<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>Vram on d3v0ps.cloud</title>
    <link>https://d3v0ps.cloud/tags/vram/</link>
    <description>Recent content in Vram on d3v0ps.cloud</description>
    <generator>Hugo</generator>
    <language>en</language>
    <copyright>&lt;a href=&#34;https://creativecommons.org/licenses/by-nc/4.0/&#34; target=&#34;_blank&#34; rel=&#34;noopener&#34;&gt;CC BY-NC 4.0&lt;/a&gt;</copyright>
    <lastBuildDate>Thu, 28 May 2026 20:22:37 +0000</lastBuildDate>
    <atom:link href="https://d3v0ps.cloud/tags/vram/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>Running llama.cpp in Low VRAM: A Practical Setup Guide</title>
      <link>https://d3v0ps.cloud/posts/2026/05/running-llama.cpp-in-low-vram-a-practical-setup-guide/</link>
      <pubDate>Thu, 28 May 2026 20:22:37 +0000</pubDate>
      <guid>https://d3v0ps.cloud/posts/2026/05/running-llama.cpp-in-low-vram-a-practical-setup-guide/</guid>
      <description>&lt;h2 id=&#34;tldr&#34;&gt;TL;DR&lt;/h2&gt;&#xA;&lt;p&gt;I published &lt;a href=&#34;https://github.com/aaronbolton/llamacpp-low-vram&#34;&gt;aaronbolton/llamacpp-low-vram&lt;/a&gt; as a minimal, copy-paste starting point for running &lt;code&gt;llama-server&lt;/code&gt; on a GPU with limited VRAM. Adjust &lt;code&gt;--n-cpu-moe&lt;/code&gt; and &lt;code&gt;--ctx-size&lt;/code&gt; for your hardware, and you get an OpenAI-compatible local endpoint without buying a bigger card.&lt;/p&gt;&#xA;&lt;hr&gt;&#xA;&lt;p&gt;Running a large model on a modest GPU is mostly a VRAM budgeting problem. The model weights, the KV cache, and the attention computation all compete for the same pool of memory. This post explains the launch script I use, why each flag is there, and how to tune the two settings that have the most impact.&lt;/p&gt;</description>
    </item>
  </channel>
</rss>
