<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>Qwen on d3v0ps.cloud</title>
    <link>https://d3v0ps.cloud/tags/qwen/</link>
    <description>Recent content in Qwen on d3v0ps.cloud</description>
    <generator>Hugo</generator>
    <language>en</language>
    <copyright>&lt;a href=&#34;https://creativecommons.org/licenses/by-nc/4.0/&#34; target=&#34;_blank&#34; rel=&#34;noopener&#34;&gt;CC BY-NC 4.0&lt;/a&gt;</copyright>
    <lastBuildDate>Sun, 03 May 2026 00:00:00 +0000</lastBuildDate>
    <atom:link href="https://d3v0ps.cloud/tags/qwen/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>My Local LLM Setup: One Model, Many Personalities</title>
      <link>https://d3v0ps.cloud/posts/2026/05/my-local-llm-setup-one-model-many-personalities/</link>
      <pubDate>Sun, 03 May 2026 00:00:00 +0000</pubDate>
      <guid>https://d3v0ps.cloud/posts/2026/05/my-local-llm-setup-one-model-many-personalities/</guid>
      <description>&lt;h2 id=&#34;hardware&#34;&gt;Hardware&lt;/h2&gt;&#xA;&lt;table&gt;&#xA;  &lt;thead&gt;&#xA;      &lt;tr&gt;&#xA;          &lt;th&gt;Component&lt;/th&gt;&#xA;          &lt;th&gt;Spec&lt;/th&gt;&#xA;      &lt;/tr&gt;&#xA;  &lt;/thead&gt;&#xA;  &lt;tbody&gt;&#xA;      &lt;tr&gt;&#xA;          &lt;td&gt;CPU&lt;/td&gt;&#xA;          &lt;td&gt;11th Gen Intel Core i7-11700K (16 threads) @ 5.00 GHz&lt;/td&gt;&#xA;      &lt;/tr&gt;&#xA;      &lt;tr&gt;&#xA;          &lt;td&gt;GPU 1&lt;/td&gt;&#xA;          &lt;td&gt;NVIDIA GeForce RTX 4060 Ti 16GB (Discrete)&lt;/td&gt;&#xA;      &lt;/tr&gt;&#xA;      &lt;tr&gt;&#xA;          &lt;td&gt;GPU 2&lt;/td&gt;&#xA;          &lt;td&gt;NVIDIA GeForce RTX 4060 Ti 16GB (Discrete)&lt;/td&gt;&#xA;      &lt;/tr&gt;&#xA;      &lt;tr&gt;&#xA;          &lt;td&gt;Memory&lt;/td&gt;&#xA;          &lt;td&gt;128 GiB&lt;/td&gt;&#xA;      &lt;/tr&gt;&#xA;  &lt;/tbody&gt;&#xA;&lt;/table&gt;&#xA;&lt;hr&gt;&#xA;&lt;p&gt;Running a large language model locally is one thing. Serving it intelligently to a variety of workloads is another. This post walks through how I serve a single Qwen 3.6 model via llama.cpp and expose it as multiple purpose-tuned model aliases through LiteLLM — giving different clients the right inference parameters without ever loading a second model.&lt;/p&gt;</description>
    </item>
  </channel>
</rss>
