From 77219de9f8d97fcdb07fa3537a2edd597699f9dd Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Sun, 29 Mar 2026 22:36:08 -0700
Subject: [PATCH] feat(server): add model-load chat_template_kwargs

---
 CHANGELOG.md                   |  1 +
 docs/server.md                 | 25 +++++++++++++++++++++++++
 llama_cpp/llama_chat_format.py |  7 ++++++-
 llama_cpp/server/cli.py        | 34 ++++++++++++++++++++++++++++++++--
 llama_cpp/server/model.py      | 15 +++++++++++++++
 llama_cpp/server/settings.py   |  6 +++++-
 6 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3b4c13ee3..e577324db 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat(server): Add model-load `chat_template_kwargs` support and document the CLI/config usage by @abetlen in #2168
 - ci: Publish release wheels as `py3-none` by @Bing-su in #2166
 - fix(ci): Publish distinct manylinux and musllinux CPU wheels by @abetlen in #2165
 
diff --git a/docs/server.md b/docs/server.md
index cd6f86c51..9c09a1f1c 100644
--- a/docs/server.md
+++ b/docs/server.md
@@ -22,6 +22,15 @@ The server can then be started by running the following command:
 python3 -m llama_cpp.server --model <model_path>
 ```
 
+You can also pass chat-template kwargs at model load time from the CLI:
+
+```bash
+python3 -m llama_cpp.server \
+  --model <model_path> \
+  --chat_format chatml \
+  --chat_template_kwargs '{"enable_thinking": true}'
+```
+
 ### Server options
 
 For a full list of options, run:
@@ -147,6 +156,22 @@ The server supports routing requests to multiple models based on the `model` par
 
 At the moment only a single model is loaded into memory at, the server will automatically load and unload models as needed.
 
+For a single-model config, `chat_template_kwargs` can be set directly on the model entry:
+
+```json
+{
+    "models": [
+        {
+            "model": "models/Qwen3.5-0.8B/qwen3.5-0.8b-q8_0.gguf",
+            "chat_format": "chatml",
+            "chat_template_kwargs": {
+                "enable_thinking": true
+            }
+        }
+    ]
+}
+```
+
 ```json
 {
     "host": "0.0.0.0",
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index d7910e984..1024fb85b 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -243,6 +243,7 @@ def raise_exception(message: str):
             tools=tools,
             tool_choice=tool_choice,
             strftime_now=self.strftime_now,
+            **kwargs,
         )
 
         stopping_criteria = None
@@ -617,6 +618,7 @@ def chat_completion_handler(
             function_call=function_call,
             tools=tools,
             tool_choice=tool_choice,
+            **kwargs,
         )
         prompt = llama.tokenize(
             result.prompt.encode("utf-8"),
@@ -734,7 +736,9 @@ def format_autotokenizer(
         **kwargs: Any,
     ) -> ChatFormatterResponse:
         tokenizer.use_default_system_prompt = False  # type: ignore
-        prompt: str = tokenizer.apply_chat_template(messages, tokenize=False)  # type: ignore
+        prompt: str = tokenizer.apply_chat_template(  # type: ignore
+            messages, tokenize=False, **kwargs
+        )
         assert isinstance(prompt, str)
         # Return formatted prompt and eos token by default
         return ChatFormatterResponse(
@@ -791,6 +795,7 @@ def format_tokenizer_config(
             messages=messages,
             bos_token=bos_token,
             eos_token=eos_token,
+            **kwargs,
         )
         return ChatFormatterResponse(
             prompt=prompt, stop=[eos_token, bos_token], added_special=True
diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py
index 8ed029063..171b8db30 100644
--- a/llama_cpp/server/cli.py
+++ b/llama_cpp/server/cli.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
 
 import argparse
+import json
 
-from typing import List, Literal, Union, Any, Type, TypeVar
+from typing import List, Literal, Union, Any, Type, TypeVar, Dict
 
 from pydantic import BaseModel
 
@@ -40,6 +41,17 @@ def _contains_list_type(annotation: Type[Any] | None) -> bool:
         return False
 
 
+def _contains_dict_type(annotation: Type[Any] | None) -> bool:
+    origin = getattr(annotation, "__origin__", None)
+
+    if origin is dict or origin is Dict:
+        return True
+    elif origin in (Literal, Union):
+        return any(_contains_dict_type(arg) for arg in annotation.__args__)  # type: ignore
+    else:
+        return False
+
+
 def _parse_bool_arg(arg: str | bytes | bool) -> bool:
     if isinstance(arg, bytes):
         arg = arg.decode("utf-8")
@@ -57,6 +69,16 @@ def _parse_bool_arg(arg: str | bytes | bool) -> bool:
         raise ValueError(f"Invalid boolean argument: {arg}")
 
 
+def _parse_json_object_arg(arg: str | bytes) -> dict[str, Any]:
+    if isinstance(arg, bytes):
+        arg = arg.decode("utf-8")
+
+    value = json.loads(arg)
+    if not isinstance(value, dict):
+        raise ValueError(f"Invalid JSON object argument: {arg}")
+    return value
+
+
 def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel]):
     """Add arguments from a pydantic model to an argparse parser."""
 
@@ -68,7 +90,15 @@ def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel])
             _get_base_type(field.annotation) if field.annotation is not None else str
         )
         list_type = _contains_list_type(field.annotation)
-        if base_type is not bool:
+        dict_type = _contains_dict_type(field.annotation)
+        if dict_type:
+            parser.add_argument(
+                f"--{name}",
+                dest=name,
+                type=_parse_json_object_arg,
+                help=description,
+            )
+        elif base_type is not bool:
             parser.add_argument(
                 f"--{name}",
                 dest=name,
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 9e59e8563..3922ce5df 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -299,6 +299,21 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             # Misc
             verbose=settings.verbose,
         )
+        if settings.chat_template_kwargs:
+            base_chat_handler = (
+                _model.chat_handler
+                or _model._chat_handlers.get(_model.chat_format)
+                or llama_cpp.llama_chat_format.get_chat_completion_handler(
+                    _model.chat_format
+                )
+            )
+
+            def chat_handler_with_kwargs(*args, **kwargs):
+                return base_chat_handler(
+                    *args, **{**settings.chat_template_kwargs, **kwargs}
+                )
+
+            _model.chat_handler = chat_handler_with_kwargs
         if settings.cache:
             if settings.cache_type == "disk":
                 if settings.verbose:
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 13c951241..3c2bb7fd0 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -2,7 +2,7 @@
 
 import multiprocessing
 
-from typing import Optional, List, Literal, Union, Dict, cast
+from typing import Any, Optional, List, Literal, Union, Dict, cast
 from typing_extensions import Self
 
 from pydantic import Field, model_validator
@@ -131,6 +131,10 @@ class ModelSettings(BaseSettings):
         default=None,
         description="Chat format to use.",
     )
+    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Extra keyword arguments forwarded to chat templates at model load time. Matches llama.cpp server `chat_template_kwargs`.",
+    )
     clip_model_path: Optional[str] = Field(
         default=None,
         description="Path to a CLIP model to use for multi-modal chat completion.",