More Gemma4 fixes in the past 24 hours

Posted by andy2na@reddit | LocalLLaMA | View on Reddit | 89 comments

Reasoning budget fix (merged): https://github.com/ggml-org/llama.cpp/pull/21697

New chat templates from Google to fix tool calling:

31B: https://huggingface.co/google/gemma-4-31B-it/blob/main/chat_template.jinja

27B: https://huggingface.co/google/gemma-4-26B-A4B-it/blob/main/chat_template.jinja

E4B: https://huggingface.co/google/gemma-4-E4B-it/blob/main/chat_template.jinja

E2B: https://huggingface.co/google/gemma-4-E2B-it/blob/main/chat_template.jinja

Please correct me if Im wrong, but you should use these new templates unless you redownload a new GGUF

You can use specific templates in llama.cpp by the command argument:

--chat-template-file /models/gemma4/gemma4_chat_template_26B.jinja

My current llama-swap/llama.cpp config 26B example (testing on 16GB VRAM , so context window is limited):

"Gemma4-26B-IQ4_XS":
    ttl: 300  # Automatically unloads after 5 mins of inactivity
    cmd: >
      /usr/local/bin/llama-server
       --port ${PORT}
      --host 127.0.0.1
      --model /models/gemma4/gemma-4-26B-A4B-it-UD-IQ4_XS.gguf
      --mmproj /models/gemma4/gemma-4-26B-A4B-it.mmproj-q8_0.gguf
      --chat-template-file /models/gemma4/gemma4_chat_template_26B_09APR2026.jinja
      --cache-type-k q8_0
      --cache-type-v q8_0      
      --n-gpu-layers 99
      --parallel 1 
      --batch-size 2048 
      --ubatch-size 512
      --ctx-size 16384
      --image-min-tokens 300
      --image-max-tokens 512 
      --flash-attn on 
      --jinja
      --cache-ram 2048
      -ctxcp 2
    filters:
      stripParams: "temperature, top_p, top_k, min_p, presence_penalty, repeat_penalty"
      
      setParamsByID:
        "${MODEL_ID}:thinking":
          chat_template_kwargs:
            enable_thinking: true
          reasoning_budget: 4096
          temperature: 1.0
          top_p: 0.95
          top_k: 64
          min_p: 0.0
          presence_penalty: 0.0
          repeat_penalty: 1.0


        "${MODEL_ID}:thinking-coding":
          chat_template_kwargs:
            enable_thinking: true
          reasoning_budget: 4096
          temperature: 1.5
          top_p: 0.95
          top_k: 65
          min_p: 0.0
          presence_penalty: 0.0
          repeat_penalty: 1.0


        "${MODEL_ID}:instruct":
          chat_template_kwargs:
            enable_thinking: false
          temperature: 1.0
          top_p: 0.95
          top_k: 64
          min_p: 0.0
          presence_penalty: 0.0
          repeat_penalty: 1.0"