From 807be1183272fac409ce8f08609dbdd0d9f63362 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 18:48:50 -0700
Subject: [PATCH 01/12] Remove obsolete models/config.yaml and related code

---
 docs/01 - Chat Tab.md        |   2 +-
 docs/12 - OpenAI API.md      |   2 +-
 modules/models.py            |   1 -
 modules/models_settings.py   |   9 +-
 modules/shared.py            |  10 --
 server.py                    |   5 -
 user_data/models/config.yaml | 203 -----------------------------------
 7 files changed, 4 insertions(+), 228 deletions(-)
 delete mode 100644 user_data/models/config.yaml

diff --git a/docs/01 - Chat Tab.md b/docs/01 - Chat Tab.md
index 5104895f28..96b232fa31 100644
--- a/docs/01 - Chat Tab.md	
+++ b/docs/01 - Chat Tab.md	
@@ -112,7 +112,7 @@ Used for talking to an instruction-following model using the prompt format defin
 
 The prompt format is defined by the **Instruction template** parameter in "Parameters" > "Instruction template", which represents a Jinja2 template.
 
-Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any), and will update the values under "Parameters" > "Instruction template" accordingly. This is done using a set of regular expressions defined in `user_data/models/config.yaml`. This detection is not guaranteed to be accurate. You should check the model card on Hugging Face to see if you are using the correct prompt format.
+Note that when you load a model in the "Model" tab, the web UI will try to automatically detect its instruction template (if any) from the model metadata (e.g. `tokenizer_config.json` or GGUF metadata), and will update the values under "Parameters" > "Instruction template" accordingly. You should check the model card on Hugging Face to see if you are using the correct prompt format.
 
 ### Chat-instruct
 
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 2a7a7f6915..0a076c350a 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -39,7 +39,7 @@ curl http://127.0.0.1:5000/v1/completions \
 
 #### Chat completions
 
-Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be guessed automatically based on the model name using the regex patterns in `user_data/models/config.yaml`.
+Works best with instruction-following models. If the "instruction_template" variable is not provided, it will be detected automatically from the model metadata.
 
 ```shell
 curl http://127.0.0.1:5000/v1/chat/completions \
diff --git a/modules/models.py b/modules/models.py
index 1d139b89a6..b2665c6bb1 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -67,7 +67,6 @@ def load_model(model_name, loader=None):
     logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
     logger.info(f"LOADER: \"{loader}\"")
     logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
-    logger.info(f"INSTRUCTION TEMPLATE: \"{metadata['instruction_template']}\"")
     return model, tokenizer
 
 
diff --git a/modules/models_settings.py b/modules/models_settings.py
index dcface7182..eafa058107 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -23,14 +23,9 @@ def get_fallback_settings():
 
 def get_model_metadata(model):
     model_path = resolve_model_path(model)
-    model_settings = {}
 
-    # Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
-    settings = shared.model_config
-    for pat in settings:
-        if re.match(pat.lower(), Path(model).name.lower()):
-            for k in settings[pat]:
-                model_settings[k] = settings[pat][k]
+    # Fallback settings
+    model_settings = get_fallback_settings()
 
     path = model_path / 'config.json'
     if path.exists():
diff --git a/modules/shared.py b/modules/shared.py
index 16ccbe77a4..acb103b45d 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -454,17 +454,7 @@ def load_user_config():
 
 args.loader = fix_loader_name(args.loader)
 
-# Load model-specific settings
-p = Path(f'{args.model_dir}/config.yaml')
-if p.exists():
-    model_config = yaml.safe_load(open(p, 'r').read())
-else:
-    model_config = {}
-del p
-
-
 # Load custom model-specific settings
 user_config = load_user_config()
 
-model_config = OrderedDict(model_config)
 user_config = OrderedDict(user_config)
diff --git a/server.py b/server.py
index d224909ce1..88936ca6e8 100644
--- a/server.py
+++ b/server.py
@@ -18,7 +18,6 @@
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model_if_idle
 from modules.models_settings import (
-    get_fallback_settings,
     get_model_metadata,
     update_model_parameters
 )
@@ -271,10 +270,6 @@ def create_interface():
     # Apply CLI overrides for image model settings (CLI flags take precedence over saved settings)
     shared.apply_image_model_cli_overrides()
 
-    # Fallback settings for models
-    shared.model_config['.*'] = get_fallback_settings()
-    shared.model_config.move_to_end('.*', last=False)  # Move to the beginning
-
     # Activate the extensions listed on settings.yaml
     extensions_module.available_extensions = utils.get_available_extensions()
     for extension in shared.settings['default_extensions']:
diff --git a/user_data/models/config.yaml b/user_data/models/config.yaml
deleted file mode 100644
index 038ebcf1ed..0000000000
--- a/user_data/models/config.yaml
+++ /dev/null
@@ -1,203 +0,0 @@
-.*(llama|alpac|vicuna|guanaco|koala|llava|wizardlm|metharme|pygmalion-7b|pygmalion-2|mythalion|wizard-mega|openbuddy|vigogne|h2ogpt-research|manticore):
-  model_type: 'llama'
-.*(opt-|opt_|opt1|opt3|optfor|galactica|galpaca|pygmalion-350m):
-  model_type: 'opt'
-.*(gpt-j|gptj|gpt4all-j|malion-6b|pygway|pygmalion-6b|dolly-v1):
-  model_type: 'gptj'
-.*(gpt-neox|koalpaca-polyglot|polyglot.*koalpaca|polyglot-ko|polyglot_ko|pythia|stablelm|incite|dolly-v2|polycoder|h2ogpt-oig|h2ogpt-oasst1|h2ogpt-gm):
-  model_type: 'gptneox'
-.*bloom:
-  model_type: 'bloom'
-.*gpt2:
-  model_type: 'gpt2'
-.*falcon:
-  model_type: 'falcon'
-.*mpt:
-  model_type: 'mpt'
-.*(starcoder|starchat):
-  model_type: 'starcoder'
-.*dolly-v2:
-  model_type: 'dollyv2'
-.*replit:
-  model_type: 'replit'
-.*(oasst|openassistant-|stablelm-7b-sft-v7-epoch-3):
-  instruction_template: 'Open Assistant'
-  skip_special_tokens: false
-(?!.*galactica)(?!.*reward).*openassistant:
-  instruction_template: 'Open Assistant'
-  skip_special_tokens: false
-.*galactica:
-  skip_special_tokens: false
-.*dolly-v[0-9]-[0-9]*b:
-  instruction_template: 'Alpaca'
-  skip_special_tokens: false
-.*alpaca-native-4bit:
-  instruction_template: 'Alpaca'
-.*llava:
-  instruction_template: 'LLaVA'
-.*llava.*1.5:
-  instruction_template: 'Vicuna-v1.1'
-.*wizard.*mega:
-  instruction_template: 'Wizard-Mega'
-.*starchat-beta:
-  instruction_template: 'Starchat-Beta'
-(?!.*v0)(?!.*1.1)(?!.*1_1)(?!.*stable)(?!.*chinese).*vicuna:
-  instruction_template: 'Vicuna-v0'
-.*vicuna.*v0:
-  instruction_template: 'Vicuna-v0'
-.*vicuna.*(1.1|1_1|1.3|1_3):
-  instruction_template: 'Vicuna-v1.1'
-.*vicuna.*(1.5|1_5):
-  instruction_template: 'Vicuna-v1.1'
-.*stable.*vicuna:
-  instruction_template: 'StableVicuna'
-(?!.*chat).*chinese-vicuna:
-  instruction_template: 'Alpaca'
-.*chinese-vicuna.*chat:
-  instruction_template: 'Chinese-Vicuna-Chat'
-.*alpaca:
-  instruction_template: 'Alpaca'
-.*koala:
-  instruction_template: 'Koala'
-.*chatglm:
-  instruction_template: 'ChatGLM'
-.*(metharme|pygmalion|mythalion):
-  instruction_template: 'Metharme'
-.*raven:
-  instruction_template: 'RWKV-Raven'
-.*moss-moon.*sft:
-  instruction_template: 'MOSS'
-.*stablelm-tuned:
-  instruction_template: 'StableLM'
-.*galactica.*finetuned:
-  instruction_template: 'Galactica Finetuned'
-.*galactica.*-v2:
-  instruction_template: 'Galactica v2'
-(?!.*finetuned)(?!.*-v2).*galactica:
-  instruction_template: 'Galactica'
-.*guanaco:
-  instruction_template: 'Guanaco non-chat'
-.*baize:
-  instruction_template: 'Baize'
-.*mpt-.*instruct:
-  instruction_template: 'Alpaca'
-.*mpt-.*chat:
-  instruction_template: 'ChatML'
-(?!.*-flan-)(?!.*-t5-).*lamini-:
-  instruction_template: 'Alpaca'
-.*incite.*chat:
-  instruction_template: 'INCITE-Chat'
-.*incite.*instruct:
-  instruction_template: 'INCITE-Instruct'
-.*ziya-:
-  instruction_template: 'Ziya'
-.*koalpaca:
-  instruction_template: 'KoAlpaca'
-.*openbuddy:
-  instruction_template: 'OpenBuddy'
-(?!.*chat).*vigogne:
-  instruction_template: 'Vigogne-Instruct'
-.*vigogne.*chat:
-  instruction_template: 'Vigogne-Chat'
-.*(llama-deus|supercot|llama-natural-instructions|open-llama-0.3t-7b-instruct-dolly-hhrlhf|open-llama-0.3t-7b-open-instruct):
-  instruction_template: 'Alpaca'
-.*bactrian:
-  instruction_template: 'Bactrian'
-.*(h2ogpt-oig-|h2ogpt-oasst1-|h2ogpt-research-oasst1-):
-  instruction_template: 'INCITE-Chat'
-.*h2ogpt-gm-:
-  instruction_template: 'H2O-prompt_answer'
-.*manticore:
-  instruction_template: 'Manticore Chat'
-.*bluemoonrp-(30|13)b:
-  instruction_template: 'Bluemoon'
-.*Nous-Hermes-13b:
-  instruction_template: 'Alpaca'
-.*airoboros:
-  instruction_template: 'Vicuna-v1.1'
-.*airoboros.*1.2:
-  instruction_template: 'Airoboros-v1.2'
-.*alpa(cino|sta):
-  instruction_template: 'Alpaca'
-.*hippogriff:
-  instruction_template: 'Hippogriff'
-.*lazarus:
-  instruction_template: 'Alpaca'
-.*guanaco-.*(7|13|33|65)b:
-  instruction_template: 'Vicuna-v0'
-.*hypermantis:
-  instruction_template: 'Alpaca'
-.*open-llama-.*-open-instruct:
-  instruction_template: 'Alpaca'
-.*starcoder-gpteacher-code-instruct:
-  instruction_template: 'Alpaca'
-.*tulu:
-  instruction_template: 'Tulu'
-.*chronos:
-  instruction_template: 'Alpaca'
-.*samantha:
-  instruction_template: 'Samantha'
-.*wizardcoder:
-  instruction_template: 'Alpaca'
-.*minotaur:
-  instruction_template: 'Manticore Chat'
-.*orca_mini:
-  instruction_template: 'Orca Mini'
-.*(platypus|gplatty|superplatty):
-  instruction_template: 'Alpaca'
-.*(openorca-platypus2):
-  instruction_template: 'OpenOrca-Platypus2'
-.*longchat:
-  instruction_template: 'Vicuna-v1.1'
-.*vicuna-33b:
-  instruction_template: 'Vicuna-v1.1'
-.*redmond-hermes-coder:
-  instruction_template: 'Alpaca'
-.*wizardcoder-15b:
-  instruction_template: 'Alpaca'
-.*wizardlm:
-  instruction_template: 'Vicuna-v1.1'
-.*godzilla:
-  instruction_template: 'Alpaca'
-.*llama(-?)(2|v2).*chat:
-  instruction_template: 'Llama-v2'
-.*newhope:
-  instruction_template: 'NewHope'
-.*stablebeluga2:
-  instruction_template: 'StableBeluga2'
-.*openchat:
-  instruction_template: 'OpenChat'
-.*codellama.*instruct:
-  instruction_template: 'Llama-v2'
-.*(mistral|mixtral).*instruct:
-  instruction_template: 'Mistral'
-.*mistral.*openorca:
-  instruction_template: 'ChatML'
-.*(WizardCoder-Python-34B-V1.0|Phind-CodeLlama-34B-v2|CodeBooga-34B-v0.1):
-  instruction_template: 'Alpaca'
-.*orca-2-(13|7)b:
-  instruction_template: 'ChatML'
-.*openhermes.*mistral:
-  instruction_template: 'ChatML'
-.*Yi-34B-Chat:
-  instruction_template: 'ChatML'
-(dolphin).*:
-  instruction_template: 'ChatML'
-.*synthia:
-  instruction_template: 'Synthia'
-.*(hercules|hyperion):
-  instruction_template: 'ChatML'
-.*command-r:
-  instruction_template: 'Command-R'
-.*xwin-lm-70b-v0.1:
-  instruction_template: 'Vicuna-v1.1'
-.*platypus-yi-34b:
-  instruction_template: 'Vicuna-v1.1'
-.*CausalLM-RP-34B:
-  instruction_template: 'ChatML'
-34b-beta:
-  instruction_template: 'ChatML'
-.*airoboros-3_1-yi-34b-200k:
-  instruction_template: 'Llama-v2'
-.*chatqa:
-  instruction_template: 'NVIDIA-ChatQA'

From d6f1485dd189494f6fbe5b6ea7ebd5cc0404233a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 24 Mar 2026 21:45:11 -0700
Subject: [PATCH 02/12] UI: Update the enable_thinking info message

---
 modules/ui_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index f1dc7883cf..10d05f659c 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -82,7 +82,7 @@ def create_ui():
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
                 shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
-                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Seed-OSS and pre-2507 Qwen3.')
+                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='For models with thinking support.')
 
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 

From 368f37335f634ba001d00d2841902de85c7b48db Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 25 Mar 2026 06:37:45 -0700
Subject: [PATCH 03/12] Fix --idle-timeout issues with encode/decode and
 parallel generation

---
 modules/logits.py          |  4 +---
 modules/models.py          | 15 ++++++++++++++-
 modules/text_generation.py | 18 +++++++++++++-----
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/modules/logits.py b/modules/logits.py
index 1f878f272f..473f589033 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -4,7 +4,6 @@
 
 from modules import models, shared
 from modules.logging_colors import logger
-from modules.models import load_model
 from modules.text_generation import generate_reply
 from modules.utils import check_model_loaded
 
@@ -12,8 +11,7 @@
 
 
 def get_next_logits(*args, **kwargs):
-    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
-        shared.model, shared.tokenizer = load_model(shared.model_name)
+    models.load_model_if_idle_unloaded()
 
     needs_lock = not args[2]  # use_samplers
     if needs_lock:
diff --git a/modules/models.py b/modules/models.py
index b2665c6bb1..61ca383878 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -1,4 +1,5 @@
 import sys
+import threading
 import time
 
 import modules.shared as shared
@@ -7,6 +8,15 @@
 from modules.utils import resolve_model_path
 
 last_generation_time = time.time()
+active_generation_count = 0
+_generation_count_lock = threading.Lock()
+
+
+def load_model_if_idle_unloaded():
+    global last_generation_time
+    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
+        shared.model, shared.tokenizer = load_model(shared.model_name)
+        last_generation_time = time.time()
 
 
 def load_model(model_name, loader=None):
@@ -158,7 +168,10 @@ def unload_model_if_idle():
     while True:
         shared.generation_lock.acquire()
         try:
-            if time.time() - last_generation_time > shared.args.idle_timeout * 60:
+            with _generation_count_lock:
+                is_active = active_generation_count > 0
+
+            if not is_active and time.time() - last_generation_time > shared.args.idle_timeout * 60:
                 if shared.model is not None:
                     logger.info("Unloading the model for inactivity.")
                     unload_model(keep_model_name=True)
diff --git a/modules/text_generation.py b/modules/text_generation.py
index f77be124f9..3a9ddab531 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -17,9 +17,7 @@
 
 
 def generate_reply(*args, **kwargs):
-    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
-        from modules.models import load_model
-        shared.model, shared.tokenizer = load_model(shared.model_name)
+    models.load_model_if_idle_unloaded()
 
     state = args[1] if len(args) > 1 else kwargs.get('state', {})
     use_parallel = (
@@ -31,10 +29,16 @@ def generate_reply(*args, **kwargs):
     if not use_parallel:
         shared.generation_lock.acquire()
 
+    with models._generation_count_lock:
+        models.active_generation_count += 1
+
     try:
         for result in _generate_reply(*args, **kwargs):
             yield result
     finally:
+        with models._generation_count_lock:
+            models.active_generation_count -= 1
+
         models.last_generation_time = time.time()
         if not use_parallel:
             shared.generation_lock.release()
@@ -126,7 +130,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
 
 def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
     if shared.tokenizer is None:
-        raise ValueError('No tokenizer is loaded')
+        models.load_model_if_idle_unloaded()
+        if shared.tokenizer is None:
+            raise ValueError('No tokenizer is loaded')
 
     # llama.cpp case
     if shared.model.__class__.__name__ == 'LlamaServer':
@@ -176,7 +182,9 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
 
 def decode(output_ids, skip_special_tokens=True):
     if shared.tokenizer is None:
-        raise ValueError('No tokenizer is loaded')
+        models.load_model_if_idle_unloaded()
+        if shared.tokenizer is None:
+            raise ValueError('No tokenizer is loaded')
 
     return shared.tokenizer.decode(output_ids, skip_special_tokens=skip_special_tokens)
 

From e1541400219043f9b9cebf5f002b48251efc8bf9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 25 Mar 2026 07:21:02 -0700
Subject: [PATCH 04/12] Rename "truncation length" to "context length" in logs

---
 modules/api/models.py | 2 +-
 modules/models.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/api/models.py b/modules/api/models.py
index c879a860f7..b89397d3dd 100644
--- a/modules/api/models.py
+++ b/modules/api/models.py
@@ -68,7 +68,7 @@ def _load_model(data):
             if k in shared.settings:
                 shared.settings[k] = settings[k]
                 if k == 'truncation_length':
-                    logger.info(f"TRUNCATION LENGTH (UPDATED): {shared.settings['truncation_length']}")
+                    logger.info(f"CONTEXT LENGTH (UPDATED): {shared.settings['truncation_length']}")
                 elif k == 'instruction_template':
                     logger.info(f"INSTRUCTION TEMPLATE (UPDATED): {shared.settings['instruction_template']}")
 
diff --git a/modules/models.py b/modules/models.py
index 61ca383878..e997d2d864 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -76,7 +76,7 @@ def load_model(model_name, loader=None):
 
     logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
     logger.info(f"LOADER: \"{loader}\"")
-    logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
+    logger.info(f"CONTEXT LENGTH: {shared.settings['truncation_length']}")
     return model, tokenizer
 
 

From 4cbea02ed4e0dee2efd066ac48bcdf33631b9eca Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 26 Mar 2026 06:49:39 -0700
Subject: [PATCH 05/12] Add ik_llama.cpp support via `--ik` flag

---
 modules/llama_cpp_server.py | 37 +++++++++++++++++++++++++++++++++++++
 modules/shared.py           |  1 +
 2 files changed, 38 insertions(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 2ae01ddc2c..9b9756a95d 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -470,6 +470,10 @@ def _start_server(self):
                         else:
                             cmd.append(f"--{flag_item}")
 
+        # Patch flags for ik_llama.cpp compatibility
+        if shared.args.ik:
+            cmd = _patch_cmd_for_ik(cmd)
+
         env = os.environ.copy()
         if os.name == 'posix':
             current_path = env.get('LD_LIBRARY_PATH', '')
@@ -607,3 +611,36 @@ def filter_stderr_with_progress(process_stderr):
             process_stderr.close()
         except Exception:
             pass
+
+
+def _patch_cmd_for_ik(cmd):
+    """
+    Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents:
+      --no-webui          → --webui none
+      --fit off            → (removed)
+      --fit on / --fit-ctx → --fit (bare flag)
+      --fit-target         → --fit-margin
+    """
+    patched = []
+    i = 0
+    while i < len(cmd):
+        arg = cmd[i]
+
+        if arg == "--no-webui":
+            patched += ["--webui", "none"]
+        elif arg == "--fit" and i + 1 < len(cmd) and cmd[i + 1] in ("on", "off"):
+            val = cmd[i + 1]
+            i += 1
+            if val == "on":
+                patched.append("--fit")
+            # "off" → drop entirely
+        elif arg == "--fit-ctx":
+            i += 1  # skip the value
+        elif arg == "--fit-target":
+            patched.append("--fit-margin")
+        else:
+            patched.append(arg)
+
+        i += 1
+
+    return patched
diff --git a/modules/shared.py b/modules/shared.py
index acb103b45d..c50736d703 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -110,6 +110,7 @@
 group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
 group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
+group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. To install: build ik_llama.cpp, then delete all files inside <venv>/lib/pythonX.Y/site-packages/llama_cpp_binaries/bin/ and copy or symlink the ik_llama.cpp build outputs into that folder.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')

From bda95172bd6abecba165fc118f140cfc446f3c42 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 06:09:53 -0700
Subject: [PATCH 06/12] Fix stopping string detection for chromadb/context-1

---
 modules/chat.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index f8088e0fab..edda11b09a 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -671,7 +671,10 @@ def get_stopping_strings(state):
     # Handle GPT-OSS as a special case
     if '<|channel|>final<|message|>' in state['instruction_template_str'] and "<|end|>" in result:
         result.remove("<|end|>")
-        result.append("<|result|>")
+        if '<|result|>' in state['instruction_template_str']:
+            result.append("<|result|>")
+        elif '<|return|>' in state['instruction_template_str']:
+            result.append("<|return|>")
         result = list(set(result))
 
     if shared.args.verbose:

From 9dd04b86ce407507bcaf0862b97aadc64b6e62a6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 06:17:57 -0700
Subject: [PATCH 07/12] Suppress EOS token at logit level for ExLlamav3 when
 ban_eos_token is set

---
 modules/exllamav3.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/modules/exllamav3.py b/modules/exllamav3.py
index 75c76c7c3b..f873503a8b 100644
--- a/modules/exllamav3.py
+++ b/modules/exllamav3.py
@@ -423,6 +423,15 @@ def custom_sort_key(sampler_obj):
         if logit_bias:
             filters.append(LogitBiasFilter(self.tokenizer, logit_bias))
 
+        # Suppress EOS tokens via logit bias so they are never sampled
+        if state['ban_eos_token']:
+            eos_bias = {}
+            for eos_id in self.config.eos_token_id_list:
+                if eos_id is not None:
+                    eos_bias[str(eos_id)] = float('-inf')
+            if eos_bias:
+                filters.append(LogitBiasFilter(self.tokenizer, eos_bias))
+
         # Logprobs support (OpenAI API)
         logprobs = state.get('logprobs', 0) or 0
         return_top_tokens = logprobs if logprobs > 0 else 0

From 4979e87e48c78d5e3186e4d9b2fbc8b30e86164f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 11:49:47 -0300
Subject: [PATCH 08/12] Add ik_llama.cpp support via ik_llama_cpp_binaries
 package

---
 .github/workflows/build-everything-tgw.yml    |  35 +++
 .../build-portable-release-ik-cuda.yml        | 179 +++++++++++++++
 .../workflows/build-portable-release-ik.yml   | 205 ++++++++++++++++++
 modules/llama_cpp_server.py                   |  21 +-
 modules/loaders.py                            |   2 +
 modules/shared.py                             |   2 +-
 modules/ui_model_menu.py                      |   3 +
 requirements/full/requirements.txt            |   6 +-
 requirements/full/requirements_amd.txt        |   4 +-
 .../full/requirements_apple_intel.txt         |   3 +-
 .../full/requirements_apple_silicon.txt       |   3 +-
 requirements/full/requirements_cpu_only.txt   |   6 +-
 requirements/portable/requirements.txt        |   4 +-
 requirements/portable/requirements_amd.txt    |   4 +-
 .../portable/requirements_apple_intel.txt     |   2 +-
 .../portable/requirements_apple_silicon.txt   |   2 +-
 .../portable/requirements_cpu_only.txt        |   4 +-
 .../portable/requirements_cuda131.txt         |   4 +-
 requirements/portable/requirements_vulkan.txt |   4 +-
 19 files changed, 469 insertions(+), 24 deletions(-)
 create mode 100644 .github/workflows/build-portable-release-ik-cuda.yml
 create mode 100644 .github/workflows/build-portable-release-ik.yml

diff --git a/.github/workflows/build-everything-tgw.yml b/.github/workflows/build-everything-tgw.yml
index 9322f85929..4de591f407 100644
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@@ -68,3 +68,38 @@ jobs:
     with:
       version: ${{ inputs.version }}
       config: 'os:macos-15-intel,macos-14'
+
+  build_release_ik_cuda_windows:
+    name: ik CUDA Windows
+    uses: ./.github/workflows/build-portable-release-ik-cuda.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:windows-2022'
+
+  build_release_ik_cuda_linux:
+    name: ik CUDA Linux
+    uses: ./.github/workflows/build-portable-release-ik-cuda.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:ubuntu-22.04'
+
+  build_release_ik_cpu_windows:
+    name: ik CPU Windows
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:windows-2022'
+
+  build_release_ik_cpu_linux:
+    name: ik CPU Linux
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:ubuntu-22.04'
+
+  build_release_ik_macos:
+    name: ik macOS
+    uses: ./.github/workflows/build-portable-release-ik.yml
+    with:
+      version: ${{ inputs.version }}
+      config: 'os:macos-14'
diff --git a/.github/workflows/build-portable-release-ik-cuda.yml b/.github/workflows/build-portable-release-ik-cuda.yml
new file mode 100644
index 0000000000..40b4b92f98
--- /dev/null
+++ b/.github/workflows/build-portable-release-ik-cuda.yml
@@ -0,0 +1,179 @@
+name: Build ik CUDA
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+
+permissions:
+  contents: write
+
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CONFIGIN: ${{ inputs.config }}
+      EXCLUDEIN: ${{ inputs.exclude }}
+
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('ubuntu-22.04', 'windows-2022')
+              'pyver' = @("3.13")
+              'cuda' = @("12.4", "13.1")
+          }
+
+          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
+
+          if ($env:EXCLUDEIN -ne 'None') {
+              $exclusions = @()
+              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
+              $matrix['exclude'] = $exclusions
+          }
+
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+  build_wheels:
+    name: ${{ matrix.os }} ${{ matrix.pyver }} CUDA ${{ matrix.cuda }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      PCKGVER: ${{ inputs.version }}
+
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          repository: 'oobabooga/text-generation-webui'
+          ref: ${{ inputs.version }}
+          submodules: 'recursive'
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Build Package
+        shell: bash
+        run: |
+            VERSION_CLEAN="${{ inputs.version }}"
+            VERSION_CLEAN="${VERSION_CLEAN#v}"
+            cd ..
+            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Remove extensions that need additional requirements
+            allowed=("character_bias" "gallery" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
+
+            # Define common variables
+            CUDA_VERSION="${{ matrix.cuda }}"
+            VERSION="${{ inputs.version }}"
+
+            # 1. Set platform-specific variables
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                PLATFORM="windows"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/python.exe -m pip"
+                PACKAGES_PATH="portable_env/Lib/site-packages"
+                rm start_linux.sh start_macos.sh
+            else
+                PLATFORM="linux"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_macos.sh start_windows.bat
+            fi
+
+            # 2. Download and extract Python
+            cd ..
+            echo "Downloading Python for $PLATFORM..."
+            curl -L -o python-build.tar.gz "$PYTHON_URL"
+            tar -xzf python-build.tar.gz
+            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+
+            # 3. Prepare requirements file based on CUDA version
+            cd "text-generation-webui-${VERSION_CLEAN}"
+            if [[ "$CUDA_VERSION" == "13.1" ]]; then
+                REQ_FILE="requirements/portable/requirements_cuda131.txt"
+            else
+                REQ_FILE="requirements/portable/requirements.txt"
+            fi
+
+            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
+            sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+            sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat start_macos.sh 2>/dev/null || true
+
+            # 5. Install packages
+            echo "Installing Python packages from $REQ_FILE..."
+            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
+
+            # 6. Clean up
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
+
+            # 7. Create archive
+            cd ..
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+            else
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+            fi
+
+      - name: Upload files to a GitHub release
+        id: upload-release
+        uses: svenstaro/upload-release-action@2.7.0
+        continue-on-error: true
+        with:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: ../textgen-portable-ik-*
+          tag: ${{ inputs.version }}
+          file_glob: true
+          make_latest: false
+          overwrite: true
diff --git a/.github/workflows/build-portable-release-ik.yml b/.github/workflows/build-portable-release-ik.yml
new file mode 100644
index 0000000000..afb2e76327
--- /dev/null
+++ b/.github/workflows/build-portable-release-ik.yml
@@ -0,0 +1,205 @@
+name: Build ik CPU and macOS
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      version:
+        description: 'Version tag of text-generation-webui to build: v3.0'
+        default: 'v3.0'
+        required: true
+        type: string
+      config:
+        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
+        default: 'Default'
+        required: false
+        type: string
+      exclude:
+        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
+        default: 'None'
+        required: false
+        type: string
+
+permissions:
+  contents: write
+
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CONFIGIN: ${{ inputs.config }}
+      EXCLUDEIN: ${{ inputs.exclude }}
+
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('ubuntu-22.04', 'windows-2022', 'macos-14')
+              'pyver' = @("3.13")
+          }
+
+          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
+
+          if ($env:EXCLUDEIN -ne 'None') {
+              $exclusions = @()
+              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
+              $matrix['exclude'] = $exclusions
+          }
+
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+
+  build_wheels:
+    name: ${{ matrix.os }} ${{ matrix.pyver }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      PCKGVER: ${{ inputs.version }}
+
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          repository: 'oobabooga/text-generation-webui'
+          ref: ${{ inputs.version }}
+          submodules: 'recursive'
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Build Package
+        shell: bash
+        run: |
+            VERSION_CLEAN="${{ inputs.version }}"
+            VERSION_CLEAN="${VERSION_CLEAN#v}"
+            cd ..
+            cp -r text-generation-webui "text-generation-webui-${VERSION_CLEAN}"
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Remove extensions that need additional requirements
+            allowed=("character_bias" "gallery" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
+
+            # Define common variables
+            VERSION="${{ inputs.version }}"
+            OS_TYPE="${{ matrix.os }}"
+
+            # 1. Set platform-specific variables
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                PLATFORM="windows-cpu"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-pc-windows-msvc-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/python.exe -m pip"
+                PACKAGES_PATH="portable_env/Lib/site-packages"
+                rm start_linux.sh start_macos.sh
+            elif [[ "$RUNNER_OS" == "macOS" ]]; then
+                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
+                    PLATFORM="macos-x86_64"
+                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-apple-darwin-install_only_stripped.tar.gz"
+                    REQ_TYPE="apple_intel"
+                else
+                    PLATFORM="macos-arm64"
+                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-aarch64-apple-darwin-install_only_stripped.tar.gz"
+                    REQ_TYPE="apple_silicon"
+                fi
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_linux.sh start_windows.bat
+            else
+                # Linux case
+                PLATFORM="linux-cpu"
+                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20260303/cpython-3.13.12+20260303-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz"
+                PIP_PATH="portable_env/bin/python -m pip"
+                PACKAGES_PATH="portable_env/lib/python3.13/site-packages"
+                rm start_macos.sh start_windows.bat
+            fi
+
+            # 2. Download and extract Python
+            echo "Downloading Python for $PLATFORM..."
+            cd ..
+            curl -L -o python-build.tar.gz "$PYTHON_URL"
+            tar -xzf python-build.tar.gz
+            mv python "text-generation-webui-${VERSION_CLEAN}/portable_env"
+
+            # 3. Prepare requirements file based on platform
+            cd "text-generation-webui-${VERSION_CLEAN}"
+
+            # Select requirements file based on platform
+            if [[ "$RUNNER_OS" == "macOS" ]]; then
+                if [[ "$OS_TYPE" == "macos-15-intel" ]]; then
+                    REQ_FILE="requirements/portable/requirements_apple_intel.txt"
+                else
+                    REQ_FILE="requirements/portable/requirements_apple_silicon.txt"
+                fi
+            else
+                REQ_FILE="requirements/portable/requirements_cpu_only.txt"
+            fi
+
+            echo "Using requirements file: $REQ_FILE"
+
+            # 4. Swap llama.cpp wheels for ik_llama.cpp and inject --ik into start scripts
+            if [[ "$RUNNER_OS" == "macOS" ]]; then
+                sed -i '' 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+                sed -i '' 's/--portable/--portable --ik/g' start_macos.sh
+            else
+                sed -i 's|/llama_cpp_binaries-|/ik_llama_cpp_binaries-|g' "$REQ_FILE"
+                sed -i 's/--portable/--portable --ik/g' start_linux.sh start_windows.bat 2>/dev/null || true
+            fi
+
+            # 5. Install packages
+            echo "Installing Python packages from $REQ_FILE..."
+            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
+
+            # 6. Clean up
+            rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker setup.cfg .github .gitignore requirements/ one_click.py
+
+            # 7. Create archive
+            cd ..
+            if [[ "$RUNNER_OS" == "Windows" ]]; then
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.zip"
+                echo "Creating archive: $ARCHIVE_NAME"
+                powershell -Command "Compress-Archive -Path text-generation-webui-${VERSION_CLEAN} -DestinationPath $ARCHIVE_NAME"
+            else
+                ARCHIVE_NAME="textgen-portable-ik-${VERSION_CLEAN}-${PLATFORM}.tar.gz"
+                echo "Creating archive: $ARCHIVE_NAME"
+                tar czf "$ARCHIVE_NAME" "text-generation-webui-${VERSION_CLEAN}"
+            fi
+
+      - name: Upload files to a GitHub release
+        id: upload-release
+        uses: svenstaro/upload-release-action@2.7.0
+        continue-on-error: true
+        with:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: ../textgen-portable-ik-*
+          tag: ${{ inputs.version }}
+          file_glob: true
+          make_latest: false
+          overwrite: true
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 9b9756a95d..5e2decfa27 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -11,7 +11,6 @@
 from pathlib import Path
 from typing import Any, List
 
-import llama_cpp_binaries
 import requests
 
 from modules import shared
@@ -357,7 +356,16 @@ def _start_server(self):
         """Start the llama.cpp server and wait until it's ready."""
         # Determine the server path
         if self.server_path is None:
-            self.server_path = llama_cpp_binaries.get_binary_path()
+            if shared.args.ik:
+                try:
+                    import ik_llama_cpp_binaries
+                except ImportError:
+                    raise ImportError("--ik requires the ik_llama_cpp_binaries package. Install it with: pip install <ik_llama_cpp_binaries wheel URL>")
+
+                self.server_path = ik_llama_cpp_binaries.get_binary_path()
+            else:
+                import llama_cpp_binaries
+                self.server_path = llama_cpp_binaries.get_binary_path()
 
         # Build the command
         cmd = [
@@ -616,10 +624,12 @@ def filter_stderr_with_progress(process_stderr):
 def _patch_cmd_for_ik(cmd):
     """
     Rewrite upstream llama.cpp flags to ik_llama.cpp equivalents:
-      --no-webui          → --webui none
+      --no-webui           → --webui none
       --fit off            → (removed)
       --fit on / --fit-ctx → --fit (bare flag)
       --fit-target         → --fit-margin
+      --cache-reuse        → (removed, unsupported)
+      --swa-full           → (removed, unsupported)
     """
     patched = []
     i = 0
@@ -635,9 +645,14 @@ def _patch_cmd_for_ik(cmd):
                 patched.append("--fit")
             # "off" → drop entirely
         elif arg == "--fit-ctx":
+            patched.append("--fit")
             i += 1  # skip the value
         elif arg == "--fit-target":
             patched.append("--fit-margin")
+        elif arg == "--cache-reuse":
+            i += 1  # skip the value
+        elif arg == "--swa-full":
+            pass  # bare flag, just drop it
         else:
             patched.append(arg)
 
diff --git a/modules/loaders.py b/modules/loaders.py
index c90f2ebbec..cb1f3d3bad 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -20,6 +20,7 @@
         'no_mmap',
         'mlock',
         'numa',
+        'ik',
         'parallel',
         'model_draft',
         'draft_max',
@@ -345,6 +346,7 @@ def list_model_elements():
         'spec_ngram_size_m',
         'spec_ngram_min_hits',
         'mmproj',
+        'ik',
     ]
 
 
diff --git a/modules/shared.py b/modules/shared.py
index c50736d703..13843f0c52 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -110,7 +110,7 @@
 group.add_argument('--parallel', type=int, default=1, help='Number of parallel request slots. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
 group.add_argument('--fit-target', type=str, default='512', help='Target VRAM margin per device for auto GPU layers, comma-separated list of values in MiB. A single value is broadcast across all devices.')
 group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Example: "--jinja --rpc 192.168.1.100:50052"')
-group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. To install: build ik_llama.cpp, then delete all files inside <venv>/lib/pythonX.Y/site-packages/llama_cpp_binaries/bin/ and copy or symlink the ik_llama.cpp build outputs into that folder.')
+group.add_argument('--ik', action='store_true', help='Use ik_llama.cpp instead of upstream llama.cpp. Requires the ik_llama_cpp_binaries package to be installed.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 5b7621a76e..16505afa55 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -51,6 +51,9 @@ def create_ui():
 
                         with gr.Column():
                             shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
+                            if not shared.args.portable:
+                                shared.gradio['ik'] = gr.Checkbox(label="ik", value=shared.args.ik, info='Use ik_llama.cpp instead of upstream llama.cpp.')
+
                             shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 5661962700..100c99d17f 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -40,8 +40,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
 https://github.com/turboderp-org/exllamav3/releases/download/v0.0.26/exllamav3-0.0.26+cu128.torch2.9.0-cp313-cp313-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.13"
 https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.9.0cxx11abiFALSE-cp313-cp313-win_amd64.whl; platform_system == "Windows" and python_version == "3.13"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 620683cce2..66fa4ac76d 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -37,5 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index b1f109b2e0..98dc8be65c 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -37,4 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index a54476a95d..e33264cf85 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -37,4 +37,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index be82c904f7..cd083f6d11 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -37,5 +37,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/ik_llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 188da3809a..671822251c 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 4562b6d01b..5f5b2f8d91 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+rocm7.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 04dcf25e12..f5f7d6eec3 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_x86_64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 4b8af78a73..e51fc29666 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -23,4 +23,4 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 5b0eaf892e..683f94c890 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cpu-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 90b3234f37..942d087737 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+cu131-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index ea72b4ec15..ae784e00b5 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -23,5 +23,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Vulkan wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.98.0/llama_cpp_binaries-0.98.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.99.0/llama_cpp_binaries-0.99.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From be6fc0663ac1b7a60b7fde24afb38de2b0aba57b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 28 Mar 2026 08:11:28 -0700
Subject: [PATCH 09/12] Update the custom gradio wheels

---
 requirements/full/requirements.txt                   | 4 ++--
 requirements/full/requirements_amd.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt       | 4 ++--
 requirements/full/requirements_apple_silicon.txt     | 4 ++--
 requirements/full/requirements_cpu_only.txt          | 4 ++--
 requirements/full/requirements_nowheels.txt          | 4 ++--
 requirements/portable/requirements.txt               | 4 ++--
 requirements/portable/requirements_amd.txt           | 4 ++--
 requirements/portable/requirements_apple_intel.txt   | 4 ++--
 requirements/portable/requirements_apple_silicon.txt | 4 ++--
 requirements/portable/requirements_cpu_only.txt      | 4 ++--
 requirements/portable/requirements_cuda131.txt       | 4 ++--
 requirements/portable/requirements_nowheels.txt      | 4 ++--
 requirements/portable/requirements_vulkan.txt        | 4 ++--
 14 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 100c99d17f..6e11dd2fa0 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -31,8 +31,8 @@ tqdm
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 66fa4ac76d..c964eff69f 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 98dc8be65c..b1dd6a4fdc 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index e33264cf85..4d03d28007 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index cd083f6d11..9d41d069c2 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 77c254e681..052085cc42 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -28,8 +28,8 @@ trafilatura==2.0.0
 wandb
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 671822251c..ff80b6c836 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
index 5f5b2f8d91..318044da94 100644
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index f5f7d6eec3..1676bffbcd 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index e51fc29666..27fc2da81b 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 683f94c890..0bbdd30a0d 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt
index 942d087737..c3ae3c57dd 100644
--- a/requirements/portable/requirements_cuda131.txt
+++ b/requirements/portable/requirements_cuda131.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index e8457909d9..e38140cebd 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index ae784e00b5..e646c04c93 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -14,8 +14,8 @@ trafilatura==2.0.0
 tqdm
 
 # Gradio
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio-4.37.2+custom.12-py3-none-any.whl
-https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.12/gradio_client-1.0.2+custom.12-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio-4.37.2+custom.13-py3-none-any.whl
+https://github.com/oobabooga/gradio/releases/download/4.37.2-custom.13/gradio_client-1.0.2+custom.13-py3-none-any.whl
 
 # API
 flask_cloudflared==0.0.15

From 0d0287b13574c7dd406c54696be4bf2ea5c3af06 Mon Sep 17 00:00:00 2001
From: ystartgo <startgo@yia.app>
Date: Sun, 15 Feb 2026 20:45:39 +0800
Subject: [PATCH 10/12] i18n

---
 modules/i18n.py          | 282 +++++++++++++++++++++++++++++++++++++++
 modules/ui_chat.py       | 125 ++++++++---------
 modules/ui_model_menu.py | 139 +++++++++----------
 modules/ui_parameters.py | 123 ++++++++---------
 4 files changed, 477 insertions(+), 192 deletions(-)
 create mode 100644 modules/i18n.py

diff --git a/modules/i18n.py b/modules/i18n.py
new file mode 100644
index 0000000000..4fb8f60336
--- /dev/null
+++ b/modules/i18n.py
@@ -0,0 +1,282 @@
+from modules import shared
+
+LANG_CHOICES = [
+    ("en", "English"),
+    ("zh_TW", "繁體中文"),
+]
+
+_ZH_TW = {
+    "Text Generation Web UI": "文字生成 Web UI",
+    "Chat": "聊天",
+    "Notebook": "筆記本",
+    "Parameters": "參數",
+    "Character": "角色",
+    "Model": "模型",
+    "Image generation": "圖像生成",
+    "Training": "訓練",
+    "Session": "工作階段",
+    "Settings": "設定",
+    "Extensions & flags": "擴充與旗標",
+    "Available extensions": "可用擴充",
+    "Boolean command-line flags": "布林命令列旗標",
+    "Toggle light/dark theme 💡": "切換亮/暗主題 💡",
+    "Show two columns in the Notebook tab": "Notebook 分頁顯示雙欄",
+    "Turn long pasted text into attachments in the Chat tab": "在聊天分頁將長貼上文字轉為附件",
+    "Include attachments/search results from previous messages in the chat prompt": "聊天提示包含前訊息的附件/搜尋結果",
+    "Save extensions settings to user_data/settings.yaml": "儲存擴充設定到 user_data/settings.yaml",
+    "Apply flags/extensions and restart": "套用旗標/擴充並重新啟動",
+    "Language": "語言",
+    "Input": "輸入",
+    "Output": "輸出",
+    "Continue": "繼續",
+    "Stop": "停止",
+    "Generate": "生成",
+    "Prompt": "提示",
+    "New": "新增",
+    "Rename": "重新命名",
+    "Delete": "刪除",
+    "Confirm": "確認",
+    "Cancel": "取消",
+    "Raw": "原始",
+    "Markdown": "Markdown",
+    "HTML": "HTML",
+    "Logits": "Logits",
+    "Tokens": "詞元",
+    "Get next token probabilities": "取得下一個詞元機率",
+    "Use samplers": "使用採樣器",
+    "Previous output": "前一次輸出",
+    "Get token IDs for the input": "取得輸入的詞元 ID",
+    "Render": "渲染",
+    "<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.": "<|character|> 與 <|prompt|> 會分別替換為機器人名稱與一般聊天提示。",
+    "dynatemp_low": "dynatemp_low 低溫",
+    "dynatemp_high": "dynatemp_high 高溫",
+    "dynatemp_exponent": "dynatemp_exponent 指數",
+    "dynamic_temperature": "dynamic_temperature 動態溫度",
+    "mirostat_tau": "mirostat_tau 目標熵",
+    "mirostat_eta": "mirostat_eta 學習率",
+    "Ban the eos_token": "Ban the eos_token 禁用結束詞元",
+    "threads": "threads 執行緒數",
+    "threads_batch": "threads_batch 每批執行緒",
+    "batch_size": "batch_size 批次大小",
+    "ubatch_size": "ubatch_size 微批次大小",
+    "extra-flags": "extra-flags 額外旗標",
+    "rope_freq_base": "rope_freq_base 頻率基準",
+    "mlock": "mlock 鎖定記憶體",
+    "numa": "numa NUMA",
+    "Send a message": "送出訊息",
+    "Send": "送出",
+    "no_kv_offload": "no_kv_offload 不卸載 K/V",
+    "Please enter a model path.": "請輸入模型路徑。",
+    "Undo": "復原",
+    "Regenerate": "重新生成",
+    "Generation": "生成",
+    "Preset": "預設",
+    "Restore preset": "還原預設",
+    "Neutralize samplers": "重置採樣器",
+    "Filter by loader": "依載入器篩選",
+    "## Curve shape": "## 曲線形狀",
+    "## Curve cutoff": "## 曲線截斷",
+    "## Repetition suppression": "## 重複抑制",
+    "## Alternative sampling methods": "## 替代採樣方法",
+    "## Other options": "## 其他選項",
+    "Truncate the prompt up to this length": "將提示截斷至此長度",
+    "Seed (-1 for random)": "隨機種子（-1 為隨機）",
+    "Custom system message": "自訂系統訊息",
+    "If not empty, will be used instead of the default one.": "若不為空，將取代預設內容。",
+    "Custom stopping strings": "自訂停止字串",
+    "Token bans": "禁用詞元",
+    "Negative prompt": "負面提示",
+    "Load grammar from file (.gbnf)": "從檔案載入文法（.gbnf）",
+    "Grammar": "文法",
+    "temperature": "temperature 溫度",
+    "Instruction template": "指令模板",
+    "Saved instruction templates": "已儲存的指令模板",
+    "Load": "載入",
+    "Send to notebook": "發送到筆記本",
+    "Send to Notebook": "發送到筆記本",
+    "Chat template": "聊天模板",
+    "Send": "傳送",
+    "Regenerate (Ctrl + Enter)": "重新生成（Ctrl + Enter）",
+    "Continue (Alt + Enter)": "繼續（Alt + Enter）",
+    "Remove last reply (Ctrl + Shift + Backspace)": "移除上一則回覆（Ctrl + Shift + Backspace）",
+    "Impersonate (Ctrl + Shift + M)": "扮演（Ctrl + Shift + M）",
+    "Send dummy message": "送出範例訊息",
+    "Send dummy reply": "送出範例回覆",
+    "Show controls (Ctrl+S)": "顯示控制列（Ctrl+S）",
+    "Start reply with": "以此開頭回覆",
+    "Reasoning effort": "推理強度",
+    "Enable thinking": "啟用思考",
+    "For models with thinking support.": "適用於支援思考的模型。",
+    "Activate web search": "啟用網頁搜尋",
+    "Number of pages to download": "下載頁數",
+    "Mode": "模式",
+    "Chat style": "聊天風格",
+    "Command for chat-instruct mode": "chat-instruct 模式指令",
+    "Count tokens": "計算詞元數",
+    "New chat": "新聊天",
+    "Branch": "分支",
+    "Search chats...": "搜尋聊天…",
+    "Rename to:": "重新命名為：",
+    "New name": "新名稱",
+    "Restore character": "還原角色",
+    "Character's name": "角色名稱",
+    "Context": "情境",
+    "Greeting": "問候語",
+    "User": "使用者",
+    "Name": "名稱",
+    "Description": "描述",
+    "Here you can optionally write a description of yourself.": "此處可選擇性撰寫自我描述。",
+    "Chat history": "聊天記錄",
+    "Save history": "儲存記錄",
+    "Upload History JSON": "上傳記錄 JSON",
+    "Upload character": "上傳角色",
+    "YAML or JSON": "YAML 或 JSON",
+    "JSON or YAML File": "JSON 或 YAML 檔",
+    "Profile Picture (optional)": "大頭貼（可選）",
+    "Submit": "送出",
+    "TavernAI PNG File": "TavernAI PNG 檔",
+    "Character picture": "角色圖片",
+    "Your picture": "你的圖片",
+    "Save settings": "儲存設定",
+    "Unload": "卸載",
+    "Model loader": "模型載入器",
+    "## Main options": "## 主要選項",
+    "See more options": "查看更多選項",
+    "Multimodal (vision)": "多模態（視覺）",
+    "mmproj file": "mmproj 檔案",
+    "Speculative decoding": "投機解碼",
+    "gpu-layers": "GPU 層數",
+    "ctx-size": "上下文長度",
+    "cache-type": "快取類型",
+    "cpu-moe": "將專家移至 CPU（cpu-moe）",
+    "streaming-llm": "啟用 StreamingLLM",
+    "ik": "ik",
+    "Use ik_llama.cpp instead of upstream llama.cpp.": "使用 ik_llama.cpp 取代上游 llama.cpp。",
+    "Download": "下載",
+    "Download model or LoRA": "下載模型或 LoRA",
+    "Get file list": "取得檔案清單",
+    "Customize instruction template": "自訂指令模板",
+    "Select the desired instruction template": "選擇想要的指令模板",
+    "No model is loaded": "尚未載入模型",
+    "Ready": "就緒",
+    "File name (for GGUF models)": "檔名（適用 GGUF 模型）",
+    "This allows you to set a customized template for the model currently selected in the \"Model loader\" menu. Whenever the model gets loaded, this template will be used in place of the template specified in the model's medatada, which sometimes is wrong.": "這可為「模型載入器」中目前選取的模型設定自訂模板。之後每次載入該模型時，都會使用此模板，取代模型中標註的模板（有時會標註錯誤）。",
+    "Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.": "輸入 Hugging Face 的使用者/模型路徑，例如：facebook/galactica-125m。若要指定分支，請在最後加上冒號，例如：facebook/galactica-125m:main。若只下載單一檔案，請在第二個輸入框填入檔名。",
+    "Used by GPT-OSS.": "供 GPT-OSS 使用。",
+    "Used by Seed-OSS and pre-2507 Qwen3.": "供 Seed-OSS 與 2507 之前的 Qwen3 使用。",
+    "In instruct and chat-instruct modes, the template under Parameters > Instruction template is used.": "在 instruct 與 chat-instruct 模式下，會使用「參數 > 指令模板」中的模板。",
+    "After selecting the template, click on \"Load\" to load and apply it.": "選擇模板後，點擊「載入」以套用。",
+    "This gets autodetected; you usually don't need to change it. Used in instruct and chat-instruct modes.": "此值通常會自動偵測，通常不需更改。用於 instruct 與 chat-instruct 模式。",
+    "Defines how the chat prompt in chat/chat-instruct modes is generated.": "定義 chat 或 chat-instruct 模式下聊天提示的生成方式。",
+    "Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can't load the model.": "必須大於 0 才會使用 GPU。⚠️ 如果無法載入模型，請降低此值。",
+    "Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072.": "上下文長度。常見值：4096、8192、16384、32768、65536、131072。",
+    "Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7": "每張 GPU 使用的 VRAM（GB）以逗號分隔。範例：20,7,7",
+    "Attention implementation.": "注意力實作。",
+    "Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).": "有效選項：llama.cpp - fp16、q8_0、q4_0；ExLlamaV2 - fp16、fp8、q8、q6、q4；ExLlamaV3 - fp16、q2 到 q8。對 ExLlamaV3，可為 k/v 分別輸入自訂組合（例如 q4_q8）。",
+    "The backend for tensor parallelism.": "張量並行的後端。",
+    "Move the experts to the CPU. Saves VRAM on MoE models.": "將 experts 移至 CPU，可在 MoE 模型節省 VRAM。",
+    "Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.": "啟用 StreamingLLM，移除舊訊息時免重新評估整個提示。",
+    "Used by load-in-4bit.": "供 load-in-4bit 使用。",
+    "Automatically split the model tensors across the available GPUs.": "自動將模型張量分散至可用 GPU。",
+    "Enable tensor parallelism (TP).": "啟用張量並行（TP）。",
+    "Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.": "使用 ModelRunnerCpp 進行推論，通常比預設 ModelRunner 更快。",
+    "Select a file that matches your model. Must be placed in user_data/mmproj/": "選擇與模型相容的檔案，需放在 user_data/mmproj/。",
+    "Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).": "草稿模型。投機解碼僅適用於共享相同詞彙表的模型（例如相同家族）。",
+    "Number of layers to offload to the GPU for the draft model.": "草稿模型要卸載到 GPU 的層數。",
+    "Number of tokens to draft for speculative decoding. Recommended value: 4.": "投機解碼的草稿詞元數。建議值：4。",
+    "Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1": "卸載草稿模型的裝置列表（以逗號分隔）。範例：CUDA0,CUDA1",
+    "Size of the prompt context for the draft model. If 0, uses the same as the main model.": "草稿模型的提示上下文大小。若為 0，則與主模型相同。",
+    "* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.": "* 目前需要在獨立的 Python 3.10 環境手動安裝 TensorRT-LLM。安裝指南請參考此 PR 的說明。\n\n* 僅在勾選 `cpp-runner` 時使用 `ctx_size`。\n\n* 目前 `cpp_runner` 不支援串流輸出。",
+    "Use PyTorch in CPU mode.": "以 CPU 模式使用 PyTorch。",
+    "Split the model by rows across GPUs. This may improve multi-gpu performance.": "將模型依列在多張 GPU 上切分，可能改善多 GPU 效能。",
+    "Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.": "不要將 K、Q、V 卸載至 GPU，可節省 VRAM，但會降低效能。",
+    "NUMA support can help on some systems with non-uniform memory access.": "在非一致性記憶體架構（NUMA）的系統上，啟用可改善效能。",
+    "Necessary to use CFG with this loader.": "使用此載入器啟用 CFG 時所必需。",
+    "Set use_fast=False while loading the tokenizer.": "在載入 tokenizer 時設定 use_fast=False。",
+    "Activates Quadratic Sampling.": "啟用二次取樣（Quadratic Sampling）。",
+    "Adjusts the dropoff curve of Quadratic Sampling.": "調整二次取樣的衰減曲線。",
+    "Probability that the removal will actually happen. 0 disables the sampler. 1 makes it always happen.": "實際執行移除的機率。0 代表停用此採樣器，1 代表必定執行。",
+    "For Contrastive Search. do_sample must be unchecked.": "用於對比式搜尋（Contrastive Search）。需取消勾選 do_sample。",
+    "For CFG. 1.5 is a good value.": "用於 CFG。建議值：1.5。",
+    "Parameter names separated by new lines or commas.": "以換行或逗號分隔參數名稱。",
+    "Tokens across which sequence matching is not continued. Specified as a comma-separated list of quoted strings.": "不進行序列比對延續的斷點詞元，以加引號的字串並以逗號分隔。",
+    "Activates Prompt Lookup Decoding.": "啟用提示查找解碼（Prompt Lookup Decoding）。",
+    "Expand max_new_tokens to the available context length.": "將 max_new_tokens 擴展至可用的上下文長度。",
+    "Forces the model to never end the generation prematurely.": "強制模型不要過早結束生成。",
+    "Only applies to text completion (notebook). In chat mode, templates control BOS tokens.": "僅適用於文字補全（Notebook）。聊天模式由模板控制 BOS。",
+    "Some specific models need this unset.": "部分模型需要取消此選項。",
+    "Activate text streaming": "啟用文字串流",
+    "Static KV cache": "靜態 KV 快取",
+    "Use a static cache for improved performance.": "使用靜態快取以提升效能。",
+    "List of proportions to split the model across multiple GPUs. Example: 60,40": "模型在多張 GPU 間的比例設定。範例：60,40",
+    "tensor_split": "張量切分比例",
+    "auto_max_new_tokens": "自動調整最大新詞元數",
+    "Add the bos_token to the beginning of prompts": "在提示開頭加入 BOS 詞元",
+    "Additional flags to pass to llama-server. Format: \"flag1=value1,flag2,flag3=value3\". Example: \"override-tensor=exps=CPU\"": "傳遞給 llama-server 的額外旗標。格式：「flag1=value1,flag2,flag3=value3」。範例：「override-tensor=exps=CPU」。",
+    "Maximum CPU memory in GiB. Use this for CPU offloading.": "最大 CPU 記憶體（GiB）。用於 CPU 卸載。",
+    "Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.": "NTK RoPE 縮放用的位置嵌入 alpha 因子。建議值（NTKv1）：1.75 對應 1.5x 上下文、2.5 對應 2x 上下文。與 compress_pos_emb 擇一使用。",
+    "Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.": "NTK RoPE 縮放用的位置嵌入頻率基準。與 alpha_value 的關係：rope_freq_base = 10000 * alpha_value ^ (64 / 63)。0 表示使用模型內建值。",
+    "Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.": "位置嵌入壓縮係數。應設為（新上下文長度）/（模型原始上下文長度）。等同 1/rope_freq_scale。",
+    "compress_pos_emb": "位置嵌入壓縮係數",
+    "row_split": "列切分（row_split）",
+    "no-mmap": "停用 mmap（no-mmap）",
+    "Only applies to MoE models like Mixtral.": "僅適用於 Mixtral 等 MoE 模型。",
+    "Estimated VRAM to load the model:": "載入模型的預估 VRAM：",
+    "mirostat_mode": "mirostat 模式",
+    "mode=1 is for llama.cpp only.": "mode=1 只適用於 llama.cpp。",
+    "Note that some of these extensions may require manually installing Python requirements through the command: pip install -r extensions/extension_name/requirements.txt": "部分擴充可能需要手動安裝 Python 相依套件：pip install -r extensions/extension_name/requirements.txt",
+    "Used in chat and chat-instruct modes.": "用於 chat 與 chat-instruct 模式。",
+    "max_new_tokens": "最大新詞元數",
+    "⚠️ Setting this too high can cause prompt truncation.": "⚠️ 設定過高可能導致提示被截斷。",
+    "The leftmost tokens are removed if the prompt exceeds this length.": "若提示超過此長度，會移除最左側的詞元。",
+    "Sampler priority": "採樣器優先順序",
+    "Maximum tokens/second": "每秒最大詞元數",
+    "To make text readable in real time.": "讓文字即時可讀。",
+    "min_p": "min_p 最小機率",
+    "xtc_threshold": "xtc_threshold XTC 臨界值",
+    "xtc_probability": "xtc_probability XTC 機率",
+    "dry_base": "dry 基數",
+    "dry_allowed_length": "dry 允許重複長度",
+    "dry_sequence_breakers": "dry_sequence_breakers 斷序詞元",
+    "repetition_penalty": "repetition_penalty 重複懲罰",
+    "frequency_penalty": "frequency_penalty 頻率懲罰",
+    "presence_penalty": "presence_penalty 出現懲罰",
+    "repetition_penalty_range": "repetition_penalty_range 重複懲罰範圍",
+    "temperature_last": "temperature_last 溫度後置",
+    "If 2 or more tokens have probability above this threshold, consider removing all but the last one.": "若有 2 個以上詞元的機率高於此門檻，考慮只保留最後一個。",
+    "Set to greater than 0 to enable DRY. Recommended value: 0.8.": "設為大於 0 以啟用 DRY。建議值：0.8。",
+    "Longest sequence that can be repeated without being penalized.": "不受懲罰可重複的最長序列長度。",
+    "Controls how fast the penalty grows with increasing sequence length.": "控制懲罰隨序列長度增加的成長速度。",
+    "Moves temperature/dynamic temperature/quadratic sampling to the end of the sampler stack, ignoring their positions in \"Sampler priority\".": "將溫度/動態溫度/二次取樣移至採樣器堆疊末端，忽略其在「採樣器優先順序」中的位置。",
+    "top_n_sigma": "top_n_sigma Top-N 標準差",
+    "top_p": "top_p Top-p",
+    "top_k": "top_k Top-k",
+    "typical_p": "typical_p Typical-p",
+    "dry_multiplier": "dry 乘數",
+    "dry_allowed_length": "允許重複長度",
+    "Good morning!": "早安！",
+    "Good afternoon!": "午安！",
+    "Good evening!": "晚安！",
+    "How can I help you today?": "今天我可以怎麼幫你？",
+}
+
+_MAP = {
+    "zh_TW": _ZH_TW,
+}
+
+
+def lang():
+    v = shared.settings.get("language", "en")
+    # Normalize legacy or mislabeled values
+    if v in ("English", "en_US", "en-GB"):
+        return "en"
+    if v in ("zh_TW", "zh-TW", "繁體中文"):
+        return "zh_TW"
+    return v
+
+
+def t(s: str) -> str:
+    d = _MAP.get(lang())
+    if not d:
+        return s
+    return d.get(s, s)
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 10d05f659c..b3f13029b2 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -9,6 +9,7 @@
 from modules.html_generator import chat_html_wrapper
 from modules.text_generation import stop_everything_event
 from modules.utils import gradio
+from modules.i18n import t
 
 inputs = ('Chat input', 'interface_state')
 reload_arr = ('history', 'name1', 'name2', 'mode', 'chat_style', 'character_menu')
@@ -21,28 +22,28 @@ def create_ui():
     shared.gradio['history'] = gr.State({'internal': [], 'visible': [], 'metadata': {}})
     shared.gradio['display'] = gr.Headless(value={})
 
-    with gr.Tab('Chat', elem_id='chat-tab'):
+    with gr.Tab(t('Chat'), elem_id='chat-tab'):
         with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
             with gr.Column():
                 with gr.Row(elem_id='past-chats-buttons'):
-                    shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes=['refresh-button', 'refresh-button-medium'], elem_id='Branch', interactive=not mu)
-                    shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes=['refresh-button', 'refresh-button-medium'], interactive=not mu)
+                    shared.gradio['branch_chat'] = gr.Button(t('Branch'), elem_classes=['refresh-button', 'refresh-button-medium'], elem_id='Branch', interactive=not mu)
+                    shared.gradio['rename_chat'] = gr.Button(t('Rename'), elem_classes=['refresh-button', 'refresh-button-medium'], interactive=not mu)
                     shared.gradio['delete_chat'] = gr.Button('🗑️', visible=False, elem_classes='refresh-button', interactive=not mu, elem_id='delete_chat')
-                    shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'refresh-button-medium', 'focus-on-chat-input'], elem_id='new-chat-btn')
-                    shared.gradio['Start incognito chat'] = gr.Button('Incognito chat', visible=False, elem_id='incognito-chat-btn')
+                    shared.gradio['Start new chat'] = gr.Button(t('New chat'), elem_classes=['refresh-button', 'refresh-button-medium', 'focus-on-chat-input'], elem_id='new-chat-btn')
+                    shared.gradio['Start incognito chat'] = gr.Button(t('Incognito chat'), visible=False, elem_id='incognito-chat-btn')
                     shared.gradio['branch_index'] = gr.Number(value=-1, precision=0, visible=False, elem_id="Branch-index", interactive=True)
 
-                shared.gradio['search_chat'] = gr.Textbox(placeholder='Search chats...', max_lines=1, elem_id='search_chat')
+                shared.gradio['search_chat'] = gr.Textbox(placeholder=t('Search chats...'), max_lines=1, elem_id='search_chat')
 
                 with gr.Row(elem_id='delete-chat-row', visible=False) as shared.gradio['delete-chat-row']:
-                    shared.gradio['delete_chat-cancel'] = gr.Button('Cancel', elem_classes=['refresh-button', 'focus-on-chat-input'], elem_id='delete_chat-cancel')
-                    shared.gradio['delete_chat-confirm'] = gr.Button('Confirm', variant='stop', elem_classes=['refresh-button', 'focus-on-chat-input'], elem_id='delete_chat-confirm')
+                    shared.gradio['delete_chat-cancel'] = gr.Button(t('Cancel'), elem_classes=['refresh-button', 'focus-on-chat-input'], elem_id='delete_chat-cancel')
+                    shared.gradio['delete_chat-confirm'] = gr.Button(t('Confirm'), variant='stop', elem_classes=['refresh-button', 'focus-on-chat-input'], elem_id='delete_chat-confirm')
 
                 with gr.Row(elem_id='rename-row', visible=False) as shared.gradio['rename-row']:
-                    shared.gradio['rename_to'] = gr.Textbox(label='Rename to:', placeholder='New name', elem_classes=['no-background'])
+                    shared.gradio['rename_to'] = gr.Textbox(label=t('Rename to:'), placeholder=t('New name'), elem_classes=['no-background'])
                     with gr.Row():
-                        shared.gradio['rename_to-cancel'] = gr.Button('Cancel', elem_classes=['refresh-button', 'focus-on-chat-input'])
-                        shared.gradio['rename_to-confirm'] = gr.Button('Confirm', elem_classes=['refresh-button', 'focus-on-chat-input'], variant='primary')
+                        shared.gradio['rename_to-cancel'] = gr.Button(t('Cancel'), elem_classes=['refresh-button', 'focus-on-chat-input'])
+                        shared.gradio['rename_to-confirm'] = gr.Button(t('Confirm'), elem_classes=['refresh-button', 'focus-on-chat-input'], variant='primary')
 
                 with gr.Row():
                     shared.gradio['unique_id'] = gr.Radio(label="", elem_classes=['slim-dropdown', 'pretty_scrollbar'], interactive=not mu, elem_id='past-chats')
@@ -55,40 +56,40 @@ def create_ui():
                         gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')
 
                     with gr.Column(scale=10, elem_id='chat-input-container'):
-                        shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf', 'image'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar'])
+                        shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder=t('Send a message'), file_types=['text', '.pdf', 'image'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar'])
                         shared.gradio['typing-dots'] = gr.HTML(value='<div class="typing"><span></span><span class="dot1"></span><span class="dot2"></span></div>', label='typing', elem_id='typing-container')
 
                     with gr.Column(scale=1, elem_id='generate-stop-container'):
                         with gr.Row():
-                            shared.gradio['Stop'] = gr.Button('Stop', elem_id='stop', visible=False)
-                            shared.gradio['Generate'] = gr.Button('Send', elem_id='Generate', variant='primary')
+                            shared.gradio['Stop'] = gr.Button(t('Stop'), elem_id='stop', visible=False)
+                            shared.gradio['Generate'] = gr.Button(t('Send'), elem_id='Generate', variant='primary')
 
         # Hover menu buttons
         with gr.Column(elem_id='chat-buttons'):
-            shared.gradio['Regenerate'] = gr.Button('Regenerate (Ctrl + Enter)', elem_id='Regenerate')
-            shared.gradio['Continue'] = gr.Button('Continue (Alt + Enter)', elem_id='Continue')
-            shared.gradio['Remove last'] = gr.Button('Remove last reply (Ctrl + Shift + Backspace)', elem_id='Remove-last')
-            shared.gradio['Impersonate'] = gr.Button('Impersonate (Ctrl + Shift + M)', elem_id='Impersonate')
-            shared.gradio['Send dummy message'] = gr.Button('Send dummy message')
-            shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
-            shared.gradio['send-chat-to-notebook'] = gr.Button('Send to Notebook')
-            shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls')
+            shared.gradio['Regenerate'] = gr.Button(t('Regenerate (Ctrl + Enter)'), elem_id='Regenerate')
+            shared.gradio['Continue'] = gr.Button(t('Continue (Alt + Enter)'), elem_id='Continue')
+            shared.gradio['Remove last'] = gr.Button(t('Remove last reply (Ctrl + Shift + Backspace)'), elem_id='Remove-last')
+            shared.gradio['Impersonate'] = gr.Button(t('Impersonate (Ctrl + Shift + M)'), elem_id='Impersonate')
+            shared.gradio['Send dummy message'] = gr.Button(t('Send dummy message'))
+            shared.gradio['Send dummy reply'] = gr.Button(t('Send dummy reply'))
+            shared.gradio['send-chat-to-notebook'] = gr.Button(t('Send to Notebook'))
+            shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label=t('Show controls (Ctrl+S)'), elem_id='show-controls')
 
         with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']):
             with gr.Column():
                 with gr.Row():
-                    shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
+                    shared.gradio['start_with'] = gr.Textbox(label=t('Start reply with'), placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
 
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
-                shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label='Reasoning effort', info='Used by GPT-OSS.')
-                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='For models with thinking support.')
+                shared.gradio['reasoning_effort'] = gr.Dropdown(value=shared.settings['reasoning_effort'], choices=['low', 'medium', 'high'], label=t('Reasoning effort'), info=t('Used by GPT-OSS.'))
+                shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label=t('Enable thinking'), info=t('For models with thinking support.'))
 
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
-                shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search')
+                shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label=t('Activate web search'), elem_id='web-search')
                 with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']:
-                    shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10)
+                    shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label=t('Number of pages to download'), minimum=1, maximum=10)
 
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
@@ -108,18 +109,18 @@ def sync_web_tools(selected):
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
                 with gr.Row():
-                    shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label='Mode', info='In instruct and chat-instruct modes, the template under Parameters > Instruction template is used.', elem_id='chat-mode')
+                    shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=None, label=t('Mode'), info=t('In instruct and chat-instruct modes, the template under Parameters > Instruction template is used.'), elem_id='chat-mode')
 
                 with gr.Row():
-                    shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
+                    shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label=t('Chat style'), value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
 
                 with gr.Row():
-                    shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
+                    shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label=t('Command for chat-instruct mode'), info=t('<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.'), visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
 
                 gr.HTML("<div class='sidebar-vertical-separator'></div>")
 
                 with gr.Row():
-                    shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm')
+                    shared.gradio['count_tokens'] = gr.Button(t('Count tokens'), size='sm')
 
                 shared.gradio['token_display'] = gr.HTML(value='', elem_classes='token-display')
 
@@ -137,72 +138,72 @@ def sync_web_tools(selected):
 
 def create_character_settings_ui():
     mu = shared.args.multi_user
-    with gr.Tab('Character', elem_id="character-tab"):
+    with gr.Tab(t('Character'), elem_id="character-tab"):
         with gr.Row():
             with gr.Column(scale=8):
-                with gr.Tab("Character"):
+                with gr.Tab(t("Character")):
                     with gr.Row():
-                        shared.gradio['character_menu'] = gr.Dropdown(value=shared.settings['character'], choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown')
+                        shared.gradio['character_menu'] = gr.Dropdown(value=shared.settings['character'], choices=utils.get_available_characters(), label=t('Character'), elem_id='character-menu', info=t('Used in chat and chat-instruct modes.'), elem_classes='slim-dropdown')
                         ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button', interactive=not mu)
                         shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button', elem_id="save-character", interactive=not mu)
                         shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
-                        shared.gradio['restore_character'] = gr.Button('Restore character', elem_classes='refresh-button', interactive=True, elem_id='restore-character')
+                        shared.gradio['restore_character'] = gr.Button(t('Restore character'), elem_classes='refresh-button', interactive=True, elem_id='restore-character')
 
-                    shared.gradio['name2'] = gr.Textbox(value=shared.settings['name2'], lines=1, label='Character\'s name')
-                    shared.gradio['context'] = gr.Textbox(value=shared.settings['context'], lines=10, label='Context', elem_classes=['add_scrollbar'], elem_id="character-context")
-                    shared.gradio['greeting'] = gr.Textbox(value=shared.settings['greeting'], lines=5, label='Greeting', elem_classes=['add_scrollbar'], elem_id="character-greeting")
+                    shared.gradio['name2'] = gr.Textbox(value=shared.settings['name2'], lines=1, label=t('Character\'s name'))
+                    shared.gradio['context'] = gr.Textbox(value=shared.settings['context'], lines=10, label=t('Context'), elem_classes=['add_scrollbar'], elem_id="character-context")
+                    shared.gradio['greeting'] = gr.Textbox(value=shared.settings['greeting'], lines=5, label=t('Greeting'), elem_classes=['add_scrollbar'], elem_id="character-greeting")
 
-                with gr.Tab("User"):
+                with gr.Tab(t("User")):
                     with gr.Row():
-                        shared.gradio['user_menu'] = gr.Dropdown(value=shared.settings['user'], choices=utils.get_available_users(), label='User', elem_id='user-menu', info='Select a user profile.', elem_classes='slim-dropdown')
+                        shared.gradio['user_menu'] = gr.Dropdown(value=shared.settings['user'], choices=utils.get_available_users(), label=t('User'), elem_id='user-menu', info=t('Select a user profile.'), elem_classes='slim-dropdown')
                         ui.create_refresh_button(shared.gradio['user_menu'], lambda: None, lambda: {'choices': utils.get_available_users()}, 'refresh-button', interactive=not mu)
                         shared.gradio['save_user'] = gr.Button('💾', elem_classes='refresh-button', elem_id="save-user", interactive=not mu)
                         shared.gradio['delete_user'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
 
-                    shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Name')
-                    shared.gradio['user_bio'] = gr.Textbox(value=shared.settings['user_bio'], lines=10, label='Description', info='Here you can optionally write a description of yourself.', placeholder='{{user}}\'s personality: ...', elem_classes=['add_scrollbar'], elem_id="user-description")
+                    shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label=t('Name'))
+                    shared.gradio['user_bio'] = gr.Textbox(value=shared.settings['user_bio'], lines=10, label=t('Description'), info=t('Here you can optionally write a description of yourself.'), placeholder='{{user}}\'s personality: ...', elem_classes=['add_scrollbar'], elem_id="user-description")
 
-                with gr.Tab('Chat history'):
+                with gr.Tab(t('Chat history')):
                     with gr.Row():
                         with gr.Column():
-                            shared.gradio['save_chat_history'] = gr.Button(value='Save history')
+                            shared.gradio['save_chat_history'] = gr.Button(value=t('Save history'))
 
                         with gr.Column():
-                            shared.gradio['load_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label='Upload History JSON')
+                            shared.gradio['load_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label=t('Upload History JSON'))
 
-                with gr.Tab('Upload character'):
-                    with gr.Tab('YAML or JSON'):
+                with gr.Tab(t('Upload character')):
+                    with gr.Tab(t('YAML or JSON')):
                         with gr.Row():
-                            shared.gradio['upload_json'] = gr.File(type='binary', file_types=['.json', '.yaml'], label='JSON or YAML File', interactive=not mu)
-                            shared.gradio['upload_img_bot'] = gr.Image(type='filepath', label='Profile Picture (optional)', interactive=not mu)
+                            shared.gradio['upload_json'] = gr.File(type='binary', file_types=['.json', '.yaml'], label=t('JSON or YAML File'), interactive=not mu)
+                            shared.gradio['upload_img_bot'] = gr.Image(type='filepath', label=t('Profile Picture (optional)'), interactive=not mu)
 
-                        shared.gradio['Submit character'] = gr.Button(value='Submit', interactive=False)
+                        shared.gradio['Submit character'] = gr.Button(value=t('Submit'), interactive=False)
 
                     with gr.Tab('TavernAI PNG'):
                         with gr.Row():
                             with gr.Column():
-                                shared.gradio['upload_img_tavern'] = gr.Image(type='filepath', label='TavernAI PNG File', elem_id='upload_img_tavern', interactive=not mu)
+                                shared.gradio['upload_img_tavern'] = gr.Image(type='filepath', label=t('TavernAI PNG File'), elem_id='upload_img_tavern', interactive=not mu)
                                 shared.gradio['tavern_json'] = gr.State()
                             with gr.Column():
-                                shared.gradio['tavern_name'] = gr.Textbox(value='', lines=1, label='Name', interactive=False)
-                                shared.gradio['tavern_desc'] = gr.Textbox(value='', lines=10, label='Description', interactive=False, elem_classes=['add_scrollbar'])
+                                shared.gradio['tavern_name'] = gr.Textbox(value='', lines=1, label=t('Name'), interactive=False)
+                                shared.gradio['tavern_desc'] = gr.Textbox(value='', lines=10, label=t('Description'), interactive=False, elem_classes=['add_scrollbar'])
 
-                        shared.gradio['Submit tavern character'] = gr.Button(value='Submit', interactive=False)
+                        shared.gradio['Submit tavern character'] = gr.Button(value=t('Submit'), interactive=False)
 
             with gr.Column(scale=1):
-                shared.gradio['character_picture'] = gr.Image(label='Character picture', type='filepath', interactive=not mu)
-                shared.gradio['your_picture'] = gr.Image(label='Your picture', type='filepath', value=Image.open(shared.user_data_dir / 'cache' / 'pfp_me.png') if (shared.user_data_dir / 'cache' / 'pfp_me.png').exists() else None, interactive=not mu)
+                shared.gradio['character_picture'] = gr.Image(label=t('Character picture'), type='filepath', interactive=not mu)
+                shared.gradio['your_picture'] = gr.Image(label=t('Your picture'), type='filepath', value=Image.open(shared.user_data_dir / 'cache' / 'pfp_me.png') if (shared.user_data_dir / 'cache' / 'pfp_me.png').exists() else None, interactive=not mu)
 
 
 def create_chat_settings_ui():
     mu = shared.args.multi_user
-    with gr.Tab('Instruction template'):
+    with gr.Tab(t('Instruction template')):
         with gr.Row():
             with gr.Column():
                 with gr.Row():
-                    shared.gradio['instruction_template'] = gr.Dropdown(choices=utils.get_available_instruction_templates(), label='Saved instruction templates', info="After selecting the template, click on \"Load\" to load and apply it.", value='None', elem_classes='slim-dropdown')
+                    shared.gradio['instruction_template'] = gr.Dropdown(choices=utils.get_available_instruction_templates(), label=t('Saved instruction templates'), info=t("After selecting the template, click on \"Load\" to load and apply it."), value='None', elem_classes=['slim-dropdown'])
                     ui.create_refresh_button(shared.gradio['instruction_template'], lambda: None, lambda: {'choices': utils.get_available_instruction_templates()}, 'refresh-button', interactive=not mu)
-                    shared.gradio['load_template'] = gr.Button("Load", elem_classes='refresh-button')
+                    shared.gradio['load_template'] = gr.Button(t("Load"), elem_classes='refresh-button')
                     shared.gradio['save_template'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['delete_template'] = gr.Button('🗑️ ', elem_classes='refresh-button', interactive=not mu)
 
@@ -211,12 +212,12 @@ def create_chat_settings_ui():
 
         with gr.Row():
             with gr.Column():
-                shared.gradio['instruction_template_str'] = gr.Textbox(value=shared.settings['instruction_template_str'], label='Instruction template', lines=24, info='This gets autodetected; you usually don\'t need to change it. Used in instruct and chat-instruct modes.', elem_classes=['add_scrollbar', 'monospace'], elem_id='instruction-template-str')
+                shared.gradio['instruction_template_str'] = gr.Textbox(value=shared.settings['instruction_template_str'], label=t('Instruction template'), lines=24, info=t("This gets autodetected; you usually don't need to change it. Used in instruct and chat-instruct modes."), elem_classes=['add_scrollbar', 'monospace'], elem_id='instruction-template-str')
                 with gr.Row():
-                    shared.gradio['send_instruction_to_notebook'] = gr.Button('Send to notebook', elem_classes=['small-button'])
+                    shared.gradio['send_instruction_to_notebook'] = gr.Button(t('Send to notebook'), elem_classes=['small-button'])
 
             with gr.Column():
-                shared.gradio['chat_template_str'] = gr.Textbox(value=shared.settings['chat_template_str'], label='Chat template', lines=22, elem_classes=['add_scrollbar', 'monospace'], info='Defines how the chat prompt in chat/chat-instruct modes is generated.', elem_id='chat-template-str')
+                shared.gradio['chat_template_str'] = gr.Textbox(value=shared.settings['chat_template_str'], label=t('Chat template'), lines=22, elem_classes=['add_scrollbar', 'monospace'], info=t('Defines how the chat prompt in chat/chat-instruct modes is generated.'), elem_id='chat-template-str')
 
 
 def create_event_handlers():
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 16505afa55..9ab87e2fca 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -21,126 +21,127 @@
     update_model_parameters
 )
 from modules.utils import gradio
+from modules.i18n import t
 
 
 def create_ui():
     mu = shared.args.multi_user
 
-    with gr.Tab("Model", elem_id="model-tab"):
+    with gr.Tab(t("Model"), elem_id="model-tab"):
         with gr.Row():
             with gr.Column():
                 with gr.Row():
-                    shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
+                    shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label=t('Model'), elem_classes='slim-dropdown', interactive=not mu)
                     ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
-                    shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu)
-                    shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
-                    shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['load_model'] = gr.Button(t("Load"), elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['unload_model'] = gr.Button(t("Unload"), elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['save_model_settings'] = gr.Button(t("Save settings"), elem_classes='refresh-button', interactive=not mu)
 
-                shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None)
+                shared.gradio['loader'] = gr.Dropdown(label=t("Model loader"), choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None)
                 with gr.Blocks():
-                    gr.Markdown("## Main options")
+                    gr.Markdown(t("## Main options"))
                     with gr.Row():
                         with gr.Column():
-                            shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=-1, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Number of layers to offload to the GPU. -1 = auto.')
-                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=0, maximum=1048576, step=1024, value=shared.args.ctx_size, info='Context length. 0 = auto for llama.cpp (requires gpu-layers=-1), 8192 for other loaders. Common values: 4096, 8192, 16384, 32768, 65536, 131072.')
-                            shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
-                            shared.gradio['attn_implementation'] = gr.Dropdown(label="attn-implementation", choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info='Attention implementation.')
-                            shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
-                            shared.gradio['fit_target'] = gr.Textbox(label='fit-target', value=shared.args.fit_target, info='Target VRAM margin per device for auto GPU layers (MiB). Comma-separated list for multiple devices.')
-                            shared.gradio['tp_backend'] = gr.Dropdown(label="tp-backend", choices=['native', 'nccl'], value=shared.args.tp_backend, info='The backend for tensor parallelism.')
+                            shared.gradio['gpu_layers'] = gr.Slider(label=t("gpu-layers"), minimum=-1, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info=t('Number of layers to offload to the GPU. -1 = auto.'))
+                            shared.gradio['ctx_size'] = gr.Slider(label=t('ctx-size'), minimum=0, maximum=1048576, step=1024, value=shared.args.ctx_size, info=t('Context length. 0 = auto for llama.cpp (requires gpu-layers=-1), 8192 for other loaders. Common values: 4096, 8192, 16384, 32768, 65536, 131072.'))
+                            shared.gradio['gpu_split'] = gr.Textbox(label=t('gpu-split'), info=t('Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7'))
+                            shared.gradio['attn_implementation'] = gr.Dropdown(label=t("attn-implementation"), choices=['sdpa', 'eager', 'flash_attention_2'], value=shared.args.attn_implementation, info=t('Attention implementation.'))
+                            shared.gradio['cache_type'] = gr.Dropdown(label=t("cache-type"), choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info=t('Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).'))
+                            shared.gradio['fit_target'] = gr.Textbox(label=t('fit-target'), value=shared.args.fit_target, info=t('Target VRAM margin per device for auto GPU layers (MiB). Comma-separated list for multiple devices.'))
+                            shared.gradio['tp_backend'] = gr.Dropdown(label=t("tp-backend"), choices=['native', 'nccl'], value=shared.args.tp_backend, info=t('The backend for tensor parallelism.'))
 
                         with gr.Column():
                             shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
                             if not shared.args.portable:
-                                shared.gradio['ik'] = gr.Checkbox(label="ik", value=shared.args.ik, info='Use ik_llama.cpp instead of upstream llama.cpp.')
+                                shared.gradio['ik'] = gr.Checkbox(label=t("ik"), value=shared.args.ik, info=t('Use ik_llama.cpp instead of upstream llama.cpp.'))
 
-                            shared.gradio['cpu_moe'] = gr.Checkbox(label="cpu-moe", value=shared.args.cpu_moe, info='Move the experts to the CPU. Saves VRAM on MoE models.')
-                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
+                            shared.gradio['cpu_moe'] = gr.Checkbox(label=t("cpu-moe"), value=shared.args.cpu_moe, info=t('Move the experts to the CPU. Saves VRAM on MoE models.'))
+                            shared.gradio['streaming_llm'] = gr.Checkbox(label=t("streaming-llm"), value=shared.args.streaming_llm, info=t('Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.'))
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
-                            shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
-                            shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable tensor parallelism (TP).')
+                            shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info=t('Used by load-in-4bit.'))
+                            shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info=t('Enable tensor parallelism (TP).'))
                             shared.gradio['tensorrt_llm_info'] = gr.Markdown(
-                                '* TensorRT-LLM has to be installed manually: `pip install tensorrt_llm==1.1.0 --extra-index-url https://pypi.nvidia.com`.\n\n'
-                                '* You can load either a pre-built TensorRT engine or a regular HF model. '
-                                'HF models will be compiled to a TensorRT engine automatically on each load (this can take a while).'
+                                t('* TensorRT-LLM has to be installed manually: `pip install tensorrt_llm==1.1.0 --extra-index-url https://pypi.nvidia.com`.\n\n')
+                                + t('* You can load either a pre-built TensorRT engine or a regular HF model. ')
+                                + t('HF models will be compiled to a TensorRT engine automatically on each load (this can take a while).')
                             )
 
                             # Multimodal
-                            with gr.Accordion("Multimodal (vision)", open=False, elem_classes='tgw-accordion') as shared.gradio['mmproj_accordion']:
+                            with gr.Accordion(t("Multimodal (vision)"), open=False, elem_classes='tgw-accordion') as shared.gradio['mmproj_accordion']:
                                 with gr.Row():
-                                    shared.gradio['mmproj'] = gr.Dropdown(label="mmproj file", choices=utils.get_available_mmproj(), value=lambda: shared.args.mmproj or 'None', elem_classes='slim-dropdown', info=f'Select a file that matches your model. Must be placed in {shared.user_data_dir}/mmproj/', interactive=not mu)
+                                    shared.gradio['mmproj'] = gr.Dropdown(label=t("mmproj file"), choices=utils.get_available_mmproj(), value=lambda: shared.args.mmproj or 'None', elem_classes='slim-dropdown', info=f"{t('Select a file that matches your model. Must be placed in')} {shared.user_data_dir}/mmproj/", interactive=not mu)
                                     ui.create_refresh_button(shared.gradio['mmproj'], lambda: None, lambda: {'choices': utils.get_available_mmproj()}, 'refresh-button', interactive=not mu)
 
                             # Speculative decoding
-                            with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
-                                shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Maximum number of tokens to draft for speculative decoding. Recommended: 4 for draft model, 64 for n-gram.')
+                            with gr.Accordion(t("Speculative decoding"), open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
+                                shared.gradio['draft_max'] = gr.Number(label=t("draft-max"), precision=0, step=1, value=shared.args.draft_max, info=t('Maximum number of tokens to draft for speculative decoding. Recommended: 4 for draft model, 64 for n-gram.'))
 
-                                gr.Markdown('#### Draft model')
+                                gr.Markdown(t('#### Draft model'))
                                 with gr.Row():
-                                    shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=['None'] + utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Must share the same vocabulary as the main model.', interactive=not mu)
+                                    shared.gradio['model_draft'] = gr.Dropdown(label=t("model-draft"), choices=['None'] + utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info=t('Draft model. Must share the same vocabulary as the main model.'), interactive=not mu)
                                     ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': ['None'] + utils.get_available_models()}, 'refresh-button', interactive=not mu)
 
-                                shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
-                                shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
-                                shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
+                                shared.gradio['gpu_layers_draft'] = gr.Slider(label=t("gpu-layers-draft"), minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info=t('Number of layers to offload to the GPU for the draft model.'))
+                                shared.gradio['device_draft'] = gr.Textbox(label=t("device-draft"), value=shared.args.device_draft, info=t('Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1'))
+                                shared.gradio['ctx_size_draft'] = gr.Number(label=t("ctx-size-draft"), precision=0, step=256, value=shared.args.ctx_size_draft, info=t('Size of the prompt context for the draft model. If 0, uses the same as the main model.'))
 
-                                shared.gradio['ngram_header'] = gr.Markdown('#### N-gram (draftless)')
-                                shared.gradio['spec_type'] = gr.Dropdown(label="spec-type", choices=['none', 'ngram-mod', 'ngram-simple', 'ngram-map-k', 'ngram-map-k4v', 'ngram-cache'], value=shared.args.spec_type, info='Draftless speculative decoding type. Recommended: ngram-mod.')
-                                shared.gradio['spec_ngram_size_n'] = gr.Number(label="spec-ngram-size-n", precision=0, step=1, value=shared.args.spec_ngram_size_n, info='N-gram lookup size for speculative decoding.', visible=shared.args.spec_type != 'none')
-                                shared.gradio['spec_ngram_size_m'] = gr.Number(label="spec-ngram-size-m", precision=0, step=1, value=shared.args.spec_ngram_size_m, info='Draft n-gram size for speculative decoding.', visible=shared.args.spec_type != 'none')
-                                shared.gradio['spec_ngram_min_hits'] = gr.Number(label="spec-ngram-min-hits", precision=0, step=1, value=shared.args.spec_ngram_min_hits, info='Minimum n-gram hits for ngram-map speculative decoding.', visible=shared.args.spec_type != 'none')
+                                shared.gradio['ngram_header'] = gr.Markdown(t('#### N-gram (draftless)'))
+                                shared.gradio['spec_type'] = gr.Dropdown(label=t("spec-type"), choices=['none', 'ngram-mod', 'ngram-simple', 'ngram-map-k', 'ngram-map-k4v', 'ngram-cache'], value=shared.args.spec_type, info=t('Draftless speculative decoding type. Recommended: ngram-mod.'))
+                                shared.gradio['spec_ngram_size_n'] = gr.Number(label=t("spec-ngram-size-n"), precision=0, step=1, value=shared.args.spec_ngram_size_n, info=t('N-gram lookup size for speculative decoding.'), visible=shared.args.spec_type != 'none')
+                                shared.gradio['spec_ngram_size_m'] = gr.Number(label=t("spec-ngram-size-m"), precision=0, step=1, value=shared.args.spec_ngram_size_m, info=t('Draft n-gram size for speculative decoding.'), visible=shared.args.spec_type != 'none')
+                                shared.gradio['spec_ngram_min_hits'] = gr.Number(label=t("spec-ngram-min-hits"), precision=0, step=1, value=shared.args.spec_ngram_min_hits, info=t('Minimum n-gram hits for ngram-map speculative decoding.'), visible=shared.args.spec_type != 'none')
 
-                    gr.Markdown("## Other options")
-                    with gr.Accordion("See more options", open=False, elem_classes='tgw-accordion'):
+                    gr.Markdown(t("## Other options"))
+                    with gr.Accordion(t("See more options"), open=False, elem_classes='tgw-accordion'):
                         with gr.Row():
                             with gr.Column():
-                                shared.gradio['parallel'] = gr.Slider(label="parallel", minimum=1, step=1, maximum=64, value=shared.args.parallel, info='Number of parallel request slots for the API. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.')
-                                shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
-                                shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
-                                shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
-                                shared.gradio['ubatch_size'] = gr.Slider(label="ubatch_size", minimum=1, maximum=4096, step=1, value=shared.args.ubatch_size)
-                                shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
-                                shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Extra flags to pass to llama-server. Example: --jinja --rpc 192.168.1.100:50052', value=shared.args.extra_flags)
-                                shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
-                                shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
-                                shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
+                                shared.gradio['parallel'] = gr.Slider(label=t("parallel"), minimum=1, step=1, maximum=64, value=shared.args.parallel, info=t('Number of parallel request slots for the API. The context size is divided equally among slots. For example, to have 4 slots with 8192 context each, set ctx_size to 32768.'))
+                                shared.gradio['threads'] = gr.Slider(label=t("threads"), minimum=0, step=1, maximum=256, value=shared.args.threads)
+                                shared.gradio['threads_batch'] = gr.Slider(label=t("threads_batch"), minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
+                                shared.gradio['batch_size'] = gr.Slider(label=t("batch_size"), minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
+                                shared.gradio['ubatch_size'] = gr.Slider(label=t("ubatch_size"), minimum=1, maximum=4096, step=1, value=shared.args.ubatch_size)
+                                shared.gradio['tensor_split'] = gr.Textbox(label=t('tensor_split'), info=t('List of proportions to split the model across multiple GPUs. Example: 60,40'))
+                                shared.gradio['extra_flags'] = gr.Textbox(label=t('extra-flags'), info=t('Extra flags to pass to llama-server. Example: --jinja --rpc 192.168.1.100:50052'), value=shared.args.extra_flags)
+                                shared.gradio['cpu_memory'] = gr.Number(label=t("Maximum CPU memory in GiB. Use this for CPU offloading."), value=shared.args.cpu_memory)
+                                shared.gradio['compute_dtype'] = gr.Dropdown(label=t("compute_dtype"), choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info=t('Used by load-in-4bit.'))
+                                shared.gradio['quant_type'] = gr.Dropdown(label=t("quant_type"), choices=["nf4", "fp4"], value=shared.args.quant_type, info=t('Used by load-in-4bit.'))
 
                             with gr.Column():
-                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='Use PyTorch in CPU mode.')
+                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info=t('Use PyTorch in CPU mode.'))
                                 shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
-                                shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
-                                shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces performance.')
-                                shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
-                                shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
-                                shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
+                                shared.gradio['row_split'] = gr.Checkbox(label=t("row_split"), value=shared.args.row_split, info=t('Split the model by rows across GPUs. This may improve multi-gpu performance.'))
+                                shared.gradio['no_kv_offload'] = gr.Checkbox(label=t("no_kv_offload"), value=shared.args.no_kv_offload, info=t('Do not offload the K, Q, V to the GPU. This saves VRAM but reduces performance.'))
+                                shared.gradio['no_mmap'] = gr.Checkbox(label=t("no-mmap"), value=shared.args.no_mmap)
+                                shared.gradio['mlock'] = gr.Checkbox(label=t("mlock"), value=shared.args.mlock)
+                                shared.gradio['numa'] = gr.Checkbox(label=t("numa"), value=shared.args.numa, info=t('NUMA support can help on some systems with non-uniform memory access.'))
                                 shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
-                                shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
-                                shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
+                                shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info=t('Necessary to use CFG with this loader.'))
+                                shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info=t('Set use_fast=False while loading the tokenizer.'))
                                 if not shared.args.portable:
                                     with gr.Row():
-                                        shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
+                                        shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label=t('LoRA(s)'), elem_classes='slim-dropdown', interactive=not mu)
                                         ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
-                                        shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
+                                        shared.gradio['lora_menu_apply'] = gr.Button(value=t('Apply LoRAs'), elem_classes='refresh-button', interactive=not mu)
 
             with gr.Column():
-                with gr.Tab("Download"):
-                    shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu)
-                    shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1, interactive=not mu)
+                with gr.Tab(t("Download")):
+                    shared.gradio['custom_model_menu'] = gr.Textbox(label=t("Download model or LoRA"), info=t("Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box."), interactive=not mu)
+                    shared.gradio['download_specific_file'] = gr.Textbox(placeholder=t("File name (for GGUF models)"), show_label=False, max_lines=1, interactive=not mu)
                     with gr.Row():
-                        shared.gradio['download_model_button'] = gr.Button("Download", variant='primary', interactive=not mu)
-                        shared.gradio['get_file_list'] = gr.Button("Get file list", interactive=not mu)
+                        shared.gradio['download_model_button'] = gr.Button(t("Download"), variant='primary', interactive=not mu)
+                        shared.gradio['get_file_list'] = gr.Button(t("Get file list"), interactive=not mu)
 
-                with gr.Tab("Customize instruction template"):
+                with gr.Tab(t("Customize instruction template")):
                     with gr.Row():
-                        shared.gradio['customized_template'] = gr.Dropdown(choices=utils.get_available_instruction_templates(), value='None', label='Select the desired instruction template', elem_classes='slim-dropdown')
+                        shared.gradio['customized_template'] = gr.Dropdown(choices=utils.get_available_instruction_templates(), value='None', label=t('Select the desired instruction template'), elem_classes='slim-dropdown')
                         ui.create_refresh_button(shared.gradio['customized_template'], lambda: None, lambda: {'choices': utils.get_available_instruction_templates()}, 'refresh-button', interactive=not mu)
 
-                    shared.gradio['customized_template_submit'] = gr.Button("Submit", variant="primary", interactive=not mu)
-                    gr.Markdown("This allows you to set a customized template for the model currently selected in the \"Model loader\" menu. Whenever the model gets loaded, this template will be used in place of the template specified in the model's metadata, which sometimes is wrong.")
+                    shared.gradio['customized_template_submit'] = gr.Button(t("Submit"), variant="primary", interactive=not mu)
+                    gr.Markdown(t("This allows you to set a customized template for the model currently selected in the \"Model loader\" menu. Whenever the model gets loaded, this template will be used in place of the template specified in the model's metadata, which sometimes is wrong."))
 
                 with gr.Row():
-                    shared.gradio['model_status'] = gr.Markdown('No model is loaded' if shared.model_name == 'None' else 'Ready')
+                    shared.gradio['model_status'] = gr.Markdown(t('No model is loaded') if shared.model_name == 'None' else t('Ready'))
 
 
 def create_event_handlers():
@@ -257,7 +258,7 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
                 return
 
         if not repo_id:
-            yield "Please enter a model path."
+            yield t("Please enter a model path.")
             progress(0.0)
             return
 
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 5411b29427..81aa5d147e 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -4,107 +4,108 @@
 
 from modules import loaders, presets, shared, ui, ui_chat, utils
 from modules.utils import gradio
+from modules.i18n import t
 
 
 def create_ui():
     mu = shared.args.multi_user
-    with gr.Tab("Parameters", elem_id="parameters"):
-        with gr.Tab("Generation"):
+    with gr.Tab(t("Parameters"), elem_id="parameters"):
+        with gr.Tab(t("Generation")):
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
-                        shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=shared.settings['preset'], label='Preset', elem_classes='slim-dropdown')
+                        shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=shared.settings['preset'], label=t('Preset'), elem_classes='slim-dropdown')
                         ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button', interactive=not mu)
                         shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
                         shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
-                        shared.gradio['reset_preset'] = gr.Button('Restore preset', elem_classes='refresh-button', interactive=True)
-                        shared.gradio['neutralize_samplers'] = gr.Button('Neutralize samplers', elem_classes='refresh-button', interactive=True)
+                        shared.gradio['reset_preset'] = gr.Button(t('Restore preset'), elem_classes='refresh-button', interactive=True)
+                        shared.gradio['neutralize_samplers'] = gr.Button(t('Neutralize samplers'), elem_classes='refresh-button', interactive=True)
 
                 with gr.Column():
-                    shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()) if not shared.args.portable else ['llama.cpp'], value="All", elem_classes='slim-dropdown')
+                    shared.gradio['filter_by_loader'] = gr.Dropdown(label=t("Filter by loader"), choices=["All"] + list(loaders.loaders_and_params.keys()) if not shared.args.portable else ['llama.cpp'], value="All", elem_classes='slim-dropdown')
 
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
                         with gr.Column():
-                            gr.Markdown('## Curve shape')
-                            shared.gradio['temperature'] = gr.Slider(0.01, 5, value=shared.settings['temperature'], step=0.01, label='temperature')
-                            shared.gradio['dynatemp_low'] = gr.Slider(0.01, 5, value=shared.settings['dynatemp_low'], step=0.01, label='dynatemp_low', visible=shared.settings['dynamic_temperature'])
-                            shared.gradio['dynatemp_high'] = gr.Slider(0.01, 5, value=shared.settings['dynatemp_high'], step=0.01, label='dynatemp_high', visible=shared.settings['dynamic_temperature'])
-                            shared.gradio['dynatemp_exponent'] = gr.Slider(0.01, 5, value=shared.settings['dynatemp_exponent'], step=0.01, label='dynatemp_exponent', visible=shared.settings['dynamic_temperature'])
-                            shared.gradio['smoothing_factor'] = gr.Slider(0.0, 10.0, value=shared.settings['smoothing_factor'], step=0.01, label='smoothing_factor', info='Activates Quadratic Sampling.')
-                            shared.gradio['smoothing_curve'] = gr.Slider(1.0, 10.0, value=shared.settings['smoothing_curve'], step=0.01, label='smoothing_curve', info='Adjusts the dropoff curve of Quadratic Sampling.')
-                            shared.gradio['dynamic_temperature'] = gr.Checkbox(value=shared.settings['dynamic_temperature'], label='dynamic_temperature')
-
-                            gr.Markdown('## Curve cutoff')
-                            shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=shared.settings['top_p'], step=0.01, label='top_p')
-                            shared.gradio['top_k'] = gr.Slider(0, 200, value=shared.settings['top_k'], step=1, label='top_k')
-                            shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=shared.settings['min_p'], step=0.01, label='min_p')
-                            shared.gradio['top_n_sigma'] = gr.Slider(0.0, 5.0, value=shared.settings['top_n_sigma'], step=0.01, label='top_n_sigma')
-                            shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=shared.settings['typical_p'], step=0.01, label='typical_p')
-                            shared.gradio['xtc_threshold'] = gr.Slider(0, 0.5, value=shared.settings['xtc_threshold'], step=0.01, label='xtc_threshold', info='If 2 or more tokens have probability above this threshold, consider removing all but the last one.')
-                            shared.gradio['xtc_probability'] = gr.Slider(0, 1, value=shared.settings['xtc_probability'], step=0.01, label='xtc_probability', info='Probability that the removal will actually happen. 0 disables the sampler. 1 makes it always happen.')
+                            gr.Markdown(t('## Curve shape'))
+                            shared.gradio['temperature'] = gr.Slider(0.01, 5, value=shared.settings['temperature'], step=0.01, label=t('temperature'))
+                            shared.gradio['dynatemp_low'] = gr.Slider(0.01, 5, value=shared.settings['dynatemp_low'], step=0.01, label=t('dynatemp_low'), visible=shared.settings['dynamic_temperature'])
+                            shared.gradio['dynatemp_high'] = gr.Slider(0.01, 5, value=shared.settings['dynatemp_high'], step=0.01, label=t('dynatemp_high'), visible=shared.settings['dynamic_temperature'])
+                            shared.gradio['dynatemp_exponent'] = gr.Slider(0.01, 5, value=shared.settings['dynatemp_exponent'], step=0.01, label=t('dynatemp_exponent'), visible=shared.settings['dynamic_temperature'])
+                            shared.gradio['smoothing_factor'] = gr.Slider(0.0, 10.0, value=shared.settings['smoothing_factor'], step=0.01, label='smoothing_factor', info=t('Activates Quadratic Sampling.'))
+                            shared.gradio['smoothing_curve'] = gr.Slider(1.0, 10.0, value=shared.settings['smoothing_curve'], step=0.01, label='smoothing_curve', info=t('Adjusts the dropoff curve of Quadratic Sampling.'))
+                            shared.gradio['dynamic_temperature'] = gr.Checkbox(value=shared.settings['dynamic_temperature'], label=t('dynamic_temperature'))
+
+                            gr.Markdown(t('## Curve cutoff'))
+                            shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=shared.settings['top_p'], step=0.01, label=t('top_p'))
+                            shared.gradio['top_k'] = gr.Slider(0, 200, value=shared.settings['top_k'], step=1, label=t('top_k'))
+                            shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=shared.settings['min_p'], step=0.01, label=t('min_p'))
+                            shared.gradio['top_n_sigma'] = gr.Slider(0.0, 5.0, value=shared.settings['top_n_sigma'], step=0.01, label=t('top_n_sigma'))
+                            shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=shared.settings['typical_p'], step=0.01, label=t('typical_p'))
+                            shared.gradio['xtc_threshold'] = gr.Slider(0, 0.5, value=shared.settings['xtc_threshold'], step=0.01, label=t('xtc_threshold'), info=t('If 2 or more tokens have probability above this threshold, consider removing all but the last one.'))
+                            shared.gradio['xtc_probability'] = gr.Slider(0, 1, value=shared.settings['xtc_probability'], step=0.01, label=t('xtc_probability'), info=t('Probability that the removal will actually happen. 0 disables the sampler. 1 makes it always happen.'))
                             shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=shared.settings['epsilon_cutoff'], step=0.01, label='epsilon_cutoff')
                             shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=shared.settings['eta_cutoff'], step=0.01, label='eta_cutoff')
                             shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=shared.settings['tfs'], step=0.01, label='tfs')
                             shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=shared.settings['top_a'], step=0.01, label='top_a')
 
-                            gr.Markdown('## Repetition suppression')
-                            shared.gradio['dry_multiplier'] = gr.Slider(0, 5, value=shared.settings['dry_multiplier'], step=0.01, label='dry_multiplier', info='Set to greater than 0 to enable DRY. Recommended value: 0.8.')
-                            shared.gradio['dry_allowed_length'] = gr.Slider(1, 20, value=shared.settings['dry_allowed_length'], step=1, label='dry_allowed_length', info='Longest sequence that can be repeated without being penalized.')
-                            shared.gradio['dry_base'] = gr.Slider(1, 4, value=shared.settings['dry_base'], step=0.01, label='dry_base', info='Controls how fast the penalty grows with increasing sequence length.')
-                            shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=shared.settings['repetition_penalty'], step=0.01, label='repetition_penalty')
-                            shared.gradio['frequency_penalty'] = gr.Slider(0, 2, value=shared.settings['frequency_penalty'], step=0.05, label='frequency_penalty')
-                            shared.gradio['presence_penalty'] = gr.Slider(0, 2, value=shared.settings['presence_penalty'], step=0.05, label='presence_penalty')
+                            gr.Markdown(t('## Repetition suppression'))
+                            shared.gradio['dry_multiplier'] = gr.Slider(0, 5, value=shared.settings['dry_multiplier'], step=0.01, label=t('dry_multiplier'), info=t('Set to greater than 0 to enable DRY. Recommended value: 0.8.'))
+                            shared.gradio['dry_allowed_length'] = gr.Slider(1, 20, value=shared.settings['dry_allowed_length'], step=1, label=t('dry_allowed_length'), info=t('Longest sequence that can be repeated without being penalized.'))
+                            shared.gradio['dry_base'] = gr.Slider(1, 4, value=shared.settings['dry_base'], step=0.01, label=t('dry_base'), info=t('Controls how fast the penalty grows with increasing sequence length.'))
+                            shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=shared.settings['repetition_penalty'], step=0.01, label=t('repetition_penalty'))
+                            shared.gradio['frequency_penalty'] = gr.Slider(0, 2, value=shared.settings['frequency_penalty'], step=0.05, label=t('frequency_penalty'))
+                            shared.gradio['presence_penalty'] = gr.Slider(0, 2, value=shared.settings['presence_penalty'], step=0.05, label=t('presence_penalty'))
                             shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=shared.settings['encoder_repetition_penalty'], step=0.01, label='encoder_repetition_penalty')
                             shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=shared.settings['no_repeat_ngram_size'], label='no_repeat_ngram_size')
-                            shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=shared.settings['repetition_penalty_range'], label='repetition_penalty_range')
+                            shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=shared.settings['repetition_penalty_range'], label=t('repetition_penalty_range'))
 
                         with gr.Column():
-                            gr.Markdown('## Alternative sampling methods')
-                            shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=shared.settings['penalty_alpha'], label='penalty_alpha', info='For Contrastive Search. do_sample must be unchecked.')
-                            shared.gradio['guidance_scale'] = gr.Slider(-0.5, 2.5, step=0.05, value=shared.settings['guidance_scale'], label='guidance_scale', info='For CFG. 1.5 is a good value.')
-                            shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=shared.settings['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
-                            shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=shared.settings['mirostat_tau'], label='mirostat_tau')
-                            shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=shared.settings['mirostat_eta'], label='mirostat_eta')
-                            shared.gradio['adaptive_target'] = gr.Slider(0.0, 1.0, value=shared.settings['adaptive_target'], step=0.01, label='adaptive_target', info='Target probability for adaptive-p sampling. Tokens near this probability are favored. 0 disables.')
-                            shared.gradio['adaptive_decay'] = gr.Slider(0.0, 0.99, value=shared.settings['adaptive_decay'], step=0.01, label='adaptive_decay', info='EMA decay rate for adaptive-p. Controls history window (~1/(1-decay) tokens). Default: 0.9.')
-
-                            gr.Markdown('## Other options')
+                            gr.Markdown(t('## Alternative sampling methods'))
+                            shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=shared.settings['penalty_alpha'], label=t('penalty_alpha'), info=t('For Contrastive Search. do_sample must be unchecked.'))
+                            shared.gradio['guidance_scale'] = gr.Slider(-0.5, 2.5, step=0.05, value=shared.settings['guidance_scale'], label=t('guidance_scale'), info=t('For CFG. 1.5 is a good value.'))
+                            shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=shared.settings['mirostat_mode'], label=t('mirostat_mode'), info=t('mode=1 is for llama.cpp only.'))
+                            shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=shared.settings['mirostat_tau'], label=t('mirostat_tau'))
+                            shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=shared.settings['mirostat_eta'], label=t('mirostat_eta'))
+                            shared.gradio['adaptive_target'] = gr.Slider(0.0, 1.0, value=shared.settings['adaptive_target'], step=0.01, label=t('adaptive_target'), info=t('Target probability for adaptive-p sampling. Tokens near this probability are favored. 0 disables.'))
+                            shared.gradio['adaptive_decay'] = gr.Slider(0.0, 0.99, value=shared.settings['adaptive_decay'], step=0.01, label=t('adaptive_decay'), info=t('EMA decay rate for adaptive-p. Controls history window (~1/(1-decay) tokens). Default: 0.9.'))
+
+                            gr.Markdown(t('## Other options'))
                             shared.gradio['do_sample'] = gr.Checkbox(value=shared.settings['do_sample'], label='do_sample')
-                            shared.gradio['temperature_last'] = gr.Checkbox(value=shared.settings['temperature_last'], label='temperature_last', info='Moves temperature/dynamic temperature/quadratic sampling to the end of the sampler stack, ignoring their positions in "Sampler priority".')
-                            shared.gradio['sampler_priority'] = gr.DragDrop(value=shared.settings['sampler_priority'], label='Sampler priority', info='Parameter names separated by new lines or commas.', elem_classes=['add_scrollbar'])
-                            shared.gradio['dry_sequence_breakers'] = gr.Textbox(value=shared.settings['dry_sequence_breakers'], label='dry_sequence_breakers', info='Tokens across which sequence matching is not continued. Specified as a comma-separated list of quoted strings.')
+                            shared.gradio['temperature_last'] = gr.Checkbox(value=shared.settings['temperature_last'], label=t('temperature_last'), info=t('Moves temperature/dynamic temperature/quadratic sampling to the end of the sampler stack, ignoring their positions in "Sampler priority".'))
+                            shared.gradio['sampler_priority'] = gr.DragDrop(value=shared.settings['sampler_priority'], label=t('Sampler priority'), info=t('Parameter names separated by new lines or commas.'), elem_classes=['add_scrollbar'])
+                            shared.gradio['dry_sequence_breakers'] = gr.Textbox(value=shared.settings['dry_sequence_breakers'], label=t('dry_sequence_breakers'), info=t('Tokens across which sequence matching is not continued. Specified as a comma-separated list of quoted strings.'))
 
                 with gr.Column():
                     with gr.Row():
                         with gr.Column():
                             with gr.Blocks():
-                                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label='max_new_tokens', info='⚠️ Setting this too high can cause prompt truncation.')
-                                shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
-                                shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
+                                shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label=t('max_new_tokens'), info=t('⚠️ Setting this too high can cause prompt truncation.'))
+                                shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info=t('Activates Prompt Lookup Decoding.'))
+                                shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label=t('Maximum tokens/second'), info=t('To make text readable in real time.'))
 
-                            shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
-                            shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
-                            shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Only applies to text completion (notebook). In chat mode, templates control BOS tokens.')
-                            shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
-                            shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
-                            shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')
+                            shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label=t('auto_max_new_tokens'), info=t('Expand max_new_tokens to the available context length.'))
+                            shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label=t('Ban the eos_token'), info=t('Forces the model to never end the generation prematurely.'))
+                            shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label=t('Add the bos_token to the beginning of prompts'), info=t('Only applies to text completion (notebook). In chat mode, templates control BOS tokens.'))
+                            shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info=t('Some specific models need this unset.'))
+                            shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label=t('Activate text streaming'))
+                            shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label=t('Static KV cache'), info=t('Use a static cache for improved performance.'))
 
                         with gr.Column():
-                            shared.gradio['truncation_length'] = gr.Number(precision=0, step=256, value=get_truncation_length(), label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length.')
-                            shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)')
-                            shared.gradio['custom_system_message'] = gr.Textbox(value=shared.settings['custom_system_message'], lines=2, label='Custom system message', info='If not empty, will be used instead of the default one.', elem_classes=['add_scrollbar'])
-                            shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=2, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"')
-                            shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label='Token bans', info='Token IDs to ban, separated by commas. The IDs can be found in the Default or Notebook tab.')
-                            shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt', info='For CFG. Only used when guidance_scale is different than 1.', lines=3, elem_classes=['add_scrollbar'])
+                            shared.gradio['truncation_length'] = gr.Number(precision=0, step=256, value=get_truncation_length(), label=t('Truncate the prompt up to this length'), info=t('The leftmost tokens are removed if the prompt exceeds this length.'))
+                            shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label=t('Seed (-1 for random)'))
+                            shared.gradio['custom_system_message'] = gr.Textbox(value=shared.settings['custom_system_message'], lines=2, label=t('Custom system message'), info=t('If not empty, will be used instead of the default one.'), elem_classes=['add_scrollbar'])
+                            shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=2, value=shared.settings["custom_stopping_strings"] or None, label=t('Custom stopping strings'), info=t('Written between \"\" and separated by commas.'), placeholder='"\\n", "\\nYou:"')
+                            shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label=t('Token bans'), info=t('Token IDs to ban, separated by commas. The IDs can be found in the Default or Notebook tab.'))
+                            shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label=t('Negative prompt'), info=t('For CFG. Only used when guidance_scale is different than 1.'), lines=3, elem_classes=['add_scrollbar'])
                             with gr.Row() as shared.gradio['grammar_file_row']:
-                                shared.gradio['grammar_file'] = gr.Dropdown(value='None', choices=utils.get_available_grammars(), label='Load grammar from file (.gbnf)', elem_classes='slim-dropdown')
+                                shared.gradio['grammar_file'] = gr.Dropdown(value='None', choices=utils.get_available_grammars(), label=t('Load grammar from file (.gbnf)'), elem_classes='slim-dropdown')
                                 ui.create_refresh_button(shared.gradio['grammar_file'], lambda: None, lambda: {'choices': utils.get_available_grammars()}, 'refresh-button', interactive=not mu)
                                 shared.gradio['save_grammar'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
                                 shared.gradio['delete_grammar'] = gr.Button('🗑️ ', elem_classes='refresh-button', interactive=not mu)
 
-                            shared.gradio['grammar_string'] = gr.Textbox(value=shared.settings['grammar_string'], label='Grammar', lines=16, elem_classes=['add_scrollbar', 'monospace'])
+                            shared.gradio['grammar_string'] = gr.Textbox(value=shared.settings['grammar_string'], label=t('Grammar'), lines=16, elem_classes=['add_scrollbar', 'monospace'])
 
         ui_chat.create_chat_settings_ui()
 

From e2a8eb936756767f8264fb03efb09577e58e9311 Mon Sep 17 00:00:00 2001
From: ystartgo <startgo@yia.app>
Date: Sun, 15 Feb 2026 20:50:50 +0800
Subject: [PATCH 11/12] lint

---
 modules/i18n.py          | 4 ++--
 modules/ui_model_menu.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/i18n.py b/modules/i18n.py
index 4fb8f60336..a379dee0a8 100644
--- a/modules/i18n.py
+++ b/modules/i18n.py
@@ -95,7 +95,7 @@
     "Send to notebook": "發送到筆記本",
     "Send to Notebook": "發送到筆記本",
     "Chat template": "聊天模板",
-    "Send": "傳送",
+    # duplicate key removed; keep single mapping for "Send"
     "Regenerate (Ctrl + Enter)": "重新生成（Ctrl + Enter）",
     "Continue (Alt + Enter)": "繼續（Alt + Enter）",
     "Remove last reply (Ctrl + Shift + Backspace)": "移除上一則回覆（Ctrl + Shift + Backspace）",
@@ -253,7 +253,7 @@
     "top_k": "top_k Top-k",
     "typical_p": "typical_p Typical-p",
     "dry_multiplier": "dry 乘數",
-    "dry_allowed_length": "允許重複長度",
+    # duplicate key removed; keep prefixed form above
     "Good morning!": "早安！",
     "Good afternoon!": "午安！",
     "Good evening!": "晚安！",
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 9ab87e2fca..e847477a45 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -343,7 +343,7 @@ def downloader_thread_target():
                     specific_file=specific_file
                 )
                 update_queue.put(("COMPLETED", f"Model successfully saved to `{output_folder}/`."))
-            except Exception as e:
+            except Exception:
                 tb_str = traceback.format_exc().replace('\n', '\n\n')
                 update_queue.put(("ERROR", tb_str))
 
@@ -378,7 +378,7 @@ def downloader_thread_target():
 
         download_thread.join()
 
-    except Exception as e:
+    except Exception:
         progress(0.0)
         tb_str = traceback.format_exc().replace('\n', '\n\n')
         yield tb_str

From aa77a3ac9b1992f7e887cf113a3554f869b79f34 Mon Sep 17 00:00:00 2001
From: ystartgo <startgo@yia.app>
Date: Sun, 15 Feb 2026 21:02:04 +0800
Subject: [PATCH 12/12] fix

---
 modules/i18n.py          | 10 ++++++++++
 modules/ui_model_menu.py | 15 +++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/modules/i18n.py b/modules/i18n.py
index a379dee0a8..d3e3ab860c 100644
--- a/modules/i18n.py
+++ b/modules/i18n.py
@@ -107,6 +107,16 @@
     "Reasoning effort": "推理強度",
     "Enable thinking": "啟用思考",
     "For models with thinking support.": "適用於支援思考的模型。",
+    "compute_dtype": "compute_dtype 計算精度",
+    "quant_type": "quant_type 量化類型",
+    "Number of experts per token": "每個詞元的專家數",
+    "cpu": "cpu CPU 模式",
+    "disk": "disk 硬碟卸載",
+    "bf16": "bf16 bfloat16",
+    "no_flash_attn": "no_flash_attn 停用 Flash-Attn",
+    "no_xformers": "no_xformers 停用 xFormers",
+    "no_sdpa": "no_sdpa 停用 SDPA",
+    "cfg-cache": "cfg-cache CFG 快取",
     "Activate web search": "啟用網頁搜尋",
     "Number of pages to download": "下載頁數",
     "Mode": "模式",
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index e847477a45..d0603e320b 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -104,19 +104,26 @@ def create_ui():
                                 shared.gradio['tensor_split'] = gr.Textbox(label=t('tensor_split'), info=t('List of proportions to split the model across multiple GPUs. Example: 60,40'))
                                 shared.gradio['extra_flags'] = gr.Textbox(label=t('extra-flags'), info=t('Extra flags to pass to llama-server. Example: --jinja --rpc 192.168.1.100:50052'), value=shared.args.extra_flags)
                                 shared.gradio['cpu_memory'] = gr.Number(label=t("Maximum CPU memory in GiB. Use this for CPU offloading."), value=shared.args.cpu_memory)
+                                shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info=t('Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.'))
+                                shared.gradio['rope_freq_base'] = gr.Number(label=t('rope_freq_base'), value=shared.args.rope_freq_base, precision=0, info=t('Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.'))
+                                shared.gradio['compress_pos_emb'] = gr.Number(label=t('compress_pos_emb'), value=shared.args.compress_pos_emb, precision=2, info=t("Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale."))
                                 shared.gradio['compute_dtype'] = gr.Dropdown(label=t("compute_dtype"), choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info=t('Used by load-in-4bit.'))
                                 shared.gradio['quant_type'] = gr.Dropdown(label=t("quant_type"), choices=["nf4", "fp4"], value=shared.args.quant_type, info=t('Used by load-in-4bit.'))
+                                shared.gradio['num_experts_per_token'] = gr.Number(label=t("Number of experts per token"), value=shared.args.num_experts_per_token, info=t('Only applies to MoE models like Mixtral.'))
 
                             with gr.Column():
-                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info=t('Use PyTorch in CPU mode.'))
-                                shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
+                                shared.gradio['cpu'] = gr.Checkbox(label=t("cpu"), value=shared.args.cpu, info=t('Use PyTorch in CPU mode.'))
+                                shared.gradio['disk'] = gr.Checkbox(label=t("disk"), value=shared.args.disk)
                                 shared.gradio['row_split'] = gr.Checkbox(label=t("row_split"), value=shared.args.row_split, info=t('Split the model by rows across GPUs. This may improve multi-gpu performance.'))
                                 shared.gradio['no_kv_offload'] = gr.Checkbox(label=t("no_kv_offload"), value=shared.args.no_kv_offload, info=t('Do not offload the K, Q, V to the GPU. This saves VRAM but reduces performance.'))
                                 shared.gradio['no_mmap'] = gr.Checkbox(label=t("no-mmap"), value=shared.args.no_mmap)
                                 shared.gradio['mlock'] = gr.Checkbox(label=t("mlock"), value=shared.args.mlock)
                                 shared.gradio['numa'] = gr.Checkbox(label=t("numa"), value=shared.args.numa, info=t('NUMA support can help on some systems with non-uniform memory access.'))
-                                shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
-                                shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info=t('Necessary to use CFG with this loader.'))
+                                shared.gradio['bf16'] = gr.Checkbox(label=t("bf16"), value=shared.args.bf16)
+                                shared.gradio['no_flash_attn'] = gr.Checkbox(label=t("no_flash_attn"), value=shared.args.no_flash_attn)
+                                shared.gradio['no_xformers'] = gr.Checkbox(label=t("no_xformers"), value=shared.args.no_xformers)
+                                shared.gradio['no_sdpa'] = gr.Checkbox(label=t("no_sdpa"), value=shared.args.no_sdpa)
+                                shared.gradio['cfg_cache'] = gr.Checkbox(label=t("cfg-cache"), value=shared.args.cfg_cache, info=t('Necessary to use CFG with this loader.'))
                                 shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info=t('Set use_fast=False while loading the tokenizer.'))
                                 if not shared.args.portable:
                                     with gr.Row():