From 9fdb4cfd16f3ad329f48cd1b2a05e0fc06754179 Mon Sep 17 00:00:00 2001
From: kaka <kaka@example.com>
Date: Mon, 1 Dec 2025 22:26:27 +0700
Subject: [PATCH 1/6] feat: Added sampling denylist paramter

for paramters that do not support sampling, replace `max_tokens` with `max_completion_tokens`. Remove unsupported parameters such as `temperature` and `top_p`.
---
 autoload/ollama/review.vim | 14 ++++++++++++--
 plugin/ollama.vim          |  4 ++++
 python/chat.py             | 35 ++++++++++++++++++++++-------------
 3 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/autoload/ollama/review.vim b/autoload/ollama/review.vim
index ee89a7b..3686c89 100644
--- a/autoload/ollama/review.vim
+++ b/autoload/ollama/review.vim
@@ -136,9 +136,18 @@ function! s:StartChat(lines) abort
     endfunc
 
     let l:model_options = json_encode(g:ollama_chat_options)
-    call ollama#logger#Debug("Connecting to Ollama on " .. g:ollama_host .. " using model " .. g:ollama_model)
+    call ollama#logger#Debug("Chat Connecting to Ollama on " .. g:ollama_host .. " using model " .. g:ollama_model)
     call ollama#logger#Debug("model_options=" .. l:model_options)
 
+    if exists('g:ollama_model_sampling_denylist')
+            \ && len(g:ollama_model_sampling_denylist) > 0
+            \ && index(g:ollama_model_sampling_denylist, g:ollama_chat_model) >= 0
+        let l:sampling_enabled = 0
+    else
+        let l:sampling_enabled = 1
+    endif
+    call ollama#logger#Debug("sampling_enabled=" .. l:sampling_enabled)
+
     " Convert plugin debug level to python logger levels
     let l:log_level = ollama#logger#PythonLogLevel(g:ollama_debug)
     let l:base_url = g:ollama_host
@@ -154,6 +163,7 @@ function! s:StartChat(lines) abort
                 \ '-m', g:ollama_chat_model,
                 \ '-u', l:base_url,
                 \ '-o', l:model_options,
+                \ "-se", l:sampling_enabled,
                 \ '-t', g:ollama_chat_timeout,
                 \ '-l', l:log_level ]
     " Check if a system prompt was configured
@@ -205,7 +215,7 @@ function! s:StartChat(lines) abort
         silent execute 'new' l:bufname
     endif
     " Set the filetype to ollama-chat
-"    setlocal filetype=ollama-chat
+    " setlocal filetype=ollama-chat
     setlocal filetype=markdown
     setlocal buftype=prompt
     " enable BufDelete event when closing buffer usig :q!
diff --git a/plugin/ollama.vim b/plugin/ollama.vim
index 7945af4..fa24a6f 100644
--- a/plugin/ollama.vim
+++ b/plugin/ollama.vim
@@ -98,6 +98,10 @@ if !exists('g:ollama_model_options')
                 \ 'max_tokens': 500
                 \ }
 endif
+if !exists('g:ollama_model_sampling_denylist')
+    " default model sampling denylist
+  let g:ollama_model_sampling_denylist = []
+endif
 " Chat specific settings
 if !exists('g:ollama_chat_provider')
     " Provider for chat models: 'ollama' or 'openai'
diff --git a/python/chat.py b/python/chat.py
index c947c2e..b3bd6e8 100755
--- a/python/chat.py
+++ b/python/chat.py
@@ -90,7 +90,7 @@ async def stream_chat_message_ollama(messages, endpoint, model, options, timeout
         messages.append({"role": "assistant", "content": assistant_message.strip()})
 
 
-async def stream_chat_message_openai(messages, endpoint, model, options, credentialname):
+async def stream_chat_message_openai(messages, endpoint, model, options, sampling_enabled, credentialname):
     """Stream chat responses from OpenAI API."""
     if AsyncOpenAI is None:
         raise ImportError("OpenAI package not found. Please install via 'pip install openai'.")
@@ -114,14 +114,22 @@ async def stream_chat_message_openai(messages, endpoint, model, options, credent
     top_p = options.get('top_p', 1.0)
 
     try:
-        stream = await client.chat.completions.create(
-            model=model,
-            messages=messages,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=top_p,
-            stream=True,
-        )
+        # Build request parameters
+        request_params = {
+            'model': model,
+            'messages': messages,
+            'stream': True,
+        }
+
+        # Check if model supports sampling parameters
+        if sampling_enabled:
+            request_params['temperature'] = temperature
+            request_params['top_p'] = top_p
+            request_params['max_tokens'] = max_tokens
+        else:
+            request_params['max_completion_tokens'] = max_tokens
+
+        stream = await client.chat.completions.create(**request_params)
 
         async for chunk in stream:
             if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content:
@@ -139,7 +147,7 @@ async def stream_chat_message_openai(messages, endpoint, model, options, credent
         messages.append({"role": "assistant", "content": assistant_message.strip()})
 
 
-async def main(provider, endpoint, model, options, systemprompt, timeout, credentialname):
+async def main(provider, endpoint, model, options, sampling_enabled, systemprompt, timeout, credentialname):
     conversation_history = []
     log.debug("endpoint: " + str(endpoint))
 
@@ -169,7 +177,7 @@ async def main(provider, endpoint, model, options, systemprompt, timeout, creden
                         )
                     else:
                         task = asyncio.create_task(
-                            stream_chat_message_openai(conversation_history, endpoint, model, options, credentialname)
+                            stream_chat_message_openai(conversation_history, endpoint, model, options, sampling_enabled, credentialname)
                         )
                     await task
                 else:
@@ -189,7 +197,7 @@ async def main(provider, endpoint, model, options, systemprompt, timeout, creden
                         )
                     else:
                         task = asyncio.create_task(
-                            stream_chat_message_openai(conversation_history, endpoint, model, options, credentialname)
+                            stream_chat_message_openai(conversation_history, endpoint, model, options, sampling_enabled, credentialname)
                         )
                     await task
 
@@ -213,6 +221,7 @@ async def main(provider, endpoint, model, options, systemprompt, timeout, creden
                         help="Base endpoint URL.")
     parser.add_argument("-o", "--options", type=str, default=DEFAULT_OPTIONS,
                         help="Ollama REST API options.")
+    parser.add_argument("-se", "--sampling-enabled", type=int, default=1, help="Enable or disable sampling.")
     parser.add_argument("-s", "--system-prompt", type=str, default="", help="Specify system prompt.")
     parser.add_argument("-t", "--timeout", type=int, default=DEFAULT_TIMEOUT, help="Timeout in seconds.")
     parser.add_argument("-l", "--log-level", type=int, default=OllamaLogger.ERROR, help="Log level.")
@@ -243,7 +252,7 @@ async def main(provider, endpoint, model, options, systemprompt, timeout, creden
     try:
         while True:
             try:
-                asyncio.run(main(args.provider, endpoint, model, options, args.system_prompt, args.timeout, args.keyname))
+                asyncio.run(main(args.provider, endpoint, model, options, args.sampling_enabled, args.system_prompt, args.timeout, args.keyname))
             except KeyboardInterrupt:
                 print("Canceled.")
                 break

From c10e1b808c2a9912e9c2098aa9db16b91a594eba Mon Sep 17 00:00:00 2001
From: kaka <kaka@example.com>
Date: Tue, 2 Dec 2025 13:26:00 +0700
Subject: [PATCH 2/6] feat: Stream tokens on the same line with real-time
 cursor tracking

---
 autoload/ollama/review.vim | 95 ++++++++++++++++++++++++++++++++++++--
 python/chat.py             |  8 +++-
 2 files changed, 98 insertions(+), 5 deletions(-)

diff --git a/autoload/ollama/review.vim b/autoload/ollama/review.vim
index 3686c89..7b34bff 100644
--- a/autoload/ollama/review.vim
+++ b/autoload/ollama/review.vim
@@ -56,6 +56,9 @@ function! s:FindBufferWindow(bufnr)
 endfunction
 
 function! s:StartChat(lines) abort
+    " Counter for reducing redraw frequency
+    let s:token_count = 0
+
     " Function handling a line of text that has been typed.
     func! TextEntered(text)
         call ollama#logger#Debug("TextEntered: " .. a:text)
@@ -65,12 +68,13 @@ function! s:StartChat(lines) abort
         endif
         " Send the text to a shell with Enter appended.
         call ch_sendraw(s:job, a:text .. "\n")
+        " Reset token count for new request
+        let s:token_count = 0
     endfunc
 
-    " Function handling output from the shell: Add it above the prompt.
-    func! GotOutput(channel, msg)
+    " OLD VERSION: Append each token as a new line (non-streaming)
+    func! GotOutputOld(channel, msg)
         call ollama#logger#Debug("GotOutput: " .. a:msg)
-
         " append lines
         let l:lines = split(a:msg, "\n")
         for l:line in l:lines
@@ -96,6 +100,91 @@ function! s:StartChat(lines) abort
         endfor
     endfunc
 
+    " NEW VERSION: Stream tokens on the same line with real-time cursor tracking
+    func! GotOutputNew(channel, msg)
+        " call ollama#logger#Debug("GotOutput: [" .. a:msg .. "]")
+
+        " Check for <EOT> marker
+        let l:idx = stridx(a:msg, "<EOT>")
+        let l:is_eot = l:idx != -1
+        let l:content = l:is_eot ? strpart(a:msg, 0, l:idx) : a:msg
+
+        " Append content to the last line for streaming effect
+        let l:updated_line_num = 0
+        let l:updated_line_content = ""
+        let l:line_count = 0
+
+        if !empty(l:content)
+            " Get buffer line count efficiently
+            let l:buf_info = getbufinfo(s:buf)[0]
+            let l:line_count = l:buf_info.linecount
+            " call ollama#logger#Debug("line_count=" .. l:line_count)
+
+            if l:line_count == 0
+                " Buffer is empty, append as new line
+                " call ollama#logger#Debug("Buffer empty, appending first line")
+                call appendbufline(s:buf, 0, l:content)
+                let l:updated_line_num = 1
+                let l:updated_line_content = l:content
+            else
+                " Get only the last line (much faster than getting all lines)
+                let l:last_line = getbufline(s:buf, l:line_count, l:line_count)[0]
+                let l:updated_line_content = l:last_line .. l:content
+                " call ollama#logger#Debug("Appending to line " .. l:line_count .. ": '" .. l:last_line .. "' + '" .. l:content .. "'")
+                call setbufline(s:buf, l:line_count, l:updated_line_content)
+                let l:updated_line_num = l:line_count
+            endif
+        endif
+
+        " When streaming is done, add a new line for the next input
+        if l:is_eot
+            " call ollama#logger#Debug("EOT received, adding newline")
+            call appendbufline(s:buf, "$", "")
+            " Reuse line_count if we already got it, otherwise fetch
+            if l:line_count > 0
+                let l:updated_line_num = l:line_count + 1
+            else
+                let l:buf_info = getbufinfo(s:buf)[0]
+                let l:updated_line_num = l:buf_info.linecount
+            endif
+            let l:updated_line_content = ""
+        endif
+
+        " Update cursor position if this is the active chat window
+        if bufname() == s:ollama_bufname " Check if current active window is Ollama Chat
+            let l:winid = bufwinid(s:buf)
+            if l:winid != -1 && l:updated_line_num > 0
+                " Set cursor position directly (much faster than feedkeys)
+                let l:col = len(l:updated_line_content) + 1
+                call win_execute(l:winid, 'call cursor(' . l:updated_line_num . ', ' . l:col . ')')
+
+                " Increment token counter and only redraw every N tokens (or always for EOT)
+                let s:token_count += 1
+                if l:is_eot || s:token_count % 5 == 0
+                    redraw
+                endif
+
+                if l:is_eot
+                    " Streaming done, enter insert mode
+                    if mode() == 'i'
+                        call feedkeys("\<Esc>")
+                    endif
+                    call feedkeys("a")
+                endif
+            endif
+        endif
+    endfunc
+
+    " Wrapper function that delegates to new version by default
+    " To use old version, set g:ollama_use_old_output = 1
+    func! GotOutput(channel, msg)
+        if exists('g:ollama_use_old_output') && g:ollama_use_old_output
+            call GotOutputOld(a:channel, a:msg)
+        else
+            call GotOutputNew(a:channel, a:msg)
+        endif
+    endfunc
+
     " Function handling output from the shell: Add it above the prompt.
     func! GotErrors(channel, msg)
         call ollama#logger#Debug("GotErrors: " .. a:msg)
diff --git a/python/chat.py b/python/chat.py
index b3bd6e8..fde4f2e 100755
--- a/python/chat.py
+++ b/python/chat.py
@@ -63,7 +63,9 @@ async def stream_chat_message_ollama(messages, endpoint, model, options, timeout
                             if "message" in message and "content" in message["message"]:
                                 content = message["message"]["content"]
                                 assistant_message += content
-                                print(content, end="", flush=True)
+                                # Print each token followed by newline so Vim's out_cb receives it immediately
+                                # VimScript will need to handle concatenating tokens on the same line
+                                print(content, flush=True)
 
                                 # If <EOT> is detected, stop processing
                                 if "<EOT>" in content:
@@ -135,7 +137,9 @@ async def stream_chat_message_openai(messages, endpoint, model, options, samplin
             if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content:
                 token = chunk.choices[0].delta.content
                 assistant_message += token
-                print(token, end="", flush=True)
+                # Print each token followed by newline so Vim's out_cb receives it immediately
+                # VimScript will need to handle concatenating tokens on the same line
+                print(token, flush=True)
 
         print("<EOT>", flush=True)
 

From 6dbed9c2bf8afe6f5e015247bf0531b2af4a5fa4 Mon Sep 17 00:00:00 2001
From: kaka <kaka@example.com>
Date: Tue, 2 Dec 2025 16:08:14 +0700
Subject: [PATCH 3/6] feat: Added sampling_enabled for
 generate_code_completion_openai

---
 autoload/ollama.vim | 11 +++++++++++
 python/complete.py  | 27 ++++++++++++++++++---------
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/autoload/ollama.vim b/autoload/ollama.vim
index 9cc389b..bc7bc89 100644
--- a/autoload/ollama.vim
+++ b/autoload/ollama.vim
@@ -221,6 +221,16 @@ function! ollama#GetSuggestion(timer)
           \ "Connecting to Ollama on " .. g:ollama_host
           \ .. " using model " .. g:ollama_model)
     call ollama#logger#Debug("model_options=" .. l:model_options)
+
+    if exists('g:ollama_model_sampling_denylist')
+            \ && len(g:ollama_model_sampling_denylist) > 0
+            \ && index(g:ollama_model_sampling_denylist, g:ollama_model) >= 0
+        let l:sampling_enabled = 0
+    else
+        let l:sampling_enabled = 1
+    endif
+    call ollama#logger#Debug("sampling_enabled=" .. l:sampling_enabled)
+
     " Convert plugin debug level to python logger levels
     let l:log_level = ollama#logger#PythonLogLevel(g:ollama_debug)
     let l:base_url = g:ollama_host
@@ -234,6 +244,7 @@ function! ollama#GetSuggestion(timer)
         \ "-m", g:ollama_model,
         \ "-u", l:base_url,
         \ "-o", l:model_options,
+        \ "-se", l:sampling_enabled,
         \ "-l", l:log_level
         \ ]
     " Add optional credentialname for looking up the API key
diff --git a/python/complete.py b/python/complete.py
index 4145917..bdd0b01 100755
--- a/python/complete.py
+++ b/python/complete.py
@@ -209,7 +209,7 @@ def extract_stop_marker(after: str) -> str | None:
             return line.rstrip()  # preserve indentation
     return None
 
-def generate_code_completion_openai(prompt, baseurl, model, options, credentialname):
+def generate_code_completion_openai(prompt, baseurl, model, options, sampling_enabled, credentialname):
     """Generate code completion using OpenAI's official Python SDK"""
     if OpenAI is None:
         raise ImportError("OpenAI package not found. Please install via 'pip install openai'.")
@@ -264,13 +264,21 @@ def generate_code_completion_openai(prompt, baseurl, model, options, credentialn
     log.debug('max_tokens: ' + str(max_tokens))
     log.debug('stops: ' + str(stops))
     try:
-        response = client.chat.completions.create(
-            model=model,
-            messages=[{"role": "user", "content": full_prompt}],
-            temperature=temperature,
-            max_tokens=max_tokens,
-            stop=stops
-        )
+        # Build request parameters
+        request_params = {
+            'model': model,
+            'messages': [{"role": "user", "content": full_prompt}],
+        }
+
+        # Check if model supports sampling parameters
+        if sampling_enabled:
+            request_params['temperature'] = temperature
+            request_params['max_tokens'] = max_tokens
+            request_params['stop'] = stops
+        else:
+            request_params['max_completion_tokens'] = max_tokens
+
+        response = client.chat.completions.create(**request_params)
         response = response.choices[0].message.content.strip()
         log.debug('response: ' + response)
     except Exception as e:
@@ -346,6 +354,7 @@ def generate_code_completion_openai_legacy(prompt, baseurl, model, options, cred
                             help="Base endpoint URL (for Ollama only).")
         parser.add_argument('-o', '--options', type=str, default=DEFAULT_OPTIONS,
                             help="Ollama REST API options (JSON string).")
+        parser.add_argument("-se", "--sampling-enabled", type=int, default=1, help="Enable or disable sampling.")
         parser.add_argument('-l', '--log-level', type=int, default=OllamaLogger.ERROR,
                             help="Specify log level")
         parser.add_argument('-f', '--log-filename', type=str, default="complete.log",
@@ -391,7 +400,7 @@ def generate_code_completion_openai_legacy(prompt, baseurl, model, options, cred
             else:
                 modelname = DEFAULT_OPENAI_MODEL
             baseurl = args.url or None
-            response = generate_code_completion_openai(prompt, baseurl, modelname, options, args.keyname)
+            response = generate_code_completion_openai(prompt, baseurl, modelname, options, args.sampling_enabled, args.keyname)
         elif args.provider == "openai_legacy":
             if args.model:
                 modelname = args.model

From e7f1c4c9e6f34083692044b11b85948a1b2572b8 Mon Sep 17 00:00:00 2001
From: kaka <kaka@example.com>
Date: Tue, 2 Dec 2025 17:34:20 +0700
Subject: [PATCH 4/6] feat: Added Claude and openai_responses supported

---
 autoload/ollama.vim         |   8 ++
 plugin/ollama.vim           |   4 +
 python/OllamaCredentials.py |   3 +
 python/complete.py          | 247 +++++++++++++++++++++++++++++++++++-
 4 files changed, 255 insertions(+), 7 deletions(-)

diff --git a/autoload/ollama.vim b/autoload/ollama.vim
index bc7bc89..c4598ca 100644
--- a/autoload/ollama.vim
+++ b/autoload/ollama.vim
@@ -236,6 +236,9 @@ function! ollama#GetSuggestion(timer)
     let l:base_url = g:ollama_host
     if g:ollama_model_provider =~ '^openai'
         let l:base_url = g:ollama_openai_baseurl
+    elseif g:ollama_model_provider == 'claude'
+        " Claude uses default Anthropic API, don't set base_url
+        let l:base_url = ''
     endif
     " Adjust the command to use the prompt as stdin input
     let l:command = [ g:ollama_python_interpreter,
@@ -258,6 +261,11 @@ function! ollama#GetSuggestion(timer)
             " add credentialname option for Mistral
             let l:command += [ '-k', g:ollama_mistral_credentialname ]
         endif
+    elseif g:ollama_model_provider == 'claude'
+        if exists('g:ollama_claude_credentialname') && g:ollama_claude_credentialname != ''
+            " add credentialname option for Claude
+            let l:command += [ '-k', g:ollama_claude_credentialname ]
+        endif
     endif
     call ollama#logger#Debug("command=" .. join(l:command, " "))
     let l:job_options = {
diff --git a/plugin/ollama.vim b/plugin/ollama.vim
index fa24a6f..8dfecb3 100644
--- a/plugin/ollama.vim
+++ b/plugin/ollama.vim
@@ -67,6 +67,10 @@ if !exists('g:ollama_openai_credentialname')
     " UNIX Pass credential name to lookup API key for OpenAI service
     let g:ollama_openai_credentialname = ''
 endif
+if !exists('g:ollama_claude_credentialname')
+    " UNIX Pass credential name to lookup API key for Anthropic Claude service
+    let g:ollama_claude_credentialname = ''
+endif
 " Tab completion specific settings
 if !exists('g:ollama_debounce_time')
     let g:ollama_debounce_time = 500
diff --git a/python/OllamaCredentials.py b/python/OllamaCredentials.py
index 3affe2a..37595f3 100644
--- a/python/OllamaCredentials.py
+++ b/python/OllamaCredentials.py
@@ -18,6 +18,7 @@ def GetApiKey(self, provider: str, credentialname: str | None) -> str:
           - 'openai'         → use OPENAI_API_KEY env var or pass entry
           - 'openai_legacy'  → same as 'openai', kept for compatibility
           - 'mistral'        → use MISTRAL_API_KEY env var or pass entry
+          - 'anthropic'      → use ANTHROPIC_API_KEY env var or pass entry
 
         Priority:
           1. Environment variable override
@@ -36,6 +37,8 @@ def GetApiKey(self, provider: str, credentialname: str | None) -> str:
             env_var = "OPENAI_API_KEY"
         elif provider == "mistral":
             env_var = "MISTRAL_API_KEY"
+        elif provider == "anthropic":
+            env_var = "ANTHROPIC_API_KEY"
         else:
             raise ValueError(f"Unknown provider: {provider}")
 
diff --git a/python/complete.py b/python/complete.py
index bdd0b01..b65d115 100755
--- a/python/complete.py
+++ b/python/complete.py
@@ -23,13 +23,23 @@
 except ImportError:
     Mistral = None
 
+# try to load Anthropic package if it exists
+try:
+    from anthropic import Anthropic  # type: ignore
+except ImportError:
+    Anthropic = None
+
 # Default values
 DEFAULT_HOST = 'http://localhost:11434'
 DEFAULT_PROVIDER = 'ollama'
 DEFAULT_MODEL = 'codellama:code'
 DEFAULT_OPTIONS = '{ "temperature": 0, "top_p": 0.95 }'
+DEFAULT_TEMPERATURE = 0
+DEFAULT_MAX_TOKENS = 300
 DEFAULT_MISTRAL_MODEL = 'codestral-2501'
 DEFAULT_OPENAI_MODEL = 'gpt-4.1-mini'
+DEFAULT_OPENAI_RESPONSES_MODEL = 'gpt-5.1-codex'
+DEFAULT_CLAUDE_MODEL = 'claude-sonnet-4-20250514'
 
 # When set to true, we use our own templates and don't use the Ollama built-in templates.
 # Is is the only way to make this work reliable. As soon is this works also with Ollama
@@ -170,9 +180,9 @@ def generate_code_completion_mistral(prompt, baseurl, model, options, credential
     stop_marker = extract_stop_marker(suffix)
     stops = [stop_marker] if stop_marker else []
 
-    temperature = options.get('temperature', 0)
-#    min_tokens = options.get('min_tokens', 1)
-    max_tokens = options.get('max_tokens', 300)
+    temperature = options.get('temperature', DEFAULT_TEMPERATURE)
+    # min_tokens = options.get('min_tokens', 1)
+    max_tokens = options.get('max_tokens', DEFAULT_MAX_TOKENS)
 
     log.debug('model: ' + str(model))
     log.debug('temperature: ' + str(temperature))
@@ -256,13 +266,14 @@ def generate_code_completion_openai(prompt, baseurl, model, options, sampling_en
     stop_marker = extract_stop_marker(after)
     stops = [stop_marker] if stop_marker else []
 
-    temperature = options.get('temperature', 0)
-    max_tokens = options.get('max_tokens', 300)
+    temperature = options.get('temperature', DEFAULT_TEMPERATURE)
+    max_tokens = options.get('max_tokens', DEFAULT_MAX_TOKENS)
 
     log.debug('model: ' + str(model))
     log.debug('temperature: ' + str(temperature))
     log.debug('max_tokens: ' + str(max_tokens))
     log.debug('stops: ' + str(stops))
+    log.debug('sampling_enabled: ' + str(sampling_enabled))
     try:
         # Build request parameters
         request_params = {
@@ -301,6 +312,85 @@ def generate_code_completion_openai(prompt, baseurl, model, options, sampling_en
 
     return response
 
+def generate_code_completion_claude(prompt, baseurl, model, options, credentialname):
+    """Generate code completion using Anthropic Claude API"""
+    if Anthropic is None:
+        raise ImportError("Anthropic package not found. Please install via 'pip install anthropic'.")
+
+    cred = OllamaCredentials()
+    api_key = cred.GetApiKey('anthropic', credentialname)
+
+    log.debug('Using Anthropic Claude API')
+    if baseurl:
+        log.debug(f'baseurl={baseurl}')
+        client = Anthropic(api_key=api_key, base_url=baseurl)
+    else:
+        log.debug(f'Using default Anthropic URL')
+        client = Anthropic(api_key=api_key)
+
+    parts = prompt.split('<FILL_IN_HERE>')
+    if len(parts) != 2:
+        log.error("Prompt must contain <FILL_IN_HERE> marker for Claude mode.")
+        sys.exit(1)
+    before = parts[0]
+    after = parts[1]
+
+    lang = options.get('lang', 'C')
+    # Claude doesn't support Fill-in-the-middle, use prompt engineering
+    full_prompt = f"""Fill in the missing code between the markers below.
+
+Rules:
+- Do NOT repeat any code that appears in the AFTER section.
+- Return only the exact code that fits between BEFORE and AFTER.
+- Do NOT add explanations or comments.
+- Output the missing code only.
+
+Language: {lang}
+
+BEFORE:
+{before}
+
+AFTER:
+{after}
+"""
+    log.debug('full_prompt: ' + full_prompt)
+
+    temperature = options.get('temperature', DEFAULT_TEMPERATURE)
+    max_tokens = options.get('max_tokens', DEFAULT_MAX_TOKENS)
+
+    log.debug('model: ' + str(model))
+    log.debug('temperature: ' + str(temperature))
+    log.debug('max_tokens: ' + str(max_tokens))
+    
+    try:
+        response = client.messages.create(
+            model=model or DEFAULT_CLAUDE_MODEL,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            messages=[{"role": "user", "content": full_prompt}]
+        )
+        
+        response_text = response.content[0].text.strip()
+        log.debug('response: ' + response_text)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        log.error(str(e))
+        sys.exit(1)
+
+    # convert response to lines
+    lines = response_text.splitlines()
+    if lines:
+        # remove 1st element from array if it starts with ```
+        if lines[0].startswith("```"):
+            lines.pop(0)
+        # remove last element from array if it starts with ```
+        if lines[-1].startswith("```"):
+            lines.pop()
+
+        response_text = "\n".join(lines)
+
+    return response_text
+
 def generate_code_completion_openai_legacy(prompt, baseurl, model, options, credentialname):
     """Generate code completion using OpenAI's official Python SDK"""
     if OpenAI is None:
@@ -326,8 +416,8 @@ def generate_code_completion_openai_legacy(prompt, baseurl, model, options, cred
     full_prompt = fill_in_the_middle(config, prompt)
     log.debug('full_prompt: ' + full_prompt)
 
-    temperature = options.get('temperature', 0)
-    max_tokens = options.get('max_tokens', 300)
+    temperature = options.get('temperature', DEFAULT_TEMPERATURE)
+    max_tokens = options.get('max_tokens', DEFAULT_MAX_TOKENS)
 
     log.debug('model: ' + str(model))
     log.debug('temperature: ' + str(temperature))
@@ -343,6 +433,135 @@ def generate_code_completion_openai_legacy(prompt, baseurl, model, options, cred
 
     return response.rstrip()
 
+def generate_code_completion_openai_responses(prompt, baseurl, model, options, credentialname):
+    """Generate code completion using OpenAI's /v1/responses endpoint for GPT-5.1-Codex"""
+    if OpenAI is None:
+        raise ImportError("OpenAI package not found. Please install via 'pip install openai'.")
+
+    log.debug('Using OpenAI responses endpoint (for GPT-5.1-Codex)')
+    cred = OllamaCredentials()
+    api_key = cred.GetApiKey('openai', credentialname)
+
+    if baseurl:
+        endpoint = f"{baseurl}/v1/responses"
+    else:
+        endpoint = "https://api.openai.com/v1/responses"
+
+    log.debug(f'endpoint: {endpoint}')
+
+    parts = prompt.split('<FILL_IN_HERE>')
+    if len(parts) != 2:
+        log.error("Prompt must contain <FILL_IN_HERE> marker.")
+        sys.exit(1)
+
+    # For code completion, we just use the before part as input
+    before = parts[0]
+    after = parts[1]
+
+    # Build the input prompt for code completion
+    # gpt-5.1-codex seems to work better with explicit instructions
+    if after.strip():
+        # Fill-in-the-middle style completion
+        full_input = f"Complete the code at <FILL>:\n\n{before}<FILL>{after}\n\nProvide ONLY the code that replaces <FILL>, nothing else."
+    else:
+        # End-of-file completion
+        full_input = f"Continue this code:\n\n{before}\n\nProvide ONLY the next line(s) of code, nothing else."
+
+    # Use higher token limit for gpt-5.1-codex which uses reasoning tokens
+    max_output_tokens = options.get('max_completion_tokens', options.get('max_tokens', DEFAULT_MAX_TOKENS))
+
+    log.debug('model: ' + str(model))
+    log.debug('max_output_tokens: ' + str(max_output_tokens))
+    log.debug('input: ' + full_input)
+
+    headers = {
+        'Authorization': f'Bearer {api_key}',
+        'Content-Type': 'application/json'
+    }
+
+    data = {
+        'model': model,
+        'input': full_input,
+        'max_output_tokens': max_output_tokens
+    }
+
+    try:
+        response = requests.post(endpoint, headers=headers, json=data)
+        if response.status_code != 200:
+            log.error(f'API error: {response.text}')
+        response.raise_for_status()
+        result = response.json()
+        log.debug('response: ' + json.dumps(result, indent=2))
+
+        # Extract the completion from the response
+        # The responses endpoint can return different formats
+        completion = None
+
+        # Check if response has the new format with 'output' array
+        if 'output' in result and isinstance(result['output'], list):
+            log.debug(f'Result has output array with {len(result["output"])} items')
+            # Look for message type items in the output array
+            for idx, item in enumerate(result['output']):
+                log.debug(f'Output item {idx}: type={item.get("type")}')
+                if item.get('type') == 'message' and item.get('status') == 'completed':
+                    content = item.get('content', [])
+                    log.debug(f'Message content has {len(content)} items')
+                    for content_idx, content_item in enumerate(content):
+                        log.debug(f'Content {content_idx}: type={content_item.get("type")}')
+                        if content_item.get('type') == 'output_text':
+                            completion = content_item.get('text', '')
+                            log.debug(f'Found output_text: {completion}')
+                            break
+                    if completion:
+                        break
+
+            # If no message found, log the incomplete status
+            if not completion and result.get('status') == 'incomplete':
+                log.warning(f'Response incomplete: {result.get("incomplete_details")}')
+                # For incomplete responses with only reasoning, we might need to handle differently
+                log.error('No message output found, only reasoning. Model may need different prompt.')
+
+        # Handle list format (old format)
+        elif isinstance(result, list):
+            log.debug(f'Result is a list with {len(result)} items')
+            for idx, item in enumerate(result):
+                log.debug(f'Item {idx}: type={item.get("type")}, status={item.get("status")}')
+                if item.get('type') == 'message' and item.get('status') == 'completed':
+                    content = item.get('content', [])
+                    log.debug(f'Message content has {len(content)} items')
+                    for content_idx, content_item in enumerate(content):
+                        log.debug(f'Content {content_idx}: type={content_item.get("type")}')
+                        if content_item.get('type') == 'output_text':
+                            completion = content_item.get('text', '')
+                            log.debug(f'Found output_text: {completion}')
+                            break
+                    if completion:
+                        break
+
+        # Fallback for other formats
+        elif 'text' in result:
+            completion = result['text']
+        elif 'choices' in result and len(result['choices']) > 0:
+            completion = result['choices'][0].get('text', result['choices'][0].get('message', {}).get('content', ''))
+
+        if not completion:
+            log.error('Could not extract completion from response')
+            log.error('Response structure: ' + json.dumps(result, indent=2))
+            return ""
+
+        # Ensure completion is a string
+        if not isinstance(completion, str):
+            log.error(f'Completion is not a string, type: {type(completion)}')
+            completion = str(completion)
+
+        log.debug('Final completion: ' + completion)
+        return completion.strip()
+
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        log.error(str(e))
+        sys.exit(1)
+
 if __name__ == "__main__":
     try:
         parser = argparse.ArgumentParser(description="Complete code using Ollama or OpenAI LLM.")
@@ -408,6 +627,20 @@ def generate_code_completion_openai_legacy(prompt, baseurl, model, options, cred
                 modelname = DEFAULT_OPENAI_MODEL
             baseurl = args.url or None
             response = generate_code_completion_openai_legacy(prompt, baseurl, modelname, options, args.keyname)
+        elif args.provider == "claude":
+            if args.model:
+                modelname = args.model
+            else:
+                modelname = DEFAULT_CLAUDE_MODEL
+            baseurl = args.url or None
+            response = generate_code_completion_claude(prompt, baseurl, modelname, options, args.keyname)
+        elif args.provider == "openai_responses":
+            if args.model:
+                modelname = args.model
+            else:
+                modelname = DEFAULT_OPENAI_RESPONSES_MODEL
+            baseurl = args.url or None
+            response = generate_code_completion_openai_responses(prompt, baseurl, modelname, options, args.keyname)
         else:
             log.error(f"Unknown provider: {args.provider}")
             sys.exit(1)

From 1a01949410ae3aef314c8cd5c5d5225c84f3b20e Mon Sep 17 00:00:00 2001
From: kaka <kaka@example.com>
Date: Tue, 2 Dec 2025 17:39:31 +0700
Subject: [PATCH 5/6] feat: Use the same input prompt for code completion as
 generate_code_completion_openai

---
 python/complete.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/python/complete.py b/python/complete.py
index b65d115..b03c78b 100755
--- a/python/complete.py
+++ b/python/complete.py
@@ -459,13 +459,25 @@ def generate_code_completion_openai_responses(prompt, baseurl, model, options, c
     after = parts[1]
 
     # Build the input prompt for code completion
-    # gpt-5.1-codex seems to work better with explicit instructions
-    if after.strip():
-        # Fill-in-the-middle style completion
-        full_input = f"Complete the code at <FILL>:\n\n{before}<FILL>{after}\n\nProvide ONLY the code that replaces <FILL>, nothing else."
-    else:
-        # End-of-file completion
-        full_input = f"Continue this code:\n\n{before}\n\nProvide ONLY the next line(s) of code, nothing else."
+    # Use similar structure as Claude prompt for better results
+    lang = options.get('lang', 'C')
+
+    full_input = f"""Fill in the missing code between the markers below.
+
+Rules:
+- Do NOT repeat any code that appears in the AFTER section.
+- Return only the exact code that fits between BEFORE and AFTER.
+- Do NOT add explanations or comments.
+- Output the missing code only.
+
+Language: {lang}
+
+BEFORE:
+{before}
+
+AFTER:
+{after}
+"""
 
     # Use higher token limit for gpt-5.1-codex which uses reasoning tokens
     max_output_tokens = options.get('max_completion_tokens', options.get('max_tokens', DEFAULT_MAX_TOKENS))

From b130eb3db9cbc37b0245e700744192a92c14d85d Mon Sep 17 00:00:00 2001
From: kaka <kaka@example.com>
Date: Tue, 2 Dec 2025 17:49:38 +0700
Subject: [PATCH 6/6] refactor: extract the _build_fim_prompt and
 _strip_code_fences methods.

---
 python/complete.py | 222 ++++++++++++++++-----------------------------
 1 file changed, 80 insertions(+), 142 deletions(-)

diff --git a/python/complete.py b/python/complete.py
index b03c78b..b92fe8f 100755
--- a/python/complete.py
+++ b/python/complete.py
@@ -219,6 +219,36 @@ def extract_stop_marker(after: str) -> str | None:
             return line.rstrip()  # preserve indentation
     return None
 
+def _build_fim_prompt(before: str, after: str, lang: str = 'C') -> str:
+    """Build fill-in-the-middle prompt for models that don't support native FIM."""
+    return f"""Fill in the missing code between the markers below.
+
+Rules:
+- Do NOT repeat any code that appears in the AFTER section.
+- Return only the exact code that fits between BEFORE and AFTER.
+- Do NOT add explanations or comments.
+- Output the missing code only.
+
+Language: {lang}
+
+BEFORE:
+{before}
+
+AFTER:
+{after}
+"""
+
+def _strip_code_fences(text: str) -> str:
+    """Remove markdown code fence markers (```) from beginning and end of text."""
+    lines = text.splitlines()
+    if lines:
+        if lines[0].startswith("```"):
+            lines.pop(0)
+        if lines and lines[-1].startswith("```"):
+            lines.pop()
+        return "\n".join(lines)
+    return text
+
 def generate_code_completion_openai(prompt, baseurl, model, options, sampling_enabled, credentialname):
     """Generate code completion using OpenAI's official Python SDK"""
     if OpenAI is None:
@@ -298,19 +328,8 @@ def generate_code_completion_openai(prompt, baseurl, model, options, sampling_en
         log.error(str(e))
         sys.exit(1)
 
-    # convert response to lines
-    lines = response.splitlines()
-    if lines:
-        # remove 1st element from array if it starts with ```
-        if lines[0].startswith("```"):
-            lines.pop(0)
-        # remove last element from array if it starts with ```
-        if lines[-1].startswith("```"):
-            lines.pop()
-
-        response = "\n".join(lines)
-
-    return response
+    # Remove markdown code fences if present
+    return _strip_code_fences(response)
 
 def generate_code_completion_claude(prompt, baseurl, model, options, credentialname):
     """Generate code completion using Anthropic Claude API"""
@@ -332,27 +351,10 @@ def generate_code_completion_claude(prompt, baseurl, model, options, credentialn
     if len(parts) != 2:
         log.error("Prompt must contain <FILL_IN_HERE> marker for Claude mode.")
         sys.exit(1)
-    before = parts[0]
-    after = parts[1]
 
+    # Build FIM prompt using helper function
     lang = options.get('lang', 'C')
-    # Claude doesn't support Fill-in-the-middle, use prompt engineering
-    full_prompt = f"""Fill in the missing code between the markers below.
-
-Rules:
-- Do NOT repeat any code that appears in the AFTER section.
-- Return only the exact code that fits between BEFORE and AFTER.
-- Do NOT add explanations or comments.
-- Output the missing code only.
-
-Language: {lang}
-
-BEFORE:
-{before}
-
-AFTER:
-{after}
-"""
+    full_prompt = _build_fim_prompt(parts[0], parts[1], lang)
     log.debug('full_prompt: ' + full_prompt)
 
     temperature = options.get('temperature', DEFAULT_TEMPERATURE)
@@ -377,19 +379,8 @@ def generate_code_completion_claude(prompt, baseurl, model, options, credentialn
         log.error(str(e))
         sys.exit(1)
 
-    # convert response to lines
-    lines = response_text.splitlines()
-    if lines:
-        # remove 1st element from array if it starts with ```
-        if lines[0].startswith("```"):
-            lines.pop(0)
-        # remove last element from array if it starts with ```
-        if lines[-1].startswith("```"):
-            lines.pop()
-
-        response_text = "\n".join(lines)
-
-    return response_text
+    # Remove markdown code fences if present
+    return _strip_code_fences(response_text)
 
 def generate_code_completion_openai_legacy(prompt, baseurl, model, options, credentialname):
     """Generate code completion using OpenAI's official Python SDK"""
@@ -433,140 +424,87 @@ def generate_code_completion_openai_legacy(prompt, baseurl, model, options, cred
 
     return response.rstrip()
 
+def _extract_output_text_from_message(item):
+    """Extract output_text from a message item in OpenAI responses format"""
+    if item.get('type') == 'message' and item.get('status') == 'completed':
+        content = item.get('content', [])
+        for content_item in content:
+            if content_item.get('type') == 'output_text':
+                return content_item.get('text', '')
+    return None
+
 def generate_code_completion_openai_responses(prompt, baseurl, model, options, credentialname):
     """Generate code completion using OpenAI's /v1/responses endpoint for GPT-5.1-Codex"""
     if OpenAI is None:
         raise ImportError("OpenAI package not found. Please install via 'pip install openai'.")
 
     log.debug('Using OpenAI responses endpoint (for GPT-5.1-Codex)')
+
+    # Get API credentials
     cred = OllamaCredentials()
     api_key = cred.GetApiKey('openai', credentialname)
+    endpoint = f"{baseurl}/v1/responses" if baseurl else "https://api.openai.com/v1/responses"
 
-    if baseurl:
-        endpoint = f"{baseurl}/v1/responses"
-    else:
-        endpoint = "https://api.openai.com/v1/responses"
-
-    log.debug(f'endpoint: {endpoint}')
-
+    # Parse prompt
     parts = prompt.split('<FILL_IN_HERE>')
     if len(parts) != 2:
         log.error("Prompt must contain <FILL_IN_HERE> marker.")
         sys.exit(1)
 
-    # For code completion, we just use the before part as input
-    before = parts[0]
-    after = parts[1]
-
-    # Build the input prompt for code completion
-    # Use similar structure as Claude prompt for better results
+    # Build FIM prompt using helper function
     lang = options.get('lang', 'C')
+    full_input = _build_fim_prompt(parts[0], parts[1], lang)
 
-    full_input = f"""Fill in the missing code between the markers below.
-
-Rules:
-- Do NOT repeat any code that appears in the AFTER section.
-- Return only the exact code that fits between BEFORE and AFTER.
-- Do NOT add explanations or comments.
-- Output the missing code only.
-
-Language: {lang}
-
-BEFORE:
-{before}
-
-AFTER:
-{after}
-"""
-
-    # Use higher token limit for gpt-5.1-codex which uses reasoning tokens
     max_output_tokens = options.get('max_completion_tokens', options.get('max_tokens', DEFAULT_MAX_TOKENS))
+    log.debug(f'endpoint: {endpoint}, model: {model}, max_output_tokens: {max_output_tokens}')
 
-    log.debug('model: ' + str(model))
-    log.debug('max_output_tokens: ' + str(max_output_tokens))
-    log.debug('input: ' + full_input)
-
-    headers = {
-        'Authorization': f'Bearer {api_key}',
-        'Content-Type': 'application/json'
-    }
-
-    data = {
-        'model': model,
-        'input': full_input,
-        'max_output_tokens': max_output_tokens
-    }
-
+    # Make API request
     try:
-        response = requests.post(endpoint, headers=headers, json=data)
+        response = requests.post(
+            endpoint,
+            headers={'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'},
+            json={'model': model, 'input': full_input, 'max_output_tokens': max_output_tokens}
+        )
+
         if response.status_code != 200:
             log.error(f'API error: {response.text}')
         response.raise_for_status()
+
         result = response.json()
         log.debug('response: ' + json.dumps(result, indent=2))
 
-        # Extract the completion from the response
-        # The responses endpoint can return different formats
+        # Extract completion from response (supports both list and dict formats)
         completion = None
+        output_items = result.get('output', []) if isinstance(result, dict) else result if isinstance(result, list) else []
+
+        for item in output_items:
+            completion = _extract_output_text_from_message(item)
+            if completion:
+                break
 
-        # Check if response has the new format with 'output' array
-        if 'output' in result and isinstance(result['output'], list):
-            log.debug(f'Result has output array with {len(result["output"])} items')
-            # Look for message type items in the output array
-            for idx, item in enumerate(result['output']):
-                log.debug(f'Output item {idx}: type={item.get("type")}')
-                if item.get('type') == 'message' and item.get('status') == 'completed':
-                    content = item.get('content', [])
-                    log.debug(f'Message content has {len(content)} items')
-                    for content_idx, content_item in enumerate(content):
-                        log.debug(f'Content {content_idx}: type={content_item.get("type")}')
-                        if content_item.get('type') == 'output_text':
-                            completion = content_item.get('text', '')
-                            log.debug(f'Found output_text: {completion}')
-                            break
-                    if completion:
-                        break
-
-            # If no message found, log the incomplete status
-            if not completion and result.get('status') == 'incomplete':
-                log.warning(f'Response incomplete: {result.get("incomplete_details")}')
-                # For incomplete responses with only reasoning, we might need to handle differently
-                log.error('No message output found, only reasoning. Model may need different prompt.')
-
-        # Handle list format (old format)
-        elif isinstance(result, list):
-            log.debug(f'Result is a list with {len(result)} items')
-            for idx, item in enumerate(result):
-                log.debug(f'Item {idx}: type={item.get("type")}, status={item.get("status")}')
-                if item.get('type') == 'message' and item.get('status') == 'completed':
-                    content = item.get('content', [])
-                    log.debug(f'Message content has {len(content)} items')
-                    for content_idx, content_item in enumerate(content):
-                        log.debug(f'Content {content_idx}: type={content_item.get("type")}')
-                        if content_item.get('type') == 'output_text':
-                            completion = content_item.get('text', '')
-                            log.debug(f'Found output_text: {completion}')
-                            break
-                    if completion:
-                        break
+        # Check for incomplete responses
+        if not completion and isinstance(result, dict) and result.get('status') == 'incomplete':
+            log.warning(f'Response incomplete: {result.get("incomplete_details")}')
 
         # Fallback for other formats
-        elif 'text' in result:
-            completion = result['text']
-        elif 'choices' in result and len(result['choices']) > 0:
-            completion = result['choices'][0].get('text', result['choices'][0].get('message', {}).get('content', ''))
+        if not completion:
+            if isinstance(result, dict):
+                if 'text' in result:
+                    completion = result['text']
+                elif 'choices' in result and result['choices']:
+                    choice = result['choices'][0]
+                    completion = choice.get('text') or choice.get('message', {}).get('content')
 
         if not completion:
             log.error('Could not extract completion from response')
-            log.error('Response structure: ' + json.dumps(result, indent=2))
             return ""
 
         # Ensure completion is a string
         if not isinstance(completion, str):
-            log.error(f'Completion is not a string, type: {type(completion)}')
-            completion = str(completion)
+            log.error(f'Completion is not a string, type: {type(completion)}, value: {completion}')
+            return ""
 
-        log.debug('Final completion: ' + completion)
+        log.debug(f'Final completion: {completion}')
         return completion.strip()
 
     except Exception as e: