From 9fdb4cfd16f3ad329f48cd1b2a05e0fc06754179 Mon Sep 17 00:00:00 2001 From: kaka Date: Mon, 1 Dec 2025 22:26:27 +0700 Subject: [PATCH 1/6] feat: Added sampling denylist paramter for paramters that do not support sampling, replace `max_tokens` with `max_completion_tokens`. Remove unsupported parameters such as `temperature` and `top_p`. --- autoload/ollama/review.vim | 14 ++++++++++++-- plugin/ollama.vim | 4 ++++ python/chat.py | 35 ++++++++++++++++++++++------------- 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/autoload/ollama/review.vim b/autoload/ollama/review.vim index ee89a7b..3686c89 100644 --- a/autoload/ollama/review.vim +++ b/autoload/ollama/review.vim @@ -136,9 +136,18 @@ function! s:StartChat(lines) abort endfunc let l:model_options = json_encode(g:ollama_chat_options) - call ollama#logger#Debug("Connecting to Ollama on " .. g:ollama_host .. " using model " .. g:ollama_model) + call ollama#logger#Debug("Chat Connecting to Ollama on " .. g:ollama_host .. " using model " .. g:ollama_model) call ollama#logger#Debug("model_options=" .. l:model_options) + if exists('g:ollama_model_sampling_denylist') + \ && len(g:ollama_model_sampling_denylist) > 0 + \ && index(g:ollama_model_sampling_denylist, g:ollama_chat_model) >= 0 + let l:sampling_enabled = 0 + else + let l:sampling_enabled = 1 + endif + call ollama#logger#Debug("sampling_enabled=" .. l:sampling_enabled) + " Convert plugin debug level to python logger levels let l:log_level = ollama#logger#PythonLogLevel(g:ollama_debug) let l:base_url = g:ollama_host @@ -154,6 +163,7 @@ function! s:StartChat(lines) abort \ '-m', g:ollama_chat_model, \ '-u', l:base_url, \ '-o', l:model_options, + \ "-se", l:sampling_enabled, \ '-t', g:ollama_chat_timeout, \ '-l', l:log_level ] " Check if a system prompt was configured @@ -205,7 +215,7 @@ function! s:StartChat(lines) abort silent execute 'new' l:bufname endif " Set the filetype to ollama-chat -" setlocal filetype=ollama-chat + " setlocal filetype=ollama-chat setlocal filetype=markdown setlocal buftype=prompt " enable BufDelete event when closing buffer usig :q! diff --git a/plugin/ollama.vim b/plugin/ollama.vim index 7945af4..fa24a6f 100644 --- a/plugin/ollama.vim +++ b/plugin/ollama.vim @@ -98,6 +98,10 @@ if !exists('g:ollama_model_options') \ 'max_tokens': 500 \ } endif +if !exists('g:ollama_model_sampling_denylist') + " default model sampling denylist + let g:ollama_model_sampling_denylist = [] +endif " Chat specific settings if !exists('g:ollama_chat_provider') " Provider for chat models: 'ollama' or 'openai' diff --git a/python/chat.py b/python/chat.py index c947c2e..b3bd6e8 100755 --- a/python/chat.py +++ b/python/chat.py @@ -90,7 +90,7 @@ async def stream_chat_message_ollama(messages, endpoint, model, options, timeout messages.append({"role": "assistant", "content": assistant_message.strip()}) -async def stream_chat_message_openai(messages, endpoint, model, options, credentialname): +async def stream_chat_message_openai(messages, endpoint, model, options, sampling_enabled, credentialname): """Stream chat responses from OpenAI API.""" if AsyncOpenAI is None: raise ImportError("OpenAI package not found. Please install via 'pip install openai'.") @@ -114,14 +114,22 @@ async def stream_chat_message_openai(messages, endpoint, model, options, credent top_p = options.get('top_p', 1.0) try: - stream = await client.chat.completions.create( - model=model, - messages=messages, - temperature=temperature, - max_tokens=max_tokens, - top_p=top_p, - stream=True, - ) + # Build request parameters + request_params = { + 'model': model, + 'messages': messages, + 'stream': True, + } + + # Check if model supports sampling parameters + if sampling_enabled: + request_params['temperature'] = temperature + request_params['top_p'] = top_p + request_params['max_tokens'] = max_tokens + else: + request_params['max_completion_tokens'] = max_tokens + + stream = await client.chat.completions.create(**request_params) async for chunk in stream: if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content: @@ -139,7 +147,7 @@ async def stream_chat_message_openai(messages, endpoint, model, options, credent messages.append({"role": "assistant", "content": assistant_message.strip()}) -async def main(provider, endpoint, model, options, systemprompt, timeout, credentialname): +async def main(provider, endpoint, model, options, sampling_enabled, systemprompt, timeout, credentialname): conversation_history = [] log.debug("endpoint: " + str(endpoint)) @@ -169,7 +177,7 @@ async def main(provider, endpoint, model, options, systemprompt, timeout, creden ) else: task = asyncio.create_task( - stream_chat_message_openai(conversation_history, endpoint, model, options, credentialname) + stream_chat_message_openai(conversation_history, endpoint, model, options, sampling_enabled, credentialname) ) await task else: @@ -189,7 +197,7 @@ async def main(provider, endpoint, model, options, systemprompt, timeout, creden ) else: task = asyncio.create_task( - stream_chat_message_openai(conversation_history, endpoint, model, options, credentialname) + stream_chat_message_openai(conversation_history, endpoint, model, options, sampling_enabled, credentialname) ) await task @@ -213,6 +221,7 @@ async def main(provider, endpoint, model, options, systemprompt, timeout, creden help="Base endpoint URL.") parser.add_argument("-o", "--options", type=str, default=DEFAULT_OPTIONS, help="Ollama REST API options.") + parser.add_argument("-se", "--sampling-enabled", type=int, default=1, help="Enable or disable sampling.") parser.add_argument("-s", "--system-prompt", type=str, default="", help="Specify system prompt.") parser.add_argument("-t", "--timeout", type=int, default=DEFAULT_TIMEOUT, help="Timeout in seconds.") parser.add_argument("-l", "--log-level", type=int, default=OllamaLogger.ERROR, help="Log level.") @@ -243,7 +252,7 @@ async def main(provider, endpoint, model, options, systemprompt, timeout, creden try: while True: try: - asyncio.run(main(args.provider, endpoint, model, options, args.system_prompt, args.timeout, args.keyname)) + asyncio.run(main(args.provider, endpoint, model, options, args.sampling_enabled, args.system_prompt, args.timeout, args.keyname)) except KeyboardInterrupt: print("Canceled.") break From c10e1b808c2a9912e9c2098aa9db16b91a594eba Mon Sep 17 00:00:00 2001 From: kaka Date: Tue, 2 Dec 2025 13:26:00 +0700 Subject: [PATCH 2/6] feat: Stream tokens on the same line with real-time cursor tracking --- autoload/ollama/review.vim | 95 ++++++++++++++++++++++++++++++++++++-- python/chat.py | 8 +++- 2 files changed, 98 insertions(+), 5 deletions(-) diff --git a/autoload/ollama/review.vim b/autoload/ollama/review.vim index 3686c89..7b34bff 100644 --- a/autoload/ollama/review.vim +++ b/autoload/ollama/review.vim @@ -56,6 +56,9 @@ function! s:FindBufferWindow(bufnr) endfunction function! s:StartChat(lines) abort + " Counter for reducing redraw frequency + let s:token_count = 0 + " Function handling a line of text that has been typed. func! TextEntered(text) call ollama#logger#Debug("TextEntered: " .. a:text) @@ -65,12 +68,13 @@ function! s:StartChat(lines) abort endif " Send the text to a shell with Enter appended. call ch_sendraw(s:job, a:text .. "\n") + " Reset token count for new request + let s:token_count = 0 endfunc - " Function handling output from the shell: Add it above the prompt. - func! GotOutput(channel, msg) + " OLD VERSION: Append each token as a new line (non-streaming) + func! GotOutputOld(channel, msg) call ollama#logger#Debug("GotOutput: " .. a:msg) - " append lines let l:lines = split(a:msg, "\n") for l:line in l:lines @@ -96,6 +100,91 @@ function! s:StartChat(lines) abort endfor endfunc + " NEW VERSION: Stream tokens on the same line with real-time cursor tracking + func! GotOutputNew(channel, msg) + " call ollama#logger#Debug("GotOutput: [" .. a:msg .. "]") + + " Check for marker + let l:idx = stridx(a:msg, "") + let l:is_eot = l:idx != -1 + let l:content = l:is_eot ? strpart(a:msg, 0, l:idx) : a:msg + + " Append content to the last line for streaming effect + let l:updated_line_num = 0 + let l:updated_line_content = "" + let l:line_count = 0 + + if !empty(l:content) + " Get buffer line count efficiently + let l:buf_info = getbufinfo(s:buf)[0] + let l:line_count = l:buf_info.linecount + " call ollama#logger#Debug("line_count=" .. l:line_count) + + if l:line_count == 0 + " Buffer is empty, append as new line + " call ollama#logger#Debug("Buffer empty, appending first line") + call appendbufline(s:buf, 0, l:content) + let l:updated_line_num = 1 + let l:updated_line_content = l:content + else + " Get only the last line (much faster than getting all lines) + let l:last_line = getbufline(s:buf, l:line_count, l:line_count)[0] + let l:updated_line_content = l:last_line .. l:content + " call ollama#logger#Debug("Appending to line " .. l:line_count .. ": '" .. l:last_line .. "' + '" .. l:content .. "'") + call setbufline(s:buf, l:line_count, l:updated_line_content) + let l:updated_line_num = l:line_count + endif + endif + + " When streaming is done, add a new line for the next input + if l:is_eot + " call ollama#logger#Debug("EOT received, adding newline") + call appendbufline(s:buf, "$", "") + " Reuse line_count if we already got it, otherwise fetch + if l:line_count > 0 + let l:updated_line_num = l:line_count + 1 + else + let l:buf_info = getbufinfo(s:buf)[0] + let l:updated_line_num = l:buf_info.linecount + endif + let l:updated_line_content = "" + endif + + " Update cursor position if this is the active chat window + if bufname() == s:ollama_bufname " Check if current active window is Ollama Chat + let l:winid = bufwinid(s:buf) + if l:winid != -1 && l:updated_line_num > 0 + " Set cursor position directly (much faster than feedkeys) + let l:col = len(l:updated_line_content) + 1 + call win_execute(l:winid, 'call cursor(' . l:updated_line_num . ', ' . l:col . ')') + + " Increment token counter and only redraw every N tokens (or always for EOT) + let s:token_count += 1 + if l:is_eot || s:token_count % 5 == 0 + redraw + endif + + if l:is_eot + " Streaming done, enter insert mode + if mode() == 'i' + call feedkeys("\") + endif + call feedkeys("a") + endif + endif + endif + endfunc + + " Wrapper function that delegates to new version by default + " To use old version, set g:ollama_use_old_output = 1 + func! GotOutput(channel, msg) + if exists('g:ollama_use_old_output') && g:ollama_use_old_output + call GotOutputOld(a:channel, a:msg) + else + call GotOutputNew(a:channel, a:msg) + endif + endfunc + " Function handling output from the shell: Add it above the prompt. func! GotErrors(channel, msg) call ollama#logger#Debug("GotErrors: " .. a:msg) diff --git a/python/chat.py b/python/chat.py index b3bd6e8..fde4f2e 100755 --- a/python/chat.py +++ b/python/chat.py @@ -63,7 +63,9 @@ async def stream_chat_message_ollama(messages, endpoint, model, options, timeout if "message" in message and "content" in message["message"]: content = message["message"]["content"] assistant_message += content - print(content, end="", flush=True) + # Print each token followed by newline so Vim's out_cb receives it immediately + # VimScript will need to handle concatenating tokens on the same line + print(content, flush=True) # If is detected, stop processing if "" in content: @@ -135,7 +137,9 @@ async def stream_chat_message_openai(messages, endpoint, model, options, samplin if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content: token = chunk.choices[0].delta.content assistant_message += token - print(token, end="", flush=True) + # Print each token followed by newline so Vim's out_cb receives it immediately + # VimScript will need to handle concatenating tokens on the same line + print(token, flush=True) print("", flush=True) From 6dbed9c2bf8afe6f5e015247bf0531b2af4a5fa4 Mon Sep 17 00:00:00 2001 From: kaka Date: Tue, 2 Dec 2025 16:08:14 +0700 Subject: [PATCH 3/6] feat: Added sampling_enabled for generate_code_completion_openai --- autoload/ollama.vim | 11 +++++++++++ python/complete.py | 27 ++++++++++++++++++--------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/autoload/ollama.vim b/autoload/ollama.vim index 9cc389b..bc7bc89 100644 --- a/autoload/ollama.vim +++ b/autoload/ollama.vim @@ -221,6 +221,16 @@ function! ollama#GetSuggestion(timer) \ "Connecting to Ollama on " .. g:ollama_host \ .. " using model " .. g:ollama_model) call ollama#logger#Debug("model_options=" .. l:model_options) + + if exists('g:ollama_model_sampling_denylist') + \ && len(g:ollama_model_sampling_denylist) > 0 + \ && index(g:ollama_model_sampling_denylist, g:ollama_model) >= 0 + let l:sampling_enabled = 0 + else + let l:sampling_enabled = 1 + endif + call ollama#logger#Debug("sampling_enabled=" .. l:sampling_enabled) + " Convert plugin debug level to python logger levels let l:log_level = ollama#logger#PythonLogLevel(g:ollama_debug) let l:base_url = g:ollama_host @@ -234,6 +244,7 @@ function! ollama#GetSuggestion(timer) \ "-m", g:ollama_model, \ "-u", l:base_url, \ "-o", l:model_options, + \ "-se", l:sampling_enabled, \ "-l", l:log_level \ ] " Add optional credentialname for looking up the API key diff --git a/python/complete.py b/python/complete.py index 4145917..bdd0b01 100755 --- a/python/complete.py +++ b/python/complete.py @@ -209,7 +209,7 @@ def extract_stop_marker(after: str) -> str | None: return line.rstrip() # preserve indentation return None -def generate_code_completion_openai(prompt, baseurl, model, options, credentialname): +def generate_code_completion_openai(prompt, baseurl, model, options, sampling_enabled, credentialname): """Generate code completion using OpenAI's official Python SDK""" if OpenAI is None: raise ImportError("OpenAI package not found. Please install via 'pip install openai'.") @@ -264,13 +264,21 @@ def generate_code_completion_openai(prompt, baseurl, model, options, credentialn log.debug('max_tokens: ' + str(max_tokens)) log.debug('stops: ' + str(stops)) try: - response = client.chat.completions.create( - model=model, - messages=[{"role": "user", "content": full_prompt}], - temperature=temperature, - max_tokens=max_tokens, - stop=stops - ) + # Build request parameters + request_params = { + 'model': model, + 'messages': [{"role": "user", "content": full_prompt}], + } + + # Check if model supports sampling parameters + if sampling_enabled: + request_params['temperature'] = temperature + request_params['max_tokens'] = max_tokens + request_params['stop'] = stops + else: + request_params['max_completion_tokens'] = max_tokens + + response = client.chat.completions.create(**request_params) response = response.choices[0].message.content.strip() log.debug('response: ' + response) except Exception as e: @@ -346,6 +354,7 @@ def generate_code_completion_openai_legacy(prompt, baseurl, model, options, cred help="Base endpoint URL (for Ollama only).") parser.add_argument('-o', '--options', type=str, default=DEFAULT_OPTIONS, help="Ollama REST API options (JSON string).") + parser.add_argument("-se", "--sampling-enabled", type=int, default=1, help="Enable or disable sampling.") parser.add_argument('-l', '--log-level', type=int, default=OllamaLogger.ERROR, help="Specify log level") parser.add_argument('-f', '--log-filename', type=str, default="complete.log", @@ -391,7 +400,7 @@ def generate_code_completion_openai_legacy(prompt, baseurl, model, options, cred else: modelname = DEFAULT_OPENAI_MODEL baseurl = args.url or None - response = generate_code_completion_openai(prompt, baseurl, modelname, options, args.keyname) + response = generate_code_completion_openai(prompt, baseurl, modelname, options, args.sampling_enabled, args.keyname) elif args.provider == "openai_legacy": if args.model: modelname = args.model From e7f1c4c9e6f34083692044b11b85948a1b2572b8 Mon Sep 17 00:00:00 2001 From: kaka Date: Tue, 2 Dec 2025 17:34:20 +0700 Subject: [PATCH 4/6] feat: Added Claude and openai_responses supported --- autoload/ollama.vim | 8 ++ plugin/ollama.vim | 4 + python/OllamaCredentials.py | 3 + python/complete.py | 247 +++++++++++++++++++++++++++++++++++- 4 files changed, 255 insertions(+), 7 deletions(-) diff --git a/autoload/ollama.vim b/autoload/ollama.vim index bc7bc89..c4598ca 100644 --- a/autoload/ollama.vim +++ b/autoload/ollama.vim @@ -236,6 +236,9 @@ function! ollama#GetSuggestion(timer) let l:base_url = g:ollama_host if g:ollama_model_provider =~ '^openai' let l:base_url = g:ollama_openai_baseurl + elseif g:ollama_model_provider == 'claude' + " Claude uses default Anthropic API, don't set base_url + let l:base_url = '' endif " Adjust the command to use the prompt as stdin input let l:command = [ g:ollama_python_interpreter, @@ -258,6 +261,11 @@ function! ollama#GetSuggestion(timer) " add credentialname option for Mistral let l:command += [ '-k', g:ollama_mistral_credentialname ] endif + elseif g:ollama_model_provider == 'claude' + if exists('g:ollama_claude_credentialname') && g:ollama_claude_credentialname != '' + " add credentialname option for Claude + let l:command += [ '-k', g:ollama_claude_credentialname ] + endif endif call ollama#logger#Debug("command=" .. join(l:command, " ")) let l:job_options = { diff --git a/plugin/ollama.vim b/plugin/ollama.vim index fa24a6f..8dfecb3 100644 --- a/plugin/ollama.vim +++ b/plugin/ollama.vim @@ -67,6 +67,10 @@ if !exists('g:ollama_openai_credentialname') " UNIX Pass credential name to lookup API key for OpenAI service let g:ollama_openai_credentialname = '' endif +if !exists('g:ollama_claude_credentialname') + " UNIX Pass credential name to lookup API key for Anthropic Claude service + let g:ollama_claude_credentialname = '' +endif " Tab completion specific settings if !exists('g:ollama_debounce_time') let g:ollama_debounce_time = 500 diff --git a/python/OllamaCredentials.py b/python/OllamaCredentials.py index 3affe2a..37595f3 100644 --- a/python/OllamaCredentials.py +++ b/python/OllamaCredentials.py @@ -18,6 +18,7 @@ def GetApiKey(self, provider: str, credentialname: str | None) -> str: - 'openai' → use OPENAI_API_KEY env var or pass entry - 'openai_legacy' → same as 'openai', kept for compatibility - 'mistral' → use MISTRAL_API_KEY env var or pass entry + - 'anthropic' → use ANTHROPIC_API_KEY env var or pass entry Priority: 1. Environment variable override @@ -36,6 +37,8 @@ def GetApiKey(self, provider: str, credentialname: str | None) -> str: env_var = "OPENAI_API_KEY" elif provider == "mistral": env_var = "MISTRAL_API_KEY" + elif provider == "anthropic": + env_var = "ANTHROPIC_API_KEY" else: raise ValueError(f"Unknown provider: {provider}") diff --git a/python/complete.py b/python/complete.py index bdd0b01..b65d115 100755 --- a/python/complete.py +++ b/python/complete.py @@ -23,13 +23,23 @@ except ImportError: Mistral = None +# try to load Anthropic package if it exists +try: + from anthropic import Anthropic # type: ignore +except ImportError: + Anthropic = None + # Default values DEFAULT_HOST = 'http://localhost:11434' DEFAULT_PROVIDER = 'ollama' DEFAULT_MODEL = 'codellama:code' DEFAULT_OPTIONS = '{ "temperature": 0, "top_p": 0.95 }' +DEFAULT_TEMPERATURE = 0 +DEFAULT_MAX_TOKENS = 300 DEFAULT_MISTRAL_MODEL = 'codestral-2501' DEFAULT_OPENAI_MODEL = 'gpt-4.1-mini' +DEFAULT_OPENAI_RESPONSES_MODEL = 'gpt-5.1-codex' +DEFAULT_CLAUDE_MODEL = 'claude-sonnet-4-20250514' # When set to true, we use our own templates and don't use the Ollama built-in templates. # Is is the only way to make this work reliable. As soon is this works also with Ollama @@ -170,9 +180,9 @@ def generate_code_completion_mistral(prompt, baseurl, model, options, credential stop_marker = extract_stop_marker(suffix) stops = [stop_marker] if stop_marker else [] - temperature = options.get('temperature', 0) -# min_tokens = options.get('min_tokens', 1) - max_tokens = options.get('max_tokens', 300) + temperature = options.get('temperature', DEFAULT_TEMPERATURE) + # min_tokens = options.get('min_tokens', 1) + max_tokens = options.get('max_tokens', DEFAULT_MAX_TOKENS) log.debug('model: ' + str(model)) log.debug('temperature: ' + str(temperature)) @@ -256,13 +266,14 @@ def generate_code_completion_openai(prompt, baseurl, model, options, sampling_en stop_marker = extract_stop_marker(after) stops = [stop_marker] if stop_marker else [] - temperature = options.get('temperature', 0) - max_tokens = options.get('max_tokens', 300) + temperature = options.get('temperature', DEFAULT_TEMPERATURE) + max_tokens = options.get('max_tokens', DEFAULT_MAX_TOKENS) log.debug('model: ' + str(model)) log.debug('temperature: ' + str(temperature)) log.debug('max_tokens: ' + str(max_tokens)) log.debug('stops: ' + str(stops)) + log.debug('sampling_enabled: ' + str(sampling_enabled)) try: # Build request parameters request_params = { @@ -301,6 +312,85 @@ def generate_code_completion_openai(prompt, baseurl, model, options, sampling_en return response +def generate_code_completion_claude(prompt, baseurl, model, options, credentialname): + """Generate code completion using Anthropic Claude API""" + if Anthropic is None: + raise ImportError("Anthropic package not found. Please install via 'pip install anthropic'.") + + cred = OllamaCredentials() + api_key = cred.GetApiKey('anthropic', credentialname) + + log.debug('Using Anthropic Claude API') + if baseurl: + log.debug(f'baseurl={baseurl}') + client = Anthropic(api_key=api_key, base_url=baseurl) + else: + log.debug(f'Using default Anthropic URL') + client = Anthropic(api_key=api_key) + + parts = prompt.split('') + if len(parts) != 2: + log.error("Prompt must contain marker for Claude mode.") + sys.exit(1) + before = parts[0] + after = parts[1] + + lang = options.get('lang', 'C') + # Claude doesn't support Fill-in-the-middle, use prompt engineering + full_prompt = f"""Fill in the missing code between the markers below. + +Rules: +- Do NOT repeat any code that appears in the AFTER section. +- Return only the exact code that fits between BEFORE and AFTER. +- Do NOT add explanations or comments. +- Output the missing code only. + +Language: {lang} + +BEFORE: +{before} + +AFTER: +{after} +""" + log.debug('full_prompt: ' + full_prompt) + + temperature = options.get('temperature', DEFAULT_TEMPERATURE) + max_tokens = options.get('max_tokens', DEFAULT_MAX_TOKENS) + + log.debug('model: ' + str(model)) + log.debug('temperature: ' + str(temperature)) + log.debug('max_tokens: ' + str(max_tokens)) + + try: + response = client.messages.create( + model=model or DEFAULT_CLAUDE_MODEL, + max_tokens=max_tokens, + temperature=temperature, + messages=[{"role": "user", "content": full_prompt}] + ) + + response_text = response.content[0].text.strip() + log.debug('response: ' + response_text) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + log.error(str(e)) + sys.exit(1) + + # convert response to lines + lines = response_text.splitlines() + if lines: + # remove 1st element from array if it starts with ``` + if lines[0].startswith("```"): + lines.pop(0) + # remove last element from array if it starts with ``` + if lines[-1].startswith("```"): + lines.pop() + + response_text = "\n".join(lines) + + return response_text + def generate_code_completion_openai_legacy(prompt, baseurl, model, options, credentialname): """Generate code completion using OpenAI's official Python SDK""" if OpenAI is None: @@ -326,8 +416,8 @@ def generate_code_completion_openai_legacy(prompt, baseurl, model, options, cred full_prompt = fill_in_the_middle(config, prompt) log.debug('full_prompt: ' + full_prompt) - temperature = options.get('temperature', 0) - max_tokens = options.get('max_tokens', 300) + temperature = options.get('temperature', DEFAULT_TEMPERATURE) + max_tokens = options.get('max_tokens', DEFAULT_MAX_TOKENS) log.debug('model: ' + str(model)) log.debug('temperature: ' + str(temperature)) @@ -343,6 +433,135 @@ def generate_code_completion_openai_legacy(prompt, baseurl, model, options, cred return response.rstrip() +def generate_code_completion_openai_responses(prompt, baseurl, model, options, credentialname): + """Generate code completion using OpenAI's /v1/responses endpoint for GPT-5.1-Codex""" + if OpenAI is None: + raise ImportError("OpenAI package not found. Please install via 'pip install openai'.") + + log.debug('Using OpenAI responses endpoint (for GPT-5.1-Codex)') + cred = OllamaCredentials() + api_key = cred.GetApiKey('openai', credentialname) + + if baseurl: + endpoint = f"{baseurl}/v1/responses" + else: + endpoint = "https://api.openai.com/v1/responses" + + log.debug(f'endpoint: {endpoint}') + + parts = prompt.split('') + if len(parts) != 2: + log.error("Prompt must contain marker.") + sys.exit(1) + + # For code completion, we just use the before part as input + before = parts[0] + after = parts[1] + + # Build the input prompt for code completion + # gpt-5.1-codex seems to work better with explicit instructions + if after.strip(): + # Fill-in-the-middle style completion + full_input = f"Complete the code at :\n\n{before}{after}\n\nProvide ONLY the code that replaces , nothing else." + else: + # End-of-file completion + full_input = f"Continue this code:\n\n{before}\n\nProvide ONLY the next line(s) of code, nothing else." + + # Use higher token limit for gpt-5.1-codex which uses reasoning tokens + max_output_tokens = options.get('max_completion_tokens', options.get('max_tokens', DEFAULT_MAX_TOKENS)) + + log.debug('model: ' + str(model)) + log.debug('max_output_tokens: ' + str(max_output_tokens)) + log.debug('input: ' + full_input) + + headers = { + 'Authorization': f'Bearer {api_key}', + 'Content-Type': 'application/json' + } + + data = { + 'model': model, + 'input': full_input, + 'max_output_tokens': max_output_tokens + } + + try: + response = requests.post(endpoint, headers=headers, json=data) + if response.status_code != 200: + log.error(f'API error: {response.text}') + response.raise_for_status() + result = response.json() + log.debug('response: ' + json.dumps(result, indent=2)) + + # Extract the completion from the response + # The responses endpoint can return different formats + completion = None + + # Check if response has the new format with 'output' array + if 'output' in result and isinstance(result['output'], list): + log.debug(f'Result has output array with {len(result["output"])} items') + # Look for message type items in the output array + for idx, item in enumerate(result['output']): + log.debug(f'Output item {idx}: type={item.get("type")}') + if item.get('type') == 'message' and item.get('status') == 'completed': + content = item.get('content', []) + log.debug(f'Message content has {len(content)} items') + for content_idx, content_item in enumerate(content): + log.debug(f'Content {content_idx}: type={content_item.get("type")}') + if content_item.get('type') == 'output_text': + completion = content_item.get('text', '') + log.debug(f'Found output_text: {completion}') + break + if completion: + break + + # If no message found, log the incomplete status + if not completion and result.get('status') == 'incomplete': + log.warning(f'Response incomplete: {result.get("incomplete_details")}') + # For incomplete responses with only reasoning, we might need to handle differently + log.error('No message output found, only reasoning. Model may need different prompt.') + + # Handle list format (old format) + elif isinstance(result, list): + log.debug(f'Result is a list with {len(result)} items') + for idx, item in enumerate(result): + log.debug(f'Item {idx}: type={item.get("type")}, status={item.get("status")}') + if item.get('type') == 'message' and item.get('status') == 'completed': + content = item.get('content', []) + log.debug(f'Message content has {len(content)} items') + for content_idx, content_item in enumerate(content): + log.debug(f'Content {content_idx}: type={content_item.get("type")}') + if content_item.get('type') == 'output_text': + completion = content_item.get('text', '') + log.debug(f'Found output_text: {completion}') + break + if completion: + break + + # Fallback for other formats + elif 'text' in result: + completion = result['text'] + elif 'choices' in result and len(result['choices']) > 0: + completion = result['choices'][0].get('text', result['choices'][0].get('message', {}).get('content', '')) + + if not completion: + log.error('Could not extract completion from response') + log.error('Response structure: ' + json.dumps(result, indent=2)) + return "" + + # Ensure completion is a string + if not isinstance(completion, str): + log.error(f'Completion is not a string, type: {type(completion)}') + completion = str(completion) + + log.debug('Final completion: ' + completion) + return completion.strip() + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + log.error(str(e)) + sys.exit(1) + if __name__ == "__main__": try: parser = argparse.ArgumentParser(description="Complete code using Ollama or OpenAI LLM.") @@ -408,6 +627,20 @@ def generate_code_completion_openai_legacy(prompt, baseurl, model, options, cred modelname = DEFAULT_OPENAI_MODEL baseurl = args.url or None response = generate_code_completion_openai_legacy(prompt, baseurl, modelname, options, args.keyname) + elif args.provider == "claude": + if args.model: + modelname = args.model + else: + modelname = DEFAULT_CLAUDE_MODEL + baseurl = args.url or None + response = generate_code_completion_claude(prompt, baseurl, modelname, options, args.keyname) + elif args.provider == "openai_responses": + if args.model: + modelname = args.model + else: + modelname = DEFAULT_OPENAI_RESPONSES_MODEL + baseurl = args.url or None + response = generate_code_completion_openai_responses(prompt, baseurl, modelname, options, args.keyname) else: log.error(f"Unknown provider: {args.provider}") sys.exit(1) From 1a01949410ae3aef314c8cd5c5d5225c84f3b20e Mon Sep 17 00:00:00 2001 From: kaka Date: Tue, 2 Dec 2025 17:39:31 +0700 Subject: [PATCH 5/6] feat: Use the same input prompt for code completion as generate_code_completion_openai --- python/complete.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/python/complete.py b/python/complete.py index b65d115..b03c78b 100755 --- a/python/complete.py +++ b/python/complete.py @@ -459,13 +459,25 @@ def generate_code_completion_openai_responses(prompt, baseurl, model, options, c after = parts[1] # Build the input prompt for code completion - # gpt-5.1-codex seems to work better with explicit instructions - if after.strip(): - # Fill-in-the-middle style completion - full_input = f"Complete the code at :\n\n{before}{after}\n\nProvide ONLY the code that replaces , nothing else." - else: - # End-of-file completion - full_input = f"Continue this code:\n\n{before}\n\nProvide ONLY the next line(s) of code, nothing else." + # Use similar structure as Claude prompt for better results + lang = options.get('lang', 'C') + + full_input = f"""Fill in the missing code between the markers below. + +Rules: +- Do NOT repeat any code that appears in the AFTER section. +- Return only the exact code that fits between BEFORE and AFTER. +- Do NOT add explanations or comments. +- Output the missing code only. + +Language: {lang} + +BEFORE: +{before} + +AFTER: +{after} +""" # Use higher token limit for gpt-5.1-codex which uses reasoning tokens max_output_tokens = options.get('max_completion_tokens', options.get('max_tokens', DEFAULT_MAX_TOKENS)) From b130eb3db9cbc37b0245e700744192a92c14d85d Mon Sep 17 00:00:00 2001 From: kaka Date: Tue, 2 Dec 2025 17:49:38 +0700 Subject: [PATCH 6/6] refactor: extract the _build_fim_prompt and _strip_code_fences methods. --- python/complete.py | 222 ++++++++++++++++----------------------------- 1 file changed, 80 insertions(+), 142 deletions(-) diff --git a/python/complete.py b/python/complete.py index b03c78b..b92fe8f 100755 --- a/python/complete.py +++ b/python/complete.py @@ -219,6 +219,36 @@ def extract_stop_marker(after: str) -> str | None: return line.rstrip() # preserve indentation return None +def _build_fim_prompt(before: str, after: str, lang: str = 'C') -> str: + """Build fill-in-the-middle prompt for models that don't support native FIM.""" + return f"""Fill in the missing code between the markers below. + +Rules: +- Do NOT repeat any code that appears in the AFTER section. +- Return only the exact code that fits between BEFORE and AFTER. +- Do NOT add explanations or comments. +- Output the missing code only. + +Language: {lang} + +BEFORE: +{before} + +AFTER: +{after} +""" + +def _strip_code_fences(text: str) -> str: + """Remove markdown code fence markers (```) from beginning and end of text.""" + lines = text.splitlines() + if lines: + if lines[0].startswith("```"): + lines.pop(0) + if lines and lines[-1].startswith("```"): + lines.pop() + return "\n".join(lines) + return text + def generate_code_completion_openai(prompt, baseurl, model, options, sampling_enabled, credentialname): """Generate code completion using OpenAI's official Python SDK""" if OpenAI is None: @@ -298,19 +328,8 @@ def generate_code_completion_openai(prompt, baseurl, model, options, sampling_en log.error(str(e)) sys.exit(1) - # convert response to lines - lines = response.splitlines() - if lines: - # remove 1st element from array if it starts with ``` - if lines[0].startswith("```"): - lines.pop(0) - # remove last element from array if it starts with ``` - if lines[-1].startswith("```"): - lines.pop() - - response = "\n".join(lines) - - return response + # Remove markdown code fences if present + return _strip_code_fences(response) def generate_code_completion_claude(prompt, baseurl, model, options, credentialname): """Generate code completion using Anthropic Claude API""" @@ -332,27 +351,10 @@ def generate_code_completion_claude(prompt, baseurl, model, options, credentialn if len(parts) != 2: log.error("Prompt must contain marker for Claude mode.") sys.exit(1) - before = parts[0] - after = parts[1] + # Build FIM prompt using helper function lang = options.get('lang', 'C') - # Claude doesn't support Fill-in-the-middle, use prompt engineering - full_prompt = f"""Fill in the missing code between the markers below. - -Rules: -- Do NOT repeat any code that appears in the AFTER section. -- Return only the exact code that fits between BEFORE and AFTER. -- Do NOT add explanations or comments. -- Output the missing code only. - -Language: {lang} - -BEFORE: -{before} - -AFTER: -{after} -""" + full_prompt = _build_fim_prompt(parts[0], parts[1], lang) log.debug('full_prompt: ' + full_prompt) temperature = options.get('temperature', DEFAULT_TEMPERATURE) @@ -377,19 +379,8 @@ def generate_code_completion_claude(prompt, baseurl, model, options, credentialn log.error(str(e)) sys.exit(1) - # convert response to lines - lines = response_text.splitlines() - if lines: - # remove 1st element from array if it starts with ``` - if lines[0].startswith("```"): - lines.pop(0) - # remove last element from array if it starts with ``` - if lines[-1].startswith("```"): - lines.pop() - - response_text = "\n".join(lines) - - return response_text + # Remove markdown code fences if present + return _strip_code_fences(response_text) def generate_code_completion_openai_legacy(prompt, baseurl, model, options, credentialname): """Generate code completion using OpenAI's official Python SDK""" @@ -433,140 +424,87 @@ def generate_code_completion_openai_legacy(prompt, baseurl, model, options, cred return response.rstrip() +def _extract_output_text_from_message(item): + """Extract output_text from a message item in OpenAI responses format""" + if item.get('type') == 'message' and item.get('status') == 'completed': + content = item.get('content', []) + for content_item in content: + if content_item.get('type') == 'output_text': + return content_item.get('text', '') + return None + def generate_code_completion_openai_responses(prompt, baseurl, model, options, credentialname): """Generate code completion using OpenAI's /v1/responses endpoint for GPT-5.1-Codex""" if OpenAI is None: raise ImportError("OpenAI package not found. Please install via 'pip install openai'.") log.debug('Using OpenAI responses endpoint (for GPT-5.1-Codex)') + + # Get API credentials cred = OllamaCredentials() api_key = cred.GetApiKey('openai', credentialname) + endpoint = f"{baseurl}/v1/responses" if baseurl else "https://api.openai.com/v1/responses" - if baseurl: - endpoint = f"{baseurl}/v1/responses" - else: - endpoint = "https://api.openai.com/v1/responses" - - log.debug(f'endpoint: {endpoint}') - + # Parse prompt parts = prompt.split('') if len(parts) != 2: log.error("Prompt must contain marker.") sys.exit(1) - # For code completion, we just use the before part as input - before = parts[0] - after = parts[1] - - # Build the input prompt for code completion - # Use similar structure as Claude prompt for better results + # Build FIM prompt using helper function lang = options.get('lang', 'C') + full_input = _build_fim_prompt(parts[0], parts[1], lang) - full_input = f"""Fill in the missing code between the markers below. - -Rules: -- Do NOT repeat any code that appears in the AFTER section. -- Return only the exact code that fits between BEFORE and AFTER. -- Do NOT add explanations or comments. -- Output the missing code only. - -Language: {lang} - -BEFORE: -{before} - -AFTER: -{after} -""" - - # Use higher token limit for gpt-5.1-codex which uses reasoning tokens max_output_tokens = options.get('max_completion_tokens', options.get('max_tokens', DEFAULT_MAX_TOKENS)) + log.debug(f'endpoint: {endpoint}, model: {model}, max_output_tokens: {max_output_tokens}') - log.debug('model: ' + str(model)) - log.debug('max_output_tokens: ' + str(max_output_tokens)) - log.debug('input: ' + full_input) - - headers = { - 'Authorization': f'Bearer {api_key}', - 'Content-Type': 'application/json' - } - - data = { - 'model': model, - 'input': full_input, - 'max_output_tokens': max_output_tokens - } - + # Make API request try: - response = requests.post(endpoint, headers=headers, json=data) + response = requests.post( + endpoint, + headers={'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}, + json={'model': model, 'input': full_input, 'max_output_tokens': max_output_tokens} + ) + if response.status_code != 200: log.error(f'API error: {response.text}') response.raise_for_status() + result = response.json() log.debug('response: ' + json.dumps(result, indent=2)) - # Extract the completion from the response - # The responses endpoint can return different formats + # Extract completion from response (supports both list and dict formats) completion = None + output_items = result.get('output', []) if isinstance(result, dict) else result if isinstance(result, list) else [] + + for item in output_items: + completion = _extract_output_text_from_message(item) + if completion: + break - # Check if response has the new format with 'output' array - if 'output' in result and isinstance(result['output'], list): - log.debug(f'Result has output array with {len(result["output"])} items') - # Look for message type items in the output array - for idx, item in enumerate(result['output']): - log.debug(f'Output item {idx}: type={item.get("type")}') - if item.get('type') == 'message' and item.get('status') == 'completed': - content = item.get('content', []) - log.debug(f'Message content has {len(content)} items') - for content_idx, content_item in enumerate(content): - log.debug(f'Content {content_idx}: type={content_item.get("type")}') - if content_item.get('type') == 'output_text': - completion = content_item.get('text', '') - log.debug(f'Found output_text: {completion}') - break - if completion: - break - - # If no message found, log the incomplete status - if not completion and result.get('status') == 'incomplete': - log.warning(f'Response incomplete: {result.get("incomplete_details")}') - # For incomplete responses with only reasoning, we might need to handle differently - log.error('No message output found, only reasoning. Model may need different prompt.') - - # Handle list format (old format) - elif isinstance(result, list): - log.debug(f'Result is a list with {len(result)} items') - for idx, item in enumerate(result): - log.debug(f'Item {idx}: type={item.get("type")}, status={item.get("status")}') - if item.get('type') == 'message' and item.get('status') == 'completed': - content = item.get('content', []) - log.debug(f'Message content has {len(content)} items') - for content_idx, content_item in enumerate(content): - log.debug(f'Content {content_idx}: type={content_item.get("type")}') - if content_item.get('type') == 'output_text': - completion = content_item.get('text', '') - log.debug(f'Found output_text: {completion}') - break - if completion: - break + # Check for incomplete responses + if not completion and isinstance(result, dict) and result.get('status') == 'incomplete': + log.warning(f'Response incomplete: {result.get("incomplete_details")}') # Fallback for other formats - elif 'text' in result: - completion = result['text'] - elif 'choices' in result and len(result['choices']) > 0: - completion = result['choices'][0].get('text', result['choices'][0].get('message', {}).get('content', '')) + if not completion: + if isinstance(result, dict): + if 'text' in result: + completion = result['text'] + elif 'choices' in result and result['choices']: + choice = result['choices'][0] + completion = choice.get('text') or choice.get('message', {}).get('content') if not completion: log.error('Could not extract completion from response') - log.error('Response structure: ' + json.dumps(result, indent=2)) return "" # Ensure completion is a string if not isinstance(completion, str): - log.error(f'Completion is not a string, type: {type(completion)}') - completion = str(completion) + log.error(f'Completion is not a string, type: {type(completion)}, value: {completion}') + return "" - log.debug('Final completion: ' + completion) + log.debug(f'Final completion: {completion}') return completion.strip() except Exception as e: