diff --git a/.gitignore b/.gitignore
index 330f1d4..9e9a4d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
/target/
FnKey.app/
.DS_Store
+.venv/
+__pycache__/
diff --git a/AppIcon.icns b/AppIcon.icns
new file mode 100644
index 0000000..f66f944
Binary files /dev/null and b/AppIcon.icns differ
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..2a75cbe
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,113 @@
+# FnKey — Agent Instructions
+
+## What is this
+
+FnKey is a macOS menu bar app (Rust) that records audio while a hotkey is held, transcribes it via Whisper, sanitizes the text with a small LLM, and pastes the result at the cursor. Single file: `src/main.rs`.
+
+## Architecture
+
+```
+Hotkey held → mic records → release → WAV encoded → Whisper STT → LLM sanitizer → clipboard → ⌘V paste
+```
+
+- **Transcription**: Any OpenAI-compatible `/v1/audio/transcriptions` endpoint
+- **Sanitization**: Any OpenAI-compatible `/v1/chat/completions` endpoint
+- **Config**: `~/.config/fnkey/config.toml` (TOML, deserialized into `Config` struct)
+- **No runtime dependencies** — single static binary, no frameworks beyond macOS system libs
+
+## Text Sanitization Pipeline
+
+The sanitizer is the key differentiator. Raw Whisper output is messy — filler words, repeated words, misheard technical terms. A small LLM (0.6B–3B params) cleans it up in real time (<200ms).
+
+### Setup for contributors
+
+After cloning this repo, set up local inference for development:
+
+1. **Whisper STT** — download and serve a Whisper model:
+ ```bash
+ # Option A: llama.cpp (any platform with Vulkan/CUDA/Metal)
+ whisper-server -m ggml-large-v3-turbo.bin --port 8100
+
+ # Option B: faster-whisper-server (CUDA)
+ pip install faster-whisper-server
+ faster-whisper-server --model large-v3-turbo --port 8100
+
+ # Option C: MLX (macOS Apple Silicon)
+ pip install mlx-whisper
+ # (use via mlx-whisper Python API, or wrap in a server)
+ ```
+
+2. **Text sanitizer** — download and serve a small LLM:
+ ```bash
+ # Option A: llama.cpp
+ llama-server -m Qwen3-0.6B-Q6_K.gguf --port 8108 -ngl 999 --jinja -c 4096
+
+ # Option B: MLX
+ pip install mlx-lm
+ mlx_lm.server --model mlx-community/Qwen3-0.6B-4bit --port 8108
+ ```
+
+3. **Configure** `~/.config/fnkey/config.toml`:
+ ```toml
+ api_key = "not-needed"
+ transcription_url = "http://localhost:8100/v1/audio/transcriptions"
+ whisper_model = "whisper-large-v3-turbo"
+ polish_url = "http://localhost:8108/v1/chat/completions"
+ polish_model = "qwen3-0.6b"
+ always_polish = true
+ ```
+
+### How the system prompt works
+
+The `polish_prompt` config field overrides the built-in system prompt. For 0.6B models, the prompt must be:
+- **Short** — the model has limited instruction-following at this scale
+- **Explicit** — use a replacement dictionary for domain terms the STT engine misrecognizes
+- **End with `/no_think`** — disables Qwen3 reasoning mode, keeping latency under 200ms
+
+Example for a codebase that uses llama.cpp, Claude, and Tailscale:
+
+```toml
+polish_prompt = """Fix dictation. Remove filler words. Fix grammar and punctuation.
+Replace misheard terms:
+- clod dot MD/cloud dot MD → CLAUDE.md
+- agents dot MD → agents.md
+- lama dot CPP → llama.cpp
+- quan three/qan three → Qwen3
+- tail scale → Tailscale
+Output ONLY the corrected text. /no_think"""
+```
+
+**When helping users customize their prompt**: identify the technical terms they use frequently, figure out how Whisper is likely to misrecognize them, and add those as `misheard → correct` pairs. This is the single most impactful thing you can do to improve dictation accuracy.
+
+## Build & test
+
+```bash
+cargo build --release # Build
+./build-app.sh # Create .app bundle
+cargo check # Type check only
+```
+
+After rebuilding, macOS invalidates permissions — user must re-add the app in System Settings (Input Monitoring, Accessibility).
+
+## Key code locations
+
+- `Config` struct and `load_config()` — config parsing and defaults
+- `polish_text()` — LLM sanitization call
+- `transcribe_and_paste()` — main pipeline: encode WAV → call Whisper → sanitize → paste
+- `run_event_tap()` — hotkey detection via CGEventTap
+- `enhance_audio()` — audio preprocessing (DC removal, high-pass filter, normalization)
+
+## Config fields
+
+| Field | Default | Purpose |
+|-------|---------|---------|
+| `api_key` | `""` | Bearer token for Whisper transcription (also used for sanitizer if `polish_api_key` is empty) |
+| `polish_api_key` | `""` | Separate bearer token for sanitizer endpoint (empty = falls back to `api_key`) |
+| `transcription_url` | Groq | Whisper endpoint |
+| `polish_url` | Groq | Chat completions endpoint for sanitizer |
+| `whisper_model` | `whisper-large-v3` | Model name sent to STT endpoint |
+| `polish_model` | `llama-3.3-70b-versatile` | Model name sent to sanitizer endpoint |
+| `hotkey` | `fn` | Trigger key (fn/option/control/shift/command) |
+| `language` | `""` (auto) | ISO-639-1 hint for Whisper |
+| `always_polish` | `true` | Sanitize every dictation by default |
+| `polish_prompt` | `""` (built-in) | Custom system prompt for sanitizer |
diff --git a/Cargo.lock b/Cargo.lock
index 3a45e69..ea5c74c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -466,6 +466,7 @@ dependencies = [
"reqwest",
"serde",
"serde_json",
+ "toml",
]
[[package]]
@@ -1422,7 +1423,7 @@ version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983"
dependencies = [
- "toml_edit",
+ "toml_edit 0.23.7",
]
[[package]]
@@ -1713,6 +1714,15 @@ dependencies = [
"serde_core",
]
+[[package]]
+name = "serde_spanned"
+version = "0.6.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3"
+dependencies = [
+ "serde",
+]
+
[[package]]
name = "serde_urlencoded"
version = "0.7.1"
@@ -1927,6 +1937,27 @@ dependencies = [
"tokio",
]
+[[package]]
+name = "toml"
+version = "0.8.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime 0.6.11",
+ "toml_edit 0.22.27",
+]
+
+[[package]]
+name = "toml_datetime"
+version = "0.6.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
+dependencies = [
+ "serde",
+]
+
[[package]]
name = "toml_datetime"
version = "0.7.3"
@@ -1936,6 +1967,20 @@ dependencies = [
"serde_core",
]
+[[package]]
+name = "toml_edit"
+version = "0.22.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
+dependencies = [
+ "indexmap",
+ "serde",
+ "serde_spanned",
+ "toml_datetime 0.6.11",
+ "toml_write",
+ "winnow",
+]
+
[[package]]
name = "toml_edit"
version = "0.23.7"
@@ -1943,7 +1988,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6485ef6d0d9b5d0ec17244ff7eb05310113c3f316f2d14200d4de56b3cb98f8d"
dependencies = [
"indexmap",
- "toml_datetime",
+ "toml_datetime 0.7.3",
"toml_parser",
"winnow",
]
@@ -1957,6 +2002,12 @@ dependencies = [
"winnow",
]
+[[package]]
+name = "toml_write"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
+
[[package]]
name = "tower"
version = "0.5.2"
diff --git a/Cargo.toml b/Cargo.toml
index 03692fc..e0c4f37 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,3 +28,6 @@ hound = "3.5"
# JSON parsing for LLM API
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
+
+# Config file parsing
+toml = "0.8"
diff --git a/Info.plist b/Info.plist
index b3eee15..5a15892 100644
--- a/Info.plist
+++ b/Info.plist
@@ -14,6 +14,8 @@
1.0.0
CFBundleShortVersionString
1.0.0
+ CFBundleIconFile
+ AppIcon
CFBundlePackageType
APPL
LSMinimumSystemVersion
diff --git a/README.md b/README.md
index bd0e945..7134725 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# fnkey.ai
-Hold Fn key, speak, paste transcribed text.
+Hold a hotkey, speak, paste transcribed text. Works with any OpenAI-compatible speech-to-text API.
## Install
@@ -14,66 +14,260 @@ Hold Fn key, speak, paste transcribed text.
mv FnKey.app /Applications/
```
-3. Set your Groq API key:
- ```bash
- mkdir -p ~/.config/fnkey
- echo 'your-groq-api-key' > ~/.config/fnkey/api_key
- ```
- Get a key at [console.groq.com](https://console.groq.com)
+3. Grant macOS permissions (see [Permissions](#permissions) below)
4. Launch:
```bash
open /Applications/FnKey.app
```
-5. Grant permissions in **System Settings → Privacy & Security**:
+5. Click the **○** menu bar icon → **Settings...** to configure your API endpoint
+
+## Configuration
+
+FnKey is configured via `~/.config/fnkey/config.toml`. A template is created automatically on first launch. Click **Settings...** in the menu bar to open it.
+
+```toml
+# API keys (optional — some local servers don't need one)
+api_key = "gsk_..." # Used for Whisper transcription
+polish_api_key = "" # Used for sanitizer (empty = use api_key)
+
+# API endpoints (default: Groq — any OpenAI-compatible API works)
+transcription_url = "https://api.groq.com/openai/v1/audio/transcriptions"
+polish_url = "https://api.groq.com/openai/v1/chat/completions"
+
+# Models (sent as-is in the API request — use whatever your server expects)
+whisper_model = "whisper-large-v3"
+polish_model = "llama-3.3-70b-versatile"
+
+# Hotkey: fn | option | control | shift | command
+hotkey = "fn"
+
+# Language hint (ISO-639-1 code: "en", "sk", "de", "fr", etc. Empty = auto-detect)
+language = ""
+
+# Always run text sanitization on every dictation (default: true)
+# When true, hold the polish modifier to get RAW Whisper output instead
+always_polish = true
+
+# Custom system prompt for text sanitization (empty = use built-in)
+polish_prompt = ""
+```
+
+### Custom API endpoints
+
+FnKey works with any OpenAI-compatible API. Examples:
+
+```toml
+# OpenAI for both
+api_key = "sk-..."
+transcription_url = "https://api.openai.com/v1/audio/transcriptions"
+polish_url = "https://api.openai.com/v1/chat/completions"
+whisper_model = "whisper-1"
+polish_model = "gpt-4o-mini"
+
+# Mixed: Groq for Whisper, OpenAI for sanitizer
+api_key = "gsk_..."
+polish_api_key = "sk-..."
+transcription_url = "https://api.groq.com/openai/v1/audio/transcriptions"
+polish_url = "https://api.openai.com/v1/chat/completions"
+whisper_model = "whisper-large-v3"
+polish_model = "gpt-4o-mini"
+
+# Local / self-hosted (vLLM, faster-whisper-server, etc.)
+api_key = "not-needed"
+transcription_url = "http://localhost:8000/v1/audio/transcriptions"
+whisper_model = "my-model-name"
+```
+
+Both plain-text and JSON transcription responses are handled automatically.
+
+### Hotkey options
+
+| Hotkey | Config value | Polish modifier |
+|--------|-------------|-----------------|
+| Fn | `"fn"` (default) | Ctrl |
+| Option/Alt | `"option"` | Ctrl |
+| Control | `"control"` | Shift |
+| Shift | `"shift"` | Ctrl |
+| Command | `"command"` | Ctrl |
- | Permission | Purpose | How to Grant |
- |------------|---------|--------------|
- | **Input Monitoring** | Detect Fn key press | Add FnKey.app via + button |
- | **Microphone** | Record voice | Prompted on first use, or add manually |
- | **Accessibility** | Auto-paste text | Add FnKey.app via + button |
+When `hotkey = "control"`, the polish modifier switches to Shift to avoid conflict.
- Note: After rebuilding the app, you may need to remove and re-add it in these settings.
+### Backward compatibility
+
+FnKey checks for configuration in this order:
+1. `~/.config/fnkey/config.toml`
+2. `~/.config/fnkey/api_key` (legacy — plain text API key)
+3. `GROQ_API_KEY` environment variable
## Usage
-- Hold **Fn** and speak → raw transcription
-- Hold **Fn+Ctrl** and speak → polished transcription (removes filler words, improves sentence structure)
-- Release to transcribe and paste
-- Click menu bar icon (○) → Quit to exit
+- Hold **hotkey** → speak → release → cleaned text pasted at cursor
+- Hold **hotkey + polish modifier** → speak → release → raw Whisper output (bypasses sanitization)
+- Click menu bar icon **○** → **Settings...** to edit config, **Quit** to exit
The icon changes: ○ (idle) → ● (recording)
-## Build from source
+When `always_polish = false`, the behavior is inverted: hotkey gives raw output, hotkey + modifier gives polished output.
+
+## Text Sanitization
+
+FnKey includes an LLM-powered text sanitization step that runs after Whisper transcription. It fixes the common artifacts of speech-to-text: filler words, repeated words, broken grammar, and misheard terms.
+
+### How it works
+
+```
+Voice → [Whisper STT] → raw text → [LLM sanitizer] → clean text → clipboard → paste
+```
+
+The sanitizer is a lightweight LLM (as small as 0.6B parameters) that receives the raw Whisper output and a system prompt, then returns cleaned text. It uses any OpenAI-compatible chat completions endpoint.
+
+### Running locally
+
+For real-time dictation, the sanitizer must be fast. A small model (0.6B–3B) running locally can sanitize a sentence in under 200ms. Two recommended setups:
+
+#### llama.cpp (Linux/macOS, GPU or CPU)
+
+Download a small model like [Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B-GGUF) and serve it:
```bash
-./build-app.sh
-cp -r FnKey.app /Applications/
+llama-server \
+ -m Qwen3-0.6B-Q6_K.gguf \
+ --port 8108 \
+ --host 0.0.0.0 \
+ -ngl 999 \
+ --jinja \
+ -c 4096
+
+# On macOS with Metal:
+llama-server -m Qwen3-0.6B-Q6_K.gguf --port 8108 -ngl 999 --jinja -c 4096
```
-Note: If cargo isn't found, run with login shell: `/bin/bash -l -c './build-app.sh'`
+Then configure FnKey:
+```toml
+polish_url = "http://localhost:8108/v1/chat/completions"
+polish_model = "Qwen3-0.6B-Q6_K.gguf"
+api_key = "not-needed"
+```
-## Features
+#### MLX (macOS Apple Silicon)
-- **Whisper large-v3** - Full model for best accuracy
-- **Audio enhancement** - DC offset removal, high-pass filter, peak normalization
-- **Config file** - API key stored in `~/.config/fnkey/api_key`
-- **Auto sample rate** - Uses device's native sample rate
+```bash
+pip install mlx-lm
+mlx_lm.server --model mlx-community/Qwen3-0.6B-4bit --port 8108
+```
-## TODO
+Then configure FnKey the same way.
-Features from Ito not yet implemented:
+#### Whisper locally
-- **Vocabulary hints** - Send prompt with proper nouns/technical terms to improve accuracy
-- **No-speech detection** - Use `verbose_json` response format and check `no_speech_prob` to skip silent recordings
-- **Custom dictionary** - User-configurable word list for domain-specific terms
+For the transcription side, run Whisper via [faster-whisper-server](https://github.com/fedirz/faster-whisper-server), [vLLM](https://docs.vllm.ai/), or llama.cpp's built-in whisper support:
-## Notes
+```bash
+# faster-whisper-server (CUDA)
+pip install faster-whisper-server
+faster-whisper-server --model large-v3-turbo --port 8100
+
+# vLLM (CUDA)
+vllm serve openai/whisper-large-v3-turbo --port 8100
+
+# llama.cpp whisper
+whisper-server -m ggml-large-v3-turbo.bin --port 8100
+```
+
+### Custom system prompt
+
+The built-in prompt handles general dictation cleanup. For domain-specific accuracy, set `polish_prompt` in your config with a replacement dictionary for terms your STT engine commonly misrecognizes:
+
+```toml
+polish_prompt = """Fix dictation. Remove filler words. Fix grammar and punctuation.
+Replace misheard terms:
+- clod dot MD/cloud dot MD → CLAUDE.md
+- agents dot MD → agents.md
+- lama dot CPP/llama dot CPP → llama.cpp
+- quan three/qan three → Qwen3
+- M L X → MLX
+- tailscale/tail scale → Tailscale
+Output ONLY the corrected text. /no_think"""
+```
+
+The `/no_think` suffix disables reasoning on Qwen3 models, keeping response time under 200ms.
+
+**Adapt this to your codebase.** If you dictate about Kubernetes, add `cooper netties → Kubernetes`. If you work on a project called "Nexus", add `nexus/next us → Nexus`. The replacement dictionary is the key to making 0.6B models accurate for your domain.
+
+### Recommended models
+
+| Model | Size | Speed | Notes |
+|-------|------|-------|-------|
+| Qwen3-0.6B | 600MB | ~270 t/s | Best speed, needs explicit replacement dictionary |
+| Qwen2.5-1.5B | 1.5GB | ~150 t/s | Better understanding, less dictionary needed |
+| Qwen3-1.7B | 1.7GB | ~120 t/s | Good balance of speed and quality |
+
+For the sanitizer, smaller is better — the task is simple pattern matching and cleanup, not reasoning. Use `/no_think` with Qwen3 models to disable chain-of-thought and keep latency low.
+
+## Permissions
+
+FnKey requires three macOS permissions. All are configured in **System Settings → Privacy & Security**.
+
+| Permission | Why | How to grant |
+|------------|-----|--------------|
+| **Input Monitoring** | Detect hotkey press/release | System Settings → Input Monitoring → click **+** → select FnKey.app |
+| **Microphone** | Record voice while hotkey is held | Prompted automatically on first recording, or add manually |
+| **Accessibility** | Simulate ⌘V to paste transcribed text | System Settings → Accessibility → click **+** → select FnKey.app |
+
+### After rebuilding from source
+
+When you rebuild and re-codesign the app, macOS **invalidates all previously granted permissions** because the binary signature changes. You must:
+
+1. Open **System Settings → Privacy & Security**
+2. For **Input Monitoring** and **Accessibility**: remove FnKey, then re-add `/Applications/FnKey.app`
+3. Relaunch the app
+
+The **Microphone** permission is usually re-prompted automatically.
+
+### Troubleshooting permissions
+
+| Symptom | Cause | Fix |
+|---------|-------|-----|
+| App launches but hotkey does nothing | Input Monitoring not granted | Add FnKey to Input Monitoring |
+| Hotkey records but text doesn't paste | Accessibility not granted | Add FnKey to Accessibility |
+| No microphone indicator when holding hotkey | Microphone not granted | Add FnKey to Microphone, or approve the prompt |
+| Permissions are granted but app still doesn't work | Stale permission after rebuild | Remove and re-add FnKey in each permission category |
+
+## Build from source
+
+```bash
+cargo build --release
+```
+
+To create an .app bundle:
+
+```bash
+./build-app.sh
+cp -r FnKey.app /Applications/
+```
+
+To regenerate the app icon (requires Python + Pillow):
+
+```bash
+python3 -m venv .venv && source .venv/bin/activate && pip install Pillow
+python3 gen-icon.py
+```
+
+Note: If cargo isn't found, run with login shell: `/bin/bash -l -c './build-app.sh'`
+
+## Features
-- Falls back to Option key if Fn not detected after 5s
-- Floating red dot appears during recording
+- **Text sanitization** — LLM-powered cleanup of filler words, repeated words, grammar, and misheard terms
+- **Configurable hotkey** — Fn, Option, Control, Shift, or Command
+- **Custom API endpoints** — any OpenAI-compatible transcription/chat API (Groq, OpenAI, vLLM, faster-whisper, etc.)
+- **Custom system prompt** — domain-specific replacement dictionaries for accurate technical dictation
+- **Audio enhancement** — DC offset removal, high-pass filter, peak normalization
+- **TOML config** — `~/.config/fnkey/config.toml` with Settings menu item
+- **Auto sample rate** — uses device's native sample rate
+- **JSON response handling** — works with servers that return JSON instead of plain text
## Known Limitations
-**Slight recording delay**: There's a brief moment when you start speaking before audio capture begins. This is a deliberate tradeoff — eliminating this delay would require the microphone to be always active, showing the yellow indicator constantly. The current design prioritizes privacy: the microphone only activates when you press the Fn key.
+**Slight recording delay**: There's a brief moment when you start speaking before audio capture begins. This is a deliberate tradeoff — eliminating this delay would require the microphone to be always active, showing the yellow indicator constantly. The current design prioritizes privacy: the microphone only activates when you press the hotkey.
diff --git a/build-app.sh b/build-app.sh
index fd90dd6..7f745a1 100755
--- a/build-app.sh
+++ b/build-app.sh
@@ -17,6 +17,9 @@ mkdir -p "$BUNDLE_DIR/Contents/Resources"
cp target/release/fnkey "$BUNDLE_DIR/Contents/MacOS/"
cp Info.plist "$BUNDLE_DIR/Contents/"
+if [ -f AppIcon.icns ]; then
+ cp AppIcon.icns "$BUNDLE_DIR/Contents/Resources/"
+fi
echo "Signing app..."
codesign --force --deep --sign "FnKey Dev" "$BUNDLE_DIR"
diff --git a/gen-icon.py b/gen-icon.py
new file mode 100644
index 0000000..cb6bb51
--- /dev/null
+++ b/gen-icon.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+"""Generate FnKey app icon using Pillow."""
+import math
+import os
+import shutil
+import subprocess
+from PIL import Image, ImageDraw, ImageFont
+
+
+def draw_icon(size):
+ """Draw the FnKey icon at given pixel size."""
+ img = Image.new("RGBA", (size, size), (0, 0, 0, 0))
+ draw = ImageDraw.Draw(img)
+
+ s = size
+ pad = int(s * 0.08)
+ corner_r = int(s * 0.22)
+
+ # === Background: dark rounded square ===
+ draw.rounded_rectangle(
+ [pad, pad, s - pad, s - pad],
+ radius=corner_r,
+ fill=(28, 28, 32, 255),
+ )
+
+ # Subtle border
+ inset = pad + max(1, int(s * 0.006))
+ draw.rounded_rectangle(
+ [inset, inset, s - inset, s - inset],
+ radius=corner_r - max(1, int(s * 0.006)),
+ outline=(60, 60, 72, 130),
+ width=max(1, int(s * 0.004)),
+ )
+
+ cx = s / 2
+ cy = s / 2
+
+ # === Sound wave arcs ===
+ for radius_frac, alpha in [(0.30, 35), (0.23, 55), (0.16, 80)]:
+ r = s * radius_frac
+ arc_w = max(1, int(s * 0.013))
+ arc_color = (100, 190, 255, alpha)
+ # Right arcs
+ bbox = [cx + s*0.02 - r, cy + s*0.04 - r, cx + s*0.02 + r, cy + s*0.04 + r]
+ draw.arc(bbox, start=-50, end=50, fill=arc_color, width=arc_w)
+ # Left arcs
+ bbox = [cx - s*0.02 - r, cy + s*0.04 - r, cx - s*0.02 + r, cy + s*0.04 + r]
+ draw.arc(bbox, start=130, end=230, fill=arc_color, width=arc_w)
+
+ # === Microphone ===
+ mic_w = s * 0.14
+ mic_h = s * 0.24
+ mic_x = cx - mic_w / 2
+ mic_y = cy - mic_h * 0.15
+
+ mic_color = (100, 195, 255, 240)
+
+ # Mic capsule (pill shape)
+ mic_r = mic_w / 2
+ draw.rounded_rectangle(
+ [mic_x, mic_y, mic_x + mic_w, mic_y + mic_h],
+ radius=int(mic_r),
+ fill=mic_color,
+ )
+
+ # Grille lines on mic
+ grille_color = (35, 100, 160, 100)
+ num_lines = 4
+ grille_top = mic_y + mic_h * 0.28
+ grille_bot = mic_y + mic_h * 0.82
+ line_w = max(1, int(s * 0.005))
+ for i in range(num_lines):
+ ly = grille_top + i * (grille_bot - grille_top) / (num_lines - 1)
+ lx1 = mic_x + mic_w * 0.2
+ lx2 = mic_x + mic_w * 0.8
+ draw.line([(lx1, ly), (lx2, ly)], fill=grille_color, width=line_w)
+
+ # === Mic stand ===
+ stand_color = (100, 195, 255, 200)
+ stand_w = max(1, int(s * 0.016))
+
+ # U-cradle arc
+ cradle_r = mic_w * 0.85
+ cradle_cy = mic_y + mic_h * 0.08
+ bbox = [cx - cradle_r, cradle_cy - cradle_r, cx + cradle_r, cradle_cy + cradle_r]
+ draw.arc(bbox, start=0, end=180, fill=stand_color, width=stand_w)
+
+ # Vertical stem
+ stem_top = cradle_cy + cradle_r
+ stem_bottom = stem_top + s * 0.07
+ draw.line([(cx, stem_top), (cx, stem_bottom)], fill=stand_color, width=stand_w)
+
+ # Base
+ base_w = s * 0.12
+ draw.line(
+ [(cx - base_w/2, stem_bottom), (cx + base_w/2, stem_bottom)],
+ fill=stand_color,
+ width=stand_w,
+ )
+
+ # === "fn" text at bottom ===
+ font_size = int(s * 0.16)
+ try:
+ font = ImageFont.truetype("/System/Library/Fonts/HelveticaNeue.ttc", font_size, index=8) # Bold
+ except (OSError, IndexError):
+ try:
+ font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", font_size)
+ except OSError:
+ font = ImageFont.load_default()
+
+ text = "fn"
+ text_bbox = draw.textbbox((0, 0), text, font=font)
+ tw = text_bbox[2] - text_bbox[0]
+ th = text_bbox[3] - text_bbox[1]
+
+ text_x = cx - tw / 2 - text_bbox[0]
+ text_y = pad + s * 0.06
+
+ draw.text((text_x, text_y), text, fill=(255, 255, 255, 230), font=font)
+
+ return img
+
+
+def main():
+ script_dir = os.path.dirname(os.path.abspath(__file__))
+ iconset_dir = os.path.join(script_dir, "AppIcon.iconset")
+ os.makedirs(iconset_dir, exist_ok=True)
+
+ sizes = [
+ (16, 1), (16, 2),
+ (32, 1), (32, 2),
+ (128, 1), (128, 2),
+ (256, 1), (256, 2),
+ (512, 1), (512, 2),
+ ]
+
+ for base_size, scale in sizes:
+ px = base_size * scale
+ if scale == 1:
+ name = f"icon_{base_size}x{base_size}.png"
+ else:
+ name = f"icon_{base_size}x{base_size}@{scale}x.png"
+
+ path = os.path.join(iconset_dir, name)
+ img = draw_icon(px)
+ img.save(path, "PNG")
+ print(f" {name} ({px}x{px})")
+
+ # Convert to .icns
+ icns_path = os.path.join(script_dir, "AppIcon.icns")
+ subprocess.run(["iconutil", "-c", "icns", iconset_dir, "-o", icns_path], check=True)
+ print(f"\nCreated {icns_path}")
+
+ shutil.rmtree(iconset_dir)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/main.rs b/src/main.rs
index 8fa3492..13edfd3 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -26,7 +26,8 @@ use core_graphics::event_source::{CGEventSource, CGEventSourceStateID};
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
use cpal::Stream;
use hound::{WavSpec, WavWriter};
-use objc::runtime::Object;
+use objc::declare::ClassDecl;
+use objc::runtime::{Object, Sel};
use objc::{class, msg_send, sel, sel_impl};
// ============================================================================
@@ -135,63 +136,210 @@ fn get_paste_keycode() -> u16 {
// Main application
// ============================================================================
-// Fn key flag in CGEventFlags
+// Modifier key flags in CGEventFlags
const FN_KEY_FLAG: u64 = 0x800000;
-// Option/Alt key flag
const OPTION_KEY_FLAG: u64 = 0x80000;
-// Control key flag
const CONTROL_KEY_FLAG: u64 = 0x40000;
+const SHIFT_KEY_FLAG: u64 = 0x20000;
+const COMMAND_KEY_FLAG: u64 = 0x100000;
-struct AppState {
- audio_buffer: Arc>>,
+// ============================================================================
+// Configuration
+// ============================================================================
+
+fn default_api_key() -> String {
+ String::new()
+}
+fn default_transcription_url() -> String {
+ "https://api.groq.com/openai/v1/audio/transcriptions".to_string()
+}
+fn default_polish_url() -> String {
+ "https://api.groq.com/openai/v1/chat/completions".to_string()
+}
+fn default_whisper_model() -> String {
+ "whisper-large-v3".to_string()
+}
+fn default_polish_model() -> String {
+ "llama-3.3-70b-versatile".to_string()
+}
+fn default_hotkey() -> String {
+ "fn".to_string()
+}
+fn default_language() -> String {
+ String::new() // empty = auto-detect
+}
+fn default_always_polish() -> bool {
+ true
+}
+fn default_polish_prompt() -> String {
+ String::new() // empty = use built-in prompt
+}
+fn default_polish_api_key() -> String {
+ String::new() // empty = use api_key
+}
+
+#[derive(serde::Deserialize, Clone)]
+struct Config {
+ #[serde(default = "default_api_key")]
api_key: String,
- use_fn_key: AtomicBool,
- sample_rate: std::sync::atomic::AtomicU32,
+ #[serde(default = "default_transcription_url")]
+ transcription_url: String,
+ #[serde(default = "default_polish_url")]
+ polish_url: String,
+ #[serde(default = "default_whisper_model")]
+ whisper_model: String,
+ #[serde(default = "default_polish_model")]
+ polish_model: String,
+ #[serde(default = "default_hotkey")]
+ hotkey: String,
+ #[serde(default = "default_language")]
+ language: String,
+ #[serde(default = "default_always_polish")]
+ always_polish: bool,
+ #[serde(default = "default_polish_prompt")]
+ polish_prompt: String,
+ #[serde(default = "default_polish_api_key")]
+ polish_api_key: String,
}
-// Global status item pointer for updating from callbacks
-static mut STATUS_ITEM: *mut Object = std::ptr::null_mut();
-// Global audio stream (not Send, so can't be in Arc)
-static mut AUDIO_STREAM: Option = None;
+impl Config {
+ /// Returns the CGEventFlags bitmask for the configured hotkey
+ fn hotkey_flag(&self) -> u64 {
+ match self.hotkey.as_str() {
+ "option" => OPTION_KEY_FLAG,
+ "control" => CONTROL_KEY_FLAG,
+ "shift" => SHIFT_KEY_FLAG,
+ "command" => COMMAND_KEY_FLAG,
+ _ => FN_KEY_FLAG, // "fn" or any unrecognized value
+ }
+ }
-/// Get API key from config file or environment variable.
-/// Checks ~/.config/fnkey/api_key first, then GROQ_API_KEY env var.
-fn get_api_key() -> Option {
- // Try config file first
+ /// Returns the modifier flag used to trigger polish mode.
+ /// Normally Ctrl, but if hotkey is already Ctrl, use Shift instead.
+ fn polish_flag(&self) -> u64 {
+ if self.hotkey == "control" {
+ SHIFT_KEY_FLAG
+ } else {
+ CONTROL_KEY_FLAG
+ }
+ }
+
+ /// API key for the polish/sanitizer endpoint.
+ /// Falls back to the main api_key when polish_api_key is not set.
+ fn polish_key(&self) -> &str {
+ if self.polish_api_key.is_empty() {
+ &self.api_key
+ } else {
+ &self.polish_api_key
+ }
+ }
+}
+
+/// Load configuration from TOML file, legacy api_key file, or environment variable.
+/// Always returns a Config — creates a default config.toml if nothing exists.
+fn load_config() -> Config {
if let Some(home) = env::var_os("HOME") {
- let config_path = std::path::Path::new(&home)
- .join(".config")
- .join("fnkey")
- .join("api_key");
- if let Ok(key) = std::fs::read_to_string(&config_path) {
+ let config_dir = std::path::Path::new(&home).join(".config").join("fnkey");
+
+ // Try config.toml first
+ let toml_path = config_dir.join("config.toml");
+ if let Ok(contents) = std::fs::read_to_string(&toml_path) {
+ if let Ok(config) = toml::from_str::(&contents) {
+ return config;
+ }
+ }
+
+ // Try legacy api_key file
+ let key_path = config_dir.join("api_key");
+ if let Ok(key) = std::fs::read_to_string(&key_path) {
let key = key.trim();
if !key.is_empty() {
- return Some(key.to_string());
+ return Config {
+ api_key: key.to_string(),
+ transcription_url: default_transcription_url(),
+ polish_url: default_polish_url(),
+ whisper_model: default_whisper_model(),
+ polish_model: default_polish_model(),
+ hotkey: default_hotkey(),
+ language: default_language(),
+ always_polish: default_always_polish(),
+ polish_prompt: default_polish_prompt(),
+ polish_api_key: default_polish_api_key(),
+ };
}
}
+
+ // Try environment variable
+ if let Ok(key) = env::var("GROQ_API_KEY") {
+ return Config {
+ api_key: key,
+ transcription_url: default_transcription_url(),
+ polish_url: default_polish_url(),
+ whisper_model: default_whisper_model(),
+ polish_model: default_polish_model(),
+ hotkey: default_hotkey(),
+ language: default_language(),
+ always_polish: default_always_polish(),
+ polish_prompt: default_polish_prompt(),
+ polish_api_key: default_polish_api_key(),
+ };
+ }
+
+ // No config found — create a default config.toml for the user to edit
+ let _ = std::fs::create_dir_all(&config_dir);
+ let default_toml = r#"# FnKey configuration — edit and relaunch
+# api_key = "your-api-key"
+# polish_api_key = "" # Separate key for sanitizer (empty = use api_key)
+# transcription_url = "https://your-server/v1/audio/transcriptions"
+# polish_url = "https://your-server/v1/chat/completions"
+# whisper_model = "whisper-large-v3"
+# polish_model = "llama-3.3-70b-versatile"
+# hotkey = "fn"
+# language = "" # ISO-639-1 code: "en", "sk", "de", "fr", etc. Empty = auto-detect
+# always_polish = true # Always run LLM cleanup on transcriptions (Ctrl modifier skips it)
+# polish_prompt = "" # Custom system prompt for polish mode (empty = use built-in)
+"#;
+ let _ = std::fs::write(&toml_path, default_toml);
}
- // Fall back to environment variable
- env::var("GROQ_API_KEY").ok()
+
+ // Return defaults — app will launch but transcription won't work until configured
+ Config {
+ api_key: default_api_key(),
+ transcription_url: default_transcription_url(),
+ polish_url: default_polish_url(),
+ whisper_model: default_whisper_model(),
+ polish_model: default_polish_model(),
+ hotkey: default_hotkey(),
+ language: default_language(),
+ always_polish: default_always_polish(),
+ polish_prompt: default_polish_prompt(),
+ polish_api_key: default_polish_api_key(),
+ }
+}
+
+struct AppState {
+ audio_buffer: Arc>>,
+ config: Config,
+ sample_rate: std::sync::atomic::AtomicU32,
}
+// Global status item pointer for updating from callbacks
+static mut STATUS_ITEM: *mut Object = std::ptr::null_mut();
+// Global audio stream (not Send, so can't be in Arc)
+static mut AUDIO_STREAM: Option = None;
+
fn main() {
- let api_key = get_api_key().unwrap_or_else(|| {
- show_alert(
- "GROQ_API_KEY not configured",
- "Please create ~/.config/fnkey/api_key with your Groq API key.\n\nExample:\n mkdir -p ~/.config/fnkey\n echo 'gsk_your_key_here' > ~/.config/fnkey/api_key"
- );
- std::process::exit(1);
- });
+ let config = load_config();
- // Check Input Monitoring permission
- if !check_input_monitoring_permission() {
- std::process::exit(1);
- }
+ // Eagerly build keycode map on main thread — Carbon TIS APIs require main thread
+ let _ = get_paste_keycode();
+
+ // Request Input Monitoring permission (non-blocking — app continues either way)
+ check_input_monitoring_permission();
let state = Arc::new(AppState {
audio_buffer: Arc::new(Mutex::new(Vec::new())),
- api_key,
- use_fn_key: AtomicBool::new(true),
+ config,
sample_rate: std::sync::atomic::AtomicU32::new(48000), // Default, will be updated
});
@@ -209,29 +357,18 @@ fn main() {
run_event_tap(state);
}
-fn check_input_monitoring_permission() -> bool {
+fn check_input_monitoring_permission() {
unsafe {
- // CGPreflightListenEventAccess and CGRequestListenEventAccess
#[link(name = "CoreGraphics", kind = "framework")]
extern "C" {
fn CGPreflightListenEventAccess() -> bool;
fn CGRequestListenEventAccess() -> bool;
}
- if CGPreflightListenEventAccess() {
- return true;
+ if !CGPreflightListenEventAccess() {
+ // Request permission - shows system dialog on first run
+ CGRequestListenEventAccess();
}
-
- // Request permission - this shows system dialog
- if CGRequestListenEventAccess() {
- return true;
- }
-
- show_alert(
- "Input Monitoring Required",
- "FnKey needs Input Monitoring permission to detect the Fn key.\n\nPlease grant access in System Settings → Privacy & Security → Input Monitoring, then relaunch FnKey.",
- );
- false
}
}
@@ -249,6 +386,53 @@ fn show_alert(title: &str, message: &str) {
}
}
+/// Objective-C callback: open config.toml in default editor
+extern "C" fn open_settings(_this: &Object, _cmd: Sel, _sender: id) {
+ if let Some(home) = env::var_os("HOME") {
+ let config_path = std::path::Path::new(&home)
+ .join(".config")
+ .join("fnkey")
+ .join("config.toml");
+ // Ensure file exists
+ let _ = std::fs::create_dir_all(config_path.parent().unwrap());
+ if !config_path.exists() {
+ let default_toml = r#"# FnKey configuration — edit and relaunch
+# api_key = "your-api-key"
+# polish_api_key = "" # Separate key for sanitizer (empty = use api_key)
+# transcription_url = "https://your-server/v1/audio/transcriptions"
+# polish_url = "https://your-server/v1/chat/completions"
+# whisper_model = "whisper-large-v3"
+# polish_model = "llama-3.3-70b-versatile"
+# hotkey = "fn"
+# language = "" # ISO-639-1 code: "en", "sk", "de", "fr", etc. Empty = auto-detect
+# always_polish = true # Always run LLM cleanup on transcriptions (Ctrl modifier skips it)
+# polish_prompt = "" # Custom system prompt for polish mode (empty = use built-in)
+"#;
+ let _ = std::fs::write(&config_path, default_toml);
+ }
+ unsafe {
+ let workspace: id = msg_send![class!(NSWorkspace), sharedWorkspace];
+ let path_str = NSString::alloc(nil).init_str(config_path.to_str().unwrap());
+ let url: id = msg_send![class!(NSURL), fileURLWithPath: path_str];
+ let _: bool = msg_send![workspace, openURL: url];
+ }
+ }
+}
+
+/// Register a helper class with an openSettings: action
+unsafe fn register_menu_delegate() -> id {
+ let superclass = class!(NSObject);
+ let mut decl = ClassDecl::new("FnKeyMenuDelegate", superclass).unwrap();
+ decl.add_method(
+ sel!(openSettings:),
+ open_settings as extern "C" fn(&Object, Sel, id),
+ );
+ let cls = decl.register();
+ let obj: id = msg_send![cls, new];
+ let _: () = msg_send![obj, retain];
+ obj
+}
+
unsafe fn create_status_item() {
let status_bar: id = msg_send![class!(NSStatusBar), systemStatusBar];
let status_item: id = msg_send![status_bar, statusItemWithLength: -1.0_f64]; // NSVariableStatusItemLength
@@ -260,9 +444,24 @@ unsafe fn create_status_item() {
let button: id = msg_send![status_item, button];
let _: () = msg_send![button, setTitle: title];
+ // Register menu delegate for Settings action
+ let delegate = register_menu_delegate();
+
// Create menu
let menu: id = NSMenu::new(nil);
+ // Settings item
+ let settings_title = NSString::alloc(nil).init_str("Settings...");
+ let settings_key = NSString::alloc(nil).init_str(",");
+ let settings_item: id = msg_send![class!(NSMenuItem), alloc];
+ let settings_item: id = msg_send![settings_item, initWithTitle: settings_title action: sel!(openSettings:) keyEquivalent: settings_key];
+ let _: () = msg_send![settings_item, setTarget: delegate];
+ let _: () = msg_send![menu, addItem: settings_item];
+
+ // Separator
+ let separator: id = msg_send![class!(NSMenuItem), separatorItem];
+ let _: () = msg_send![menu, addItem: separator];
+
// Quit item
let quit_title = NSString::alloc(nil).init_str("Quit FnKey");
let quit_key = NSString::alloc(nil).init_str("q");
@@ -288,13 +487,14 @@ fn update_status_icon(recording: bool) {
fn run_event_tap(state: Arc) {
let state_for_callback = Arc::clone(&state);
- let fn_detected = Arc::new(AtomicBool::new(false));
let was_pressed = Arc::new(AtomicBool::new(false));
- let ctrl_was_held = Arc::new(AtomicBool::new(false));
+ let polish_latched = Arc::new(AtomicBool::new(false));
- let fn_detected_clone = Arc::clone(&fn_detected);
let was_pressed_clone = Arc::clone(&was_pressed);
- let ctrl_latched_clone = Arc::clone(&ctrl_was_held); // Latches true if Ctrl pressed anytime during recording
+ let polish_latched_clone = Arc::clone(&polish_latched);
+
+ let hotkey_flag = state.config.hotkey_flag();
+ let polish_flag = state.config.polish_flag();
let tap = CGEventTap::new(
CGEventTapLocation::HID,
@@ -304,65 +504,54 @@ fn run_event_tap(state: Arc) {
move |_, _, event| {
let flags = event.get_flags().bits();
- // Check Fn key first, then Option as fallback
- let fn_pressed = (flags & FN_KEY_FLAG) != 0;
- let option_pressed = (flags & OPTION_KEY_FLAG) != 0;
- let ctrl_pressed = (flags & CONTROL_KEY_FLAG) != 0;
-
- let use_fn = state_for_callback.use_fn_key.load(Ordering::SeqCst);
- let key_pressed = if use_fn { fn_pressed } else { option_pressed };
-
- // Detect if Fn key works (first time detection)
- if fn_pressed && !fn_detected_clone.load(Ordering::SeqCst) {
- fn_detected_clone.store(true, Ordering::SeqCst);
- }
+ let key_pressed = (flags & hotkey_flag) != 0;
+ let polish_held = (flags & polish_flag) != 0;
let prev_pressed = was_pressed_clone.load(Ordering::SeqCst);
- // Handle key state changes
if key_pressed && !prev_pressed {
- // Key pressed - start recording, reset Ctrl latch
- ctrl_latched_clone.store(false, Ordering::SeqCst);
+ // Key pressed - start recording, reset polish latch
+ polish_latched_clone.store(false, Ordering::SeqCst);
start_recording(&state_for_callback);
} else if !key_pressed && prev_pressed {
// Key released - stop recording and transcribe
- let polish = ctrl_latched_clone.load(Ordering::SeqCst);
+ let polish = polish_latched_clone.load(Ordering::SeqCst);
stop_recording(&state_for_callback, polish);
}
- // Latch Ctrl if pressed anytime during recording
- if key_pressed && ctrl_pressed {
- ctrl_latched_clone.store(true, Ordering::SeqCst);
+ // Latch polish modifier if held anytime during recording
+ if key_pressed && polish_held {
+ polish_latched_clone.store(true, Ordering::SeqCst);
}
was_pressed_clone.store(key_pressed, Ordering::SeqCst);
None
},
- )
- .expect("Failed to create event tap - check Input Monitoring permissions");
+ );
- let source = tap
- .mach_port
- .create_runloop_source(0)
- .expect("Failed to create runloop source");
+ match tap {
+ Ok(tap) => {
+ let source = tap
+ .mach_port
+ .create_runloop_source(0)
+ .expect("Failed to create runloop source");
- let run_loop = CFRunLoop::get_current();
- run_loop.add_source(&source, unsafe { kCFRunLoopCommonModes });
+ let run_loop = CFRunLoop::get_current();
+ run_loop.add_source(&source, unsafe { kCFRunLoopCommonModes });
- tap.enable();
+ tap.enable();
- // Fallback timer: if no Fn detected in 5 seconds, switch to Option
- let state_fallback = Arc::clone(&state);
- let fn_detected_fallback = Arc::clone(&fn_detected);
- thread::spawn(move || {
- thread::sleep(Duration::from_secs(5));
- if !fn_detected_fallback.load(Ordering::SeqCst) && state_fallback.use_fn_key.load(Ordering::SeqCst) {
- state_fallback.use_fn_key.store(false, Ordering::SeqCst);
+ // tap + source must stay alive while the run loop is running
+ unsafe { NSApp().run(); }
+ }
+ Err(_) => {
+ show_alert(
+ "Input Monitoring Required",
+ "FnKey can't detect hotkey presses.\n\nGo to System Settings → Privacy & Security → Input Monitoring, remove FnKey, re-add it, then relaunch.",
+ );
+ // Still run the app so the menu bar icon (Settings/Quit) is usable
+ unsafe { NSApp().run(); }
}
- });
-
- unsafe {
- NSApp().run();
}
}
@@ -451,48 +640,76 @@ fn stop_recording(state: &Arc, polish: bool) {
}
// Transcribe in background
- let api_key = state.api_key.clone();
+ let config = state.config.clone();
let sample_rate = state.sample_rate.load(Ordering::SeqCst);
thread::spawn(move || {
- transcribe_and_paste(audio_data, sample_rate, &api_key, polish);
+ transcribe_and_paste(audio_data, sample_rate, &config, polish);
});
}
-fn transcribe_and_paste(audio: Vec, sample_rate: u32, api_key: &str, polish: bool) {
+fn transcribe_and_paste(audio: Vec, sample_rate: u32, config: &Config, polish: bool) {
+ let duration_secs = audio.len() as f32 / sample_rate as f32;
+ eprintln!("[fnkey] audio: {:.1}s, {} samples, {}Hz, {:.0}KB raw",
+ duration_secs, audio.len(), sample_rate, audio.len() as f32 * 4.0 / 1024.0);
+
let wav_data = match encode_wav(&audio, sample_rate) {
Ok(data) => data,
Err(_) => return,
};
+ eprintln!("[fnkey] wav: {:.0}KB", wav_data.len() as f32 / 1024.0);
let client = reqwest::blocking::Client::new();
- let form = reqwest::blocking::multipart::Form::new()
- .text("model", "whisper-large-v3") // Full model for better accuracy (vs turbo)
- .text("response_format", "text")
- .part(
- "file",
- reqwest::blocking::multipart::Part::bytes(wav_data)
- .file_name("audio.wav")
- .mime_str("audio/wav")
- .unwrap(),
- );
+ let mut form = reqwest::blocking::multipart::Form::new()
+ .text("model", config.whisper_model.clone())
+ .text("response_format", "text");
+
+ // Send language hint to Whisper if configured (ISO-639-1 code)
+ if !config.language.is_empty() {
+ form = form.text("language", config.language.clone());
+ }
+
+ let form = form.part(
+ "file",
+ reqwest::blocking::multipart::Part::bytes(wav_data)
+ .file_name("audio.wav")
+ .mime_str("audio/wav")
+ .unwrap(),
+ );
let response = client
- .post("https://api.groq.com/openai/v1/audio/transcriptions")
- .header("Authorization", format!("Bearer {}", api_key))
+ .post(&config.transcription_url)
+ .header("Authorization", format!("Bearer {}", config.api_key))
.multipart(form)
.timeout(Duration::from_secs(30))
.send();
if let Ok(resp) = response {
if resp.status().is_success() {
- if let Ok(text) = resp.text() {
- let text = text.trim();
+ if let Ok(raw) = resp.text() {
+ eprintln!("[fnkey] whisper raw response ({} bytes): {:.200}", raw.len(), raw);
+ // Handle both plain text and JSON responses
+ // Some servers (e.g. vLLM) return {"text":"..."} even with response_format=text
+ let text = if raw.trim_start().starts_with('{') {
+ serde_json::from_str::(raw.trim())
+ .ok()
+ .and_then(|v| v.get("text")?.as_str().map(String::from))
+ .unwrap_or_else(|| raw.trim().to_string())
+ } else {
+ raw.trim().to_string()
+ };
+
if !text.is_empty() {
- // Apply polish if requested, fallback to raw on error
- let final_text = if polish {
- polish_text(text, api_key).unwrap_or_else(|| text.to_string())
+ eprintln!("[fnkey] whisper text: {}", text);
+ // When always_polish is on: polish by default, Ctrl modifier = raw
+ // When always_polish is off: raw by default, Ctrl modifier = polish
+ let should_polish = if config.always_polish { !polish } else { polish };
+ let final_text = if should_polish {
+ let polished = polish_text(&text, config).unwrap_or_else(|| text.clone());
+ eprintln!("[fnkey] polished: {}", polished);
+ polished
} else {
- text.to_string()
+ eprintln!("[fnkey] raw (no polish)");
+ text
};
if let Ok(mut clipboard) = Clipboard::new() {
@@ -546,27 +763,49 @@ struct ChatMessage {
/// Polish transcribed text using LLM to convert spoken style to written prose.
/// Returns None on any error (caller should fall back to raw text).
-fn polish_text(text: &str, api_key: &str) -> Option {
+fn polish_text(text: &str, config: &Config) -> Option {
let client = reqwest::blocking::Client::new();
+ let system_prompt = if !config.polish_prompt.is_empty() {
+ config.polish_prompt.clone()
+ } else if config.language.is_empty() || config.language == "en" {
+ "Fix dictation. Remove filler words (um, uh, like, you know, basically). \
+ Remove repeated words. Fix grammar and punctuation. \
+ Keep the same tone and meaning. Output ONLY the corrected text. /no_think".to_string()
+ } else {
+ format!(
+ "Fix dictation in language \"{}\". Remove filler words and hesitations. \
+ Remove repeated words. Fix grammar and punctuation. \
+ Keep the same language, tone and meaning. Do NOT translate. \
+ Output ONLY the corrected text. /no_think",
+ config.language
+ )
+ };
+
+ // Cap output tokens: rough estimate of input tokens (words * 1.3) doubled as headroom,
+ // with a floor of 64 and ceiling of 1024. Prevents hallucination runaway on small models.
+ let estimated_tokens = (text.split_whitespace().count() as f32 * 1.3 * 2.0) as u64;
+ let max_tokens = estimated_tokens.clamp(64, 1024);
+
let body = serde_json::json!({
- "model": "llama-3.3-70b-versatile",
+ "model": config.polish_model,
"messages": [
{
"role": "system",
- "content": "Clean up this voice message for texting. Remove filler words (um, uh, like, you know). Fix punctuation and sentence structure. Break up run-on sentences. Keep it casual. No trailing period. Output ONLY the cleaned text - no explanations, no quotes."
+ "content": system_prompt
},
{
"role": "user",
"content": text
}
],
- "temperature": 0.2
+ "temperature": 0.2,
+ "max_tokens": max_tokens
});
let response = client
- .post("https://api.groq.com/openai/v1/chat/completions")
- .header("Authorization", format!("Bearer {}", api_key))
+ .post(&config.polish_url)
+ .header("Authorization", format!("Bearer {}", config.polish_key()))
.header("Content-Type", "application/json")
.json(&body)
.timeout(Duration::from_secs(30))