From f2034e7504aa380689ff09771f011d9068117ea7 Mon Sep 17 00:00:00 2001 From: Edris <47713852+EdrisT@users.noreply.github.com> Date: Wed, 3 Dec 2025 19:01:25 +0100 Subject: [PATCH 1/4] UI part of custom alignment model option --- .../components/TranscriptionConfigDialog.tsx | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/web/frontend/src/components/TranscriptionConfigDialog.tsx b/web/frontend/src/components/TranscriptionConfigDialog.tsx index 8ad0e37..60df03d 100644 --- a/web/frontend/src/components/TranscriptionConfigDialog.tsx +++ b/web/frontend/src/components/TranscriptionConfigDialog.tsx @@ -150,7 +150,8 @@ const PARAM_DESCRIPTIONS = { verbose: "Show detailed progress and debug messages during transcription.", print_progress: "Display processing progress information in the console output.", hf_token: "Hugging Face API token required for accessing private or gated models.", - is_multi_track_enabled: "Enable multi-track transcription mode for processing individual speaker tracks. When enabled, diarization is automatically disabled as each track represents a single speaker." + is_multi_track_enabled: "Enable multi-track transcription mode for processing individual speaker tracks. When enabled, diarization is automatically disabled as each track represents a single speaker.", + align_model: "Custom alignment model to use (e.g., KBLab/wav2vec2-large-voxrex-swedish).\nThe model format must be WhisperX compatible!\nLeave empty to use default." }; interface TranscriptionConfigDialogProps { @@ -1776,6 +1777,26 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog
+
+
+ + + + + + +

{PARAM_DESCRIPTIONS.align_model}

+
+
+
+ updateParam('align_model', e.target.value || undefined)} + className="mt-3 bg-white dark:bg-gray-800 border-gray-300 dark:border-gray-600 text-gray-900 dark:text-gray-100" + /> +
Date: Wed, 3 Dec 2025 19:11:12 +0100 Subject: [PATCH 2/4] Add custom alignment model parameter to WhisperX Added option to use custom alignment model when using Whisper model family. Default is to not use custom alignment model. --- .../transcription/adapters/whisperx_adapter.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/internal/transcription/adapters/whisperx_adapter.go b/internal/transcription/adapters/whisperx_adapter.go index 516c89c..9e27f49 100644 --- a/internal/transcription/adapters/whisperx_adapter.go +++ b/internal/transcription/adapters/whisperx_adapter.go @@ -258,6 +258,16 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter { Description: "VAD offset threshold", Group: "advanced", }, + + // Custom Alignment Model + { + Name: "align_model", + Type: "string", + Required: false, + Default: nil, + Description: "Custom alignment model (e.g. KBLab/wav2vec2-large-voxrex-swedish)", + Group: "advanced", + }, } baseAdapter := NewBaseAdapter("whisperx", filepath.Join(envPath, "WhisperX"), capabilities, schema) @@ -484,6 +494,11 @@ func (w *WhisperXAdapter) buildWhisperXArgs(input interfaces.AudioInput, params args = append(args, "--vad_onset", fmt.Sprintf("%.3f", w.GetFloatParameter(params, "vad_onset"))) args = append(args, "--vad_offset", fmt.Sprintf("%.3f", w.GetFloatParameter(params, "vad_offset"))) + // Custom alignment model + if alignModel := w.GetStringParameter(params, "align_model"); alignModel != "" { + args = append(args, "--align_model", alignModel) + } + // Diarization if w.GetBoolParameter(params, "diarize") { args = append(args, "--diarize") From 8434bdf21de2026dafc25b24f959d70189fc77e4 Mon Sep 17 00:00:00 2001 From: ET <47713852+EdrisT@users.noreply.github.com> Date: Thu, 4 Dec 2025 12:54:11 +0100 Subject: [PATCH 3/4] Configurable OpenAI API Base URL Fix for enhancement issue #194 Added option to use custom OpenAI API base URL. If not configured the default OpenAI API base URL (https://api.openai.com/v1) will be used. Does not change current behavior of apiKey, i.e if apiKey is already configured it will not have to be re-entered when modifying base URL. --- internal/api/chat_handlers.go | 2 +- internal/api/handlers.go | 86 +++++++++++++-------- internal/llm/openai.go | 8 +- internal/models/transcription.go | 15 ++-- web/frontend/src/components/LLMSettings.tsx | 67 ++++++++++------ 5 files changed, 112 insertions(+), 66 deletions(-) diff --git a/internal/api/chat_handlers.go b/internal/api/chat_handlers.go index 107998c..848fded 100644 --- a/internal/api/chat_handlers.go +++ b/internal/api/chat_handlers.go @@ -88,7 +88,7 @@ func (h *Handler) getLLMService(ctx context.Context) (llm.Service, string, error if cfg.APIKey == nil || *cfg.APIKey == "" { return nil, cfg.Provider, fmt.Errorf("OpenAI API key not configured") } - return llm.NewOpenAIService(*cfg.APIKey), cfg.Provider, nil + return llm.NewOpenAIService(*cfg.APIKey, cfg.OpenAIBaseURL), cfg.Provider, nil case "ollama": if cfg.BaseURL == nil || *cfg.BaseURL == "" { return nil, cfg.Provider, fmt.Errorf("Ollama base URL not configured") diff --git a/internal/api/handlers.go b/internal/api/handlers.go index 9d79a76..03c0953 100644 --- a/internal/api/handlers.go +++ b/internal/api/handlers.go @@ -171,21 +171,23 @@ type YouTubeDownloadResponse struct { // LLMConfigRequest represents the LLM configuration request type LLMConfigRequest struct { - Provider string `json:"provider" binding:"required,oneof=ollama openai"` - BaseURL *string `json:"base_url,omitempty"` - APIKey *string `json:"api_key,omitempty"` - IsActive bool `json:"is_active"` + Provider string `json:"provider" binding:"required,oneof=ollama openai"` + BaseURL *string `json:"base_url,omitempty"` + OpenAIBaseURL *string `json:"openai_base_url,omitempty"` + APIKey *string `json:"api_key,omitempty"` + IsActive bool `json:"is_active"` } // LLMConfigResponse represents the LLM configuration response type LLMConfigResponse struct { - ID uint `json:"id"` - Provider string `json:"provider"` - BaseURL *string `json:"base_url,omitempty"` - HasAPIKey bool `json:"has_api_key"` // Don't return actual API key - IsActive bool `json:"is_active"` - CreatedAt string `json:"created_at"` - UpdatedAt string `json:"updated_at"` + ID uint `json:"id"` + Provider string `json:"provider"` + BaseURL *string `json:"base_url,omitempty"` + OpenAIBaseURL *string `json:"openai_base_url,omitempty"` + HasAPIKey bool `json:"has_api_key"` // Don't return actual API key + IsActive bool `json:"is_active"` + CreatedAt string `json:"created_at"` + UpdatedAt string `json:"updated_at"` } // APIKeyListResponse represents an API key in the list (without the actual key) @@ -1880,13 +1882,14 @@ func (h *Handler) GetLLMConfig(c *gin.Context) { } response := LLMConfigResponse{ - ID: config.ID, - Provider: config.Provider, - BaseURL: config.BaseURL, - HasAPIKey: config.APIKey != nil && *config.APIKey != "", - IsActive: config.IsActive, - CreatedAt: config.CreatedAt.Format("2006-01-02 15:04:05"), - UpdatedAt: config.UpdatedAt.Format("2006-01-02 15:04:05"), + ID: config.ID, + Provider: config.Provider, + BaseURL: config.BaseURL, + OpenAIBaseURL: config.OpenAIBaseURL, + HasAPIKey: config.APIKey != nil && *config.APIKey != "", + IsActive: config.IsActive, + CreatedAt: config.CreatedAt.Format("2006-01-02 15:04:05"), + UpdatedAt: config.UpdatedAt.Format("2006-01-02 15:04:05"), } c.JSON(http.StatusOK, response) @@ -1914,10 +1917,6 @@ func (h *Handler) SaveLLMConfig(c *gin.Context) { c.JSON(http.StatusBadRequest, gin.H{"error": "Base URL is required for Ollama provider"}) return } - if req.Provider == "openai" && (req.APIKey == nil || *req.APIKey == "") { - c.JSON(http.StatusBadRequest, gin.H{"error": "API key is required for OpenAI provider"}) - return - } // Check if there's an existing active configuration existingConfig, err := h.llmConfigRepo.GetActive(c.Request.Context()) @@ -1926,15 +1925,32 @@ func (h *Handler) SaveLLMConfig(c *gin.Context) { return } + // Handle API Key logic for OpenAI + var apiKeyToSave *string + if req.Provider == "openai" { + if req.APIKey != nil && *req.APIKey != "" { + // New key provided + apiKeyToSave = req.APIKey + } else if existingConfig != nil && existingConfig.APIKey != nil && *existingConfig.APIKey != "" { + // Reuse existing key + apiKeyToSave = existingConfig.APIKey + } else { + // No key provided and no existing key + c.JSON(http.StatusBadRequest, gin.H{"error": "API key is required for OpenAI provider"}) + return + } + } + var config *models.LLMConfig if err == gorm.ErrRecordNotFound { // No existing active config, create new one config = &models.LLMConfig{ - Provider: req.Provider, - BaseURL: req.BaseURL, - APIKey: req.APIKey, - IsActive: req.IsActive, + Provider: req.Provider, + BaseURL: req.BaseURL, + OpenAIBaseURL: req.OpenAIBaseURL, + APIKey: apiKeyToSave, + IsActive: req.IsActive, } if err := h.llmConfigRepo.Create(c.Request.Context(), config); err != nil { @@ -1945,7 +1961,8 @@ func (h *Handler) SaveLLMConfig(c *gin.Context) { // Update existing config existingConfig.Provider = req.Provider existingConfig.BaseURL = req.BaseURL - existingConfig.APIKey = req.APIKey + existingConfig.OpenAIBaseURL = req.OpenAIBaseURL + existingConfig.APIKey = apiKeyToSave existingConfig.IsActive = req.IsActive if err := h.llmConfigRepo.Update(c.Request.Context(), existingConfig); err != nil { @@ -1956,13 +1973,14 @@ func (h *Handler) SaveLLMConfig(c *gin.Context) { } response := LLMConfigResponse{ - ID: config.ID, - Provider: config.Provider, - BaseURL: config.BaseURL, - HasAPIKey: config.APIKey != nil && *config.APIKey != "", - IsActive: config.IsActive, - CreatedAt: config.CreatedAt.Format("2006-01-02 15:04:05"), - UpdatedAt: config.UpdatedAt.Format("2006-01-02 15:04:05"), + ID: config.ID, + Provider: config.Provider, + BaseURL: config.BaseURL, + OpenAIBaseURL: config.OpenAIBaseURL, + HasAPIKey: config.APIKey != nil && *config.APIKey != "", + IsActive: config.IsActive, + CreatedAt: config.CreatedAt.Format("2006-01-02 15:04:05"), + UpdatedAt: config.UpdatedAt.Format("2006-01-02 15:04:05"), } c.JSON(http.StatusOK, response) diff --git a/internal/llm/openai.go b/internal/llm/openai.go index 461548c..9d5ec54 100644 --- a/internal/llm/openai.go +++ b/internal/llm/openai.go @@ -21,10 +21,14 @@ type OpenAIService struct { } // NewOpenAIService creates a new OpenAI service -func NewOpenAIService(apiKey string) *OpenAIService { +func NewOpenAIService(apiKey string, baseURL *string) *OpenAIService { + url := "https://api.openai.com/v1" + if baseURL != nil && *baseURL != "" { + url = *baseURL + } return &OpenAIService{ apiKey: apiKey, - baseURL: "https://api.openai.com/v1", + baseURL: url, client: &http.Client{ Timeout: 300 * time.Second, }, diff --git a/internal/models/transcription.go b/internal/models/transcription.go index 06b8c04..94114aa 100644 --- a/internal/models/transcription.go +++ b/internal/models/transcription.go @@ -202,13 +202,14 @@ func (tp *TranscriptionProfile) BeforeSave(tx *gorm.DB) error { // LLMConfig represents LLM configuration settings type LLMConfig struct { - ID uint `json:"id" gorm:"primaryKey"` - Provider string `json:"provider" gorm:"not null;type:varchar(50)"` // "ollama" or "openai" - BaseURL *string `json:"base_url,omitempty" gorm:"type:text"` // For Ollama - APIKey *string `json:"api_key,omitempty" gorm:"type:text"` // For OpenAI (encrypted) - IsActive bool `json:"is_active" gorm:"type:boolean;default:false"` - CreatedAt time.Time `json:"created_at" gorm:"autoCreateTime"` - UpdatedAt time.Time `json:"updated_at" gorm:"autoUpdateTime"` + ID uint `json:"id" gorm:"primaryKey"` + Provider string `json:"provider" gorm:"not null;type:varchar(50)"` // "ollama" or "openai" + BaseURL *string `json:"base_url,omitempty" gorm:"type:text"` // For Ollama + OpenAIBaseURL *string `json:"openai_base_url,omitempty" gorm:"type:text"` // For OpenAI custom endpoint + APIKey *string `json:"api_key,omitempty" gorm:"type:text"` // For OpenAI (encrypted) + IsActive bool `json:"is_active" gorm:"type:boolean;default:false"` + CreatedAt time.Time `json:"created_at" gorm:"autoCreateTime"` + UpdatedAt time.Time `json:"updated_at" gorm:"autoUpdateTime"` } // BeforeSave ensures only one LLM config can be active diff --git a/web/frontend/src/components/LLMSettings.tsx b/web/frontend/src/components/LLMSettings.tsx index f362d0f..0dcfec3 100644 --- a/web/frontend/src/components/LLMSettings.tsx +++ b/web/frontend/src/components/LLMSettings.tsx @@ -10,6 +10,7 @@ interface LLMConfig { id?: number; provider: string; base_url?: string; + openai_base_url?: string; has_api_key?: boolean; is_active: boolean; created_at?: string; @@ -22,6 +23,7 @@ export function LLMSettings() { is_active: false, }); const [baseUrl, setBaseUrl] = useState(""); + const [openAIBaseUrl, setOpenAIBaseUrl] = useState(""); const [apiKey, setApiKey] = useState(""); const [loading, setLoading] = useState(true); const [saving, setSaving] = useState(false); @@ -42,6 +44,7 @@ export function LLMSettings() { const data = await response.json(); setConfig(data); setBaseUrl(data.base_url || ""); + setOpenAIBaseUrl(data.openai_base_url || ""); // Don't set API key from response for security } else if (response.status !== 404) { console.error("Failed to fetch LLM config"); @@ -61,7 +64,10 @@ export function LLMSettings() { provider: config.provider, is_active: true, // Always set to active when saving ...(config.provider === "ollama" && { base_url: baseUrl }), - ...(config.provider === "openai" && { api_key: apiKey }), + ...(config.provider === "openai" && { + api_key: apiKey, + openai_base_url: openAIBaseUrl + }), }; try { @@ -228,27 +234,44 @@ export function LLMSettings() { )} {config.provider === "openai" && ( -
- - setApiKey(e.target.value)} - className="mt-1" - /> -

- Your OpenAI API key. {config.has_api_key ? "Leave blank to keep current key." : ""} -

+
+
+ + setApiKey(e.target.value)} + className="mt-1" + /> +

+ Your OpenAI API key. {config.has_api_key ? "Leave blank to keep current key." : ""} +

+
+ +
+ + setOpenAIBaseUrl(e.target.value)} + className="mt-1" + /> +

+ Custom endpoint URL for OpenAI-compatible services. Leave blank for default. +

+
)}
From a6473069751885b401e998de30a29c43f8bf04cd Mon Sep 17 00:00:00 2001 From: ET <47713852+EdrisT@users.noreply.github.com> Date: Fri, 5 Dec 2025 09:57:59 +0100 Subject: [PATCH 4/4] Added KBLab transcription models Added KBLab transcription models (Alignment model not included) --- .../transcription/adapters/whisperx_adapter.go | 17 +++++++++++------ .../components/TranscriptionConfigDialog.tsx | 18 ++++++++++++------ 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/internal/transcription/adapters/whisperx_adapter.go b/internal/transcription/adapters/whisperx_adapter.go index 516c89c..838e6b5 100644 --- a/internal/transcription/adapters/whisperx_adapter.go +++ b/internal/transcription/adapters/whisperx_adapter.go @@ -64,8 +64,8 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter { Name: "model", Type: "string", Required: false, - Default: "small", - Options: []string{"tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2", "large-v3"}, + Default: "KBLab/kb-whisper-large", + Options: []string{"KBLab/kb-whisper-large","KBLab/kb-whisper-medium","KBLab/kb-whisper-small","KBLab/kb-whisper-base","KBLab/kb-whisper-tiny","tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2", "large-v3"}, Description: "Whisper model size to use", Group: "basic", }, @@ -75,7 +75,7 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter { Name: "device", Type: "string", Required: false, - Default: "cpu", + Default: "cuda", Options: []string{"cpu", "cuda"}, Description: "Device to use for computation", Group: "basic", @@ -104,7 +104,7 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter { Name: "compute_type", Type: "string", Required: false, - Default: "float32", + Default: "float16", Options: []string{"float16", "float32", "int8"}, Description: "Computation precision", Group: "advanced", @@ -125,7 +125,7 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter { Name: "language", Type: "string", Required: false, - Default: nil, + Default: "sv", Description: "Language code (auto-detect if not specified)", Group: "basic", }, @@ -144,7 +144,7 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter { Name: "diarize", Type: "bool", Required: false, - Default: false, + Default: true, Description: "Enable speaker diarization", Group: "basic", }, @@ -273,6 +273,11 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter { // GetSupportedModels returns the list of Whisper models supported func (w *WhisperXAdapter) GetSupportedModels() []string { return []string{ + "KBLab/kb-whisper-large", + "KBLab/kb-whisper-medium", + "KBLab/kb-whisper-small", + "KBLab/kb-whisper-base", + "KBLab/kb-whisper-tiny", "tiny", "tiny.en", "base", "base.en", "small", "small.en", diff --git a/web/frontend/src/components/TranscriptionConfigDialog.tsx b/web/frontend/src/components/TranscriptionConfigDialog.tsx index 8ad0e37..ac5059b 100644 --- a/web/frontend/src/components/TranscriptionConfigDialog.tsx +++ b/web/frontend/src/components/TranscriptionConfigDialog.tsx @@ -151,6 +151,7 @@ const PARAM_DESCRIPTIONS = { print_progress: "Display processing progress information in the console output.", hf_token: "Hugging Face API token required for accessing private or gated models.", is_multi_track_enabled: "Enable multi-track transcription mode for processing individual speaker tracks. When enabled, diarization is automatically disabled as each track represents a single speaker." + align_model: "Custom alignment model to use (e.g., KBLab/wav2vec2-large-voxrex-swedish).\nThe model format must be WhisperX compatible!\nLeave empty to use default.", }; interface TranscriptionConfigDialogProps { @@ -168,12 +169,12 @@ interface TranscriptionConfigDialogProps { const DEFAULT_PARAMS: WhisperXParams = { model_family: "whisper", - model: "small", + model: "KBLab/kb-whisper-large", model_cache_only: false, - device: "cpu", + device: "cuda", device_index: 0, batch_size: 8, - compute_type: "float32", + compute_type: "float16", threads: 0, output_format: "all", verbose: true, @@ -185,7 +186,7 @@ const DEFAULT_PARAMS: WhisperXParams = { vad_onset: 0.5, vad_offset: 0.363, chunk_size: 30, - diarize: false, + diarize: true, diarize_model: "pyannote", speaker_embeddings: false, temperature: 0, @@ -210,6 +211,11 @@ const DEFAULT_PARAMS: WhisperXParams = { }; const WHISPER_MODELS = [ + "KBLab/kb-whisper-large", + "KBLab/kb-whisper-medium", + "KBLab/kb-whisper-small", + "KBLab/kb-whisper-base", + "KBLab/kb-whisper-tiny", "tiny", "tiny.en", "base", "base.en", "small", "small.en", @@ -1096,8 +1102,8 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog Language