From f2034e7504aa380689ff09771f011d9068117ea7 Mon Sep 17 00:00:00 2001
From: Edris <47713852+EdrisT@users.noreply.github.com>
Date: Wed, 3 Dec 2025 19:01:25 +0100
Subject: [PATCH 1/4] UI part of custom alignment model option
---
.../components/TranscriptionConfigDialog.tsx | 23 ++++++++++++++++++-
1 file changed, 22 insertions(+), 1 deletion(-)
diff --git a/web/frontend/src/components/TranscriptionConfigDialog.tsx b/web/frontend/src/components/TranscriptionConfigDialog.tsx
index 8ad0e37..60df03d 100644
--- a/web/frontend/src/components/TranscriptionConfigDialog.tsx
+++ b/web/frontend/src/components/TranscriptionConfigDialog.tsx
@@ -150,7 +150,8 @@ const PARAM_DESCRIPTIONS = {
verbose: "Show detailed progress and debug messages during transcription.",
print_progress: "Display processing progress information in the console output.",
hf_token: "Hugging Face API token required for accessing private or gated models.",
- is_multi_track_enabled: "Enable multi-track transcription mode for processing individual speaker tracks. When enabled, diarization is automatically disabled as each track represents a single speaker."
+ is_multi_track_enabled: "Enable multi-track transcription mode for processing individual speaker tracks. When enabled, diarization is automatically disabled as each track represents a single speaker.",
+ align_model: "Custom alignment model to use (e.g., KBLab/wav2vec2-large-voxrex-swedish).\nThe model format must be WhisperX compatible!\nLeave empty to use default."
};
interface TranscriptionConfigDialogProps {
@@ -1776,6 +1777,26 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog
+
+
+
Custom Alignment Model
+
+
+
+
+
+ {PARAM_DESCRIPTIONS.align_model}
+
+
+
+
updateParam('align_model', e.target.value || undefined)}
+ className="mt-3 bg-white dark:bg-gray-800 border-gray-300 dark:border-gray-600 text-gray-900 dark:text-gray-100"
+ />
+
Date: Wed, 3 Dec 2025 19:11:12 +0100
Subject: [PATCH 2/4] Add custom alignment model parameter to WhisperX
Added option to use custom alignment model when using Whisper model family.
Default is to not use custom alignment model.
---
.../transcription/adapters/whisperx_adapter.go | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/internal/transcription/adapters/whisperx_adapter.go b/internal/transcription/adapters/whisperx_adapter.go
index 516c89c..9e27f49 100644
--- a/internal/transcription/adapters/whisperx_adapter.go
+++ b/internal/transcription/adapters/whisperx_adapter.go
@@ -258,6 +258,16 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter {
Description: "VAD offset threshold",
Group: "advanced",
},
+
+ // Custom Alignment Model
+ {
+ Name: "align_model",
+ Type: "string",
+ Required: false,
+ Default: nil,
+ Description: "Custom alignment model (e.g. KBLab/wav2vec2-large-voxrex-swedish)",
+ Group: "advanced",
+ },
}
baseAdapter := NewBaseAdapter("whisperx", filepath.Join(envPath, "WhisperX"), capabilities, schema)
@@ -484,6 +494,11 @@ func (w *WhisperXAdapter) buildWhisperXArgs(input interfaces.AudioInput, params
args = append(args, "--vad_onset", fmt.Sprintf("%.3f", w.GetFloatParameter(params, "vad_onset")))
args = append(args, "--vad_offset", fmt.Sprintf("%.3f", w.GetFloatParameter(params, "vad_offset")))
+ // Custom alignment model
+ if alignModel := w.GetStringParameter(params, "align_model"); alignModel != "" {
+ args = append(args, "--align_model", alignModel)
+ }
+
// Diarization
if w.GetBoolParameter(params, "diarize") {
args = append(args, "--diarize")
From 8434bdf21de2026dafc25b24f959d70189fc77e4 Mon Sep 17 00:00:00 2001
From: ET <47713852+EdrisT@users.noreply.github.com>
Date: Thu, 4 Dec 2025 12:54:11 +0100
Subject: [PATCH 3/4] Configurable OpenAI API Base URL
Fix for enhancement issue #194
Added option to use custom OpenAI API base URL.
If not configured the default OpenAI API base URL (https://api.openai.com/v1) will be used.
Does not change current behavior of apiKey, i.e if apiKey is already configured it will not have to be re-entered when modifying base URL.
---
internal/api/chat_handlers.go | 2 +-
internal/api/handlers.go | 86 +++++++++++++--------
internal/llm/openai.go | 8 +-
internal/models/transcription.go | 15 ++--
web/frontend/src/components/LLMSettings.tsx | 67 ++++++++++------
5 files changed, 112 insertions(+), 66 deletions(-)
diff --git a/internal/api/chat_handlers.go b/internal/api/chat_handlers.go
index 107998c..848fded 100644
--- a/internal/api/chat_handlers.go
+++ b/internal/api/chat_handlers.go
@@ -88,7 +88,7 @@ func (h *Handler) getLLMService(ctx context.Context) (llm.Service, string, error
if cfg.APIKey == nil || *cfg.APIKey == "" {
return nil, cfg.Provider, fmt.Errorf("OpenAI API key not configured")
}
- return llm.NewOpenAIService(*cfg.APIKey), cfg.Provider, nil
+ return llm.NewOpenAIService(*cfg.APIKey, cfg.OpenAIBaseURL), cfg.Provider, nil
case "ollama":
if cfg.BaseURL == nil || *cfg.BaseURL == "" {
return nil, cfg.Provider, fmt.Errorf("Ollama base URL not configured")
diff --git a/internal/api/handlers.go b/internal/api/handlers.go
index 9d79a76..03c0953 100644
--- a/internal/api/handlers.go
+++ b/internal/api/handlers.go
@@ -171,21 +171,23 @@ type YouTubeDownloadResponse struct {
// LLMConfigRequest represents the LLM configuration request
type LLMConfigRequest struct {
- Provider string `json:"provider" binding:"required,oneof=ollama openai"`
- BaseURL *string `json:"base_url,omitempty"`
- APIKey *string `json:"api_key,omitempty"`
- IsActive bool `json:"is_active"`
+ Provider string `json:"provider" binding:"required,oneof=ollama openai"`
+ BaseURL *string `json:"base_url,omitempty"`
+ OpenAIBaseURL *string `json:"openai_base_url,omitempty"`
+ APIKey *string `json:"api_key,omitempty"`
+ IsActive bool `json:"is_active"`
}
// LLMConfigResponse represents the LLM configuration response
type LLMConfigResponse struct {
- ID uint `json:"id"`
- Provider string `json:"provider"`
- BaseURL *string `json:"base_url,omitempty"`
- HasAPIKey bool `json:"has_api_key"` // Don't return actual API key
- IsActive bool `json:"is_active"`
- CreatedAt string `json:"created_at"`
- UpdatedAt string `json:"updated_at"`
+ ID uint `json:"id"`
+ Provider string `json:"provider"`
+ BaseURL *string `json:"base_url,omitempty"`
+ OpenAIBaseURL *string `json:"openai_base_url,omitempty"`
+ HasAPIKey bool `json:"has_api_key"` // Don't return actual API key
+ IsActive bool `json:"is_active"`
+ CreatedAt string `json:"created_at"`
+ UpdatedAt string `json:"updated_at"`
}
// APIKeyListResponse represents an API key in the list (without the actual key)
@@ -1880,13 +1882,14 @@ func (h *Handler) GetLLMConfig(c *gin.Context) {
}
response := LLMConfigResponse{
- ID: config.ID,
- Provider: config.Provider,
- BaseURL: config.BaseURL,
- HasAPIKey: config.APIKey != nil && *config.APIKey != "",
- IsActive: config.IsActive,
- CreatedAt: config.CreatedAt.Format("2006-01-02 15:04:05"),
- UpdatedAt: config.UpdatedAt.Format("2006-01-02 15:04:05"),
+ ID: config.ID,
+ Provider: config.Provider,
+ BaseURL: config.BaseURL,
+ OpenAIBaseURL: config.OpenAIBaseURL,
+ HasAPIKey: config.APIKey != nil && *config.APIKey != "",
+ IsActive: config.IsActive,
+ CreatedAt: config.CreatedAt.Format("2006-01-02 15:04:05"),
+ UpdatedAt: config.UpdatedAt.Format("2006-01-02 15:04:05"),
}
c.JSON(http.StatusOK, response)
@@ -1914,10 +1917,6 @@ func (h *Handler) SaveLLMConfig(c *gin.Context) {
c.JSON(http.StatusBadRequest, gin.H{"error": "Base URL is required for Ollama provider"})
return
}
- if req.Provider == "openai" && (req.APIKey == nil || *req.APIKey == "") {
- c.JSON(http.StatusBadRequest, gin.H{"error": "API key is required for OpenAI provider"})
- return
- }
// Check if there's an existing active configuration
existingConfig, err := h.llmConfigRepo.GetActive(c.Request.Context())
@@ -1926,15 +1925,32 @@ func (h *Handler) SaveLLMConfig(c *gin.Context) {
return
}
+ // Handle API Key logic for OpenAI
+ var apiKeyToSave *string
+ if req.Provider == "openai" {
+ if req.APIKey != nil && *req.APIKey != "" {
+ // New key provided
+ apiKeyToSave = req.APIKey
+ } else if existingConfig != nil && existingConfig.APIKey != nil && *existingConfig.APIKey != "" {
+ // Reuse existing key
+ apiKeyToSave = existingConfig.APIKey
+ } else {
+ // No key provided and no existing key
+ c.JSON(http.StatusBadRequest, gin.H{"error": "API key is required for OpenAI provider"})
+ return
+ }
+ }
+
var config *models.LLMConfig
if err == gorm.ErrRecordNotFound {
// No existing active config, create new one
config = &models.LLMConfig{
- Provider: req.Provider,
- BaseURL: req.BaseURL,
- APIKey: req.APIKey,
- IsActive: req.IsActive,
+ Provider: req.Provider,
+ BaseURL: req.BaseURL,
+ OpenAIBaseURL: req.OpenAIBaseURL,
+ APIKey: apiKeyToSave,
+ IsActive: req.IsActive,
}
if err := h.llmConfigRepo.Create(c.Request.Context(), config); err != nil {
@@ -1945,7 +1961,8 @@ func (h *Handler) SaveLLMConfig(c *gin.Context) {
// Update existing config
existingConfig.Provider = req.Provider
existingConfig.BaseURL = req.BaseURL
- existingConfig.APIKey = req.APIKey
+ existingConfig.OpenAIBaseURL = req.OpenAIBaseURL
+ existingConfig.APIKey = apiKeyToSave
existingConfig.IsActive = req.IsActive
if err := h.llmConfigRepo.Update(c.Request.Context(), existingConfig); err != nil {
@@ -1956,13 +1973,14 @@ func (h *Handler) SaveLLMConfig(c *gin.Context) {
}
response := LLMConfigResponse{
- ID: config.ID,
- Provider: config.Provider,
- BaseURL: config.BaseURL,
- HasAPIKey: config.APIKey != nil && *config.APIKey != "",
- IsActive: config.IsActive,
- CreatedAt: config.CreatedAt.Format("2006-01-02 15:04:05"),
- UpdatedAt: config.UpdatedAt.Format("2006-01-02 15:04:05"),
+ ID: config.ID,
+ Provider: config.Provider,
+ BaseURL: config.BaseURL,
+ OpenAIBaseURL: config.OpenAIBaseURL,
+ HasAPIKey: config.APIKey != nil && *config.APIKey != "",
+ IsActive: config.IsActive,
+ CreatedAt: config.CreatedAt.Format("2006-01-02 15:04:05"),
+ UpdatedAt: config.UpdatedAt.Format("2006-01-02 15:04:05"),
}
c.JSON(http.StatusOK, response)
diff --git a/internal/llm/openai.go b/internal/llm/openai.go
index 461548c..9d5ec54 100644
--- a/internal/llm/openai.go
+++ b/internal/llm/openai.go
@@ -21,10 +21,14 @@ type OpenAIService struct {
}
// NewOpenAIService creates a new OpenAI service
-func NewOpenAIService(apiKey string) *OpenAIService {
+func NewOpenAIService(apiKey string, baseURL *string) *OpenAIService {
+ url := "https://api.openai.com/v1"
+ if baseURL != nil && *baseURL != "" {
+ url = *baseURL
+ }
return &OpenAIService{
apiKey: apiKey,
- baseURL: "https://api.openai.com/v1",
+ baseURL: url,
client: &http.Client{
Timeout: 300 * time.Second,
},
diff --git a/internal/models/transcription.go b/internal/models/transcription.go
index 06b8c04..94114aa 100644
--- a/internal/models/transcription.go
+++ b/internal/models/transcription.go
@@ -202,13 +202,14 @@ func (tp *TranscriptionProfile) BeforeSave(tx *gorm.DB) error {
// LLMConfig represents LLM configuration settings
type LLMConfig struct {
- ID uint `json:"id" gorm:"primaryKey"`
- Provider string `json:"provider" gorm:"not null;type:varchar(50)"` // "ollama" or "openai"
- BaseURL *string `json:"base_url,omitempty" gorm:"type:text"` // For Ollama
- APIKey *string `json:"api_key,omitempty" gorm:"type:text"` // For OpenAI (encrypted)
- IsActive bool `json:"is_active" gorm:"type:boolean;default:false"`
- CreatedAt time.Time `json:"created_at" gorm:"autoCreateTime"`
- UpdatedAt time.Time `json:"updated_at" gorm:"autoUpdateTime"`
+ ID uint `json:"id" gorm:"primaryKey"`
+ Provider string `json:"provider" gorm:"not null;type:varchar(50)"` // "ollama" or "openai"
+ BaseURL *string `json:"base_url,omitempty" gorm:"type:text"` // For Ollama
+ OpenAIBaseURL *string `json:"openai_base_url,omitempty" gorm:"type:text"` // For OpenAI custom endpoint
+ APIKey *string `json:"api_key,omitempty" gorm:"type:text"` // For OpenAI (encrypted)
+ IsActive bool `json:"is_active" gorm:"type:boolean;default:false"`
+ CreatedAt time.Time `json:"created_at" gorm:"autoCreateTime"`
+ UpdatedAt time.Time `json:"updated_at" gorm:"autoUpdateTime"`
}
// BeforeSave ensures only one LLM config can be active
diff --git a/web/frontend/src/components/LLMSettings.tsx b/web/frontend/src/components/LLMSettings.tsx
index f362d0f..0dcfec3 100644
--- a/web/frontend/src/components/LLMSettings.tsx
+++ b/web/frontend/src/components/LLMSettings.tsx
@@ -10,6 +10,7 @@ interface LLMConfig {
id?: number;
provider: string;
base_url?: string;
+ openai_base_url?: string;
has_api_key?: boolean;
is_active: boolean;
created_at?: string;
@@ -22,6 +23,7 @@ export function LLMSettings() {
is_active: false,
});
const [baseUrl, setBaseUrl] = useState("");
+ const [openAIBaseUrl, setOpenAIBaseUrl] = useState("");
const [apiKey, setApiKey] = useState("");
const [loading, setLoading] = useState(true);
const [saving, setSaving] = useState(false);
@@ -42,6 +44,7 @@ export function LLMSettings() {
const data = await response.json();
setConfig(data);
setBaseUrl(data.base_url || "");
+ setOpenAIBaseUrl(data.openai_base_url || "");
// Don't set API key from response for security
} else if (response.status !== 404) {
console.error("Failed to fetch LLM config");
@@ -61,7 +64,10 @@ export function LLMSettings() {
provider: config.provider,
is_active: true, // Always set to active when saving
...(config.provider === "ollama" && { base_url: baseUrl }),
- ...(config.provider === "openai" && { api_key: apiKey }),
+ ...(config.provider === "openai" && {
+ api_key: apiKey,
+ openai_base_url: openAIBaseUrl
+ }),
};
try {
@@ -228,27 +234,44 @@ export function LLMSettings() {
)}
{config.provider === "openai" && (
-
-
-
- OpenAI API Key *
- {config.has_api_key && (
-
- Already configured
-
- )}
-
-
setApiKey(e.target.value)}
- className="mt-1"
- />
-
- Your OpenAI API key. {config.has_api_key ? "Leave blank to keep current key." : ""}
-
+
+
+
+
+ OpenAI API Key *
+ {config.has_api_key && (
+
+ Already configured
+
+ )}
+
+
setApiKey(e.target.value)}
+ className="mt-1"
+ />
+
+ Your OpenAI API key. {config.has_api_key ? "Leave blank to keep current key." : ""}
+
+
+
+
+
OpenAI Base URL (Optional)
+
setOpenAIBaseUrl(e.target.value)}
+ className="mt-1"
+ />
+
+ Custom endpoint URL for OpenAI-compatible services. Leave blank for default.
+
+
)}
From a6473069751885b401e998de30a29c43f8bf04cd Mon Sep 17 00:00:00 2001
From: ET <47713852+EdrisT@users.noreply.github.com>
Date: Fri, 5 Dec 2025 09:57:59 +0100
Subject: [PATCH 4/4] Added KBLab transcription models
Added KBLab transcription models
(Alignment model not included)
---
.../transcription/adapters/whisperx_adapter.go | 17 +++++++++++------
.../components/TranscriptionConfigDialog.tsx | 18 ++++++++++++------
2 files changed, 23 insertions(+), 12 deletions(-)
diff --git a/internal/transcription/adapters/whisperx_adapter.go b/internal/transcription/adapters/whisperx_adapter.go
index 516c89c..838e6b5 100644
--- a/internal/transcription/adapters/whisperx_adapter.go
+++ b/internal/transcription/adapters/whisperx_adapter.go
@@ -64,8 +64,8 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter {
Name: "model",
Type: "string",
Required: false,
- Default: "small",
- Options: []string{"tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2", "large-v3"},
+ Default: "KBLab/kb-whisper-large",
+ Options: []string{"KBLab/kb-whisper-large","KBLab/kb-whisper-medium","KBLab/kb-whisper-small","KBLab/kb-whisper-base","KBLab/kb-whisper-tiny","tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2", "large-v3"},
Description: "Whisper model size to use",
Group: "basic",
},
@@ -75,7 +75,7 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter {
Name: "device",
Type: "string",
Required: false,
- Default: "cpu",
+ Default: "cuda",
Options: []string{"cpu", "cuda"},
Description: "Device to use for computation",
Group: "basic",
@@ -104,7 +104,7 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter {
Name: "compute_type",
Type: "string",
Required: false,
- Default: "float32",
+ Default: "float16",
Options: []string{"float16", "float32", "int8"},
Description: "Computation precision",
Group: "advanced",
@@ -125,7 +125,7 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter {
Name: "language",
Type: "string",
Required: false,
- Default: nil,
+ Default: "sv",
Description: "Language code (auto-detect if not specified)",
Group: "basic",
},
@@ -144,7 +144,7 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter {
Name: "diarize",
Type: "bool",
Required: false,
- Default: false,
+ Default: true,
Description: "Enable speaker diarization",
Group: "basic",
},
@@ -273,6 +273,11 @@ func NewWhisperXAdapter(envPath string) *WhisperXAdapter {
// GetSupportedModels returns the list of Whisper models supported
func (w *WhisperXAdapter) GetSupportedModels() []string {
return []string{
+ "KBLab/kb-whisper-large",
+ "KBLab/kb-whisper-medium",
+ "KBLab/kb-whisper-small",
+ "KBLab/kb-whisper-base",
+ "KBLab/kb-whisper-tiny",
"tiny", "tiny.en",
"base", "base.en",
"small", "small.en",
diff --git a/web/frontend/src/components/TranscriptionConfigDialog.tsx b/web/frontend/src/components/TranscriptionConfigDialog.tsx
index 8ad0e37..ac5059b 100644
--- a/web/frontend/src/components/TranscriptionConfigDialog.tsx
+++ b/web/frontend/src/components/TranscriptionConfigDialog.tsx
@@ -151,6 +151,7 @@ const PARAM_DESCRIPTIONS = {
print_progress: "Display processing progress information in the console output.",
hf_token: "Hugging Face API token required for accessing private or gated models.",
is_multi_track_enabled: "Enable multi-track transcription mode for processing individual speaker tracks. When enabled, diarization is automatically disabled as each track represents a single speaker."
+ align_model: "Custom alignment model to use (e.g., KBLab/wav2vec2-large-voxrex-swedish).\nThe model format must be WhisperX compatible!\nLeave empty to use default.",
};
interface TranscriptionConfigDialogProps {
@@ -168,12 +169,12 @@ interface TranscriptionConfigDialogProps {
const DEFAULT_PARAMS: WhisperXParams = {
model_family: "whisper",
- model: "small",
+ model: "KBLab/kb-whisper-large",
model_cache_only: false,
- device: "cpu",
+ device: "cuda",
device_index: 0,
batch_size: 8,
- compute_type: "float32",
+ compute_type: "float16",
threads: 0,
output_format: "all",
verbose: true,
@@ -185,7 +186,7 @@ const DEFAULT_PARAMS: WhisperXParams = {
vad_onset: 0.5,
vad_offset: 0.363,
chunk_size: 30,
- diarize: false,
+ diarize: true,
diarize_model: "pyannote",
speaker_embeddings: false,
temperature: 0,
@@ -210,6 +211,11 @@ const DEFAULT_PARAMS: WhisperXParams = {
};
const WHISPER_MODELS = [
+ "KBLab/kb-whisper-large",
+ "KBLab/kb-whisper-medium",
+ "KBLab/kb-whisper-small",
+ "KBLab/kb-whisper-base",
+ "KBLab/kb-whisper-tiny",
"tiny", "tiny.en",
"base", "base.en",
"small", "small.en",
@@ -1096,8 +1102,8 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog
Language
updateParam('language', value === "auto" ? undefined : value)}
+ value={params.language || "sv"}
+ onValueChange={(value) => updateParam('language', value === "sv" ? undefined : value)}
>