feat: add WhisperX diarization and speaker transcript UI

- add WhisperX diarization support to the Whisper VM server - normalize speaker timestamp segments from Whisper responses - document Hugging Face/pyannote VM setup and health checks - show diarized speaker transcript blocks in record and transcript views - group consecutive segments from the same speaker - remove duplicate paragraph transcript display when diarized segments exist - let diarized transcript content expand without an inner scrollbar
2026-05-20 16:34:50 +05:30 · 2026-05-20 16:34:50 +05:30 · 9517bad3dc
commit 9517bad3dc
parent 3abb5e9281
10 changed files with 674 additions and 73 deletions
--- a/backend/src/config/migrate.js
+++ b/backend/src/config/migrate.js
@ -6,6 +6,15 @@ import { env } from "./env.js";
 import { pool, query } from "./database.js";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const requiredTables = [
  "users",
  "refresh_tokens",
  "audio_assets",
  "transcripts",
  "transcript_shares",
  "audio_metadata",
  "transcription_jobs",
 ];
 function identifier(value) {
  return `\`${String(value).replaceAll("`", "``")}\``;
@ -38,12 +47,24 @@ export async function ensureDatabase() {
 }
 export async function ensureSchema() {
-  await ensureDatabase();
+  try {
-  const schema = await fs.readFile(path.join(__dirname, "schema.sql"), "utf8");
+    await ensureDatabase();
-  for (const statement of splitSql(schema)) {
+  } catch (error) {
-    await pool.query(statement);
+    if (!isReadOnlyError(error)) throw error;
    await ensureExistingSchema();
    return;
  }
  const schema = await fs.readFile(path.join(__dirname, "schema.sql"), "utf8");
  try {
    for (const statement of splitSql(schema)) {
      await pool.query(statement);
    }
    await runMigrations();
  } catch (error) {
    if (!isReadOnlyError(error)) throw error;
    await ensureExistingSchema();
  }
  await runMigrations();
 }
 export async function runMigrations() {
@ -85,6 +106,31 @@ export async function runMigrations() {
  }
 }
 function isReadOnlyError(error) {
  return (
    error?.code === "ER_OPTION_PREVENTS_STATEMENT" ||
    error?.errno === 1290 ||
    String(error?.message ?? "").includes("--read-only")
  );
 }
 async function ensureExistingSchema() {
  const rows = await query(
    `SELECT TABLE_NAME AS tableName
     FROM information_schema.TABLES
     WHERE TABLE_SCHEMA = DATABASE()
       AND TABLE_NAME IN (${requiredTables.map((_, index) => `:table${index}`).join(", ")})`,
    Object.fromEntries(requiredTables.map((table, index) => [`table${index}`, table])),
  );
  const present = new Set(rows.map((row) => row.tableName));
  const missing = requiredTables.filter((table) => !present.has(table));
  if (missing.length > 0) {
    throw new Error(
      `Database is read-only and schema is incomplete. Missing tables: ${missing.join(", ")}`,
    );
  }
 }
 if (import.meta.url === `file://${process.argv[1]}`) {
  ensureSchema()
    .then(() => {
--- a/backend/src/middlewares/security.js
+++ b/backend/src/middlewares/security.js
@ -9,7 +9,7 @@ export function securityMiddleware(app) {
    app.set("trust proxy", 1);
  }
-  const origins = env.clientOrigin.split(",").map((origin) => origin.trim());
+  const origins = allowedOrigins();
  app.use(
    helmet({
      crossOriginResourcePolicy: { policy: "cross-origin" },
@ -35,6 +35,33 @@ export function securityMiddleware(app) {
  );
 }
 function allowedOrigins() {
  const configured = env.clientOrigin
    .split(",")
    .map((origin) => origin.trim())
    .filter(Boolean);
  if (env.isProduction) return configured;
  const expanded = new Set(configured);
  for (const origin of configured) {
    try {
      const url = new URL(origin);
      if (url.protocol === "http:") {
        url.protocol = "https:";
        expanded.add(url.toString().replace(/\/$/, ""));
      } else if (url.protocol === "https:") {
        url.protocol = "http:";
        expanded.add(url.toString().replace(/\/$/, ""));
      }
    } catch {
      // Ignore invalid origin entries; the CORS check below will reject them.
    }
  }
  return [...expanded];
 }
 export const authRateLimiter = rateLimit({
  windowMs: env.rateLimit.windowMs,
  limit: env.rateLimit.authMax,
--- a/backend/src/transcription/whisperClient.js
+++ b/backend/src/transcription/whisperClient.js
@ -94,10 +94,19 @@ function normalizeTranscript(payload, elapsedMs) {
    throw new AppError("Whisper VM returned an empty transcript", 502, "EMPTY_TRANSCRIPT");
  }
  const timestamps = normalizeSegments(
    payload.timestamps ??
      payload.segments ??
      payload.speaker_segments ??
      payload.result?.timestamps ??
      payload.result?.segments ??
      [],
  );
  return {
    transcriptText,
    language: payload.language ?? payload.result?.language ?? null,
-    timestamps: payload.timestamps ?? payload.segments ?? payload.result?.timestamps ?? [],
+    timestamps,
    duration: numberOrNull(payload.duration ?? payload.result?.duration),
    processingTime: Number((elapsedMs / 1000).toFixed(3)),
    modelName: env.whisper.modelName,
@ -108,13 +117,38 @@ function mockTranscript(filename) {
  return {
    transcriptText: `Mock transcript for ${filename}. Configure WHISPER_API_URL to use Faster-Whisper Large v3.`,
    language: "en",
-    timestamps: [{ start: 0, end: 4, text: "Mock transcript generated for local development." }],
+    timestamps: [
      {
        speaker: "Speaker 1",
        start: 0,
        end: 4,
        text: "Mock transcript generated for local development.",
      },
    ],
    duration: null,
    processingTime: 0.15,
    modelName: env.whisper.modelName,
  };
 }
 function normalizeSegments(segments) {
  if (!Array.isArray(segments)) return [];
  return segments
    .map((segment) => {
      const text = String(segment?.text ?? "").trim();
      const start = numberOrNull(segment?.start);
      const end = numberOrNull(segment?.end);
      if (!text || start === null || end === null) return null;
      return {
        ...(segment.speaker ? { speaker: String(segment.speaker) } : {}),
        start,
        end,
        text,
      };
    })
    .filter(Boolean);
 }
 function numberOrNull(value) {
  const number = Number(value);
  return Number.isFinite(number) ? number : null;
--- a/docs/WHISPER_VM.md
+++ b/docs/WHISPER_VM.md
@ -16,7 +16,8 @@ WHISPER_ALLOW_MOCK=false
 Expected endpoints:
- `GET /health` returns any 2xx status when the VM is ready.
+- `GET /health` returns any 2xx status when the VM is ready. For WhisperX diarization, it should
  also report `"whisperx": true` and `"diarization": true`.
 - `POST /transcribe` accepts multipart audio and returns one of:
 ```json
@ -24,9 +25,49 @@ Expected endpoints:
  "transcript_text": "Meeting transcript...",
  "language": "en",
  "duration": 123.45,
-  "timestamps": [{ "start": 0, "end": 5, "text": "Hello" }]
+  "timestamps": [{ "speaker": "Speaker 1", "start": 0, "end": 5, "text": "Hello" }]
 }
 ```
 The API retries failed requests, applies `WHISPER_TIMEOUT_MS`, and marks jobs as failed when the VM
 is unavailable.
 ## Enable WhisperX diarization on the VM
 The systemd unit runs `/home/cezen/whisper/server.py` inside the existing
 `/home/cezen/whisper/venv`. Deploy the updated script without creating a new venv:
 ```bash
 scp scripts/whisper_http_server.py cezen@172.16.10.64:/home/cezen/whisper/server.py
 scp scripts/orphion-whisper.service cezen@172.16.10.64:/tmp/orphion-whisper.service
 ssh cezen@172.16.10.64 'sudo mv /tmp/orphion-whisper.service /etc/systemd/system/orphion-whisper.service'
 ```
 Create `/home/cezen/whisper/.env` on the VM with the HuggingFace token accepted by pyannote:
 ```env
 HUGGINGFACE_TOKEN=your_token_here
 WHISPERX_DIARIZATION=true
 WHISPERX_DEVICE=cuda
 WHISPERX_COMPUTE_TYPE=float16
 WHISPERX_BATCH_SIZE=8
 WHISPERX_DIARIZATION_MODEL=pyannote/speaker-diarization-community-1
 ```
 The HuggingFace account behind the token must be approved for the configured pyannote diarization
 model. If transcription returns `"diarization": "fallback"` and the service log mentions a gated
 repo, visit `https://huggingface.co/pyannote/speaker-diarization-community-1` while signed in to
 that account and accept/request access.
 Restart and verify:
 ```bash
 ssh cezen@172.16.10.64 'sudo systemctl daemon-reload && sudo systemctl restart orphion-whisper'
 curl -sS http://172.16.10.64:8000/health
 ```
 Expected health shape:
 ```json
 { "status": "ok", "model": "large-v3", "device": "cuda", "whisperx": true, "diarization": true }
 ```
--- a/frontend/src/routes/_authenticated/record.tsx
+++ b/frontend/src/routes/_authenticated/record.tsx
@ -4,6 +4,7 @@ import { useEffect, useMemo, useRef, useState } from "react";
 import { motion } from "framer-motion";
 import {
  Check,
  Clock,
  Loader2,
  Mic,
  Pause,
@ -14,11 +15,12 @@ import {
  Sparkles,
  Square,
  Trash2,
  UserRound,
 } from "lucide-react";
 import { toast } from "sonner";
 import { transcribeAudio } from "@/services/audio";
 import { transcriptService } from "@/services/transcripts";
-import type { Transcript } from "@/services/types";
+import type { Transcript, TranscriptSegment } from "@/services/types";
 import { AudioPlayer } from "@/components/audio-player";
 import { SendTranscriptDialog } from "@/components/send-transcript-dialog";
@ -483,12 +485,42 @@ function RecordPage() {
              </Link>
            </div>
          </div>
-          <textarea
+          {currentTranscript.timestamps.length > 0 ? (
-            value={transcript}
+            <div className="mt-5 space-y-3 rounded-xl border border-border bg-background/35 p-4">
-            onChange={(event) => setTranscript(event.target.value)}
+              {groupTranscriptSegments(currentTranscript.timestamps).map((segment, index) => {
-            rows={16}
+                const speaker = segment.speaker ?? "Speaker";
-            className="mt-5 w-full resize-none rounded-xl border border-border bg-background/35 p-4 text-sm leading-relaxed outline-none transition focus:border-primary/60"
+                return (
-          />
+                  <div
                    key={`${segment.start}-${segment.end}-${index}`}
                    className="rounded-lg border border-border bg-secondary/20 p-3 text-sm"
                  >
                    <div className="mb-2 flex flex-wrap items-center gap-2">
                      <span
                        className={`inline-flex items-center gap-1.5 rounded-full border px-2.5 py-1 text-xs font-semibold ${speakerTone(
                          speaker,
                        )}`}
                      >
                        <UserRound className="h-3.5 w-3.5" />
                        {speaker}
                      </span>
                      <span className="inline-flex items-center gap-1.5 font-mono text-xs text-primary">
                        <Clock className="h-3.5 w-3.5" />
                        {formatStamp(segment.start)} - {formatStamp(segment.end)}
                      </span>
                    </div>
                    <p className="leading-relaxed text-muted-foreground">{segment.text}</p>
                  </div>
                );
              })}
            </div>
          ) : (
            <textarea
              value={transcript}
              onChange={(event) => setTranscript(event.target.value)}
              rows={16}
              className="mt-5 w-full resize-none rounded-xl border border-border bg-background/35 p-4 text-sm leading-relaxed outline-none transition focus:border-primary/60"
            />
          )}
        </section>
      )}
@ -506,3 +538,43 @@ function RecordPage() {
 function formatTime(seconds: number) {
  return `${String(Math.floor(seconds / 60)).padStart(2, "0")}:${String(seconds % 60).padStart(2, "0")}`;
 }
 function speakerTone(speaker: string) {
  const tones = [
    "border-primary/25 bg-primary/15 text-primary",
    "border-emerald-400/25 bg-emerald-400/10 text-emerald-300",
    "border-sky-400/25 bg-sky-400/10 text-sky-300",
    "border-amber-400/25 bg-amber-400/10 text-amber-300",
    "border-rose-400/25 bg-rose-400/10 text-rose-300",
  ];
  const number = Number(speaker.match(/\d+/)?.[0] ?? 1);
  return tones[(Math.max(number, 1) - 1) % tones.length];
 }
 function groupTranscriptSegments(segments: TranscriptSegment[]) {
  return segments.reduce<TranscriptSegment[]>((groups, segment) => {
    const previous = groups.at(-1);
    const speaker = segment.speaker ?? "";
    if (previous && (previous.speaker ?? "") === speaker) {
      previous.end = segment.end;
      previous.text = `${previous.text} ${segment.text}`.trim();
      return groups;
    }
    groups.push({ ...segment });
    return groups;
  }, []);
 }
 function formatStamp(seconds: number) {
  const safeSeconds = Number.isFinite(seconds) ? Math.max(seconds, 0) : 0;
  const totalSeconds = Math.floor(safeSeconds);
  const hours = Math.floor(totalSeconds / 3600);
  const minutes = Math.floor((totalSeconds % 3600) / 60);
  const secs = totalSeconds % 60;
  if (hours > 0) {
    return `${String(hours).padStart(2, "0")}:${String(minutes).padStart(2, "0")}:${String(
      secs,
    ).padStart(2, "0")}`;
  }
  return `${String(minutes).padStart(2, "0")}:${String(secs).padStart(2, "0")}`;
 }
--- a/frontend/src/routes/_authenticated/transcripts.$id.tsx
+++ b/frontend/src/routes/_authenticated/transcripts.$id.tsx
@ -1,11 +1,22 @@
 import { createFileRoute, Link, useNavigate } from "@tanstack/react-router";
 import { useQuery, useQueryClient } from "@tanstack/react-query";
 import { useEffect, useRef, useState } from "react";
-import { ArrowLeft, Copy, Download, Loader2, Save, Send, Sparkles, Trash2 } from "lucide-react";
+import {
  ArrowLeft,
  Clock,
  Copy,
  Download,
  Loader2,
  Save,
  Send,
  Sparkles,
  Trash2,
  UserRound,
 } from "lucide-react";
 import { toast } from "sonner";
 import { transcriptService } from "@/services/transcripts";
 import { useAuth } from "@/context/auth";
-import type { Transcript } from "@/services/types";
+import type { Transcript, TranscriptSegment } from "@/services/types";
 import { AudioPlayer } from "@/components/audio-player";
 import { ConfirmDialog } from "@/components/confirm-dialog";
 import { SendTranscriptDialog } from "@/components/send-transcript-dialog";
@ -104,7 +115,7 @@ function TranscriptDetailPage() {
  }
  async function copy() {
-    await navigator.clipboard.writeText(text);
+    await navigator.clipboard.writeText(formatTranscriptForClipboard(transcript));
    toast.success("Transcript copied");
  }
@ -277,34 +288,19 @@ function TranscriptDetailPage() {
            </button>
          )}
        </div>
-        <textarea
+        {transcript.timestamps.length > 0 ? (
-          value={text}
+          <SpeakerTranscript transcript={transcript} />
-          onChange={(event) => setText(event.target.value)}
+        ) : (
-          rows={18}
+          <textarea
-          readOnly={isReceived}
+            value={text}
-          className="w-full resize-none bg-transparent text-sm leading-relaxed outline-none read-only:text-muted-foreground"
+            onChange={(event) => setText(event.target.value)}
-        />
+            rows={18}
            readOnly={isReceived}
            className="w-full resize-none bg-transparent text-sm leading-relaxed outline-none read-only:text-muted-foreground"
          />
        )}
      </div>
      {transcript.timestamps.length > 0 && (
        <div className="glass rounded-2xl p-6">
          <h2 className="font-display text-lg font-semibold">Timestamps</h2>
          <div className="mt-4 max-h-72 space-y-2 overflow-y-auto">
            {transcript.timestamps.map((segment, index) => (
              <div
                key={`${segment.start}-${index}`}
                className="grid gap-3 rounded-xl border border-border bg-secondary/20 p-3 text-sm md:grid-cols-[120px_1fr]"
              >
                <span className="font-mono text-xs text-primary">
                  {formatStamp(segment.start)} - {formatStamp(segment.end)}
                </span>
                <span className="text-muted-foreground">{segment.text}</span>
              </div>
            ))}
          </div>
        </div>
      )}
      {showSend && (
        <SendTranscriptDialog sending={sending} onClose={() => setShowSend(false)} onSend={send} />
      )}
@ -322,8 +318,105 @@ function TranscriptDetailPage() {
  );
 }
-function formatStamp(seconds: number) {
+function SpeakerTranscript({ transcript }: { transcript: Transcript }) {
-  const minutes = Math.floor(seconds / 60);
+  const hasSpeakers = transcript.timestamps.some((segment) => segment.speaker);
-  const secs = Math.floor(seconds % 60);
+  const groupedSegments = groupTranscriptSegments(transcript.timestamps);
-  return `${minutes}:${String(secs).padStart(2, "0")}`;
+
  return (
    <div>
      <div className="flex flex-wrap items-center justify-between gap-3">
        <h2 className="font-display text-lg font-semibold">
          {hasSpeakers ? "Speaker transcript" : "Timestamps"}
        </h2>
        {hasSpeakers && (
          <span className="rounded-full border border-primary/20 bg-primary/10 px-3 py-1 text-xs font-medium text-primary">
            {speakerCount(transcript)} {speakerCount(transcript) === 1 ? "speaker" : "speakers"}
          </span>
        )}
      </div>
      <div className="mt-4 space-y-3">
        {groupedSegments.map((segment, index) => {
          const speaker = segment.speaker ?? "Speaker";
          return (
            <div
              key={`${segment.start}-${segment.end}-${index}`}
              className="rounded-xl border border-border bg-secondary/20 p-4 text-sm"
            >
              <div className="mb-2 flex flex-wrap items-center gap-2">
                {segment.speaker && (
                  <span
                    className={`inline-flex items-center gap-1.5 rounded-full border px-2.5 py-1 text-xs font-semibold ${speakerTone(
                      speaker,
                    )}`}
                  >
                    <UserRound className="h-3.5 w-3.5" />
                    {speaker}
                  </span>
                )}
                <span className="inline-flex items-center gap-1.5 font-mono text-xs text-primary">
                  <Clock className="h-3.5 w-3.5" />
                  {formatStamp(segment.start)} - {formatStamp(segment.end)}
                </span>
              </div>
              <p className="leading-relaxed text-muted-foreground">{segment.text}</p>
            </div>
          );
        })}
      </div>
    </div>
  );
 }
 function formatTranscriptForClipboard(transcript: Transcript | null) {
  if (!transcript?.timestamps.length) return transcript?.transcriptText ?? "";
  return groupTranscriptSegments(transcript.timestamps)
    .map((segment) => {
      const speaker = segment.speaker ? `${segment.speaker}\n` : "";
      return `${speaker}${formatStamp(segment.start)} - ${formatStamp(segment.end)}\n${segment.text}`;
    })
    .join("\n\n");
 }
 function speakerCount(transcript: Transcript) {
  return new Set(transcript.timestamps.map((segment) => segment.speaker).filter(Boolean)).size;
 }
 function groupTranscriptSegments(segments: TranscriptSegment[]) {
  return segments.reduce<TranscriptSegment[]>((groups, segment) => {
    const previous = groups.at(-1);
    const speaker = segment.speaker ?? "";
    if (previous && (previous.speaker ?? "") === speaker) {
      previous.end = segment.end;
      previous.text = `${previous.text} ${segment.text}`.trim();
      return groups;
    }
    groups.push({ ...segment });
    return groups;
  }, []);
 }
 function speakerTone(speaker: string) {
  const tones = [
    "border-primary/25 bg-primary/15 text-primary",
    "border-emerald-400/25 bg-emerald-400/10 text-emerald-300",
    "border-sky-400/25 bg-sky-400/10 text-sky-300",
    "border-amber-400/25 bg-amber-400/10 text-amber-300",
    "border-rose-400/25 bg-rose-400/10 text-rose-300",
  ];
  const number = Number(speaker.match(/\d+/)?.[0] ?? 1);
  return tones[(Math.max(number, 1) - 1) % tones.length];
 }
 function formatStamp(seconds: number) {
  const safeSeconds = Number.isFinite(seconds) ? Math.max(seconds, 0) : 0;
  const totalSeconds = Math.floor(safeSeconds);
  const hours = Math.floor(totalSeconds / 3600);
  const minutes = Math.floor((totalSeconds % 3600) / 60);
  const secs = totalSeconds % 60;
  if (hours > 0) {
    return `${String(hours).padStart(2, "0")}:${String(minutes).padStart(2, "0")}:${String(
      secs,
    ).padStart(2, "0")}`;
  }
  return `${String(minutes).padStart(2, "0")}:${String(secs).padStart(2, "0")}`;
 }
--- a/frontend/src/services/types.ts
+++ b/frontend/src/services/types.ts
@ -10,6 +10,7 @@ export type User = {
 export type UserSummary = Pick<User, "id" | "fullName" | "username" | "email">;
 export type TranscriptSegment = {
  speaker?: string;
  start: number;
  end: number;
  text: string;
--- a/frontend/vite.config.ts
+++ b/frontend/vite.config.ts
@ -1,3 +1,7 @@
 import { execFileSync } from "node:child_process";
 import { existsSync, mkdirSync, readFileSync } from "node:fs";
 import { tmpdir } from "node:os";
 import path from "node:path";
 import { fileURLToPath, URL } from "node:url";
 import tailwindcss from "@tailwindcss/vite";
 import { tanstackRouter } from "@tanstack/router-plugin/vite";
@ -5,11 +9,49 @@ import viteReact from "@vitejs/plugin-react";
 import { defineConfig, loadEnv } from "vite";
 import tsConfigPaths from "vite-tsconfig-paths";
-export default defineConfig(({ mode }) => {
+function getDevHttpsOptions() {
  const certDir = path.join(tmpdir(), "orphion-vite-https");
  const keyPath = path.join(certDir, "localhost-key.pem");
  const certPath = path.join(certDir, "localhost-cert.pem");
  if (!existsSync(keyPath) || !existsSync(certPath)) {
    mkdirSync(certDir, { recursive: true });
    execFileSync(
      "openssl",
      [
        "req",
        "-x509",
        "-newkey",
        "rsa:2048",
        "-nodes",
        "-sha256",
        "-days",
        "365",
        "-subj",
        "/CN=localhost",
        "-addext",
        "subjectAltName=DNS:localhost,IP:127.0.0.1,IP:0.0.0.0",
        "-keyout",
        keyPath,
        "-out",
        certPath,
      ],
      { stdio: "ignore" },
    );
  }
  return {
    key: readFileSync(keyPath),
    cert: readFileSync(certPath),
  };
 }
 export default defineConfig(({ command, mode }) => {
  const viteEnv = loadEnv(mode, process.cwd(), "");
  const serviceHost = viteEnv.VITE_ORPHION_SERVICE_HOST || "127.0.0.1";
  const apiPort = viteEnv.VITE_API_PORT || "4000";
  const apiProxyTarget = viteEnv.VITE_API_PROXY_TARGET || `http://${serviceHost}:${apiPort}`;
  const https = command === "serve" ? getDevHttpsOptions() : undefined;
  return {
    plugins: [
@ -30,6 +72,7 @@ export default defineConfig(({ mode }) => {
      dedupe: ["react", "react-dom", "@tanstack/react-query", "@tanstack/query-core"],
    },
    server: {
      ...(https ? { https } : {}),
      proxy: {
        "/api": {
          target: apiProxyTarget,
@ -37,6 +80,7 @@ export default defineConfig(({ mode }) => {
        },
      },
    },
    ...(https ? { preview: { https } } : {}),
    build: {
      rollupOptions: {
        output: {
--- a/scripts/orphion-whisper.service
+++ b/scripts/orphion-whisper.service
@ -8,9 +8,15 @@ Type=simple
 User=cezen
 Group=cezen
 WorkingDirectory=/home/cezen/whisper
 EnvironmentFile=-/home/cezen/whisper/.env
 Environment=WHISPER_MODEL=large-v3
 Environment=WHISPER_MODEL_DIR=/home/cezen/whisper/models
 Environment=WHISPER_FILE_FIELD=file
 Environment=WHISPERX_DIARIZATION=true
 Environment=WHISPERX_DEVICE=cuda
 Environment=WHISPERX_COMPUTE_TYPE=float16
 Environment=WHISPERX_BATCH_SIZE=8
 Environment=WHISPERX_DIARIZATION_MODEL=pyannote/speaker-diarization-community-1
 ExecStart=/home/cezen/whisper/venv/bin/python /home/cezen/whisper/server.py --host 0.0.0.0 --port 8000
 Restart=always
 RestartSec=5
--- a/scripts/whisper_http_server.py
+++ b/scripts/whisper_http_server.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Tiny Faster-Whisper HTTP API for Orphion.
+"""Tiny Faster-Whisper/WhisperX HTTP API for Orphion.
 Endpoints:
  GET /health
@ -10,6 +10,8 @@ from __future__ import annotations
 import argparse
 import cgi
 import gc
 import inspect
 import json
 import os
 import tempfile
@ -21,6 +23,21 @@ from pathlib import Path
 from faster_whisper import WhisperModel
 try:
    import torch
 except Exception:  # pragma: no cover - torch is optional for CPU fallback
    torch = None
 try:
    import whisperx
 except Exception:  # pragma: no cover - keeps existing Faster-Whisper path available
    whisperx = None
 try:
    import whisperx.diarize as whisperx_diarize
 except Exception:  # pragma: no cover - older/newer WhisperX layouts vary
    whisperx_diarize = None
 try:
    import ctranslate2
 except Exception:  # pragma: no cover - only used for runtime device selection
@ -29,6 +46,11 @@ except Exception:  # pragma: no cover - only used for runtime device selection
 MODEL_LOCK = threading.Lock()
 MODEL = None
 WHISPERX_LOCK = threading.Lock()
 INFERENCE_LOCK = threading.Lock()
 WHISPERX_MODEL = None
 ALIGN_MODELS = {}
 DIARIZATION_PIPELINE = None
 def choose_device() -> str:
@ -50,7 +72,35 @@ def default_compute_type(device: str) -> str:
    configured = os.getenv("WHISPER_COMPUTE_TYPE", "").strip()
    if configured:
        return configured
-    return "float16" if device == "cuda" else "int8"
+    return "float16" if device.startswith("cuda") else "int8"
 def diarization_enabled() -> bool:
    return os.getenv("WHISPERX_DIARIZATION", "true").lower() in {"1", "true", "yes", "on"}
 def whisperx_device() -> str:
    configured = os.getenv("WHISPERX_DEVICE", "").strip()
    if configured:
        return configured
    selected = choose_device()
    return "cuda" if selected == "cuda" else selected
 def whisperx_compute_type(device: str) -> str:
    configured = os.getenv("WHISPERX_COMPUTE_TYPE", "").strip()
    if configured:
        return configured
    return "float16" if device.startswith("cuda") else default_compute_type(device)
 def huggingface_token() -> str | None:
    return (
        os.getenv("HUGGINGFACE_TOKEN")
        or os.getenv("HF_TOKEN")
        or os.getenv("PYANNOTE_AUTH_TOKEN")
        or None
    )
 def get_model() -> WhisperModel:
@ -71,6 +121,196 @@ def get_model() -> WhisperModel:
    return MODEL
 def get_whisperx_model():
    global WHISPERX_MODEL
    if WHISPERX_MODEL is not None:
        return WHISPERX_MODEL
    if whisperx is None:
        raise RuntimeError("whisperx is not installed")
    with WHISPERX_LOCK:
        if WHISPERX_MODEL is None:
            device = whisperx_device()
            model_name = os.getenv("WHISPERX_MODEL", os.getenv("WHISPER_MODEL", "large-v3"))
            kwargs = {
                "compute_type": whisperx_compute_type(device),
                "download_root": os.getenv("WHISPER_MODEL_DIR") or None,
            }
            try:
                WHISPERX_MODEL = whisperx.load_model(model_name, device, **kwargs)
            except TypeError:
                kwargs.pop("download_root", None)
                WHISPERX_MODEL = whisperx.load_model(model_name, device, **kwargs)
    return WHISPERX_MODEL
 def get_align_model(language_code: str | None, device: str):
    if whisperx is None:
        raise RuntimeError("whisperx is not installed")
    language = language_code or os.getenv("WHISPERX_ALIGN_LANGUAGE", "en")
    key = (language, device)
    if key not in ALIGN_MODELS:
        ALIGN_MODELS[key] = whisperx.load_align_model(language_code=language, device=device)
    return ALIGN_MODELS[key]
 def get_diarization_pipeline(device: str):
    global DIARIZATION_PIPELINE
    if whisperx is None:
        raise RuntimeError("whisperx is not installed")
    if DIARIZATION_PIPELINE is not None:
        return DIARIZATION_PIPELINE
    token = huggingface_token()
    if not token:
        raise RuntimeError("HuggingFace token is required for speaker diarization")
    with WHISPERX_LOCK:
        if DIARIZATION_PIPELINE is None:
            pipeline_factory = getattr(whisperx, "DiarizationPipeline", None)
            if pipeline_factory is None and whisperx_diarize is not None:
                pipeline_factory = getattr(whisperx_diarize, "DiarizationPipeline", None)
            if pipeline_factory is None:
                raise RuntimeError("WhisperX diarization pipeline is not available")
            parameters = inspect.signature(pipeline_factory).parameters
            if "use_auth_token" in parameters:
                DIARIZATION_PIPELINE = pipeline_factory(use_auth_token=token, device=device)
            elif "auth_token" in parameters:
                DIARIZATION_PIPELINE = pipeline_factory(auth_token=token, device=device)
            elif "token" in parameters:
                DIARIZATION_PIPELINE = pipeline_factory(
                    model_name=os.getenv("WHISPERX_DIARIZATION_MODEL") or None,
                    token=token,
                    device=device,
                )
            else:
                DIARIZATION_PIPELINE = pipeline_factory(device=device)
    return DIARIZATION_PIPELINE
 def cleanup_gpu_memory() -> None:
    gc.collect()
    if torch is not None and getattr(torch, "cuda", None) is not None:
        try:
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        except Exception:
            pass
 def normalize_speaker(raw: str | None, mapping: dict[str, str]) -> str:
    if not raw:
        return "Speaker 1"
    key = str(raw)
    if key not in mapping:
        mapping[key] = f"Speaker {len(mapping) + 1}"
    return mapping[key]
 def segment_speaker(segment: dict) -> str | None:
    if segment.get("speaker"):
        return segment.get("speaker")
    counts: dict[str, int] = {}
    for word in segment.get("words") or []:
        speaker = word.get("speaker")
        if speaker:
            counts[speaker] = counts.get(speaker, 0) + 1
    if not counts:
        return None
    return max(counts.items(), key=lambda item: item[1])[0]
 def normalize_segments(segments: list[dict], include_speakers: bool = True) -> list[dict]:
    speaker_map: dict[str, str] = {}
    normalized = []
    for segment in segments:
        text = str(segment.get("text") or "").strip()
        if not text:
            continue
        item = {
            "start": round(float(segment.get("start") or 0), 3),
            "end": round(float(segment.get("end") or 0), 3),
            "text": text,
        }
        if include_speakers:
            item["speaker"] = normalize_speaker(segment_speaker(segment), speaker_map)
        normalized.append(item)
    return normalized
 def transcribe_with_whisperx(audio_path: str) -> dict:
    device = whisperx_device()
    model = get_whisperx_model()
    audio = whisperx.load_audio(audio_path)
    batch_size = int(os.getenv("WHISPERX_BATCH_SIZE", os.getenv("WHISPER_BATCH_SIZE", "8")))
    result = model.transcribe(audio, batch_size=batch_size)
    language = result.get("language")
    try:
        align_model, metadata = get_align_model(language, device)
        result = whisperx.align(
            result.get("segments", []),
            align_model,
            metadata,
            audio,
            device,
            return_char_alignments=False,
        )
    except Exception:
        traceback.print_exc()
    diarization_failed = False
    try:
        diarize_model = get_diarization_pipeline(device)
        min_speakers = os.getenv("WHISPERX_MIN_SPEAKERS")
        max_speakers = os.getenv("WHISPERX_MAX_SPEAKERS")
        kwargs = {}
        if min_speakers:
            kwargs["min_speakers"] = int(min_speakers)
        if max_speakers:
            kwargs["max_speakers"] = int(max_speakers)
        diarize_segments = diarize_model(audio, **kwargs)
        result = whisperx.assign_word_speakers(diarize_segments, result)
    except Exception:
        diarization_failed = True
        traceback.print_exc()
    timestamps = normalize_segments(result.get("segments", []), include_speakers=not diarization_failed)
    transcript_text = " ".join(segment["text"] for segment in timestamps if segment.get("text"))
    return {
        "transcript_text": transcript_text,
        "language": language,
        "duration": float(audio.shape[0]) / 16000 if hasattr(audio, "shape") else None,
        "timestamps": timestamps,
        "diarization": "fallback" if diarization_failed else "completed",
    }
 def transcribe_with_faster_whisper(audio_path: str) -> dict:
    model = get_model()
    segments, info = model.transcribe(
        audio_path,
        beam_size=int(os.getenv("WHISPER_BEAM_SIZE", "5")),
        vad_filter=os.getenv("WHISPER_VAD_FILTER", "true").lower() in {"1", "true", "yes", "on"},
    )
    timestamps = []
    transcript_parts = []
    for segment in segments:
        text = segment.text.strip()
        transcript_parts.append(text)
        timestamps.append({"start": segment.start, "end": segment.end, "text": text})
    return {
        "transcript_text": " ".join(part for part in transcript_parts if part),
        "language": getattr(info, "language", None),
        "duration": getattr(info, "duration", None),
        "timestamps": timestamps,
        "diarization": "disabled",
    }
 class WhisperHandler(BaseHTTPRequestHandler):
    server_version = "OrphionWhisper/1.0"
@ -84,6 +324,11 @@ class WhisperHandler(BaseHTTPRequestHandler):
                "status": "ok",
                "model": os.getenv("WHISPER_MODEL", "large-v3"),
                "device": choose_device(),
                "whisperx": whisperx is not None,
                "diarization": diarization_enabled(),
                "diarization_ready": whisperx is not None
                and diarization_enabled()
                and bool(huggingface_token()),
            },
        )
@ -129,28 +374,20 @@ class WhisperHandler(BaseHTTPRequestHandler):
                temp_file.write(chunk)
        try:
-            model = get_model()
+            with INFERENCE_LOCK:
-            segments, info = model.transcribe(
+                try:
-                temp_path,
+                    if diarization_enabled():
-                beam_size=int(os.getenv("WHISPER_BEAM_SIZE", "5")),
+                        payload = transcribe_with_whisperx(temp_path)
-                vad_filter=os.getenv("WHISPER_VAD_FILTER", "true").lower() in {"1", "true", "yes", "on"},
+                    else:
-            )
+                        payload = transcribe_with_faster_whisper(temp_path)
-            timestamps = []
+                except Exception:
-            transcript_parts = []
+                    traceback.print_exc()
-            for segment in segments:
+                    payload = transcribe_with_faster_whisper(temp_path)
-                text = segment.text.strip()
+                    payload["diarization"] = "fallback"
                transcript_parts.append(text)
                timestamps.append({"start": segment.start, "end": segment.end, "text": text})
-            self.send_json(
+            self.send_json(payload)
                {
                    "transcript_text": " ".join(part for part in transcript_parts if part),
                    "language": getattr(info, "language", None),
                    "duration": getattr(info, "duration", None),
                    "timestamps": timestamps,
                },
            )
        finally:
            cleanup_gpu_memory()
            try:
                os.unlink(temp_path)
            except OSError: