diff --git a/backend/src/config/migrate.js b/backend/src/config/migrate.js index 46cc707..483fbca 100644 --- a/backend/src/config/migrate.js +++ b/backend/src/config/migrate.js @@ -6,6 +6,15 @@ import { env } from "./env.js"; import { pool, query } from "./database.js"; const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const requiredTables = [ + "users", + "refresh_tokens", + "audio_assets", + "transcripts", + "transcript_shares", + "audio_metadata", + "transcription_jobs", +]; function identifier(value) { return `\`${String(value).replaceAll("`", "``")}\``; @@ -38,12 +47,24 @@ export async function ensureDatabase() { } export async function ensureSchema() { - await ensureDatabase(); - const schema = await fs.readFile(path.join(__dirname, "schema.sql"), "utf8"); - for (const statement of splitSql(schema)) { - await pool.query(statement); + try { + await ensureDatabase(); + } catch (error) { + if (!isReadOnlyError(error)) throw error; + await ensureExistingSchema(); + return; + } + + const schema = await fs.readFile(path.join(__dirname, "schema.sql"), "utf8"); + try { + for (const statement of splitSql(schema)) { + await pool.query(statement); + } + await runMigrations(); + } catch (error) { + if (!isReadOnlyError(error)) throw error; + await ensureExistingSchema(); } - await runMigrations(); } export async function runMigrations() { @@ -85,6 +106,31 @@ export async function runMigrations() { } } +function isReadOnlyError(error) { + return ( + error?.code === "ER_OPTION_PREVENTS_STATEMENT" || + error?.errno === 1290 || + String(error?.message ?? "").includes("--read-only") + ); +} + +async function ensureExistingSchema() { + const rows = await query( + `SELECT TABLE_NAME AS tableName + FROM information_schema.TABLES + WHERE TABLE_SCHEMA = DATABASE() + AND TABLE_NAME IN (${requiredTables.map((_, index) => `:table${index}`).join(", ")})`, + Object.fromEntries(requiredTables.map((table, index) => [`table${index}`, table])), + ); + const present = new Set(rows.map((row) => row.tableName)); + const missing = requiredTables.filter((table) => !present.has(table)); + if (missing.length > 0) { + throw new Error( + `Database is read-only and schema is incomplete. Missing tables: ${missing.join(", ")}`, + ); + } +} + if (import.meta.url === `file://${process.argv[1]}`) { ensureSchema() .then(() => { diff --git a/backend/src/middlewares/security.js b/backend/src/middlewares/security.js index 3f03aa9..076b934 100644 --- a/backend/src/middlewares/security.js +++ b/backend/src/middlewares/security.js @@ -9,7 +9,7 @@ export function securityMiddleware(app) { app.set("trust proxy", 1); } - const origins = env.clientOrigin.split(",").map((origin) => origin.trim()); + const origins = allowedOrigins(); app.use( helmet({ crossOriginResourcePolicy: { policy: "cross-origin" }, @@ -35,6 +35,33 @@ export function securityMiddleware(app) { ); } +function allowedOrigins() { + const configured = env.clientOrigin + .split(",") + .map((origin) => origin.trim()) + .filter(Boolean); + + if (env.isProduction) return configured; + + const expanded = new Set(configured); + for (const origin of configured) { + try { + const url = new URL(origin); + if (url.protocol === "http:") { + url.protocol = "https:"; + expanded.add(url.toString().replace(/\/$/, "")); + } else if (url.protocol === "https:") { + url.protocol = "http:"; + expanded.add(url.toString().replace(/\/$/, "")); + } + } catch { + // Ignore invalid origin entries; the CORS check below will reject them. + } + } + + return [...expanded]; +} + export const authRateLimiter = rateLimit({ windowMs: env.rateLimit.windowMs, limit: env.rateLimit.authMax, diff --git a/backend/src/transcription/whisperClient.js b/backend/src/transcription/whisperClient.js index 71777cf..9bf5dd5 100644 --- a/backend/src/transcription/whisperClient.js +++ b/backend/src/transcription/whisperClient.js @@ -94,10 +94,19 @@ function normalizeTranscript(payload, elapsedMs) { throw new AppError("Whisper VM returned an empty transcript", 502, "EMPTY_TRANSCRIPT"); } + const timestamps = normalizeSegments( + payload.timestamps ?? + payload.segments ?? + payload.speaker_segments ?? + payload.result?.timestamps ?? + payload.result?.segments ?? + [], + ); + return { transcriptText, language: payload.language ?? payload.result?.language ?? null, - timestamps: payload.timestamps ?? payload.segments ?? payload.result?.timestamps ?? [], + timestamps, duration: numberOrNull(payload.duration ?? payload.result?.duration), processingTime: Number((elapsedMs / 1000).toFixed(3)), modelName: env.whisper.modelName, @@ -108,13 +117,38 @@ function mockTranscript(filename) { return { transcriptText: `Mock transcript for ${filename}. Configure WHISPER_API_URL to use Faster-Whisper Large v3.`, language: "en", - timestamps: [{ start: 0, end: 4, text: "Mock transcript generated for local development." }], + timestamps: [ + { + speaker: "Speaker 1", + start: 0, + end: 4, + text: "Mock transcript generated for local development.", + }, + ], duration: null, processingTime: 0.15, modelName: env.whisper.modelName, }; } +function normalizeSegments(segments) { + if (!Array.isArray(segments)) return []; + return segments + .map((segment) => { + const text = String(segment?.text ?? "").trim(); + const start = numberOrNull(segment?.start); + const end = numberOrNull(segment?.end); + if (!text || start === null || end === null) return null; + return { + ...(segment.speaker ? { speaker: String(segment.speaker) } : {}), + start, + end, + text, + }; + }) + .filter(Boolean); +} + function numberOrNull(value) { const number = Number(value); return Number.isFinite(number) ? number : null; diff --git a/docs/WHISPER_VM.md b/docs/WHISPER_VM.md index 99df6fe..aa4c6f4 100644 --- a/docs/WHISPER_VM.md +++ b/docs/WHISPER_VM.md @@ -16,7 +16,8 @@ WHISPER_ALLOW_MOCK=false Expected endpoints: -- `GET /health` returns any 2xx status when the VM is ready. +- `GET /health` returns any 2xx status when the VM is ready. For WhisperX diarization, it should + also report `"whisperx": true` and `"diarization": true`. - `POST /transcribe` accepts multipart audio and returns one of: ```json @@ -24,9 +25,49 @@ Expected endpoints: "transcript_text": "Meeting transcript...", "language": "en", "duration": 123.45, - "timestamps": [{ "start": 0, "end": 5, "text": "Hello" }] + "timestamps": [{ "speaker": "Speaker 1", "start": 0, "end": 5, "text": "Hello" }] } ``` The API retries failed requests, applies `WHISPER_TIMEOUT_MS`, and marks jobs as failed when the VM is unavailable. + +## Enable WhisperX diarization on the VM + +The systemd unit runs `/home/cezen/whisper/server.py` inside the existing +`/home/cezen/whisper/venv`. Deploy the updated script without creating a new venv: + +```bash +scp scripts/whisper_http_server.py cezen@172.16.10.64:/home/cezen/whisper/server.py +scp scripts/orphion-whisper.service cezen@172.16.10.64:/tmp/orphion-whisper.service +ssh cezen@172.16.10.64 'sudo mv /tmp/orphion-whisper.service /etc/systemd/system/orphion-whisper.service' +``` + +Create `/home/cezen/whisper/.env` on the VM with the HuggingFace token accepted by pyannote: + +```env +HUGGINGFACE_TOKEN=your_token_here +WHISPERX_DIARIZATION=true +WHISPERX_DEVICE=cuda +WHISPERX_COMPUTE_TYPE=float16 +WHISPERX_BATCH_SIZE=8 +WHISPERX_DIARIZATION_MODEL=pyannote/speaker-diarization-community-1 +``` + +The HuggingFace account behind the token must be approved for the configured pyannote diarization +model. If transcription returns `"diarization": "fallback"` and the service log mentions a gated +repo, visit `https://huggingface.co/pyannote/speaker-diarization-community-1` while signed in to +that account and accept/request access. + +Restart and verify: + +```bash +ssh cezen@172.16.10.64 'sudo systemctl daemon-reload && sudo systemctl restart orphion-whisper' +curl -sS http://172.16.10.64:8000/health +``` + +Expected health shape: + +```json +{ "status": "ok", "model": "large-v3", "device": "cuda", "whisperx": true, "diarization": true } +``` diff --git a/frontend/src/routes/_authenticated/record.tsx b/frontend/src/routes/_authenticated/record.tsx index 45f10cf..1c77e33 100644 --- a/frontend/src/routes/_authenticated/record.tsx +++ b/frontend/src/routes/_authenticated/record.tsx @@ -4,6 +4,7 @@ import { useEffect, useMemo, useRef, useState } from "react"; import { motion } from "framer-motion"; import { Check, + Clock, Loader2, Mic, Pause, @@ -14,11 +15,12 @@ import { Sparkles, Square, Trash2, + UserRound, } from "lucide-react"; import { toast } from "sonner"; import { transcribeAudio } from "@/services/audio"; import { transcriptService } from "@/services/transcripts"; -import type { Transcript } from "@/services/types"; +import type { Transcript, TranscriptSegment } from "@/services/types"; import { AudioPlayer } from "@/components/audio-player"; import { SendTranscriptDialog } from "@/components/send-transcript-dialog"; @@ -483,12 +485,42 @@ function RecordPage() { -