Mom-Portal/server/whisper.ts
2026-05-14 15:00:56 +05:30

357 lines
9.2 KiB
TypeScript

import { randomUUID } from 'node:crypto';
import { createReadStream } from 'node:fs';
import { extname, posix } from 'node:path';
import { Client } from 'ssh2';
import type { ClientChannel, ConnectConfig, SFTPWrapper } from 'ssh2';
interface WhisperConfig {
audioDir: string;
command: string;
envActivatePath: string;
envName: string;
ffmpegCommand: string;
host: string;
language: string;
model: string;
password: string;
port: number;
sshReadyTimeoutMs: number;
timeoutMs: number;
transcriptDir: string;
username: string;
}
interface TranscriptionInput {
localFilePath: string;
originalName: string;
}
interface TranscriptionOutput {
remoteAudioPath: string;
remoteTranscriptPath: string;
transcript: string;
}
const requiredEnv = (name: string, fallback?: string) => {
const value = process.env[name] ?? fallback;
if (!value) {
throw new Error(`Missing required environment variable: ${name}`);
}
return value;
};
const getConfig = (): WhisperConfig => ({
audioDir: requiredEnv('WHISPER_VM_AUDIO_DIR', '/home/kevin/mom_audio'),
command: requiredEnv('WHISPER_COMMAND', 'whisper'),
envActivatePath: requiredEnv(
'WHISPER_ENV_ACTIVATE',
'/home/kevin/whisper-env/bin/activate'
),
envName: requiredEnv('WHISPER_ENV_NAME', 'whisper-env'),
ffmpegCommand: requiredEnv('FFMPEG_COMMAND', 'ffmpeg'),
host: requiredEnv('WHISPER_VM_HOST', '172.16.10.51'),
language: requiredEnv('WHISPER_LANGUAGE', 'English'),
model: requiredEnv('WHISPER_MODEL', 'medium'),
password: requiredEnv('WHISPER_VM_PASSWORD'),
port: Number(process.env.WHISPER_VM_PORT ?? 22),
sshReadyTimeoutMs: Number(process.env.WHISPER_SSH_READY_TIMEOUT_MS ?? 60_000),
timeoutMs: Number(process.env.WHISPER_TIMEOUT_MS ?? 1_800_000),
transcriptDir: requiredEnv(
'WHISPER_VM_TRANSCRIPT_DIR',
'/home/kevin/mom_transcripts'
),
username: requiredEnv('WHISPER_VM_USER', 'kevin')
});
const trimRemoteDir = (dir: string) => dir.replace(/\/+$/, '');
const shellQuote = (value: string) => `'${value.replace(/'/g, "'\\''")}'`;
const runInsideWhisperEnv = (config: WhisperConfig, command: string) => {
const script = [
'set -e',
`if [ -f ${shellQuote(config.envActivatePath)} ]; then`,
` . ${shellQuote(config.envActivatePath)}`,
'elif command -v conda >/dev/null 2>&1; then',
' eval "$(conda shell.bash hook)"',
` conda activate ${shellQuote(config.envName)}`,
'else',
` echo ${shellQuote(
'Unable to activate the transcription environment. Check the remote activation path.'
)} >&2`,
' exit 127',
'fi',
command
].join('\n');
return `bash -lc ${shellQuote(script)}`;
};
const safeBaseName = (fileName: string) => {
const withoutExtension = fileName.replace(/\.[^.]+$/, '');
const safe = withoutExtension
.replace(/[^a-zA-Z0-9._-]+/g, '-')
.replace(/^-+|-+$/g, '')
.slice(0, 80);
return safe || 'meeting-audio';
};
const connectSsh = (config: WhisperConfig) =>
new Promise<Client>((resolve, reject) => {
const client = new Client();
const connection: ConnectConfig = {
host: config.host,
keepaliveInterval: 15_000,
password: config.password,
port: config.port,
readyTimeout: config.sshReadyTimeoutMs,
username: config.username
};
client
.on('ready', () => resolve(client))
.on('error', (error) => {
const message =
error instanceof Error
? error.message
: 'Unknown SSH connection error.';
reject(
new Error(
`Unable to connect to the transcription service at ${config.host}:${config.port}. ` +
`Check that the VM is powered on, reachable from this machine, and accepting SSH. Details: ${message}`
)
);
})
.connect(connection);
});
const getSftp = (client: Client) =>
new Promise<SFTPWrapper>((resolve, reject) => {
client.sftp((error, sftp) => {
if (error) {
reject(error);
return;
}
resolve(sftp);
});
});
const execCommand = (client: Client, command: string, timeoutMs: number) =>
new Promise<string>((resolve, reject) => {
let stream: ClientChannel | null = null;
let stdout = '';
let stderr = '';
let settled = false;
const finish = (error?: Error, output?: string) => {
if (settled) return;
settled = true;
clearTimeout(timeout);
if (error) {
reject(error);
return;
}
resolve(output ?? '');
};
const timeout = setTimeout(() => {
stream?.close();
finish(new Error('Transcription timed out on the processing machine.'));
}, timeoutMs);
client.exec(command, (error, commandStream) => {
if (error) {
finish(error);
return;
}
stream = commandStream;
commandStream.on('data', (chunk: Buffer) => {
stdout += chunk.toString();
});
commandStream.stderr.on('data', (chunk: Buffer) => {
stderr += chunk.toString();
});
commandStream.on('close', (code: number | null) => {
if (code && code !== 0) {
finish(
new Error(
stderr.trim() ||
stdout.trim() ||
`Transcription failed with exit code ${code}.`
)
);
return;
}
finish(undefined, `${stdout}${stderr}`.trim());
});
});
});
const uploadFile = (
sftp: SFTPWrapper,
localFilePath: string,
remoteFilePath: string
) =>
new Promise<void>((resolve, reject) => {
let settled = false;
const source = createReadStream(localFilePath);
const target = sftp.createWriteStream(remoteFilePath, {
flags: 'w',
mode: 0o640
});
const finish = (error?: Error) => {
if (settled) return;
settled = true;
if (error) {
reject(error);
return;
}
resolve();
};
source.on('error', finish);
target.on('error', finish);
target.on('close', () => finish());
source.pipe(target);
});
const readRemoteFile = (sftp: SFTPWrapper, remoteFilePath: string) =>
new Promise<string>((resolve, reject) => {
let content = '';
let settled = false;
const source = sftp.createReadStream(remoteFilePath);
const finish = (error?: Error) => {
if (settled) return;
settled = true;
if (error) {
reject(error);
return;
}
resolve(content.trim());
};
source.on('data', (chunk: Buffer | string) => {
content += chunk.toString();
});
source.on('error', finish);
source.on('close', () => finish());
});
const deleteRemoteFile = (sftp: SFTPWrapper, remoteFilePath: string) =>
new Promise<void>((resolve) => {
sftp.unlink(remoteFilePath, () => resolve());
});
export const transcribeOnWhisperVm = async ({
localFilePath,
originalName
}: TranscriptionInput): Promise<TranscriptionOutput> => {
const config = getConfig();
const audioDir = trimRemoteDir(config.audioDir);
const transcriptDir = trimRemoteDir(config.transcriptDir);
const extension = extname(originalName).toLowerCase() || '.webm';
const remoteBaseName = `${Date.now()}-${randomUUID()}-${safeBaseName(
originalName
)}`;
const remoteSourcePath = posix.join(
audioDir,
`${remoteBaseName}.source${extension}`
);
const remoteAudioPath = posix.join(audioDir, `${remoteBaseName}.wav`);
const remoteTranscriptPath = posix.join(transcriptDir, `${remoteBaseName}.txt`);
const client = await connectSsh(config);
let sftp: SFTPWrapper | null = null;
try {
await execCommand(
client,
`mkdir -p ${shellQuote(audioDir)} ${shellQuote(transcriptDir)}`,
30_000
);
sftp = await getSftp(client);
await uploadFile(sftp, localFilePath, remoteSourcePath);
// Browser recordings are commonly WebM/Opus; the remote job is more reliable with WAV.
await execCommand(
client,
[
config.ffmpegCommand,
'-y',
'-hide_banner',
'-loglevel',
'error',
'-i',
shellQuote(remoteSourcePath),
'-vn',
'-ac',
'1',
'-ar',
'16000',
'-c:a',
'pcm_s16le',
shellQuote(remoteAudioPath)
].join(' '),
config.timeoutMs
);
// Transcription runs remotely so the Node process stays lightweight.
await execCommand(
client,
runInsideWhisperEnv(
config,
[
shellQuote(config.command),
shellQuote(remoteAudioPath),
'--model',
shellQuote(config.model),
'--language',
shellQuote(config.language),
'--task',
'transcribe',
'--output_dir',
shellQuote(transcriptDir),
'--output_format',
'txt',
'--fp16',
'False'
].join(' ')
),
config.timeoutMs
);
const transcript = await readRemoteFile(sftp, remoteTranscriptPath);
if (!transcript) {
throw new Error('The transcription service returned an empty transcript.');
}
return {
remoteAudioPath,
remoteTranscriptPath,
transcript
};
} finally {
if (sftp) {
await deleteRemoteFile(sftp, remoteSourcePath);
sftp.end();
}
client.end();
}
};