feat(stt): add /api/stt endpoint using OpenAI; mount route; add Transcribe button in Recorder UI; update plan

This commit is contained in:
Ender 2025-10-23 22:15:09 +02:00
parent 4ad9c311a2
commit 498b49c474
4 changed files with 93 additions and 1 deletions

View File

@ -68,7 +68,8 @@ Voice-first authoring tool for single-user Ghost blog. Capture audio, refine wit
## Upcoming Next Actions ## Upcoming Next Actions
- [x] Backend endpoint for audio upload `/api/media/audio` (accept WebM/PCM) — implemented with MinIO via AWS SDK v3 - [x] Backend endpoint for audio upload `/api/media/audio` (accept WebM/PCM) — implemented with MinIO via AWS SDK v3
- [x] S3-compatible adapter using MinIO (`S3_ENDPOINT`, `S3_ACCESS_KEY`, `S3_SECRET_KEY`) - [x] S3-compatible adapter using MinIO (`S3_ENDPOINT`, `S3_ACCESS_KEY`, `S3_SECRET_KEY`)
- [ ] Add STT trigger in UI: send blob to backend, call OpenAI STT, render transcript - [ ] Backend STT endpoint `/api/stt` (download from MinIO, call OpenAI STT, return transcript)
- [ ] Add STT trigger in UI: call `/api/stt` with `{ bucket, key }` and render transcript
## MinIO Integration Checklist ## MinIO Integration Checklist
- [ ] Deploy MinIO on VPS (console `:9001`, API `:9000`). - [ ] Deploy MinIO on VPS (console `:9001`, API `:9000`).

View File

@ -8,6 +8,8 @@ export default function Recorder() {
const [audioUrl, setAudioUrl] = useState<string | null>(null); const [audioUrl, setAudioUrl] = useState<string | null>(null);
const [audioBlob, setAudioBlob] = useState<Blob | null>(null); const [audioBlob, setAudioBlob] = useState<Blob | null>(null);
const [uploadKey, setUploadKey] = useState<string | null>(null); const [uploadKey, setUploadKey] = useState<string | null>(null);
const [uploadBucket, setUploadBucket] = useState<string | null>(null);
const [transcript, setTranscript] = useState<string>('');
const [error, setError] = useState<string>(''); const [error, setError] = useState<string>('');
const requestStream = async (): Promise<MediaStream | null> => { const requestStream = async (): Promise<MediaStream | null> => {
@ -59,6 +61,8 @@ export default function Recorder() {
try { try {
setError(''); setError('');
setUploadKey(null); setUploadKey(null);
setUploadBucket(null);
setTranscript('');
if (!audioBlob) { if (!audioBlob) {
setError('No audio to upload'); setError('No audio to upload');
return; return;
@ -75,11 +79,36 @@ export default function Recorder() {
} }
const data = await res.json(); const data = await res.json();
setUploadKey(data.key || 'uploaded'); setUploadKey(data.key || 'uploaded');
setUploadBucket(data.bucket || null);
} catch (e: any) { } catch (e: any) {
setError(e?.message || 'Upload failed'); setError(e?.message || 'Upload failed');
} }
}; };
const transcribe = async () => {
try {
setError('');
setTranscript('');
if (!uploadKey) {
setError('Upload audio before transcribing');
return;
}
const res = await fetch('/api/stt', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ bucket: uploadBucket ?? undefined, key: uploadKey }),
});
if (!res.ok) {
const txt = await res.text();
throw new Error(`STT failed: ${res.status} ${txt}`);
}
const data = await res.json();
setTranscript(data.transcript || '');
} catch (e: any) {
setError(e?.message || 'Transcription failed');
}
};
useEffect(() => { useEffect(() => {
return () => { return () => {
if (audioUrl) URL.revokeObjectURL(audioUrl); if (audioUrl) URL.revokeObjectURL(audioUrl);
@ -93,6 +122,7 @@ export default function Recorder() {
<Button variant="contained" disabled={recording} onClick={startRecording}>Start</Button> <Button variant="contained" disabled={recording} onClick={startRecording}>Start</Button>
<Button variant="outlined" disabled={!recording} onClick={stopRecording}>Stop</Button> <Button variant="outlined" disabled={!recording} onClick={stopRecording}>Stop</Button>
<Button variant="text" disabled={!audioBlob} onClick={uploadAudio}>Upload</Button> <Button variant="text" disabled={!audioBlob} onClick={uploadAudio}>Upload</Button>
<Button variant="text" disabled={!uploadKey} onClick={transcribe}>Transcribe</Button>
</Stack> </Stack>
{error && <Typography color="error" sx={{ mb: 2 }}>{error}</Typography>} {error && <Typography color="error" sx={{ mb: 2 }}>{error}</Typography>}
{audioUrl && ( {audioUrl && (
@ -105,6 +135,12 @@ export default function Recorder() {
Uploaded as key: {uploadKey} Uploaded as key: {uploadKey}
</Typography> </Typography>
)} )}
{transcript && (
<Box sx={{ mt: 2 }}>
<Typography variant="subtitle1">Transcript</Typography>
<Typography variant="body2" sx={{ whiteSpace: 'pre-wrap' }}>{transcript}</Typography>
</Box>
)}
</Box> </Box>
); );
} }

View File

@ -5,6 +5,7 @@ import express from 'express';
import cors from 'cors'; import cors from 'cors';
import authRouter from './auth'; import authRouter from './auth';
import mediaRouter from './media'; import mediaRouter from './media';
import sttRouter from './stt';
const app = express(); const app = express();
console.log('ENV ADMIN_PASSWORD loaded:', Boolean(process.env.ADMIN_PASSWORD)); console.log('ENV ADMIN_PASSWORD loaded:', Boolean(process.env.ADMIN_PASSWORD));
@ -19,6 +20,7 @@ app.use(express.json());
// API routes // API routes
app.use('/api/auth', authRouter); app.use('/api/auth', authRouter);
app.use('/api/media', mediaRouter); app.use('/api/media', mediaRouter);
app.use('/api/stt', sttRouter);
app.get('/api/health', (_req, res) => { app.get('/api/health', (_req, res) => {
res.json({ ok: true }); res.json({ ok: true });
}); });

53
apps/api/src/stt.ts Normal file
View File

@ -0,0 +1,53 @@
import express from 'express';
import { fetch, FormData } from 'undici';
import { downloadObject } from './storage/s3';
const router = express.Router();
router.post('/', async (req, res) => {
try {
const { bucket: bodyBucket, key } = req.body as { bucket?: string; key?: string };
const bucket = bodyBucket || process.env.S3_BUCKET;
if (!bucket || !key) {
return res.status(400).json({ error: 'bucket (or env S3_BUCKET) and key are required' });
}
if (!process.env.OPENAI_API_KEY) {
return res.status(500).json({ error: 'OPENAI_API_KEY not configured' });
}
const { buffer, contentType } = await downloadObject({ bucket, key });
// Build multipart form for OpenAI Whisper
const fd = new FormData();
// Convert Node Buffer -> ArrayBuffer (TS-accurate) for BlobPart
const ab = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength) as ArrayBuffer;
const blob = new Blob([ab as unknown as ArrayBuffer], { type: contentType || 'audio/webm' });
// OpenAI expects a filename
fd.append('file', blob, 'audio.webm');
fd.append('model', 'whisper-1');
// Optional: language hints, prompt, temperature etc.
const resp = await fetch('https://api.openai.com/v1/audio/transcriptions', {
method: 'POST',
headers: {
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
},
body: fd,
});
if (!resp.ok) {
const text = await resp.text();
console.error('OpenAI STT error:', resp.status, text);
return res.status(500).json({ error: 'STT failed', details: text });
}
const data: any = await resp.json();
// OpenAI returns { text: "..." }
return res.json({ success: true, transcript: data.text || '' });
} catch (err: any) {
console.error('STT error:', err);
return res.status(500).json({ error: 'STT error' });
}
});
export default router;