feat(stt): add /api/stt endpoint using OpenAI; mount route; add Transcribe button in Recorder UI; update plan

This commit is contained in:
Ender 2025-10-23 22:15:09 +02:00
parent 4ad9c311a2
commit 498b49c474
4 changed files with 93 additions and 1 deletions

View File

@ -68,7 +68,8 @@ Voice-first authoring tool for single-user Ghost blog. Capture audio, refine wit
## Upcoming Next Actions
- [x] Backend endpoint for audio upload `/api/media/audio` (accept WebM/PCM) — implemented with MinIO via AWS SDK v3
- [x] S3-compatible adapter using MinIO (`S3_ENDPOINT`, `S3_ACCESS_KEY`, `S3_SECRET_KEY`)
- [ ] Add STT trigger in UI: send blob to backend, call OpenAI STT, render transcript
- [ ] Backend STT endpoint `/api/stt` (download from MinIO, call OpenAI STT, return transcript)
- [ ] Add STT trigger in UI: call `/api/stt` with `{ bucket, key }` and render transcript
## MinIO Integration Checklist
- [ ] Deploy MinIO on VPS (console `:9001`, API `:9000`).

View File

@ -8,6 +8,8 @@ export default function Recorder() {
const [audioUrl, setAudioUrl] = useState<string | null>(null);
const [audioBlob, setAudioBlob] = useState<Blob | null>(null);
const [uploadKey, setUploadKey] = useState<string | null>(null);
const [uploadBucket, setUploadBucket] = useState<string | null>(null);
const [transcript, setTranscript] = useState<string>('');
const [error, setError] = useState<string>('');
const requestStream = async (): Promise<MediaStream | null> => {
@ -59,6 +61,8 @@ export default function Recorder() {
try {
setError('');
setUploadKey(null);
setUploadBucket(null);
setTranscript('');
if (!audioBlob) {
setError('No audio to upload');
return;
@ -75,11 +79,36 @@ export default function Recorder() {
}
const data = await res.json();
setUploadKey(data.key || 'uploaded');
setUploadBucket(data.bucket || null);
} catch (e: any) {
setError(e?.message || 'Upload failed');
}
};
const transcribe = async () => {
try {
setError('');
setTranscript('');
if (!uploadKey) {
setError('Upload audio before transcribing');
return;
}
const res = await fetch('/api/stt', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ bucket: uploadBucket ?? undefined, key: uploadKey }),
});
if (!res.ok) {
const txt = await res.text();
throw new Error(`STT failed: ${res.status} ${txt}`);
}
const data = await res.json();
setTranscript(data.transcript || '');
} catch (e: any) {
setError(e?.message || 'Transcription failed');
}
};
useEffect(() => {
return () => {
if (audioUrl) URL.revokeObjectURL(audioUrl);
@ -93,6 +122,7 @@ export default function Recorder() {
<Button variant="contained" disabled={recording} onClick={startRecording}>Start</Button>
<Button variant="outlined" disabled={!recording} onClick={stopRecording}>Stop</Button>
<Button variant="text" disabled={!audioBlob} onClick={uploadAudio}>Upload</Button>
<Button variant="text" disabled={!uploadKey} onClick={transcribe}>Transcribe</Button>
</Stack>
{error && <Typography color="error" sx={{ mb: 2 }}>{error}</Typography>}
{audioUrl && (
@ -105,6 +135,12 @@ export default function Recorder() {
Uploaded as key: {uploadKey}
</Typography>
)}
{transcript && (
<Box sx={{ mt: 2 }}>
<Typography variant="subtitle1">Transcript</Typography>
<Typography variant="body2" sx={{ whiteSpace: 'pre-wrap' }}>{transcript}</Typography>
</Box>
)}
</Box>
);
}

View File

@ -5,6 +5,7 @@ import express from 'express';
import cors from 'cors';
import authRouter from './auth';
import mediaRouter from './media';
import sttRouter from './stt';
const app = express();
console.log('ENV ADMIN_PASSWORD loaded:', Boolean(process.env.ADMIN_PASSWORD));
@ -19,6 +20,7 @@ app.use(express.json());
// API routes
app.use('/api/auth', authRouter);
app.use('/api/media', mediaRouter);
app.use('/api/stt', sttRouter);
app.get('/api/health', (_req, res) => {
res.json({ ok: true });
});

53
apps/api/src/stt.ts Normal file
View File

@ -0,0 +1,53 @@
import express from 'express';
import { fetch, FormData } from 'undici';
import { downloadObject } from './storage/s3';
const router = express.Router();
router.post('/', async (req, res) => {
try {
const { bucket: bodyBucket, key } = req.body as { bucket?: string; key?: string };
const bucket = bodyBucket || process.env.S3_BUCKET;
if (!bucket || !key) {
return res.status(400).json({ error: 'bucket (or env S3_BUCKET) and key are required' });
}
if (!process.env.OPENAI_API_KEY) {
return res.status(500).json({ error: 'OPENAI_API_KEY not configured' });
}
const { buffer, contentType } = await downloadObject({ bucket, key });
// Build multipart form for OpenAI Whisper
const fd = new FormData();
// Convert Node Buffer -> ArrayBuffer (TS-accurate) for BlobPart
const ab = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength) as ArrayBuffer;
const blob = new Blob([ab as unknown as ArrayBuffer], { type: contentType || 'audio/webm' });
// OpenAI expects a filename
fd.append('file', blob, 'audio.webm');
fd.append('model', 'whisper-1');
// Optional: language hints, prompt, temperature etc.
const resp = await fetch('https://api.openai.com/v1/audio/transcriptions', {
method: 'POST',
headers: {
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
},
body: fd,
});
if (!resp.ok) {
const text = await resp.text();
console.error('OpenAI STT error:', resp.status, text);
return res.status(500).json({ error: 'STT failed', details: text });
}
const data: any = await resp.json();
// OpenAI returns { text: "..." }
return res.json({ success: true, transcript: data.text || '' });
} catch (err: any) {
console.error('STT error:', err);
return res.status(500).json({ error: 'STT error' });
}
});
export default router;