feat(stt): add /api/stt endpoint using OpenAI; mount route; add Transcribe button in Recorder UI; update plan

2025-10-23 22:15:09 +02:00 · 2025-10-23 22:15:09 +02:00 · 498b49c474
commit 498b49c474
parent 4ad9c311a2
4 changed files with 93 additions and 1 deletions
--- a/PLAN.md
+++ b/PLAN.md
@ -68,7 +68,8 @@ Voice-first authoring tool for single-user Ghost blog. Capture audio, refine wit
 ## Upcoming Next Actions
 - [x] Backend endpoint for audio upload `/api/media/audio` (accept WebM/PCM) — implemented with MinIO via AWS SDK v3
 - [x] S3-compatible adapter using MinIO (`S3_ENDPOINT`, `S3_ACCESS_KEY`, `S3_SECRET_KEY`)
- [ ] Add STT trigger in UI: send blob to backend, call OpenAI STT, render transcript
+- [ ] Backend STT endpoint `/api/stt` (download from MinIO, call OpenAI STT, return transcript)
+- [ ] Add STT trigger in UI: call `/api/stt` with `{ bucket, key }` and render transcript

 ## MinIO Integration Checklist
 - [ ] Deploy MinIO on VPS (console `:9001`, API `:9000`).
--- a/apps/admin/src/features/recorder/Recorder.tsx
+++ b/apps/admin/src/features/recorder/Recorder.tsx
@ -8,6 +8,8 @@ export default function Recorder() {
  const [audioUrl, setAudioUrl] = useState<string | null>(null);
  const [audioBlob, setAudioBlob] = useState<Blob | null>(null);
  const [uploadKey, setUploadKey] = useState<string | null>(null);
+  const [uploadBucket, setUploadBucket] = useState<string | null>(null);
+  const [transcript, setTranscript] = useState<string>('');
  const [error, setError] = useState<string>('');

  const requestStream = async (): Promise<MediaStream | null> => {
@ -59,6 +61,8 @@ export default function Recorder() {
    try {
      setError('');
      setUploadKey(null);
+      setUploadBucket(null);
+      setTranscript('');
      if (!audioBlob) {
        setError('No audio to upload');
        return;
@ -75,11 +79,36 @@ export default function Recorder() {
      }
      const data = await res.json();
      setUploadKey(data.key || 'uploaded');
+      setUploadBucket(data.bucket || null);
    } catch (e: any) {
      setError(e?.message || 'Upload failed');
    }
  };

+  const transcribe = async () => {
+    try {
+      setError('');
+      setTranscript('');
+      if (!uploadKey) {
+        setError('Upload audio before transcribing');
+        return;
+      }
+      const res = await fetch('/api/stt', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ bucket: uploadBucket ?? undefined, key: uploadKey }),
+      });
+      if (!res.ok) {
+        const txt = await res.text();
+        throw new Error(`STT failed: ${res.status} ${txt}`);
+      }
+      const data = await res.json();
+      setTranscript(data.transcript || '');
+    } catch (e: any) {
+      setError(e?.message || 'Transcription failed');
+    }
+  };
+
  useEffect(() => {
    return () => {
      if (audioUrl) URL.revokeObjectURL(audioUrl);
@ -93,6 +122,7 @@ export default function Recorder() {
        <Button variant="contained" disabled={recording} onClick={startRecording}>Start</Button>
        <Button variant="outlined" disabled={!recording} onClick={stopRecording}>Stop</Button>
        <Button variant="text" disabled={!audioBlob} onClick={uploadAudio}>Upload</Button>
+        <Button variant="text" disabled={!uploadKey} onClick={transcribe}>Transcribe</Button>
      </Stack>
      {error && <Typography color="error" sx={{ mb: 2 }}>{error}</Typography>}
      {audioUrl && (
@ -105,6 +135,12 @@ export default function Recorder() {
          Uploaded as key: {uploadKey}
        </Typography>
      )}
+      {transcript && (
+        <Box sx={{ mt: 2 }}>
+          <Typography variant="subtitle1">Transcript</Typography>
+          <Typography variant="body2" sx={{ whiteSpace: 'pre-wrap' }}>{transcript}</Typography>
+        </Box>
+      )}
    </Box>
  );
 }
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@ -5,6 +5,7 @@ import express from 'express';
 import cors from 'cors';
 import authRouter from './auth';
 import mediaRouter from './media';
+import sttRouter from './stt';

 const app = express();
 console.log('ENV ADMIN_PASSWORD loaded:', Boolean(process.env.ADMIN_PASSWORD));
@ -19,6 +20,7 @@ app.use(express.json());
 // API routes
 app.use('/api/auth', authRouter);
 app.use('/api/media', mediaRouter);
+app.use('/api/stt', sttRouter);
 app.get('/api/health', (_req, res) => {
  res.json({ ok: true });
 });
--- a/apps/api/src/stt.ts
+++ b/apps/api/src/stt.ts
@ -0,0 +1,53 @@
+import express from 'express';
+import { fetch, FormData } from 'undici';
+import { downloadObject } from './storage/s3';
+
+const router = express.Router();
+
+router.post('/', async (req, res) => {
+  try {
+    const { bucket: bodyBucket, key } = req.body as { bucket?: string; key?: string };
+    const bucket = bodyBucket || process.env.S3_BUCKET;
+    if (!bucket || !key) {
+      return res.status(400).json({ error: 'bucket (or env S3_BUCKET) and key are required' });
+    }
+    if (!process.env.OPENAI_API_KEY) {
+      return res.status(500).json({ error: 'OPENAI_API_KEY not configured' });
+    }
+
+    const { buffer, contentType } = await downloadObject({ bucket, key });
+
+    // Build multipart form for OpenAI Whisper
+    const fd = new FormData();
+    // Convert Node Buffer -> ArrayBuffer (TS-accurate) for BlobPart
+    const ab = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength) as ArrayBuffer;
+    const blob = new Blob([ab as unknown as ArrayBuffer], { type: contentType || 'audio/webm' });
+    // OpenAI expects a filename
+    fd.append('file', blob, 'audio.webm');
+    fd.append('model', 'whisper-1');
+    // Optional: language hints, prompt, temperature etc.
+
+    const resp = await fetch('https://api.openai.com/v1/audio/transcriptions', {
+      method: 'POST',
+      headers: {
+        Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
+      },
+      body: fd,
+    });
+
+    if (!resp.ok) {
+      const text = await resp.text();
+      console.error('OpenAI STT error:', resp.status, text);
+      return res.status(500).json({ error: 'STT failed', details: text });
+    }
+
+    const data: any = await resp.json();
+    // OpenAI returns { text: "..." }
+    return res.json({ success: true, transcript: data.text || '' });
+  } catch (err: any) {
+    console.error('STT error:', err);
+    return res.status(500).json({ error: 'STT error' });
+  }
+});
+
+export default router;