From 03768b5b270361e8d885e79ca21d3fcb6ccb4b11 Mon Sep 17 00:00:00 2001 From: Kumi Date: Wed, 29 Nov 2023 12:30:26 +0100 Subject: [PATCH] Improve speech-to-text audio handling Enhanced the audio processing in speech-to-text conversion by converting the input audio to MP3 format before transcription. The logging now reflects the word count of the recognized text, providing clearer insight into the output. This should improve compatibility with the transcription service and result in more accurate transcriptions. --- src/gptbot/classes/openai.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/gptbot/classes/openai.py b/src/gptbot/classes/openai.py index 0264801..fa55858 100644 --- a/src/gptbot/classes/openai.py +++ b/src/gptbot/classes/openai.py @@ -408,14 +408,18 @@ Only the event_types mentioned above are allowed, you must not respond in any ot """ self.logger.log(f"Generating text from speech...") + audio_file = BytesIO() + AudioSegment.from_file(BytesIO(audio)).export(audio_file, format="mp3") + audio_file.name = "audio.mp3" + response = await self.openai_api.audio.transcriptions.create( model=self.stt_model, - file=BytesIO(audio), + file=audio_file, ) text = response.text - self.logger.log(f"Generated text with {tokens_used} tokens.") + self.logger.log(f"Recognized text: {len(text.split())} words.") return text