From 155ea68e7a0fcaefefc3a0a045323ef579256817 Mon Sep 17 00:00:00 2001
From: Kumi <git@kumi.email>
Date: Sun, 26 Nov 2023 07:58:10 +0100
Subject: [PATCH] feat: Add voice input and output support

This change adds support for voice input and output to the GPTbot. Users can enable this feature using the new `!gptbot roomsettings` command. Voice input and output are currently supported via OpenAI's TTS and Whisper models. However, note that voice input may be unreliable at the moment. This enhancement expands the capabilities of the bot, allowing users to interact with it using their voice. This addresses the need for a more user-friendly and natural way of communication.
---
 README.md                           |  26 ++++--
 pyproject.toml                      |   3 +-
 requirements.txt                    |   1 +
 src/gptbot/classes/bot.py           | 125 ++++++++++++++++++++++++++--
 src/gptbot/classes/openai.py        |  58 ++++++++++++-
 src/gptbot/commands/__init__.py     |   1 +
 src/gptbot/commands/help.py         |   1 +
 src/gptbot/commands/roomsettings.py |   9 +-
 src/gptbot/commands/tts.py          |  23 +++++
 9 files changed, 227 insertions(+), 20 deletions(-)
 create mode 100644 src/gptbot/commands/tts.py

diff --git a/README.md b/README.md
index 6f3122a..3ad94b6 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
 # GPTbot
 
-GPTbot is a simple bot that uses different APIs to generate responses to 
+GPTbot is a simple bot that uses different APIs to generate responses to
 messages in a Matrix room.
 
 It is called GPTbot because it was originally intended to only use GPT-3 to
-generate responses. However, it supports other services/APIs, and I will 
+generate responses. However, it supports other services/APIs, and I will
 probably add more in the future, so the name is a bit misleading.
 
 ## Features
@@ -12,9 +12,12 @@ probably add more in the future, so the name is a bit misleading.
 - AI-generated responses to messages in a Matrix room (chatbot)
   - Currently supports OpenAI (tested with `gpt-3.5-turbo` and `gpt-4`)
 - AI-generated pictures via the `!gptbot imagine` command
-  - Currently supports OpenAI (DALL-E)
+  - Currently supports OpenAI (DALL-E-2/DALL-E-3)
 - Mathematical calculations via the `!gptbot calculate` command
   - Currently supports WolframAlpha
+- Voice input and output
+  - Currently supports OpenAI (TTS and Whisper)
+  - Beta feature, see dedicated section for details
 - Automatic classification of messages (for `imagine`, `calculate`, etc.)
   - Beta feature, see Usage section for details
 - Really useful commands like `!gptbot help` and `!gptbot coin`
@@ -26,9 +29,9 @@ probably add more in the future, so the name is a bit misleading.
 
 ## Installation
 
-To run the bot, you will need Python 3.10 or newer. 
+To run the bot, you will need Python 3.10 or newer.
 
-The bot has been tested with Python 3.11 on Arch, but should work with any 
+The bot has been tested with Python 3.11 on Arch, but should work with any
 current version, and should not require any special dependencies or operating
 system features.
 
@@ -53,7 +56,7 @@ A release to PyPI is planned, but not yet available.
 
 ### Development
 
-Clone the repository and install the requirements to a virtual environment. 
+Clone the repository and install the requirements to a virtual environment.
 
 ```shell
 # Clone the repository
@@ -145,6 +148,14 @@ Also note that this feature conflicts with the `always_reply false` setting -
 or rather, it doesn't make sense then because you already have to explicitly
 specify the command to use.
 
+## Voice input and output
+
+The bot supports voice input and output, but it is disabled by default. To
+enable it, use the `!gptbot roomsettings` command to change the settings for
+the current room. `!gptbot roomsettings stt true` will enable voice input,
+and `!gptbot roomsettings tts true` will enable voice output. Note that this
+may be a little unreliable at the moment, especially voice input.
+
 ## Troubleshooting
 
 **Help, the bot is not responding!**
@@ -181,4 +192,5 @@ please check the logs and open an issue if you can't figure out what's going on.
 
 ## License
 
-This project is licensed under the terms of the MIT license. See the [LICENSE](LICENSE) file for details.
+This project is licensed under the terms of the MIT license. See the [LICENSE](LICENSE)
+file for details.
diff --git a/pyproject.toml b/pyproject.toml
index 405526d..bc2d818 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ allow-direct-references = true
 
 [project]
 name = "matrix-gptbot"
-version = "0.2.0"
+version = "0.2.1"
 
 authors = [
   { name="Kumi Mitterer", email="gptbot@kumi.email" },
@@ -39,6 +39,7 @@ dependencies = [
 [project.optional-dependencies]
 openai = [
     "openai>=1.2",
+    "pydub",
 ]
 
 wolframalpha = [
diff --git a/requirements.txt b/requirements.txt
index 28a5039..c19cce1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,5 +6,6 @@ duckdb
 python-magic
 pillow
 wolframalpha
+pydub
 
 git+https://kumig.it/kumitterer/trackingmore-api-tool.git
\ No newline at end of file
diff --git a/src/gptbot/classes/bot.py b/src/gptbot/classes/bot.py
index 2c318df..7682c60 100644
--- a/src/gptbot/classes/bot.py
+++ b/src/gptbot/classes/bot.py
@@ -83,6 +83,8 @@ class GPTBot:
     chat_api: Optional[OpenAI] = None
     image_api: Optional[OpenAI] = None
     classification_api: Optional[OpenAI] = None
+    tts_api: Optional[OpenAI] = None
+    stt_api: Optional[OpenAI] = None
     parcel_api: Optional[TrackingMore] = None
     operator: Optional[str] = None
     room_ignore_list: List[str] = []  # List of rooms to ignore invites from
@@ -149,9 +151,14 @@ class GPTBot:
             if "AllowedUsers" in config["GPTBot"]:
                 bot.allowed_users = json.loads(config["GPTBot"]["AllowedUsers"])
 
-        bot.chat_api = bot.image_api = bot.classification_api = OpenAI(
-            bot, config["OpenAI"]["APIKey"], config["OpenAI"].get("Model"),
-            config["OpenAI"].get("ImageModel"), config["OpenAI"].get("BaseURL"),  bot.logger
+        bot.chat_api = bot.image_api = bot.classification_api = bot.tts_api = bot.stt_api = OpenAI(
+            bot=bot,
+            api_key=config["OpenAI"]["APIKey"], 
+            chat_model=config["OpenAI"].get("Model"),
+            image_model=config["OpenAI"].get("ImageModel"),
+            tts_model=config["OpenAI"].get("TTSModel"),
+            stt_model=config["OpenAI"].get("STTModel"),
+            base_url=config["OpenAI"].get("BaseURL")
         )
         bot.max_tokens = config["OpenAI"].getint("MaxTokens", bot.max_tokens)
         bot.max_messages = config["OpenAI"].getint("MaxMessages", bot.max_messages)
@@ -207,7 +214,7 @@ class GPTBot:
 
         return user_id
 
-    async def _last_n_messages(self, room: str | MatrixRoom, n: Optional[int], ignore_bot_commands: bool = True):
+    async def _last_n_messages(self, room: str | MatrixRoom, n: Optional[int], ignore_bot_commands: bool = False):
         messages = []
         n = n or self.max_messages
         room_id = room.room_id if isinstance(room, MatrixRoom) else room
@@ -264,8 +271,7 @@ class GPTBot:
                     messages.append(event)
 
             if isinstance(event, RoomMessageMedia):
-                if event.sender != self.matrix_client.user_id:
-                    messages.append(event)
+                messages.append(event)
 
         self.logger.log(f"Found {len(messages)} messages (limit: {n})", "debug")
 
@@ -574,6 +580,39 @@ class GPTBot:
 
         self.logger.log("Sent image", "debug")
 
+    async def send_file(
+        self, room: MatrixRoom, file: bytes, filename: str, mime: str, msgtype: str
+    ):
+        """Send a file to a room.
+
+        Args:
+            room (MatrixRoom): The room to send the file to.
+            file (bytes): The file to send.
+            filename (str): The name of the file.
+            mime (str): The MIME type of the file.
+        """
+
+        self.logger.log(
+            f"Sending file of size {len(file)} bytes to room {room.room_id}", "debug"
+        )
+
+        content_uri = await self.upload_file(file, filename, mime)
+
+        self.logger.log("Uploaded file - sending message...", "debug")
+
+        content = {
+            "body": filename,
+            "info": {"mimetype": mime, "size": len(file)},
+            "msgtype": msgtype,
+            "url": content_uri,
+        }
+
+        status = await self.matrix_client.room_send(
+            room.room_id, "m.room.message", content
+        )
+
+        self.logger.log("Sent file", "debug")
+
     async def send_message(
         self, room: MatrixRoom | str, message: str, notice: bool = False
     ):
@@ -861,6 +900,46 @@ class GPTBot:
                 space,
             )
 
+    def room_uses_stt(self, room: MatrixRoom | str) -> bool:
+        """Check if a room uses STT.
+
+        Args:
+            room (MatrixRoom | str): The room to check.
+
+        Returns:
+            bool: Whether the room uses STT.
+        """
+        room_id = room.room_id if isinstance(room, MatrixRoom) else room
+
+        with closing(self.database.cursor()) as cursor:
+            cursor.execute(
+                "SELECT value FROM room_settings WHERE room_id = ? AND setting = ?",
+                (room_id, "stt"),
+            )
+            result = cursor.fetchone()
+
+        return False if not result else bool(int(result[0]))
+
+    def room_uses_tts(self, room: MatrixRoom | str) -> bool:
+        """Check if a room uses TTS.
+
+        Args:
+            room (MatrixRoom | str): The room to check.
+
+        Returns:
+            bool: Whether the room uses TTS.
+        """
+        room_id = room.room_id if isinstance(room, MatrixRoom) else room
+
+        with closing(self.database.cursor()) as cursor:
+            cursor.execute(
+                "SELECT value FROM room_settings WHERE room_id = ? AND setting = ?",
+                (room_id, "tts"),
+            )
+            result = cursor.fetchone()
+
+        return False if not result else bool(int(result[0]))
+
     def respond_to_room_messages(self, room: MatrixRoom | str) -> bool:
         """Check whether the bot should respond to all messages sent in a room.
 
@@ -955,7 +1034,25 @@ class GPTBot:
                     message_body = message.body if not self.chat_api.supports_chat_images() else [{"type": "text", "text": message.body}]
                     chat_messages.append({"role": role, "content": message_body})
 
-            if self.chat_api.supports_chat_images() and isinstance(message, RoomMessageMedia):
+            if isinstance(message, RoomMessageAudio):
+                role = (
+                    "assistant" if message.sender == self.matrix_client.user_id else "user"
+                )
+                if message == event or (not message.event_id == event.event_id):
+                    if self.room_uses_stt(room):
+                        try:
+                            download = await self.download_file(message.url)
+                            message_text = await self.stt_api.speech_to_text(download.body)
+                        except Exception as e:
+                            self.logger.log(f"Error generating text from audio: {e}", "error")
+                            message_text = message.body
+                    else:
+                        message_text = message.body
+
+                    message_body = message_text if not self.chat_api.supports_chat_images() else [{"type": "text", "text": message_text}]
+                    chat_messages.append({"role": role, "content": message_body})
+
+            if self.chat_api.supports_chat_images() and isinstance(message, RoomMessageImage):
                 image_url = message.url
                 download = await self.download_file(image_url)
 
@@ -1001,6 +1098,20 @@ class GPTBot:
 
             self.logger.log(f"Sending response to room {room.room_id}...")
 
+            if self.room_uses_tts(room):
+                self.logger.log("TTS enabled for room", "debug")
+
+                try:
+                    audio = await self.tts_api.text_to_speech(response)
+                    await self.send_file(room, audio, response, "audio/mpeg", "m.audio")
+                    return
+
+                except Exception as e:
+                    self.logger.log(f"Error generating audio: {e}", "error")
+                    await self.send_message(
+                        room, "Something went wrong generating audio file.", True
+                    )
+
             message = await self.send_message(room, response)
 
         else:
diff --git a/src/gptbot/classes/openai.py b/src/gptbot/classes/openai.py
index 68e2c7d..f788344 100644
--- a/src/gptbot/classes/openai.py
+++ b/src/gptbot/classes/openai.py
@@ -3,13 +3,16 @@ import requests
 
 import asyncio
 import json
+
 from functools import partial
 from contextlib import closing
+from typing import Dict, List, Tuple, Generator, AsyncGenerator, Optional, Any
+from io import BytesIO
+
+from pydub import AudioSegment
 
 from .logging import Logger
 
-from typing import Dict, List, Tuple, Generator, AsyncGenerator, Optional, Any
-
 ASSISTANT_CODE_INTERPRETER = [
     {
         "type": "code_interpreter",
@@ -30,17 +33,23 @@ class OpenAI:
 
     classification_api = chat_api
     image_model: str = "dall-e-2"
+    tts_model: str = "tts-1-hd"
+    tts_voice: str = "alloy"
+    stt_model: str = "whisper-1"
 
     operator: str = "OpenAI ([https://openai.com](https://openai.com))"
 
-    def __init__(self, bot, api_key, chat_model=None, image_model=None, base_url=None, logger=None):
+    def __init__(self, bot, api_key, chat_model=None, image_model=None, tts_model=None, tts_voice=None, stt_model=None, base_url=None, logger=None):
         self.bot = bot
         self.api_key = api_key
         self.chat_model = chat_model or self.chat_model
         self.image_model = image_model or self.image_model
-        self.logger = logger or Logger()
+        self.logger = logger or bot.logger or Logger()
         self.base_url = base_url or openai.base_url
         self.openai_api = openai.AsyncOpenAI(api_key=self.api_key, base_url=self.base_url)
+        self.tts_model = tts_model or self.tts_model
+        self.tts_voice = tts_voice or self.tts_voice
+        self.stt_model = stt_model or self.stt_model
 
     def supports_chat_images(self):
         return "vision" in self.chat_model
@@ -266,6 +275,47 @@ Only the event_types mentioned above are allowed, you must not respond in any ot
 
         return result, tokens_used
 
+    async def text_to_speech(self, text: str, user: Optional[str] = None) -> Generator[bytes, None, None]:
+        """Generate speech from text.
+
+        Args:
+            text (str): The text to use.
+
+        Yields:
+            bytes: The audio data.
+        """
+        self.logger.log(f"Generating speech from text '{text}'...")
+
+        speech = await self.openai_api.audio.speech.create(
+            model=self.tts_model,
+            input=text,
+            voice=self.tts_voice
+        )
+
+        return speech.content
+
+    async def speech_to_text(self, audio: bytes, user: Optional[str] = None) -> Tuple[str, int]:
+        """Generate text from speech.
+
+        Args:
+            audio (bytes): The audio data.
+
+        Returns:
+            Tuple[str, int]: The text and the number of tokens used.
+        """
+        self.logger.log(f"Generating text from speech...")
+
+        response = await self.openai_api.audio.transcriptions.create(
+            model=self.stt_model,
+            file=BytesIO(audio),
+        )
+
+        text = response.text
+
+        self.logger.log(f"Generated text with {tokens_used} tokens.")
+
+        return text
+
     async def generate_image(self, prompt: str, user: Optional[str] = None) -> Generator[bytes, None, None]:
         """Generate an image from a prompt.
 
diff --git a/src/gptbot/commands/__init__.py b/src/gptbot/commands/__init__.py
index 17aa5cb..ca65ec5 100644
--- a/src/gptbot/commands/__init__.py
+++ b/src/gptbot/commands/__init__.py
@@ -22,6 +22,7 @@ for command in [
     "dice",
     "parcel",
     "space",
+    "tts",
 ]:
     function = getattr(import_module(
         "." + command, "gptbot.commands"), "command_" + command)
diff --git a/src/gptbot/commands/help.py b/src/gptbot/commands/help.py
index db1a317..784d6e0 100644
--- a/src/gptbot/commands/help.py
+++ b/src/gptbot/commands/help.py
@@ -19,6 +19,7 @@ async def command_help(room: MatrixRoom, event: RoomMessageText, bot):
 - !gptbot chat \<message\> - Send a message to the chat API
 - !gptbot classify \<message\> - Classify a message using the classification API
 - !gptbot custom \<message\> - Used for custom commands handled by the chat model and defined through the room's system message
+- !gptbot roomsettings [use_classification|use_timing|always_reply|system_message|tts] [true|false|\<message\>] - Get or set room settings
 - !gptbot ignoreolder - Ignore messages before this point as context
 """
 
diff --git a/src/gptbot/commands/roomsettings.py b/src/gptbot/commands/roomsettings.py
index 6d96441..7ea04e3 100644
--- a/src/gptbot/commands/roomsettings.py
+++ b/src/gptbot/commands/roomsettings.py
@@ -25,6 +25,8 @@ async def command_roomsettings(room: MatrixRoom, event: RoomMessageText, bot):
                     (room.room_id, "system_message", value, value)
                 )
 
+            bot.database.commit()
+
             await bot.send_message(room, f"Alright, I've stored the system message: '{value}'.", True)
             return
 
@@ -35,7 +37,7 @@ async def command_roomsettings(room: MatrixRoom, event: RoomMessageText, bot):
         await bot.send_message(room, f"The current system message is: '{system_message}'.", True)
         return
 
-    if setting in ("use_classification", "always_reply", "use_timing"):
+    if setting in ("use_classification", "always_reply", "use_timing", "tts", "stt"):
         if value:
             if value.lower() in ["true", "false"]:
                 value = value.lower() == "true"
@@ -49,6 +51,8 @@ async def command_roomsettings(room: MatrixRoom, event: RoomMessageText, bot):
                         (room.room_id, setting, "1" if value else "0", "1" if value else "0")
                     )
 
+                bot.database.commit()
+
                 await bot.send_message(room, f"Alright, I've set {setting} to: '{value}'.", True)
                 return
 
@@ -81,6 +85,9 @@ async def command_roomsettings(room: MatrixRoom, event: RoomMessageText, bot):
 - system_message [message]: Get or set the system message to be sent to the chat model
 - classification [true/false]: Get or set whether the room uses classification
 - always_reply [true/false]: Get or set whether the bot should reply to all messages (if false, only reply to mentions and commands)
+- tts [true/false]: Get or set whether the bot should generate audio files instead of sending text
+- stt [true/false]: Get or set whether the bot should attempt to process information from audio files
+- timing [true/false]: Get or set whether the bot should return information about the time it took to generate a response
 """
 
     await bot.send_message(room, message, True)
diff --git a/src/gptbot/commands/tts.py b/src/gptbot/commands/tts.py
new file mode 100644
index 0000000..d048a54
--- /dev/null
+++ b/src/gptbot/commands/tts.py
@@ -0,0 +1,23 @@
+from nio.events.room_events import RoomMessageText
+from nio.rooms import MatrixRoom
+
+
+async def command_tts(room: MatrixRoom, event: RoomMessageText, bot):
+    prompt = " ".join(event.body.split()[2:])
+
+    if prompt:
+        bot.logger.log("Generating speech...")
+
+        try:
+            content = await bot.tts_api.text_to_speech(prompt, user=room.room_id)
+        except Exception as e:
+            bot.logger.log(f"Error generating speech: {e}", "error")
+            await bot.send_message(room, "Sorry, I couldn't generate an audio file. Please try again later.", True)
+            return
+
+        bot.logger.log(f"Sending audio file...")
+        await bot.send_file(room, content, "audio.mp3", "audio/mpeg", "m.audio")
+
+        return
+
+    await bot.send_message(room, "You need to provide a prompt.", True)