diff --git a/README.md b/README.md index 6f3122a..3ad94b6 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # GPTbot -GPTbot is a simple bot that uses different APIs to generate responses to +GPTbot is a simple bot that uses different APIs to generate responses to messages in a Matrix room. It is called GPTbot because it was originally intended to only use GPT-3 to -generate responses. However, it supports other services/APIs, and I will +generate responses. However, it supports other services/APIs, and I will probably add more in the future, so the name is a bit misleading. ## Features @@ -12,9 +12,12 @@ probably add more in the future, so the name is a bit misleading. - AI-generated responses to messages in a Matrix room (chatbot) - Currently supports OpenAI (tested with `gpt-3.5-turbo` and `gpt-4`) - AI-generated pictures via the `!gptbot imagine` command - - Currently supports OpenAI (DALL-E) + - Currently supports OpenAI (DALL-E-2/DALL-E-3) - Mathematical calculations via the `!gptbot calculate` command - Currently supports WolframAlpha +- Voice input and output + - Currently supports OpenAI (TTS and Whisper) + - Beta feature, see dedicated section for details - Automatic classification of messages (for `imagine`, `calculate`, etc.) - Beta feature, see Usage section for details - Really useful commands like `!gptbot help` and `!gptbot coin` @@ -26,9 +29,9 @@ probably add more in the future, so the name is a bit misleading. ## Installation -To run the bot, you will need Python 3.10 or newer. +To run the bot, you will need Python 3.10 or newer. -The bot has been tested with Python 3.11 on Arch, but should work with any +The bot has been tested with Python 3.11 on Arch, but should work with any current version, and should not require any special dependencies or operating system features. @@ -53,7 +56,7 @@ A release to PyPI is planned, but not yet available. ### Development -Clone the repository and install the requirements to a virtual environment. +Clone the repository and install the requirements to a virtual environment. ```shell # Clone the repository @@ -145,6 +148,14 @@ Also note that this feature conflicts with the `always_reply false` setting - or rather, it doesn't make sense then because you already have to explicitly specify the command to use. +## Voice input and output + +The bot supports voice input and output, but it is disabled by default. To +enable it, use the `!gptbot roomsettings` command to change the settings for +the current room. `!gptbot roomsettings stt true` will enable voice input, +and `!gptbot roomsettings tts true` will enable voice output. Note that this +may be a little unreliable at the moment, especially voice input. + ## Troubleshooting **Help, the bot is not responding!** @@ -181,4 +192,5 @@ please check the logs and open an issue if you can't figure out what's going on. ## License -This project is licensed under the terms of the MIT license. See the [LICENSE](LICENSE) file for details. +This project is licensed under the terms of the MIT license. See the [LICENSE](LICENSE) +file for details. diff --git a/pyproject.toml b/pyproject.toml index 405526d..bc2d818 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ allow-direct-references = true [project] name = "matrix-gptbot" -version = "0.2.0" +version = "0.2.1" authors = [ { name="Kumi Mitterer", email="gptbot@kumi.email" }, @@ -39,6 +39,7 @@ dependencies = [ [project.optional-dependencies] openai = [ "openai>=1.2", + "pydub", ] wolframalpha = [ diff --git a/requirements.txt b/requirements.txt index 28a5039..c19cce1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,6 @@ duckdb python-magic pillow wolframalpha +pydub git+https://kumig.it/kumitterer/trackingmore-api-tool.git \ No newline at end of file diff --git a/src/gptbot/classes/bot.py b/src/gptbot/classes/bot.py index 2c318df..7682c60 100644 --- a/src/gptbot/classes/bot.py +++ b/src/gptbot/classes/bot.py @@ -83,6 +83,8 @@ class GPTBot: chat_api: Optional[OpenAI] = None image_api: Optional[OpenAI] = None classification_api: Optional[OpenAI] = None + tts_api: Optional[OpenAI] = None + stt_api: Optional[OpenAI] = None parcel_api: Optional[TrackingMore] = None operator: Optional[str] = None room_ignore_list: List[str] = [] # List of rooms to ignore invites from @@ -149,9 +151,14 @@ class GPTBot: if "AllowedUsers" in config["GPTBot"]: bot.allowed_users = json.loads(config["GPTBot"]["AllowedUsers"]) - bot.chat_api = bot.image_api = bot.classification_api = OpenAI( - bot, config["OpenAI"]["APIKey"], config["OpenAI"].get("Model"), - config["OpenAI"].get("ImageModel"), config["OpenAI"].get("BaseURL"), bot.logger + bot.chat_api = bot.image_api = bot.classification_api = bot.tts_api = bot.stt_api = OpenAI( + bot=bot, + api_key=config["OpenAI"]["APIKey"], + chat_model=config["OpenAI"].get("Model"), + image_model=config["OpenAI"].get("ImageModel"), + tts_model=config["OpenAI"].get("TTSModel"), + stt_model=config["OpenAI"].get("STTModel"), + base_url=config["OpenAI"].get("BaseURL") ) bot.max_tokens = config["OpenAI"].getint("MaxTokens", bot.max_tokens) bot.max_messages = config["OpenAI"].getint("MaxMessages", bot.max_messages) @@ -207,7 +214,7 @@ class GPTBot: return user_id - async def _last_n_messages(self, room: str | MatrixRoom, n: Optional[int], ignore_bot_commands: bool = True): + async def _last_n_messages(self, room: str | MatrixRoom, n: Optional[int], ignore_bot_commands: bool = False): messages = [] n = n or self.max_messages room_id = room.room_id if isinstance(room, MatrixRoom) else room @@ -264,8 +271,7 @@ class GPTBot: messages.append(event) if isinstance(event, RoomMessageMedia): - if event.sender != self.matrix_client.user_id: - messages.append(event) + messages.append(event) self.logger.log(f"Found {len(messages)} messages (limit: {n})", "debug") @@ -574,6 +580,39 @@ class GPTBot: self.logger.log("Sent image", "debug") + async def send_file( + self, room: MatrixRoom, file: bytes, filename: str, mime: str, msgtype: str + ): + """Send a file to a room. + + Args: + room (MatrixRoom): The room to send the file to. + file (bytes): The file to send. + filename (str): The name of the file. + mime (str): The MIME type of the file. + """ + + self.logger.log( + f"Sending file of size {len(file)} bytes to room {room.room_id}", "debug" + ) + + content_uri = await self.upload_file(file, filename, mime) + + self.logger.log("Uploaded file - sending message...", "debug") + + content = { + "body": filename, + "info": {"mimetype": mime, "size": len(file)}, + "msgtype": msgtype, + "url": content_uri, + } + + status = await self.matrix_client.room_send( + room.room_id, "m.room.message", content + ) + + self.logger.log("Sent file", "debug") + async def send_message( self, room: MatrixRoom | str, message: str, notice: bool = False ): @@ -861,6 +900,46 @@ class GPTBot: space, ) + def room_uses_stt(self, room: MatrixRoom | str) -> bool: + """Check if a room uses STT. + + Args: + room (MatrixRoom | str): The room to check. + + Returns: + bool: Whether the room uses STT. + """ + room_id = room.room_id if isinstance(room, MatrixRoom) else room + + with closing(self.database.cursor()) as cursor: + cursor.execute( + "SELECT value FROM room_settings WHERE room_id = ? AND setting = ?", + (room_id, "stt"), + ) + result = cursor.fetchone() + + return False if not result else bool(int(result[0])) + + def room_uses_tts(self, room: MatrixRoom | str) -> bool: + """Check if a room uses TTS. + + Args: + room (MatrixRoom | str): The room to check. + + Returns: + bool: Whether the room uses TTS. + """ + room_id = room.room_id if isinstance(room, MatrixRoom) else room + + with closing(self.database.cursor()) as cursor: + cursor.execute( + "SELECT value FROM room_settings WHERE room_id = ? AND setting = ?", + (room_id, "tts"), + ) + result = cursor.fetchone() + + return False if not result else bool(int(result[0])) + def respond_to_room_messages(self, room: MatrixRoom | str) -> bool: """Check whether the bot should respond to all messages sent in a room. @@ -955,7 +1034,25 @@ class GPTBot: message_body = message.body if not self.chat_api.supports_chat_images() else [{"type": "text", "text": message.body}] chat_messages.append({"role": role, "content": message_body}) - if self.chat_api.supports_chat_images() and isinstance(message, RoomMessageMedia): + if isinstance(message, RoomMessageAudio): + role = ( + "assistant" if message.sender == self.matrix_client.user_id else "user" + ) + if message == event or (not message.event_id == event.event_id): + if self.room_uses_stt(room): + try: + download = await self.download_file(message.url) + message_text = await self.stt_api.speech_to_text(download.body) + except Exception as e: + self.logger.log(f"Error generating text from audio: {e}", "error") + message_text = message.body + else: + message_text = message.body + + message_body = message_text if not self.chat_api.supports_chat_images() else [{"type": "text", "text": message_text}] + chat_messages.append({"role": role, "content": message_body}) + + if self.chat_api.supports_chat_images() and isinstance(message, RoomMessageImage): image_url = message.url download = await self.download_file(image_url) @@ -1001,6 +1098,20 @@ class GPTBot: self.logger.log(f"Sending response to room {room.room_id}...") + if self.room_uses_tts(room): + self.logger.log("TTS enabled for room", "debug") + + try: + audio = await self.tts_api.text_to_speech(response) + await self.send_file(room, audio, response, "audio/mpeg", "m.audio") + return + + except Exception as e: + self.logger.log(f"Error generating audio: {e}", "error") + await self.send_message( + room, "Something went wrong generating audio file.", True + ) + message = await self.send_message(room, response) else: diff --git a/src/gptbot/classes/openai.py b/src/gptbot/classes/openai.py index 68e2c7d..f788344 100644 --- a/src/gptbot/classes/openai.py +++ b/src/gptbot/classes/openai.py @@ -3,13 +3,16 @@ import requests import asyncio import json + from functools import partial from contextlib import closing +from typing import Dict, List, Tuple, Generator, AsyncGenerator, Optional, Any +from io import BytesIO + +from pydub import AudioSegment from .logging import Logger -from typing import Dict, List, Tuple, Generator, AsyncGenerator, Optional, Any - ASSISTANT_CODE_INTERPRETER = [ { "type": "code_interpreter", @@ -30,17 +33,23 @@ class OpenAI: classification_api = chat_api image_model: str = "dall-e-2" + tts_model: str = "tts-1-hd" + tts_voice: str = "alloy" + stt_model: str = "whisper-1" operator: str = "OpenAI ([https://openai.com](https://openai.com))" - def __init__(self, bot, api_key, chat_model=None, image_model=None, base_url=None, logger=None): + def __init__(self, bot, api_key, chat_model=None, image_model=None, tts_model=None, tts_voice=None, stt_model=None, base_url=None, logger=None): self.bot = bot self.api_key = api_key self.chat_model = chat_model or self.chat_model self.image_model = image_model or self.image_model - self.logger = logger or Logger() + self.logger = logger or bot.logger or Logger() self.base_url = base_url or openai.base_url self.openai_api = openai.AsyncOpenAI(api_key=self.api_key, base_url=self.base_url) + self.tts_model = tts_model or self.tts_model + self.tts_voice = tts_voice or self.tts_voice + self.stt_model = stt_model or self.stt_model def supports_chat_images(self): return "vision" in self.chat_model @@ -266,6 +275,47 @@ Only the event_types mentioned above are allowed, you must not respond in any ot return result, tokens_used + async def text_to_speech(self, text: str, user: Optional[str] = None) -> Generator[bytes, None, None]: + """Generate speech from text. + + Args: + text (str): The text to use. + + Yields: + bytes: The audio data. + """ + self.logger.log(f"Generating speech from text '{text}'...") + + speech = await self.openai_api.audio.speech.create( + model=self.tts_model, + input=text, + voice=self.tts_voice + ) + + return speech.content + + async def speech_to_text(self, audio: bytes, user: Optional[str] = None) -> Tuple[str, int]: + """Generate text from speech. + + Args: + audio (bytes): The audio data. + + Returns: + Tuple[str, int]: The text and the number of tokens used. + """ + self.logger.log(f"Generating text from speech...") + + response = await self.openai_api.audio.transcriptions.create( + model=self.stt_model, + file=BytesIO(audio), + ) + + text = response.text + + self.logger.log(f"Generated text with {tokens_used} tokens.") + + return text + async def generate_image(self, prompt: str, user: Optional[str] = None) -> Generator[bytes, None, None]: """Generate an image from a prompt. diff --git a/src/gptbot/commands/__init__.py b/src/gptbot/commands/__init__.py index 17aa5cb..ca65ec5 100644 --- a/src/gptbot/commands/__init__.py +++ b/src/gptbot/commands/__init__.py @@ -22,6 +22,7 @@ for command in [ "dice", "parcel", "space", + "tts", ]: function = getattr(import_module( "." + command, "gptbot.commands"), "command_" + command) diff --git a/src/gptbot/commands/help.py b/src/gptbot/commands/help.py index db1a317..784d6e0 100644 --- a/src/gptbot/commands/help.py +++ b/src/gptbot/commands/help.py @@ -19,6 +19,7 @@ async def command_help(room: MatrixRoom, event: RoomMessageText, bot): - !gptbot chat \ - Send a message to the chat API - !gptbot classify \ - Classify a message using the classification API - !gptbot custom \ - Used for custom commands handled by the chat model and defined through the room's system message +- !gptbot roomsettings [use_classification|use_timing|always_reply|system_message|tts] [true|false|\] - Get or set room settings - !gptbot ignoreolder - Ignore messages before this point as context """ diff --git a/src/gptbot/commands/roomsettings.py b/src/gptbot/commands/roomsettings.py index 6d96441..7ea04e3 100644 --- a/src/gptbot/commands/roomsettings.py +++ b/src/gptbot/commands/roomsettings.py @@ -25,6 +25,8 @@ async def command_roomsettings(room: MatrixRoom, event: RoomMessageText, bot): (room.room_id, "system_message", value, value) ) + bot.database.commit() + await bot.send_message(room, f"Alright, I've stored the system message: '{value}'.", True) return @@ -35,7 +37,7 @@ async def command_roomsettings(room: MatrixRoom, event: RoomMessageText, bot): await bot.send_message(room, f"The current system message is: '{system_message}'.", True) return - if setting in ("use_classification", "always_reply", "use_timing"): + if setting in ("use_classification", "always_reply", "use_timing", "tts", "stt"): if value: if value.lower() in ["true", "false"]: value = value.lower() == "true" @@ -49,6 +51,8 @@ async def command_roomsettings(room: MatrixRoom, event: RoomMessageText, bot): (room.room_id, setting, "1" if value else "0", "1" if value else "0") ) + bot.database.commit() + await bot.send_message(room, f"Alright, I've set {setting} to: '{value}'.", True) return @@ -81,6 +85,9 @@ async def command_roomsettings(room: MatrixRoom, event: RoomMessageText, bot): - system_message [message]: Get or set the system message to be sent to the chat model - classification [true/false]: Get or set whether the room uses classification - always_reply [true/false]: Get or set whether the bot should reply to all messages (if false, only reply to mentions and commands) +- tts [true/false]: Get or set whether the bot should generate audio files instead of sending text +- stt [true/false]: Get or set whether the bot should attempt to process information from audio files +- timing [true/false]: Get or set whether the bot should return information about the time it took to generate a response """ await bot.send_message(room, message, True) diff --git a/src/gptbot/commands/tts.py b/src/gptbot/commands/tts.py new file mode 100644 index 0000000..d048a54 --- /dev/null +++ b/src/gptbot/commands/tts.py @@ -0,0 +1,23 @@ +from nio.events.room_events import RoomMessageText +from nio.rooms import MatrixRoom + + +async def command_tts(room: MatrixRoom, event: RoomMessageText, bot): + prompt = " ".join(event.body.split()[2:]) + + if prompt: + bot.logger.log("Generating speech...") + + try: + content = await bot.tts_api.text_to_speech(prompt, user=room.room_id) + except Exception as e: + bot.logger.log(f"Error generating speech: {e}", "error") + await bot.send_message(room, "Sorry, I couldn't generate an audio file. Please try again later.", True) + return + + bot.logger.log(f"Sending audio file...") + await bot.send_file(room, content, "audio.mp3", "audio/mpeg", "m.audio") + + return + + await bot.send_message(room, "You need to provide a prompt.", True)