diff --git a/src/gptbot/classes/openai.py b/src/gptbot/classes/openai.py index fa55858..5352230 100644 --- a/src/gptbot/classes/openai.py +++ b/src/gptbot/classes/openai.py @@ -4,6 +4,7 @@ import tiktoken import asyncio import json +import base64 from functools import partial from contextlib import closing @@ -387,7 +388,7 @@ Only the event_types mentioned above are allowed, you must not respond in any ot Yields: bytes: The audio data. """ - self.logger.log(f"Generating speech from text '{text}'...") + self.logger.log(f"Generating speech from text of length: {len(text.split())} words...") speech = await self.openai_api.audio.speech.create( model=self.tts_model, @@ -475,3 +476,37 @@ Only the event_types mentioned above are allowed, you must not respond in any ot images.append(image) return images, len(images) + + async def describe_images(self, messages: list, user: Optional[str] = None) -> Tuple[str, int]: + """Generate a description for an image. + + Args: + image (bytes): The image data. + + Returns: + Tuple[str, int]: The description and the number of tokens used. + """ + self.logger.log(f"Generating description for images in conversation...") + + system_message = "You are an image description generator. You generate descriptions for all images in the current conversation, one after another." + + messages = [ + { + "role": "system", + "content": system_message + } + ] + messages[1:] + + if not "vision" in (chat_model := self.chat_model): + chat_model = self.chat_model + "gpt-4-vision-preview" + + chat_partial = partial( + self.openai_api.chat.completions.create, + model=self.chat_model, + messages=messages, + user=user, + ) + + response = await self._request_with_retries(chat_partial) + + return response.choices[0].message.content, response.usage.total_tokens \ No newline at end of file diff --git a/src/gptbot/tools/base.py b/src/gptbot/tools/base.py index 69975b5..e85754d 100644 --- a/src/gptbot/tools/base.py +++ b/src/gptbot/tools/base.py @@ -4,9 +4,10 @@ class BaseTool: def __init__(self, **kwargs): self.kwargs = kwargs - self.bot = kwargs["bot"] - self.room = kwargs["room"] - self.user = kwargs["user"] + self.bot = kwargs.get("bot") + self.room = kwargs.get("room") + self.user = kwargs.get("user") + self.messages = kwargs.get("messages", []) async def run(self): raise NotImplementedError() diff --git a/src/gptbot/tools/imagedescription.py b/src/gptbot/tools/imagedescription.py index b82f07c..2c83d98 100644 --- a/src/gptbot/tools/imagedescription.py +++ b/src/gptbot/tools/imagedescription.py @@ -1,24 +1,15 @@ from .base import BaseTool, Handover class Imagedescription(BaseTool): - DESCRIPTION = "Describe the content of an image." + DESCRIPTION = "Describe the content of the images in the conversation." PARAMETERS = { "type": "object", "properties": { - "image": { - "type": "string", - "description": "The image to describe.", - }, }, - "required": ["image"], } async def run(self): - """Describe an image. - - This tool only hands over to the original model, if applicable. - It is intended to handle the case where GPT-3 thinks it is asked to - *generate* an image, but the user actually wants to *describe* an - image... - """ - raise Handover() \ No newline at end of file + """Describe images in the conversation.""" + image_api = self.bot.image_api + + return (await image_api.describe_images(self.messages, self.user))[0] \ No newline at end of file