Refine logging and add image description feature

Enhanced the speech generation logging to display the word count of the input text instead of the full text. This change prioritizes user privacy and improves log readability. Implemented a new feature to generate descriptions for images within a conversation, expanding the bot's capabilities. Also, refactor `BaseTool` class to securely access arguments through `.get` method and to include `messages` by default, ensuring graceful handling of missing arguments.
2023-11-29 14:53:19 +01:00 · 2023-11-29 14:53:19 +01:00 · ad600faf4b
commit ad600faf4b
parent 03768b5b27
3 changed files with 45 additions and 18 deletions
--- a/src/gptbot/classes/openai.py
+++ b/src/gptbot/classes/openai.py
@ -4,6 +4,7 @@ import tiktoken

 import asyncio
 import json
+import base64

 from functools import partial
 from contextlib import closing
@ -387,7 +388,7 @@ Only the event_types mentioned above are allowed, you must not respond in any ot
        Yields:
            bytes: The audio data.
        """
-        self.logger.log(f"Generating speech from text '{text}'...")
+        self.logger.log(f"Generating speech from text of length: {len(text.split())} words...")

        speech = await self.openai_api.audio.speech.create(
            model=self.tts_model,
@ -475,3 +476,37 @@ Only the event_types mentioned above are allowed, you must not respond in any ot
            images.append(image)

        return images, len(images)
+
+    async def describe_images(self, messages: list, user: Optional[str] = None) -> Tuple[str, int]:
+        """Generate a description for an image.
+
+        Args:
+            image (bytes): The image data.
+
+        Returns:
+            Tuple[str, int]: The description and the number of tokens used.
+        """
+        self.logger.log(f"Generating description for images in conversation...")
+
+        system_message = "You are an image description generator. You generate descriptions for all images in the current conversation, one after another."
+
+        messages = [
+            {
+                "role": "system",
+                "content": system_message
+            }
+        ] + messages[1:]
+
+        if not "vision" in (chat_model := self.chat_model):
+            chat_model = self.chat_model + "gpt-4-vision-preview"
+
+        chat_partial = partial(
+            self.openai_api.chat.completions.create,
+                model=self.chat_model,
+                messages=messages,
+                user=user,
+        )
+
+        response = await self._request_with_retries(chat_partial)
+
+        return response.choices[0].message.content, response.usage.total_tokens
--- a/src/gptbot/tools/base.py
+++ b/src/gptbot/tools/base.py
@ -4,9 +4,10 @@ class BaseTool:

    def __init__(self, **kwargs):
        self.kwargs = kwargs
-        self.bot = kwargs["bot"]
-        self.room = kwargs["room"]
-        self.user = kwargs["user"]
+        self.bot = kwargs.get("bot")
+        self.room = kwargs.get("room")
+        self.user = kwargs.get("user")
+        self.messages = kwargs.get("messages", [])

    async def run(self):
        raise NotImplementedError()
--- a/src/gptbot/tools/imagedescription.py
+++ b/src/gptbot/tools/imagedescription.py
@ -1,24 +1,15 @@
 from .base import BaseTool, Handover

 class Imagedescription(BaseTool):
-    DESCRIPTION = "Describe the content of an image."
+    DESCRIPTION = "Describe the content of the images in the conversation."
    PARAMETERS = {
        "type": "object",
        "properties": {
-            "image": {
-                "type": "string",
-                "description": "The image to describe.",
-            },
        },
-        "required": ["image"],
    }

    async def run(self):
-        """Describe an image.
+        """Describe images in the conversation."""
+        image_api = self.bot.image_api

-        This tool only hands over to the original model, if applicable.
-        It is intended to handle the case where GPT-3 thinks it is asked to
-        *generate* an image, but the user actually wants to *describe* an
-        image...
-        """
-        raise Handover()
+        return (await image_api.describe_images(self.messages, self.user))[0]