From ad600faf4b51f76cec2ce4a90e5c2d38a0884b04 Mon Sep 17 00:00:00 2001
From: Kumi <git@kumi.email>
Date: Wed, 29 Nov 2023 14:53:19 +0100
Subject: [PATCH] Refine logging and add image description feature

Enhanced the speech generation logging to display the word count of the input text instead of the full text. This change prioritizes user privacy and improves log readability. Implemented a new feature to generate descriptions for images within a conversation, expanding the bot's capabilities. Also, refactor `BaseTool` class to securely access arguments through `.get` method and to include `messages` by default, ensuring graceful handling of missing arguments.
---
 src/gptbot/classes/openai.py         | 37 +++++++++++++++++++++++++++-
 src/gptbot/tools/base.py             |  7 +++---
 src/gptbot/tools/imagedescription.py | 19 ++++----------
 3 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/src/gptbot/classes/openai.py b/src/gptbot/classes/openai.py
index fa55858..5352230 100644
--- a/src/gptbot/classes/openai.py
+++ b/src/gptbot/classes/openai.py
@@ -4,6 +4,7 @@ import tiktoken
 
 import asyncio
 import json
+import base64
 
 from functools import partial
 from contextlib import closing
@@ -387,7 +388,7 @@ Only the event_types mentioned above are allowed, you must not respond in any ot
         Yields:
             bytes: The audio data.
         """
-        self.logger.log(f"Generating speech from text '{text}'...")
+        self.logger.log(f"Generating speech from text of length: {len(text.split())} words...")
 
         speech = await self.openai_api.audio.speech.create(
             model=self.tts_model,
@@ -475,3 +476,37 @@ Only the event_types mentioned above are allowed, you must not respond in any ot
             images.append(image)
 
         return images, len(images)
+
+    async def describe_images(self, messages: list, user: Optional[str] = None) -> Tuple[str, int]:
+        """Generate a description for an image.
+
+        Args:
+            image (bytes): The image data.
+
+        Returns:
+            Tuple[str, int]: The description and the number of tokens used.
+        """
+        self.logger.log(f"Generating description for images in conversation...")
+
+        system_message = "You are an image description generator. You generate descriptions for all images in the current conversation, one after another."
+
+        messages = [
+            {
+                "role": "system",
+                "content": system_message
+            }
+        ] + messages[1:]
+
+        if not "vision" in (chat_model := self.chat_model):
+            chat_model = self.chat_model + "gpt-4-vision-preview"
+
+        chat_partial = partial(
+            self.openai_api.chat.completions.create,
+                model=self.chat_model,
+                messages=messages,
+                user=user,
+        )
+
+        response = await self._request_with_retries(chat_partial)
+
+        return response.choices[0].message.content, response.usage.total_tokens
\ No newline at end of file
diff --git a/src/gptbot/tools/base.py b/src/gptbot/tools/base.py
index 69975b5..e85754d 100644
--- a/src/gptbot/tools/base.py
+++ b/src/gptbot/tools/base.py
@@ -4,9 +4,10 @@ class BaseTool:
 
     def __init__(self, **kwargs):
         self.kwargs = kwargs
-        self.bot = kwargs["bot"]
-        self.room = kwargs["room"]
-        self.user = kwargs["user"]
+        self.bot = kwargs.get("bot")
+        self.room = kwargs.get("room")
+        self.user = kwargs.get("user")
+        self.messages = kwargs.get("messages", [])
 
     async def run(self):
         raise NotImplementedError()
diff --git a/src/gptbot/tools/imagedescription.py b/src/gptbot/tools/imagedescription.py
index b82f07c..2c83d98 100644
--- a/src/gptbot/tools/imagedescription.py
+++ b/src/gptbot/tools/imagedescription.py
@@ -1,24 +1,15 @@
 from .base import BaseTool, Handover
 
 class Imagedescription(BaseTool):
-    DESCRIPTION = "Describe the content of an image."
+    DESCRIPTION = "Describe the content of the images in the conversation."
     PARAMETERS = {
         "type": "object",
         "properties": {
-            "image": {
-                "type": "string",
-                "description": "The image to describe.",
-            },
         },
-        "required": ["image"],
     }
 
     async def run(self):
-        """Describe an image.
-        
-        This tool only hands over to the original model, if applicable.
-        It is intended to handle the case where GPT-3 thinks it is asked to
-        *generate* an image, but the user actually wants to *describe* an
-        image...
-        """
-        raise Handover()
\ No newline at end of file
+        """Describe images in the conversation."""
+        image_api = self.bot.image_api
+
+        return (await image_api.describe_images(self.messages, self.user))[0]
\ No newline at end of file