feat: enhance tool and image handling

Introduced changes to the tool request behavior and image processing. Now, the configuration allows a dedicated model for tool requests (`ToolModel`) and enforces automatic resizing of context images to maximal dimensions, improving compatibility and performance with the AI model. The update shifts away from a rigid tool model use, accommodating varied model support for tool requests, and optimizes image handling for network and processing efficiency. These adjustments aim to enhance user experience with more flexible tool usage and efficient image handling in chat interactions.
2024-05-20 10:20:17 +02:00 · 2024-05-20 10:20:17 +02:00 · 3f084ffdd3
commit 3f084ffdd3
parent 89f06268a5
5 changed files with 108 additions and 48 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,11 @@
 # Changelog

+### 0.3.13 (2024-05-20)
+
+- **Breaking Change**: The `ForceTools` configuration option behavior has changed. Instead of using a separate model for tools, the bot will now try to use the default chat model for tool requests, even if that model is not known to support tools.
+- Added `ToolModel` to OpenAI configuration to allow specifying a separate model for tool requests
+- Automatically resize context images to a default maximum of 2000x768 pixels before sending them to the AI model
+
 ### 0.3.12 (2024-05-17)

 - Added `ForceVision` to OpenAI configuration to allow third-party models to be used for image recognition
--- a/config.dist.ini
+++ b/config.dist.ini
@ -107,11 +107,20 @@ APIKey = sk-yoursecretkey

 # Whether to force the use of tools in the chat completion model
 #
-# Currently, only gpt-3.5-turbo supports tools. If you set this to 1, the bot
-# will use that model for tools even if you have a different model set as the
-# default. It will only generate the final result using the default model.
+# This will make the bot allow the use of tools in the chat completion model,
+# even if the model you are using isn't known to support tools. This is useful
+# if you are using a self-hosted model that supports tools, but the bot doesn't
+# know about it.
 #
-# ForceTools = 0
+# ForceTools = 1
+
+# Whether a dedicated model should be used for tools
+#
+# This will make the bot use a dedicated model for tools. This is useful if you
+# want to use a model that doesn't support tools, but still want to be able to
+# use tools.
+#
+# ToolModel = gpt-4o

 # Whether to emulate tools in the chat completion model
 #
@ -130,6 +139,16 @@ APIKey = sk-yoursecretkey
 #
 # ForceVision = 0

+# Maximum width and height of images sent to the API if vision is enabled
+#
+# The OpenAI API has a limit of 2000 pixels for the long side of an image, and 
+# 768 pixels for the short side. You may have to adjust these values if you're
+# using a self-hosted model that has different limits. You can also set these
+# to 0 to disable image resizing.
+#
+# MaxImageLongSide = 2000
+# MaxImageShortSide = 768
+
 # Advanced settings for the OpenAI API
 #
 # These settings are not required for normal operation, but can be used to
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,7 +7,7 @@ allow-direct-references = true

 [project]
 name = "matrix-gptbot"
-version = "0.3.12"
+version = "0.3.13"

 authors = [
  { name = "Kumi Mitterer", email = "gptbot@kumi.email" },
--- a/src/gptbot/classes/ai/openai.py
+++ b/src/gptbot/classes/ai/openai.py
@ -97,6 +97,14 @@ class OpenAI(BaseAI):
    def force_tools(self):
        return self._config.getboolean("ForceTools", fallback=False)

+    @property
+    def tool_model(self):
+        return self._config.get("ToolModel")
+
+    @property
+    def vision_model(self):
+        return self._config.get("VisionModel")
+
    @property
    def emulate_tools(self):
        return self._config.getboolean("EmulateTools", fallback=False)
@ -106,12 +114,26 @@ class OpenAI(BaseAI):
        # TODO: This should be model-specific
        return self._config.getint("MaxTokens", fallback=4000)

+    @property
+    def max_messages(self):
+        return self._config.getint("MaxMessages", fallback=30)
+
+    @property
+    def max_image_long_side(self):
+        return self._config.getint("MaxImageLongSide", fallback=2000)
+
+    @property
+    def max_image_short_side(self):
+        return self._config.getint("MaxImageShortSide", fallback=768)
+
+    def _is_tool_model(self, model: str) -> bool:
+        return model in ("gpt-3.5-turbo", "gpt-4-turbo", "gpt-4o")
+
+    def _is_vision_model(self, model: str) -> bool:
+        return model in ("gpt-4-turbo", "gpt-4o") or "vision" in model
+
    def supports_chat_images(self):
-        return (
-            "vision" in self.chat_model
-            or self.chat_model in ("gpt-4o",)
-            or self.force_vision
-        )
+        return self._is_vision_model(self.chat_model) or self.force_vision

    def json_decode(self, data):
        if data.startswith("```json\n"):
@ -194,11 +216,15 @@ class OpenAI(BaseAI):

        original_messages = messages

-        # TODO: I believe more models support tools now, so this could be adapted
-        if allow_override and "gpt-3.5-turbo" not in original_model:
-            if self.force_tools:
+        if (
+            allow_override
+            and use_tools
+            and self.tool_model
+            and not (self._is_tool_model(chat_model) or self.force_tools)
+        ):
+            if self.tool_model:
                self.logger.log("Overriding chat model to use tools")
-                chat_model = "gpt-3.5-turbo"
+                chat_model = self.tool_model

                out_messages = []

@ -225,8 +251,8 @@ class OpenAI(BaseAI):
            use_tools
            and self.emulate_tools
            and not self.force_tools
-            and "gpt-3.5-turbo" not in chat_model
-        ):  # TODO: This should be adapted to use tools with more models
+            and not self._is_tool_model(chat_model)
+        ):
            self.bot.logger.log("Using tool emulation mode.", "debug")

            messages = (
@ -272,9 +298,10 @@ class OpenAI(BaseAI):
            "presence_penalty": self.presence_penalty,
        }

-        if "gpt-3.5-turbo" in chat_model and use_tools:
+        if (self._is_tool_model(chat_model) and use_tools) or self.force_tools:
            kwargs["tools"] = tools

+        # TODO: Look into this
        if "gpt-4" in chat_model:
            kwargs["max_tokens"] = self.max_tokens

@ -685,10 +712,10 @@ Only the event_types mentioned above are allowed, you must not respond in any ot

        messages = [{"role": "system", "content": system_message}] + messages[1:]

-        if "vision" not in (chat_model := self.chat_model) and chat_model not in (
-            "gpt-4o",
-        ):
-            chat_model = "gpt-4o"
+        chat_model = self.chat_model
+
+        if not self._is_vision_model(chat_model):
+            chat_model = self.vision_model or "gpt-4o"

        chat_partial = partial(
            self.openai_api.chat.completions.create,
--- a/src/gptbot/classes/bot.py
+++ b/src/gptbot/classes/bot.py
@ -29,11 +29,13 @@ from nio import (
    RoomMessageAudio,
    DownloadError,
    RoomGetStateError,
+    DiskDownloadResponse,
+    MemoryDownloadResponse,
 )
 from nio.store import SqliteStore


-from typing import Optional, List, Any
+from typing import Optional, List, Any, Union
 from configparser import ConfigParser
 from datetime import datetime
 from io import BytesIO
@ -126,26 +128,6 @@ class GPTBot:
        """
        return self.config["GPTBot"].getboolean("ForceSystemMessage", False)

-    @property
-    def max_tokens(self) -> int:
-        """Maximum number of input tokens.
-
-        Returns:
-            int: The maximum number of input tokens. Defaults to 3000.
-        """
-        return self.config["OpenAI"].getint("MaxTokens", 3000)
-        # TODO: Move this to OpenAI class
-
-    @property
-    def max_messages(self) -> int:
-        """Maximum number of messages to consider as input.
-
-        Returns:
-            int: The maximum number of messages to consider as input. Defaults to 30.
-        """
-        return self.config["OpenAI"].getint("MaxMessages", 30)
-        # TODO: Move this to OpenAI class
-
    @property
    def operator(self) -> Optional[str]:
        """Operator of the bot.
@ -309,7 +291,7 @@ class GPTBot:
        ignore_notices: bool = True,
    ):
        messages = []
-        n = n or self.max_messages
+        n = n or self.chat_api.max_messages
        room_id = room.room_id if isinstance(room, MatrixRoom) else room

        self.logger.log(
@ -378,7 +360,7 @@ class GPTBot:
        model: Optional[str] = None,
        system_message: Optional[str] = None,
    ):
-        max_tokens = max_tokens or self.max_tokens
+        max_tokens = max_tokens or self.chat_api.max_tokens
        model = model or self.chat_api.chat_model
        system_message = (
            self.default_system_message if system_message is None else system_message
@ -1168,7 +1150,9 @@ class GPTBot:
                return

        try:
-            last_messages = await self._last_n_messages(room.room_id, self.max_messages)
+            last_messages = await self._last_n_messages(
+                room.room_id, self.chat_api.max_messages
+            )
        except Exception as e:
            self.logger.log(f"Error getting last messages: {e}", "error")
            await self.send_message(
@ -1271,7 +1255,27 @@ class GPTBot:
                    download = await self.download_file(image_url)

                    if download:
-                        encoded_url = f"data:{download.content_type};base64,{base64.b64encode(download.body).decode('utf-8')}"
+                        pil_image = Image.open(BytesIO(download.body))
+
+                        file_format = pil_image.format or "PNG"
+
+                        max_long_side = self.chat_api.max_image_long_side
+                        max_short_side = self.chat_api.max_image_short_side
+
+                        if max_long_side and max_short_side:
+                            if pil_image.width > pil_image.height:
+                                if pil_image.width > max_long_side:
+                                    pil_image.thumbnail((max_long_side, max_short_side))
+
+                            else:
+                                if pil_image.height > max_long_side:
+                                    pil_image.thumbnail((max_short_side, max_long_side))
+
+                        bio = BytesIO()
+
+                        pil_image.save(bio, format=file_format)
+
+                        encoded_url = f"data:{download.content_type};base64,{base64.b64encode(bio.getvalue()).decode('utf-8')}"
                        parent = (
                            chat_messages[-1]
                            if chat_messages
@ -1312,7 +1316,9 @@ class GPTBot:

        # Truncate messages to fit within the token limit
        self._truncate(
-            chat_messages[1:], self.max_tokens - 1, system_message=system_message
+            chat_messages[1:],
+            self.chat_api.max_tokens - 1,
+            system_message=system_message,
        )

        # Check for a model override
@ -1362,7 +1368,9 @@ class GPTBot:

        await self.matrix_client.room_typing(room.room_id, False)

-    async def download_file(self, mxc) -> Optional[bytes]:
+    async def download_file(
+        self, mxc
+    ) -> Union[DiskDownloadResponse, MemoryDownloadResponse]:
        """Download a file from the homeserver.

        Args: