feat: enhance tool and image handling
All checks were successful
Docker CI/CD / Docker Build and Push to Docker Hub (push) Successful in 8m22s
Python Package CI/CD / Setup and Test (push) Successful in 1m11s
Python Package CI/CD / Publish to PyPI (push) Successful in 38s

Introduced changes to the tool request behavior and image processing. Now, the configuration allows a dedicated model for tool requests (`ToolModel`) and enforces automatic resizing of context images to maximal dimensions, improving compatibility and performance with the AI model. The update shifts away from a rigid tool model use, accommodating varied model support for tool requests, and optimizes image handling for network and processing efficiency. These adjustments aim to enhance user experience with more flexible tool usage and efficient image handling in chat interactions.
This commit is contained in:
Kumi 2024-05-20 10:20:17 +02:00
parent 89f06268a5
commit 3f084ffdd3
Signed by: kumi
GPG key ID: ECBCC9082395383F
5 changed files with 108 additions and 48 deletions

View file

@ -1,5 +1,11 @@
# Changelog # Changelog
### 0.3.13 (2024-05-20)
- **Breaking Change**: The `ForceTools` configuration option behavior has changed. Instead of using a separate model for tools, the bot will now try to use the default chat model for tool requests, even if that model is not known to support tools.
- Added `ToolModel` to OpenAI configuration to allow specifying a separate model for tool requests
- Automatically resize context images to a default maximum of 2000x768 pixels before sending them to the AI model
### 0.3.12 (2024-05-17) ### 0.3.12 (2024-05-17)
- Added `ForceVision` to OpenAI configuration to allow third-party models to be used for image recognition - Added `ForceVision` to OpenAI configuration to allow third-party models to be used for image recognition

View file

@ -107,11 +107,20 @@ APIKey = sk-yoursecretkey
# Whether to force the use of tools in the chat completion model # Whether to force the use of tools in the chat completion model
# #
# Currently, only gpt-3.5-turbo supports tools. If you set this to 1, the bot # This will make the bot allow the use of tools in the chat completion model,
# will use that model for tools even if you have a different model set as the # even if the model you are using isn't known to support tools. This is useful
# default. It will only generate the final result using the default model. # if you are using a self-hosted model that supports tools, but the bot doesn't
# know about it.
# #
# ForceTools = 0 # ForceTools = 1
# Whether a dedicated model should be used for tools
#
# This will make the bot use a dedicated model for tools. This is useful if you
# want to use a model that doesn't support tools, but still want to be able to
# use tools.
#
# ToolModel = gpt-4o
# Whether to emulate tools in the chat completion model # Whether to emulate tools in the chat completion model
# #
@ -130,6 +139,16 @@ APIKey = sk-yoursecretkey
# #
# ForceVision = 0 # ForceVision = 0
# Maximum width and height of images sent to the API if vision is enabled
#
# The OpenAI API has a limit of 2000 pixels for the long side of an image, and
# 768 pixels for the short side. You may have to adjust these values if you're
# using a self-hosted model that has different limits. You can also set these
# to 0 to disable image resizing.
#
# MaxImageLongSide = 2000
# MaxImageShortSide = 768
# Advanced settings for the OpenAI API # Advanced settings for the OpenAI API
# #
# These settings are not required for normal operation, but can be used to # These settings are not required for normal operation, but can be used to

View file

@ -7,7 +7,7 @@ allow-direct-references = true
[project] [project]
name = "matrix-gptbot" name = "matrix-gptbot"
version = "0.3.12" version = "0.3.13"
authors = [ authors = [
{ name = "Kumi Mitterer", email = "gptbot@kumi.email" }, { name = "Kumi Mitterer", email = "gptbot@kumi.email" },

View file

@ -97,6 +97,14 @@ class OpenAI(BaseAI):
def force_tools(self): def force_tools(self):
return self._config.getboolean("ForceTools", fallback=False) return self._config.getboolean("ForceTools", fallback=False)
@property
def tool_model(self):
return self._config.get("ToolModel")
@property
def vision_model(self):
return self._config.get("VisionModel")
@property @property
def emulate_tools(self): def emulate_tools(self):
return self._config.getboolean("EmulateTools", fallback=False) return self._config.getboolean("EmulateTools", fallback=False)
@ -106,12 +114,26 @@ class OpenAI(BaseAI):
# TODO: This should be model-specific # TODO: This should be model-specific
return self._config.getint("MaxTokens", fallback=4000) return self._config.getint("MaxTokens", fallback=4000)
@property
def max_messages(self):
return self._config.getint("MaxMessages", fallback=30)
@property
def max_image_long_side(self):
return self._config.getint("MaxImageLongSide", fallback=2000)
@property
def max_image_short_side(self):
return self._config.getint("MaxImageShortSide", fallback=768)
def _is_tool_model(self, model: str) -> bool:
return model in ("gpt-3.5-turbo", "gpt-4-turbo", "gpt-4o")
def _is_vision_model(self, model: str) -> bool:
return model in ("gpt-4-turbo", "gpt-4o") or "vision" in model
def supports_chat_images(self): def supports_chat_images(self):
return ( return self._is_vision_model(self.chat_model) or self.force_vision
"vision" in self.chat_model
or self.chat_model in ("gpt-4o",)
or self.force_vision
)
def json_decode(self, data): def json_decode(self, data):
if data.startswith("```json\n"): if data.startswith("```json\n"):
@ -194,11 +216,15 @@ class OpenAI(BaseAI):
original_messages = messages original_messages = messages
# TODO: I believe more models support tools now, so this could be adapted if (
if allow_override and "gpt-3.5-turbo" not in original_model: allow_override
if self.force_tools: and use_tools
and self.tool_model
and not (self._is_tool_model(chat_model) or self.force_tools)
):
if self.tool_model:
self.logger.log("Overriding chat model to use tools") self.logger.log("Overriding chat model to use tools")
chat_model = "gpt-3.5-turbo" chat_model = self.tool_model
out_messages = [] out_messages = []
@ -225,8 +251,8 @@ class OpenAI(BaseAI):
use_tools use_tools
and self.emulate_tools and self.emulate_tools
and not self.force_tools and not self.force_tools
and "gpt-3.5-turbo" not in chat_model and not self._is_tool_model(chat_model)
): # TODO: This should be adapted to use tools with more models ):
self.bot.logger.log("Using tool emulation mode.", "debug") self.bot.logger.log("Using tool emulation mode.", "debug")
messages = ( messages = (
@ -272,9 +298,10 @@ class OpenAI(BaseAI):
"presence_penalty": self.presence_penalty, "presence_penalty": self.presence_penalty,
} }
if "gpt-3.5-turbo" in chat_model and use_tools: if (self._is_tool_model(chat_model) and use_tools) or self.force_tools:
kwargs["tools"] = tools kwargs["tools"] = tools
# TODO: Look into this
if "gpt-4" in chat_model: if "gpt-4" in chat_model:
kwargs["max_tokens"] = self.max_tokens kwargs["max_tokens"] = self.max_tokens
@ -685,10 +712,10 @@ Only the event_types mentioned above are allowed, you must not respond in any ot
messages = [{"role": "system", "content": system_message}] + messages[1:] messages = [{"role": "system", "content": system_message}] + messages[1:]
if "vision" not in (chat_model := self.chat_model) and chat_model not in ( chat_model = self.chat_model
"gpt-4o",
): if not self._is_vision_model(chat_model):
chat_model = "gpt-4o" chat_model = self.vision_model or "gpt-4o"
chat_partial = partial( chat_partial = partial(
self.openai_api.chat.completions.create, self.openai_api.chat.completions.create,

View file

@ -29,11 +29,13 @@ from nio import (
RoomMessageAudio, RoomMessageAudio,
DownloadError, DownloadError,
RoomGetStateError, RoomGetStateError,
DiskDownloadResponse,
MemoryDownloadResponse,
) )
from nio.store import SqliteStore from nio.store import SqliteStore
from typing import Optional, List, Any from typing import Optional, List, Any, Union
from configparser import ConfigParser from configparser import ConfigParser
from datetime import datetime from datetime import datetime
from io import BytesIO from io import BytesIO
@ -126,26 +128,6 @@ class GPTBot:
""" """
return self.config["GPTBot"].getboolean("ForceSystemMessage", False) return self.config["GPTBot"].getboolean("ForceSystemMessage", False)
@property
def max_tokens(self) -> int:
"""Maximum number of input tokens.
Returns:
int: The maximum number of input tokens. Defaults to 3000.
"""
return self.config["OpenAI"].getint("MaxTokens", 3000)
# TODO: Move this to OpenAI class
@property
def max_messages(self) -> int:
"""Maximum number of messages to consider as input.
Returns:
int: The maximum number of messages to consider as input. Defaults to 30.
"""
return self.config["OpenAI"].getint("MaxMessages", 30)
# TODO: Move this to OpenAI class
@property @property
def operator(self) -> Optional[str]: def operator(self) -> Optional[str]:
"""Operator of the bot. """Operator of the bot.
@ -309,7 +291,7 @@ class GPTBot:
ignore_notices: bool = True, ignore_notices: bool = True,
): ):
messages = [] messages = []
n = n or self.max_messages n = n or self.chat_api.max_messages
room_id = room.room_id if isinstance(room, MatrixRoom) else room room_id = room.room_id if isinstance(room, MatrixRoom) else room
self.logger.log( self.logger.log(
@ -378,7 +360,7 @@ class GPTBot:
model: Optional[str] = None, model: Optional[str] = None,
system_message: Optional[str] = None, system_message: Optional[str] = None,
): ):
max_tokens = max_tokens or self.max_tokens max_tokens = max_tokens or self.chat_api.max_tokens
model = model or self.chat_api.chat_model model = model or self.chat_api.chat_model
system_message = ( system_message = (
self.default_system_message if system_message is None else system_message self.default_system_message if system_message is None else system_message
@ -1168,7 +1150,9 @@ class GPTBot:
return return
try: try:
last_messages = await self._last_n_messages(room.room_id, self.max_messages) last_messages = await self._last_n_messages(
room.room_id, self.chat_api.max_messages
)
except Exception as e: except Exception as e:
self.logger.log(f"Error getting last messages: {e}", "error") self.logger.log(f"Error getting last messages: {e}", "error")
await self.send_message( await self.send_message(
@ -1271,7 +1255,27 @@ class GPTBot:
download = await self.download_file(image_url) download = await self.download_file(image_url)
if download: if download:
encoded_url = f"data:{download.content_type};base64,{base64.b64encode(download.body).decode('utf-8')}" pil_image = Image.open(BytesIO(download.body))
file_format = pil_image.format or "PNG"
max_long_side = self.chat_api.max_image_long_side
max_short_side = self.chat_api.max_image_short_side
if max_long_side and max_short_side:
if pil_image.width > pil_image.height:
if pil_image.width > max_long_side:
pil_image.thumbnail((max_long_side, max_short_side))
else:
if pil_image.height > max_long_side:
pil_image.thumbnail((max_short_side, max_long_side))
bio = BytesIO()
pil_image.save(bio, format=file_format)
encoded_url = f"data:{download.content_type};base64,{base64.b64encode(bio.getvalue()).decode('utf-8')}"
parent = ( parent = (
chat_messages[-1] chat_messages[-1]
if chat_messages if chat_messages
@ -1312,7 +1316,9 @@ class GPTBot:
# Truncate messages to fit within the token limit # Truncate messages to fit within the token limit
self._truncate( self._truncate(
chat_messages[1:], self.max_tokens - 1, system_message=system_message chat_messages[1:],
self.chat_api.max_tokens - 1,
system_message=system_message,
) )
# Check for a model override # Check for a model override
@ -1362,7 +1368,9 @@ class GPTBot:
await self.matrix_client.room_typing(room.room_id, False) await self.matrix_client.room_typing(room.room_id, False)
async def download_file(self, mxc) -> Optional[bytes]: async def download_file(
self, mxc
) -> Union[DiskDownloadResponse, MemoryDownloadResponse]:
"""Download a file from the homeserver. """Download a file from the homeserver.
Args: Args: