feat: enhance tool and image handling
All checks were successful
Docker CI/CD / Docker Build and Push to Docker Hub (push) Successful in 8m22s
Python Package CI/CD / Setup and Test (push) Successful in 1m11s
Python Package CI/CD / Publish to PyPI (push) Successful in 38s

Introduced changes to the tool request behavior and image processing. Now, the configuration allows a dedicated model for tool requests (`ToolModel`) and enforces automatic resizing of context images to maximal dimensions, improving compatibility and performance with the AI model. The update shifts away from a rigid tool model use, accommodating varied model support for tool requests, and optimizes image handling for network and processing efficiency. These adjustments aim to enhance user experience with more flexible tool usage and efficient image handling in chat interactions.
This commit is contained in:
Kumi 2024-05-20 10:20:17 +02:00
parent 89f06268a5
commit 3f084ffdd3
Signed by: kumi
GPG key ID: ECBCC9082395383F
5 changed files with 108 additions and 48 deletions

View file

@ -1,5 +1,11 @@
# Changelog
### 0.3.13 (2024-05-20)
- **Breaking Change**: The `ForceTools` configuration option behavior has changed. Instead of using a separate model for tools, the bot will now try to use the default chat model for tool requests, even if that model is not known to support tools.
- Added `ToolModel` to OpenAI configuration to allow specifying a separate model for tool requests
- Automatically resize context images to a default maximum of 2000x768 pixels before sending them to the AI model
### 0.3.12 (2024-05-17)
- Added `ForceVision` to OpenAI configuration to allow third-party models to be used for image recognition

View file

@ -107,11 +107,20 @@ APIKey = sk-yoursecretkey
# Whether to force the use of tools in the chat completion model
#
# Currently, only gpt-3.5-turbo supports tools. If you set this to 1, the bot
# will use that model for tools even if you have a different model set as the
# default. It will only generate the final result using the default model.
# This will make the bot allow the use of tools in the chat completion model,
# even if the model you are using isn't known to support tools. This is useful
# if you are using a self-hosted model that supports tools, but the bot doesn't
# know about it.
#
# ForceTools = 0
# ForceTools = 1
# Whether a dedicated model should be used for tools
#
# This will make the bot use a dedicated model for tools. This is useful if you
# want to use a model that doesn't support tools, but still want to be able to
# use tools.
#
# ToolModel = gpt-4o
# Whether to emulate tools in the chat completion model
#
@ -130,6 +139,16 @@ APIKey = sk-yoursecretkey
#
# ForceVision = 0
# Maximum width and height of images sent to the API if vision is enabled
#
# The OpenAI API has a limit of 2000 pixels for the long side of an image, and
# 768 pixels for the short side. You may have to adjust these values if you're
# using a self-hosted model that has different limits. You can also set these
# to 0 to disable image resizing.
#
# MaxImageLongSide = 2000
# MaxImageShortSide = 768
# Advanced settings for the OpenAI API
#
# These settings are not required for normal operation, but can be used to

View file

@ -7,7 +7,7 @@ allow-direct-references = true
[project]
name = "matrix-gptbot"
version = "0.3.12"
version = "0.3.13"
authors = [
{ name = "Kumi Mitterer", email = "gptbot@kumi.email" },

View file

@ -97,6 +97,14 @@ class OpenAI(BaseAI):
def force_tools(self):
return self._config.getboolean("ForceTools", fallback=False)
@property
def tool_model(self):
return self._config.get("ToolModel")
@property
def vision_model(self):
return self._config.get("VisionModel")
@property
def emulate_tools(self):
return self._config.getboolean("EmulateTools", fallback=False)
@ -106,12 +114,26 @@ class OpenAI(BaseAI):
# TODO: This should be model-specific
return self._config.getint("MaxTokens", fallback=4000)
@property
def max_messages(self):
return self._config.getint("MaxMessages", fallback=30)
@property
def max_image_long_side(self):
return self._config.getint("MaxImageLongSide", fallback=2000)
@property
def max_image_short_side(self):
return self._config.getint("MaxImageShortSide", fallback=768)
def _is_tool_model(self, model: str) -> bool:
return model in ("gpt-3.5-turbo", "gpt-4-turbo", "gpt-4o")
def _is_vision_model(self, model: str) -> bool:
return model in ("gpt-4-turbo", "gpt-4o") or "vision" in model
def supports_chat_images(self):
return (
"vision" in self.chat_model
or self.chat_model in ("gpt-4o",)
or self.force_vision
)
return self._is_vision_model(self.chat_model) or self.force_vision
def json_decode(self, data):
if data.startswith("```json\n"):
@ -194,11 +216,15 @@ class OpenAI(BaseAI):
original_messages = messages
# TODO: I believe more models support tools now, so this could be adapted
if allow_override and "gpt-3.5-turbo" not in original_model:
if self.force_tools:
if (
allow_override
and use_tools
and self.tool_model
and not (self._is_tool_model(chat_model) or self.force_tools)
):
if self.tool_model:
self.logger.log("Overriding chat model to use tools")
chat_model = "gpt-3.5-turbo"
chat_model = self.tool_model
out_messages = []
@ -225,8 +251,8 @@ class OpenAI(BaseAI):
use_tools
and self.emulate_tools
and not self.force_tools
and "gpt-3.5-turbo" not in chat_model
): # TODO: This should be adapted to use tools with more models
and not self._is_tool_model(chat_model)
):
self.bot.logger.log("Using tool emulation mode.", "debug")
messages = (
@ -272,9 +298,10 @@ class OpenAI(BaseAI):
"presence_penalty": self.presence_penalty,
}
if "gpt-3.5-turbo" in chat_model and use_tools:
if (self._is_tool_model(chat_model) and use_tools) or self.force_tools:
kwargs["tools"] = tools
# TODO: Look into this
if "gpt-4" in chat_model:
kwargs["max_tokens"] = self.max_tokens
@ -685,10 +712,10 @@ Only the event_types mentioned above are allowed, you must not respond in any ot
messages = [{"role": "system", "content": system_message}] + messages[1:]
if "vision" not in (chat_model := self.chat_model) and chat_model not in (
"gpt-4o",
):
chat_model = "gpt-4o"
chat_model = self.chat_model
if not self._is_vision_model(chat_model):
chat_model = self.vision_model or "gpt-4o"
chat_partial = partial(
self.openai_api.chat.completions.create,

View file

@ -29,11 +29,13 @@ from nio import (
RoomMessageAudio,
DownloadError,
RoomGetStateError,
DiskDownloadResponse,
MemoryDownloadResponse,
)
from nio.store import SqliteStore
from typing import Optional, List, Any
from typing import Optional, List, Any, Union
from configparser import ConfigParser
from datetime import datetime
from io import BytesIO
@ -126,26 +128,6 @@ class GPTBot:
"""
return self.config["GPTBot"].getboolean("ForceSystemMessage", False)
@property
def max_tokens(self) -> int:
"""Maximum number of input tokens.
Returns:
int: The maximum number of input tokens. Defaults to 3000.
"""
return self.config["OpenAI"].getint("MaxTokens", 3000)
# TODO: Move this to OpenAI class
@property
def max_messages(self) -> int:
"""Maximum number of messages to consider as input.
Returns:
int: The maximum number of messages to consider as input. Defaults to 30.
"""
return self.config["OpenAI"].getint("MaxMessages", 30)
# TODO: Move this to OpenAI class
@property
def operator(self) -> Optional[str]:
"""Operator of the bot.
@ -309,7 +291,7 @@ class GPTBot:
ignore_notices: bool = True,
):
messages = []
n = n or self.max_messages
n = n or self.chat_api.max_messages
room_id = room.room_id if isinstance(room, MatrixRoom) else room
self.logger.log(
@ -378,7 +360,7 @@ class GPTBot:
model: Optional[str] = None,
system_message: Optional[str] = None,
):
max_tokens = max_tokens or self.max_tokens
max_tokens = max_tokens or self.chat_api.max_tokens
model = model or self.chat_api.chat_model
system_message = (
self.default_system_message if system_message is None else system_message
@ -1168,7 +1150,9 @@ class GPTBot:
return
try:
last_messages = await self._last_n_messages(room.room_id, self.max_messages)
last_messages = await self._last_n_messages(
room.room_id, self.chat_api.max_messages
)
except Exception as e:
self.logger.log(f"Error getting last messages: {e}", "error")
await self.send_message(
@ -1271,7 +1255,27 @@ class GPTBot:
download = await self.download_file(image_url)
if download:
encoded_url = f"data:{download.content_type};base64,{base64.b64encode(download.body).decode('utf-8')}"
pil_image = Image.open(BytesIO(download.body))
file_format = pil_image.format or "PNG"
max_long_side = self.chat_api.max_image_long_side
max_short_side = self.chat_api.max_image_short_side
if max_long_side and max_short_side:
if pil_image.width > pil_image.height:
if pil_image.width > max_long_side:
pil_image.thumbnail((max_long_side, max_short_side))
else:
if pil_image.height > max_long_side:
pil_image.thumbnail((max_short_side, max_long_side))
bio = BytesIO()
pil_image.save(bio, format=file_format)
encoded_url = f"data:{download.content_type};base64,{base64.b64encode(bio.getvalue()).decode('utf-8')}"
parent = (
chat_messages[-1]
if chat_messages
@ -1312,7 +1316,9 @@ class GPTBot:
# Truncate messages to fit within the token limit
self._truncate(
chat_messages[1:], self.max_tokens - 1, system_message=system_message
chat_messages[1:],
self.chat_api.max_tokens - 1,
system_message=system_message,
)
# Check for a model override
@ -1362,7 +1368,9 @@ class GPTBot:
await self.matrix_client.room_typing(room.room_id, False)
async def download_file(self, mxc) -> Optional[bytes]:
async def download_file(
self, mxc
) -> Union[DiskDownloadResponse, MemoryDownloadResponse]:
"""Download a file from the homeserver.
Args: