Refine logging and add image description feature

Enhanced the speech generation logging to display the word count of the input text instead of the full text. This change prioritizes user privacy and improves log readability. Implemented a new feature to generate descriptions for images within a conversation, expanding the bot's capabilities. Also, refactor `BaseTool` class to securely access arguments through `.get` method and to include `messages` by default, ensuring graceful handling of missing arguments.
This commit is contained in:
Kumi 2023-11-29 14:53:19 +01:00
parent 03768b5b27
commit ad600faf4b
Signed by: kumi
GPG key ID: ECBCC9082395383F
3 changed files with 45 additions and 18 deletions

View file

@ -4,6 +4,7 @@ import tiktoken
import asyncio import asyncio
import json import json
import base64
from functools import partial from functools import partial
from contextlib import closing from contextlib import closing
@ -387,7 +388,7 @@ Only the event_types mentioned above are allowed, you must not respond in any ot
Yields: Yields:
bytes: The audio data. bytes: The audio data.
""" """
self.logger.log(f"Generating speech from text '{text}'...") self.logger.log(f"Generating speech from text of length: {len(text.split())} words...")
speech = await self.openai_api.audio.speech.create( speech = await self.openai_api.audio.speech.create(
model=self.tts_model, model=self.tts_model,
@ -475,3 +476,37 @@ Only the event_types mentioned above are allowed, you must not respond in any ot
images.append(image) images.append(image)
return images, len(images) return images, len(images)
async def describe_images(self, messages: list, user: Optional[str] = None) -> Tuple[str, int]:
"""Generate a description for an image.
Args:
image (bytes): The image data.
Returns:
Tuple[str, int]: The description and the number of tokens used.
"""
self.logger.log(f"Generating description for images in conversation...")
system_message = "You are an image description generator. You generate descriptions for all images in the current conversation, one after another."
messages = [
{
"role": "system",
"content": system_message
}
] + messages[1:]
if not "vision" in (chat_model := self.chat_model):
chat_model = self.chat_model + "gpt-4-vision-preview"
chat_partial = partial(
self.openai_api.chat.completions.create,
model=self.chat_model,
messages=messages,
user=user,
)
response = await self._request_with_retries(chat_partial)
return response.choices[0].message.content, response.usage.total_tokens

View file

@ -4,9 +4,10 @@ class BaseTool:
def __init__(self, **kwargs): def __init__(self, **kwargs):
self.kwargs = kwargs self.kwargs = kwargs
self.bot = kwargs["bot"] self.bot = kwargs.get("bot")
self.room = kwargs["room"] self.room = kwargs.get("room")
self.user = kwargs["user"] self.user = kwargs.get("user")
self.messages = kwargs.get("messages", [])
async def run(self): async def run(self):
raise NotImplementedError() raise NotImplementedError()

View file

@ -1,24 +1,15 @@
from .base import BaseTool, Handover from .base import BaseTool, Handover
class Imagedescription(BaseTool): class Imagedescription(BaseTool):
DESCRIPTION = "Describe the content of an image." DESCRIPTION = "Describe the content of the images in the conversation."
PARAMETERS = { PARAMETERS = {
"type": "object", "type": "object",
"properties": { "properties": {
"image": {
"type": "string",
"description": "The image to describe.",
},
}, },
"required": ["image"],
} }
async def run(self): async def run(self):
"""Describe an image. """Describe images in the conversation."""
image_api = self.bot.image_api
This tool only hands over to the original model, if applicable. return (await image_api.describe_images(self.messages, self.user))[0]
It is intended to handle the case where GPT-3 thinks it is asked to
*generate* an image, but the user actually wants to *describe* an
image...
"""
raise Handover()