Refine logging and add image description feature
Enhanced the speech generation logging to display the word count of the input text instead of the full text. This change prioritizes user privacy and improves log readability. Implemented a new feature to generate descriptions for images within a conversation, expanding the bot's capabilities. Also, refactor `BaseTool` class to securely access arguments through `.get` method and to include `messages` by default, ensuring graceful handling of missing arguments.
This commit is contained in:
parent
03768b5b27
commit
ad600faf4b
3 changed files with 45 additions and 18 deletions
|
@ -4,6 +4,7 @@ import tiktoken
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import base64
|
||||||
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
@ -387,7 +388,7 @@ Only the event_types mentioned above are allowed, you must not respond in any ot
|
||||||
Yields:
|
Yields:
|
||||||
bytes: The audio data.
|
bytes: The audio data.
|
||||||
"""
|
"""
|
||||||
self.logger.log(f"Generating speech from text '{text}'...")
|
self.logger.log(f"Generating speech from text of length: {len(text.split())} words...")
|
||||||
|
|
||||||
speech = await self.openai_api.audio.speech.create(
|
speech = await self.openai_api.audio.speech.create(
|
||||||
model=self.tts_model,
|
model=self.tts_model,
|
||||||
|
@ -475,3 +476,37 @@ Only the event_types mentioned above are allowed, you must not respond in any ot
|
||||||
images.append(image)
|
images.append(image)
|
||||||
|
|
||||||
return images, len(images)
|
return images, len(images)
|
||||||
|
|
||||||
|
async def describe_images(self, messages: list, user: Optional[str] = None) -> Tuple[str, int]:
|
||||||
|
"""Generate a description for an image.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image (bytes): The image data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[str, int]: The description and the number of tokens used.
|
||||||
|
"""
|
||||||
|
self.logger.log(f"Generating description for images in conversation...")
|
||||||
|
|
||||||
|
system_message = "You are an image description generator. You generate descriptions for all images in the current conversation, one after another."
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": system_message
|
||||||
|
}
|
||||||
|
] + messages[1:]
|
||||||
|
|
||||||
|
if not "vision" in (chat_model := self.chat_model):
|
||||||
|
chat_model = self.chat_model + "gpt-4-vision-preview"
|
||||||
|
|
||||||
|
chat_partial = partial(
|
||||||
|
self.openai_api.chat.completions.create,
|
||||||
|
model=self.chat_model,
|
||||||
|
messages=messages,
|
||||||
|
user=user,
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await self._request_with_retries(chat_partial)
|
||||||
|
|
||||||
|
return response.choices[0].message.content, response.usage.total_tokens
|
|
@ -4,9 +4,10 @@ class BaseTool:
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
self.kwargs = kwargs
|
self.kwargs = kwargs
|
||||||
self.bot = kwargs["bot"]
|
self.bot = kwargs.get("bot")
|
||||||
self.room = kwargs["room"]
|
self.room = kwargs.get("room")
|
||||||
self.user = kwargs["user"]
|
self.user = kwargs.get("user")
|
||||||
|
self.messages = kwargs.get("messages", [])
|
||||||
|
|
||||||
async def run(self):
|
async def run(self):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
|
@ -1,24 +1,15 @@
|
||||||
from .base import BaseTool, Handover
|
from .base import BaseTool, Handover
|
||||||
|
|
||||||
class Imagedescription(BaseTool):
|
class Imagedescription(BaseTool):
|
||||||
DESCRIPTION = "Describe the content of an image."
|
DESCRIPTION = "Describe the content of the images in the conversation."
|
||||||
PARAMETERS = {
|
PARAMETERS = {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"image": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The image to describe.",
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
"required": ["image"],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async def run(self):
|
async def run(self):
|
||||||
"""Describe an image.
|
"""Describe images in the conversation."""
|
||||||
|
image_api = self.bot.image_api
|
||||||
|
|
||||||
This tool only hands over to the original model, if applicable.
|
return (await image_api.describe_images(self.messages, self.user))[0]
|
||||||
It is intended to handle the case where GPT-3 thinks it is asked to
|
|
||||||
*generate* an image, but the user actually wants to *describe* an
|
|
||||||
image...
|
|
||||||
"""
|
|
||||||
raise Handover()
|
|
Loading…
Reference in a new issue