import { connect } from '@/scripts/functions/connect' import { Article, ArticleHeading, ArticleHeadings } from '@/types/content-types' import { config } from 'dotenv' import OpenAI from 'openai' import path from 'path' import { LocalIndex } from 'vectra' import { nicelog } from './nicelog' config() const MAX_ARTICLES = Infinity const INCLUDE_API_CONTENT = true const INCLUDE_CONTENT = true const index = new LocalIndex(path.join(process.cwd(), 'utils', 'vector-db')) const openai = new OpenAI({ apiKey: process.env.OPENAI_KEY, }) export class ContentVectorDatabase { index: LocalIndex api: OpenAI constructor(opts = {} as { index: LocalIndex; api: OpenAI }) { this.index = opts.index this.api = opts.api } /** * Get a vector from a piece of text from openai. * * @param text The text to get a vector for. * * @returns The vector. */ async getVectorEmbeddings(inputs: string[]) { const response = await this.api.embeddings.create({ model: 'text-embedding-ada-002', input: inputs, }) return response.data.map((d) => d.embedding) } async addHeadingToIndex(article: Article, heading: ArticleHeading) { const id = `${article.id}#${heading.slug}` // Skip headings that are already present const hash = this.getHashForString(heading.title + heading.slug) const existingItem = await this.index.getItem(id) if (existingItem) { if (existingItem.metadata.hash === hash) { nicelog(`Skipping heading ${id} (already present)`) return } await this.index.deleteItem(id) } nicelog(`Adding headers for ${article.title}#${heading.title}`) const vectors = await this.getVectorEmbeddings([article.title + '#' + heading.title]) this.index.insertItem({ id, vector: vectors[0], metadata: { type: 'heading', articleId: article.id, slug: heading.slug, hash }, }) } /** * Add a text item to the index. * * @param text The text to add to the index. * * @returns The index item. */ async addArticleToIndex(article: Article, headings: ArticleHeadings) { // This is the content that we'll create the embedding for let contentToVectorize: string if (article.sectionId === 'gen') { // For API docs, we'll just use the title, description, and members as the content. // We'll also add a note that the content was generated from the API docs, hopefully // so that the embedding better reflects searches for api docs. contentToVectorize = `Title: ${article.title}\nPackage: @tldraw/${article.categoryId}\nDescription: ${article.description}\nMembers:${article.keywords}\n\n(content generated from API docs)` } else { // The content is the raw markdown content, which includes all the markdown // headings and annotations, though none of the frontmatter. We'll add the // frontmatter information again manually. We may need to also add some information // about how "important" this article is, relative to related docs or headings. contentToVectorize = `Title: ${article.title}\nDescription: ${article.description}\nKeywords:${article.keywords}\nMarkdown:\n${article.content}` } if (headings.length) { for (const heading of headings) { await this.addHeadingToIndex(article, heading) } } // Generate a hash based on the content that we'd be vectorizing const hash = this.getHashForString(contentToVectorize) // Create chunks from the content; openAI has a limit of 500 tokens per request const chunksToAdd: string[] = [] const chunkSize = 500 for (let i = 0; i < contentToVectorize.length; i += chunkSize) { const chunk = contentToVectorize.slice(i, i + chunkSize) chunksToAdd.push(chunk) } // Is there already an item with this id? const existingItem = await this.index.getItem(article.id + '_0') if (existingItem) { // ...and if the existing item matches our hash, we can skip it if (existingItem.metadata.hash === hash) { nicelog(`Skipping ${article.id} (already present)`) return } // ...otherwise, delete all the chunks so that we can add a new one. for (let i = 0; i < chunksToAdd.length; i++) { await this.index.deleteItem(article.id + '_' + i) } } // Add chunks to index nicelog(`Adding article ${article.title} (${chunksToAdd.length} chunks)`) // Get an embedding / vector for all of the chunks const vectors = await this.getVectorEmbeddings(chunksToAdd) for (let i = 0; i < vectors.length; i++) { const vector = vectors[i] // Add the article item to the index (include the hash as metadata) await this.index.insertItem({ id: article.id + '_' + i, vector: vector, metadata: { type: 'article', articleId: article.id, hash }, }) } // Sleep for 50ms or so to avoid rate limiting await new Promise((r) => setTimeout(r, 35)) return } /** * Query an item using our index. * * @param text The text to query. * * @returns The query results. */ async query(text: string, limit = 5) { const vector = await this.getVectorEmbeddings([text]) const results = await this.index.queryItems(vector[0], limit) const output: ( | { id: string; type: 'article'; score: number } | { id: string; type: 'heading'; slug: string; score: number } )[] = [] const visited = new Set() for (const result of results) { const id = result.item.metadata.articleId as string const type = result.item.metadata.type as 'article' | 'heading' if (type === 'heading') { const slug = result.item.metadata.slug as string output.push({ id, type, slug, score: result.score }) } else { // multiple chunks may have been returned if (visited.has(id)) continue output.push({ id, type, score: result.score }) visited.add(id) } } return output } /** * Hash a string using the FNV-1a algorithm. * * @public */ getHashForString(string: string) { let hash = 0 for (let i = 0; i < string.length; i++) { hash = (hash << 5) - hash + string.charCodeAt(i) hash |= 0 // Convert to 32bit integer } return hash + '' } } let _cvdb: ContentVectorDatabase export async function getVectorDb( opts = {} as { updateContent?: boolean rebuildIndex?: boolean } ) { if (_cvdb) { return _cvdb } if (opts.rebuildIndex || !(await index.isIndexCreated())) { await index.createIndex({ deleteIfExists: opts.rebuildIndex, version: 1 }) } _cvdb = new ContentVectorDatabase({ api: openai, index }) if (opts.updateContent || opts.rebuildIndex) { nicelog(`Rebuilding index`) const db = await connect({ reset: false }) nicelog(`Getting articles`) const articles = INCLUDE_API_CONTENT && INCLUDE_CONTENT ? await db.all('SELECT * FROM articles') : INCLUDE_API_CONTENT ? await db.all('SELECT * FROM articles WHERE articles.sectionId = ?', 'gen') : await db.all('SELECT * FROM articles WHERE articles.sectionId != ?', 'gen') nicelog(`Adding articles to index`) const max = Math.min(articles.length, MAX_ARTICLES) for (let i = 0; i < max; i++) { const article = articles[i] const headings = await db.all( 'SELECT * FROM headings WHERE articleId = ? AND slug NOT IN (?, ?, ?, ?)', article.id, 'constructor', 'properties', 'example', 'methods' ) nicelog(`Adding article ${article.id} to index (${i} of ${max})`) await _cvdb.addArticleToIndex(article, headings) } } return _cvdb }