242 lines
7.2 KiB
TypeScript
242 lines
7.2 KiB
TypeScript
|
import { connect } from '@/scripts/functions/connect'
|
||
|
import { Article, ArticleHeading, ArticleHeadings } from '@/types/content-types'
|
||
|
import { config } from 'dotenv'
|
||
|
import OpenAI from 'openai'
|
||
|
import path from 'path'
|
||
|
import { LocalIndex } from 'vectra'
|
||
|
import { nicelog } from './nicelog'
|
||
|
|
||
|
config()
|
||
|
|
||
|
const MAX_ARTICLES = Infinity
|
||
|
const INCLUDE_API_CONTENT = true
|
||
|
const INCLUDE_CONTENT = true
|
||
|
|
||
|
const index = new LocalIndex(path.join(process.cwd(), 'utils', 'vector-db'))
|
||
|
|
||
|
const openai = new OpenAI({
|
||
|
apiKey: process.env.OPENAI_KEY,
|
||
|
})
|
||
|
|
||
|
export class ContentVectorDatabase {
|
||
|
index: LocalIndex
|
||
|
api: OpenAI
|
||
|
|
||
|
constructor(opts = {} as { index: LocalIndex; api: OpenAI }) {
|
||
|
this.index = opts.index
|
||
|
this.api = opts.api
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get a vector from a piece of text from openai.
|
||
|
*
|
||
|
* @param text The text to get a vector for.
|
||
|
*
|
||
|
* @returns The vector.
|
||
|
*/
|
||
|
async getVectorEmbeddings(inputs: string[]) {
|
||
|
const response = await this.api.embeddings.create({
|
||
|
model: 'text-embedding-ada-002',
|
||
|
input: inputs,
|
||
|
})
|
||
|
return response.data.map((d) => d.embedding)
|
||
|
}
|
||
|
|
||
|
async addHeadingToIndex(article: Article, heading: ArticleHeading) {
|
||
|
const id = `${article.id}#${heading.slug}`
|
||
|
|
||
|
// Skip headings that are already present
|
||
|
const hash = this.getHashForString(heading.title + heading.slug)
|
||
|
const existingItem = await this.index.getItem(id)
|
||
|
if (existingItem) {
|
||
|
if (existingItem.metadata.hash === hash) {
|
||
|
nicelog(`Skipping heading ${id} (already present)`)
|
||
|
return
|
||
|
}
|
||
|
await this.index.deleteItem(id)
|
||
|
}
|
||
|
|
||
|
nicelog(`Adding headers for ${article.title}#${heading.title}`)
|
||
|
const vectors = await this.getVectorEmbeddings([article.title + '#' + heading.title])
|
||
|
this.index.insertItem({
|
||
|
id,
|
||
|
vector: vectors[0],
|
||
|
metadata: { type: 'heading', articleId: article.id, slug: heading.slug, hash },
|
||
|
})
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Add a text item to the index.
|
||
|
*
|
||
|
* @param text The text to add to the index.
|
||
|
*
|
||
|
* @returns The index item.
|
||
|
*/
|
||
|
async addArticleToIndex(article: Article, headings: ArticleHeadings) {
|
||
|
// This is the content that we'll create the embedding for
|
||
|
let contentToVectorize: string
|
||
|
|
||
|
if (article.sectionId === 'gen') {
|
||
|
// For API docs, we'll just use the title, description, and members as the content.
|
||
|
// We'll also add a note that the content was generated from the API docs, hopefully
|
||
|
// so that the embedding better reflects searches for api docs.
|
||
|
contentToVectorize = `Title: ${article.title}\nPackage: @tldraw/${article.categoryId}\nDescription: ${article.description}\nMembers:${article.keywords}\n\n(content generated from API docs)`
|
||
|
} else {
|
||
|
// The content is the raw markdown content, which includes all the markdown
|
||
|
// headings and annotations, though none of the frontmatter. We'll add the
|
||
|
// frontmatter information again manually. We may need to also add some information
|
||
|
// about how "important" this article is, relative to related docs or headings.
|
||
|
contentToVectorize = `Title: ${article.title}\nDescription: ${article.description}\nKeywords:${article.keywords}\nMarkdown:\n${article.content}`
|
||
|
}
|
||
|
|
||
|
if (headings.length) {
|
||
|
for (const heading of headings) {
|
||
|
await this.addHeadingToIndex(article, heading)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Generate a hash based on the content that we'd be vectorizing
|
||
|
const hash = this.getHashForString(contentToVectorize)
|
||
|
|
||
|
// Create chunks from the content; openAI has a limit of 500 tokens per request
|
||
|
const chunksToAdd: string[] = []
|
||
|
const chunkSize = 500
|
||
|
for (let i = 0; i < contentToVectorize.length; i += chunkSize) {
|
||
|
const chunk = contentToVectorize.slice(i, i + chunkSize)
|
||
|
chunksToAdd.push(chunk)
|
||
|
}
|
||
|
|
||
|
// Is there already an item with this id?
|
||
|
const existingItem = await this.index.getItem(article.id + '_0')
|
||
|
|
||
|
if (existingItem) {
|
||
|
// ...and if the existing item matches our hash, we can skip it
|
||
|
if (existingItem.metadata.hash === hash) {
|
||
|
nicelog(`Skipping ${article.id} (already present)`)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
// ...otherwise, delete all the chunks so that we can add a new one.
|
||
|
for (let i = 0; i < chunksToAdd.length; i++) {
|
||
|
await this.index.deleteItem(article.id + '_' + i)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Add chunks to index
|
||
|
nicelog(`Adding article ${article.title} (${chunksToAdd.length} chunks)`)
|
||
|
|
||
|
// Get an embedding / vector for all of the chunks
|
||
|
const vectors = await this.getVectorEmbeddings(chunksToAdd)
|
||
|
|
||
|
for (let i = 0; i < vectors.length; i++) {
|
||
|
const vector = vectors[i]
|
||
|
// Add the article item to the index (include the hash as metadata)
|
||
|
await this.index.insertItem({
|
||
|
id: article.id + '_' + i,
|
||
|
vector: vector,
|
||
|
metadata: { type: 'article', articleId: article.id, hash },
|
||
|
})
|
||
|
}
|
||
|
|
||
|
// Sleep for 50ms or so to avoid rate limiting
|
||
|
await new Promise((r) => setTimeout(r, 35))
|
||
|
|
||
|
return
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Query an item using our index.
|
||
|
*
|
||
|
* @param text The text to query.
|
||
|
*
|
||
|
* @returns The query results.
|
||
|
*/
|
||
|
async query(text: string, limit = 5) {
|
||
|
const vector = await this.getVectorEmbeddings([text])
|
||
|
const results = await this.index.queryItems(vector[0], limit)
|
||
|
const output: (
|
||
|
| { id: string; type: 'article'; score: number }
|
||
|
| { id: string; type: 'heading'; slug: string; score: number }
|
||
|
)[] = []
|
||
|
const visited = new Set<string>()
|
||
|
for (const result of results) {
|
||
|
const id = result.item.metadata.articleId as string
|
||
|
const type = result.item.metadata.type as 'article' | 'heading'
|
||
|
if (type === 'heading') {
|
||
|
const slug = result.item.metadata.slug as string
|
||
|
output.push({ id, type, slug, score: result.score })
|
||
|
} else {
|
||
|
// multiple chunks may have been returned
|
||
|
if (visited.has(id)) continue
|
||
|
output.push({ id, type, score: result.score })
|
||
|
visited.add(id)
|
||
|
}
|
||
|
}
|
||
|
return output
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Hash a string using the FNV-1a algorithm.
|
||
|
*
|
||
|
* @public
|
||
|
*/
|
||
|
getHashForString(string: string) {
|
||
|
let hash = 0
|
||
|
for (let i = 0; i < string.length; i++) {
|
||
|
hash = (hash << 5) - hash + string.charCodeAt(i)
|
||
|
hash |= 0 // Convert to 32bit integer
|
||
|
}
|
||
|
return hash + ''
|
||
|
}
|
||
|
}
|
||
|
|
||
|
let _cvdb: ContentVectorDatabase
|
||
|
|
||
|
export async function getVectorDb(
|
||
|
opts = {} as {
|
||
|
updateContent?: boolean
|
||
|
rebuildIndex?: boolean
|
||
|
}
|
||
|
) {
|
||
|
if (_cvdb) {
|
||
|
return _cvdb
|
||
|
}
|
||
|
|
||
|
if (opts.rebuildIndex || !(await index.isIndexCreated())) {
|
||
|
await index.createIndex({ deleteIfExists: opts.rebuildIndex, version: 1 })
|
||
|
}
|
||
|
|
||
|
_cvdb = new ContentVectorDatabase({ api: openai, index })
|
||
|
|
||
|
if (opts.updateContent || opts.rebuildIndex) {
|
||
|
nicelog(`Rebuilding index`)
|
||
|
const db = await connect({ reset: false })
|
||
|
|
||
|
nicelog(`Getting articles`)
|
||
|
const articles =
|
||
|
INCLUDE_API_CONTENT && INCLUDE_CONTENT
|
||
|
? await db.all('SELECT * FROM articles')
|
||
|
: INCLUDE_API_CONTENT
|
||
|
? await db.all('SELECT * FROM articles WHERE articles.sectionId = ?', 'gen')
|
||
|
: await db.all('SELECT * FROM articles WHERE articles.sectionId != ?', 'gen')
|
||
|
|
||
|
nicelog(`Adding articles to index`)
|
||
|
const max = Math.min(articles.length, MAX_ARTICLES)
|
||
|
for (let i = 0; i < max; i++) {
|
||
|
const article = articles[i]
|
||
|
const headings = await db.all(
|
||
|
'SELECT * FROM headings WHERE articleId = ? AND slug NOT IN (?, ?, ?, ?)',
|
||
|
article.id,
|
||
|
'constructor',
|
||
|
'properties',
|
||
|
'example',
|
||
|
'methods'
|
||
|
)
|
||
|
nicelog(`Adding article ${article.id} to index (${i} of ${max})`)
|
||
|
await _cvdb.addArticleToIndex(article, headings)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return _cvdb
|
||
|
}
|