e6e4e7f6cb
Biome seems to be MUCH faster than Prettier. Unfortunately, it introduces some formatting changes around the ternary operator, so we have to update files in the repo. To make revert easier if we need it, the change is split into two PRs. This PR introduces a Biome CI check and reformats all files accordingly. ## Change Type - [x] `minor` — New feature
241 lines
7.2 KiB
TypeScript
241 lines
7.2 KiB
TypeScript
import { connect } from '@/scripts/functions/connect'
|
|
import { Article, ArticleHeading, ArticleHeadings } from '@/types/content-types'
|
|
import { config } from 'dotenv'
|
|
import OpenAI from 'openai'
|
|
import path from 'path'
|
|
import { LocalIndex } from 'vectra'
|
|
import { nicelog } from './nicelog'
|
|
|
|
config()
|
|
|
|
const MAX_ARTICLES = Infinity
|
|
const INCLUDE_API_CONTENT = true
|
|
const INCLUDE_CONTENT = true
|
|
|
|
const index = new LocalIndex(path.join(process.cwd(), 'utils', 'vector-db'))
|
|
|
|
const openai = new OpenAI({
|
|
apiKey: process.env.OPENAI_KEY,
|
|
})
|
|
|
|
export class ContentVectorDatabase {
|
|
index: LocalIndex
|
|
api: OpenAI
|
|
|
|
constructor(opts = {} as { index: LocalIndex; api: OpenAI }) {
|
|
this.index = opts.index
|
|
this.api = opts.api
|
|
}
|
|
|
|
/**
|
|
* Get a vector from a piece of text from openai.
|
|
*
|
|
* @param text The text to get a vector for.
|
|
*
|
|
* @returns The vector.
|
|
*/
|
|
async getVectorEmbeddings(inputs: string[]) {
|
|
const response = await this.api.embeddings.create({
|
|
model: 'text-embedding-ada-002',
|
|
input: inputs,
|
|
})
|
|
return response.data.map((d) => d.embedding)
|
|
}
|
|
|
|
async addHeadingToIndex(article: Article, heading: ArticleHeading) {
|
|
const id = `${article.id}#${heading.slug}`
|
|
|
|
// Skip headings that are already present
|
|
const hash = this.getHashForString(heading.title + heading.slug)
|
|
const existingItem = await this.index.getItem(id)
|
|
if (existingItem) {
|
|
if (existingItem.metadata.hash === hash) {
|
|
nicelog(`Skipping heading ${id} (already present)`)
|
|
return
|
|
}
|
|
await this.index.deleteItem(id)
|
|
}
|
|
|
|
nicelog(`Adding headers for ${article.title}#${heading.title}`)
|
|
const vectors = await this.getVectorEmbeddings([article.title + '#' + heading.title])
|
|
this.index.insertItem({
|
|
id,
|
|
vector: vectors[0],
|
|
metadata: { type: 'heading', articleId: article.id, slug: heading.slug, hash },
|
|
})
|
|
}
|
|
|
|
/**
|
|
* Add a text item to the index.
|
|
*
|
|
* @param text The text to add to the index.
|
|
*
|
|
* @returns The index item.
|
|
*/
|
|
async addArticleToIndex(article: Article, headings: ArticleHeadings) {
|
|
// This is the content that we'll create the embedding for
|
|
let contentToVectorize: string
|
|
|
|
if (article.sectionId === 'reference') {
|
|
// For API docs, we'll just use the title, description, and members as the content.
|
|
// We'll also add a note that the content was generated from the API docs, hopefully
|
|
// so that the embedding better reflects searches for api docs.
|
|
contentToVectorize = `Title: ${article.title}\nPackage: @tldraw/${article.categoryId}\nDescription: ${article.description}\nMembers:${article.keywords}\n\n(content generated from API docs)`
|
|
} else {
|
|
// The content is the raw markdown content, which includes all the markdown
|
|
// headings and annotations, though none of the frontmatter. We'll add the
|
|
// frontmatter information again manually. We may need to also add some information
|
|
// about how "important" this article is, relative to related docs or headings.
|
|
contentToVectorize = `Title: ${article.title}\nDescription: ${article.description}\nKeywords:${article.keywords}\nMarkdown:\n${article.content}`
|
|
}
|
|
|
|
if (headings.length) {
|
|
for (const heading of headings) {
|
|
await this.addHeadingToIndex(article, heading)
|
|
}
|
|
}
|
|
|
|
// Generate a hash based on the content that we'd be vectorizing
|
|
const hash = this.getHashForString(contentToVectorize)
|
|
|
|
// Create chunks from the content; openAI has a limit of 500 tokens per request
|
|
const chunksToAdd: string[] = []
|
|
const chunkSize = 500
|
|
for (let i = 0; i < contentToVectorize.length; i += chunkSize) {
|
|
const chunk = contentToVectorize.slice(i, i + chunkSize)
|
|
chunksToAdd.push(chunk)
|
|
}
|
|
|
|
// Is there already an item with this id?
|
|
const existingItem = await this.index.getItem(article.id + '_0')
|
|
|
|
if (existingItem) {
|
|
// ...and if the existing item matches our hash, we can skip it
|
|
if (existingItem.metadata.hash === hash) {
|
|
nicelog(`Skipping ${article.id} (already present)`)
|
|
return
|
|
}
|
|
|
|
// ...otherwise, delete all the chunks so that we can add a new one.
|
|
for (let i = 0; i < chunksToAdd.length; i++) {
|
|
await this.index.deleteItem(article.id + '_' + i)
|
|
}
|
|
}
|
|
|
|
// Add chunks to index
|
|
nicelog(`Adding article ${article.title} (${chunksToAdd.length} chunks)`)
|
|
|
|
// Get an embedding / vector for all of the chunks
|
|
const vectors = await this.getVectorEmbeddings(chunksToAdd)
|
|
|
|
for (let i = 0; i < vectors.length; i++) {
|
|
const vector = vectors[i]
|
|
// Add the article item to the index (include the hash as metadata)
|
|
await this.index.insertItem({
|
|
id: article.id + '_' + i,
|
|
vector: vector,
|
|
metadata: { type: 'article', articleId: article.id, hash },
|
|
})
|
|
}
|
|
|
|
// Sleep for 50ms or so to avoid rate limiting
|
|
await new Promise((r) => setTimeout(r, 35))
|
|
|
|
return
|
|
}
|
|
|
|
/**
|
|
* Query an item using our index.
|
|
*
|
|
* @param text The text to query.
|
|
*
|
|
* @returns The query results.
|
|
*/
|
|
async query(text: string, limit = 5) {
|
|
const vector = await this.getVectorEmbeddings([text])
|
|
const results = await this.index.queryItems(vector[0], limit)
|
|
const output: (
|
|
| { id: string; type: 'article'; score: number }
|
|
| { id: string; type: 'heading'; slug: string; score: number }
|
|
)[] = []
|
|
const visited = new Set<string>()
|
|
for (const result of results) {
|
|
const id = result.item.metadata.articleId as string
|
|
const type = result.item.metadata.type as 'article' | 'heading'
|
|
if (type === 'heading') {
|
|
const slug = result.item.metadata.slug as string
|
|
output.push({ id, type, slug, score: result.score })
|
|
} else {
|
|
// multiple chunks may have been returned
|
|
if (visited.has(id)) continue
|
|
output.push({ id, type, score: result.score })
|
|
visited.add(id)
|
|
}
|
|
}
|
|
return output
|
|
}
|
|
|
|
/**
|
|
* Hash a string using the FNV-1a algorithm.
|
|
*
|
|
* @public
|
|
*/
|
|
getHashForString(string: string) {
|
|
let hash = 0
|
|
for (let i = 0; i < string.length; i++) {
|
|
hash = (hash << 5) - hash + string.charCodeAt(i)
|
|
hash |= 0 // Convert to 32bit integer
|
|
}
|
|
return hash + ''
|
|
}
|
|
}
|
|
|
|
let _cvdb: ContentVectorDatabase
|
|
|
|
export async function getVectorDb(
|
|
opts = {} as {
|
|
updateContent?: boolean
|
|
rebuildIndex?: boolean
|
|
}
|
|
) {
|
|
if (_cvdb) {
|
|
return _cvdb
|
|
}
|
|
|
|
if (opts.rebuildIndex || !(await index.isIndexCreated())) {
|
|
await index.createIndex({ deleteIfExists: opts.rebuildIndex, version: 1 })
|
|
}
|
|
|
|
_cvdb = new ContentVectorDatabase({ api: openai, index })
|
|
|
|
if (opts.updateContent || opts.rebuildIndex) {
|
|
nicelog(`Rebuilding index`)
|
|
const db = await connect({ reset: false })
|
|
|
|
nicelog(`Getting articles`)
|
|
const articles =
|
|
INCLUDE_API_CONTENT && INCLUDE_CONTENT
|
|
? await db.all('SELECT * FROM articles')
|
|
: INCLUDE_API_CONTENT
|
|
? await db.all('SELECT * FROM articles WHERE articles.sectionId = ?', 'reference')
|
|
: await db.all('SELECT * FROM articles WHERE articles.sectionId != ?', 'reference')
|
|
|
|
nicelog(`Adding articles to index`)
|
|
const max = Math.min(articles.length, MAX_ARTICLES)
|
|
for (let i = 0; i < max; i++) {
|
|
const article = articles[i]
|
|
const headings = await db.all(
|
|
'SELECT * FROM headings WHERE articleId = ? AND slug NOT IN (?, ?, ?, ?)',
|
|
article.id,
|
|
'constructor',
|
|
'properties',
|
|
'example',
|
|
'methods'
|
|
)
|
|
nicelog(`Adding article ${article.id} to index (${i} of ${max})`)
|
|
await _cvdb.addArticleToIndex(article, headings)
|
|
}
|
|
}
|
|
|
|
return _cvdb
|
|
}
|