tldraw/apps/docs/utils/ContentVectorDatabase.ts

242 lines
7.2 KiB
TypeScript
Raw Normal View History

import { connect } from '@/scripts/functions/connect'
import { Article, ArticleHeading, ArticleHeadings } from '@/types/content-types'
import { config } from 'dotenv'
import OpenAI from 'openai'
import path from 'path'
import { LocalIndex } from 'vectra'
import { nicelog } from './nicelog'
config()
const MAX_ARTICLES = Infinity
const INCLUDE_API_CONTENT = true
const INCLUDE_CONTENT = true
const index = new LocalIndex(path.join(process.cwd(), 'utils', 'vector-db'))
const openai = new OpenAI({
apiKey: process.env.OPENAI_KEY,
})
export class ContentVectorDatabase {
index: LocalIndex
api: OpenAI
constructor(opts = {} as { index: LocalIndex; api: OpenAI }) {
this.index = opts.index
this.api = opts.api
}
/**
* Get a vector from a piece of text from openai.
*
* @param text The text to get a vector for.
*
* @returns The vector.
*/
async getVectorEmbeddings(inputs: string[]) {
const response = await this.api.embeddings.create({
model: 'text-embedding-ada-002',
input: inputs,
})
return response.data.map((d) => d.embedding)
}
async addHeadingToIndex(article: Article, heading: ArticleHeading) {
const id = `${article.id}#${heading.slug}`
// Skip headings that are already present
const hash = this.getHashForString(heading.title + heading.slug)
const existingItem = await this.index.getItem(id)
if (existingItem) {
if (existingItem.metadata.hash === hash) {
nicelog(`Skipping heading ${id} (already present)`)
return
}
await this.index.deleteItem(id)
}
nicelog(`Adding headers for ${article.title}#${heading.title}`)
const vectors = await this.getVectorEmbeddings([article.title + '#' + heading.title])
this.index.insertItem({
id,
vector: vectors[0],
metadata: { type: 'heading', articleId: article.id, slug: heading.slug, hash },
})
}
/**
* Add a text item to the index.
*
* @param text The text to add to the index.
*
* @returns The index item.
*/
async addArticleToIndex(article: Article, headings: ArticleHeadings) {
// This is the content that we'll create the embedding for
let contentToVectorize: string
docs: rework docs site to have different sections (#2686) This PR starts putting in place the high-level changes we want to make to the docs site. - It makes separate sections for Reference and Examples and Community. - Gets rid of the secondary sidebar and integrates it into the main sidebar. - Groups the reference articles by type. - Pulls in the examples alongside code and a live playground so people don't have to visit examples.tldraw.com separately. <img width="1458" alt="Screenshot 2024-01-30 at 09 43 46" src="https://github.com/tldraw/tldraw/assets/469604/4f5aa339-3a69-4d9b-9b9f-dfdddea623e8"> Again, this is the top-level changes and there's more to be done for the next PR(s): - create quick start page - clean up installation page - add accordion to Examples page prbly - put fun stuff in header (from footer) - landing page - something for landing page of API - search cmd-k and border - cleanup _sidebarReferenceContentLinks - external links _blank - address potential skew issue with code examples - have a link to other examples (next.js, etc.) ### Change Type - [x] `documentation` — Changes to the documentation only[^2] ### Test Plan 1. Make sure examples work! ### Release Notes - Rework our docs site to pull together the examples app and reference section more cohesively. --------- Co-authored-by: Taha <98838967+Taha-Hassan-Git@users.noreply.github.com> Co-authored-by: Steve Ruiz <steveruizok@gmail.com> Co-authored-by: Mitja Bezenšek <mitja.bezensek@gmail.com> Co-authored-by: alex <alex@dytry.ch> Co-authored-by: Lu Wilson <l2wilson94@gmail.com> Co-authored-by: Dan Groshev <git@dgroshev.com>
2024-01-30 14:19:25 +00:00
if (article.sectionId === 'reference') {
// For API docs, we'll just use the title, description, and members as the content.
// We'll also add a note that the content was generated from the API docs, hopefully
// so that the embedding better reflects searches for api docs.
contentToVectorize = `Title: ${article.title}\nPackage: @tldraw/${article.categoryId}\nDescription: ${article.description}\nMembers:${article.keywords}\n\n(content generated from API docs)`
} else {
// The content is the raw markdown content, which includes all the markdown
// headings and annotations, though none of the frontmatter. We'll add the
// frontmatter information again manually. We may need to also add some information
// about how "important" this article is, relative to related docs or headings.
contentToVectorize = `Title: ${article.title}\nDescription: ${article.description}\nKeywords:${article.keywords}\nMarkdown:\n${article.content}`
}
if (headings.length) {
for (const heading of headings) {
await this.addHeadingToIndex(article, heading)
}
}
// Generate a hash based on the content that we'd be vectorizing
const hash = this.getHashForString(contentToVectorize)
// Create chunks from the content; openAI has a limit of 500 tokens per request
const chunksToAdd: string[] = []
const chunkSize = 500
for (let i = 0; i < contentToVectorize.length; i += chunkSize) {
const chunk = contentToVectorize.slice(i, i + chunkSize)
chunksToAdd.push(chunk)
}
// Is there already an item with this id?
const existingItem = await this.index.getItem(article.id + '_0')
if (existingItem) {
// ...and if the existing item matches our hash, we can skip it
if (existingItem.metadata.hash === hash) {
nicelog(`Skipping ${article.id} (already present)`)
return
}
// ...otherwise, delete all the chunks so that we can add a new one.
for (let i = 0; i < chunksToAdd.length; i++) {
await this.index.deleteItem(article.id + '_' + i)
}
}
// Add chunks to index
nicelog(`Adding article ${article.title} (${chunksToAdd.length} chunks)`)
// Get an embedding / vector for all of the chunks
const vectors = await this.getVectorEmbeddings(chunksToAdd)
for (let i = 0; i < vectors.length; i++) {
const vector = vectors[i]
// Add the article item to the index (include the hash as metadata)
await this.index.insertItem({
id: article.id + '_' + i,
vector: vector,
metadata: { type: 'article', articleId: article.id, hash },
})
}
// Sleep for 50ms or so to avoid rate limiting
await new Promise((r) => setTimeout(r, 35))
return
}
/**
* Query an item using our index.
*
* @param text The text to query.
*
* @returns The query results.
*/
async query(text: string, limit = 5) {
const vector = await this.getVectorEmbeddings([text])
const results = await this.index.queryItems(vector[0], limit)
const output: (
| { id: string; type: 'article'; score: number }
| { id: string; type: 'heading'; slug: string; score: number }
)[] = []
const visited = new Set<string>()
for (const result of results) {
const id = result.item.metadata.articleId as string
const type = result.item.metadata.type as 'article' | 'heading'
if (type === 'heading') {
const slug = result.item.metadata.slug as string
output.push({ id, type, slug, score: result.score })
} else {
// multiple chunks may have been returned
if (visited.has(id)) continue
output.push({ id, type, score: result.score })
visited.add(id)
}
}
return output
}
/**
* Hash a string using the FNV-1a algorithm.
*
* @public
*/
getHashForString(string: string) {
let hash = 0
for (let i = 0; i < string.length; i++) {
hash = (hash << 5) - hash + string.charCodeAt(i)
hash |= 0 // Convert to 32bit integer
}
return hash + ''
}
}
let _cvdb: ContentVectorDatabase
export async function getVectorDb(
opts = {} as {
updateContent?: boolean
rebuildIndex?: boolean
}
) {
if (_cvdb) {
return _cvdb
}
if (opts.rebuildIndex || !(await index.isIndexCreated())) {
await index.createIndex({ deleteIfExists: opts.rebuildIndex, version: 1 })
}
_cvdb = new ContentVectorDatabase({ api: openai, index })
if (opts.updateContent || opts.rebuildIndex) {
nicelog(`Rebuilding index`)
const db = await connect({ reset: false, mode: 'readonly' })
nicelog(`Getting articles`)
const articles =
INCLUDE_API_CONTENT && INCLUDE_CONTENT
? await db.all('SELECT * FROM articles')
: INCLUDE_API_CONTENT
? await db.all('SELECT * FROM articles WHERE articles.sectionId = ?', 'reference')
: await db.all('SELECT * FROM articles WHERE articles.sectionId != ?', 'reference')
nicelog(`Adding articles to index`)
const max = Math.min(articles.length, MAX_ARTICLES)
for (let i = 0; i < max; i++) {
const article = articles[i]
const headings = await db.all(
'SELECT * FROM headings WHERE articleId = ? AND slug NOT IN (?, ?, ?, ?)',
article.id,
'constructor',
'properties',
'example',
'methods'
)
nicelog(`Adding article ${article.id} to index (${i} of ${max})`)
await _cvdb.addArticleToIndex(article, headings)
}
}
return _cvdb
}