Unfurl bookmarks in worker (#4039)

This PR adds a `GET /api/unfurl?url=blahblah` endpoint to our worker.

I tried out the existing cheerio implementation but it added 300kb to
our worker bundle in the end, due to transitive dependencies.

So I implemented the same logic with cloudflare's sanctioned streaming
HTML parser `HTMLRewriter` and it seems to work fine.

I also made the vscode extension do its fetching locally (from the node
process so it's not bound by security policies), retaining the cheerio
version for that. At the same time I fixed a bug in the RPC layer that
was preventing unfurled metadata from loading correctly.

In a few months we can retire the bookmark-extractor app by just
deleting it in the vercel dashboard.

### Change Type


<!--  Please select a 'Type' label ️ -->

- [ ] `feature` — New feature
- [x] `improvement` — Product improvement
- [ ] `api` — API change
- [ ] `bugfix` — Bug fix
- [ ] `other` — Changes that don't affect SDK users, e.g. internal or
.com changes


### Test Plan

1. Add a step-by-step description of how to test your PR here.
2.

- [ ] Unit Tests
- [ ] End to end tests

### Release Notes

- Do link unfurling on the same subdomain as all our other api
endpoints.
This commit is contained in:
David Sheldrick 2024-07-01 15:40:03 +01:00 committed by GitHub
parent bfccf98d99
commit ee6aa172b2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 136 additions and 206 deletions

View file

@ -1 +0,0 @@
.vercel

View file

@ -1,3 +0,0 @@
# @tldraw/bookmark-extractor
Deploy this manually with `vercel deploy --prod`.

View file

@ -1,35 +0,0 @@
import Cors from 'cors'
const whitelist = [
'http://localhost:3000',
'http://localhost:4000',
'http://localhost:5420',
'https://www.tldraw.com',
'https://staging.tldraw.com',
process.env.NEXT_PUBLIC_VERCEL_URL,
'vercel.app',
]
export const cors = Cors({
methods: ['POST'],
origin: function (origin, callback) {
if (origin?.endsWith('.tldraw.com')) {
callback(null, true)
} else if (origin?.endsWith('-tldraw.vercel.app')) {
callback(null, true)
} else if (origin && whitelist.includes(origin)) {
callback(null, true)
} else {
callback(new Error(`Not allowed by CORS (${origin})`))
}
},
})
export function runCorsMiddleware(req: any, res: any) {
return new Promise((resolve, reject) => {
cors(req, res, (result) => {
if (result instanceof Error) return reject(result)
return resolve(result)
})
})
}

View file

@ -1,18 +0,0 @@
import { unfurl } from '../lib/unfurl'
import { runCorsMiddleware } from './_cors'
interface RequestBody {
url: string
}
export default async function handler(req: any, res: any) {
try {
await runCorsMiddleware(req, res)
const { url } = typeof req.body === 'string' ? JSON.parse(req.body) : (req.body as RequestBody)
const results = await unfurl(url)
res.send(results)
} catch (error: any) {
console.error(error)
res.status(422).send(error.message)
}
}

View file

@ -1,26 +0,0 @@
{
"name": "@tldraw/bookmark-extractor",
"description": "A tiny little drawing app (merge server).",
"version": "2.0.0-alpha.11",
"private": true,
"author": {
"name": "tldraw GB Ltd.",
"email": "hello@tldraw.com"
},
"scripts": {
"run-local": "vercel dev",
"lint": "yarn run -T tsx ../../scripts/lint.ts"
},
"dependencies": {
"cheerio": "1.0.0-rc.12",
"cors": "^2.8.5"
},
"devDependencies": {
"@types/cheerio": "0.22.33",
"@types/cors": "^2.8.15",
"lazyrepo": "0.0.0-alpha.27",
"tslib": "^2.6.2",
"typescript": "^5.3.3",
"vercel": "^34.2.4"
}
}

View file

@ -1,33 +0,0 @@
{
"exclude": ["node_modules", "dist", ".tsbuild*", ".vercel"],
"compilerOptions": {
"composite": true,
"declaration": true,
"declarationMap": true,
"allowSyntheticDefaultImports": true,
"esModuleInterop": true,
"forceConsistentCasingInFileNames": true,
"importHelpers": true,
"resolveJsonModule": true,
"incremental": true,
"jsx": "react-jsx",
"lib": ["dom", "DOM.Iterable", "esnext"],
"experimentalDecorators": true,
"module": "CommonJS",
"target": "esnext",
"moduleResolution": "node",
"noFallthroughCasesInSwitch": true,
"noImplicitAny": true,
"noImplicitReturns": true,
"noUnusedLocals": false,
"noUnusedParameters": false,
"skipLibCheck": true,
"strict": true,
"strictFunctionTypes": true,
"strictNullChecks": true,
"useDefineForClassFields": true,
"noImplicitOverride": true,
"noEmit": true
},
"references": []
}

View file

@ -26,6 +26,7 @@
"@tldraw/tlschema": "workspace:*",
"@tldraw/tlsync": "workspace:*",
"@tldraw/utils": "workspace:*",
"@tldraw/validate": "workspace:*",
"itty-router": "^4.0.13",
"nanoid": "4.0.2",
"react": "^18.2.0",

View file

@ -0,0 +1,73 @@
class TextExtractor {
string = ''
text({ text }: any) {
// An incoming piece of text
this.string += text
}
}
class MetaExtractor {
og: { [key: string]: string | undefined } = {}
twitter: { [key: string]: string | undefined } = {}
description = null as string | null
element(element: Element) {
// An incoming element, such as `div`
const property = element.getAttribute('property')
const name = element.getAttribute('name')
if (property && property.startsWith('og:')) {
this.og[property] = element.getAttribute('content')!
} else if (name && name.startsWith('twitter:')) {
this.twitter[name] = element.getAttribute('content')!
} else if (name === 'description') {
this.description = element.getAttribute('content')
}
}
}
class IconExtractor {
appleIcon = null as string | null
icon = null as string | null
element(element: Element) {
if (element.getAttribute('rel') === 'icon') {
this.icon = element.getAttribute('href')!
} else if (element.getAttribute('rel') === 'apple-touch-icon') {
this.appleIcon = element.getAttribute('href')!
}
}
}
export async function unfurl(url: string) {
const meta$ = new MetaExtractor()
const title$ = new TextExtractor()
const icon$ = new IconExtractor()
// we use cloudflare's special html parser https://developers.cloudflare.com/workers/runtime-apis/html-rewriter/
await new HTMLRewriter()
.on('meta', meta$)
.on('title', title$)
.on('link', icon$)
.transform((await fetch(url)) as any)
.blob?.()
const { og, twitter } = meta$
const title = og['og:title'] ?? twitter['twitter:title'] ?? title$.string ?? undefined
const description =
og['og:description'] ?? twitter['twitter:description'] ?? meta$.description ?? undefined
let image = og['og:image:secure_url'] ?? og['og:image'] ?? twitter['twitter:image'] ?? undefined
let favicon = icon$.appleIcon ?? icon$.icon ?? undefined
if (image && !image?.startsWith('http')) {
image = new URL(image, url).href
}
if (favicon && !favicon?.startsWith('http')) {
favicon = new URL(favicon, url).href
}
return {
title,
description,
image,
favicon,
}
}

View file

@ -6,7 +6,8 @@ import {
ROOM_OPEN_MODE,
ROOM_PREFIX,
} from '@tldraw/dotcom-shared'
import { Router, createCors } from 'itty-router'
import { T } from '@tldraw/validate'
import { Router, createCors, json } from 'itty-router'
import { Toucan } from 'toucan-js'
import { createRoom } from './routes/createRoom'
import { createRoomSnapshot } from './routes/createRoomSnapshot'
@ -18,6 +19,7 @@ import { getRoomSnapshot } from './routes/getRoomSnapshot'
import { joinExistingRoom } from './routes/joinExistingRoom'
import { Environment } from './types'
import { fourOhFour } from './utils/fourOhFour'
import { unfurl } from './utils/unfurl'
export { TLDrawDurableObject } from './TLDrawDurableObject'
const { preflight, corsify } = createCors({
@ -42,6 +44,12 @@ const router = Router()
.get(`/${ROOM_PREFIX}/:roomId/history`, getRoomHistory)
.get(`/${ROOM_PREFIX}/:roomId/history/:timestamp`, getRoomHistorySnapshot)
.get('/readonly-slug/:roomId', getReadonlySlug)
.get('/unfurl', async (req) => {
if (typeof req.query.url !== 'string' || !T.httpUrl.isValid(req.query.url)) {
return new Response('url query param is required', { status: 400 })
}
return json(await unfurl(req.query.url))
})
.post(`/${ROOM_PREFIX}/:roomId/restore`, forwardRoomRequest)
.all('*', fourOhFour)

View file

@ -21,6 +21,9 @@
},
{
"path": "../../packages/utils"
},
{
"path": "../../packages/validate"
}
]
}

View file

@ -16,7 +16,6 @@ const cspDirectives: { [key: string]: string[] } = {
`'self'`,
`ws:`,
`wss:`,
`https://bookmark-extractor.tldraw.com`,
`https://assets.tldraw.xyz`,
`https://*.tldraw.workers.dev`,
`https://*.ingest.sentry.io`,

View file

@ -1,4 +1,4 @@
export const BOOKMARK_ENDPOINT = 'https://bookmark-extractor.tldraw.com/api/bookmark'
export const BOOKMARK_ENDPOINT = '/api/unfurl'
// some boilerplate to get the URL of the server to upload/fetch assets

View file

@ -11,17 +11,14 @@ interface ResponseBody {
export async function createAssetFromUrl({ url }: { type: 'url'; url: string }): Promise<TLAsset> {
try {
// First, try to get the meta data from our endpoint
const meta = (await (
await fetch(BOOKMARK_ENDPOINT, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
url,
}),
})
).json()) as ResponseBody
const fetchUrl =
BOOKMARK_ENDPOINT +
'?' +
new URLSearchParams({
url,
}).toString()
const meta = (await (await fetch(fetchUrl)).json()) as ResponseBody
return {
id: AssetRecordType.createId(getHashForString(url)),

View file

@ -5,7 +5,7 @@ export async function onCreateAssetFromUrl({
url,
}: TLExternalAssetContent & { type: 'url' }): Promise<TLAsset> {
try {
// First, try to get the data from vscode
// First, try to get the data from the extension manager process, using node's fetch
const meta = await rpc('vscode:bookmark', { url })
return {

View file

@ -36,7 +36,8 @@ export function rpc(
vscode.postMessage(inMessage)
const handler = ({ data: response }: MessageEvent<ResponseType | ErrorType>) => {
if (uuid === response.uuid) {
// Only handle messages that are meant to be a direct response to the message we sent
if (response.uuid !== uuid + '_response') {
return
}

View file

@ -154,6 +154,7 @@
},
"gitHead": "4b1137849ad07da36fc8f0f19cb64e7535a79296",
"dependencies": {
"cheerio": "^1.0.0-rc.12",
"node-fetch": "^2.0.0"
},
"peerDependencies": {

View file

@ -1,5 +1,4 @@
import { isEqual } from 'lodash'
import fetch from 'node-fetch'
import * as vscode from 'vscode'
import { TLDrawDocument } from './TldrawDocument'
import { loadFile } from './file'
@ -7,8 +6,8 @@ import { loadFile } from './file'
import { UnknownRecord } from 'tldraw'
// @ts-ignore
import type { VscodeMessage } from '../../messages'
import { unfurl } from './unfurl'
import { nicelog } from './utils'
const BOOKMARK_ENDPOINT = 'https://bookmark-extractor.tldraw.com/api/bookmark'
export const GlobalStateKeys = {
ShowV1FileOpenWarning: 'showV1fileOpenWarning',
@ -74,24 +73,12 @@ export class WebViewMessageHandler {
}
case 'vscode:bookmark/request': {
const url = e.data.url
fetch(BOOKMARK_ENDPOINT, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
// We can fake the origin here because we're in node.js
origin: 'https://www.tldraw.com',
},
body: JSON.stringify({
url,
}),
})
.then((resp) => {
return resp.json()
})
.then((json: any) => {
await unfurl(url)
.then(async (json: any) => {
this.webviewPanel.webview.postMessage({
type: 'vscode:bookmark/response',
uuid: e.uuid,
// Add a suffix to the uuid to represent the response.
uuid: e.uuid + '_response',
data: {
url,
title: json.title,

View file

@ -1,4 +1,4 @@
import cheerio from 'cheerio'
import { load } from 'cheerio'
export async function unfurl(url: string) {
const response = await fetch(url)
@ -11,13 +11,17 @@ export async function unfurl(url: string) {
}
const content = await response.text()
const $ = cheerio.load(content)
const $ = load(content)
const og: { [key: string]: string | undefined } = {}
const twitter: { [key: string]: string | undefined } = {}
$('meta[property^=og:]').each((_, el) => (og[$(el).attr('property')!] = $(el).attr('content')))
$('meta[name^=twitter:]').each((_, el) => (twitter[$(el).attr('name')!] = $(el).attr('content')))
$('meta[property^=og:]').each((_, el) => {
og[$(el).attr('property')!] = $(el).attr('content')
})
$('meta[name^=twitter:]').each((_, el) => {
twitter[$(el).attr('name')!] = $(el).attr('content')
})
const title = og['og:title'] ?? twitter['twitter:title'] ?? $('title').text() ?? undefined
const description =

View file

@ -55,6 +55,9 @@ export type ExtractRequiredKeys<T extends object> = {
[K in keyof T]: undefined extends T[K] ? never : K;
}[keyof T];
// @public
const httpUrl: Validator<string>;
// @public
const indexKey: Validator<IndexKey>;
@ -182,6 +185,7 @@ declare namespace T {
jsonValue,
linkUrl,
srcUrl,
httpUrl,
indexKey
}
}

View file

@ -1003,6 +1003,22 @@ export const srcUrl = string.check((value) => {
}
})
/**
* Validates an http(s) url
*
* @public
*/
export const httpUrl = string.check((value) => {
if (value === '') return
const url = parseUrl(value)
if (!url.protocol.toLowerCase().match(/^https?:$/)) {
throw new ValidationError(
`Expected a valid url, got ${JSON.stringify(value)} (invalid protocol)`
)
}
})
/**
* Validates that a value is an IndexKey.
* @public

View file

@ -6012,21 +6012,6 @@ __metadata:
languageName: unknown
linkType: soft
"@tldraw/bookmark-extractor@workspace:apps/dotcom-bookmark-extractor":
version: 0.0.0-use.local
resolution: "@tldraw/bookmark-extractor@workspace:apps/dotcom-bookmark-extractor"
dependencies:
"@types/cheerio": "npm:0.22.33"
"@types/cors": "npm:^2.8.15"
cheerio: "npm:1.0.0-rc.12"
cors: "npm:^2.8.5"
lazyrepo: "npm:0.0.0-alpha.27"
tslib: "npm:^2.6.2"
typescript: "npm:^5.3.3"
vercel: "npm:^34.2.4"
languageName: unknown
linkType: soft
"@tldraw/docs@workspace:apps/docs":
version: 0.0.0-use.local
resolution: "@tldraw/docs@workspace:apps/docs"
@ -6103,6 +6088,7 @@ __metadata:
"@tldraw/tlschema": "workspace:*"
"@tldraw/tlsync": "workspace:*"
"@tldraw/utils": "workspace:*"
"@tldraw/validate": "workspace:*"
esbuild: "npm:^0.21.5"
itty-router: "npm:^4.0.13"
lazyrepo: "npm:0.0.0-alpha.27"
@ -6535,15 +6521,6 @@ __metadata:
languageName: node
linkType: hard
"@types/cheerio@npm:0.22.33":
version: 0.22.33
resolution: "@types/cheerio@npm:0.22.33"
dependencies:
"@types/node": "npm:*"
checksum: 21828cccc3da6c1177d884bff4aca3231904e98d262cc3bb98519805144361a39be24a89b7099c38457152b3822b59121a82111479d1d8f65e3703073a9245fd
languageName: node
linkType: hard
"@types/classnames@npm:^2.3.1":
version: 2.3.1
resolution: "@types/classnames@npm:2.3.1"
@ -6574,15 +6551,6 @@ __metadata:
languageName: node
linkType: hard
"@types/cors@npm:^2.8.15":
version: 2.8.17
resolution: "@types/cors@npm:2.8.17"
dependencies:
"@types/node": "npm:*"
checksum: 469bd85e29a35977099a3745c78e489916011169a664e97c4c3d6538143b0a16e4cc72b05b407dc008df3892ed7bf595f9b7c0f1f4680e169565ee9d64966bde
languageName: node
linkType: hard
"@types/debug@npm:^4.0.0":
version: 4.1.12
resolution: "@types/debug@npm:4.1.12"
@ -9103,7 +9071,7 @@ __metadata:
languageName: node
linkType: hard
"cheerio@npm:1.0.0-rc.12, cheerio@npm:^1.0.0-rc.12, cheerio@npm:^1.0.0-rc.9":
"cheerio@npm:^1.0.0-rc.12, cheerio@npm:^1.0.0-rc.9":
version: 1.0.0-rc.12
resolution: "cheerio@npm:1.0.0-rc.12"
dependencies:
@ -9629,16 +9597,6 @@ __metadata:
languageName: node
linkType: hard
"cors@npm:^2.8.5":
version: 2.8.5
resolution: "cors@npm:2.8.5"
dependencies:
object-assign: "npm:^4"
vary: "npm:^1"
checksum: 66e88e08edee7cbce9d92b4d28a2028c88772a4c73e02f143ed8ca76789f9b59444eed6b1c167139e76fa662998c151322720093ba229f9941365ada5a6fc2c6
languageName: node
linkType: hard
"cosmiconfig@npm:7.0.0":
version: 7.0.0
resolution: "cosmiconfig@npm:7.0.0"
@ -17959,7 +17917,7 @@ __metadata:
languageName: node
linkType: hard
"object-assign@npm:^4, object-assign@npm:^4.1.1":
"object-assign@npm:^4.1.1":
version: 4.1.1
resolution: "object-assign@npm:4.1.1"
checksum: fcc6e4ea8c7fe48abfbb552578b1c53e0d194086e2e6bbbf59e0a536381a292f39943c6e9628af05b5528aa5e3318bb30d6b2e53cadaf5b8fe9e12c4b69af23f
@ -21376,6 +21334,7 @@ __metadata:
"@typescript-eslint/eslint-plugin": "npm:^5.57.0"
"@typescript-eslint/parser": "npm:^5.57.0"
assert: "npm:^2.0.0"
cheerio: "npm:^1.0.0-rc.12"
esbuild: "npm:^0.21.5"
eslint: "npm:^8.37.0"
fs-extra: "npm:^11.1.0"
@ -22589,13 +22548,6 @@ __metadata:
languageName: node
linkType: hard
"vary@npm:^1":
version: 1.1.2
resolution: "vary@npm:1.1.2"
checksum: 31389debef15a480849b8331b220782230b9815a8e0dbb7b9a8369559aed2e9a7800cd904d4371ea74f4c3527db456dc8e7ac5befce5f0d289014dbdf47b2242
languageName: node
linkType: hard
"vectra@npm:0.4.4":
version: 0.4.4
resolution: "vectra@npm:0.4.4"