forked from mishushakov/llm-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.ts
144 lines (123 loc) · 3.58 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import { Browser, BrowserContext } from 'playwright'
import Turndown from 'turndown'
import OpenAI from 'openai'
import { LlamaModel } from 'node-llama-cpp'
import { z } from 'zod'
import { zodToJsonSchema } from 'zod-to-json-schema'
import {
ScraperCompletionResult,
generateLlamaCompletions,
generateOpenAICompletions,
} from './models.js'
export type ScraperLoadOptions = {
mode?: 'html' | 'text' | 'markdown' | 'image'
closeOnFinish?: boolean
}
export type ScraperLoadResult = {
url: string
content: string
mode: ScraperLoadOptions['mode']
}
export type ScraperRunOptions<T extends z.ZodSchema<any>> = {
schema: T
model?: string
prompt?: string
temperature?: number
} & ScraperLoadOptions
export default class LLMScraper {
private context: BrowserContext
constructor(private browser: Browser, private client: OpenAI | LlamaModel) {
this.browser = browser
this.client = client
}
// Load pages in the browser
private async load(
url: string | string[],
options: ScraperLoadOptions = { mode: 'html' }
): Promise<Promise<ScraperLoadResult>[]> {
this.context = await this.browser.newContext()
const urls = Array.isArray(url) ? url : [url]
const pages = urls.map(async (url) => {
const page = await this.context.newPage()
await page.goto(url)
let content
if (options.mode === 'html') {
content = await page.content()
}
if (options.mode === 'markdown') {
const body = await page.innerHTML('body')
content = new Turndown().turndown(body)
}
if (options.mode === 'text') {
const readable = await page.evaluate(async () => {
const readability = await import(
// @ts-ignore
'https://cdn.skypack.dev/@mozilla/readability'
)
return new readability.Readability(document).parse()
})
content = `Page Title: ${readable.title}\n${readable.textContent}`
}
if (options.mode === 'image') {
const image = await page.screenshot()
content = image.toString('base64')
}
await page.close()
return {
url,
content,
mode: options.mode,
}
})
return pages
}
// Generate completion using OpenAI
private generateCompletions<T extends z.ZodSchema<any>>(
pages: Promise<ScraperLoadResult>[],
options: ScraperRunOptions<T>
): Promise<ScraperCompletionResult<T>>[] {
const schema = zodToJsonSchema(options.schema)
const loader = pages.map(async (page, i) => {
switch (this.client.constructor) {
case OpenAI:
return generateOpenAICompletions<T>(
this.client as OpenAI,
options.model,
await page,
schema,
options?.prompt,
options?.temperature
)
case LlamaModel:
return generateLlamaCompletions<T>(
this.client,
await page,
schema,
options?.prompt,
options?.temperature
)
default:
throw new Error('Invalid client')
}
})
Promise.all(loader).then(() => {
if (options.closeOnFinish) {
this.close()
}
})
return loader
}
// Load pages and generate completion
async run<T extends z.ZodSchema<any>>(
url: string | string[],
options: ScraperRunOptions<T>
) {
const pages = await this.load(url, options)
return this.generateCompletions<T>(pages, options)
}
// Close the current context and the browser
async close() {
await this.context.close()
await this.browser.close()
}
}