fix: website工具用浏览器

This commit is contained in:
ikechan8370 2023-06-23 23:29:27 +08:00
parent cd13b829e7
commit ed337275d5

View file

@ -4,6 +4,7 @@ import { Config } from '../config.js'
import fetch from 'node-fetch'
import proxy from 'https-proxy-agent'
import { getMaxModelTokens } from '../common.js'
import { ChatGPTPuppeteer } from '../browser.js'
export class WebsiteTool extends AbstractTool {
name = 'website'
@ -20,12 +21,36 @@ export class WebsiteTool extends AbstractTool {
func = async function (opts) {
let { url } = opts
try {
let res = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
// let res = await fetch(url, {
// headers: {
// 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
// }
// })
// let text = await res.text()
let origin = false
if (!Config.headless) {
Config.headless = true
origin = true
}
let ppt = new ChatGPTPuppeteer()
let browser = await ppt.getBrowser()
let page = await browser.newPage()
await page.goto(url, {
waitUntil: 'networkidle2'
})
let text = await res.text()
let text = await page.content()
await page.close()
if (origin) {
Config.headless = false
}
// text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
// .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
// .replace(/<head\b[^<]*(?:(?!<\/head>)<[^<]*)*<\/head>/gi, '')
// .replace(/<!--[\s\S]*?-->/gi, '')
text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '') // 移除<style>标签及其内容
.replace(/<[^>]+style\s*=\s*(["'])(?:(?!\1).)*\1[^>]*>/gi, '') // 移除带有style属性的标签
.replace(/<[^>]+>/g, '')
let maxModelTokens = getMaxModelTokens(Config.model)
text = text.slice(0, Math.min(text.length, maxModelTokens - 1600))
let api = new ChatGPTAPI({
@ -49,7 +74,7 @@ export class WebsiteTool extends AbstractTool {
},
maxModelTokens
})
const htmlContentSummaryRes = await api.sendMessage(`这是一个网页html的内容,请你从中提取出其中的主体内容告诉我${text}`)
const htmlContentSummaryRes = await api.sendMessage(`这是一个网页html经过筛选的内容请你进一步去掉其中的标签、样式、script等无用信息并从中提取出其中的主体内容转换成自然语言告诉我不需要主观描述性的语言${text}`)
let htmlContentSummary = htmlContentSummaryRes.text
return `this is the main content of website:\n ${htmlContentSummary}`
} catch (err) {