chatgpt-plugin/utils/tools/WebsiteTool.js
2024-03-09 23:37:47 +08:00

118 lines
4.6 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { AbstractTool } from './AbstractTool.js'
import { ChatGPTAPI } from '../openai/chatgpt-api.js'
import { Config } from '../config.js'
import fetch from 'node-fetch'
import proxy from 'https-proxy-agent'
import { getMaxModelTokens } from '../common.js'
import { ChatGPTPuppeteer } from '../browser.js'
import { CustomGoogleGeminiClient } from '../../client/CustomGoogleGeminiClient.js'
export class WebsiteTool extends AbstractTool {
name = 'website'
parameters = {
properties: {
url: {
type: 'string',
description: '要访问的网站网址'
}
},
required: ['url']
}
func = async function (opts) {
let { url, mode, e } = opts
let browser
try {
// let res = await fetch(url, {
// headers: {
// 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
// }
// })
// let text = await res.text()
let origin = false
if (!Config.headless) {
Config.headless = true
origin = true
}
let ppt = new ChatGPTPuppeteer()
browser = await ppt.getBrowser()
let page = await browser.newPage()
await page.goto(url, {
waitUntil: 'networkidle2'
})
let text = await page.content()
await page.close()
if (origin) {
Config.headless = false
}
text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
.replace(/<head\b[^<]*(?:(?!<\/head>)<[^<]*)*<\/head>/gi, '')
.replace(/<figure\b[^<]*(?:(?!<\/figure>)<[^<]*)*<\/figure>/gi, '')
.replace(/<path\b[^<]*(?:(?!<\/path>)<[^<]*)*<\/path>/gi, '')
.replace(/<video\b[^<]*(?:(?!<\/video>)<[^<]*)*<\/video>/gi, '')
.replace(/<audio\b[^<]*(?:(?!<\/audio>)<[^<]*)*<\/audio>/gi, '')
.replace(/<img[^>]*>/gi, '')
.replace(/<!--[\s\S]*?-->/gi, '') // 去除注释
.replace(/<(?!\/?(title|ul|li|td|tr|thead|tbody|blockquote|h[1-6]|H[1-6])[^>]*)\w+\s+[^>]*>/gi, '') // 去除常见语音标签外的含属性标签
.replace(/<(\w+)(\s[^>]*)?>/gi, '<$1>') // 进一步去除剩余标签的属性
.replace(/<\/(?!\/?(title|ul|li|td|tr|thead|tbody|blockquote|h[1-6]|H[1-6])[^>]*)[a-z][a-z0-9]*>/gi, '') // 去除常见语音标签外的含属性结束标签
.replace(/[\n\r]/gi, '') // 去除回车换行
.replace(/\s{2}/g, '') // 多个空格只保留一个空格
.replace('<!DOCTYPE html>', '') // 去除<!DOCTYPE>声明
if (mode === 'gemini') {
let client = new CustomGoogleGeminiClient({
e,
userId: e?.sender?.user_id,
key: Config.geminiKey,
model: Config.geminiModel,
baseUrl: Config.geminiBaseUrl,
debug: Config.debug
})
const htmlContentSummaryRes = await client.sendMessage(`去除与主体内容无关的部分从中整理出主体内容并转换成md格式不需要主观描述性的语言与冗余的空白行。${text}`)
let htmlContentSummary = htmlContentSummaryRes.text
return `this is the main content of website:\n ${htmlContentSummary}`
} else {
let maxModelTokens = getMaxModelTokens(Config.model)
text = text.slice(0, Math.min(text.length, maxModelTokens - 1600))
let completionParams = {
// model: Config.model
model: 'gpt-3.5-turbo-16k'
}
let api = new ChatGPTAPI({
apiBaseUrl: Config.openAiBaseUrl,
apiKey: Config.apiKey,
debug: false,
completionParams,
fetch: (url, options = {}) => {
const defaultOptions = Config.proxy
? {
agent: proxy(Config.proxy)
}
: {}
const mergedOptions = {
...defaultOptions,
...options
}
return fetch(url, mergedOptions)
},
maxModelTokens
})
const htmlContentSummaryRes = await api.sendMessage(`去除与主体内容无关的部分从中整理出主体内容并转换成md格式不需要主观描述性的语言与冗余的空白行。${text}`, { completionParams })
let htmlContentSummary = htmlContentSummaryRes.text
return `this is the main content of website:\n ${htmlContentSummary}`
}
} catch (err) {
return `failed to visit the website, error: ${err.toString()}`
} finally {
if (browser) {
try {
await browser.close()
} catch (err) {}
}
}
}
description = 'Useful when you want to browse a website by url'
}