chatgpt-plugin/utils/tools/WebsiteTool.js

114 lines
4.1 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { AbstractTool } from './AbstractTool.js'
import { ChatGPTAPI } from '../openai/chatgpt-api.js'
import { Config } from '../config.js'
import fetch from 'node-fetch'
import proxy from 'https-proxy-agent'
import { getMaxModelTokens } from '../common.js'
import { ChatGPTPuppeteer } from '../browser.js'
import { CustomGoogleGeminiClient } from '../../client/CustomGoogleGeminiClient.js'
/**
* Generated by GPT-4o
* @param html
* @returns {*}
*/
function cleanHTML (html) {
// 1. 移除 <style>、<script>、<link>、<head> 等无关内容
html = html.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') // 移除CSS
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') // 移除JS
.replace(/<link[^>]*>/gi, '') // 移除外部CSS文件
.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '') // 移除整个<head>
.replace(/<!--[\s\S]*?-->/g, '') // 移除HTML注释
.replace(/<figure[^>]*>[\s\S]*?<\/figure>/gi, '') // 移除<figure>
// 2. 允许的标签列表
const allowedTags = ['title', 'meta', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'img', 'video', 'audio', 'source', 'a']
// 3. 处理HTML标签移除不在允许列表中的标签
html = html.replace(/<\/?([a-zA-Z0-9]+)(\s[^>]*)?>/g, (match, tagName, attrs) => {
tagName = tagName.toLowerCase()
if (allowedTags.includes(tagName)) {
if (tagName === 'meta') {
// 允许<meta>标签,仅保留其中的 charset, name, content
return match.replace(/<(meta)([^>]*)>/gi, (_, tag, attributes) => {
let allowedAttrs = attributes.match(/(charset|name|content)=["'][^"']+["']/gi)
return `<${tag} ${allowedAttrs ? allowedAttrs.join(' ') : ''}>`
})
} else if (tagName === 'img' || tagName === 'video' || tagName === 'audio' || tagName === 'source') {
// 仅保留 `src` 属性,并去掉 base64 编码的 `data:` 形式
return match.replace(/<(img|video|audio|source)([^>]*)>/gi, (_, tag, attributes) => {
let srcMatch = attributes.match(/\bsrc=["'](?!data:)[^"']+["']/i) // 过滤 base64
return srcMatch ? `<${tag} ${srcMatch[0]}>` : '' // 没有合法的 src 就移除整个标签
})
} else if (tagName === 'a') {
// 仅保留 `href`,并去掉 base64 `data:` 形式
return match.replace(/<a([^>]*)>/gi, (_, attributes) => {
let hrefMatch = attributes.match(/\bhref=["'](?!data:)[^"']+["']/i)
return hrefMatch ? `<a ${hrefMatch[0]}>` : '' // 没有合法的 href 就移除整个标签
})
}
return match // 其他允许的标签直接保留
}
return '' // 过滤不在允许列表中的标签
})
// 4. 移除多余的空格和换行符
html = html.replace(/\s+/g, ' ').trim()
return html
}
export class WebsiteTool extends AbstractTool {
name = 'website'
parameters = {
properties: {
url: {
type: 'string',
description: '要访问的网站网址'
}
},
required: ['url']
}
func = async function (opts) {
let { url, mode, e } = opts
let browser
try {
// let res = await fetch(url, {
// headers: {
// 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
// }
// })
// let text = await res.text()
let origin = false
if (!Config.headless) {
Config.headless = true
origin = true
}
let ppt = new ChatGPTPuppeteer()
browser = await ppt.getBrowser()
let page = await browser.newPage()
await page.goto(url, {
waitUntil: 'networkidle2'
})
let text = await page.content()
await page.close()
if (origin) {
Config.headless = false
}
text = cleanHTML(text)
return `the content of the website is:\n${text}`
} catch (err) {
return `failed to visit the website, error: ${err.toString()}`
} finally {
if (browser) {
try {
await browser.close()
} catch (err) {}
}
}
}
description = 'Useful when you want to browse a website by url, it can be a html or api url'
}