fix: adjust website tools; trim gemini intermediate response

This commit is contained in:
ikechan8370 2025-02-17 22:45:39 +08:00
parent 41be6befec
commit 0fae49d5d1
2 changed files with 56 additions and 61 deletions

View file

@ -6,6 +6,58 @@ import proxy from 'https-proxy-agent'
import { getMaxModelTokens } from '../common.js'
import { ChatGPTPuppeteer } from '../browser.js'
import { CustomGoogleGeminiClient } from '../../client/CustomGoogleGeminiClient.js'
/**
* Generated by GPT-4o
* @param html
* @returns {*}
*/
function cleanHTML (html) {
// 1. 移除 <style>、<script>、<link>、<head> 等无关内容
html = html.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') // 移除CSS
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') // 移除JS
.replace(/<link[^>]*>/gi, '') // 移除外部CSS文件
.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '') // 移除整个<head>
.replace(/<!--[\s\S]*?-->/g, '') // 移除HTML注释
.replace(/<figure[^>]*>[\s\S]*?<\/figure>/gi, '') // 移除<figure>
// 2. 允许的标签列表
const allowedTags = ['title', 'meta', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'img', 'video', 'audio', 'source', 'a']
// 3. 处理HTML标签移除不在允许列表中的标签
html = html.replace(/<\/?([a-zA-Z0-9]+)(\s[^>]*)?>/g, (match, tagName, attrs) => {
tagName = tagName.toLowerCase()
if (allowedTags.includes(tagName)) {
if (tagName === 'meta') {
// 允许<meta>标签,仅保留其中的 charset, name, content
return match.replace(/<(meta)([^>]*)>/gi, (_, tag, attributes) => {
let allowedAttrs = attributes.match(/(charset|name|content)=["'][^"']+["']/gi)
return `<${tag} ${allowedAttrs ? allowedAttrs.join(' ') : ''}>`
})
} else if (tagName === 'img' || tagName === 'video' || tagName === 'audio' || tagName === 'source') {
// 仅保留 `src` 属性,并去掉 base64 编码的 `data:` 形式
return match.replace(/<(img|video|audio|source)([^>]*)>/gi, (_, tag, attributes) => {
let srcMatch = attributes.match(/\bsrc=["'](?!data:)[^"']+["']/i) // 过滤 base64
return srcMatch ? `<${tag} ${srcMatch[0]}>` : '' // 没有合法的 src 就移除整个标签
})
} else if (tagName === 'a') {
// 仅保留 `href`,并去掉 base64 `data:` 形式
return match.replace(/<a([^>]*)>/gi, (_, attributes) => {
let hrefMatch = attributes.match(/\bhref=["'](?!data:)[^"']+["']/i)
return hrefMatch ? `<a ${hrefMatch[0]}>` : '' // 没有合法的 href 就移除整个标签
})
}
return match // 其他允许的标签直接保留
}
return '' // 过滤不在允许列表中的标签
})
// 4. 移除多余的空格和换行符
html = html.replace(/\s+/g, ' ').trim()
return html
}
export class WebsiteTool extends AbstractTool {
name = 'website'
@ -45,64 +97,7 @@ export class WebsiteTool extends AbstractTool {
if (origin) {
Config.headless = false
}
text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
.replace(/<head\b[^<]*(?:(?!<\/head>)<[^<]*)*<\/head>/gi, '')
.replace(/<figure\b[^<]*(?:(?!<\/figure>)<[^<]*)*<\/figure>/gi, '')
.replace(/<path\b[^<]*(?:(?!<\/path>)<[^<]*)*<\/path>/gi, '')
.replace(/<video\b[^<]*(?:(?!<\/video>)<[^<]*)*<\/video>/gi, '')
.replace(/<audio\b[^<]*(?:(?!<\/audio>)<[^<]*)*<\/audio>/gi, '')
.replace(/<img[^>]*>/gi, '')
.replace(/<!--[\s\S]*?-->/gi, '') // 去除注释
.replace(/<(?!\/?(title|ul|li|td|tr|thead|tbody|blockquote|h[1-6]|H[1-6])[^>]*)\w+\s+[^>]*>/gi, '') // 去除常见语音标签外的含属性标签
.replace(/<(\w+)(\s[^>]*)?>/gi, '<$1>') // 进一步去除剩余标签的属性
.replace(/<\/(?!\/?(title|ul|li|td|tr|thead|tbody|blockquote|h[1-6]|H[1-6])[^>]*)[a-z][a-z0-9]*>/gi, '') // 去除常见语音标签外的含属性结束标签
.replace(/[\n\r]/gi, '') // 去除回车换行
.replace(/\s{2}/g, '') // 多个空格只保留一个空格
.replace('<!DOCTYPE html>', '') // 去除<!DOCTYPE>声明
// if (mode === 'gemini') {
// let client = new CustomGoogleGeminiClient({
// e,
// userId: e?.sender?.user_id,
// key: Config.getGeminiKey(),
// model: Config.geminiModel,
// baseUrl: Config.geminiBaseUrl,
// debug: Config.debug
// })
// const htmlContentSummaryRes = await client.sendMessage(`去除与主体内容无关的部分从中整理出主体内容并转换成md格式不需要主观描述性的语言与冗余的空白行。${text}`)
// let htmlContentSummary = htmlContentSummaryRes.text
// return `this is the main content of website:\n ${htmlContentSummary}`
// } else {
// let maxModelTokens = getMaxModelTokens(Config.model)
// text = text.slice(0, Math.min(text.length, maxModelTokens - 1600))
// let completionParams = {
// // model: Config.model
// model: 'gpt-3.5-turbo-16k'
// }
// let api = new ChatGPTAPI({
// apiBaseUrl: Config.openAiBaseUrl,
// apiKey: Config.apiKey,
// debug: false,
// completionParams,
// fetch: (url, options = {}) => {
// const defaultOptions = Config.proxy
// ? {
// agent: proxy(Config.proxy)
// }
// : {}
// const mergedOptions = {
// ...defaultOptions,
// ...options
// }
// return fetch(url, mergedOptions)
// },
// maxModelTokens
// })
// const htmlContentSummaryRes = await api.sendMessage(`去除与主体内容无关的部分从中整理出主体内容并转换成md格式不需要主观描述性的语言与冗余的空白行。${text}`, { completionParams })
// let htmlContentSummary = htmlContentSummaryRes.text
// return `this is the main content of website:\n ${htmlContentSummary}`
// }
text = cleanHTML(text)
return `the content of the website is:\n${text}`
} catch (err) {
return `failed to visit the website, error: ${err.toString()}`