mirror of
https://github.com/ikechan8370/chatgpt-plugin.git
synced 2025-12-16 13:27:08 +00:00
fix: adjust website tools; trim gemini intermediate response
This commit is contained in:
parent
41be6befec
commit
0fae49d5d1
2 changed files with 56 additions and 61 deletions
|
|
@ -6,6 +6,58 @@ import proxy from 'https-proxy-agent'
|
|||
import { getMaxModelTokens } from '../common.js'
|
||||
import { ChatGPTPuppeteer } from '../browser.js'
|
||||
import { CustomGoogleGeminiClient } from '../../client/CustomGoogleGeminiClient.js'
|
||||
|
||||
/**
|
||||
* Generated by GPT-4o
|
||||
* @param html
|
||||
* @returns {*}
|
||||
*/
|
||||
function cleanHTML (html) {
|
||||
// 1. 移除 <style>、<script>、<link>、<head> 等无关内容
|
||||
html = html.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') // 移除CSS
|
||||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') // 移除JS
|
||||
.replace(/<link[^>]*>/gi, '') // 移除外部CSS文件
|
||||
.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '') // 移除整个<head>
|
||||
.replace(/<!--[\s\S]*?-->/g, '') // 移除HTML注释
|
||||
.replace(/<figure[^>]*>[\s\S]*?<\/figure>/gi, '') // 移除<figure>
|
||||
|
||||
// 2. 允许的标签列表
|
||||
const allowedTags = ['title', 'meta', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'img', 'video', 'audio', 'source', 'a']
|
||||
|
||||
// 3. 处理HTML标签,移除不在允许列表中的标签
|
||||
html = html.replace(/<\/?([a-zA-Z0-9]+)(\s[^>]*)?>/g, (match, tagName, attrs) => {
|
||||
tagName = tagName.toLowerCase()
|
||||
if (allowedTags.includes(tagName)) {
|
||||
if (tagName === 'meta') {
|
||||
// 允许<meta>标签,仅保留其中的 charset, name, content
|
||||
return match.replace(/<(meta)([^>]*)>/gi, (_, tag, attributes) => {
|
||||
let allowedAttrs = attributes.match(/(charset|name|content)=["'][^"']+["']/gi)
|
||||
return `<${tag} ${allowedAttrs ? allowedAttrs.join(' ') : ''}>`
|
||||
})
|
||||
} else if (tagName === 'img' || tagName === 'video' || tagName === 'audio' || tagName === 'source') {
|
||||
// 仅保留 `src` 属性,并去掉 base64 编码的 `data:` 形式
|
||||
return match.replace(/<(img|video|audio|source)([^>]*)>/gi, (_, tag, attributes) => {
|
||||
let srcMatch = attributes.match(/\bsrc=["'](?!data:)[^"']+["']/i) // 过滤 base64
|
||||
return srcMatch ? `<${tag} ${srcMatch[0]}>` : '' // 没有合法的 src 就移除整个标签
|
||||
})
|
||||
} else if (tagName === 'a') {
|
||||
// 仅保留 `href`,并去掉 base64 `data:` 形式
|
||||
return match.replace(/<a([^>]*)>/gi, (_, attributes) => {
|
||||
let hrefMatch = attributes.match(/\bhref=["'](?!data:)[^"']+["']/i)
|
||||
return hrefMatch ? `<a ${hrefMatch[0]}>` : '' // 没有合法的 href 就移除整个标签
|
||||
})
|
||||
}
|
||||
return match // 其他允许的标签直接保留
|
||||
}
|
||||
return '' // 过滤不在允许列表中的标签
|
||||
})
|
||||
|
||||
// 4. 移除多余的空格和换行符
|
||||
html = html.replace(/\s+/g, ' ').trim()
|
||||
|
||||
return html
|
||||
}
|
||||
|
||||
export class WebsiteTool extends AbstractTool {
|
||||
name = 'website'
|
||||
|
||||
|
|
@ -45,64 +97,7 @@ export class WebsiteTool extends AbstractTool {
|
|||
if (origin) {
|
||||
Config.headless = false
|
||||
}
|
||||
text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
|
||||
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
|
||||
.replace(/<head\b[^<]*(?:(?!<\/head>)<[^<]*)*<\/head>/gi, '')
|
||||
.replace(/<figure\b[^<]*(?:(?!<\/figure>)<[^<]*)*<\/figure>/gi, '')
|
||||
.replace(/<path\b[^<]*(?:(?!<\/path>)<[^<]*)*<\/path>/gi, '')
|
||||
.replace(/<video\b[^<]*(?:(?!<\/video>)<[^<]*)*<\/video>/gi, '')
|
||||
.replace(/<audio\b[^<]*(?:(?!<\/audio>)<[^<]*)*<\/audio>/gi, '')
|
||||
.replace(/<img[^>]*>/gi, '')
|
||||
.replace(/<!--[\s\S]*?-->/gi, '') // 去除注释
|
||||
.replace(/<(?!\/?(title|ul|li|td|tr|thead|tbody|blockquote|h[1-6]|H[1-6])[^>]*)\w+\s+[^>]*>/gi, '') // 去除常见语音标签外的含属性标签
|
||||
.replace(/<(\w+)(\s[^>]*)?>/gi, '<$1>') // 进一步去除剩余标签的属性
|
||||
.replace(/<\/(?!\/?(title|ul|li|td|tr|thead|tbody|blockquote|h[1-6]|H[1-6])[^>]*)[a-z][a-z0-9]*>/gi, '') // 去除常见语音标签外的含属性结束标签
|
||||
.replace(/[\n\r]/gi, '') // 去除回车换行
|
||||
.replace(/\s{2}/g, '') // 多个空格只保留一个空格
|
||||
.replace('<!DOCTYPE html>', '') // 去除<!DOCTYPE>声明
|
||||
|
||||
// if (mode === 'gemini') {
|
||||
// let client = new CustomGoogleGeminiClient({
|
||||
// e,
|
||||
// userId: e?.sender?.user_id,
|
||||
// key: Config.getGeminiKey(),
|
||||
// model: Config.geminiModel,
|
||||
// baseUrl: Config.geminiBaseUrl,
|
||||
// debug: Config.debug
|
||||
// })
|
||||
// const htmlContentSummaryRes = await client.sendMessage(`去除与主体内容无关的部分,从中整理出主体内容并转换成md格式,不需要主观描述性的语言与冗余的空白行。${text}`)
|
||||
// let htmlContentSummary = htmlContentSummaryRes.text
|
||||
// return `this is the main content of website:\n ${htmlContentSummary}`
|
||||
// } else {
|
||||
// let maxModelTokens = getMaxModelTokens(Config.model)
|
||||
// text = text.slice(0, Math.min(text.length, maxModelTokens - 1600))
|
||||
// let completionParams = {
|
||||
// // model: Config.model
|
||||
// model: 'gpt-3.5-turbo-16k'
|
||||
// }
|
||||
// let api = new ChatGPTAPI({
|
||||
// apiBaseUrl: Config.openAiBaseUrl,
|
||||
// apiKey: Config.apiKey,
|
||||
// debug: false,
|
||||
// completionParams,
|
||||
// fetch: (url, options = {}) => {
|
||||
// const defaultOptions = Config.proxy
|
||||
// ? {
|
||||
// agent: proxy(Config.proxy)
|
||||
// }
|
||||
// : {}
|
||||
// const mergedOptions = {
|
||||
// ...defaultOptions,
|
||||
// ...options
|
||||
// }
|
||||
// return fetch(url, mergedOptions)
|
||||
// },
|
||||
// maxModelTokens
|
||||
// })
|
||||
// const htmlContentSummaryRes = await api.sendMessage(`去除与主体内容无关的部分,从中整理出主体内容并转换成md格式,不需要主观描述性的语言与冗余的空白行。${text}`, { completionParams })
|
||||
// let htmlContentSummary = htmlContentSummaryRes.text
|
||||
// return `this is the main content of website:\n ${htmlContentSummary}`
|
||||
// }
|
||||
text = cleanHTML(text)
|
||||
return `the content of the website is:\n${text}`
|
||||
} catch (err) {
|
||||
return `failed to visit the website, error: ${err.toString()}`
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue