mirror of
https://github.com/ikechan8370/chatgpt-plugin.git
synced 2025-12-17 13:57:10 +00:00
114 lines
4.1 KiB
JavaScript
114 lines
4.1 KiB
JavaScript
import { AbstractTool } from './AbstractTool.js'
|
||
import { ChatGPTAPI } from '../openai/chatgpt-api.js'
|
||
import { Config } from '../config.js'
|
||
import fetch from 'node-fetch'
|
||
import proxy from 'https-proxy-agent'
|
||
import { getMaxModelTokens } from '../common.js'
|
||
import { ChatGPTPuppeteer } from '../browser.js'
|
||
import { CustomGoogleGeminiClient } from '../../client/CustomGoogleGeminiClient.js'
|
||
|
||
/**
|
||
* Generated by GPT-4o
|
||
* @param html
|
||
* @returns {*}
|
||
*/
|
||
function cleanHTML (html) {
|
||
// 1. 移除 <style>、<script>、<link>、<head> 等无关内容
|
||
html = html.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') // 移除CSS
|
||
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') // 移除JS
|
||
.replace(/<link[^>]*>/gi, '') // 移除外部CSS文件
|
||
.replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '') // 移除整个<head>
|
||
.replace(/<!--[\s\S]*?-->/g, '') // 移除HTML注释
|
||
.replace(/<figure[^>]*>[\s\S]*?<\/figure>/gi, '') // 移除<figure>
|
||
|
||
// 2. 允许的标签列表
|
||
const allowedTags = ['title', 'meta', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'img', 'video', 'audio', 'source', 'a']
|
||
|
||
// 3. 处理HTML标签,移除不在允许列表中的标签
|
||
html = html.replace(/<\/?([a-zA-Z0-9]+)(\s[^>]*)?>/g, (match, tagName, attrs) => {
|
||
tagName = tagName.toLowerCase()
|
||
if (allowedTags.includes(tagName)) {
|
||
if (tagName === 'meta') {
|
||
// 允许<meta>标签,仅保留其中的 charset, name, content
|
||
return match.replace(/<(meta)([^>]*)>/gi, (_, tag, attributes) => {
|
||
let allowedAttrs = attributes.match(/(charset|name|content)=["'][^"']+["']/gi)
|
||
return `<${tag} ${allowedAttrs ? allowedAttrs.join(' ') : ''}>`
|
||
})
|
||
} else if (tagName === 'img' || tagName === 'video' || tagName === 'audio' || tagName === 'source') {
|
||
// 仅保留 `src` 属性,并去掉 base64 编码的 `data:` 形式
|
||
return match.replace(/<(img|video|audio|source)([^>]*)>/gi, (_, tag, attributes) => {
|
||
let srcMatch = attributes.match(/\bsrc=["'](?!data:)[^"']+["']/i) // 过滤 base64
|
||
return srcMatch ? `<${tag} ${srcMatch[0]}>` : '' // 没有合法的 src 就移除整个标签
|
||
})
|
||
} else if (tagName === 'a') {
|
||
// 仅保留 `href`,并去掉 base64 `data:` 形式
|
||
return match.replace(/<a([^>]*)>/gi, (_, attributes) => {
|
||
let hrefMatch = attributes.match(/\bhref=["'](?!data:)[^"']+["']/i)
|
||
return hrefMatch ? `<a ${hrefMatch[0]}>` : '' // 没有合法的 href 就移除整个标签
|
||
})
|
||
}
|
||
return match // 其他允许的标签直接保留
|
||
}
|
||
return '' // 过滤不在允许列表中的标签
|
||
})
|
||
|
||
// 4. 移除多余的空格和换行符
|
||
html = html.replace(/\s+/g, ' ').trim()
|
||
|
||
return html
|
||
}
|
||
|
||
export class WebsiteTool extends AbstractTool {
|
||
name = 'website'
|
||
|
||
parameters = {
|
||
properties: {
|
||
url: {
|
||
type: 'string',
|
||
description: '要访问的网站网址'
|
||
}
|
||
},
|
||
required: ['url']
|
||
}
|
||
|
||
func = async function (opts) {
|
||
let { url, mode, e } = opts
|
||
let browser
|
||
try {
|
||
// let res = await fetch(url, {
|
||
// headers: {
|
||
// 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
|
||
// }
|
||
// })
|
||
// let text = await res.text()
|
||
let origin = false
|
||
if (!Config.headless) {
|
||
Config.headless = true
|
||
origin = true
|
||
}
|
||
let ppt = new ChatGPTPuppeteer()
|
||
browser = await ppt.getBrowser()
|
||
let page = await browser.newPage()
|
||
await page.goto(url, {
|
||
waitUntil: 'networkidle2'
|
||
})
|
||
let text = await page.content()
|
||
await page.close()
|
||
if (origin) {
|
||
Config.headless = false
|
||
}
|
||
text = cleanHTML(text)
|
||
return `the content of the website is:\n${text}`
|
||
} catch (err) {
|
||
return `failed to visit the website, error: ${err.toString()}`
|
||
} finally {
|
||
if (browser) {
|
||
try {
|
||
await browser.close()
|
||
} catch (err) {}
|
||
}
|
||
}
|
||
}
|
||
|
||
description = 'Useful when you want to browse a website by url, it can be a html or api url'
|
||
}
|