chatgpt-plugin/utils/tools/WebsiteTool.js

import { AbstractTool } from './AbstractTool.js'
import { ChatGPTAPI } from '../openai/chatgpt-api.js'
import { Config } from '../config.js'
import fetch from 'node-fetch'
import proxy from 'https-proxy-agent'
import { getMaxModelTokens } from '../common.js'
import { ChatGPTPuppeteer } from '../browser.js'
import { CustomGoogleGeminiClient } from '../../client/CustomGoogleGeminiClient.js'

/**
 * Generated by GPT-4o
 * @param html
 * @returns {*}
 */
function cleanHTML (html) {
  // 1. 移除 <style>、<script>、<link>、<head> 等无关内容
  html = html.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') // 移除CSS
    .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') // 移除JS
    .replace(/<link[^>]*>/gi, '') // 移除外部CSS文件
    .replace(/<head[^>]*>[\s\S]*?<\/head>/gi, '') // 移除整个<head>
    .replace(/<!--[\s\S]*?-->/g, '') // 移除HTML注释
    .replace(/<figure[^>]*>[\s\S]*?<\/figure>/gi, '') // 移除<figure>

  // 2. 允许的标签列表
  const allowedTags = ['title', 'meta', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'img', 'video', 'audio', 'source', 'a']

  // 3. 处理HTML标签，移除不在允许列表中的标签
  html = html.replace(/<\/?([a-zA-Z0-9]+)(\s[^>]*)?>/g, (match, tagName, attrs) => {
    tagName = tagName.toLowerCase()
    if (allowedTags.includes(tagName)) {
      if (tagName === 'meta') {
        // 允许<meta>标签，仅保留其中的 charset, name, content
        return match.replace(/<(meta)([^>]*)>/gi, (_, tag, attributes) => {
          let allowedAttrs = attributes.match(/(charset|name|content)=["'][^"']+["']/gi)
          return `<${tag} ${allowedAttrs ? allowedAttrs.join(' ') : ''}>`
        })
      } else if (tagName === 'img' || tagName === 'video' || tagName === 'audio' || tagName === 'source') {
        // 仅保留 `src` 属性，并去掉 base64 编码的 `data:` 形式
        return match.replace(/<(img|video|audio|source)([^>]*)>/gi, (_, tag, attributes) => {
          let srcMatch = attributes.match(/\bsrc=["'](?!data:)[^"']+["']/i) // 过滤 base64
          return srcMatch ? `<${tag} ${srcMatch[0]}>` : '' // 没有合法的 src 就移除整个标签
        })
      } else if (tagName === 'a') {
        // 仅保留 `href`，并去掉 base64 `data:` 形式
        return match.replace(/<a([^>]*)>/gi, (_, attributes) => {
          let hrefMatch = attributes.match(/\bhref=["'](?!data:)[^"']+["']/i)
          return hrefMatch ? `<a ${hrefMatch[0]}>` : '' // 没有合法的 href 就移除整个标签
        })
      }
      return match // 其他允许的标签直接保留
    }
    return '' // 过滤不在允许列表中的标签
  })

  // 4. 移除多余的空格和换行符
  html = html.replace(/\s+/g, ' ').trim()

  return html
}

export class WebsiteTool extends AbstractTool {
  name = 'website'

  parameters = {
    properties: {
      url: {
        type: 'string',
        description: '要访问的网站网址'
      }
    },
    required: ['url']
  }

  func = async function (opts) {
    let { url, mode, e } = opts
    let browser
    try {
      // let res = await fetch(url, {
      //   headers: {
      //     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
      //   }
      // })
      // let text = await res.text()
      let origin = false
      if (!Config.headless) {
        Config.headless = true
        origin = true
      }
      let ppt = new ChatGPTPuppeteer()
      browser = await ppt.getBrowser()
      let page = await browser.newPage()
      await page.goto(url, {
        waitUntil: 'networkidle2'
      })
      let text = await page.content()
      await page.close()
      if (origin) {
        Config.headless = false
      }
      text = cleanHTML(text)
      return `the content of the website is:\n${text}`
    } catch (err) {
      return `failed to visit the website, error: ${err.toString()}`
    } finally {
      if (browser) {
        try {
          await browser.close()
        } catch (err) {}
      }
    }
  }

  description = 'Useful when you want to browse a website by url, it can be a html or api url'
}