feat: 为必应和claude2提供读取文件能力支持

This commit is contained in:
ikechan8370 2023-10-19 20:15:49 +08:00
parent 6c171b32a1
commit 1a95c67130
7 changed files with 369 additions and 46 deletions

View file

@ -7,7 +7,7 @@ import fetch, {
import crypto from 'crypto'
import WebSocket from 'ws'
import { Config, pureSydneyInstruction } from './config.js'
import { formatDate, getMasterQQ, isCN, getUserData } from './common.js'
import { formatDate, getMasterQQ, isCN, getUserData, limitString } from './common.js'
import delay from 'delay'
import moment from 'moment'
import { getProxy } from './proxy.js'
@ -222,8 +222,8 @@ export default class SydneyAIClient {
timeout = Config.defaultTimeoutMs,
firstMessageTimeout = Config.sydneyFirstMessageTimeout,
groupId, nickname, qq, groupName, chats, botName, masterName,
messageType = 'Chat'
messageType = 'Chat',
toSummaryFileContent
} = opts
// if (messageType === 'Chat') {
// logger.warn('该Bing账户token已被限流降级至使用非搜索模式。本次对话AI将无法使用Bing搜索返回的内容')
@ -372,6 +372,10 @@ export default class SydneyAIClient {
let maxConv = Config.maxNumUserMessagesInConversation
const currentDate = moment().format('YYYY-MM-DDTHH:mm:ssZ')
const imageDate = await this.kblobImage(opts.imageUrl)
if (toSummaryFileContent?.content) {
// message = `请不要进行搜索,用户的问题是:"${message}"`
messageType = 'Chat'
}
let argument0 = {
source: 'cib',
optionsSets,
@ -415,10 +419,12 @@ export default class SydneyAIClient {
text: message,
messageType,
userIpAddress: await generateRandomIP(),
timestamp: currentDate
timestamp: currentDate,
privacy: 'Internal'
// messageType: 'SearchQuery'
},
tone: 'Creative',
privacy: 'Internal',
conversationSignature,
participant: {
id: clientId
@ -440,7 +446,7 @@ export default class SydneyAIClient {
}
// simulates document summary function on Edge's Bing sidebar
// unknown character limit, at least up to 7k
if (groupId) {
if (groupId && !toSummaryFileContent?.content) {
context += '注意你现在正在一个qq群里和人聊天现在问你问题的人是' + `${nickname}(${qq})。`
if (Config.enforceMaster && master) {
if (qq === master) {
@ -493,6 +499,17 @@ export default class SydneyAIClient {
messageType: 'Context',
messageId: 'discover-web--page-ping-mriduna-----'
})
} else if (toSummaryFileContent?.content) {
obj.arguments[0].previousMessages.push({
author: 'user',
description: limitString(toSummaryFileContent?.content, 50000, true),
contextType: 'WebPage',
messageType: 'Context',
sourceName: toSummaryFileContent?.name,
sourceUrl: 'file:///C:/Users/turing/Downloads/Documents/' + toSummaryFileContent?.name || 'file.pdf',
// locale: 'und',
// privacy: 'Internal'
})
} else {
obj.arguments[0].previousMessages.push({
author: 'user',

71
utils/bilibili/wbi.js Normal file
View file

@ -0,0 +1,71 @@
import md5 from 'md5'
import fetch from 'node-fetch'
const mixinKeyEncTab = [
46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49,
33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40,
61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11,
36, 20, 34, 44, 52
]
// 对 imgKey 和 subKey 进行字符顺序打乱编码
function getMixinKey (orig) {
let temp = ''
mixinKeyEncTab.forEach((n) => {
temp += orig[n]
})
return temp.slice(0, 32)
}
// 为请求参数进行 wbi 签名
function encWbi (params, imgKey, subKey) {
const mixinKey = getMixinKey(imgKey + subKey)
const currTime = Math.round(Date.now() / 1000)
const chrFilter = /[!'()*]/g
let query = []
Object.assign(params, { wts: currTime }) // 添加 wts 字段
// 按照 key 重排参数
Object.keys(params).sort().forEach((key) => {
query.push(
`${encodeURIComponent(key)}=${encodeURIComponent(
// 过滤 value 中的 "!'()*" 字符
params[key].toString().replace(chrFilter, '')
)}`
)
})
query = query.join('&')
const wbiSign = md5(query + mixinKey) // 计算 w_rid
return query + '&w_rid=' + wbiSign
}
// 获取最新的 img_key 和 sub_key
async function getWbiKeys () {
const resp = await fetch('https://api.bilibili.com/x/web-interface/nav')
const jsonContent = resp.data
const imgUrl = jsonContent.data.wbi_img.img_url
const subUrl = jsonContent.data.wbi_img.sub_url
return {
img_key: imgUrl.slice(
imgUrl.lastIndexOf('/') + 1,
imgUrl.lastIndexOf('.')
),
sub_key: subUrl.slice(
subUrl.lastIndexOf('/') + 1,
subUrl.lastIndexOf('.')
)
}
}
// getWbiKeys().then((wbi_keys) => {
// const query = encWbi(
// {
// foo: '114',
// bar: '514',
// baz: 1919810
// },
// wbi_keys.img_key,
// wbi_keys.sub_key
// )
// console.log(query)
// })

View file

@ -186,6 +186,7 @@ export class ClaudeAIClient {
} else if (streamDataRes.status === 408) {
throw new Error('claude.ai响应超时可能是回复文本太多请调高超时时间重试')
} else {
logger.error(streamDataRes.status, streamDataRes.body)
throw new Error('unknown error')
}
}

View file

@ -1,5 +1,3 @@
// import { remark } from 'remark'
// import stripMarkdown from 'strip-markdown'
import { exec } from 'child_process'
import lodash from 'lodash'
import fs from 'node:fs'
@ -15,12 +13,26 @@ import AzureTTS, { supportConfigurations as azureRoleList } from './tts/microsof
import { translate } from './translate.js'
import uploadRecord from './uploadRecord.js'
import Version from './version.js'
// export function markdownToText (markdown) {
// return remark()
// .use(stripMarkdown)
// .processSync(markdown ?? '')
// .toString()
// }
import fetch from 'node-fetch'
let pdfjsLib
try {
pdfjsLib = require('pdfjs-dist')
} catch (err) {}
let mammoth
try {
mammoth = require('mammoth')
} catch (err) {}
let XLSX
try {
XLSX = require('xlsx')
} catch (err) {}
let PPTX
try {
PPTX = require('nodejs-pptx')
} catch (err) {}
let _puppeteer
try {
@ -972,3 +984,208 @@ export function getUserSpeaker (userSetting) {
return userSetting.ttsRoleVoiceVox || Config.voicevoxTTSSpeaker
}
}
/**
*
* @param url 要下载的文件链接
* @param destPath 目标路径如received/abc.pdf. 目前如果文件名重复会覆盖
* @param absolute 是否是绝对路径默认为false此时拼接在data/chatgpt下
* @returns {Promise<string>} 最终下载文件的存储位置
*/
export async function downloadFile (url, destPath, absolute = false) {
let response = await fetch(url)
if (!response.ok) {
throw new Error(`download file http error: status: ${response.status}`)
}
let dest = destPath
if (!absolute) {
const _path = process.cwd()
dest = path.join(_path, 'data', 'chatgpt', dest)
const lastLevelDirPath = path.dirname(dest)
mkdirs(lastLevelDirPath)
}
const fileStream = fs.createWriteStream(dest)
await new Promise((resolve, reject) => {
response.body.pipe(fileStream)
response.body.on('error', err => {
reject(err)
})
fileStream.on('finish', function () {
resolve()
})
})
logger.info(`File downloaded successfully! URL: ${url}, Destination: ${dest}`)
return dest
}
export function isPureText (filename) {
const ext = path.extname(filename).toLowerCase()
// List of file extensions that can be treated as pure text
const textFileExtensions = ['.txt', '.log', '.md', '.csv', '.html', '.css', '.js', '.json', '.xml', '.py', '.java', '.cpp', '.c', '.rb', '.php', '.sql', '.sh', '.pl', '.r', '.swift', '.go', '.ts', '.htm', '.yaml', '.yml', '.ini', '.properties', '.tsv']
// File types that require additional processing
const processingExtensions = ['.docx', '.pptx', '.xlsx', '.pdf', '.epub']
if (textFileExtensions.includes(ext)) {
return 'text'
} else if (processingExtensions.includes(ext)) {
// Return the file extension if additional processing is needed
return ext.replace('.', '')
} else {
return false
}
}
/**
* 从文件中提取文本内容
* @param fileMsgElem MessageElem
* @returns {Promise<{}>} 提取的文本内容和文件名
*/
export async function extractContentFromFile (fileMsgElem, e) {
logger.info('filename: ' + fileMsgElem.name)
let fileType = isPureText(fileMsgElem.name)
if (fileType) {
// 可读的文件类型
let fileUrl = e.isGroup ? await e.group.getFileUrl(fileMsgElem.fid) : await e.friend.getFileUrl(fileMsgElem.fid)
let filePath = await downloadFile(fileUrl, path.join('received', fileMsgElem.name))
switch (fileType) {
case 'pdf': {
if (!pdfjsLib) {
return {}
}
const data = new Uint8Array(fs.readFileSync(filePath))
let loadingTask = pdfjsLib.getDocument(data)
try {
const pdfDocument = await loadingTask.promise
const numPages = pdfDocument.numPages
let pdfText = ''
// limit pages to prevent OOM or LLM down
let maxPage = 100
// Iterate through each page and extract text
for (let pageNum = 1; pageNum <= Math.min(numPages, maxPage); ++pageNum) {
const page = await pdfDocument.getPage(pageNum)
const textContent = await page.getTextContent()
const pageText = textContent.items.map(item => item.str).join(' ')
pdfText += pageText
}
return {
content: pdfText,
name: fileMsgElem.name
}
} catch (error) {
console.error('Error reading PDF file:', error)
return {}
}
}
case 'doc': {
logger.error('not supported file type now')
return ''
}
case 'docx': {
if (!mammoth) {
return {}
}
try {
const { value } = await mammoth.extractRawText({ path: filePath })
return {
content: value,
name: fileMsgElem.name
}
} catch (error) {
logger.error('Error reading .docx file:', error)
return {}
}
}
case 'xls': {
logger.error('not supported file type now')
return {}
}
case 'xlsx': {
if (!XLSX) {
return {}
}
try {
const workbook = XLSX.readFile(filePath)
const sheetName = workbook.SheetNames[0] // Assuming the first sheet is the one you want to read
const sheet = workbook.Sheets[sheetName]
const data = XLSX.utils.sheet_to_json(sheet, { header: 1 })
// Convert the 2D array to plain text
return {
content: data.map(row => row.join('\t')).join('\n'),
name: fileMsgElem.name
}
} catch (error) {
console.error('Error reading .xlsx file:', error)
return {}
}
}
case 'ppt': {
logger.error('not supported file type now')
return {}
}
case 'pptx': {
if (!PPTX) {
return {}
}
try {
let pptx = new PPTX.Composer()
await pptx.load(filePath)
let presentationContent = []
let slideNumber = 1
let maxSlideNumber = 60
while (slideNumber <= maxSlideNumber) {
let slide
try {
slide = pptx.getSlide(slideNumber)
} catch (error) {
// Slide number out of range, break the loop
break
}
let slideContent = []
// Iterate through slide elements and extract text content
slide.elements.forEach(element => {
if (element.text) {
slideContent.push(element.text)
}
})
// Add slide content to the presentation content array
presentationContent.push(slideContent.join('\n'))
// Move to the next slide
slideNumber++
}
return {
content: presentationContent.join('\n'),
name: fileMsgElem.name
}
} catch (error) {
console.error('Error reading .pptx file:', error)
return {}
}
}
case 'epub': {
logger.error('not supported file type now')
return {}
}
default: {
// text type
const data = fs.readFileSync(filePath)
let text = String(data)
if (text) {
return {
content: text,
name: fileMsgElem.name
}
}
}
}
return {}
}
}