mirror of
https://github.com/ikechan8370/chatgpt-plugin.git
synced 2025-12-16 21:37:11 +00:00
feat: 为必应和claude2提供读取文件能力支持
This commit is contained in:
parent
6c171b32a1
commit
1a95c67130
7 changed files with 369 additions and 46 deletions
|
|
@ -41,6 +41,7 @@
|
||||||
* 2023-05-29 支持gpt-4 API.必应无需cookie即可对话(Sydney和自定义模式)
|
* 2023-05-29 支持gpt-4 API.必应无需cookie即可对话(Sydney和自定义模式)
|
||||||
* 2023-07 支持智能模式,机器人可以实现禁言、群名片/头衔(需给机器人管理员/群主)、分享音乐视频、主动发音频、对接ap,sr和喵喵等插件、联网搜索等,需api模式0613系列模型。智能模式所需的额外api和搜索api分别可以参考[chatgpt-plugin-extras](https://github.com/ikechan8370/chatgpt-plugin-extras) 和 [search-api](https://github.com/ikechan8370/search-api) 自行搭建,其中后者提供了一个公益版本,前者可使用[huggingface](https://huggingface.co/spaces/ikechan8370/cp-extra)部署
|
* 2023-07 支持智能模式,机器人可以实现禁言、群名片/头衔(需给机器人管理员/群主)、分享音乐视频、主动发音频、对接ap,sr和喵喵等插件、联网搜索等,需api模式0613系列模型。智能模式所需的额外api和搜索api分别可以参考[chatgpt-plugin-extras](https://github.com/ikechan8370/chatgpt-plugin-extras) 和 [search-api](https://github.com/ikechan8370/search-api) 自行搭建,其中后者提供了一个公益版本,前者可使用[huggingface](https://huggingface.co/spaces/ikechan8370/cp-extra)部署
|
||||||
* 2023-09-10 支持来自claude.ai的claude-2模型
|
* 2023-09-10 支持来自claude.ai的claude-2模型
|
||||||
|
* 2023-10-19 支持读取文件,(目前适配必应模式和Claude2模式)
|
||||||
### 如果觉得这个插件有趣或者对你有帮助,请点一个star吧!
|
### 如果觉得这个插件有趣或者对你有帮助,请点一个star吧!
|
||||||
|
|
||||||
## 版本要求
|
## 版本要求
|
||||||
|
|
|
||||||
74
apps/chat.js
74
apps/chat.js
|
|
@ -26,7 +26,15 @@ import {
|
||||||
getUserReplySetting,
|
getUserReplySetting,
|
||||||
getImageOcrText,
|
getImageOcrText,
|
||||||
getImg,
|
getImg,
|
||||||
getMaxModelTokens, formatDate, generateAudio, formatDate2, mkdirs, getUin
|
getMaxModelTokens,
|
||||||
|
formatDate,
|
||||||
|
generateAudio,
|
||||||
|
formatDate2,
|
||||||
|
mkdirs,
|
||||||
|
getUin,
|
||||||
|
downloadFile,
|
||||||
|
isPureText,
|
||||||
|
extractContentFromFile
|
||||||
} from '../utils/common.js'
|
} from '../utils/common.js'
|
||||||
import { ChatGPTPuppeteer } from '../utils/browser.js'
|
import { ChatGPTPuppeteer } from '../utils/browser.js'
|
||||||
import { KeyvFile } from 'keyv-file'
|
import { KeyvFile } from 'keyv-file'
|
||||||
|
|
@ -1626,6 +1634,20 @@ export class chatgpt extends plugin {
|
||||||
logger.warn('获取群聊聊天记录失败,本次对话不携带聊天记录', err)
|
logger.warn('获取群聊聊天记录失败,本次对话不携带聊天记录', err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
let toSummaryFileContent
|
||||||
|
try {
|
||||||
|
if (e.source) {
|
||||||
|
let msgs = e.isGroup ? await e.group.getChatHistory(e.source.seq, 1) : await e.friend.getChatHistory(e.source.time, 1)
|
||||||
|
let sourceMsg = msgs[0]
|
||||||
|
let fileMsgElem = sourceMsg.message.find(msg => msg.type === 'file')
|
||||||
|
if (fileMsgElem) {
|
||||||
|
toSummaryFileContent = await extractContentFromFile(fileMsgElem, e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
logger.warn('读取文件内容出错, 忽略文件内容', err)
|
||||||
|
}
|
||||||
|
opt.toSummaryFileContent = toSummaryFileContent
|
||||||
} else {
|
} else {
|
||||||
// 重新创建client,因为token可能换到别的了
|
// 重新创建client,因为token可能换到别的了
|
||||||
if (bingToken?.indexOf('=') > -1) {
|
if (bingToken?.indexOf('=') > -1) {
|
||||||
|
|
@ -1893,40 +1915,30 @@ export class chatgpt extends plugin {
|
||||||
debug: Config.debug,
|
debug: Config.debug,
|
||||||
proxy: Config.proxy
|
proxy: Config.proxy
|
||||||
})
|
})
|
||||||
let fileUrl, filename, attachments
|
let toSummaryFileContent
|
||||||
if (e.source && e.source.message === '[文件]') {
|
try {
|
||||||
if (e.isGroup) {
|
if (e.source) {
|
||||||
let source = (await e.group.getChatHistory(e.source.seq, 1))[0]
|
let msgs = e.isGroup ? await e.group.getChatHistory(e.source.seq, 1) : await e.friend.getChatHistory(e.source.time, 1)
|
||||||
let file = source.message.find(m => m.type === 'file')
|
let sourceMsg = msgs[0]
|
||||||
if (file) {
|
let fileMsgElem = sourceMsg.message.find(msg => msg.type === 'file')
|
||||||
filename = file.name
|
if (fileMsgElem) {
|
||||||
fileUrl = await e.group.getFileUrl(file.fid)
|
toSummaryFileContent = await extractContentFromFile(fileMsgElem, e)
|
||||||
}
|
|
||||||
} else {
|
|
||||||
let source = (await e.friend.getChatHistory(e.source.time, 1))[0]
|
|
||||||
let file = source.message.find(m => m.type === 'file')
|
|
||||||
if (file) {
|
|
||||||
filename = file.name
|
|
||||||
fileUrl = await e.group.getFileUrl(file.fid)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} catch (err) {
|
||||||
|
logger.warn('读取文件内容出错, 忽略文件内容', err)
|
||||||
}
|
}
|
||||||
if (fileUrl) {
|
|
||||||
logger.info('文件地址:' + fileUrl)
|
let attachments = []
|
||||||
mkdirs('data/chatgpt/files')
|
if (toSummaryFileContent?.content) {
|
||||||
let destinationPath = 'data/chatgpt/files/' + filename
|
attachments.push({
|
||||||
const response = await fetch(fileUrl)
|
extracted_content: toSummaryFileContent.content,
|
||||||
const fileStream = fs.createWriteStream(destinationPath)
|
file_name: toSummaryFileContent.name,
|
||||||
await new Promise((resolve, reject) => {
|
file_type: 'pdf',
|
||||||
response.body.pipe(fileStream)
|
file_size: 200312,
|
||||||
response.body.on('error', (err) => {
|
totalPages: 20
|
||||||
reject(err)
|
|
||||||
})
|
|
||||||
fileStream.on('finish', () => {
|
|
||||||
resolve()
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
attachments = [await client.convertDocument(destinationPath, filename)]
|
logger.info(toSummaryFileContent.content)
|
||||||
}
|
}
|
||||||
if (conversationId) {
|
if (conversationId) {
|
||||||
return await client.sendMessage(prompt, conversationId, attachments)
|
return await client.sendMessage(prompt, conversationId, attachments)
|
||||||
|
|
|
||||||
|
|
@ -35,14 +35,18 @@
|
||||||
"ws": "^8.13.0"
|
"ws": "^8.13.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
|
"xlsx": "^0.18.5",
|
||||||
|
"mammoth": "^1.6.0",
|
||||||
|
"pdfjs-dist": "^3.11.174",
|
||||||
|
"nodejs-pptx": "^1.2.4",
|
||||||
"@node-rs/jieba": "^1.6.2",
|
"@node-rs/jieba": "^1.6.2",
|
||||||
|
"cycletls": "^1.0.21",
|
||||||
"jimp": "^0.22.7",
|
"jimp": "^0.22.7",
|
||||||
"node-silk": "^0.1.0",
|
"node-silk": "^0.1.0",
|
||||||
"puppeteer-extra": "^3.3.6",
|
"puppeteer-extra": "^3.3.6",
|
||||||
"puppeteer-extra-plugin-recaptcha": "^3.6.8",
|
"puppeteer-extra-plugin-recaptcha": "^3.6.8",
|
||||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||||
"sharp": "^0.32.3",
|
"sharp": "^0.32.3"
|
||||||
"cycletls": "^1.0.21"
|
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"ts-node": "^10.9.1",
|
"ts-node": "^10.9.1",
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import fetch, {
|
||||||
import crypto from 'crypto'
|
import crypto from 'crypto'
|
||||||
import WebSocket from 'ws'
|
import WebSocket from 'ws'
|
||||||
import { Config, pureSydneyInstruction } from './config.js'
|
import { Config, pureSydneyInstruction } from './config.js'
|
||||||
import { formatDate, getMasterQQ, isCN, getUserData } from './common.js'
|
import { formatDate, getMasterQQ, isCN, getUserData, limitString } from './common.js'
|
||||||
import delay from 'delay'
|
import delay from 'delay'
|
||||||
import moment from 'moment'
|
import moment from 'moment'
|
||||||
import { getProxy } from './proxy.js'
|
import { getProxy } from './proxy.js'
|
||||||
|
|
@ -222,8 +222,8 @@ export default class SydneyAIClient {
|
||||||
timeout = Config.defaultTimeoutMs,
|
timeout = Config.defaultTimeoutMs,
|
||||||
firstMessageTimeout = Config.sydneyFirstMessageTimeout,
|
firstMessageTimeout = Config.sydneyFirstMessageTimeout,
|
||||||
groupId, nickname, qq, groupName, chats, botName, masterName,
|
groupId, nickname, qq, groupName, chats, botName, masterName,
|
||||||
messageType = 'Chat'
|
messageType = 'Chat',
|
||||||
|
toSummaryFileContent
|
||||||
} = opts
|
} = opts
|
||||||
// if (messageType === 'Chat') {
|
// if (messageType === 'Chat') {
|
||||||
// logger.warn('该Bing账户token已被限流,降级至使用非搜索模式。本次对话AI将无法使用Bing搜索返回的内容')
|
// logger.warn('该Bing账户token已被限流,降级至使用非搜索模式。本次对话AI将无法使用Bing搜索返回的内容')
|
||||||
|
|
@ -372,6 +372,10 @@ export default class SydneyAIClient {
|
||||||
let maxConv = Config.maxNumUserMessagesInConversation
|
let maxConv = Config.maxNumUserMessagesInConversation
|
||||||
const currentDate = moment().format('YYYY-MM-DDTHH:mm:ssZ')
|
const currentDate = moment().format('YYYY-MM-DDTHH:mm:ssZ')
|
||||||
const imageDate = await this.kblobImage(opts.imageUrl)
|
const imageDate = await this.kblobImage(opts.imageUrl)
|
||||||
|
if (toSummaryFileContent?.content) {
|
||||||
|
// message = `请不要进行搜索,用户的问题是:"${message}"`
|
||||||
|
messageType = 'Chat'
|
||||||
|
}
|
||||||
let argument0 = {
|
let argument0 = {
|
||||||
source: 'cib',
|
source: 'cib',
|
||||||
optionsSets,
|
optionsSets,
|
||||||
|
|
@ -415,10 +419,12 @@ export default class SydneyAIClient {
|
||||||
text: message,
|
text: message,
|
||||||
messageType,
|
messageType,
|
||||||
userIpAddress: await generateRandomIP(),
|
userIpAddress: await generateRandomIP(),
|
||||||
timestamp: currentDate
|
timestamp: currentDate,
|
||||||
|
privacy: 'Internal'
|
||||||
// messageType: 'SearchQuery'
|
// messageType: 'SearchQuery'
|
||||||
},
|
},
|
||||||
tone: 'Creative',
|
tone: 'Creative',
|
||||||
|
privacy: 'Internal',
|
||||||
conversationSignature,
|
conversationSignature,
|
||||||
participant: {
|
participant: {
|
||||||
id: clientId
|
id: clientId
|
||||||
|
|
@ -440,7 +446,7 @@ export default class SydneyAIClient {
|
||||||
}
|
}
|
||||||
// simulates document summary function on Edge's Bing sidebar
|
// simulates document summary function on Edge's Bing sidebar
|
||||||
// unknown character limit, at least up to 7k
|
// unknown character limit, at least up to 7k
|
||||||
if (groupId) {
|
if (groupId && !toSummaryFileContent?.content) {
|
||||||
context += '注意,你现在正在一个qq群里和人聊天,现在问你问题的人是' + `${nickname}(${qq})。`
|
context += '注意,你现在正在一个qq群里和人聊天,现在问你问题的人是' + `${nickname}(${qq})。`
|
||||||
if (Config.enforceMaster && master) {
|
if (Config.enforceMaster && master) {
|
||||||
if (qq === master) {
|
if (qq === master) {
|
||||||
|
|
@ -493,6 +499,17 @@ export default class SydneyAIClient {
|
||||||
messageType: 'Context',
|
messageType: 'Context',
|
||||||
messageId: 'discover-web--page-ping-mriduna-----'
|
messageId: 'discover-web--page-ping-mriduna-----'
|
||||||
})
|
})
|
||||||
|
} else if (toSummaryFileContent?.content) {
|
||||||
|
obj.arguments[0].previousMessages.push({
|
||||||
|
author: 'user',
|
||||||
|
description: limitString(toSummaryFileContent?.content, 50000, true),
|
||||||
|
contextType: 'WebPage',
|
||||||
|
messageType: 'Context',
|
||||||
|
sourceName: toSummaryFileContent?.name,
|
||||||
|
sourceUrl: 'file:///C:/Users/turing/Downloads/Documents/' + toSummaryFileContent?.name || 'file.pdf',
|
||||||
|
// locale: 'und',
|
||||||
|
// privacy: 'Internal'
|
||||||
|
})
|
||||||
} else {
|
} else {
|
||||||
obj.arguments[0].previousMessages.push({
|
obj.arguments[0].previousMessages.push({
|
||||||
author: 'user',
|
author: 'user',
|
||||||
|
|
|
||||||
71
utils/bilibili/wbi.js
Normal file
71
utils/bilibili/wbi.js
Normal file
|
|
@ -0,0 +1,71 @@
|
||||||
|
import md5 from 'md5'
|
||||||
|
import fetch from 'node-fetch'
|
||||||
|
|
||||||
|
const mixinKeyEncTab = [
|
||||||
|
46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49,
|
||||||
|
33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40,
|
||||||
|
61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11,
|
||||||
|
36, 20, 34, 44, 52
|
||||||
|
]
|
||||||
|
|
||||||
|
// 对 imgKey 和 subKey 进行字符顺序打乱编码
|
||||||
|
function getMixinKey (orig) {
|
||||||
|
let temp = ''
|
||||||
|
mixinKeyEncTab.forEach((n) => {
|
||||||
|
temp += orig[n]
|
||||||
|
})
|
||||||
|
return temp.slice(0, 32)
|
||||||
|
}
|
||||||
|
|
||||||
|
// 为请求参数进行 wbi 签名
|
||||||
|
function encWbi (params, imgKey, subKey) {
|
||||||
|
const mixinKey = getMixinKey(imgKey + subKey)
|
||||||
|
const currTime = Math.round(Date.now() / 1000)
|
||||||
|
const chrFilter = /[!'()*]/g
|
||||||
|
let query = []
|
||||||
|
Object.assign(params, { wts: currTime }) // 添加 wts 字段
|
||||||
|
// 按照 key 重排参数
|
||||||
|
Object.keys(params).sort().forEach((key) => {
|
||||||
|
query.push(
|
||||||
|
`${encodeURIComponent(key)}=${encodeURIComponent(
|
||||||
|
// 过滤 value 中的 "!'()*" 字符
|
||||||
|
params[key].toString().replace(chrFilter, '')
|
||||||
|
)}`
|
||||||
|
)
|
||||||
|
})
|
||||||
|
query = query.join('&')
|
||||||
|
const wbiSign = md5(query + mixinKey) // 计算 w_rid
|
||||||
|
return query + '&w_rid=' + wbiSign
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取最新的 img_key 和 sub_key
|
||||||
|
async function getWbiKeys () {
|
||||||
|
const resp = await fetch('https://api.bilibili.com/x/web-interface/nav')
|
||||||
|
const jsonContent = resp.data
|
||||||
|
const imgUrl = jsonContent.data.wbi_img.img_url
|
||||||
|
const subUrl = jsonContent.data.wbi_img.sub_url
|
||||||
|
|
||||||
|
return {
|
||||||
|
img_key: imgUrl.slice(
|
||||||
|
imgUrl.lastIndexOf('/') + 1,
|
||||||
|
imgUrl.lastIndexOf('.')
|
||||||
|
),
|
||||||
|
sub_key: subUrl.slice(
|
||||||
|
subUrl.lastIndexOf('/') + 1,
|
||||||
|
subUrl.lastIndexOf('.')
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// getWbiKeys().then((wbi_keys) => {
|
||||||
|
// const query = encWbi(
|
||||||
|
// {
|
||||||
|
// foo: '114',
|
||||||
|
// bar: '514',
|
||||||
|
// baz: 1919810
|
||||||
|
// },
|
||||||
|
// wbi_keys.img_key,
|
||||||
|
// wbi_keys.sub_key
|
||||||
|
// )
|
||||||
|
// console.log(query)
|
||||||
|
// })
|
||||||
|
|
@ -186,6 +186,7 @@ export class ClaudeAIClient {
|
||||||
} else if (streamDataRes.status === 408) {
|
} else if (streamDataRes.status === 408) {
|
||||||
throw new Error('claude.ai响应超时,可能是回复文本太多,请调高超时时间重试')
|
throw new Error('claude.ai响应超时,可能是回复文本太多,请调高超时时间重试')
|
||||||
} else {
|
} else {
|
||||||
|
logger.error(streamDataRes.status, streamDataRes.body)
|
||||||
throw new Error('unknown error')
|
throw new Error('unknown error')
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
233
utils/common.js
233
utils/common.js
|
|
@ -1,5 +1,3 @@
|
||||||
// import { remark } from 'remark'
|
|
||||||
// import stripMarkdown from 'strip-markdown'
|
|
||||||
import { exec } from 'child_process'
|
import { exec } from 'child_process'
|
||||||
import lodash from 'lodash'
|
import lodash from 'lodash'
|
||||||
import fs from 'node:fs'
|
import fs from 'node:fs'
|
||||||
|
|
@ -15,12 +13,26 @@ import AzureTTS, { supportConfigurations as azureRoleList } from './tts/microsof
|
||||||
import { translate } from './translate.js'
|
import { translate } from './translate.js'
|
||||||
import uploadRecord from './uploadRecord.js'
|
import uploadRecord from './uploadRecord.js'
|
||||||
import Version from './version.js'
|
import Version from './version.js'
|
||||||
// export function markdownToText (markdown) {
|
import fetch from 'node-fetch'
|
||||||
// return remark()
|
let pdfjsLib
|
||||||
// .use(stripMarkdown)
|
try {
|
||||||
// .processSync(markdown ?? '')
|
pdfjsLib = require('pdfjs-dist')
|
||||||
// .toString()
|
} catch (err) {}
|
||||||
// }
|
|
||||||
|
let mammoth
|
||||||
|
try {
|
||||||
|
mammoth = require('mammoth')
|
||||||
|
} catch (err) {}
|
||||||
|
|
||||||
|
let XLSX
|
||||||
|
try {
|
||||||
|
XLSX = require('xlsx')
|
||||||
|
} catch (err) {}
|
||||||
|
|
||||||
|
let PPTX
|
||||||
|
try {
|
||||||
|
PPTX = require('nodejs-pptx')
|
||||||
|
} catch (err) {}
|
||||||
|
|
||||||
let _puppeteer
|
let _puppeteer
|
||||||
try {
|
try {
|
||||||
|
|
@ -972,3 +984,208 @@ export function getUserSpeaker (userSetting) {
|
||||||
return userSetting.ttsRoleVoiceVox || Config.voicevoxTTSSpeaker
|
return userSetting.ttsRoleVoiceVox || Config.voicevoxTTSSpeaker
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param url 要下载的文件链接
|
||||||
|
* @param destPath 目标路径,如received/abc.pdf. 目前如果文件名重复会覆盖。
|
||||||
|
* @param absolute 是否是绝对路径,默认为false,此时拼接在data/chatgpt下
|
||||||
|
* @returns {Promise<string>} 最终下载文件的存储位置
|
||||||
|
*/
|
||||||
|
export async function downloadFile (url, destPath, absolute = false) {
|
||||||
|
let response = await fetch(url)
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`download file http error: status: ${response.status}`)
|
||||||
|
}
|
||||||
|
let dest = destPath
|
||||||
|
if (!absolute) {
|
||||||
|
const _path = process.cwd()
|
||||||
|
dest = path.join(_path, 'data', 'chatgpt', dest)
|
||||||
|
const lastLevelDirPath = path.dirname(dest)
|
||||||
|
mkdirs(lastLevelDirPath)
|
||||||
|
}
|
||||||
|
const fileStream = fs.createWriteStream(dest)
|
||||||
|
await new Promise((resolve, reject) => {
|
||||||
|
response.body.pipe(fileStream)
|
||||||
|
response.body.on('error', err => {
|
||||||
|
reject(err)
|
||||||
|
})
|
||||||
|
fileStream.on('finish', function () {
|
||||||
|
resolve()
|
||||||
|
})
|
||||||
|
})
|
||||||
|
logger.info(`File downloaded successfully! URL: ${url}, Destination: ${dest}`)
|
||||||
|
return dest
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isPureText (filename) {
|
||||||
|
const ext = path.extname(filename).toLowerCase()
|
||||||
|
|
||||||
|
// List of file extensions that can be treated as pure text
|
||||||
|
const textFileExtensions = ['.txt', '.log', '.md', '.csv', '.html', '.css', '.js', '.json', '.xml', '.py', '.java', '.cpp', '.c', '.rb', '.php', '.sql', '.sh', '.pl', '.r', '.swift', '.go', '.ts', '.htm', '.yaml', '.yml', '.ini', '.properties', '.tsv']
|
||||||
|
|
||||||
|
// File types that require additional processing
|
||||||
|
const processingExtensions = ['.docx', '.pptx', '.xlsx', '.pdf', '.epub']
|
||||||
|
|
||||||
|
if (textFileExtensions.includes(ext)) {
|
||||||
|
return 'text'
|
||||||
|
} else if (processingExtensions.includes(ext)) {
|
||||||
|
// Return the file extension if additional processing is needed
|
||||||
|
return ext.replace('.', '')
|
||||||
|
} else {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 从文件中提取文本内容
|
||||||
|
* @param fileMsgElem MessageElem
|
||||||
|
* @returns {Promise<{}>} 提取的文本内容和文件名
|
||||||
|
*/
|
||||||
|
export async function extractContentFromFile (fileMsgElem, e) {
|
||||||
|
logger.info('filename: ' + fileMsgElem.name)
|
||||||
|
let fileType = isPureText(fileMsgElem.name)
|
||||||
|
if (fileType) {
|
||||||
|
// 可读的文件类型
|
||||||
|
let fileUrl = e.isGroup ? await e.group.getFileUrl(fileMsgElem.fid) : await e.friend.getFileUrl(fileMsgElem.fid)
|
||||||
|
let filePath = await downloadFile(fileUrl, path.join('received', fileMsgElem.name))
|
||||||
|
switch (fileType) {
|
||||||
|
case 'pdf': {
|
||||||
|
if (!pdfjsLib) {
|
||||||
|
return {}
|
||||||
|
}
|
||||||
|
const data = new Uint8Array(fs.readFileSync(filePath))
|
||||||
|
let loadingTask = pdfjsLib.getDocument(data)
|
||||||
|
try {
|
||||||
|
const pdfDocument = await loadingTask.promise
|
||||||
|
const numPages = pdfDocument.numPages
|
||||||
|
let pdfText = ''
|
||||||
|
|
||||||
|
// limit pages to prevent OOM or LLM down
|
||||||
|
let maxPage = 100
|
||||||
|
// Iterate through each page and extract text
|
||||||
|
for (let pageNum = 1; pageNum <= Math.min(numPages, maxPage); ++pageNum) {
|
||||||
|
const page = await pdfDocument.getPage(pageNum)
|
||||||
|
const textContent = await page.getTextContent()
|
||||||
|
const pageText = textContent.items.map(item => item.str).join(' ')
|
||||||
|
pdfText += pageText
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
content: pdfText,
|
||||||
|
name: fileMsgElem.name
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error reading PDF file:', error)
|
||||||
|
return {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case 'doc': {
|
||||||
|
logger.error('not supported file type now')
|
||||||
|
return ''
|
||||||
|
}
|
||||||
|
case 'docx': {
|
||||||
|
if (!mammoth) {
|
||||||
|
return {}
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const { value } = await mammoth.extractRawText({ path: filePath })
|
||||||
|
return {
|
||||||
|
content: value,
|
||||||
|
name: fileMsgElem.name
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error('Error reading .docx file:', error)
|
||||||
|
return {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case 'xls': {
|
||||||
|
logger.error('not supported file type now')
|
||||||
|
return {}
|
||||||
|
}
|
||||||
|
case 'xlsx': {
|
||||||
|
if (!XLSX) {
|
||||||
|
return {}
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const workbook = XLSX.readFile(filePath)
|
||||||
|
const sheetName = workbook.SheetNames[0] // Assuming the first sheet is the one you want to read
|
||||||
|
const sheet = workbook.Sheets[sheetName]
|
||||||
|
const data = XLSX.utils.sheet_to_json(sheet, { header: 1 })
|
||||||
|
|
||||||
|
// Convert the 2D array to plain text
|
||||||
|
return {
|
||||||
|
content: data.map(row => row.join('\t')).join('\n'),
|
||||||
|
name: fileMsgElem.name
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error reading .xlsx file:', error)
|
||||||
|
return {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case 'ppt': {
|
||||||
|
logger.error('not supported file type now')
|
||||||
|
return {}
|
||||||
|
}
|
||||||
|
case 'pptx': {
|
||||||
|
if (!PPTX) {
|
||||||
|
return {}
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
let pptx = new PPTX.Composer()
|
||||||
|
await pptx.load(filePath)
|
||||||
|
let presentationContent = []
|
||||||
|
let slideNumber = 1
|
||||||
|
let maxSlideNumber = 60
|
||||||
|
while (slideNumber <= maxSlideNumber) {
|
||||||
|
let slide
|
||||||
|
try {
|
||||||
|
slide = pptx.getSlide(slideNumber)
|
||||||
|
} catch (error) {
|
||||||
|
// Slide number out of range, break the loop
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
let slideContent = []
|
||||||
|
|
||||||
|
// Iterate through slide elements and extract text content
|
||||||
|
slide.elements.forEach(element => {
|
||||||
|
if (element.text) {
|
||||||
|
slideContent.push(element.text)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
// Add slide content to the presentation content array
|
||||||
|
presentationContent.push(slideContent.join('\n'))
|
||||||
|
|
||||||
|
// Move to the next slide
|
||||||
|
slideNumber++
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
content: presentationContent.join('\n'),
|
||||||
|
name: fileMsgElem.name
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error reading .pptx file:', error)
|
||||||
|
return {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case 'epub': {
|
||||||
|
logger.error('not supported file type now')
|
||||||
|
return {}
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
// text type
|
||||||
|
const data = fs.readFileSync(filePath)
|
||||||
|
let text = String(data)
|
||||||
|
if (text) {
|
||||||
|
return {
|
||||||
|
content: text,
|
||||||
|
name: fileMsgElem.name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue