chatgpt-plugin/utils/wordcloud/tokenizer.js
2024-03-09 23:37:47 +08:00

231 lines
7.9 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { Config } from '../config.js'
import fs from 'fs'
import nodejieba from '@node-rs/jieba'
class Tokenizer {
async getHistory (e, groupId, date = new Date(), duration = 0, userId) {
if (!groupId) {
throw new Error('no valid group id')
}
let group = e.bot.pickGroup(groupId, true)
let latestChat = await group.getChatHistory(0, 1)
let seq = latestChat[0].seq
let chats = latestChat
function compareByTime (a, b) {
const timeA = a.time
const timeB = b.time
if (timeA < timeB) {
return -1
}
if (timeA > timeB) {
return 1
}
return 0
}
// Get the current timestamp
let currentTime = date.getTime()
// Step 2: Set the hours, minutes, seconds, and milliseconds to 0
date.setHours(0, 0, 0, 0)
// Step 3: Calculate the timestamp representing the start of the specified date
// duration represents the number of hours to go back
// if duration is 0, keeping the original date (start of today)
let startOfSpecifiedDate = date.getTime()
// if duration > 0, go back to the specified number of hours
if (duration > 0) {
// duration should be in range [0, 24]
// duration = Math.min(duration, 24)
startOfSpecifiedDate = currentTime - (duration * 60 * 60 * 1000)
}
// Step 4: Get the end of the specified date by current time
const endOfSpecifiedDate = currentTime
while (isTimestampInDateRange(chats[0]?.time, startOfSpecifiedDate, endOfSpecifiedDate) &&
isTimestampInDateRange(chats[chats.length - 1]?.time, startOfSpecifiedDate, endOfSpecifiedDate)) {
let chatHistory = await group.getChatHistory(seq, 20)
if (chatHistory.length === 1) {
if (chats[0].seq === chatHistory[0].seq) {
// 昨天没有聊天记录 比如新建的群 新进群的机器人 会卡在某一条
break
}
}
chats.push(...chatHistory)
chats.sort(compareByTime)
seq = chatHistory?.[0]?.seq
if (!seq) {
break
}
if (Config.debug) {
logger.info(`拉取到${chatHistory.length}条聊天记录,当前已累计获取${chats.length}条聊天记录,继续拉...`)
}
}
chats = chats.filter(chat => isTimestampInDateRange(chat.time, startOfSpecifiedDate, endOfSpecifiedDate))
if (userId) {
chats = chats.filter(chat => chat.sender.user_id === userId)
}
return chats
}
async getKeywordTopK (e, groupId, topK = 100, duration = 0, userId) {
if (!nodejieba) {
throw new Error('未安装node-rs/jieba娱乐功能-词云统计不可用')
}
if (!this.loaded) {
nodejieba.load()
this.loaded = true
}
// duration represents the number of hours to go back, should in range [0, 24]
let chats = await this.getHistory(e, groupId, new Date(), duration, userId)
let durationStr = duration > 0 ? `${duration}小时` : '今日'
logger.mark(`聊天记录拉取完成,获取到${durationStr}${chats.length}条聊天记录,准备分词中`)
const _path = process.cwd()
let stopWordsPath = `${_path}/plugins/chatgpt-plugin/utils/wordcloud/cn_stopwords.txt`
const data = fs.readFileSync(stopWordsPath)
const stopWords = String(data)?.split('\n') || []
let chatContent = chats
.map(c => c.message
// 只统计文本内容
.filter(item => item.type == 'text')
.map(textItem => `${textItem.text}`)
.join('').trim()
)
.map(c => {
// let length = c.length
let threshold = 2
// if (length < 100 && length > 50) {
// threshold = 6
// } else if (length <= 50 && length > 25) {
// threshold = 3
// } else if (length <= 25) {
// threshold = 2
// }
return nodejieba.extract(c, threshold)
})
.reduce((acc, curr) => acc.concat(curr), [])
.map(c => c.keyword)
.filter(c => stopWords.indexOf(c) < 0)
if (Config.debug) {
logger.info(chatContent)
}
const countMap = {}
for (const value of chatContent) {
if (countMap[value]) {
countMap[value]++
} else {
countMap[value] = 1
}
}
let list = Object.keys(countMap).map(k => {
return [k, countMap[k]]
})
function compareByFrequency (a, b) {
const freA = a[1]
const freB = b[1]
if (freA < freB) {
return 1
}
if (freA > freB) {
return -1
}
return 0
}
logger.mark('分词统计完成,绘制词云中...')
return list.filter(s => s[1] > 2).sort(compareByFrequency).slice(0, topK)
}
}
class ShamrockTokenizer extends Tokenizer {
async getHistory (e, groupId, date = new Date(), duration = 0, userId) {
logger.mark('当前使用Shamrock适配器')
if (!groupId) {
throw new Error('no valid group id')
}
let group = e.bot.pickGroup(groupId, true)
// 直接加大力度
let pageSize = 500
let chats = (await group.getChatHistory(0, pageSize, false)) || []
// Get the current timestamp
let currentTime = date.getTime()
// Step 2: Set the hours, minutes, seconds, and milliseconds to 0
date.setHours(0, 0, 0, 0)
// Step 3: Calculate the timestamp representing the start of the specified date
// duration represents the number of hours to go back
// if duration is 0, keeping the original date (start of today)
let startOfSpecifiedDate = date.getTime()
// if duration > 0, go back to the specified number of hours
if (duration > 0) {
// duration should be in range [0, 24]
// duration = Math.min(duration, 24)
startOfSpecifiedDate = currentTime - (duration * 60 * 60 * 1000)
}
// Step 4: Get the end of the specified date by currentTime
const endOfSpecifiedDate = currentTime
let cursor = chats.length
// -------------------------------------------------------
// | | |
// -------------------------------------------------------
// ^ ^
// long ago cursor+pageSize cursor current
while (isTimestampInDateRange(chats[0]?.time, startOfSpecifiedDate, endOfSpecifiedDate)) {
// 由于Shamrock消息是从最新的开始拉结束时由于动态更新一旦有人发送消息就会立刻停止所以不判断结束时间
// 拉到后面会巨卡所以增大page减少次数
pageSize = Math.floor(Math.max(cursor / 2, pageSize))
cursor = cursor + pageSize
let retries = 3
let chatHistory
while (retries >= 0) {
try {
chatHistory = await group.getChatHistory(0, cursor, false)
break
} catch (err) {
if (retries === 0) {
logger.error(err)
}
retries--
}
}
if (retries < 0) {
logger.warn('拉不动了,就这样吧')
break
}
if (chatHistory.length === 1) {
break
}
if (chatHistory.length === chats.length) {
// 没有了!再拉也没有了
break
}
let oldLength = chats.length
chats = chatHistory
// chats.sort(compareByTime)
if (Config.debug) {
logger.info(`拉取到${chats.length - oldLength}条聊天记录,当前已累计获取${chats.length}条聊天记录,继续拉...`)
}
}
chats = chats.filter(chat => isTimestampInDateRange(chat.time, startOfSpecifiedDate, endOfSpecifiedDate))
if (userId) {
chats = chats.filter(chat => chat.sender.user_id === userId)
}
return chats
}
}
function isTimestampInDateRange (timestamp, startOfSpecifiedDate, endOfSpecifiedDate) {
if (!timestamp) {
return false
}
timestamp = timestamp * 1000
// Step 5: Compare the given timestamp with the start and end of the specified date
return timestamp >= startOfSpecifiedDate && timestamp < endOfSpecifiedDate
}
export default {
default: new Tokenizer(),
shamrock: new ShamrockTokenizer()
}