chatgpt-plugin/utils/wordcloud/tokenizer.js
2023-04-20 22:15:55 +08:00

112 lines
3.6 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { Config } from '../config.js'
let nodejieba
try {
nodejieba = (await import('@node-rs/jieba')).default
} catch (err) {
logger.info('未安装nodejieba娱乐功能-词云统计不可用')
}
export class Tokenizer {
async getTodayHistory (groupId, date = new Date()) {
if (!groupId) {
throw new Error('no valid group id')
}
let group = Bot.pickGroup(groupId, true)
let latestChat = await group.getChatHistory(0, 1)
let seq = latestChat[0].seq
let chats = latestChat
function compareByTime (a, b) {
const timeA = a.time
const timeB = b.time
if (timeA < timeB) {
return -1
}
if (timeA > timeB) {
return 1
}
return 0
}
// Step 2: Set the hours, minutes, seconds, and milliseconds to 0
date.setHours(0, 0, 0, 0)
// Step 3: Calculate the timestamp representing the start of the specified date
const startOfSpecifiedDate = date.getTime()
// Step 4: Get the end of the specified date by adding 24 hours (in milliseconds)
const endOfSpecifiedDate = startOfSpecifiedDate + (24 * 60 * 60 * 1000)
while (isTimestampInDateRange(chats[0]?.time, startOfSpecifiedDate, endOfSpecifiedDate) && isTimestampInDateRange(chats[chats.length - 1]?.time, startOfSpecifiedDate, endOfSpecifiedDate)) {
let chatHistory = await group.getChatHistory(seq, 20)
if (chatHistory.length === 1) {
if (chats[0].seq === chatHistory[0].seq) {
// 昨天没有聊天记录 比如新建的群 新进群的机器人 会卡在某一条
break
}
}
chats.push(...chatHistory)
chats.sort(compareByTime)
seq = chatHistory[0].seq
if (Config.debug) {
logger.info(`拉取到${chatHistory.length}条聊天记录,当前已累计获取${chats.length}条聊天记录,继续拉...`)
}
}
chats = chats.filter(chat => isTimestampInDateRange(chat.time, startOfSpecifiedDate, endOfSpecifiedDate))
return chats
}
async getTodayKeywordTopK (groupId, topK = 100) {
if (!nodejieba) {
throw new Error('未安装nodejieba娱乐功能-词云统计不可用')
}
let chats = await this.getTodayHistory(groupId)
logger.mark(`聊天记录拉去完成,获取到今日内${chats.length}条聊天记录,准备分词中`)
nodejieba.load()
let chatContent = chats
.map(c => c.raw_message
.replaceAll('[图片]', '')
.replaceAll('[表情]', '')
.replaceAll('[动画表情]', '')
.replaceAll('[语音]', '')
)
.map(c => nodejieba.extract(c, 10))
.reduce((acc, curr) => acc.concat(curr), [])
.map(c => c.keyword)
if (Config.debug) {
logger.info(chatContent)
}
const countMap = {}
for (const value of chatContent) {
if (countMap[value]) {
countMap[value]++
} else {
countMap[value] = 1
}
}
let list = Object.keys(countMap).map(k => {
return [k, countMap[k]]
})
function compareByFrequency (a, b) {
const freA = a[1]
const freB = b[1]
if (freA < freB) {
return 1
}
if (freA > freB) {
return -1
}
return 0
}
logger.mark('分词统计完成,绘制词云中...')
return list.sort(compareByFrequency).slice(0, topK)
}
}
function isTimestampInDateRange (timestamp, startOfSpecifiedDate, endOfSpecifiedDate) {
if (!timestamp) {
return false
}
timestamp = timestamp * 1000
// Step 5: Compare the given timestamp with the start and end of the specified date
return timestamp >= startOfSpecifiedDate && timestamp < endOfSpecifiedDate
}