-
Notifications
You must be signed in to change notification settings - Fork 2
/
spider.mjs
181 lines (150 loc) · 6.47 KB
/
spider.mjs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
/**
* 爬虫主程序
* 负责监听任务目录里的新任务,并自动抓取数据保存到数据目录。
* 增加失败任务的重试机制
* 增加失败任务上报
* 增加任务处理超时
*/
import getConfigs from './config.mjs';
import common from './lib/common.mjs';
import TaskMoniter from "./lib/taskMoniter.mjs";
import TaJian from "./lib/tajian.mjs";
import HeroBot from "./lib/heroBot.mjs";
import Douyin from './bot/Douyin.mjs';
import Kuaishou from './bot/Kuaishou.mjs';
import Xigua from './bot/Xigua.mjs';
import Bilibili from './bot/Bilibili.mjs';
import WebCrawler from './bot/WebCrawler.mjs';
import cron from 'node-cron';
import path from 'node:path';
(async () => {
//设置configs为全局变量
let configFile = '';
if (process.argv.length >= 3) {
configFile = process.argv[2];
}
global.configs = await getConfigs(configFile);
const taskMoniter = new TaskMoniter(configs.task_list_dir);
const tajian = new TaJian(configs.data_save_dir);
taskMoniter.run(); //监控新任务
//HeroUnion英雄联盟对接
let heroUnionConfig = configs.herounion;
let heroBot = new HeroBot(
heroUnionConfig.server_url,
heroUnionConfig.name,
heroUnionConfig.description,
heroUnionConfig.platforms,
heroUnionConfig.contracts,
heroUnionConfig.country,
heroUnionConfig.lang,
heroUnionConfig.contact,
heroUnionConfig.data_mode
);
//配置本地cloud server地址,cloud安装参考:./install_cloud.sh
const heroCloudServer = typeof(configs.cloud_server) != 'undefined' && configs.cloud_server ? configs.cloud_server : '';
//spider run
let spider_is_running = false,
last_run_time = 0;
const task_check_time = 20; //每 20 秒抓取一次
const task_auto_run = cron.schedule(`*/${task_check_time} * * * * *`, async () => {
const current_time = common.getTimestampInSeconds();
//避免同时执行多个爬虫任务,并检查上个任务执行是否超时
if (spider_is_running == true && current_time - last_run_time < configs.task_do_timeout) {
return false;
}
//随机延迟一段时间,将不同爬虫的执行时间错开
let rnd_secods = parseInt(Math.random() * task_check_time);
console.log("Sleep %s seconds before crap...", rnd_secods);
await common.delay(rnd_secods);
const task = taskMoniter.getNewTask();
if (!task) {return false;}
//标记爬虫开始执行任务
spider_is_running = true;
last_run_time = common.getTimestampInSeconds();
let logFile = path.resolve(configs.task_log_dir) + `/tasks_${heroUnionConfig.name}.log`;
await common.saveLog(logFile, JSON.stringify(task) + "\n");
const botName = common.getBotName(task.url);
console.log('New task %s handle by bot %s, url: %s, cloud server: %s', task.id, botName, task.url, heroCloudServer);
let bot = null;
switch (botName) {
case 'douyin':
bot = new Douyin(heroCloudServer);
bot.setMode('mob'); //使用手机模式
break;
case 'kuaishou':
bot = new Kuaishou(heroCloudServer);
break;
case 'xigua':
bot = new Xigua(heroCloudServer);
break;
case 'bilibili':
bot = new Bilibili(heroCloudServer);
break;
default:
bot = new WebCrawler(heroCloudServer, botName);
break;
}
if (bot) {
console.log('Spider craping...');
let taskStarted = taskMoniter.setTaskRunning(task.id);
const data = await bot.scrap(task.url);
//console.log('Data got by bot', data);
if (typeof(data.done) != 'undefined' && data.done == true) {
task.data = data; //把抓取到的数据保存到任务里
taskMoniter.updateTask(task.id, task);
if (
await tajian.saveUrlShortcut(task.id, data)
&& await tajian.saveDescriptionFiles(task.id, data)
) {
//马上回传一次数据
taskMoniter.notifyHandle(task);
//标记任务完成
taskMoniter.setTaskDone(task.id);
}else {
taskMoniter.setTaskFailed(task.id);
}
}else {
//失败后最多重试 5 次
if (typeof(task.fail_retry) == 'undefined') {
task.fail_retry = 0;
}else {
task.fail_retry ++;
}
taskMoniter.updateTask(task.id, task);
if (task.fail_retry > configs.max_fail_retry) {
taskMoniter.setTaskFailed(task.id);
//上报联盟,任务失败
heroBot.saveTaskData(task.id, task.token, [], 'failed');
}else {
taskMoniter.setTaskWaiting(task.id); //重新进入等待处理状态
}
}
spider_is_running = false;
}else {
console.error('No bot matched with url %s', task.url);
spider_is_running = false;
taskMoniter.setTaskFailed(task.id);
}
}, {
scheduled: false
});
task_auto_run.start();
console.log('[%s] Spider started.', common.getTimeString());
//爬虫心跳上报
const heartBeatFrequence = 5; //5 分钟上报一次
const heroUnionHeartBeat = cron.schedule(`*/${heartBeatFrequence} * * * *`, async () => {
//随机延迟一段时间,将不同爬虫的执行时间错开
let rnd_secods = parseInt(Math.random() * 60);
console.log("Sleep %s seconds before send heart beat...", rnd_secods);
await common.delay(rnd_secods);
let status = spider_is_running ? 'busy' : 'idle';
const res = await heroBot.heartBeat(status);
console.log('HeroUnion bot heart beat result', res);
}, {scheduled: false});
heroUnionHeartBeat.start();
let heartBeatRes = await heroBot.heartBeat('idle'); //马上上报一次
console.log('[%s] HeroUnion bot heart beat started.', common.getTimeString(), heartBeatRes);
})().catch(error => {
console.error("Spider error got:\n%s", error);
process.exit(1);
});