From 66f42aed1c736d2d7ae8e8da80372f2fc0bef504 Mon Sep 17 00:00:00 2001 From: modood Date: Fri, 28 Feb 2020 11:16:21 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20=E6=B7=BB=E5=8A=A0=E6=9B=B4?= =?UTF-8?q?=E5=A4=9A=E6=8A=93=E5=8F=96=E5=BC=82=E5=B8=B8=E6=83=85=E5=86=B5?= =?UTF-8?q?=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/crawler.js | 13 ++++++++++++- lib/worker.js | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/lib/crawler.js b/lib/crawler.js index 2eeb681..a154eff 100644 --- a/lib/crawler.js +++ b/lib/crawler.js @@ -35,6 +35,13 @@ exports.fetch = (host, route, regexp, codeLen) => const bufferHelper = new BufferHelper() const statusCode = res.statusCode + // 302 Move Temporarily + // 这种情况一般重试就可以了,所以视为超时统一重试处理 + if (statusCode === 302) { + res.resume() + return reject(new Error('timeout')) + } + if (statusCode !== 200) { res.resume() return reject(new Error('Request Failed. Status Code: ' + statusCode)) @@ -49,7 +56,11 @@ exports.fetch = (host, route, regexp, codeLen) => let current while ((current = regexp.exec(rawData)) !== null) result[current[1].substr(0, codeLen)] = current[2].trim() if (Object.keys(result).length === 0) { - return reject(new Error('Request Failed. rawData: '), rawData) + const raw = iconv.decode(bufferHelper.toBuffer(), 'UTF-8') + if (raw.includes('请开启JavaScript并刷新该页')) { + console.log('\n温馨提示:请求过于频繁已被目标网站限制,当前抓取进度已保存,请五分钟后再试...\n') + process.exit(0) + } } return resolve(result) diff --git a/lib/worker.js b/lib/worker.js index 7cc4568..39880d4 100644 --- a/lib/worker.js +++ b/lib/worker.js @@ -11,6 +11,11 @@ const limit = 100 * @datetime 2018-01-31 22:11 */ exports.fetchProvinces = async () => { + const count = await Province.count() + if (count !== 0) { + return + } + console.log('[1/1]正在抓取省级数据...') const o = await crawler.fetchProvinces() const rows = [] @@ -29,12 +34,19 @@ exports.fetchProvinces = async () => { exports.fetchCities = async () => { await exports.fetchProvinces() - const count = await Province.count() + const fetchedProvinceCode = await City.aggregate('provinceCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT) + const where = { code: { [Sequelize.Op.notIn]: fetchedProvinceCode } } + const count = await Province.count({ where }) + + if (count === 0) { + return + } + let index = 0 let hasNext = true let after while (hasNext) { - const r = await Province.paginate({ limit, after }) + const r = await Province.paginate({ where, limit, after }) const rows = [] for (let i = 0; i < r.results.length; i++) { const { dataValues: {