Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/inner text on undefined #77

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
49 changes: 19 additions & 30 deletions src/modules/google.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ class GoogleScraper extends Scraper {

const results = await this.page.evaluate(() => {

let _text = (el, s) => {
let _text = (el, s, onlyFirstTextNode) => {
let n = el.querySelector(s);

if (n) {
return n.innerText;
return (onlyFirstTextNode) ? n.childNodes[0].nodeValue : n.innerText;
} else {
return '';
return;
}
};

Expand All @@ -29,7 +29,7 @@ class GoogleScraper extends Scraper {
if (n) {
return n.getAttribute(attr);
} else {
return null;
return;
}
};

Expand Down Expand Up @@ -111,14 +111,14 @@ class GoogleScraper extends Scraper {
// parse right side product information
results.right_info.review = _attr(document, '#rhs .cu-container g-review-stars span', 'aria-label');

let title_el = document.querySelector('#rhs .cu-container g-review-stars');
let title_el = document.querySelector('#rhs .cu-container .Q7Oxbd');
if (title_el) {
results.right_info.review.title = title_el.parentNode.querySelector('div:first-child').innerText;
results.right_info.title = title_el.innerText;
}

let num_reviews_el = document.querySelector('#rhs .cu-container g-review-stars');
let num_reviews_el = document.querySelector('#rhs .cu-container .PGDKUd');
if (num_reviews_el) {
results.right_info.num_reviews = num_reviews_el.parentNode.querySelector('div:nth-of-type(2)').innerText;
results.right_info.num_reviews = num_reviews_el.innerText;
}

results.right_info.vendors = [];
Expand All @@ -127,20 +127,16 @@ class GoogleScraper extends Scraper {
document.querySelectorAll('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').forEach((el) => {
results.right_info.vendors.push({
price: _text(el, 'span:nth-of-type(1)'),
merchant_name: _text(el, 'span:nth-child(3) a:nth-child(2)'),
merchant_name: _text(el, '.doUe3s0oL2B__jackpot-merchant a'),
merchant_ad_link: _attr(el, 'span:nth-child(3) a:first-child', 'href'),
merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'),
merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'), // TODO this is not working anymore
source_name: _text(el, 'span:nth-child(4) a'),
source_link: _attr(el, 'span:nth-child(4) a', 'href'),
info: _text(el, 'div span'),
shipping: _text(el, 'span:last-child > span'),
info: _text(el, '.SdBHnc.e2CF7c'),
shipping: _text(el, '.JfwJme'),
})
});

if (!results.right_info.title) {
results.right_info = {};
}

let right_side_info_el = document.getElementById('rhs');

if (right_side_info_el) {
Expand All @@ -151,26 +147,19 @@ class GoogleScraper extends Scraper {
}
}

// parse top main column product information
// #tvcap .pla-unit
document.querySelectorAll('#tvcap .pla-unit').forEach((el) => {
// Parse Google Shopping top or left
document.querySelectorAll('.pla-unit').forEach((el) => {
let top_product = {
tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'),
link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'),
title: _text(el, '.pla-unit-title a:nth-child(2) span'),
price: _text(el, '.pla-unit-title + div'),
shipping: _text(el, '.pla-extensions-container div:nth-of-type(1)'),
vendor_link: _attr(el,'.pla-extensions-container div > a', 'href'),
price: _text(el, '.pla-unit-title + div', true),
originalPrice: _text(el, '.pla-unit-title + div > span'),
shipping: _text(el, '.pla-extensions-container .cYBBsb'),
vendor_link: _attr(el,'.pla-extensions-container a.FfKHB', 'href'),
merchant_name: _text(el,'.LbUacb span:nth-child(1)'),
};

let merchant_node = el.querySelector('.pla-unit-title');
if (merchant_node) {
let node = merchant_node.parentNode.querySelector('div > span');
if (node) {
top_product.merchant_name = node.innerText;
}
}

results.top_products.push(top_product);
});

Expand Down
10 changes: 8 additions & 2 deletions src/modules/se_scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ module.exports = class Scraper {
this.proxy = config.proxy;
this.keywords = config.keywords;

this.STANDARD_TIMEOUT = 10000;
this.SOLVE_CAPTCHA_TIME = 45000;
this.STANDARD_TIMEOUT = config.standard_timeout;
this.SOLVE_CAPTCHA_TIME = config.solve_captcha_time;

this.results = {};
this.result_rank = 1;
Expand Down Expand Up @@ -272,6 +272,12 @@ module.exports = class Scraper {
await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
}

if (this.config.keep_html_on_error){
const html_error = await this.page.content();
e.html_on_error = html_error;
e.lastUrl = await this.page.evaluate(() => {return window.location.href;});
}

this.metadata.scraping_detected = await this.detected();

if (this.metadata.scraping_detected === true) {
Expand Down
3 changes: 3 additions & 0 deletions src/node_scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ class ScrapeManager {
//custom_func: resolve('examples/pluggable.js'),
custom_func: null,
throw_on_detection: false,
keep_html_on_error: false,
standard_timeout: 10000,
solve_captcha_time: 45000,
// List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
proxies: null,
// a file with one proxy per line. Example:
Expand Down
108 changes: 108 additions & 0 deletions test/keep_html_on_error.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
'use strict';
const express = require('express');
const { createLogger, transports } = require('winston');
const http = require('http');
const https = require('https');
const assert = require('assert');
const path = require('path');
const keyCert = require('key-cert');
const Promise = require('bluebird');
const Proxy = require('http-mitm-proxy');

const debug = require('debug')('se-scraper:test');
const se_scraper = require('..');

const httpPort = 3012;
const httpsPort = httpPort + 1;
const proxyPort = httpPort + 2;

const fakeSearchEngine = express();
fakeSearchEngine.get('/search', (req, res) => {
debug('q=%s', req.query.q);
const pageNumber = ((req.query.start/10) || 0) + 1;
res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
});
fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));

describe('Config', function(){

let httpServer, httpsServer, proxy;
before(async function(){
// Here mount our fake engine in both http and https listen server
httpServer = http.createServer(fakeSearchEngine);
httpsServer = https.createServer(await keyCert(), fakeSearchEngine);

proxy = Proxy();
proxy.onRequest((ctx, callback) => {
ctx.proxyToServerRequestOptions.host = 'localhost';
ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
return callback();
});

await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
debug('Fake http search engine servers started');
});

after(function(){
httpsServer.close();
httpServer.close();
proxy.close();
});

describe('keep_html_on_error', function(){

const testLogger = createLogger({
transports: [
new transports.Console({
level: 'error'
})
]
});

/**
* Test html_output option
*/
it('html_output single page single keyword', async function () {

const scrape_job = {
search_engine: 'google',
/* TODO refactor start_url
google_settings: {
start_url: 'http://localhost:' + httpPort
},
*/
keywords: ['test error'],
};

var scraper = new se_scraper.ScrapeManager({
throw_on_detection: true,
keep_html_on_error: true,
logger: testLogger,
//clean_html_output: false,
//clean_data_images: false,
// TODO refactor start_url so we can use-it instead of depending of the proxy for this test
proxies: ['http://localhost:' + proxyPort],
use_proxies_only: true,
standard_timeout: 500,
});
await scraper.start();
await assert.rejects(
async () => {
await scraper.scrape(scrape_job);
},
(error) => {
assert(error.html_on_error, 'Error is containing the html output');
return /#fbar/.test(error.message);
}
)
await scraper.quit();

});

});

});
Loading