NikolaiT · HugoPoi · Mar 31, 2020 · Apr 1, 2020 · Apr 1, 2020 · Jun 16, 2020
diff --git a/src/modules/google.js b/src/modules/google.js
@@ -13,13 +13,13 @@ class GoogleScraper extends Scraper {
 
         const results = await this.page.evaluate(() => {
 
-            let _text = (el, s) => {
+            let _text = (el, s, onlyFirstTextNode) => {
                 let n = el.querySelector(s);
 
                 if (n) {
-                    return n.innerText;
+                    return (onlyFirstTextNode) ? n.childNodes[0].nodeValue : n.innerText;
                 } else {
-                    return '';
+                    return;
                 }
             };
 
@@ -29,7 +29,7 @@ class GoogleScraper extends Scraper {
                 if (n) {
                     return n.getAttribute(attr);
                 } else {
-                    return null;
+                    return;
                 }
             };
 
@@ -111,14 +111,14 @@ class GoogleScraper extends Scraper {
             // parse right side product information
             results.right_info.review = _attr(document, '#rhs .cu-container g-review-stars span', 'aria-label');
 
-            let title_el = document.querySelector('#rhs .cu-container g-review-stars');
+            let title_el = document.querySelector('#rhs .cu-container .Q7Oxbd');
             if (title_el) {
-                results.right_info.review.title = title_el.parentNode.querySelector('div:first-child').innerText;
+                results.right_info.title = title_el.innerText;
             }
 
-            let num_reviews_el = document.querySelector('#rhs .cu-container g-review-stars');
+            let num_reviews_el = document.querySelector('#rhs .cu-container .PGDKUd');
             if (num_reviews_el) {
-                results.right_info.num_reviews = num_reviews_el.parentNode.querySelector('div:nth-of-type(2)').innerText;
+                results.right_info.num_reviews = num_reviews_el.innerText;
             }
 
             results.right_info.vendors = [];
@@ -127,20 +127,16 @@ class GoogleScraper extends Scraper {
             document.querySelectorAll('#rhs .cu-container .rhsvw > div > div:nth-child(4) > div > div:nth-child(3) > div').forEach((el) => {
                 results.right_info.vendors.push({
                     price: _text(el, 'span:nth-of-type(1)'),
-                    merchant_name: _text(el, 'span:nth-child(3) a:nth-child(2)'),
+                    merchant_name: _text(el, '.doUe3s0oL2B__jackpot-merchant a'),
                     merchant_ad_link: _attr(el, 'span:nth-child(3) a:first-child', 'href'),
-                    merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'),
+                    merchant_link: _attr(el, 'span:nth-child(3) a:nth-child(2)', 'href'), // TODO this is not working anymore
                     source_name: _text(el, 'span:nth-child(4) a'),
                     source_link: _attr(el, 'span:nth-child(4) a', 'href'),
-                    info: _text(el, 'div span'),
-                    shipping: _text(el, 'span:last-child > span'),
+                    info: _text(el, '.SdBHnc.e2CF7c'),
+                    shipping: _text(el, '.JfwJme'),
                 })
             });
 
-            if (!results.right_info.title) {
-                results.right_info = {};
-            }
-
             let right_side_info_el = document.getElementById('rhs');
 
             if (right_side_info_el) {
@@ -151,26 +147,19 @@ class GoogleScraper extends Scraper {
                 }
             }
 
-            // parse top main column product information
-            // #tvcap .pla-unit
-            document.querySelectorAll('#tvcap .pla-unit').forEach((el) => {
+            // Parse Google Shopping top or left
+            document.querySelectorAll('.pla-unit').forEach((el) => {
                 let top_product = {
                     tracking_link: _attr(el, '.pla-unit-title a:first-child', 'href'),
                     link: _attr(el, '.pla-unit-title a:nth-child(2)', 'href'),
                     title: _text(el, '.pla-unit-title a:nth-child(2) span'),
-                    price: _text(el, '.pla-unit-title + div'),
-                    shipping: _text(el, '.pla-extensions-container div:nth-of-type(1)'),
-                    vendor_link: _attr(el,'.pla-extensions-container div > a', 'href'),
+                    price: _text(el, '.pla-unit-title + div', true),
+                    originalPrice: _text(el, '.pla-unit-title + div > span'),
+                    shipping: _text(el, '.pla-extensions-container .cYBBsb'),
+                    vendor_link: _attr(el,'.pla-extensions-container a.FfKHB', 'href'),
+                    merchant_name: _text(el,'.LbUacb span:nth-child(1)'),
                 };
 
-                let merchant_node = el.querySelector('.pla-unit-title');
-                if (merchant_node) {
-                    let node = merchant_node.parentNode.querySelector('div > span');
-                    if (node) {
-                        top_product.merchant_name = node.innerText;
-                    }
-                }
-
                 results.top_products.push(top_product);
             });
 

diff --git a/src/modules/se_scraper.js b/src/modules/se_scraper.js
@@ -31,8 +31,8 @@ module.exports = class Scraper {
         this.proxy = config.proxy;
         this.keywords = config.keywords;
 
-        this.STANDARD_TIMEOUT = 10000;
-        this.SOLVE_CAPTCHA_TIME = 45000;
+        this.STANDARD_TIMEOUT = config.standard_timeout;
+        this.SOLVE_CAPTCHA_TIME = config.solve_captcha_time;
 
         this.results = {};
         this.result_rank = 1;
@@ -272,6 +272,12 @@ module.exports = class Scraper {
                     await this.page.screenshot({ path: `debug_se_scraper_${this.config.search_engine_name}_${keyword}.png` });
                 }
 
+                if (this.config.keep_html_on_error){
+                    const html_error = await this.page.content();
+                    e.html_on_error = html_error;
+                    e.lastUrl = await this.page.evaluate(() => {return window.location.href;});
+                }
+
                 this.metadata.scraping_detected = await this.detected();
 
                 if (this.metadata.scraping_detected === true) {

diff --git a/src/node_scraper.js b/src/node_scraper.js
@@ -139,6 +139,9 @@ class ScrapeManager {
             //custom_func: resolve('examples/pluggable.js'),
             custom_func: null,
             throw_on_detection: false,
+            keep_html_on_error: false,
+            standard_timeout: 10000,
+            solve_captcha_time: 45000,
             // List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
             proxies: null,
             // a file with one proxy per line. Example:

diff --git a/test/keep_html_on_error.js b/test/keep_html_on_error.js
@@ -0,0 +1,108 @@
+'use strict';
+const express = require('express');
+const { createLogger, transports } = require('winston');
+const http = require('http');
+const https = require('https');
+const assert = require('assert');
+const path = require('path');
+const keyCert = require('key-cert');
+const Promise = require('bluebird');
+const Proxy = require('http-mitm-proxy');
+
+const debug = require('debug')('se-scraper:test');
+const se_scraper = require('..');
+
+const httpPort = 3012;
+const httpsPort = httpPort + 1;
+const proxyPort = httpPort + 2;
+
+const fakeSearchEngine = express();
+fakeSearchEngine.get('/search', (req, res) => {
+    debug('q=%s', req.query.q);
+    const pageNumber = ((req.query.start/10) || 0)  + 1;
+    res.sendFile(path.join(__dirname, 'mocks/google/' + req.query.q + '_page' + pageNumber + '.html'));
+});
+fakeSearchEngine.use(express.static('test/mocks/google', {extensions: ['html']}));
+
+describe('Config', function(){
+
+    let httpServer, httpsServer, proxy;
+    before(async function(){
+        // Here mount our fake engine in both http and https listen server
+        httpServer = http.createServer(fakeSearchEngine);
+        httpsServer = https.createServer(await keyCert(), fakeSearchEngine);
+
+        proxy = Proxy();
+        proxy.onRequest((ctx, callback) => {
+            ctx.proxyToServerRequestOptions.host = 'localhost';
+            ctx.proxyToServerRequestOptions.port = (ctx.isSSL) ? httpsPort : httpPort;
+            ctx.proxyToServerRequestOptions.headers['X-Forwarded-Host'] = 'ProxiedThroughFakeEngine';
+            debug('Proxy request to %s', ctx.clientToProxyRequest.headers.host);
+            return callback();
+        });
+
+        await Promise.promisify(proxy.listen, {context: proxy})({port: proxyPort});
+        await Promise.promisify(httpServer.listen, {context: httpServer})(httpPort);
+        await Promise.promisify(httpsServer.listen, {context: httpsServer})(httpsPort);
+        debug('Fake http search engine servers started');
+    });
+
+    after(function(){
+        httpsServer.close();
+        httpServer.close();
+        proxy.close();
+    });
+
+    describe('keep_html_on_error', function(){
+
+        const testLogger = createLogger({
+            transports: [
+                new transports.Console({
+                    level: 'error'
+                })
+            ]
+        });
+
+        /**
+         * Test html_output option
+         */
+        it('html_output single page single keyword', async function () {
+
+            const scrape_job = {
+                search_engine: 'google',
+                /* TODO refactor start_url
+                google_settings: {
+                    start_url: 'http://localhost:' + httpPort
+                },
+                */
+                keywords: ['test error'],
+            };
+
+            var scraper = new se_scraper.ScrapeManager({
+                throw_on_detection: true,
+                keep_html_on_error: true,
+                logger: testLogger,
+                //clean_html_output: false,
+                //clean_data_images: false,
+                // TODO refactor start_url so we can use-it instead of depending of the proxy for this test
+                proxies: ['http://localhost:' + proxyPort],
+                use_proxies_only: true,
+                standard_timeout: 500,
+            });
+            await scraper.start();
+            await assert.rejects(
+                async () => {
+                    await scraper.scrape(scrape_job);
+                },
+                (error) => {
+                    assert(error.html_on_error, 'Error is containing the html output');
+                    return /#fbar/.test(error.message);
+                }
+            )
+            await scraper.quit();
+
+        });
+
+    });
+
+});