-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.bat
29 lines (21 loc) · 1.45 KB
/
scrape.bat
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
@echo off
del .\data\documents.csv
rem scrape our document set to recommend from (max 500 records hard coded in spider)
echo Scraping documents
scrapy runspider ".\GawkerScraper\ListScraper.py" -o ".\data\Documents.csv" -a stop_at=500 2> err.txt
rem scrape our history (max 500 records hard coded in spider)
rem in order to get data from our history, we have to request it from google. In order to do that, we need an authorization token and developer keywords
rem using chrome you can retrieve this information by:
rem navigating to chrome://flags and disabling the QUIC protocol
rem launching fiddler with ssl decryption enabled
rem then refreshing the history page and inspecting the response headers to get the authorization token and developer key
rem I would turn off fiddler after you have your token, as it slows scraping a crazy amount
del ".\data\History.csv"
echo Scraping History 1/2
scrapy runspider ".\HistoryScraper\HistoryScraper.py" -o ".\data\History.csv" -a auth_token="[AUTH_TOKEN]" -a dev_key="[DEV_KEY]" -a stop_at=500 2> err.txt
del .\data\fullhistory.json
echo Scraping History 2/2
scrapy runspider ".\HistoryScraper\CsvScraper.py" -o ".\data\fullhistory.json" -a history=".\data\History.csv" 2> err.txt
rem convert json data to sqlite, used sqlite because it should be more efficient in handling condition based queries
echo Converting history to sqlite
python ".\HistoryScraper\Json2Sqlite.py" ".\data\fullhistory.json" ".\Data\browser_history.db"