From e068773b23ae379a4cf79d520f810c78b938c2bc Mon Sep 17 00:00:00 2001 From: Ryan Walek Date: Thu, 11 Mar 2021 19:42:10 -0800 Subject: [PATCH 1/7] Fix dockerfile --- Dockerfile.dev | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile.dev b/Dockerfile.dev index 529a2c6..cfb447c 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -5,7 +5,9 @@ RUN apk update && apk add build-base autoconf automake libtool pkgconfig nasm # Add the package.json file and build the node_modules folder WORKDIR /app COPY ./package*.json ./ -RUN mkdir node_modules && yarn install +RUN mkdir node_modules +RUN apk update && apk add yarn python g++ make && rm -rf /var/cache/apk/* +RUN yarn install # Get a clean image with gatsby-cli and the pre-built node modules FROM node:12-alpine From 5d19ab944f2c484935cbc63f0bc2ecf667e39f29 Mon Sep 17 00:00:00 2001 From: alessandro-pianetta Date: Thu, 15 Apr 2021 19:23:30 -0700 Subject: [PATCH 2/7] Updates members --- src/pages/aboutUs.js | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/pages/aboutUs.js b/src/pages/aboutUs.js index 6af1de5..997cd4c 100644 --- a/src/pages/aboutUs.js +++ b/src/pages/aboutUs.js @@ -20,10 +20,9 @@ export default function AboutUs() { const currentTeam = sortTeamByAlphabeticalOrder([ { name: "Alex P", - position: "Frontend / Co-lead", + position: "Frontend", github: "alessandro-pianetta", image: Alex, - lead: true, }, { name: "Geraldine E", @@ -31,14 +30,12 @@ export default function AboutUs() { github: "geleazar1000111", image: Geraldine, }, - { name: "Ryan W", position: "Backend", image: Ryan }, + { name: "Ryan W", position: "Backend / Co-lead", image: Ryan, lead: true }, { name: "Darren P", position: "Backend / Co-lead", lead: true }, - { name: "Emily J", position: "Frontend" }, { name: "Mark N", position: "Frontend" }, { name: "Coco M", position: "Backend" }, { name: "Diane L", position: "UX & Design" }, { name: "Irina R", position: "UX & Design" }, - { name: "Yan-Yin C", position: "Frontend"} ]) const alumni = sortTeamByAlphabeticalOrder([ { name: "Helen", position: "Project Lead", lead: true }, @@ -57,6 +54,8 @@ export default function AboutUs() { { name: "Lynna J", position: "Fullstack" }, { name: "Gajan N", position: "Fullstack" }, { name: "Nicole", position: "Fullstack" }, + { name: "Emily J", position: "Frontend" }, + { name: "Yan-Yin C", position: "Frontend" }, ]) return ( From 553fd1990dfebd6c9465b47bdb880ab8bb246d9e Mon Sep 17 00:00:00 2001 From: Steven Hans Limantoro Date: Tue, 29 Jun 2021 19:53:03 +0700 Subject: [PATCH 3/7] Fixed overwriting downloaded files in headless mode --- data_pipeline/scraper/scraper.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/data_pipeline/scraper/scraper.py b/data_pipeline/scraper/scraper.py index 2802850..824e8fc 100644 --- a/data_pipeline/scraper/scraper.py +++ b/data_pipeline/scraper/scraper.py @@ -1,5 +1,6 @@ import os import time +import asyncio from time import sleep from selenium import webdriver @@ -83,9 +84,9 @@ def verifySearchTableLoadComplete(self, driver): def verifyDownloadFormTableLoadComplete(self, driver): WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, self.FORM_TABLE_MAIN_TABLE_ID)) - ) + ) - def downloadExcel(self, driver): + def downloadExcel(self, driver, countFile): # Finds all the Excel files linked on the page and downloads them. # First create array that handles ammendments, to ensure we're only downloading the latest/most accurate numFormTableRows = driver.find_elements_by_xpath( @@ -117,8 +118,16 @@ def downloadExcel(self, driver): else: downloadLinkElement.click() count += 1 + sleep(1) + if os.path.exists('./data/transactionExportGrid.xls'): + countFile += 1 + renamedFile = './data/transactionExportGrid' + '(' + str(countFile) + ').xls' + os.rename('./data/transactionExportGrid.xls', renamedFile) + + print('NUM DOWNLOADS {}'.format(count)) self.preprocessing.insertColumns(count, self.CANDIDATENAME, self.ELECTIONDATE, self.BALLOTITEM) + return countFile # Returns a boolean. def errorDialogExists(self, driver): @@ -230,11 +239,12 @@ def __init__(self): # Uncomment block BELOW for headless data-retrieval # --> Currently not working 100%, only downloads first link on form table - isHeadless = os.environ.get('HEADLESS', False) + #isHeadless = os.environ.get('HEADLESS', False) + isHeadless = os.environ.get('HEADLESS', True) if isHeadless: options.add_argument("--headless") - # options.add_argument("--disable-gpu") - # options.add_argument("--window-size=1280,800") + options.add_argument("--disable-gpu") + options.add_argument("--window-size=1280,800") # Uncomment block ABOVE for headless data-retrieval options.add_argument("--ignore-certificate-errors") @@ -255,11 +265,14 @@ def __init__(self): options.add_experimental_option("prefs", prefs) self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) + def scrape(self, election_cycle=None): # Navigate to https://www.southtechhosting.com/SanJoseCity/CampaignDocsWebRetrieval/Search/SearchByElection.aspx self.website.navigateToSearchPage(self.driver, self.SEARCH_FORM_ADDRESS, election_cycle=election_cycle) self.website.verifySearchTableLoadComplete(self.driver) + countFile = 0 + for search_page_num in range(1, self.website.numPages(self.driver) + 1): print('PAGE {}'.format(search_page_num)) # Need to navigate to the page upfront so that when we get the number of entries on the page it is accurate. @@ -282,7 +295,7 @@ def scrape(self, election_cycle=None): else: # If there are forms, then we will be brought to the "forms" page. self.website.verifyDownloadFormTableLoadComplete(self.driver) - self.website.downloadExcel(self.driver) + countFile = self.website.downloadExcel(self.driver, countFile) self.website.clickBackButton(self.driver) self.website.verifySearchTableLoadComplete(self.driver) From cfbab7444d374885c02b7f360ae806231546227b Mon Sep 17 00:00:00 2001 From: Steven Hans Limantoro Date: Tue, 29 Jun 2021 19:55:53 +0700 Subject: [PATCH 4/7] Removed asyncio --- data_pipeline/scraper/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_pipeline/scraper/scraper.py b/data_pipeline/scraper/scraper.py index 824e8fc..cb762ec 100644 --- a/data_pipeline/scraper/scraper.py +++ b/data_pipeline/scraper/scraper.py @@ -1,6 +1,6 @@ import os import time -import asyncio + from time import sleep from selenium import webdriver From 860c74bbc8b6cfc430e2c98b55f7b934f3c81bbe Mon Sep 17 00:00:00 2001 From: Steven Hans Limantoro Date: Thu, 1 Jul 2021 11:01:03 +0700 Subject: [PATCH 5/7] Speed up the downloading and renaming process --- data_pipeline/scraper/scraper.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/data_pipeline/scraper/scraper.py b/data_pipeline/scraper/scraper.py index cb762ec..dd6ff35 100644 --- a/data_pipeline/scraper/scraper.py +++ b/data_pipeline/scraper/scraper.py @@ -118,11 +118,22 @@ def downloadExcel(self, driver, countFile): else: downloadLinkElement.click() count += 1 + + while(1): + if os.path.exists('./data/transactionExportGrid.xls'): + countFile += 1 + renamedFile = './data/transactionExportGrid' + '(' + str(countFile) + ').xls' + os.rename('./data/transactionExportGrid.xls', renamedFile) + break + sleep(0.1) + + """ sleep(1) if os.path.exists('./data/transactionExportGrid.xls'): countFile += 1 renamedFile = './data/transactionExportGrid' + '(' + str(countFile) + ').xls' os.rename('./data/transactionExportGrid.xls', renamedFile) + """ print('NUM DOWNLOADS {}'.format(count)) From b02acff3c7b3b038bc6df8abf5dce9469e46439a Mon Sep 17 00:00:00 2001 From: Steven Hans Limantoro Date: Thu, 1 Jul 2021 11:05:06 +0700 Subject: [PATCH 6/7] Cleaned up headless data retrieval --- data_pipeline/scraper/scraper.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/data_pipeline/scraper/scraper.py b/data_pipeline/scraper/scraper.py index dd6ff35..0c45d34 100644 --- a/data_pipeline/scraper/scraper.py +++ b/data_pipeline/scraper/scraper.py @@ -248,15 +248,12 @@ def __init__(self): options = webdriver.ChromeOptions() - # Uncomment block BELOW for headless data-retrieval - # --> Currently not working 100%, only downloads first link on form table - #isHeadless = os.environ.get('HEADLESS', False) + # enable headless data retrieval isHeadless = os.environ.get('HEADLESS', True) if isHeadless: options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--window-size=1280,800") - # Uncomment block ABOVE for headless data-retrieval options.add_argument("--ignore-certificate-errors") options.add_argument("--test_type") From d35ecd92c4d27fccbdf68e62867fcbf051823ab2 Mon Sep 17 00:00:00 2001 From: Steven Hans Limantoro Date: Fri, 2 Jul 2021 09:37:49 +0700 Subject: [PATCH 7/7] Deleted the old renaming logic --- data_pipeline/scraper/scraper.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/data_pipeline/scraper/scraper.py b/data_pipeline/scraper/scraper.py index 0c45d34..8c75eed 100644 --- a/data_pipeline/scraper/scraper.py +++ b/data_pipeline/scraper/scraper.py @@ -126,14 +126,6 @@ def downloadExcel(self, driver, countFile): os.rename('./data/transactionExportGrid.xls', renamedFile) break sleep(0.1) - - """ - sleep(1) - if os.path.exists('./data/transactionExportGrid.xls'): - countFile += 1 - renamedFile = './data/transactionExportGrid' + '(' + str(countFile) + ').xls' - os.rename('./data/transactionExportGrid.xls', renamedFile) - """ print('NUM DOWNLOADS {}'.format(count)) @@ -313,7 +305,7 @@ def scrape(self, election_cycle=None): # Custom module to aggregate data into single CSV self.website.preprocessing.aggregateData() - +""" start_time = time.time() s = Scraper() @@ -325,3 +317,4 @@ def scrape(self, election_cycle=None): time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)) ) ) +""" \ No newline at end of file