diff --git a/Dockerfile.dev b/Dockerfile.dev index f1b4add8..67d8a440 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -5,9 +5,10 @@ RUN apk update && apk add build-base autoconf automake libtool pkgconfig nasm # Add the package.json file and build the node_modules folder WORKDIR /app COPY ./package*.json ./ -RUN apk add --no-cache --virtual .gyp \ - python -RUN mkdir node_modules && yarn install + +RUN mkdir node_modules +RUN apk update && apk add yarn python g++ make && rm -rf /var/cache/apk/* +RUN yarn install # Get a clean image with gatsby-cli and the pre-built node modules FROM node:12-alpine diff --git a/data_pipeline/scraper/scraper.py b/data_pipeline/scraper/scraper.py index 4d6575aa..f5d22f90 100644 --- a/data_pipeline/scraper/scraper.py +++ b/data_pipeline/scraper/scraper.py @@ -1,5 +1,6 @@ import os import time + from time import sleep from selenium import webdriver @@ -84,9 +85,9 @@ def verifySearchTableLoadComplete(self, driver): def verifyDownloadFormTableLoadComplete(self, driver): WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, self.FORM_TABLE_MAIN_TABLE_ID)) - ) + ) - def downloadExcel(self, driver): + def downloadExcel(self, driver, countFile): # Finds all the Excel files linked on the page and downloads them. # First create array that handles ammendments, to ensure we're only downloading the latest/most accurate numFormTableRows = driver.find_elements_by_xpath( @@ -118,8 +119,19 @@ def downloadExcel(self, driver): else: downloadLinkElement.click() count += 1 + + while(1): + if os.path.exists('./data/transactionExportGrid.xls'): + countFile += 1 + renamedFile = './data/transactionExportGrid' + '(' + str(countFile) + ').xls' + os.rename('./data/transactionExportGrid.xls', renamedFile) + break + sleep(0.1) + + print('NUM DOWNLOADS {}'.format(count)) self.preprocessing.insertColumns(count, self.CANDIDATENAME, self.ELECTIONDATE, self.BALLOTITEM) + return countFile # Returns a boolean. def errorDialogExists(self, driver): @@ -229,14 +241,12 @@ def __init__(self): options = webdriver.ChromeOptions() - # Uncomment block BELOW for headless data-retrieval - # --> Currently not working 100%, only downloads first link on form table - isHeadless = os.environ.get('HEADLESS', False) + # enable headless data retrieval + isHeadless = os.environ.get('HEADLESS', True) if isHeadless: options.add_argument("--headless") - # options.add_argument("--disable-gpu") - # options.add_argument("--window-size=1280,800") - # Uncomment block ABOVE for headless data-retrieval + options.add_argument("--disable-gpu") + options.add_argument("--window-size=1280,800") options.add_argument("--ignore-certificate-errors") options.add_argument("--test_type") @@ -256,11 +266,14 @@ def __init__(self): options.add_experimental_option("prefs", prefs) self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) + def scrape(self, election_cycle=None): # Navigate to https://www.southtechhosting.com/SanJoseCity/CampaignDocsWebRetrieval/Search/SearchByElection.aspx self.website.navigateToSearchPage(self.driver, self.SEARCH_FORM_ADDRESS, election_cycle=election_cycle) self.website.verifySearchTableLoadComplete(self.driver) + countFile = 0 + for search_page_num in range(1, self.website.numPages(self.driver) + 1): print('PAGE {}'.format(search_page_num)) # Need to navigate to the page upfront so that when we get the number of entries on the page it is accurate. @@ -283,7 +296,7 @@ def scrape(self, election_cycle=None): else: # If there are forms, then we will be brought to the "forms" page. self.website.verifyDownloadFormTableLoadComplete(self.driver) - self.website.downloadExcel(self.driver) + countFile = self.website.downloadExcel(self.driver, countFile) self.website.clickBackButton(self.driver) self.website.verifySearchTableLoadComplete(self.driver) @@ -293,7 +306,7 @@ def scrape(self, election_cycle=None): # Custom module to aggregate data into single CSV self.website.preprocessing.aggregateData() - +""" start_time = time.time() s = Scraper() @@ -305,3 +318,4 @@ def scrape(self, election_cycle=None): time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)) ) ) +""" \ No newline at end of file diff --git a/src/pages/aboutUs.js b/src/pages/aboutUs.js index 6af1de57..997cd4c8 100644 --- a/src/pages/aboutUs.js +++ b/src/pages/aboutUs.js @@ -20,10 +20,9 @@ export default function AboutUs() { const currentTeam = sortTeamByAlphabeticalOrder([ { name: "Alex P", - position: "Frontend / Co-lead", + position: "Frontend", github: "alessandro-pianetta", image: Alex, - lead: true, }, { name: "Geraldine E", @@ -31,14 +30,12 @@ export default function AboutUs() { github: "geleazar1000111", image: Geraldine, }, - { name: "Ryan W", position: "Backend", image: Ryan }, + { name: "Ryan W", position: "Backend / Co-lead", image: Ryan, lead: true }, { name: "Darren P", position: "Backend / Co-lead", lead: true }, - { name: "Emily J", position: "Frontend" }, { name: "Mark N", position: "Frontend" }, { name: "Coco M", position: "Backend" }, { name: "Diane L", position: "UX & Design" }, { name: "Irina R", position: "UX & Design" }, - { name: "Yan-Yin C", position: "Frontend"} ]) const alumni = sortTeamByAlphabeticalOrder([ { name: "Helen", position: "Project Lead", lead: true }, @@ -57,6 +54,8 @@ export default function AboutUs() { { name: "Lynna J", position: "Fullstack" }, { name: "Gajan N", position: "Fullstack" }, { name: "Nicole", position: "Fullstack" }, + { name: "Emily J", position: "Frontend" }, + { name: "Yan-Yin C", position: "Frontend" }, ]) return (