Skip to content

Commit

Permalink
Merge branch 'master' into data-errors
Browse files Browse the repository at this point in the history
  • Loading branch information
rwalek668 authored Aug 20, 2021
2 parents 8e5f443 + 18f607d commit 683ba5a
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 18 deletions.
7 changes: 4 additions & 3 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ RUN apk update && apk add build-base autoconf automake libtool pkgconfig nasm
# Add the package.json file and build the node_modules folder
WORKDIR /app
COPY ./package*.json ./
RUN apk add --no-cache --virtual .gyp \
python
RUN mkdir node_modules && yarn install

RUN mkdir node_modules
RUN apk update && apk add yarn python g++ make && rm -rf /var/cache/apk/*
RUN yarn install

# Get a clean image with gatsby-cli and the pre-built node modules
FROM node:12-alpine
Expand Down
34 changes: 24 additions & 10 deletions data_pipeline/scraper/scraper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import time

from time import sleep

from selenium import webdriver
Expand Down Expand Up @@ -84,9 +85,9 @@ def verifySearchTableLoadComplete(self, driver):
def verifyDownloadFormTableLoadComplete(self, driver):
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, self.FORM_TABLE_MAIN_TABLE_ID))
)
)

def downloadExcel(self, driver):
def downloadExcel(self, driver, countFile):
# Finds all the Excel files linked on the page and downloads them.
# First create array that handles ammendments, to ensure we're only downloading the latest/most accurate
numFormTableRows = driver.find_elements_by_xpath(
Expand Down Expand Up @@ -118,8 +119,19 @@ def downloadExcel(self, driver):
else:
downloadLinkElement.click()
count += 1

while(1):
if os.path.exists('./data/transactionExportGrid.xls'):
countFile += 1
renamedFile = './data/transactionExportGrid' + '(' + str(countFile) + ').xls'
os.rename('./data/transactionExportGrid.xls', renamedFile)
break
sleep(0.1)


print('NUM DOWNLOADS {}'.format(count))
self.preprocessing.insertColumns(count, self.CANDIDATENAME, self.ELECTIONDATE, self.BALLOTITEM)
return countFile

# Returns a boolean.
def errorDialogExists(self, driver):
Expand Down Expand Up @@ -229,14 +241,12 @@ def __init__(self):

options = webdriver.ChromeOptions()

# Uncomment block BELOW for headless data-retrieval
# --> Currently not working 100%, only downloads first link on form table
isHeadless = os.environ.get('HEADLESS', False)
# enable headless data retrieval
isHeadless = os.environ.get('HEADLESS', True)
if isHeadless:
options.add_argument("--headless")
# options.add_argument("--disable-gpu")
# options.add_argument("--window-size=1280,800")
# Uncomment block ABOVE for headless data-retrieval
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1280,800")

options.add_argument("--ignore-certificate-errors")
options.add_argument("--test_type")
Expand All @@ -256,11 +266,14 @@ def __init__(self):
options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)


def scrape(self, election_cycle=None):
# Navigate to https://www.southtechhosting.com/SanJoseCity/CampaignDocsWebRetrieval/Search/SearchByElection.aspx
self.website.navigateToSearchPage(self.driver, self.SEARCH_FORM_ADDRESS, election_cycle=election_cycle)
self.website.verifySearchTableLoadComplete(self.driver)

countFile = 0

for search_page_num in range(1, self.website.numPages(self.driver) + 1):
print('PAGE {}'.format(search_page_num))
# Need to navigate to the page upfront so that when we get the number of entries on the page it is accurate.
Expand All @@ -283,7 +296,7 @@ def scrape(self, election_cycle=None):
else:
# If there are forms, then we will be brought to the "forms" page.
self.website.verifyDownloadFormTableLoadComplete(self.driver)
self.website.downloadExcel(self.driver)
countFile = self.website.downloadExcel(self.driver, countFile)

self.website.clickBackButton(self.driver)
self.website.verifySearchTableLoadComplete(self.driver)
Expand All @@ -293,7 +306,7 @@ def scrape(self, election_cycle=None):

# Custom module to aggregate data into single CSV
self.website.preprocessing.aggregateData()

"""
start_time = time.time()
s = Scraper()
Expand All @@ -305,3 +318,4 @@ def scrape(self, election_cycle=None):
time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
)
)
"""
9 changes: 4 additions & 5 deletions src/pages/aboutUs.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,22 @@ export default function AboutUs() {
const currentTeam = sortTeamByAlphabeticalOrder([
{
name: "Alex P",
position: "Frontend / Co-lead",
position: "Frontend",
github: "alessandro-pianetta",
image: Alex,
lead: true,
},
{
name: "Geraldine E",
position: "Backend",
github: "geleazar1000111",
image: Geraldine,
},
{ name: "Ryan W", position: "Backend", image: Ryan },
{ name: "Ryan W", position: "Backend / Co-lead", image: Ryan, lead: true },
{ name: "Darren P", position: "Backend / Co-lead", lead: true },
{ name: "Emily J", position: "Frontend" },
{ name: "Mark N", position: "Frontend" },
{ name: "Coco M", position: "Backend" },
{ name: "Diane L", position: "UX & Design" },
{ name: "Irina R", position: "UX & Design" },
{ name: "Yan-Yin C", position: "Frontend"}
])
const alumni = sortTeamByAlphabeticalOrder([
{ name: "Helen", position: "Project Lead", lead: true },
Expand All @@ -57,6 +54,8 @@ export default function AboutUs() {
{ name: "Lynna J", position: "Fullstack" },
{ name: "Gajan N", position: "Fullstack" },
{ name: "Nicole", position: "Fullstack" },
{ name: "Emily J", position: "Frontend" },
{ name: "Yan-Yin C", position: "Frontend" },
])

return (
Expand Down

0 comments on commit 683ba5a

Please sign in to comment.