Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TX: improve vote scraping #5136

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
298 changes: 294 additions & 4 deletions scrapers/tx/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,23 @@
import ftplib
import re
import time
import urllib3

from io import BytesIO
from urllib import parse as urlparse

import xml.etree.cElementTree as etree
import fitz

from openstates.scrape import Scraper, Bill
from openstates.scrape import Scraper, Bill, VoteEvent
from openstates.scrape.base import ScrapeError
from utils import LXMLMixin
from .actions import Categorizer


urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


class TXBillScraper(Scraper, LXMLMixin):
_FTP_ROOT = "ftp.legis.state.tx.us"
CHAMBERS = {"H": "lower", "S": "upper"}
Expand Down Expand Up @@ -90,11 +98,14 @@ def scrape(self, session=None, chamber=None):
if "house" in bill_url:
if "lower" in chambers:
yield from self.scrape_bill(session, bill_url)
break
elif "senate" in bill_url:
if "upper" in chambers:
yield from self.scrape_bill(session, bill_url)
break

def scrape_bill(self, session, history_url):
print(history_url)
history_xml = self.get(history_url).text
root = etree.fromstring(history_xml)

Expand Down Expand Up @@ -128,9 +139,8 @@ def scrape_bill(self, session, history_url):
bill.add_source(history_url)

bill_id_for_url = bill_id.replace(" ", "")
bill.add_source(
f"https://capitol.texas.gov/BillLookup/History.aspx?LegSess={session}&Bill={bill_id_for_url}"
)
bill_history_url = f"https://capitol.texas.gov/BillLookup/History.aspx?LegSess={session}&Bill={bill_id_for_url}"
bill.add_source(bill_history_url)

for subject in root.iterfind("subjects/subject"):
bill.add_subject(subject.text.strip())
Expand Down Expand Up @@ -243,6 +253,9 @@ def scrape_bill(self, session, history_url):
if root.findtext("companions"):
self._get_companion(bill)

# Parse Votes
yield from self.scrape_vote(bill_id, bill_history_url)

yield bill

def _get_companion(self, bill):
Expand All @@ -269,3 +282,280 @@ def _format_session(self, session):

def _format_bill_id(self, bill_id):
return bill_id.replace(" ", "")

def scrape_vote(self, bill_id, bill_history_url):
page = self.lxmlize(bill_history_url)
for vote_href in page.xpath('//tr[contains(@id, "vote")]/td/a'):
vote_name = vote_href.text_content()
if vote_name != "Record vote":
continue
vote_url = (
vote_href.attrib["href"].replace("pdf", "html").replace("PDF", "HTM")
)
yield from self.parse_vote(vote_url, bill_id)

def parse_vote(self, url, bill_id):
if "page=" not in url:
self.error("No page number for {}".format(url))

page_number = url.split("page=")[-1]
try:
response = self.get(url, verify=False)
except Exception as e:
self.error(f"Failed request in {url} - {e}")
return
pdf_content = BytesIO(response.content)
doc = fitz.open("pdf", pdf_content)
page = doc[int(page_number) - 1]
pdf_text = page.get_text()

next_page = doc[int(page_number)]
next_pdf_text = next_page.get_text()

pdf_text = self.clear_pdf_text(pdf_text.strip())
next_pdf_text = self.clear_pdf_text(next_pdf_text.strip())

bill_pattern = r"\b(CSSB|CSHB|HB|SB|SCR|SJR|SR|HCR|HJR|HR|HOUSE.BILL|SENATE.BILL)[^\d]+(\d+)\b"
voting_pattern1 = (
r"(\d+)\s+Yeas,\s+(\d+)\s+Nays(?:,\s+(\d+)\s+Present,\s+not voting)?"
)
voting_pattern2 = (
r"Yeas.(\d+),\s+Nays.(\d+)(?:,\s+Present,\s+not voting.(\d+))?"
)

skip_line_cnt = 0
alt_bill_id = bill_id
results = []
may_results = []
for line_num, line_text in enumerate(pdf_text):
if skip_line_cnt > 0:
skip_line_cnt = skip_line_cnt - 1
continue

skip_line_cnt = 0
bill_match = re.search(bill_pattern, line_text)
if bill_match:
alt_bill_id = (
bill_match.group(0)
.replace("CS", "")
.replace("HOUSE BILL", "HB")
.replace("SENATE BILL", "SB")
.strip()
)

voting_match = re.search(voting_pattern1, line_text) or re.search(
voting_pattern2, line_text
)
if voting_match:
next_line_num = 0
yeas, nays, present_not_voting = voting_match.groups()
present_not_voting = present_not_voting or "0"

yeas = int(yeas)
nays = int(nays)
present_not_voting = int(present_not_voting)

next_text = (
pdf_text[line_num + 1] if line_num + 1 < len(pdf_text) else ""
)
yea_voters, go_next_yea = self.extract_voter_list(
"Yeas", next_text, yeas
)
if go_next_yea:
yea_voters, _ = self.extract_voter_list(
"Yeas", next_text + next_pdf_text[next_line_num], yeas
)
next_line_num += 1
if len(yea_voters) != 0:
skip_line_cnt = 1
next_text = (
pdf_text[line_num + skip_line_cnt + 1]
if line_num + skip_line_cnt + 1 < len(pdf_text)
else ""
)
nay_voters, go_next_nay = self.extract_voter_list(
"Nays",
(next_pdf_text[next_line_num] if next_line_num else next_text),
nays,
)
if next_line_num > 0:
next_line_num += 1
if go_next_nay:
nay_voters, go_next_nay = self.extract_voter_list(
"Nays", next_text + next_pdf_text[0], nays
)
next_line_num = 1
if len(nay_voters) != 0:
skip_line_cnt += 1
next_text = (
pdf_text[line_num + skip_line_cnt + 1]
if line_num + skip_line_cnt + 1 < len(pdf_text)
else ""
)
present_not_voting_voters, go_next_nv = self.extract_voter_list(
"Present, not voting",
(next_pdf_text[next_line_num] if next_line_num else next_text),
present_not_voting,
)
if next_line_num > 0:
next_line_num += 1
if go_next_nv:
present_not_voting_voters, go_next_nv = self.extract_voter_list(
"Present, not voting",
next_text + next_pdf_text[0],
present_not_voting,
)
next_line_num = 1
# if len(present_not_voting_voters) != 0:
# skip_line_cnt += 1
# next_text = (
# pdf_text[line_num + skip_line_cnt + 1]
# if line_num + skip_line_cnt + 1 < len(pdf_text)
# else ""
# )
# if next_line_num > 0:
# next_line_num += 1
# absent_excused_voters, go_next_exc = self.extract_voter_list(
# "Absent, Excused",
# (next_pdf_text[next_line_num] if next_line_num else next_text),
# -1,
# )
# if go_next_exc:
# absent_excused_voters, go_next_exc = self.extract_voter_list(
# "Absent, Excused", next_text + next_pdf_text[0], -1
# )
# next_line_num = 1
# if len(absent_excused_voters) != 0:
# skip_line_cnt += 1
# next_text = (
# pdf_text[line_num + skip_line_cnt + 1]
# if line_num + skip_line_cnt + 1 < len(pdf_text)
# else ""
# )
# if next_line_num > 0:
# next_line_num += 1
# absent_voters, go_next_abs = self.extract_voter_list(
# "Absent",
# (next_pdf_text[next_line_num] if next_line_num else next_text),
# -1,
# )
# if go_next_abs:
# absent_voters, go_next_abs = self.extract_voter_list(
# "Absent, Excused", next_text + next_pdf_text[0], -1
# )
# next_line_num = 1
# if len(absent_voters) != 0:
# skip_line_cnt += 1

if alt_bill_id == bill_id:
results.append(
{
"yes": yeas,
"no": nays,
"other": present_not_voting,
# + len(absent_voters)
# + len(absent_excused_voters),
"voters": {
"yes": yea_voters,
"no": nay_voters,
"other": present_not_voting_voters,
# "absent": absent_voters,
# "excused": absent_excused_voters,
},
}
)
else:
may_results.append(
{
"yes": yeas,
"no": nays,
"other": present_not_voting,
# + len(absent_voters)
# + len(absent_excused_voters),
"voters": {
"yes": yea_voters,
"no": nay_voters,
"other": present_not_voting_voters,
# "absent": absent_voters,
# "excused": absent_excused_voters,
},
}
)

if len(results) == 0 and len(may_results) > 0:
results = may_results[0:1]

chamber = "upper" if "senate" in url else "lower"
for result in results:
passed = result["yes"] > result["no"]
v = VoteEvent(
chamber=chamber,
start_date=None,
motion_text="passage" if passed else "other",
result="pass" if passed else "fail",
classification="passage" if passed else None,
legislative_session=bill_id,
)

v.set_count("yes", result["voters"]["yes"])
v.set_count("no", result["voters"]["no"])
v.set_count("other", result["voters"]["other"])

yield v

# Extract voter names
def clear_pdf_text(self, text):
def replace_str(match):
return match.group(0).replace("i", " ")

bill_pattern = r"\b(CSSB|CSHB|HB|SB|SCR|SJR|SR|HCR|HJR|HR|HOUSE.BILL|SENATE.BILL)[^\d]+(\d+)\b"
pdf_text = re.sub(
bill_pattern,
replace_str,
text,
)
pdf_text = re.sub(
r"i\d+",
replace_str,
pdf_text.replace("ii", " "),
)
ignore_patterns = [
r"^\d+$",
r"\b\d{1,3}(?:st|nd|rd|th) LEGISLATURE — [A-Z]+ SESSION\b",
r"\b\d{1,3}(?:st|nd|rd|th) Day\b",
r"\b[A-Za-z]+, [A-Za-z]+ \d{1,2}, \d{4}\b",
r"\b(?:HOUSE|SENATE) JOURNAL\b",
r"\b\d{1,3}(?:st|nd|rd|th) Day\b",
]
ignore_cnt = 0
for line_text in pdf_text.split("\n"):
for ignore_pattern in ignore_patterns:
if re.search(ignore_pattern, line_text, re.IGNORECASE):
ignore_cnt -= 1
break

pdf_text = [
line.replace("\n", " ")
for line in re.split(r"\.\n", "\n".join(pdf_text.split("\n")[0:ignore_cnt]))
]

return pdf_text

def extract_voter_list(self, prefix, text, estimate_cnt):
go_next = False
result = []
match = re.search(rf"{prefix} — ([\s\S]+)$", text)
if match:
result = [name.strip() for name in match.group(1).split(";")]
match = re.search(rf"{prefix}: ([\s\S]+)$", text)
if match:
result = [name.strip() for name in match.group(1).split(",")]

if len(result) != estimate_cnt and estimate_cnt != -1:
go_next = True
if len(text.strip()) == 0:
go_next = True
if estimate_cnt == 0:
go_next = False
result = []
return result, go_next
Loading