openstates · braykuka · Dec 5, 2024 · Dec 16, 2024 · Dec 16, 2024
diff --git a/scrapers/tx/bills.py b/scrapers/tx/bills.py
@@ -2,15 +2,23 @@
 import ftplib
 import re
 import time
+import urllib3
+
+from io import BytesIO
 from urllib import parse as urlparse
+
 import xml.etree.cElementTree as etree
+import fitz
 
-from openstates.scrape import Scraper, Bill
+from openstates.scrape import Scraper, Bill, VoteEvent
 from openstates.scrape.base import ScrapeError
 from utils import LXMLMixin
 from .actions import Categorizer
 
 
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+
 class TXBillScraper(Scraper, LXMLMixin):
     _FTP_ROOT = "ftp.legis.state.tx.us"
     CHAMBERS = {"H": "lower", "S": "upper"}
@@ -90,11 +98,14 @@ def scrape(self, session=None, chamber=None):
             if "house" in bill_url:
                 if "lower" in chambers:
                     yield from self.scrape_bill(session, bill_url)
+                    break
             elif "senate" in bill_url:
                 if "upper" in chambers:
                     yield from self.scrape_bill(session, bill_url)
+                    break
 
     def scrape_bill(self, session, history_url):
+        print(history_url)
         history_xml = self.get(history_url).text
         root = etree.fromstring(history_xml)
 
@@ -128,9 +139,8 @@ def scrape_bill(self, session, history_url):
         bill.add_source(history_url)
 
         bill_id_for_url = bill_id.replace(" ", "")
-        bill.add_source(
-            f"https://capitol.texas.gov/BillLookup/History.aspx?LegSess={session}&Bill={bill_id_for_url}"
-        )
+        bill_history_url = f"https://capitol.texas.gov/BillLookup/History.aspx?LegSess={session}&Bill={bill_id_for_url}"
+        bill.add_source(bill_history_url)
 
         for subject in root.iterfind("subjects/subject"):
             bill.add_subject(subject.text.strip())
@@ -243,6 +253,9 @@ def scrape_bill(self, session, history_url):
         if root.findtext("companions"):
             self._get_companion(bill)
 
+        # Parse Votes
+        yield from self.scrape_vote(bill_id, bill_history_url)
+
         yield bill
 
     def _get_companion(self, bill):
@@ -269,3 +282,280 @@ def _format_session(self, session):
 
     def _format_bill_id(self, bill_id):
         return bill_id.replace(" ", "")
+
+    def scrape_vote(self, bill_id, bill_history_url):
+        page = self.lxmlize(bill_history_url)
+        for vote_href in page.xpath('//tr[contains(@id, "vote")]/td/a'):
+            vote_name = vote_href.text_content()
+            if vote_name != "Record vote":
+                continue
+            vote_url = (
+                vote_href.attrib["href"].replace("pdf", "html").replace("PDF", "HTM")
+            )
+            yield from self.parse_vote(vote_url, bill_id)
+
+    def parse_vote(self, url, bill_id):
+        if "page=" not in url:
+            self.error("No page number for {}".format(url))
+
+        page_number = url.split("page=")[-1]
+        try:
+            response = self.get(url, verify=False)
+        except Exception as e:
+            self.error(f"Failed request in {url} - {e}")
+            return
+        pdf_content = BytesIO(response.content)
+        doc = fitz.open("pdf", pdf_content)
+        page = doc[int(page_number) - 1]
+        pdf_text = page.get_text()
+
+        next_page = doc[int(page_number)]
+        next_pdf_text = next_page.get_text()
+
+        pdf_text = self.clear_pdf_text(pdf_text.strip())
+        next_pdf_text = self.clear_pdf_text(next_pdf_text.strip())
+
+        bill_pattern = r"\b(CSSB|CSHB|HB|SB|SCR|SJR|SR|HCR|HJR|HR|HOUSE.BILL|SENATE.BILL)[^\d]+(\d+)\b"
+        voting_pattern1 = (
+            r"(\d+)\s+Yeas,\s+(\d+)\s+Nays(?:,\s+(\d+)\s+Present,\s+not voting)?"
+        )
+        voting_pattern2 = (
+            r"Yeas.(\d+),\s+Nays.(\d+)(?:,\s+Present,\s+not voting.(\d+))?"
+        )
+
+        skip_line_cnt = 0
+        alt_bill_id = bill_id
+        results = []
+        may_results = []
+        for line_num, line_text in enumerate(pdf_text):
+            if skip_line_cnt > 0:
+                skip_line_cnt = skip_line_cnt - 1
+                continue
+
+            skip_line_cnt = 0
+            bill_match = re.search(bill_pattern, line_text)
+            if bill_match:
+                alt_bill_id = (
+                    bill_match.group(0)
+                    .replace("CS", "")
+                    .replace("HOUSE BILL", "HB")
+                    .replace("SENATE BILL", "SB")
+                    .strip()
+                )
+
+            voting_match = re.search(voting_pattern1, line_text) or re.search(
+                voting_pattern2, line_text
+            )
+            if voting_match:
+                next_line_num = 0
+                yeas, nays, present_not_voting = voting_match.groups()
+                present_not_voting = present_not_voting or "0"
+
+                yeas = int(yeas)
+                nays = int(nays)
+                present_not_voting = int(present_not_voting)
+
+                next_text = (
+                    pdf_text[line_num + 1] if line_num + 1 < len(pdf_text) else ""
+                )
+                yea_voters, go_next_yea = self.extract_voter_list(
+                    "Yeas", next_text, yeas
+                )
+                if go_next_yea:
+                    yea_voters, _ = self.extract_voter_list(
+                        "Yeas", next_text + next_pdf_text[next_line_num], yeas
+                    )
+                    next_line_num += 1
+                if len(yea_voters) != 0:
+                    skip_line_cnt = 1
+                next_text = (
+                    pdf_text[line_num + skip_line_cnt + 1]
+                    if line_num + skip_line_cnt + 1 < len(pdf_text)
+                    else ""
+                )
+                nay_voters, go_next_nay = self.extract_voter_list(
+                    "Nays",
+                    (next_pdf_text[next_line_num] if next_line_num else next_text),
+                    nays,
+                )
+                if next_line_num > 0:
+                    next_line_num += 1
+                if go_next_nay:
+                    nay_voters, go_next_nay = self.extract_voter_list(
+                        "Nays", next_text + next_pdf_text[0], nays
+                    )
+                    next_line_num = 1
+                if len(nay_voters) != 0:
+                    skip_line_cnt += 1
+                next_text = (
+                    pdf_text[line_num + skip_line_cnt + 1]
+                    if line_num + skip_line_cnt + 1 < len(pdf_text)
+                    else ""
+                )
+                present_not_voting_voters, go_next_nv = self.extract_voter_list(
+                    "Present, not voting",
+                    (next_pdf_text[next_line_num] if next_line_num else next_text),
+                    present_not_voting,
+                )
+                if next_line_num > 0:
+                    next_line_num += 1
+                if go_next_nv:
+                    present_not_voting_voters, go_next_nv = self.extract_voter_list(
+                        "Present, not voting",
+                        next_text + next_pdf_text[0],
+                        present_not_voting,
+                    )
+                    next_line_num = 1
+                # if len(present_not_voting_voters) != 0:
+                #     skip_line_cnt += 1
+                # next_text = (
+                #     pdf_text[line_num + skip_line_cnt + 1]
+                #     if line_num + skip_line_cnt + 1 < len(pdf_text)
+                #     else ""
+                # )
+                # if next_line_num > 0:
+                #     next_line_num += 1
+                # absent_excused_voters, go_next_exc = self.extract_voter_list(
+                #     "Absent, Excused",
+                #     (next_pdf_text[next_line_num] if next_line_num else next_text),
+                #     -1,
+                # )
+                # if go_next_exc:
+                #     absent_excused_voters, go_next_exc = self.extract_voter_list(
+                #         "Absent, Excused", next_text + next_pdf_text[0], -1
+                #     )
+                #     next_line_num = 1
+                # if len(absent_excused_voters) != 0:
+                #     skip_line_cnt += 1
+                # next_text = (
+                #     pdf_text[line_num + skip_line_cnt + 1]
+                #     if line_num + skip_line_cnt + 1 < len(pdf_text)
+                #     else ""
+                # )
+                # if next_line_num > 0:
+                #     next_line_num += 1
+                # absent_voters, go_next_abs = self.extract_voter_list(
+                #     "Absent",
+                #     (next_pdf_text[next_line_num] if next_line_num else next_text),
+                #     -1,
+                # )
+                # if go_next_abs:
+                #     absent_voters, go_next_abs = self.extract_voter_list(
+                #         "Absent, Excused", next_text + next_pdf_text[0], -1
+                #     )
+                #     next_line_num = 1
+                # if len(absent_voters) != 0:
+                #     skip_line_cnt += 1
+
+                if alt_bill_id == bill_id:
+                    results.append(
+                        {
+                            "yes": yeas,
+                            "no": nays,
+                            "other": present_not_voting,
+                            # + len(absent_voters)
+                            # + len(absent_excused_voters),
+                            "voters": {
+                                "yes": yea_voters,
+                                "no": nay_voters,
+                                "other": present_not_voting_voters,
+                                # "absent": absent_voters,
+                                # "excused": absent_excused_voters,
+                            },
+                        }
+                    )
+                else:
+                    may_results.append(
+                        {
+                            "yes": yeas,
+                            "no": nays,
+                            "other": present_not_voting,
+                            # + len(absent_voters)
+                            # + len(absent_excused_voters),
+                            "voters": {
+                                "yes": yea_voters,
+                                "no": nay_voters,
+                                "other": present_not_voting_voters,
+                                # "absent": absent_voters,
+                                # "excused": absent_excused_voters,
+                            },
+                        }
+                    )
+
+        if len(results) == 0 and len(may_results) > 0:
+            results = may_results[0:1]
+
+        chamber = "upper" if "senate" in url else "lower"
+        for result in results:
+            passed = result["yes"] > result["no"]
+            v = VoteEvent(
+                chamber=chamber,
+                start_date=None,
+                motion_text="passage" if passed else "other",
+                result="pass" if passed else "fail",
+                classification="passage" if passed else None,
+                legislative_session=bill_id,
+            )
+
+            v.set_count("yes", result["voters"]["yes"])
+            v.set_count("no", result["voters"]["no"])
+            v.set_count("other", result["voters"]["other"])
+
+            yield v
+
+    # Extract voter names
+    def clear_pdf_text(self, text):
+        def replace_str(match):
+            return match.group(0).replace("i", " ")
+
+        bill_pattern = r"\b(CSSB|CSHB|HB|SB|SCR|SJR|SR|HCR|HJR|HR|HOUSE.BILL|SENATE.BILL)[^\d]+(\d+)\b"
+        pdf_text = re.sub(
+            bill_pattern,
+            replace_str,
+            text,
+        )
+        pdf_text = re.sub(
+            r"i\d+",
+            replace_str,
+            pdf_text.replace("ii", " "),
+        )
+        ignore_patterns = [
+            r"^\d+$",
+            r"\b\d{1,3}(?:st|nd|rd|th) LEGISLATURE — [A-Z]+ SESSION\b",
+            r"\b\d{1,3}(?:st|nd|rd|th) Day\b",
+            r"\b[A-Za-z]+, [A-Za-z]+ \d{1,2}, \d{4}\b",
+            r"\b(?:HOUSE|SENATE) JOURNAL\b",
+            r"\b\d{1,3}(?:st|nd|rd|th) Day\b",
+        ]
+        ignore_cnt = 0
+        for line_text in pdf_text.split("\n"):
+            for ignore_pattern in ignore_patterns:
+                if re.search(ignore_pattern, line_text, re.IGNORECASE):
+                    ignore_cnt -= 1
+                    break
+
+        pdf_text = [
+            line.replace("\n", " ")
+            for line in re.split(r"\.\n", "\n".join(pdf_text.split("\n")[0:ignore_cnt]))
+        ]
+
+        return pdf_text
+
+    def extract_voter_list(self, prefix, text, estimate_cnt):
+        go_next = False
+        result = []
+        match = re.search(rf"{prefix} — ([\s\S]+)$", text)
+        if match:
+            result = [name.strip() for name in match.group(1).split(";")]
+        match = re.search(rf"{prefix}: ([\s\S]+)$", text)
+        if match:
+            result = [name.strip() for name in match.group(1).split(",")]
+
+        if len(result) != estimate_cnt and estimate_cnt != -1:
+            go_next = True
+        if len(text.strip()) == 0:
+            go_next = True
+        if estimate_cnt == 0:
+            go_next = False
+            result = []
+        return result, go_next