Merge pull request #24 from NREL/gb/ord_example

Gb/ord example
NREL · Jul 10, 2024 · 1c9cf0d · 1c9cf0d
2 parents 906eeba + 1043602
commit 1c9cf0d
Show file tree

Hide file tree

Showing 13 changed files with 218 additions and 42 deletions.
diff --git a/.github/workflows/pytest_ords.yml b/.github/workflows/pytest_ords.yml
@@ -12,17 +12,18 @@ jobs:
  python-version: [3.11]
 
  steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v4
  with:
  ref: ${{ github.event.pull_request.head.ref }}
  fetch-depth: 1
  - name: Set up Python ${{ matrix.python-version }}
- uses: conda-incubator/setup-miniconda@v2
+ uses: conda-incubator/setup-miniconda@v3
  with:
  auto-update-conda: true
+ channels: conda-forge,defaults
  python-version: ${{ matrix.python-version }}
  miniconda-version: "latest"
- - name: Install dependencies'
+ - name: Install dependencies
  shell: bash -l {0}
  run: |
  conda install -c conda-forge poppler

diff --git a/elm/ords/README.md b/elm/ords/README.md
@@ -1,15 +1,18 @@
 # Welcome to Energy Language Model - OrdinanceGPT
 
-The ordinance web scraping and data extraction portion of this codebase required a few extra dependencies that do not come out-of-the-box with the base ELM software.
-To set up ELM for ordinances, first create a conda environment. Then, _before installing ELM_, run the poppler installation:
+The ordinance web scraping and data extraction portion of this codebase
+required a few extra dependencies that do not come out-of-the-box with the base
+ELM software. To set up ELM for ordinances, first create a conda environment.
+We have had some issues using python 3.9 and recommend using python 3.11. Then,
+_before installing ELM_, run the poppler installation:
 
  $ conda install -c conda-forge poppler
 
 Then, install `pdftotext`:
 
  $ pip install pdftotext
 
-(OPTIONAL) If you want to have access to Optical Character Recognition (OCR) for PDF parsing, you should also install pytesseract during this step:
+(OPTIONAL) If you want to have access to Optical Character Recognition (OCR) for PDF parsing, you should also install pytesseract during this step. Note that there may be additional OS-specific installation steps to get tesseract working properly (see the [pytesseract install instructions](https://pypi.org/project/pytesseract/))
 
  $ pip install pytesseract pdf2image
 

diff --git a/elm/ords/extraction/apply.py b/elm/ords/extraction/apply.py
@@ -277,7 +277,7 @@ async def extract_ordinance_values(doc, **kwargs):
  A document known to contain ordinance text. This means it must
  contain an ``"cleaned_ordinance_text"`` key in the metadata. You
  can run
- :func:`~elm.ords.extraction.apply.extract_ordinance_text`
+ :func:`~elm.ords.extraction.apply.extract_ordinance_text_with_llm`
  to have this attribute populated automatically for documents
  that are found to contain ordinance data. Note that if the
  document's metadata does not contain the
@@ -297,8 +297,8 @@ async def extract_ordinance_values(doc, **kwargs):
  if not doc.metadata.get("cleaned_ordinance_text"):
  msg = (
  "Input document has no 'cleaned_ordinance_text' key or string "
- "does not contain info. Please run `extract_ordinance_text` "
- "prior to calling this method."
+ "does not contain info. Please run "
+ "`extract_ordinance_text_with_llm` prior to calling this method."
  )
  logger.warning(msg)
  warn(msg, UserWarning)

diff --git a/elm/ords/extraction/ordinance.py b/elm/ords/extraction/ordinance.py
@@ -173,6 +173,8 @@ async def parse(self, min_chunks_to_process=3):
  logger.debug("Text at ind %d is not legal text", ind)
  continue
 
+ logger.debug("Text at ind %d is legal text", ind)
+
  contains_ord_info = await self.parse_from_ind(
  ind, self.CONTAINS_ORD_PROMPT, key="contains_ord_info"
  )
@@ -182,6 +184,8 @@ async def parse(self, min_chunks_to_process=3):
  )
  continue
 
+ logger.debug("Text at ind %d does contain ordinance info", ind)
+
  is_utility_scale = await self.parse_from_ind(
  ind, self.IS_UTILITY_SCALE_PROMPT, key="x"
  )
@@ -191,6 +195,8 @@ async def parse(self, min_chunks_to_process=3):
  )
  continue
 
+ logger.debug("Text at ind %d is for utility-scale WECS", ind)
+
  self._ordinance_chunks.append({"text": text, "ind": ind})
  logger.debug("Added text at ind %d to ordinances", ind)
  # mask, since we got a good result

diff --git a/elm/ords/process.py b/elm/ords/process.py
@@ -269,7 +269,7 @@ async def _process_with_logs(
 ):
  """Process counties with logging enabled."""
  counties = _load_counties_to_process(county_fp)
- azure_api_key, azure_version, azure_endpoint = _validate_api_params(
+ azure_api_key, azure_version, azure_endpoint = validate_api_params(
  azure_api_key, azure_version, azure_endpoint
  )
 
@@ -318,7 +318,7 @@ async def _process_with_logs(
  )
  trackers.append(usage_tracker)
  task = asyncio.create_task(
- download_docs_for_county_with_logging(
+ process_county_with_logging(
  log_listener,
  log_dir,
  location,
@@ -379,7 +379,8 @@ def _load_counties_to_process(county_fp):
  return load_counties_from_fp(county_fp)
 
 
-def _validate_api_params(azure_api_key, azure_version, azure_endpoint):
+def validate_api_params(azure_api_key=None, azure_version=None,
+ azure_endpoint=None):
  """Validate OpenAI API parameters."""
  azure_api_key = azure_api_key or os.environ.get("AZURE_OPENAI_API_KEY")
  azure_version = azure_version or os.environ.get("AZURE_OPENAI_VERSION")
@@ -404,7 +405,7 @@ def _configure_file_loader_kwargs(file_loader_kwargs):
  return file_loader_kwargs
 
 
-async def download_docs_for_county_with_logging(
+async def process_county_with_logging(
  listener,
  log_dir,
  county,
@@ -461,7 +462,7 @@ async def download_docs_for_county_with_logging(
  listener, log_dir, location=county.full_name, level=level
  ):
  task = asyncio.create_task(
- download_doc_for_county(
+ process_county(
  county,
  text_splitter,
  num_urls=num_urls,
@@ -485,7 +486,7 @@ async def download_docs_for_county_with_logging(
  return doc
 
 
-async def download_doc_for_county(
+async def process_county(
  county,
  text_splitter,
  num_urls=5,

diff --git a/elm/ords/services/provider.py b/elm/ords/services/provider.py
@@ -150,3 +150,37 @@ async def __aexit__(self, exc_type, exc, tb):
  for service in self.services:
  logger.debug("Tearing down Service: %s", service.name)
  tear_down_service_queue(service.name)
+
+ @classmethod
+ def run(cls, services, coroutine):
+ """Run an async function that relies on services.
+
+ You can treat this function like the ``asyncio.run`` function
+ with an extra parameter::
+
+ openai_service = OpenAIService(...)
+ RunningAsyncServices.run(
+ [openai_service], my_async_func(*args, **kwargs)
+ )
+
+
+ Parameters
+ ----------
+ services : iterable of :class:`elm.ords.services.base.Service`
+ An iterable (i.e. a list) of Services that are needed to run
+ the asynchronous function.
+ coroutine : coroutine
+ A coroutine that should be run with the services.
+
+ Returns
+ -------
+ Any
+ Returns the output of the coroutine.
+ """
+ return asyncio.run(cls._run_coroutine(services, coroutine))
+
+ @classmethod
+ async def _run_coroutine(cls, services, coroutine):
+ """Run a coroutine under services. """
+ async with cls(services):
+ return await coroutine
diff --git a/elm/ords/validation/content.py b/elm/ords/validation/content.py
@@ -125,7 +125,6 @@ async def parse_from_ind(self, ind, prompt, key):
  logger.debug("Mem at ind %d is %s", step, mem)
  check = mem.get(key)
  if check is None:
- # logger.debug("text=%s", text)
  content = await self.slc.call(
  sys_msg=prompt.format(key=key),
  content=text,

diff --git a/elm/pdf.py b/elm/pdf.py
@@ -336,15 +336,19 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n',
  self.full = combine_pages(self.pages)
  return self.full
 
- def convert_to_txt(self, txt_fp, separator=' '):
- """Function to convert contents of pdf document to txt file.
+ def convert_to_txt(self, txt_fp=None, separator=' ',
+ clean_header_kwargs=None):
+ """Function to convert contents of pdf document to txt file using
+ poppler.
 
  Parameters
  ----------
- txt_fp: str
- Directory for output txt file.
- separator : str
+ txt_fp: str, optional
+ Optional Directory for output txt file.
+ separator : str, optional
  Heuristic split string to look for spaces between columns
+ clean_header_kwargs : dict, optional
+ Optional kwargs to override clean_headers kwargs
 
  Returns
  -------
@@ -354,11 +358,13 @@ def convert_to_txt(self, txt_fp, separator=' '):
  text = self.clean_poppler(layout=True)
  if is_multi_col(text, separator=separator):
  text = self.clean_poppler(layout=False)
- text = self.clean_headers(char_thresh=0.6, page_thresh=0.8,
- split_on='\n',
- iheaders=[0, 1, 3, -3, -2, -1])
- with open(txt_fp, 'w') as f:
- f.write(text)
- logger.info(f'Saved: {txt_fp}')
+
+ clean_header_kwargs = clean_header_kwargs or {}
+ text = self.clean_headers(**clean_header_kwargs)
+
+ if txt_fp is not None:
+ with open(txt_fp, 'w') as f:
+ f.write(text)
+ logger.info(f'Saved: {txt_fp}')
 
  return text
diff --git a/elm/version.py b/elm/version.py
@@ -2,4 +2,4 @@
 ELM version number
 """
 
-__version__ = "0.0.4"
+__version__ = "0.0.5"
diff --git a/elm/web/document.py b/elm/web/document.py
@@ -3,19 +3,25 @@
 from abc import ABC, abstractmethod
 from copy import deepcopy
 from functools import cached_property
+import logging
 
 from elm.utilities.parse import (
  combine_pages,
  clean_headers,
  html_to_text,
  remove_blank_pages,
  format_html_tables,
+ read_pdf,
+ read_pdf_ocr,
  replace_common_pdf_conversion_chars,
  replace_multi_dot_lines,
  remove_empty_lines_or_page_footers,
 )
 
 
+logger = logging.getLogger(__name__)
+
+
 class BaseDocument(ABC):
  """Base ELM web document representation."""
 
@@ -173,6 +179,41 @@ def _raw_pages(self):
  raw_pages += [page for page in self.pages[self._last_page_index:]]
  return raw_pages
 
+ @classmethod
+ def from_file(cls, fp, **init_kwargs):
+ """Initialize a PDFDocument object from a .pdf file on disk. This
+ method will try to use pdftotext (a poppler utility) and then
+ OCR with pytesseract.
+
+ Parameters
+ ----------
+ fp : str
+ filepath to .pdf on disk
+ init_kwargs : dict
+ Optional kwargs for PDFDocument Initialization
+
+ Returns
+ -------
+ out : PDFDocument
+ Initialized PDFDocument class from input fp
+ """
+
+ with open(fp, 'rb') as f:
+ pages = read_pdf(f.read())
+
+ if all(len(page) < 10 for page in pages):
+ # fallback to OCR with pytesseract if no pages have more than 10
+ # chars. Typical scanned document only has weird ascii per page.
+ with open(fp, 'rb') as f:
+ pages = read_pdf_ocr(f.read())
+
+ if not any(pages):
+ msg = f'Could not get text from pdf: {fp}'
+ logger.error(msg)
+ raise RuntimeError(msg)
+
+ return cls(pages, **init_kwargs)
+
 
 class HTMLDocument(BaseDocument):
  """ELM web HTML document"""

diff --git a/examples/ordinance_gpt/README.rst b/examples/ordinance_gpt/README.rst
@@ -2,17 +2,29 @@
 Ordinance GPT
 *************
 
-This example folder contains supporting documents, results, and code for the
-Ordinance GPT experiment.
+This example folder contains supporting documents, results, and code for the Ordinance GPT experiment.
 
 Prerequisites
 =============
 We recommend installing the pytesseract module to allow PDF retrieval for scanned documents.
 See the `ordinance-specific installation instructions <https://github.com/NREL/elm/blob/main/elm/ords/README.md>`_
 for more details.
 
-Setup
-=====
+Running from Python
+===================
+This instruction set presents a simplified example to extract ordinance data from a ordinance document on disk. This corresponds with the ordinance data extraction from PDF results in `Buster et al., 2024 <https://doi.org/10.48550/arXiv.2403.12924>`_. 
+
+To run this, first download one or more ordinance documents from `the Box folder <https://app.box.com/s/a8oi8jotb9vnu55rzdul7e291jnn7hmq>`_.
+
+After downloading the ordinance document(s), set the relevant path for the ``fp_pdf`` variable, and then run the script:
+.. code-block:: bash
+
+ $ python parse_pdf.py
+
+Running from the Command Line Utility
+=====================================
+This instruction set is an experimental process to use LLMs to search the internet for relevant ordinance documents, download those documents, and then extract the relevant ordinance data. 
+
 There are a few key things you need to set up in order to run ordinance retrieval and extraction.
 First, you must specify which counties you want to process. You can do this by setting up a CSV file
 with a ``County`` and a ``State`` column. Each row in the CSV file then represents a single county to process.
@@ -21,7 +33,7 @@ file for reference.
 
 Once you have set up the county CSV, you can fill out the
 `template JSON config <https://github.com/NREL/elm/blob/main/examples/ordinance_gpt/config.json>`_.
-See the documentation for the `"process_counties_with_openai" function <https://github.com/NREL/elm/blob/42e9ed69dce8d818bb4fb02b3e041dda370a539f/elm/ords/process.py#L78>`_
+See the documentation for the `"process_counties_with_openai" function <https://github.com/NREL/elm/blob/main/elm/ords/process.py#L78>`_
 for an explanation of all the allowed inputs to the configuration file.
 Some notable inputs here are the ``azure*`` keys, which should be configured to match your Azure OpenAI API
 deployment (unless it's defined in your environment with the ``AZURE_OPENAI_API_KEY``, ``AZURE_OPENAI_VERSION``,
@@ -32,7 +44,7 @@ the ``llm_service_rate_limit`` to match your deployment's API tokens-per-minute
 paths to all files/directories unless you are executing the program from your working folder.
 
 Execution
-=========
+---------
 Once you are happy with the configuration parameters, you can kick off the processing using
 
 .. code-block:: bash
@@ -44,18 +56,18 @@ asynchronously, so the the logs will not print in order).
 
 .. WARNING:: Running all of the 85 counties given in the sample county CSV file can cost $700-$1000 in API calls. We recommend running a smaller subset for example purposes.
 
+Debugging
+---------
+Not sure why things aren't working? No error messages? Make sure you run the CLI call with a ``-v`` flag for "verbose" logging (e.g., ``$ elm ords -c config.json -v``)
+
+Errors on import statements? Trouble importing ``pdftotext`` with cryptic error messages like ``symbol not found in flat namespace``? Follow the `ordinance-specific install instructions <https://github.com/NREL/elm/blob/main/elm/ords/README.md>`_ *exactly*.
+
 Source Ordinance Documents
 ==========================
 
 The ordinance documents downloaded using (an older version of) this example code can be downloaded `here
 <https://app.box.com/s/a8oi8jotb9vnu55rzdul7e291jnn7hmq>`_.
 
-Debugging
-=========
-Not sure why things aren't working? No error messages? Make sure you run the CLI call with a ``-v`` flag for "verbose" logging (e.g., ``$ elm ords -c config.json -v``)
-
-Errors on import statements? Trouble importing ``pdftotext`` with cryptic error messages like ``symbol not found in flat namespace``? Follow the `ordinance-specific install instructions <https://github.com/NREL/elm/blob/main/elm/ords/README.md>`_ *exactly*.
-
 Extension to Other Technologies
 ===============================
 Extending this functionality to other technologies is possible but requires deeper understanding of the underlying processes.
@@ -64,5 +76,5 @@ as well as how they are applied in `parse.py <https://github.com/NREL/elm/blob/m
 have a firm understanding of these two modules, look through the
 `document validation routines <https://github.com/NREL/elm/blob/main/elm/ords/validation>`_ to get a better sense of how to
 adjust the web-scraping portion of the code to your technology. When you have set up the validation and parsing for your
-technology, put it all together by adjusting the `"process_counties_with_openai" function <https://github.com/NREL/elm/blob/42e9ed69dce8d818bb4fb02b3e041dda370a539f/elm/ords/process.py#L78>`_
+technology, put it all together by adjusting the `"process_counties_with_openai" function <https://github.com/NREL/elm/blob/main/elm/ords/process.py#L78>`_
 to call your new routines.