Skip to content

Commit

Permalink
Merge pull request #19 from NREL/sp/rhub_api
Browse files Browse the repository at this point in the history
Sp/rhub api
  • Loading branch information
spodgorny9 authored Jun 4, 2024
2 parents fa0f871 + 268dc93 commit f4ee815
Show file tree
Hide file tree
Showing 10 changed files with 823 additions and 4,753 deletions.
25 changes: 25 additions & 0 deletions elm/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,3 +350,28 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n',
iheaders=iheaders)
self.full = combine_pages(self.pages)
return self.full

def convert_to_txt(self, txt_fp):
"""Function to convert contents of pdf document to txt file.
Parameters
----------
txt_fp: str
Directory for output txt file.
Returns
-------
text : str
Text string containing contents from pdf
"""
text = self.clean_poppler(layout=True)
if self.is_double_col():
text = self.clean_poppler(layout=False)
text = self.clean_headers(char_thresh=0.6, page_thresh=0.8,
split_on='\n',
iheaders=[0, 1, 3, -3, -2, -1])
with open(txt_fp, 'w') as f:
f.write(text)
logger.info(f'Saved: {txt_fp}')

return text
25 changes: 22 additions & 3 deletions elm/web/osti.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,7 @@ def __init__(self, url, n_pages=1):
self._n_pages = 0
self._iter = 0

records = self._get_first()
for page in self._get_pages(n_pages=n_pages):
records += page
records = self._get_all(n_pages)
records = [OstiRecord(single) for single in records]
super().__init__(records)

Expand Down Expand Up @@ -234,6 +232,27 @@ def _get_pages(self, n_pages):
else:
break

def _get_all(self, n_pages):
"""Get all pages of records up to n_pages.
Parameters
----------
n_pages : int
Number of pages to retrieve
Returns
-------
all_records : list
List of all records.
"""
first_page = self._get_first()
records = first_page

for page in self._get_pages(n_pages):
records.extend(page)

return records

def download(self, out_dir):
"""Download all PDFs from the records in this OSTI object into a
directory. PDFs will be given file names based on their OSTI record ID
Expand Down
Loading

0 comments on commit f4ee815

Please sign in to comment.