Skip to content

Commit

Permalink
TLDR-844 add auth key to grobid (#503)
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget authored Oct 31, 2024
1 parent fb0a9b8 commit 9497dba
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 7 deletions.
7 changes: 5 additions & 2 deletions dedoc/readers/article_reader/article_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ def __init__(self, config: Optional[dict] = None) -> None:
else:
self.grobid_url = f"http://{os.environ.get('GROBID_HOST', 'localhost')}:{os.environ.get('GROBID_PORT', '8070')}"
self.url = f"{self.grobid_url}/api/processFulltextDocument"

auth_key = os.environ.get("GROBID_AUTH_KEY", "")
self.request_headers = {"Authorization": auth_key} if auth_key else {}
self.grobid_is_alive = False

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
Expand All @@ -48,7 +51,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
with open(file_path, "rb") as file:
files = {"input": file}
try:
response = requests.post(self.url, files=files, data={"teiCoordinates": "figure"})
response = requests.post(self.url, files=files, data={"teiCoordinates": "figure"}, headers=self.request_headers)
if response.status_code != 200:
warning = f"GROBID returns code {response.status_code}."
self.logger.warning(warning)
Expand Down Expand Up @@ -106,7 +109,7 @@ def __update_grobid_alive(self, grobid_url: str, max_attempts: int = 2) -> None:
attempt = max_attempts
while attempt > 0:
try:
response = requests.get(f"{grobid_url}/api/isalive")
response = requests.get(f"{grobid_url}/api/isalive", headers=self.request_headers)
if response.status_code == 200:
self.logger.info(f"GROBID up on {grobid_url}.")
self.grobid_is_alive = True
Expand Down
9 changes: 4 additions & 5 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from collections import namedtuple
from typing import Dict, Iterator, List, Optional, Set, Tuple

import numpy as np
from dedocutils.data_structures.bbox import BBox
from numpy import ndarray

Expand All @@ -13,7 +12,6 @@
from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer

ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
"orient_analysis_cells",
Expand Down Expand Up @@ -45,6 +43,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti

from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.metadata_extractor import LineMetadataExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer
from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker
from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor
Expand Down Expand Up @@ -153,8 +152,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
metadata["rotated_page_angles"] = page_angles
return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata

def _process_document_with_gost_frame(self, images: Iterator[np.ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \
Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]]:
def _process_document_with_gost_frame(self, images: Iterator[ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \
Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]]:
from joblib import Parallel, delayed
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader

Expand All @@ -170,7 +169,7 @@ def _process_document_with_gost_frame(self, images: Iterator[np.ndarray], first_
return result, gost_analyzed_images

def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[ScanTable], attachments: List[PdfImageAttachment],
gost_analyzed_images: Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]) -> None:
gost_analyzed_images: Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]) -> None:
# shift unref_tables
for scan_table in unref_tables:
for location in scan_table.locations:
Expand Down

0 comments on commit 9497dba

Please sign in to comment.