diff --git a/dedoc/readers/article_reader/article_reader.py b/dedoc/readers/article_reader/article_reader.py index a63bd8b6..29cb0348 100644 --- a/dedoc/readers/article_reader/article_reader.py +++ b/dedoc/readers/article_reader/article_reader.py @@ -28,6 +28,9 @@ def __init__(self, config: Optional[dict] = None) -> None: else: self.grobid_url = f"http://{os.environ.get('GROBID_HOST', 'localhost')}:{os.environ.get('GROBID_PORT', '8070')}" self.url = f"{self.grobid_url}/api/processFulltextDocument" + + auth_key = os.environ.get("GROBID_AUTH_KEY", "") + self.request_headers = {"Authorization": auth_key} if auth_key else {} self.grobid_is_alive = False def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: @@ -48,7 +51,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure with open(file_path, "rb") as file: files = {"input": file} try: - response = requests.post(self.url, files=files, data={"teiCoordinates": "figure"}) + response = requests.post(self.url, files=files, data={"teiCoordinates": "figure"}, headers=self.request_headers) if response.status_code != 200: warning = f"GROBID returns code {response.status_code}." self.logger.warning(warning) @@ -106,7 +109,7 @@ def __update_grobid_alive(self, grobid_url: str, max_attempts: int = 2) -> None: attempt = max_attempts while attempt > 0: try: - response = requests.get(f"{grobid_url}/api/isalive") + response = requests.get(f"{grobid_url}/api/isalive", headers=self.request_headers) if response.status_code == 200: self.logger.info(f"GROBID up on {grobid_url}.") self.grobid_is_alive = True diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 0b34ce72..17816b30 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -2,7 +2,6 @@ from collections import namedtuple from typing import Dict, Iterator, List, Optional, Set, Tuple -import numpy as np from dedocutils.data_structures.bbox import BBox from numpy import ndarray @@ -13,7 +12,6 @@ from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable -from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer ParametersForParseDoc = namedtuple("ParametersForParseDoc", [ "orient_analysis_cells", @@ -45,6 +43,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.metadata_extractor import LineMetadataExtractor from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor + from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor @@ -153,8 +152,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( metadata["rotated_page_angles"] = page_angles return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata - def _process_document_with_gost_frame(self, images: Iterator[np.ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \ - Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]]: + def _process_document_with_gost_frame(self, images: Iterator[ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \ + Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]]: from joblib import Parallel, delayed from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader @@ -170,7 +169,7 @@ def _process_document_with_gost_frame(self, images: Iterator[np.ndarray], first_ return result, gost_analyzed_images def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[ScanTable], attachments: List[PdfImageAttachment], - gost_analyzed_images: Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]) -> None: + gost_analyzed_images: Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]) -> None: # shift unref_tables for scan_table in unref_tables: for location in scan_table.locations: