Skip to content

Commit

Permalink
TLDR-405 remove is_one_column_document_list (#332)
Browse files Browse the repository at this point in the history
* TLDR-405 remove is_one_column_document_list

* TLDR-405 fix tests

* TLDR-405 review fix
  • Loading branch information
NastyBoget authored Sep 20, 2023
1 parent 79f4cb5 commit 2ff929b
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 59 deletions.
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ There are two ways to install and run dedoc as a web application or a library th

## Install and run dedoc using docker

You should have [`git`] (https://git-scm.com) and [`docker`](https://www.docker.com) installed for running dedoc by this method.
You should have [`git`](https://git-scm.com) and [`docker`](https://www.docker.com) installed for running dedoc by this method.
This method is more flexible because it doesn't depend on the operating system and other user's limitations,
still, the docker application should be installed and configured properly.

Expand Down Expand Up @@ -130,7 +130,7 @@ export TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/

## Install the dedoc library via pip.

You need torch~=1.11.0 and torchvision~=0.12.0 installed. If you already have torch and torchvision in your environment:
You need `torch~=1.11.0` and `torchvision~=0.12.0` installed. If you already have torch and torchvision in your environment:

```bash
pip install dedoc
Expand All @@ -144,10 +144,10 @@ pip install "dedoc[torch]"

## Install and run dedoc from sources

If you want to run dedoc as a service from sources. it's possible to run dedoc locally.
However, it isn't suitable for any operating system (Ubuntu 20+ is recommended) and
If you want to run dedoc as a service from sources, it's possible to run dedoc locally.
However, it is suitable not for all operating systems (`Ubuntu 20+` is recommended) and
there may be not enough machine's resources for its work.
You should have `python` (python3.8, python3.9 are recommended) and `pip` installed.
You should have `python` (`python3.8`, `python3.9` are recommended) and `pip` installed.

### 1. Install necessary packages: according to instructions [install necessary packages](#1-Install-necessary-packages)

Expand Down Expand Up @@ -183,14 +183,14 @@ python dedoc/main.py -c ./dedoc/config.py
Now you can go to the `localhost:1231` and look at the docs and examples.

## Option: You can change the port of service:
you need to change environment DOCREADER_PORT
You need to change environment `DOCREADER_PORT`

1. For local service launching on your_port (1166 example). [Install instruction from sources](#Install-and-run-dedoc-from-sources) and launch with environment:
1. For local service launching on `your_port` (e.g. `1166`). Install ([installation instruction](#Install-and-run-dedoc-from-sources)) and launch with environment:
```bash
DOCREADER_PORT=1166 python dedoc/main.py -c ./dedoc/config.py
```

2. For service launching in docker-container you need to change port value in DOCREADER_PORT env and field 'ports' in docker-compose.yml file:
2. For service launching in docker-container you need to change port value in `DOCREADER_PORT` env and field `ports` in `docker-compose.yml` file:
```yaml
...
dedoc:
Expand Down
6 changes: 2 additions & 4 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@
"first_page",
"last_page",
"need_binarization",
"table_type",
"is_one_column_document_list"])
"table_type"])


class PdfBaseReader(BaseReader):
Expand Down Expand Up @@ -84,8 +83,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
first_page=first_page,
last_page=last_page,
need_binarization=param_utils.get_param_need_binarization(parameters),
table_type=param_utils.get_param_table_type(parameters),
is_one_column_document_list=param_utils.get_is_one_column_document_list(parameters)
table_type=param_utils.get_param_table_type(parameters)
)

lines, scan_tables, attachments, warnings, other_fields = self._parse_document(path, params_for_parse)
Expand Down
39 changes: 15 additions & 24 deletions dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def _process_one_page(self,
page_number: int,
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
# --- Step 1: correct orientation and detect column count ---
rotated_image, is_one_column_document = self._detect_columncount_and_orientation(image, parameters)
rotated_image, is_one_column_document = self._detect_column_count_and_orientation(image, parameters)

# --- Step 2: do binarization ---
if parameters.need_binarization:
Expand Down Expand Up @@ -102,37 +102,28 @@ def _process_one_page(self,

return lines, tables, page.attachments

def _detect_columncount_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool]:
def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool]:
"""
Function :
- detects the count of the column
- detects document orientation angle
- rotates document on detected angle
- updates a parameters.is_one_column_document
Return: rotated_image
- detects the number of page columns
- detects page orientation angle
- rotates the page on detected angle
Return: rotated_image and indicator if the page is one-column
"""
angle = 0 # parameters.document_orientation is False
columns = None
columns, angle = None, None

if parameters.is_one_column_document is None or parameters.document_orientation is None:
self.logger.info("Call orientation and columns classifier")
columns, angle = self.column_orientation_classifier.predict(image)
self.logger.info(f"Predicted orientation angle = {angle}, columns = {columns}")

self.logger.debug(f"Predict {angle}")
if columns is not None:
self.logger.info(f"Final number of columns: {columns}")
else:
self.logger.info("Final number of columns: not detected")

if parameters.is_one_column_document is not None:
is_one_column_document = parameters.is_one_column_document
else:
is_one_column_document = True if columns == 1 else False

self.logger.info(f"Final orientation angle: {angle}")
is_one_column_document = columns == 1 if parameters.is_one_column_document is None else parameters.is_one_column_document
angle = angle if parameters.document_orientation is None else 0
self.logger.info(f"Final orientation angle = {angle}, is_one_column_document = {is_one_column_document}")

rotated_image, _ = self.scan_rotator.auto_rotate(image, angle)
if self.config.get("debug_mode"):
self.logger.info(self.config["path_debug"])
cv2.imwrite(os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg"), rotated_image)
img_path = os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg")
self.logger.info(f"Save image to {img_path}")
cv2.imwrite(img_path, rotated_image)

return rotated_image, is_one_column_document
10 changes: 4 additions & 6 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio

return self._postprocess(result)

def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable], List[List[CellPropertyInfo]]]:
def __extract(self, path: str, start_page: int = None, end_page: int = None) \
-> Tuple[List[LineWithMeta], List[ScanTable], List[List[List[CellPropertyInfo]]]]:
file_hash = calculate_file_hash(path=path)
document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page)
all_lines = []
Expand All @@ -134,7 +135,7 @@ def __extract(self, path: str, start_page: int = None, end_page: int = None) ->

return all_lines, all_tables, all_cell_properties

def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
def __get_tables(self, page: dict, file_hash: str) -> Tuple[List[ScanTable], List[List[List[CellPropertyInfo]]]]:
tables = []
cell_properties = []
page_number = page["number"]
Expand All @@ -154,10 +155,7 @@ def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
cell_property_row_list = []

for cell_property in cell_properties_row:
cell_property_info = CellPropertyInfo(cell_property["col_span"],
cell_property["row_span"],
bool(cell_property["invisible"]))

cell_property_info = CellPropertyInfo(cell_property["col_span"], cell_property["row_span"], bool(cell_property["invisible"]))
cell_property_row_list.append(cell_property_info)

cell_property_list.append(cell_property_row_list)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ def _process_one_page(self,
parameters: ParametersForParseDoc,
page_number: int,
path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
gray_image = self._convert_to_gray(image)
if parameters.need_pdf_table_analysis:
gray_image = self._convert_to_gray(image)
cleaned_image, tables = self.table_recognizer.recognize_tables_from_image(
image=gray_image,
page_number=page_number,
Expand All @@ -57,9 +57,7 @@ def _process_one_page(self,
else:
tables = []

is_one_column_document_list = None if parameters.is_one_column_document_list is None else parameters.is_one_column_document_list[page_number]

page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number, is_one_column_document=is_one_column_document_list)
page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number)
if page is None:
return [], [], []
unreadable_blocks = [location.bbox for table in tables for location in table.locations]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,23 @@ def __init__(self, *, config: dict) -> None:
self.config = config
self.logger = self.config.get("logger", logging.getLogger())

def extract_text_layer(self, path: str, page_number: int, is_one_column_document: bool) -> Optional[PageWithBBox]:
def extract_text_layer(self, path: str, page_number: int) -> Optional[PageWithBBox]:
"""
Extract text information with metadata from pdf with help pdfminer.six
:param path: path to pdf
:param page_number: number of the page to read
:return: pages_with_bbox - page with extracted text
"""
with open(path, "rb") as fp:
pages = PDFPage.get_pages(fp)
for page_num, page in enumerate(pages):
if page_num != page_number:
continue
return self.__handle_page(page=page, page_number=page_number, path=path, is_one_column_document=is_one_column_document)
return self.__handle_page(page=page, page_number=page_number, path=path)

def __handle_page(self, page: PDFPage, page_number: int, path: str, is_one_column_document: bool) -> PageWithBBox:
def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithBBox:
directory = os.path.dirname(path)
device, interpreter = self.__get_interpreter(is_one_column_document=is_one_column_document)
device, interpreter = self.__get_interpreter()
try:
interpreter.process_page(page)
except Exception as e:
Expand Down Expand Up @@ -139,12 +140,9 @@ def __get_image(path: str, page_num: int) -> np.ndarray:
image_page = cv2.cvtColor(image_page, cv2.COLOR_GRAY2BGR)
return image_page

def __get_interpreter(self, is_one_column_document: bool) -> Tuple[PDFPageAggregator, PDFPageInterpreter]:
def __get_interpreter(self) -> Tuple[PDFPageAggregator, PDFPageInterpreter]:
rsrcmgr = PDFResourceManager()
if is_one_column_document is not None and is_one_column_document:
laparams = LAParams(line_margin=3.0, line_overlap=0.1, boxes_flow=0.5, word_margin=1.5, char_margin=100.0, detect_vertical=False)
else:
laparams = LAParams(line_margin=1.5, line_overlap=0.5, boxes_flow=0.5, word_margin=0.1, detect_vertical=False)
laparams = LAParams(line_margin=1.5, line_overlap=0.5, boxes_flow=0.5, word_margin=0.1, detect_vertical=False) # TODO find the best parameters
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return device, interpreter
Expand Down
4 changes: 0 additions & 4 deletions dedoc/utils/parameter_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,6 @@ def get_param_table_type(parameters: Optional[dict]) -> str:
return str(parameters.get("table_type", ""))


def get_is_one_column_document_list(parameters: Optional[dict]) -> Optional[bool]:
return None if parameters is None else parameters.get("is_one_column_document_list")


def get_param_page_slice(parameters: Dict[str, Any]) -> Tuple[Optional[int], Optional[int]]:
"""
Parse parameter pages = ["page_number:page_number" | "" | "page_number:" | ":page_number" : ":"]
Expand Down

0 comments on commit 2ff929b

Please sign in to comment.