Skip to content

Commit

Permalink
TLDR-473 add dedoc utils (#340)
Browse files Browse the repository at this point in the history
* use dedoc utils BBox class

* use AdaptiveBinarizer from dedoc-utils

* use SkewCorrector from dedoc-utils

* fix style

* fix rotated angle error

* delete BBox from docs

* fix angles

* delete print

* fix dedocutils

* dedocutils set ver. 0.3.5

* fix mistakes and names

---------

Co-authored-by: Nikita Shevtsov <shevtsov@ispras.ru>
  • Loading branch information
Travvy88 and Nikita Shevtsov authored Oct 2, 2023
1 parent 840bed2 commit 896a31a
Show file tree
Hide file tree
Showing 44 changed files with 68 additions and 367 deletions.
3 changes: 1 addition & 2 deletions dedoc/data_structures/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import dedoc.data_structures.concrete_annotations as annotations
from .annotation import Annotation
from .attached_file import AttachedFile
from .bbox import BBox
from .cell_with_meta import CellWithMeta
from .concrete_annotations import *
from .document_content import DocumentContent
Expand All @@ -17,5 +16,5 @@
from .tree_node import TreeNode
from .unstructured_document import UnstructuredDocument

__all__ = ['Annotation', 'AttachedFile', 'BBox', 'DocumentContent', 'DocumentMetadata', 'HierarchyLevel', 'LineMetadata',
__all__ = ['Annotation', 'AttachedFile', 'DocumentContent', 'DocumentMetadata', 'HierarchyLevel', 'LineMetadata',
'LineWithMeta', 'ParsedDocument', 'Serializable', 'Table', 'TableMetadata', 'CellWithMeta', 'TreeNode', 'UnstructuredDocument', *annotations.__all__]
138 changes: 0 additions & 138 deletions dedoc/data_structures/bbox.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import json

from dedocutils.data_structures import BBox
from flask_restx import Api, Model, fields

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.bbox import BBox


class BBoxAnnotation(Annotation):
Expand Down
3 changes: 2 additions & 1 deletion dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import uuid
from typing import List, Optional

from dedocutils.data_structures import BBox

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.bbox import BBox
from dedoc.data_structures.line_with_meta import LineWithMeta


Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/pdf_reader/data_classes/tables/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from functools import total_ordering
from typing import Any, Dict

from dedoc.data_structures.bbox import BBox
from dedocutils.data_structures import BBox


@total_ordering
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/pdf_reader/data_classes/tables/scantable.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from typing import Any, List

import numpy as np
from dedocutils.data_structures import BBox

from dedoc.data_structures.bbox import BBox
from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
from dedoc.readers.pdf_reader.data_classes.tables.location import Location

Expand Down
3 changes: 2 additions & 1 deletion dedoc/readers/pdf_reader/data_classes/tables/table_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@

import cv2
import numpy as np
from dedocutils.data_structures import BBox

from dedoc.data_structures import BBox, LineWithMeta
from dedoc.data_structures import LineWithMeta
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_cell_extractor import OCRCellExtractor
from dedoc.utils.image_utils import crop_image_text

Expand Down
3 changes: 2 additions & 1 deletion dedoc/readers/pdf_reader/data_classes/text_with_bbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from typing import List, Optional
from uuid import uuid1

from dedocutils.data_structures import BBox

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.bbox import BBox
from dedoc.data_structures.serializable import Serializable


Expand Down
122 changes: 0 additions & 122 deletions dedoc/readers/pdf_reader/pdf_image_reader/adaptive_binarizer.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from typing import List

import numpy as np
from dedocutils.data_structures import BBox

from dedoc.data_structures import BBox
from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.bold_classifier.agglomerative_clusterizer import BoldAgglomerativeClusterizer
from dedoc.readers.pdf_reader.pdf_image_reader.line_metadata_extractor.bold_classifier.valley_emphasis_binarizer import ValleyEmphasisBinarizer

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

import cv2
import numpy as np
from dedocutils.data_structures import BBox

from dedoc.data_structures import BBoxAnnotation, ConfidenceAnnotation, LineMetadata, LineWithMeta
from dedoc.data_structures.bbox import BBox
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_line_extractor import OCRLineExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_page import OcrPage
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_utils import get_text_with_bbox_from_cells
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from collections import defaultdict
from typing import List

from dedoc.data_structures.bbox import BBox
from dedocutils.data_structures import BBox

from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_paragraph import OcrParagraph
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_tuple import OcrElement

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import List

from dedocutils.data_structures import BBox

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.bbox import BBox
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_tuple import OcrElement
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from collections import defaultdict
from typing import List

from dedoc.data_structures.bbox import BBox
from dedocutils.data_structures import BBox

from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_line import OcrLine
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_tuple import OcrElement

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Dict, Iterable

from dedoc.data_structures.bbox import BBox
from dedocutils.data_structures import BBox


class OcrElement:
Expand Down
Loading

0 comments on commit 896a31a

Please sign in to comment.