TLDR-748 finished tutorial and add links checking for docs

ispras · Aug 28, 2024 · 2b5e47c · 2b5e47c
1 parent 7229a31
commit 2b5e47c
Show file tree

Hide file tree

Showing 9 changed files with 394 additions and 24 deletions.
diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
@@ -11,7 +11,7 @@ class PdfAutoReader(BaseReader):
 
     :class:`~dedoc.readers.PdfAutoReader` is used for automatic detection of a correct textual layer in the given PDF file:
 
-    * if PDF document has a correct textual layer then :class:`~dedoc.readers.PdfTxtLayerReader` or :class:`~dedoc.readers.PdfTabbyReader` is used \
+    * if PDF document has a correct textual layer then :class:`~dedoc.readers.PdfTxtlayerReader` or :class:`~dedoc.readers.PdfTabbyReader` is used \
     for document content extraction;
 
     * if PDF document doesn't have a correct textual layer then :class:`~dedoc.readers.PdfImageReader` is used for document content extraction.

diff --git a/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py b/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py
@@ -84,7 +84,7 @@ def print_document_tree(document: UnstructuredDocument, patterns: List[AbstractP
 
 
 print("\n\nDocument lines\n")
-for document_line in pdf_document.lines[:50]:
+for document_line in pdf_document.lines[:10]:
     print(document_line, document_line.annotations)
 
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -31,6 +31,24 @@
 exclude_patterns = []
 highlight_language = "python3"
 
+# -- Options for the nitpicky mode -------------------------------------------
+
+nitpicky = True
+nitpick_ignore = [
+    ("py:class", "abc.ABC"),
+    ("py:class", "pydantic.main.BaseModel"),
+    ("py:class", "scipy.stats._multivariate.dirichlet_multinomial_gen.cov"),
+    ("py:class", "pandas.core.series.Series"),
+    ("py:class", "numpy.ndarray"),
+    ("py:class", "pandas.core.frame.DataFrame"),
+    ("py:class", "dedoc.structure_extractors.feature_extractors.toc_feature_extractor.TocItem"),
+    ("py:class", "logging.Logger"),
+    ("py:class", "train_dataset.data_structures.line_with_label.LineWithLabel"),
+    ("py:class", "xgboost.sklearn.XGBClassifier"),
+    ("py:class", "collections.Counter"),
+
+]
+
 # -- Options for HTML output -------------------------------------------------
 
 html_theme = "sphinx_rtd_theme"

diff --git a/docs/source/getting_started/usage.rst b/docs/source/getting_started/usage.rst
@@ -25,7 +25,7 @@ For this purpose one can use :class:`~dedoc.converters.DocxConverter` class:
     :language: python
     :lines: 10
 
-Method :meth:`~dedoc.converters.DocxConverter.can_convert` allows to check if the converter can convert the given file:
+Method :meth:`~dedoc.converters.AbstractConverter.can_convert` allows to check if the converter can convert the given file:
 
 .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py
     :language: python
@@ -70,7 +70,7 @@ one can use :class:`~dedoc.readers.DocxReader` class:
     :language: python
     :lines: 17
 
-Method :meth:`~dedoc.readers.DocxReader.can_read` allows to check if the reader can parse the given file:
+Method :meth:`~dedoc.readers.BaseReader.can_read` allows to check if the reader can parse the given file:
 
 .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py
     :language: python
@@ -196,7 +196,7 @@ we can add some metadata using :class:`~dedoc.metadata_extractors.DocxMetadataEx
     :language: python
     :lines: 64
 
-Method :meth:`~dedoc.metadata_extractors.DocxMetadataExtractor.can_extract` allows to check if
+Method :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` allows to check if
 the metadata extractor can extract metadata from the given file:
 
 .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py
@@ -228,7 +228,7 @@ For example, in the :ref:`docx_example_image` we can use :class:`~dedoc.attachme
     :language: python
     :lines: 74
 
-Method :meth:`~dedoc.attachments_extractors.DocxAttachmentsExtractor.can_extract` allows to check if the attachments extractor can extract attachments from the given file:
+Method :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.can_extract` allows to check if the attachments extractor can extract attachments from the given file:
 
 .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py
     :language: python

diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst
@@ -62,7 +62,7 @@ PDF and images handling
       - rus, eng, rus+eng, fra, spa
       - rus+eng
       - * :meth:`dedoc.DedocManager.parse`
-        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read`
+        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`
         * :meth:`dedoc.readers.ReaderComposition.read`
         * :meth:`dedoc.structure_extractors.FintocStructureExtractor.extract`
       - Language of the document without a textual layer. The following values are available:
@@ -77,7 +77,7 @@ PDF and images handling
       - :, start:, :end, start:end
       - :
       - * :meth:`dedoc.DedocManager.parse`
-        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read`, :meth:`dedoc.readers.PdfImageReader.read`
+        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read`
         * :meth:`dedoc.readers.ReaderComposition.read`
       - If you need to read a part of the PDF document, you can use page slice to define the reading range.
         If the range is set like ``start_page:end_page``, document will be processed from ``start_page`` to ``end_page``
@@ -96,7 +96,7 @@ PDF and images handling
       - true, false, auto
       - auto
       - * :meth:`dedoc.DedocManager.parse`
-        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read`
+        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`
         * :meth:`dedoc.readers.ReaderComposition.read`
       - This option is used to set the number of columns if the PDF document is without a textual layer in case it's known beforehand.
         The following values are available:
@@ -111,7 +111,7 @@ PDF and images handling
       - auto, no_change
       - auto
       - * :meth:`dedoc.DedocManager.parse`
-        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read`
+        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`
         * :meth:`dedoc.readers.ReaderComposition.read`
       - This option is used to control document orientation analysis for PDF documents without a textual layer.
         The following values are available:
@@ -125,7 +125,7 @@ PDF and images handling
       - True, False
       - False
       - * :meth:`dedoc.DedocManager.parse`
-        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read`
+        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`
         * :meth:`dedoc.readers.ReaderComposition.read`
       - This option is used to **remove** headers and footers of PDF documents from the output result.
         If ``need_header_footer_analysis=False``, header and footer lines will present in the output as well as all other document lines.
@@ -134,7 +134,7 @@ PDF and images handling
       - True, False
       - False
       - * :meth:`dedoc.DedocManager.parse`
-        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read`
+        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`
         * :meth:`dedoc.readers.ReaderComposition.read`
       - This option is used to clean background (binarize) for pages of PDF documents without a textual layer.
         If the document's background is heterogeneous, this option may help to improve the result of document text recognition.
@@ -144,7 +144,7 @@ PDF and images handling
       - True, False
       - True
       - * :meth:`dedoc.DedocManager.parse`
-        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read`
+        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`
         * :meth:`dedoc.readers.ReaderComposition.read`
       - This option is used to enable table recognition for PDF documents or images.
         The table recognition method is used in :class:`dedoc.readers.PdfImageReader` and :class:`dedoc.readers.PdfTxtlayerReader`.
@@ -155,7 +155,7 @@ PDF and images handling
       - True, False
       - False
       - * :meth:`dedoc.DedocManager.parse`
-        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read`
+        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`
         * :meth:`dedoc.readers.ReaderComposition.read`
       - This option is used for a table recognition for PDF documents or images.
         It is ignored when ``need_pdf_table_analysis=False``.
@@ -166,7 +166,7 @@ PDF and images handling
       - 90, 270
       - 90
       - * :meth:`dedoc.DedocManager.parse`
-        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read`
+        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`
         * :meth:`dedoc.readers.ReaderComposition.read`
       - This option is used for a table recognition for PDF documents or images.
         It is ignored when ``need_pdf_table_analysis=False`` or ``orient_analysis_cells=False``.

diff --git a/docs/source/readers_output/line_types.rst b/docs/source/readers_output/line_types.rst
@@ -19,7 +19,7 @@ Below the readers are enlisted that can return non-empty ``tag_hierarchy_level``
    * - **Reader**
      - **header**
      - **list_item**
-     - **raw_text, unknown**
+     - **unknown**
      - **key**
 
    * - :class:`~dedoc.readers.DocxReader`

diff --git a/docs/source/tutorials/add_new_doc_format.rst b/docs/source/tutorials/add_new_doc_format.rst
@@ -164,8 +164,7 @@ You should implement the following methods:
 * :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.can_extract()`: use file extension or mime to check if we could read the given file. You can learn more about extensions and mime using file ``dedoc/extensions.py``
 * :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.extract()` : use information about file path and file name to extract attachments from the given file.
 
-The method returns the list of :class:`~dedoc.data_structures.attached_file.AttachedFile` using
-:meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor._content2attach_file` method.
+The method returns the list of :class:`~dedoc.data_structures.attached_file.AttachedFile` using ``_content2attach_file`` method.
 This method is inherited from the abstract class, it makes the list of :class:`~dedoc.data_structures.attached_file.AttachedFile` from the list of tuples:
 the name of the attached file and binary content of the file.
 

diff --git a/docs/source/tutorials/add_new_structure_type/features_extraction.rst b/docs/source/tutorials/add_new_structure_type/features_extraction.rst
@@ -29,9 +29,6 @@ Let's implement the basic methods of the parent class:
     * :meth:`~dedoc.structure_extractors.feature_extractors.abstract_extractor.AbstractFeatureExtractor.parameters` --
       we don't plan to use any parameters in the ``__init__`` method, so an empty dictionary can be returned;
 
-    * :meth:`~dedoc.structure_extractors.feature_extractors.abstract_extractor.AbstractFeatureExtractor.fit` --
-      we don't need to train our feature extractor, so the method can be empty;
-
     * :meth:`~dedoc.structure_extractors.feature_extractors.abstract_extractor.AbstractFeatureExtractor.transform` --
       here we implement a basic scheme of features extraction from each document and their concatenation.