Skip to content

Commit

Permalink
updated txt layer correctness classifier (#334)
Browse files Browse the repository at this point in the history
Co-authored-by: Alexander Golodkov <golodkov@ispras.ru>
  • Loading branch information
alexander1999-hub and Alexander Golodkov authored Sep 26, 2023
1 parent db202e7 commit e089b4b
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 20 deletions.
2 changes: 1 addition & 1 deletion dedoc/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
Keys are the names of repositories with models.
"""
model_hash_dict = dict(
txtlayer_classifier="93b10fea2b661d7eca79381b47e5c4ebe2a22e75",
txtlayer_classifier="94e27e184fa2876883d260e0aa58b042e6ab3e35",
scan_orientation_efficient_net_b0="0160965f8a920d12afacf62b8a5a8a3b365b11ef",
font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
paragraph_classifier="00bf989876cec171c1cf9859a6b712af6445e864",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,15 @@
class TxtlayerFeatureExtractor:

def __init__(self) -> None:
eng = "".join(list(map(chr, range(ord("a"), ord("z") + 1))))
rus = "".join([chr(i) for i in range(ord("а"), ord("а") + 32)] + ["ё"])

self.lower_letters = eng + rus
self.eng = "".join(list(map(chr, range(ord("a"), ord("z") + 1))))
self.rus = "".join([chr(i) for i in range(ord("а"), ord("а") + 32)] + ["ё"])
self.lower_letters = self.eng + self.rus
self.upper_letters = self.lower_letters.upper()
self.letters = self.upper_letters + self.lower_letters
self.digits = "".join([str(i) for i in range(10)])
self.special_symbols = "<>~!@#$%^&*_+-/\"|?.,:;'`= "
self.brackets = "{}[]()"
self.symbols = self.letters + self.digits + self.brackets + self.special_symbols
self.consonants = "".join(i for i in self.lower_letters if i not in "аоуыэяёюиеaeiouy")

self.prohibited_symbols = {s: i for i, s in enumerate("[]<")}

Expand All @@ -30,13 +28,15 @@ def transform(self, texts: List[str]) -> pd.DataFrame:
num_digits = self.__count_symbols(text, self.digits)
num_special_symbols = self.__count_symbols(text, self.special_symbols)
num_brackets = self.__count_symbols(text, self.brackets)
num_consonants = self.__count_symbols(text.lower(), self.consonants)
num_rus = self.__count_symbols(text, self.rus + self.rus.upper())
num_eng = self.__count_symbols(text, self.eng + self.eng.upper())

features["letters_proportion"].append(num_letters / len(text))
features["digits_proportion"].append(num_digits / len(text))
features["special_symbols_proportion"].append(num_special_symbols / len(text))
features["brackets_proportion"].append(num_brackets / len(text))
features["consonants_proportion"].append(num_consonants / num_letters if num_letters != 0 else 0.0)
features["rus_proportion"].append(num_rus / len(text))
features["eng_proportion"].append(num_eng / len(text))

for symbol in self.letters + self.digits:
n = num_letters + num_digits
Expand All @@ -59,6 +59,15 @@ def transform(self, texts: List[str]) -> pd.DataFrame:
features["letter_changes"].append(letter_changes / len(text))

features["mean_word_length"].append(np.mean([len(word) for word in text.split()]))
features["median_word_length"].append(np.median([len(word) for word in text.split()]))

all_characters_ord = [ord(character) for character in text]
trash_chars = sum(1 for s in all_characters_ord if s <= 32 or 160 <= s <= 879)
features["trash_chars_proportion"].append(trash_chars / len(text))
features["trash_chars_number"].append(trash_chars)
features["std_char_ord"].append(np.std(all_characters_ord))
features["mean_char_ord"].append(np.mean(all_characters_ord))
features["median_char_ord"].append(np.median(all_characters_ord))
features = pd.DataFrame(features)
return features[sorted(features.columns)].astype(float)

Expand Down
21 changes: 9 additions & 12 deletions resources/benchmarks/benchmarks_tl_correctness.json
Original file line number Diff line number Diff line change
@@ -1,24 +1,21 @@
{
"version": "0.9.2",
"version": "0.11.2",
"guessing_the_correctness_of_the_text": {
"percentage_of_guessed_correct_tl": 0.9699570815450643,
"percentage_of_guessed_correct_tl": 0.9785407725321889,
"list_of_file_with_incorrect_tl": [
"access-the-vision-for-2013.pdf",
"hogans-federal-motion-for-a-preliminary-injunction_1616093696_24.pdf",
"afcea-spy.pdf",
"b96a__usmc-combat-camera-directory.pdf",
"HBG-JMP-CIM_1616126784_120.pdf",
"demystifying-nge-rock-ridge_1643518222_537.pdf",
"nifog-2009_1616541890_542.pdf",
"hogans-federal-motion-for-a-preliminary-injunction_1616093696_24.pdf"
"access-the-vision-for-2013.pdf",
"demystifying-nge-rock-ridge_1643518222_537.pdf"
],
"percentage_of_guessed_incorrect_tl": 0.75,
"percentage_of_guessed_incorrect_tl": 0.7916666666666666,
"list_of_file_with_correct_tl": [
"slides.pdf",
"ЧММФ_Абакумов_учебник.pdf",
"PE20_1616439522_1.pdf",
"slides.pdf",
"PE157_1616278053_181.pdf",
"EXTERNAL FORMS - SUPPORTING DOCUMENTATION-ESHS9615401 2017_07_27 11_22_39_1616049888_455.pdf",
"cu-spy-holes_1616346633_620.pdf",
"PE157_1616278053_181.pdf"
"ЧММФ_Абакумов_учебник.pdf"
]
}
}
Binary file not shown.

0 comments on commit e089b4b

Please sign in to comment.