Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated txt layer correctness classifier features #334

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dedoc/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
Keys are the names of repositories with models.
"""
model_hash_dict = dict(
txtlayer_classifier="93b10fea2b661d7eca79381b47e5c4ebe2a22e75",
txtlayer_classifier="94e27e184fa2876883d260e0aa58b042e6ab3e35",
scan_orientation_efficient_net_b0="0160965f8a920d12afacf62b8a5a8a3b365b11ef",
font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
paragraph_classifier="00bf989876cec171c1cf9859a6b712af6445e864",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,15 @@
class TxtlayerFeatureExtractor:

def __init__(self) -> None:
eng = "".join(list(map(chr, range(ord("a"), ord("z") + 1))))
rus = "".join([chr(i) for i in range(ord("а"), ord("а") + 32)] + ["ё"])

self.lower_letters = eng + rus
self.eng = "".join(list(map(chr, range(ord("a"), ord("z") + 1))))
self.rus = "".join([chr(i) for i in range(ord("а"), ord("а") + 32)] + ["ё"])
self.lower_letters = self.eng + self.rus
self.upper_letters = self.lower_letters.upper()
self.letters = self.upper_letters + self.lower_letters
self.digits = "".join([str(i) for i in range(10)])
self.special_symbols = "<>~!@#$%^&*_+-/\"|?.,:;'`= "
self.brackets = "{}[]()"
self.symbols = self.letters + self.digits + self.brackets + self.special_symbols
self.consonants = "".join(i for i in self.lower_letters if i not in "аоуыэяёюиеaeiouy")

self.prohibited_symbols = {s: i for i, s in enumerate("[]<")}

Expand All @@ -30,13 +28,15 @@ def transform(self, texts: List[str]) -> pd.DataFrame:
num_digits = self.__count_symbols(text, self.digits)
num_special_symbols = self.__count_symbols(text, self.special_symbols)
num_brackets = self.__count_symbols(text, self.brackets)
num_consonants = self.__count_symbols(text.lower(), self.consonants)
num_rus = self.__count_symbols(text, self.rus + self.rus.upper())
num_eng = self.__count_symbols(text, self.eng + self.eng.upper())

features["letters_proportion"].append(num_letters / len(text))
features["digits_proportion"].append(num_digits / len(text))
features["special_symbols_proportion"].append(num_special_symbols / len(text))
features["brackets_proportion"].append(num_brackets / len(text))
features["consonants_proportion"].append(num_consonants / num_letters if num_letters != 0 else 0.0)
features["rus_proportion"].append(num_rus / len(text))
features["eng_proportion"].append(num_eng / len(text))

for symbol in self.letters + self.digits:
n = num_letters + num_digits
Expand All @@ -59,6 +59,15 @@ def transform(self, texts: List[str]) -> pd.DataFrame:
features["letter_changes"].append(letter_changes / len(text))

features["mean_word_length"].append(np.mean([len(word) for word in text.split()]))
features["median_word_length"].append(np.median([len(word) for word in text.split()]))

all_characters_ord = [ord(character) for character in text]
trash_chars = sum(1 for s in all_characters_ord if s <= 32 or 160 <= s <= 879)
features["trash_chars_proportion"].append(trash_chars / len(text))
features["trash_chars_number"].append(trash_chars)
features["std_char_ord"].append(np.std(all_characters_ord))
features["mean_char_ord"].append(np.mean(all_characters_ord))
features["median_char_ord"].append(np.median(all_characters_ord))
features = pd.DataFrame(features)
return features[sorted(features.columns)].astype(float)

Expand Down
21 changes: 9 additions & 12 deletions resources/benchmarks/benchmarks_tl_correctness.json
Original file line number Diff line number Diff line change
@@ -1,24 +1,21 @@
{
"version": "0.9.2",
"version": "0.11.2",
"guessing_the_correctness_of_the_text": {
"percentage_of_guessed_correct_tl": 0.9699570815450643,
"percentage_of_guessed_correct_tl": 0.9785407725321889,
"list_of_file_with_incorrect_tl": [
"access-the-vision-for-2013.pdf",
"hogans-federal-motion-for-a-preliminary-injunction_1616093696_24.pdf",
"afcea-spy.pdf",
"b96a__usmc-combat-camera-directory.pdf",
"HBG-JMP-CIM_1616126784_120.pdf",
"demystifying-nge-rock-ridge_1643518222_537.pdf",
"nifog-2009_1616541890_542.pdf",
"hogans-federal-motion-for-a-preliminary-injunction_1616093696_24.pdf"
"access-the-vision-for-2013.pdf",
"demystifying-nge-rock-ridge_1643518222_537.pdf"
],
"percentage_of_guessed_incorrect_tl": 0.75,
"percentage_of_guessed_incorrect_tl": 0.7916666666666666,
"list_of_file_with_correct_tl": [
"slides.pdf",
"ЧММФ_Абакумов_учебник.pdf",
"PE20_1616439522_1.pdf",
"slides.pdf",
"PE157_1616278053_181.pdf",
"EXTERNAL FORMS - SUPPORTING DOCUMENTATION-ESHS9615401 2017_07_27 11_22_39_1616049888_455.pdf",
"cu-spy-holes_1616346633_620.pdf",
"PE157_1616278053_181.pdf"
"ЧММФ_Абакумов_учебник.pdf"
]
}
}
Binary file not shown.
Loading