-
Notifications
You must be signed in to change notification settings - Fork 0
/
das.py
129 lines (117 loc) · 5.75 KB
/
das.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
"""A module applying semantic, morphological, and phonological criteria to explain the neuter gender assignment"""
import deepl
from langdetect import detect
from hypernyms import taxonomy
from rules import neut_category_dict, neut_classes, neut_prefixes, neut_suffixes
license_key = "#" # replace with your own DeepL licence key
deepl_translator = deepl.Translator(license_key)
def neut_rule1(hypernyms: list) -> set:
"""returns an intersection of the semantic categories associated with the neuter noun class and the set of hypernyms generated by the input noun across all of its synsets. To simplify the output, the initially generated classes are also expressed in terms of general semantic categories."""
granular_categories = []
general_categories = []
if (
hypernyms is None
): # handles the None object occasionally generated by taxonomy()
return set()
else:
for category in hypernyms:
if category in neut_classes:
granular_categories.append(category)
for category in granular_categories:
for key, value in neut_category_dict.items():
if category in value:
general_categories.append(key)
return set(general_categories), set(granular_categories)
def neut_rule2(token: str) -> list:
"""checks the input noun for the prefixes and suffixes associated with the neuter noun class"""
suffixes = [] # suffixes may be nested
prefixes = "" # prefixes are exclusive
for suffix in neut_suffixes:
if token.endswith(suffix):
suffixes.append("-" + suffix)
for prefix in neut_prefixes:
if token.startswith(prefix):
prefixes = prefix
if suffixes:
return [max(suffixes, key=len) + prefixes]
else:
return prefixes
def neut_evaluate(lemmatized: str, hypernyms: list, parsed_base: str) -> None:
"""the fucntion takes a noun and a set of hypernyms generated over all noun synsets and determines if any of the hypernyms are affiliated with the neuter noun class. It then performs simple morphological analysis by checking if the noun contains the prefixes and suffixes associated with the neuter gender."""
neut_flag = False
print(f"The noun '{lemmatized}' is neuter.")
# check the semantic taxonomy
if hypernyms: # check if it generated hypernyms
semantic_general, semantic_granular = neut_rule1(
hypernyms
) # generate the taxonomy
if semantic_granular:
print(
f"It belongs to the following predominantly neuter semantic categories: {', '.join(semantic_granular)}"
)
print(
f"The above classification can be expressed in terms of the following general semantic categories: {', '.join(semantic_general)}"
)
neut_flag = True
else:
print(
"Grammatical gender assignment could not be determined based on the semantic category alone."
)
elif (
not hypernyms
): # if the word returns no hypernyms, see if its parsed base returns them
if parsed_base: # check if the parser returned a base in the first place
parsed_translation = deepl_translator.translate_text(
parsed_base, source_lang="DE", target_lang="EN-US"
)
translated_base = parsed_translation.text.casefold()
base_hypernyms = taxonomy(
translated_base
) # generate all possible hypernyms
if base_hypernyms:
base_semantic_general, base_semantic_granular = neut_rule1(
base_hypernyms
)
if base_semantic_granular:
print(f"Couldn't find any semantic categories for '{lemmatized}'.")
print(
f"The base noun '{parsed_base}' belongs to the following predominantly neuter semantic categories: {', '.join(base_semantic_granular)}"
)
print(
f"The above classification can be expressed in terms of the following general semantic categories: {', '.join(base_semantic_general)}"
)
neut_flag = True
else:
print(
f"Couldn't find any semantic categories for '{lemmatized}'. There don't seem to be any predomiantly neuter semantic categories to which the base noun '{parsed_base}' blelongs."
)
else:
print(f"Couldn't generate any semantic categories for '{parsed_base}'.")
else:
print(f"Couldn't parse '{lemmatized}'.")
elif hypernyms is None:
print(f"Couldn't generate a semantic taxonomy for '{lemmatized}'.")
# check the morphology
morphological = neut_rule2(lemmatized)
if parsed_base:
print(f"'{lemmatized}' has the following neuter base noun: '{parsed_base}'.")
if morphological:
print(f"The noun has the following neuter affixes: {', '.join(morphological)}")
neut_flag = True
else:
print(
"Grammatical gender assignment cannot be determined based on the noun's affixes alone."
)
# check if the word is borrowed. Foreign borrowings are predominantly neuter
if detect(lemmatized) != "de":
print(
f"The word '{lemmatized}' could be a borrowing from another langauge. Imported words tend to be neuter"
)
neut_flag = True
# print this if none of the above applies
if neut_flag == False:
print(
f"The grammatical gender of '{lemmatized}' cannot be explained with the available rules."
)
print("For better or worse, it has to be memorized")