-
Notifications
You must be signed in to change notification settings - Fork 0
/
der.py
145 lines (131 loc) · 6.36 KB
/
der.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
"""A module applying semantic, morphological, and phonological criteria to explain the masculine gender assignment"""
import deepl
import syllables
from hypernyms import taxonomy
from rules import masc_category_dict, masc_classes, masc_prefixes, masc_suffixes
license_key = "#" # replace with your own DeepL licence key
deepl_translator = deepl.Translator(license_key)
def masc_rule1(hypernyms: list) -> list:
"""returns an intersection of the semantic categories associated with the masculine noun class and the set of hypernyms generated by the input noun across all of its synsets. To simplify the output, the initially generated classes are also expressed in terms of general semantic categories."""
granular_categories = []
general_categories = []
if (
hypernyms is None
): # handles the None object occasionally generated by taxonomy()
return set()
else:
for category in hypernyms:
if category in masc_classes:
granular_categories.append(category)
for category in granular_categories:
for key, value in masc_category_dict.items():
if category in value:
general_categories.append(key)
return set(general_categories), set(granular_categories)
def masc_rule2(token: str) -> list:
"""checks the input noun for the prefixes and suffixes associated with the masculine noun class"""
suffixes = [] # suffixes may be nested
prefixes = "" # prefixes are exclusive
for suffix in masc_suffixes:
if token.endswith(suffix):
suffixes.append("-" + suffix)
for prefix in masc_prefixes:
if token.startswith(prefix):
prefixes = prefix
if suffixes:
return [max(suffixes, key=len) + prefixes]
else:
return prefixes
def masc_syllables(token: str) -> bool:
"""monosyllabic words in DE are overwhelmingly masculine. This function estimates the number of syllables in a word.
The 'syllables' package used here was written for EN lexemes, but due to the phonological similarity between EN and DE it works okay in detecting
monosyllabic DE words. """
if syllables.estimate(token) == 1:
return True
def masc_evaluate(lemmatized: str, hypernyms: list, parsed_base: str) -> None:
"""the fucntion takes the noun and a set of hypernyms generated over all noun synsets and determines if any of the hypernyms are affiliated with the masculine noun class. It then performs simple morphological analysis by checking if the noun contains the prefixes and suffixes associated with the masculine gender."""
masc_flag = False
print(f"The noun '{lemmatized}' is masculine.")
# check the semantic taxonomy
if hypernyms:
semantic_general, semantic_granular = masc_rule1(hypernyms)
if semantic_granular:
print(
f"It belongs to the following predominantly masculine semantic categories: {', '.join(semantic_granular)}"
)
print(
f"The above classification can be expressed in terms of the following general semantic categories: {', '.join(semantic_general)}"
)
masc_flag = True
else:
print(
"Grammatical gender assignment could not be determined based on the semantic category alone."
)
elif (
not hypernyms
): # if the word returns no hypernyms, see if its base noun returns them
if parsed_base: # checks if the parser returns a base in the first place
parsed_translation = deepl_translator.translate_text(
parsed_base, source_lang="DE", target_lang="EN-US"
)
translated_base = parsed_translation.text.casefold()
base_hypernyms = taxonomy(
translated_base
) # generate all possible hypernyms
if base_hypernyms:
base_semantic_general, base_semantic_granular = masc_rule1(
base_hypernyms
)
if base_semantic_granular:
print(f"Couldn't find any semantic categories for '{lemmatized}'.")
print(
f"The base noun '{parsed_base}' belongs to the following predominantly masculine semantic categories: {', '.join(base_semantic_granular)}"
)
print(
f"The above classification can be expressed in terms of the following general semantic categories: {', '.join(base_semantic_general)}"
)
masc_flag = True
else:
print(
f"Couldn't find any semantic categories for '{lemmatized}'. There don't seem to be any predomiantly masculine semantic categories to which the base noun '{parsed_base}' blelongs."
)
else:
print(f"Couldn't generate any semantic categories for '{parsed_base}'.")
else:
print(f"Couldn't parse '{lemmatized}'.")
elif hypernyms is None:
print(f"Couldn't generate a semantic taxonomy for '{lemmatized}'.")
# check the morphology
morphological = masc_rule2(lemmatized)
if parsed_base:
print(f"'{lemmatized}' has the following masculine base noun: '{parsed_base}'.")
if morphological:
print(
f"The noun has the following masculine affixes: {', '.join(morphological)}"
)
masc_flag = True
else:
print(
"Grammatical gender assignment cannot be determined based on the noun's affixes alone."
)
# check if monosyllabic
# but first replace the umlauts and the 'ß' with their alternative transcriptions. The EN syllable estimator doesn't recognize DE chars.
no_umlauts = (
lemmatized.replace("ä", "ae")
.replace("ö", "oe")
.replace("ü", "ue")
.replace("ß", "ss")
)
monosyllabic = masc_syllables(no_umlauts)
if monosyllabic:
print(
f"Monosyllabic nouns are overwhelmingly masculine. '{lemmatized}' might be monosyllabic."
)
masc_flag = True
# print this if none of the above applies
if masc_flag == False:
print(
f"The grammatical gender of '{lemmatized}' cannot be explained with the available rules."
)
print("For better or worse, it has to be memorized")