der.py

#!/usr/bin/env python
"""A module applying semantic, morphological, and phonological criteria to explain the masculine gender assignment"""

import deepl
import syllables

from hypernyms import taxonomy
from rules import masc_category_dict, masc_classes, masc_prefixes, masc_suffixes

license_key = "#"  # replace with your own DeepL licence key
deepl_translator = deepl.Translator(license_key)


def masc_rule1(hypernyms: list) -> list:
    """returns an intersection of the semantic categories associated with the masculine noun class and the set of hypernyms generated by the input noun across all of its synsets. To simplify the output, the initially generated classes are also expressed in terms of general semantic categories."""
    granular_categories = []
    general_categories = []
    if (
        hypernyms is None
    ):  # handles the None object occasionally generated by taxonomy()
        return set()
    else:
        for category in hypernyms:
            if category in masc_classes:
                granular_categories.append(category)
        for category in granular_categories:
            for key, value in masc_category_dict.items():
                if category in value:
                    general_categories.append(key)
        return set(general_categories), set(granular_categories)


def masc_rule2(token: str) -> list:
    """checks the input noun for the prefixes and suffixes associated with the masculine noun class"""
    suffixes = []  # suffixes may be nested
    prefixes = ""  # prefixes are exclusive
    for suffix in masc_suffixes:
        if token.endswith(suffix):
            suffixes.append("-" + suffix)
    for prefix in masc_prefixes:
        if token.startswith(prefix):
            prefixes = prefix
    if suffixes:
        return [max(suffixes, key=len) + prefixes]
    else:
        return prefixes


def masc_syllables(token: str) -> bool:
    """monosyllabic words in DE are overwhelmingly masculine.  This function estimates the number of syllables in a word.
    The 'syllables' package used here was written for EN lexemes, but due to the phonological similarity between EN and DE it works okay in detecting
     monosyllabic DE words.  """
    if syllables.estimate(token) == 1:
        return True


def masc_evaluate(lemmatized: str, hypernyms: list, parsed_base: str) -> None:
    """the fucntion takes the noun and a set of hypernyms generated over all noun synsets and determines if any of the hypernyms are affiliated with the masculine noun class.  It then performs simple morphological analysis by checking if the noun contains the prefixes and suffixes associated with the masculine gender."""
    masc_flag = False
    print(f"The noun '{lemmatized}' is masculine.")
    # check the semantic taxonomy
    if hypernyms:
        semantic_general, semantic_granular = masc_rule1(hypernyms)
        if semantic_granular:
            print(
                f"It belongs to the following predominantly masculine semantic categories: {', '.join(semantic_granular)}"
            )
            print(
                f"The above classification can be expressed in terms of the following general semantic categories: {', '.join(semantic_general)}"
            )
            masc_flag = True
        else:
            print(
                "Grammatical gender assignment could not be determined based on the semantic category alone."
            )
    elif (
        not hypernyms
    ):  # if the word returns no hypernyms, see if its base noun returns them
        if parsed_base:  # checks if the parser returns a base in the first place
            parsed_translation = deepl_translator.translate_text(
                parsed_base, source_lang="DE", target_lang="EN-US"
            )
            translated_base = parsed_translation.text.casefold()
            base_hypernyms = taxonomy(
                translated_base
            )  # generate all possible hypernyms
            if base_hypernyms:
                base_semantic_general, base_semantic_granular = masc_rule1(
                    base_hypernyms
                )
                if base_semantic_granular:
                    print(f"Couldn't find any semantic categories for '{lemmatized}'.")
                    print(
                        f"The base noun '{parsed_base}' belongs to the following predominantly masculine semantic categories: {', '.join(base_semantic_granular)}"
                    )
                    print(
                        f"The above classification can be expressed in terms of the following general semantic categories: {', '.join(base_semantic_general)}"
                    )
                    masc_flag = True
                else:
                    print(
                        f"Couldn't find any semantic categories for '{lemmatized}'. There don't seem to be any predomiantly masculine semantic categories to which the base noun '{parsed_base}' blelongs."
                    )
            else:
                print(f"Couldn't generate any semantic categories for '{parsed_base}'.")
        else:
            print(f"Couldn't parse '{lemmatized}'.")
    elif hypernyms is None:
        print(f"Couldn't generate a semantic taxonomy for '{lemmatized}'.")

    # check the morphology
    morphological = masc_rule2(lemmatized)
    if parsed_base:
        print(f"'{lemmatized}' has the following masculine base noun: '{parsed_base}'.")
    if morphological:
        print(
            f"The noun has the following masculine affixes: {', '.join(morphological)}"
        )
        masc_flag = True
    else:
        print(
            "Grammatical gender assignment cannot be determined based on the noun's affixes alone."
        )

    # check if monosyllabic
    # but first replace the umlauts and the 'ß' with their alternative transcriptions.  The EN syllable estimator doesn't recognize DE chars.
    no_umlauts = (
        lemmatized.replace("ä", "ae")
        .replace("ö", "oe")
        .replace("ü", "ue")
        .replace("ß", "ss")
    )
    monosyllabic = masc_syllables(no_umlauts)
    if monosyllabic:
        print(
            f"Monosyllabic nouns are overwhelmingly masculine. '{lemmatized}' might be monosyllabic."
        )
        masc_flag = True

    # print this if none of the above applies
    if masc_flag == False:
        print(
            f"The grammatical gender of '{lemmatized}' cannot be explained with the available rules."
        )
        print("For better or worse, it has to be memorized")