Skip to content

Commit

Permalink
Merge pull request #44 from oscar-defelice/master
Browse files Browse the repository at this point in the history
Added the French language support
  • Loading branch information
filyp authored Dec 4, 2021
2 parents d762cfc + e503123 commit 2daa44d
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ MANIFEST
.vscode
autocorrect/data/*
.coverage
.DS_Store
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ bzip2 -d ruiwiki-latest-pages-articles.xml.bz2

After that:

First, edit the `autocorrect.constants` dictionaries in order to accommodate regexes and dictionaries for your language.

Then:

```python
>>> from autocorrect.word_count import count_words
>>> count_words('ruwiki-latest-pages-articles.xml', 'ru')
Expand Down
7 changes: 6 additions & 1 deletion autocorrect/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"cs": r"[AÁBCČDĎEÉĚFGH(Ch)IÍJKLMNŇOÓPQRŘSŠTŤUÚŮVWXYÝZŽaábcčdďeéěfgh(ch)iíjklmnňoópqrřsštťuúůvwxyýzž]+",
"el": r"[α-ωΑ-ΩίϊΐόάέύϋΰήώΊΪΪ́ΌΆΈΎΫΫ́ΉΏ]+",
"it": r"[a-zA-ZãáàâçéêíõóôúüÃÁÀÂÇÉÊÍÕÓÔÚÜ]+",
"fr": r"[a-zA-ZãáàâçéêíõóôúüÃÁÀÂÇÉÊÍÕÓÔÚÜ]+",
"vi": r"[a-zA-ZàáạảãÀÁẠẢÃằắặẳẵẰẮẶẲẴầấậẩẫẦẤẬẨẪèéẹẻẽÈÉẸẺẼềếệểễỀẾỆỂỄìíịỉĩÌÍỊỈĨòóọỏõÒÓỌỎÕồốộổỗỒỐỘỔỖờớợởỡỜỚỢỞỠùúụủũÙÚỤỦŨừứựửữỪỨỰỬỮỳýỵỷỹỲÝỴỶỸ]+",
}

Expand All @@ -23,13 +24,15 @@
"cs": "aábcčdďeéěfgh(ch)iíjklmnňoópqrřsštťuúůvwxyýzž",
"el": "αβγδεζηθικλμνξοπρςτυφχψωίϊΐόάέύϋΰήώ",
"it": "abcdefghijklmnopqrstuvwxzyãáàâçéêíõóôúü",
"fr": "abcdefghijklmnopqrstuvwxzyãáàâçéêíõóôúü",
"vi": "aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz",
}

ipfs_gateways = [
"http://ipfs.io/ipfs/",
"https://gateway.pinata.cloud/ipfs/",
"https://cf-ipfs.com/ipfs/", # this one has the best performance, but doesn't return download progress
# this one has the best performance, but doesn't return download progress
"https://cf-ipfs.com/ipfs/",
]

ipfs_paths = {
Expand All @@ -43,6 +46,7 @@
"pt": ["QmbRSZvfJV6zN12zzWhecphcvE9ZBeQdAJGQ9c9ttJXzcg/pt.tar.gz"],
"el": ["QmbRSZvfJV6zN12zzWhecphcvE9ZBeQdAJGQ9c9ttJXzcg/el.tar.gz"],
"it": ["QmbRSZvfJV6zN12zzWhecphcvE9ZBeQdAJGQ9c9ttJXzcg/it.tar.gz"],
"fr": ["QmPRNDmUDTXikq8gWnGcw3ZGmnoBfvekmAyeyX8y6onf23/fr.tar.gz"],
"vi": ["QmRRJj5i7nkpzTRSKhFe23XMjLRw7f2zD6FLKDrRfzco7f/vi.tar.gz"],
}

Expand Down Expand Up @@ -82,4 +86,5 @@
"it": [
"https://dl.dropboxusercontent.com/s/6xci1wfb387zk23/it.tar.gz?dl=0",
],
"fr": ["https://mega.nz/file/kQByQJAb#rMbmF0HG09MLQQ-FDafHrPAgXigJIpmC1zhtxRMp2dQ"],
}
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"Natural Language :: Portuguese",
"Natural Language :: Greek",
"Natural Language :: Italian",
"Natural Language :: French",
"Natural Language :: Vietnamese",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
Expand Down
52 changes: 52 additions & 0 deletions test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,19 @@
"salto": "saulto",
}

french_words_all_correct = {
"ordre": "oldre",
"leger": "lezger",
"cahier": "cnhier",
"saut": "slaut",
"vache": "vacne",
"fromage": "fromae",
"bisous": "biosus",
"possible": "possable",
"position": "posizion",
"populaire": "popularie",
}

single_typos_me = {
"ae",
"ame",
Expand Down Expand Up @@ -1009,6 +1022,38 @@
"cavallo": "cavatlo",
"poltrona": "poltrola",
},
"fr": {
"disparu": "disparue",
"atteint": "atteind",
"croient": "croyent",
"cicogne": "cygogne",
"electronique": "électronique",
"bien": "bein",
"connexion": "connection",
"galerie": "gallerie",
"meilleur": "meiileur",
"obnubiler": "obnibuler",
"oculaire": "ocualire",
"télescope": "teiescope",
"valeur": "vaelur",
"vertu": "veltu",
"salade": "saiade",
"renommer": "renomner",
"sibyllin": "sibuliin",
"successeur": "succaszeur",
"écologie": "ecoiogie",
"éloge": "elogie",
"emménager": "ennenager",
"cheval": "clreval",
"permis": "pennis",
"recueillir": "recuelilir",
"martel": "manel",
"veux": "vuex",
"emploi": "emnloi",
"pôle": "pole",
"qui": "uui",
"experience": "escpehience",
},
"vi": {
"hiếu": "hiéu",
"hiền": "hién",
Expand Down Expand Up @@ -1141,6 +1186,11 @@ def test_italian():
assert spelltest(spell_it, italian_words_all_correct) == 0


def test_french():
spell_fr = Speller("fr")
assert spelltest(spell_fr, french_words_all_correct) == 0


if __name__ == "__main__":
command = sys.argv[1]

Expand All @@ -1161,6 +1211,8 @@ def test_italian():
benchmark("spanish words", spell, optional_language_tests["es"])
spell = Speller("it")
benchmark("italian words", spell, optional_language_tests["it"])
spell = Speller("fr")
benchmark("french words", spell, optional_language_tests["fr"])
elif command == "find_threshold":
lang = sys.argv[2]
test = optional_language_tests[lang]
Expand Down

0 comments on commit 2daa44d

Please sign in to comment.