-
-
Notifications
You must be signed in to change notification settings - Fork 230
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for externally sorting terms
- Loading branch information
1 parent
d63b996
commit 9088323
Showing
3 changed files
with
312 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,4 +20,3 @@ assets/css/to_delete.css | |
.DS_Store | ||
_config.yml | ||
_data/ | ||
count.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,308 @@ | ||
import pprint | ||
import yaml | ||
|
||
from collections import OrderedDict | ||
from pathlib import Path | ||
|
||
import icu | ||
|
||
languages = [ | ||
('aa', 'Afar'), | ||
('ab', 'Abkhazian'), | ||
('af', 'Afrikaans'), | ||
('ak', 'Akan'), | ||
('sq', 'Albanian'), | ||
('am', 'Amharic'), | ||
('ar', 'Arabic'), | ||
('an', 'Aragonese'), | ||
('hy', 'Armenian'), | ||
('as', 'Assamese'), | ||
('av', 'Avaric'), | ||
('ae', 'Avestan'), | ||
('ay', 'Aymara'), | ||
('az', 'Azerbaijani'), | ||
('ba', 'Bashkir'), | ||
('bm', 'Bambara'), | ||
('eu', 'Basque'), | ||
('be', 'Belarusian'), | ||
('bn', 'Bengali'), | ||
('bh', 'Bihari languages'), | ||
('bi', 'Bislama'), | ||
('bo', 'Tibetan'), | ||
('bs', 'Bosnian'), | ||
('br', 'Breton'), | ||
('bg', 'Bulgarian'), | ||
('my', 'Burmese'), | ||
('ca', 'Catalan; Valencian'), | ||
('cs', 'Czech'), | ||
('ch', 'Chamorro'), | ||
('ce', 'Chechen'), | ||
('zh', 'Chinese'), | ||
('cu', 'Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic'), | ||
('cv', 'Chuvash'), | ||
('kw', 'Cornish'), | ||
('co', 'Corsican'), | ||
('cr', 'Cree'), | ||
('cy', 'Welsh'), | ||
('cs', 'Czech'), | ||
('da', 'Danish'), | ||
('de', 'German'), | ||
('dv', 'Divehi; Dhivehi; Maldivian'), | ||
('nl', 'Dutch; Flemish'), | ||
('dz', 'Dzongkha'), | ||
('el', 'Greek, Modern (1453-)'), | ||
('en', 'English'), | ||
('eo', 'Esperanto'), | ||
('et', 'Estonian'), | ||
('eu', 'Basque'), | ||
('ee', 'Ewe'), | ||
('fo', 'Faroese'), | ||
('fa', 'Persian'), | ||
('fj', 'Fijian'), | ||
('fi', 'Finnish'), | ||
('fr', 'French'), | ||
('fy', 'Western Frisian'), | ||
('ff', 'Fulah'), | ||
('Ga', 'Georgian'), | ||
('de', 'German'), | ||
('gd', 'Gaelic; Scottish Gaelic'), | ||
('ga', 'Irish'), | ||
('gl', 'Galician'), | ||
('gv', 'Manx'), | ||
('el', 'Greek, Modern (1453-)'), | ||
('gn', 'Guarani'), | ||
('gu', 'Gujarati'), | ||
('ht', 'Haitian; Haitian Creole'), | ||
('ha', 'Hausa'), | ||
('he', 'Hebrew'), | ||
('hz', 'Herero'), | ||
('hi', 'Hindi'), | ||
('ho', 'Hiri Motu'), | ||
('hr', 'Croatian'), | ||
('hu', 'Hungarian'), | ||
('hy', 'Armenian'), | ||
('ig', 'Igbo'), | ||
('is', 'Icelandic'), | ||
('io', 'Ido'), | ||
('ii', 'Sichuan Yi; Nuosu'), | ||
('iu', 'Inuktitut'), | ||
('ie', 'Interlingue; Occidental'), | ||
('ia', 'Interlingua (International Auxiliary Language Association)'), | ||
('id', 'Indonesian'), | ||
('ik', 'Inupiaq'), | ||
('is', 'Icelandic'), | ||
('it', 'Italian'), | ||
('jv', 'Javanese'), | ||
('ja', 'Japanese'), | ||
('kl', 'Kalaallisut; Greenlandic'), | ||
('kn', 'Kannada'), | ||
('ks', 'Kashmiri'), | ||
('ka', 'Georgian'), | ||
('kr', 'Kanuri'), | ||
('kk', 'Kazakh'), | ||
('km', 'Central Khmer'), | ||
('ki', 'Kikuyu; Gikuyu'), | ||
('rw', 'Kinyarwanda'), | ||
('ky', 'Kirghiz; Kyrgyz'), | ||
('kv', 'Komi'), | ||
('kg', 'Kongo'), | ||
('ko', 'Korean'), | ||
('kj', 'Kuanyama; Kwanyama'), | ||
('ku', 'Kurdish'), | ||
('lo', 'Lao'), | ||
('la', 'Latin'), | ||
('lv', 'Latvian'), | ||
('li', 'Limburgan; Limburger; Limburgish'), | ||
('ln', 'Lingala'), | ||
('lt', 'Lithuanian'), | ||
('lb', 'Luxembourgish; Letzeburgesch'), | ||
('lu', 'Luba-Katanga'), | ||
('lg', 'Ganda'), | ||
('mk', 'Macedonian'), | ||
('mh', 'Marshallese'), | ||
('ml', 'Malayalam'), | ||
('mi', 'Maori'), | ||
('mr', 'Marathi'), | ||
('ms', 'Malay'), | ||
('Mi', 'Micmac'), | ||
('mk', 'Macedonian'), | ||
('mg', 'Malagasy'), | ||
('mt', 'Maltese'), | ||
('mn', 'Mongolian'), | ||
('mi', 'Maori'), | ||
('ms', 'Malay'), | ||
('my', 'Burmese'), | ||
('na', 'Nauru'), | ||
('nv', 'Navajo; Navaho'), | ||
('nr', 'Ndebele, South; South Ndebele'), | ||
('nd', 'Ndebele, North; North Ndebele'), | ||
('ng', 'Ndonga'), | ||
('ne', 'Nepali'), | ||
('nl', 'Dutch; Flemish'), | ||
('nn', 'Norwegian Nynorsk; Nynorsk, Norwegian'), | ||
('nb', 'Bokmål, Norwegian; Norwegian Bokmål'), | ||
('no', 'Norwegian'), | ||
('oc', 'Occitan (post 1500)'), | ||
('oj', 'Ojibwa'), | ||
('or', 'Oriya'), | ||
('om', 'Oromo'), | ||
('os', 'Ossetian; Ossetic'), | ||
('pa', 'Panjabi; Punjabi'), | ||
('fa', 'Persian'), | ||
('pi', 'Pali'), | ||
('pl', 'Polish'), | ||
('pt', 'Portuguese'), | ||
('ps', 'Pushto; Pashto'), | ||
('qu', 'Quechua'), | ||
('rm', 'Romansh'), | ||
('ro', 'Romanian; Moldavian; Moldovan'), | ||
('ro', 'Romanian; Moldavian; Moldovan'), | ||
('rn', 'Rundi'), | ||
('ru', 'Russian'), | ||
('sg', 'Sango'), | ||
('sa', 'Sanskrit'), | ||
('si', 'Sinhala; Sinhalese'), | ||
('sk', 'Slovak'), | ||
('sk', 'Slovak'), | ||
('sl', 'Slovenian'), | ||
('se', 'Northern Sami'), | ||
('sm', 'Samoan'), | ||
('sn', 'Shona'), | ||
('sd', 'Sindhi'), | ||
('so', 'Somali'), | ||
('st', 'Sotho, Southern'), | ||
('es', 'Spanish; Castilian'), | ||
('sq', 'Albanian'), | ||
('sc', 'Sardinian'), | ||
('sr', 'Serbian'), | ||
('ss', 'Swati'), | ||
('su', 'Sundanese'), | ||
('sw', 'Swahili'), | ||
('sv', 'Swedish'), | ||
('ty', 'Tahitian'), | ||
('ta', 'Tamil'), | ||
('tt', 'Tatar'), | ||
('te', 'Telugu'), | ||
('tg', 'Tajik'), | ||
('tl', 'Tagalog'), | ||
('th', 'Thai'), | ||
('bo', 'Tibetan'), | ||
('ti', 'Tigrinya'), | ||
('to', 'Tonga (Tonga Islands)'), | ||
('tn', 'Tswana'), | ||
('ts', 'Tsonga'), | ||
('tk', 'Turkmen'), | ||
('tr', 'Turkish'), | ||
('tw', 'Twi'), | ||
('ug', 'Uighur; Uyghur'), | ||
('uk', 'Ukrainian'), | ||
('ur', 'Urdu'), | ||
('uz', 'Uzbek'), | ||
('ve', 'Venda'), | ||
('vi', 'Vietnamese'), | ||
('vo', 'Volapük'), | ||
('cy', 'Welsh'), | ||
('wa', 'Walloon'), | ||
('wo', 'Wolof'), | ||
('xh', 'Xhosa'), | ||
('yi', 'Yiddish'), | ||
('yo', 'Yoruba'), | ||
('za', 'Zhuang; Chuang'), | ||
('zh', 'Chinese'), | ||
('zu', 'Zulu') | ||
] | ||
|
||
def _sort_terms(count_dict): | ||
# sort and reassign terms | ||
for lang in count_dict: | ||
# std_lang = standardize_tag(lang) | ||
# print(f"{lang} -> {std_lang} -> {Language.get(std_lang).to_alpha3()}") | ||
|
||
# create a locale from the language code and sort the terms with a collator | ||
icu_locale = icu.Locale(lang) | ||
collator = icu.Collator.createInstance(icu_locale) | ||
|
||
# only create directories for languages with terms | ||
if count_dict[lang]["count"] > 0: | ||
lang_path = data_path.joinpath(lang) | ||
lang_path.mkdir(parents=True, exist_ok=True) | ||
|
||
# sort | ||
sorted_terms = sorted(count_dict[lang]["terms"], key=collator.getSortKey) | ||
count_dict[lang]["sorted_terms"] = sorted_terms | ||
return count_dict | ||
|
||
def _setup_dict(glossary): | ||
count_dict = {} | ||
lang_codes = [] | ||
|
||
for cc in languages: | ||
count_dict[cc[0]] = {} | ||
count_dict[cc[0]]["count"] = 0 | ||
count_dict[cc[0]]["name"] = cc[1] | ||
count_dict[cc[0]]["terms"] = [] | ||
count_dict[cc[0]]["sorted_terms"] = [] | ||
count_dict[cc[0]]["term_entry_map"] = {} | ||
lang_codes.append(cc[0]) | ||
|
||
# total number of glossary terms | ||
# print(len(glos)) | ||
|
||
for slug in glossary: | ||
for lang in slug.keys(): | ||
if lang in lang_codes: | ||
count_dict[lang]["count"] += 1 | ||
count_dict[lang]["terms"].append(slug[lang]["term"]) | ||
count_dict[lang]["term_entry_map"][slug[lang]["term"]] = dict( | ||
{ | ||
"slug": slug["slug"], | ||
"def": slug[lang]["def"] | ||
} | ||
) | ||
return _sort_terms(count_dict) | ||
|
||
def _build_lang_glossary(count_dict): | ||
glossary_by_lang = {} | ||
for lang in count_dict: | ||
sorted_glossary = [] | ||
for sorted_term in count_dict[lang]["sorted_terms"]: | ||
if sorted_term in count_dict[lang]["term_entry_map"]: | ||
term_map = count_dict[lang]["term_entry_map"][sorted_term] | ||
slug = term_map["slug"] | ||
_def = term_map["def"] | ||
|
||
sorted_glossary.append(OrderedDict({ | ||
"slug": slug, | ||
lang: { | ||
"term": sorted_term, | ||
"def": _def | ||
} | ||
})) | ||
if sorted_glossary: | ||
glossary_by_lang[lang] = sorted_glossary | ||
return glossary_by_lang | ||
|
||
def setup_yaml(): | ||
""" https://stackoverflow.com/a/8661021 """ | ||
def represent_dict_order(self, data): | ||
return self.represent_mapping('tag:yaml.org,2002:map', data.items()) | ||
yaml.add_representer(OrderedDict, represent_dict_order) | ||
|
||
# load main glossary file | ||
glos = yaml.safe_load(Path('glossary.yml').read_text()) | ||
data_path = Path("_data/") | ||
|
||
# sort terms | ||
count_dict = _setup_dict(glos) | ||
|
||
# rebuild glossary per language | ||
sorted_glossary_by_lang = _build_lang_glossary(count_dict) | ||
|
||
# setup yaml for outputting | ||
setup_yaml() | ||
for lang in sorted_glossary_by_lang: | ||
pprint.pprint(sorted_glossary_by_lang[lang]) | ||
yaml.dump(sorted_glossary_by_lang[lang], Path(f'_data/{lang}/glossary.yml').open('w')) | ||
|
||
# output counts | ||
# pprint.pprint(count_dict) |