Skip to content

Commit

Permalink
Add support for externally sorting terms
Browse files Browse the repository at this point in the history
  • Loading branch information
froggleston committed Sep 17, 2024
1 parent d63b996 commit 9088323
Show file tree
Hide file tree
Showing 3 changed files with 312 additions and 3 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,3 @@ assets/css/to_delete.css
.DS_Store
_config.yml
_data/
count.py
6 changes: 4 additions & 2 deletions _includes/glossary.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
{% assign gloss = site.data.glossary %}
{% assign language = page.permalink | replace: '/', '' %}

{% assign gloss = site.data.[language].glossary %}

{% assign direction = 'ltr' %}
{% if page.direction %}
{% assign direction = page.direction %}
Expand All @@ -13,7 +15,7 @@
- 'actual' is a list of slugs sorted by terms.
{%- endcomment -%}
{%- capture defined -%}{%- for item in gloss -%}{%- if item[language] -%}{{item[language].term | downcase}}IN_ITEM{{item.slug}}BETWEEN_ITEMS{%- endif -%}{%- endfor -%}{%- endcapture -%}
{%- assign sorted = defined | split: 'BETWEEN_ITEMS' | sort -%}
{%- assign sorted = defined | split: 'BETWEEN_ITEMS' -%}
{%- capture ordered -%}{%- for item in sorted -%}{{item | split: 'IN_ITEM' | last}}BETWEEN_ITEMS{%- endfor -%}{%- endcapture -%}
{%- assign actual = ordered | split: 'BETWEEN_ITEMS' -%}

Expand Down
308 changes: 308 additions & 0 deletions utils/sort-glossary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,308 @@
import pprint
import yaml

from collections import OrderedDict
from pathlib import Path

import icu

languages = [
('aa', 'Afar'),
('ab', 'Abkhazian'),
('af', 'Afrikaans'),
('ak', 'Akan'),
('sq', 'Albanian'),
('am', 'Amharic'),
('ar', 'Arabic'),
('an', 'Aragonese'),
('hy', 'Armenian'),
('as', 'Assamese'),
('av', 'Avaric'),
('ae', 'Avestan'),
('ay', 'Aymara'),
('az', 'Azerbaijani'),
('ba', 'Bashkir'),
('bm', 'Bambara'),
('eu', 'Basque'),
('be', 'Belarusian'),
('bn', 'Bengali'),
('bh', 'Bihari languages'),
('bi', 'Bislama'),
('bo', 'Tibetan'),
('bs', 'Bosnian'),
('br', 'Breton'),
('bg', 'Bulgarian'),
('my', 'Burmese'),
('ca', 'Catalan; Valencian'),
('cs', 'Czech'),
('ch', 'Chamorro'),
('ce', 'Chechen'),
('zh', 'Chinese'),
('cu', 'Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic'),
('cv', 'Chuvash'),
('kw', 'Cornish'),
('co', 'Corsican'),
('cr', 'Cree'),
('cy', 'Welsh'),
('cs', 'Czech'),
('da', 'Danish'),
('de', 'German'),
('dv', 'Divehi; Dhivehi; Maldivian'),
('nl', 'Dutch; Flemish'),
('dz', 'Dzongkha'),
('el', 'Greek, Modern (1453-)'),
('en', 'English'),
('eo', 'Esperanto'),
('et', 'Estonian'),
('eu', 'Basque'),
('ee', 'Ewe'),
('fo', 'Faroese'),
('fa', 'Persian'),
('fj', 'Fijian'),
('fi', 'Finnish'),
('fr', 'French'),
('fy', 'Western Frisian'),
('ff', 'Fulah'),
('Ga', 'Georgian'),
('de', 'German'),
('gd', 'Gaelic; Scottish Gaelic'),
('ga', 'Irish'),
('gl', 'Galician'),
('gv', 'Manx'),
('el', 'Greek, Modern (1453-)'),
('gn', 'Guarani'),
('gu', 'Gujarati'),
('ht', 'Haitian; Haitian Creole'),
('ha', 'Hausa'),
('he', 'Hebrew'),
('hz', 'Herero'),
('hi', 'Hindi'),
('ho', 'Hiri Motu'),
('hr', 'Croatian'),
('hu', 'Hungarian'),
('hy', 'Armenian'),
('ig', 'Igbo'),
('is', 'Icelandic'),
('io', 'Ido'),
('ii', 'Sichuan Yi; Nuosu'),
('iu', 'Inuktitut'),
('ie', 'Interlingue; Occidental'),
('ia', 'Interlingua (International Auxiliary Language Association)'),
('id', 'Indonesian'),
('ik', 'Inupiaq'),
('is', 'Icelandic'),
('it', 'Italian'),
('jv', 'Javanese'),
('ja', 'Japanese'),
('kl', 'Kalaallisut; Greenlandic'),
('kn', 'Kannada'),
('ks', 'Kashmiri'),
('ka', 'Georgian'),
('kr', 'Kanuri'),
('kk', 'Kazakh'),
('km', 'Central Khmer'),
('ki', 'Kikuyu; Gikuyu'),
('rw', 'Kinyarwanda'),
('ky', 'Kirghiz; Kyrgyz'),
('kv', 'Komi'),
('kg', 'Kongo'),
('ko', 'Korean'),
('kj', 'Kuanyama; Kwanyama'),
('ku', 'Kurdish'),
('lo', 'Lao'),
('la', 'Latin'),
('lv', 'Latvian'),
('li', 'Limburgan; Limburger; Limburgish'),
('ln', 'Lingala'),
('lt', 'Lithuanian'),
('lb', 'Luxembourgish; Letzeburgesch'),
('lu', 'Luba-Katanga'),
('lg', 'Ganda'),
('mk', 'Macedonian'),
('mh', 'Marshallese'),
('ml', 'Malayalam'),
('mi', 'Maori'),
('mr', 'Marathi'),
('ms', 'Malay'),
('Mi', 'Micmac'),
('mk', 'Macedonian'),
('mg', 'Malagasy'),
('mt', 'Maltese'),
('mn', 'Mongolian'),
('mi', 'Maori'),
('ms', 'Malay'),
('my', 'Burmese'),
('na', 'Nauru'),
('nv', 'Navajo; Navaho'),
('nr', 'Ndebele, South; South Ndebele'),
('nd', 'Ndebele, North; North Ndebele'),
('ng', 'Ndonga'),
('ne', 'Nepali'),
('nl', 'Dutch; Flemish'),
('nn', 'Norwegian Nynorsk; Nynorsk, Norwegian'),
('nb', 'Bokmål, Norwegian; Norwegian Bokmål'),
('no', 'Norwegian'),
('oc', 'Occitan (post 1500)'),
('oj', 'Ojibwa'),
('or', 'Oriya'),
('om', 'Oromo'),
('os', 'Ossetian; Ossetic'),
('pa', 'Panjabi; Punjabi'),
('fa', 'Persian'),
('pi', 'Pali'),
('pl', 'Polish'),
('pt', 'Portuguese'),
('ps', 'Pushto; Pashto'),
('qu', 'Quechua'),
('rm', 'Romansh'),
('ro', 'Romanian; Moldavian; Moldovan'),
('ro', 'Romanian; Moldavian; Moldovan'),
('rn', 'Rundi'),
('ru', 'Russian'),
('sg', 'Sango'),
('sa', 'Sanskrit'),
('si', 'Sinhala; Sinhalese'),
('sk', 'Slovak'),
('sk', 'Slovak'),
('sl', 'Slovenian'),
('se', 'Northern Sami'),
('sm', 'Samoan'),
('sn', 'Shona'),
('sd', 'Sindhi'),
('so', 'Somali'),
('st', 'Sotho, Southern'),
('es', 'Spanish; Castilian'),
('sq', 'Albanian'),
('sc', 'Sardinian'),
('sr', 'Serbian'),
('ss', 'Swati'),
('su', 'Sundanese'),
('sw', 'Swahili'),
('sv', 'Swedish'),
('ty', 'Tahitian'),
('ta', 'Tamil'),
('tt', 'Tatar'),
('te', 'Telugu'),
('tg', 'Tajik'),
('tl', 'Tagalog'),
('th', 'Thai'),
('bo', 'Tibetan'),
('ti', 'Tigrinya'),
('to', 'Tonga (Tonga Islands)'),
('tn', 'Tswana'),
('ts', 'Tsonga'),
('tk', 'Turkmen'),
('tr', 'Turkish'),
('tw', 'Twi'),
('ug', 'Uighur; Uyghur'),
('uk', 'Ukrainian'),
('ur', 'Urdu'),
('uz', 'Uzbek'),
('ve', 'Venda'),
('vi', 'Vietnamese'),
('vo', 'Volapük'),
('cy', 'Welsh'),
('wa', 'Walloon'),
('wo', 'Wolof'),
('xh', 'Xhosa'),
('yi', 'Yiddish'),
('yo', 'Yoruba'),
('za', 'Zhuang; Chuang'),
('zh', 'Chinese'),
('zu', 'Zulu')
]

def _sort_terms(count_dict):
# sort and reassign terms
for lang in count_dict:
# std_lang = standardize_tag(lang)
# print(f"{lang} -> {std_lang} -> {Language.get(std_lang).to_alpha3()}")

# create a locale from the language code and sort the terms with a collator
icu_locale = icu.Locale(lang)
collator = icu.Collator.createInstance(icu_locale)

# only create directories for languages with terms
if count_dict[lang]["count"] > 0:
lang_path = data_path.joinpath(lang)
lang_path.mkdir(parents=True, exist_ok=True)

# sort
sorted_terms = sorted(count_dict[lang]["terms"], key=collator.getSortKey)
count_dict[lang]["sorted_terms"] = sorted_terms
return count_dict

def _setup_dict(glossary):
count_dict = {}
lang_codes = []

for cc in languages:
count_dict[cc[0]] = {}
count_dict[cc[0]]["count"] = 0
count_dict[cc[0]]["name"] = cc[1]
count_dict[cc[0]]["terms"] = []
count_dict[cc[0]]["sorted_terms"] = []
count_dict[cc[0]]["term_entry_map"] = {}
lang_codes.append(cc[0])

# total number of glossary terms
# print(len(glos))

for slug in glossary:
for lang in slug.keys():
if lang in lang_codes:
count_dict[lang]["count"] += 1
count_dict[lang]["terms"].append(slug[lang]["term"])
count_dict[lang]["term_entry_map"][slug[lang]["term"]] = dict(
{
"slug": slug["slug"],
"def": slug[lang]["def"]
}
)
return _sort_terms(count_dict)

def _build_lang_glossary(count_dict):
glossary_by_lang = {}
for lang in count_dict:
sorted_glossary = []
for sorted_term in count_dict[lang]["sorted_terms"]:
if sorted_term in count_dict[lang]["term_entry_map"]:
term_map = count_dict[lang]["term_entry_map"][sorted_term]
slug = term_map["slug"]
_def = term_map["def"]

sorted_glossary.append(OrderedDict({
"slug": slug,
lang: {
"term": sorted_term,
"def": _def
}
}))
if sorted_glossary:
glossary_by_lang[lang] = sorted_glossary
return glossary_by_lang

def setup_yaml():
""" https://stackoverflow.com/a/8661021 """
def represent_dict_order(self, data):
return self.represent_mapping('tag:yaml.org,2002:map', data.items())
yaml.add_representer(OrderedDict, represent_dict_order)

# load main glossary file
glos = yaml.safe_load(Path('glossary.yml').read_text())
data_path = Path("_data/")

# sort terms
count_dict = _setup_dict(glos)

# rebuild glossary per language
sorted_glossary_by_lang = _build_lang_glossary(count_dict)

# setup yaml for outputting
setup_yaml()
for lang in sorted_glossary_by_lang:
pprint.pprint(sorted_glossary_by_lang[lang])
yaml.dump(sorted_glossary_by_lang[lang], Path(f'_data/{lang}/glossary.yml').open('w'))

# output counts
# pprint.pprint(count_dict)

0 comments on commit 9088323

Please sign in to comment.