From 2b7c79fab0b8f52c484f5dc548bd2a675e5bfdff Mon Sep 17 00:00:00 2001 From: froggleston Date: Tue, 17 Sep 2024 20:41:21 +0100 Subject: [PATCH] Create sorted glossaries in _data by lang --- .github/workflows/yaml-lint.yaml | 5 ++- Makefile | 21 ++++++---- utils/sort-glossary.py | 71 ++++++++++++++++++++++---------- 3 files changed, 64 insertions(+), 33 deletions(-) diff --git a/.github/workflows/yaml-lint.yaml b/.github/workflows/yaml-lint.yaml index 50a37311..a1b86a46 100644 --- a/.github/workflows/yaml-lint.yaml +++ b/.github/workflows/yaml-lint.yaml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8] + python-version: [3.11] steps: - uses: actions/checkout@v3 @@ -17,8 +17,9 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | + apt-get install -y build-essential libicu-dev python -m pip install --upgrade pip - pip install yamllint + pip install yamllint pycld2 pyicu-binary if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint _config.yml with yamllint run: | diff --git a/Makefile b/Makefile index 9c629cf0..8f1e6e55 100644 --- a/Makefile +++ b/Makefile @@ -5,13 +5,21 @@ all : commands commands : @grep -h -E '^##' ${MAKEFILE_LIST} | sed -e 's/## //g' | column -t -s ':' +_data/glossary.yml : ./glossary.yml + @mkdir -p _data + @cp $< $@ + +sort-glossary : _data/glossary.yml + @yamllint glossary.yml + @python utils/sort-glossary.py + ## site : rebuild GitHub Pages site locally. -site : _data/glossary.yml +site : sort-glossary rm -rf .jekyll-cache .jekyll-metadata _site bundle exec jekyll build ## gh-site : builds the website for GitHub pages (part of the GH Actions workflow) -gh-site : _data/glossary.yml +gh-site : sort-glossary @rm -rf _gh-site @mkdir -p _gh-site @cp -r `ls -A | grep -v '.git' | grep -v '_gh-site' | grep -v '_site'` _gh-site @@ -19,7 +27,7 @@ gh-site : _data/glossary.yml @cp $< _gh-site/$< ## serve : serve GitHub Pages site locally. -serve : _data/glossary.yml +serve : sort-glossary rm -rf _site bundle exec jekyll serve -I @@ -27,7 +35,7 @@ serve : _data/glossary.yml clean : @rm -rf _site @find . -name '*~' -exec rm {} \; - @rm -f _data/glossary.yml + @rm -rf _data/* ## check : check glossary consistency. check : @@ -37,8 +45,3 @@ check : ## checkall : check glossary consistency including missing terms in all languages. checkall : @python utils/check-glossary.py -A _config.yml glossary.yml - -# Create copy of glossary file for GitHub Pages site. -_data/glossary.yml : ./glossary.yml - @mkdir -p _data - @cp $< $@ diff --git a/utils/sort-glossary.py b/utils/sort-glossary.py index 299490ce..f38a6518 100644 --- a/utils/sort-glossary.py +++ b/utils/sort-glossary.py @@ -6,7 +6,8 @@ import icu -languages = [ +# set up supported languages +LANGUAGES = [ ('aa', 'Afar'), ('ab', 'Abkhazian'), ('af', 'Afrikaans'), @@ -212,13 +213,15 @@ ('zu', 'Zulu') ] -def _sort_terms(count_dict): + +def _sort_terms(count_dict, data_path): # sort and reassign terms for lang in count_dict: + # check 2-letter language codes vs 3-letter language codes # std_lang = standardize_tag(lang) # print(f"{lang} -> {std_lang} -> {Language.get(std_lang).to_alpha3()}") - # create a locale from the language code and sort the terms with a collator + # create a locale from the language code and a collator to perform sorting icu_locale = icu.Locale(lang) collator = icu.Collator.createInstance(icu_locale) @@ -227,16 +230,18 @@ def _sort_terms(count_dict): lang_path = data_path.joinpath(lang) lang_path.mkdir(parents=True, exist_ok=True) - # sort + # sort and store sorted terms separate from the original list sorted_terms = sorted(count_dict[lang]["terms"], key=collator.getSortKey) count_dict[lang]["sorted_terms"] = sorted_terms return count_dict -def _setup_dict(glossary): + +def _setup_dict(glossary, data_path): + # data structure to hold counts and terms count_dict = {} - lang_codes = [] - for cc in languages: + lang_codes = [] + for cc in LANGUAGES: count_dict[cc[0]] = {} count_dict[cc[0]]["count"] = 0 count_dict[cc[0]]["name"] = cc[1] @@ -248,6 +253,7 @@ def _setup_dict(glossary): # total number of glossary terms # print(len(glos)) + # count terms and store them in the data structure for slug in glossary: for lang in slug.keys(): if lang in lang_codes: @@ -259,18 +265,24 @@ def _setup_dict(glossary): "def": slug[lang]["def"] } ) - return _sort_terms(count_dict) + + # return the data structure including sorted terms + return _sort_terms(count_dict, data_path) + def _build_lang_glossary(count_dict): glossary_by_lang = {} for lang in count_dict: sorted_glossary = [] + + # process the data structure to create a new sorted glossary per language for sorted_term in count_dict[lang]["sorted_terms"]: if sorted_term in count_dict[lang]["term_entry_map"]: term_map = count_dict[lang]["term_entry_map"][sorted_term] slug = term_map["slug"] _def = term_map["def"] + # use an OrderedDict to retain insertion order sorted_glossary.append(OrderedDict({ "slug": slug, lang: { @@ -278,31 +290,46 @@ def _build_lang_glossary(count_dict): "def": _def } })) + + # only include languages with terms if sorted_glossary: glossary_by_lang[lang] = sorted_glossary return glossary_by_lang + def setup_yaml(): """ https://stackoverflow.com/a/8661021 """ def represent_dict_order(self, data): return self.represent_mapping('tag:yaml.org,2002:map', data.items()) yaml.add_representer(OrderedDict, represent_dict_order) -# load main glossary file -glos = yaml.safe_load(Path('glossary.yml').read_text()) -data_path = Path("_data/") -# sort terms -count_dict = _setup_dict(glos) +def main(): + try: + # get path + current_path = Path(__file__).resolve() + + # load main glossary file + data_path = current_path.parent.parent.joinpath("_data/") + glossary_path = data_path.joinpath("glossary.yml") + glos = yaml.safe_load(glossary_path.read_text()) + + # sort terms + sort_dict = _setup_dict(glos, data_path) + + # rebuild glossary per language + sorted_glossary_by_lang = _build_lang_glossary(sort_dict) + + # setup yaml for outputting + setup_yaml() + for lang in sorted_glossary_by_lang: + yaml.dump(sorted_glossary_by_lang[lang], Path(f'_data/{lang}/glossary.yml').open('w')) -# rebuild glossary per language -sorted_glossary_by_lang = _build_lang_glossary(count_dict) + # output counts + # pprint.pprint(count_dict) + except Exception as e: + print(e) -# setup yaml for outputting -setup_yaml() -for lang in sorted_glossary_by_lang: - pprint.pprint(sorted_glossary_by_lang[lang]) - yaml.dump(sorted_glossary_by_lang[lang], Path(f'_data/{lang}/glossary.yml').open('w')) -# output counts -# pprint.pprint(count_dict) +if __name__ == '__main__': + main()