add CLDF data provider

cldf · Apr 18, 2024 · 3f8219c · 3f8219c
1 parent 467252b
commit 3f8219c
Show file tree

Hide file tree

Showing 9 changed files with 211 additions and 12 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -21,6 +21,7 @@ python_requires = >=3.8
 install_requires =
     tqdm
     clldutils >= 3.12
+    cldfzenodo
     attrs
     csvw
     TexSoup
@@ -29,7 +30,7 @@ install_requires =
     lxml
     pyglottolog
     pycldf
-    pyigt>=1.4.1
+    pyigt>=2.1
     thefuzz
     unidecode
 

diff --git a/src/linglit/__init__.py b/src/linglit/__init__.py
@@ -5,9 +5,10 @@
 
 from . import langsci
 from . import glossa
+from . import cldf
 from .base import Repository, Glottolog
 
-assert langsci and glossa
+assert langsci and glossa and cldf
 PROVIDERS = {r.id: r for r in Repository.__subclasses__() if r.id}
 
 

diff --git a/src/linglit/base/__init__.py b/src/linglit/base/__init__.py
@@ -1,11 +1,11 @@
 import typing
 import pathlib
+import functools
 import collections
 
 import attr
 from pyigt import IGT
 from pycldf.sources import Source
-from clldutils.misc import lazyproperty
 from pyglottolog import Glottolog as API
 
 from linglit.util import clean_translation
@@ -159,11 +159,11 @@ def is_current(self) -> bool:
     def has_open_license(self) -> bool:
         return self.record.has_open_license
 
-    @lazyproperty
+    @functools.cached_property
     def cited_references(self) -> typing.List[Source]:
         return [ref for ref in self.references.values() if ref.id in self.cited]
 
-    @lazyproperty
+    @functools.cached_property
     def id(self) -> str:
         return '{}{}'.format(self.repos.id, self.record.ID)
 
@@ -172,7 +172,7 @@ def as_source(self) -> Source:
         src.id = self.id
         return src
 
-    @lazyproperty
+    @functools.cached_property
     def references(self) -> typing.OrderedDict[str, Source]:
         res = collections.OrderedDict()
         for src in self.iter_references():
@@ -184,7 +184,7 @@ def references(self) -> typing.OrderedDict[str, Source]:
     def iter_references(self) -> typing.Generator[Source, None, None]:  # pragma: no cover
         raise NotImplementedError()
 
-    @lazyproperty
+    @functools.cached_property
     def cited(self) -> collections.Counter:
         res = collections.Counter()
         for key in self.iter_cited():
@@ -200,7 +200,7 @@ def example_sources(self, ex: Example) -> typing.List[Source]:
         """
         return [self.references[sid] for sid, _ in ex.Source if sid != self.id] + [self.as_source()]
 
-    @lazyproperty
+    @functools.cached_property
     def examples(self):
         res = []
         for ex in self.iter_examples():

diff --git a/src/linglit/bibtex.py b/src/linglit/bibtex.py
@@ -18,6 +18,7 @@
 YEAR_PATTERN = re.compile('([0-9]{4})')
 ACC_FIELDS = {  # Fields where content from merged records should be accumulated.
     'isreferencedby': ' ',
+    'lgcode': '; ',
 }
 
 
@@ -173,6 +174,13 @@ def make_key(e):
     s = s.replace("ö", "oe")
     s = s.replace("ü", "ue")
     s = s.replace('"=', '-')
+    s = s.replace('`', '')
+    s = s.replace('{', '')
+    s = s.replace('}', '')
+    s = s.replace('[', '')
+    s = s.replace(']', '')
+    s = s.replace('?', 'na')
+    s = s.replace('&', '')
     creators = unidecode(s).replace(',', '')  # unidecode converts ogonek to comma!
     for c in "/.'()= ":
         creators = creators.replace(c, '')

diff --git a/src/linglit/cldf/__init__.py b/src/linglit/cldf/__init__.py
@@ -0,0 +1,3 @@
+from .repository import Repository
+
+assert Repository
diff --git a/src/linglit/cldf/publication.py b/src/linglit/cldf/publication.py
@@ -0,0 +1,87 @@
+import functools
+import collections
+
+from pycldf.dataset import iter_datasets
+from pycldf.sources import Sources
+from pyigt import Example
+
+from linglit import base
+
+
+class Publication(base.Publication):
+    @functools.cached_property
+    def ds(self):
+        return next(iter_datasets(self.dir))
+
+    @functools.cached_property
+    def cfg(self):
+        return self.repos.catalog[self.dir.name]
+
+    @functools.cached_property
+    def languages(self):
+        return self.ds.objects('LanguageTable')
+
+    def iter_references(self):
+        sid2langs = collections.defaultdict(set)
+        if not self.cfg.bib:
+            return
+        s2l = self.cfg.source_to_language
+        l2gc = {}
+        for row in self.languages:
+            if row.cldf.glottocode:
+                l2gc[row.id] = row.cldf.glottocode
+                if s2l == 'LanguageTable':
+                    for src in row.cldf.source:
+                        sid, _ = Sources.parse(src)
+                        sid2langs[sid].add(row.cldf.glottocode)
+        if s2l == 'ValueTable':
+            for row in self.ds.iter_rows('ValueTable', 'languageReference', 'source'):
+                if row['languageReference'] in l2gc:
+                    for src in row['source']:
+                        sid, _ = Sources.parse(src)
+                        sid2langs[sid].add(l2gc[row['languageReference']])
+        for src in self.ds.sources:
+            for field in [
+                'besttxt', 'cfn', 'delivered', 'fn',
+                'languageid', 'glottolog_ref_id', 'glottolog_ref',
+                'gbid', 'google_book_search_id', 'google_book_search_viewability',
+                'google_book_viewability',
+            ]:
+                if field in src:
+                    del src[field]
+            if src.id in sid2langs:
+                src['lgcode'] = '; '.join('[{}]'.format(gc) for gc in sorted(sid2langs[src.id]))
+            if self.cfg.hhtype:
+                src['hhtype'] = self.cfg.hhtype
+            yield src
+
+    def iter_cited(self):
+        for src in self.ds.sources:
+            yield src.id
+
+    def iter_examples(self, glottolog=None):
+        abbrs = {}
+        if self.cfg.gloss_abbreviations:
+            fname, abbrcol, defcol = self.cfg.gloss_abbreviations
+            abbrs = collections.OrderedDict(
+                [(r[abbrcol], r[defcol]) for r in self.ds.iter_rows(fname)])
+        l2gc = {l.id: (l.cldf.glottocode, l.cldf.name) for l in self.languages}
+        if self.cfg.igt:
+            for count, ex in enumerate(self.ds.objects('ExampleTable', cls=Example), start=1):
+                if abbrs:
+                    ex.igt.abbrs = abbrs
+                if ex.igt.primary_text and ex.igt.phrase:
+                    yield base.Example(
+                        ID='{}'.format(count),
+                        Local_ID=ex.id,
+                        Primary_Text=ex.igt.primary_text,
+                        Analyzed_Word=ex.igt.phrase,
+                        Gloss=ex.igt.gloss,
+                        Translated_Text=ex.igt.translation or '',
+                        Language_ID=l2gc[ex.cldf.languageReference][0],
+                        Language_Name=l2gc[ex.cldf.languageReference][1],
+                        Source=[],
+                        Abbreviations=ex.igt.gloss_abbrs if ex.igt.is_valid(strict=True) else {},
+                        Meta_Language_ID=ex.cldf.metaLanguageReference or 'stan1293',
+                        Comment=getattr(ex.cldf, 'comment', None),
+                    )
diff --git a/src/linglit/cldf/repository.py b/src/linglit/cldf/repository.py
@@ -0,0 +1,96 @@
+import shutil
+import functools
+import collections
+
+import attr
+import cldfzenodo
+from clldutils.jsonlib import dump, load
+from pycldf import Source
+from csvw.dsv import reader
+
+from linglit import base
+from .publication import Publication
+
+
+@attr.s
+class Record(base.Record):
+    bibtex = attr.ib()
+
+    def as_source(self):
+        return Source.from_bibtex(self.bibtex)
+
+
+@attr.s
+class Dataset:
+    """
+    A row in a repository's catalog.csv file.
+    """
+    id = attr.ib()
+    name = attr.ib()
+    conceptdoi = attr.ib()
+    source_to_language = attr.ib()
+    hhtype = attr.ib()
+    bib = attr.ib(converter=lambda s: bool(s))
+    igt = attr.ib(converter=lambda s: bool(s))
+    gloss_abbreviations = attr.ib(converter=lambda s: s.split())  # triple (fname, abbrcol, defcol)
+
+
+class Repository(base.Repository):
+    id = 'cldf'
+
+    @functools.cached_property
+    def catalog(self):
+        return collections.OrderedDict([
+            (row['name'], Dataset(**row)) for row in reader(self.dir / 'catalog.csv', dicts=True)])
+
+    def __getitem__(self, did):
+        md = self.metadata(did)
+        return Publication(
+            Record(
+                ID=str(self.catalog[did].id),
+                DOI=md['doi'],
+                license=md['license'],
+                creators=md['creators'],
+                title=md['title'],
+                year=md['year'],
+                metalanguage=None,
+                objectlanguage=None,
+                bibtex=cldfzenodo.Record(**md).bibtex,
+            ),
+            self.dir / did,
+            self)
+
+    def metadata(self, did):
+        return load(self.dir / '{}.json'.format(did))
+
+    def iter_publications(self):
+        for i, (did, rmd) in enumerate(self.catalog.items(), start=1):
+            md = self.metadata(did)
+            yield Publication(
+                Record(
+                    ID=str(rmd.id),
+                    DOI=md['doi'],
+                    license=md['license'],
+                    creators=md['creators'],
+                    title=md['title'],
+                    year=md['year'],
+                    metalanguage=None,
+                    objectlanguage=None,
+                    bibtex=cldfzenodo.Record(**md).bibtex,
+                ),
+                self.dir / did,
+                self)
+
+    def create(self, verbose=False):
+        for did, dataset in self.catalog.items():
+            dldir = self.dir / did
+            print(did)
+            rec = cldfzenodo.Record.from_concept_doi(dataset.conceptdoi)
+            if dldir.exists():
+                if self.metadata(did)['version'] == rec.version:
+                    continue
+                shutil.rmtree(dldir)
+            print('downloading {} ...'.format(rec.version))
+            rec.download_dataset(self.dir / did)
+            print('... done')
+            dump(attr.asdict(rec), self.dir / '{}.json'.format(did), indent=2)
diff --git a/src/linglit/commands/mergedbib.py b/src/linglit/commands/mergedbib.py
@@ -3,6 +3,7 @@
 """
 from clldutils.path import TemporaryDirectory
 from tqdm import tqdm
+from pybtex.database import parse_string
 
 from linglit.cli_util import add_provider, get_provider
 from linglit.bibtex import iter_entries, iter_merged
@@ -31,4 +32,6 @@ def bibtex(src):
                 for src in pub.cited_references:
                     bib.write(bibtex(src))
         for src, _ in iter_merged(iter_entries(tmp)):
-            print(bibtex(src))
+            res = bibtex(src)
+            parse_string(res, 'bibtex')
+            print(res)
diff --git a/src/linglit/langsci/publication.py b/src/linglit/langsci/publication.py
@@ -1,8 +1,8 @@
 import re
+import functools
 import collections
 
 from clldutils.path import walk
-from clldutils.misc import lazyproperty
 
 from linglit import base
 from .bibtex import iter_bib, normalize_key
@@ -97,7 +97,7 @@ def read_tex(self, p, with_input=True):
             self._includes_tex[str(p)] = texfixes.read_tex(p, with_input=with_input)
         return self._includes_tex[str(p)]
 
-    @lazyproperty
+    @functools.cached_property
     def gloss_abbreviations(self):
         abbr_pattern = re.compile(r'\\(sub)?section\*?(\[[^]]+])?{Abbreviations(\s+[A-Za-z]+)*}')
         section_pattern = re.compile(r'\\(?:sub)?section')
@@ -131,7 +131,7 @@ def bibs(self):
             self._get_includes_and_bibs()  # pragma: no cover
         return self._bibs
 
-    @lazyproperty
+    @functools.cached_property
     def bibkeys(self):
         res = {}
         for src in self.iter_references():
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .repository import Repository

		assert Repository