Skip to content

Commit

Permalink
add CLDF data provider
Browse files Browse the repository at this point in the history
  • Loading branch information
xrotwang committed Apr 18, 2024
1 parent 467252b commit 3f8219c
Show file tree
Hide file tree
Showing 9 changed files with 211 additions and 12 deletions.
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ python_requires = >=3.8
install_requires =
tqdm
clldutils >= 3.12
cldfzenodo
attrs
csvw
TexSoup
Expand All @@ -29,7 +30,7 @@ install_requires =
lxml
pyglottolog
pycldf
pyigt>=1.4.1
pyigt>=2.1
thefuzz
unidecode

Expand Down
3 changes: 2 additions & 1 deletion src/linglit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@

from . import langsci
from . import glossa
from . import cldf
from .base import Repository, Glottolog

assert langsci and glossa
assert langsci and glossa and cldf
PROVIDERS = {r.id: r for r in Repository.__subclasses__() if r.id}


Expand Down
12 changes: 6 additions & 6 deletions src/linglit/base/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import typing
import pathlib
import functools
import collections

import attr
from pyigt import IGT
from pycldf.sources import Source
from clldutils.misc import lazyproperty
from pyglottolog import Glottolog as API

from linglit.util import clean_translation
Expand Down Expand Up @@ -159,11 +159,11 @@ def is_current(self) -> bool:
def has_open_license(self) -> bool:
return self.record.has_open_license

@lazyproperty
@functools.cached_property
def cited_references(self) -> typing.List[Source]:
return [ref for ref in self.references.values() if ref.id in self.cited]

@lazyproperty
@functools.cached_property
def id(self) -> str:
return '{}{}'.format(self.repos.id, self.record.ID)

Expand All @@ -172,7 +172,7 @@ def as_source(self) -> Source:
src.id = self.id
return src

@lazyproperty
@functools.cached_property
def references(self) -> typing.OrderedDict[str, Source]:
res = collections.OrderedDict()
for src in self.iter_references():
Expand All @@ -184,7 +184,7 @@ def references(self) -> typing.OrderedDict[str, Source]:
def iter_references(self) -> typing.Generator[Source, None, None]: # pragma: no cover
raise NotImplementedError()

@lazyproperty
@functools.cached_property
def cited(self) -> collections.Counter:
res = collections.Counter()
for key in self.iter_cited():
Expand All @@ -200,7 +200,7 @@ def example_sources(self, ex: Example) -> typing.List[Source]:
"""
return [self.references[sid] for sid, _ in ex.Source if sid != self.id] + [self.as_source()]

@lazyproperty
@functools.cached_property
def examples(self):
res = []
for ex in self.iter_examples():
Expand Down
8 changes: 8 additions & 0 deletions src/linglit/bibtex.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
YEAR_PATTERN = re.compile('([0-9]{4})')
ACC_FIELDS = { # Fields where content from merged records should be accumulated.
'isreferencedby': ' ',
'lgcode': '; ',
}


Expand Down Expand Up @@ -173,6 +174,13 @@ def make_key(e):
s = s.replace("ö", "oe")
s = s.replace("ü", "ue")
s = s.replace('"=', '-')
s = s.replace('`', '')
s = s.replace('{', '')
s = s.replace('}', '')
s = s.replace('[', '')
s = s.replace(']', '')
s = s.replace('?', 'na')
s = s.replace('&', '')
creators = unidecode(s).replace(',', '') # unidecode converts ogonek to comma!
for c in "/.'()= ":
creators = creators.replace(c, '')
Expand Down
3 changes: 3 additions & 0 deletions src/linglit/cldf/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .repository import Repository

assert Repository
87 changes: 87 additions & 0 deletions src/linglit/cldf/publication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import functools
import collections

from pycldf.dataset import iter_datasets
from pycldf.sources import Sources
from pyigt import Example

from linglit import base


class Publication(base.Publication):
@functools.cached_property
def ds(self):
return next(iter_datasets(self.dir))

@functools.cached_property
def cfg(self):
return self.repos.catalog[self.dir.name]

@functools.cached_property
def languages(self):
return self.ds.objects('LanguageTable')

def iter_references(self):
sid2langs = collections.defaultdict(set)
if not self.cfg.bib:
return
s2l = self.cfg.source_to_language
l2gc = {}
for row in self.languages:
if row.cldf.glottocode:
l2gc[row.id] = row.cldf.glottocode
if s2l == 'LanguageTable':
for src in row.cldf.source:
sid, _ = Sources.parse(src)
sid2langs[sid].add(row.cldf.glottocode)
if s2l == 'ValueTable':
for row in self.ds.iter_rows('ValueTable', 'languageReference', 'source'):
if row['languageReference'] in l2gc:
for src in row['source']:
sid, _ = Sources.parse(src)
sid2langs[sid].add(l2gc[row['languageReference']])
for src in self.ds.sources:
for field in [
'besttxt', 'cfn', 'delivered', 'fn',
'languageid', 'glottolog_ref_id', 'glottolog_ref',
'gbid', 'google_book_search_id', 'google_book_search_viewability',
'google_book_viewability',
]:
if field in src:
del src[field]
if src.id in sid2langs:
src['lgcode'] = '; '.join('[{}]'.format(gc) for gc in sorted(sid2langs[src.id]))
if self.cfg.hhtype:
src['hhtype'] = self.cfg.hhtype
yield src

def iter_cited(self):
for src in self.ds.sources:
yield src.id

def iter_examples(self, glottolog=None):
abbrs = {}
if self.cfg.gloss_abbreviations:
fname, abbrcol, defcol = self.cfg.gloss_abbreviations
abbrs = collections.OrderedDict(
[(r[abbrcol], r[defcol]) for r in self.ds.iter_rows(fname)])
l2gc = {l.id: (l.cldf.glottocode, l.cldf.name) for l in self.languages}
if self.cfg.igt:
for count, ex in enumerate(self.ds.objects('ExampleTable', cls=Example), start=1):
if abbrs:
ex.igt.abbrs = abbrs
if ex.igt.primary_text and ex.igt.phrase:
yield base.Example(
ID='{}'.format(count),
Local_ID=ex.id,
Primary_Text=ex.igt.primary_text,
Analyzed_Word=ex.igt.phrase,
Gloss=ex.igt.gloss,
Translated_Text=ex.igt.translation or '',
Language_ID=l2gc[ex.cldf.languageReference][0],
Language_Name=l2gc[ex.cldf.languageReference][1],
Source=[],
Abbreviations=ex.igt.gloss_abbrs if ex.igt.is_valid(strict=True) else {},
Meta_Language_ID=ex.cldf.metaLanguageReference or 'stan1293',
Comment=getattr(ex.cldf, 'comment', None),
)
96 changes: 96 additions & 0 deletions src/linglit/cldf/repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import shutil
import functools
import collections

import attr
import cldfzenodo
from clldutils.jsonlib import dump, load
from pycldf import Source
from csvw.dsv import reader

from linglit import base
from .publication import Publication


@attr.s
class Record(base.Record):
bibtex = attr.ib()

def as_source(self):
return Source.from_bibtex(self.bibtex)


@attr.s
class Dataset:
"""
A row in a repository's catalog.csv file.
"""
id = attr.ib()
name = attr.ib()
conceptdoi = attr.ib()
source_to_language = attr.ib()
hhtype = attr.ib()
bib = attr.ib(converter=lambda s: bool(s))
igt = attr.ib(converter=lambda s: bool(s))
gloss_abbreviations = attr.ib(converter=lambda s: s.split()) # triple (fname, abbrcol, defcol)


class Repository(base.Repository):
id = 'cldf'

@functools.cached_property
def catalog(self):
return collections.OrderedDict([
(row['name'], Dataset(**row)) for row in reader(self.dir / 'catalog.csv', dicts=True)])

def __getitem__(self, did):
md = self.metadata(did)
return Publication(
Record(
ID=str(self.catalog[did].id),
DOI=md['doi'],
license=md['license'],
creators=md['creators'],
title=md['title'],
year=md['year'],
metalanguage=None,
objectlanguage=None,
bibtex=cldfzenodo.Record(**md).bibtex,
),
self.dir / did,
self)

def metadata(self, did):
return load(self.dir / '{}.json'.format(did))

def iter_publications(self):
for i, (did, rmd) in enumerate(self.catalog.items(), start=1):
md = self.metadata(did)
yield Publication(
Record(
ID=str(rmd.id),
DOI=md['doi'],
license=md['license'],
creators=md['creators'],
title=md['title'],
year=md['year'],
metalanguage=None,
objectlanguage=None,
bibtex=cldfzenodo.Record(**md).bibtex,
),
self.dir / did,
self)

def create(self, verbose=False):
for did, dataset in self.catalog.items():
dldir = self.dir / did
print(did)
rec = cldfzenodo.Record.from_concept_doi(dataset.conceptdoi)
if dldir.exists():
if self.metadata(did)['version'] == rec.version:
continue
shutil.rmtree(dldir)
print('downloading {} ...'.format(rec.version))
rec.download_dataset(self.dir / did)
print('... done')
dump(attr.asdict(rec), self.dir / '{}.json'.format(did), indent=2)
5 changes: 4 additions & 1 deletion src/linglit/commands/mergedbib.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
from clldutils.path import TemporaryDirectory
from tqdm import tqdm
from pybtex.database import parse_string

from linglit.cli_util import add_provider, get_provider
from linglit.bibtex import iter_entries, iter_merged
Expand Down Expand Up @@ -31,4 +32,6 @@ def bibtex(src):
for src in pub.cited_references:
bib.write(bibtex(src))
for src, _ in iter_merged(iter_entries(tmp)):
print(bibtex(src))
res = bibtex(src)
parse_string(res, 'bibtex')
print(res)
6 changes: 3 additions & 3 deletions src/linglit/langsci/publication.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import re
import functools
import collections

from clldutils.path import walk
from clldutils.misc import lazyproperty

from linglit import base
from .bibtex import iter_bib, normalize_key
Expand Down Expand Up @@ -97,7 +97,7 @@ def read_tex(self, p, with_input=True):
self._includes_tex[str(p)] = texfixes.read_tex(p, with_input=with_input)
return self._includes_tex[str(p)]

@lazyproperty
@functools.cached_property
def gloss_abbreviations(self):
abbr_pattern = re.compile(r'\\(sub)?section\*?(\[[^]]+])?{Abbreviations(\s+[A-Za-z]+)*}')
section_pattern = re.compile(r'\\(?:sub)?section')
Expand Down Expand Up @@ -131,7 +131,7 @@ def bibs(self):
self._get_includes_and_bibs() # pragma: no cover
return self._bibs

@lazyproperty
@functools.cached_property
def bibkeys(self):
res = {}
for src in self.iter_references():
Expand Down

0 comments on commit 3f8219c

Please sign in to comment.