-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
211 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .repository import Repository | ||
|
||
assert Repository |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import functools | ||
import collections | ||
|
||
from pycldf.dataset import iter_datasets | ||
from pycldf.sources import Sources | ||
from pyigt import Example | ||
|
||
from linglit import base | ||
|
||
|
||
class Publication(base.Publication): | ||
@functools.cached_property | ||
def ds(self): | ||
return next(iter_datasets(self.dir)) | ||
|
||
@functools.cached_property | ||
def cfg(self): | ||
return self.repos.catalog[self.dir.name] | ||
|
||
@functools.cached_property | ||
def languages(self): | ||
return self.ds.objects('LanguageTable') | ||
|
||
def iter_references(self): | ||
sid2langs = collections.defaultdict(set) | ||
if not self.cfg.bib: | ||
return | ||
s2l = self.cfg.source_to_language | ||
l2gc = {} | ||
for row in self.languages: | ||
if row.cldf.glottocode: | ||
l2gc[row.id] = row.cldf.glottocode | ||
if s2l == 'LanguageTable': | ||
for src in row.cldf.source: | ||
sid, _ = Sources.parse(src) | ||
sid2langs[sid].add(row.cldf.glottocode) | ||
if s2l == 'ValueTable': | ||
for row in self.ds.iter_rows('ValueTable', 'languageReference', 'source'): | ||
if row['languageReference'] in l2gc: | ||
for src in row['source']: | ||
sid, _ = Sources.parse(src) | ||
sid2langs[sid].add(l2gc[row['languageReference']]) | ||
for src in self.ds.sources: | ||
for field in [ | ||
'besttxt', 'cfn', 'delivered', 'fn', | ||
'languageid', 'glottolog_ref_id', 'glottolog_ref', | ||
'gbid', 'google_book_search_id', 'google_book_search_viewability', | ||
'google_book_viewability', | ||
]: | ||
if field in src: | ||
del src[field] | ||
if src.id in sid2langs: | ||
src['lgcode'] = '; '.join('[{}]'.format(gc) for gc in sorted(sid2langs[src.id])) | ||
if self.cfg.hhtype: | ||
src['hhtype'] = self.cfg.hhtype | ||
yield src | ||
|
||
def iter_cited(self): | ||
for src in self.ds.sources: | ||
yield src.id | ||
|
||
def iter_examples(self, glottolog=None): | ||
abbrs = {} | ||
if self.cfg.gloss_abbreviations: | ||
fname, abbrcol, defcol = self.cfg.gloss_abbreviations | ||
abbrs = collections.OrderedDict( | ||
[(r[abbrcol], r[defcol]) for r in self.ds.iter_rows(fname)]) | ||
l2gc = {l.id: (l.cldf.glottocode, l.cldf.name) for l in self.languages} | ||
if self.cfg.igt: | ||
for count, ex in enumerate(self.ds.objects('ExampleTable', cls=Example), start=1): | ||
if abbrs: | ||
ex.igt.abbrs = abbrs | ||
if ex.igt.primary_text and ex.igt.phrase: | ||
yield base.Example( | ||
ID='{}'.format(count), | ||
Local_ID=ex.id, | ||
Primary_Text=ex.igt.primary_text, | ||
Analyzed_Word=ex.igt.phrase, | ||
Gloss=ex.igt.gloss, | ||
Translated_Text=ex.igt.translation or '', | ||
Language_ID=l2gc[ex.cldf.languageReference][0], | ||
Language_Name=l2gc[ex.cldf.languageReference][1], | ||
Source=[], | ||
Abbreviations=ex.igt.gloss_abbrs if ex.igt.is_valid(strict=True) else {}, | ||
Meta_Language_ID=ex.cldf.metaLanguageReference or 'stan1293', | ||
Comment=getattr(ex.cldf, 'comment', None), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import shutil | ||
import functools | ||
import collections | ||
|
||
import attr | ||
import cldfzenodo | ||
from clldutils.jsonlib import dump, load | ||
from pycldf import Source | ||
from csvw.dsv import reader | ||
|
||
from linglit import base | ||
from .publication import Publication | ||
|
||
|
||
@attr.s | ||
class Record(base.Record): | ||
bibtex = attr.ib() | ||
|
||
def as_source(self): | ||
return Source.from_bibtex(self.bibtex) | ||
|
||
|
||
@attr.s | ||
class Dataset: | ||
""" | ||
A row in a repository's catalog.csv file. | ||
""" | ||
id = attr.ib() | ||
name = attr.ib() | ||
conceptdoi = attr.ib() | ||
source_to_language = attr.ib() | ||
hhtype = attr.ib() | ||
bib = attr.ib(converter=lambda s: bool(s)) | ||
igt = attr.ib(converter=lambda s: bool(s)) | ||
gloss_abbreviations = attr.ib(converter=lambda s: s.split()) # triple (fname, abbrcol, defcol) | ||
|
||
|
||
class Repository(base.Repository): | ||
id = 'cldf' | ||
|
||
@functools.cached_property | ||
def catalog(self): | ||
return collections.OrderedDict([ | ||
(row['name'], Dataset(**row)) for row in reader(self.dir / 'catalog.csv', dicts=True)]) | ||
|
||
def __getitem__(self, did): | ||
md = self.metadata(did) | ||
return Publication( | ||
Record( | ||
ID=str(self.catalog[did].id), | ||
DOI=md['doi'], | ||
license=md['license'], | ||
creators=md['creators'], | ||
title=md['title'], | ||
year=md['year'], | ||
metalanguage=None, | ||
objectlanguage=None, | ||
bibtex=cldfzenodo.Record(**md).bibtex, | ||
), | ||
self.dir / did, | ||
self) | ||
|
||
def metadata(self, did): | ||
return load(self.dir / '{}.json'.format(did)) | ||
|
||
def iter_publications(self): | ||
for i, (did, rmd) in enumerate(self.catalog.items(), start=1): | ||
md = self.metadata(did) | ||
yield Publication( | ||
Record( | ||
ID=str(rmd.id), | ||
DOI=md['doi'], | ||
license=md['license'], | ||
creators=md['creators'], | ||
title=md['title'], | ||
year=md['year'], | ||
metalanguage=None, | ||
objectlanguage=None, | ||
bibtex=cldfzenodo.Record(**md).bibtex, | ||
), | ||
self.dir / did, | ||
self) | ||
|
||
def create(self, verbose=False): | ||
for did, dataset in self.catalog.items(): | ||
dldir = self.dir / did | ||
print(did) | ||
rec = cldfzenodo.Record.from_concept_doi(dataset.conceptdoi) | ||
if dldir.exists(): | ||
if self.metadata(did)['version'] == rec.version: | ||
continue | ||
shutil.rmtree(dldir) | ||
print('downloading {} ...'.format(rec.version)) | ||
rec.download_dataset(self.dir / did) | ||
print('... done') | ||
dump(attr.asdict(rec), self.dir / '{}.json'.format(did), indent=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters