Skip to content

Commit

Permalink
Code cleanup, Glottolog 4.0.
Browse files Browse the repository at this point in the history
  • Loading branch information
chrzyki committed Jul 2, 2019
1 parent 92e8532 commit 986761d
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 44 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ This dataset, which is well-known among Sinologists, comprises 18 dialect variet
- **Invalid lexemes:** 0
- **Tokens:** 121,097
- **Segments:** 247 (0 BIPA errors, 0 CTLS sound class errors, 247 CLTS modified)
- **Inventory size (avg):** 61.06
- **Inventory size (avg):** 61.06
5 changes: 2 additions & 3 deletions cldf/cldf-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
"dc:related": null,
"dc:source": "sources.bib",
"dc:title": "Chinese Dialect Vocabularies",
"dcat:accessURL": "https://github.com/lexibank/beidasinitic",
"rdf:ID": "beidasinitic",
"rdf:type": "http://www.w3.org/ns/dcat#Distribution",
"dialect": {
Expand All @@ -23,8 +22,8 @@
{
"dc:title": "environment",
"properties": {
"glottolog_version": "v3.4-1-g07a9b54e37",
"concepticon_version": "pyconcepticon-1.4.0-206-g1ad282b"
"glottolog_version": "v4.0",
"concepticon_version": "v2.0"
}
}
],
Expand Down
4 changes: 2 additions & 2 deletions cldf/sources.bib
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
@book{Cihui,
Editor = {北京大学, Běijīng Dàxué},
Publisher = {Wénzì Gǎigé 文字改革},
Title = {Hànyǔ fāngyán cíhuì},
Publisher = {Wénzì Gǎigé 文字改革},
Title = {Hànyǔ fāngyán cíhuì 汉语方言词汇},
Year = {1964}
}
51 changes: 22 additions & 29 deletions lexibank_beidasinitic.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,46 @@
# coding=utf-8
from __future__ import unicode_literals, print_function
from itertools import groupby

import attr
import lingpy
from pycldf.sources import Source

from lingpy.sequence.sound_classes import syllabify

from clldutils.path import Path
from clldutils.misc import slug
from clldutils.misc import lazyproperty
from pylexibank.dataset import Metadata, Concept
from lingpy.sequence.sound_classes import syllabify
from pylexibank.dataset import Concept
from pylexibank.dataset import Dataset as BaseDataset
from pylexibank.util import pb, getEvoBibAsBibtex



@attr.s
class BDConcept(Concept):
Chinese = attr.ib(default=None)


class Dataset(BaseDataset):
dir = Path(__file__).parent
id = 'beidasinitic'
id = "beidasinitic"
concept_class = BDConcept

def cmd_download(self, **kw):
self.raw.write('sources.bib', getEvoBibAsBibtex('Cihui', **kw))
self.raw.write("sources.bib", getEvoBibAsBibtex("Cihui", **kw))

def cmd_install(self, **kw):
wl = lingpy.Wordlist(self.raw.posix('words.tsv'),
conf=self.raw.posix('wordlist.rc'))

wl = lingpy.Wordlist(self.raw.posix("words.tsv"), conf=self.raw.posix("wordlist.rc"))

with self.cldf as ds:
ds.add_sources(*self.raw.read_bib())
ds.add_concepts(id_factory=lambda c: c.number)
ds.add_languages(id_factory=lambda c: c['ID'])
for k in pb(wl, desc='wl-to-cldf', total=len(wl)):
if wl[k, 'value']:
ds.add_languages(id_factory=lambda c: c["ID"])
for k in pb(wl, desc="wl-to-cldf", total=len(wl)):
if wl[k, "value"]:
ds.add_lexemes(
Language_ID=wl[k, 'doculect'],
Parameter_ID=wl[k, 'beida_id'],
Value=wl[k, 'value'],
Form=wl[k, 'form'],
Segments = syllabify([{'t↑h': 'tʰ', 'ᴇ': 'ᴇ/ɛ̝'}.get(
x, x) for x in self.tokenizer(None,
''.join(wl[k, 'segments']),
column='IPA')]),
Source='Cihui')
Language_ID=wl[k, "doculect"],
Parameter_ID=wl[k, "beida_id"],
Value=wl[k, "value"],
Form=wl[k, "form"],
Segments=syllabify(
[
{"t↑h": "tʰ", "ᴇ": "ᴇ/ɛ̝"}.get(x, x)
for x in self.tokenizer(
None, "".join(wl[k, "segments"]), column="IPA"
)
]
),
Source="Cihui",
)
14 changes: 9 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
from setuptools import setup
import sys
import json


PY2 = sys.version_info.major == 2
with open('metadata.json', **({} if PY2 else {'encoding': 'utf-8'})) as fp:
with open('metadata.json') as fp:
metadata = json.load(fp)


Expand All @@ -22,6 +20,12 @@
]
},
install_requires=[
'pylexibank>=1.1.1',
]
'pylexibank==1.1.1',
'segments==2.0.2'
],
extras_require={
'test': [
'pytest-cldf',
],
},
)
16 changes: 12 additions & 4 deletions test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals


def test_valid(cldf_dataset, cldf_logger):
assert cldf_dataset.validate(log=cldf_logger)


def test_languages(cldf_dataset):
assert len(list(cldf_dataset['LanguageTable'])) == 18


def test_parameters(cldf_dataset):
assert len(list(cldf_dataset['ParameterTable'])) == 905


def test_sources(cldf_dataset):
assert len(cldf_dataset.sources) == 1

0 comments on commit 986761d

Please sign in to comment.