-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexibank_halenepal.py
68 lines (56 loc) · 2.2 KB
/
lexibank_halenepal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from collections import defaultdict
from pathlib import Path
import attr
import pylexibank
from clldutils.misc import slug
from pylexibank import Language, Concept
from pylexibank.dataset import Dataset as NonSplittingDataset
from pylexibank.util import progressbar
@attr.s
class CustomLanguage(Language):
SubGroup = attr.ib(default=None)
Number = attr.ib(default=None)
@attr.s
class CustomConcept(Concept):
Number = attr.ib(default=None)
class Dataset(NonSplittingDataset):
dir = Path(__file__).parent
id = "halenepal"
writer_options = dict(keep_languages=False, keep_parameters=False)
language_class = CustomLanguage
concept_class = CustomConcept
form_spec = pylexibank.FormSpec(
brackets={"(": ")"},
separators=";/,",
replacements=[(" ", "_")],
missing_data=("?", "-", "*", "---"),
strip_inside_brackets=True,
)
def cmd_makecldf(self, args):
# due to bad concept ids in STEDT, we need to load them from file
converter = defaultdict(set)
for row in self.raw_dir.read_csv("srcids.tsv", delimiter="\t", dicts=True):
converter[row["CORRECTED"]].add(row["IDINSTEDT"])
concept_lookup = {}
for concept in self.conceptlists[0].concepts.values():
idx = concept.id.split("-")[-1] + "_" + slug(concept.english)
args.writer.add_concept(
ID=idx,
Name=concept.english,
Concepticon_ID=concept.concepticon_id,
Concepticon_Gloss=concept.concepticon_gloss,
Number=concept.number,
)
concept_lookup[concept.number] = idx
for id_in_source in converter[concept.number]:
concept_lookup[id_in_source] = idx
language_lookup = args.writer.add_languages(lookup_factory="Name")
args.writer.add_sources()
for row in progressbar(self.raw_dir.read_csv("AH-CSDPN.tsv", delimiter="\t")[1:]):
args.writer.add_forms_from_value(
Local_ID=row[0],
Language_ID=language_lookup[row[6]],
Parameter_ID=concept_lookup[row[7]],
Value=row[1],
Source=["Hale1973"],
)