-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexibank_robinsonap.py
114 lines (94 loc) · 4.64 KB
/
lexibank_robinsonap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from pathlib import Path
import attr
import pylexibank
from clldutils.misc import slug
@attr.s
class CustomLanguage(pylexibank.Language):
# the tokens used as language identifiers by Robinson and Holton
Token = attr.ib(default=None)
class Dataset(pylexibank.Dataset):
dir = Path(__file__).parent
id = "robinsonap"
language_class = CustomLanguage
form_spec = pylexibank.FormSpec(
brackets={"[": "]", "{": "}", "(": ")", "‘": "’"},
separators=";/,",
missing_data=("?", "-", "--"),
strip_inside_brackets=True,
)
def cmd_makecldf(self, args):
args.writer.add_sources()
languages = args.writer.add_languages(lookup_factory="Token")
concepts = args.writer.add_concepts(
id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name"
)
seen = []
for f in ("AP_lexicon_coded.txt", "AP_lexicon.txt"):
for row in self.raw_dir.read_csv(f, dicts=True, delimiter="\t"):
concept = row["English"].lower().strip().replace(", ", "/")
# skip rows in AP_lexicon.txt that we've already seen
# in AP_lexicon_coded.txt
# Note that there are duplicate rows in across both files, and *within*
# the same files, so this handles that too.
if concept in seen:
continue
# manually catch "chase away". There are two glosses for this:
# 367 "chase away"
# 368 "chase away, expel"
# ...-> 367 looks only partially complete and 368 contains all
# forms in 367, so ignore 367.
if row["English"] == "chase away":
continue
if row["English"]:
# store lexicon IDs for the cognate row.
lexicon_ids = {}
for lang in languages:
assert concept in concepts, "bad concept %s" % concept
value = row[lang]
# preprocess value
# remove the reconstruction mark (for proto-AP)
value = value.lstrip("*")
# remove leading & trailing spaces
value = value.strip().lstrip()
# if the stripped form starts and ends with a slash,
# it is a leftover from a transcription, let's clean
# it (it could be done with the orthographic profile,
# but this could hide errors in parsing multiple
# forms, and in any case this is more adequate as we
# get the correct value)
if value.startswith("/") and value.endswith("/"):
value = value[1:-1]
lex = args.writer.add_forms_from_value(
Language_ID=languages[lang],
Parameter_ID=concepts[concept],
Value=value,
Source=["Robinson2012"],
)
if len(lex) >= 1:
# it looks like only the first lexemes of combined
# forms have cognates, so only add the first one.
lexicon_ids[lang] = lex[0]
seen.append(concept)
else: # cognates...
lastword = seen[-1] # find the last word..
for lang in languages:
# find lexical ids belonging to this language & gloss.
lex = lexicon_ids.get(lang)
if lex and row[lang]:
if int(row[lang]) not in range(0, 12):
raise ValueError("Invalid cognate id: %s" % row[lang])
args.writer.add_cognate(
lexeme=lex,
Cognateset_ID="%s-%s" % (concepts[lastword], row[lang]),
Source=["Robinson2012"],
)
def cmd_download(self, **kw):
if not self.raw_dir.exists():
self.raw_dir.mkdir()
files = ["AP_lexicon_coded.txt", "AP_lexicon.txt"]
self.raw_dir.download_and_unpack(
"http://booksandjournals.brillonline.com/upload/"
"robinson_10.116322105832-20120201.zip?itemId="
"/content/journals/10.1163/22105832-20120201&mimeType=application/octet-stream",
*[Path(f) for f in files]
)