-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_linku.jq
executable file
·61 lines (53 loc) · 2.12 KB
/
filter_linku.jq
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/jq -rf
# Takes a combined JSON object (sona Linku JSON + augment*.json)
# as input, reshapes it, and filters it according to dictionary
# inclusion criteria.
{
words:
(.words
# Reshape words to be smaller and more standard.
| map_values(
{
word,
pos,
# Remaining attributes are used for filtering, or flags
definition: (.translations.en.definitions // ""),
# Usage percentage from the latest survey is always used.
usage: (.usage | to_entries | sort_by(.key) | reverse[0].value // null),
book: (if .book == "none" then null else .book end),
commentary: .commentary
}
)
# Exclude words matching certain criteria before processing further.
| map_values(
# select all...
select(
(
# - words documented as typos;
(.translations.en.definitions // "" | startswith("[typo "))
# - words documented as reserved words
or (.translations.en.definitions // "" | test("\\bword reserved\\b"))
# - words deprecated by their creators
or (.translations.en.commentary // "" | test("\\bdeprecated\\b"))
# - words without a book *and* a usage percentage
# of less than 1/3 of speakers, or no percentage
or (
(.book == null)
and (
(.usage == null)
or ((.usage // 0) < (1/3)*100)
)
)
)
| not # and then invert the selection
)
| del(.commentary)
)
)
, names: (
.places
+ .languages
+ .transliterations
+ .names
)
}