-
Notifications
You must be signed in to change notification settings - Fork 1
/
sentence_complexity.py
116 lines (108 loc) · 5.78 KB
/
sentence_complexity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#initial plan: order Nishnaabemwin sentences, favoring short sentences, low transitivity and independent order
#common readability scores in English weigh polysyllabicity and length of sentence
#this suggests favoring phonological simplicity too (with short words? simple clusters? few syncope alternations?)
#ordering, numerical score, enumerate components
import regex
def interface(postags, *m_parse_los):
h = []
assert all([type(m) == list for m in m_parse_los])
for s in m_parse_los:
distilled = []
for m in s:
formatted = {"analysis":m}
if regex.search(r"({0})(.*({0}))?".format(postags), m):
formatted["pos"] = [x for x in regex.search(r"({0})(.*({0}))?".format(postags), m)[0].split("+") if x][-1] #Denominal words may contain Dim, etc, but plain nouns will omit this if only POS tags are used as boundaries
if formatted["pos"] in ["VAI", "VAIO", "VTA", "VTI", "VII"] and regex.search(r"(Imp|Cnj)", m):
formatted["order"] = regex.search(r"(Imp|Cnj)", m)[0] #Denominal words may contain Dim, etc, but plain nouns will omit this if only POS tags are used as boundaries
elif formatted["pos"] in ["VAI", "VAIO", "VTA", "VTI", "VII"] and not regex.search(r"(Imp|Cnj)", m): formatted["order"] = "Ind"
else:
formatted["order"] = ""
if not regex.search(r"({0})(.*({0}))?".format(postags), m):
formatted["pos"] = ""
formatted["order"] = ""
distilled.append(formatted)
h.append(distilled)
return h
def paradigm_richness(*m_parse_his):
h = {
"VTA": {"Cnj": {}, "Ind": {}},
"VTI": {"Cnj": {}, "Ind": {}},
"VAIO": {"Cnj": {}, "Ind": {}},
"VAI": {"Cnj": {}, "Ind": {}},
"VII": {"Cnj": {}, "Ind": {}},
}
for x in h:
for y in h[x]:
h[x][y]["Neg"] = 0
h[x][y]["Prt"] = 0
h[x][y]["Dub"] = 0
for x in ["1", "1Pl", "2", "2Pl", "21Pl", "3", "3Pl", "3Obv", "0"]:
for y in ["1", "1Pl", "2", "2Pl", "21Pl", "3", "3Pl", "3Obv"]:
#if ... need to zap combinations that do not exist
h["VTA"]["Ind"][x+"v"+y] = 0
h["VTA"]["Cnj"][x+"v"+y] = 0
for m in m_parse_his:
pass #increment the counts for each thing you see
def alg_morph_counts(*sentences):
scores = []
for s in sentences:
s_score = [
[0,0,0,0,0],[0,0,0], [0]#pos (VTA,VAI,VAIO,VTI,VII),order (cnj,ind,imp), total morphemes
#[0] #number of words with stress shift alternations
#preverbs? mood? discontinuous person/number realization?
]
for w in s:
if w["pos"] == "VTA": s_score[0][0] += 1
if w["pos"] == "VAIO": s_score[0][1] += 1
if w["pos"] == "VTI": s_score[0][2] += 1
if w["pos"] == "VAI": s_score[0][3] += 1
if w["pos"] == "VII": s_score[0][4] += 1
if w["order"] == "Cnj": s_score[1][0] += 1
if w["order"] == "Ind": s_score[1][1] += 1
if w["order"] == "Imp": s_score[1][2] += 1
#if w["alts"]: s_score[2][0] += 1
s_score[2][0] += morpheme_count(w["analysis"])
scores.append(s_score)
return scores
def morpheme_count(analysis):
return len(analysis.split("+"))
def alg_morph_score_rate(*counts):
at_bats = 0.001
total_bases_pos = 0
main_order_counts = [0, 0]
total_morphemes = 0
for c in counts:
if sum(c[0]) == 0: print("degenerate verb-less sentence detected")
at_bats += sum(c[0])
total_bases_pos += c[0][0]*4 #vtas are 'home run', way more complex than the others
total_bases_pos += c[0][1]*3 #vaios are 'double' kind of a weird corner case addition to VAIs, more complex only in the sense that you have to add another wrinkle (so maybe a triple)?
total_bases_pos += c[0][2]*3 #vtis are 'double', a fairly straightforward variant of VAIs
total_bases_pos += c[0][3]*2 #vais are 'single', your run of the mill verb
total_bases_pos += c[0][4] #viis are 'out', a super simple type of verb
main_order_counts[0] += c[1][0] #cnj has irregular phonology, difficult to explain context of use
main_order_counts[1] += c[1][1] #ind has long distance dependencies, stronger ripple effect phonology, simpler context of use
#imperatives are not worth tracking
total_morphemes += c[2][0] #maybe take out the core morphemes present in the verb? (or at least don't count the pos tags?) all just makes things more complicated...
if main_order_counts[0] > main_order_counts[1]: order_sign = 1
elif main_order_counts[0] <= main_order_counts[1]: order_sign = -1
return [round(total_bases_pos/at_bats, 1), round((order_sign*(max(main_order_counts)/at_bats)), 1), round(total_morphemes/len(counts), 1)] #slugging pct pos complexity, pct independent/cnj (negative for mostly independent, positive for mostly cnj), average sentence length in morphemes
def flesch_reading_ease_score(*sentences):
word_count = 0
sentence_count = 0
syllable_count = 0
for s in sentences:
sentence_count += 1
for word in s:
word_count += 1
syllable_count += len(regex.findall(r"(^|[^aeioAEIO])[aeioAEIO]", word))
return (206.835 - (1.015*(word_count/sentence_count))-(84.6*(syllable_count/word_count)))
def flesch_reading_grade_level(*sentences):
word_count = 0
sentence_count = 0
syllable_count = 0
for s in sentences:
sentence_count += 1
for word in s:
word_count += 1
syllable_count += len(regex.findall(r"(^|[^aeioAEIO])[aeioAEIO]", word))
return ((0.39*(word_count/sentence_count))+(11.8*(syllable_count/word_count)))-15.59