-
Notifications
You must be signed in to change notification settings - Fork 0
/
hypenator-guesser.xfst
130 lines (111 loc) · 4.76 KB
/
hypenator-guesser.xfst
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
clear stack
define Vowel a | e | i | o | u | y | ä | ö ;
define LongVowel a a | e e | i i | o o | u u | y y | ä ä | ö ö ;
define Const b | c | d | f | g | h | j | k | l | m | n | p | q | r | s | t | v;
define Break %#;
define Hyp %-;
define AnyLetter [Vowel | Const | Hyp | Break];
# Define words as any strings of letters so we can try out the hyphenator
# with any word
define Word (AnyLetter)+ ;
regex Word ;
# In non-loan words 'd', 'j', 'v' cannot be at the end of a syllable
define LastConst Const - [d | j | v ];
define Diphthong a i | e i | o i | ä i | ö i | e y | ä y | ö y | a u | e u | o u | u i | y i | i u | i y | i e | u o | y ö ;
# Syllable structure for native words
define CV [ Const Vowel ];
define CVC [ Const Vowel LastConst ];
define CVCC [ Const Vowel Const LastConst ];
define CVV [ Const Vowel Vowel ];
define CVVC [ Const Vowel Vowel LastConst ];
define V [ Vowel ];
define VC [ Vowel LastConst ];
define VCC [ Vowel Const LastConst ];
define VV [ LongVowel ];
define VVC [ Vowel Vowel LastConst ];
# Syllable structures in loan words
define CCV [ Const Const Vowel ];
define CCVC [ Const Const Vowel Const ];
define CCVCC [ Const Const Vowel Const Const ];
define CCVV [ Const Const Vowel Vowel ];
define CCVVC [ Const Const Vowel Vowel Const ];
define sCCVC [ 's' [Const Const Vowel Const ]];
define sCCVCC [ 's' [Const Const Vowel Const Const ]];
define End [%- | %^];
define AllSyllables [
# native word structures:
CVVC | CV | CVC | CVCC | CVV | VC | VCC | VVC | VV | V |
# loan word structures:
CCV | CCVC | CCVCC | CCVV | CCVVC | sCCVC | sCCVCC
];
###########################################################################
#
# The two basic rules from Institute for the Languages of Finland:
#
# derived from:
# http://www.kielitoimistonohjepankki.fi/haku/tavutus/ohje/153
###########################################################################
##
# Syllable border is always before a combination of consonant and a vowel
#
# 1) Tavuraja on aina konsonantin ja vokaalin yhdistelmän edellä
#
# Adding AllSyllables in the context to prevent adding a hyphen to the
# beginning of the word
define syl [. .] -> %- || AllSyllables _ Const [ Vowel | Diphthong ] ;
# Syllable border is between vowels that do not form a diphthong
# 2) Tavuraja on sellaisten eri vokaalien välissä, jotka eivät muodosta diftongia
define nonDiphthongSplit a -> a %- || _ [e | o | y ] .o. # a-e, a-o, a-y
e -> e %- || _ [a | ä | o | y ] .o. # e-a, e-ä, e-o, e-y
i -> i %- || _ [a | o ] .o. # i-a, i-o
o -> o %- || _ [a | e ] .o. # o-a, o-e
u -> u %- || _ [a | e ] .o. # u-a, u-e
ä -> ä %- || _ [ e ] .o. # ä-e
0 -> 0; # zero rule to NOT forget the final ';'
#############################################################################
#
# Additional rules
#
#############################################################################
# Special cases for non-first diftongs, ie, uo, yö is not a diftong if they
# are not the first syllable of the word
#
# There are two versions of the rule:
# - first for the 3rd or later syllables (e.g. kie-li-en)
# - second for the second syllable (e.g säi-e)
#
# Eg pie-ni-en, #
define nonFirstDiphthonSplit i -> i %- || [%- AnyLetter+] _ e .o. # i-e
i -> i %- || AllSyllables _ e .o. # i-e
u -> u %- || [%- AnyLetter+] _ o .o. # u-o
u -> u %- || AllSyllables _ o .o. # u-o
y -> y %- || [%- AnyLetter+] _ ä .o. # y-ä
y -> y %- || AllSyllables _ ä .o. # y-ä
y -> y %- || [%- AnyLetter+] _ ö .o. # y-ö
y -> y %- || AllSyllables _ ö .o. # y-ö
0 -> 0; # zero rule to NOT forget the final ';'
#
# Some specific rules to match tricky cases for 1st and 2nd syllables in words like
# hams-traa-ja, abs-trak-ti, eks-tra
#
define longSyllableSplit [. .] -> %- || VCC _ AllSyllables .o.
[. .] -> %- || CVC _ AllSyllables .o.
[. .] -> %- || CVCC _ AllSyllables .o.
0 -> 0; # zero rule to NOT forget the final ';'
#############################################################################
#Clean up rules
#
define cleanup %^ -> 0 .o. %# -> %-;
#############################################################################
#
# Compose the rules together
#
##############################################################################
regex Word .o.
# Rules:
syl .o.
nonDiphthongSplit .o.
nonFirstDiphthonSplit .o.
longSyllableSplit .o.
cleanup
;