-
Notifications
You must be signed in to change notification settings - Fork 66
/
example_test.go
179 lines (150 loc) · 6.18 KB
/
example_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
/*
* Copyright © 2021-present Peter M. Stahl pemistahl@gmail.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package lingua_test
import (
"fmt"
"github.com/pemistahl/lingua-go"
)
func Example_basic() {
languages := []lingua.Language{
lingua.English,
lingua.French,
lingua.German,
lingua.Spanish,
}
detector := lingua.NewLanguageDetectorBuilder().
FromLanguages(languages...).
Build()
if language, exists := detector.DetectLanguageOf("languages are awesome"); exists {
fmt.Println(language)
}
// Output: English
}
func Example_multipleLanguagesDetection() {
languages := []lingua.Language{
lingua.English,
lingua.French,
lingua.German,
}
detector := lingua.NewLanguageDetectorBuilder().
FromLanguages(languages...).
Build()
sentence := "Parlez-vous français? " +
"Ich spreche Französisch nur ein bisschen. " +
"A little bit is better than nothing."
for _, result := range detector.DetectMultipleLanguagesOf(sentence) {
fmt.Printf("%s: '%s'\n", result.Language(), sentence[result.StartIndex():result.EndIndex()])
}
// Output:
// French: 'Parlez-vous français? '
// German: 'Ich spreche Französisch nur ein bisschen. '
// English: 'A little bit is better than nothing.'
}
// By default, Lingua returns the most likely language for a given input text.
// However, there are certain words that are spelled the same in more than one
// language. The word `prologue`, for instance, is both a valid English and
// French word. Lingua would output either English or French which might be
// wrong in the given context. For cases like that, it is possible to specify a
// minimum relative distance that the logarithmized and summed up probabilities
// for each possible language have to satisfy. It can be stated as seen below.
//
// Be aware that the distance between the language probabilities is dependent on
// the length of the input text. The longer the input text, the larger the
// distance between the languages. So if you want to classify very short text
// phrases, do not set the minimum relative distance too high. Otherwise Unknown
// will be returned most of the time as in the example below. This is the return
// value for cases where language detection is not reliably possible.
func Example_minimumRelativeDistance() {
languages := []lingua.Language{
lingua.English,
lingua.French,
lingua.German,
lingua.Spanish,
}
detector := lingua.NewLanguageDetectorBuilder().
FromLanguages(languages...).
WithMinimumRelativeDistance(0.9).
Build()
language, exists := detector.DetectLanguageOf("languages are awesome")
fmt.Println(language)
fmt.Println(exists)
// Output:
// Unknown
// false
}
// Knowing about the most likely language is nice but how reliable is the
// computed likelihood? And how less likely are the other examined languages in
// comparison to the most likely one? In the example below, a slice of
// ConfidenceValue is returned containing those languages which the calling
// instance of LanguageDetector has been built from. The entries are sorted by
// their confidence value in descending order. Each value is a probability
// between 0.0 and 1.0. The probabilities of all languages will sum to 1.0.
// If the language is unambiguously identified by the rule engine, the value 1.0
// will always be returned for this language. The other languages will receive a
// value of 0.0.
func Example_confidenceValues() {
languages := []lingua.Language{
lingua.English,
lingua.French,
lingua.German,
lingua.Spanish,
}
detector := lingua.NewLanguageDetectorBuilder().
FromLanguages(languages...).
Build()
confidenceValues := detector.ComputeLanguageConfidenceValues("languages are awesome")
for _, elem := range confidenceValues {
fmt.Printf("%s: %.2f\n", elem.Language(), elem.Value())
}
// Output:
// English: 0.93
// French: 0.04
// German: 0.02
// Spanish: 0.01
}
// By default, Lingua uses lazy-loading to load only those language models on
// demand which are considered relevant by the rule-based filter engine. For web
// services, for instance, it is rather beneficial to preload all language models
// into memory to avoid unexpected latency while waiting for the service response.
// If you want to enable the eager-loading mode, you can do it as seen below.
// Multiple instances of LanguageDetector share the same language models in
// memory which are accessed asynchronously by the instances.
func Example_eagerLoading() {
lingua.NewLanguageDetectorBuilder().
FromAllLanguages().
WithPreloadedLanguageModels().
Build()
}
// There might be classification tasks where you know beforehand that your language
// data is definitely not written in Latin, for instance. The detection accuracy
// can become better in such cases if you exclude certain languages from the
// decision process or just explicitly include relevant languages.
func Example_builderApi() {
// Include all languages available in the library.
lingua.NewLanguageDetectorBuilder().FromAllLanguages()
// Include only languages that are not yet extinct (= currently excludes Latin).
lingua.NewLanguageDetectorBuilder().FromAllSpokenLanguages()
// Include only languages written with Cyrillic script.
lingua.NewLanguageDetectorBuilder().FromAllLanguagesWithCyrillicScript()
// Exclude only the Spanish language from the decision algorithm.
lingua.NewLanguageDetectorBuilder().FromAllLanguagesWithout(lingua.Spanish)
// Only decide between English and German.
lingua.NewLanguageDetectorBuilder().FromLanguages(lingua.English, lingua.German)
// Select languages by ISO 639-1 code.
lingua.NewLanguageDetectorBuilder().FromIsoCodes639_1(lingua.EN, lingua.DE)
// Select languages by ISO 639-3 code.
lingua.NewLanguageDetectorBuilder().FromIsoCodes639_3(lingua.ENG, lingua.DEU)
}