Skip to content

Commit

Permalink
optional find by nom. annotation (close #143)
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed Jul 31, 2023
1 parent e3123c9 commit e2d4ea1
Show file tree
Hide file tree
Showing 10 changed files with 62 additions and 21 deletions.
7 changes: 7 additions & 0 deletions cmd/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,13 @@ func allMatchesFlag(cmd *cobra.Command) {
}
}

func findByAnnotFlag(cmd *cobra.Command) {
b, _ := cmd.Flags().GetBool("find-by-annotation")
if b {
opts = append(opts, config.OptWithFindByAnnotation(b))
}
}

func oddsDetailsFlag(cmd *cobra.Command) {
b, _ := cmd.Flags().GetBool("details-odds")
if b {
Expand Down
6 changes: 6 additions & 0 deletions cmd/gnfinder.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@
#
# WithBayesOddsDetails: false

# WithFindByAnnotation allows to detect names by existence of a
# nomenclatural annotation. If it is true, dictionaries do not prevent
# detection of a name.
#
# WithFindByAnnotation: false

# WithOddsAdjustment can be set to true to adjust calculated odds using the
# ratio of scientific names found in text to the number of capitalized
# words.
Expand Down
18 changes: 14 additions & 4 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ type cfgData struct {
WithAllMatches bool
WithAmbiguousNames bool
WithBayesOddsDetails bool
WithFindByAnnotation bool
WithOddsAdjustment bool
WithPlainInput bool
WithPositionInBytes bool
Expand Down Expand Up @@ -120,6 +121,7 @@ verification results.
inputOnlyFlag(cmd)
langFlag(cmd)
allMatchesFlag(cmd)
findByAnnotFlag(cmd)
oddsDetailsFlag(cmd)
plainInputFlag(cmd)
sourcesFlag(cmd)
Expand Down Expand Up @@ -194,6 +196,9 @@ func init() {
"show details of odds calculation.")
rootCmd.Flags().StringP("verifier-url", "e", "",
"custom URL for name-verification service.")
rootCmd.Flags().BoolP("find-by-annotation", "F", false,
`if there is a nomenclatural annotation ('sp. nov.' etc),
a name will be detected.`)
rootCmd.Flags().StringP("format", "f", "",
`Format of the output: "compact", "pretty", "csv".
compact: compact JSON,
Expand Down Expand Up @@ -272,6 +277,7 @@ func initConfig() {
_ = viper.BindEnv("WithAmbiguousNames", "GNF_WITH_AMBIGUOUS_NAMES")
_ = viper.BindEnv("WithAllMatches", "GNF_WITH_ALL_MATCHES")
_ = viper.BindEnv("WithBayesOddsDetails", "GNF_WITH_BAYES_ODDS_DETAILS")
_ = viper.BindEnv("WithFindByAnnotation", "GNF_WITH_FIND_BY_ANNOTATION")
_ = viper.BindEnv("WithOddsAdjustment", "GNF_WITH_ODDS_ADJUSTMENT")
_ = viper.BindEnv("WithPlainInput", "GNF_WITH_PLAIN_INPUT")
_ = viper.BindEnv("WithPositionInBytes", "GNF_WITH_POSITION_IN_BYTES")
Expand Down Expand Up @@ -362,6 +368,14 @@ func getOpts() {
opts = append(opts, config.OptWithBayesOddsDetails(true))
}

if cfgCli.WithFindByAnnotation {
opts = append(opts, config.OptWithFindByAnnotation(true))
}

if cfgCli.WithOddsAdjustment {
opts = append(opts, config.OptWithOddsAdjustment(true))
}

if cfgCli.WithPlainInput {
opts = append(opts, config.OptWithPlainInput(true))
}
Expand All @@ -370,10 +384,6 @@ func getOpts() {
opts = append(opts, config.OptWithPositonInBytes(true))
}

if cfgCli.WithOddsAdjustment {
opts = append(opts, config.OptWithOddsAdjustment(true))
}

if cfgCli.WithUniqueNames {
opts = append(opts, config.OptWithUniqueNames(true))
}
Expand Down
13 changes: 13 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ type Config struct {
// WithBayesOddsDetails show in detail how odds are calculated.
WithBayesOddsDetails bool

// WithFindByAnnotation allows to detect names by existence of a
// nomenclatural annotation. If it is true, dictionaries do not prevent
// detection of a name.
WithFindByAnnotation bool

// WithOddsAdjustment can be set to true to adjust calculated odds using the
// ratio of scientific names found in text to the number of capitalized
// words.
Expand Down Expand Up @@ -208,6 +213,14 @@ func OptWithBayesOddsDetails(b bool) Option {
}
}

// OptWithFindByAnnotation option to allow detect names solely by their
// nomenclatural annotation.
func OptWithFindByAnnotation(b bool) Option {
return func(cfg *Config) {
cfg.WithFindByAnnotation = b
}
}

// OptWithOddsAdjustment is an option that triggers recalculation of prior odds
// using number of found names divided by number of all name candidates.
func OptWithOddsAdjustment(b bool) Option {
Expand Down
29 changes: 17 additions & 12 deletions pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@ func TestConfig(t *testing.T) {

t.Run("returns new Config object", func(t *testing.T) {
cfg := config.New()
assert.Equal(t, cfg.Language, lang.English)
assert.Equal(t, cfg.LanguageDetected, "")
assert.Equal(t, cfg.TokensAround, 0)
assert.Equal(t, lang.English, cfg.Language)
assert.Equal(t, "", cfg.LanguageDetected)
assert.Equal(t, 0, cfg.TokensAround)
assert.True(t, cfg.WithBayes)
assert.False(t, cfg.WithPositionInBytes)
})

t.Run("takes language", func(t *testing.T) {
cfg := config.New(config.OptLanguage(lang.English))
assert.Equal(t, cfg.Language, lang.English)
assert.Equal(t, cfg.LanguageDetected, "")
assert.Equal(t, lang.English, cfg.Language)
assert.Equal(t, "", cfg.LanguageDetected)
})

t.Run("sets bayes", func(t *testing.T) {
Expand All @@ -41,19 +41,24 @@ func TestConfig(t *testing.T) {

t.Run("sets tokens number", func(t *testing.T) {
cfg := config.New(config.OptTokensAround(4))
assert.Equal(t, cfg.TokensAround, 4)
assert.Equal(t, 4, cfg.TokensAround)
})

t.Run("sets find by annotation", func(t *testing.T) {
cfg := config.New(config.OptWithFindByAnnotation(true))
assert.Equal(t, true, cfg.WithFindByAnnotation)
})

t.Run("does not set 'bad' tokens number", func(t *testing.T) {
cfg := config.New(config.OptTokensAround(-1))
assert.Equal(t, cfg.TokensAround, 0)
assert.Equal(t, 0, cfg.TokensAround)
cfg = config.New(config.OptTokensAround(10))
assert.Equal(t, cfg.TokensAround, 5)
assert.Equal(t, 5, cfg.TokensAround)
})

t.Run("sets bayes' threshold", func(t *testing.T) {
cfg := config.New(config.OptBayesOddsThreshold(200))
assert.Equal(t, cfg.BayesOddsThreshold, 200.0)
assert.Equal(t, 200.0, cfg.BayesOddsThreshold)
})

t.Run("sets several options", func(t *testing.T) {
Expand All @@ -62,7 +67,7 @@ func TestConfig(t *testing.T) {
config.OptLanguage(lang.German),
}
cfg := config.New(opts...)
assert.Equal(t, cfg.Language, lang.German)
assert.Equal(t, lang.German, cfg.Language)
assert.True(t, cfg.WithBayes)
})

Expand All @@ -81,11 +86,11 @@ func TestConfig(t *testing.T) {

for _, v := range tests {
l, err := lang.New(v.lang)
assert.Equal(t, err != nil, v.hasErr, v.msg)
assert.Equal(t, v.hasErr, err != nil, v.msg)
langOpt := config.OptLanguage(l)
opts := []config.Option{langOpt}
cfg := config.New(opts...)
assert.Equal(t, cfg.Language, v.langCfg, v.msg)
assert.Equal(t, v.langCfg, cfg.Language, v.msg)
}
})
}
2 changes: 1 addition & 1 deletion pkg/ent/heuristic/heuristic.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
// tokens and sets up token's indices. Indices determine if a token is a
// potential unimonial, binomial or trinomial. Then if fills out signfificant
// number of features pertained to the token.
func TagTokens(ts []token.TokenSN, d *dict.Dictionary) {
func TagTokens(ts []token.TokenSN, d *dict.Dictionary, withAnnot bool) {
l := len(ts)

for i := range ts {
Expand Down
2 changes: 1 addition & 1 deletion pkg/ent/heuristic/heuristic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ func TestHeuristic(t *testing.T) {
randomly... Pardosa is a very nice when it is not sad. Drosophila
(Sophophora) melanogaster disagrees!`)
ts := token.Tokenize(txt)
heuristic.TagTokens(ts, dictionary)
heuristic.TagTokens(ts, dictionary, false)
tests := map[int]struct {
name string
decision token.Decision
Expand Down
2 changes: 1 addition & 1 deletion pkg/ent/nlp/bayes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ cheilum, 1 5s. per doz.
Conostylis americana, 2i. 6d.
`)
tokens := token.Tokenize(txt)
heuristic.TagTokens(tokens, dictionary)
heuristic.TagTokens(tokens, dictionary, false)
nb := weights[lang.English]

tkn := tokens[10]
Expand Down
2 changes: 1 addition & 1 deletion pkg/gnfinder.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func (gnf gnfinder) Find(file, txt string) output.Output {
gnf.Language, gnf.LanguageDetected = lang.DetectLanguage(text)
}

heuristic.TagTokens(tokens, gnf.Dictionary)
heuristic.TagTokens(tokens, gnf.Dictionary, gnf.WithFindByAnnotation)
if gnf.WithBayes {
nb := gnf.bayesWeights[gnf.Language]
nlp.TagTokens(tokens, gnf.Dictionary, nb, gnf.BayesOddsThreshold)
Expand Down
2 changes: 1 addition & 1 deletion tools/training/trainer.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ func processText(t *TextData, d *dict.Dictionary) []feature.ClassFeatures {
var lfs, lfsText []feature.ClassFeatures
var nd NameData
ts := token.Tokenize(t.Text)
heuristic.TagTokens(ts, d)
heuristic.TagTokens(ts, d, false)
l := len(t.NamesPositions)
var nameIdx, i int
for {
Expand Down

0 comments on commit e2d4ea1

Please sign in to comment.