Skip to content

Commit

Permalink
go: remove Classifier from API
Browse files Browse the repository at this point in the history
Even more reduces public API surface by
hiding un-used Classifier API for providing
a pre-trained classifier weights.

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
  • Loading branch information
bzz committed Oct 29, 2019
1 parent 3f0c4e1 commit fa097f4
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 21 deletions.
4 changes: 2 additions & 2 deletions benchmark_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ func BenchmarkClassifyTotal(b *testing.B) {
b.Run("Classify()_TOTAL", func(b *testing.B) {
for n := 0; n < b.N; n++ {
for _, sample := range samples {
o = defaultClassifier.Classify(sample.content, nil)
o = defaultClassifier.classify(sample.content, nil)
}

overcomeLanguages = o
Expand Down Expand Up @@ -195,7 +195,7 @@ func BenchmarkClassifyPerSample(b *testing.B) {
for _, sample := range samples {
b.Run("Classify()_SAMPLE_"+sample.filename, func(b *testing.B) {
for n := 0; n < b.N; n++ {
o = defaultClassifier.Classify(sample.content, nil)
o = defaultClassifier.classify(sample.content, nil)
}

overcomeLanguages = o
Expand Down
18 changes: 9 additions & 9 deletions classifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ import (
"github.com/src-d/enry/v2/internal/tokenizer"
)

// Classifier is the interface in charge to detect the possible languages of the given content based on a set of
// classifier is the interface in charge to detect the possible languages of the given content based on a set of
// candidates. Candidates is a map which can be used to assign weights to languages dynamically.
type Classifier interface {
Classify(content []byte, candidates map[string]float64) (languages []string)
type classifier interface {
classify(content []byte, candidates map[string]float64) (languages []string)
}

type classifier struct {
type naiveBayes struct {
languagesLogProbabilities map[string]float64
tokensLogProbabilities map[string]map[string]float64
tokensTotal float64
Expand All @@ -24,8 +24,8 @@ type scoredLanguage struct {
score float64
}

// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
// classify returns a sorted slice of possible languages sorted by decreasing language's probability
func (c *naiveBayes) classify(content []byte, candidates map[string]float64) []string {

var languages map[string]float64
if len(candidates) == 0 {
Expand Down Expand Up @@ -73,7 +73,7 @@ func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
return sortedLanguages
}

func (c *classifier) knownLangs() map[string]float64 {
func (c *naiveBayes) knownLangs() map[string]float64 {
langs := make(map[string]float64, len(c.languagesLogProbabilities))
for lang := range c.languagesLogProbabilities {
langs[lang]++
Expand All @@ -82,7 +82,7 @@ func (c *classifier) knownLangs() map[string]float64 {
return langs
}

func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
func (c *naiveBayes) tokensLogProbability(tokens []string, language string) float64 {
var sum float64
for _, token := range tokens {
sum += c.tokenProbability(token, language)
Expand All @@ -91,7 +91,7 @@ func (c *classifier) tokensLogProbability(tokens []string, language string) floa
return sum
}

func (c *classifier) tokenProbability(token, language string) float64 {
func (c *naiveBayes) tokenProbability(token, language string) float64 {
tokenProb, ok := c.tokensLogProbabilities[language][token]
if !ok {
tokenProb = math.Log(1.000000 / c.tokensTotal)
Expand Down
16 changes: 8 additions & 8 deletions common.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ var DefaultStrategies = []Strategy{
}

// defaultClassifier is a Naive Bayes classifier trained on Linguist samples.
var defaultClassifier Classifier = &classifier{
var defaultClassifier classifier = &naiveBayes{
languagesLogProbabilities: data.LanguagesLogProbabilities,
tokensLogProbabilities: data.TokensLogProbabilities,
tokensTotal: data.TokensTotal,
Expand Down Expand Up @@ -108,10 +108,10 @@ func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
return
}

// GetLanguageBySpecificClassifier returns the most probably language for the given content using
// getLanguageBySpecificClassifier returns the most probably language for the given content using
// classifier to detect language.
func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) {
languages := GetLanguagesBySpecificClassifier(content, candidates, classifier)
func getLanguageBySpecificClassifier(content []byte, candidates []string, classifier classifier) (language string, safe bool) {
languages := getLanguagesBySpecificClassifier(content, candidates, classifier)
return getFirstLanguageAndSafe(languages)
}

Expand Down Expand Up @@ -420,17 +420,17 @@ func GetLanguagesByClassifier(filename string, content []byte, candidates []stri
return nil
}

return GetLanguagesBySpecificClassifier(content, candidates, defaultClassifier)
return getLanguagesBySpecificClassifier(content, candidates, defaultClassifier)
}

// GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) {
// getLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
func getLanguagesBySpecificClassifier(content []byte, candidates []string, classifier classifier) (languages []string) {
mapCandidates := make(map[string]float64)
for _, candidate := range candidates {
mapCandidates[candidate]++
}

return classifier.Classify(content, mapCandidates)
return classifier.classify(content, mapCandidates)
}

// GetLanguageExtensions returns the different extensions being used by the language.
Expand Down
4 changes: 2 additions & 2 deletions common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ func (s *EnryTestSuite) TestGetLanguagesBySpecificClassifier() {
name string
filename string
candidates []string
classifier Classifier
classifier classifier
expected string
}{
{name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: defaultClassifier, expected: "C"},
Expand All @@ -348,7 +348,7 @@ func (s *EnryTestSuite) TestGetLanguagesBySpecificClassifier() {
content, err := ioutil.ReadFile(test.filename)
assert.NoError(s.T(), err)

languages := GetLanguagesBySpecificClassifier(content, test.candidates, test.classifier)
languages := getLanguagesBySpecificClassifier(content, test.candidates, test.classifier)
var language string
if len(languages) == 0 {
language = OtherLanguage
Expand Down

0 comments on commit fa097f4

Please sign in to comment.