-
Notifications
You must be signed in to change notification settings - Fork 0
/
bm25.py
39 lines (28 loc) · 896 Bytes
/
bm25.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import numpy as np
from text import (composite, remove_punctuation, remove_stop_words,
to_lemmatize, to_stem, to_tokenized)
#
# Implentação da função BM25 sem considerar o IDF
#
def bm25_no_idf(corpus: list[str], doc: str, query: str, **kwargs) -> float:
K = 2.0
B = 0.75
def term_freq(words: list[str], term: str) -> int:
return words.count(term)
prepare = composite(
to_tokenized,
remove_stop_words,
remove_punctuation,
to_lemmatize,
to_stem
)
words = prepare(doc)
query = prepare(query)
points = 0
avg_words = kwargs.get('avg_words', np.mean(
[len(prepare(d)) for d in corpus]))
for q in query:
tf = term_freq(words, q)
points += tf * (K + 1) / (tf + K *
(1 - B + B * len(words) / avg_words))
return points