-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokens_p.py
71 lines (50 loc) · 1.67 KB
/
tokens_p.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
tokenizes the blurb bodies
"""
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from typing import List, Tuple
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
def preview_features(
bodies: List[str],
features: csr_matrix, term_names: List[str],
doc_idx: int, num_features: int = 30):
"""
previews the features and the body which they were taken from.
params:
features is a 2D numpy sparse matrix
"""
doc_scores = features[doc_idx, :].todense()
term_idxs_sorted = np.fliplr(doc_scores.argsort())
for i in range(num_features):
term_idx_inorder = term_idxs_sorted[0, i]
term = term_names[term_idx_inorder]
tfidf = doc_scores[0, term_idx_inorder]
print('{0:20s} {1:f} '.format(term, tfidf))
print()
print(bodies[doc_idx])
def tokenize_flow(df: pd.DataFrame, **tf_params) -> Tuple[TfidfVectorizer, csr_matrix]:
"""
tokenizes the blurb bodies in df and returns the sklearn vectorizer object and the feature matrix
"""
if not 'stop_words' in tf_params:
tf_params['stop_words'] = stopwords.words('english') + OUR_STOP_WORDS
vectorizer = TfidfVectorizer(**tf_params)
corpus = df['body']
X = vectorizer.fit_transform(corpus)
return vectorizer, X
OUR_STOP_WORDS = [
'would'
]
if __name__ == "__main__":
import load_p
import genres_p
import os
df = load_p.get_df_flow()
df = genres_p.parse_genres_flow(df)
vectorizer, X = tokenize_flow(df)
preview_features(df['body'], X, vectorizer.get_feature_names(), 113, 50)