-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
109 lines (84 loc) · 4.51 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
import re
# To show all the rows of pandas dataframe
pd.set_option('display.max_rows', None)
# Load the dataset from google sheets
df = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQxg-eU4BXWaTJzHKlNW8pS3Jda-4v75jMrSWt0bOYJZga2Tsu8WJ8od5lvRrgTTy89qEiMNPDnid71/pub?gid=2072216167&single=true&output=csv")
# Look at the dataframe
df.head()
# Look at how many different conditions there are in the data
df.condition.value_counts().head(n = 200)
# df_train = df_train.loc[~df_train['condition'].isin(['Acne'])]
# df_train = df_train[df_train['condition'].map(df_train['condition'].value_counts()) > 32]
# Filter for some conditions
df_train = df[df['condition'].isin(['Birth Control', 'Depression', 'Pain', 'Anxiety', 'Bipolar Disorder',
'Insomnia', 'Weight Loss', 'Obesity', 'ADHD', 'Diabetes, Type 2',
'Emergency Contraception', 'High Blood Pressure ', 'Vaginal Yeast Infection',
'Abnormal Uterine Bleeding', 'Bowel Preparation', 'ibromyalgia',
'Smoking Cessation', 'Migraine' 'Anxiety and Stress', 'Constipation',
'Major Depressive Disorde', 'Chronic Pain', 'Panic Disorde', 'Migraine Prevention',
'Urinary Tract Infection', 'Opiate Dependence', 'Osteoarthritis', 'Muscle Spasm',
'Erectile Dysfunction', 'Generalized Anxiety Disorde', 'Allergic Rhinitis',
'Irritable Bowel Syndrome', 'Rheumatoid Arthritis ', 'Bacterial Infection',
'Cough', 'Sinusitis', 'Nausea/Vomiting', 'GERD', 'Hepatitis C', 'Hyperhidrosis',
'Restless Legs Syndrome', 'Overactive Bladde', 'Multiple Sclerosis',
'High Cholesterol', 'Psoriasis', 'Schizophrenia',
'Hypogonadism, Male', 'Back Pain', 'Asthma, Maintenance', 'Bronchitis',
'Not Listed / Othe', 'Headache', 'Underactive Thyroid',
'Influenza', 'Cough and Nasal Congestion',
'Opiate Withdrawal', 'Nasal Congestion', 'Pneumonia', 'Osteoporosis', 'Asthma',
'Ulcerative Colitis', 'Diarrhea', 'Strep Throat', 'Allergies', 'COPD',
'Diabetes, Type 1 ', 'Cold Symptoms', 'Asthma, acute', 'Atrial Fibrillation'])]
# Drop columns we won't be using for the model (including the dependent variable, drug)
X = df_train.drop(['drugName','rating','date','usefulCount'],axis=1)
# Remove double quotes
for i, col in enumerate(X.columns):
X.iloc[:, i] = X.iloc[:, i].str.replace('"', '')
# Get list of stopwords
nltk.download('stopwords')
print(stopwords.words('english'))
stop = stopwords.words('english')
# Get the nltk wordnet
nltk.download('wordnet')
# Lemmatize
lemmatizer = WordNetLemmatizer()
# Function to clean text
def review_to_words(raw_review):
# 1. Delete HTML
review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
# 2. Make a space
letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
# 3. lower letters
words = letters_only.lower().split()
# 5. Stopwords
meaningful_words = [w for w in words if not w in stop]
# 6. lemmitization
lemmitize_words = [lemmatizer.lemmatize(w) for w in meaningful_words]
# 7. space join words
return( ' '.join(lemmitize_words))
# Apply the cleaning function to the text
X['review_clean'] = X['review'].apply(review_to_words)
# Set feature and dependent variable
X_feat = X['review_clean']
y = X['condition']
# Do train test split
X_train, X_test, y_train, y_test = train_test_split(X_feat, y, stratify=y, test_size=0.2, random_state=0)
# Bag of words
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)
# Fit a RF model
rf_model = RandomForestClassifier()
rf_model.fit(count_train, y_train)
# Save the model
joblib.dump(rf_model, "my_random_forest.joblib", compress=3)