-
Notifications
You must be signed in to change notification settings - Fork 0
/
Code_Dump.txt
140 lines (103 loc) · 6.34 KB
/
Code_Dump.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Select only the columns that are important to the analysis, and convert their values to numerical categorical values
imbalanced_data = data.copy()
imbalanced_data[important_cols] = imbalanced_data[important_cols].apply(LabelEncoder().fit_transform)
imbalanced_data = imbalanced_data[important_cols]
# (Optional) Remove the duplicate samples from the imbalanced dataset
imbalanced_data.drop_duplicates(subset=['Had_Heart_Attack',
'Smoker_Status',
'Alcohol_Drinkers',
'Had_Stroke',
'Difficulty_Walking',
'Sex',
'Age_Category',
'Race_Ethnicity_Category',
'Had_Diabetes',
'Physical_Activities',
'General_Health',
'Had_Asthma',
'Had_Kidney_Disease',
'Had_Skin_Cancer'], inplace=True)
# Split the balanced dataset to a set of training and testing samples
x_train, x_test, y_train, y_test = train_test_split(SMOTE_data[['Smoker_Status',
'Alcohol_Drinkers',
'Had_Stroke',
'Difficulty_Walking',
'Sex',
'Age_Category',
'Race_Ethnicity_Category',
'Had_Diabetes',
'Physical_Activities',
'General_Health',
'Had_Asthma',
'Had_Kidney_Disease',
'Had_Skin_Cancer']],
SMOTE_data['Had_Heart_Attack'], test_size=0.30, random_state=42, stratify=SMOTE_data['Had_Heart_Attack'])
# Split the imbalanced dataset to a set of training and testing samples
x_train, x_test, y_train, y_test = train_test_split(imbalanced_data[['Smoker_Status',
'Alcohol_Drinkers',
'Had_Stroke',
'Difficulty_Walking',
'Sex',
'Age_Category',
'Race_Ethnicity_Category',
'Had_Diabetes',
'Physical_Activities',
'General_Health',
'Had_Asthma',
'Had_Kidney_Disease',
'Had_Skin_Cancer']],
imbalanced_data['Had_Heart_Attack'], test_size=0.20, random_state=42, stratify=imbalanced_data['Had_Heart_Attack'])
# Create an empty list to store rows for the new dataset
new_dataset_rows = []
# Iterate through each row in the original dataset
for _, row in dataframes[2].iterrows():
sentence_parts = [] # To store parts of the sentence for each row
# Iterate through each column in the original dataset
for column_name, value in row.items():
# Check if the current column is in the descriptions dictionary
if column_name in sequences:
# Get the description for the current value of the variable
description = sequences[column_name].get(value, '')
# Add a comma to the end of the description
description += ','
# Add the description to the sentence parts
sentence_parts.append(description)
# Check if the current column is the target variable
elif column_name == '_MICHD':
# Add the target variable's value to the second column of the new dataset
classification_value = value
# Concatenate the sentence parts into a single sentence
sentence = ' '.join(sentence_parts)
# Create a new row for the new dataset
new_dataset_rows.append({'Sentence': sentence, 'Classification': classification_value})
# Create the new dataset from the list of rows
text_data = pd.DataFrame(new_dataset_rows)
# Display the new dataset
print(text_data)
SMOTE_train_data, SMOTE_test_data = train_test_split(SMOTE_data, test_size=0.25, stratify=SMOTE_data['_MICHD'])
# Separate the dataframe into minority and majority classes
minority_df = SMOTE_data[SMOTE_data['_MICHD'] == 0]
majority_df = SMOTE_data[SMOTE_data['_MICHD'] == 1]
# Split the minority class into training and testing sets
minority_train, minority_test = train_test_split(minority_df, test_size=0.1)
# Split the majority class into training and testing sets
majority_train, majority_test = train_test_split(majority_df, test_size=0.3)
# Concatenate the training and testing sets for both classes
train_set = pd.concat([minority_train, majority_train])
test_set = pd.concat([minority_test, majority_test])
# Shuffle the rows in the testing set
test_set = test_set.sample(frac=1)
# Ensure that the testing set does not contain samples from the training set
test_set = test_set[~test_set.index.isin(train_set.index)]
# Verify the class distribution in the training and testing sets
print("Training set class distribution:")
print(train_set['_MICHD'].value_counts())
print("\nTesting set class distribution:")
print(test_set['_MICHD'].value_counts())
# Generate synthetic data for the original imbalanced data
resampled_data = BorderlineSMOTE().fit_resample(train_set.drop(columns = ['_MICHD']),
train_set['_MICHD'])
train_set = resampled_data[0]
train_set[target_column] = resampled_data[1].to_numpy()
# Counts for the "Had_CHD_MI" variable
sns.countplot(data=train_set,x='_MICHD')