-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
60 lines (45 loc) · 1.78 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#Auto importing all the packages using pyforest
#import pyforest
def load_dataset(file_path):
"""Loads the dataset from the specified file path and returns a pandas DataFrame."""
if file_path.endswith('.csv'):
return pd.read_csv(file_path)
elif file_path.endswith('.json'):
return pd.read_json(file_path)
elif file_path.endswith('.xlsx'):
return pd.read_excel(file_path)
def explore_dataset(data):
"""Provides summary statistics and visualizations of the dataset."""
print("Dataset shape:", data.shape)
print("Dataset data types:\n", data.dtypes)
print("Number of missing values:\n", data.isna().sum())
# Visualize the distribution of values for each column
for col in data.columns:
if data[col].dtype == 'object':
sns.countplot(data[col])
plt.title(col)
plt.show()
else:
sns.histplot(data[col])
plt.title(col)
plt.show()
def clean_dataset(data, drop_missing=True):
"""Performs data cleaning on the dataset."""
# Remove duplicate rows
data.drop_duplicates(inplace=True)
# Fill missing values
data.fillna(method='ffill', inplace=True)
# Drop columns with a high percentage of missing values
missing_percent = data.isnull().sum() / len(data) * 100
to_drop = list(missing_percent[missing_percent > 50].index)
data.drop(to_drop, axis=1, inplace=True)
# Drop rows with missing values if drop_missing is True
if drop_missing:
data.dropna(inplace=True)
return data
#Signal that the code is working without errors.(The following code should be commented out)
#print('The package is working without errors.')