-
Notifications
You must be signed in to change notification settings - Fork 2
/
merge.py
48 lines (35 loc) · 1.64 KB
/
merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import pandas as pd
import os
import time
# Paths relative to the current working directory
folder_path = 'Scrapped' # Directory containing CSV files
output_folder = 'Clean Data' # Directory to save cleaned data
# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)
# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
print(f'Found {len(csv_files)} CSV files.')
# Create an empty DataFrame to hold all the data
all_data = pd.DataFrame()
# Read each CSV file and append it to the all_data DataFrame
for file in csv_files:
file_path = os.path.join(folder_path, file)
print(f'Reading {file_path}...')
df = pd.read_csv(file_path)
all_data = pd.concat([all_data, df], ignore_index=True)
print(f'Combined data contains {len(all_data)} rows before processing.')
# Check if 'Name' column exists
if 'Name' not in all_data.columns:
raise ValueError("The 'Name' column is missing from the data.")
# Remove duplicates based on the 'Name' column, keeping the first occurrence
print('Removing duplicates based on the Name column, keeping the first occurrence...')
cleaned_data = all_data.drop_duplicates(subset=['Name'], keep='first')
print(f'Cleaned data contains {len(cleaned_data)} rows after removing duplicates.')
# Save the cleaned data to a new CSV file in the Clean Data folder
output_file = os.path.join(output_folder, 'cleaned_data.csv')
print(f'Saving cleaned data to {output_file}...')
cleaned_data.to_csv(output_file, index=False)
print(f'Cleaned data saved to {output_file}')
# Wait for 5 seconds before exiting
print('Exiting in 5 seconds...')
time.sleep(5)