-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
126 lines (89 loc) · 3.55 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Data Wrangling Script for VertNet Mammal Data
Neeka Sewnath
nsewnath@ufl.edu
"""
#===========================================================================================================================================
import pandas as pd
import argparse
import id_assign
import clean_year
import clean_country
import process_na
import m_method_process
import rename
import rearrange
import clean_lifestage
import clean_sex
import add_cols
import save_file
#===========================================================================================================================================
try:
import warnings
warnings.filterwarnings('ignore')
except:
pass
#===========================================================================================================================================
def get_args():
"""Get command-line arguments"""
parser = argparse.ArgumentParser(
description='API data scrape and reformat',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-f',
'--file',
help = 'File input',
metavar = 'url',
type = str,
default = "./../fovt-data-mapping/Original_Data/all_mammals_2021-11-09a/all_mammals_2021-11-09a.csv")
return parser.parse_args()
#===========================================================================================================================================
def main():
# Fetch arguments
args = get_args()
file = args.file
# Read input file
print("\nReading in data...")
data = pd.read_csv(file)
# Data Processing Steps
print("\nAssigning individualID...")
data = id_assign.assign_indivdual_ID(data)
print("\nCleaning yearCollected column...")
data = clean_year.clean_year_collected(data)
print("\nCleaning lifeStage column...")
data = clean_lifestage.clean_lifestage_column(data)
print("\nCleaning sex column...")
data = clean_sex.clean_sex_column(data)
print("\nCleaning scientificName column...")
data = process_na.fill_unknown(data)
print("\nAdding GEOME required column...")
data = add_cols.add_req_cols(data)
print("\nAdding verbatimEventDate column...")
data = add_cols.adding_verbatim_date(data)
print("\nCleaning country column...")
data = clean_country.clean_country(data)
print("\nCreating verbatimElevation columns...")
data = add_cols.verbatim_elev(data)
print("\nMatching column names with template names...")
data = rename.match_cols(data)
print("\nCreating materialSampleID...")
data = id_assign.create_id(data)
print("\nCreating unique measurementMethod column...")
data = m_method_process.create_uni_mm(data)
print("\nCreating long version...")
data = rearrange.long_vers(data)
print("\nProcessing measurement method...")
data = m_method_process.mm_processing(data)
print("\nMatching trait and ontology terms...")
data = rename.match_traits(data)
print("\nCreate verbatimMeasurementUnit column...")
data = add_cols.verbatim_mu(data)
print("\nCreating diagnosticID column...")
data = id_assign.diagnostic_id(data)
print("\nDrop blank measurements...")
data = process_na.drop_na(data)
# Saving files
print("\n Saving files...")
save_file.save_file(data)
#===========================================================================================================================================
if __name__ == '__main__':
main()