-
Notifications
You must be signed in to change notification settings - Fork 0
/
staticInputs.py
executable file
·270 lines (212 loc) · 10.3 KB
/
staticInputs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
# /**
# * @file staticInputs.py
# * @author Samay Pashine (samay@iiti.ac.in)
# * @modified Samay Pashine (samay@iiti.ac.in)
# * @brief Open all dataset file, preprocess, & save the static input in feather file incrementally.
# * @version 3.0
# * @date 2021-11-12
# * @copyright Copyright (c) 2021
# */
# Importing necessary libraries.
import os
import gc
import glob
import time
import geopandas
import numpy as np
import pandas as pd
import xarray as xr
from tqdm import tqdm
from constants import *
from shapely.geometry import mapping
def clipData(shape_file_path, plantData, matyData, yieldData):
"""Clip the netcdf data using shape file passed.
Args:
shape_file_path (str): Path to shape file for clipping the data.
plantData (xarray.Dataset): nc4 Data of plant-file
matyData (xarray.Dataset): nc4 Data of maturity-file
yieldData (xarray.Dataset): nc4 Data of yield
Returns:
[xarray.Dataset]: Clipped nc4 data.
"""
# Reading the shape file using geopandas.
geodf = geopandas.read_file(shape_file_path)
# Configuring the rio engine for clipping.
plantData = plantData.rio.write_crs("EPSG:4326", inplace=True)
matyData = matyData.rio.write_crs("EPSG:4326", inplace=True)
yieldData = yieldData.rio.write_crs("EPSG:4326", inplace=True)
# Clipping the data.
plantData = plantData.rio.clip(geodf.geometry.apply(mapping), geodf.crs, from_disk=True)
matyData = matyData.rio.clip(geodf.geometry.apply(mapping), geodf.crs, from_disk=True)
yieldData = yieldData.rio.clip(geodf.geometry.apply(mapping), geodf.crs, from_disk=True)
# Renaming the cooridnates of the data.
yieldData = yieldData.rename(x='lon', y='lat')
yieldData = yieldData.rio.set_spatial_dims(x_dim='lon', y_dim='lat', inplace=True)
return plantData, matyData, yieldData
def loadDatasets(yield_filePath, plant_filePath, maty_filePath, crop_name):
"""Load datasets, and unify them in a data frame with all essential features.
Args:
yield_filePath ([str]): Complete yield file path.
plant_filePath ([str]): Complete plant-day file path.
maty_filePath ([str]): Complete maty-day file path.
crop_name ([str]): Name of crop.
Returns:
yieldDF([pd.DataFrame]): Pandas DataFrame with all features combined.
"""
# Loading datasets files without decoding time variable.
plantData = xr.open_dataset(plant_filePath, engine='rasterio', decode_times=False)
matyData = xr.open_dataset(maty_filePath, engine='rasterio', decode_times=False)
yieldData = xr.open_dataset(yield_filePath, engine='rasterio', decode_times=False)
# Clipping the data.
plantData, matyData, yieldData = clipData(os.path.join(shape_file_path, "4_states/4_states.shp"), plantData, matyData, yieldData)
# Decoding time variable of the dataset using pandas.
initialYear = 1979
timeArray = initialYear + yieldData.variables['time'].values
plantData['time'] = timeArray.astype(int)
matyData['time'] = timeArray.astype(int)
yieldData['time'] = timeArray.astype(int)
# Converting dataset in DataFrame format for modification.
plantDF = plantData['plant-day_{}'.format(crop_name)].to_dataframe()
matyDF = matyData['maty-day_{}'.format(crop_name)].to_dataframe()
yieldDF = yieldData['yield_{}'.format(crop_name)].to_dataframe()
# Adding columns from plantDF and matyDF in yieldDF.
yieldDF['plant-day'] = plantDF['plant-day_{}'.format(crop_name)]
yieldDF['maturity-day'] = yieldDF['plant-day'] + matyDF['maty-day_{}'.format(crop_name)]
yieldDF = yieldDF.rename(columns={'x': 'lon', 'y': 'lat'})
# Deleting unnecessary variables to conserve space in the system.
del plantDF, matyDF, plantData, matyData, yieldData
return yieldDF
def surfaceFeatureExtractor(yieldDF, yield_filePath):
"""Extract surface featueres from individual netCDF file and combine it with yield DataFrame.
Args:
yieldDF (pandas.DataFrame): DataFrame with plant-day, maty-day and yield features.
yield_filePath (str): Path of yield netCDF file.
Returns:
yieldDF (pandas.DataFrame): Return Dataframe with additional CWTN-A features.
"""
# Splitting filename to get individual un-processed feature.
splittedFilename = yield_filePath.split('/')[-1].split('_')
raw_CO2, raw_W = splittedFilename[9], splittedFilename[11]
raw_T, raw_N = splittedFilename[10], splittedFilename[12]
raw_A = splittedFilename[13]
# Processing individual feature.
CO2, N, A = int(raw_CO2[1:]), int(raw_N[1:]), int(raw_A[1])
T, W = raw_T.split('-'), raw_W.split('-')
if len(T) > 1:
T = int(T[-1])
else:
T = int(T[0][1:])
if len(W) > 1:
W = int(W[-1])
else:
W = W[0].strip()
if len(W) > 3:
W = np.inf
else:
W = int(W[1:])
# print("[INFO]. CO2 : ", CO2)
# print("[INFO]. W : ", W)
# print("[INFO]. T : ", T)
# print("[INFO]. N : ", N)
# print("[INFO]. A : ", A)
# Adding surface features in yield DataFrame.
yieldDF['CO2'] = CO2
yieldDF['W'] = W
yieldDF['T'] = T
yieldDF['N'] = N
yieldDF['A'] = A
# Clearing the memory buffer and deleting the un-necessary variables.
gc.collect()
del raw_CO2, raw_A, raw_N, raw_T, raw_W, splittedFilename, CO2, W, T, N, A
return yieldDF
def soilFeatureCombine(yieldDF, soil_file_path):
"""Access the HWSD soil v2.2 netcdf file and add soil features to the yield DataFrame.
Args:
yieldDF (pandas.DataFrame): yield Dataframe with surface features.
soil_file_path (str): HWSD soil netCDF path.
Returns:
yieldDF (pandas.DataFrame): DataFrame with soil features.
"""
# Reading HWSD file and converting it to dataframe.
soilData = xr.open_dataset(soil_file_path)
soilDF = soilData.to_dataframe().reset_index()
# Dropping additional features from the file to conserve computation power.
soilDF = soilDF.drop(columns=['mu_global', 'bulk_density', 'root_obstacles', 'impermeable_layer', 'ece', 'bs_soil', 'issoil'])
# Dropping any row with null value.
soilDF = soilDF.dropna(how='any')
# Merging yield DF and soil DF on latitude and longitude basis.
yieldDF = pd.merge(yieldDF, soilDF, on=['lat', 'lon'], how='inner')
return yieldDF
if __name__ == "__main__":
"""
Driver Code which saves the finalized static inputs dataset in .feather format.
"""
# Initializing variables.
# plant_dir = "./ggcmi/phase2_outputs/dataset/plant-day/"
# maty_dir = "./ggcmi/phase2_outputs/dataset/maty-day/"
# yield_dir = "./ggcmi/phase2_outputs/dataset/yield/"
# soil_file_path = "./ggcmi/HWSD/HWSD_soil_data_on_cropland_v2.2.nc"
count, total_files = 1, len(glob.glob(yield_dir + "*.nc4"))
prevFile, DF = pd.DataFrame(), pd.DataFrame()
# Loop to go through each file in plant-day, maty-day and yield folder.
for filename in tqdm(glob.glob(yield_dir + "*.nc4")):
yield_filePath = filename
filenameList = yield_filePath.split('/')[-1].split('_')
crop_name = filenameList[4]
filenameList[3] = 'plant-day'
plant_filePath = plant_dir + '_'.join(filenameList)
filenameList[3] = 'maty-day'
maty_filePath = maty_dir + '_'.join(filenameList)
# Condition to check if the corresponding files of plant-day and maty-day exists.
if not os.path.exists(plant_filePath) or not os.path.exists(maty_filePath):
print("[ERROR]. File does not exists.")
print("[ERROR]. Passing through this iteration without change.")
count += 1
time.sleep(3)
pass
# print("[INFO]. plant-day file Path : ", plant_filePath)
# print("[INFO]. maty-day file Path : ", maty_filePath)
# print("[INFO]. yield file Path : ", yield_filePath)
# print("\n\n\n\n[PHASE 1]. Starting Loading Datasets.")
yieldDF = loadDatasets(yield_filePath, plant_filePath, maty_filePath, crop_name)
# print("[PHASE 1]. Datasets Loaded Successfully.")
# print("\n[PHASE 2]. Starting Surface Feature Extraction.")
yieldDF = surfaceFeatureExtractor(yieldDF, yield_filePath)
# print("[PHASE 2]. Surface Feature Extracted Successfully.")
yieldDF = yieldDF.reset_index()
yieldDF = yieldDF.dropna(how='any')
# print("\n[PHASE 3]. Starting Soil Feature Extraction.")
yieldDF = soilFeatureCombine(yieldDF, soil_file_path)
# print("[PHASE 3]. Soil Feature Extracted Successfully.")
# Changing the column format of the dataframe.
# yieldDF.gravel = yieldDF.gravel.astype(int)
# yieldDF.clay = yieldDF.clay.astype(int)
# yieldDF.silt = yieldDF.silt.astype(int)
# yieldDF.sand = yieldDF.sand.astype(int)
# yieldDF.awc = yieldDF.awc.astype(int)
# yieldDF.cec_soil = yieldDF.cec_soil.astype(int)
# yieldDF.texture_class = yieldDF.texture_class.astype(int)
# yieldDF.CO2 = yieldDF.CO2.astype(int)
# yieldDF['plant-day'] = yieldDF['plant-day'].astype(int)
# yieldDF['maturity-day'] = yieldDF['maturity-day'].astype(int)
# Shuffling the dataframe.
yieldDF = yieldDF.sample(frac=1)
# Concatenating the finalized DF from each file.
prevFile = pd.concat([prevFile, yieldDF], ignore_index=True)
yieldDF["yield_{}".format(crop_name)] = np.round(yieldDF["yield_{}".format(crop_name)])
count += 1
# Saving the DF in feather format after every specified iteration or when finished.
if count % 10 == 0 or count == total_files:
if os.path.isfile(os.path.join(input_dir, 'static.feather')):
DF = pd.read_feather(os.path.join(input_dir, 'static.feather'))
DF = DF.sample(frac=1)
DF = pd.concat([DF, prevFile], ignore_index=True)
os.system("rm -rf {}".format(os.path.join(input_dir, 'static.feather')))
else:
DF = DF.reset_index()
DF = DF.reset_index(drop=True)
DF.to_feather(os.path.join(input_dir, 'static.feather'), compression='lz4')
# Clearing the memory buffer, deleting un-necessary variables and resetting prevFile and DF.
del prevFile, DF
prevFile, DF = pd.DataFrame(), pd.DataFrame()
gc.collect()