-
Notifications
You must be signed in to change notification settings - Fork 0
/
value.py
120 lines (96 loc) · 3.54 KB
/
value.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import datetime
import numpy as np
import xarray as xr
import pandas as pd
from load import data_dir
"""
VALUE dataset
"""
def formatting_data(monthly=True):
"""
Create new file with data in useable format.
Args:
monthly (bool, optional): whether to resample to monthly data. Defaults to True.
"""
# Import precipitation data
df = pd.read_csv(
data_dir + 'VALUE_ECA_86_v2/precip.txt')
df = df.astype(float, errors='ignore')
df['time'] = pd.to_datetime(df['YYYYMMDD'], format="%Y%m%d")
df = df.drop(columns=['YYYYMMDD'])
df.set_index('time', inplace=True)
# Reformat columns
df1 = df.stack().reset_index()
df1 = df1.rename({"level_1": "station_id", 0: "tp"}, axis=1)
df1['station_id'] = df1['station_id'].astype(int)
# Resample
if monthly == True:
df1 = df1.groupby('station_id').resample('MS', on='time', fill_method='ffill').mean()
df1 = df1.drop(columns=['station_id']).reset_index()
# Import station data and combine
df4 = pd.read_csv(data_dir + 'VALUE_ECA_86_v2/stations.txt',
sep='\t', lineterminator='\r')
df4['station_id'] = df4['station_id'].astype(int)
df7 = df1.join(df4.set_index('station_id'), on='station_id')
df7 = df7.rename({'longitude': 'lon', 'latitude': 'lat',
'altitude': 'z', }, axis=1)
df7 = df7.drop(['source'], axis=1)
if monthly == True:
df7.to_csv(
data_dir + 'VALUE_ECA_86_v2/value_rsamp.csv')
if monthly == False:
df7.to_csv(
data_dir + 'VALUE_ECA_86_v2/value_daily.csv')
def all_gauge_data(minyear:str, maxyear:str, threshold=None, monthly=True) -> pd.DataFrame:
"""
Download data between specified dates for all active stations between two dates.
Can specify threshold for the the total number of active days during period:
e.g. for 10 year period -> 4018 - 365 = 3653
Args:
minyear (str): start year
maxyear (str): end year
threshold (_type_, optional): threshold value. Defaults to None.
monthly (bool, optional): whether to return monthly or daily data. Defaults to True.
Returns:
pd.DataFrame: VALUE data
"""
if monthly == True:
filepath = data_dir + 'VALUE_ECA_86_v2/value_rsamp.csv'
if monthly == False:
filepath = data_dir + 'VALUE_ECA_86_v2/value_daily.csv'
df = pd.read_csv(filepath)
df = df.drop(['Unnamed: 0'], axis=1)
df['time'] = pd.to_datetime(df['time'])
df.set_index('time', inplace=True)
df_masked = df[minyear:maxyear]
return df_masked.reset_index()
def gauge_download(station, minyear, maxyear):
"""
Download and format raw gauge data
Args:
station (str): station name (capitalised)
Returns
df (pd.DataFrame): precipitation gauge values
"""
df = all_gauge_data(minyear, maxyear)
station_df = df[df['name'] == station]
return station_df
def year_into_days(start_year: float, end_year: float) -> np.array:
"""
Divide years into days
Args:
start_year (float): year to start array
end_year (float): year to end array
Returns:
np.array: array in years with daily resolution
"""
final_arr = np.array([])
year_arr = np.arange(start_year, end_year)
for y in year_arr:
if y % 4 < 0.001:
year_in_days_arr = np.arange(y, y+1, 1/366)
else:
year_in_days_arr = np.arange(y, y+1, 1/365)
final_arr = np.append(final_arr, year_in_days_arr)
return final_arr