-
Notifications
You must be signed in to change notification settings - Fork 0
/
weather_missing.py
153 lines (110 loc) · 5.41 KB
/
weather_missing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from datetime import datetime, timedelta
import heapq
import os
import pandas as pd
from pandas import date_range
import csv
from collections import OrderedDict
import shutil
#-------------------------------------------------------------------------------------------
import weather_ilens
import weather_format
#-------------------------------------------------------------------------------------------
#check if there's any data lackage
def check(dates):
#-------------------------------------------------------------------------------------------
data_file_path = "data/"
#-------------------------------------------------------------------------------------------
missing_date = []
for date in dates:
date_str = date.strftime('%Y%m%d')
file_path = data_file_path + date_str
#if the file doesn't exist, return this date
if not os.path.isdir(file_path):
missing_date.append(date)
continue
#if the file is empty, return this date
if not os.listdir(file_path):
missing_date.append(date)
continue
#----------------------------------------------------------------------------------------------
#use inclusive insteade of closed in the lastest version
time_period = date_range(start=date, end=date + timedelta(days=1), freq='10t', closed='left')
#----------------------------------------------------------------------------------------------
for file_name in os.listdir(file_path):
#put the current data in to data_list of this date
with open(file_path + '/' + file_name, 'r') as date_file:
date_file = csv.reader(date_file)
data_list = [row for row in date_file]
file_missing = False
#compare the donwloaded data and the required time interval
for count in range(len(data_list)):
#load data
data = datetime.strptime(data_list[count][0], '%Y/%m/%d %H:%M')
#check if the data and the required time is closer than 10 mins
if data - time_period[count] >= timedelta(minutes=10):
file_missing = True
break
#check if the file is complete
if (data_list[count][1] == '') | (data_list[count][2] == ''):
file_missing = True
break
if file_missing:
missing_date.append(date)
break
return missing_date
#combine the old data with the newly downloaded data
def combine():
#-------------------------------------------------------------------------------------------
data_file_path = 'data/'
missing_file_path = 'missing_data/'
#-------------------------------------------------------------------------------------------
#iterate through all the dates
for date in os.listdir(missing_file_path):
missing_path = missing_file_path + date + '/'
date_path = data_file_path + date + '/'
#if the file does not exist, move the data directly
if not os.path.isdir(date_path):
os.rename(missing_path, date_path)
continue
#if the file is empty, remove the empty file and move the data directly
if not os.listdir(date_path):
shutil.rmtree(date_path)
os.rename(missing_path, date_path)
continue
#iterate through all the sites on that day
for site_name in os.listdir(missing_path):
missing_site_path = missing_path + site_name
date_site_path = date_path + site_name
#load both the old data and the new data
df1 = pd.read_csv(missing_site_path, header=None)
df2 = pd.read_csv(date_site_path, header=None)
#use the last step of merge sort(O(n)) to combine two data
#the data will generate index:'_0', '_1', '_2' after coming out from the generator heapq.merge()
df = pd.DataFrame(heapq.merge(df1.itertuples(index=False), df2.itertuples(index=False)))
#drop duplcates time
df.drop_duplicates('_0', inplace=True)
#store the data
df.to_csv(date_site_path, index=None, header=None)
#delete all the temporary files in missing_data
for missing_file_name in os.listdir(missing_file_path):
shutil.rmtree(missing_file_path + missing_file_name)
def fill(start_date, end_date):
dates = date_range(start = start_date, end = end_date, freq = '1D')
#get the missing dates
missing = check(dates)
#call weather_ilens to download the missing data from ilens
weather_ilens.ilens(missing)
#format the file to suit the other files from HKO
weather_format.format()
#combine the old data with the newly downloaded data
combine()
#-------------------------------------------------------------------------------------------
#delete the zip files
zip_file_path = 'zip_file/'
for date_zip_file_name in os.listdir(zip_file_path):
os.remove(zip_file_path + date_zip_file_name)
#-------------------------------------------------------------------------------------------
if __name__ == '__main__':
combine()
#check_boundaries(check())