forked from aaronpenne/get_noaa_ghcn_data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_station_id.py
172 lines (151 loc) · 5.48 KB
/
get_station_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# -*- coding: utf-8 -*-
"""
Searches list of stations via user input to find the station ID.
Author: Aaron Penne
------------------------------
Variable Columns Type
------------------------------
ID 1-11 Character
LATITUDE 13-20 Real
LONGITUDE 22-30 Real
ELEVATION 32-37 Real
STATE 39-40 Character
NAME 42-71 Character
GSN FLAG 73-75 Character
HCN/CRN FLAG 77-79 Character
WMO ID 81-85 Character
------------------------------
"""
import sys
import pandas as pd
from ftplib import FTP
import os
output_dir = os.path.relpath('output')
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
ftp_path_dly = '/pub/data/ghcn/daily/'
ftp_path_dly_all = '/pub/data/ghcn/daily/all/'
ftp_filename = 'ghcnd-stations.txt'
def connect_to_ftp():
ftp_path_root = 'ftp.ncdc.noaa.gov'
# Access NOAA FTP server
ftp = FTP(ftp_path_root)
message = ftp.login() # No credentials needed
print(message)
return ftp
def get_station_id(ftp):
'''
Get stations file
'''
ftp_full_path = os.path.join(ftp_path_dly, ftp_filename)
local_full_path = os.path.join(output_dir, ftp_filename)
if not os.path.isfile(local_full_path):
with open(local_full_path, 'wb+') as f:
ftp.retrbinary('RETR ' + ftp_full_path, f.write)
'''
Get user search term
'''
print()
query = input('Enter station name, full or partial. (ex. Washington, san fran, USC): ')
query = query.upper()
# FIXME try/catch and clean input
print()
'''
Read stations text file using fixed-width-file reader built into pandas
'''
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_fwf.html
dtype = {'STATION_ID': str,
'LATITUDE': str,
'LONGITUDE': str,
'ELEVATION': str,
'STATE': str,
'STATION_NAME': str,
'GSN_FLAG': str,
'HCN_CRN_FLAG': str,
'WMO_ID': str}
names = ['STATION_ID', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'STATE', 'STATION_NAME', 'GSN_FLAG', 'HCN_CRN_FLAG', 'WMO_ID']
widths = [11, # Station ID
9, # Latitude (decimal degrees)
10, # Longitude (decimal degrees)
7, # Elevation (meters)
3, # State (USA stations only)
31, # Station Name
4, # GSN Flag
4, # HCN/CRN Flag
6] # WMO ID
df = pd.read_fwf(local_full_path, widths=widths, names=names, dtype=dtype, header=None)
'''
Replace missing values (nan, -999.9)
'''
df['STATE'] = df['STATE'].replace('nan', '--')
df['GSN_FLAG'] = df['GSN_FLAG'].replace('nan', '---')
df['HCN_CRN_FLAG'] = df['GSN_FLAG'].replace('nan', '---')
df = df.replace(-999.9, float('nan'))
try:
'''
Get query results, but only the columns we care about
'''
print('Searching records...')
matches = df['STATION_NAME'].str.contains(query)
df = df.loc[matches, ['STATION_ID', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'STATE', 'STATION_NAME']]
df.reset_index(drop=True, inplace=True)
'''
Get file sizes of each station's records to augment results
'''
print('Getting file sizes...', end='')
ftp.voidcmd('TYPE I') # Needed to avoid FTP error with ftp.size()
for i in list(df.index):
print('.', end='')
ftp_dly_file = ftp_path_dly + 'all/' + df.loc[i, 'STATION_ID'] + '.dly'
df.loc[i, 'SIZE'] = round(ftp.size(ftp_dly_file)/1000) # Kilobytes
print()
print()
'''
Sort by size then by rounded lat/long values to group geographic areas and show stations with most data
'''
df_sort = df.round(0)
df_sort.sort_values(['LATITUDE', 'LONGITUDE', 'SIZE'], ascending=False, inplace=True)
df = df.loc[df_sort.index]
df.reset_index(drop=True, inplace=True)
except:
print('Station not found')
sys.exit()
'''
Print headers and values to facilitate reading
'''
selection = 'Index'
station_id = 'Station_ID '
lat = 'Latitude'
lon = 'Longitude'
state = 'State'
name = 'Station_Name '
size = ' File_Size'
# Format output to be pretty, hopefully there is a prettier way to do this.
print('{: <6}{: <31}{: <6}({: >8},{: >10}){: >13}'.format(selection, name, state, lat, lon, size))
print('-'*5 + ' ' + '-'*30 + ' ' + '-'*5 + ' ' + '-'*21 + ' ' + '-'*12)
for i in list(df.index):
print('{: 4}: {: <31}{: <6}({: >8},{: >10}){: >10} Kb'.format(i,
df.loc[i,'STATION_NAME'],
df.loc[i,'STATE'],
df.loc[i,'LATITUDE'],
df.loc[i,'LONGITUDE'],
df.loc[i,'SIZE']))
'''
Get user selection
'''
try:
query = input('Enter selection (ex. 001, 42): ')
query = int(query)
except:
print('Please enter valid selection (ex. 001, 42)')
sys.exit()
station_id = df.loc[query, 'STATION_ID']
return station_id
'''
Main
'''
if __name__ == '__main__':
ftp = connect_to_ftp()
station_id = get_station_id(ftp)
print(station_id)
ftp.quit()