diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..80b05f3 Binary files /dev/null and b/.DS_Store differ diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index c871289..ecf37d9 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -17,14 +17,18 @@ WATERDATA_BASE_URL = 'https://nwis.waterdata.usgs.gov/' WATERDATA_URL = WATERDATA_BASE_URL + 'nwis/' WATERSERVICE_URL = 'https://waterservices.usgs.gov/nwis/' +PARAMCODES_URL = 'https://help.waterdata.usgs.gov/code/parameter_cd_nm_query?' +ALLPARAMCODES_URL = 'https://help.waterdata.usgs.gov/code/parameter_cd_query?' WATERSERVICES_SERVICES = ['dv', 'iv', 'site', 'stat', 'gwlevels'] WATERDATA_SERVICES = ['qwdata', 'measurements', 'peaks', 'pmcodes', 'water_use', 'ratings'] -def format_response(df, service=None): +def format_response(df, service=None, **kwargs): """Setup index for response from query. """ + mi = kwargs.pop('multi_index', True) + if service == 'peaks': df = preformat_peaks_response(df) @@ -33,7 +37,7 @@ def format_response(df, service=None): # XXX: consider making site_no index return df - elif len(df['site_no'].unique()) > 1: + elif len(df['site_no'].unique()) > 1 and mi: # setup multi-index df.set_index(['site_no', 'datetime'], inplace=True) if hasattr(df.index.levels[1], 'tzinfo') and df.index.levels[1].tzinfo is None: @@ -53,7 +57,8 @@ def preformat_peaks_response(df): return df -def get_qwdata(datetime_index=True, wide_format=True, sites=None, start=None, end=None, **kwargs): +def get_qwdata(datetime_index=True, wide_format=True, sites=None, + start=None, end=None, multi_index=True,**kwargs): """ Get water sample data from qwdata service. @@ -69,6 +74,8 @@ def get_qwdata(datetime_index=True, wide_format=True, sites=None, start=None, en If the qwdata parameter begin_date is supplied, it will overwrite the start parameter end: string If the qwdata parameter end_date is supplied, it will overwrite the end parameter + multi_index: boolean + If False, a dataframe with a single-level index (datetime) is returned Returns: DataFrame containing times series data from the NWIS json and Metadata as tuple @@ -79,7 +86,7 @@ def get_qwdata(datetime_index=True, wide_format=True, sites=None, start=None, en end = kwargs.pop('end_date', end) sites = kwargs.pop('site_no', sites) return _qwdata(site_no=sites, begin_date=start, end_date=end, datetime_index=datetime_index, - ** kwargs) + multi_index=multi_index, ** kwargs) def _qwdata(datetime_index=True, **kwargs): # check number of sites, may need to create multiindex @@ -122,8 +129,7 @@ def _qwdata(datetime_index=True, **kwargs): df = format_datetime(df, 'sample_dt', 'sample_tm', 'sample_start_time_datum_cd') - df = format_response(df) - return df, _set_metadata(response, **kwargs) + return format_response(df, **kwargs), _set_metadata(response, **kwargs) def get_discharge_measurements(sites=None, start=None, end=None, **kwargs): @@ -153,7 +159,7 @@ def _discharge_measurements(**kwargs): return _read_rdb(response.text), _set_metadata(response, **kwargs) -def get_discharge_peaks(sites=None, start=None, end=None, **kwargs): +def get_discharge_peaks(sites=None, start=None, end=None, multi_index=True, **kwargs): """ Get discharge peaks from the waterdata service. @@ -172,7 +178,7 @@ def get_discharge_peaks(sites=None, start=None, end=None, **kwargs): start = kwargs.pop('begin_date', start) end = kwargs.pop('end_date', end) sites = kwargs.pop('site_no', sites) - return _discharge_peaks(site_no=sites, begin_date=start, end_date=end, **kwargs) + return _discharge_peaks(site_no=sites, begin_date=start, end_date=end, multi_index=multi_index, **kwargs) def _discharge_peaks(**kwargs): @@ -180,10 +186,10 @@ def _discharge_peaks(**kwargs): df = _read_rdb(response.text) - return format_response(df, service='peaks'), _set_metadata(response, **kwargs) + return format_response(df, service='peaks', **kwargs), _set_metadata(response, **kwargs) -def get_gwlevels(start='1851-01-01', end=None, **kwargs): +def get_gwlevels(start='1851-01-01', end=None, multi_index=True, **kwargs): """ Querys the groundwater level service from waterservices @@ -200,16 +206,17 @@ def get_gwlevels(start='1851-01-01', end=None, **kwargs): """ start = kwargs.pop('startDT', start) end = kwargs.pop('endDT', end) - return _gwlevels(startDT=start, endDT=end, **kwargs) + return _gwlevels(startDT=start, endDT=end, multi_index=multi_index, **kwargs) def _gwlevels(**kwargs): + response = query_waterservices('gwlevels', **kwargs) df = _read_rdb(response.text) df = format_datetime(df, 'lev_dt', 'lev_tm', 'lev_tz_cd') - return format_response(df), _set_metadata(response, **kwargs) + return format_response(df, **kwargs), _set_metadata(response, **kwargs) def get_stats(sites, **kwargs): @@ -294,7 +301,7 @@ def query_waterservices(service, **kwargs): return query(url, payload=kwargs) -def get_dv(start=None, end=None, **kwargs): +def get_dv(start=None, end=None, multi_index=True, **kwargs): """ Get daily values data from NWIS and return it as a DataFrame. @@ -312,15 +319,14 @@ def get_dv(start=None, end=None, **kwargs): """ start = kwargs.pop('startDT', start) end = kwargs.pop('endDT', end) - return _dv(startDT=start, endDT=end, **kwargs) + return _dv(startDT=start, endDT=end, multi_index=multi_index, **kwargs) def _dv(**kwargs): response = query_waterservices('dv', format='json', **kwargs) df = _read_json(response.json()) - df = format_response(df) - return df, _set_metadata(response, **kwargs) + return format_response(df, **kwargs), _set_metadata(response, **kwargs) def get_info(**kwargs): @@ -411,7 +417,7 @@ def get_info(**kwargs): return _read_rdb(response.text), _set_metadata(response, **kwargs) -def get_iv(start=None, end=None, **kwargs): +def get_iv(start=None, end=None, multi_index=True, **kwargs): """Get instantaneous values data from NWIS and return it as a DataFrame. Note: If no start or end date are provided, only the most recent record is returned. @@ -428,40 +434,63 @@ def get_iv(start=None, end=None, **kwargs): """ start = kwargs.pop('startDT', start) end = kwargs.pop('endDT', end) - return _iv(startDT=start, endDT=end, **kwargs) + return _iv(startDT=start, endDT=end, multi_index=multi_index, **kwargs) def _iv(**kwargs): response = query_waterservices('iv', format='json', **kwargs) - return _read_json(response.json()), _set_metadata(response, **kwargs) + df = _read_json(response.json()) + return format_response(df, **kwargs), _set_metadata(response, **kwargs) -def get_pmcodes(parameterCd='All', **kwargs): +def get_pmcodes(parameterCd = 'All', partial = True): """ - Return a DataFrame containing all NWIS parameter codes. - - Note: NWIS may return incorrect column names. Rename them with + Returns a DataFrame containing all NWIS parameter code information. - >>> df.rename(columns={key:value}) - - Parameters (Additional parameters, if supplied, will be used as query parameters). + Parameters ---------- - parameterCd: string or listlike + parameterCd: string or list + Accepts parameter codes or names + + partial: boolean + Default is True (partial querying). If False, the funciton will query only exact matches Returns: DataFrame containing the USGS parameter codes and Metadata as tuple """ - payload = {'radio_pm_search' : 'pm_search', - 'pm_group' : 'All+--+include+all+parameter+groups', - 'pm_search' : parameterCd, - 'casrn_search' : None, - 'srsname_search' : None, - 'show' : ['parameter_group_nm', 'casrn', 'srsname','parameter_units', 'parameter_nm'], - 'format' : 'rdb'} + if parameterCd is None: + raise TypeError('The query must include a parameter name or code') - payload.update(kwargs) - url = WATERDATA_URL + 'pmcodes/pmcodes' - response = query(url, payload) - return _read_rdb(response.text), _set_metadata(response, **kwargs) + payload = {'fmt':'rdb'} + url = PARAMCODES_URL + + if isinstance(parameterCd, str): # when a single code or name is given + if parameterCd.lower() == "all": + payload.update({'group_cd': '%'}) + url = ALLPARAMCODES_URL + response = query(url, payload) + return _read_rdb(response.text), _set_metadata(response) + + else: + parameterCd = [parameterCd] + + if not isinstance(parameterCd, list): + raise TypeError('Parameter information (code or name) must be type string or list') + + # Querying with a list of parameters names, codes, or mixed + l = [] + for param in parameterCd: + if isinstance(param, str): + if partial: + param ='%{0}%'.format(param) + payload.update({'parm_nm_cd':param}) + response = query(url, payload) + if len(response.text.splitlines()) < 10: # empty query + raise TypeError('One of the parameter codes or names entered does not return any information,'\ + ' please try a different value') + l.append(_read_rdb(response.text)) + else: + raise TypeError('Parameter information (code or name) must be type string') + return pd.concat(l), _set_metadata(response) def get_water_use(years="ALL", state=None, counties="ALL", categories="ALL"): @@ -625,7 +654,7 @@ def get_record(sites=None, start=None, end=None, state=None, raise TypeError('{} service not yet implemented'.format(service)) -def _read_json(json, multi_index=False): +def _read_json(json): """ Reads a NWIS Water Services formatted JSON into a DataFrame. @@ -690,7 +719,6 @@ def _read_json(json, multi_index=False): merged_df = update_merge(merged_df, record_df, na_only=True, on=['site_no', 'datetime']) - merged_df = format_response(merged_df) return merged_df @@ -714,7 +742,7 @@ def _read_rdb(rdb): fields = re.split("[\t]", rdb.splitlines()[count]) fields = [field.replace(",", "") for field in fields] - dtypes = {'site_no': str, 'dec_long_va': float, 'dec_lat_va': float} + dtypes = {'site_no': str, 'dec_long_va': float, 'dec_lat_va': float, 'parm_cd': str, 'parameter_cd':str} df = pd.read_csv(StringIO(rdb), delimiter='\t', skiprows=count + 2, names=fields, na_values='NaN', dtype=dtypes) @@ -741,7 +769,7 @@ def _set_metadata(response, **parameters): if 'parameterCd' in parameters: md.variable_info = lambda: get_pmcodes(parameterCd=parameters['parameterCd']) - + comments = "" for line in response.text.splitlines(): if line.startswith("#"): diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index a7b447f..82676b0 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -54,9 +54,9 @@ def format_datetime(df, date_field, time_field, tz_field): #create a datetime index from the columns in qwdata response df[tz_field] = df[tz_field].map(tz) - df['datetime'] = pd.to_datetime(df.pop(date_field) + ' ' + - df.pop(time_field) + ' ' + - df.pop(tz_field), + df['datetime'] = pd.to_datetime(df[date_field] + ' ' + + df[time_field] + ' ' + + df[tz_field], format = '%Y-%m-%d %H:%M', utc=True) diff --git a/tests/data/waterdata_pmcodes.txt b/tests/data/waterdata_pmcodes.txt index a2728a1..2dce370 100755 --- a/tests/data/waterdata_pmcodes.txt +++ b/tests/data/waterdata_pmcodes.txt @@ -1,18 +1,10 @@ -# -# -# USGS Parameter Code Definitions File -# -# Column Definitions -# parameter_cd - Parameter Code -# parameter_group_nm - Group Name -# casrn - CASRN -# srsname - SRSName -# parameter_units - Parameter Unit -# parameter_nm - Parameter Name/Description -# -# -# Date Retrieved: 2020-04-16 18:48:43 EDT -# -parameter_cd parameter_group_nm casrn srsname parameter_units parameter_nm -5s 30s 170s 170s 16s 170s -00618 Nutrient Nitrate, water, filtered, milligrams per liter as nitrogen 14797-55-8 Nitrate mg/l as N +# +# National Water Information System +# 2022/06/08 +# +# +# Date Retrieved: USGS Water Data for the Nation Help System +# +parameter_cd group parm_nm epa_equivalence result_statistical_basis result_time_basis result_weight_basis result_particle_size_basis result_sample_fraction result_temperature_basis CASRN SRSName parm_unit +5s 8s 58s 5s 0s 0s 0s 0s 9s 0s 10s 7s 9s +00618 Nutrient Nitrate, water, filtered, milligrams per liter as nitrogen Agree Dissolved 14797-55-8 Nitrate mg/l as N diff --git a/tests/waterservices_test.py b/tests/waterservices_test.py index 13777b8..5b23a43 100755 --- a/tests/waterservices_test.py +++ b/tests/waterservices_test.py @@ -106,7 +106,7 @@ def test_get_info(requests_mock): assert type(df) is DataFrame assert df.size == 24 assert md.url == request_url - assert_metadata(requests_mock, request_url, md, site, parameter_cd, format) + assert_metadata(requests_mock, request_url, md, site, [parameter_cd], format) def test_get_qwdata(requests_mock): @@ -121,7 +121,7 @@ def test_get_qwdata(requests_mock): mock_request(requests_mock, request_url, response_file_path) df, md = get_qwdata(sites=["01491000", "01645000"]) assert type(df) is DataFrame - assert df.size == 1650709 + assert df.size == 1821472 assert_metadata(requests_mock, request_url, md, site, None, format) @@ -135,7 +135,7 @@ def test_get_gwlevels(requests_mock): mock_request(requests_mock, request_url, response_file_path) df, md = get_gwlevels(sites=[site]) assert type(df) is DataFrame - assert df.size == 13 + assert df.size == 16 assert_metadata(requests_mock, request_url, md, site, None, format) @@ -172,14 +172,12 @@ def test_get_pmcodes(requests_mock): """Tests get_discharge_measurements method correctly generates the request url and returns the result in a DataFrame""" format = "rdb" - request_url = 'https://nwis.waterdata.usgs.gov/nwis/pmcodes/pmcodes?radio_pm_search=pm_search' \ - '&pm_group=All%2B--%2Binclude%2Ball%2Bparameter%2Bgroups&pm_search=00618' \ - '&show=parameter_group_nm%2Ccasrn%2Csrsname%2Cparameter_units%2Cparameter_nm&format={}'.format(format) + request_url = "https://help.waterdata.usgs.gov/code/parameter_cd_nm_query?fmt=rdb&parm_nm_cd=%2500618%25" response_file_path = 'data/waterdata_pmcodes.txt' mock_request(requests_mock, request_url, response_file_path) df, md = get_pmcodes(parameterCd='00618') assert type(df) is DataFrame - assert df.size == 6 + assert df.size == 13 assert_metadata(requests_mock, request_url, md, None, None, format) @@ -236,17 +234,18 @@ def test_what_sites(requests_mock): """Tests what_sites method correctly generates the request url and returns the result in a DataFrame""" format = "rdb" parameter_cd = '00010%2C00060' + parameter_cd_list = ["00010","00060"] request_url = "https://waterservices.usgs.gov/nwis/site?bBox=-83.0%2C36.5%2C-81.0%2C38.5" \ "¶meterCd={}&hasDataTypeCd=dv&format={}".format(parameter_cd, format) response_file_path = 'data/nwis_sites.txt' mock_request(requests_mock, request_url, response_file_path) df, md = what_sites(bBox=[-83.0,36.5,-81.0,38.5], - parameterCd=["00010","00060"], + parameterCd=parameter_cd_list, hasDataTypeCd="dv") assert type(df) is DataFrame assert df.size == 2472 - assert_metadata(requests_mock, request_url, md, None, parameter_cd, format) + assert_metadata(requests_mock, request_url, md, None, parameter_cd_list, format) def test_get_stats(requests_mock): @@ -280,17 +279,17 @@ def assert_metadata(requests_mock, request_url, md, site, parameter_cd, format): site_info, _ = md.site_info() assert type(site_info) is DataFrame if parameter_cd is None: - assert md.variable_info is None + assert md.variable_info is None else: - pcode_request_url = "https://nwis.waterdata.usgs.gov/nwis/pmcodes/pmcodes?radio_pm_search=pm_search" \ - "&pm_group=All%2B--%2Binclude%2Ball%2Bparameter%2Bgroups&pm_search={}" \ - "&show=parameter_group_nm%2Ccasrn%2Csrsname%2Cparameter_units%2Cparameter_nm" \ - "&format=rdb".format(parameter_cd) - with open('data/waterdata_pmcodes.txt') as text: - requests_mock.get(pcode_request_url, text=text.read()) + for param in parameter_cd: + pcode_request_url = "https://help.waterdata.usgs.gov/code/parameter_cd_nm_query?fmt=rdb&parm_nm_cd=%25{}%25".format(param) + with open('data/waterdata_pmcodes.txt') as text: + requests_mock.get(pcode_request_url, text=text.read()) variable_info, _ = md.variable_info() assert type(variable_info) is DataFrame + if format == "rdb": assert md.comment is not None else: assert md.comment is None +