From 0d9c45c3e0a7074d674386d66681d27d33acb9c2 Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Wed, 26 Jun 2019 23:05:34 +0200 Subject: [PATCH 01/11] First draft of odata4 implementation. --- README.rst | 30 ++- cbsodata4.py | 463 ++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + docs/reference_v4.rst | 5 + setup.py | 2 +- tests/test_cbsodata4.py | 54 +++++ 6 files changed, 551 insertions(+), 4 deletions(-) create mode 100644 cbsodata4.py create mode 100644 docs/reference_v4.rst create mode 100644 tests/test_cbsodata4.py diff --git a/README.rst b/README.rst index 1438fe0..921b021 100644 --- a/README.rst +++ b/README.rst @@ -43,9 +43,9 @@ Load the package with Tables ~~~~~~ -Statistics Netherlands (CBS) has a large amount of public available +Statistics Netherlands (CBS) has a large amount of publicly available data tables (more than 4000 at the moment of writing). Each table is -identified by a unique identifier (``Identifier``). +identified by a unique identifier (``Identifier``). .. code:: python @@ -93,7 +93,7 @@ Data ~~~~ The function you are looking for!! The function ``get_data`` returns a list of -dicts with the table data. +dictionaries with the table data. .. code:: python @@ -182,3 +182,27 @@ The list of tables can be turned into a pandas DataFrame as well. >>> tables = pandas.DataFrame(cbsodata.get_table_list()) >>> tables.head() + + +OData Version 4 +--------------- + +CBS migrates from `OData version 3`_ to `OData version 4`_. This migration comes +with a lot of other changes. Read about the changes at the website of CBS https://beta.opendata.cbs.nl/OData4/index.html. + +More documentation on this page will follow when the new API is officially released (it's now in beta) + +.. _`OData version 3`: https://www.odata.org/documentation/odata-version-3-0/ +.. _`OData version 4`: https://www.odata.org/documentation/ + +.. code:: python + + import cbsodata4 as cbsodata + + obs = cbsodata.get_data("84120NED") + +Transform data into a pivot table with Pandas. + +.. code:: python + + df.pivot(index='PeriodenTitle', columns='BelastingenEnWettelijkePremiesTitle', values='Value') diff --git a/cbsodata4.py b/cbsodata4.py new file mode 100644 index 0000000..99f02df --- /dev/null +++ b/cbsodata4.py @@ -0,0 +1,463 @@ +# Copyright (c) 2019 Jonathan de Bruin + +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: + +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +"""Statistics Netherlands opendata version 4 API client for Python""" + +__all__ = [ + 'options', + 'get_data', + 'get_dataset', + 'get_dataset_info', + 'get_dataset_list', + 'get_catalog_info', + 'get_catalog_list', + 'get_metadata', + 'get_observations'] + +import os +import json +import copy +import logging +from contextlib import contextmanager + +import requests +from requests import Session, Request + + +class OptionsManager(object): + """Class for option management""" + + def __init__(self): + + # url of cbs odata4 service + self.odata_url = "http://beta.opendata.cbs.nl/OData4" + self.catalog = "CBS" + self.odata_version = "4" + + # Enable in next version + # self.catalog_url = "opendata.cbs.nl" + + def __repr__(self): + return self.__str__() + + def __str__(self): + return "odata_url = {}, catalog = {}, odata_version = {}".format( + self.odata_url, self.catalog, self.api_version) + + def __getitem__(self, arg): + return getattr(self, arg) + + def __setitem__(self, arg, value): + setattr(self, arg, value) + + def _log_setting_change(self, setting_name, old_value, new_value): + logging.info( + "Setting '{}' changed from '{}' to '{}'.".format( + setting_name, old_value, new_value) + ) + + def __getattr__(self, arg): + return getattr(self, arg) + + def __setattr__(self, arg, value): + try: + old_value = copy.copy(getattr(self, arg)) + except Exception: + old_value = "undefined" + + self._log_setting_change(arg, old_value, value) + super(OptionsManager, self).__setattr__(arg, value) + + +# User options +options = OptionsManager() + + +def _odata4_request(url, kind="EntitySet", **kwargs): + + try: + + s = Session() + p = Request('GET', url, params=kwargs).prepare() + + logging.info("Download " + p.url) + + r = s.send(p) + r.raise_for_status() + + except requests.HTTPError as http_err: + raise requests.HTTPError( + "Downloading metadata '{}' failed. {}".format( + p.url, str(http_err) + ) + ) + + res = r.json(encoding='utf-8') + + # check the data context + if kind == "Singleton": + del res["@odata.context"] + return res + elif kind == "EntitySet": + data = copy.copy(res['value']) + + if "@odata.nextLink" in res.keys(): + data_next = _odata4_request( + res['@odata.nextLink'], + **kwargs + ) + data.extend(data_next) + + return data + else: + raise ValueError("Unknown kind '{}'.".format(kind)) + + +def _filters(filter): + """Filter rows with a CBS-style query. + + Parameters + ---------- + filter : str + The rows to return. + + Returns + ------- + str + Filter parameter for URL + """ + + return filter + + +def _save_data(data, dir, metadata_name): + """Save the data.""" + + if not os.path.exists(dir): + os.makedirs(dir) + + fp = os.path.join(dir, metadata_name + '.json') + + with open(fp, 'w') as output_file: + json.dump(data, output_file, indent=2) + + +def _read_data(*args, **kwargs): + pass + + +def download_data(table_id, catalog=None): + + raise NotImplementedError + + +def get_metadata(dataset_id, catalog=None, filters=None): + """Get the metadata of the dataset. + + Parameters + ---------- + dataset_id : str + The identifier of the dataset. Find the identifier in the list + of datasets with get_dataset_list() or navigate to + https://beta.opendata.cbs.nl/OData4/index.html. + catalog : str + The name of the catalog. Default options.catalog. Get a list + of catalogs with get_catalog_list() or navigate to + https://beta.opendata.cbs.nl/OData4/index.html. + filters : str + Return only rows that agree on the filter. + + Returns + ------- + list + A dictionary with the (meta)data of the table + """ + + catalog = options.catalog if catalog is None else catalog + + dataset_url = "{}/{}/{}/".format(options.odata_url, catalog, dataset_id) + dataset_odata_meta_list = _odata4_request(dataset_url) + + # https://beta.opendata.cbs.nl/OData4/CBS/83765NED/ + + metadata = {} + + for metadata_object in dataset_odata_meta_list: + + if metadata_object['name'].endswith("Groups") or \ + metadata_object['name'].endswith("Codes") or \ + metadata_object['name'] == "Dimensions" or \ + metadata_object['name'] == "Properties": + + metadata_url = dataset_url + metadata_object['url'] + metadata_table = _odata4_request( + metadata_url, + kind=metadata_object['kind'] + ) + metadata[metadata_object['name']] = metadata_table + + return metadata + + +def get_observations(table_id, catalog=None, filters=None): + """Get the observation of the dataset. + + Parameters + ---------- + dataset_id : str + The identifier of the dataset. Find the identifier in the list + of datasets with get_dataset_list() or navigate to + https://beta.opendata.cbs.nl/OData4/index.html. + catalog : str + The name of the catalog. Default options.catalog. Get a list + of catalogs with get_catalog_list() or navigate to + https://beta.opendata.cbs.nl/OData4/index.html. + filters : str + Return only rows that agree on the filter. + + Returns + ------- + list + A dictionary with the observations. + """ + catalog = options.catalog if catalog is None else catalog + + observations_url = "{}/{}/{}/Observations".format( + options.odata_url, + catalog, + table_id + ) + return _odata4_request(observations_url, kind="EntitySet") + + +def get_data(dataset_id, + catalog=None, + filters=None, + add_codes=True, + measure_vars=["Title", "Unit"], + measure_group_vars=["Title"]): + """Get the enriched observation of the dataset. + + Parameters + ---------- + dataset_id : str + The identifier of the dataset. Find the identifier in the list + of datasets with get_dataset_list() or navigate to + https://beta.opendata.cbs.nl/OData4/index.html. + catalog : str + The name of the catalog. Default options.catalog. Get a list + of catalogs with get_catalog_list() or navigate to + https://beta.opendata.cbs.nl/OData4/index.html. + filters : str + Return only rows that agree on the filter. + + Returns + ------- + list + A dictionary with the enriched observations. + """ + + observations = get_observations(dataset_id, catalog) + + if add_codes: + + # add codes + meta = get_metadata(dataset_id, + catalog=catalog, + filters=filters) + + def _lookup_dict(d, meta, key, drop_key=True): + r = dict(d, **meta.get(d[key], {})) + if drop_key: + del r[key] + return r + + # measures codes + if measure_group_vars: + _measure_vars = measure_vars + ["MeasureGroupID"] + else: + _measure_vars = measure_vars + + code_meta_dict = { + d["Identifier"]: {k: d[k] for k in _measure_vars} + for d in meta["MeasureCodes"] + } + observations = [ + _lookup_dict(d, code_meta_dict, "Measure", drop_key=False) + for d in observations + ] + + # measure groups + if "MeasureGroups" in meta.keys() and measure_group_vars: + group_meta_dict = {d["ID"]: { + "MeasureGroupTitle": d["Title"]} + for d in meta["MeasureGroups"]} + observations = [_lookup_dict(d, group_meta_dict, "MeasureGroupID") + for d in observations] + + # dimension codes + dimensions = [dim["Identifier"] for dim in meta['Dimensions']] + + for dim in dimensions: + + # codes + code_meta_dict = { + d["Identifier"]: { + dim + "Title": d["Title"], + dim + "GroupID": d["DimensionGroupID"]} + for d in meta[dim + "Codes"] + } + observations = [ + _lookup_dict(d, code_meta_dict, dim, drop_key=False) + for d in observations + ] + + # groups + meta_group_name = dim + "Groups" + + if meta_group_name in meta.keys(): + group_meta_dict = {d["ID"]: { + dim + "GroupTitle": d["Title"]} + for d in meta[meta_group_name]} + observations = [ + _lookup_dict(d, group_meta_dict, dim + "GroupID") + for d in observations + ] + + return observations + + +def get_dataset(*args, **kwargs): + return get_data(*args, **kwargs) + + +def get_catalog_list(): + """Get a list with the available catalogs. + + Returns + ------- + list + A list with the description of catalogs.""" + + catalog_url = "{}/Catalogs".format( + options.odata_url + ) + return _odata4_request(catalog_url) + + +def get_catalog_info(catalog): + """Get information on the catalog. + + Parameters + ---------- + catalog : str + The name of the catalog. Default ``options.catalog``. Get a + list of the available catalogs with get_catalog_list() or + navigate to https://beta.opendata.cbs.nl/OData4/index.html. + + Returns + ------- + dict + A dictionary with the description of the catalog. + """ + + catalog_url = "{}/Catalogs/{}".format( + options.odata_url, + catalog + ) + + return _odata4_request(catalog_url) + + +def get_dataset_list(catalog=None): + """Get a list with the available datasets. + + Parameters + ---------- + catalog : str + The name of the catalog. Default None. Get a + list of the available catalogs with get_catalog_list() or + navigate to https://beta.opendata.cbs.nl/OData4/index.html. + + Returns + ------- + list + A list with the description of datasets. + """ + + catalog = "" if catalog is None else catalog + catalog_url = "{}/{}/Datasets".format( + options.odata_url, + catalog + ) + return _odata4_request(catalog_url) + + +def get_dataset_info(dataset_id, catalog=None): + """Get information on the dataset. + + Parameters + ---------- + dataset_id : str + The identifier of the dataset. Find the identifier in the list + of datasets with get_dataset_list() or navigate to + https://beta.opendata.cbs.nl/OData4/index.html. + catalog : str + The name of the catalog. Default ``options.catalog``. Get a + list of the available catalogs with get_catalog_list() or + navigate to https://beta.opendata.cbs.nl/OData4/index.html. + + Returns + ------- + dict + A dictionary with the description of the dataset. + """ + + catalog = options.catalog if catalog is None else catalog + + url = "{}/{}/{}/Properties".format( + options.odata_url, + catalog, + dataset_id + ) + + return _odata4_request(url) + + +@contextmanager +def catalog(catalog): + """Context manager for catalogs. + + Parameters + ---------- + catalog : str + The catalog. For example: 'CBS' or 'CBS-Maatwerk'. + + """ + + old = copy.copy(options.catalog) + options.catalog = catalog + + yield + + options.catalog = old diff --git a/docs/index.rst b/docs/index.rst index c275acd..5e01ded 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,4 +8,5 @@ Contents: readme_link reference + reference_v4 diff --git a/docs/reference_v4.rst b/docs/reference_v4.rst new file mode 100644 index 0000000..396959f --- /dev/null +++ b/docs/reference_v4.rst @@ -0,0 +1,5 @@ +Reference Odata4 +================ + +.. automodule:: cbsodata4 + :members: \ No newline at end of file diff --git a/setup.py b/setup.py index d20a6a3..6a2e791 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ ], keywords='cbs statistics odata netherlands dutch', install_requires=['requests'], - py_modules=['cbsodata'], + py_modules=['cbsodata', 'cbsodata4'], tests_require=[ 'pytest' ], diff --git a/tests/test_cbsodata4.py b/tests/test_cbsodata4.py new file mode 100644 index 0000000..0350cae --- /dev/null +++ b/tests/test_cbsodata4.py @@ -0,0 +1,54 @@ +import os +import shutil + +import requests + +import cbsodata4 as cbsodata +# testing deps +import pytest + + +datasets = [ + '82245NED' +] + + +TEST_ENV = 'test_env' + + +def setup_module(module): + print('\nsetup_module()') + + if not os.path.exists(TEST_ENV): + os.makedirs(TEST_ENV) + + +def teardown_module(module): + print('teardown_module()') + + shutil.rmtree(TEST_ENV) + + +@pytest.mark.parametrize("table_id", datasets) +def test_observations(dataset_id): + + x = cbsodata.get_observations(dataset_id) + + assert len(x) > 100 + + +@pytest.mark.parametrize("dataset_id", datasets) +def test_metadata(dataset_id): + + x = cbsodata.get_metadata(dataset_id) + + assert "MeasurementGroups" in x.keys() + + +@pytest.mark.parametrize("dataset_id", datasets) +def test_info(dataset_id): + + # testing + info = cbsodata.get_dataset_info(dataset_id) + + assert isinstance(info, dict) From e76e433c71aa840a60a713e9ff92f522765f6f81 Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Thu, 27 Jun 2019 00:27:19 +0200 Subject: [PATCH 02/11] Add support for observation filters --- cbsodata4.py | 42 +++++++++++++++++++++++++---------------- tests/test_cbsodata4.py | 8 ++++++++ 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/cbsodata4.py b/cbsodata4.py index 99f02df..acbcb04 100644 --- a/cbsodata4.py +++ b/cbsodata4.py @@ -93,12 +93,12 @@ def __setattr__(self, arg, value): options = OptionsManager() -def _odata4_request(url, kind="EntitySet", **kwargs): +def _odata4_request(url, kind="EntitySet", params={}): try: s = Session() - p = Request('GET', url, params=kwargs).prepare() + p = Request('GET', url, params=params).prepare() logging.info("Download " + p.url) @@ -124,7 +124,8 @@ def _odata4_request(url, kind="EntitySet", **kwargs): if "@odata.nextLink" in res.keys(): data_next = _odata4_request( res['@odata.nextLink'], - **kwargs + kind=kind, + params=params ) data.extend(data_next) @@ -133,7 +134,7 @@ def _odata4_request(url, kind="EntitySet", **kwargs): raise ValueError("Unknown kind '{}'.".format(kind)) -def _filters(filter): +def _filter(filter): """Filter rows with a CBS-style query. Parameters @@ -171,7 +172,7 @@ def download_data(table_id, catalog=None): raise NotImplementedError -def get_metadata(dataset_id, catalog=None, filters=None): +def get_metadata(dataset_id, catalog=None): """Get the metadata of the dataset. Parameters @@ -184,8 +185,6 @@ def get_metadata(dataset_id, catalog=None, filters=None): The name of the catalog. Default options.catalog. Get a list of catalogs with get_catalog_list() or navigate to https://beta.opendata.cbs.nl/OData4/index.html. - filters : str - Return only rows that agree on the filter. Returns ------- @@ -219,7 +218,7 @@ def get_metadata(dataset_id, catalog=None, filters=None): return metadata -def get_observations(table_id, catalog=None, filters=None): +def get_observations(table_id, catalog=None, filter=None): """Get the observation of the dataset. Parameters @@ -232,7 +231,7 @@ def get_observations(table_id, catalog=None, filters=None): The name of the catalog. Default options.catalog. Get a list of catalogs with get_catalog_list() or navigate to https://beta.opendata.cbs.nl/OData4/index.html. - filters : str + filter : str Return only rows that agree on the filter. Returns @@ -247,12 +246,18 @@ def get_observations(table_id, catalog=None, filters=None): catalog, table_id ) - return _odata4_request(observations_url, kind="EntitySet") + payload = {"$filter": filter} if filter else {} + + return _odata4_request( + observations_url, + kind="EntitySet", + params=payload + ) def get_data(dataset_id, catalog=None, - filters=None, + filter=None, add_codes=True, measure_vars=["Title", "Unit"], measure_group_vars=["Title"]): @@ -268,8 +273,10 @@ def get_data(dataset_id, The name of the catalog. Default options.catalog. Get a list of catalogs with get_catalog_list() or navigate to https://beta.opendata.cbs.nl/OData4/index.html. - filters : str - Return only rows that agree on the filter. + filter : str + Filter observations. See + https://beta.opendata.cbs.nl/OData4/implement.html for filter. + At the moment, it is only possible to filter on observations. Returns ------- @@ -277,14 +284,17 @@ def get_data(dataset_id, A dictionary with the enriched observations. """ - observations = get_observations(dataset_id, catalog) + observations = get_observations( + dataset_id, + catalog, + filter=filter + ) if add_codes: # add codes meta = get_metadata(dataset_id, - catalog=catalog, - filters=filters) + catalog=catalog) def _lookup_dict(d, meta, key, drop_key=True): r = dict(d, **meta.get(d[key], {})) diff --git a/tests/test_cbsodata4.py b/tests/test_cbsodata4.py index 0350cae..9cb201c 100644 --- a/tests/test_cbsodata4.py +++ b/tests/test_cbsodata4.py @@ -52,3 +52,11 @@ def test_info(dataset_id): info = cbsodata.get_dataset_info(dataset_id) assert isinstance(info, dict) + + +@pytest.mark.parametrize("dataset_id", datasets) +def test_filter(dataset_id): + + x = cbsodata.get_dataset(dataset_id, filter="Id ge 1 and Id lt 10") + + assert len(x) == 9 From 1ef7d16ae3803ca507107861a7aab6a3cc64dd73 Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Thu, 27 Jun 2019 00:36:32 +0200 Subject: [PATCH 03/11] Fix singleton results for info functions --- cbsodata4.py | 4 ++-- tests/test_cbsodata4.py | 11 ++++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/cbsodata4.py b/cbsodata4.py index acbcb04..7a88a9b 100644 --- a/cbsodata4.py +++ b/cbsodata4.py @@ -396,7 +396,7 @@ def get_catalog_info(catalog): catalog ) - return _odata4_request(catalog_url) + return _odata4_request(catalog_url, kind="Singleton") def get_dataset_list(catalog=None): @@ -451,7 +451,7 @@ def get_dataset_info(dataset_id, catalog=None): dataset_id ) - return _odata4_request(url) + return _odata4_request(url, kind="Singleton") @contextmanager diff --git a/tests/test_cbsodata4.py b/tests/test_cbsodata4.py index 9cb201c..f28efc6 100644 --- a/tests/test_cbsodata4.py +++ b/tests/test_cbsodata4.py @@ -46,7 +46,7 @@ def test_metadata(dataset_id): @pytest.mark.parametrize("dataset_id", datasets) -def test_info(dataset_id): +def test_dataset_info(dataset_id): # testing info = cbsodata.get_dataset_info(dataset_id) @@ -54,6 +54,15 @@ def test_info(dataset_id): assert isinstance(info, dict) +@pytest.mark.parametrize("dataset_id", datasets) +def test_catalog_info(dataset_id): + + # testing + info = cbsodata.get_catalog_info(dataset_id) + + assert isinstance(info, dict) + + @pytest.mark.parametrize("dataset_id", datasets) def test_filter(dataset_id): From d362224691e06bd7b5133151ed482f18a802a088 Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Thu, 27 Jun 2019 01:01:29 +0200 Subject: [PATCH 04/11] Add top and skip options --- cbsodata4.py | 24 +++++++++++++++++++++--- tests/test_cbsodata4.py | 11 ++++++++++- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/cbsodata4.py b/cbsodata4.py index 7a88a9b..da48cc0 100644 --- a/cbsodata4.py +++ b/cbsodata4.py @@ -218,7 +218,8 @@ def get_metadata(dataset_id, catalog=None): return metadata -def get_observations(table_id, catalog=None, filter=None): +def get_observations(table_id, catalog=None, filter=None, + top=None, skip=None): """Get the observation of the dataset. Parameters @@ -233,6 +234,10 @@ def get_observations(table_id, catalog=None, filter=None): https://beta.opendata.cbs.nl/OData4/index.html. filter : str Return only rows that agree on the filter. + top : int + Return the top x observations. Default returns all. + skip : int + Skip the top x observations. Default 0. Returns ------- @@ -248,6 +253,11 @@ def get_observations(table_id, catalog=None, filter=None): ) payload = {"$filter": filter} if filter else {} + if top is not None: + payload["$top"] = top + if skip is not None: + payload["$skip"] = skip + return _odata4_request( observations_url, kind="EntitySet", @@ -260,7 +270,9 @@ def get_data(dataset_id, filter=None, add_codes=True, measure_vars=["Title", "Unit"], - measure_group_vars=["Title"]): + measure_group_vars=["Title"], + top=None, + skip=None): """Get the enriched observation of the dataset. Parameters @@ -277,6 +289,10 @@ def get_data(dataset_id, Filter observations. See https://beta.opendata.cbs.nl/OData4/implement.html for filter. At the moment, it is only possible to filter on observations. + top : int + Return the top x observations. Default returns all. + skip : int + Skip the top x observations. Default 0. Returns ------- @@ -287,7 +303,9 @@ def get_data(dataset_id, observations = get_observations( dataset_id, catalog, - filter=filter + filter=filter, + top=top, + skip=skip ) if add_codes: diff --git a/tests/test_cbsodata4.py b/tests/test_cbsodata4.py index f28efc6..19c8fae 100644 --- a/tests/test_cbsodata4.py +++ b/tests/test_cbsodata4.py @@ -29,7 +29,7 @@ def teardown_module(module): shutil.rmtree(TEST_ENV) -@pytest.mark.parametrize("table_id", datasets) +@pytest.mark.parametrize("dataset_id", datasets) def test_observations(dataset_id): x = cbsodata.get_observations(dataset_id) @@ -69,3 +69,12 @@ def test_filter(dataset_id): x = cbsodata.get_dataset(dataset_id, filter="Id ge 1 and Id lt 10") assert len(x) == 9 + + +@pytest.mark.parametrize("dataset_id", datasets) +def test_top_skip(dataset_id): + + x = cbsodata.get_dataset(dataset_id, top=10, skip=5) + + assert len(x) == 10 + assert x[0]["Id"] == 5 From 3c78d67d34885e10e53ea7b7d59b224de17c1edf Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Tue, 23 Jul 2019 00:37:56 +0200 Subject: [PATCH 05/11] Extend measure and dimension handling --- cbsodata4.py | 205 +++++++++++++++++++++++++++++----------- tests/test_cbsodata4.py | 109 +++++++++++++++++++-- 2 files changed, 252 insertions(+), 62 deletions(-) diff --git a/cbsodata4.py b/cbsodata4.py index da48cc0..5ae76ab 100644 --- a/cbsodata4.py +++ b/cbsodata4.py @@ -34,10 +34,11 @@ 'get_metadata', 'get_observations'] -import os -import json import copy +import json import logging +import os +import re from contextlib import contextmanager import requests @@ -106,12 +107,12 @@ def _odata4_request(url, kind="EntitySet", params={}): r.raise_for_status() except requests.HTTPError as http_err: - raise requests.HTTPError( - "Downloading metadata '{}' failed. {}".format( - p.url, str(http_err) - ) + http_err.message = "Downloading metadata '{}' failed. {}".format( + p.url, str(http_err) ) + raise http_err + res = r.json(encoding='utf-8') # check the data context @@ -268,9 +269,14 @@ def get_observations(table_id, catalog=None, filter=None, def get_data(dataset_id, catalog=None, filter=None, - add_codes=True, measure_vars=["Title", "Unit"], + include_measure_code_id=True, measure_group_vars=["Title"], + include_measure_group_id=True, + dimension_vars=["Title"], + include_dimension_code_id=True, + dimension_group_vars=["Title"], + include_dimension_group_id=False, top=None, skip=None): """Get the enriched observation of the dataset. @@ -289,6 +295,30 @@ def get_data(dataset_id, Filter observations. See https://beta.opendata.cbs.nl/OData4/implement.html for filter. At the moment, it is only possible to filter on observations. + measure_vars : list + A list of labels and variables to include for each measure + code. Examples are "Title", "Description", "DataType", + "Unit", "Format","Decimals","PresentationType". + Default ["Title", "Unit"] + measure_group_vars : list + A list of labels and variables to include for each measure + group. Examples are "Title", "Description" and "ParentID" + Default ["Title"] + include_measure_group_id : bool + Include the Identifier of the Measure Group. Default True. + dimension_vars : list + A list of labels and variables to include for each dimension + code. Examples are "Title", "Description", "DataType", + "Unit", "Format","Decimals","PresentationType". + Default ["Title", "Unit"] + dimension_group_vars : list + A list of labels and variables to include for each dimension + group. Examples are "Title", "Description" and "ParentID" + Default ["Title"] + include_dimension_group_id : bool + Include the Identifier of the Dimension Group. Default False. + (The default of this option is False because the Group ID + has no added value.) top : int Return the top x observations. Default returns all. skip : int @@ -308,69 +338,110 @@ def get_data(dataset_id, skip=skip ) - if add_codes: + # add codes + meta = get_metadata(dataset_id, + catalog=catalog) - # add codes - meta = get_metadata(dataset_id, - catalog=catalog) + def _lookup_dict(d, meta, key, drop_key=True): + r = dict(d, **meta.get(d[key], {})) + if drop_key: + del r[key] + return r - def _lookup_dict(d, meta, key, drop_key=True): - r = dict(d, **meta.get(d[key], {})) - if drop_key: - del r[key] - return r + if measure_vars or measure_group_vars: - # measures codes - if measure_group_vars: - _measure_vars = measure_vars + ["MeasureGroupID"] - else: - _measure_vars = measure_vars + # transform measure codes into key-value pairs + code_meas_meta_dict = {} + for d in meta["MeasureCodes"]: # loop over all meta records + + # include all measure_vars with the name "Measure" + # as a prefix + temp_meas_dict = { + "Measure" + k: d[k] for k in measure_vars + } + + # if there are group variables to include, we need the + # MeasureGroupID. + if measure_group_vars: + temp_meas_dict["MeasureGroupID"] = d["MeasureGroupID"] + + # update the dict + code_meas_meta_dict[d["Identifier"]] = temp_meas_dict - code_meta_dict = { - d["Identifier"]: {k: d[k] for k in _measure_vars} - for d in meta["MeasureCodes"] - } observations = [ - _lookup_dict(d, code_meta_dict, "Measure", drop_key=False) + _lookup_dict(d, code_meas_meta_dict, "Measure", + drop_key=not include_measure_code_id) for d in observations ] # measure groups if "MeasureGroups" in meta.keys() and measure_group_vars: - group_meta_dict = {d["ID"]: { - "MeasureGroupTitle": d["Title"]} + group_meta_dict = { + d["ID"]: {"MeasureGroup" + k: d[k] for k in measure_group_vars} for d in meta["MeasureGroups"]} - observations = [_lookup_dict(d, group_meta_dict, "MeasureGroupID") - for d in observations] - - # dimension codes + observations = [ + _lookup_dict( + d, + group_meta_dict, + "MeasureGroupID", + drop_key=not include_measure_group_id + ) + for d in observations] + + # dimension codes + if dimension_vars or dimension_group_vars: + + # get a list of the dimension names dimensions = [dim["Identifier"] for dim in meta['Dimensions']] + # add code and group info for each dimension for dim in dimensions: - # codes - code_meta_dict = { - d["Identifier"]: { - dim + "Title": d["Title"], - dim + "GroupID": d["DimensionGroupID"]} - for d in meta[dim + "Codes"] - } + # transform codes into key-value pairs + code_dim_meta_dict = {} + for d in meta[dim + "Codes"]: # loop over all meta records + + # include all dimension_vars with the name of the dimension + # as a prefix + temp_dim_dict = { + dim + k: d[k] for k in dimension_vars + } + + # if there are group variables to include, we need the GroupID. + if dimension_group_vars: + temp_dim_dict[dim + "GroupID"] = d["DimensionGroupID"] + + code_dim_meta_dict[d["Identifier"]] = temp_dim_dict + + # Update the observations observations = [ - _lookup_dict(d, code_meta_dict, dim, drop_key=False) + _lookup_dict(d, code_dim_meta_dict, key=dim, + drop_key=not include_dimension_code_id) for d in observations ] - # groups - meta_group_name = dim + "Groups" - - if meta_group_name in meta.keys(): - group_meta_dict = {d["ID"]: { - dim + "GroupTitle": d["Title"]} - for d in meta[meta_group_name]} - observations = [ - _lookup_dict(d, group_meta_dict, dim + "GroupID") - for d in observations - ] + # append dimension group vars. + if dimension_group_vars: + + # groups + meta_group_name = dim + "Groups" + + if meta_group_name in meta.keys(): + group_meta_dict = { + d["ID"]: { + dim + "Group" + k: d[k] + for k in dimension_group_vars + } + for d in meta[meta_group_name]} + observations = [ + _lookup_dict( + d, + group_meta_dict, + dim + "GroupID", + drop_key=not include_dimension_group_id + ) + for d in observations + ] return observations @@ -409,12 +480,34 @@ def get_catalog_info(catalog): A dictionary with the description of the catalog. """ - catalog_url = "{}/Catalogs/{}".format( - options.odata_url, - catalog - ) + try: + catalog_url = "{}/Catalogs/{}".format( + options.odata_url, + catalog + ) + + return _odata4_request(catalog_url, kind="Singleton") + + except requests.HTTPError as err: + + # check if catalog is a dataset to get more informative error + pattern = re.compile(r"\d{5,6}[A-Z]{3}") + catalog_is_dataset = pattern.match(catalog) - return _odata4_request(catalog_url, kind="Singleton") + if catalog_is_dataset: + raise ValueError( + "Catalog '{}' seems to be a dataset identifier.".format( + catalog + ) + ) + elif err.response.status_code == 404: + raise ValueError( + "Catalog '{}' not found.".format( + catalog + ) + ) + else: + raise err def get_dataset_list(catalog=None): diff --git a/tests/test_cbsodata4.py b/tests/test_cbsodata4.py index 19c8fae..07cf1d7 100644 --- a/tests/test_cbsodata4.py +++ b/tests/test_cbsodata4.py @@ -12,6 +12,10 @@ '82245NED' ] +catalogs = [ + 'CBS', + 'CBS-Maatwerk' +] TEST_ENV = 'test_env' @@ -42,7 +46,7 @@ def test_metadata(dataset_id): x = cbsodata.get_metadata(dataset_id) - assert "MeasurementGroups" in x.keys() + assert "MeasureGroups" in x.keys() @pytest.mark.parametrize("dataset_id", datasets) @@ -54,17 +58,23 @@ def test_dataset_info(dataset_id): assert isinstance(info, dict) -@pytest.mark.parametrize("dataset_id", datasets) -def test_catalog_info(dataset_id): +@pytest.mark.parametrize("catalog_id", catalogs) +def test_catalog_info(catalog_id): # testing - info = cbsodata.get_catalog_info(dataset_id) + info = cbsodata.get_catalog_info(catalog_id) assert isinstance(info, dict) +def test_catalog_info_error(): + + with pytest.raises(ValueError): + cbsodata.get_catalog_info("CBS-UNKNOWN") + + @pytest.mark.parametrize("dataset_id", datasets) -def test_filter(dataset_id): +def test_dataset_filter(dataset_id): x = cbsodata.get_dataset(dataset_id, filter="Id ge 1 and Id lt 10") @@ -72,9 +82,96 @@ def test_filter(dataset_id): @pytest.mark.parametrize("dataset_id", datasets) -def test_top_skip(dataset_id): +def test_dataset_top_skip(dataset_id): x = cbsodata.get_dataset(dataset_id, top=10, skip=5) assert len(x) == 10 assert x[0]["Id"] == 5 + + +def test_dataset_measure_vars(): + + dataset_id = '83487NED' + + # measure codes + x = cbsodata.get_dataset( + dataset_id, + measure_vars=["Description"], + top=10 + ) + + assert "MeasureDescription" in x[0].keys() + assert "MeasureTitle" not in x[0].keys() + assert "Measure" in x[0].keys() + + x = cbsodata.get_dataset( + dataset_id, + include_measure_code_id=False, + top=10 + ) + + assert "Measure" not in x[0].keys() + + x = cbsodata.get_dataset( + dataset_id, + measure_group_vars=["Description"], + include_measure_group_id=False, + top=10 + ) + + assert "MeasureGroupID" not in x[0].keys() + assert "MeasureGroupDescription" in x[0].keys() + assert "MeasureGroupTitle" not in x[0].keys() + + x = cbsodata.get_dataset( + dataset_id, + include_measure_group_id=True, + top=10 + ) + + assert "MeasureGroupID" in x[0].keys() + + +def test_dataset_dimension_vars(): + + dataset_id = '83487NED' + + # measure codes + x = cbsodata.get_dataset( + dataset_id, + dimension_vars=["Description"], + dimension_group_vars=[], + top=10 + ) + + assert "WijkenEnBuurtenDescription" in x[0].keys() + assert "WijkenEnBuurtenTitle" not in x[0].keys() + assert "WijkenEnBuurten" in x[0].keys() + + x = cbsodata.get_dataset( + dataset_id, + include_dimension_code_id=False, + top=10 + ) + + assert "WijkenEnBuurten" not in x[0].keys() + + x = cbsodata.get_dataset( + dataset_id, + dimension_group_vars=["Description"], + include_dimension_group_id=False, + top=10 + ) + + assert "WijkenEnBuurtenGroupID" not in x[0].keys() + assert "WijkenEnBuurtenGroupDescription" in x[0].keys() + assert "WijkenEnBuurtenGroupTitle" not in x[0].keys() + + x = cbsodata.get_dataset( + dataset_id, + include_dimension_group_id=True, + top=10 + ) + + assert "WijkenEnBuurtenGroupID" in x[0].keys() From d9436791a976ca1337d5b9f599d7d2a0ccb2d600 Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Wed, 24 Jul 2019 12:22:15 +0200 Subject: [PATCH 06/11] Better docstrings, small fixes for rare situations, add tests --- cbsodata4.py | 77 +++++++++++++++++++++++++++++++++++++---- tests/test_cbsodata4.py | 27 +++++++++++++++ 2 files changed, 98 insertions(+), 6 deletions(-) diff --git a/cbsodata4.py b/cbsodata4.py index 5ae76ab..d21a56e 100644 --- a/cbsodata4.py +++ b/cbsodata4.py @@ -279,7 +279,15 @@ def get_data(dataset_id, include_dimension_group_id=False, top=None, skip=None): - """Get the enriched observation of the dataset. + """Get the enriched observations of the requested dataset. + + This function can be used to retrieve observation/data from the + API. Each observation can be enriched with metadata of the + measures and dimensions. + + The returned data is in a long format. Each row contains a + single observation. One can transform the data into a wide format + with Python tools like pandas. Parameters ---------- @@ -293,24 +301,29 @@ def get_data(dataset_id, https://beta.opendata.cbs.nl/OData4/index.html. filter : str Filter observations. See - https://beta.opendata.cbs.nl/OData4/implement.html for filter. - At the moment, it is only possible to filter on observations. + https://beta.opendata.cbs.nl/OData4/implement.html for filter + ideas. At the moment, it is only possible to filter on + observations. measure_vars : list A list of labels and variables to include for each measure code. Examples are "Title", "Description", "DataType", - "Unit", "Format","Decimals","PresentationType". + "Unit", "Format", "Decimals", "PresentationType". Default ["Title", "Unit"] + include_measure_code_id : bool + Include the Identifier of the Measure Code. Default True. measure_group_vars : list A list of labels and variables to include for each measure - group. Examples are "Title", "Description" and "ParentID" + group. Examples are "Title", "Description" and "ParentID". Default ["Title"] include_measure_group_id : bool Include the Identifier of the Measure Group. Default True. dimension_vars : list A list of labels and variables to include for each dimension code. Examples are "Title", "Description", "DataType", - "Unit", "Format","Decimals","PresentationType". + "Unit", "Format", "Decimals", "PresentationType". Default ["Title", "Unit"] + include_dimension_code_id : bool + Include the Identifier of the Dimension Code. Default True. dimension_group_vars : list A list of labels and variables to include for each dimension group. Examples are "Title", "Description" and "ParentID" @@ -328,6 +341,41 @@ def get_data(dataset_id, ------- list A dictionary with the enriched observations. + + Examples + -------- + + >>> cbsodata.get_data("81589NED", + ... measure_vars=["Title", "DataType"], + ... top=1) + [{'Id': 0, + 'Measure': 'M0000201', + 'ValueAttribute': 'None', + 'Value': 987345.0, + 'BedrijfstakkenBranchesSBI2008': 'T001081', + 'Perioden': '2007KW01', + 'MeasureTitle': 'Totaal bedrijven', + 'MeasureDataType': 'Long', + 'MeasureGroupID': None, + 'BedrijfstakkenBranchesSBI2008Title': 'A-U Alle economische ...', + 'BedrijfstakkenBranchesSBI2008GroupTitle': 'Totaal', + 'PeriodenTitle': '2007 1e kwartaal', + 'PeriodenGroupTitle': 'Kwartalen'}] + + >>> cbsodata.get_data("81589NED", + ... include_measure_code_id=False, + ... include_measure_group_id=False, + ... dimension_vars=["Title"], + ... dimension_group_vars=None, + ... include_dimension_code_id=False, + ... top=1) + [{'Id': 0, + 'ValueAttribute': 'None', + 'Value': 987345.0, + 'MeasureTitle': 'Totaal bedrijven', + 'MeasureUnit': 'aantal', + 'BedrijfstakkenBranchesSBI2008Title': 'A-U Alle economische acti...', + 'PeriodenTitle': '2007 1e kwartaal'}] """ observations = get_observations( @@ -348,6 +396,10 @@ def _lookup_dict(d, meta, key, drop_key=True): del r[key] return r + def _drop_key_value(d, key): + del d[key] + return d + if measure_vars or measure_group_vars: # transform measure codes into key-value pairs @@ -387,6 +439,9 @@ def _lookup_dict(d, meta, key, drop_key=True): drop_key=not include_measure_group_id ) for d in observations] + elif not (measure_vars or measure_group_vars) and \ + not include_measure_code_id: + observations = [_drop_key_value(d, "Measure") for d in observations] # dimension codes if dimension_vars or dimension_group_vars: @@ -443,6 +498,16 @@ def _lookup_dict(d, meta, key, drop_key=True): for d in observations ] + elif not (dimension_vars or dimension_group_vars) and \ + not include_dimension_code_id: + + # get a list of the dimension names + dimensions = [dim["Identifier"] for dim in meta['Dimensions']] + + # add code and group info for each dimension + for dim in dimensions: + observations = [_drop_key_value(d, dim) for d in observations] + return observations diff --git a/tests/test_cbsodata4.py b/tests/test_cbsodata4.py index 07cf1d7..dcfc0d3 100644 --- a/tests/test_cbsodata4.py +++ b/tests/test_cbsodata4.py @@ -133,6 +133,19 @@ def test_dataset_measure_vars(): assert "MeasureGroupID" in x[0].keys() +def test_dataset_drop_measure_id(): + + x = cbsodata.get_data( + "81589NED", + measure_vars=None, + measure_group_vars=None, + include_measure_code_id=False, + top=1 + ) + + assert "Measure" not in x[0].keys() + + def test_dataset_dimension_vars(): dataset_id = '83487NED' @@ -175,3 +188,17 @@ def test_dataset_dimension_vars(): ) assert "WijkenEnBuurtenGroupID" in x[0].keys() + + +def test_dataset_drop_dimension_id(): + + x = cbsodata.get_data( + "81589NED", + dimension_vars=None, + dimension_group_vars=None, + include_dimension_code_id=False, + top=1 + ) + + assert "Perioden" not in x[0].keys() + assert "BedrijfstakkenBranchesSBI2008" not in x[0].keys() From 71d5aec1ff3c412710730ec3873b668fbb4649bf Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Thu, 25 Jul 2019 00:12:14 +0200 Subject: [PATCH 07/11] Fix broken target names --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 921b021..a311164 100644 --- a/README.rst +++ b/README.rst @@ -187,13 +187,13 @@ The list of tables can be turned into a pandas DataFrame as well. OData Version 4 --------------- -CBS migrates from `OData version 3`_ to `OData version 4`_. This migration comes +CBS migrates from `OData version 3`__ to `OData version 4`__. This migration comes with a lot of other changes. Read about the changes at the website of CBS https://beta.opendata.cbs.nl/OData4/index.html. More documentation on this page will follow when the new API is officially released (it's now in beta) -.. _`OData version 3`: https://www.odata.org/documentation/odata-version-3-0/ -.. _`OData version 4`: https://www.odata.org/documentation/ +__ https://www.odata.org/documentation/odata-version-3-0/ +__ https://www.odata.org/documentation/ .. code:: python From efdd80e88d0a533752d4307aa1a592eff72e751b Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Thu, 25 Jul 2019 00:22:39 +0200 Subject: [PATCH 08/11] Change link to fix target names --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index a311164..cc31de5 100644 --- a/README.rst +++ b/README.rst @@ -184,8 +184,8 @@ The list of tables can be turned into a pandas DataFrame as well. >>> tables.head() -OData Version 4 ---------------- +Odata API Version 4 +------------------- CBS migrates from `OData version 3`__ to `OData version 4`__. This migration comes with a lot of other changes. Read about the changes at the website of CBS https://beta.opendata.cbs.nl/OData4/index.html. From 1ae15238d72baded37b84ff194afdfc15d26b69c Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Thu, 25 Jul 2019 00:48:45 +0200 Subject: [PATCH 09/11] Remove unused requests import --- tests/test_cbsodata4.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_cbsodata4.py b/tests/test_cbsodata4.py index dcfc0d3..d785084 100644 --- a/tests/test_cbsodata4.py +++ b/tests/test_cbsodata4.py @@ -1,8 +1,6 @@ import os import shutil -import requests - import cbsodata4 as cbsodata # testing deps import pytest From b16337d1c819535f1e0b6132658b6c7f4743db9a Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Thu, 25 Jul 2019 17:54:46 +0200 Subject: [PATCH 10/11] Add option to download raw dataset --- cbsodata4.py | 95 ++++++++++++++++++++++++++++++++--------- tests/test_cbsodata4.py | 8 ++++ 2 files changed, 84 insertions(+), 19 deletions(-) diff --git a/cbsodata4.py b/cbsodata4.py index d21a56e..83ce984 100644 --- a/cbsodata4.py +++ b/cbsodata4.py @@ -94,7 +94,7 @@ def __setattr__(self, arg, value): options = OptionsManager() -def _odata4_request(url, kind="EntitySet", params={}): +def _download_request(url, params={}): try: @@ -106,6 +106,8 @@ def _odata4_request(url, kind="EntitySet", params={}): r = s.send(p) r.raise_for_status() + return r + except requests.HTTPError as http_err: http_err.message = "Downloading metadata '{}' failed. {}".format( p.url, str(http_err) @@ -113,28 +115,52 @@ def _odata4_request(url, kind="EntitySet", params={}): raise http_err - res = r.json(encoding='utf-8') + +def _parse_odata4_request(json_response, kind): + """Parse Odata4 requests. + + Returns + ------- + tuple + A tuple with the data (list or dict) in the first position + and the next link in the second. The latter one is None + is not given. + """ # check the data context if kind == "Singleton": - del res["@odata.context"] - return res + del json_response["@odata.context"] + return json_response, None elif kind == "EntitySet": - data = copy.copy(res['value']) + data = json_response['value'] - if "@odata.nextLink" in res.keys(): - data_next = _odata4_request( - res['@odata.nextLink'], - kind=kind, - params=params - ) - data.extend(data_next) + if "@odata.nextLink" in json_response.keys(): + next_link = json_response['@odata.nextLink'] + else: + next_link = None - return data + return data, next_link else: raise ValueError("Unknown kind '{}'.".format(kind)) +def _odata4_request(url, kind="EntitySet", params={}, follow_next_link=True): + """Make an Odata4 requests. + """ + + # download the page + r = _download_request(url, params=params) + res = r.json(encoding='utf-8') + + data, next_link = _parse_odata4_request(res, kind) + + if kind == "EntitySet" and follow_next_link and next_link: + data_next = _odata4_request(next_link, kind=kind, params=params) + data.extend(data_next) + + return data + + def _filter(filter): """Filter rows with a CBS-style query. @@ -160,17 +186,48 @@ def _save_data(data, dir, metadata_name): fp = os.path.join(dir, metadata_name + '.json') - with open(fp, 'w') as output_file: - json.dump(data, output_file, indent=2) + if isinstance(data, dict): + with open(fp, 'w') as output_file: + json.dump(data, output_file, indent=2) + elif isinstance(data, list): + with open(fp, 'a') as output_file: + for line in data: + output_file.write(json.dumps(line) + "\n") + else: + ValueError("Unknown data type to export.") + + +def download_dataset(dataset_id, catalog=None, params={}, + save_dir="tmp", include_metadata=True): + """Download the raw data package.""" + + # https://beta.opendata.cbs.nl/OData4/CBS/83765NED/ + + catalog = options.catalog if catalog is None else catalog + dataset_url = "{}/{}/{}/".format(options.odata_url, catalog, dataset_id) + # download the page + res_index = _odata4_request(dataset_url, params=params) + _save_data(res_index, save_dir, "index") -def _read_data(*args, **kwargs): - pass + # download metadata xml + if include_metadata: + metadata_url = "{}/$metadata".format(dataset_url) + xml_metadata = _download_request(metadata_url) + fp = os.path.join(save_dir, 'metadata.xml') + with open(fp, "w", encoding="utf-8") as f: + f.write(xml_metadata.text) + for metadata_object in res_index: -def download_data(table_id, catalog=None): + metadata_url = dataset_url + metadata_object['url'] - raise NotImplementedError + # download and save data + res_meta = _odata4_request( + metadata_url, + kind=metadata_object['kind'], + params=params) + _save_data(res_meta, save_dir, metadata_object['url']) def get_metadata(dataset_id, catalog=None): diff --git a/tests/test_cbsodata4.py b/tests/test_cbsodata4.py index d785084..fd7617f 100644 --- a/tests/test_cbsodata4.py +++ b/tests/test_cbsodata4.py @@ -31,6 +31,14 @@ def teardown_module(module): shutil.rmtree(TEST_ENV) +def test_download(): + + cbsodata.download_dataset( + "81589NED", + save_dir=os.path.join(TEST_ENV, "81589NED") + ) + + @pytest.mark.parametrize("dataset_id", datasets) def test_observations(dataset_id): From 082ec77b62fe890ce4455fa11e17a40b2e964e38 Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Sun, 9 Aug 2020 19:12:05 +0200 Subject: [PATCH 11/11] Changes for CBS updates --- cbsodata4.py => cbsodata/cbsodata4.py | 222 +++++++++++++------------- tests/test_cbsodata4.py | 12 +- 2 files changed, 115 insertions(+), 119 deletions(-) rename cbsodata4.py => cbsodata/cbsodata4.py (80%) diff --git a/cbsodata4.py b/cbsodata/cbsodata4.py similarity index 80% rename from cbsodata4.py rename to cbsodata/cbsodata4.py index 83ce984..ede5da8 100644 --- a/cbsodata4.py +++ b/cbsodata/cbsodata4.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 Jonathan de Bruin +# Copyright (c) 2020 Jonathan de Bruin # Permission is hereby granted, free of charge, to any person # obtaining a copy of this software and associated documentation @@ -20,19 +20,13 @@ # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. - """Statistics Netherlands opendata version 4 API client for Python""" __all__ = [ - 'options', - 'get_data', - 'get_dataset', - 'get_dataset_info', - 'get_dataset_list', - 'get_catalog_info', - 'get_catalog_list', - 'get_metadata', - 'get_observations'] + 'options', 'get_data', 'get_dataset', 'get_dataset_info', + 'get_dataset_list', 'get_catalog_info', 'get_catalog_list', 'get_metadata', + 'get_observations' +] import copy import json @@ -51,13 +45,10 @@ class OptionsManager(object): def __init__(self): # url of cbs odata4 service - self.odata_url = "http://beta.opendata.cbs.nl/OData4" + self.odata_url = "https://odata4.cbs.nl" self.catalog = "CBS" self.odata_version = "4" - # Enable in next version - # self.catalog_url = "opendata.cbs.nl" - def __repr__(self): return self.__str__() @@ -72,10 +63,8 @@ def __setitem__(self, arg, value): setattr(self, arg, value) def _log_setting_change(self, setting_name, old_value, new_value): - logging.info( - "Setting '{}' changed from '{}' to '{}'.".format( - setting_name, old_value, new_value) - ) + logging.info("Setting '{}' changed from '{}' to '{}'.".format( + setting_name, old_value, new_value)) def __getattr__(self, arg): return getattr(self, arg) @@ -94,7 +83,28 @@ def __setattr__(self, arg, value): options = OptionsManager() -def _download_request(url, params={}): +def _get_catalog(catalog=None): + """Return the catalog. + + Parameters + ---------- + catalog : str + If not None, return the catalog. Else the catalog + in options. + + Returns + ------- + str + The catalog. + """ + return _get_catalog(catalog) + + +def _download_request(url, params={}, **kwargs): + + # additional parameters to requests + request_kwargs = options.requests.copy() + request_kwargs.update(kwargs) try: @@ -103,22 +113,28 @@ def _download_request(url, params={}): logging.info("Download " + p.url) - r = s.send(p) + r = s.send(p, **request_kwargs) r.raise_for_status() return r except requests.HTTPError as http_err: http_err.message = "Downloading metadata '{}' failed. {}".format( - p.url, str(http_err) - ) + p.url, str(http_err)) raise http_err -def _parse_odata4_request(json_response, kind): +def _parse_odata4_response(json_response, kind): """Parse Odata4 requests. + Parameters + ---------- + json_response : requests.Response + The OData4 response to parse. + kind : str + Type of response: "Singleton" or "EntitySet". + Returns ------- tuple @@ -144,18 +160,23 @@ def _parse_odata4_request(json_response, kind): raise ValueError("Unknown kind '{}'.".format(kind)) -def _odata4_request(url, kind="EntitySet", params={}, follow_next_link=True): - """Make an Odata4 requests. +def _odata4_request(url, + kind="EntitySet", + params={}, + follow_next_link=True, + **kwargs): + """Make an Odata4 request. """ # download the page - r = _download_request(url, params=params) + r = _download_request(url, params=params, **kwargs) res = r.json(encoding='utf-8') - data, next_link = _parse_odata4_request(res, kind) + data, next_link = _parse_odata4_response(res, kind) if kind == "EntitySet" and follow_next_link and next_link: - data_next = _odata4_request(next_link, kind=kind, params=params) + data_next = _odata4_request( + next_link, kind=kind, params=params, **kwargs) data.extend(data_next) return data @@ -197,22 +218,26 @@ def _save_data(data, dir, metadata_name): ValueError("Unknown data type to export.") -def download_dataset(dataset_id, catalog=None, params={}, - save_dir="tmp", include_metadata=True): +def download_dataset(dataset_id, + catalog=None, + params={}, + save_dir="tmp", + include_metadata=True, + **kwargs): """Download the raw data package.""" # https://beta.opendata.cbs.nl/OData4/CBS/83765NED/ - catalog = options.catalog if catalog is None else catalog + catalog = _get_catalog(catalog) dataset_url = "{}/{}/{}/".format(options.odata_url, catalog, dataset_id) # download the page - res_index = _odata4_request(dataset_url, params=params) + res_index = _odata4_request(dataset_url, params=params, **kwargs) _save_data(res_index, save_dir, "index") # download metadata xml if include_metadata: - metadata_url = "{}/$metadata".format(dataset_url) + metadata_url = "{}$metadata".format(dataset_url) xml_metadata = _download_request(metadata_url) fp = os.path.join(save_dir, 'metadata.xml') with open(fp, "w", encoding="utf-8") as f: @@ -226,7 +251,8 @@ def download_dataset(dataset_id, catalog=None, params={}, res_meta = _odata4_request( metadata_url, kind=metadata_object['kind'], - params=params) + params=params, + **kwargs) _save_data(res_meta, save_dir, metadata_object['url']) @@ -250,7 +276,7 @@ def get_metadata(dataset_id, catalog=None): A dictionary with the (meta)data of the table """ - catalog = options.catalog if catalog is None else catalog + catalog = _get_catalog(catalog) dataset_url = "{}/{}/{}/".format(options.odata_url, catalog, dataset_id) dataset_odata_meta_list = _odata4_request(dataset_url) @@ -268,16 +294,13 @@ def get_metadata(dataset_id, catalog=None): metadata_url = dataset_url + metadata_object['url'] metadata_table = _odata4_request( - metadata_url, - kind=metadata_object['kind'] - ) + metadata_url, kind=metadata_object['kind']) metadata[metadata_object['name']] = metadata_table return metadata -def get_observations(table_id, catalog=None, filter=None, - top=None, skip=None): +def get_observations(table_id, catalog=None, filter=None, top=None, skip=None): """Get the observation of the dataset. Parameters @@ -302,13 +325,10 @@ def get_observations(table_id, catalog=None, filter=None, list A dictionary with the observations. """ - catalog = options.catalog if catalog is None else catalog + catalog = _get_catalog(catalog) - observations_url = "{}/{}/{}/Observations".format( - options.odata_url, - catalog, - table_id - ) + observations_url = "{}/{}/{}/Observations".format(options.odata_url, + catalog, table_id) payload = {"$filter": filter} if filter else {} if top is not None: @@ -316,11 +336,7 @@ def get_observations(table_id, catalog=None, filter=None, if skip is not None: payload["$skip"] = skip - return _odata4_request( - observations_url, - kind="EntitySet", - params=payload - ) + return _odata4_request(observations_url, kind="EntitySet", params=payload) def get_data(dataset_id, @@ -413,7 +429,7 @@ def get_data(dataset_id, 'Perioden': '2007KW01', 'MeasureTitle': 'Totaal bedrijven', 'MeasureDataType': 'Long', - 'MeasureGroupID': None, + 'MeasureGroupId': None, 'BedrijfstakkenBranchesSBI2008Title': 'A-U Alle economische ...', 'BedrijfstakkenBranchesSBI2008GroupTitle': 'Totaal', 'PeriodenTitle': '2007 1e kwartaal', @@ -436,16 +452,10 @@ def get_data(dataset_id, """ observations = get_observations( - dataset_id, - catalog, - filter=filter, - top=top, - skip=skip - ) + dataset_id, catalog, filter=filter, top=top, skip=skip) # add codes - meta = get_metadata(dataset_id, - catalog=catalog) + meta = get_metadata(dataset_id, catalog=catalog) def _lookup_dict(d, meta, key, drop_key=True): r = dict(d, **meta.get(d[key], {})) @@ -465,37 +475,40 @@ def _drop_key_value(d, key): # include all measure_vars with the name "Measure" # as a prefix - temp_meas_dict = { - "Measure" + k: d[k] for k in measure_vars - } + temp_meas_dict = {"Measure" + k: d[k] for k in measure_vars} # if there are group variables to include, we need the - # MeasureGroupID. + # MeasureGroupId. if measure_group_vars: - temp_meas_dict["MeasureGroupID"] = d["MeasureGroupID"] + temp_meas_dict["MeasureGroupId"] = d["MeasureGroupId"] # update the dict code_meas_meta_dict[d["Identifier"]] = temp_meas_dict observations = [ - _lookup_dict(d, code_meas_meta_dict, "Measure", - drop_key=not include_measure_code_id) - for d in observations + _lookup_dict( + d, + code_meas_meta_dict, + "Measure", + drop_key=not include_measure_code_id) for d in observations ] # measure groups if "MeasureGroups" in meta.keys() and measure_group_vars: group_meta_dict = { - d["ID"]: {"MeasureGroup" + k: d[k] for k in measure_group_vars} - for d in meta["MeasureGroups"]} + d["Id"]: + {"MeasureGroup" + k: d[k] + for k in measure_group_vars} + for d in meta["MeasureGroups"] + } observations = [ _lookup_dict( d, group_meta_dict, - "MeasureGroupID", - drop_key=not include_measure_group_id - ) - for d in observations] + "MeasureGroupId", + drop_key=not include_measure_group_id) + for d in observations + ] elif not (measure_vars or measure_group_vars) and \ not include_measure_code_id: observations = [_drop_key_value(d, "Measure") for d in observations] @@ -515,20 +528,21 @@ def _drop_key_value(d, key): # include all dimension_vars with the name of the dimension # as a prefix - temp_dim_dict = { - dim + k: d[k] for k in dimension_vars - } + temp_dim_dict = {dim + k: d[k] for k in dimension_vars} - # if there are group variables to include, we need the GroupID. + # if there are group variables to include, we need the GroupId. if dimension_group_vars: - temp_dim_dict[dim + "GroupID"] = d["DimensionGroupID"] + temp_dim_dict[dim + "GroupId"] = d["DimensionGroupId"] code_dim_meta_dict[d["Identifier"]] = temp_dim_dict # Update the observations observations = [ - _lookup_dict(d, code_dim_meta_dict, key=dim, - drop_key=not include_dimension_code_id) + _lookup_dict( + d, + code_dim_meta_dict, + key=dim, + drop_key=not include_dimension_code_id) for d in observations ] @@ -540,18 +554,18 @@ def _drop_key_value(d, key): if meta_group_name in meta.keys(): group_meta_dict = { - d["ID"]: { + d["Id"]: { dim + "Group" + k: d[k] for k in dimension_group_vars } - for d in meta[meta_group_name]} + for d in meta[meta_group_name] + } observations = [ _lookup_dict( d, group_meta_dict, - dim + "GroupID", - drop_key=not include_dimension_group_id - ) + dim + "GroupId", + drop_key=not include_dimension_group_id) for d in observations ] @@ -580,9 +594,7 @@ def get_catalog_list(): list A list with the description of catalogs.""" - catalog_url = "{}/Catalogs".format( - options.odata_url - ) + catalog_url = "{}/Catalogs".format(options.odata_url) return _odata4_request(catalog_url) @@ -603,10 +615,7 @@ def get_catalog_info(catalog): """ try: - catalog_url = "{}/Catalogs/{}".format( - options.odata_url, - catalog - ) + catalog_url = "{}/Catalogs/{}".format(options.odata_url, catalog) return _odata4_request(catalog_url, kind="Singleton") @@ -619,15 +628,9 @@ def get_catalog_info(catalog): if catalog_is_dataset: raise ValueError( "Catalog '{}' seems to be a dataset identifier.".format( - catalog - ) - ) + catalog)) elif err.response.status_code == 404: - raise ValueError( - "Catalog '{}' not found.".format( - catalog - ) - ) + raise ValueError("Catalog '{}' not found.".format(catalog)) else: raise err @@ -649,10 +652,7 @@ def get_dataset_list(catalog=None): """ catalog = "" if catalog is None else catalog - catalog_url = "{}/{}/Datasets".format( - options.odata_url, - catalog - ) + catalog_url = "{}/{}/Datasets".format(options.odata_url, catalog) return _odata4_request(catalog_url) @@ -676,13 +676,9 @@ def get_dataset_info(dataset_id, catalog=None): A dictionary with the description of the dataset. """ - catalog = options.catalog if catalog is None else catalog + catalog = _get_catalog(catalog) - url = "{}/{}/{}/Properties".format( - options.odata_url, - catalog, - dataset_id - ) + url = "{}/{}/{}/Properties".format(options.odata_url, catalog, dataset_id) return _odata4_request(url, kind="Singleton") @@ -694,7 +690,7 @@ def catalog(catalog): Parameters ---------- catalog : str - The catalog. For example: 'CBS' or 'CBS-Maatwerk'. + The catalog. For example: 'CBS' or 'CBS-asd'. """ diff --git a/tests/test_cbsodata4.py b/tests/test_cbsodata4.py index fd7617f..35a6060 100644 --- a/tests/test_cbsodata4.py +++ b/tests/test_cbsodata4.py @@ -1,7 +1,7 @@ import os import shutil -import cbsodata4 as cbsodata +import cbsodata.cbsodata4 as cbsodata # testing deps import pytest @@ -12,7 +12,7 @@ catalogs = [ 'CBS', - 'CBS-Maatwerk' + 'CBS-asd' ] TEST_ENV = 'test_env' @@ -126,7 +126,7 @@ def test_dataset_measure_vars(): top=10 ) - assert "MeasureGroupID" not in x[0].keys() + assert "MeasureGroupId" not in x[0].keys() assert "MeasureGroupDescription" in x[0].keys() assert "MeasureGroupTitle" not in x[0].keys() @@ -136,7 +136,7 @@ def test_dataset_measure_vars(): top=10 ) - assert "MeasureGroupID" in x[0].keys() + assert "MeasureGroupId" in x[0].keys() def test_dataset_drop_measure_id(): @@ -183,7 +183,7 @@ def test_dataset_dimension_vars(): top=10 ) - assert "WijkenEnBuurtenGroupID" not in x[0].keys() + assert "WijkenEnBuurtenGroupId" not in x[0].keys() assert "WijkenEnBuurtenGroupDescription" in x[0].keys() assert "WijkenEnBuurtenGroupTitle" not in x[0].keys() @@ -193,7 +193,7 @@ def test_dataset_dimension_vars(): top=10 ) - assert "WijkenEnBuurtenGroupID" in x[0].keys() + assert "WijkenEnBuurtenGroupId" in x[0].keys() def test_dataset_drop_dimension_id():