From f315ac2f878640ca2e0036c8012019baa676925a Mon Sep 17 00:00:00 2001 From: Zigfrid Zvezdin Date: Tue, 8 Dec 2020 00:28:54 +0100 Subject: [PATCH 1/5] Add support and testing for 'dict' in SchemaGenerator (fixes #58) (#63) * Add support and testing for 'dict' in SchemaGenerator * Fix lines lengths for flake8 * Reformat verify_data_chunk so that git diff is smaller --- bigquery_schema_generator/generate_schema.py | 35 +++++++++++--------- tests/test_generate_schema.py | 34 ++++++++++++++++--- 2 files changed, 48 insertions(+), 21 deletions(-) diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py index 7cacbab..3ea9ea1 100755 --- a/bigquery_schema_generator/generate_schema.py +++ b/bigquery_schema_generator/generate_schema.py @@ -108,12 +108,12 @@ def __init__( # If CSV, force keep_nulls = True self.keep_nulls = True if (input_format == 'csv') else keep_nulls - # If JSON, sort the schema using the name of the column to be + # If JSON or dict, sort the schema using the name of the column to be # consistent with 'bq load'. # If CSV, preserve the original ordering because 'bq load` matches the # CSV column with the respective schema entry using the position of the # column in the schema. - self.sorted_schema = (input_format == 'json') + self.sorted_schema = (input_format in {'json', 'dict'}) self.line_number = 0 self.error_logs = [] @@ -121,8 +121,8 @@ def __init__( def log_error(self, msg): self.error_logs.append({'line_number': self.line_number, 'msg': msg}) - def deduce_schema(self, file, *, schema_map=None): - """Loop through each newlined-delimited line of 'file' and deduce the + def deduce_schema(self, input_data, *, schema_map=None): + """Loop through each element of 'input_data' and deduce the BigQuery schema. The schema is returned as a recursive map that contains both the database schema and some additional metadata about each entry. It has the following form: @@ -171,9 +171,11 @@ def deduce_schema(self, file, *, schema_map=None): """ if self.input_format == 'csv': - reader = csv.DictReader(file) + reader = csv.DictReader(input_data) elif self.input_format == 'json' or self.input_format is None: - reader = json_reader(file) + reader = json_reader(input_data) + elif self.input_format == 'dict': + reader = input_data else: raise Exception(f"Unknown input_format '{self.input_format}'") @@ -202,11 +204,12 @@ def deduce_schema(self, file, *, schema_map=None): raise json_object else: self.log_error( - 'Record should be a JSON Object but was a' - f' {type(json_object)}' + 'Record should be a JSON Object ' + f'but was a {type(json_object)}' ) if not self.ignore_invalid_lines: - raise Exception('Record must be a JSON Object') + raise Exception(f'Record must be a JSON Object ' + f'but was a {type(json_object)}') finally: logging.info(f'Processed {self.line_number} lines') @@ -714,15 +717,15 @@ def run( print(file=output_file) -def json_reader(file): +def json_reader(input_data): """A generator that converts an iterable of newline-delimited JSON objects - ('file' could be a 'list' for testing purposes) into an iterable of Python - dict objects. If the line cannot be parsed as JSON, the exception thrown by - the json.loads() is yielded back, instead of the json object. The calling - code can check for this exception with an isinstance() function, then - continue processing the rest of the file. + ('input_data' could be a 'list' for testing purposes) into an iterable of + Python dict objects. If the line cannot be parsed as JSON, the exception + thrown by the json.loads() is yielded back, instead of the json object. + The calling code can check for this exception with an isinstance() function, + then continue processing the rest of the file. """ - for line in file: + for line in input_data: try: yield json.loads(line) except Exception as e: diff --git a/tests/test_generate_schema.py b/tests/test_generate_schema.py index e88dc0f..90c143f 100644 --- a/tests/test_generate_schema.py +++ b/tests/test_generate_schema.py @@ -25,6 +25,7 @@ from bigquery_schema_generator.generate_schema import convert_type from bigquery_schema_generator.generate_schema import is_string_type from bigquery_schema_generator.generate_schema import json_full_path +from bigquery_schema_generator.generate_schema import json_reader from .data_reader import DataReader @@ -432,6 +433,7 @@ class TestDataChunksFromFile(unittest.TestCase): schema matches the one produced by SchemaGenerator.deduce_schema(). Multiple test cases are stored in TESTDATA_FILE. The data_reader.py module knows how to parse that file. + JSON chunks are verified as JSON but also as dict. """ TESTDATA_FILE = 'testdata.txt' @@ -456,6 +458,15 @@ def test_all_data_chunks(self): raise e def verify_data_chunk(self, chunk): + self.verify_data_chunk_as_csv_json_dict(chunk=chunk, as_dict=False) + self.verify_data_chunk_as_csv_json_dict(chunk=chunk, as_dict=True) + + def verify_data_chunk_as_csv_json_dict(self, *, chunk, as_dict): + """Verify the given chunk from the testdata.txt file. If `as_dict` is + True, then if the input_format of the chunk is 'json', pretend + that the input data was given as an internal Python dict, and verify + the 'input_format=dict' code path in SchemaGenerator. + """ chunk_count = chunk['chunk_count'] line_number = chunk['line_number'] data_flags = chunk['data_flags'] @@ -471,10 +482,23 @@ def verify_data_chunk(self, chunk): expected_schema = chunk['schema'] existing_schema = chunk['existing_schema'] - print( - f"Test chunk: {chunk_count}; line_number: {line_number}; " - f"first record: {records[0]}" - ) + if as_dict: + if input_format == 'json': + print( + f"Test chunk: {chunk_count}; line_number: {line_number}; " + f"input_format='dict'" + ) + input_format = 'dict' + records = json_reader(records) + else: + # Don't bother converting CSV data chunks into Python dict. + return + else: + print( + f"Test chunk: {chunk_count}; line_number: {line_number}; " + f"first record: {records[0]}" + ) + # Generate schema. generator = SchemaGenerator( input_format=input_format, @@ -549,7 +573,7 @@ def test_bq_schema_to_map_round_trip_permutations(self): schema_map = bq_schema_to_map(schema) for input_format_and_mode in valid_input_formats_and_modes: for keep_null_param in valid_keep_null_params: - for quotes_are_strings in\ + for quotes_are_strings in \ valid_quoted_values_are_strings: generator = SchemaGenerator( input_format=input_format_and_mode[0], From 68d9ffbda3db74f219f38fd91c7c7405913a7cd2 Mon Sep 17 00:00:00 2001 From: Brian Park Date: Mon, 7 Dec 2020 16:34:56 -0800 Subject: [PATCH 2/5] Extend INTEGER_MATCHER to accept '+', and FLOAT_MATCHER to detect scientific notation and other variations of floats --- bigquery_schema_generator/generate_schema.py | 4 +- tests/test_generate_schema.py | 89 ++++++++++++++++++++ tests/testdata.txt | 73 ++++++++++++++++ 3 files changed, 164 insertions(+), 2 deletions(-) diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py index 3ea9ea1..116bf4f 100755 --- a/bigquery_schema_generator/generate_schema.py +++ b/bigquery_schema_generator/generate_schema.py @@ -62,7 +62,7 @@ class SchemaGenerator: TIME_MATCHER = re.compile(r'^\d{1,2}:\d{1,2}:\d{1,2}(\.\d{1,6})?$') # Detect integers inside quotes. - INTEGER_MATCHER = re.compile(r'^[-]?\d+$') + INTEGER_MATCHER = re.compile(r'^[-+]?\d+$') # Max INTEGER value supported by 'bq load'. INTEGER_MAX_VALUE = 2**63 - 1 @@ -71,7 +71,7 @@ class SchemaGenerator: INTEGER_MIN_VALUE = -2**63 # Detect floats inside quotes. - FLOAT_MATCHER = re.compile(r'^[-]?\d+\.\d+$') + FLOAT_MATCHER = re.compile(r'^[-+]?(?:\d+\.?\d*|\.\d+)(?:[eE][-+]?\d+)?$') # Valid field name characters of BigQuery FIELD_NAME_MATCHER = re.compile(r'[^a-zA-Z0-9_]') diff --git a/tests/test_generate_schema.py b/tests/test_generate_schema.py index 90c143f..a49cba3 100644 --- a/tests/test_generate_schema.py +++ b/tests/test_generate_schema.py @@ -107,6 +107,95 @@ def test_time_matcher_invalid(self): self.assertFalse( SchemaGenerator.TIME_MATCHER.match('12:33:01.1234567')) + def test_integer_matcher_valid(self): + self.assertTrue(SchemaGenerator.INTEGER_MATCHER.match('1')) + self.assertTrue(SchemaGenerator.INTEGER_MATCHER.match('-1')) + self.assertTrue(SchemaGenerator.INTEGER_MATCHER.match('+1')) + + def test_integer_matcher_invalid(self): + self.assertFalse(SchemaGenerator.INTEGER_MATCHER.match('')) + self.assertFalse(SchemaGenerator.INTEGER_MATCHER.match('-')) + self.assertFalse(SchemaGenerator.INTEGER_MATCHER.match('+')) + + def test_float_matcher_valid(self): + # Floats w/o exponents + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('1.0')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-1.0')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+1.0')) + + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('1.')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-1.')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+1.')) + + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('.1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-.1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+.1')) + + # Different signs in mantissa + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('1e1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-1e1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+1e1')) + + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('1e-1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-1e-1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+1e-1')) + + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('1e+1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-1e+1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+1e+1')) + + # Decimal point in mantissa. + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('2.e1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-2.e1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+2.e1')) + + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('2.e-1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-2.e-1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+2.e-1')) + + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('2.e+1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-2.e+1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+2.e+1')) + + # Decimal point and fraction in mantissa + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('3.3e1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-3.3e1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+3.3e1')) + + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('3.3e-1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-3.3e-1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+3.3e-1')) + + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('3.3e+1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-3.3e+1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+3.3e+1')) + + # Fraction only in mantissa + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('.4e1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-.4e1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+.4e1')) + + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('.4e-1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-.4e-1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+.4e-1')) + + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('.4e+1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-.4e+1')) + self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+.4e+1')) + + def test_float_matcher_invalid(self): + # No digit in mantissa + self.assertFalse(SchemaGenerator.FLOAT_MATCHER.match('.e1')) + + # No mantissa at all + self.assertFalse(SchemaGenerator.FLOAT_MATCHER.match('+e1')) + + # Decimal point in exponent + self.assertFalse(SchemaGenerator.FLOAT_MATCHER.match('1e.1')) + + # No exponent digit after 'e' + self.assertFalse(SchemaGenerator.FLOAT_MATCHER.match('1e')) + def test_infer_value_type(self): generator = SchemaGenerator() diff --git a/tests/testdata.txt b/tests/testdata.txt index f6b522a..a0324cb 100644 --- a/tests/testdata.txt +++ b/tests/testdata.txt @@ -676,6 +676,79 @@ SCHEMA ] END +# Support less common incantations of integers +DATA +{ "qi": "1" } +{ "qi": "+1" } +{ "qi": "-1" } +SCHEMA +[ + { + "mode": "NULLABLE", + "name": "qi", + "type": "INTEGER" + } +] +END + +# Support less common incantations of floats. The examples should match the +# test cases in TestSchemaGenerator.test_float_matcher_valid() ideally. +DATA +{ "qf": "1.0" } +{ "qf": "-1.0" } +{ "qf": "+1.0" } +{ "qf": "1." } +{ "qf": "-1." } +{ "qf": "+1." } +{ "qf": ".1" } +{ "qf": "-.1" } +{ "qf": "+.1" } +{ "qf": "1e1" } +{ "qf": "-1e1" } +{ "qf": "+1e1" } +{ "qf": "1e-1" } +{ "qf": "-1e-1" } +{ "qf": "+1e-1" } +{ "qf": "1e+1" } +{ "qf": "-1e+1" } +{ "qf": "+1e+1" } +{ "qf": "2.e1" } +{ "qf": "-2.e1" } +{ "qf": "+2.e1" } +{ "qf": "2.e-1" } +{ "qf": "-2.e-1" } +{ "qf": "+2.e-1" } +{ "qf": "2.e+1" } +{ "qf": "-2.e+1" } +{ "qf": "+2.e+1" } +{ "qf": "3.3e1" } +{ "qf": "-3.3e1" } +{ "qf": "+3.3e1" } +{ "qf": "3.3e-1" } +{ "qf": "-3.3e-1" } +{ "qf": "+3.3e-1" } +{ "qf": "3.3e+1" } +{ "qf": "-3.3e+1" } +{ "qf": "+3.3e+1" } +{ "qf": ".4e1" } +{ "qf": "-.4e1" } +{ "qf": "+.4e1" } +{ "qf": ".4e-1" } +{ "qf": "-.4e-1" } +{ "qf": "+.4e-1" } +{ "qf": ".4e+1" } +{ "qf": "-.4e+1" } +{ "qf": "+.4e+1" } +SCHEMA +[ + { + "mode": "NULLABLE", + "name": "qf", + "type": "FLOAT" + } +] +END + # Integers in quoted strings that fit inside a signed 64-bit -> INTEGER # See https://github.com/bxparks/bigquery-schema-generator/issues/18. DATA From 2d9bbf1e88c1ce4d194abcc71217fe1ecfc2aa4c Mon Sep 17 00:00:00 2001 From: Brian Park Date: Mon, 7 Dec 2020 22:30:34 -0800 Subject: [PATCH 3/5] DEVELOPER.md: Update instructions to create a new release and push to PyPI --- DEVELOPER.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/DEVELOPER.md b/DEVELOPER.md index 5b11351..dd1ec31 100644 --- a/DEVELOPER.md +++ b/DEVELOPER.md @@ -19,24 +19,25 @@ There are a lot of instructions on the web that uses those are deprecated. The tool that seems to work for me is [Twine](https://github.com/pypa/twine). -[PyPI](https://pypi.python.org/pypi) does not support Markdown, so -we use `pypandoc` and `pandoc` to convert Markdown to RST. -`pypandoc` is a thin Python wrapper around `pandoc`. +[PyPI](https://pypi.python.org/pypi) now supports Markdown so we no longer need +to download `pypandoc` (Python package) and `pandoc` (apt package) to convert +Markdown to RST. Install the following packages: ``` -$ sudo apt install pandoc -$ sudo -H pip3 install setuptools wheel twine pypandoc +$ sudo -H pip3 install setuptools wheel twine ``` ### Steps 1. Edit `setup.py` and increment the `version`. 1. Push all changes to `develop` branch. -1. Merge `develop` into `master` branch, and checkout the `master` branch. +1. Create a GitHub pull request (PR) from `develop` into `master` branch. +1. Merge the PR into `master`. +1. Create a new Release in GitHub with the new tag label. 1. Create the dist using `python3 setup.py sdist`. -1. Upload to PyPI using `twine upload dist/*`. - (Need to enter my PyPI login creddentials). +1. Upload to PyPI using `twine upload + dist/bigquery-schema-generator-{version}.tar.gz`. + * Enter my PyPI login creddentials. * If `dist/` becomes too cluttered, we can remove the entire `dist/` directory and run `python3 setup.py sdist` again. -1. Tag the `master` branch with the release on GitHub. From 997c6f781bc7fbd902739261f629a1414d2e9938 Mon Sep 17 00:00:00 2001 From: Brian Park Date: Mon, 7 Dec 2020 23:22:40 -0800 Subject: [PATCH 4/5] README.md: Add Table of Contents; add usage info for schema_map=existing_schema_map and input_format='dict' --- CHANGELOG.md | 7 +++ README.md | 175 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 138 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a82f520..2e574d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ # Changelog * Unreleased + * Add 'dict' as a third `input_format` when `SchemaGenerator` is used as a + library. This can be useful when the data has already been transformed + into a list of native Python `dict` objects (see #58, thanks to + ZiggerZZ@). + * Expand the pattern matchers for quoted integers and quoted floating point + numbers to be more cmopatible with the ones recognized by `bq load + --autodetect`. * 1.3 (2020-12-05) * Allow an existing schema file to be specified using `--existing_schema_path` flag, so that new data can be merged into it. diff --git a/README.md b/README.md index 5989187..652904b 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,41 @@ $ generate-schema < file.data.json > file.schema.json $ generate-schema --input_format csv < file.data.csv > file.schema.json ``` -Version: 1.3 (2020-12-05) - -Changelog: [CHANGELOG.md](CHANGELOG.md) - +**Version**: 1.3 (2020-12-05) + +**Changelog**: [CHANGELOG.md](CHANGELOG.md) + +**Table of Contents** + +* [Background](#Background) +* [Installation](#Installation) + * [Ubuntu Linux](#UbuntuLinux) + * [MacOS](#MacOS) +* [Usage](#Usage) + * [Command Line](#CommandLine) + * [Schema Output](#SchemaOutput) + * [Command Line Flag Options](#FlagOptions) + * [Help (`--help`)](#Help) + * [Input Format (`--input_format`)](#InputFormat) + * [Keep Nulls (`--keep_nulls`)](#KeepNulls) + * [Quoted Values Are Strings(`--quoted_values_are_strings`)](#QuotedValuesAreStrings) + * [Infer Mode (`--infer_mode`)](#InferMode) + * [Debugging Interval (`--debugging_interval`)](#DebuggingInterval) + * [Debugging Map (`--debugging_map`)](#DebuggingMap) + * [Sanitize Names (`--sanitize_names`)](#SanitizedNames) + * [Ignore Invalid Lines (`--ignore_invalid_lines`)](#IgnoreInvalidLines) + * [Existing Schema Path (`--existing_schema_path`)](#ExistingSchemaPath) + * [Using as a Library](#UsingAsLibrary) +* [Schema Types](#SchemaTypes) + * [Supported Types](#SupportedTypes) + * [Type Inferrence](#TypeInferrence) +* [Examples](#Examples) +* [Benchmarks](#Benchmarks) +* [System Requirements](#SystemRequirements) +* [Authors](#Authors) +* [License](#License) + + ## Background Data can be imported into [BigQuery](https://cloud.google.com/bigquery/) using @@ -44,6 +75,7 @@ in JSON format on the STDOUT. This schema file can be fed back into the **bq load** tool to create a table that is more compatible with the data fields in the input dataset. + ## Installation **Prerequisite**: You need have Python 3.6 or higher. @@ -87,6 +119,7 @@ The shell script `generate-schema` will be installed somewhere in your system, depending on how your Python environment is configured. See below for some notes for Ubuntu Linux and MacOS. + ### Ubuntu Linux (18.04, 20.04) After running `pip3 install bigquery_schema_generator`, the `generate-schema` @@ -97,6 +130,7 @@ script may be installed in one the following locations: * `$HOME/.local/bin/generate-schema` * `$HOME/.virtualenvs/{your_virtual_env}/bin/generate-schema` + ### MacOS (10.14 Mojave) I don't use my Mac for software development these days, and I won't upgrade to @@ -119,8 +153,12 @@ You can install Python3 using `generate-schema` script will probably be installed in `/usr/local/bin` but I'm not completely certain. + ## Usage + +### Command Line + The `generate_schema.py` script accepts a newline-delimited JSON or CSV data file on the STDIN. JSON input format has been tested extensively. CSV input format was added more recently (in v0.4) using the `--input_format @@ -161,6 +199,7 @@ then you can invoke the Python script directly: $ ./generate_schema.py < file.data.json > file.schema.json ``` + ### Using the Schema Output The resulting schema file can be given to the **bq load** command using the @@ -226,11 +265,13 @@ $ bq show --schema mydataset.mytable | python3 -m json.tool file. An alternative is the [jq command](https://stedolan.github.io/jq/).) The resulting schema file should be identical to `file.schema.json`. -### Flag Options + +### Command Line Flag Options The `generate_schema.py` script supports a handful of command line flags as shown by the `--help` flag below. + #### Help (`--help`) Print the built-in help strings: @@ -268,6 +309,7 @@ optional arguments: :: ``` + #### Input Format (`--input_format`) Specifies the format of the input file, either `json` (default) or `csv`. @@ -280,6 +322,7 @@ order, even if the column contains an empty value for every record. See [Issue #26](https://github.com/bxparks/bigquery-schema-generator/issues/26) for implementation details. + #### Keep Nulls (`--keep_nulls`) Normally when the input data file contains a field which has a null, empty @@ -327,6 +370,7 @@ INFO:root:Processed 1 lines ] ``` + #### Quoted Values Are Strings (`--quoted_values_are_strings`) By default, quoted values are inspected to determine if they can be interpreted @@ -360,6 +404,7 @@ $ generate-schema --quoted_values_are_strings ] ``` + #### Infer Mode (`--infer_mode`) Set the schema `mode` of a field to `REQUIRED` instead of the default @@ -379,6 +424,7 @@ either input_format, CSV or JSON. See [Issue #28](https://github.com/bxparks/bigquery-schema-generator/issues/28) for implementation details. + #### Debugging Interval (`--debugging_interval`) By default, the `generate_schema.py` script prints a short progress message @@ -389,6 +435,7 @@ every 1000 lines of input data. This interval can be changed using the $ generate-schema --debugging_interval 50 < file.data.json > file.schema.json ``` + #### Debugging Map (`--debugging_map`) Instead of printing out the BigQuery schema, the `--debugging_map` prints out @@ -400,6 +447,7 @@ flag is intended to be used for debugging. $ generate-schema --debugging_map < file.data.json > file.schema.json ``` + #### Sanitize Names (`--sanitize_names`) BigQuery column names are [restricted to certain characters and @@ -426,6 +474,7 @@ through the data files to cleanup the column names anyway. See [Issue #14](https://github.com/bxparks/bigquery-schema-generator/issues/14) and [Issue #33](https://github.com/bxparks/bigquery-schema-generator/issues/33). + #### Ignore Invalid Lines (`--ignore_invalid_lines`) By default, if an error is encountered on a particular line, processing stops @@ -446,6 +495,7 @@ deduction logic will handle any missing or extra columns gracefully. Fixes [Issue #49](https://github.com/bxparks/bigquery-schema-generator/issues/49). + #### Existing Schema Path (`--existing_schema_path`) There are cases where we would like to start from an existing BigQuery table @@ -478,8 +528,72 @@ See discussion in [PR #57](https://github.com/bxparks/bigquery-schema-generator/pull/57) for more details. + +### Using As a Library + +The `bigquery_schema_generator` module can be used as a library by an external +Python client code by creating an instance of `SchemaGenerator` and calling the +`run(input, output)` method: + +```python +from bigquery_schema_generator.generate_schema import SchemaGenerator + +generator = SchemaGenerator( + input_format=input_format, + infer_mode=infer_mode, + keep_nulls=keep_nulls, + quoted_values_are_strings=quoted_values_are_strings, + debugging_interval=debugging_interval, + debugging_map=debugging_map, + sanitize_names=sanitize_names, + ignore_invalid_lines=ignore_invalid_lines, +) +generator.run(input_file=input_file, output_file=output_file) +``` + +If you need to process the generated schema programmatically, use the +`deduce_schema()` method and process the resulting `schema_map` and `error_log` +data structures like this: + +```python +from bigquery_schema_generator.generate_schema import SchemaGenerator +... +generator = SchemaGenerator( + ...(same as above)... +) + +schema_map, error_logs = generator.deduce_schema(input_data=input_data) + +# Print errors if desired. +for error in error_logs: + logging.info("Problem on line %s: %s", error['line_number'], error['msg']) + +schema = generator.flatten_schema(schema_map) +json.dump(schema, output_file, indent=2) +``` + +The `decude_schema()` now supports starting from an existing `schema_map` +instead of starting from scratch. This is the internal version of the +`--existing_schema_path` functionality. + +```python +schema_map1, error_logs = generator.deduce_schema(input_data=data1) +schema_map2, error_logs = generator.deduce_schema( + input_data=data1, schema_map=schema_map1 +) +``` + +When using the `SchemaGenerator` object directly, the `input_format` parameter +supports `dict` as a third input format in addition to the `json` and `csv` +formats. The `dict` input format tells `SchemaGenerator.deduce_schema()` to +accept a list of Python dict objects as the `input_data`. This is useful if the +input data (usually JSON) has already been read into memory and parsed from +newline-delimited JSON into native Python dict objects. + + ## Schema Types + ### Supported Types The `bq show --schema` command produces a JSON schema file that uses the @@ -531,6 +645,7 @@ The following types are _not_ supported at all: * `BYTES` * `DATETIME` (unable to distinguish from `TIMESTAMP`) + ### Type Inferrence Rules The `generate-schema` script attempts to emulate the various type conversion and @@ -572,6 +687,7 @@ compatibility rules implemented by **bq load**: * integers less than `-2^63` (-9223372036854775808) * (See [Issue #18](https://github.com/bxparks/bigquery-schema-generator/issues/18) for more details) + ## Examples Here is an example of a single JSON data record on the STDIN (the `^D` below @@ -705,41 +821,7 @@ INFO:root:Processed 4 lines ] ``` -## Using As a Library - -The `bigquery_schema_generator` module can be used as a library by an external -Python client code by creating an instance of `SchemaGenerator` and calling the -`run(input, output)` method: - -```python -from bigquery_schema_generator.generate_schema import SchemaGenerator - -generator = SchemaGenerator( - input_format=input_format, - infer_mode=infer_mode, - keep_nulls=keep_nulls, - quoted_values_are_strings=quoted_values_are_strings, - debugging_interval=debugging_interval, - debugging_map=debugging_map) -generator.run(input_file, output_file) -``` - -If you need to process the generated schema programmatically, use the -`deduce_schema()` method and process the resulting `schema_map` and `error_log` -data structures like this: - -```python -from bigquery_schema_generator.generate_schema import SchemaGenerator -... -schema_map, error_logs = generator.deduce_schema(input_file) - -for error in error_logs: - logging.info("Problem on line %s: %s", error['line'], error['msg']) - -schema = generator.flatten_schema(schema_map) -json.dump(schema, output_file, indent=2) -``` - + ## Benchmarks I wrote the `bigquery_schema_generator/anonymize.py` script to create an @@ -759,6 +841,7 @@ $ bigquery_schema_generator/generate_schema.py < anon1.data.json \ took 67s on a Dell Precision M4700 laptop with an Intel Core i7-3840QM CPU @ 2.80GHz, 32GB of RAM, Ubuntu Linux 18.04, Python 3.6.7. + ## System Requirements This project was initially developed on Ubuntu 17.04 using Python 3.5.3, but it @@ -776,6 +859,12 @@ I have tested it on: The GitHub Actions continuous integration pipeline validates on Python 3.6, 3.7 and 3.8. + +## License + +Apache License 2.0 + + ## Authors * Created by Brian T. Park (brian@xparks.net). @@ -793,8 +882,6 @@ and 3.8. (abroglesc@). * Allow an existing schema file to be specified using `--existing_schema_path`, by Austin Brogle (abroglesc@) and Bozo Dragojevic (bozzzzo@). +* Allow `SchemaGenerator.deduce_schema()` to accept a list of native Python + `dict` objects, by Zigfrid Zvezdin (ZiggerZZ@). - -## License - -Apache License 2.0 From cf1c1ad6dbd155852943cb4d1051cf6b9ea66a9a Mon Sep 17 00:00:00 2001 From: Brian Park Date: Wed, 9 Dec 2020 21:23:21 -0800 Subject: [PATCH 5/5] Bump version to 1.4 --- CHANGELOG.md | 6 +++++- README.md | 6 +++--- bigquery_schema_generator/version.py | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e574d7..f0f16c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,17 @@ # Changelog * Unreleased +* 1.4 (2020-12-09) * Add 'dict' as a third `input_format` when `SchemaGenerator` is used as a library. This can be useful when the data has already been transformed into a list of native Python `dict` objects (see #58, thanks to ZiggerZZ@). * Expand the pattern matchers for quoted integers and quoted floating point - numbers to be more cmopatible with the ones recognized by `bq load + numbers to be more compatible with the patterns recognized by `bq load --autodetect`. + * Add Table of Contents to READMD.md. Add usage info for the + `schema_map=existing_schema_map` and the `input_format='dict'` parameters + in the `SchemaGenerator()` constructor. * 1.3 (2020-12-05) * Allow an existing schema file to be specified using `--existing_schema_path` flag, so that new data can be merged into it. diff --git a/README.md b/README.md index 652904b..2f01807 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,11 @@ $ generate-schema < file.data.json > file.schema.json $ generate-schema --input_format csv < file.data.csv > file.schema.json ``` -**Version**: 1.3 (2020-12-05) +**Version**: 1.4 (2020-12-09) **Changelog**: [CHANGELOG.md](CHANGELOG.md) -**Table of Contents** +## Table of Contents * [Background](#Background) * [Installation](#Installation) @@ -572,7 +572,7 @@ schema = generator.flatten_schema(schema_map) json.dump(schema, output_file, indent=2) ``` -The `decude_schema()` now supports starting from an existing `schema_map` +The `deduce_schema()` now supports starting from an existing `schema_map` instead of starting from scratch. This is the internal version of the `--existing_schema_path` functionality. diff --git a/bigquery_schema_generator/version.py b/bigquery_schema_generator/version.py index 6f4fa58..0f66308 100644 --- a/bigquery_schema_generator/version.py +++ b/bigquery_schema_generator/version.py @@ -1 +1 @@ -__version__ = '1.3' +__version__ = '1.4'