From f315ac2f878640ca2e0036c8012019baa676925a Mon Sep 17 00:00:00 2001
From: Zigfrid Zvezdin <ziggerZZ@gmail.com>
Date: Tue, 8 Dec 2020 00:28:54 +0100
Subject: [PATCH 1/5] Add support and testing for 'dict' in SchemaGenerator
 (fixes #58) (#63)

* Add support and testing for 'dict' in SchemaGenerator

* Fix lines lengths for flake8

* Reformat verify_data_chunk so that git diff is smaller
---
 bigquery_schema_generator/generate_schema.py | 35 +++++++++++---------
 tests/test_generate_schema.py                | 34 ++++++++++++++++---
 2 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py
index 7cacbab..3ea9ea1 100755
--- a/bigquery_schema_generator/generate_schema.py
+++ b/bigquery_schema_generator/generate_schema.py
@@ -108,12 +108,12 @@ def __init__(
         # If CSV, force keep_nulls = True
         self.keep_nulls = True if (input_format == 'csv') else keep_nulls
 
-        # If JSON, sort the schema using the name of the column to be
+        # If JSON or dict, sort the schema using the name of the column to be
         # consistent with 'bq load'.
         # If CSV, preserve the original ordering because 'bq load` matches the
         # CSV column with the respective schema entry using the position of the
         # column in the schema.
-        self.sorted_schema = (input_format == 'json')
+        self.sorted_schema = (input_format in {'json', 'dict'})
 
         self.line_number = 0
         self.error_logs = []
@@ -121,8 +121,8 @@ def __init__(
     def log_error(self, msg):
         self.error_logs.append({'line_number': self.line_number, 'msg': msg})
 
-    def deduce_schema(self, file, *, schema_map=None):
-        """Loop through each newlined-delimited line of 'file' and deduce the
+    def deduce_schema(self, input_data, *, schema_map=None):
+        """Loop through each element of 'input_data' and deduce the
         BigQuery schema. The schema is returned as a recursive map that contains
         both the database schema and some additional metadata about each entry.
         It has the following form:
@@ -171,9 +171,11 @@ def deduce_schema(self, file, *, schema_map=None):
         """
 
         if self.input_format == 'csv':
-            reader = csv.DictReader(file)
+            reader = csv.DictReader(input_data)
         elif self.input_format == 'json' or self.input_format is None:
-            reader = json_reader(file)
+            reader = json_reader(input_data)
+        elif self.input_format == 'dict':
+            reader = input_data
         else:
             raise Exception(f"Unknown input_format '{self.input_format}'")
 
@@ -202,11 +204,12 @@ def deduce_schema(self, file, *, schema_map=None):
                         raise json_object
                 else:
                     self.log_error(
-                        'Record should be a JSON Object but was a'
-                        f' {type(json_object)}'
+                        'Record should be a JSON Object '
+                        f'but was a {type(json_object)}'
                     )
                     if not self.ignore_invalid_lines:
-                        raise Exception('Record must be a JSON Object')
+                        raise Exception(f'Record must be a JSON Object '
+                                        f'but was a {type(json_object)}')
         finally:
             logging.info(f'Processed {self.line_number} lines')
 
@@ -714,15 +717,15 @@ def run(
             print(file=output_file)
 
 
-def json_reader(file):
+def json_reader(input_data):
     """A generator that converts an iterable of newline-delimited JSON objects
-    ('file' could be a 'list' for testing purposes) into an iterable of Python
-    dict objects. If the line cannot be parsed as JSON, the exception thrown by
-    the json.loads() is yielded back, instead of the json object. The calling
-    code can check for this exception with an isinstance() function, then
-    continue processing the rest of the file.
+    ('input_data' could be a 'list' for testing purposes) into an iterable of
+    Python dict objects. If the line cannot be parsed as JSON, the exception
+    thrown by the json.loads() is yielded back, instead of the json object.
+    The calling code can check for this exception with an isinstance() function,
+    then continue processing the rest of the file.
     """
-    for line in file:
+    for line in input_data:
         try:
             yield json.loads(line)
         except Exception as e:
diff --git a/tests/test_generate_schema.py b/tests/test_generate_schema.py
index e88dc0f..90c143f 100644
--- a/tests/test_generate_schema.py
+++ b/tests/test_generate_schema.py
@@ -25,6 +25,7 @@
 from bigquery_schema_generator.generate_schema import convert_type
 from bigquery_schema_generator.generate_schema import is_string_type
 from bigquery_schema_generator.generate_schema import json_full_path
+from bigquery_schema_generator.generate_schema import json_reader
 from .data_reader import DataReader
 
 
@@ -432,6 +433,7 @@ class TestDataChunksFromFile(unittest.TestCase):
     schema matches the one produced by SchemaGenerator.deduce_schema(). Multiple
     test cases are stored in TESTDATA_FILE. The data_reader.py module knows how
     to parse that file.
+    JSON chunks are verified as JSON but also as dict.
     """
 
     TESTDATA_FILE = 'testdata.txt'
@@ -456,6 +458,15 @@ def test_all_data_chunks(self):
                     raise e
 
     def verify_data_chunk(self, chunk):
+        self.verify_data_chunk_as_csv_json_dict(chunk=chunk, as_dict=False)
+        self.verify_data_chunk_as_csv_json_dict(chunk=chunk, as_dict=True)
+
+    def verify_data_chunk_as_csv_json_dict(self, *, chunk, as_dict):
+        """Verify the given chunk from the testdata.txt file. If `as_dict` is
+        True, then if the input_format of the chunk is 'json', pretend
+        that the input data was given as an internal Python dict, and verify
+        the 'input_format=dict' code path in SchemaGenerator.
+        """
         chunk_count = chunk['chunk_count']
         line_number = chunk['line_number']
         data_flags = chunk['data_flags']
@@ -471,10 +482,23 @@ def verify_data_chunk(self, chunk):
         expected_schema = chunk['schema']
         existing_schema = chunk['existing_schema']
 
-        print(
-            f"Test chunk: {chunk_count}; line_number: {line_number}; "
-            f"first record: {records[0]}"
-        )
+        if as_dict:
+            if input_format == 'json':
+                print(
+                    f"Test chunk: {chunk_count}; line_number: {line_number}; "
+                    f"input_format='dict'"
+                )
+                input_format = 'dict'
+                records = json_reader(records)
+            else:
+                # Don't bother converting CSV data chunks into Python dict.
+                return
+        else:
+            print(
+                f"Test chunk: {chunk_count}; line_number: {line_number}; "
+                f"first record: {records[0]}"
+            )
+
         # Generate schema.
         generator = SchemaGenerator(
             input_format=input_format,
@@ -549,7 +573,7 @@ def test_bq_schema_to_map_round_trip_permutations(self):
                 schema_map = bq_schema_to_map(schema)
                 for input_format_and_mode in valid_input_formats_and_modes:
                     for keep_null_param in valid_keep_null_params:
-                        for quotes_are_strings in\
+                        for quotes_are_strings in \
                                 valid_quoted_values_are_strings:
                             generator = SchemaGenerator(
                                 input_format=input_format_and_mode[0],

From 68d9ffbda3db74f219f38fd91c7c7405913a7cd2 Mon Sep 17 00:00:00 2001
From: Brian Park <brian@xparks.net>
Date: Mon, 7 Dec 2020 16:34:56 -0800
Subject: [PATCH 2/5] Extend INTEGER_MATCHER to accept '+', and FLOAT_MATCHER
 to detect scientific notation and other variations of floats

---
 bigquery_schema_generator/generate_schema.py |  4 +-
 tests/test_generate_schema.py                | 89 ++++++++++++++++++++
 tests/testdata.txt                           | 73 ++++++++++++++++
 3 files changed, 164 insertions(+), 2 deletions(-)

diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py
index 3ea9ea1..116bf4f 100755
--- a/bigquery_schema_generator/generate_schema.py
+++ b/bigquery_schema_generator/generate_schema.py
@@ -62,7 +62,7 @@ class SchemaGenerator:
     TIME_MATCHER = re.compile(r'^\d{1,2}:\d{1,2}:\d{1,2}(\.\d{1,6})?$')
 
     # Detect integers inside quotes.
-    INTEGER_MATCHER = re.compile(r'^[-]?\d+$')
+    INTEGER_MATCHER = re.compile(r'^[-+]?\d+$')
 
     # Max INTEGER value supported by 'bq load'.
     INTEGER_MAX_VALUE = 2**63 - 1
@@ -71,7 +71,7 @@ class SchemaGenerator:
     INTEGER_MIN_VALUE = -2**63
 
     # Detect floats inside quotes.
-    FLOAT_MATCHER = re.compile(r'^[-]?\d+\.\d+$')
+    FLOAT_MATCHER = re.compile(r'^[-+]?(?:\d+\.?\d*|\.\d+)(?:[eE][-+]?\d+)?$')
 
     # Valid field name characters of BigQuery
     FIELD_NAME_MATCHER = re.compile(r'[^a-zA-Z0-9_]')
diff --git a/tests/test_generate_schema.py b/tests/test_generate_schema.py
index 90c143f..a49cba3 100644
--- a/tests/test_generate_schema.py
+++ b/tests/test_generate_schema.py
@@ -107,6 +107,95 @@ def test_time_matcher_invalid(self):
         self.assertFalse(
             SchemaGenerator.TIME_MATCHER.match('12:33:01.1234567'))
 
+    def test_integer_matcher_valid(self):
+        self.assertTrue(SchemaGenerator.INTEGER_MATCHER.match('1'))
+        self.assertTrue(SchemaGenerator.INTEGER_MATCHER.match('-1'))
+        self.assertTrue(SchemaGenerator.INTEGER_MATCHER.match('+1'))
+
+    def test_integer_matcher_invalid(self):
+        self.assertFalse(SchemaGenerator.INTEGER_MATCHER.match(''))
+        self.assertFalse(SchemaGenerator.INTEGER_MATCHER.match('-'))
+        self.assertFalse(SchemaGenerator.INTEGER_MATCHER.match('+'))
+
+    def test_float_matcher_valid(self):
+        # Floats w/o exponents
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('1.0'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-1.0'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+1.0'))
+
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('1.'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-1.'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+1.'))
+
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('.1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-.1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+.1'))
+
+        # Different signs in mantissa
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('1e1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-1e1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+1e1'))
+
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('1e-1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-1e-1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+1e-1'))
+
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('1e+1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-1e+1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+1e+1'))
+
+        # Decimal point in mantissa.
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('2.e1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-2.e1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+2.e1'))
+
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('2.e-1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-2.e-1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+2.e-1'))
+
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('2.e+1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-2.e+1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+2.e+1'))
+
+        # Decimal point and fraction in mantissa
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('3.3e1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-3.3e1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+3.3e1'))
+
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('3.3e-1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-3.3e-1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+3.3e-1'))
+
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('3.3e+1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-3.3e+1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+3.3e+1'))
+
+        # Fraction only in mantissa
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('.4e1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-.4e1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+.4e1'))
+
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('.4e-1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-.4e-1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+.4e-1'))
+
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('.4e+1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('-.4e+1'))
+        self.assertTrue(SchemaGenerator.FLOAT_MATCHER.match('+.4e+1'))
+
+    def test_float_matcher_invalid(self):
+        # No digit in mantissa
+        self.assertFalse(SchemaGenerator.FLOAT_MATCHER.match('.e1'))
+
+        # No mantissa at all
+        self.assertFalse(SchemaGenerator.FLOAT_MATCHER.match('+e1'))
+
+        # Decimal point in exponent
+        self.assertFalse(SchemaGenerator.FLOAT_MATCHER.match('1e.1'))
+
+        # No exponent digit after 'e'
+        self.assertFalse(SchemaGenerator.FLOAT_MATCHER.match('1e'))
+
     def test_infer_value_type(self):
         generator = SchemaGenerator()
 
diff --git a/tests/testdata.txt b/tests/testdata.txt
index f6b522a..a0324cb 100644
--- a/tests/testdata.txt
+++ b/tests/testdata.txt
@@ -676,6 +676,79 @@ SCHEMA
 ]
 END
 
+# Support less common incantations of integers
+DATA
+{ "qi": "1" }
+{ "qi": "+1" }
+{ "qi": "-1" }
+SCHEMA
+[
+  {
+    "mode": "NULLABLE",
+    "name": "qi",
+    "type": "INTEGER"
+  }
+]
+END
+
+# Support less common incantations of floats. The examples should match the
+# test cases in TestSchemaGenerator.test_float_matcher_valid() ideally.
+DATA
+{ "qf": "1.0" }
+{ "qf": "-1.0" }
+{ "qf": "+1.0" }
+{ "qf": "1." }
+{ "qf": "-1." }
+{ "qf": "+1." }
+{ "qf": ".1" }
+{ "qf": "-.1" }
+{ "qf": "+.1" }
+{ "qf": "1e1" }
+{ "qf": "-1e1" }
+{ "qf": "+1e1" }
+{ "qf": "1e-1" }
+{ "qf": "-1e-1" }
+{ "qf": "+1e-1" }
+{ "qf": "1e+1" }
+{ "qf": "-1e+1" }
+{ "qf": "+1e+1" }
+{ "qf": "2.e1" }
+{ "qf": "-2.e1" }
+{ "qf": "+2.e1" }
+{ "qf": "2.e-1" }
+{ "qf": "-2.e-1" }
+{ "qf": "+2.e-1" }
+{ "qf": "2.e+1" }
+{ "qf": "-2.e+1" }
+{ "qf": "+2.e+1" }
+{ "qf": "3.3e1" }
+{ "qf": "-3.3e1" }
+{ "qf": "+3.3e1" }
+{ "qf": "3.3e-1" }
+{ "qf": "-3.3e-1" }
+{ "qf": "+3.3e-1" }
+{ "qf": "3.3e+1" }
+{ "qf": "-3.3e+1" }
+{ "qf": "+3.3e+1" }
+{ "qf": ".4e1" }
+{ "qf": "-.4e1" }
+{ "qf": "+.4e1" }
+{ "qf": ".4e-1" }
+{ "qf": "-.4e-1" }
+{ "qf": "+.4e-1" }
+{ "qf": ".4e+1" }
+{ "qf": "-.4e+1" }
+{ "qf": "+.4e+1" }
+SCHEMA
+[
+  {
+    "mode": "NULLABLE",
+    "name": "qf",
+    "type": "FLOAT"
+  }
+]
+END
+
 # Integers in quoted strings that fit inside a signed 64-bit -> INTEGER
 # See https://github.com/bxparks/bigquery-schema-generator/issues/18.
 DATA

From 2d9bbf1e88c1ce4d194abcc71217fe1ecfc2aa4c Mon Sep 17 00:00:00 2001
From: Brian Park <brian@xparks.net>
Date: Mon, 7 Dec 2020 22:30:34 -0800
Subject: [PATCH 3/5] DEVELOPER.md: Update instructions to create a new release
 and push to PyPI

---
 DEVELOPER.md | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/DEVELOPER.md b/DEVELOPER.md
index 5b11351..dd1ec31 100644
--- a/DEVELOPER.md
+++ b/DEVELOPER.md
@@ -19,24 +19,25 @@ There are a lot of instructions on the web that uses
 those are deprecated. The tool that seems to work for me is
 [Twine](https://github.com/pypa/twine).
 
-[PyPI](https://pypi.python.org/pypi) does not support Markdown, so
-we use `pypandoc` and `pandoc` to convert Markdown to RST.
-`pypandoc` is a thin Python wrapper around `pandoc`.
+[PyPI](https://pypi.python.org/pypi) now supports Markdown so we no longer need
+to download `pypandoc` (Python package) and `pandoc` (apt package) to convert
+Markdown to RST.
 
 Install the following packages:
 ```
-$ sudo apt install pandoc
-$ sudo -H pip3 install setuptools wheel twine pypandoc
+$ sudo -H pip3 install setuptools wheel twine
 ```
 
 ### Steps
 
 1. Edit `setup.py` and increment the `version`.
 1. Push all changes to `develop` branch.
-1. Merge `develop` into `master` branch, and checkout the `master` branch.
+1. Create a GitHub pull request (PR) from `develop` into `master` branch.
+1. Merge the PR into `master`.
+1. Create a new Release in GitHub with the new tag label.
 1. Create the dist using `python3 setup.py sdist`.
-1. Upload to PyPI using `twine upload dist/*`.
-   (Need to enter my PyPI login creddentials).
+1. Upload to PyPI using `twine upload
+   dist/bigquery-schema-generator-{version}.tar.gz`.
+    * Enter my PyPI login creddentials.
     * If `dist/` becomes too cluttered, we can remove the entire `dist/`
       directory and run `python3 setup.py sdist` again.
-1. Tag the `master` branch with the release on GitHub.

From 997c6f781bc7fbd902739261f629a1414d2e9938 Mon Sep 17 00:00:00 2001
From: Brian Park <brian@xparks.net>
Date: Mon, 7 Dec 2020 23:22:40 -0800
Subject: [PATCH 4/5] README.md: Add Table of Contents; add usage info for
 schema_map=existing_schema_map and input_format='dict'

---
 CHANGELOG.md |   7 +++
 README.md    | 175 ++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 138 insertions(+), 44 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a82f520..2e574d7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,13 @@
 # Changelog
 
 * Unreleased
+    * Add 'dict' as a third `input_format` when `SchemaGenerator` is used as a
+      library. This can be useful when the data has already been transformed
+      into a list of native Python `dict` objects (see #58, thanks to
+      ZiggerZZ@).
+    * Expand the pattern matchers for quoted integers and quoted floating point
+      numbers to be more cmopatible with the ones recognized by `bq load
+      --autodetect`.
 * 1.3 (2020-12-05)
     * Allow an existing schema file to be specified using
       `--existing_schema_path` flag, so that new data can be merged into it.
diff --git a/README.md b/README.md
index 5989187..652904b 100644
--- a/README.md
+++ b/README.md
@@ -12,10 +12,41 @@ $ generate-schema < file.data.json > file.schema.json
 $ generate-schema --input_format csv < file.data.csv > file.schema.json
 ```
 
-Version: 1.3 (2020-12-05)
-
-Changelog: [CHANGELOG.md](CHANGELOG.md)
-
+**Version**: 1.3 (2020-12-05)
+
+**Changelog**: [CHANGELOG.md](CHANGELOG.md)
+
+**Table of Contents**
+
+* [Background](#Background)
+* [Installation](#Installation)
+    * [Ubuntu Linux](#UbuntuLinux)
+    * [MacOS](#MacOS)
+* [Usage](#Usage)
+    * [Command Line](#CommandLine)
+    * [Schema Output](#SchemaOutput)
+    * [Command Line Flag Options](#FlagOptions)
+        * [Help (`--help`)](#Help)
+        * [Input Format (`--input_format`)](#InputFormat)
+        * [Keep Nulls (`--keep_nulls`)](#KeepNulls)
+        * [Quoted Values Are Strings(`--quoted_values_are_strings`)](#QuotedValuesAreStrings)
+        * [Infer Mode (`--infer_mode`)](#InferMode)
+        * [Debugging Interval (`--debugging_interval`)](#DebuggingInterval)
+        * [Debugging Map (`--debugging_map`)](#DebuggingMap)
+        * [Sanitize Names (`--sanitize_names`)](#SanitizedNames)
+        * [Ignore Invalid Lines (`--ignore_invalid_lines`)](#IgnoreInvalidLines)
+        * [Existing Schema Path (`--existing_schema_path`)](#ExistingSchemaPath)
+    * [Using as a Library](#UsingAsLibrary)
+* [Schema Types](#SchemaTypes)
+    * [Supported Types](#SupportedTypes)
+    * [Type Inferrence](#TypeInferrence)
+* [Examples](#Examples)
+* [Benchmarks](#Benchmarks)
+* [System Requirements](#SystemRequirements)
+* [Authors](#Authors)
+* [License](#License)
+
+<a name="Background"></a>
 ## Background
 
 Data can be imported into [BigQuery](https://cloud.google.com/bigquery/) using
@@ -44,6 +75,7 @@ in JSON format on the STDOUT. This schema file can be fed back into the **bq
 load** tool to create a table that is more compatible with the data fields in
 the input dataset.
 
+<a name="Installation"></a>
 ## Installation
 
 **Prerequisite**: You need have Python 3.6 or higher.
@@ -87,6 +119,7 @@ The shell script `generate-schema` will be installed somewhere in your system,
 depending on how your Python environment is configured. See below for
 some notes for Ubuntu Linux and MacOS.
 
+<a name="UbuntuLinux"></a>
 ### Ubuntu Linux (18.04, 20.04)
 
 After running `pip3 install bigquery_schema_generator`, the `generate-schema`
@@ -97,6 +130,7 @@ script may be installed in one the following locations:
 * `$HOME/.local/bin/generate-schema`
 * `$HOME/.virtualenvs/{your_virtual_env}/bin/generate-schema`
 
+<a name="MacOS"></a>
 ### MacOS (10.14 Mojave)
 
 I don't use my Mac for software development these days, and I won't upgrade to
@@ -119,8 +153,12 @@ You can install Python3 using
 `generate-schema` script will probably be installed in `/usr/local/bin` but I'm
 not completely certain.
 
+<a name="Usage"></a>
 ## Usage
 
+<a name="CommandLine"></a>
+### Command Line
+
 The `generate_schema.py` script accepts a newline-delimited JSON or
 CSV data file on the STDIN. JSON input format has been tested extensively.
 CSV input format was added more recently (in v0.4) using the `--input_format
@@ -161,6 +199,7 @@ then you can invoke the Python script directly:
 $ ./generate_schema.py < file.data.json > file.schema.json
 ```
 
+<a name="SchemaOutput"></a>
 ### Using the Schema Output
 
 The resulting schema file can be given to the **bq load** command using the
@@ -226,11 +265,13 @@ $ bq show --schema mydataset.mytable | python3 -m json.tool
 file. An alternative is the [jq command](https://stedolan.github.io/jq/).)
 The resulting schema file should be identical to `file.schema.json`.
 
-### Flag Options
+<a name="FlagOptions"></a>
+### Command Line Flag Options
 
 The `generate_schema.py` script supports a handful of command line flags
 as shown by the `--help` flag below.
 
+<a name="Help"></a>
 #### Help (`--help`)
 
 Print the built-in help strings:
@@ -268,6 +309,7 @@ optional arguments:
                         <project_id>:<dataset>:<table_name>
 ```
 
+<a name="InputFormat"></a>
 #### Input Format (`--input_format`)
 
 Specifies the format of the input file, either `json` (default) or `csv`.
@@ -280,6 +322,7 @@ order, even if the column contains an empty value for every record.
 See [Issue #26](https://github.com/bxparks/bigquery-schema-generator/issues/26)
 for implementation details.
 
+<a name="KeepNulls"></a>
 #### Keep Nulls (`--keep_nulls`)
 
 Normally when the input data file contains a field which has a null, empty
@@ -327,6 +370,7 @@ INFO:root:Processed 1 lines
 ]
 ```
 
+<a name="QuotedValuesAreStrings"></a>
 #### Quoted Values Are Strings (`--quoted_values_are_strings`)
 
 By default, quoted values are inspected to determine if they can be interpreted
@@ -360,6 +404,7 @@ $ generate-schema --quoted_values_are_strings
 ]
 ```
 
+<a name="InferMode"></a>
 #### Infer Mode (`--infer_mode`)
 
 Set the schema `mode` of a field to `REQUIRED` instead of the default
@@ -379,6 +424,7 @@ either input_format, CSV or JSON.
 See [Issue #28](https://github.com/bxparks/bigquery-schema-generator/issues/28)
 for implementation details.
 
+<a name="DebuggingInterval"></a>
 #### Debugging Interval (`--debugging_interval`)
 
 By default, the `generate_schema.py` script prints a short progress message
@@ -389,6 +435,7 @@ every 1000 lines of input data. This interval can be changed using the
 $ generate-schema --debugging_interval 50 < file.data.json > file.schema.json
 ```
 
+<a name="DebuggingMap"></a>
 #### Debugging Map (`--debugging_map`)
 
 Instead of printing out the BigQuery schema, the `--debugging_map` prints out
@@ -400,6 +447,7 @@ flag is intended to be used for debugging.
 $ generate-schema --debugging_map < file.data.json > file.schema.json
 ```
 
+<a name="SanitizedNames"></a>
 #### Sanitize Names (`--sanitize_names`)
 
 BigQuery column names are [restricted to certain characters and
@@ -426,6 +474,7 @@ through the data files to cleanup the column names anyway. See
 [Issue #14](https://github.com/bxparks/bigquery-schema-generator/issues/14) and
 [Issue #33](https://github.com/bxparks/bigquery-schema-generator/issues/33).
 
+<a name="IgnoreInvalidLines"></a>
 #### Ignore Invalid Lines (`--ignore_invalid_lines`)
 
 By default, if an error is encountered on a particular line, processing stops
@@ -446,6 +495,7 @@ deduction logic will handle any missing or extra columns gracefully.
 Fixes
 [Issue #49](https://github.com/bxparks/bigquery-schema-generator/issues/49).
 
+<a name="ExistingSchemaPath"></a>
 #### Existing Schema Path (`--existing_schema_path`)
 
 There are cases where we would like to start from an existing BigQuery table
@@ -478,8 +528,72 @@ See discussion in
 [PR #57](https://github.com/bxparks/bigquery-schema-generator/pull/57) for
 more details.
 
+<a name="UsingAsLibrary"></a>
+### Using As a Library
+
+The `bigquery_schema_generator` module can be used as a library by an external
+Python client code by creating an instance of `SchemaGenerator` and calling the
+`run(input, output)` method:
+
+```python
+from bigquery_schema_generator.generate_schema import SchemaGenerator
+
+generator = SchemaGenerator(
+    input_format=input_format,
+    infer_mode=infer_mode,
+    keep_nulls=keep_nulls,
+    quoted_values_are_strings=quoted_values_are_strings,
+    debugging_interval=debugging_interval,
+    debugging_map=debugging_map,
+    sanitize_names=sanitize_names,
+    ignore_invalid_lines=ignore_invalid_lines,
+)
+generator.run(input_file=input_file, output_file=output_file)
+```
+
+If you need to process the generated schema programmatically, use the
+`deduce_schema()` method and process the resulting `schema_map` and `error_log`
+data structures like this:
+
+```python
+from bigquery_schema_generator.generate_schema import SchemaGenerator
+...
+generator = SchemaGenerator(
+  ...(same as above)...
+)
+
+schema_map, error_logs = generator.deduce_schema(input_data=input_data)
+
+# Print errors if desired.
+for error in error_logs:
+    logging.info("Problem on line %s: %s", error['line_number'], error['msg'])
+
+schema = generator.flatten_schema(schema_map)
+json.dump(schema, output_file, indent=2)
+```
+
+The `decude_schema()` now supports starting from an existing `schema_map`
+instead of starting from scratch. This is the internal version of the
+`--existing_schema_path` functionality.
+
+```python
+schema_map1, error_logs = generator.deduce_schema(input_data=data1)
+schema_map2, error_logs = generator.deduce_schema(
+    input_data=data1, schema_map=schema_map1
+)
+```
+
+When using the `SchemaGenerator` object directly, the `input_format` parameter
+supports `dict` as a third input format in addition to the `json` and `csv`
+formats. The `dict` input format tells `SchemaGenerator.deduce_schema()` to
+accept a list of Python dict objects as the `input_data`. This is useful if the
+input data (usually JSON) has already been read into memory and parsed from
+newline-delimited JSON into native Python dict objects.
+
+<a name="SchemaTypes"></a>
 ## Schema Types
 
+<a name="SupportedTypes"></a>
 ### Supported Types
 
 The `bq show --schema` command produces a JSON schema file that uses the
@@ -531,6 +645,7 @@ The following types are _not_ supported at all:
 * `BYTES`
 * `DATETIME` (unable to distinguish from `TIMESTAMP`)
 
+<a name="TypeInferrence"></a>
 ### Type Inferrence Rules
 
 The `generate-schema` script attempts to emulate the various type conversion and
@@ -572,6 +687,7 @@ compatibility rules implemented by **bq load**:
     * integers less than `-2^63` (-9223372036854775808)
     * (See [Issue #18](https://github.com/bxparks/bigquery-schema-generator/issues/18) for more details)
 
+<a name="Examples"></a>
 ## Examples
 
 Here is an example of a single JSON data record on the STDIN (the `^D` below
@@ -705,41 +821,7 @@ INFO:root:Processed 4 lines
 ]
 ```
 
-## Using As a Library
-
-The `bigquery_schema_generator` module can be used as a library by an external
-Python client code by creating an instance of `SchemaGenerator` and calling the
-`run(input, output)` method:
-
-```python
-from bigquery_schema_generator.generate_schema import SchemaGenerator
-
-generator = SchemaGenerator(
-    input_format=input_format,
-    infer_mode=infer_mode,
-    keep_nulls=keep_nulls,
-    quoted_values_are_strings=quoted_values_are_strings,
-    debugging_interval=debugging_interval,
-    debugging_map=debugging_map)
-generator.run(input_file, output_file)
-```
-
-If you need to process the generated schema programmatically, use the
-`deduce_schema()` method and process the resulting `schema_map` and `error_log`
-data structures like this:
-
-```python
-from bigquery_schema_generator.generate_schema import SchemaGenerator
-...
-schema_map, error_logs = generator.deduce_schema(input_file)
-
-for error in error_logs:
-    logging.info("Problem on line %s: %s", error['line'], error['msg'])
-
-schema = generator.flatten_schema(schema_map)
-json.dump(schema, output_file, indent=2)
-```
-
+<a name="Benchmarks"></a>
 ## Benchmarks
 
 I wrote the `bigquery_schema_generator/anonymize.py` script to create an
@@ -759,6 +841,7 @@ $ bigquery_schema_generator/generate_schema.py < anon1.data.json \
 took 67s on a Dell Precision M4700 laptop with an Intel Core i7-3840QM CPU @
 2.80GHz, 32GB of RAM, Ubuntu Linux 18.04, Python 3.6.7.
 
+<a name="SystemRequirements"></a>
 ## System Requirements
 
 This project was initially developed on Ubuntu 17.04 using Python 3.5.3, but it
@@ -776,6 +859,12 @@ I have tested it on:
 The GitHub Actions continuous integration pipeline validates on Python 3.6, 3.7
 and 3.8.
 
+<a name="License"></a>
+## License
+
+Apache License 2.0
+
+<a name="Authors"></a>
 ## Authors
 
 * Created by Brian T. Park (brian@xparks.net).
@@ -793,8 +882,6 @@ and 3.8.
   (abroglesc@).
 * Allow an existing schema file to be specified using `--existing_schema_path`,
   by Austin Brogle (abroglesc@) and Bozo Dragojevic (bozzzzo@).
+* Allow `SchemaGenerator.deduce_schema()` to accept a list of native Python
+  `dict` objects, by Zigfrid Zvezdin (ZiggerZZ@).
 
-
-## License
-
-Apache License 2.0

From cf1c1ad6dbd155852943cb4d1051cf6b9ea66a9a Mon Sep 17 00:00:00 2001
From: Brian Park <brian@xparks.net>
Date: Wed, 9 Dec 2020 21:23:21 -0800
Subject: [PATCH 5/5] Bump version to 1.4

---
 CHANGELOG.md                         | 6 +++++-
 README.md                            | 6 +++---
 bigquery_schema_generator/version.py | 2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2e574d7..f0f16c9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,13 +1,17 @@
 # Changelog
 
 * Unreleased
+* 1.4 (2020-12-09)
     * Add 'dict' as a third `input_format` when `SchemaGenerator` is used as a
       library. This can be useful when the data has already been transformed
       into a list of native Python `dict` objects (see #58, thanks to
       ZiggerZZ@).
     * Expand the pattern matchers for quoted integers and quoted floating point
-      numbers to be more cmopatible with the ones recognized by `bq load
+      numbers to be more compatible with the patterns recognized by `bq load
       --autodetect`.
+    * Add Table of Contents to READMD.md. Add usage info for the
+      `schema_map=existing_schema_map` and the `input_format='dict'` parameters
+      in the `SchemaGenerator()` constructor.
 * 1.3 (2020-12-05)
     * Allow an existing schema file to be specified using
       `--existing_schema_path` flag, so that new data can be merged into it.
diff --git a/README.md b/README.md
index 652904b..2f01807 100644
--- a/README.md
+++ b/README.md
@@ -12,11 +12,11 @@ $ generate-schema < file.data.json > file.schema.json
 $ generate-schema --input_format csv < file.data.csv > file.schema.json
 ```
 
-**Version**: 1.3 (2020-12-05)
+**Version**: 1.4 (2020-12-09)
 
 **Changelog**: [CHANGELOG.md](CHANGELOG.md)
 
-**Table of Contents**
+## Table of Contents
 
 * [Background](#Background)
 * [Installation](#Installation)
@@ -572,7 +572,7 @@ schema = generator.flatten_schema(schema_map)
 json.dump(schema, output_file, indent=2)
 ```
 
-The `decude_schema()` now supports starting from an existing `schema_map`
+The `deduce_schema()` now supports starting from an existing `schema_map`
 instead of starting from scratch. This is the internal version of the
 `--existing_schema_path` functionality.
 
diff --git a/bigquery_schema_generator/version.py b/bigquery_schema_generator/version.py
index 6f4fa58..0f66308 100644
--- a/bigquery_schema_generator/version.py
+++ b/bigquery_schema_generator/version.py
@@ -1 +1 @@
-__version__ = '1.3'
+__version__ = '1.4'