From b8fe3f9e7c34804d2a5bf5f86c1a9fdeb2672ac3 Mon Sep 17 00:00:00 2001
From: Rajat Venkatesh <vrajat@users.noreply.github.com>
Date: Sun, 19 Dec 2021 12:09:42 +0530
Subject: [PATCH] fix: Improve docs on installation and plugins.

Change name of scan types to metadata and data from shallow and deep.
---
 README.md                  | 67 ++++++++++++++++++++++++++++----------
 piicatcher/__init__.py     |  2 +-
 piicatcher/api.py          | 14 ++++----
 piicatcher/command_line.py |  2 +-
 piicatcher/scanner.py      |  4 +--
 poetry.lock                |  8 ++---
 pyproject.toml             |  4 +--
 tests/test_api.py          |  2 +-
 tests/test_cli.py          |  6 ++--
 tests/test_scanner.py      |  8 ++---
 10 files changed, 75 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index 17e2dc9..89c1522 100644
--- a/README.md
+++ b/README.md
@@ -16,8 +16,9 @@ and tracks critical data. PIICatcher uses two techniques to detect PII:
 
 Read more in the [blog post](https://tokern.io/blog/scan-pii-data-warehouse/) on both these strategies.
 
-PIICatcher is *battery-included* with a growing set of
-regular expressions for scanning column names as well as data. It also include [Spacy](https://spacy.io).
+PIICatcher is *batteries-included* with a growing set of plugins to scan column metadata as well as metadata. 
+For example, [piicatcher_spacy](https://github.com/tokern/piicatcher_spacy) uses [Spacy](https://spacy.io) to detect
+PII in column data.
 
 PIICatcher supports incremental scans and will only scan new or not-yet scanned columns. Incremental scans allow easy
 scheduling of scans. It also provides powerful options to include or exclude schema and tables to manage compute resources.
@@ -37,24 +38,32 @@ and tables with PII and the type of PII tags.
 
 PIICatcher is available as a docker image or command-line application.
 
-### Docker (preferred)
+### Installation
+
+Docker:
 
     alias piicatcher='docker run -v ${HOME}/.config/tokern:/config -u $(id -u ${USER}):$(id -g ${USER}) -it --add-host=host.docker.internal:host-gateway tokern/piicatcher:latest'
-    piicatcher --help
-    piicatcher scan sqlite --name sqldb --path '/db/sqldb'
 
-### Command-line
-To install use pip:
+
+Pypi:
+    # Install development libraries for compiling dependencies.
+    # On Amazon Linux
+    sudo yum install mysql-devel gcc gcc-devel python-devel
 
     python3 -m venv .env
     source .env/bin/activate
     pip install piicatcher
 
-    # Install Spacy English package
-    python -m spacy download en_core_web_sm
-    
+    # Install Spacy plugin
+    pip install piicatcher_spacy
+
+
+### Command Line Usage
+    # add a sqlite source
+    piicatcher catalog add_sqlite --name sqldb --path '/db/sqldb'
+
     # run piicatcher on a sqlite db and print report to console
-    piicatcher scan sqlite --name sqldb --path '/db/sqldb'
+    piicatcher detect --source-name sqldb
     ╭─────────────┬─────────────┬─────────────┬─────────────╮
     │   schema    │    table    │   column    │   has_pii   │
     ├─────────────┼─────────────┼─────────────┼─────────────┤
@@ -67,16 +76,22 @@ To install use pip:
     ╰─────────────┴─────────────┴─────────────┴─────────────╯
 
 
-### API
-    from piicatcher.api import scan_postgresql
+### API Usage
+
+    from dbcat.api import open_catalog, add_postgresql_source
+    from piicatcher.api import scan_database
 
     # PIICatcher uses a catalog to store its state. 
     # The easiest option is to use a sqlite memory database.
     # For production usage check, https://tokern.io/docs/data-catalog
-    catalog_params={'catalog_path': ':memory:'}
-    output = scan_postrgresql(catalog_params=catalog_params, name="pg_db", uri="127.0.0.1", 
-                              username="piiuser", password="p11secret", database="piidb", 
-                              include_table_regex=["sample"])
+    catalog = open_catalog(app_dir='/tmp/.config/piicatcher', path=':memory:', secret='my_secret')
+
+    with catalog.managed_session:
+        # Add a postgresql source
+        source = add_postgresql_source(catalog=catalog, name="pg_db", uri="127.0.0.1", username="piiuser",
+                                        password="p11secret", database="piidb")
+        output = scan_database(catalog=catalog, source=source)
+
     print(output)
 
     # Example Output
@@ -90,6 +105,24 @@ To install use pip:
      ['public', 'sample', 'email', 'PiiTypes.EMAIL']]
 
 
+## Plugins
+
+PIICatcher can be extended by creating new detectors. PIICatcher supports two scanning techniques:
+* Metadata
+* Data
+
+Plugins can be created for either of these two techniques. Plugins are then registered using an API or using
+[Python Entry Points](https://packaging.python.org/en/latest/specifications/entry-points/).
+
+To create a new detector, simply create a new class that inherits from [`MetadataDetector`](https://github.com/tokern/piicatcher/blob/master/piicatcher/detectors.py)
+or [`DatumDetector`](https://github.com/tokern/piicatcher/blob/master/piicatcher/detectors.py).
+
+In the new class, define a function `detect` that will return a [`PIIType`](https://github.com/tokern/dbcat/blob/main/dbcat/catalog/pii_types.py) 
+If you are detecting a new PII type, then you can define a new class that inherits from PIIType.
+
+For detailed documentation, check [piicatcher plugin docs](https://tokern.io/docs/piicatcher/detectors/plugins).
+
+
 ## Supported Databases
 
 PIICatcher supports the following databases:
diff --git a/piicatcher/__init__.py b/piicatcher/__init__.py
index f423bf9..de172c1 100644
--- a/piicatcher/__init__.py
+++ b/piicatcher/__init__.py
@@ -1,5 +1,5 @@
 # flake8: noqa
-__version__ = "0.18.2"
+__version__ = "0.19.1"
 
 from dbcat.catalog.pii_types import PiiType
 
diff --git a/piicatcher/api.py b/piicatcher/api.py
index db009fa..d86a3d1 100644
--- a/piicatcher/api.py
+++ b/piicatcher/api.py
@@ -10,14 +10,14 @@
 from piicatcher.detectors import DatumDetector, MetadataDetector, detector_registry
 from piicatcher.generators import SMALL_TABLE_MAX, column_generator, data_generator
 from piicatcher.output import output_dict, output_tabular
-from piicatcher.scanner import deep_scan, shallow_scan
+from piicatcher.scanner import data_scan, metadata_scan
 
 LOGGER = logging.getLogger(__name__)
 
 
 class ScanTypeEnum(str, Enum):
-    shallow = "shallow"
-    deep = "deep"
+    metadata = "metadata"
+    data = "data"
 
 
 class OutputFormat(str, Enum):
@@ -28,7 +28,7 @@ class OutputFormat(str, Enum):
 def scan_database(
     catalog: Catalog,
     source: CatSource,
-    scan_type: ScanTypeEnum = ScanTypeEnum.shallow,
+    scan_type: ScanTypeEnum = ScanTypeEnum.metadata,
     incremental: bool = True,
     output_format: OutputFormat = OutputFormat.tabular,
     list_all: bool = False,
@@ -88,14 +88,14 @@ def scan_database(
                 exclude_table_regex=exclude_table_regex,
             )
 
-            if scan_type == ScanTypeEnum.shallow:
+            if scan_type == ScanTypeEnum.metadata:
                 detector_list = [
                     detector()
                     for detector in detectors.detector_registry.get_all().values()
                     if issubclass(detector, MetadataDetector)
                 ]
 
-                shallow_scan(
+                metadata_scan(
                     catalog=catalog,
                     detectors=detector_list,
                     work_generator=column_generator(
@@ -124,7 +124,7 @@ def scan_database(
                     if issubclass(detector, DatumDetector)
                 ]
 
-                deep_scan(
+                data_scan(
                     catalog=catalog,
                     detectors=detector_list,
                     work_generator=column_generator(
diff --git a/piicatcher/command_line.py b/piicatcher/command_line.py
index b25368f..16b5b60 100644
--- a/piicatcher/command_line.py
+++ b/piicatcher/command_line.py
@@ -157,7 +157,7 @@ def cli(
 def detect(
     source_name: str = typer.Option(..., help="Name of database to scan."),
     scan_type: ScanTypeEnum = typer.Option(
-        ScanTypeEnum.shallow,
+        ScanTypeEnum.metadata,
         help="Choose deep(scan data) or shallow(scan column names only)",
     ),
     incremental: bool = typer.Option(
diff --git a/piicatcher/scanner.py b/piicatcher/scanner.py
index e517952..7862521 100644
--- a/piicatcher/scanner.py
+++ b/piicatcher/scanner.py
@@ -75,7 +75,7 @@ def detect(self, column: CatColumn) -> Optional[PiiType]:
         return None
 
 
-def shallow_scan(
+def metadata_scan(
     catalog: Catalog,
     detectors: List[MetadataDetector],
     work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None],
@@ -124,7 +124,7 @@ def detect(self, column: CatColumn, datum: str) -> Optional[PiiType]:
         return None
 
 
-def deep_scan(
+def data_scan(
     catalog: Catalog,
     detectors: List[DatumDetector],
     work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None],
diff --git a/poetry.lock b/poetry.lock
index 93018a3..f681908 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -560,7 +560,7 @@ python-versions = ">=3.6, <3.7"
 
 [[package]]
 name = "dbcat"
-version = "0.11.0"
+version = "0.12.0"
 description = "Tokern Data Catalog"
 category = "main"
 optional = false
@@ -2734,7 +2734,7 @@ datahub = ["acryl-datahub", "great-expectations"]
 [metadata]
 lock-version = "1.1"
 python-versions = ">=3.6.1,<3.9"
-content-hash = "e9afde071a09d7ccfcee8fe85d12cea35e04769fe95730ae732615d445b9953f"
+content-hash = "44f4364e7e87580a7dd0848f86b1fc1d8762ef946f61af1947a99bd0fcdff9a1"
 
 [metadata.files]
 acryl-datahub = [
@@ -3018,8 +3018,8 @@ dataclasses = [
     {file = "dataclasses-0.8.tar.gz", hash = "sha256:8479067f342acf957dc82ec415d355ab5edb7e7646b90dc6e2fd1d96ad084c97"},
 ]
 dbcat = [
-    {file = "dbcat-0.11.0-py3-none-any.whl", hash = "sha256:25d18df96f899ef45d4b1c93931e04e3f34e70c445f8032760f21fce23274fe4"},
-    {file = "dbcat-0.11.0.tar.gz", hash = "sha256:270fc77f04a21fb53194d11ffa1bae922a16480288ff5a9cf45bbf926b3818cf"},
+    {file = "dbcat-0.12.0-py3-none-any.whl", hash = "sha256:1cba6423929e7953cfbeb9096287cf1f611a4872b4ccbad57871f59a82284b06"},
+    {file = "dbcat-0.12.0.tar.gz", hash = "sha256:cb9f247d14c7c233443d81820323cf27677b5e7b14d5b7e47d7c1d922a221f14"},
 ]
 decopatch = [
     {file = "decopatch-1.4.8-py2.py3-none-any.whl", hash = "sha256:29a74d5d753423b188d5b537532da4f4b88e33ddccb95a8a20a5eff5b13265d4"},
diff --git a/pyproject.toml b/pyproject.toml
index 0f8cf48..ec44620 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "piicatcher"
-version = "0.19.0"
+version = "0.19.1"
 description = "Find PII data in databases"
 authors = ["Tokern <info@tokern.io>"]
 license = "Apache 2.0"
@@ -28,7 +28,7 @@ pyyaml = "*"
 click = "*"
 python-json-logger = "^2.0.2"
 commonregex = "^1.5"
-dbcat = "^0.11.0"
+dbcat = "^0.12.0"
 typer = "^0.4.0"
 tabulate = "^0.8.9"
 dataclasses = {version = ">=0.6", markers="python_version >= '3.6' and python_version < '3.7'"}
diff --git a/tests/test_api.py b/tests/test_api.py
index 69b2c9c..51c91d4 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -49,7 +49,7 @@ def test_scan_database_deep(load_sample_data_and_pull):
             catalog=catalog,
             source=source,
             include_table_regex=["sample"],
-            scan_type=ScanTypeEnum.deep,
+            scan_type=ScanTypeEnum.data,
         )
 
         schemata = catalog.search_schema(source_like=source.name, schema_like="%")
diff --git a/tests/test_cli.py b/tests/test_cli.py
index e67884a..51c0dcf 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -57,7 +57,7 @@ def test_include_exclude(mocker, temp_sqlite_path, args):
     piicatcher.command_line.scan_database.assert_called_once_with(
         catalog=ANY,
         source=ANY,
-        scan_type=ScanTypeEnum.shallow,
+        scan_type=ScanTypeEnum.metadata,
         incremental=True,
         output_format=OutputFormat.tabular,
         list_all=False,
@@ -105,7 +105,7 @@ def test_multiple_include_exclude(mocker, temp_sqlite_path, args):
     piicatcher.command_line.scan_database.assert_called_once_with(
         catalog=ANY,
         source=ANY,
-        scan_type=ScanTypeEnum.shallow,
+        scan_type=ScanTypeEnum.metadata,
         incremental=True,
         output_format=OutputFormat.tabular,
         list_all=False,
@@ -139,7 +139,7 @@ def test_sample_size(mocker, temp_sqlite_path, args):
     piicatcher.command_line.scan_database.assert_called_once_with(
         catalog=ANY,
         source=ANY,
-        scan_type=ScanTypeEnum.shallow,
+        scan_type=ScanTypeEnum.metadata,
         incremental=True,
         output_format=OutputFormat.tabular,
         list_all=False,
diff --git a/tests/test_scanner.py b/tests/test_scanner.py
index bfa1abb..240c2c9 100644
--- a/tests/test_scanner.py
+++ b/tests/test_scanner.py
@@ -19,8 +19,8 @@
 from piicatcher.scanner import (
     ColumnNameRegexDetector,
     DatumRegexDetector,
-    deep_scan,
-    shallow_scan,
+    data_scan,
+    metadata_scan,
 )
 
 
@@ -139,7 +139,7 @@ def test_shallow_scan(load_data_and_pull):
     catalog, source_id = load_data_and_pull
     with catalog.managed_session:
         source = catalog.get_source_by_id(source_id)
-        shallow_scan(
+        metadata_scan(
             catalog=catalog,
             detectors=[ColumnNameRegexDetector()],
             work_generator=column_generator(catalog=catalog, source=source),
@@ -168,7 +168,7 @@ def test_deep_scan(load_data_and_pull):
     catalog, source_id = load_data_and_pull
     with catalog.managed_session:
         source = catalog.get_source_by_id(source_id)
-        deep_scan(
+        data_scan(
             catalog=catalog,
             detectors=[DatumRegexDetector()],
             work_generator=column_generator(catalog=catalog, source=source),