fix: Improve docs on installation and plugins.

Change name of scan types to metadata and data from shallow and deep.
tokern · Dec 19, 2021 · b8fe3f9 · b8fe3f9
1 parent bc142a4
commit b8fe3f9
Show file tree

Hide file tree

Showing 10 changed files with 75 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -16,8 +16,9 @@ and tracks critical data. PIICatcher uses two techniques to detect PII:
 
 Read more in the [blog post](https://tokern.io/blog/scan-pii-data-warehouse/) on both these strategies.
 
-PIICatcher is *battery-included* with a growing set of
-regular expressions for scanning column names as well as data. It also include [Spacy](https://spacy.io).
+PIICatcher is *batteries-included* with a growing set of plugins to scan column metadata as well as metadata. 
+For example, [piicatcher_spacy](https://github.com/tokern/piicatcher_spacy) uses [Spacy](https://spacy.io) to detect
+PII in column data.
 
 PIICatcher supports incremental scans and will only scan new or not-yet scanned columns. Incremental scans allow easy
 scheduling of scans. It also provides powerful options to include or exclude schema and tables to manage compute resources.
@@ -37,24 +38,32 @@ and tables with PII and the type of PII tags.
 
 PIICatcher is available as a docker image or command-line application.
 
-### Docker (preferred)
+### Installation
+
+Docker:
 
     alias piicatcher='docker run -v ${HOME}/.config/tokern:/config -u $(id -u ${USER}):$(id -g ${USER}) -it --add-host=host.docker.internal:host-gateway tokern/piicatcher:latest'
-    piicatcher --help
-    piicatcher scan sqlite --name sqldb --path '/db/sqldb'
 
-### Command-line
-To install use pip:
+
+Pypi:
+    # Install development libraries for compiling dependencies.
+    # On Amazon Linux
+    sudo yum install mysql-devel gcc gcc-devel python-devel
 
     python3 -m venv .env
     source .env/bin/activate
     pip install piicatcher
 
-    # Install Spacy English package
-    python -m spacy download en_core_web_sm
-
+    # Install Spacy plugin
+    pip install piicatcher_spacy
+
+
+### Command Line Usage
+    # add a sqlite source
+    piicatcher catalog add_sqlite --name sqldb --path '/db/sqldb'
+
     # run piicatcher on a sqlite db and print report to console
-    piicatcher scan sqlite --name sqldb --path '/db/sqldb'
+    piicatcher detect --source-name sqldb
     ╭─────────────┬─────────────┬─────────────┬─────────────╮
     │   schema    │    table    │   column    │   has_pii   │
     ├─────────────┼─────────────┼─────────────┼─────────────┤
@@ -67,16 +76,22 @@ To install use pip:
     ╰─────────────┴─────────────┴─────────────┴─────────────╯
 
 
-### API
-    from piicatcher.api import scan_postgresql
+### API Usage
+
+    from dbcat.api import open_catalog, add_postgresql_source
+    from piicatcher.api import scan_database
 
     # PIICatcher uses a catalog to store its state. 
     # The easiest option is to use a sqlite memory database.
     # For production usage check, https://tokern.io/docs/data-catalog
-    catalog_params={'catalog_path': ':memory:'}
-    output = scan_postrgresql(catalog_params=catalog_params, name="pg_db", uri="127.0.0.1", 
-                              username="piiuser", password="p11secret", database="piidb", 
-                              include_table_regex=["sample"])
+    catalog = open_catalog(app_dir='/tmp/.config/piicatcher', path=':memory:', secret='my_secret')
+
+    with catalog.managed_session:
+        # Add a postgresql source
+        source = add_postgresql_source(catalog=catalog, name="pg_db", uri="127.0.0.1", username="piiuser",
+                                        password="p11secret", database="piidb")
+        output = scan_database(catalog=catalog, source=source)
+
     print(output)
 
     # Example Output
@@ -90,6 +105,24 @@ To install use pip:
      ['public', 'sample', 'email', 'PiiTypes.EMAIL']]
 
 
+## Plugins
+
+PIICatcher can be extended by creating new detectors. PIICatcher supports two scanning techniques:
+* Metadata
+* Data
+
+Plugins can be created for either of these two techniques. Plugins are then registered using an API or using
+[Python Entry Points](https://packaging.python.org/en/latest/specifications/entry-points/).
+
+To create a new detector, simply create a new class that inherits from [`MetadataDetector`](https://github.com/tokern/piicatcher/blob/master/piicatcher/detectors.py)
+or [`DatumDetector`](https://github.com/tokern/piicatcher/blob/master/piicatcher/detectors.py).
+
+In the new class, define a function `detect` that will return a [`PIIType`](https://github.com/tokern/dbcat/blob/main/dbcat/catalog/pii_types.py) 
+If you are detecting a new PII type, then you can define a new class that inherits from PIIType.
+
+For detailed documentation, check [piicatcher plugin docs](https://tokern.io/docs/piicatcher/detectors/plugins).
+
+
 ## Supported Databases
 
 PIICatcher supports the following databases:

diff --git a/piicatcher/__init__.py b/piicatcher/__init__.py
@@ -1,5 +1,5 @@
 # flake8: noqa
-__version__ = "0.18.2"
+__version__ = "0.19.1"
 
 from dbcat.catalog.pii_types import PiiType
 

diff --git a/piicatcher/api.py b/piicatcher/api.py
@@ -10,14 +10,14 @@
 from piicatcher.detectors import DatumDetector, MetadataDetector, detector_registry
 from piicatcher.generators import SMALL_TABLE_MAX, column_generator, data_generator
 from piicatcher.output import output_dict, output_tabular
-from piicatcher.scanner import deep_scan, shallow_scan
+from piicatcher.scanner import data_scan, metadata_scan
 
 LOGGER = logging.getLogger(__name__)
 
 
 class ScanTypeEnum(str, Enum):
-    shallow = "shallow"
-    deep = "deep"
+    metadata = "metadata"
+    data = "data"
 
 
 class OutputFormat(str, Enum):
@@ -28,7 +28,7 @@ class OutputFormat(str, Enum):
 def scan_database(
     catalog: Catalog,
     source: CatSource,
-    scan_type: ScanTypeEnum = ScanTypeEnum.shallow,
+    scan_type: ScanTypeEnum = ScanTypeEnum.metadata,
     incremental: bool = True,
     output_format: OutputFormat = OutputFormat.tabular,
     list_all: bool = False,
@@ -88,14 +88,14 @@ def scan_database(
                 exclude_table_regex=exclude_table_regex,
             )
 
-            if scan_type == ScanTypeEnum.shallow:
+            if scan_type == ScanTypeEnum.metadata:
                 detector_list = [
                     detector()
                     for detector in detectors.detector_registry.get_all().values()
                     if issubclass(detector, MetadataDetector)
                 ]
 
-                shallow_scan(
+                metadata_scan(
                     catalog=catalog,
                     detectors=detector_list,
                     work_generator=column_generator(
@@ -124,7 +124,7 @@ def scan_database(
                     if issubclass(detector, DatumDetector)
                 ]
 
-                deep_scan(
+                data_scan(
                     catalog=catalog,
                     detectors=detector_list,
                     work_generator=column_generator(

diff --git a/piicatcher/command_line.py b/piicatcher/command_line.py
@@ -157,7 +157,7 @@ def cli(
 def detect(
     source_name: str = typer.Option(..., help="Name of database to scan."),
     scan_type: ScanTypeEnum = typer.Option(
-        ScanTypeEnum.shallow,
+        ScanTypeEnum.metadata,
         help="Choose deep(scan data) or shallow(scan column names only)",
     ),
     incremental: bool = typer.Option(

diff --git a/piicatcher/scanner.py b/piicatcher/scanner.py
@@ -75,7 +75,7 @@ def detect(self, column: CatColumn) -> Optional[PiiType]:
         return None
 
 
-def shallow_scan(
+def metadata_scan(
     catalog: Catalog,
     detectors: List[MetadataDetector],
     work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None],
@@ -124,7 +124,7 @@ def detect(self, column: CatColumn, datum: str) -> Optional[PiiType]:
         return None
 
 
-def deep_scan(
+def data_scan(
     catalog: Catalog,
     detectors: List[DatumDetector],
     work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None],

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "piicatcher"
-version = "0.19.0"
+version = "0.19.1"
 description = "Find PII data in databases"
 authors = ["Tokern <info@tokern.io>"]
 license = "Apache 2.0"
@@ -28,7 +28,7 @@ pyyaml = "*"
 click = "*"
 python-json-logger = "^2.0.2"
 commonregex = "^1.5"
-dbcat = "^0.11.0"
+dbcat = "^0.12.0"
 typer = "^0.4.0"
 tabulate = "^0.8.9"
 dataclasses = {version = ">=0.6", markers="python_version >= '3.6' and python_version < '3.7'"}

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -49,7 +49,7 @@ def test_scan_database_deep(load_sample_data_and_pull):
             catalog=catalog,
             source=source,
             include_table_regex=["sample"],
-            scan_type=ScanTypeEnum.deep,
+            scan_type=ScanTypeEnum.data,
         )
 
         schemata = catalog.search_schema(source_like=source.name, schema_like="%")

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -57,7 +57,7 @@ def test_include_exclude(mocker, temp_sqlite_path, args):
     piicatcher.command_line.scan_database.assert_called_once_with(
         catalog=ANY,
         source=ANY,
-        scan_type=ScanTypeEnum.shallow,
+        scan_type=ScanTypeEnum.metadata,
         incremental=True,
         output_format=OutputFormat.tabular,
         list_all=False,
@@ -105,7 +105,7 @@ def test_multiple_include_exclude(mocker, temp_sqlite_path, args):
     piicatcher.command_line.scan_database.assert_called_once_with(
         catalog=ANY,
         source=ANY,
-        scan_type=ScanTypeEnum.shallow,
+        scan_type=ScanTypeEnum.metadata,
         incremental=True,
         output_format=OutputFormat.tabular,
         list_all=False,
@@ -139,7 +139,7 @@ def test_sample_size(mocker, temp_sqlite_path, args):
     piicatcher.command_line.scan_database.assert_called_once_with(
         catalog=ANY,
         source=ANY,
-        scan_type=ScanTypeEnum.shallow,
+        scan_type=ScanTypeEnum.metadata,
         incremental=True,
         output_format=OutputFormat.tabular,
         list_all=False,

diff --git a/tests/test_scanner.py b/tests/test_scanner.py
@@ -19,8 +19,8 @@
 from piicatcher.scanner import (
     ColumnNameRegexDetector,
     DatumRegexDetector,
-    deep_scan,
-    shallow_scan,
+    data_scan,
+    metadata_scan,
 )
 
 
@@ -139,7 +139,7 @@ def test_shallow_scan(load_data_and_pull):
     catalog, source_id = load_data_and_pull
     with catalog.managed_session:
         source = catalog.get_source_by_id(source_id)
-        shallow_scan(
+        metadata_scan(
             catalog=catalog,
             detectors=[ColumnNameRegexDetector()],
             work_generator=column_generator(catalog=catalog, source=source),
@@ -168,7 +168,7 @@ def test_deep_scan(load_data_and_pull):
     catalog, source_id = load_data_and_pull
     with catalog.managed_session:
         source = catalog.get_source_by_id(source_id)
-        deep_scan(
+        data_scan(
             catalog=catalog,
             detectors=[DatumRegexDetector()],
             work_generator=column_generator(catalog=catalog, source=source),