From b8fe3f9e7c34804d2a5bf5f86c1a9fdeb2672ac3 Mon Sep 17 00:00:00 2001 From: Rajat Venkatesh Date: Sun, 19 Dec 2021 12:09:42 +0530 Subject: [PATCH] fix: Improve docs on installation and plugins. Change name of scan types to metadata and data from shallow and deep. --- README.md | 67 ++++++++++++++++++++++++++++---------- piicatcher/__init__.py | 2 +- piicatcher/api.py | 14 ++++---- piicatcher/command_line.py | 2 +- piicatcher/scanner.py | 4 +-- poetry.lock | 8 ++--- pyproject.toml | 4 +-- tests/test_api.py | 2 +- tests/test_cli.py | 6 ++-- tests/test_scanner.py | 8 ++--- 10 files changed, 75 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 17e2dc9..89c1522 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,9 @@ and tracks critical data. PIICatcher uses two techniques to detect PII: Read more in the [blog post](https://tokern.io/blog/scan-pii-data-warehouse/) on both these strategies. -PIICatcher is *battery-included* with a growing set of -regular expressions for scanning column names as well as data. It also include [Spacy](https://spacy.io). +PIICatcher is *batteries-included* with a growing set of plugins to scan column metadata as well as metadata. +For example, [piicatcher_spacy](https://github.com/tokern/piicatcher_spacy) uses [Spacy](https://spacy.io) to detect +PII in column data. PIICatcher supports incremental scans and will only scan new or not-yet scanned columns. Incremental scans allow easy scheduling of scans. It also provides powerful options to include or exclude schema and tables to manage compute resources. @@ -37,24 +38,32 @@ and tables with PII and the type of PII tags. PIICatcher is available as a docker image or command-line application. -### Docker (preferred) +### Installation + +Docker: alias piicatcher='docker run -v ${HOME}/.config/tokern:/config -u $(id -u ${USER}):$(id -g ${USER}) -it --add-host=host.docker.internal:host-gateway tokern/piicatcher:latest' - piicatcher --help - piicatcher scan sqlite --name sqldb --path '/db/sqldb' -### Command-line -To install use pip: + +Pypi: + # Install development libraries for compiling dependencies. + # On Amazon Linux + sudo yum install mysql-devel gcc gcc-devel python-devel python3 -m venv .env source .env/bin/activate pip install piicatcher - # Install Spacy English package - python -m spacy download en_core_web_sm - + # Install Spacy plugin + pip install piicatcher_spacy + + +### Command Line Usage + # add a sqlite source + piicatcher catalog add_sqlite --name sqldb --path '/db/sqldb' + # run piicatcher on a sqlite db and print report to console - piicatcher scan sqlite --name sqldb --path '/db/sqldb' + piicatcher detect --source-name sqldb ╭─────────────┬─────────────┬─────────────┬─────────────╮ │ schema │ table │ column │ has_pii │ ├─────────────┼─────────────┼─────────────┼─────────────┤ @@ -67,16 +76,22 @@ To install use pip: ╰─────────────┴─────────────┴─────────────┴─────────────╯ -### API - from piicatcher.api import scan_postgresql +### API Usage + + from dbcat.api import open_catalog, add_postgresql_source + from piicatcher.api import scan_database # PIICatcher uses a catalog to store its state. # The easiest option is to use a sqlite memory database. # For production usage check, https://tokern.io/docs/data-catalog - catalog_params={'catalog_path': ':memory:'} - output = scan_postrgresql(catalog_params=catalog_params, name="pg_db", uri="127.0.0.1", - username="piiuser", password="p11secret", database="piidb", - include_table_regex=["sample"]) + catalog = open_catalog(app_dir='/tmp/.config/piicatcher', path=':memory:', secret='my_secret') + + with catalog.managed_session: + # Add a postgresql source + source = add_postgresql_source(catalog=catalog, name="pg_db", uri="127.0.0.1", username="piiuser", + password="p11secret", database="piidb") + output = scan_database(catalog=catalog, source=source) + print(output) # Example Output @@ -90,6 +105,24 @@ To install use pip: ['public', 'sample', 'email', 'PiiTypes.EMAIL']] +## Plugins + +PIICatcher can be extended by creating new detectors. PIICatcher supports two scanning techniques: +* Metadata +* Data + +Plugins can be created for either of these two techniques. Plugins are then registered using an API or using +[Python Entry Points](https://packaging.python.org/en/latest/specifications/entry-points/). + +To create a new detector, simply create a new class that inherits from [`MetadataDetector`](https://github.com/tokern/piicatcher/blob/master/piicatcher/detectors.py) +or [`DatumDetector`](https://github.com/tokern/piicatcher/blob/master/piicatcher/detectors.py). + +In the new class, define a function `detect` that will return a [`PIIType`](https://github.com/tokern/dbcat/blob/main/dbcat/catalog/pii_types.py) +If you are detecting a new PII type, then you can define a new class that inherits from PIIType. + +For detailed documentation, check [piicatcher plugin docs](https://tokern.io/docs/piicatcher/detectors/plugins). + + ## Supported Databases PIICatcher supports the following databases: diff --git a/piicatcher/__init__.py b/piicatcher/__init__.py index f423bf9..de172c1 100644 --- a/piicatcher/__init__.py +++ b/piicatcher/__init__.py @@ -1,5 +1,5 @@ # flake8: noqa -__version__ = "0.18.2" +__version__ = "0.19.1" from dbcat.catalog.pii_types import PiiType diff --git a/piicatcher/api.py b/piicatcher/api.py index db009fa..d86a3d1 100644 --- a/piicatcher/api.py +++ b/piicatcher/api.py @@ -10,14 +10,14 @@ from piicatcher.detectors import DatumDetector, MetadataDetector, detector_registry from piicatcher.generators import SMALL_TABLE_MAX, column_generator, data_generator from piicatcher.output import output_dict, output_tabular -from piicatcher.scanner import deep_scan, shallow_scan +from piicatcher.scanner import data_scan, metadata_scan LOGGER = logging.getLogger(__name__) class ScanTypeEnum(str, Enum): - shallow = "shallow" - deep = "deep" + metadata = "metadata" + data = "data" class OutputFormat(str, Enum): @@ -28,7 +28,7 @@ class OutputFormat(str, Enum): def scan_database( catalog: Catalog, source: CatSource, - scan_type: ScanTypeEnum = ScanTypeEnum.shallow, + scan_type: ScanTypeEnum = ScanTypeEnum.metadata, incremental: bool = True, output_format: OutputFormat = OutputFormat.tabular, list_all: bool = False, @@ -88,14 +88,14 @@ def scan_database( exclude_table_regex=exclude_table_regex, ) - if scan_type == ScanTypeEnum.shallow: + if scan_type == ScanTypeEnum.metadata: detector_list = [ detector() for detector in detectors.detector_registry.get_all().values() if issubclass(detector, MetadataDetector) ] - shallow_scan( + metadata_scan( catalog=catalog, detectors=detector_list, work_generator=column_generator( @@ -124,7 +124,7 @@ def scan_database( if issubclass(detector, DatumDetector) ] - deep_scan( + data_scan( catalog=catalog, detectors=detector_list, work_generator=column_generator( diff --git a/piicatcher/command_line.py b/piicatcher/command_line.py index b25368f..16b5b60 100644 --- a/piicatcher/command_line.py +++ b/piicatcher/command_line.py @@ -157,7 +157,7 @@ def cli( def detect( source_name: str = typer.Option(..., help="Name of database to scan."), scan_type: ScanTypeEnum = typer.Option( - ScanTypeEnum.shallow, + ScanTypeEnum.metadata, help="Choose deep(scan data) or shallow(scan column names only)", ), incremental: bool = typer.Option( diff --git a/piicatcher/scanner.py b/piicatcher/scanner.py index e517952..7862521 100644 --- a/piicatcher/scanner.py +++ b/piicatcher/scanner.py @@ -75,7 +75,7 @@ def detect(self, column: CatColumn) -> Optional[PiiType]: return None -def shallow_scan( +def metadata_scan( catalog: Catalog, detectors: List[MetadataDetector], work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None], @@ -124,7 +124,7 @@ def detect(self, column: CatColumn, datum: str) -> Optional[PiiType]: return None -def deep_scan( +def data_scan( catalog: Catalog, detectors: List[DatumDetector], work_generator: Generator[Tuple[CatSchema, CatTable, CatColumn], None, None], diff --git a/poetry.lock b/poetry.lock index 93018a3..f681908 100644 --- a/poetry.lock +++ b/poetry.lock @@ -560,7 +560,7 @@ python-versions = ">=3.6, <3.7" [[package]] name = "dbcat" -version = "0.11.0" +version = "0.12.0" description = "Tokern Data Catalog" category = "main" optional = false @@ -2734,7 +2734,7 @@ datahub = ["acryl-datahub", "great-expectations"] [metadata] lock-version = "1.1" python-versions = ">=3.6.1,<3.9" -content-hash = "e9afde071a09d7ccfcee8fe85d12cea35e04769fe95730ae732615d445b9953f" +content-hash = "44f4364e7e87580a7dd0848f86b1fc1d8762ef946f61af1947a99bd0fcdff9a1" [metadata.files] acryl-datahub = [ @@ -3018,8 +3018,8 @@ dataclasses = [ {file = "dataclasses-0.8.tar.gz", hash = "sha256:8479067f342acf957dc82ec415d355ab5edb7e7646b90dc6e2fd1d96ad084c97"}, ] dbcat = [ - {file = "dbcat-0.11.0-py3-none-any.whl", hash = "sha256:25d18df96f899ef45d4b1c93931e04e3f34e70c445f8032760f21fce23274fe4"}, - {file = "dbcat-0.11.0.tar.gz", hash = "sha256:270fc77f04a21fb53194d11ffa1bae922a16480288ff5a9cf45bbf926b3818cf"}, + {file = "dbcat-0.12.0-py3-none-any.whl", hash = "sha256:1cba6423929e7953cfbeb9096287cf1f611a4872b4ccbad57871f59a82284b06"}, + {file = "dbcat-0.12.0.tar.gz", hash = "sha256:cb9f247d14c7c233443d81820323cf27677b5e7b14d5b7e47d7c1d922a221f14"}, ] decopatch = [ {file = "decopatch-1.4.8-py2.py3-none-any.whl", hash = "sha256:29a74d5d753423b188d5b537532da4f4b88e33ddccb95a8a20a5eff5b13265d4"}, diff --git a/pyproject.toml b/pyproject.toml index 0f8cf48..ec44620 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "piicatcher" -version = "0.19.0" +version = "0.19.1" description = "Find PII data in databases" authors = ["Tokern "] license = "Apache 2.0" @@ -28,7 +28,7 @@ pyyaml = "*" click = "*" python-json-logger = "^2.0.2" commonregex = "^1.5" -dbcat = "^0.11.0" +dbcat = "^0.12.0" typer = "^0.4.0" tabulate = "^0.8.9" dataclasses = {version = ">=0.6", markers="python_version >= '3.6' and python_version < '3.7'"} diff --git a/tests/test_api.py b/tests/test_api.py index 69b2c9c..51c91d4 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -49,7 +49,7 @@ def test_scan_database_deep(load_sample_data_and_pull): catalog=catalog, source=source, include_table_regex=["sample"], - scan_type=ScanTypeEnum.deep, + scan_type=ScanTypeEnum.data, ) schemata = catalog.search_schema(source_like=source.name, schema_like="%") diff --git a/tests/test_cli.py b/tests/test_cli.py index e67884a..51c0dcf 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -57,7 +57,7 @@ def test_include_exclude(mocker, temp_sqlite_path, args): piicatcher.command_line.scan_database.assert_called_once_with( catalog=ANY, source=ANY, - scan_type=ScanTypeEnum.shallow, + scan_type=ScanTypeEnum.metadata, incremental=True, output_format=OutputFormat.tabular, list_all=False, @@ -105,7 +105,7 @@ def test_multiple_include_exclude(mocker, temp_sqlite_path, args): piicatcher.command_line.scan_database.assert_called_once_with( catalog=ANY, source=ANY, - scan_type=ScanTypeEnum.shallow, + scan_type=ScanTypeEnum.metadata, incremental=True, output_format=OutputFormat.tabular, list_all=False, @@ -139,7 +139,7 @@ def test_sample_size(mocker, temp_sqlite_path, args): piicatcher.command_line.scan_database.assert_called_once_with( catalog=ANY, source=ANY, - scan_type=ScanTypeEnum.shallow, + scan_type=ScanTypeEnum.metadata, incremental=True, output_format=OutputFormat.tabular, list_all=False, diff --git a/tests/test_scanner.py b/tests/test_scanner.py index bfa1abb..240c2c9 100644 --- a/tests/test_scanner.py +++ b/tests/test_scanner.py @@ -19,8 +19,8 @@ from piicatcher.scanner import ( ColumnNameRegexDetector, DatumRegexDetector, - deep_scan, - shallow_scan, + data_scan, + metadata_scan, ) @@ -139,7 +139,7 @@ def test_shallow_scan(load_data_and_pull): catalog, source_id = load_data_and_pull with catalog.managed_session: source = catalog.get_source_by_id(source_id) - shallow_scan( + metadata_scan( catalog=catalog, detectors=[ColumnNameRegexDetector()], work_generator=column_generator(catalog=catalog, source=source), @@ -168,7 +168,7 @@ def test_deep_scan(load_data_and_pull): catalog, source_id = load_data_and_pull with catalog.managed_session: source = catalog.get_source_by_id(source_id) - deep_scan( + data_scan( catalog=catalog, detectors=[DatumRegexDetector()], work_generator=column_generator(catalog=catalog, source=source),