From 40e15c0405a132643fbf82e654bc8bc11943722f Mon Sep 17 00:00:00 2001 From: Serge Smertin Date: Fri, 15 Mar 2024 17:51:38 +0100 Subject: [PATCH] Added documentation generator on `make fmt` stage --- Makefile | 1 + README.md | 158 ++++++++++++++++++++++++ scripts/docs.py | 40 ++++++ src/databricks/labs/pylint/airflow.py | 9 +- src/databricks/labs/pylint/dbutils.py | 18 ++- src/databricks/labs/pylint/legacy.py | 5 +- src/databricks/labs/pylint/notebooks.py | 7 +- src/databricks/labs/pylint/spark.py | 7 +- 8 files changed, 230 insertions(+), 15 deletions(-) create mode 100644 scripts/docs.py diff --git a/Makefile b/Makefile index 4b9667f..aa00d24 100644 --- a/Makefile +++ b/Makefile @@ -15,6 +15,7 @@ lint: hatch run verify fmt: + hatch run python scripts/docs.py hatch run fmt test: diff --git a/README.md b/README.md index 331b32e..10978b5 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,28 @@ Checks for common mistakes and issues in Python code specifically in Databricks * [Databricks Labs PyLint Plugin](#databricks-labs-pylint-plugin) * [Installation](#installation) +* [Automated code analysis](#automated-code-analysis) + * [`databricks-airflow` checker](#databricks-airflow-checker) + * [`E9698`: `unsupported-runtime`](#e9698-unsupported-runtime) + * [`E9699`: `missing-data-security-mode`](#e9699-missing-data-security-mode) + * [`databricks-dbutils` checker](#databricks-dbutils-checker) + * [`E9859`: `internal-api`](#e9859-internal-api) + * [`E9869`: `pat-token-leaked`](#e9869-pat-token-leaked) + * [`E9879`: `dbutils-notebook-run`](#e9879-dbutils-notebook-run) + * [`E9889`: `dbutils-credentials`](#e9889-dbutils-credentials) + * [`E9896`: `dbutils-fs-mount`](#e9896-dbutils-fs-mount) + * [`E9897`: `dbutils-fs-ls`](#e9897-dbutils-fs-ls) + * [`E9898`: `dbutils-fs-head`](#e9898-dbutils-fs-head) + * [`E9899`: `dbutils-fs-cp`](#e9899-dbutils-fs-cp) + * [`databricks-legacy` checker](#databricks-legacy-checker) + * [`E9789`: `incompatible-with-uc`](#e9789-incompatible-with-uc) + * [`E9799`: `legacy-cli`](#e9799-legacy-cli) + * [`databricks-notebooks` checker](#databricks-notebooks-checker) + * [`E9994`: `notebooks-percent-run`](#e9994-notebooks-percent-run) + * [`E9996`: `notebooks-too-many-cells`](#e9996-notebooks-too-many-cells) + * [`spark` checker](#spark-checker) + * [`E9700`: `spark-outside-function`](#e9700-spark-outside-function) + * [`E9701`: `no-spark-argument-in-function`](#e9701-no-spark-argument-in-function) * [Project Support](#project-support) @@ -22,6 +44,142 @@ You can install this project via `pip`: pip install databricks-labs-pylint-plugin ``` +and then use it with `pylint`: + +``` +pylint --load-plugins=databricks.labs.pylint.all .py +``` + +[[back to top](#databricks-labs-pylint-plugin)] + +# Automated code analysis + +[[back to top](#databricks-labs-pylint-plugin)] + + + +## `databricks-airflow` checker + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9698`: `unsupported-runtime` + +XXX cluster has unsupported runtime: XXX. The runtime version is not supported by Unity Catalog. Please upgrade to a runtime greater than or equal to 11.3. + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9699`: `missing-data-security-mode` + +XXX cluster missing `data_security_mode` required for Unity Catalog compatibility. Before you enable Unity Catalog, you must set the `data_security_mode` to 'NONE', so that your existing jobs would keep the same behavior. Failure to do so may cause your jobs to fail with unexpected errors. + +[[back to top](#databricks-labs-pylint-plugin)] + +## `databricks-dbutils` checker + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9859`: `internal-api` + +Do not use internal APIs, rewrite using Databricks SDK. Do not use internal APIs. Use Databricks SDK for Python: https://databricks-sdk-py.readthedocs.io/en/latest/index.html + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9869`: `pat-token-leaked` + +Use Databricks SDK instead: from databricks.sdk import WorkspaceClient(); w = WorkspaceClient(). Do not hardcode secrets in code, use Databricks SDK instead, which natively authenticates in Databricks Notebooks. See more at https://databricks-sdk-py.readthedocs.io/en/latest/authentication.html + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9879`: `dbutils-notebook-run` + +Use Databricks SDK instead: w.jobs.submit( + tasks=[jobs.SubmitTask(existing_cluster_id=..., + notebook_task=jobs.NotebookTask(notebook_path=XXX), + task_key=...) + ]).result(timeout=timedelta(minutes=XXX)). Migrate all usage of dbutils to Databricks SDK. See the more detailed documentation at https://databricks-sdk-py.readthedocs.io/en/latest/workspace/jobs/jobs.html + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9889`: `dbutils-credentials` + +Credentials utility is not supported with Unity Catalog. Migrate all usage to Unity Catalog + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9896`: `dbutils-fs-mount` + +Mounts are not supported with Unity Catalog, switch to using Unity Catalog Volumes instead. Migrate all usage to Unity Catalog + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9897`: `dbutils-fs-ls` + +Use Databricks SDK instead: w.dbfs.list(XXX). Migrate all usage of dbutils to Databricks SDK. See the more detailed documentation at https://databricks-sdk-py.readthedocs.io/en/latest/workspace/files/dbfs.html + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9898`: `dbutils-fs-head` + +Use Databricks SDK instead: with w.dbfs.download(XXX) as f: f.read(). Migrate all usage of dbutils to Databricks SDK. See the more detailed documentation at https://databricks-sdk-py.readthedocs.io/en/latest/workspace/files/dbfs.html + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9899`: `dbutils-fs-cp` + +Use Databricks SDK instead: w.dbfs.copy(XXX, XXX). Migrate all usage of dbutils to Databricks SDK. See the more detailed documentation at https://databricks-sdk-py.readthedocs.io/en/latest/workspace/files/dbfs.html + +[[back to top](#databricks-labs-pylint-plugin)] + +## `databricks-legacy` checker + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9789`: `incompatible-with-uc` + +Incompatible with Unified Catalog. Migrate all usage to Databricks Unity Catalog. Use https://github.com/databrickslabs/ucx for more details + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9799`: `legacy-cli` + +Don't use databricks_cli, use databricks.sdk instead: pip install databricks-sdk. Migrate all usage of Legacy CLI to Databricks SDK. See the more detailed documentation at https://databricks-sdk-py.readthedocs.io/en/latest/index.html + +[[back to top](#databricks-labs-pylint-plugin)] + +## `databricks-notebooks` checker + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9994`: `notebooks-percent-run` + +Using %run is not allowed. Use functions instead of %run to avoid side effects and make the code more testable. If you need to share code between notebooks, consider creating a library. If still need to call another code as a separate job, use Databricks SDK for Python: https://databricks-sdk-py.readthedocs.io/en/latest/index.html + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9996`: `notebooks-too-many-cells` + +Notebooks should not have more than 75 cells. Otherwise, it's hard to maintain and understand the notebook for other people and the future you + +[[back to top](#databricks-labs-pylint-plugin)] + +## `spark` checker + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9700`: `spark-outside-function` + +Using spark outside the function is leading to untestable code. Do not use global spark object, pass it as an argument to the function instead, so that the function becomes testable in a CI/CD pipelines. + +[[back to top](#databricks-labs-pylint-plugin)] + +### `E9701`: `no-spark-argument-in-function` + +Function XXX is missing a 'spark' argument. Function refers to a global spark variable, which may not always be available. Pass the spark object as an argument to the function instead, so that the function becomes testable in a CI/CD pipelines. + +[[back to top](#databricks-labs-pylint-plugin)] + + + # Project Support Please note that this project is provided for your exploration only and is not diff --git a/scripts/docs.py b/scripts/docs.py new file mode 100644 index 0000000..892d02a --- /dev/null +++ b/scripts/docs.py @@ -0,0 +1,40 @@ +import re +from pathlib import Path + +from pylint.lint import PyLinter + +from databricks.labs.pylint.airflow import AirflowChecker +from databricks.labs.pylint.dbutils import DbutilsChecker +from databricks.labs.pylint.legacy import LegacyChecker +from databricks.labs.pylint.notebooks import NotebookChecker +from databricks.labs.pylint.spark import SparkChecker + + +def do_something(): + out = ["\n"] + linter = PyLinter() + for checker in [ + AirflowChecker(linter), + DbutilsChecker(linter), + LegacyChecker(linter), + NotebookChecker(linter), + SparkChecker(linter), + ]: + out.append(f"## `{checker.name}` checker") + out.append("\n[[back to top](#databricks-labs-pylint-plugin)]\n") + for msg_def in checker.messages: + out.append(f"### `{msg_def.msgid}`: `{msg_def.symbol}`\n") + out.append(f"{msg_def.msg.replace('%s', 'XXX')}. {msg_def.description}") + out.append("\n[[back to top](#databricks-labs-pylint-plugin)]\n") + out.append("") + checker_docs = "\n".join(out) + readme_file = Path(__file__).parent.parent.joinpath("README.md") + with readme_file.open("r") as f: + pattern = r"\n(.*)\n" + new_readme = re.sub(pattern, checker_docs, f.read(), 0, re.MULTILINE | re.DOTALL) + with readme_file.open("w") as f: + f.write(new_readme) + + +if __name__ == "__main__": + do_something() diff --git a/src/databricks/labs/pylint/airflow.py b/src/databricks/labs/pylint/airflow.py index b40f91f..cb8728b 100644 --- a/src/databricks/labs/pylint/airflow.py +++ b/src/databricks/labs/pylint/airflow.py @@ -9,14 +9,17 @@ class AirflowChecker(BaseChecker): msgs = { "E9699": ( - "%s cluster missing 'data_security_mode' required for Unity Catalog compatibility", + "%s cluster missing `data_security_mode` required for Unity Catalog compatibility", "missing-data-security-mode", - "new_cluster is missing data_security_mode", + "Before you enable Unity Catalog, you must set the `data_security_mode` to 'NONE'," + " so that your existing jobs would keep the same behavior. Failure to do so may cause " + "your jobs to fail with unexpected errors.", ), "E9698": ( "%s cluster has unsupported runtime: %s", "unsupported-runtime", - "new_cluster has unsupported runtime", + "The runtime version is not supported by Unity Catalog. Please upgrade to a runtime greater " + "than or equal to 11.3.", ), } diff --git a/src/databricks/labs/pylint/dbutils.py b/src/databricks/labs/pylint/dbutils.py index 7c3464a..c52805b 100644 --- a/src/databricks/labs/pylint/dbutils.py +++ b/src/databricks/labs/pylint/dbutils.py @@ -10,17 +10,20 @@ class DbutilsChecker(BaseChecker): "E9899": ( "Use Databricks SDK instead: w.dbfs.copy(%s, %s)", "dbutils-fs-cp", - "Migrate all usage of dbutils to Databricks SDK", + "Migrate all usage of dbutils to Databricks SDK. See the more detailed documentation at " + "https://databricks-sdk-py.readthedocs.io/en/latest/workspace/files/dbfs.html", ), "E9898": ( "Use Databricks SDK instead: with w.dbfs.download(%s) as f: f.read()", "dbutils-fs-head", - "Migrate all usage of dbutils to Databricks SDK", + "Migrate all usage of dbutils to Databricks SDK. See the more detailed documentation at " + "https://databricks-sdk-py.readthedocs.io/en/latest/workspace/files/dbfs.html", ), "E9897": ( "Use Databricks SDK instead: w.dbfs.list(%s)", "dbutils-fs-ls", - "Migrate all usage of dbutils to Databricks SDK", + "Migrate all usage of dbutils to Databricks SDK. See the more detailed documentation at " + "https://databricks-sdk-py.readthedocs.io/en/latest/workspace/files/dbfs.html", ), "E9896": ( "Mounts are not supported with Unity Catalog, switch to using Unity Catalog Volumes instead", @@ -39,17 +42,20 @@ class DbutilsChecker(BaseChecker): task_key=...) ]).result(timeout=timedelta(minutes=%s))""", "dbutils-notebook-run", - "Migrate all usage of dbutils to Databricks SDK", + "Migrate all usage of dbutils to Databricks SDK. See the more detailed documentation at " + "https://databricks-sdk-py.readthedocs.io/en/latest/workspace/jobs/jobs.html", ), "E9869": ( "Use Databricks SDK instead: from databricks.sdk import WorkspaceClient(); w = WorkspaceClient()", "pat-token-leaked", - "Do not hardcode secrets in code, use Databricks Scopes instead", + "Do not hardcode secrets in code, use Databricks SDK instead, which natively authenticates in Databricks " + "Notebooks. See more at https://databricks-sdk-py.readthedocs.io/en/latest/authentication.html", ), "E9859": ( "Do not use internal APIs, rewrite using Databricks SDK", "internal-api", - "Do not use internal APIs", + "Do not use internal APIs. Use Databricks SDK for Python: " + "https://databricks-sdk-py.readthedocs.io/en/latest/index.html", ), } diff --git a/src/databricks/labs/pylint/legacy.py b/src/databricks/labs/pylint/legacy.py index 6823c1a..67a3acf 100644 --- a/src/databricks/labs/pylint/legacy.py +++ b/src/databricks/labs/pylint/legacy.py @@ -11,12 +11,13 @@ class LegacyChecker(BaseChecker): "E9799": ( "Don't use databricks_cli, use databricks.sdk instead: pip install databricks-sdk", "legacy-cli", - "Migrate all usage of Legacy CLI to Databricks SDK", + "Migrate all usage of Legacy CLI to Databricks SDK. See the more detailed documentation at " + "https://databricks-sdk-py.readthedocs.io/en/latest/index.html", ), "E9789": ( "Incompatible with Unified Catalog", "incompatible-with-uc", - "Migrate all usage to Databricks Unity Catalog", + "Migrate all usage to Databricks Unity Catalog. Use https://github.com/databrickslabs/ucx for more details", ), } diff --git a/src/databricks/labs/pylint/notebooks.py b/src/databricks/labs/pylint/notebooks.py index e3e2300..91574db 100644 --- a/src/databricks/labs/pylint/notebooks.py +++ b/src/databricks/labs/pylint/notebooks.py @@ -10,12 +10,15 @@ class NotebookChecker(BaseRawFileChecker): "E9996": ( "Notebooks should not have more than 75 cells", "notebooks-too-many-cells", - "Used when the number of cells in a notebook is greater than 75", + "Otherwise, it's hard to maintain and understand the notebook for other people and the future you", ), "E9994": ( "Using %run is not allowed", "notebooks-percent-run", - "Used when `# MAGIC %run` comment is used", + "Use functions instead of %run to avoid side effects and make the code more testable. " + "If you need to share code between notebooks, consider creating a library. " + "If still need to call another code as a separate job, use Databricks SDK for Python:" + " https://databricks-sdk-py.readthedocs.io/en/latest/index.html", ), } diff --git a/src/databricks/labs/pylint/spark.py b/src/databricks/labs/pylint/spark.py index 7bc54de..456690f 100644 --- a/src/databricks/labs/pylint/spark.py +++ b/src/databricks/labs/pylint/spark.py @@ -9,12 +9,15 @@ class SparkChecker(BaseChecker): "E9700": ( "Using spark outside the function is leading to untestable code", "spark-outside-function", - "spark used outside of function", + "Do not use global spark object, pass it as an argument to the function instead, " + "so that the function becomes testable in a CI/CD pipelines.", ), "E9701": ( "Function %s is missing a 'spark' argument", "no-spark-argument-in-function", - "function missing spark argument", + "Function refers to a global spark variable, which may not always be available. " + "Pass the spark object as an argument to the function instead, so that the function " + "becomes testable in a CI/CD pipelines.", ), }