make release-tag: Merge branch 'master' into stable

sdv-dev · Jul 9, 2020 · ab0c4de · ab0c4de
2 parents 859cd46 + d9acc57
commit ab0c4de
Show file tree

Hide file tree

Showing 65 changed files with 9,265 additions and 5,729 deletions.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -18,6 +18,7 @@ jobs:
 
     - name: Build
       run: |
+        sudo apt install pandoc
         python -m pip install --upgrade pip
         pip install -e .[dev]
         make docs

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -24,12 +24,12 @@ jobs:
     - if: matrix.os == 'ubuntu-latest'
       name: Install graphviz - Ubuntu
       run: |
-        sudo apt-get install graphviz
+        sudo apt-get install graphviz pandoc
 
     - if: matrix.os == 'macos-latest'
       name: Install graphviz - MacOS
       run: |
-        brew install graphviz
+        brew install graphviz pandoc
 
     - name: Install dependencies
       run: |

diff --git a/.gitignore b/.gitignore
@@ -106,3 +106,5 @@ ENV/
 .*.swp
 
 sdv/data/
+tutorials/sdv.pkl
+tutorials/demo_metadata.json
diff --git a/.travis.yml b/.travis.yml
@@ -9,7 +9,7 @@ python:
 # Command to install dependencies
 install:
   - sudo apt-get update
-  - sudo apt-get install graphviz
+  - sudo apt-get install graphviz pandoc
   - pip install -U tox-travis codecov
 
 after_success: codecov

diff --git a/EVALUATION.md b/EVALUATION.md
@@ -12,7 +12,7 @@ generate a simple standardized score.
 After you have modeled your databased and generated samples out of the SDV models
 you will be left with a dictionary that contains table names and dataframes.
 
-For exmple, if we model and sample the demo dataset:
+For example, if we model and sample the demo dataset:
 
 ```python3
 from sdv import SDV
@@ -44,3 +44,94 @@ the value will be negative.
 
 For further options, including visualizations and more detailed reports, please refer to
 the [SDMetrics](https://github.com/sdv-dev/SDMetrics) library.
+
+
+## SDV Benchmark
+
+SDV also provides a simple functionality to evaluate the performance of SDV across a
+collection of demo datasets or custom datasets hosted in a local folder.
+
+In order to execute this evaluation you can execute the function `sdv.benchmark.run_benchmark`:
+
+```python3
+from sdv.benchmark import run_benchmark
+
+scores = run_benchmark()
+```
+
+This function has the following arguments:
+
+* `datasets`: List of dataset names, which can either be names of demo datasets or
+  names of custom datasets stored in a local folder.
+* `datasets_path`: Path where the custom datasets are stored. If not provided, the
+  dataset names are interpreted as demo datasets.
+* `distributed`: Whether to execute the benchmark using Dask. Defaults to True.
+* `timeout`: Maximum time allowed for each dataset to be modeled, sampled and evaluated.
+  Any dataset that takes longer to run will return a score of `None`.
+
+For example, the following command will run the SDV benchmark on all the given demo datasets
+using `dask` and a timeout of 60 seconds:
+
+```python
+scores = run_benchmark(
+    datasets=['DCG_v1', 'trains_v1', 'UTube_v1'],
+    distributed=True,
+    timeout=60
+)
+```
+
+And the result will be a DataFrame containing a table with the columns `dataset`, `score`:
+
+| dataset | score |
+|:-------:|:-----:|
+| DCG_v1  | -14.49341665631863 |
+| trains_v1  | -30.26840342069557 |
+| UTube_v1  | -8.57618576332235 |
+
+Additionally, if some dataset has raised an error or has reached the timeout, an `error`
+column will be added indicating the details.
+
+### Demo Datasets
+
+The collection of datasets can be seen using the `sdv.demo.get_demo_demos`,
+which returns a table with a description of the dataset properties:
+
+```python3
+from sdv.demo import get_available_demos
+
+demos = get_available_demos()
+```
+
+The result is a table indicating the name of the dataset and a few properties, such as the
+number of tables that compose the dataset and the total number of rows and columns:
+
+| name                  |   tables |    rows |   columns |
+|-----------------------|----------|---------|-----------|
+| UTube_v1              |        2 |    2735 |        10 |
+| SAP_v1                |        4 | 3841029 |        71 |
+| NCAA_v1               |        9 |  202305 |       333 |
+| airbnb-simplified     |        2 | 5751408 |        22 |
+| Atherosclerosis_v1    |        4 |   12781 |       307 |
+| rossmann              |        3 | 2035533 |        21 |
+| walmart               |        4 |  544869 |        24 |
+| AustralianFootball_v1 |        4 |  139179 |       193 |
+| Pyrimidine_v1         |        2 |     296 |        38 |
+| world_v1              |        3 |    5302 |        39 |
+| Accidents_v1          |        3 | 1463093 |        87 |
+| trains_v1             |        2 |      83 |        15 |
+| legalActs_v1          |        5 | 1754397 |        50 |
+| DCG_v1                |        2 |    8258 |         9 |
+| imdb_ijs_v1           |        7 | 5647694 |        50 |
+| SalesDB_v1            |        4 | 6735507 |        35 |
+| MuskSmall_v1          |        2 |     568 |       173 |
+| KRK_v1                |        1 |    1000 |         9 |
+| Chess_v1              |        2 |    2052 |        57 |
+| Telstra_v1            |        5 |  148021 |        23 |
+| mutagenesis_v1        |        3 |   10324 |        26 |
+| PremierLeague_v1      |        4 |   11308 |       250 |
+| census                |        1 |   32561 |        15 |
+| FNHK_v1               |        3 | 2113275 |        43 |
+| imdb_MovieLens_v1     |        7 | 1249411 |        58 |
+| financial_v1          |        8 | 1079680 |        84 |
+| ftp_v1                |        2 |   96491 |        13 |
+| Triazine_v1           |        2 |    1302 |        35 |
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,34 @@
 # History
 
+## 0.3.5 - 2020-07-09
+
+This release introduces a new subpackage `sdv.tabular` with models designed specifically
+for single table modeling, while still providing all the usual conveniences from SDV, such
+as:
+
+* Seamless multi-type support
+* Missing data handling
+* PII anonymization
+
+Currently implemented models are:
+
+* GaussianCopula: Multivariate distributions modeled using copula functions. This is stronger
+  version, with more marginal distributions and options, than the one used to model multi-table
+  datasets.
+* CTGAN: GAN-based data synthesizer that can generate synthetic tabular data with high fidelity.
+
+
+## 0.3.4 - 2020-07-04
+
+## New Features
+
+* Support for Multiple Parents - [Issue #162](https://github.com/sdv-dev/SDV/issues/162) by @csala
+* Sample by default the same number of rows as in the original table - [Issue #163](https://github.com/sdv-dev/SDV/issues/163) by @csala
+
+### General Improvements
+
+* Add benchmark - [Issue #165](https://github.com/sdv-dev/SDV/issues/165) by @csala
+
 ## 0.3.3 - 2020-06-26
 
 ### General Improvements

diff --git a/Makefile b/Makefile
@@ -50,6 +50,7 @@ clean-pyc: ## remove Python file artifacts
 .PHONY: clean-docs
 clean-docs: ## remove previously built docs
 	rm -f docs/api/*.rst
+	rm -rf docs/tutorials
 	-$(MAKE) -C docs clean 2>/dev/null  # this fails if sphinx is not yet installed
 
 .PHONY: clean-coverage
@@ -110,7 +111,7 @@ test-readme: ## run the readme snippets
 
 .PHONY: test-tutorials
 test-tutorials: ## run the tutorial notebooks
-	jupyter nbconvert --execute --ExecutePreprocessor.timeout=600 examples/*.ipynb --stdout > /dev/null
+	jupyter nbconvert --execute --ExecutePreprocessor.timeout=600 tutorials/*.ipynb --stdout > /dev/null
 
 .PHONY: test
 test: test-unit test-readme test-tutorials ## test everything that needs test dependencies
@@ -134,6 +135,7 @@ coverage: ## check code coverage quickly with the default Python
 
 .PHONY: docs
 docs: clean-docs ## generate Sphinx HTML documentation, including API docs
+	cp -r tutorials docs/tutorials
 	sphinx-apidoc --separate --no-toc -o docs/api/ sdv
 	$(MAKE) -C docs html
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -32,6 +32,7 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 extensions = [
     'm2r',
+    'nbsphinx',
     'sphinx.ext.autodoc',
     'sphinx.ext.githubpages',
     'sphinx.ext.viewcode',
@@ -53,6 +54,9 @@
 # The master toctree document.
 master_doc = 'index'
 
+# Jupyter Notebooks
+nbsphinx_execute = 'never'
+
 # General information about the project.
 project = 'SDV'
 slug = 'sdv'

diff --git a/docs/index.rst b/docs/index.rst
@@ -7,9 +7,11 @@
    Overview <readme>
 
 .. toctree::
-   :caption: Advanced Usage
-   :maxdepth: 3
+   :caption: User Guides
+   :maxdepth: 2
 
+   tutorials/02_Single_Table_Modeling
+   tutorials/03_Relational_Data_Modeling
    metadata
 
 .. toctree::

diff --git a/docs/metadata.rst b/docs/metadata.rst
@@ -1,5 +1,5 @@
-Metadata
-========
+Working with Metadata
+=====================
 
 In order to use **SDV** you will need a ``Metadata`` object alongside your data.