feat: implement type generation (#23)

* implemented type writer * added docs for types generation * updated readme usage * added test for types invoke * removed pretty option * added test for console
kiran94 · May 1, 2021 · e5fa814 · e5fa814
1 parent 7a7aefe
commit e5fa814
Show file tree

Hide file tree

Showing 6 changed files with 357 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,4 @@ samples/**/xidmap/*
 
 settings.json
 schema.txt
+types.txt
diff --git a/README.md b/README.md
@@ -17,7 +17,9 @@ A Library (with accompanying cli tool) to transform [Pandas](https://pandas.pyda
     - [Edges](#edges)
   - [Configuration](#configuration)
     - [Additional Configuration](#additional-configuration)
-  - [Schema](#schema)
+  - [Schema and Types](#schema-and-types)
+    - [Generating a Schema](#generating-a-schema)
+    - [Generating Types](#generating-types)
   - [Samples](#samples)
   - [Working with Larger Files](#working-with-larger-files)
     - [Command Line](#command-line-1)
@@ -34,8 +36,9 @@ python -m pip install dgraphpandas
 
 ```sh
 ❯ dgraphpandas --help
-usage: dgraphpandas [-h] -f FILE -c CONFIG -ck CONFIG_FILE_KEY [-o OUTPUT_DIR]
-                    [--console] [--export_csv] [--encoding ENCODING]
+usage: dgraphpandas [-h] [-x {upserts,schema,types}] [-f FILE] -c CONFIG
+                    [-ck CONFIG_FILE_KEY] [-o OUTPUT_DIR] [--console]
+                    [--export_csv] [--encoding ENCODING]
                     [--chunk_size CHUNK_SIZE]
                     [--gz_compression_level GZ_COMPRESSION_LEVEL]
                     [--key_separator KEY_SEPARATOR]
@@ -387,7 +390,9 @@ These options can be placed on the root of the config or passed as `kwargs` dire
     - Schema option to define an edge as a list. This will ensure the type is `[uid]` rather then just `uid`
 
 
-## Schema
+## Schema and Types
+
+### Generating a Schema
 
 DGraph allows you to define a [schema](https://dgraph.io/docs/query-language/schema/#sidebar). This can be generated using the same configuration used above but there are also additional options you can add such as `options` and `list_edges` which are exclusively used for schema generation.
 
@@ -440,6 +445,72 @@ class: uid @reverse .
 dgraph live -s schema.txt
 ```
 
+### Generating Types
+
+DGraph also allows you to define [types](https://dgraph.io/docs/query-language/type-system/#sidebar) that can be used to categorize nodes. This can also be generated from the same configuration as data loading.
+
+
+```sh
+# Model the data, define types, edges and any options
+echo '
+{
+  "transform": "horizontal",
+  "files": {
+    "animal": {
+      "subject_fields": ["species_id"],
+      "type_overrides": {
+        "name": "string",
+        "legs": "int",
+        "weight": "float",
+        "height": "float",
+        "discovered": "datetime64",
+        "aquatic": "bool"
+      },
+      "edge_fields": ["class_id", "found_in"],
+      "class": ["@reverse"],
+      "found_in": ["@reverse", "@count"],
+      "list_edges": ["found_in"]
+    },
+    "habitat": {
+      "subject_fields": ["id"],
+      "type_overrides": {
+        "name": "string"
+      }
+    }
+  }
+}' > dgraphpandas.json
+
+# Apply the config to the schema generation logic
+> dgraphpandas -c dgraphpandas.json -x types -v DEBUG
+
+# Inspect Types
+❯ cat types.txt
+type animal {
+found_in
+aquatic
+discovered
+height
+weight
+legs
+species
+name
+class
+ }
+
+type habitat {
+id
+name
+}
+
+
+# Apply to DGraph
+# NOTE: you should always apply the schema
+# before applying types else dgraph
+# won't know what the predicates are
+dgraph live -s types.txt
+
+```
+
 ## Samples
 
 Samples can be found [here](https://github.com/kiran94/dgraphpandas/tree/main/samples). They follow a convention where the download script can be found within the `input` directory and the config, generate_upsert, publish scripts can be found root of each respective sample.

diff --git a/dgraphpandas/__main__.py b/dgraphpandas/__main__.py
@@ -8,13 +8,14 @@
 from dgraphpandas import __version__, __description__, to_rdf
 from dgraphpandas.strategies.schema import create_schema
 from dgraphpandas.writers.schema import generate_schema
+from dgraphpandas.writers.types import generate_types
 
 pd.set_option('mode.chained_assignment', None)
 
 
 def main():
     parser = argparse.ArgumentParser(description=__description__)
-    parser.add_argument('-x', '--method', choices=['upserts', 'schema'], default='upserts')
+    parser.add_argument('-x', '--method', choices=['upserts', 'schema', 'types'], default='upserts')
     parser.add_argument('-f', '--file', required=False, help='The Data File (CSV) to convert into RDF.')
     parser.add_argument('-c', '--config', required=True, help='The DgraphPandas Configuration. See Documentation for options/examples.')
     parser.add_argument('-ck', '--config_file_key', required=False, help='The Entry in the Configuration to use for this passed file.')
@@ -72,6 +73,10 @@ def main():
         schema_frame = create_schema(args.config, ensure_xid_predicate=True, **(options))
         generate_schema(schema_frame, export_schema=True, **(options))
 
+    elif args.method == 'types':
+        schema_frame = create_schema(args.config, ensure_xid_predicate=True, **(options))
+        generate_types(schema_frame, export_schema=True, **(options))
+
 
 if __name__ == '__main__':
     main()  # pragma: no cover
diff --git a/dgraphpandas/writers/types.py b/dgraphpandas/writers/types.py
@@ -0,0 +1,78 @@
+import os
+import logging
+from typing import List
+
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+def generate_types(frame: pd.DataFrame, **kwargs) -> List[str]:
+    '''
+    Given the pre-processed DataFrame from the schema
+    strategy, generate types.
+    '''
+    if frame is None:
+        raise ValueError('frame')
+    if 'column' not in frame:
+        raise ValueError('column')
+    if 'type' not in frame:
+        raise ValueError('type')
+    if 'table' not in frame:
+        raise ValueError('table')
+    if 'options' not in frame:
+        raise ValueError('options')
+
+    output_dir = kwargs.get('output_dir', '.')
+    export_schema = kwargs.get('export_schema', False)
+    export_file = kwargs.get('export_file', 'types.txt')
+    console = kwargs.get('console', False)
+    encoding = kwargs.get('encoding', 'utf-8')
+    line_delimeter = kwargs.get('line_delimeter ', '\n')
+
+    all_types: List[str] = []
+    all_types_reverse: List[str] = []
+
+    tables = frame.groupby(by=['table'])
+    for name, current_frame in tables:
+        logger.debug(f'Creating types for {name}')
+
+        reverse_edge_mask = (~current_frame['options'].isnull()) & current_frame['options'].str.contains('@reverse')
+        current_frame.loc[reverse_edge_mask, 'column'] = '<~' + current_frame['column'] + '>'
+
+        type_builder = 'type ' + name
+        type_builder += ' { '
+        type_builder += line_delimeter
+        type_builder += line_delimeter.join(current_frame['column'].unique().tolist())
+        type_builder += line_delimeter
+        type_builder += ' }'
+        type_builder += line_delimeter
+
+        # Split up types with reverse edges so we can gurantee they are applied after other types
+        # This is required because if dgraph live encounters a reverse edge for a type defined later in the file
+        # then dgraph live will fails.
+        # NOTE: There might be a better solution here
+        # and we could build a dependency tree based on the references
+        # topological sort?
+        # also this won't detect circular dependencies
+        if current_frame.loc[reverse_edge_mask, 'column'].shape[0]:
+            all_types_reverse.append(type_builder)
+        else:
+            all_types.append(type_builder)
+
+        if console:
+            print(type_builder)
+            print(line_delimeter)
+
+    if export_schema:
+        export_path = os.path.join(output_dir, export_file)
+        logger.debug(f'Writing to {export_path} ({encoding})')
+        with open(export_path, 'w', encoding=encoding) as f:
+            for current_type in all_types:
+                f.write(current_type)
+                f.write('\n')
+            for current_type in all_types_reverse:
+                f.write(current_type)
+                f.write('\n')
+
+    return all_types + all_types_reverse
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -248,3 +248,57 @@ def test_version(
         main()
 
     assert __version__ in capsys.readouterr().out
+
+
+@patch('dgraphpandas.__main__.logging')
+@patch('dgraphpandas.__main__.generate_types')
+@patch('dgraphpandas.__main__.create_schema')
+@patch('dgraphpandas.__main__.sys')
+def test_types(
+        argv_mock: Mock,
+        create_schema_mock: Mock,
+        generate_types_mock: Mock,
+        logger_mock: Mock,
+        capsys):
+    '''
+    Ensures when types is called, the underlying services
+    are called
+    '''
+    argv_mock.argv = [
+        'script',
+        '-x', 'types',
+        '-c', 'config.json',
+    ]
+
+    create_schema_mock.return_value = 'fake_schema'
+
+    main()
+
+    assert create_schema_mock.called
+    assert generate_types_mock.called
+
+    args, kwargs = create_schema_mock.call_args_list[0]
+
+    assert args == ('config.json',)
+    assert kwargs == {
+        'add_dgraph_type_records': True,
+        'drop_na_intrinsic_objects': True,
+        'drop_na_edge_objects': True,
+        'illegal_characters': ['%', '\\.', '\\s', '"', '\\n', '\\r\\n'],
+        'illegal_characters_intrinsic_object': ['"', '\\n', '\\r\\n'],
+        'chunk_size': 10000000,
+        'ensure_xid_predicate': True
+    }
+
+    args, kwargs = generate_types_mock.call_args_list[0]
+
+    assert args == ('fake_schema',)
+    assert kwargs == {
+        'add_dgraph_type_records': True,
+        'drop_na_intrinsic_objects': True,
+        'drop_na_edge_objects': True,
+        'illegal_characters': ['%', '\\.', '\\s', '"', '\\n', '\\r\\n'],
+        'illegal_characters_intrinsic_object': ['"', '\\n', '\\r\\n'],
+        'chunk_size': 10000000,
+        'export_schema': True
+    }