Skip to content

Commit

Permalink
feat: implement type generation (#23)
Browse files Browse the repository at this point in the history
* implemented type writer

* added docs for types generation

* updated readme usage

* added test for types invoke

* removed pretty option

* added test for console
  • Loading branch information
kiran94 authored May 1, 2021
1 parent 7a7aefe commit e5fa814
Show file tree
Hide file tree
Showing 6 changed files with 357 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,4 @@ samples/**/xidmap/*

settings.json
schema.txt
types.txt
79 changes: 75 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ A Library (with accompanying cli tool) to transform [Pandas](https://pandas.pyda
- [Edges](#edges)
- [Configuration](#configuration)
- [Additional Configuration](#additional-configuration)
- [Schema](#schema)
- [Schema and Types](#schema-and-types)
- [Generating a Schema](#generating-a-schema)
- [Generating Types](#generating-types)
- [Samples](#samples)
- [Working with Larger Files](#working-with-larger-files)
- [Command Line](#command-line-1)
Expand All @@ -34,8 +36,9 @@ python -m pip install dgraphpandas

```sh
❯ dgraphpandas --help
usage: dgraphpandas [-h] -f FILE -c CONFIG -ck CONFIG_FILE_KEY [-o OUTPUT_DIR]
[--console] [--export_csv] [--encoding ENCODING]
usage: dgraphpandas [-h] [-x {upserts,schema,types}] [-f FILE] -c CONFIG
[-ck CONFIG_FILE_KEY] [-o OUTPUT_DIR] [--console]
[--export_csv] [--encoding ENCODING]
[--chunk_size CHUNK_SIZE]
[--gz_compression_level GZ_COMPRESSION_LEVEL]
[--key_separator KEY_SEPARATOR]
Expand Down Expand Up @@ -387,7 +390,9 @@ These options can be placed on the root of the config or passed as `kwargs` dire
- Schema option to define an edge as a list. This will ensure the type is `[uid]` rather then just `uid`


## Schema
## Schema and Types

### Generating a Schema

DGraph allows you to define a [schema](https://dgraph.io/docs/query-language/schema/#sidebar). This can be generated using the same configuration used above but there are also additional options you can add such as `options` and `list_edges` which are exclusively used for schema generation.

Expand Down Expand Up @@ -440,6 +445,72 @@ class: uid @reverse .
dgraph live -s schema.txt
```

### Generating Types

DGraph also allows you to define [types](https://dgraph.io/docs/query-language/type-system/#sidebar) that can be used to categorize nodes. This can also be generated from the same configuration as data loading.


```sh
# Model the data, define types, edges and any options
echo '
{
"transform": "horizontal",
"files": {
"animal": {
"subject_fields": ["species_id"],
"type_overrides": {
"name": "string",
"legs": "int",
"weight": "float",
"height": "float",
"discovered": "datetime64",
"aquatic": "bool"
},
"edge_fields": ["class_id", "found_in"],
"class": ["@reverse"],
"found_in": ["@reverse", "@count"],
"list_edges": ["found_in"]
},
"habitat": {
"subject_fields": ["id"],
"type_overrides": {
"name": "string"
}
}
}
}' > dgraphpandas.json

# Apply the config to the schema generation logic
> dgraphpandas -c dgraphpandas.json -x types -v DEBUG

# Inspect Types
❯ cat types.txt
type animal {
found_in
aquatic
discovered
height
weight
legs
species
name
class
}

type habitat {
id
name
}


# Apply to DGraph
# NOTE: you should always apply the schema
# before applying types else dgraph
# won't know what the predicates are
dgraph live -s types.txt

```

## Samples

Samples can be found [here](https://github.com/kiran94/dgraphpandas/tree/main/samples). They follow a convention where the download script can be found within the `input` directory and the config, generate_upsert, publish scripts can be found root of each respective sample.
Expand Down
7 changes: 6 additions & 1 deletion dgraphpandas/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@
from dgraphpandas import __version__, __description__, to_rdf
from dgraphpandas.strategies.schema import create_schema
from dgraphpandas.writers.schema import generate_schema
from dgraphpandas.writers.types import generate_types

pd.set_option('mode.chained_assignment', None)


def main():
parser = argparse.ArgumentParser(description=__description__)
parser.add_argument('-x', '--method', choices=['upserts', 'schema'], default='upserts')
parser.add_argument('-x', '--method', choices=['upserts', 'schema', 'types'], default='upserts')
parser.add_argument('-f', '--file', required=False, help='The Data File (CSV) to convert into RDF.')
parser.add_argument('-c', '--config', required=True, help='The DgraphPandas Configuration. See Documentation for options/examples.')
parser.add_argument('-ck', '--config_file_key', required=False, help='The Entry in the Configuration to use for this passed file.')
Expand Down Expand Up @@ -72,6 +73,10 @@ def main():
schema_frame = create_schema(args.config, ensure_xid_predicate=True, **(options))
generate_schema(schema_frame, export_schema=True, **(options))

elif args.method == 'types':
schema_frame = create_schema(args.config, ensure_xid_predicate=True, **(options))
generate_types(schema_frame, export_schema=True, **(options))


if __name__ == '__main__':
main() # pragma: no cover
78 changes: 78 additions & 0 deletions dgraphpandas/writers/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import os
import logging
from typing import List

import pandas as pd

logger = logging.getLogger(__name__)


def generate_types(frame: pd.DataFrame, **kwargs) -> List[str]:
'''
Given the pre-processed DataFrame from the schema
strategy, generate types.
'''
if frame is None:
raise ValueError('frame')
if 'column' not in frame:
raise ValueError('column')
if 'type' not in frame:
raise ValueError('type')
if 'table' not in frame:
raise ValueError('table')
if 'options' not in frame:
raise ValueError('options')

output_dir = kwargs.get('output_dir', '.')
export_schema = kwargs.get('export_schema', False)
export_file = kwargs.get('export_file', 'types.txt')
console = kwargs.get('console', False)
encoding = kwargs.get('encoding', 'utf-8')
line_delimeter = kwargs.get('line_delimeter ', '\n')

all_types: List[str] = []
all_types_reverse: List[str] = []

tables = frame.groupby(by=['table'])
for name, current_frame in tables:
logger.debug(f'Creating types for {name}')

reverse_edge_mask = (~current_frame['options'].isnull()) & current_frame['options'].str.contains('@reverse')
current_frame.loc[reverse_edge_mask, 'column'] = '<~' + current_frame['column'] + '>'

type_builder = 'type ' + name
type_builder += ' { '
type_builder += line_delimeter
type_builder += line_delimeter.join(current_frame['column'].unique().tolist())
type_builder += line_delimeter
type_builder += ' }'
type_builder += line_delimeter

# Split up types with reverse edges so we can gurantee they are applied after other types
# This is required because if dgraph live encounters a reverse edge for a type defined later in the file
# then dgraph live will fails.
# NOTE: There might be a better solution here
# and we could build a dependency tree based on the references
# topological sort?
# also this won't detect circular dependencies
if current_frame.loc[reverse_edge_mask, 'column'].shape[0]:
all_types_reverse.append(type_builder)
else:
all_types.append(type_builder)

if console:
print(type_builder)
print(line_delimeter)

if export_schema:
export_path = os.path.join(output_dir, export_file)
logger.debug(f'Writing to {export_path} ({encoding})')
with open(export_path, 'w', encoding=encoding) as f:
for current_type in all_types:
f.write(current_type)
f.write('\n')
for current_type in all_types_reverse:
f.write(current_type)
f.write('\n')

return all_types + all_types_reverse
54 changes: 54 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,57 @@ def test_version(
main()

assert __version__ in capsys.readouterr().out


@patch('dgraphpandas.__main__.logging')
@patch('dgraphpandas.__main__.generate_types')
@patch('dgraphpandas.__main__.create_schema')
@patch('dgraphpandas.__main__.sys')
def test_types(
argv_mock: Mock,
create_schema_mock: Mock,
generate_types_mock: Mock,
logger_mock: Mock,
capsys):
'''
Ensures when types is called, the underlying services
are called
'''
argv_mock.argv = [
'script',
'-x', 'types',
'-c', 'config.json',
]

create_schema_mock.return_value = 'fake_schema'

main()

assert create_schema_mock.called
assert generate_types_mock.called

args, kwargs = create_schema_mock.call_args_list[0]

assert args == ('config.json',)
assert kwargs == {
'add_dgraph_type_records': True,
'drop_na_intrinsic_objects': True,
'drop_na_edge_objects': True,
'illegal_characters': ['%', '\\.', '\\s', '"', '\\n', '\\r\\n'],
'illegal_characters_intrinsic_object': ['"', '\\n', '\\r\\n'],
'chunk_size': 10000000,
'ensure_xid_predicate': True
}

args, kwargs = generate_types_mock.call_args_list[0]

assert args == ('fake_schema',)
assert kwargs == {
'add_dgraph_type_records': True,
'drop_na_intrinsic_objects': True,
'drop_na_edge_objects': True,
'illegal_characters': ['%', '\\.', '\\s', '"', '\\n', '\\r\\n'],
'illegal_characters_intrinsic_object': ['"', '\\n', '\\r\\n'],
'chunk_size': 10000000,
'export_schema': True
}
Loading

0 comments on commit e5fa814

Please sign in to comment.