diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4bc5e30d3..494565c95 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,7 +24,7 @@ jobs: strategy: matrix: - python-version: [3.8, 3.9, '3.10', 3.11] + python-version: [3.8, 3.9, '3.10', 3.11, 3.12] steps: @@ -71,7 +71,7 @@ jobs: strategy: matrix: - python-version: [3.8, 3.9, '3.10', 3.11] + python-version: [3.8, 3.9, '3.10', 3.11, 3.12] steps: @@ -93,7 +93,7 @@ jobs: python -m venv pygraphistry source pygraphistry/bin/activate python -m pip install --upgrade pip - python -m pip install -e .[docs,test,build,bolt,igraph,networkx,gremlin,nodexl,jupyter] + python -m pip install -e .[test,build,bolt,igraph,networkx,gremlin,nodexl,jupyter] - name: Lint run: | @@ -110,6 +110,47 @@ jobs: source pygraphistry/bin/activate ./bin/test.sh + test-graphviz: + + needs: [ test-minimal-python ] + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: [3.8, 3.9, '3.10', 3.11, 3.12] + + steps: + + - name: Checkout repo + uses: actions/checkout@v3 + with: + lfs: true + + - name: Checkout LFS objects + run: git lfs pull + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install test dependencies + run: | + python -m venv pygraphistry + source pygraphistry/bin/activate + sudo apt-get install graphviz graphviz-dev + python -m pip install --upgrade pip + python -m pip install -e .[test,pygraphviz] + + - name: Type check + run: | + source pygraphistry/bin/activate + ./bin/typecheck.sh + + - name: Graphviz tests + run: | + source pygraphistry/bin/activate + ./bin/test-graphviz.sh test-core-umap: @@ -118,6 +159,7 @@ jobs: strategy: matrix: + #python-version: [3.8, 3.9, '3.10', 3.11, 3.12] python-version: [3.8, 3.9] steps: @@ -165,6 +207,10 @@ jobs: strategy: matrix: python-version: [3.8, 3.9] + #python-version: [3.8, 3.9, '3.10', 3.11, 3.12] + #include: + # - python-version: 3.12 + # continue-on-error: true steps: @@ -284,7 +330,7 @@ jobs: - name: Test building docs run: | - cd docs && ./docker.sh + cd docs && ./ci.sh test-readme: diff --git a/.gitignore b/.gitignore index f8a1ee954..104d69b49 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,8 @@ coverage.xml # Sphinx documentation docs/_build/ +docs/doctrees/ +docs/source/demos/ # PyBuilder target/ @@ -87,3 +89,4 @@ demos/data/BIOGRID-IDENTIFIERS-3.3.123.tab.txt # local jupyter dev jupyter_dev/ +docs/source/demos diff --git a/.readthedocs.yml b/.readthedocs.yml index 609e875f7..e037f6cd8 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -5,24 +5,53 @@ # Required version: 2 -# Build documentation in the docs/ directory with Sphinx -sphinx: - configuration: docs/source/conf.py - build: os: ubuntu-22.04 tools: - python: "3.8" + python: "3.12" + apt_packages: + # More closely mirror https://github.com/sphinx-doc/sphinx-docker-images + - graphviz + - imagemagick + - make + - pandoc + - texlive-latex-base + - texlive-latex-recommended + - texlive-latex-extra + - texlive-fonts-recommended + commands: + + # setup + - pip install ".[docs]" + - cp -r demos docs/source/demos + - cp README.md docs/source/README.md + - cp ARCHITECTURE.md docs/source/ARCHITECTURE.md + - cp CONTRIBUTE.md docs/source/CONTRIBUTE.md + - cp DEVELOP.md docs/source/DEVELOP.md + + # build html + - sphinx-build -b html -d docs/doctrees docs/source $READTHEDOCS_OUTPUT/html/ + + # build epub + - sphinx-build -b epub -d docs/doctrees docs/source docs/_build/latexpdf + - mkdir -p $READTHEDOCS_OUTPUT/epub + - cp docs/_build/latexpdf/PyGraphistry.epub $READTHEDOCS_OUTPUT/epub/PyGraphistry.epub -# Optionally build your docs in additional formats such as PDF + # build pdf + - sphinx-build -b latex -d docs/doctrees docs/source docs/_build/latexpdf + - cd docs/_build/latexpdf && pdflatex -file-line-error -interaction=nonstopmode PyGraphistry.tex && pdflatex -file-line-error -interaction=nonstopmode PyGraphistry.tex && echo ok || { echo fail && exit 1 ; } + - mkdir -p $READTHEDOCS_OUTPUT/pdf + - cp docs/_build/latexpdf/PyGraphistry.pdf $READTHEDOCS_OUTPUT/pdf/PyGraphistry.pdf + +#for nav links? formats: - pdf - - htmlzip - epub + - htmlzip python: install: - method: pip path: . extra_requirements: - - dev \ No newline at end of file + - docs diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e735dff9..a7cf750fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,155 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Development] +## [0.34.16 - 2024-10-13] + +### Docs + +* Update and streamline readme.md +* Add quicksheet for overall +* More crosslinking + +### Infra + +* Add markdown support to docsite +* ReadTheDocs homepage reuses github README.md +* Docs pip install caches +* Drop SVGs and external images during latexpdf generation + +### Changed + +* Treemap import `squarify` deferred to use to allow core import without squarify installed, such as in `--no-deps` + +## [0.34.15 - 2024-10-11] + +### Docs + +* Improve GFQL translation doc +* Add examples and API links: Shaping, Hypergraphs, AI & ML +* Add performance docs +* Add AI examples + +## [0.34.14 - 2024-10-09] + +### Added + +* HTTP responses with error status codes log an `logging.ERROR`-level message of the status code and response body + +## [0.34.13 - 2024-10-07] + +### Docs + +* Add more GFQL cross-references + +## [0.34.12 - 2024-10-07] + +### Docs + +* Fix ipynb examples in ReadTheDocs distribution + +## [0.34.11 - 2024-10-07] + +### Fix + +* Types + +### Infra + +* Enable more Python version checks + +## [0.34.10 - 2024-10-07] + +### Fix + +* Docs: Notebook builds + +### Docs + +* More links, especially around plugins +* Update color theme to match Graphistry branding + +## [0.34.9 - 2024-10-07] + +### Fix + +* Docs: 10 Minutes to PyGraphistry links + +## [0.34.8 - 2024-10-06] + +### Fix + +* Docs: PDF support +* Docs: Links + +### Docs + +* More accessible theme + +## [0.34.7 - 2024-10-06] + +### Docs + +* RTD: Added notebook tutorials +* RTD: Added various guides +* RTD: Added cross-references +* RTD: Cleaner navigation + +### Infra + +* Python: Add Python 12 to CI and document support +* Docs: Udated dependencies - Sphinx 8, Python 12, and various related +* Docs: Added nbsphinx - hub url grounding, ... +* Docs: Redo as a docker compose flow with incremental builds (docker, sphinx) +* Docs: Updated instructions for new flow + +### Fix + +* Docs: 2024 +* Notebooks: Compatibility with nbsphinx - exactly one title heading, no uncommented `!`, correct references, ... + +## [0.34.6 - 2024-10-04] + +### Added + +* Plugins: graphviz bindings, such as `g.layout_graphviz("dot")` + +### Docs + +* Reorganized readthedocs +* Added intro tutorials: `10 Minutes to PyGraphistry`, `10 Minutes to GFQL`, `Login and Sharing` + +## [0.34.5 - 2024-09-23] + +### Fixed + +* GFQL: Fix `chain()` regression around an incorrectly disabled check manifesting as https://github.com/graphistry/pygraphistry/issues/583 +* GFQL: Fix `chain()`, `hop()` traverse filtering logic for a multi-hop edge scenarios +* GFQL: Fix `hop()` predicate handling in multihop scenarios + +### Infra + +* GFQL: Expand test suite around multihop edge predicates in `hop()` and `chain()` + +## [0.34.4 - 2024-09-20] + +### Added + +* UMAP: Optional kwargs passthrough to umap library constructor, fit, and transform methods: `g.umap(..., umap_kwargs={...}, umap_fit_kwargs={...}, umap_transform_kwargs={...})` +* Additional GPU support in featurize paths + +### Changed + +* Replace `verbose` with `logging` + +### Refactor + +* Narrow `use_scaler` and `use_scaler_target` typing to `ScalerType` (`Literal[...]`) vs `str` +* Rename `featurize_or_get_nodes_dataframe_if_X_is_None` (and edges variant) as non-private due to being shared + +### Fixed + +* get_indegrees: Fix warning https://github.com/graphistry/pygraphistry/issues/587 + ## [0.34.3 - 2024-08-03] ### Added diff --git a/README.md b/README.md index a50e260f7..d5dd94b92 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# PyGraphistry: Explore Relationships +# PyGraphistry: Leverage the power of graphs & GPUs to visualize, analyze, and scale your data ![Build Status](https://github.com/graphistry/pygraphistry/workflows/CI%20Tests/badge.svg) [![CodeQL](https://github.com/graphistry/pygraphistry/workflows/CodeQL/badge.svg)](https://github.com/graphistry/pygraphistry/actions?query=workflow%3ACodeQL) @@ -11,1606 +11,201 @@ [![Uptime Robot status](https://img.shields.io/uptimerobot/status/m787548531-e9c7b7508fc76fea927e2313?label=hub.graphistry.com)](https://status.graphistry.com/) [](https://join.slack.com/t/graphistry-community/shared_invite/zt-53ik36w2-fpP0Ibjbk7IJuVFIRSnr6g) [![Twitter Follow](https://img.shields.io/twitter/follow/graphistry)](https://twitter.com/graphistry) -**PyGraphistry is a dataframe-native Python visual graph AI library to extract, query, transform, analyze, model, and visualize big graphs, and especially alongside [Graphistry](https://www.graphistry.com) end-to-end GPU server sessions.** The GFQL query language supports running a large subset of the Cypher property graph query language without requiring external software and adds optional GPU acceleration. Installing PyGraphistry with the optional `graphistry[ai]` dependencies adds **graph autoML**, including automatic feature engineering, UMAP, and graph neural net support. Combined, PyGraphistry reduces your **time to graph** for going from raw data to visualizations and AI models down to three lines of code. - -The optional visual engine, Graphistry, gets used on problems like visually mapping the behavior of devices and users, investigating fraud, analyzing machine learning results, and starting in graph AI. It provides point-and-click features like timebars, search, filtering, clustering, coloring, sharing, and more. Graphistry is the only tool built ground-up for large graphs. The client's custom WebGL rendering engine renders up to 8MM nodes + edges at a time, and most older client GPUs smoothly support somewhere between 100K and 2MM elements. The serverside GPU analytics engine supports even bigger graphs. It smoothes graph workflows over the PyData ecosystem including Pandas/Spark/Dask dataframes, Nvidia RAPIDS GPU dataframes & GPU graphs, DGL/PyTorch graph neural networks, and various data connectors. - -The PyGraphistry Python client helps several kinds of usage modes: - -* **Data scientists**: Go from data to accelerated visual explorations in a couple lines, share live results, build up more advanced views over time, and do it all from notebook environments like Jupyter and Google Colab -* **Developers**: Quickly prototype stunning Python solutions with PyGraphistry, embed in a language-neutral way with the [REST APIs](https://hub.graphistry.com/docs/api/), and go deep on customizations like colors, icons, layouts, JavaScript, and more -* **Analysts**: Every Graphistry session is a point-and-click environment with interactive search, filters, timebars, histograms, and more -* **Dashboarding**: Embed into your favorite framework. Additionally, see our sister project [Graph-App-Kit](https://github.com/graphistry/graph-app-kit) for quickly building interactive graph dashboards by launching a stack built on PyGraphistry, StreamLit, Docker, and ready recipes for integrating with common graph libraries - -PyGraphistry is a friendly and optimized PyData-native interface to the language-neutral [Graphistry REST APIs](https://hub.graphistry.com/docs/api/). -You can use PyGraphistry with traditional Python data sources like CSVs, SQL, Neo4j, Splunk, and more (see below). Wrangle data however you want, and with especially good support for Pandas dataframes, Apache Arrow tables, Nvidia RAPIDS cuDF dataframes & cuGraph graphs, and DGL/PyTorch graph neural networks. - -1. [Interactive Demo](#demo-of-friendship-communities-on-facebook) -2. [Graph Gallery](#gallery) -3. [Install](#install) -4. [Tutorial](#tutorial-les-misérables) -5. [Next Steps](#next-steps) -6. [Resources](#resources) - -## Demo of Friendship Communities on Facebook -
Click to open interactive version! (For server-backed interactive analytics, use an API key) - Source data: SNAP + + Demo: Interactive visualization of 80,000+ Facebook friendships (source data)
-## **PyGraphistry is:** - -* **Fast & gorgeous:** Interactively cluster, filter, inspect large amounts of data, and zip through timebars. It clusters large graphs with a descendant of the gorgeous ForceAtlas2 layout algorithm introduced in Gephi. Our data explorer connects to Graphistry's GPU cluster to layout and render hundreds of thousand of nodes+edges in your browser at unparalleled speeds. - -* **Easy to install:** `pip install` the client in your notebook or web app, and then connect to a [free Graphistry Hub account](https://www.graphistry.com/get-started) or [launch your own private GPU server](https://www.graphistry.com/get-started) - - ```python - # pip install --user graphistry # minimal - # pip install --user graphistry[bolt,gremlin,nodexl,igraph,networkx] # data plugins - # AI modules: Python 3.8+ with scikit-learn 1.0+: - # pip install --user graphistry[umap-learn] # Lightweight: UMAP autoML (without text support); scikit-learn 1.0+ - # pip install --user graphistry[ai] # Heavy: Full UMAP + GNN autoML, including sentence transformers (1GB+) - - import graphistry - graphistry.register(api=3, username='abc', password='xyz') # Free: hub.graphistry.com - #graphistry.register(..., personal_key_id='pkey_id', personal_key_secret='pkey_secret') # Key instead of username+password+org_name - #graphistry.register(..., is_sso_login=True) # SSO instead of password - #graphistry.register(..., org_name='my-org') # Upload into an organization account vs personal - #graphistry.register(..., protocol='https', server='my.site.ngo') # Use with a self-hosted server - # ... and if client (browser) URLs are different than python server<> graphistry server uploads - #graphistry.register(..., client_protocol_hostname='https://public.acme.co') - ``` - -* **Notebook-friendly:** PyGraphistry plays well with interactive notebooks like [Jupyter](http://ipython.org), [Zeppelin](https://zeppelin.incubator.apache.org/), and [Databricks](http://databricks.com). Process, visualize, and drill into with graphs directly within your notebooks: - - ```python - graphistry.edges(pd.read_csv('rows.csv'), 'col_a', 'col_b').plot() - ``` - -* **Great for events, CSVs, and more:** Not sure if your data is graph-friendly? PyGraphistry's `hypergraph` transform helps turn any sample data like CSVs, SQL results, and event data into a graph for pattern analysis: - - ```python - rows = pandas.read_csv('transactions.csv')[:1000] - graphistry.hypergraph(rows)['graph'].plot() - ``` - -* **Embeddable:** Drop live views into your web dashboards and apps (and go further with [JS/React](https://hub.graphistry.com/docs)): - - ```python - iframe_url = g.plot(render=False) - print(f'') - ``` - -* **Configurable:** In-tool or via the declarative APIs, use the powerful encodings systems for tasks like coloring by time, sizing by score, clustering by weight, show icons by type, and more. - -* **Shareable:** Share live links, configure who has access, and more! [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/sharing_tutorial.ipynb) - -* **Graph AI that is fast & easy:** In oneines of code, turn messy data into feature vectors for modeling, GNNs for training pipelines, lower dimensional embeddings, and visualizations: - - ```python - df = pandas.read_csv('accounts.csv') - - # UMAP dimensionality reduction with automatic feature engineering - g1 = graphistry.nodes(df).umap() - - # Automatically shows top inferred similarity edges g1._edges - g1.plot() - - # Optional: Use subset of columns, supervised learning target, & more - g2.umap(X=['name', 'description', 'amount'], y=['label_col_1']).plot() - ``` - -### Explore any data as a graph - -It is easy to turn arbitrary data into insightful graphs. PyGraphistry comes with many built-in connectors, and by supporting Python dataframes (Pandas, Arrow, RAPIDS), it's easy to bring standard Python data libraries. If the data comes as a table instead of a graph, PyGraphistry will help you extract and explore the relationships. - -* [Pandas](http://pandas.pydata.org) - - ```python - edges = pd.read_csv('facebook_combined.txt', sep=' ', names=['src', 'dst']) - graphistry.edges(edges, 'src', 'dst').plot() - ``` - - ```python - table_rows = pd.read_csv('honeypot.csv') - graphistry.hypergraph(table_rows, ['attackerIP', 'victimIP', 'victimPort', 'vulnName'])['graph'].plot() - ``` - - ```python - graphistry.hypergraph(table_rows, ['attackerIP', 'victimIP', 'victimPort', 'vulnName'], - direct=True, - opts={'EDGES': { - 'attackerIP': ['victimIP', 'victimPort', 'vulnName'], - 'victimIP': ['victimPort', 'vulnName'], - 'victimPort': ['vulnName'] - }})['graph'].plot() - ``` - - ```python - ### Override smart defaults with custom settings - g1 = graphistry.bind(source='src', destination='dst').edges(edges) - g2 = g1.nodes(nodes).bind(node='col2') - g3 = g2.bind(point_color='col3') - g4 = g3.settings(url_params={'edgeInfluence': 1.0, play: 2000}) - url = g4.plot(render=False) - ``` - - ```python - ### Read back data and create modified variants - enriched_edges = my_function1(g1._edges) - enriched_nodes = my_function2(g1._nodes) - g2 = g1.edges(enriched_edges).nodes(enriched_nodes) - g2.plot() - ``` - -* GFQL: Cypher-style graph pattern mining queries on dataframes with optional GPU acceleration ([ipynb demo](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/hop_and_chain_graph_pattern_mining.ipynb), [benchmark](https://github.com/graphistry/pygraphistry/blob/master/demos/gfql/benchmark_hops_cpu_gpu.ipynb)) - - Run Cypher-style graph queries natively on dataframes without going to a database or Java with GFQL: - - ```python - from graphistry import n, e_undirected, is_in - - g2 = g1.chain([ - n({'user': 'Biden'}), - e_undirected(), - n(name='bridge'), - e_undirected(), - n({'user': is_in(['Trump', 'Obama'])}) - ]) - - print('# bridges', len(g2._nodes[g2._nodes.bridge])) - g2.plot() - ``` - - Enable GFQL's optional automatic GPU acceleration for 43X+ speedups: - - ```python - # Switch from Pandas CPU dataframes to RAPIDS GPU dataframes - import cudf - g2 = g1.edges(lambda g: cudf.DataFrame(g._edges)) - # GFQL will automaticallly run on a GPU - g3 = g2.chain([n(), e(hops=3), n()]) - g3.plot() - ``` - -* [Spark](https://spark.apache.org/)/[Databricks](https://databricks.com/) ([ipynb demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.ipynb), [dbc demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.dbc)) - - ```python - #optional but recommended - spark.conf.set("spark.sql.execution.arrow.enabled", "true") - - edges_df = ( - spark.read.format('json'). - load('/databricks-datasets/iot/iot_devices.json') - .sample(fraction=0.1) - ) - g = graphistry.edges(edges_df, 'device_name', 'cn') - - #notebook - displayHTML(g.plot()) - - #dashboard: pick size of choice - displayHTML( - g.settings(url_params={'splashAfter': 'false'}) - .plot(override_html_style=""" - width: 50em; - height: 50em; - """) - ) - ``` - -* GPU [RAPIDS.ai](https://www.rapids.ai) cudf - - ```python - edges = cudf.read_csv('facebook_combined.txt', sep=' ', names=['src', 'dst']) - graphistry.edges(edges, 'src', 'dst').plot() - ``` - -* GPU [RAPIDS.ai](https://www.rapids.ai) cuML - - ```python - g = graphistry.nodes(cudf.read_csv('rows.csv')) - g = graphistry.nodes(G) - g.umap(engine='cuml',metric='euclidean').plot() - ``` - -* GPU [RAPIDS.ai](https://www.rapids.ai) cugraph ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/gpu_rapids/cugraph.ipynb)) - - ```python - g = graphistry.from_cugraph(G) - g2 = g.compute_cugraph('pagerank') - g3 = g2.layout_cugraph('force_atlas2') - g3.plot() - G3 = g.to_cugraph() - ``` - -* [Apache Arrow](https://arrow.apache.org/) - - ```python - edges = pa.Table.from_pandas(pd.read_csv('facebook_combined.txt', sep=' ', names=['src', 'dst'])) - graphistry.edges(edges, 'src', 'dst').plot() - ``` - -* [Neo4j](http://neo4j.com) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/neo4j/official/graphistry_bolt_tutorial_public.ipynb)) - - ```python - NEO4J_CREDS = {'uri': 'bolt://my.site.ngo:7687', 'auth': ('neo4j', 'mypwd')} - graphistry.register(bolt=NEO4J_CREDS) - graphistry.cypher("MATCH (n1)-[r1]->(n2) RETURN n1, r1, n2 LIMIT 1000").plot() - ``` - - ```python - graphistry.cypher("CALL db.schema()").plot() - ``` - - ```python - from neo4j import GraphDatabase, Driver - graphistry.register(bolt=GraphDatabase.driver(**NEO4J_CREDS)) - g = graphistry.cypher(""" - MATCH (a)-[p:PAYMENT]->(b) - WHERE p.USD > 7000 AND p.USD < 10000 - RETURN a, p, b - LIMIT 100000""") - print(g._edges.columns) - g.plot() - ``` - -* [Memgraph](https://memgraph.com/) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/memgraph/visualizing_iam_dataset.ipynb)) - - ```python - from neo4j import GraphDatabase - MEMGRAPH = { - 'uri': "bolt://localhost:7687", - 'auth': (" ", " ") - } - graphistry.register(bolt=MEMGRAPH) - ``` - - ```python - driver = GraphDatabase.driver(**MEMGRAPH) - with driver.session() as session: - session.run(""" - CREATE (per1:Person {id: 1, name: "Julie"}) - CREATE (fil2:File {id: 2, name: "welcome_to_memgraph.txt"}) - CREATE (per1)-[:HAS_ACCESS_TO]->(fil2) """) - g = graphistry.cypher(""" - MATCH (node1)-[connection]-(node2) - RETURN node1, connection, node2;""") - g.plot() - ``` - -* [Azure Cosmos DB (Gremlin)](https://azure.microsoft.com/en-us/services/cosmos-db/) - - ```python - # pip install --user gremlinpython - # Options in help(graphistry.cosmos) - g = graphistry.cosmos( - COSMOS_ACCOUNT='', - COSMOS_DB='', - COSMOS_CONTAINER='', - COSMOS_PRIMARY_KEY='' - ) - g2 = g.gremlin('g.E().sample(10000)').fetch_nodes() - g2.plot() - ``` - -* [Amazon Neptune (Gremlin)](https://aws.amazon.com/neptune/) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/neptune/neptune_tutorial.ipynb), [dashboarding demo](https://aws.amazon.com/blogs/database/enabling-low-code-graph-data-apps-with-amazon-neptune-and-graphistry/)) - - ```python - # pip install --user gremlinpython==3.4.10 - # - Deploy tips: https://github.com/graphistry/graph-app-kit/blob/master/docs/neptune.md - # - Versioning tips: https://gist.github.com/lmeyerov/459f6f0360abea787909c7c8c8f04cee - # - Login options in help(graphistry.neptune) - g = graphistry.neptune(endpoint='wss://zzz:8182/gremlin') - g2 = g.gremlin('g.E().limit(100)').fetch_nodes() - g2.plot() - ``` - -* [TigerGraph](https://tigergraph.com) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/tigergraph/tigergraph_pygraphistry_bindings.ipynb)) - - ```python - g = graphistry.tigergraph(protocol='https', ...) - g2 = g.gsql("...", {'edges': '@@eList'}) - g2.plot() - print('# edges', len(g2._edges)) - ``` - - ```python - g.endpoint('my_fn', {'arg': 'val'}, {'edges': '@@eList'}).plot() - ``` - -* [igraph](http://igraph.org) - - ```python - edges = pd.read_csv('facebook_combined.txt', sep=' ', names=['src', 'dst']) - g_a = graphistry.edges(edges, 'src', 'dst') - g_b = g_a.layout_igraph('sugiyama', directed=True) # directed: for to_igraph - g_b.compute_igraph('pagerank', params={'damping': 0.85}).plot() #params: for layout - - ig = igraph.read('facebook_combined.txt', format='edgelist', directed=False) - g = graphistry.from_igraph(ig) # full conversion - g.plot() - - ig2 = g.to_igraph() - ig2.vs['spinglass'] = ig2.community_spinglass(spins=3).membership - # selective column updates: preserve g._edges; merge 1 attribute from ig into g._nodes - g2 = g.from_igraph(ig2, load_edges=False, node_attributes=[g._node, 'spinglass']) - ``` - -* [NetworkX](https://networkx.github.io) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/networkx/networkx.ipynb)) - - ```python - graph = networkx.read_edgelist('facebook_combined.txt') - graphistry.bind(source='src', destination='dst', node='nodeid').plot(graph) - ``` - -* [HyperNetX](https://github.com/pnnl/HyperNetX) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/hypernetx/hypernetx.ipynb)) - - ```python - hg.hypernetx_to_graphistry_nodes(H).plot() - ``` - - ```python - hg.hypernetx_to_graphistry_bipartite(H.dual()).plot() - ``` - -* [Splunk](https://www.splunk.com) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/splunk/splunk_demo_public.ipynb)) - - ```python - df = splunkToPandas("index=netflow bytes > 100000 | head 100000", {}) - graphistry.edges(df, 'src_ip', 'dest_ip').plot() - ``` - -* [NodeXL](https://www.nodexl.com) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/nodexl/official/nodexl_graphistry.ipynb)) - - ```python - graphistry.nodexl('/my/file.xls').plot() - ``` - - ```python - graphistry.nodexl('https://file.xls').plot() - ``` - - ```python - graphistry.nodexl('https://file.xls', 'twitter').plot() - graphistry.nodexl('https://file.xls', verbose=True).plot() - graphistry.nodexl('https://file.xls', engine='xlsxwriter').plot() - graphistry.nodexl('https://file.xls')._nodes - ``` - -## Graph AI in a single line of code - -Graph autoML features including: - -### Generate features from raw data - -Automatically and intelligently transform text, numbers, booleans, and other formats to AI-ready representations: - -* Featurization - - ```python - g = graphistry.nodes(df).featurize(kind='nodes', X=['col_1', ..., 'col_n'], y=['label', ..., 'other_targets'], ...) - - print('X', g._node_features) - print('y', g._node_target) - ``` +PyGraphistry is an open source Python library for data scientists and developers to leverage the power of graph visualization, analytics, AI, including with native GPU acceleration: -* Set `kind='edges'` to featurize edges: +* [**Python dataframe-native graph processing:**](https://pygraphistry.readthedocs.io/en/latest/10min.html) Quickly ingest & prepare data in many formats, shapes, and scales as graphs. Use tools like Pandas, Spark, [RAPIDS (GPU)](https://www.rapids.ai), and [Apache Arrow](https://arrow.apache.org/). - ```python - g = graphistry.edges(df, src, dst).featurize(kind='edges', X=['col_1', ..., 'col_n'], y=['label', ..., 'other_targets'], ...) - ``` +* [**Integrations:**](https://pygraphistry.readthedocs.io/en/latest/plugins.html) Plug into [Amazon Neptune](https://docs.aws.amazon.com/neptune/latest/userguide/visualization-graphistry.html) ([notebook](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/neptune/neptune_cypher_viz_using_bolt.html)), [cuGraph](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/gpu_rapids/cugraph.html), [Databricks](https://www.databricks.com/solutions/accelerators/incident-investigation-using-graphistry) ([notebook](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.html)), [graphviz](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/graphviz/graphviz.html), [Neo4j](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/neo4j/official/graphistry_bolt_tutorial_public.html), [Splunk](https://www.splunk.com/en_us/blog/security/supercharge-cybersecurity-investigations-with-splunk-and-graphistry-a-powerful-combination-for-interactive-graph-exploration.html) ([notebook](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/splunk/splunk_demo_public.html)), [TigerGraph](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/tigergraph/tigergraph_pygraphistry_bindings.html), and many more in the [notebook data provider demo gallery](https://pygraphistry.readthedocs.io/en/latest/notebooks/plugins.connectors.html). -* Use generated features with both Graphistry and external libraries: - ```python - # graphistry - g = g.umap() # UMAP, GNNs, use features if already provided, otherwise will compute +* [**Prototype locally and deploy remotely:**](https://www.graphistry.com/get-started) Prototype from notebooks like Jupyter and Databricks using local CPUs & GPUs, and then power production dashboards & pipelines with Graphistry Hub and your own self-hosted servers. - # other pydata libraries - X = g._node_features # g._get_feature('nodes') or g.get_matrix() - y = g._node_target # g._get_target('nodes') or g.get_matrix(target=True) - from sklearn.ensemble import RandomForestRegressor - model = RandomForestRegressor().fit(X, y) # assumes train/test split - new_df = pandas.read_csv(...) # mini batch - X_new, _ = g.transform(new_df, None, kind='nodes', return_graph=False) - preds = model.predict(X_new) - ``` +* [**Query graphs with GFQL:**](https://pygraphistry.readthedocs.io/en/latest/gfql/index.html) Use GFQL, the first dataframe-native graph query language, to ask relationship questions that are difficult for tabular tools and without requiring a database. -* Encode model definitions and compare models against each other +* [**graphistry[ai]:**](https://pygraphistry.readthedocs.io/en/latest/gfql/combo.html#) Call streamlined graph ML & AI methods to benefit from clustering, UMAP embeddings, graph neural networks, automatic feature engineering, and more. - ```python - # graphistry - from graphistry.features import search_model, topic_model, ngrams_model, ModelDict, default_featurize_parameters, default_umap_parameters +* [**Visualize & explore large graphs:**](https://pygraphistry.readthedocs.io/en/latest/visualization/10min.html#) In just a few minutes, create stunning interactive visualizations with millions of edges and many point-and-click built-ins like drilldowns, timebars, and filtering. When ready, customize with Python, JavaScript, and REST APIs. - g = graphistry.nodes(df) - g2 = g.umap(X=[..], y=[..], **search_model) +* [**Columnar & GPU acceleration:**](https://pygraphistry.readthedocs.io/en/latest/performance.html) CPU-mode ingestion and wrangling is fast due to native use of Apache Arrow and columnar analytics, and the optional RAPIDS-based GPU mode delivers 100X+ speedups. - # set custom encoding model with any feature/umap/dbscan kwargs - new_model = ModelDict(message='encoding new model parameters is easy', **default_featurize_parameters) - new_model.update(dict( - y=[...], - kind='edges', - model_name='sbert/cool_transformer_model', - use_scaler_target='kbins', - n_bins=11, - strategy='normal')) - print(new_model) - g3 = g.umap(X=[..], **new_model) - # compare g2 vs g3 or add to different pipelines - ``` +From global 10 banks, manufacturers, news agencies, and government agencies, to startups, game companies, scientists, biotechs, and NGOs, many teams are tackling their graph workloads with Graphistry. -See `help(g.featurize)` for more options -### [sklearn-based UMAP](https://umap-learn.readthedocs.io/en/latest/), [cuML-based UMAP](https://docs.rapids.ai/api/cuml/stable/api.html?highlight=umap#cuml.UMAP) -* Reduce dimensionality by plotting a similarity graph from feature vectors: - - ```python - # automatic feature engineering, UMAP - g = graphistry.nodes(df).umap() - - # plot the similarity graph without any explicit edge_dataframe passed in -- it is created during UMAP. - g.plot() - ``` - -* Apply a trained model to new data: - - ```python - new_df = pd.read_csv(...) - embeddings, X_new, _ = g.transform_umap(new_df, None, kind='nodes', return_graph=False) - ``` - -* Infer a new graph from new data using the old umap coordinates to run inference without having to train a new umap model. - - ```python - new_df = pd.read_csv(...) - g2 = g.transform_umap(new_df, return_graph=True) # return_graph=True is default - g2.plot() # - - # or if you want the new minibatch to cluster to closest points in previous fit: - g3 = g.transform_umap(new_df, return_graph=True, merge_policy=True) - g3.plot() # useful to see how new data connects to old -- play with `sample` and `n_neighbors` to control how much of old to include - ``` - -* UMAP supports many options, such as supervised mode, working on a subset of columns, and passing arguments to underlying `featurize()` and UMAP implementations (see `help(g.umap)`): - - ```python - g.umap(kind='nodes', X=['col_1', ..., 'col_n'], y=['label', ..., 'other_targets'], ...) - ``` - -* `umap(engine="...")` supports multiple implementations. It defaults to using the GPU-accelerated `engine="cuml"` when a GPU is available, resulting in orders-of-magnitude speedups, and falls back to CPU processing via `engine="umap_learn"`.: - - ```python - g.umap(engine='cuml') - ``` - -You can also featurize edges and UMAP them as we did above. - -UMAP support is rapidly evolving, please contact the team directly or on Slack for additional discussions - -See `help(g.umap)` for more options - -### [GNN models](https://docs.dgl.ai/en/0.6.x/index.html) - -* Graphistry adds bindings and automation to working with popular GNN models, currently focusing on DGL/PyTorch: - - ```python - g = (graphistry - .nodes(ndf) - .edges(edf, src, dst) - .build_gnn( - X_nodes=['col_1', ..., 'col_n'], #columns from nodes_dataframe - y_nodes=['label', ..., 'other_targets'], - X_edges=['col_1_edge', ..., 'col_n_edge'], #columns from edges_dataframe - y_edges=['label_edge', ..., 'other_targets_edge'], - ...) - ) - G = g.DGL_graph - - from [your_training_pipeline] import train, model - # Train - g = graphistry.nodes(df).build_gnn(y_nodes=`target`) - G = g.DGL_graph - train(G, model) - # predict on new data - X_new, _ = g.transform(new_df, None, kind='nodes' or 'edges', return_graph=False) # no targets - predictions = model.predict(G_new, X_new) - ``` - -Like `g.umap()`, GNN layers automate feature engineering (`.featurize()`) - -See `help(g.build_gnn)` for options. - -GNN support is rapidly evolving, please contact the team directly or on Slack for additional discussions - -### [Semantic Search](https://www.sbert.net/examples/applications/semantic-search/README.html) - -* Search textual data semantically and see the resulting graph: - - ```python - ndf = pd.read_csv(nodes.csv) - edf = pd.read_csv(edges.csv) - - g = graphistry.nodes(ndf, 'node').edges(edf, 'src', 'dst') - - g2 = g.featurize(X = ['text_col_1', .., 'text_col_n'], kind='nodes', - min_words = 0, # forces all named columns as textual ones - #encode text as paraphrase embeddings, supports any sbert model - model_name = "paraphrase-MiniLM-L6-v2") - - # or use convienence `ModelDict` to store parameters - - from graphistry.features import search_model - g2 = g.featurize(X = ['text_col_1', .., 'text_col_n'], kind='nodes', **search_model) - - # query using the power of transformers to find richly relevant results - - results_df, query_vector = g2.search('my natural language query', ...) - - print(results_df[['_distance', 'text_col', ..]]) #sorted by relevancy - - # or see graph of matching entities and original edges - - g2.search_graph('my natural language query', ...).plot() - - ``` - -* If edges are not given, `g.umap(..)` will supply them: - - ```python - ndf = pd.read_csv(nodes.csv) - g = graphistry.nodes(ndf) - g2 = g.umap(X = ['text_col_1', .., 'text_col_n'], min_words=0, ...) - - g2.search_graph('my natural language query', ...).plot() - ``` - -See `help(g.search_graph)` for options - -### Knowledge Graph Embeddings - -* Train a RGCN model and predict: - - ```python - edf = pd.read_csv(edges.csv) - g = graphistry.edges(edf, src, dst) - g2 = g.embed(relation='relationship_column_of_interest', **kwargs) - - # predict links over all nodes - g3 = g2.predict_links_all(threshold=0.95) # score high confidence predicted edges - g3.plot() - - # predict over any set of entities and/or relations. - # Set any `source`, `destination` or `relation` to `None` to predict over all of them. - # if all are None, it is better to use `g.predict_links_all` for speed. - g4 = g2.predict_links(source=['entity_k'], - relation=['relationship_1', 'relationship_4', ..], - destination=['entity_l', 'entity_m', ..], - threshold=0.9, # score threshold - return_dataframe=False) # set to `True` to return dataframe, or just access via `g4._edges` - ``` - -* Detect Anamolous Behavior (example use cases such as Cyber, Fraud, etc) - - ```python - # Score anomolous edges by setting the flag `anomalous` to True and set confidence threshold low - g5 = g.predict_links_all(threshold=0.05, anomalous=True) # score low confidence predicted edges - g5.plot() - - g6 = g.predict_links(source=['ip_address_1', 'user_id_3'], - relation=['attempt_logon', 'phishing', ..], - destination=['user_id_1', 'active_directory', ..], - anomalous=True, - threshold=0.05) - g6.plot() - ``` - -* Train a RGCN model including auto-featurized node embeddings - - ```python - edf = pd.read_csv(edges.csv) - ndf = pd.read_csv(nodes.csv) # adding node dataframe - - g = graphistry.edges(edf, src, dst).nodes(ndf, node_column) - - # inherets all the featurization `kwargs` from `g.featurize` - g2 = g.embed(relation='relationship_column_of_interest', use_feat=True, **kwargs) - g2.predict_links_all(threshold=0.95).plot() - ``` - -See `help(g.embed)`, `help(g.predict_links)` , or `help(g.predict_links_all)` for options - -### DBSCAN - -* Enrich UMAP embeddings or featurization dataframe with GPU or CPU DBSCAN - - ```python - g = graphistry.edges(edf, 'src', 'dst').nodes(ndf, 'node') - - # cluster by UMAP embeddings - kind = 'nodes' | 'edges' - g2 = g.umap(kind=kind).dbscan(kind=kind) - print(g2._nodes['_dbscan']) | print(g2._edges['_dbscan']) - - # dbscan in `umap` or `featurize` via flag - g2 = g.umap(dbscan=True, min_dist=0.2, min_samples=1) - - # or via chaining, - g2 = g.umap().dbscan(min_dist=1.2, min_samples=2, **kwargs) - - # cluster by feature embeddings - g2 = g.featurize().dbscan(**kwargs) - - # cluster by a given set of feature column attributes, inhereted from `g.get_matrix(cols)` - g2 = g.featurize().dbscan(cols=['ip_172', 'location', 'alert'], **kwargs) - - # equivalent to above (ie, cols != None and umap=True will still use features dataframe, rather than UMAP embeddings) - g2 = g.umap().dbscan(cols=['ip_172', 'location', 'alert'], umap=True | False, **kwargs) - g2.plot() # color by `_dbscan` - - new_df = pd.read_csv(..) - # transform on new data according to fit dbscan model - g3 = g2.transform_dbscan(new_df) - ``` - -See `help(g.dbscan)` or `help(g.transform_dbscan)` for options - -### Quickly configurable - -Set visual attributes through [quick data bindings](https://hub.graphistry.com/docs/api/2/rest/upload/#createdataset2) and set [all sorts of URL options](https://hub.graphistry.com/docs/api/1/rest/url/). Check out the tutorials on [colors](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/encodings-colors.ipynb), [sizes](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/encodings-sizes.ipynb), [icons](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/encodings-icons.ipynb), [badges](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/encodings-badges.ipynb), [weighted clustering](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/edge-weights.ipynb) and [sharing controls](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/sharing_tutorial.ipynb): +## Gallery - ```python - g - .privacy(mode='private', invited_users=[{'email': 'friend1@site.ngo', 'action': '10'}], notify=False) - .edges(df, 'col_a', 'col_b') - .edges(my_transform1(g._edges)) - .nodes(df, 'col_c') - .nodes(my_transform2(g._nodes)) - .bind(source='col_a', destination='col_b', node='col_c') - .bind( - point_color='col_a', - point_size='col_b', - point_title='col_c', - point_x='col_d', - point_y='col_e') - .bind( - edge_color='col_m', - edge_weight='col_n', - edge_title='col_o') - .encode_edge_color('timestamp', ["blue", "yellow", "red"], as_continuous=True) - .encode_point_icon('device_type', categorical_mapping={'macbook': 'laptop', ...}) - .encode_point_badge('passport', 'TopRight', categorical_mapping={'Canada': 'flag-icon-ca', ...}) - .encode_point_color('score', ['black', 'white']) - .addStyle(bg={'color': 'red'}, fg={}, page={'title': 'My Graph'}, logo={}) - .settings(url_params={ - 'play': 2000, - 'menu': True, 'info': True, - 'showArrows': True, - 'pointSize': 2.0, 'edgeCurvature': 0.5, - 'edgeOpacity': 1.0, 'pointOpacity': 1.0, - 'lockedX': False, 'lockedY': False, 'lockedR': False, - 'linLog': False, 'strongGravity': False, 'dissuadeHubs': False, - 'edgeInfluence': 1.0, 'precisionVsSpeed': 1.0, 'gravity': 1.0, 'scalingRatio': 1.0, - 'showLabels': True, 'showLabelOnHover': True, - 'showPointsOfInterest': True, 'showPointsOfInterestLabel': True, 'showLabelPropertiesOnHover': True, - 'pointsOfInterestMax': 5 - }) - .plot() - ``` -### Gallery +The [notebook demo gallery](https://pygraphistry.readthedocs.io/en/latest/demos/for_analysis.html) shares many more live visualizations, demos, and integration examples - - - + + + - - + +
Twitter Botnet
Edit Wars on Wikipedia
Source: SNAP
100,000 Bitcoin Transactions
Twitter Botnet
Edit Wars on Wikipedia
(data)
100,000 Bitcoin Transactions
Port Scan Attack
Protein Interactions
Source: BioGRID
Programming Languages
Source: Socio-PLT project
Protein Interactions
(data)
Programming Languages
(data)
-## Install - -### Get - -You need to install the PyGraphistry Python client and connect it to a Graphistry GPU server of your choice: - -1. Graphistry server account: - * Create a free [Graphistry Hub account](https://www.graphistry.com/get-started) for open data, or [one-click launch your own private AWS/Azure instance](https://www.graphistry.com/get-started) - * Later, [setup and manage](https://github.com/graphistry/graphistry-cli) your own private Docker instance ([contact](https://www.graphistry.com/demo-request)) - -2. PyGraphistry Python client: - * `pip install --user graphistry` (Python 3.8+) or [directly call the HTTP API](https://hub.graphistry.com/docs/api/) - * Use `pip install --user graphistry[all]` for optional dependencies such as Neo4j drivers - * To use from a notebook environment, run your own [Jupyter](https://jupyter.org/) server ([one-click launch your own private AWS/Azure GPU instance](https://www.graphistry.com/get-started)) or another such as [Google Colab](https://colab.research.google.com) - * See immediately following `configure` section for how to connect - -### Configure - -Most users connect to a Graphistry GPU server account via: - -* `graphistry.register(api=3, username='abc', password='xyz')`: personal hub.graphistry.com account -* `graphistry.register(api=3, username='abc', password='xyz', org_name='optional_org')`: team hub.graphistry.com account -* `graphistry.register(api=3, username='abc', password='xyz', org_name='optiona_org', protocol='http', server='my.private_server.org')`: private server - -For more advanced configuration, read on for: - -* Version: Use protocol `api=3`, which will soon become the default, or a legacy version - -* JWT Tokens: Connect to a GPU server by providing a `username='abc'`/`password='xyz'`, or for advanced long-running service account software, a refresh loop using 1-hour-only JWT tokens - -* Organizations: Optionally use `org_name` to set a specific organization - -* Private servers: PyGraphistry defaults to using the free [Graphistry Hub](https://hub.graphistry.com) public API - - * Connect to a [private Graphistry server](https://www.graphistry.com/get-started) and provide optional settings specific to it via `protocol`, `server`, and in some cases, `client_protocol_hostname` - -Non-Python users may want to explore the underlying language-neutral [authentication REST API docs](https://hub.graphistry.com/docs/api/1/rest/auth/). - -#### Advanced Login - -* **For people:** Provide your account username/password: - -```python -import graphistry -graphistry.register(api=3, username='username', password='your password') -``` - -* **For service accounts**: Long-running services may prefer to use 1-hour JWT tokens: - -```python -import graphistry -graphistry.register(api=3, username='username', password='your password') -initial_one_hour_token = graphistry.api_token() -graphistry.register(api=3, token=initial_one_hour_token) - -# must run every 59min -graphistry.refresh() -fresh_token = graphistry.api_token() -assert initial_one_hour_token != fresh_token -``` - -Refreshes exhaust their limit every day/month. An upcoming Personal Key feature enables non-expiring use. - -Alternatively, you can rerun `graphistry.register(api=3, username='username', password='your password')`, which will also fetch a fresh token. - -#### Advanced: Private servers - server uploads - -Specify which Graphistry server to reach for Python uploads: - -```python -graphistry.register(protocol='https', server='hub.graphistry.com') -``` - -Private Graphistry notebook environments are preconfigured to fill in this data for you: - -```python -graphistry.register(protocol='http', server='nginx', client_protocol_hostname='') -``` - -Using `'http'`/`'nginx'` ensures uploads stay within the Docker network (vs. going more slowly through an outside network), and client protocol `''` ensures the browser URLs do not show `http://nginx/`, and instead use the server's name. (See immediately following **Switch client URL** section.) - -#### Advanced: Private servers - switch client URL for browser views - -In cases such as when the notebook server is the same as the Graphistry server, you may want your Python code to *upload* to a known local Graphistry address without going outside the network (e.g., `http://nginx` or `http://localhost`), but for web viewing, generate and embed URLs to a different public address (e.g., `https://graphistry.acme.ngo/`). In this case, explicitly set a client (browser) location different from `protocol` / `server`: - -```python -graphistry.register( - ### fast local notebook<>graphistry upload - protocol='http', server='nginx', - - ### shareable public URL for browsers - client_protocol_hostname='https://graphistry.acme.ngo' -) -``` - -Prebuilt Graphistry servers are already setup to do this out-of-the-box. - -#### Advanced: Sharing controls - -Graphistry supports flexible sharing permissions that are similar to Google documents and Dropbox links - -By default, visualizations are publicly viewable by anyone with the URL (that is unguessable & unlisted), and only editable by their owner. -* Private-only: You can globally default uploads to private: -```python -graphistry.privacy() # graphistry.privacy(mode='private') -``` - -* Organizations: You can login with an organization and share only within it - -```python -graphistry.register(api=3, username='...', password='...', org_name='my-org123') -graphistry.privacy(mode='organization') -``` - -* Invitees: You can share access to specify users, and optionally, even email them invites - -```python -VIEW = "10" -EDIT = "20" -graphistry.privacy( - mode='private', - invited_users=[ - {"email": "friend1@site1.com", "action": VIEW}, - {"email": "friend2@site2.com", "action": EDIT} - ], - notify=True) -``` +## Install -* Per-visualization: You can choose different rules for global defaults vs. for specific visualizations +Common configurations: -```python -graphistry.privacy(invited_users=[...]) -g = graphistry.hypergraph(pd.read_csv('...'))['graph'] -g.privacy(notify=True).plot() -``` +* **Minimal core** -See additional examples in the [sharing tutorial](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/sharing_tutorial.ipynb) + Includes: The GFQL dataframe-native graph query language, built-in layouts, Graphistry visualization server client -## Tutorial: Les Misérables + ```python + pip install graphistry + ``` -Let's visualize relationships between the characters in [Les Misérables](http://en.wikipedia.org/wiki/Les_Misérables). -For this example, we'll choose [Pandas](http://pandas.pydata.org) to wrangle data and [igraph](http://igraph.org) to run a community detection algorithm. You can [view](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/simple/MarvelTutorial.ipynb) the Jupyter notebook containing this example. + Does not include `graphistry[ai]`, plugins -Our [dataset is a CSV file](https://raw.githubusercontent.com/graphistry/pygraphistry/master/demos/data/lesmiserables.csv) that looks like this: +* **No dependencies and user-level** -| source | target | value | -| ------------- |:-------------:| ------:| -| Cravatte | Myriel | 1 -| Valjean | Mme.Magloire | 3 -| Valjean | Mlle.Baptistine | 3 + ```python + pip install --no-deps --user graphistry + ``` -*Source* and *target* are character names, and the *value* column counts the number of time they meet. Parsing is a one-liner with Pandas: +* **GPU acceleration** - Optional -```python -import pandas -links = pandas.read_csv('./lesmiserables.csv') -``` + Local GPU: Install [RAPIDS](https://www.rapids.ai) and/or deploy a GPU-ready [Graphistry server](https://www.graphistry.com/get-started) + + Remote GPU: Use the [remote endpoints](https://www.graphistry.com/blog/graphistry-2-41-3). -### Quick Visualization +For further options, see the [installation guides](https://pygraphistry.readthedocs.io/en/latest/install/index.html) -If you already have graph-like data, use this step. Otherwise, try the [Hypergraph Transform](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_by_use_case/logs/malware-hypergraph/Malware%20Hypergraph.ipynb) for creating graphs from rows of data (logs, samples, records, ...). -PyGraphistry can plot graphs directly from Pandas data frames, Arrow tables, cuGraph GPU data frames, igraph graphs, or NetworkX graphs. Calling *plot* uploads the data to our visualization servers and return an URL to an embeddable webpage containing the visualization. +## Visualization quickstart -To define the graph, we `bind` *source* and *destination* to the columns indicating the start and end nodes of each edges: +Quickly go from raw data to a styled and interactive Graphistry graph visualization: ```python import graphistry -graphistry.register(api=3, username='YOUR_ACCOUNT_HERE', password='YOUR_PASSWORD_HERE') - -g = graphistry.bind(source="source", destination="target") -g.edges(links).plot() -``` - -You should see a beautiful graph like this one: -![Graph of Miserables](http://i.imgur.com/dRHHTyK.png) - -### Adding Labels - -Let's add labels to edges in order to show how many times each pair of characters met. We create a new column called *label* in edge table *links* that contains the text of the label and we bind *edge_label* to it. - -```python -links["label"] = links.value.map(lambda v: "#Meetings: %d" % v) -g = g.bind(edge_title="label") -g.edges(links).plot() -``` - -### Controlling Node Title, Size, Color, and Position - -Let's size nodes based on their [PageRank](http://en.wikipedia.org/wiki/PageRank) score and color them using their [community](https://en.wikipedia.org/wiki/Community_structure). - -#### Warmup: igraph for computing statistics - -[igraph](http://igraph.org/python/) already has these algorithms implemented for us for small graphs. (See our cuGraph examples for big graphs.) If igraph is not already installed, fetch it with `pip install igraph`. - -We start by converting our edge dateframe into an igraph. The plotter can do the conversion for us using the *source* and *destination* bindings. Then we compute two new node attributes (*pagerank* & *community*). - -```python -g = g.compute_igraph('pagerank', directed=True, params={'damping': 0.85}).compute_igraph('community_infomap') -``` - -The algorithm names `'pagerank'` and `'community_infomap'` correspond to method names of [igraph.Graph](https://igraph.org/python/api/latest/igraph.Graph.html). Likewise, optional `params={...}` allow specifying additional parameters. - -#### Bind node data to visual node attributes - -We can then bind the node `community` and `pagerank` columns to visualization attributes: - -```python -g.bind(point_color='community', point_size='pagerank').plot() -``` - -See the [color palette documentation](https://hub.graphistry.com/docs/api/2/rest/upload/colors/#extendedpalette2) for specifying color values by using built-in ColorBrewer palettes (`int32`) or custom RGB values (`int64`). - -To control the position, we can add `.bind(point_x='colA', point_y='colB').settings(url_params={'play': 0})` ([see demos](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/external_layout) and [additional url parameters](https://hub.graphistry.com/docs/api/1/rest/url/#urloptions)]). In `api=1`, you created columns named `x` and `y`. - -You may also want to bind `point_title`: `.bind(point_title='colA')`. - -For more in-depth examples, check out the tutorials on [colors](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/encodings-colors.ipynb) and [sizes](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/encodings-sizes.ipynb). - -![Second Graph of Miserables](http://i.imgur.com/P7fm5sn.png) - -### Add edge colors and weights - -By default, edges get colored as a gradient between their source/destination node colors. You can override this by setting `.bind(edge_color='colA')`, similar to how node colors function. ([See color documentation](https://hub.graphistry.com/docs/api/2/rest/upload/colors/#extendedpalette2).) - -Similarly, you can bind the edge weight, where higher weights cause nodes to cluster closer together: `.bind(edge_weight='colA')`. [See tutorial](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/edge-weights.ipynb). - -For more in-depth examples, check out the tutorials on [colors](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/encodings-colors.ipynb) and [weighted clustering](demos/more_examples/graphistry_features/edge-weights.ipynb). - -### More advanced color and size controls - -You may want more controls like using gradients or maping specific values: - -```python -g.encode_edge_color('int_col') # int32 or int64 -g.encode_edge_color('time_col', ["blue", "red"], as_continuous=True) -g.encode_edge_color('type', as_categorical=True, - categorical_mapping={"cat": "red", "sheep": "blue"}, default_mapping='#CCC') -g.encode_edge_color('brand', - categorical_mapping={'toyota': 'red', 'ford': 'blue'}, - default_mapping='#CCC') -g.encode_point_size('numeric_col') -g.encode_point_size('criticality', - categorical_mapping={'critical': 200, 'ok': 100}, - default_mapping=50) -g.encode_point_color('int_col') # int32 or int64 -g.encode_point_color('time_col', ["blue", "red"], as_continuous=True) -g.encode_point_color('type', as_categorical=True, - categorical_mapping={"cat": "red", "sheep": "blue"}, default_mapping='#CCC') -``` - -For more in-depth examples, check out the tutorials on [colors](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/encodings-colors.ipynb). - -### Custom icons and badges - -You can add a main icon and multiple peripherary badges to provide more visual information. Use column `type` for the icon type to appear visually in the legend. The glyph system supports text, icons, flags, and images, as well as multiple mapping and style controls. - -#### Main icon - -```python -g.encode_point_icon( - 'some_column', - shape="circle", #clip excess - categorical_mapping={ - 'macbook': 'laptop', #https://fontawesome.com/v4.7.0/icons/ - 'Canada': 'flag-icon-ca', #ISO3611-Alpha-2: https://github.com/datasets/country-codes/blob/master/data/country-codes.csv - 'embedded_smile': 'data:svg...', - 'external_logo': 'http://..../img.png' - }, - default_mapping="question") -g.encode_point_icon( - 'another_column', - continuous_binning=[ - [20, 'info'], - [80, 'exclamation-circle'], - [None, 'exclamation-triangle'] - ] -) -g.encode_point_icon( - 'another_column', - as_text=True, - categorical_mapping={ - 'Canada': 'CA', - 'United States': 'US' - } -) -``` - -For more in-depth examples, check out the tutorials on [icons](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/encodings-icons.ipynb). - -#### Badges - -```python -# see icons examples for mappings and glyphs -g.encode_point_badge('another_column', 'TopRight', categorical_mapping=...) - -g.encode_point_badge('another_column', 'TopRight', categorical_mapping=..., - shape="circle", - border={'width': 2, 'color': 'white', 'stroke': 'solid'}, - color={'mapping': {'categorical': {'fixed': {}, 'other': 'white'}}}, - bg={'color': {'mapping': {'continuous': {'bins': [], 'other': 'black'}}}}) -``` +import pandas as pd -For more in-depth examples, check out the tutorials on [badges](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/encodings-badges.ipynb). - -#### Axes - -For more automated use, see the section on radial layouts below. - -Radial axes support three coloring types (`'external'`, `'internal'`, and `'space'`) and optional labels: - -```python - g.encode_axis([ - {'r': 14, 'external': True, "label": "outermost"}, - {'r': 12, 'external': True}, - {'r': 10, 'space': True}, - {'r': 8, 'space': True}, - {'r': 6, 'internal': True}, - {'r': 4, 'space': True}, - {'r': 2, 'space': True, "label": "innermost"} -]) -``` - -Horizontal axis support optional labels and ranges: - -```python -g.encode_axis([ - {"label": "a", "y": 2, "internal": True }, - {"label": "b", "y": 40, "external": True, - "width": 20, "bounds": {"min": 40, "max": 400}}, -]) -``` - -Radial axis are generally used with radial positioning: - -```python -g2 = (g - .nodes( - g._nodes.assign( - x = 1 + (g._nodes['ring']) * g._nodes['n'].apply(math.cos), - y = 1 + (g._nodes['ring']) * g._nodes['n'].apply(math.sin) - )).settings(url_params={'lockedR': 'true', 'play': 1000}) -``` - -Horizontal axis are often used with pinned y and free x positions: - -```python -g2 = (g - .nodes( - g._nodes.assign( - y = 50 * g._nodes['level']) - )).settings(url_params={'lockedY': 'true', 'play': 1000}) -``` - -### Theming - -You can customize several style options to match your theme: - -```python -g.addStyle(bg={'color': 'red'}) -g.addStyle(bg={ - 'color': '#333', - 'gradient': { - 'kind': 'radial', - 'stops': [ ["rgba(255,255,255, 0.1)", "10%", "rgba(0,0,0,0)", "20%"] ]}}) -g.addStyle(bg={'image': {'url': 'http://site.com/cool.png', 'blendMode': 'multiply'}}) -g.addStyle(fg={'blendMode': 'color-burn'}) -g.addStyle(page={'title': 'My site'}) -g.addStyle(page={'favicon': 'http://site.com/favicon.ico'}) -g.addStyle(logo={'url': 'http://www.site.com/transparent_logo.png'}) -g.addStyle(logo={ - 'url': 'http://www.site.com/transparent_logo.png', - 'dimensions': {'maxHeight': 200, 'maxWidth': 200}, - 'style': {'opacity': 0.5} +# Raw data as Pandas CPU dataframes, cuDF GPU dataframes, Spark, ... +df = pd.DataFrame({ + 'src': ['Alice', 'Bob', 'Carol'], + 'dst': ['Bob', 'Carol', 'Alice'], + 'friendship': [0.3, 0.95, 0.8] }) -``` -### Transforms +# Bind +g1 = graphistry.edges(df, 'src', 'dst') -The below methods let you quickly manipulate graphs directly and with dataframe methods: Search, pattern mine, transform, and more: +# Override styling defaults +g1_styled = g1.encode_edge_color('friendship', as_continuous=True, ['blue', 'red']) -```python -from graphistry import n, e_forward, e_reverse, e_undirected, is_in -g = (graphistry - .edges(pd.DataFrame({ - 's': ['a', 'b'], - 'd': ['b', 'c'], - 'k1': ['x', 'y'] - })) - .nodes(pd.DataFrame({ - 'n': ['a', 'b', 'c'], - 'k2': [0, 2, 4, 6] - }) -) - -g2 = graphistry.hypergraph(g._edges, ['s', 'd', 'k1'])['graph'] -g2.plot() # nodes are values from cols s, d, k1 - -(g - .materialize_nodes() - .get_degrees() - .get_indegrees() - .get_outdegrees() - .pipe(lambda g2: g2.nodes(g2._nodes.assign(t=x))) # transform - .filter_edges_by_dict({"k1": "x"}) - .filter_nodes_by_dict({"k2": 4}) - .prune_self_edges() - .hop( # filter to subgraph - #almost all optional - direction='forward', # 'reverse', 'undirected' - hops=2, # number (1..n hops, inclusive) or None if to_fixed_point - to_fixed_point=False, - - #every edge source node must match these - source_node_match={"k2": 0, "k3": is_in(['a', 'b', 3, 4])}, - source_node_query='k2 == 0', - - #every edge must match these - edge_match={"k1": "x"}, - edge_query='k1 == "x"', - - #every edge destination node must match these - destination_node_match={"k2": 2}, - destination_node_query='k2 == 2 or k2 == 4', - ) - .chain([ # filter to subgraph with Cypher-style GFQL - n(), - n({'k2': 0, "m": 'ok'}), #specific values - n({'type': is_in(["type1", "type2"])}), #multiple valid values - n(query='k2 == 0 or k2 == 4'), #dataframe query - n(name="start"), # add column 'start':bool - e_forward({'k1': 'x'}, hops=1), # same API as hop() - e_undirected(name='second_edge'), - e_reverse( - {'k1': 'x'}, # edge property match - hops=2, # 1 to 2 hops - #same API as hop() - source_node_match={"k2": 2}, - source_node_query='k2 == 2 or k2 == 4', - edge_match={"k1": "x"}, - edge_query='k1 == "x"', - destination_node_match={"k2": 0}, - destination_node_query='k2 == 0') - ]) - # replace as one node the node w/ given id + transitively connected nodes w/ col=attr - .collapse(node='some_id', column='some_col', attribute='some val') -``` - -Both `hop()` and `chain()` (GFQL) match dictionary expressions support dataframe series *predicates*. The above examples show `is_in([x, y, z, ...])`. Additional predicates include: - -* categorical: is_in, duplicated -* temporal: is_month_start, is_month_end, is_quarter_start, is_quarter_end, is_year_start, is_year_end -* numeric: gt, lt, ge, le, eq, ne, between, isna, notna -* string: contains, startswith, endswith, match, isnumeric, isalpha, isdigit, islower, isupper, isspace, isalnum, isdecimal, istitle, isnull, notnull - -Both `hop()` and `chain()` will run on GPUs when passing in RAPIDS dataframes. Specify parameter `engine='cudf'` to be sure. +# Connect: Free GPU accounts and self-hosting @ graphistry.com/get-started +graphistry.register(api=3, username='your_username', password='your_password') -#### Table to graph - -```python -df = pd.read_csv('events.csv') -hg = graphistry.hypergraph(df, ['user', 'email', 'org'], direct=True) -g = hg['graph'] # g._edges: | src, dst, user, email, org, time, ... | -g.plot() +# Upload for GPU server visualization session +g1_styled.plot() ``` -```python -hg = graphistry.hypergraph( - df, - ['from_user', 'to_user', 'email', 'org'], - direct=True, - opts={ - - # when direct=True, can define src -> [ dst1, dst2, ...] edges - 'EDGES': { - 'org': ['from_user'], # org->from_user - 'from_user': ['email', 'to_user'], #from_user->email, from_user->to_user - }, - - 'CATEGORIES': { - # determine which columns share the same namespace for node generation: - # - if user 'louie' is both a from_user and to_user, show as 1 node - # - if a user & org are both named 'louie', they will appear as 2 different nodes - 'user': ['from_user', 'to_user'] - } -}) -g = hg['graph'] -g.plot() -``` +Explore [10 Minutes to Graphistry Visualization](https://pygraphistry.readthedocs.io/en/latest/visualization/10min.html) for more visualization examples and options -#### Generate node table -```python -g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']})) -g2 = g.materialize_nodes() -g2._nodes # pd.DataFrame({'id': ['a', 'b', 'c']}) -``` +## PyGraphistry[AI] & GFQL quickstart - CPU & GPU -#### Compute degrees +**CPU graph pipeline** combining graph ML, AI, mining, and visualization: ```python -g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']})) -g2 = g.get_degrees() -g2._nodes # pd.DataFrame({ - # 'id': ['a', 'b', 'c'], - # 'degree_in': [0, 1, 1], - # 'degree_out': [1, 1, 0], - # 'degree': [1, 1, 1] - #}) -``` - -See also `get_indegrees()` and `get_outdegrees()` +from graphistry import n, e, e_forward, e_reverse -#### Use igraph (CPU) and cugraph (GPU) compute - -Install the plugin of choice and then: - -```python -g2 = g.compute_igraph('pagerank') +# Graph analytics +g2 = g1.compute_igraph('pagerank') assert 'pagerank' in g2._nodes.columns -g3 = g.compute_cugraph('pagerank') -assert 'pagerank' in g2._nodes.columns -``` - -#### Graph pattern matching - -PyGraphistry supports GFQL, its PyData-native variant of the popular Cypher graph query language, meaning you can do graph pattern matching directly from Pandas dataframes without installing a database or Java - -See also [graph pattern matching tutorial](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/hop_and_chain_graph_pattern_mining.ipynb) and the CPU/GPU [benchmark](https://github.com/graphistry/pygraphistry/tree/master/demos/gfql/benchmark_hops_cpu_gpu.ipynb) - -Traverse within a graph, or expand one graph against another - -Simple node and edge filtering via `filter_edges_by_dict()` and `filter_nodes_by_dict()`: - -```python -g = graphistry.edges(pd.read_csv('data.csv'), 's', 'd') -g2 = g.materialize_nodes() +# Graph ML/AI +g3 = g2.umap() +assert ('x' in g3._nodes.columns) and ('y' in g3._nodes.columns) -g3 = g.filter_edges_by_dict({"v": 1, "b": True}) -g4 = g.filter_nodes_by_dict({"v2": 1, "b2": True}) -``` - -Method `.hop()` enables slightly more complicated edge filters: - -```python - -from graphistry import is_in, gt - -# (a)-[{"v": 1, "type": "z"}]->(b) based on g -g2b = g2.hop( - source_node_match={g2._node: "a"}, - edge_match={"v": 1, "type": "z"}, - destination_node_match={g2._node: "b"}) -g2b = g2.hop( - source_node_query='n == "a"', - edge_query='v == 1 and type == "z"', - destination_node_query='n == "b"') - -# (a {x in [1,2] and y > 3})-[e]->(b) based on g -g2c = g2.hop( - source_node_match={ - g2._node: "a", - "x": is_in([1,2]), - "y": gt(3) - }, - destination_node_match={g2._node: "b"}) -) - -# (a or b)-[1 to 8 hops]->(anynode), based on graph g2 -g3 = g2.hop(pd.DataFrame({g2._node: ['a', 'b']}), hops=8) - -# (a or b)-[1 to 8 hops]->(anynode), based on graph g2 -g3 = g2.hop(pd.DataFrame({g2._node: is_in(['a', 'b'])}), hops=8) - -# (c)<-[any number of hops]-(any node), based on graph g3 -# Note multihop matches check source/destination/edge match/query predicates -# against every encountered edge for it to be included -g4 = g3.hop(source_node_match={"node": "c"}, direction='reverse', to_fixed_point=True) - -# (c)-[incoming or outgoing edge]-(any node), -# for c in g4 with expansions against nodes/edges in g2 -g5 = g2.hop(pd.DataFrame({g4._node: g4[g4._node]}), hops=1, direction='undirected') - -g5.plot() -``` - -Rich compound patterns are enabled via `.chain()`: - -```python -from graphistry import n, e_forward, e_reverse, e_undirected, is_in - -g2.chain([ n() ]) -g2.chain([ n({"x": 1, "y": True}) ]), -g2.chain([ n(query='x == 1 and y == True') ]), -g2.chain([ n({"z": is_in([1,2,4,'z'])}) ]), # multiple valid values -g2.chain([ e_forward({"type": "x"}, hops=2) ]) # simple multi-hop -g3 = g2.chain([ - n(name="start"), # tag node matches - e_forward(hops=3), - e_forward(name="final_edge"), # tag edge matches - n(name="end") +# Graph querying with GFQL +g4 = g3.chain([ + n(query='pagerank > 0.1'), e_forward(), n(query='pagerank > 0.1') ]) -g2.chain(n(), e_forward(), n(), e_reverse(), n()]) # rich shapes -print('# end nodes: ', len(g3._nodes[ g3._nodes.end ])) -print('# end edges: ', len(g3._edges[ g3._edges.final_edge ])) -``` - -See table above for more predicates like `is_in()` and `gt()` - -Queries can be serialized and deserialized, such as for saving and remote execution: +assert (g4._nodes.pagerank > 0.1).all() -```python -from graphistry.compute.chain import Chain - -pattern = Chain([n(), e(), n()]) -pattern_json = pattern.to_json() -pattern2 = Chain.from_json(pattern_json) -g.chain(pattern2).plot() +# Upload for GPU server visualization session +g4.plot() ``` -Benefit from automatic GPU acceleration by passing in GPU dataframes: +The **automatic GPU modes** require almost no code changes: ```python import cudf +from graphistry import n, e, e_forward, e_reverse -g1 = graphistry.edges(cudf.read_csv('data.csv'), 's', 'd') -g2 = g1.chain(..., engine='cudf') -``` - -The parameter `engine` is optional, defaulting to `'auto'`. +# Modified -- Rebind data as a GPU dataframe and swap in a GPU plugin call +g1_gpu = g1.edges(cudf.from_pandas(df)) +g2 = g1_gpu.compute_cugraph('pagerank') -#### Pipelining - -```python -def capitalize(df, col): - df2 = df.copy() - df2[col] df[col].str.capitalize() - return df2 - -g - .cypher('MATCH (a)-[e]->(b) RETURN a, e, b') - .nodes(lambda g: capitalize(g._nodes, 'nTitle')) - .edges(capitalize, None, None, 'eTitle'), - .pipe(lambda g: g.nodes(g._nodes.pipe(capitalize, 'nTitle'))) -``` - -#### Removing nodes - -```python -g = graphistry.edges(pd.DataFrame({'s': ['a', 'b', 'c'], 'd': ['b', 'c', 'a']})) -g2 = g.drop_nodes(['c']) # drops node c, edge c->a, edge b->c, -``` - -#### Keeping nodes - -```python -# keep nodes [a,b,c] and edges [(a,b),(b,c)] -g2 = g.keep_nodes(['a, b, c']) -g2 = g.keep_nodes(pd.Series(['a, b, c'])) -g2 = g.keep_nodes(cudf.Series(['a, b, c'])) -``` - -#### Collapsing adjacent nodes with specific k=v matches - -One col/val pair: - -```python -g2 = g.collapse( - node='root_node_id', # rooted traversal beginning - column='some_col', # column to inspect - attribute='some val' # value match to collapse on if hit -) -assert len(g2._nodes) <= len(g._nodes) -``` - -Collapse for all possible vals in a column, and assuming a stable root node id: - -```python -g3 = g -for v in g._nodes['some_col'].unique(): - g3 = g3.collapse(node='root_node_id', column='some_col', attribute=v) -``` - -### Hierarchical layouts: Tree and radial - -A hierachical view via horizontal or vertical trees, or radial. Graph data may also be presented using these layouts. - -#### Tree - -```python -g = graphistry.edges(pd.DataFrame({'s': ['a', 'b', 'b'], 'd': ['b', 'c', 'd']})) - -g2a = g.tree_layout() -g2b = g2.tree_layout(allow_cycles=False, remove_self_loops=False, vertical=False) -g2c = g2.tree_layout(ascending=False, level_align='center') -g2d = g2.tree_layout(level_sort_values_by=['type', 'degree'], level_sort_values_by_ascending=False) - -g3a = g2a.layout_settings(locked_r=True, play=1000) -g3b = g2a.layout_settings(locked_y=True, play=0) -g3c = g2a.layout_settings(locked_x=True) - -g4 = g2.tree_layout().rotate(90) -``` - -To use with non-tree data, e.g., graphs with cycles, we recommend computing a tree such as via a minimum spanning tree, and then using that achieved layout with this algorithm. Alternatively, the radial layouts may more naturally support your graph. - -#### Radial - -A hierarchical view via radial rings that may be more space-efficient and aesthetic than the equivalent tree layout - -Supports time-based, continuous, and categorical modes: - -##### Radial: Time-based - -Use when the value column defining the ring order is a time column. See [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/layout_time_ring.ipynb) - -```python -g.time_ring_layout().plot() # finds a time column and infers all settings - -g.time_ring_layout( - time_col='my_node_time_col', - num_rings=20, - time_start=np.datetime64('2014-01-22'), - time_end=np.datetime64('2015-01-22'), - time_unit= 'Y', # s, m, h, D, W, M, Y, C - min_r=100.0, # smallest ring radius - max_r=1000.0, # biggest ring radius - reverse=False, - #format_axis: Optional[Callable[[List[Dict]], List[Dict]]] = None, - #format_label: Optional[Callable[[np.datetime64, int, np.timedelta64], str]] = None, - #play_ms: int = 2000, - #engine='auto' # 'auto', 'pandas', 'cudf' -).plot() -``` - -#### Continuous - -Use when the value column defining the ring order is a continuous number, like distance or amount. See [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/layout_continuous_ring.ipynb) - -```python -g.ring_continuous_layout() # find a numeric column and infers all settings - -g.ring_continuous_layout( - ring_col='my_numeric_col', - #v_start= # first ring at this value - #v_end= # last ring at this value - #v_step= # distance between rings in the value domain - min_r=100.0, # smallest ring radius - max_r=1000.0, # biggest ring radius - normalize_ring_col=True, # remap [v_start,v_end] to [min_r,max_r] - num_rings=20, - ring_step=100, - - #Control axis labels and styles - #axis: Optional[Union[Dict[float,str],List[str]]] = None, - #format_axis: Optional[Callable[[List[Dict]], List[Dict]]] = None, - #format_labels: Optional[Callable[[float, int, float], str]] = None, - - reverse=False, - play_ms=0, - #engine='auto', # 'auto', 'pandas', 'cudf' -) -``` - -#### Categorical - -Use when the value column defining the ring order is a categorical value, such as a name or ID. See [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/layout_categorical_ring.ipynb) - -```python -g.ring_categorical_layout('my_categorical_col') # infers all settings - -g.ring_categorical_layout( - ring_col='my_numeric_col', - order=['col1', 'my_col2'], - drop_empty=True, # remove unpopulated rings - combine_unhandled=False, # Put values not covered by order into one ring Other vs a ring per unique value - append_unhandled=True, # Append vs prepend - min_r=100.0, # smallest ring radius - max_r=1000.0, # biggest ring radius - - #Control axis labels and styles - #axis: Optional[Dict[Any,str]] = None, - #format_axis: Optional[Callable[[List[Dict]], List[Dict]]] = None, - #format_labels: Optional[Callable[[Any, int, float], str]] = None, - - reverse=False, - play_ms=0, - #engine='auto', # 'auto', 'pandas', 'cudf' -) -``` - -### Layout: Modularity weighted - -Weight edges by community membership to emphasize community structure. See [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/layout_modularity_weighted.ipynb) - -```python -g.modularity_weighted_layout().plot() -g.modularity_weighted_layout('my_community_col').plot() -g.modularity_weighted_layout( - community_alg='louvain', - engine='cudf', - same_community_weight=2.0, - cross_community_weight=0.3, - edge_influence=2.0 -).plot() -``` - -### Plugin: igraph - -With `pip install graphistry[igraph]`, you can also use [`igraph` layouts](https://igraph.org/python/doc/api/igraph.Graph.html#layout): - -```python -g.layout_igraph('sugiyama').plot() -g.layout_igraph('sugiyama', directed=True, params={}).plot() +# Unmodified -- Automatic GPU mode for all ML, AI, GFQL queries, & visualization APIs +g3 = g2.umap() +g4 = g3.chain([ + n(query='pagerank > 0.1'), e_forward(), n(query='pagerank > 0.1') +]) +g4.plot() ``` -See list [`layout_algs`](https://github.com/graphistry/pygraphistry/blob/master/graphistry/plugins/igraph.py#L365) +Explore [10 Minutes to PyGraphistry](https://pygraphistry.readthedocs.io/en/latest/10min.html) for a wider variety of graph processing. -### Plugin: cugraph -With [Nvidia RAPIDS cuGraph](https://www.rapids.ai) install: +## PyGraphistry documentation -```python -g.layout_cugraph('force_atlas2').plot() -help(g.layout_cugraph) -``` +* [Main PyGraphistry documentation](https://pygraphistry.readthedocs.io/en/latest/) +* 10 Minutes to: [PyGraphistry](https://pygraphistry.readthedocs.io/en/latest/10min.html), [Visualization](https://pygraphistry.readthedocs.io/en/latest/visualization/10min.html), [GFQL](https://pygraphistry.readthedocs.io/en/latest/gfql/about.html) +* Get started: [Install](https://pygraphistry.readthedocs.io/en/latest/install/index.html), [UI Guide](https://hub.graphistry.com/docs/ui/index/), [Notebooks](https://pygraphistry.readthedocs.io/en/latest/demos/for_analysis.html) +* Performance: [PyGraphistry CPU+GPU](https://pygraphistry.readthedocs.io/en/latest/performance.html) & [GFQL CPU+GPU](https://pygraphistry.readthedocs.io/en/latest/gfql/performance.html) +* API References + - [PyGraphistry API Reference](https://pygraphistry.readthedocs.io/en/latest/api/index.html): [Visualization & Compute](https://pygraphistry.readthedocs.io/en/latest/visualization/index.html), [PyGraphistry Cheatsheet](https://pygraphistry.readthedocs.io/en/latest/cheatsheet.html) + - [GFQL Documentation](https://pygraphistry.readthedocs.io/en/latest/gfql/index.html): [GFQL Cheatsheet](https://pygraphistry.readthedocs.io/en/latest/gfql/quick.html) and [GFQL Operator Cheatsheet](https://pygraphistry.readthedocs.io/en/latest/gfql/predicates/quick.html) + - [Plugins](https://pygraphistry.readthedocs.io/en/latest/plugins.html): Databricks, Splunk, Neptune, Neo4j, RAPIDS, and more + - Web: [iframe](https://hub.graphistry.com/docs/api/1/rest/url/#urloptions), [JavaScript](https://hub.graphistry.com/static/js-docs/index.html?path=/docs/introduction--docs), [REST](https://hub.graphistry.com/docs/api/1/rest/auth/) -See list [`layout_algs`](https://github.com/graphistry/pygraphistry/blob/master/graphistry/plugins/cugraph.py#L315) +## Graphistry ecosystem -#### Group-in-a-box layout +- **Graphistry server:** + - Launch - [Graphistry Hub, Graphistry cloud marketplaces, and self-hosting](https://www.graphistry.com/get-started) + - Self-hosting: [Administration (including Docker)](https://github.com/graphistry/graphistry-cli) & [Kubernetes](https://github.com/graphistry/graphistry-helm) -[Group-in-a-box layout](https://ieeexplore.ieee.org/document/6113135) with igraph/pandas and cugraph/cudf implementations: +- **Graphistry client APIs:** + - Web: [iframe](https://hub.graphistry.com/docs/api/1/rest/url/#urloptions), [JavaScript](https://hub.graphistry.com/static/js-docs/index.html?path=/docs/introduction--docs), [REST](https://hub.graphistry.com/docs/api/1/rest/auth/) + - [PyGraphistry](https://pygraphistry.readthedocs.io/en/latest/index.html) + - [Graphistry for Microsoft PowerBI](https://hub.graphistry.com/docs/powerbi/pbi/) -```python -g.group_in_a_box_layout().plot() -g.group_in_a_box_layout( - partition_alg='ecg', # see igraph/cugraph algs - #partition_key='some_col', # use existing col - #layout_alg='circle', # see igraph/cugraph algs - #x, y, w, h - #encode_colors=False, - #colors=['#FFF', '#FF0', ...] - engine='cudf' -).plot() -``` +- **Additional projects**: + - [Louie.ai](https://louie.ai/): GenAI-native notebooks & dashboards to talk to your databases & Graphistry + - [graph-app-kit](https://github.com/graphistry/graph-app-kit): Streamlit Python dashboards with batteries-include graph packages + - [cu-cat](https://chat.openai.com/chat): Automatic GPU feature engineering -### Control render settings -```python -g = graphistry.edges(pd.DataFrame({'s': ['a', 'b', 'b'], 'd': ['b', 'c', 'd']})) -g2 = g.scene_settings( - #hide menus - menu=False, - info=False, - #tweak graph - show_arrows=False, - point_size=1.0, - edge_curvature=0.0, - edge_opacity=0.5, - point_opacity=0.9 -).plot() +## Community and support -``` +- [Blog](https://www.graphistry.com/blog) for tutorials, case studies, and updates +- [Slack](https://join.slack.com/t/graphistry-community/shared_invite/zt-53ik36w2-fpP0Ibjbk7IJuVFIRSnr6g): Join the Graphistry Community Slack for discussions and support +- [Twitter](https://twitter.com/graphistry) & [LinkedIn](https://www.linkedin.com/company/graphistry): Follow for updates +- [GitHub Issues](https://github.com/graphistry/pygraphistry/issues) open source support +- [Graphistry ZenDesk](https://graphistry.zendesk.com/) dedicated enterprise support -With `pip install graphistry[igraph]`, you can also use [`igraph` layouts](https://igraph.org/python/doc/api/igraph.Graph.html#layout): +## Contribute -```python -g.layout_igraph('sugiyama').plot() -g.layout_igraph('sugiyama', directed=True, params={}).plot() -``` +See [CONTRIBUTE](https://pygraphistry.readthedocs.io/en/latest/CONTRIBUTE.html) and [DEVELOP](https://pygraphistry.readthedocs.io/en/latest/DEVELOP.html) for participating in PyGraphistry development, or reach out to our team -## Next Steps - -1. Create a free public data [Graphistry Hub](https://www.graphistry.com/get-started) account or [one-click launch a private Graphistry instance in AWS](https://www.graphistry.com/get-started) -2. Check out the [analyst](https://github.com/graphistry/pygraphistry/tree/master/demos/for_analysis.ipynb) and [developer](https://github.com/graphistry/pygraphistry/tree/master/demos/for_developers.ipynb) introductions, or [try your own CSV](https://github.com/graphistry/pygraphistry/tree/master/demos/upload_csv_miniapp.ipynb) -3. Explore the [demos folder](https://github.com/graphistry/pygraphistry/tree/master/demos) for your favorite [file format, database, API](https://github.com/graphistry/pygraphistry/tree/master/demos/demos_databases_apis), use case domain, kind of analysis, and [visual analytics feature](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features) - -## Resources - -* Graphistry [In-Tool UI Guide](https://hub.graphistry.com/docs/ui/index/) -* [General and REST API docs](https://hub.graphistry.com/docs/api/): - * [URL settings](https://hub.graphistry.com/docs/api/1/rest/url/#urloptions) - * [Authentication](https://hub.graphistry.com/docs/api/1/rest/auth/) - * [Uploading](https://hub.graphistry.com/docs/api/2/rest/upload/#createdataset2), including multiple file formats and settings - * [Color bindings](https://hub.graphistry.com/docs/api/2/rest/upload/colors/#extendedpalette2) and [color palettes](https://hub.graphistry.com/docs/api/api-color-palettes/) (ColorBrewer) - * Bindings and colors, REST API, embedding URLs and URL parameters, dynamic JS API, and more - * JavaScript and more! -* Python-specific - * [Python API ReadTheDocs](http://pygraphistry.readthedocs.org/en/latest/) - * Within a notebook, you can always run `help(graphistry)`, `help(graphistry.hypergraph)`, etc. -* [Administration docs](https://github.com/graphistry/graphistry-cli) for sizing, installing, configuring, managing, and updating Graphistry servers -* [Graph-App-Kit Dashboarding](https://github.com/graphistry/graph-app-kit) dashboarding diff --git a/bin/lint.sh b/bin/lint.sh index 8c4d2f3c2..22c986570 100755 --- a/bin/lint.sh +++ b/bin/lint.sh @@ -19,7 +19,7 @@ flake8 \ graphistry \ --exclude graphistry/graph_vector_pb2.py,graphistry/_version.py \ --count \ - --ignore=C901,E121,E122,E123,E124,E125,E128,E131,E144,E201,E202,E203,E231,E251,E265,E301,E302,E303,E401,E501,E722,F401,W291,W293 \ + --ignore=C901,E121,E122,E123,E124,E125,E128,E131,E144,E201,E202,E203,E231,E251,E265,E301,E302,E303,E401,E501,E722,F401,W291,W293,W503 \ --max-complexity=10 \ --max-line-length=127 \ --statistics diff --git a/bin/test-graphviz.sh b/bin/test-graphviz.sh new file mode 100755 index 000000000..765cc8577 --- /dev/null +++ b/bin/test-graphviz.sh @@ -0,0 +1,13 @@ +#!/bin/bash +set -ex + +# Run from project root +# - Args get passed to pytest phase +# Non-zero exit code on fail + +# Assume [pygraphviz,test], apt-get install graphviz graphviz-dev + +python -m pytest --version + +python -B -m pytest -vv \ + graphistry/tests/plugins/test_graphviz.py diff --git a/demos/ai/cyber/CyberSecurity-Slim.ipynb b/demos/ai/cyber/CyberSecurity-Slim.ipynb index 9b6cfdb7a..d8ee1c916 100644 --- a/demos/ai/cyber/CyberSecurity-Slim.ipynb +++ b/demos/ai/cyber/CyberSecurity-Slim.ipynb @@ -190,7 +190,7 @@ "id": "125f6ef0", "metadata": {}, "source": [ - "# Fast Incident Response\n", + "## Fast Incident Response\n", "An Incident Responder needs to quickly find which IP is the attacker.\n", "\n", "If, say, a predictive model enriched the data, responders could repeat the pipeline on new data\n", @@ -285,7 +285,7 @@ "id": "caf504e5", "metadata": {}, "source": [ - "# Do we have a predictive model?\n", + "## Do we have a predictive model?\n", "\n", "Using the x, y's we get from autofeaturization, we fit two RandomForest models" ] @@ -378,7 +378,7 @@ "id": "671557b5", "metadata": {}, "source": [ - "# Let's remove edges and see if there is a model of just 'common features' (ie no ip addresses)\n", + "## Let's remove edges and see if there is a model of just 'common features' (ie no ip addresses)\n", "\n", "Given learnings, we want to see if there is a model that does not use edge information (ie, no IP addresses, only connection metadata)" ] @@ -525,7 +525,7 @@ "id": "71166b62", "metadata": {}, "source": [ - "# Hence we see that including just common features clusters botnet traffic together under featurization and UMAP" + "## Hence we see that including just common features clusters botnet traffic together under featurization and UMAP" ] }, { @@ -557,7 +557,7 @@ "id": "762b80ed", "metadata": {}, "source": [ - "# Now we dive deeper\n", + "## Now we dive deeper\n", "-----------------------------------------" ] }, @@ -566,7 +566,7 @@ "id": "2bac394b", "metadata": {}, "source": [ - "# Let's encode the graph as a DGL graph for use in Machine Learning" + "## Let's encode the graph as a DGL graph for use in Machine Learning" ] }, { @@ -757,7 +757,7 @@ "id": "00751e3b", "metadata": {}, "source": [ - "# Contributions\n", + "## Contributions\n", "\n", "Now we know how to take raw data and turn them into actionable features and models using the Graphistry[ai] API.\n", "\n", diff --git a/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.ipynb b/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.ipynb old mode 100644 new mode 100755 index ab4126ce8..515a27057 --- a/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.ipynb +++ b/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.ipynb @@ -39,128 +39,122 @@ } }, "source": [ - "## Install & connect" + "## Install & authenticate with graphistry server" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "eaf03d3c-d046-4f96-825e-5db2355af383", "showTitle": false, "title": "" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "Requirement already satisfied: graphistry in /local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/lib/python3.9/site-packages (0.28.5)\r\n", - "Requirement already satisfied: numpy in /databricks/python3/lib/python3.9/site-packages (from graphistry) (1.20.3)\r\n", - "Requirement already satisfied: pandas>=0.17.0 in /databricks/python3/lib/python3.9/site-packages (from graphistry) (1.3.4)\r\n", - "Requirement already satisfied: packaging>=20.1 in /databricks/python3/lib/python3.9/site-packages (from graphistry) (21.0)\r\n", - "Requirement already satisfied: squarify in /local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/lib/python3.9/site-packages (from graphistry) (0.4.3)\r\n", - "Requirement already satisfied: palettable>=3.0 in /local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/lib/python3.9/site-packages (from graphistry) (3.3.0)\r\n", - "Requirement already satisfied: typing-extensions in /databricks/python3/lib/python3.9/site-packages (from graphistry) (3.10.0.2)\r\n", - "Requirement already satisfied: pyarrow>=0.15.0 in /databricks/python3/lib/python3.9/site-packages (from graphistry) (7.0.0)\r\n", - "Requirement already satisfied: requests in /databricks/python3/lib/python3.9/site-packages (from graphistry) (2.26.0)\r\n", - "Requirement already satisfied: pyparsing>=2.0.2 in /databricks/python3/lib/python3.9/site-packages (from packaging>=20.1->graphistry) (3.0.4)\r\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /databricks/python3/lib/python3.9/site-packages (from pandas>=0.17.0->graphistry) (2.8.2)\r\n", - "Requirement already satisfied: pytz>=2017.3 in /databricks/python3/lib/python3.9/site-packages (from pandas>=0.17.0->graphistry) (2021.3)\r\n", - "Requirement already satisfied: six>=1.5 in /databricks/python3/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas>=0.17.0->graphistry) (1.16.0)\r\n", - "Requirement already satisfied: idna<4,>=2.5 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (3.2)\r\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (2.0.4)\r\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (1.26.7)\r\n", - "Requirement already satisfied: certifi>=2017.4.17 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (2021.10.8)\r\n", - "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.3.1 is available.\r\n", - "You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/bin/python -m pip install --upgrade pip' command.\u001b[0m\r\n" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "Requirement already satisfied: graphistry in /local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/lib/python3.9/site-packages (0.28.5)\r\nRequirement already satisfied: numpy in /databricks/python3/lib/python3.9/site-packages (from graphistry) (1.20.3)\r\nRequirement already satisfied: pandas>=0.17.0 in /databricks/python3/lib/python3.9/site-packages (from graphistry) (1.3.4)\r\nRequirement already satisfied: packaging>=20.1 in /databricks/python3/lib/python3.9/site-packages (from graphistry) (21.0)\r\nRequirement already satisfied: squarify in /local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/lib/python3.9/site-packages (from graphistry) (0.4.3)\r\nRequirement already satisfied: palettable>=3.0 in /local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/lib/python3.9/site-packages (from graphistry) (3.3.0)\r\nRequirement already satisfied: typing-extensions in /databricks/python3/lib/python3.9/site-packages (from graphistry) (3.10.0.2)\r\nRequirement already satisfied: pyarrow>=0.15.0 in /databricks/python3/lib/python3.9/site-packages (from graphistry) (7.0.0)\r\nRequirement already satisfied: requests in /databricks/python3/lib/python3.9/site-packages (from graphistry) (2.26.0)\r\nRequirement already satisfied: pyparsing>=2.0.2 in /databricks/python3/lib/python3.9/site-packages (from packaging>=20.1->graphistry) (3.0.4)\r\nRequirement already satisfied: python-dateutil>=2.7.3 in /databricks/python3/lib/python3.9/site-packages (from pandas>=0.17.0->graphistry) (2.8.2)\r\nRequirement already satisfied: pytz>=2017.3 in /databricks/python3/lib/python3.9/site-packages (from pandas>=0.17.0->graphistry) (2021.3)\r\nRequirement already satisfied: six>=1.5 in /databricks/python3/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas>=0.17.0->graphistry) (1.16.0)\r\nRequirement already satisfied: idna<4,>=2.5 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (3.2)\r\nRequirement already satisfied: charset-normalizer~=2.0.0 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (2.0.4)\r\nRequirement already satisfied: urllib3<1.27,>=1.21.1 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (1.26.7)\r\nRequirement already satisfied: certifi>=2017.4.17 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (2021.10.8)\r\n\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.3.1 is available.\r\nYou should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/bin/python -m pip install --upgrade pip' command.\u001b[0m\r\n", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "ansi" - } + "outputs": [], + "source": [ + "# Uncomment and run first time or\n", + "# have databricks admin install graphistry python library: \n", + "# https://docs.databricks.com/en/libraries/package-repositories.html#pypi-package\n", + "\n", + "#%pip install graphistry\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 }, - "output_type": "display_data" + "inputWidgets": {}, + "nuid": "8ad9b072-f037-4d4a-a1fa-ca2c14bd639f", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ - "# Uncomment and run first time\n", - "! pip install graphistry\n", - "#! pip install git+https://github.com/graphistry/pygraphistry.git@dev/databricks\n", - " \n", - "# Can sometimes help:\n", - "#dbutils.library.restartPython()" + "# Required to run after pip install to pick up new python package: \n", + "dbutils.library.restartPython()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, - "nuid": "9e649f0e-fca5-4be6-8ad6-fa781bbb81d6", + "nuid": "cfd253ba-c647-4c45-8048-58b0ca427569", "showTitle": false, "title": "" } }, "outputs": [], "source": [ - "#Optional: Uncomment - We find this speeds up calls 10%+ on some datasets\n", - "#spark.conf.set(\"spark.sql.execution.arrow.enabled\", \"true\")" + "import graphistry # if not yet available, install pygraphistry and/or restart Python kernel using the cells above\n", + "graphistry.__version__" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, - "nuid": "cfd253ba-c647-4c45-8048-58b0ca427569", + "nuid": "55e30c26-3a8c-46dc-8eff-bd730d3c7798", "showTitle": false, "title": "" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "Out[12]: '0.28.5'" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "Out[12]: '0.28.5'", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "ansi" - } + "source": [ + "### Use databricks secrets to retrieve graphistry creds and pass to register " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 }, - "output_type": "display_data" + "inputWidgets": {}, + "nuid": "b5496fa5-525a-48c9-ad46-0ce17ebdc4f8", + "showTitle": false, + "title": "" } - ], + }, + "outputs": [], "source": [ - "import graphistry # if not yet available, install and/or restart Python kernel using the above\n", "\n", - "# To specify Graphistry account & server, use:\n", - "# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n", - "# For more options, see https://github.com/graphistry/pygraphistry#configure\n", + "# As a best practice, use databricks secrets to store graphistry personal key (access token)\n", + "# create databricks secrets: https://docs.databricks.com/en/security/secrets/index.html \n", + "# create graphistry personal key: https://hub.graphistry.com/account/tokens\n", "\n", - "graphistry.__version__" + "graphistry.register(api=3, \n", + " personal_key_id=dbutils.secrets.get(scope=\"my-secret-scope\", key=\"graphistry-personal_key_id\"), \n", + " personal_key_secret=dbutils.secrets.get(scope=\"my-secret-scope\", key=\"graphistry-personal_key_secret\"), \n", + " protocol='https',\n", + " server='hub.graphistry.com')\n", + "\n", + "# Alternatively, use username and password: \n", + "# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n", + "# For more options, see https://github.com/graphistry/pygraphistry#configure" ] }, { @@ -188,337 +182,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "c187c650-01c2-4e48-b8e0-803e937cdb11", "showTitle": false, "title": "" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "type: \n" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "type: \n", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "ansi" - } - }, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
battery_levelc02_levelcca2cca3cndevice_iddevice_namehumidityiplatitudelcdlongitudescaletemptimestamp
8868USUSAUnited States1meter-gauge-1xbYRYcj5168.161.225.138.0green-97.0Celsius341458444054093
71473NONORNorway2sensor-pad-2n2Pea70213.161.254.162.47red6.15Celsius111458444054119
21556ITITAItaly3device-mac-36TWSKiT4488.36.5.142.83red12.83Celsius191458444054120
61080USUSAUnited States4sensor-pad-4mzWkz3266.39.173.15444.06yellow-121.32Celsius281458444054121
4931PHPHLPhilippines5therm-stick-5gimpUrBB62203.82.41.914.58green120.97Celsius251458444054122
31210USUSAUnited States6sensor-pad-6al7RTAobR51204.116.105.6735.93yellow-85.46Celsius271458444054122
31129CNCHNChina7meter-gauge-7GeDoanM26220.173.179.122.82yellow108.32Celsius181458444054123
01536JPJPNJapan8sensor-pad-8xUD6pzsQI35210.173.177.135.69red139.69Celsius271458444054123
3807JPJPNJapan9device-mac-9GcjZ2pw85118.23.68.22735.69green139.69Celsius131458444054124
71470USUSAUnited States10sensor-pad-10BsywSYUF56208.109.163.21833.61red-111.89Celsius261458444054125
" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "aggData": [], - "aggError": "", - "aggOverflow": false, - "aggSchema": [], - "aggSeriesLimitReached": false, - "aggType": "", - "arguments": {}, - "columnCustomDisplayInfos": {}, - "data": [ - [ - 8, - 868, - "US", - "USA", - "United States", - 1, - "meter-gauge-1xbYRYcj", - 51, - "68.161.225.1", - 38, - "green", - -97, - "Celsius", - 34, - 1458444054093 - ], - [ - 7, - 1473, - "NO", - "NOR", - "Norway", - 2, - "sensor-pad-2n2Pea", - 70, - "213.161.254.1", - 62.47, - "red", - 6.15, - "Celsius", - 11, - 1458444054119 - ], - [ - 2, - 1556, - "IT", - "ITA", - "Italy", - 3, - "device-mac-36TWSKiT", - 44, - "88.36.5.1", - 42.83, - "red", - 12.83, - "Celsius", - 19, - 1458444054120 - ], - [ - 6, - 1080, - "US", - "USA", - "United States", - 4, - "sensor-pad-4mzWkz", - 32, - "66.39.173.154", - 44.06, - "yellow", - -121.32, - "Celsius", - 28, - 1458444054121 - ], - [ - 4, - 931, - "PH", - "PHL", - "Philippines", - 5, - "therm-stick-5gimpUrBB", - 62, - "203.82.41.9", - 14.58, - "green", - 120.97, - "Celsius", - 25, - 1458444054122 - ], - [ - 3, - 1210, - "US", - "USA", - "United States", - 6, - "sensor-pad-6al7RTAobR", - 51, - "204.116.105.67", - 35.93, - "yellow", - -85.46, - "Celsius", - 27, - 1458444054122 - ], - [ - 3, - 1129, - "CN", - "CHN", - "China", - 7, - "meter-gauge-7GeDoanM", - 26, - "220.173.179.1", - 22.82, - "yellow", - 108.32, - "Celsius", - 18, - 1458444054123 - ], - [ - 0, - 1536, - "JP", - "JPN", - "Japan", - 8, - "sensor-pad-8xUD6pzsQI", - 35, - "210.173.177.1", - 35.69, - "red", - 139.69, - "Celsius", - 27, - 1458444054123 - ], - [ - 3, - 807, - "JP", - "JPN", - "Japan", - 9, - "device-mac-9GcjZ2pw", - 85, - "118.23.68.227", - 35.69, - "green", - 139.69, - "Celsius", - 13, - 1458444054124 - ], - [ - 7, - 1470, - "US", - "USA", - "United States", - 10, - "sensor-pad-10BsywSYUF", - 56, - "208.109.163.218", - 33.61, - "red", - -111.89, - "Celsius", - 26, - 1458444054125 - ] - ], - "datasetInfos": [], - "dbfsResultPath": null, - "isJsonSchema": true, - "metadata": {}, - "overflow": false, - "plotOptions": { - "customPlotOptions": {}, - "displayType": "table", - "pivotAggregation": null, - "pivotColumns": [], - "xColumns": [], - "yColumns": [] - }, - "removedWidgets": [], - "schema": [ - { - "metadata": "{}", - "name": "battery_level", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "c02_level", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "cca2", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "cca3", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "cn", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "device_id", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "device_name", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "humidity", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "ip", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "latitude", - "type": "\"double\"" - }, - { - "metadata": "{}", - "name": "lcd", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "longitude", - "type": "\"double\"" - }, - { - "metadata": "{}", - "name": "scale", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "temp", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "timestamp", - "type": "\"long\"" - } - ], - "type": "table" - } - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Load the data from its source.\n", "devices = spark.read \\\n", @@ -532,393 +209,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "c69b91ed-c172-47b7-9bb7-27532202179a", "showTitle": false, "title": "" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
device_idcca2cca3cndevice_nameiplocation_rounded1location_rounded2battery_level_minc02_level_minhumidity_mintimestamp_minbattery_level_maxc02_level_maxhumidity_maxtimestamp_maxbattery_level_avgc02_level_avghumidity_avgtimestamp_avg
1USUSAUnited Statesmeter-gauge-1xbYRYcj68.161.225.138_-9740_-100886851145844405409388685114584440540938.0868.051.01.458444054093E12
2NONORNorwaysensor-pad-2n2Pea213.161.254.162_660_1071473701458444054119714737014584440541197.01473.070.01.458444054119E12
3ITITAItalydevice-mac-36TWSKiT88.36.5.143_1340_1021556441458444054120215564414584440541202.01556.044.01.45844405412E12
4USUSAUnited Statessensor-pad-4mzWkz66.39.173.15444_-12140_-12061080321458444054121610803214584440541216.01080.032.01.458444054121E12
5PHPHLPhilippinestherm-stick-5gimpUrBB203.82.41.915_12110_120493162145844405412249316214584440541224.0931.062.01.458444054122E12
6USUSAUnited Statessensor-pad-6al7RTAobR204.116.105.6736_-8540_-9031210511458444054122312105114584440541223.01210.051.01.458444054122E12
7CNCHNChinameter-gauge-7GeDoanM220.173.179.123_10820_11031129261458444054123311292614584440541233.01129.026.01.458444054123E12
8JPJPNJapansensor-pad-8xUD6pzsQI210.173.177.136_14040_14001536351458444054123015363514584440541230.01536.035.01.458444054123E12
9JPJPNJapandevice-mac-9GcjZ2pw118.23.68.22736_14040_140380785145844405412438078514584440541243.0807.085.01.458444054124E12
10USUSAUnited Statessensor-pad-10BsywSYUF208.109.163.21834_-11230_-11071470561458444054125714705614584440541257.01470.056.01.458444054125E12
" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "aggData": [], - "aggError": "", - "aggOverflow": false, - "aggSchema": [], - "aggSeriesLimitReached": false, - "aggType": "", - "arguments": {}, - "columnCustomDisplayInfos": {}, - "data": [ - [ - 1, - "US", - "USA", - "United States", - "meter-gauge-1xbYRYcj", - "68.161.225.1", - "38_-97", - "40_-100", - 8, - 868, - 51, - 1458444054093, - 8, - 868, - 51, - 1458444054093, - 8, - 868, - 51, - 1458444054093 - ], - [ - 2, - "NO", - "NOR", - "Norway", - "sensor-pad-2n2Pea", - "213.161.254.1", - "62_6", - "60_10", - 7, - 1473, - 70, - 1458444054119, - 7, - 1473, - 70, - 1458444054119, - 7, - 1473, - 70, - 1458444054119 - ], - [ - 3, - "IT", - "ITA", - "Italy", - "device-mac-36TWSKiT", - "88.36.5.1", - "43_13", - "40_10", - 2, - 1556, - 44, - 1458444054120, - 2, - 1556, - 44, - 1458444054120, - 2, - 1556, - 44, - 1458444054120 - ], - [ - 4, - "US", - "USA", - "United States", - "sensor-pad-4mzWkz", - "66.39.173.154", - "44_-121", - "40_-120", - 6, - 1080, - 32, - 1458444054121, - 6, - 1080, - 32, - 1458444054121, - 6, - 1080, - 32, - 1458444054121 - ], - [ - 5, - "PH", - "PHL", - "Philippines", - "therm-stick-5gimpUrBB", - "203.82.41.9", - "15_121", - "10_120", - 4, - 931, - 62, - 1458444054122, - 4, - 931, - 62, - 1458444054122, - 4, - 931, - 62, - 1458444054122 - ], - [ - 6, - "US", - "USA", - "United States", - "sensor-pad-6al7RTAobR", - "204.116.105.67", - "36_-85", - "40_-90", - 3, - 1210, - 51, - 1458444054122, - 3, - 1210, - 51, - 1458444054122, - 3, - 1210, - 51, - 1458444054122 - ], - [ - 7, - "CN", - "CHN", - "China", - "meter-gauge-7GeDoanM", - "220.173.179.1", - "23_108", - "20_110", - 3, - 1129, - 26, - 1458444054123, - 3, - 1129, - 26, - 1458444054123, - 3, - 1129, - 26, - 1458444054123 - ], - [ - 8, - "JP", - "JPN", - "Japan", - "sensor-pad-8xUD6pzsQI", - "210.173.177.1", - "36_140", - "40_140", - 0, - 1536, - 35, - 1458444054123, - 0, - 1536, - 35, - 1458444054123, - 0, - 1536, - 35, - 1458444054123 - ], - [ - 9, - "JP", - "JPN", - "Japan", - "device-mac-9GcjZ2pw", - "118.23.68.227", - "36_140", - "40_140", - 3, - 807, - 85, - 1458444054124, - 3, - 807, - 85, - 1458444054124, - 3, - 807, - 85, - 1458444054124 - ], - [ - 10, - "US", - "USA", - "United States", - "sensor-pad-10BsywSYUF", - "208.109.163.218", - "34_-112", - "30_-110", - 7, - 1470, - 56, - 1458444054125, - 7, - 1470, - 56, - 1458444054125, - 7, - 1470, - 56, - 1458444054125 - ] - ], - "datasetInfos": [], - "dbfsResultPath": null, - "isJsonSchema": true, - "metadata": {}, - "overflow": false, - "plotOptions": { - "customPlotOptions": {}, - "displayType": "table", - "pivotAggregation": null, - "pivotColumns": [], - "xColumns": [], - "yColumns": [] - }, - "removedWidgets": [], - "schema": [ - { - "metadata": "{}", - "name": "device_id", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "cca2", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "cca3", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "cn", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "device_name", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "ip", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "location_rounded1", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "location_rounded2", - "type": "\"string\"" - }, - { - "metadata": "{}", - "name": "battery_level_min", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "c02_level_min", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "humidity_min", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "timestamp_min", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "battery_level_max", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "c02_level_max", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "humidity_max", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "timestamp_max", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "battery_level_avg", - "type": "\"double\"" - }, - { - "metadata": "{}", - "name": "c02_level_avg", - "type": "\"double\"" - }, - { - "metadata": "{}", - "name": "humidity_avg", - "type": "\"double\"" - }, - { - "metadata": "{}", - "name": "timestamp_avg", - "type": "\"double\"" - } - ], - "type": "table" - } - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from pyspark.sql import functions as F\n", "from pyspark.sql.functions import concat_ws, col, round\n", @@ -985,56 +289,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "8028c3a6-308a-43ec-8988-0b51d9f1826d", "showTitle": false, "title": "" } }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " " - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "\n \n \n \n ", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "(\n", " graphistry \n", - " .edges(devices.sample(fraction=0.1), 'device_name', 'cca3') \\\n", + " .edges(devices.sample(fraction=0.1).toPandas(), 'device_name', 'cca3') \\\n", " .settings(url_params={'strongGravity': 'true'}) \\\n", " .plot()\n", ")" @@ -1042,73 +314,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "852b48fe-61af-4953-858f-52680bf07fd2", "showTitle": false, "title": "" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "# links 79200\n", - "# events 19800\n", - "# attrib entities 41197\n" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "# links 79200\n# events 19800\n# attrib entities 41197\n", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "ansi" - } - }, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " " - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "\n \n \n \n ", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "hg = graphistry.hypergraph(\n", " devices_with_rounded_locations.sample(fraction=0.1).toPandas(),\n", @@ -1150,55 +369,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "4e327ad1-169b-4bb6-95c0-8fc0cf452625", "showTitle": false, "title": "" } }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " " - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "\n \n \n \n ", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "textData": null, - "type": "htmlSandbox" - } - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "(\n", " g\n", @@ -1227,37 +411,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "a0e6bd79-1172-4cfe-ac6d-83b187d48747", "showTitle": false, "title": "" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "Out[18]: 'https://hub.graphistry.com/graph/graph.html?dataset=187d97493ce54498b820f727877eda4b&type=arrow&viztoken=b3106e8a-cbe9-4802-8519-97e1d0d539c3&usertag=50d9aebe-pygraphistry-0.28.5&splashAfter=1669270570&info=true&strongGravity=true'" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "arguments": {}, - "data": "Out[18]: 'https://hub.graphistry.com/graph/graph.html?dataset=187d97493ce54498b820f727877eda4b&type=arrow&viztoken=b3106e8a-cbe9-4802-8519-97e1d0d539c3&usertag=50d9aebe-pygraphistry-0.28.5&splashAfter=1669270570&info=true&strongGravity=true'", - "datasetInfos": [], - "metadata": {}, - "removedWidgets": [], - "type": "ansi" - } - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "url = g.plot(render=False)\n", "url" @@ -1265,12 +432,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, - "nuid": "ed683717-2c64-43b0-9a7e-bbe2115ba880", + "nuid": "cd326621-1224-4b91-890d-9285f7755ad2", "showTitle": false, "title": "" } @@ -1282,12 +449,12 @@ "metadata": { "application/vnd.databricks.v1+notebook": { "dashboards": [], + "environmentMetadata": null, "language": "python", "notebookMetadata": { "pythonIndentUnit": 4 }, - "notebookName": "graphistry-notebook-dashboard", - "notebookOrigID": 382244341032212, + "notebookName": "graphistry-notebook-dashboard (1)", "widgets": {} }, "kernelspec": { @@ -1309,5 +476,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 0 } diff --git a/demos/demos_databases_apis/gpu_rapids/part_i_cpu_pandas.ipynb b/demos/demos_databases_apis/gpu_rapids/part_i_cpu_pandas.ipynb index 956f05499..931eaad73 100644 --- a/demos/demos_databases_apis/gpu_rapids/part_i_cpu_pandas.ipynb +++ b/demos/demos_databases_apis/gpu_rapids/part_i_cpu_pandas.ipynb @@ -14,8 +14,10 @@ "This tutorial series visually analyzes Zeek/Bro network connection logs using different compute engines:\n", "\n", "* Part I: [CPU Baseline in Python Pandas](./part_i_cpu_pandas.ipynb)\n", - "* Part II: [GPU Dataframse with RAPIDS Python cudf bindings](./part_ii_gpu_cudf)\n", - "\n", + "* Part II: [GPU Dataframes with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)\n", + "* Part III: GPU SQL - deprecated as Dask-SQL replaced BlazingSQL in the RAPIDS ecosystem\n", + "* Part IV: [GPU ML with RAPIDS cuML UMAP and PyGraphistry](./part_iv_gpu_cuml.ipynb)\n", + "* [Graphistry cuGraph bindings](./cugraph.ipynb)\n", "\n", "**Part I Contents:**\n", "\n", @@ -81,9 +83,9 @@ "source": [ "%%time\n", "# download data \n", - "!if [ ! -f conn.log ]; then \\\n", - " curl https://www.secrepo.com/maccdc2012/conn.log.gz | gzip -d > conn.log; \\\n", - "fi" + "#!if [ ! -f conn.log ]; then \\\n", + "# curl https://www.secrepo.com/maccdc2012/conn.log.gz | gzip -d > conn.log; \\\n", + "#fi" ] }, { @@ -92,7 +94,7 @@ "metadata": {}, "outputs": [], "source": [ - "!head -n 3 conn.log" + "#!head -n 3 conn.log" ] }, { @@ -291,7 +293,10 @@ "## Next Steps\n", "\n", "* Part I: [CPU Baseline in Python Pandas](./part_i_cpu_pandas.ipynb)\n", - "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)" + "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)\n", + "* Part III: GPU SQL - deprecated as Dask-SQL replaced BlazingSQL in the RAPIDS ecosystem\n", + "* Part IV: [GPU ML with RAPIDS cuML UMAP and PyGraphistry](./part_iv_gpu_cuml.ipynb) \n", + "* [Graphistry cuGraph bindings](./cugraph.ipynb)\n" ] }, { diff --git a/demos/demos_databases_apis/gpu_rapids/part_ii_gpu_cudf.ipynb b/demos/demos_databases_apis/gpu_rapids/part_ii_gpu_cudf.ipynb index 294356e76..b81141f1b 100644 --- a/demos/demos_databases_apis/gpu_rapids/part_ii_gpu_cudf.ipynb +++ b/demos/demos_databases_apis/gpu_rapids/part_ii_gpu_cudf.ipynb @@ -11,8 +11,10 @@ "This tutorial series visually analyzes Zeek/Bro network connection logs using different compute engines:\n", "\n", "* Part I: [CPU Baseline in Python Pandas](./part_i_cpu_pandas.ipynb)\n", - "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf)\n", - "\n", + "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)\n", + "* Part III: GPU SQL - deprecated as Dask-SQL replaced BlazingSQL in the RAPIDS ecosystem\n", + "* Part IV: [GPU ML with RAPIDS cuML UMAP and PyGraphistry](./part_iv_gpu_cuml.ipynb)\n", + "* [Graphistry cuGraph bindings](./cugraph.ipynb)\n", "\n", "**Part II Contents:**\n", "\n", @@ -114,9 +116,9 @@ "source": [ "%%time\n", "# download data \n", - "!if [ ! -f conn.log ]; then \\\n", - " curl https://www.secrepo.com/maccdc2012/conn.log.gz | gzip -d > conn.log; \\\n", - "fi" + "#!if [ ! -f conn.log ]; then \\\n", + "# curl https://www.secrepo.com/maccdc2012/conn.log.gz | gzip -d > conn.log; \\\n", + "#fi" ] }, { @@ -736,7 +738,10 @@ "## Next Steps\n", "\n", "* Part I: [CPU Baseline in Python Pandas](./part_i_cpu_pandas.ipynb)\n", - "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf)" + "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)\n", + "* Part III: GPU SQL - deprecated as Dask-SQL replaced BlazingSQL in the RAPIDS ecosystem\n", + "* Part IV: [GPU ML with RAPIDS cuML UMAP and PyGraphistry](./part_iv_gpu_cuml.ipynb)\n", + "* [Graphistry cuGraph bindings](./cugraph.ipynb)\n" ] }, { diff --git a/demos/demos_databases_apis/gpu_rapids/part_iii_gpu_blazingsql.ipynb b/demos/demos_databases_apis/gpu_rapids/part_iii_gpu_blazingsql.ipynb index a7b8e0794..5910cd629 100644 --- a/demos/demos_databases_apis/gpu_rapids/part_iii_gpu_blazingsql.ipynb +++ b/demos/demos_databases_apis/gpu_rapids/part_iii_gpu_blazingsql.ipynb @@ -6,7 +6,9 @@ "source": [ "# BlazingSQL + Graphistry: Netflow analysis\n", "\n", - "This tutorial shows running BlazingSQL (GPU-accelerated SQL) on raw parquet files and visually analyzing the result with Graphistry" + "This tutorial shows running BlazingSQL (GPU-accelerated SQL) on raw parquet files and visually analyzing the result with Graphistry\n", + "\n", + "**WARNING: Deprecated as BlazingSQL is no longer maintained, see dask-sql instead**" ] }, { diff --git a/demos/demos_databases_apis/gpu_rapids/part_iv_gpu_cuml.ipynb b/demos/demos_databases_apis/gpu_rapids/part_iv_gpu_cuml.ipynb index 849887d4b..41a9a5c23 100644 --- a/demos/demos_databases_apis/gpu_rapids/part_iv_gpu_cuml.ipynb +++ b/demos/demos_databases_apis/gpu_rapids/part_iv_gpu_cuml.ipynb @@ -4,14 +4,25 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "UMAP is a popular method of dimensionality reduction, a helpful technique for meaningful analysis of large, complex datasets\n", + "# GPU UMAP\n", + "\n", + "UMAP is a popular method of dimensionality reduction, a helpful technique for meaningful analysis of large, complex datasets. Graphistry provides convenient bindings for working with `cuml.UMAP`.\n", + "\n", "UMAP is:\n", " * interested in the number of nearest numbers\n", " * non-linear, unlike longstanding methods such as PCA\n", " * non-scaling, which keep calculation fast\n", " * stochastic and thus non-deterministic -- and different libraries handle this differently as you will see in this notebook\n", " * `umap-learn` states that [\"variance between runs will exist, however small\"](https://umap-learn.readthedocs.io/en/latest/reproducibility.html)\n", - " * `cuml` currently uses [\"exact kNN\"](https://docs.rapids.ai/api/cuml/stable/api.html?highlight=umap#cuml.UMAP). This may chance in [future releases](https://github.com/rapidsai/cuml/issues/1653#issuecomment-584357155)\n" + " * `cuml` currently uses [\"exact kNN\"](https://docs.rapids.ai/api/cuml/stable/api.html?highlight=umap#cuml.UMAP). This may chance in [future releases](https://github.com/rapidsai/cuml/issues/1653#issuecomment-584357155)\n", + "\n", + "Further reading:\n", + "\n", + "* Part I: [CPU Baseline in Python Pandas](./part_i_cpu_pandas.ipynb)\n", + "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)\n", + "* Part III: GPU SQL - deprecated as Dask-SQL replaced BlazingSQL in the RAPIDS ecosystem\n", + "* Part IV: [GPU ML with RAPIDS cuML UMAP and PyGraphistry](./part_iv_gpu_cuml.ipynb)\n", + "* [Graphistry cuGraph bindings](./cugraph.ipynb)\n" ] }, { @@ -24,11 +35,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -64,11 +71,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -237,11 +240,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -278,11 +277,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -312,11 +307,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -353,11 +344,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -394,11 +381,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -427,11 +410,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -467,11 +446,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -502,11 +477,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -608,15 +579,29 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "#g3.plot()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "* Part I: [CPU Baseline in Python Pandas](./part_i_cpu_pandas.ipynb)\n", + "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)\n", + "* Part III: GPU SQL - deprecated as Dask-SQL replaced BlazingSQL in the RAPIDS ecosystem\n", + "* Part IV: [GPU ML with RAPIDS cuML UMAP and PyGraphistry](./part_iv_gpu_cuml.ipynb)\n", + "* [Graphistry cuGraph bindings](./cugraph.ipynb)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] } ], "metadata": { diff --git a/demos/demos_databases_apis/graphviz/graphviz.ipynb b/demos/demos_databases_apis/graphviz/graphviz.ipynb new file mode 100644 index 000000000..14fe015a9 --- /dev/null +++ b/demos/demos_databases_apis/graphviz/graphviz.ipynb @@ -0,0 +1,1430 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "fEjoJ5eBnuKZ" + }, + "source": [ + "# Graphistry <> graphviz integration quickstart\n", + "\n", + "The [graphviz engine](https://graphviz.org/) is popular for layout of small graphs and rendering to static images. The Graphistry Python bindings to graphviz enable using pygraphistry as usual for quickly loading and manipulating your data, and then benefiting from graphviz for layout, and optionally, rendering.\n", + "\n", + "The example below shows laying out and rendering company ownership data that is in a tree and benefits from graphviz's high-quality layout engine." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0BF6FBhDpLas" + }, + "source": [ + "## Setup\n", + "\n", + "* graphviz: Install the graphviz engine and the pygraphviz bindings, see below (official [tutorial](https://pygraphviz.github.io/documentation/stable/install.html) )\n", + "* Graphistry: Install PyGraphistry below, and [get a free GPU account on Graphistry Hub](https://www.graphistry.com/get-started) or run your own server\n", + "\n", + "Notes:\n", + "\n", + "* You must install the graphviz engine, as well as its pygraphviz Python bindings and pygraphistry\n", + "* graphviz is most known for its `\"dot\"` layout engine, and it includes others as well\n", + "* graphviz is generally not recommended for layout of graphs over 10,000 nodes and edges" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3XMNgAvIM9Ep", + "outputId": "b391eb13-0650-433b-bd2b-c905cdef9e18" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for graphistry (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "#!apt-get install graphviz graphviz-dev\n", + "\n", + "#!pip install -q graphistry[pygraphviz]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s40Iw_3vqQZy" + }, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "Cnhc-A4_M2Ad", + "outputId": "0f2fb73f-72a2-4fae-9b28-cea26b85d0ad" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'0.34.5+12.g4dba3e6'" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from typing import Any, Dict, Literal, Optional\n", + "import logging\n", + "try:\n", + " import pygraphviz as pgv\n", + "except (ImportError, ModuleNotFoundError):\n", + " logging.error(\"ImportError: Did you install pygraphviz and the supporting native packages?\")\n", + " raise\n", + "\n", + "import pandas as pd\n", + "import graphistry\n", + "from graphistry import Plottable\n", + "graphistry.register(api=3, username=FILL_ME_IN, password=FILL_ME_IN)\n", + "\n", + "graphistry.__version__" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wwl3XdQLqf5k" + }, + "source": [ + "### Sample graph: HSBC Beneficial ownership graph\n", + "\n", + "Sample data from [openownership.org](https://openownership.org/). Corporate ownership graphs often have deeply tree structure, and for bigger conglomerates with numerous subsidaries, officers, board officers, suppliers, and lenders, can greatly benefit from higher-quality tree layouts." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "8-7OAzDml0RV" + }, + "outputs": [], + "source": [ + "companies_df = pd.DataFrame([{'label': 'Hsbc Finance (Netherlands)', 'n': '1862294673469042014'},\n", + " {'label': 'Hsbc Holdings Plc', 'n': '7622088245850069747'},\n", + " {'label': 'Unknown person(s)', 'n': '7622088245850069747-unknown'},\n", + " {'label': 'HSBC PROPERTY (UK) LIMITED', 'n': '16634236373777089526'},\n", + " {'label': 'HSBC ALTERNATIVE INVESTMENTS LIMITED',\n", + " 'n': '18011320449780894329'},\n", + " {'label': 'HSBC INVESTMENT COMPANY LIMITED', 'n': '9134577322728469115'},\n", + " {'label': 'HSBC IM PENSION TRUST LIMITED', 'n': '1446072728533515665'},\n", + " {'label': 'MERCANTILE COMPANY LIMITED', 'n': '6904185395252167658'},\n", + " {'label': 'Mp Payments Group Limited', 'n': '13630126251685975826'},\n", + " {'label': 'MP PAYMENTS OPERATIONS LIMITED', 'n': '11514603667851101425'},\n", + " {'label': 'MP PAYMENTS UK LIMITED', 'n': '13417892994160273884'},\n", + " {'label': 'Hsbc Asia Pacific Holdings (Uk) Limited',\n", + " 'n': '2173486047275631423'},\n", + " {'label': 'HSBC SECURITIES (JAPAN) LIMITED', 'n': '18045747820524565803'}])\n", + "\n", + "ownership_df = pd.DataFrame([{'s': '7622088245850069747', 'd': '1862294673469042014'},\n", + " {'s': '7622088245850069747-unknown', 'd': '7622088245850069747'},\n", + " {'s': '1862294673469042014', 'd': '16634236373777089526'},\n", + " {'s': '1862294673469042014', 'd': '18011320449780894329'},\n", + " {'s': '1862294673469042014', 'd': '9134577322728469115'},\n", + " {'s': '9134577322728469115', 'd': '1446072728533515665'},\n", + " {'s': '9134577322728469115', 'd': '6904185395252167658'},\n", + " {'s': '9134577322728469115', 'd': '13630126251685975826'},\n", + " {'s': '13630126251685975826', 'd': '11514603667851101425'},\n", + " {'s': '13630126251685975826', 'd': '13417892994160273884'},\n", + " {'s': '9134577322728469115', 'd': '2173486047275631423'},\n", + " {'s': '2173486047275631423', 'd': '18045747820524565803'},\n", + " {'s': '9134577322728469115', 'd': '16634236373777089526'}])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "7TmvBE5iI8Tu" + }, + "outputs": [], + "source": [ + "g = graphistry.edges(ownership_df, 's', 'd').nodes(companies_df, 'n').bind(point_title='label')" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "id": "y7eC5hOCwfE1" + }, + "outputs": [], + "source": [ + "g = g.nodes(g._nodes.assign(sz=1)).encode_point_size('sz')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gnhwn1v_r3kD" + }, + "source": [ + "## Minimal tree layout and graphviz layout engines\n", + "\n", + "Graphviz provides 15+ layout engines you can use. General guidance is to use for graphs up to 10,000 nodes and engines.\n", + "\n", + "The `\"dot\"` layout engine is best known due to its beautiful hierarchical layouts for directed acycle graphs like trees." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 543 + }, + "id": "KiOxkJR_YKrh", + "outputId": "2b9af5e5-b199-452a-839b-d8cc7cc0ea50" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g2 = g.layout_graphviz('dot')\n", + "g2.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dTEjXkhisCui" + }, + "source": [ + "Additional layout engines beyond `\"dot\"` are below. See also the [graphviz layout engines documents](https://graphviz.org/docs/layouts/). The same documentation, and the below section on global graph attributes, describe options you can pass in to different layout engines." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_x28pxBUr7e_", + "outputId": "dd209734-f4cf-425b-dc3c-7cec0d9fac74" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['acyclic',\n", + " 'ccomps',\n", + " 'circo',\n", + " 'dot',\n", + " 'fdp',\n", + " 'gc',\n", + " 'gvcolor',\n", + " 'gvpr',\n", + " 'neato',\n", + " 'nop',\n", + " 'osage',\n", + " 'patchwork',\n", + " 'sccmap',\n", + " 'sfdp',\n", + " 'tred',\n", + " 'twopi',\n", + " 'unflatten']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from graphistry.plugins_types.graphviz_types import PROGS\n", + "PROGS" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 543 + }, + "id": "VD5ezMLss9Dw", + "outputId": "553250c2-26a8-4820-f01d-fe138280bcf0" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g2b = g.layout_graphviz('neato')\n", + "g2b.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from graphistry.plugins_types.graphviz_types import PROGS" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oF9m9a_WuPjN" + }, + "source": [ + "### Global attributes\n", + "\n", + "You can set global attributes. Parameter [`graph_attr`](https://graphviz.org/docs/graph/) generally refers to layout engine options, while [`edge_attr`](https://graphviz.org/docs/edges/) and [`node_attr`](https://graphviz.org/docs/nodes/) are generally for default colors, sizes, shapes, etc." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 543 + }, + "id": "ACoYzOgCE7Pt", + "outputId": "597b8129-dc9f-4f1a-e086-912e7206b103" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g2b = g.layout_graphviz(\n", + " 'dot',\n", + " graph_attr={'ratio': 10},\n", + " edge_attr={},\n", + " node_attr={}\n", + ")\n", + "g2b.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['_background',\n", + " 'bb',\n", + " 'beautify',\n", + " 'bgcolor',\n", + " 'center',\n", + " 'charset',\n", + " 'class',\n", + " 'clusterrank',\n", + " 'colorscheme',\n", + " 'comment',\n", + " 'compound',\n", + " 'concentrate',\n", + " 'Damping',\n", + " 'defaultdist',\n", + " 'dim',\n", + " 'dimen',\n", + " 'diredgeconstraints',\n", + " 'dpi',\n", + " 'epsilon',\n", + " 'esep',\n", + " 'fontcolor',\n", + " 'fontname',\n", + " 'fontnames',\n", + " 'fontpath',\n", + " 'fontsize',\n", + " 'forcelabels',\n", + " 'gradientangle',\n", + " 'href',\n", + " 'id',\n", + " 'imagepath',\n", + " 'inputscale',\n", + " 'K',\n", + " 'label',\n", + " 'label_scheme',\n", + " 'labeljust',\n", + " 'labelloc',\n", + " 'landscape',\n", + " 'layerlistsep',\n", + " 'layers',\n", + " 'layerselect',\n", + " 'layersep',\n", + " 'layout',\n", + " 'levels',\n", + " 'levelsgap',\n", + " 'lheight',\n", + " 'linelength',\n", + " 'lp',\n", + " 'lwidth',\n", + " 'margin',\n", + " 'maxiter',\n", + " 'mclimit',\n", + " 'mindist',\n", + " 'mode',\n", + " 'model',\n", + " 'newrank',\n", + " 'nodesep',\n", + " 'nojustify',\n", + " 'normalize',\n", + " 'notranslate',\n", + " 'nslimit',\n", + " 'nslimit1',\n", + " 'oneblock',\n", + " 'ordering',\n", + " 'orientation',\n", + " 'outputorder',\n", + " 'overlap',\n", + " 'overlap_scaling',\n", + " 'overlap_shrink',\n", + " 'pack',\n", + " 'packmode',\n", + " 'pad',\n", + " 'page',\n", + " 'pagedir',\n", + " 'quadtree',\n", + " 'quantum',\n", + " 'rankdir',\n", + " 'ranksep',\n", + " 'ratio',\n", + " 'remincross',\n", + " 'repulsiveforce',\n", + " 'resolution',\n", + " 'root',\n", + " 'rotate',\n", + " 'rotation',\n", + " 'scale',\n", + " 'searchsize',\n", + " 'sep',\n", + " 'showboxes',\n", + " 'size',\n", + " 'smoothing',\n", + " 'sortv',\n", + " 'splines',\n", + " 'start',\n", + " 'style',\n", + " 'stylesheet',\n", + " 'target',\n", + " 'TBbalance',\n", + " 'tooltip',\n", + " 'truecolor',\n", + " 'URL',\n", + " 'viewport',\n", + " 'voro_margin',\n", + " 'xdotversion']" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "from graphistry.plugins_types.graphviz_types import GRAPH_ATTRS\n", + "GRAPH_ATTRS" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['arrowhead',\n", + " 'arrowsize',\n", + " 'arrowtail',\n", + " 'class',\n", + " 'color',\n", + " 'colorscheme',\n", + " 'comment',\n", + " 'constraint',\n", + " 'decorate',\n", + " 'dir',\n", + " 'edgehref',\n", + " 'edgetarget',\n", + " 'edgetooltip',\n", + " 'edgeURL',\n", + " 'fillcolor',\n", + " 'fontcolor',\n", + " 'fontname',\n", + " 'fontsize',\n", + " 'head_lp',\n", + " 'headclip',\n", + " 'headhref',\n", + " 'headlabel',\n", + " 'headport',\n", + " 'headtarget',\n", + " 'headtooltip',\n", + " 'headURL',\n", + " 'href',\n", + " 'id',\n", + " 'label',\n", + " 'labelangle',\n", + " 'labeldistance',\n", + " 'labelfloat',\n", + " 'labelfontcolor',\n", + " 'labelfontname',\n", + " 'labelfontsize',\n", + " 'labelhref',\n", + " 'labeltarget',\n", + " 'labeltooltip',\n", + " 'labelURL',\n", + " 'layer',\n", + " 'len',\n", + " 'lhead',\n", + " 'lp',\n", + " 'ltail',\n", + " 'minlen',\n", + " 'nojustify',\n", + " 'penwidth',\n", + " 'pos',\n", + " 'samehead',\n", + " 'sametail',\n", + " 'showboxes',\n", + " 'style',\n", + " 'tail_lp',\n", + " 'tailclip',\n", + " 'tailhref',\n", + " 'taillabel',\n", + " 'tailport',\n", + " 'tailtarget',\n", + " 'tailtooltip',\n", + " 'tailURL',\n", + " 'target',\n", + " 'tooltip',\n", + " 'URL',\n", + " 'weight',\n", + " 'xlabel',\n", + " 'xlp']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "from graphistry.plugins_types.graphviz_types import EDGE_ATTRS\n", + "EDGE_ATTRS" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['area',\n", + " 'class',\n", + " 'color',\n", + " 'colorscheme',\n", + " 'comment',\n", + " 'distortion',\n", + " 'fillcolor',\n", + " 'fixedsize',\n", + " 'fontcolor',\n", + " 'fontname',\n", + " 'fontsize',\n", + " 'gradientangle',\n", + " 'group',\n", + " 'height',\n", + " 'href',\n", + " 'id',\n", + " 'image',\n", + " 'imagepos',\n", + " 'imagescale',\n", + " 'label',\n", + " 'labelloc',\n", + " 'layer',\n", + " 'margin',\n", + " 'nojustify',\n", + " 'ordering',\n", + " 'orientation',\n", + " 'penwidth',\n", + " 'peripheries',\n", + " 'pin',\n", + " 'pos',\n", + " 'rects',\n", + " 'regular',\n", + " 'root',\n", + " 'samplepoints',\n", + " 'shape',\n", + " 'shapefile',\n", + " 'showboxes',\n", + " 'sides',\n", + " 'skew',\n", + " 'sortv',\n", + " 'style',\n", + " 'target',\n", + " 'tooltip',\n", + " 'URL',\n", + " 'vertices',\n", + " 'width',\n", + " 'xlabel',\n", + " 'xlp',\n", + " 'z']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "from graphistry.plugins_types.graphviz_types import NODE_ATTRS\n", + "NODE_ATTRS" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fslt0bjvuyv0" + }, + "source": [ + "## Static image rendering and entity-level attributes\n", + "\n", + "graphviz suports rendering to a static file in various image formats such as png.\n", + "\n", + "You can add graphviz-specific columns to your node and edge dataframes that configure per-row render settings. These use the same names as in the above global attribute guidance, such as `color`, `shape`, and `label`.\n", + "\n", + "Adding a column for an attribute will typically disable the global attribute. For example, creating setting node column `\"shape\"` with values `\"star\"` and `None`, and global node attribute `\"shape\"` with value value `\"box\"`. All the nodes with `shape == \"star\"` will render as a star in the static image, and the rows with value `None` will not default to the global node attribute `\"box\"`, but to graphviz's general default of an oval." + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 492 + }, + "id": "-3e_fH-80bhE", + "outputId": "015cb98e-f1eb-4fbe-e29b-18a821b3b3d3" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
01862294673469042014
17622088245850069747
27622088245850069747-unknown
316634236373777089526
418011320449780894329
59134577322728469115
61446072728533515665
76904185395252167658
813630126251685975826
911514603667851101425
1013417892994160273884
112173486047275631423
1218045747820524565803
\n", + "

" + ], + "text/plain": [ + "0 1862294673469042014\n", + "1 7622088245850069747\n", + "2 7622088245850069747-unknown\n", + "3 16634236373777089526\n", + "4 18011320449780894329\n", + "5 9134577322728469115\n", + "6 1446072728533515665\n", + "7 6904185395252167658\n", + "8 13630126251685975826\n", + "9 11514603667851101425\n", + "10 13417892994160273884\n", + "11 2173486047275631423\n", + "12 18045747820524565803\n", + "dtype: object" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g._nodes.apply(lambda row: row['n'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 717 + }, + "id": "newXWAjEzo5F", + "outputId": "ff234aa4-5811-42ff-b289-d67e4e5c11b7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "row 1862294673469042014\n", + "row 7622088245850069747\n", + "row 7622088245850069747-unknown\n", + "row 16634236373777089526\n", + "row 18011320449780894329\n", + "row 9134577322728469115\n", + "row 1446072728533515665\n", + "row 6904185395252167658\n", + "row 13630126251685975826\n", + "row 11514603667851101425\n", + "row 13417892994160273884\n", + "row 2173486047275631423\n", + "row 18045747820524565803\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
0None
1None
2None
3None
4None
5None
6None
7None
8None
9None
10None
11None
12None
\n", + "

" + ], + "text/plain": [ + "0 None\n", + "1 None\n", + "2 None\n", + "3 None\n", + "4 None\n", + "5 None\n", + "6 None\n", + "7 None\n", + "8 None\n", + "9 None\n", + "10 None\n", + "11 None\n", + "12 None\n", + "dtype: object" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g._nodes.apply(lambda row: print('row', row['n']), 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "72YquJpavfLL", + "outputId": "40739d8a-dbe2-4437-fcc7-1a45c92a5b73" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "repr_error": "Out of range float values are not JSON compliant: nan", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nxylabelszshapecolor
01862294673469042014381.39234.0Hsbc Finance (Netherlands)1Nonered
116634236373777089526140.3990.0HSBC PROPERTY (UK) LIMITED1Nonered
218011320449780894329381.39162.0HSBC ALTERNATIVE INVESTMENTS LIMITED1Nonered
39134577322728469115778.39162.0HSBC INVESTMENT COMPANY LIMITED1Nonered
41446072728533515665454.3990.0HSBC IM PENSION TRUST LIMITED1Nonered
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " n x y label \\\n", + "0 1862294673469042014 381.39 234.0 Hsbc Finance (Netherlands) \n", + "1 16634236373777089526 140.39 90.0 HSBC PROPERTY (UK) LIMITED \n", + "2 18011320449780894329 381.39 162.0 HSBC ALTERNATIVE INVESTMENTS LIMITED \n", + "3 9134577322728469115 778.39 162.0 HSBC INVESTMENT COMPANY LIMITED \n", + "4 1446072728533515665 454.39 90.0 HSBC IM PENSION TRUST LIMITED \n", + "\n", + " sz shape color \n", + "0 1 None red \n", + "1 1 None red \n", + "2 1 None red \n", + "3 1 None red \n", + "4 1 None red " + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# row-level attrs\n", + "\n", + "root_id = '7622088245850069747-unknown'\n", + "\n", + "g2c = g.nodes(g._nodes.assign(\n", + " label=g._nodes.apply(lambda row: \"ROOT: Unknown person(s)\" if row['n'] == root_id else row['label'], axis=1),\n", + " shape=g._nodes.n.apply(lambda n: \"box\" if n == root_id else None),\n", + " color=g._nodes.n.apply(lambda n: \"blue\" if n == root_id else 'red')\n", + ")).edges(g._edges.assign(\n", + " color=g._edges[g._source].apply(lambda n: 'blue' if n == root_id else None)\n", + "))\n", + "\n", + "\n", + "# Save a static graphviz render\n", + "g2c_positioned = g2c.layout_graphviz(\n", + " \"dot\",\n", + " render_to_disk=True,\n", + " path=f'./graph.png',\n", + " graph_attr={},\n", + " edge_attr={},\n", + " node_attr={'color': 'green'}, # ignored due to g2c._nodes.color\n", + " format='png'\n", + ")\n", + "\n", + "g2c_positioned._nodes.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 353 + }, + "id": "Orvnc-p4vwyd", + "outputId": "59f03155-128b-4d55-f216-58b16ce7f914" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import Image\n", + "Image(filename='./graph.png')" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 543 + }, + "id": "X_e8slJ4LO2z", + "outputId": "c36ff578-dd8b-4626-8e52-7096bc5f0cfb" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g2d = g.layout_graphviz('circo')\n", + "g2d.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pDEzL2UlZGz_" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/demos/demos_databases_apis/gremlin-tinkerpop/TitanDemo.ipynb b/demos/demos_databases_apis/gremlin-tinkerpop/TitanDemo.ipynb index f54add017..411c1184f 100644 --- a/demos/demos_databases_apis/gremlin-tinkerpop/TitanDemo.ipynb +++ b/demos/demos_databases_apis/gremlin-tinkerpop/TitanDemo.ipynb @@ -4,7 +4,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# In this notebook, we demonstrate how to create and modify a Titan graph in python, and then visualize the result using Graphistry's visual graph explorer. " + "# PyGraphistry <> Titan graph\n", + "\n", + "In this notebook, we demonstrate how to create and modify a Titan graph in python, and then visualize the result using Graphistry's visual graph explorer. " ] }, { @@ -12,12 +14,12 @@ "metadata": {}, "source": [ "### We assume the gremlin server for our Titan graph is hosted locally on port 8182\n", - " - This notebook utilizes the python modules aiogremlin and asyncio.\n", - " - The GremlinClient class of aiogremlin communicates asynchronously with the gremlin server using websockets via asyncio coroutines.\n", - " - This implementation allows you to submit additional requests to the server before any responses are recieved, which is much faster than synchronous request / response cycles. \n", - " - For more information about these modules, please visit:\n", - " - aiogremlin: http://aiogremlin.readthedocs.org/en/latest/index.html\n", - " - asyncio: https://pypi.python.org/pypi/asyncio" + "- This notebook utilizes the python modules aiogremlin and asyncio.\n", + "- The GremlinClient class of aiogremlin communicates asynchronously with the gremlin server using websockets via asyncio coroutines.\n", + "- This implementation allows you to submit additional requests to the server before any responses are recieved, which is much faster than synchronous request / response cycles. \n", + "- For more information about these modules, please visit:\n", + " - aiogremlin: http://aiogremlin.readthedocs.org/en/latest/index.html\n", + " - asyncio: https://pypi.python.org/pypi/asyncio" ] }, { diff --git a/demos/demos_databases_apis/neo4j/contributed/Neo4jTwitter.ipynb b/demos/demos_databases_apis/neo4j/contributed/Neo4jTwitter.ipynb index 6621db721..47c707a08 100644 --- a/demos/demos_databases_apis/neo4j/contributed/Neo4jTwitter.ipynb +++ b/demos/demos_databases_apis/neo4j/contributed/Neo4jTwitter.ipynb @@ -65,9 +65,7 @@ "source": [ "## Connect To Neo4j\n", "\n", - "If you haven't already, create an instance of the Russian Twitter Trolls sandbox on [Neo4j Sandbox.](https://neo4j.com/sandbox-v2/) We'll use the [Python driver for Neo4j](https://github.com/neo4j/neo4j-python-driver) to fetch data from Neo4j. To do this we'll need to instantiate a `Driver` object, passing in the credentials for our Neo4j instance. If using Neo4j Sandbox you can find the credentials for your Neo4j instance in the \"Details\" tab. Specifically we need the IP address, bolt port, username, and password. Bolt is the binary protocol used by the Neo4j drivers so a typical database URL string takes the form `bolt://:`\n", - "\n", - "![](./img/sandbox.png)" + "If you haven't already, create an instance of the Russian Twitter Trolls sandbox on [Neo4j Sandbox.](https://neo4j.com/sandbox-v2/) We'll use the [Python driver for Neo4j](https://github.com/neo4j/neo4j-python-driver) to fetch data from Neo4j. To do this we'll need to instantiate a `Driver` object, passing in the credentials for our Neo4j instance. If using Neo4j Sandbox you can find the credentials for your Neo4j instance in the \"Details\" tab. Specifically we need the IP address, bolt port, username, and password. Bolt is the binary protocol used by the Neo4j drivers so a typical database URL string takes the form `bolt://:`\n" ] }, { @@ -118,7 +116,6 @@ "source": [ "If we inspect the datamodel in Neo4j we can see that we have inormation about Tweets and specifically Users mentioned in tweets.\n", "\n", - "![](./img/datamodel.png)\n", "\n", "Let's use Graphistry to visualize User-User Tweet mention interactions. We'll do this by querying Neo4j for all tweets that mention users." ] @@ -371,8 +368,6 @@ "source": [ "After running the above Python cell you should see an interactive Graphistry visualization like this:\n", "\n", - "![](./img/graphistry1.png)\n", - "\n", "Known Troll user nodes are colored red, regular users colored blue. By default, the size of the nodes is proportional to the degree of the node (number of relationships). We'll see in the next section how we can use graph algorithms such as PageRank and visualize the results of those algorithms in Graphistry." ] }, @@ -549,8 +544,6 @@ "source": [ "Now when we render the Graphistry visualization, node size is proprtional to the node's PageRank score. This results in a different set of nodes that are identified as most important. \n", "\n", - "![](./img/graphistry2.png)\n", - "\n", "By binding node size to the results of graph algorithms we are able to draw insight from the data at a glance and further explore the interactive visualization.\n" ] }, diff --git a/demos/demos_databases_apis/neptune/neptune_cypher_viz_using_bolt.ipynb b/demos/demos_databases_apis/neptune/neptune_cypher_viz_using_bolt.ipynb index 49cf8e156..c9d6f3c1d 100755 --- a/demos/demos_databases_apis/neptune/neptune_cypher_viz_using_bolt.ipynb +++ b/demos/demos_databases_apis/neptune/neptune_cypher_viz_using_bolt.ipynb @@ -5,7 +5,7 @@ "id": "10436f61-3f82-4316-b9be-b6a70746d4f7", "metadata": {}, "source": [ - "## Graphistry for Neptune using pygraphistry bolt connector \n", + "# Graphistry for Neptune using pygraphistry bolt connector \n", "\n", "#### This example uses pygraphistry bolt helper class to run queries against AWS Neptune and retrieve query results as graph, then the bolt helper function extracts all the nodes and edges into the dataframes automatically. Then visualize the resulting datasets using Graphistry. \n", "\n" diff --git a/demos/demos_databases_apis/neptune/neptune_tutorial.ipynb b/demos/demos_databases_apis/neptune/neptune_tutorial.ipynb index 5a6ee1101..da07191e6 100644 --- a/demos/demos_databases_apis/neptune/neptune_tutorial.ipynb +++ b/demos/demos_databases_apis/neptune/neptune_tutorial.ipynb @@ -698,7 +698,7 @@ "id": "removed-blair", "metadata": {}, "source": [ - "# Next steps\n", + "## Next steps\n", "\n", "* Go deeper with [PyGraphistry](https://github.com/graphistry/pygraphistry): Examples for customization, GPU graph analytics, and more\n", "* Explore [gremlinpython](https://pypi.org/project/gremlinpython/)\n", diff --git a/demos/demos_databases_apis/networkx/networkx.ipynb b/demos/demos_databases_apis/networkx/networkx.ipynb index 0426751bf..510cc6413 100644 --- a/demos/demos_databases_apis/networkx/networkx.ipynb +++ b/demos/demos_databases_apis/networkx/networkx.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NetworkX\n", + "\n", + "NetworkX is an early graph manipulation library with a variety of algorithms and layouts." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -14,7 +23,8 @@ "# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n", "# For more options, see https://github.com/graphistry/pygraphistry#configure\n", "\n", - "import networkx as nx" + "import networkx as nx\n", + "import pandas as pd" ] }, { @@ -49,7 +59,7 @@ } ], "source": [ - "G=nx.Graph()\n", + "G = nx.Graph()\n", "G.add_nodes_from([\n", " (1, {\"v\": \"one\"}), \n", " (2, {\"v\": \"two\"}), \n", @@ -64,14 +74,26 @@ "graphistry.bind(source='src', destination='dst', node='nodeid').plot(G)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When manipulating the graph, this form is even easier, as you can then use PyGraphistry methods for tasks like filtering, algorithmic enrichment, GFQL queries, etc:" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], - "source": [] + "source": [ + "g = graphistry.bind().from_networkx(G)\n", + "\n", + "assert isinstance(g._edges, pd.DataFrame)\n", + "assert isinstance(g._nodes, pd.DataFrame)\n", + "\n", + "g._edges" + ] } ], "metadata": { @@ -90,7 +112,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.11" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/demos/demos_databases_apis/nodexl/official/nodexl_graphistry.ipynb b/demos/demos_databases_apis/nodexl/official/nodexl_graphistry.ipynb index b11acdc56..f30220175 100644 --- a/demos/demos_databases_apis/nodexl/official/nodexl_graphistry.ipynb +++ b/demos/demos_databases_apis/nodexl/official/nodexl_graphistry.ipynb @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", @@ -80,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", @@ -95,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", @@ -115,12 +115,12 @@ "id": "jK9AXFTjAyDD" }, "source": [ - "# Sample use" + "## Sample use" ] }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", @@ -134,7 +134,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", @@ -148,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", @@ -166,7 +166,7 @@ "id": "a7erX6jnKhHj" }, "source": [ - "# Twitter Demos" + "### Twitter Demos" ] }, { @@ -176,12 +176,12 @@ "id": "uGuj40xkxtMh" }, "source": [ - "## Debate Warren" + "### Debate Warren" ] }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", @@ -199,12 +199,12 @@ "id": "UrnlAwkryE10" }, "source": [ - "## CES Samsung" + "### CES Samsung" ] }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", @@ -222,12 +222,12 @@ "id": "2a4TOajvC4sb" }, "source": [ - "## Larger Graph" + "### Larger Graph" ] }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", @@ -245,7 +245,7 @@ "id": "wY9KhWEgDHzn" }, "source": [ - "# MediaWiki Demos" + "## MediaWiki Demos" ] }, { @@ -255,12 +255,12 @@ "id": "t4Im6padK7Ze" }, "source": [ - "## Demo 1" + "### Demo 1" ] }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", diff --git a/demos/demos_databases_apis/splunk/splunk_demo_public.ipynb b/demos/demos_databases_apis/splunk/splunk_demo_public.ipynb index b6d84abee..281bb88f0 100644 --- a/demos/demos_databases_apis/splunk/splunk_demo_public.ipynb +++ b/demos/demos_databases_apis/splunk/splunk_demo_public.ipynb @@ -151,34 +151,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 188 - }, - "colab_type": "code", - "id": "XPK5n5Yrvjb5", - "outputId": "04e436c6-5a8b-4148-cd31-874421e6967e" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting splunk-sdk\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d4/bb/408c504f4307fcf4a89909cc85bc912d8529c9ca88200682f94a31a06186/splunk-sdk-1.6.5.tar.gz (103kB)\n", - "\u001b[K 100% |████████████████████████████████| 112kB 2.6MB/s \n", - "\u001b[?25hBuilding wheels for collected packages: splunk-sdk\n", - " Running setup.py bdist_wheel for splunk-sdk ... \u001b[?25l-\b \bdone\n", - "\u001b[?25h Stored in directory: /root/.cache/pip/wheels/87/83/8f/5f78fbc79322715add8f39ba8adc97511f27297852eb4dc270\n", - "Successfully built splunk-sdk\n", - "Installing collected packages: splunk-sdk\n", - "Successfully installed splunk-sdk-1.6.5\n" - ] - } - ], + "metadata": {}, + "outputs": [], "source": [ - "!pip install splunk-sdk\n", + "# !pip install splunk-sdk\n", "\n", "import splunklib" ] diff --git a/demos/demos_databases_apis/sql/postgres.ipynb b/demos/demos_databases_apis/sql/postgres.ipynb index 1422625d8..219e1a101 100644 --- a/demos/demos_databases_apis/sql/postgres.ipynb +++ b/demos/demos_databases_apis/sql/postgres.ipynb @@ -13,10 +13,9 @@ "* Shows several viz modes + a convenience function for sql->interactive viz\n", "* Try: Modify the indicated lines to change to visualize any other table\n", "\n", - "Further docs\n", + "Further reading:\n", " - [UI Guide](https://hub.graphistry.com/docs/ui/index/)\n", - " - [More demos: database connectors, ...](/notebook/tree/demos/demos_databases_apis)\n", - " - [CSV upload notebook app](/notebook/tree/demos/upload_csv_miniapp.ipynb)" + " - [CSV upload notebook app](../../upload_csv_miniapp.ipynb)" ] }, { @@ -357,8 +356,7 @@ "source": [ "## Further docs\n", " - [UI Guide](https://hub.graphistry.com/docs/ui/index/)\n", - " - [More demos: database connectors, ...](/notebook/tree/demos/demos_databases_apis)\n", - " - [CSV upload notebook app](/notebook/tree/demos/upload_csv_miniapp.ipynb)" + " - [CSV upload notebook app](../..//upload_csv_miniapp.ipynb)" ] }, { diff --git a/demos/demos_databases_apis/tigergraph/fraud_raw_REST_calls.ipynb b/demos/demos_databases_apis/tigergraph/fraud_raw_REST_calls.ipynb index dce85185a..c067b6563 100644 --- a/demos/demos_databases_apis/tigergraph/fraud_raw_REST_calls.ipynb +++ b/demos/demos_databases_apis/tigergraph/fraud_raw_REST_calls.ipynb @@ -104,7 +104,7 @@ "id": "LUEA1fmFOjCD" }, "source": [ - "# 1. Fraud" + "## 1. Fraud" ] }, { @@ -114,7 +114,7 @@ "id": "rY8Ip6WcOnPl" }, "source": [ - "## 1.a circleDetection" + "### 1.a circleDetection" ] }, { @@ -152,7 +152,7 @@ "id": "mXT2bD2UOp3o" }, "source": [ - "## 1.b fraudConnectivity" + "### 1.b fraudConnectivity" ] }, { @@ -190,7 +190,7 @@ "id": "SKepDGbKZLGI" }, "source": [ - "## Combined" + "### Combined" ] }, { diff --git a/demos/for_analysis.ipynb b/demos/for_analysis.ipynb index adf548691..675722246 100644 --- a/demos/for_analysis.ipynb +++ b/demos/for_analysis.ipynb @@ -14,10 +14,10 @@ "3. Advanced plotting\n", "4. Further reading\n", " - [PyGraphistry](https://github.com/graphistry/pygraphistry)\n", - " - [PyGraphistry demos: database connectors, ...](demos_databases_apis)\n", + " - [PyGraphistry demos: database connectors, ...](https://github.com/graphistry/pygraphistry/tree/master/demos/demos_databases_apis)\n", " - [graph-app-kit: Streamlit graph dashboarding](https://github.com/graphistry/graph-app-kit)\n", " - [UI Guide](https://hub.graphistry.com/docs/ui/index/)\n", - " - [CSV upload notebook app](upload_csv_miniapp.ipynb)\n", + " - [CSV upload notebook app](https://github.com/graphistry/pygraphistry/tree/master/demos/upload_csv_miniapp.ipynb)\n", " \n", "## 1. Register\n" ] @@ -896,10 +896,10 @@ "source": [ "## Further reading:\n", " - [PyGraphistry](https://github.com/graphistry/pygraphistry)\n", - " - [PyGraphistry demos: database connectors, ...](demos_databases_apis)\n", + " - [PyGraphistry demos: database connectors, ...](https://github.com/graphistry/pygraphistry/demos/demos_databases_apis)\n", " - [graph-app-kit: Streamlit graph dashboarding](https://github.com/graphistry/graph-app-kit)\n", " - [UI Guide](https://hub.graphistry.com/docs/ui/index/)\n", - " - [CSV upload notebook app](upload_csv_miniapp.ipynb)" + " - [CSV upload notebook app](https://github.com/graphistry/pygraphistry/demos/upload_csv_miniapp.ipynb)" ] } ], diff --git a/demos/for_developers.ipynb b/demos/for_developers.ipynb index 8d2187a10..25d07d1b7 100644 --- a/demos/for_developers.ipynb +++ b/demos/for_developers.ipynb @@ -7,7 +7,7 @@ "# Tutorial: Graphistry for Developers\n", "\n", "\n", - "**Start by generating interactive graphs in the [Analysis tutorial](for_analysis.ipynb)**\n", + "**Start by generating interactive graphs in the [Analysis tutorial](https://github.com/graphistry/pygraphistry/demos/for_analysis.ipynb)**\n", "\n", "\n", "**Graphistry is a client/server system:**\n", @@ -48,7 +48,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# 1. Backend APIs\n", + "## 1. Backend APIs\n", "\n", "Graphistry provides a REST upload API, and you can reuse the Python client for more conveniently using it." ] @@ -57,8 +57,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Python\n", - "* Use the PyGraphistry API as in the [Analysis tutorial](for_analysis.ipynb)\n", + "### Python\n", + "* Use the PyGraphistry API as in the [Analysis tutorial](https://github.com/graphistry/pygraphistry/demos/for_analysis.ipynb)\n", "* Instead of plotting, get the plot URL for embedding\n" ] }, @@ -173,7 +173,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## iframe" + "### iframe" ] }, { @@ -207,7 +207,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## JavaScript - Browser vanilla JS\n", + "### JavaScript - Browser vanilla JS\n", "* [npm](https://www.npmjs.com/package/@graphistry/client-api)\n", "* `npm install --save \"@graphistry/client-api\"`\n", "* See [vanilla js examples](https://hub.graphistry.com/static/js-docs/examples/toggles.html)]\n", diff --git a/demos/gfql/benchmark_hops_cpu_gpu.ipynb b/demos/gfql/benchmark_hops_cpu_gpu.ipynb index bf17b630e..cafd90815 100644 --- a/demos/gfql/benchmark_hops_cpu_gpu.ipynb +++ b/demos/gfql/benchmark_hops_cpu_gpu.ipynb @@ -1,23 +1,10 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "gpuType": "T4" - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "accelerator": "GPU" - }, "cells": [ { "cell_type": "markdown", + "metadata": { + "id": "GZxoiU8sQDk_" + }, "source": [ "# GFQL CPU, GPU Benchmark\n", "\n", @@ -73,33 +60,27 @@ "| **Orkut** | N/A | N/A | 41.50 | N/A | 711.4 |\n", "| **AVG** | 22X | 0.41 | 14.4 | 41.1 | 246.8\n", "| **MAX** | 42X | 0.50 | 41.50 | 50.2 | 711.4\n" - ], - "metadata": { - "id": "GZxoiU8sQDk_" - } + ] }, { "cell_type": "markdown", - "source": [ - "## Optional: GPU setup - Google Colab" - ], "metadata": { "id": "SAj8lhREEOwS" - } + }, + "source": [ + "## Optional: GPU setup - Google Colab" + ] }, { "cell_type": "markdown", - "source": [], "metadata": { "id": "4hrEEAAm7DTO" - } + }, + "source": [] }, { "cell_type": "code", - "source": [ - "# Report GPU used when GPU benchmarking\n", - "! nvidia-smi" - ], + "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -107,11 +88,10 @@ "id": "W2MF6ZsjDv3B", "outputId": "46088cbc-2db9-4529-f724-dc57ed85dfb7" }, - "execution_count": 1, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Tue Dec 26 00:50:30 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -135,27 +115,28 @@ "+---------------------------------------------------------------------------------------+\n" ] } + ], + "source": [ + "# Report GPU used when GPU benchmarking\n", + "# ! nvidia-smi" ] }, { "cell_type": "code", - "source": [ - "# if in google colab\n", - "!git clone https://github.com/rapidsai/rapidsai-csp-utils.git\n", - "!python rapidsai-csp-utils/colab/pip-install.py" - ], + "execution_count": 8, "metadata": { "id": "Aikh0x4ID_wK" }, - "execution_count": 8, - "outputs": [] + "outputs": [], + "source": [ + "# if in google colab\n", + "#!git clone https://github.com/rapidsai/rapidsai-csp-utils.git\n", + "#!python rapidsai-csp-utils/colab/pip-install.py" + ] }, { "cell_type": "code", - "source": [ - "import cudf\n", - "cudf.__version__" - ], + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -164,160 +145,155 @@ "id": "Lwekdei1dH3N", "outputId": "71f5b01d-7917-4283-8338-969167d6e1e8" }, - "execution_count": 3, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - "'23.12.01'" - ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" - } + }, + "text/plain": [ + "'23.12.01'" + ] }, + "execution_count": 3, "metadata": {}, - "execution_count": 3 + "output_type": "execute_result" } + ], + "source": [ + "import cudf\n", + "cudf.__version__" ] }, { "cell_type": "markdown", - "source": [ - "# 1. Install & configure" - ], "metadata": { "id": "QQpsrtwBT7sa" - } + }, + "source": [ + "## 1. Install & configure" + ] }, { "cell_type": "code", - "source": [ - "#! pip install graphistry[igraph]\n", - "\n", - "!pip install -q igraph\n", - "#!pip install -q git+https://github.com/graphistry/pygraphistry.git@dev/cugfql\n", - "!pip install -q graphistry\n" - ], + "execution_count": 2, "metadata": { - "id": "cYjRbgkU9Sx8", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "cYjRbgkU9Sx8", "outputId": "2cf25531-9b8b-4715-ccc7-e79094d84ebd" }, - "execution_count": 2, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n" ] } + ], + "source": [ + "#! pip install graphistry[igraph]" ] }, { "cell_type": "markdown", - "source": [ - "## Imports" - ], "metadata": { "id": "Ff6Tt9DhkePl" - } + }, + "source": [ + "### Imports" + ] }, { "cell_type": "code", - "source": [ - "import pandas as pd\n", - "\n", - "import graphistry\n", - "\n", - "from graphistry import (\n", - "\n", - " # graph operators\n", - " n, e_undirected, e_forward, e_reverse,\n", - "\n", - " # attribute predicates\n", - " is_in, ge, startswith, contains, match as match_re\n", - ")\n", - "graphistry.__version__" - ], + "execution_count": 3, "metadata": { - "id": "S5_y0CbLkjft", "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, + "id": "S5_y0CbLkjft", "outputId": "a68a9c4b-c9c5-4b8b-ea4f-7bf1e4ddf315" }, - "execution_count": 3, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - "'0.32.0+12.g72e778c'" - ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" - } + }, + "text/plain": [ + "'0.32.0+12.g72e778c'" + ] }, + "execution_count": 3, "metadata": {}, - "execution_count": 3 + "output_type": "execute_result" } + ], + "source": [ + "import pandas as pd\n", + "\n", + "import graphistry\n", + "\n", + "from graphistry import (\n", + "\n", + " # graph operators\n", + " n, e_undirected, e_forward, e_reverse,\n", + "\n", + " # attribute predicates\n", + " is_in, ge, startswith, contains, match as match_re\n", + ")\n", + "graphistry.__version__" ] }, { "cell_type": "code", - "source": [ - "import cudf" - ], + "execution_count": 6, "metadata": { "id": "I7Fg75jsG4co" }, - "execution_count": 6, - "outputs": [] + "outputs": [], + "source": [ + "import cudf" + ] }, { "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "uLZKph2-a5M4" + }, + "outputs": [], "source": [ "#work around google colab shell encoding bugs\n", "\n", "import locale\n", "locale.getpreferredencoding = lambda: \"UTF-8\"" - ], - "metadata": { - "id": "uLZKph2-a5M4" - }, - "execution_count": 7, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "# 2. Perf benchmarks" - ], "metadata": { "id": "eU9SyauNUHtR" - } + }, + "source": [ + "## 2. Perf benchmarks" + ] }, { "cell_type": "markdown", - "source": [ - "### Facebook: 88K edges" - ], "metadata": { "id": "NA0Ym11fkB8j" - } + }, + "source": [ + "### Facebook: 88K edges" + ] }, { "cell_type": "code", - "source": [ - "df = pd.read_csv('https://raw.githubusercontent.com/graphistry/pygraphistry/master/demos/data/facebook_combined.txt', sep=' ', names=['s', 'd'])\n", - "print(df.shape)\n", - "df.head(5)" - ], + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -326,26 +302,16 @@ "id": "vXuQogHekClJ", "outputId": "64db92c0-2704-438b-d0e4-25865acbb5e9" }, - "execution_count": 10, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(88234, 2)\n" ] }, { - "output_type": "execute_result", "data": { - "text/plain": [ - " s d\n", - "0 0 1\n", - "1 0 2\n", - "2 0 3\n", - "3 0 4\n", - "4 0 5" - ], "text/html": [ "\n", "
\n", @@ -608,20 +574,30 @@ "
\n", " \n", " \n" + ], + "text/plain": [ + " s d\n", + "0 0 1\n", + "1 0 2\n", + "2 0 3\n", + "3 0 4\n", + "4 0 5" ] }, + "execution_count": 10, "metadata": {}, - "execution_count": 10 + "output_type": "execute_result" } + ], + "source": [ + "df = pd.read_csv('https://raw.githubusercontent.com/graphistry/pygraphistry/master/demos/data/facebook_combined.txt', sep=' ', names=['s', 'd'])\n", + "print(df.shape)\n", + "df.head(5)" ] }, { "cell_type": "code", - "source": [ - "fg = graphistry.edges(df, 's', 'd').materialize_nodes()\n", - "print(fg._nodes.shape, fg._edges.shape)\n", - "fg._nodes.head(5)" - ], + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -630,26 +606,16 @@ "id": "jEma7hvvkzkN", "outputId": "dbf21342-6b80-429c-bd3f-b1494c6854c7" }, - "execution_count": 11, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(4039, 1) (88234, 2)\n" ] }, { - "output_type": "execute_result", "data": { - "text/plain": [ - " id\n", - "0 0\n", - "1 1\n", - "2 2\n", - "3 3\n", - "4 4" - ], "text/html": [ "\n", "
\n", @@ -906,20 +872,30 @@ "
\n", " \n", " \n" + ], + "text/plain": [ + " id\n", + "0 0\n", + "1 1\n", + "2 2\n", + "3 3\n", + "4 4" ] }, + "execution_count": 11, "metadata": {}, - "execution_count": 11 + "output_type": "execute_result" } + ], + "source": [ + "fg = graphistry.edges(df, 's', 'd').materialize_nodes()\n", + "print(fg._nodes.shape, fg._edges.shape)\n", + "fg._nodes.head(5)" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "for i in range(100):\n", - " fg2 = fg.chain([n({'id': 0}), e_forward(hops=2)])" - ], + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -927,30 +903,25 @@ "id": "5lEdCBw9lzd7", "outputId": "ed7451e0-401e-4edc-c8de-79c5afd0c95b" }, - "execution_count": 12, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "CPU times: user 13.6 s, sys: 2.08 s, total: 15.7 s\n", "Wall time: 18 s\n" ] } + ], + "source": [ + "%%time\n", + "for i in range(100):\n", + " fg2 = fg.chain([n({'id': 0}), e_forward(hops=2)])" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "fg_gdf = fg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "for i in range(100):\n", - " fg2 = fg_gdf.chain([n({'id': 0}), e_forward(hops=2)])\n", - "print(fg._nodes.shape, fg._edges.shape)\n", - "print(fg2._nodes.shape, fg2._edges.shape)\n", - "del fg_gdf\n", - "del fg2" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -958,11 +929,10 @@ "id": "JFKIBa8mJCvJ", "outputId": "c22022f0-b33d-483a-db64-29992c5161e8" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(4039, 1) (88234, 2)\n", "(1519, 1) (4060, 2)\n", @@ -970,17 +940,21 @@ "Wall time: 11.9 s\n" ] } + ], + "source": [ + "%%time\n", + "fg_gdf = fg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", + "for i in range(100):\n", + " fg2 = fg_gdf.chain([n({'id': 0}), e_forward(hops=2)])\n", + "print(fg._nodes.shape, fg._edges.shape)\n", + "print(fg2._nodes.shape, fg2._edges.shape)\n", + "del fg_gdf\n", + "del fg2" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "for i in range(50):\n", - " fg2 = fg.chain([n({'id': 0}), e_forward(hops=5)])\n", - "print(fg._nodes.shape, fg._edges.shape)\n", - "print(fg2._nodes.shape, fg2._edges.shape)" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -988,11 +962,10 @@ "id": "-KBGLexek5tS", "outputId": "2f462e6c-578a-4fa1-ec29-91bae753f4c5" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(4039, 1) (88234, 2)\n", "(3829, 1) (86074, 2)\n", @@ -1000,20 +973,18 @@ "Wall time: 16.2 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "fg_gdf = fg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", "for i in range(50):\n", - " fg2 = fg_gdf.chain([n({'id': 0}), e_forward(hops=5)])\n", + " fg2 = fg.chain([n({'id': 0}), e_forward(hops=5)])\n", "print(fg._nodes.shape, fg._edges.shape)\n", - "print(fg2._nodes.shape, fg2._edges.shape)\n", - "del fg_gdf\n", - "del fg2" - ], + "print(fg2._nodes.shape, fg2._edges.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1021,11 +992,10 @@ "id": "CVpcbhpdHFEF", "outputId": "aba04ee1-781e-4226-b593-b42415a55fc4" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(4039, 1) (88234, 2)\n", "(3829, 1) (86074, 2)\n", @@ -1033,47 +1003,47 @@ "Wall time: 10.1 s\n" ] } + ], + "source": [ + "%%time\n", + "fg_gdf = fg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", + "for i in range(50):\n", + " fg2 = fg_gdf.chain([n({'id': 0}), e_forward(hops=5)])\n", + "print(fg._nodes.shape, fg._edges.shape)\n", + "print(fg2._nodes.shape, fg2._edges.shape)\n", + "del fg_gdf\n", + "del fg2" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "for i in range(100):\n", - " fg2 = fg.chain([e_forward(source_node_match={'id': 0}, hops=5)])" - ], + "execution_count": null, "metadata": { - "id": "1cFIyJF9pLjE", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "1cFIyJF9pLjE", "outputId": "107329af-8e4b-428c-8b03-77ed00bdf5bf" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "CPU times: user 11.8 s, sys: 377 ms, total: 12.1 s\n", "Wall time: 13.1 s\n" ] } + ], + "source": [ + "%%time\n", + "for i in range(100):\n", + " fg2 = fg.chain([e_forward(source_node_match={'id': 0}, hops=5)])" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "fg_gdf = fg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "for i in range(100):\n", - " fg2 = fg_gdf.chain([e_forward(source_node_match={'id': 0}, hops=5)])\n", - "print(fg._nodes.shape, fg._edges.shape)\n", - "print(fg2._nodes.shape, fg2._edges.shape)\n", - "del fg_gdf\n", - "del fg2\n", - "\n" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1081,11 +1051,10 @@ "id": "M5uRiD6uJVNW", "outputId": "5e938a19-2992-4280-80c2-784382d40113" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(4039, 1) (88234, 2)\n", "(348, 1) (347, 2)\n", @@ -1093,20 +1062,22 @@ "Wall time: 14.2 s\n" ] } + ], + "source": [ + "%%time\n", + "fg_gdf = fg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", + "for i in range(100):\n", + " fg2 = fg_gdf.chain([e_forward(source_node_match={'id': 0}, hops=5)])\n", + "print(fg._nodes.shape, fg._edges.shape)\n", + "print(fg2._nodes.shape, fg2._edges.shape)\n", + "del fg_gdf\n", + "del fg2\n", + "\n" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "start_nodes = pd.DataFrame({fg._node: [0]})\n", - "for i in range(100):\n", - " fg2 = fg.hop(\n", - " nodes=start_nodes,\n", - " direction='forward',\n", - " hops=2)\n", - "print(fg2._nodes.shape, fg2._edges.shape)" - ], + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1114,35 +1085,31 @@ "id": "Y9vgzfT69x41", "outputId": "6882c1ce-0df8-4087-dda4-0a105a8617e1" }, - "execution_count": 17, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(1519, 1) (4060, 2)\n", "CPU times: user 4.5 s, sys: 1.35 s, total: 5.85 s\n", "Wall time: 6.09 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = cudf.DataFrame({fg._node: [0]})\n", - "fg_gdf = fg.nodes(cudf.from_pandas(fg._nodes)).edges(cudf.from_pandas(fg._edges))\n", + "start_nodes = pd.DataFrame({fg._node: [0]})\n", "for i in range(100):\n", - " fg2 = fg_gdf.hop(\n", + " fg2 = fg.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", " hops=2)\n", - "print(fg2._nodes.shape, fg2._edges.shape)\n", - "del start_nodes\n", - "del fg_gdf\n", - "del fg2" - ], + "print(fg2._nodes.shape, fg2._edges.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1150,31 +1117,35 @@ "id": "c7ybJqjc-T31", "outputId": "37ccc1fb-6460-4193-8aa7-22837ff06d0a" }, - "execution_count": 18, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(1519, 1) (4060, 2)\n", "CPU times: user 2.58 s, sys: 6.75 ms, total: 2.59 s\n", "Wall time: 2.58 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = pd.DataFrame({fg._node: [0]})\n", + "start_nodes = cudf.DataFrame({fg._node: [0]})\n", + "fg_gdf = fg.nodes(cudf.from_pandas(fg._nodes)).edges(cudf.from_pandas(fg._edges))\n", "for i in range(100):\n", - " fg2 = fg.hop(\n", + " fg2 = fg_gdf.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", - " hops=5)\n", - "print(fg2._nodes.shape, fg2._edges.shape)" - ], + " hops=2)\n", + "print(fg2._nodes.shape, fg2._edges.shape)\n", + "del start_nodes\n", + "del fg_gdf\n", + "del fg2" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1182,35 +1153,31 @@ "id": "Dy7a4zDZ-7_G", "outputId": "077b5d9c-c9ae-411a-8228-3c026b07a910" }, - "execution_count": 19, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(3829, 1) (86074, 2)\n", "CPU times: user 13.2 s, sys: 2 s, total: 15.2 s\n", "Wall time: 18.3 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = cudf.DataFrame({fg._node: [0]})\n", - "fg_gdf = fg.nodes(cudf.from_pandas(fg._nodes)).edges(cudf.from_pandas(fg._edges))\n", + "start_nodes = pd.DataFrame({fg._node: [0]})\n", "for i in range(100):\n", - " fg2 = fg_gdf.hop(\n", + " fg2 = fg.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", " hops=5)\n", - "print(fg2._nodes.shape, fg2._edges.shape)\n", - "del start_nodes\n", - "del fg_gdf\n", - "del fg2" - ], + "print(fg2._nodes.shape, fg2._edges.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1218,49 +1185,58 @@ "id": "N5aUtF1a--ML", "outputId": "0c2b67b8-fac6-45b3-dfbe-8002b5506e91" }, - "execution_count": 20, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(3829, 1) (86074, 2)\n", "CPU times: user 5.72 s, sys: 159 ms, total: 5.88 s\n", "Wall time: 5.86 s\n" ] } + ], + "source": [ + "%%time\n", + "start_nodes = cudf.DataFrame({fg._node: [0]})\n", + "fg_gdf = fg.nodes(cudf.from_pandas(fg._nodes)).edges(cudf.from_pandas(fg._edges))\n", + "for i in range(100):\n", + " fg2 = fg_gdf.hop(\n", + " nodes=start_nodes,\n", + " direction='forward',\n", + " hops=5)\n", + "print(fg2._nodes.shape, fg2._edges.shape)\n", + "del start_nodes\n", + "del fg_gdf\n", + "del fg2" ] }, { "cell_type": "markdown", + "metadata": { + "id": "KrJKjXy2KLos" + }, "source": [ "## Twitter\n", "\n", "- edges: 2420766\n", "- nodes: 81306" - ], - "metadata": { - "id": "KrJKjXy2KLos" - } + ] }, { "cell_type": "code", - "source": [ - "! wget 'https://snap.stanford.edu/data/twitter_combined.txt.gz'\n", - "#! curl -L 'https://snap.stanford.edu/data/twitter_combined.txt.gz' -o twitter_combined.txt.gz" - ], + "execution_count": 21, "metadata": { - "id": "fO2qasGqpubr", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "fO2qasGqpubr", "outputId": "d41a110e-9f7c-4710-9ce3-3f4906ab02ae" }, - "execution_count": 21, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "--2023-12-25 21:58:27-- https://snap.stanford.edu/data/twitter_combined.txt.gz\n", "Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80\n", @@ -1275,24 +1251,25 @@ "\n" ] } + ], + "source": [ + "#! wget 'https://snap.stanford.edu/data/twitter_combined.txt.gz'" ] }, { "cell_type": "code", - "source": [ - "! gunzip twitter_combined.txt.gz" - ], + "execution_count": 22, "metadata": { "id": "fn7zeA3SGlEo" }, - "execution_count": 22, - "outputs": [] + "outputs": [], + "source": [ + "#! gunzip twitter_combined.txt.gz" + ] }, { "cell_type": "code", - "source": [ - "! head -n 5 twitter_combined.txt" - ], + "execution_count": 24, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1300,11 +1277,10 @@ "id": "68TAZkhLGz9g", "outputId": "8ba7c23d-267f-4b59-d6c6-b3f66caec9cf" }, - "execution_count": 24, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "214328887 34428380\n", "17116707 28465635\n", @@ -1313,15 +1289,14 @@ "107830991 17868918\n" ] } + ], + "source": [ + "#! head -n 5 twitter_combined.txt" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "te_df = pd.read_csv('twitter_combined.txt', sep=' ', names=['s', 'd'])\n", - "te_df.shape" - ], + "execution_count": 25, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1329,46 +1304,46 @@ "id": "QU2wNeGXG2GC", "outputId": "349ac9c0-6f6c-4ce6-fec0-8bae75fca635" }, - "execution_count": 25, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "CPU times: user 474 ms, sys: 61.9 ms, total: 536 ms\n", "Wall time: 534 ms\n" ] }, { - "output_type": "execute_result", "data": { "text/plain": [ "(2420766, 2)" ] }, + "execution_count": 25, "metadata": {}, - "execution_count": 25 + "output_type": "execute_result" } + ], + "source": [ + "%%time\n", + "te_df = pd.read_csv('twitter_combined.txt', sep=' ', names=['s', 'd'])\n", + "te_df.shape" ] }, { "cell_type": "code", - "source": [ - "import graphistry" - ], + "execution_count": 26, "metadata": { "id": "EK5gQH2iG5UU" }, - "execution_count": 26, - "outputs": [] + "outputs": [], + "source": [ + "import graphistry" + ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "g = graphistry.edges(te_df, 's', 'd').materialize_nodes()\n", - "g._nodes.shape" - ], + "execution_count": 27, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1376,36 +1351,35 @@ "id": "ZtIW-eFGG_R4", "outputId": "0686e9b3-b684-4b93-da03-289244394338" }, - "execution_count": 27, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "CPU times: user 86.4 ms, sys: 106 ms, total: 193 ms\n", "Wall time: 191 ms\n" ] }, { - "output_type": "execute_result", "data": { "text/plain": [ "(81306, 1)" ] }, + "execution_count": 27, "metadata": {}, - "execution_count": 27 + "output_type": "execute_result" } + ], + "source": [ + "%%time\n", + "g = graphistry.edges(te_df, 's', 'd').materialize_nodes()\n", + "g._nodes.shape" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "for i in range(10):\n", - " g2 = g.chain([n({'id': 17116707}), e_forward(hops=1)])\n", - "g2._nodes.shape, g2._edges.shape" - ], + "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1413,39 +1387,36 @@ "id": "yUaRfw4FHGMb", "outputId": "3945cc5a-c36c-451b-ac95-8af992a3546f" }, - "execution_count": 29, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "CPU times: user 11.8 s, sys: 8.4 s, total: 20.2 s\n", "Wall time: 23 s\n" ] }, { - "output_type": "execute_result", "data": { "text/plain": [ "((140, 1), (615, 2))" ] }, + "execution_count": 29, "metadata": {}, - "execution_count": 29 + "output_type": "execute_result" } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "g_gdf = g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", "for i in range(10):\n", - " out = g_gdf.chain([n({'id': 17116707}), e_forward(hops=1)])._nodes\n", - "print(out.shape)\n", - "del g_gdf\n", - "del out" - ], + " g2 = g.chain([n({'id': 17116707}), e_forward(hops=1)])\n", + "g2._nodes.shape, g2._edges.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1453,27 +1424,30 @@ "id": "5hM4NBu2_eks", "outputId": "54505262-4871-44ee-e5e4-ad7ab32c13c2" }, - "execution_count": 30, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(140, 1)\n", "CPU times: user 1.33 s, sys: 46.6 ms, total: 1.38 s\n", "Wall time: 1.63 s\n" ] } + ], + "source": [ + "%%time\n", + "g_gdf = g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", + "for i in range(10):\n", + " out = g_gdf.chain([n({'id': 17116707}), e_forward(hops=1)])._nodes\n", + "print(out.shape)\n", + "del g_gdf\n", + "del out" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "for i in range(10):\n", - " out = g.chain([n({'id': 17116707}), e_forward(hops=2)])\n", - "print(out._nodes.shape, out._edges.shape)" - ], + "execution_count": 31, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1481,30 +1455,27 @@ "id": "m2-MxD5lHX6u", "outputId": "e89b9d4b-6c04-45c7-9e7f-cbdbbe0a4730" }, - "execution_count": 31, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(2345, 1) (68536, 2)\n", "CPU times: user 13.3 s, sys: 8.05 s, total: 21.4 s\n", "Wall time: 21.6 s\n" ] } + ], + "source": [ + "%%time\n", + "for i in range(10):\n", + " out = g.chain([n({'id': 17116707}), e_forward(hops=2)])\n", + "print(out._nodes.shape, out._edges.shape)" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "g_gdf = g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "for i in range(10):\n", - " out = g_gdf.chain([n({'id': 17116707}), e_forward(hops=2)])._nodes\n", - "print(out.shape)\n", - "del g_gdf\n", - "del out" - ], + "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1512,27 +1483,30 @@ "id": "7EQSRbIqLaGw", "outputId": "60c00a03-9e7b-46b5-fce3-f4f567a09430" }, - "execution_count": 36, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(2345, 1)\n", "CPU times: user 1.67 s, sys: 55.8 ms, total: 1.72 s\n", "Wall time: 1.75 s\n" ] } + ], + "source": [ + "%%time\n", + "g_gdf = g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", + "for i in range(10):\n", + " out = g_gdf.chain([n({'id': 17116707}), e_forward(hops=2)])._nodes\n", + "print(out.shape)\n", + "del g_gdf\n", + "del out" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "for i in range(10):\n", - " out = g.chain([n({'id': 17116707}), e_forward(hops=8)])\n", - "print(out._nodes.shape, out._edges.shape)" - ], + "execution_count": 37, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1540,30 +1514,27 @@ "id": "hh6WnjI3ITpB", "outputId": "33138efe-a581-49ed-b2b4-247f8e9bdc09" }, - "execution_count": 37, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(81304, 1) (2417796, 2)\n", "CPU times: user 1min 56s, sys: 17.1 s, total: 2min 13s\n", "Wall time: 2min 22s\n" ] } + ], + "source": [ + "%%time\n", + "for i in range(10):\n", + " out = g.chain([n({'id': 17116707}), e_forward(hops=8)])\n", + "print(out._nodes.shape, out._edges.shape)" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "g_gdf = g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "for i in range(10):\n", - " out = g_gdf.chain([n({'id': 17116707}), e_forward(hops=8)])._nodes\n", - "print(out.shape)\n", - "del g_gdf\n", - "del out" - ], + "execution_count": 38, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1571,31 +1542,30 @@ "id": "7jFFVUenM87j", "outputId": "2cceb720-9de3-488e-8b74-b820fd06e6c1" }, - "execution_count": 38, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(81304, 1)\n", "CPU times: user 5.3 s, sys: 1.48 s, total: 6.78 s\n", "Wall time: 7.89 s\n" ] } + ], + "source": [ + "%%time\n", + "g_gdf = g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", + "for i in range(10):\n", + " out = g_gdf.chain([n({'id': 17116707}), e_forward(hops=8)])._nodes\n", + "print(out.shape)\n", + "del g_gdf\n", + "del out" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "start_nodes = pd.DataFrame({g._node: [17116707]})\n", - "for i in range(10):\n", - " g2 = g.hop(\n", - " nodes=start_nodes,\n", - " direction='forward',\n", - " hops=1)\n", - "print(g2._nodes.shape, g2._edges.shape)" - ], + "execution_count": 39, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1603,35 +1573,31 @@ "id": "_5LD0bZB_lU4", "outputId": "bc31bd03-e79f-46d2-ea8f-3b01d9ef39a2" }, - "execution_count": 39, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(0, 1) (0, 2)\n", "CPU times: user 2.58 s, sys: 1.59 s, total: 4.17 s\n", "Wall time: 6.02 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = cudf.DataFrame({g._node: [17116707]})\n", - "g_gdf = g.nodes(cudf.from_pandas(g._nodes)).edges(cudf.from_pandas(g._edges))\n", + "start_nodes = pd.DataFrame({g._node: [17116707]})\n", "for i in range(10):\n", - " g2 = g_gdf.hop(\n", + " g2 = g.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", - " hops=5)\n", - "print(g2._nodes.shape, g2._edges.shape)\n", - "del start_nodes\n", - "del g_gdf\n", - "del g2" - ], + " hops=1)\n", + "print(g2._nodes.shape, g2._edges.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1639,31 +1605,35 @@ "id": "M_rHjqtvACQw", "outputId": "8d3e308e-b1e2-452b-f402-573be0dd5b58" }, - "execution_count": 44, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(61827, 1) (1473599, 2)\n", "CPU times: user 822 ms, sys: 179 ms, total: 1 s\n", "Wall time: 997 ms\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = pd.DataFrame({g._node: [17116707]})\n", + "start_nodes = cudf.DataFrame({g._node: [17116707]})\n", + "g_gdf = g.nodes(cudf.from_pandas(g._nodes)).edges(cudf.from_pandas(g._edges))\n", "for i in range(10):\n", - " g2 = g.hop(\n", + " g2 = g_gdf.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", - " hops=2)\n", - "print(g2._nodes.shape, g2._edges.shape)" - ], + " hops=5)\n", + "print(g2._nodes.shape, g2._edges.shape)\n", + "del start_nodes\n", + "del g_gdf\n", + "del g2" + ] + }, + { + "cell_type": "code", + "execution_count": 40, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1671,35 +1641,31 @@ "id": "0zEIucaCAbj_", "outputId": "83e64b0f-2b3a-4e4b-d189-3e6a8ef78f53" }, - "execution_count": 40, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(2345, 1) (68536, 2)\n", "CPU times: user 8.93 s, sys: 5.92 s, total: 14.9 s\n", "Wall time: 15.8 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = cudf.DataFrame({g._node: [17116707]})\n", - "g_gdf = g.nodes(cudf.from_pandas(g._nodes)).edges(cudf.from_pandas(g._edges))\n", + "start_nodes = pd.DataFrame({g._node: [17116707]})\n", "for i in range(10):\n", - " g2 = g_gdf.hop(\n", + " g2 = g.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", " hops=2)\n", - "print(g2._nodes.shape, g2._edges.shape)\n", - "del start_nodes\n", - "del g_gdf\n", - "del g2" - ], + "print(g2._nodes.shape, g2._edges.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1707,31 +1673,35 @@ "id": "LKJh5gRtAdIj", "outputId": "e3c7883d-74c0-4d55-b238-88457296c6bc" }, - "execution_count": 41, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(2345, 1) (68536, 2)\n", "CPU times: user 374 ms, sys: 6.92 ms, total: 381 ms\n", "Wall time: 379 ms\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = pd.DataFrame({g._node: [17116707]})\n", + "start_nodes = cudf.DataFrame({g._node: [17116707]})\n", + "g_gdf = g.nodes(cudf.from_pandas(g._nodes)).edges(cudf.from_pandas(g._edges))\n", "for i in range(10):\n", - " g2 = g.hop(\n", + " g2 = g_gdf.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", - " hops=8)\n", - "print(g2._nodes.shape, g2._edges.shape)" - ], + " hops=2)\n", + "print(g2._nodes.shape, g2._edges.shape)\n", + "del start_nodes\n", + "del g_gdf\n", + "del g2" + ] + }, + { + "cell_type": "code", + "execution_count": 42, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1739,35 +1709,31 @@ "id": "JZwxdofNAfmb", "outputId": "2731be4c-75d9-47f4-8602-4f2d6cb2ddac" }, - "execution_count": 42, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(81304, 1) (2417796, 2)\n", "CPU times: user 38.8 s, sys: 8.7 s, total: 47.5 s\n", "Wall time: 48.2 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = cudf.DataFrame({g._node: [17116707]})\n", - "g_gdf = g.nodes(cudf.from_pandas(g._nodes)).edges(cudf.from_pandas(g._edges))\n", + "start_nodes = pd.DataFrame({g._node: [17116707]})\n", "for i in range(10):\n", - " g2 = g_gdf.hop(\n", + " g2 = g.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", " hops=8)\n", - "print(g2._nodes.shape, g2._edges.shape)\n", - "del start_nodes\n", - "del g_gdf\n", - "del g2" - ], + "print(g2._nodes.shape, g2._edges.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1775,36 +1741,47 @@ "id": "9o_og8bSAhe3", "outputId": "dd3e4f8f-f426-4705-98c4-60f1912ba28a" }, - "execution_count": 43, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(81304, 1) (2417796, 2)\n", "CPU times: user 1.8 s, sys: 506 ms, total: 2.3 s\n", "Wall time: 2.3 s\n" ] } + ], + "source": [ + "%%time\n", + "start_nodes = cudf.DataFrame({g._node: [17116707]})\n", + "g_gdf = g.nodes(cudf.from_pandas(g._nodes)).edges(cudf.from_pandas(g._edges))\n", + "for i in range(10):\n", + " g2 = g_gdf.hop(\n", + " nodes=start_nodes,\n", + " direction='forward',\n", + " hops=8)\n", + "print(g2._nodes.shape, g2._edges.shape)\n", + "del start_nodes\n", + "del g_gdf\n", + "del g2" ] }, { "cell_type": "markdown", + "metadata": { + "id": "9dZzAAVONCD2" + }, "source": [ - "### GPlus\n", + "## GPlus\n", "\n", "- edges: 30494866\n", "- nodes: 107614" - ], - "metadata": { - "id": "9dZzAAVONCD2" - } + ] }, { "cell_type": "code", - "source": [ - "! wget https://snap.stanford.edu/data/gplus_combined.txt.gz" - ], + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1812,11 +1789,10 @@ "id": "-nhWGNekKpcZ", "outputId": "e2175290-337c-4faa-e5d8-4bc401583326" }, - "execution_count": 4, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "--2023-12-26 18:36:29-- https://snap.stanford.edu/data/gplus_combined.txt.gz\n", "Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80\n", @@ -1831,27 +1807,25 @@ "\n" ] } + ], + "source": [ + "#! wget https://snap.stanford.edu/data/gplus_combined.txt.gz" ] }, { "cell_type": "code", - "source": [ - "! gunzip gplus_combined.txt.gz" - ], + "execution_count": 5, "metadata": { "id": "g5wgA_c2KqwJ" }, - "execution_count": 5, - "outputs": [] + "outputs": [], + "source": [ + "#! gunzip gplus_combined.txt.gz" + ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "ge_df = pd.read_csv('gplus_combined.txt', sep=' ', names=['s', 'd'])\n", - "print(ge_df.shape)\n", - "ge_df.head(5)" - ], + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1860,11 +1834,10 @@ "id": "52hgDbr0Kti6", "outputId": "217203fc-7095-4784-c4c4-d46ee9c78808" }, - "execution_count": 6, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(30494866, 2)\n", "CPU times: user 16 s, sys: 1.45 s, total: 17.5 s\n", @@ -1872,16 +1845,7 @@ ] }, { - "output_type": "execute_result", "data": { - "text/plain": [ - " s d\n", - "0 116374117927631468606 101765416973555767821\n", - "1 112188647432305746617 107727150903234299458\n", - "2 116719211656774388392 100432456209427807893\n", - "3 117421021456205115327 101096322838605097368\n", - "4 116407635616074189669 113556266482860931616" - ], "text/html": [ "\n", "
\n", @@ -2144,22 +2108,31 @@ "
\n", " \n", " \n" + ], + "text/plain": [ + " s d\n", + "0 116374117927631468606 101765416973555767821\n", + "1 112188647432305746617 107727150903234299458\n", + "2 116719211656774388392 100432456209427807893\n", + "3 117421021456205115327 101096322838605097368\n", + "4 116407635616074189669 113556266482860931616" ] }, + "execution_count": 6, "metadata": {}, - "execution_count": 6 + "output_type": "execute_result" } + ], + "source": [ + "%%time\n", + "ge_df = pd.read_csv('gplus_combined.txt', sep=' ', names=['s', 'd'])\n", + "print(ge_df.shape)\n", + "ge_df.head(5)" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "gg = graphistry.edges(ge_df, 's', 'd').materialize_nodes()\n", - "gg = graphistry.edges(ge_df, 's', 'd').nodes(gg._nodes, 'id')\n", - "print(gg._edges.shape, gg._nodes.shape)\n", - "gg._nodes.head(5)" - ], + "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2168,11 +2141,10 @@ "id": "w5YkN-nLK6UV", "outputId": "dc98380d-54c2-4b36-c56e-5e8401c4ffa4" }, - "execution_count": 7, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(30494866, 2) (107614, 1)\n", "CPU times: user 4.49 s, sys: 1.25 s, total: 5.74 s\n", @@ -2180,16 +2152,7 @@ ] }, { - "output_type": "execute_result", "data": { - "text/plain": [ - " id\n", - "0 116374117927631468606\n", - "1 112188647432305746617\n", - "2 116719211656774388392\n", - "3 117421021456205115327\n", - "4 116407635616074189669" - ], "text/html": [ "\n", "
\n", @@ -2446,19 +2409,32 @@ "
\n", " \n", " \n" + ], + "text/plain": [ + " id\n", + "0 116374117927631468606\n", + "1 112188647432305746617\n", + "2 116719211656774388392\n", + "3 117421021456205115327\n", + "4 116407635616074189669" ] }, + "execution_count": 7, "metadata": {}, - "execution_count": 7 + "output_type": "execute_result" } + ], + "source": [ + "%%time\n", + "gg = graphistry.edges(ge_df, 's', 'd').materialize_nodes()\n", + "gg = graphistry.edges(ge_df, 's', 'd').nodes(gg._nodes, 'id')\n", + "print(gg._edges.shape, gg._nodes.shape)\n", + "gg._nodes.head(5)" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "gg.chain([ n({'id': '116374117927631468606'})])._nodes" - ], + "execution_count": 49, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2467,23 +2443,17 @@ "id": "NKtz54uELX-8", "outputId": "5d8f3eef-893d-47cc-e7a9-c5cbfec8270c" }, - "execution_count": 49, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "CPU times: user 534 ms, sys: 598 ms, total: 1.13 s\n", "Wall time: 1.65 s\n" ] }, { - "output_type": "execute_result", "data": { - "text/plain": [ - " id\n", - "0 116374117927631468606" - ], "text/html": [ "\n", "
\n", @@ -2597,20 +2567,25 @@ "\n", "
\n", " \n" + ], + "text/plain": [ + " id\n", + "0 116374117927631468606" ] }, + "execution_count": 49, "metadata": {}, - "execution_count": 49 + "output_type": "execute_result" } + ], + "source": [ + "%%time\n", + "gg.chain([ n({'id': '116374117927631468606'})])._nodes" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=1)])._nodes\n", - "out.shape" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2618,38 +2593,35 @@ "id": "iNWdi00VLmZG", "outputId": "ecfb56a6-c564-4bf6-f43f-2c95a103f4be" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "CPU times: user 27.5 s, sys: 11.1 s, total: 38.5 s\n", "Wall time: 39.5 s\n" ] }, { - "output_type": "execute_result", "data": { "text/plain": [ "(1473, 1)" ] }, + "execution_count": 75, "metadata": {}, - "execution_count": 75 + "output_type": "execute_result" } + ], + "source": [ + "%%time\n", + "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=1)])._nodes\n", + "out.shape" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=1)])\n", - "print(out._nodes.shape, out._edges.shape)\n", - "del gg_gdf\n", - "del out" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2657,26 +2629,29 @@ "id": "Q6p3h6uCOABh", "outputId": "817fc80f-ef5d-4070-eb48-a12344be709c" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(1473, 1) (13375, 2)\n", "CPU times: user 4.57 s, sys: 2.11 s, total: 6.68 s\n", "Wall time: 7.63 s\n" ] } + ], + "source": [ + "%%time\n", + "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", + "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=1)])\n", + "print(out._nodes.shape, out._edges.shape)\n", + "del gg_gdf\n", + "del out" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=2)])._nodes\n", - "out.shape" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2684,38 +2659,35 @@ "id": "6UdCcMdqLw-P", "outputId": "70742c79-b22b-4db2-c548-cb1e25d572eb" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "CPU times: user 45.8 s, sys: 17 s, total: 1min 2s\n", "Wall time: 1min 5s\n" ] }, { - "output_type": "execute_result", "data": { "text/plain": [ "(44073, 1)" ] }, + "execution_count": 77, "metadata": {}, - "execution_count": 77 + "output_type": "execute_result" } + ], + "source": [ + "%%time\n", + "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=2)])._nodes\n", + "out.shape" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=2)])\n", - "print(out._nodes.shape, out._edges.shape)\n", - "del gg_gdf\n", - "del out" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2723,26 +2695,29 @@ "id": "QElqatDyNYCS", "outputId": "0e15bd3e-d2d9-4965-df7d-c8856d036680" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(44073, 1) (2069325, 2)\n", "CPU times: user 4.97 s, sys: 2.36 s, total: 7.34 s\n", "Wall time: 10.6 s\n" ] } + ], + "source": [ + "%%time\n", + "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", + "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=2)])\n", + "print(out._nodes.shape, out._edges.shape)\n", + "del gg_gdf\n", + "del out" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=3)])._nodes\n", - "out.shape" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2750,38 +2725,35 @@ "id": "3HJOItZ4MQMG", "outputId": "f5be7bb4-7f09-4f80-c549-e703e99f5067" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "CPU times: user 3min 45s, sys: 1min 5s, total: 4min 50s\n", "Wall time: 4min 52s\n" ] }, { - "output_type": "execute_result", "data": { "text/plain": [ "(102414, 1)" ] }, + "execution_count": 79, "metadata": {}, - "execution_count": 79 + "output_type": "execute_result" } + ], + "source": [ + "%%time\n", + "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=3)])._nodes\n", + "out.shape" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=3)])\n", - "print(out._nodes.shape, out._edges.shape)\n", - "del gg_gdf\n", - "del out" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2789,26 +2761,29 @@ "id": "G32t_xthOUle", "outputId": "7721741f-9c86-41aa-eb0b-2c8f0db2ed54" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(102414, 1) (24851333, 2)\n", "CPU times: user 6.95 s, sys: 2.63 s, total: 9.57 s\n", "Wall time: 9.84 s\n" ] } + ], + "source": [ + "%%time\n", + "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", + "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=3)])\n", + "print(out._nodes.shape, out._edges.shape)\n", + "del gg_gdf\n", + "del out" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=4)])\n", - "print(out._nodes.shape, out._edges.shape)" - ], + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2816,29 +2791,26 @@ "id": "bXy2yyJsMsEG", "outputId": "911f2680-067c-44f2-9ba2-7f27d3c9bc6b" }, - "execution_count": 8, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(105479, 1) (30450354, 2)\n", "CPU times: user 4min 36s, sys: 1min 25s, total: 6min 2s\n", "Wall time: 6min 4s\n" ] } + ], + "source": [ + "%%time\n", + "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=4)])\n", + "print(out._nodes.shape, out._edges.shape)" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=4)])\n", - "print(out._nodes.shape, out._edges.shape)\n", - "del gg_gdf\n", - "del out" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2846,26 +2818,29 @@ "id": "Vt8hhjWDP_W_", "outputId": "824ae644-e1cf-4239-bda9-84aecde52ad8" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(105479, 1) (30450354, 2)\n", "CPU times: user 7.44 s, sys: 2.45 s, total: 9.88 s\n", "Wall time: 9.9 s\n" ] } + ], + "source": [ + "%%time\n", + "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", + "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=4)])\n", + "print(out._nodes.shape, out._edges.shape)\n", + "del gg_gdf\n", + "del out" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=5)])\n", - "print(out._nodes.shape, out._edges.shape)" - ], + "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2873,61 +2848,56 @@ "id": "_z4KpNZaOH8t", "outputId": "2417f78b-e1b7-452d-8e26-7df259620c88" }, - "execution_count": 9, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(105604, 1) (30468335, 2)\n", "CPU times: user 5min 36s, sys: 1min 39s, total: 7min 16s\n", "Wall time: 7min 15s\n" ] } + ], + "source": [ + "%%time\n", + "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=5)])\n", + "print(out._nodes.shape, out._edges.shape)" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=5)])\n", - "print(out._nodes.shape, out._edges.shape)\n", - "del gg_gdf\n", - "del out" - ], + "execution_count": null, "metadata": { - "id": "spUBH9EHSz2O", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "spUBH9EHSz2O", "outputId": "22340ce3-e8d4-4a72-b485-9839c667b965" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(105604, 1) (30468335, 2)\n", "CPU times: user 8.82 s, sys: 2.71 s, total: 11.5 s\n", "Wall time: 11.9 s\n" ] } + ], + "source": [ + "%%time\n", + "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", + "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=5)])\n", + "print(out._nodes.shape, out._edges.shape)\n", + "del gg_gdf\n", + "del out" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n", - "for i in range(1):\n", - " g2 = gg.hop(\n", - " nodes=start_nodes,\n", - " direction='forward',\n", - " hops=1)\n", - "print(g2._nodes.shape, g2._edges.shape)" - ], + "execution_count": 50, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2935,35 +2905,31 @@ "id": "vCsdmc62A7OM", "outputId": "adc05d29-c628-49ed-cd6d-8921c6dcd206" }, - "execution_count": 50, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(1473, 1) (13375, 2)\n", "CPU times: user 19.9 s, sys: 9.36 s, total: 29.2 s\n", "Wall time: 41.8 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n", - "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n", + "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n", "for i in range(1):\n", - " g2 = gg_gdf.hop(\n", + " g2 = gg.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", " hops=1)\n", - "print(g2._nodes.shape, g2._edges.shape)\n", - "del start_nodes\n", - "del gg_gdf\n", - "del g2" - ], + "print(g2._nodes.shape, g2._edges.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2971,31 +2937,35 @@ "id": "J3kV8NBYBQdW", "outputId": "76073248-43e1-4c3c-c004-67324cc1d312" }, - "execution_count": 52, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(1473, 1) (13375, 2)\n", "CPU times: user 3.71 s, sys: 2.09 s, total: 5.8 s\n", "Wall time: 6.05 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n", + "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n", + "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n", "for i in range(1):\n", - " g2 = gg.hop(\n", + " g2 = gg_gdf.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", - " hops=2)\n", - "print(g2._nodes.shape, g2._edges.shape)" - ], + " hops=1)\n", + "print(g2._nodes.shape, g2._edges.shape)\n", + "del start_nodes\n", + "del gg_gdf\n", + "del g2" + ] + }, + { + "cell_type": "code", + "execution_count": 53, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3003,35 +2973,31 @@ "id": "ONv1RQeWBeeK", "outputId": "58d57fa4-be72-45bc-abfa-5de9d1102f55" }, - "execution_count": 53, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(44073, 1) (2069325, 2)\n", "CPU times: user 27.8 s, sys: 13.2 s, total: 41 s\n", "Wall time: 43.9 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n", - "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n", + "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n", "for i in range(1):\n", - " g2 = gg_gdf.hop(\n", + " g2 = gg.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", " hops=2)\n", - "print(g2._nodes.shape, g2._edges.shape)\n", - "del start_nodes\n", - "del gg_gdf\n", - "del g2" - ], + "print(g2._nodes.shape, g2._edges.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3039,31 +3005,35 @@ "id": "ke5SZZ01BgqR", "outputId": "4173fd28-a11b-4300-d28b-6fdb87e8e9f3" }, - "execution_count": 54, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(44073, 1) (2069325, 2)\n", "CPU times: user 4.26 s, sys: 2.37 s, total: 6.63 s\n", "Wall time: 7.91 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n", + "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n", + "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n", "for i in range(1):\n", - " g2 = gg.hop(\n", + " g2 = gg_gdf.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", - " hops=3)\n", - "print(g2._nodes.shape, g2._edges.shape)" - ], + " hops=2)\n", + "print(g2._nodes.shape, g2._edges.shape)\n", + "del start_nodes\n", + "del gg_gdf\n", + "del g2" + ] + }, + { + "cell_type": "code", + "execution_count": 55, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3071,35 +3041,31 @@ "id": "U795pIBUBiZV", "outputId": "d499433c-cc0c-4bbf-c69f-36b5d55402d9" }, - "execution_count": 55, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(102414, 1) (24851333, 2)\n", "CPU times: user 1min 3s, sys: 22.7 s, total: 1min 26s\n", "Wall time: 1min 35s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n", - "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n", + "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n", "for i in range(1):\n", - " g2 = gg_gdf.hop(\n", + " g2 = gg.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", " hops=3)\n", - "print(g2._nodes.shape, g2._edges.shape)\n", - "del start_nodes\n", - "del gg_gdf\n", - "del g2" - ], + "print(g2._nodes.shape, g2._edges.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3107,31 +3073,35 @@ "id": "kIZYwSe1Bj2e", "outputId": "b7e1ed9f-47d1-412e-9593-ecc436ac1486" }, - "execution_count": 56, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(102414, 1) (24851333, 2)\n", "CPU times: user 3.96 s, sys: 2.11 s, total: 6.07 s\n", "Wall time: 6.05 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n", + "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n", + "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n", "for i in range(1):\n", - " g2 = gg.hop(\n", + " g2 = gg_gdf.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", - " hops=4)\n", - "print(g2._nodes.shape, g2._edges.shape)" - ], + " hops=3)\n", + "print(g2._nodes.shape, g2._edges.shape)\n", + "del start_nodes\n", + "del gg_gdf\n", + "del g2" + ] + }, + { + "cell_type": "code", + "execution_count": 57, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3139,35 +3109,31 @@ "id": "YTI5sD6YBpYL", "outputId": "b37bf2df-07dc-404c-8a83-a83f28e38bf6" }, - "execution_count": 57, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(105479, 1) (30450354, 2)\n", "CPU times: user 1min 34s, sys: 30.6 s, total: 2min 5s\n", "Wall time: 2min 5s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n", - "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n", + "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n", "for i in range(1):\n", - " g2 = gg_gdf.hop(\n", + " g2 = gg.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", " hops=4)\n", - "print(g2._nodes.shape, g2._edges.shape)\n", - "del start_nodes\n", - "del gg_gdf\n", - "del g2" - ], + "print(g2._nodes.shape, g2._edges.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3175,31 +3141,35 @@ "id": "d5WBazICBrSz", "outputId": "ef95e893-3a0f-4d47-ede4-bd8a6faebf98" }, - "execution_count": 58, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(105479, 1) (30450354, 2)\n", "CPU times: user 5.25 s, sys: 2.41 s, total: 7.67 s\n", "Wall time: 7.69 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n", + "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n", + "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n", "for i in range(1):\n", - " g2 = gg.hop(\n", + " g2 = gg_gdf.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", - " hops=5)\n", - "print(g2._nodes.shape, g2._edges.shape)" - ], + " hops=4)\n", + "print(g2._nodes.shape, g2._edges.shape)\n", + "del start_nodes\n", + "del gg_gdf\n", + "del g2" + ] + }, + { + "cell_type": "code", + "execution_count": 59, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3207,35 +3177,31 @@ "id": "ozQlRPaFBtPD", "outputId": "4f1655c4-38fd-47f9-942d-836585e0d866" }, - "execution_count": 59, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(105604, 1) (30468335, 2)\n", "CPU times: user 2min 16s, sys: 39.1 s, total: 2min 55s\n", "Wall time: 2min 58s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n", - "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n", + "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n", "for i in range(1):\n", - " g2 = gg_gdf.hop(\n", + " g2 = gg.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", " hops=5)\n", - "print(g2._nodes.shape, g2._edges.shape)\n", - "del start_nodes\n", - "del gg_gdf\n", - "del g2" - ], + "print(g2._nodes.shape, g2._edges.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3243,35 +3209,46 @@ "id": "-ACkMG20B6HM", "outputId": "f26c03a9-9f25-4f93-c7d3-0e8676694040" }, - "execution_count": 60, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(105604, 1) (30468335, 2)\n", "CPU times: user 5.79 s, sys: 2.51 s, total: 8.3 s\n", "Wall time: 8.29 s\n" ] } + ], + "source": [ + "%%time\n", + "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n", + "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n", + "for i in range(1):\n", + " g2 = gg_gdf.hop(\n", + " nodes=start_nodes,\n", + " direction='forward',\n", + " hops=5)\n", + "print(g2._nodes.shape, g2._edges.shape)\n", + "del start_nodes\n", + "del gg_gdf\n", + "del g2" ] }, { "cell_type": "markdown", + "metadata": { + "id": "R03M_swxarKC" + }, "source": [ - "### Orkut\n", + "## Orkut\n", "- 117M edges\n", "- 3M nodes" - ], - "metadata": { - "id": "R03M_swxarKC" - } + ] }, { "cell_type": "code", - "source": [ - "! wget https://snap.stanford.edu/data/bigdata/communities/com-orkut.ungraph.txt.gz" - ], + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3279,11 +3256,10 @@ "id": "QoabYR2maxPo", "outputId": "2bb6275d-46bb-42da-ec05-d0e5a58b1f77" }, - "execution_count": 8, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "--2023-12-26 00:55:52-- https://snap.stanford.edu/data/bigdata/communities/com-orkut.ungraph.txt.gz\n", "Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80\n", @@ -3298,24 +3274,25 @@ "\n" ] } + ], + "source": [ + "#! wget https://snap.stanford.edu/data/bigdata/communities/com-orkut.ungraph.txt.gz" ] }, { "cell_type": "code", - "source": [ - "! gunzip com-orkut.ungraph.txt.gz" - ], + "execution_count": 9, "metadata": { "id": "BvvfFPKWbAVJ" }, - "execution_count": 9, - "outputs": [] + "outputs": [], + "source": [ + "#! gunzip com-orkut.ungraph.txt.gz" + ] }, { "cell_type": "code", - "source": [ - "! head -n 7 com-orkut.ungraph.txt" - ], + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3323,11 +3300,10 @@ "id": "YsWwRoPqbPIb", "outputId": "2eb4f862-b4e1-42bf-ff5d-eec10b27cedc" }, - "execution_count": 10, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "# Undirected graph: ../../data/output/orkut.txt\n", "# Orkut\n", @@ -3338,10 +3314,33 @@ "1\t4\n" ] } + ], + "source": [ + "#! head -n 7 com-orkut.ungraph.txt" ] }, { "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cbMC8r2ldjbW", + "outputId": "82688d53-7d56-4563-d65e-7c5cd32ac14e" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('23.12.01', '0.32.0+12.g72e778c')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "\n", @@ -3363,33 +3362,11 @@ "locale.getpreferredencoding = lambda: \"UTF-8\"\n", "\n", "cudf.__version__, graphistry.__version__" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cbMC8r2ldjbW", - "outputId": "82688d53-7d56-4563-d65e-7c5cd32ac14e" - }, - "execution_count": 11, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "('23.12.01', '0.32.0+12.g72e778c')" - ] - }, - "metadata": {}, - "execution_count": 11 - } ] }, { "cell_type": "code", - "source": [ - "! nvidia-smi" - ], + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3397,11 +3374,10 @@ "id": "TopFxAvnh_Cv", "outputId": "cc9d9dc9-e594-4190-fe84-3f1b6dce8a1a" }, - "execution_count": 12, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Tue Dec 26 00:56:27 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -3424,18 +3400,14 @@ "+---------------------------------------------------------------------------------------+\n" ] } + ], + "source": [ + "#! nvidia-smi" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "co_df = cudf.read_csv('com-orkut.ungraph.txt', sep='\\t', names=['s', 'd'], skiprows=5).to_pandas()\n", - "print(co_df.shape)\n", - "print(co_df.head(5))\n", - "print(co_df.dtypes)\n", - "#del co_df" - ], + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3443,11 +3415,10 @@ "id": "Oczs87ITbJgw", "outputId": "ac203ddd-e684-4eb9-a586-f6a49fd1625d" }, - "execution_count": 13, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(117185082, 2)\n", " s d\n", @@ -3463,17 +3434,19 @@ "Wall time: 6.76 s\n" ] } + ], + "source": [ + "%%time\n", + "co_df = cudf.read_csv('com-orkut.ungraph.txt', sep='\\t', names=['s', 'd'], skiprows=5).to_pandas()\n", + "print(co_df.shape)\n", + "print(co_df.head(5))\n", + "print(co_df.dtypes)\n", + "#del co_df" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "co_g = graphistry.edges(cudf.DataFrame(co_df), 's', 'd').materialize_nodes(engine='cudf')\n", - "co_g = co_g.nodes(lambda g: g._nodes.to_pandas()).edges(lambda g: g._edges.to_pandas())\n", - "print(co_g._nodes.shape, co_g._edges.shape)\n", - "co_g._nodes.head(5)" - ], + "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -3482,11 +3455,10 @@ "id": "gGSDjTtveFAT", "outputId": "e7b38f4f-dc07-4f35-9bab-9c80a80bbf0b" }, - "execution_count": 14, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "(3072441, 1) (117185082, 2)\n", "CPU times: user 1.96 s, sys: 2.95 s, total: 4.91 s\n", @@ -3494,16 +3466,7 @@ ] }, { - "output_type": "execute_result", "data": { - "text/plain": [ - " id\n", - "0 1\n", - "1 2\n", - "2 3\n", - "3 4\n", - "4 5" - ], "text/html": [ "\n", "
\n", @@ -3760,18 +3723,32 @@ "
\n", " \n", " \n" + ], + "text/plain": [ + " id\n", + "0 1\n", + "1 2\n", + "2 3\n", + "3 4\n", + "4 5" ] }, + "execution_count": 14, "metadata": {}, - "execution_count": 14 + "output_type": "execute_result" } + ], + "source": [ + "%%time\n", + "co_g = graphistry.edges(cudf.DataFrame(co_df), 's', 'd').materialize_nodes(engine='cudf')\n", + "co_g = co_g.nodes(lambda g: g._nodes.to_pandas()).edges(lambda g: g._edges.to_pandas())\n", + "print(co_g._nodes.shape, co_g._edges.shape)\n", + "co_g._nodes.head(5)" ] }, { "cell_type": "code", - "source": [ - "! nvidia-smi" - ], + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3779,11 +3756,10 @@ "id": "V5qL8K7-dqIZ", "outputId": "e08319fc-74d3-4f33-df0f-f98950dc8c99" }, - "execution_count": 15, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Tue Dec 26 00:56:39 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -3806,18 +3782,14 @@ "+---------------------------------------------------------------------------------------+\n" ] } + ], + "source": [ + "#! nvidia-smi" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "# crashes\n", - "if False:\n", - " out = co_g.chain([ n({'id': 1}), e_forward(hops=1)])._nodes\n", - " print(out.shape)\n", - " del out" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3825,31 +3797,28 @@ "id": "hCbxZ8UmhRLp", "outputId": "519aed6c-733d-41f4-d462-e57f5e32b131" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "CPU times: user 4 µs, sys: 1 µs, total: 5 µs\n", "Wall time: 47.7 µs\n" ] } + ], + "source": [ + "%%time\n", + "# crashes\n", + "if False:\n", + " out = co_g.chain([ n({'id': 1}), e_forward(hops=1)])._nodes\n", + " print(out.shape)\n", + " del out" ] }, { - "cell_type": "code", - "source": [ - "%%time\n", - "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "! nvidia-smi\n", - "for i in range(10):\n", - " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=1)])\n", - "! nvidia-smi\n", - "print(out._nodes.shape, out._edges.shape)\n", - "del co_gdf\n", - "del out" - ], + "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3857,11 +3826,10 @@ "id": "Q682scC_eC-S", "outputId": "7ff5f829-0de7-4a6c-a77d-e2857896a8a5" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Mon Dec 25 06:23:46 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -3906,21 +3874,22 @@ "Wall time: 4.42 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "! nvidia-smi\n", + "#! nvidia-smi\n", "for i in range(10):\n", - " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=2)])\n", - "! nvidia-smi\n", + " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=1)])\n", + "#! nvidia-smi\n", "print(out._nodes.shape, out._edges.shape)\n", "del co_gdf\n", "del out" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3928,11 +3897,10 @@ "id": "i0AXhfqVbVsm", "outputId": "8271f469-a73f-48e3-e1a9-3077026ab8ec" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Mon Dec 25 06:24:52 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -3977,21 +3945,22 @@ "Wall time: 6.13 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "! nvidia-smi\n", + "#! nvidia-smi\n", "for i in range(10):\n", - " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=3)])\n", - "! nvidia-smi\n", + " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=2)])\n", + "#! nvidia-smi\n", "print(out._nodes.shape, out._edges.shape)\n", "del co_gdf\n", "del out" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -3999,11 +3968,10 @@ "id": "Hid0-iPKhpOd", "outputId": "ecaeb534-d4d7-48fa-d4e1-c80b22626afe" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Mon Dec 25 06:25:25 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -4048,21 +4016,22 @@ "Wall time: 6.37 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "! nvidia-smi\n", + "#! nvidia-smi\n", "for i in range(10):\n", - " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=4)])\n", - "! nvidia-smi\n", + " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=3)])\n", + "#! nvidia-smi\n", "print(out._nodes.shape, out._edges.shape)\n", "del co_gdf\n", "del out" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -4070,11 +4039,10 @@ "id": "buutj-ZjhrEe", "outputId": "ae11addd-6bea-44e9-81c0-b431e1db8089" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Mon Dec 25 06:26:04 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -4119,21 +4087,22 @@ "Wall time: 9.84 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "! nvidia-smi\n", + "#! nvidia-smi\n", "for i in range(10):\n", - " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=5)])\n", - "! nvidia-smi\n", + " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=4)])\n", + "#! nvidia-smi\n", "print(out._nodes.shape, out._edges.shape)\n", "del co_gdf\n", "del out" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -4141,11 +4110,10 @@ "id": "bK4C9Ly0hso-", "outputId": "8a9a32ab-03e2-42b4-8b71-2bcf797b31b1" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Mon Dec 25 06:27:18 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -4190,10 +4158,26 @@ "Wall time: 39.2 s\n" ] } + ], + "source": [ + "%%time\n", + "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", + "#! nvidia-smi\n", + "for i in range(10):\n", + " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=5)])\n", + "#! nvidia-smi\n", + "print(out._nodes.shape, out._edges.shape)\n", + "del co_gdf\n", + "del out" ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qrga-la0hwhh" + }, + "outputs": [], "source": [ "%%time\n", "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", @@ -4201,18 +4185,11 @@ "print(out.shape)\n", "del co_gdf\n", "del out" - ], - "metadata": { - "id": "qrga-la0hwhh" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "!lscpu\n" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -4220,11 +4197,10 @@ "id": "eiXFImxF-rzw", "outputId": "b807cc3d-ed1a-4bef-c6e0-bfc2df7356ff" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Architecture: x86_64\n", " CPU op-mode(s): 32-bit, 64-bit\n", @@ -4276,13 +4252,14 @@ " Tsx async abort: Vulnerable\n" ] } + ], + "source": [ + "#!lscpu\n" ] }, { "cell_type": "code", - "source": [ - "!free -h\n" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -4290,36 +4267,24 @@ "id": "wJohLi58-sN5", "outputId": "c3e144f6-c19a-4c68-e867-f5e7fa2e9df4" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ " total used free shared buff/cache available\n", "Mem: 12Gi 717Mi 8.0Gi 1.0Mi 3.9Gi 11Gi\n", "Swap: 0B 0B 0B\n" ] } + ], + "source": [ + "#!free -h\n" ] }, { "cell_type": "code", - "source": [ - "%%time\n", - "start_nodes = pd.DataFrame({'id': [1]})\n", - "! nvidia-smi\n", - "for i in range(1):\n", - " g2 = co_g.hop(\n", - " nodes=start_nodes,\n", - " direction='forward',\n", - " hops=1)\n", - "! nvidia-smi\n", - "print(g2._nodes.shape, g2._edges.shape)\n", - "#del start_nodes\n", - "#del co_gdf\n", - "#del g2" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -4327,11 +4292,10 @@ "id": "zak4Inhco5il", "outputId": "30bcf2bc-853e-4e5e-8c57-ba0cd9429554" }, - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Tue Dec 26 01:01:43 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -4354,38 +4318,37 @@ "+---------------------------------------------------------------------------------------+\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", - "start_nodes = cudf.DataFrame({'id': [1]})\n", - "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "! nvidia-smi\n", - "for i in range(10):\n", - " g2 = co_gdf.hop(\n", + "start_nodes = pd.DataFrame({'id': [1]})\n", + "#! nvidia-smi\n", + "for i in range(1):\n", + " g2 = co_g.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", " hops=1)\n", - "! nvidia-smi\n", + "#! nvidia-smi\n", "print(g2._nodes.shape, g2._edges.shape)\n", - "del start_nodes\n", - "del co_gdf\n", - "del g2" - ], + "#del start_nodes\n", + "#del co_gdf\n", + "#del g2" + ] + }, + { + "cell_type": "code", + "execution_count": 16, "metadata": { - "id": "-SmFlCBS_Bgx", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "-SmFlCBS_Bgx", "outputId": "d2326cf7-3ea6-4f99-9548-f2e98ece59a4" }, - "execution_count": 16, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Tue Dec 26 00:56:45 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -4430,26 +4393,27 @@ "Wall time: 1.84 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", "start_nodes = cudf.DataFrame({'id': [1]})\n", "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "! nvidia-smi\n", + "#! nvidia-smi\n", "for i in range(10):\n", " g2 = co_gdf.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", - " hops=2)\n", - "! nvidia-smi\n", + " hops=1)\n", + "#! nvidia-smi\n", "print(g2._nodes.shape, g2._edges.shape)\n", "del start_nodes\n", "del co_gdf\n", "del g2" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -4457,11 +4421,10 @@ "id": "fjjt3YnYnabv", "outputId": "05762f50-bfe1-4d23-9153-31431418c8e5" }, - "execution_count": 17, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Tue Dec 26 00:56:47 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -4506,26 +4469,27 @@ "Wall time: 2.51 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", "start_nodes = cudf.DataFrame({'id': [1]})\n", "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "! nvidia-smi\n", + "#! nvidia-smi\n", "for i in range(10):\n", " g2 = co_gdf.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", - " hops=3)\n", - "! nvidia-smi\n", + " hops=2)\n", + "#! nvidia-smi\n", "print(g2._nodes.shape, g2._edges.shape)\n", "del start_nodes\n", "del co_gdf\n", "del g2" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -4533,11 +4497,10 @@ "id": "oIouuORgnbcY", "outputId": "f07abe4c-5137-4ee3-935a-afbb2c5eaa1e" }, - "execution_count": 18, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Tue Dec 26 00:56:50 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -4582,26 +4545,27 @@ "Wall time: 3.25 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", "start_nodes = cudf.DataFrame({'id': [1]})\n", "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "! nvidia-smi\n", + "#! nvidia-smi\n", "for i in range(10):\n", " g2 = co_gdf.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", - " hops=4)\n", - "! nvidia-smi\n", + " hops=3)\n", + "#! nvidia-smi\n", "print(g2._nodes.shape, g2._edges.shape)\n", "del start_nodes\n", "del co_gdf\n", "del g2" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -4609,11 +4573,10 @@ "id": "oNLZGjwInc85", "outputId": "534097cf-4022-48cc-9419-a00c135f69e1" }, - "execution_count": 19, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Tue Dec 26 00:56:53 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -4658,26 +4621,27 @@ "Wall time: 5.02 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", "start_nodes = cudf.DataFrame({'id': [1]})\n", "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "! nvidia-smi\n", + "#! nvidia-smi\n", "for i in range(10):\n", " g2 = co_gdf.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", - " hops=5)\n", - "! nvidia-smi\n", + " hops=4)\n", + "#! nvidia-smi\n", "print(g2._nodes.shape, g2._edges.shape)\n", "del start_nodes\n", "del co_gdf\n", "del g2" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -4685,11 +4649,10 @@ "id": "ePqaeujMneX8", "outputId": "ffd88fff-016e-4ac0-ecb9-fa06baca60f8" }, - "execution_count": 20, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Tue Dec 26 00:56:58 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -4734,26 +4697,27 @@ "Wall time: 12 s\n" ] } - ] - }, - { - "cell_type": "code", + ], "source": [ "%%time\n", "start_nodes = cudf.DataFrame({'id': [1]})\n", "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", - "! nvidia-smi\n", + "#! nvidia-smi\n", "for i in range(10):\n", " g2 = co_gdf.hop(\n", " nodes=start_nodes,\n", " direction='forward',\n", - " hops=6)\n", - "! nvidia-smi\n", + " hops=5)\n", + "#! nvidia-smi\n", "print(g2._nodes.shape, g2._edges.shape)\n", "del start_nodes\n", "del co_gdf\n", "del g2" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -4761,11 +4725,10 @@ "id": "PTBkoIVHnfzK", "outputId": "5615ecd7-47ea-46ab-fd36-13bce4b3c787" }, - "execution_count": 21, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Tue Dec 26 00:57:10 2023 \n", "+---------------------------------------------------------------------------------------+\n", @@ -4810,16 +4773,48 @@ "Wall time: 28.2 s\n" ] } + ], + "source": [ + "%%time\n", + "start_nodes = cudf.DataFrame({'id': [1]})\n", + "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n", + "#! nvidia-smi\n", + "for i in range(10):\n", + " g2 = co_gdf.hop(\n", + " nodes=start_nodes,\n", + " direction='forward',\n", + " hops=6)\n", + "#! nvidia-smi\n", + "print(g2._nodes.shape, g2._edges.shape)\n", + "del start_nodes\n", + "del co_gdf\n", + "del g2" ] }, { "cell_type": "code", - "source": [], + "execution_count": null, "metadata": { "id": "Ygc2nrkznlCu" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/demos/more_examples/graphistry_features/embed/simple-ssh-logs-rgcn-anomaly-detector.ipynb b/demos/more_examples/graphistry_features/embed/simple-ssh-logs-rgcn-anomaly-detector.ipynb index e79c81cc6..3cb5b0dd6 100644 --- a/demos/more_examples/graphistry_features/embed/simple-ssh-logs-rgcn-anomaly-detector.ipynb +++ b/demos/more_examples/graphistry_features/embed/simple-ssh-logs-rgcn-anomaly-detector.ipynb @@ -12,10 +12,8 @@ "* Unsupervised graph neural network: RGCN\n", "* Runs on both CPU + GPU: Toggle `is_gpu`\n", "\n", - "See also:\n", - "* Other pygraphistry[ai] gnn notebooks for more advanced modes like incorporating node features\n", - "* Intro to RGCNs - [intro-story.ipynb](intro-story.md)\n", - "* In-depth RGCN - [advanced-identity-protection-40m.ipynb](advanced-identity-protection-40m.ipynb)\n" + "For background, so the RGCN intro: [intro-story.ipynb](../../../talks/infosec_jupyterthon2022/rgcn_login_anomaly_detection/advanced-identity-protection-40m.ipynb)\n", + "\n" ] }, { @@ -353,189 +351,10 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "dbf488b3-2a98-4c19-aa4f-4aef63943412", - "metadata": { - "execution": { - "iopub.execute_input": "2022-12-02T20:52:04.712354Z", - "iopub.status.busy": "2022-12-02T20:52:04.712254Z", - "iopub.status.idle": "2022-12-02T20:52:34.396563Z", - "shell.execute_reply": "2022-12-02T20:52:34.396305Z", - "shell.execute_reply.started": "2022-12-02T20:52:04.712343Z" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Preprocessing embedding data\n", - "--Splitting data\n", - "--num_nodes: 97, num_relationships: 20\n", - "Training embedding\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "epoch: 1, loss: 0.4165, score: 0.0000%: 0%| | 0/10 [00:03\n", @@ -487,55 +445,57 @@ "\n", " \n", " \n" + ], + "text/plain": [ + " from to weight\n", + "11112 RepBobbyRush janschakowsky 0.034364\n", + "3836 RepCori Ilhan 0.015936\n", + "5282 RepTedDeutch RepDWStweets 0.003268\n", + "12352 BennieGThompson RepStricklandWA 0.006849\n", + "9358 RepCarolMiller RepTroyNehls 0.005291" ] }, + "execution_count": 40, "metadata": {}, - "execution_count": 40 + "output_type": "execute_result" } + ], + "source": [ + "import json\n", + "\n", + "with open('congress_network/congress_network_data.json', 'r') as file:\n", + " data = json.load(file)\n", + "\n", + "edges = []\n", + "for i, name in enumerate(data[0]['usernameList']):\n", + " for ii, j in enumerate(data[0]['outList'][i]):\n", + " edges.append({\n", + " 'from': name,\n", + " 'to': data[0]['usernameList'][j],\n", + " 'weight': data[0]['outWeight'][i][ii]\n", + " })\n", + "edges_df = pd.DataFrame(edges)\n", + "\n", + "print(edges_df.shape)\n", + "edges_df.sample(5)" ] }, { "cell_type": "markdown", + "metadata": { + "id": "XLFTgDTEDSeA" + }, "source": [ - "## Load dataframe as a PyGraphistry graph\n", + "### Load dataframe as a PyGraphistry graph\n", "\n", "Turn into a graph and precompute some useful graph metrics\n", "\n", "Recall that a `g` object, underneath, is essentially just two dataframes, `g._edges` and `g._nodes`, and with many useful graph methods:" - ], - "metadata": { - "id": "XLFTgDTEDSeA" - } + ] }, { "cell_type": "code", - "source": [ - "# Shape\n", - "g = graphistry.edges(edges_df, 'from', 'to')\n", - "\n", - "# Enrich & style\n", - "# Tip: Switch from compute_igraph to compute_cugraph when GPUs are available\n", - "g2 = (g\n", - " .materialize_nodes()\n", - " .nodes(lambda g: g._nodes.assign(title=g._nodes.id))\n", - " .edges(lambda g: g._edges.assign(weight2=g._edges.weight))\n", - " .bind(point_title='title')\n", - " .compute_igraph('community_infomap')\n", - " .compute_igraph('pagerank')\n", - " .get_degrees()\n", - " .encode_point_color(\n", - " 'community_infomap',\n", - " as_categorical=True,\n", - " categorical_mapping={\n", - " 0: '#32a9a2', # vibrant teal\n", - " 1: '#ff6b6b', # soft coral\n", - " 2: '#f9d342', # muted yellow\n", - " }\n", - " )\n", - ")\n", - "\n", - "g2._nodes" - ], + "execution_count": 77, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -544,47 +504,16 @@ "id": "aB1U7e0HXmHh", "outputId": "53b9fa91-0caf-4866-c5a9-d9cf80e3c9ac" }, - "execution_count": 77, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "WARNING:root:edge index g._edge not set so using edge index as ID; set g._edge via g.edges(), or change merge_if_existing to FalseWARNING:root:edge index g._edge __edge_index__ missing as attribute in ig; using ig edge order for IDsWARNING:root:edge index g._edge not set so using edge index as ID; set g._edge via g.edges(), or change merge_if_existing to FalseWARNING:root:edge index g._edge __edge_index__ missing as attribute in ig; using ig edge order for IDs" ] }, { - "output_type": "execute_result", "data": { - "text/plain": [ - " id title community_infomap pagerank degree_in \\\n", - "0 SenatorBaldwin SenatorBaldwin 0 0.001422 26 \n", - "1 SenJohnBarrasso SenJohnBarrasso 0 0.001179 22 \n", - "2 SenatorBennet SenatorBennet 0 0.001995 33 \n", - "3 MarshaBlackburn MarshaBlackburn 0 0.001331 18 \n", - "4 SenBlumenthal SenBlumenthal 0 0.001672 30 \n", - ".. ... ... ... ... ... \n", - "470 RepJoeWilson RepJoeWilson 1 0.001780 21 \n", - "471 RobWittman RobWittman 1 0.001017 13 \n", - "472 rep_stevewomack rep_stevewomack 1 0.002637 35 \n", - "473 RepJohnYarmuth RepJohnYarmuth 2 0.000555 5 \n", - "474 RepLeeZeldin RepLeeZeldin 1 0.000511 3 \n", - "\n", - " degree_out degree \n", - "0 20 46 \n", - "1 19 41 \n", - "2 22 55 \n", - "3 38 56 \n", - "4 35 65 \n", - ".. ... ... \n", - "470 38 59 \n", - "471 19 32 \n", - "472 19 54 \n", - "473 20 25 \n", - "474 25 28 \n", - "\n", - "[475 rows x 7 columns]" - ], "text/html": [ "\n", "
\n", @@ -938,18 +867,73 @@ "
\n", " \n", " \n" + ], + "text/plain": [ + " id title community_infomap pagerank degree_in \\\n", + "0 SenatorBaldwin SenatorBaldwin 0 0.001422 26 \n", + "1 SenJohnBarrasso SenJohnBarrasso 0 0.001179 22 \n", + "2 SenatorBennet SenatorBennet 0 0.001995 33 \n", + "3 MarshaBlackburn MarshaBlackburn 0 0.001331 18 \n", + "4 SenBlumenthal SenBlumenthal 0 0.001672 30 \n", + ".. ... ... ... ... ... \n", + "470 RepJoeWilson RepJoeWilson 1 0.001780 21 \n", + "471 RobWittman RobWittman 1 0.001017 13 \n", + "472 rep_stevewomack rep_stevewomack 1 0.002637 35 \n", + "473 RepJohnYarmuth RepJohnYarmuth 2 0.000555 5 \n", + "474 RepLeeZeldin RepLeeZeldin 1 0.000511 3 \n", + "\n", + " degree_out degree \n", + "0 20 46 \n", + "1 19 41 \n", + "2 22 55 \n", + "3 38 56 \n", + "4 35 65 \n", + ".. ... ... \n", + "470 38 59 \n", + "471 19 32 \n", + "472 19 54 \n", + "473 20 25 \n", + "474 25 28 \n", + "\n", + "[475 rows x 7 columns]" ] }, + "execution_count": 77, "metadata": {}, - "execution_count": 77 + "output_type": "execute_result" } + ], + "source": [ + "# Shape\n", + "g = graphistry.edges(edges_df, 'from', 'to')\n", + "\n", + "# Enrich & style\n", + "# Tip: Switch from compute_igraph to compute_cugraph when GPUs are available\n", + "g2 = (g\n", + " .materialize_nodes()\n", + " .nodes(lambda g: g._nodes.assign(title=g._nodes.id))\n", + " .edges(lambda g: g._edges.assign(weight2=g._edges.weight))\n", + " .bind(point_title='title')\n", + " .compute_igraph('community_infomap')\n", + " .compute_igraph('pagerank')\n", + " .get_degrees()\n", + " .encode_point_color(\n", + " 'community_infomap',\n", + " as_categorical=True,\n", + " categorical_mapping={\n", + " 0: '#32a9a2', # vibrant teal\n", + " 1: '#ff6b6b', # soft coral\n", + " 2: '#f9d342', # muted yellow\n", + " }\n", + " )\n", + ")\n", + "\n", + "g2._nodes" ] }, { "cell_type": "code", - "source": [ - "g2.plot()" - ], + "execution_count": 79, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -958,14 +942,9 @@ "id": "GY9Q7KyqBMq8", "outputId": "5b4b277e-17fd-4201-9518-25168b927c6f" }, - "execution_count": 79, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - "" - ], "text/html": [ "\n", " ') + ``` + +## Layout + +### Hierarchical layouts: Tree and radial + +A hierachical view via horizontal or vertical trees, or radial. Graph data may also be presented using these layouts. + +#### Tree + +Tip: Also try `g.layout_graphviz("dot")` and `"circo"` + +```python +g = graphistry.edges(pd.DataFrame({'s': ['a', 'b', 'b'], 'd': ['b', 'c', 'd']})) + +g2a = g.tree_layout() +g2b = g2.tree_layout(allow_cycles=False, remove_self_loops=False, vertical=False) +g2c = g2.tree_layout(ascending=False, level_align='center') +g2d = g2.tree_layout(level_sort_values_by=['type', 'degree'], level_sort_values_by_ascending=False) + +g3a = g2a.layout_settings(locked_r=True, play=1000) +g3b = g2a.layout_settings(locked_y=True, play=0) +g3c = g2a.layout_settings(locked_x=True) + +g4 = g2.tree_layout().rotate(90) +``` + +To use with non-tree data, e.g., graphs with cycles, we recommend computing a tree such as via a minimum spanning tree, and then using that achieved layout with this algorithm. Alternatively, the radial layouts may more naturally support your graph. + +#### Radial + +A hierarchical view via radial rings that may be more space-efficient and aesthetic than the equivalent tree layout + +Supports time-based, continuous, and categorical modes: + +##### Time-based + +Use when the value column defining the ring order is a time column. See [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/layout_time_ring.ipynb) + +```python +g.time_ring_layout().plot() # finds a time column and infers all settings + +g.time_ring_layout( + time_col='my_node_time_col', + num_rings=20, + time_start=np.datetime64('2014-01-22'), + time_end=np.datetime64('2015-01-22'), + time_unit= 'Y', # s, m, h, D, W, M, Y, C + min_r=100.0, # smallest ring radius + max_r=1000.0, # biggest ring radius + reverse=False, + #format_axis: Optional[Callable[[List[Dict]], List[Dict]]] = None, + #format_label: Optional[Callable[[np.datetime64, int, np.timedelta64], str]] = None, + #play_ms: int = 2000, + #engine='auto' # 'auto', 'pandas', 'cudf' +).plot() +``` + +##### Continuous + +Use when the value column defining the ring order is a continuous number, like distance or amount. See [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/layout_continuous_ring.ipynb) + +```python +g.ring_continuous_layout() # find a numeric column and infers all settings + +g.ring_continuous_layout( + ring_col='my_numeric_col', + #v_start= # first ring at this value + #v_end= # last ring at this value + #v_step= # distance between rings in the value domain + min_r=100.0, # smallest ring radius + max_r=1000.0, # biggest ring radius + normalize_ring_col=True, # remap [v_start,v_end] to [min_r,max_r] + num_rings=20, + ring_step=100, + + #Control axis labels and styles + #axis: Optional[Union[Dict[float,str],List[str]]] = None, + #format_axis: Optional[Callable[[List[Dict]], List[Dict]]] = None, + #format_labels: Optional[Callable[[float, int, float], str]] = None, + + reverse=False, + play_ms=0, + #engine='auto', # 'auto', 'pandas', 'cudf' +) +``` + +##### Categorical + +Use when the value column defining the ring order is a categorical value, such as a name or ID. See [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/layout_categorical_ring.ipynb) + +```python +g.ring_categorical_layout('my_categorical_col') # infers all settings + +g.ring_categorical_layout( + ring_col='my_numeric_col', + order=['col1', 'my_col2'], + drop_empty=True, # remove unpopulated rings + combine_unhandled=False, # Put values not covered by order into one ring Other vs a ring per unique value + append_unhandled=True, # Append vs prepend + min_r=100.0, # smallest ring radius + max_r=1000.0, # biggest ring radius + + #Control axis labels and styles + #axis: Optional[Dict[Any,str]] = None, + #format_axis: Optional[Callable[[List[Dict]], List[Dict]]] = None, + #format_labels: Optional[Callable[[Any, int, float], str]] = None, + + reverse=False, + play_ms=0, + #engine='auto', # 'auto', 'pandas', 'cudf' +) +``` + +### Modularity weighted + +Weight edges by community membership to emphasize community structure. See [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/layout_modularity_weighted.ipynb) + +```python +g.modularity_weighted_layout().plot() +g.modularity_weighted_layout('my_community_col').plot() +g.modularity_weighted_layout( + community_alg='louvain', + engine='cudf', + same_community_weight=2.0, + cross_community_weight=0.3, + edge_influence=2.0 +).plot() +``` + + +### Group-in-a-box + +[Group-in-a-box layout](https://ieeexplore.ieee.org/document/6113135) with igraph/pandas and cugraph/cudf implementations: + +```python +g.group_in_a_box_layout().plot() +g.group_in_a_box_layout( + partition_alg='ecg', # see igraph/cugraph algs + #partition_key='some_col', # use existing col + #layout_alg='circle', # see igraph/cugraph algs + #x, y, w, h + #encode_colors=False, + #colors=['#FFF', '#FF0', ...] + engine='cudf' +).plot() +``` + +## Compute + +### Transforms + +The below methods let you quickly manipulate graphs directly and with dataframe methods: Search, pattern mine, transform, and more: + +```python +from graphistry import n, e_forward, e_reverse, e_undirected, is_in +g = (graphistry + .edges(pd.DataFrame({ + 's': ['a', 'b'], + 'd': ['b', 'c'], + 'k1': ['x', 'y'] + })) + .nodes(pd.DataFrame({ + 'n': ['a', 'b', 'c'], + 'k2': [0, 2, 4, 6] + }) +) + +g2 = graphistry.hypergraph(g._edges, ['s', 'd', 'k1'])['graph'] +g2.plot() # nodes are values from cols s, d, k1 + +(g + .materialize_nodes() + .get_degrees() + .get_indegrees() + .get_outdegrees() + .pipe(lambda g2: g2.nodes(g2._nodes.assign(t=x))) # transform + .filter_edges_by_dict({"k1": "x"}) + .filter_nodes_by_dict({"k2": 4}) + .prune_self_edges() + .hop( # filter to subgraph + #almost all optional + direction='forward', # 'reverse', 'undirected' + hops=2, # number (1..n hops, inclusive) or None if to_fixed_point + to_fixed_point=False, + + #every edge source node must match these + source_node_match={"k2": 0, "k3": is_in(['a', 'b', 3, 4])}, + source_node_query='k2 == 0', + + #every edge must match these + edge_match={"k1": "x"}, + edge_query='k1 == "x"', + + #every edge destination node must match these + destination_node_match={"k2": 2}, + destination_node_query='k2 == 2 or k2 == 4', + ) + .chain([ # filter to subgraph with Cypher-style GFQL + n(), + n({'k2': 0, "m": 'ok'}), #specific values + n({'type': is_in(["type1", "type2"])}), #multiple valid values + n(query='k2 == 0 or k2 == 4'), #dataframe query + n(name="start"), # add column 'start':bool + e_forward({'k1': 'x'}, hops=1), # same API as hop() + e_undirected(name='second_edge'), + e_reverse( + {'k1': 'x'}, # edge property match + hops=2, # 1 to 2 hops + #same API as hop() + source_node_match={"k2": 2}, + source_node_query='k2 == 2 or k2 == 4', + edge_match={"k1": "x"}, + edge_query='k1 == "x"', + destination_node_match={"k2": 0}, + destination_node_query='k2 == 0') + ]) + # replace as one node the node w/ given id + transitively connected nodes w/ col=attr + .collapse(node='some_id', column='some_col', attribute='some val') +``` + +Both `hop()` and `chain()` (GFQL) match dictionary expressions support dataframe series *predicates*. The above examples show `is_in([x, y, z, ...])`. Additional predicates include: + +* categorical: is_in, duplicated +* temporal: is_month_start, is_month_end, is_quarter_start, is_quarter_end, is_year_start, is_year_end +* numeric: gt, lt, ge, le, eq, ne, between, isna, notna +* string: contains, startswith, endswith, match, isnumeric, isalpha, isdigit, islower, isupper, isspace, isalnum, isdecimal, istitle, isnull, notnull + +Both `hop()` and `chain()` will run on GPUs when passing in RAPIDS dataframes. Specify parameter `engine='cudf'` to be sure. + +### Table to graph + +```python +df = pd.read_csv('events.csv') +hg = graphistry.hypergraph(df, ['user', 'email', 'org'], direct=True) +g = hg['graph'] # g._edges: | src, dst, user, email, org, time, ... | +g.plot() +``` + +```python +hg = graphistry.hypergraph( + df, + ['from_user', 'to_user', 'email', 'org'], + direct=True, + opts={ + + # when direct=True, can define src -> [ dst1, dst2, ...] edges + 'EDGES': { + 'org': ['from_user'], # org->from_user + 'from_user': ['email', 'to_user'], #from_user->email, from_user->to_user + }, + + 'CATEGORIES': { + # determine which columns share the same namespace for node generation: + # - if user 'louie' is both a from_user and to_user, show as 1 node + # - if a user & org are both named 'louie', they will appear as 2 different nodes + 'user': ['from_user', 'to_user'] + } +}) +g = hg['graph'] +g.plot() +``` + +### Generate node table + +```python +g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']})) +g2 = g.materialize_nodes() +g2._nodes # pd.DataFrame({'id': ['a', 'b', 'c']}) +``` + +### Compute degrees + +```python +g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']})) +g2 = g.get_degrees() +g2._nodes # pd.DataFrame({ + # 'id': ['a', 'b', 'c'], + # 'degree_in': [0, 1, 1], + # 'degree_out': [1, 1, 0], + # 'degree': [1, 1, 1] + #}) +``` + +See also `get_indegrees()` and `get_outdegrees()` + + +### Graph pattern matching + +PyGraphistry supports GFQL, its PyData-native variant of the popular Cypher graph query language, meaning you can do graph pattern matching directly from Pandas dataframes without installing a database or Java + +See also [graph pattern matching tutorial](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/hop_and_chain_graph_pattern_mining.ipynb) and the CPU/GPU [benchmark](https://github.com/graphistry/pygraphistry/tree/master/demos/gfql/benchmark_hops_cpu_gpu.ipynb) + +Traverse within a graph, or expand one graph against another + +Simple node and edge filtering via `filter_edges_by_dict()` and `filter_nodes_by_dict()`: + +```python +g = graphistry.edges(pd.read_csv('data.csv'), 's', 'd') +g2 = g.materialize_nodes() + +g3 = g.filter_edges_by_dict({"v": 1, "b": True}) +g4 = g.filter_nodes_by_dict({"v2": 1, "b2": True}) +``` + +Method `.hop()` enables slightly more complicated edge filters: + +```python + +from graphistry import is_in, gt + +# (a)-[{"v": 1, "type": "z"}]->(b) based on g +g2b = g2.hop( + source_node_match={g2._node: "a"}, + edge_match={"v": 1, "type": "z"}, + destination_node_match={g2._node: "b"}) +g2b = g2.hop( + source_node_query='n == "a"', + edge_query='v == 1 and type == "z"', + destination_node_query='n == "b"') + +# (a {x in [1,2] and y > 3})-[e]->(b) based on g +g2c = g2.hop( + source_node_match={ + g2._node: "a", + "x": is_in([1,2]), + "y": gt(3) + }, + destination_node_match={g2._node: "b"}) +) + +# (a or b)-[1 to 8 hops]->(anynode), based on graph g2 +g3 = g2.hop(pd.DataFrame({g2._node: ['a', 'b']}), hops=8) + +# (a or b)-[1 to 8 hops]->(anynode), based on graph g2 +g3 = g2.hop(pd.DataFrame({g2._node: is_in(['a', 'b'])}), hops=8) + +# (c)<-[any number of hops]-(any node), based on graph g3 +# Note multihop matches check source/destination/edge match/query predicates +# against every encountered edge for it to be included +g4 = g3.hop(source_node_match={"node": "c"}, direction='reverse', to_fixed_point=True) + +# (c)-[incoming or outgoing edge]-(any node), +# for c in g4 with expansions against nodes/edges in g2 +g5 = g2.hop(pd.DataFrame({g4._node: g4[g4._node]}), hops=1, direction='undirected') + +g5.plot() +``` + +Rich compound patterns are enabled via `.chain()`: + +```python +from graphistry import n, e_forward, e_reverse, e_undirected, is_in + +g2.chain([ n() ]) +g2.chain([ n({"x": 1, "y": True}) ]), +g2.chain([ n(query='x == 1 and y == True') ]), +g2.chain([ n({"z": is_in([1,2,4,'z'])}) ]), # multiple valid values +g2.chain([ e_forward({"type": "x"}, hops=2) ]) # simple multi-hop +g3 = g2.chain([ + n(name="start"), # tag node matches + e_forward(hops=3), + e_forward(name="final_edge"), # tag edge matches + n(name="end") +]) +g2.chain(n(), e_forward(), n(), e_reverse(), n()]) # rich shapes +print('# end nodes: ', len(g3._nodes[ g3._nodes.end ])) +print('# end edges: ', len(g3._edges[ g3._edges.final_edge ])) +``` + +See table above for more predicates like `is_in()` and `gt()` + +Queries can be serialized and deserialized, such as for saving and remote execution: + +```python +from graphistry.compute.chain import Chain + +pattern = Chain([n(), e(), n()]) +pattern_json = pattern.to_json() +pattern2 = Chain.from_json(pattern_json) +g.chain(pattern2).plot() +``` + +Benefit from automatic GPU acceleration by passing in GPU dataframes: + +```python +import cudf + +g1 = graphistry.edges(cudf.read_csv('data.csv'), 's', 'd') +g2 = g1.chain(..., engine='cudf') +``` + +The parameter `engine` is optional, defaulting to `'auto'`. + +### Pipelining + +```python +def capitalize(df, col): + df2 = df.copy() + df2[col] df[col].str.capitalize() + return df2 + +g + .cypher('MATCH (a)-[e]->(b) RETURN a, e, b') + .nodes(lambda g: capitalize(g._nodes, 'nTitle')) + .edges(capitalize, None, None, 'eTitle'), + .pipe(lambda g: g.nodes(g._nodes.pipe(capitalize, 'nTitle'))) +``` + +### Removing nodes + +```python +g = graphistry.edges(pd.DataFrame({'s': ['a', 'b', 'c'], 'd': ['b', 'c', 'a']})) +g2 = g.drop_nodes(['c']) # drops node c, edge c->a, edge b->c, +``` + +### Keeping nodes + +```python +# keep nodes [a,b,c] and edges [(a,b),(b,c)] +g2 = g.keep_nodes(['a, b, c']) +g2 = g.keep_nodes(pd.Series(['a, b, c'])) +g2 = g.keep_nodes(cudf.Series(['a, b, c'])) +``` + +### Collapsing adjacent nodes with specific k=v matches + +One col/val pair: + +```python +g2 = g.collapse( + node='root_node_id', # rooted traversal beginning + column='some_col', # column to inspect + attribute='some val' # value match to collapse on if hit +) +assert len(g2._nodes) <= len(g._nodes) +``` + +Collapse for all possible vals in a column, and assuming a stable root node id: + +```python +g3 = g +for v in g._nodes['some_col'].unique(): + g3 = g3.collapse(node='root_node_id', column='some_col', attribute=v) +``` + +## Graph AI in a single line of code + +Graph autoML features including: + +### Generate features from raw data + +Automatically and intelligently transform text, numbers, booleans, and other formats to AI-ready representations: + +* Featurization + + ```python + g = graphistry.nodes(df).featurize(kind='nodes', X=['col_1', ..., 'col_n'], y=['label', ..., 'other_targets'], ...) + + print('X', g._node_features) + print('y', g._node_target) + ``` + +* Set `kind='edges'` to featurize edges: + + ```python + g = graphistry.edges(df, src, dst).featurize(kind='edges', X=['col_1', ..., 'col_n'], y=['label', ..., 'other_targets'], ...) + ``` + +* Use generated features with both Graphistry and external libraries: + + ```python + # graphistry + g = g.umap() # UMAP, GNNs, use features if already provided, otherwise will compute + + # other pydata libraries + X = g._node_features # g._get_feature('nodes') or g.get_matrix() + y = g._node_target # g._get_target('nodes') or g.get_matrix(target=True) + from sklearn.ensemble import RandomForestRegressor + model = RandomForestRegressor().fit(X, y) # assumes train/test split + new_df = pandas.read_csv(...) # mini batch + X_new, _ = g.transform(new_df, None, kind='nodes', return_graph=False) + preds = model.predict(X_new) + ``` + +* Encode model definitions and compare models against each other + + ```python + # graphistry + from graphistry.features import search_model, topic_model, ngrams_model, ModelDict, default_featurize_parameters, default_umap_parameters + + g = graphistry.nodes(df) + g2 = g.umap(X=[..], y=[..], **search_model) + + # set custom encoding model with any feature/umap/dbscan kwargs + new_model = ModelDict(message='encoding new model parameters is easy', **default_featurize_parameters) + new_model.update(dict( + y=[...], + kind='edges', + model_name='sbert/cool_transformer_model', + use_scaler_target='kbins', + n_bins=11, + strategy='normal')) + print(new_model) + + g3 = g.umap(X=[..], **new_model) + # compare g2 vs g3 or add to different pipelines + ``` + +See `help(g.featurize)` for more options + +### [sklearn-based UMAP](https://umap-learn.readthedocs.io/en/latest/), [cuML-based UMAP](https://docs.rapids.ai/api/cuml/stable/api.html?highlight=umap#cuml.UMAP) + +* Reduce dimensionality by plotting a similarity graph from feature vectors: + + ```python + # automatic feature engineering, UMAP + g = graphistry.nodes(df).umap() + + # plot the similarity graph without any explicit edge_dataframe passed in -- it is created during UMAP. + g.plot() + ``` + +* Apply a trained model to new data: + + ```python + new_df = pd.read_csv(...) + embeddings, X_new, _ = g.transform_umap(new_df, None, kind='nodes', return_graph=False) + ``` + +* Infer a new graph from new data using the old umap coordinates to run inference without having to train a new umap model. + + ```python + new_df = pd.read_csv(...) + g2 = g.transform_umap(new_df, return_graph=True) # return_graph=True is default + g2.plot() # + + # or if you want the new minibatch to cluster to closest points in previous fit: + g3 = g.transform_umap(new_df, return_graph=True, merge_policy=True) + g3.plot() # useful to see how new data connects to old -- play with `sample` and `n_neighbors` to control how much of old to include + ``` + +* UMAP supports many options, such as supervised mode, working on a subset of columns, and passing arguments to underlying `featurize()` and UMAP implementations (see `help(g.umap)`): + + ```python + g.umap(kind='nodes', X=['col_1', ..., 'col_n'], y=['label', ..., 'other_targets'], ...) + ``` + +* `umap(engine="...")` supports multiple implementations. It defaults to using the GPU-accelerated `engine="cuml"` when a GPU is available, resulting in orders-of-magnitude speedups, and falls back to CPU processing via `engine="umap_learn"`.: + + ```python + g.umap(engine='cuml') + ``` + +You can also featurize edges and UMAP them as we did above. + +UMAP support is rapidly evolving, please contact the team directly or on Slack for additional discussions + +See `help(g.umap)` for more options + +### [GNN models](https://docs.dgl.ai/en/0.6.x/index.html) + +* Graphistry adds bindings and automation to working with popular GNN models, currently focusing on DGL/PyTorch: + + ```python + g = (graphistry + .nodes(ndf) + .edges(edf, src, dst) + .build_gnn( + X_nodes=['col_1', ..., 'col_n'], #columns from nodes_dataframe + y_nodes=['label', ..., 'other_targets'], + X_edges=['col_1_edge', ..., 'col_n_edge'], #columns from edges_dataframe + y_edges=['label_edge', ..., 'other_targets_edge'], + ...) + ) + G = g.DGL_graph + + from [your_training_pipeline] import train, model + # Train + g = graphistry.nodes(df).build_gnn(y_nodes='target') + G = g.DGL_graph + train(G, model) + # predict on new data + X_new, _ = g.transform(new_df, None, kind='nodes' or 'edges', return_graph=False) # no targets + predictions = model.predict(G_new, X_new) + ``` + +Like `g.umap()`, GNN layers automate feature engineering (`.featurize()`) + +See `help(g.build_gnn)` for options. + +GNN support is rapidly evolving, please contact the team directly or on Slack for additional discussions + +### [Semantic Search](https://www.sbert.net/examples/applications/semantic-search/README.html) + +* Search textual data semantically and see the resulting graph: + + ```python + ndf = pd.read_csv(nodes.csv) + edf = pd.read_csv(edges.csv) + + g = graphistry.nodes(ndf, 'node').edges(edf, 'src', 'dst') + + g2 = g.featurize(X = ['text_col_1', .., 'text_col_n'], kind='nodes', + min_words = 0, # forces all named columns as textual ones + #encode text as paraphrase embeddings, supports any sbert model + model_name = "paraphrase-MiniLM-L6-v2") + + # or use convienence `ModelDict` to store parameters + + from graphistry.features import search_model + g2 = g.featurize(X = ['text_col_1', .., 'text_col_n'], kind='nodes', **search_model) + + # query using the power of transformers to find richly relevant results + + results_df, query_vector = g2.search('my natural language query', ...) + + print(results_df[['_distance', 'text_col', ..]]) #sorted by relevancy + + # or see graph of matching entities and original edges + + g2.search_graph('my natural language query', ...).plot() + + ``` + +* If edges are not given, `g.umap(..)` will supply them: + + ```python + ndf = pd.read_csv(nodes.csv) + g = graphistry.nodes(ndf) + g2 = g.umap(X = ['text_col_1', .., 'text_col_n'], min_words=0, ...) + + g2.search_graph('my natural language query', ...).plot() + ``` + +See `help(g.search_graph)` for options + +### Knowledge Graph Embeddings + +* Train a RGCN model and predict: + + ```python + edf = pd.read_csv(edges.csv) + g = graphistry.edges(edf, src, dst) + g2 = g.embed(relation='relationship_column_of_interest', **kwargs) + + # predict links over all nodes + g3 = g2.predict_links_all(threshold=0.95) # score high confidence predicted edges + g3.plot() + + # predict over any set of entities and/or relations. + # Set any `source`, `destination` or `relation` to `None` to predict over all of them. + # if all are None, it is better to use `g.predict_links_all` for speed. + g4 = g2.predict_links(source=['entity_k'], + relation=['relationship_1', 'relationship_4', ..], + destination=['entity_l', 'entity_m', ..], + threshold=0.9, # score threshold + return_dataframe=False) # set to `True` to return dataframe, or just access via `g4._edges` + ``` + +* Detect Anamolous Behavior (example use cases such as Cyber, Fraud, etc) + + ```python + # Score anomolous edges by setting the flag `anomalous` to True and set confidence threshold low + g5 = g.predict_links_all(threshold=0.05, anomalous=True) # score low confidence predicted edges + g5.plot() + + g6 = g.predict_links(source=['ip_address_1', 'user_id_3'], + relation=['attempt_logon', 'phishing', ..], + destination=['user_id_1', 'active_directory', ..], + anomalous=True, + threshold=0.05) + g6.plot() + ``` + +* Train a RGCN model including auto-featurized node embeddings + + ```python + edf = pd.read_csv(edges.csv) + ndf = pd.read_csv(nodes.csv) # adding node dataframe + + g = graphistry.edges(edf, src, dst).nodes(ndf, node_column) + + # inherets all the featurization `kwargs` from `g.featurize` + g2 = g.embed(relation='relationship_column_of_interest', use_feat=True, **kwargs) + g2.predict_links_all(threshold=0.95).plot() + ``` + +See `help(g.embed)`, `help(g.predict_links)` , or `help(g.predict_links_all)` for options + +### DBSCAN + +* Enrich UMAP embeddings or featurization dataframe with GPU or CPU DBSCAN + + ```python + g = graphistry.edges(edf, 'src', 'dst').nodes(ndf, 'node') + + # cluster by UMAP embeddings + kind = 'nodes' | 'edges' + g2 = g.umap(kind=kind).dbscan(kind=kind) + print(g2._nodes['_dbscan']) | print(g2._edges['_dbscan']) + + # dbscan in `umap` or `featurize` via flag + g2 = g.umap(dbscan=True, min_dist=0.2, min_samples=1) + + # or via chaining, + g2 = g.umap().dbscan(min_dist=1.2, min_samples=2, **kwargs) + + # cluster by feature embeddings + g2 = g.featurize().dbscan(**kwargs) + + # cluster by a given set of feature column attributes, inhereted from `g.get_matrix(cols)` + g2 = g.featurize().dbscan(cols=['ip_172', 'location', 'alert'], **kwargs) + + # equivalent to above (ie, cols != None and umap=True will still use features dataframe, rather than UMAP embeddings) + g2 = g.umap().dbscan(cols=['ip_172', 'location', 'alert'], umap=True | False, **kwargs) + g2.plot() # color by `_dbscan` + + new_df = pd.read_csv(..) + # transform on new data according to fit dbscan model + g3 = g2.transform_dbscan(new_df) + ``` + +See `help(g.dbscan)` or `help(g.transform_dbscan)` for options + +### Quickly configurable + +Set visual attributes through [quick data bindings](https://hub.graphistry.com/docs/api/2/rest/upload/#createdataset2) and set [all sorts of URL options](https://hub.graphistry.com/docs/api/1/rest/url/). Check out the tutorials on [colors](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/encodings-colors.ipynb), [sizes](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/encodings-sizes.ipynb), [icons](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/encodings-icons.ipynb), [badges](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/encodings-badges.ipynb), [weighted clustering](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/edge-weights.ipynb) and [sharing controls](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/sharing_tutorial.ipynb): + + ```python + g + .privacy(mode='private', invited_users=[{'email': 'friend1@site.ngo', 'action': '10'}], notify=False) + .edges(df, 'col_a', 'col_b') + .edges(my_transform1(g._edges)) + .nodes(df, 'col_c') + .nodes(my_transform2(g._nodes)) + .bind(source='col_a', destination='col_b', node='col_c') + .bind( + point_color='col_a', + point_size='col_b', + point_title='col_c', + point_x='col_d', + point_y='col_e') + .bind( + edge_color='col_m', + edge_weight='col_n', + edge_title='col_o') + .encode_edge_color('timestamp', ["blue", "yellow", "red"], as_continuous=True) + .encode_point_icon('device_type', categorical_mapping={'macbook': 'laptop', ...}) + .encode_point_badge('passport', 'TopRight', categorical_mapping={'Canada': 'flag-icon-ca', ...}) + .encode_point_color('score', ['black', 'white']) + .addStyle(bg={'color': 'red'}, fg={}, page={'title': 'My Graph'}, logo={}) + .settings(url_params={ + 'play': 2000, + 'menu': True, 'info': True, + 'showArrows': True, + 'pointSize': 2.0, 'edgeCurvature': 0.5, + 'edgeOpacity': 1.0, 'pointOpacity': 1.0, + 'lockedX': False, 'lockedY': False, 'lockedR': False, + 'linLog': False, 'strongGravity': False, 'dissuadeHubs': False, + 'edgeInfluence': 1.0, 'precisionVsSpeed': 1.0, 'gravity': 1.0, 'scalingRatio': 1.0, + 'showLabels': True, 'showLabelOnHover': True, + 'showPointsOfInterest': True, 'showPointsOfInterestLabel': True, 'showLabelPropertiesOnHover': True, + 'pointsOfInterestMax': 5 + }) + .plot() + ``` + + +## Plugins: Graph compute & layout + +### Use igraph (CPU) and cugraph (GPU) compute + +Install the plugin of choice and then: + +```python +g2 = g.compute_igraph('pagerank') +assert 'pagerank' in g2._nodes.columns + +g3 = g.compute_cugraph('pagerank') +assert 'pagerank' in g2._nodes.columns +``` + +### igraph + +With `pip install graphistry[igraph]`, you can also use [`igraph` layouts](https://igraph.org/python/doc/api/igraph.Graph.html#layout): + +```python +g.layout_igraph('sugiyama').plot() +g.layout_igraph('sugiyama', directed=True, params={}).plot() +``` + +See list [`layout_algs`](https://github.com/graphistry/pygraphistry/blob/master/graphistry/plugins/igraph.py#L365) + +### graphviz + +With graphviz installed, you can use its many layouts. See [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/graphviz/graphviz.ipynb) + +```python +# 1. Engine: apt-get install graphviz graphviz-dev +# 2. Bindings: pip install -q graphistry[pygraphviz] + +# graphviz dot layout with graphistry interactive render +g.layout_graphviz('dot').plot() + +# save graphviz render to disk +g.layout_graphviz('dot', render_to_disk=True, path='./graph.png', format='render') + +# custom attributes +assert 'color' in g._edges.columns and 'shape' in g._nodes.columns +g.layout_graphviz( + 'dot', + graph_attrs={}, + node_attrs={'color': 'green'}, + edge_attrs={}).plot() + +help(g.layout_graphviz) +``` + +See layout algorithm list [`prog`](https://github.com/graphistry/pygraphistry/blob/master/graphistry/plugins_types/graphviz_types.py#L14). The layout algorithms, and attributes at global and node/edge-level, are in the [graphviz engine documentation](https://graphviz.org/docs/layouts/). + +### cuGraph + +With [Nvidia RAPIDS cuGraph](https://www.rapids.ai) install: + +```python +g.layout_cugraph('force_atlas2').plot() +help(g.layout_cugraph) +``` + +See list [`layout_algs`](https://github.com/graphistry/pygraphistry/blob/master/graphistry/plugins/cugraph.py#L315) + + +## Resources + +* Graphistry [In-Tool UI Guide](https://hub.graphistry.com/docs/ui/index/) +* [General and REST API docs](https://hub.graphistry.com/docs/api/): + * [URL settings](https://hub.graphistry.com/docs/api/1/rest/url/#urloptions) + * [Authentication](https://hub.graphistry.com/docs/api/1/rest/auth/) + * [Uploading](https://hub.graphistry.com/docs/api/2/rest/upload/#createdataset2), including multiple file formats and settings + * [Color bindings](https://hub.graphistry.com/docs/api/2/rest/upload/colors/#extendedpalette2) and [color palettes](https://hub.graphistry.com/docs/api/api-color-palettes/) (ColorBrewer) + * Bindings and colors, REST API, embedding URLs and URL parameters, dynamic JS API, and more + * JavaScript and more! +* Python-specific + * [Python API ReadTheDocs](http://pygraphistry.readthedocs.org/en/latest/) + * Within a notebook, you can always run `help(graphistry)`, `help(graphistry.hypergraph)`, etc. +* [Graph-App-Kit Dashboarding](https://github.com/graphistry/graph-app-kit) dashboarding \ No newline at end of file diff --git a/docs/source/community.rst b/docs/source/community.rst new file mode 100644 index 000000000..6f17c46e1 --- /dev/null +++ b/docs/source/community.rst @@ -0,0 +1,12 @@ +.. _community: + +Join the Community +================== + +The Graphistry team is active in a few places, so come join us: + +- `Blog `_ +- `Slack Channel `_ +- `GitHub `_ +- `Twitter `_ +- `LinkedIn `_ diff --git a/docs/source/conf.py b/docs/source/conf.py index c166e8cce..ecd171ae4 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -10,23 +10,33 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -import os, sys -from distutils.version import LooseVersion +import docutils.nodes, os, logging, re, sys +from docutils import nodes +from packaging.version import Version + -# sys.path.insert(0, os.path.abspath('.')) sys.path.insert(0, os.path.abspath("../..")) -sys.path.insert(0, os.path.abspath('../../')) import graphistry + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + + # -- Project information ----------------------------------------------------- project = "PyGraphistry" -copyright = "2021, Graphistry, Inc." +copyright = "2024, Graphistry, Inc." author = "Graphistry, Inc." +html_title = "PyGraphistry Documentation" +html_short_title = "PyGraphistry" +html_logo = 'graphistry_banner_transparent_colored.png' +html_favicon = 'static/favicon.ico' + # The full version, including alpha/beta/rc tags -version = LooseVersion(graphistry.__version__).vstring -relesae = version +version = str(Version(graphistry.__version__)) +release = version # -- General configuration --------------------------------------------------- @@ -34,20 +44,37 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ + 'myst_parser', + 'nbsphinx', "sphinx.ext.autodoc", #'sphinx.ext.autosummary', - #'sphinx.ext.intersphinx', + 'sphinx.ext.intersphinx', "sphinx.ext.ifconfig", - "sphinx_autodoc_typehints" + #"sphinx_autodoc_typehints", + "sphinx_copybutton", ] +# TODO guarantee most notebooks are executable (=> maintained) +# and switch to opt'ing out the few that are hard, e.g., DB deps +nbsphinx_execute = 'never' +nbsphinx_allow_errors = False # Allow errors in notebooks + +autodoc_typehints = "description" +always_document_param_types = True +typehints_document_rtype = True + +#suppress_warnings = [ +# 'nbsphinx.localfile', # Suppresses local file warnings in notebooks +#] + #FIXME Why is sphinx/autodoc failing here? nitpick_ignore = [ ('py:class', '1'), # Ex: api : Optional[Literal[1, 3]] ('py:class', '3'), ('py:class', ""), ('py:class', ""), + ('py:class', 'BiPartiteGraph'), ('py:class', "graphistry.compute.ASTSerializable.ASTSerializable"), ('py:class', "graphistry.compute.chain.Chain"), ('py:class', "graphistry.compute.predicates.ASTPredicate.ASTPredicate"), @@ -100,6 +127,87 @@ ('py:class', 'graphistry.plugins.igraph.compute_igraph'), ('py:class', 'graphistry.plugins.igraph.from_igraph'), ('py:class', 'graphistry.plugins.igraph.layout_igraph'), + ('py:data', 'graphistry.plugins_types.cugraph_types.CuGraphKind'), + ('py:data', 'graphistry.plugins_types.graphviz_types.EdgeAttr'), + ('py:data', 'graphistry.plugins_types.graphviz_types.EDGE_ATTRS'), + ('py:data', 'graphistry.plugins_types.graphviz_types.Format'), + ('py:data', 'graphistry.plugins_types.graphviz_types.FORMATS'), + ('py:data', 'graphistry.plugins_types.graphviz_types.GraphAttr'), + ('py:data', 'graphistry.plugins_types.graphviz_types.GRAPH_ATTRS'), + ('py:data', 'graphistry.plugins_types.graphviz_types.NodeAttr'), + ('py:data', 'graphistry.plugins_types.graphviz_types.NODE_ATTRS'), + ('py:data', 'graphistry.plugins_types.graphviz_types.Prog'), + ('py:data', 'graphistry.plugins_types.graphviz_types.PROGS'), + + # Suppress individual items from PROGS + ('py:class', 'acyclic'), + ('py:class', 'ccomps'), + ('py:class', 'circo'), + ('py:class', 'dot'), + ('py:class', 'fdp'), + ('py:class', 'gc'), + ('py:class', 'gvcolor'), + ('py:class', 'gvpr'), + ('py:class', 'neato'), + ('py:class', 'nop'), + ('py:class', 'osage'), + ('py:class', 'patchwork'), + ('py:class', 'sccmap'), + ('py:class', 'sfdp'), + ('py:class', 'tred'), + ('py:class', 'twopi'), + ('py:class', 'unflatten'), + + # Suppress items from FORMATS + ('py:class', 'canon'), + ('py:class', 'cmap'), + ('py:class', 'cmapx'), + ('py:class', 'cmapx_np'), + ('py:class', 'dia'), + ('py:class', 'dot'), + ('py:class', 'fig'), + ('py:class', 'gd'), + ('py:class', 'gd2'), + ('py:class', 'gif'), + ('py:class', 'hpgl'), + ('py:class', 'imap'), + ('py:class', 'imap_np'), + ('py:class', 'ismap'), + ('py:class', 'jpe'), + ('py:class', 'jpeg'), + ('py:class', 'jpg'), + ('py:class', 'mif'), + ('py:class', 'mp'), + ('py:class', 'pcl'), + ('py:class', 'pdf'), + ('py:class', 'pic'), + ('py:class', 'plain'), + ('py:class', 'plain-ext'), + ('py:class', 'png'), + ('py:class', 'ps'), + ('py:class', 'ps2'), + ('py:class', 'svg'), + ('py:class', 'svgz'), + ('py:class', 'vml'), + ('py:class', 'vmlz'), + ('py:class', 'vrml'), + ('py:class', 'vtx'), + ('py:class', 'wbmp'), + ('py:class', 'xdot'), + ('py:class', 'xlib'), + + #TimeUnit = Literal['s', 'm', 'h', 'D', 'W', 'M', 'Y', 'C'] + ('py:data', 'graphistry.compute.temporal.TimeUnit'), + ('py:class', 's'), + ('py:class', 'm'), + ('py:class', 'h'), + ('py:class', 'D'), + ('py:class', 'W'), + ('py:class', 'M'), + ('py:class', 'Y'), + ('py:class', 'C'), + + ('py:class', 'abc.ABC'), ('py:class', 'graphistry.feature_utils.FeatureMixin'), ('py:class', 'graphistry.dgl_utils.DGLGraphMixin'), ('py:class', 'graphistry.umap_utils.UMAPMixin'), @@ -111,11 +219,22 @@ ('py:class', 'Plotter'), ('py:class', 'Plottable'), ('py:class', 'CuGraphKind'), + ('py:class', 'cugraph'), + ('py:class', 'cugraph.BiPartiteGraph'), ('py:class', 'cugraph.Graph'), + ('py:class', 'cugraph.MultiGraph'), ('py:class', 'IGraph graph'), ('py:class', 'igraph'), + ('py:class', 'JSONVal'), ('py:class', 'dgl'), ('py:class', 'matplotlib'), + ('py:class', 'MultiGraph'), + ('py:class', 'numpy'), + ('py:class', 'numpy.datetime64'), + ('py:class', 'numpy.timedelta64'), + ('py:class', 'pandas.core.frame.DataFrame'), + ('py:class', 'pandas.core.series.Series'), + ('py:class', 'pandas._libs.tslibs.offsets.DateOffset'), ('py:class', 'torch'), ('py:class', 'umap'), ('py:class', 'sentence_transformers'), @@ -134,17 +253,19 @@ ('py:class', 'weakref.WeakKeyDictionary'), ('py:data', 'typing.Any'), ('py:data', 'typing.List'), + ('py:data', 'typing.List[typing_extensions.Literal]'), ('py:data', 'typing.Literal'), ('py:data', 'typing.Optional'), ('py:data', 'typing.Callable'), ('py:data', 'typing.Tuple'), ('py:data', 'typing.Union'), + ('py:class', 'typing_extensions.Literal'), ('py:class', 'Mode'), - ('py:class','pandas.core.frame.DataFrame'), ('py:class', 'graphistry.privacy.Privacy') ] -set_type_checking_flag = True +#set_type_checking_flag = True + # typehints_fully_qualified=True always_document_param_types = True typehints_document_rtype = True @@ -155,18 +276,108 @@ # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] -source_suffix = ".rst" - -# The encoding of source files. -source_encoding = "utf-8-sig" - +# source_suffix = ['.rst', '.ipynb'] +source_suffix = { + '.md': 'markdown', + '.txt': 'markdown', + '.rst': 'restructuredtext', + #'.ipynb': 'nbsphinx', +} # The master toctree document. -master_doc = "index" +root_doc = "index" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] +exclude_patterns = [ + + '_build', + '**/_build/**', + 'doctrees', + '**/doctrees/**', + 'demos/.ipynb_checkpoints', + '**/*.txt', + + # nbsphinx stalls on these + 'demos/ai/Introduction/Ask-HackerNews-Demo.ipynb', + 'demos/ai/OSINT/jack-donations.ipynb', + + #'demos/for_analysis.ipynb', + #'demos/for_developers.ipynb', + #'demos/upload_csv_miniapp.ipynb', + + # not used yet + #'demos/demos_databases_apis/splunk/splunk_demo_public.ipynb', + #'demos/demos_databases_apis/neptune/neptune_cypher_viz_using_bolt.ipynb', + #'demos/demos_databases_apis/neptune/neptune_tutorial.ipynb', + #'demos/demos_databases_apis/sql/postgres.ipynb', + #'demos/demos_databases_apis/gpu_rapids/part_iv_gpu_cuml.ipynb', + 'demos/demos_databases_apis/gpu_rapids/part_iii_gpu_blazingsql.ipynb', + #'demos/demos_databases_apis/gpu_rapids/part_ii_gpu_cudf.ipynb', + #'demos/demos_databases_apis/gpu_rapids/part_i_cpu_pandas.ipynb', + #'demos/demos_databases_apis/gpu_rapids/cugraph.ipynb', + #'demos/demos_databases_apis/memgraph/visualizing_iam_dataset.ipynb', + #'demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.ipynb', + #'demos/demos_databases_apis/arango/arango_tutorial.ipynb', + #'demos/demos_databases_apis/nodexl/official/nodexl_graphistry.ipynb', + #'demos/demos_databases_apis/neo4j/official/graphistry_bolt_tutorial_public.ipynb', + #'demos/demos_databases_apis/neo4j/contributed/Neo4jTwitter.ipynb', + #'demos/demos_databases_apis/alienvault/OTXLockerGoga.ipynb', + #'demos/demos_databases_apis/alienvault/usm.ipynb', + #'demos/demos_databases_apis/alienvault/OTXIndicators.ipynb', + #'demos/demos_databases_apis/gremlin-tinkerpop/TitanDemo.ipynb', + #'demos/demos_databases_apis/hypernetx/hypernetx.ipynb', + 'demos/demos_databases_apis/umap_learn/umap_learn.ipynb', + #'demos/demos_databases_apis/graphviz/graphviz.ipynb', + #'demos/demos_databases_apis/tigergraph/social_raw_REST_calls.ipynb', + #'demos/demos_databases_apis/tigergraph/tigergraph_pygraphistry_bindings.ipynb', + #'demos/demos_databases_apis/tigergraph/fraud_raw_REST_calls.ipynb', + #'demos/demos_databases_apis/networkx/networkx.ipynb', + 'demos/more_examples/simple/tutorial_csv_mini_app_icij_implants.ipynb', + 'demos/more_examples/simple/MarvelTutorial.ipynb', + 'demos/more_examples/simple/tutorial_basic_LesMiserablesCSV.ipynb', + #'demos/more_examples/graphistry_features/layout_tree.ipynb', + #'demos/more_examples/graphistry_features/encodings-icons.ipynb', + #'demos/more_examples/graphistry_features/layout_time_ring.ipynb', + #'demos/more_examples/graphistry_features/hop_and_chain_graph_pattern_mining.ipynb', + #'demos/more_examples/graphistry_features/encodings-colors.ipynb', + #'demos/more_examples/graphistry_features/encodings-sizes.ipynb', + #'demos/more_examples/graphistry_features/layout_modularity_weighted.ipynb', + #'demos/more_examples/graphistry_features/layout_time_ring_dev.ipynb', + #'demos/more_examples/graphistry_features/external_layout/simple_manual_layout.ipynb', + #'demos/more_examples/graphistry_features/external_layout/networkx_layout.ipynb', + #'demos/more_examples/graphistry_features/embed/simple-ssh-logs-rgcn-anomaly-detector.ipynb', + #'demos/more_examples/graphistry_features/sharing_tutorial.ipynb', + #'demos/more_examples/graphistry_features/encodings-badges.ipynb', + #'demos/more_examples/graphistry_features/layout_categorical_ring.ipynb', + #'demos/more_examples/graphistry_features/edge-weights.ipynb', + #'demos/more_examples/graphistry_features/layout_continuous_ring.ipynb', + 'demos/more_examples/graphistry_features/Workbooks.ipynb', + 'demos/demos_by_use_case/bio/BiogridDemo.ipynb', + 'demos/demos_by_use_case/logs/Tutorial Part 1 (Honey Pot).ipynb', + 'demos/demos_by_use_case/logs/malware-hypergraph/Malware Hypergraph.ipynb', + 'demos/demos_by_use_case/logs/aws_vpc_flow_cloudwatch/vpc_flow.ipynb', + 'demos/demos_by_use_case/logs/Tutorial Part 2 (Apache Logs).ipynb', + 'demos/demos_by_use_case/logs/network-threat-hunting-masterclass-zeek-bro/graphistry_corelight_webinar.ipynb', + 'demos/demos_by_use_case/logs/owasp-amass-network-enumeration/amass.ipynb', + 'demos/demos_by_use_case/logs/microservices-spigo/SystemArchitectureSpigo.ipynb', + 'demos/demos_by_use_case/fraud/BitcoinTutorial.ipynb', + 'demos/demos_by_use_case/social/Twitter.ipynb', + #'demos/talks/infosec_jupyterthon2022/rgcn_login_anomaly_detection/advanced-identity-protection-40m.ipynb', + #'demos/talks/infosec_jupyterthon2022/rgcn_login_anomaly_detection/intro-story.ipynb', + #'demos/gfql/benchmark_hops_cpu_gpu.ipynb', + 'demos/data/benchmarking/SparseDatasets.ipynb', + 'demos/data/benchmarking/DenseDatasets.ipynb', + 'demos/data/benchmarking/TestDatasets.ipynb', + 'demos/ai/Introduction/Ask-HackerNews-Demo.ipynb', + 'demos/ai/Introduction/simple-power-of-umap.ipynb', + #'demos/ai/cyber/CyberSecurity-Slim.ipynb', + 'demos/ai/cyber/redteam-umap-gtc-gpu.ipynb', + 'demos/ai/cyber/cyber-redteam-umap-demo.ipynb', + 'demos/ai/OSINT/jack-donations.ipynb', + 'demos/ai/OSINT/Chavismo.ipynb', + +] pygments_style = "sphinx" todo_include_todos = False @@ -176,15 +387,34 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = "sphinx_rtd_theme" +#html_theme = "sphinx_rtd_theme" +html_theme = "sphinx_book_theme" + + +html_theme_options = { + "repository_url": "https://github.com/graphistry/pygraphistry", + "use_repository_button": True, + + # Optional top horizontal navigation bar + #"navbar_start": ["navbar-start.html"], + #"navbar_center": ["navbar-center.html"], + #"navbar_end": ["navbar-end.html"], + + "logo": { + #"link": "https://www.graphistry.com/get-started", + #"text": "Graphistry, Inc.", + "alt_text": "Graphistry, Inc." + } +} + # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = [] # '_static' +html_static_path = ['static'] # '_static' +# html_css_files = ['graphistry.css'] html_show_sphinx = False -html_show_sourcelink = False htmlhelp_basename = "PyGraphistrydoc" @@ -192,22 +422,79 @@ # -- Options for LaTeX output --------------------------------------------- latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - #'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). - #'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. - #'preamble': '', - # Latex figure (float) alignment - #'figure_align': 'htbp', + 'preamble': r''' + + \usepackage{svg} % Enables SVG handling via Inkscape + + \RequirePackage{etex} % Ensure extended TeX capacities + \usepackage[utf8]{inputenc} % Enable UTF-8 support + \usepackage[T1]{fontenc} % Use T1 font encoding for better character support + \usepackage{lmodern} % Load Latin Modern fonts for better quality + \usepackage{amsmath} % For advanced math formatting + \usepackage{amsfonts} % For math fonts + \usepackage{amssymb} % For additional math symbols + \usepackage{graphicx} % For including images + \usepackage{hyperref} % For hyperlinks + \usepackage{textcomp} % For additional text symbols + \usepackage{breakurl} % Allows line breaks in URLs + \usepackage{listings} % For code listings + \usepackage{float} % Improved control of floating objects + \usepackage{microtype} % Improves text appearance with microtypography + \usepackage{lipsum} % For generating dummy text (if needed) + + + % Increase capacity limits + \setcounter{totalnumber}{200} % Maximum floats + \setcounter{dbltopnumber}{200} % Double float maximum + \setcounter{secnumdepth}{3} % Section numbering depth + \setcounter{tocdepth}{3} % Table of contents depth + + % Increase dimensions and allocations + \usepackage{morefloats} % Allows for more floats + \setlength{\emergencystretch}{3em} % Help with overfull hboxes + \setlength{\maxdepth}{100pt} % Sets a high limit for max depth (if applicable) + + % Allocate more memory for TeX + \usepackage{etex} % Use eTeX for more memory + %\reserveinserts{200} % Reserve more inserts + \setcounter{totalnumber}{200} % Ensure maximum floats are increased + + + % Declare Unicode characters + \DeclareUnicodeCharacter{1F389}{\textbf{(party popper)}} + \DeclareUnicodeCharacter{1F3C6}{\textbf{(trophy)}} + \DeclareUnicodeCharacter{1F44D}{\textbf{(thumbs up)}} + \DeclareUnicodeCharacter{1F4AA}{\textbf{Strong}} % Muscle emoji + \DeclareUnicodeCharacter{1F4B0}{\textbf{Money Bag}} % Money bag emoji (text representation) + \DeclareUnicodeCharacter{1F525}{\textbf{(fire)}} + \DeclareUnicodeCharacter{1F600}{\textbf{(grinning)}} + \DeclareUnicodeCharacter{1F609}{\textbf{(winking)}} + \DeclareUnicodeCharacter{1F614}{\textbf{(pensive)}} + \DeclareUnicodeCharacter{1F680}{\textbf{(rocket)}} + \DeclareUnicodeCharacter{2501}{\textbf{━}} % Heavy horizontal line + \DeclareUnicodeCharacter{2588}{\textbf{█}} % Full block character + \DeclareUnicodeCharacter{258A}{\textbf{▊}} % Center right block character + \DeclareUnicodeCharacter{258B}{\textbf{▉}} % Right block character + \DeclareUnicodeCharacter{258C}{\textbf{▌}} % Center block character + \DeclareUnicodeCharacter{258D}{\textbf{▍}} % Center left block character + \DeclareUnicodeCharacter{258E}{\textbf{▎}} % Left third block character + \DeclareUnicodeCharacter{258F}{\textbf{▏}} % Right block character + \DeclareUnicodeCharacter{2728}{\textbf{(sparkles)}} + \DeclareUnicodeCharacter{2764}{\textbf{(heart)}} + \DeclareUnicodeCharacter{2B50}{\textbf{(star)}} + + ''', } +# Use pdflatex as the LaTeX engine +latex_engine = 'pdflatex' + # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ ( - master_doc, + root_doc, "PyGraphistry.tex", u"PyGraphistry Documentation", u"Graphistry, Inc.", @@ -240,7 +527,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [(master_doc, "pygraphistry", u"PyGraphistry Documentation", [author], 1)] +man_pages = [(root_doc, "pygraphistry", u"PyGraphistry Documentation", [author], 1)] # If true, show URL addresses after external links. # man_show_urls = False @@ -253,7 +540,7 @@ # dir menu entry, description, category) texinfo_documents = [ ( - master_doc, + root_doc, "PyGraphistry", u"PyGraphistry Documentation", author, @@ -348,3 +635,136 @@ # Example configuration for intersphinx: refer to the Python standard library. # intersphinx_mapping = {'https://docs.python.org/': None} + + + +# -- Custom Preprocessor Configuration --------------------------------------- + +def replace_iframe_src(app, doctree, docname): + """ + Replace relative iframe src paths with absolute URLs in HTML content. + Specifically targets iframe tags with src attributes starting with /graph/. + """ + # Define a flexible regex pattern to match - -For self-hosting and access to a free API key, refer to our Graphistry `Hub `_. +Indices and tables +================== .. toctree:: :maxdepth: 3 + :hidden: + :caption: PyGraphistry documentation graphistry - modules -Articles -================== -* `Graphistry: Visual Graph AI Interactive demo `_ -* `PyGraphistry + Databricks `_ -* `PyGraphistry + UMAP `_ - - -Indices and tables -================== +.. toctree:: + :maxdepth: 3 * :ref:`genindex` * :ref:`modindex` -* :ref:`search` - +* :ref:`search` \ No newline at end of file diff --git a/docs/source/install/extended.rst b/docs/source/install/extended.rst new file mode 100644 index 000000000..2e1d0f352 --- /dev/null +++ b/docs/source/install/extended.rst @@ -0,0 +1,201 @@ +Installation Guide - Extended +============================= + +This extended guide provides detailed instructions for installing PyGraphistry, including optional configurations for enhanced performance and functionality. + +GPU Mode System Requirements (Optional) +--------------------------------------- + +* **Nvidia RAPIDS**: PyGraphistry primarily aligns with Nvidia RAPIDS, so check their requirements for your system: + + * **Volta generation GPUs or newer** are the current Nvidia RAPIDS minimum requirement. + + * **cuDF**: Required. + + * **cuML**, **cuGraph**: Recommended. + +* **PyTorch**: PyGraphistry[AI] further aligns with PyTorch for some of its more advanced methods. + +Core Dependencies (Installed by Default) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +PyGraphistry depends on a small set of standard CPU-based Python data science libraries such as pandas, pyarrow, and numpy. If your system is missing these dependencies, they will get installed automatically. + +Optional Dependencies +--------------------- + +PyGraphistry supports a variety of optional dependencies to extend its functionality. + +GPU Acceleration with RAPIDS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To enable GPU acceleration for DataFrames and graph analytics, install **cuDF**, **cuML**, and **cuGraph** from the NVIDIA RAPIDS suite. + +Follow the instructions at the `NVIDIA RAPIDS Installation Guide `_. + +Additional Optional Dependencies +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Many of the following can be used in both CPU mode and GPU mode. + +- **AI Libraries**: + + - `torch` (1GB+): PyTorch and related libraries for advanced AI methods in the PyGraphistry AI packages. + + Install with: + + .. code-block:: bash + + pip install graphistry[ai] + +- **Graph Libraries**: + + - `networkx`: Integration with NetworkX graphs. + + Install with: + + .. code-block:: bash + + pip install graphistry[networkx] + + - `igraph`: Support for igraph graphs. + + Install with: + + .. code-block:: bash + + pip install graphistry[igraph] + + - `pygraphviz`: Rendering graphs with Graphviz layouts. + + Install with: + + .. code-block:: bash + + pip install graphistry[pygraphviz] + +- **Graph Databases and Protocols**: + + - `gremlinpython`: Working with Gremlin graph databases. + + Install with: + + .. code-block:: bash + + pip install graphistry[gremlin] + + - `neo4j`, `neotime`: Connecting to Neo4j via the Bolt protocol. + + Install with: + + .. code-block:: bash + + pip install graphistry[bolt] + +- **Data Formats**: + + - `openpyxl`, `xlrd`: Reading NodeXL files. + + Install with: + + .. code-block:: bash + + pip install graphistry[nodexl] + +- **Machine Learning and AI**: + + - `umap-learn`, `dirty-cat`, `scikit-learn`: For dimensionality reduction and clustering. + + Install with: + + .. code-block:: bash + + pip install graphistry[umap-learn] + + - `scipy`, `dgl`, `torch<2`, `sentence-transformers`, `faiss-cpu`, `joblib`: Advanced AI functionalities. + + Install with: + + .. code-block:: bash + + pip install graphistry[ai] + +- **Jupyter Support**: + + - `ipython`: Enhanced Jupyter notebook integration. + + Install with: + + .. code-block:: bash + + pip install graphistry[jupyter] + +Installing Multiple Extras +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can install multiple extras by listing them separated by commas: + +.. code-block:: bash + + pip install graphistry[networkx,umap-learn] + +Installing All Optional Dependencies +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To install all optional dependencies (not generally recommended due to size and potential conflicts): + +.. code-block:: bash + + pip install graphistry[all] + +Common Questions +---------------- + +Do I Need a Server? +~~~~~~~~~~~~~~~~~~~ + +- **No**, you can run GFQL and other PyGraphistry CPU and GPU components locally. To use the full visualization capabilities, you do need access to a Graphistry server. + +- **Options**: + + - **Graphistry Hub**: Use the public Graphistry Hub at `hub.graphistry.com `_. + + - **Self-Hosted Server**: Set up your own Graphistry server by following the deployment instructions in the `Graphistry CLI Admin Guide `_. + +Can I Use PyGraphistry Without GPU Support? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- **Yes**, PyGraphistry can be used without GPU support. + +- **GPU Acceleration**: To leverage GPU acceleration, install optional GPU libraries like cuDF and have compatible hardware. + +What Are the Benefits of Installing Optional Dependencies? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- **Enhanced Functionality**: Support for different graph formats, advanced analytics, machine learning, and integration with various tools and databases. For example, for visualization users needing careful layout of small trees, we recommend `pygraphviz`, while for users of big GFQL workloads, we recommend RAPIDS. + +- **Customization**: Install only what you need for your specific use case. + +How Do I Install Development Dependencies? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For contributors and developers who wish to work on PyGraphistry itself, we recommend using Docker, or for native development: + +- **Install with**: + + .. code-block:: bash + + pip install graphistry[dev] + +- **Includes**: Testing tools, documentation tools, and other development dependencies like `flake8`, `pytest`, `sphinx`, etc. + +References +---------- + +- **PyGraphistry GitHub Repository**: `https://github.com/graphistry/pygraphistry `_ +- **Graphistry Get Started**: `https://www.graphistry.com/get-started `_ +- **Graphistry CLI Admin Guide**: `https://github.com/graphistry/graphistry-cli `_ +- **NVIDIA RAPIDS Installation Guide**: `https://rapids.ai/start.html `_ +- **Graphistry Documentation**: `https://hub.graphistry.com/docs/ `_ + +Happy graphing! diff --git a/docs/source/install/index.rst b/docs/source/install/index.rst new file mode 100644 index 000000000..e7edc64fc --- /dev/null +++ b/docs/source/install/index.rst @@ -0,0 +1,11 @@ +Install +================== + +Welcome to the PyGraphistry installation guide. Choose the section that best fits your needs: + +.. toctree:: + :maxdepth: 1 + + quick + extended + server diff --git a/docs/source/install/quick.rst b/docs/source/install/quick.rst new file mode 100644 index 000000000..3658b2340 --- /dev/null +++ b/docs/source/install/quick.rst @@ -0,0 +1,72 @@ +Installation Guide - Quick Start +================================= + +This quick start guide will help you install PyGraphistry and its essential dependencies to get you up and running quickly. + +Minimum System Requirements +---------------------------- + +Before installing PyGraphistry, ensure your system meets the following minimum requirements: + +- **Operating System**: Windows, macOS, Linux, or any Python-capable environment + +- **Python Version**: Python 3.8 or higher + +- **Hardware**: + + - **CPU**: 1 core + + - **Memory**: 1 GB - in addition to regular OS requirements + + - **GPU**: While optional, we recommend using a browser with WebGL enabled and a GPU, which is most phones and laptops + +Installing PyGraphistry +----------------------- + +Basic Installation +~~~~~~~~~~~~~~~~~~ + +Install PyGraphistry using `pip`: + +.. code-block:: bash + + pip install graphistry + +Importing and Version Check +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Verify the installation by importing PyGraphistry and checking its version: + +.. code-block:: python + + import graphistry + print(graphistry.__version__) + +Log in to a Graphistry GPU Server +--------------------------------- + +To use PyGraphistry's visualization server, you need to connect to a Graphistry GPU server: + +- **Get an Account**: Visit the `Graphistry Get Started `_ page and choose: + + - **Graphistry Hub**: For immediate access with no installation, use the public Graphistry Hub, which includes free GPU accounts. + + - **Self-Host**: Quick launch on AWS/Azure, or contact staff for on-premises options. + +- **Log in**: Once you have an account, register in your Python environment: + + .. code-block:: python + + import graphistry + + graphistry.register(api=3, server='hub.graphistry.com', username='YOUR_USERNAME', password='YOUR_PASSWORD') + + Replace `'YOUR_USERNAME'` and `'YOUR_PASSWORD'` with your actual credentials. + + When the command finishes without an exception, you have successfully connected to the server. + + See the authentication guide for additional options such as logging into an organization, SSO, and using API keys. + +For additional authentication options, see the Login and Sharing guide. + +Happy graphing! diff --git a/docs/source/install/server.rst b/docs/source/install/server.rst new file mode 100644 index 000000000..57086bf95 --- /dev/null +++ b/docs/source/install/server.rst @@ -0,0 +1,97 @@ +Using a Server with PyGraphistry +================================= + +While PyGraphistry offers robust functionalities out of the box, leveraging a server enhances its capabilities, especially for GPU-accelerated visualizations and remote operations. This guide helps you decide whether to use PyGraphistry without a server or to set up a server using various available options. + +Using PyGraphistry Without a Server +----------------------------------- + +For most use cases, PyGraphistry can operate seamlessly without the need for a dedicated server. This setup is ideal for: + +- **Local Data Visualization**: Create and interact with visualizations directly within your local environment. +- **Basic Graph Analytics**: Perform standard graph operations and analyses without the overhead of server management. +- **Development and Testing**: Ideal for developers building and testing applications that utilize PyGraphistry. + +**Note**: Without a server, advanced features like GPU-accelerated visualizations and certain remote capabilities will not be available. + +Using a Graphistry Server +------------------------- + +To unlock the full potential of PyGraphistry, especially for GPU-accelerated visualizations and scalable remote operations, consider setting up a Graphistry server. Below are the available options to get started: + +Graphistry Hub +~~~~~~~~~~~~~~ + +**Graphistry Hub** offers a managed solution with the following benefits: + +- **Ease of Use**: No installation required; get started immediately. +- **Free Cloud GPU Tier**: Access free GPU resources for accelerated visualizations. +- **Scalability**: Automatically scales with your project needs. + +**Getting Started with Graphistry Hub**: + +- Visit the `Graphistry Get Started `_ page. +- Choose **Graphistry Hub** to create an account and start using the service without any infrastructure setup. + +Cloud Marketplace Deployments +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Deploying Graphistry on cloud platforms like **AWS** and **Azure** provides flexibility and control over your server environment. + +AWS Marketplace +^^^^^^^^^^^^^^^ + +- **Quick Deployment**: Launch Graphistry with pre-configured settings optimized for AWS. +- **Integration**: Seamlessly integrate with other AWS services for enhanced functionality. + +**Deploy on AWS**: + +- Navigate to the `AWS Marketplace `_ and search for "Graphistry." +- Follow the deployment instructions to set up your Graphistry server on AWS. + +Azure Marketplace +^^^^^^^^^^^^^^^^^^ + +- **Azure Integration**: Leverage Azure's robust infrastructure and services. +- **Scalable Resources**: Adjust resources based on your project's demands. + +**Deploy on Azure**: + +- Visit the `Azure Marketplace `_ and search for "Graphistry." +- Follow the provided steps to deploy Graphistry on Azure. + +Kubernetes and Docker-Compose Distributions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For organizations preferring containerized deployments, Graphistry offers support for **Kubernetes** and **Docker-Compose**. + +Kubernetes +^^^^^^^^^^ + +- **Orchestration**: Manage containerized applications with Kubernetes for scalability and reliability. +- **Customization**: Tailor the deployment to fit your infrastructure and scaling requirements. + +**Deploy with Kubernetes**: + +- Access the Kubernetes deployment guides at the `Graphistry CLI Admin Guide `_. +- Follow the instructions to deploy and manage your Graphistry server on a Kubernetes cluster. + +Docker-Compose +~~~~~~~~~~~~~~~ + +- **Simplicity**: Ideal for smaller deployments or development environments. +- **Quick Setup**: Deploy Graphistry using Docker-Compose with minimal configuration. + +**Deploy with Docker-Compose**: + +- Refer to the `Graphistry CLI Admin Guide `_ for Docker-Compose setup instructions. +- Execute the provided Docker-Compose files to launch your Graphistry server locally or on a server. + +Choosing the Right Option +------------------------- + +- **For Beginners or Quick Setup**: Use **Graphistry Hub** for a hassle-free experience. +- **For Enterprise or Scalable Needs**: Deploy via **AWS** or **Azure Marketplace** to leverage cloud infrastructure. +- **For Containerized Environments**: Opt for **Kubernetes** or **Docker-Compose** to integrate with your existing container orchestration workflows. + +Happy graphing! diff --git a/docs/source/notebooks/ai.rst b/docs/source/notebooks/ai.rst new file mode 100644 index 000000000..bcdd11004 --- /dev/null +++ b/docs/source/notebooks/ai.rst @@ -0,0 +1,12 @@ +AI +========================== + +.. toctree:: + :maxdepth: 2 + :caption: AI + :titlesonly: + + Story <../demos/talks/infosec_jupyterthon2022/rgcn_login_anomaly_detection/intro-story.ipynb> + RGCN <../demos/more_examples/graphistry_features/embed/simple-ssh-logs-rgcn-anomaly-detector.ipynb> + RGCN+UMAP <../demos/talks/infosec_jupyterthon2022/rgcn_login_anomaly_detection/advanced-identity-protection-40m.ipynb> + Link prediction with DGL (cyber) <../demos/ai/cyber/CyberSecurity-Slim.ipynb> \ No newline at end of file diff --git a/docs/source/notebooks/gfql.rst b/docs/source/notebooks/gfql.rst new file mode 100644 index 000000000..99682dc5b --- /dev/null +++ b/docs/source/notebooks/gfql.rst @@ -0,0 +1,11 @@ +GFQL Graph queries +========================== + +.. toctree:: + :maxdepth: 2 + :caption: GFQL Graph queries + :titlesonly: + + Intro to graph queries with hop and chain <../demos/more_examples/graphistry_features/hop_and_chain_graph_pattern_mining.ipynb> + GPU Benchmarking <../demos/gfql/benchmark_hops_cpu_gpu.ipynb> + diff --git a/docs/source/notebooks/gpu.rst b/docs/source/notebooks/gpu.rst new file mode 100644 index 000000000..a57ee7ec2 --- /dev/null +++ b/docs/source/notebooks/gpu.rst @@ -0,0 +1,12 @@ +GPU +========================== + +.. toctree:: + :maxdepth: 2 + :caption: GPU compute with Nvidia RAPIDS + :titlesonly: + + GPU I: CPU Pandas <../demos/demos_databases_apis/gpu_rapids/part_i_cpu_pandas.ipynb> + GPU II: cuDF <../demos/demos_databases_apis/gpu_rapids/part_ii_gpu_cudf.ipynb> + GPU IV: cuML UMAP <../demos/demos_databases_apis/gpu_rapids/part_iv_gpu_cuml.ipynb> + GPU V: cuGraph <../demos/demos_databases_apis/gpu_rapids/cugraph.ipynb> diff --git a/docs/source/notebooks/index.rst b/docs/source/notebooks/index.rst new file mode 100644 index 000000000..5c354dd6c --- /dev/null +++ b/docs/source/notebooks/index.rst @@ -0,0 +1,15 @@ +.. _notebooks: + +Notebook Tutorials +========================== + +.. toctree:: + :maxdepth: 2 + + intro + visualization + gfql + gpu + ai + plugins.connectors + plugins.compute diff --git a/docs/source/notebooks/intro.rst b/docs/source/notebooks/intro.rst new file mode 100644 index 000000000..fe8e646a0 --- /dev/null +++ b/docs/source/notebooks/intro.rst @@ -0,0 +1,13 @@ +Getting Started +========================== + +.. toctree:: + :maxdepth: 2 + :caption: Getting Started + :titlesonly: + + For analysts <../demos/for_analysis.ipynb> + + For developers <../demos/for_developers.ipynb> + + CSV upload miniapp <../demos/upload_csv_miniapp.ipynb> diff --git a/docs/source/notebooks/plugins.compute.rst b/docs/source/notebooks/plugins.compute.rst new file mode 100644 index 000000000..5d0ec6973 --- /dev/null +++ b/docs/source/notebooks/plugins.compute.rst @@ -0,0 +1,13 @@ +.. _nb-compute: + +Plugins - Compute & Layout +============================= + +.. toctree:: + :maxdepth: 2 + :caption: Plugin Tutorials + :titlesonly: + + graphviz <../demos/demos_databases_apis/graphviz/graphviz.ipynb> + HyperNetX <../demos/demos_databases_apis/hypernetx/hypernetx.ipynb> + NetworkX <../demos/demos_databases_apis/networkx/networkx.ipynb> \ No newline at end of file diff --git a/docs/source/notebooks/plugins.connectors.rst b/docs/source/notebooks/plugins.connectors.rst new file mode 100644 index 000000000..c4f9dc04d --- /dev/null +++ b/docs/source/notebooks/plugins.connectors.rst @@ -0,0 +1,27 @@ +.. _nb-connectors: + +Plugins - Data Providers +============================= + +.. toctree:: + :maxdepth: 2 + :caption: Plugin Tutorials + :titlesonly: + + AlienVault: OTX indicators <../demos/demos_databases_apis/alienvault/OTXIndicators.ipynb> + AlienVault: Locker Goga <../demos/demos_databases_apis/alienvault/OTXLockerGoga.ipynb> + AlientVault: USM <../demos/demos_databases_apis/alienvault/usm.ipynb> + Amazon Neptune I <../demos/demos_databases_apis/neptune/neptune_tutorial.ipynb> + Amazon Neptune II <../demos/demos_databases_apis/neptune/neptune_cypher_viz_using_bolt.ipynb> + Arango <../demos/demos_databases_apis/arango/arango_tutorial.ipynb> + Databricks <../demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.ipynb> + Memgraph <../demos/demos_databases_apis/memgraph/visualizing_iam_dataset.ipynb> + NodeXL <../demos/demos_databases_apis/nodexl/official/nodexl_graphistry.ipynb> + Splunk <../demos/demos_databases_apis/splunk/splunk_demo_public.ipynb> + Titan <../demos/demos_databases_apis/gremlin-tinkerpop/TitanDemo.ipynb> + Neo4j - Official <../demos/demos_databases_apis/neo4j/official/graphistry_bolt_tutorial_public.ipynb> + Neo4j - Contributed <../demos/demos_databases_apis/neo4j/contributed/Neo4jTwitter.ipynb> + SQL - Postgres <../demos/demos_databases_apis/sql/postgres.ipynb> + Tigergraph: Bindings <../demos/demos_databases_apis/tigergraph/tigergraph_pygraphistry_bindings.ipynb> + Tigergraph: Fraud <../demos/demos_databases_apis/tigergraph/fraud_raw_REST_calls.ipynb> + Tigergraph: Social <../demos/demos_databases_apis/tigergraph/social_raw_REST_calls.ipynb> diff --git a/docs/source/notebooks/visualization.rst b/docs/source/notebooks/visualization.rst new file mode 100644 index 000000000..ab47101b4 --- /dev/null +++ b/docs/source/notebooks/visualization.rst @@ -0,0 +1,44 @@ +Visualization +========================== + + +Encodings +--------- + +.. toctree:: + :maxdepth: 2 + :caption: Visualization encoding features + :titlesonly: + + Colors <../demos/more_examples/graphistry_features/encodings-colors.ipynb> + Sizes <../demos/more_examples/graphistry_features/encodings-sizes.ipynb> + Icons <../demos/more_examples/graphistry_features/encodings-icons.ipynb> + Badges <../demos/more_examples/graphistry_features/encodings-badges.ipynb> + +Layout +------- + + +.. toctree:: + :maxdepth: 2 + :caption: Layout features + :titlesonly: + + Edge weights <../demos/more_examples/graphistry_features/edge-weights.ipynb> + Ring - categorical <../demos/more_examples/graphistry_features/layout_categorical_ring.ipynb> + Ring - continuous <../demos/more_examples/graphistry_features/layout_continuous_ring.ipynb> + Ring - time <../demos/more_examples/graphistry_features/layout_time_ring.ipynb> + Modularity weighted <../demos/more_examples/graphistry_features/layout_modularity_weighted.ipynb> + Tree <../demos/more_examples/graphistry_features/layout_tree.ipynb> + External - networkx <../demos/more_examples/graphistry_features/external_layout/networkx_layout.ipynb> + External - manual <../demos/more_examples/graphistry_features/external_layout/simple_manual_layout.ipynb> + +Accounts and Sharing +--------------------- + +.. toctree:: + :maxdepth: 2 + :caption: Accounts and sharing + :titlesonly: + + ../demos/more_examples/graphistry_features/sharing_tutorial.ipynb diff --git a/docs/source/performance.rst b/docs/source/performance.rst new file mode 100644 index 000000000..d6eae69ac --- /dev/null +++ b/docs/source/performance.rst @@ -0,0 +1,55 @@ +.. _performance: + +CPU & GPU Acceleration in PyGraphistry +============================================== + +Why PyGraphistry is Fast +------------------------ + +PyGraphistry is designed for speed. By focusing on **vectorized processing**, it outperforms most graph libraries on standard CPUs. When you leverage GPUs and AI models, PyGraphistry can become **100X+ faster**, enabling real-time analytics and machine learning at scale. We regularly use it on datasets with millions and billions of rows. + +Just as Apache Spark used in-memory processing to replace racks of Hadoop servers with faster and smaller multicore ones, the PyGraphistry ecosystem uses GPU acceleration to increase speeds and decrease costs even further. + +Flexible GPU Use: Client and Server +----------------------------------- + +Strictly optional, PyGraphistry lets you harness GPUs where they make the most sense for your workflow. For smaller datasets, you can run PyGraphistry on your local GPU. Graph loading, shaping, computing, querying, ML, AI, and visualization tasks all become much more interactive and immediate, making PyGraphistry great for exploration in Jupyter notebooks and dashboards. + +For larger datasets and team projects, you can offload PyGraphistry tasks like **GFQL queries** and **visualization ETL**, and even full GPU Python scripts, to shared Graphistry GPU servers. This setup handles enterprise-grade workloads, helping deliver consistent performance across web apps, dashboards, and AI pipelines. + +Where PyGraphistry Accelerates with Vector Processing and GPUs +---------------------------------------------------------------- + +PyGraphistry uses vector processing and GPU acceleration throughout your data workflow. + +In data processing, it integrates with **Apache Arrow** to seamlessly transition between **pandas** for algorithmic and hardware acceleration of datasets even on CPUs, and **cuDF** (via `NVIDIA RAPIDS `_) for large, GPU-accelerated workloads, keeping your data pipelines efficient at any size. Graphistry is typically used on GPUs with 12-80 GB single-GPU RAM, and we increasingly work with teams experimenting with multi-GPU nodes (128-640 GB GPU RAM) and clusters of them. + +For graph querying, **GFQL** leverages GPUs to speed up queries on massive graph datasets, delivering results in seconds on a single GPU even when traversal steps touch hundreds of millions of rows. + +In visualization, GPUs enable PyGraphistry to render large, complex graphs in real time. Whether you're investigating cybersecurity threats, monitoring supply chains, or analyzing clickstreams, you get responsive visuals at any scale, locally or via shared servers. + +For AI and machine learning, **PyGraphistry[AI]** uses GPUs to accelerate model training and inference for tasks like **UMAP** and **GNNs**, unlocking rapid insights from large graph datasets in areas like security and commercial analytics. When running on real-time data and billions of rows, the combination of GPU training and GPU inferencing unlocks significant velocity. + +Easy Deployment Anywhere +------------------------ + +The Graphistry ecosystem fits into your existing infrastructure. + +You can `deploy Graphistry GPU servers `_ on any modern cloud platform (`AWS `_, `GCP `_, `Azure `_), and on-premises using **Docker Compose** or **Kubernetes**. PyGraphistry works with any NVIDIA GPU that are `RAPIDS-compatible `_. + +If you don't have a GPU, no problem. PyGraphistry is a quick `pip install graphistry` away, giving performance optimized for CPU hardware through vectorized columnar processing concepts similar to `ClickHouse `_ and `Apache Spark `_. You can also offload heavy tasks to remote Graphistry shared GPUs, including Graphistry Hub visualization servers. + + +Trusted Security & Compliance +----------------------------- + +Many top organizations with sensitive environments — including global banks and air-gapped government systems — trust PyGraphistry. Regular security practices such as periodic penetration testing ensure systems meets strict security requirements, making it safe for some of the most stringent teams. + +Next Steps +---------- + +Get started with PyGraphistry: + +- **Installation Guide**: `Set up PyGraphistry `_ . +- **Visualization**: Explore :ref:`10min`. +- **GFQL Documentation**: Start with :ref:`10min-gfql`. diff --git a/docs/source/plugins.rst b/docs/source/plugins.rst new file mode 100644 index 000000000..9ab89e084 --- /dev/null +++ b/docs/source/plugins.rst @@ -0,0 +1,117 @@ +.. _plugins: + +Plugins +======= + +PyGraphistry is frequently used with a variety of external tools such as data providers, compute engines, layout engines, and more. + +Users typically prefer to go through PyGraphistry's native dataframe support (Apache Arrow, Pandas, cuDF, ...). That is often an efficient, safe, and easy starting point. + +Occasionally, native PyGraphistry plugins streamline common operations, such as with graph databases. We link to the native API integrations below as appropriate. + +For more examples, see also the :ref:`notebook catalog `. + + +Databases +--------------- + +Graph +~~~~~~~~~~~ + +* `Amazon Neptune `_ (:class:`graphistry.gremlin.NeptuneMixin`) +* `ArangoDB `_ +* `Gremlin `_ (:class:`graphistry.gremlin.GremlinMixin`) +* `Memgraph `_ (:meth:`graphistry.PlotterBase.PlotterBase.cypher`) +* `Neo4j `_ (:meth:`graphistry.PlotterBase.PlotterBase.cypher`) +* `TigerGraph `_ (:meth:`graphistry.PlotterBase.PlotterBase.gsql`) +* `Trovares `_ + + +Document, Key-Value, Log, Text, and SIEM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* `Amazon DynamoDB `_ +* `Azure Cosmos DB `_ (:class:`graphistry.gremlin.CosmosMixin`) + +* `Azure Data Explorer (Kusto) `_ +* `Cassandra `_ +* `Elasticsearch `_ +* `OpenSearch `_ +* `Redis `_ +* `Splunk `_ + +SQL +~~~~~~~~~~~ + +Typically accessed via dataframe bindings + +When available, we recommend exploring for accelerated bindings via `ADBC `_ + + +* `Amazon Athena `_ +* `Databricks `_ +* `OpenSearch `_ +* `PostgreSQL `_ +* `Amazon Redshift `_ +* `BigQuery `_ +* `Snowflake `_ +* `SQL Server `_ + + +Compute engines +---------------- + +Natively supported in methods such as :meth:`.nodes() ` and :meth:`.edges() `: + +* `Apache Spark `_ +* `Pandas `_ +* `cuDF `_ + +Partial native support: + +* `cuML `_ +* `Dask `_ +* `Dask-cuDF `__ + +Accelerated interop via `Apache Arrow `_ or Parquet: + +* `DuckDB `_ +* `Polars `_ +* `Spark `_ + +Graph layout and analytics +--------------------------- + +* :ref:`cugraph `: GPU-accelerated graph analytics +* :ref:`graphviz `: CPU graph analytics and layouts +* :ref:`igraph `: CPU graph analytics and layouts +* :ref:`networkx `: CPU graph analytics and layouts + + +Tools +--------- + +We are constantly experimenting, feel free to add: + +* OWASP Amass + +Storage engines and file formats +--------------------------------- + +GPU-accelerated readers via `cuDF `_ (in-memory single-GPU) and `Dask-cuDF `__ (bigger-than-memory, multi-GPU): + +* Arrow +* CSV +* JSON +* JSONL +* LOG +* ORC +* Parquet +* TXT + +Others, often via `fsspec `_: + +* Azure blobstore +* GML +* S3 +* XLS(X) diff --git a/docs/source/server/index.rst b/docs/source/server/index.rst new file mode 100644 index 000000000..d6bae8600 --- /dev/null +++ b/docs/source/server/index.rst @@ -0,0 +1,13 @@ +Login and Share +================= + +PyGraphistry streamlines working with optional Graphistry server capabilities such as GPU-accelerated visual analytics, sharing visualizations, simplifying graph pipelines, GFQL compute endpoints, and sharing GPU resources. + +Server interactions are typically by first logging in (`graphistry.register()`) and then sending data, such as via `g.plot()`. You can set access control policies on all of your uploaded data via `graphistry.privacy()`. Read on for more on both. + +.. toctree:: + :maxdepth: 2 + + register + privacy + diff --git a/docs/source/server/privacy.rst b/docs/source/server/privacy.rst new file mode 100644 index 000000000..055cf1217 --- /dev/null +++ b/docs/source/server/privacy.rst @@ -0,0 +1,155 @@ +Sharing and Access Control +========================== + +Graphistry provides powerful tools for visualizing and sharing graph data securely. Understanding how to manage privacy settings and share visualizations appropriately is essential for collaborative work and data security. This guide will help you understand how to control privacy settings using the Graphistry API. For more examples, see the `Sharing Tutorial Notebook `_. + +Overview of Privacy Settings +---------------------------- + +You have full control over who can view or edit your visualizations. By default, Graphistry visualizations are **public** but **unlisted**, meaning you need to have been given the secret ID of the visualization to know where it is, but do not need to log in to see it. Privacy settings can be adjusted when you create a plot using the `plot()` method. + +Key privacy levels include: + +- **Private**: Only you can view the visualization. +- **Organization (`"org"`)**: Anyone in your organization can view the visualization. +- **Public** (**unlisted**): Anyone with the link can view the visualization. Graphistry does not make the list of visualizations public, so this os the equivalent of the **unlisted** privacy mode in many platforms. +- **Custom Sharing**: Share with individual users (requires additional configuration). + +When sharing with others, you may also configure settings such as `viewer` vs `editor`. + +Getting Started with Privacy: Public (unlisted) +------------------------------------------------ + +Before adjusting privacy settings, ensure you have registered with Graphistry: + +.. code-block:: python + + import graphistry + + graphistry.register(api=3, username='my_username', password='my_password') + +By default, any plot you create is public (unlisted), meaning others will not know about your visualization, but if you share a link to it, they can see it without logging in. + +Creating a Private Visualization +-------------------------------- + +You can set a visualization to a stricter mode by calling `graphistry.privacy()`: + +.. code-block:: python + + graphistry.privacy() + + + # Sample data + edges = pd.DataFrame({ + 'src': ['A', 'B', 'C'], + 'dst': ['B', 'C', 'A'] + }) + + # Create a private plot + plot_url = graphistry.edges(edges, 'src', 'dst').plot(render=False) + + print(f"Private visualization URL: {plot_url}") + +If you are logged into your personal account, only you can access this plot. If you are logged into an organization, the visualization will be private to organization members. When anyone else obtains the URL, they won't be able to view it until you adjust the privacy settings. + +Sharing Visualizations Within Your Organization +----------------------------------------------- + +To share a visualization with members of your organization: + +.. code-block:: python + + graphistry.privacy(mode='organization') + + # Create an organization-shared plot + plot_url = graphistry.edges(edges, 'src', 'dst').plot(render=False) + + print(f"Organization-shared visualization URL: {plot_url}") + +Now, anyone within your organization who has access to Graphistry can view the plot using the provided URL. + +Making Visualizations Public +---------------------------- + +To make a visualization accessible to anyone with the link: + +.. code-block:: python + + graphistry.privacy(mode='public') + + # Create a public plot + plot_url = graphistry.edges(edges, 'src', 'dst').plot(render=False) + + print(f"Public visualization URL: {plot_url}") + +This setting is useful when sharing with external collaborators or embedding visualizations in public websites. + +Controlling Edit Permissions +---------------------------- + +By default, shared visualizations are editable by same-org members. To allow others to edit or interact with the visualization settings, or set to read-only, you can reconfigure the policy: + +.. code-block:: python + + VIEW = '10' + EDIT = '20' + graphistry.privacy(mode='organization', mode_action=EDIT) + + # Allow others to edit the plot + plot_url = graphistry.edges(edges, 'src', 'dst').plot(render=False) + + print(f"Editable visualization URL: {plot_url}") + + +Understanding Privacy Levels +---------------------------- + +- **Private**: Only accessible to the creator. +- **Organization (`"org"`)**: Accessible to all users within your Graphistry organization. +- **Public**: Unlisted in any public index, but accessible to anyone with the link. Use cautiously, as this allows broad access. +- **Custom**: Advanced configurations for sharing with specific users. + +Best Practices for Data Privacy +------------------------------- + +- **Use Organization Sharing for Internal Collaboration**: Keeps data within your company's control. +- **Limit Public Sharing**: Only make visualizations public if the data is non-sensitive and intended for broad distribution. +- **Regularly Review Shared Visualizations**: Periodically check which visualizations are shared and adjust privacy settings as needed. +- **Use Secure Methods for Sharing Links**: When sharing URLs, use secure channels to prevent unauthorized access. + +Advanced Features +------------------------------------------------------ + +Look at the documentation and tutorial for individual parameters for more advanced usage modes: + +- Invite individual users, including with optional notification emails, using parameters `invited_users` and `notify` + +- Use nested privacy settings (`g2 = g1.privacy()`) + +Additional Resources +-------------------- + +For more detailed examples and advanced features, refer to the **Graphistry Sharing Tutorial** available in the official documentation or GitHub repository. + +- **Sharing Tutorial Notebook**: `https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/sharing_tutorial.ipynb` + +This tutorial covers topics such as: + +- Creating custom share links +- Embedding visualizations in web applications +- Using access tokens for secure sharing +- Advanced privacy configurations + +Conclusion +---------- + +Managing privacy and sharing settings in Graphistry is straightforward and flexible. By understanding and utilizing these features, you can securely collaborate with others while maintaining control over your data. + +Remember to: + +- Choose the appropriate privacy level for your needs. +- Be cautious when making visualizations public. +- Regularly audit your shared visualizations. +- Use `graphistry.privacy()` to stay informed about your data handling. + diff --git a/docs/source/server/register.rst b/docs/source/server/register.rst new file mode 100644 index 000000000..5cc7512d3 --- /dev/null +++ b/docs/source/server/register.rst @@ -0,0 +1,238 @@ +API authentication to Graphistry servers +======================================== + +`graphistry.register()` is the global method to authenticate your Graphistry client. It sets up your API credentials, specifies the server to connect to, and configures authentication settings. This function should be called before making any Graphistry API calls that use the server such as `.plot()`. + +Underneath, it manages use of JWT session tokens over the Graphistry REST API. Likewise, it streamlines using advanced optional modes such as SSO. + +Basic Usage +----------- + +To register, import Graphistry and call `graphistry.register()`: + +.. code-block:: python + + import graphistry + + # Register with default Graphistry Hub using username/password + graphistry.register(api=3, username="my_username", password="my_password") + +By default, this connects to **Graphistry Hub** (`hub.graphistry.com`) using the `https` protocol and sets `api=3` for the latest API version. You can override the server, authentication details, and other settings as needed. + +Core Concepts +------------- + +Personal Accounts vs Organizational Accounts +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- **Personal Accounts**: Meant for individual use, typically on Graphistry Hub. +- **Organizational Accounts**: Managed with roles and permissions, often in an enterprise context. + +.. code-block:: python + + user_info = graphistry.user() + print(user_info.get("organization")) # Returns organization info or None + +Server Configuration +~~~~~~~~~~~~~~~~~~~~~ + +- **Default Server**: By default, `graphistry.register()` connects to the **Graphistry Hub**, including the **free GPU tier** for visual analytics. +- **Custom Server**: If using a private deployment, specify the `server` argument to connect to your custom server. + +.. code-block:: python + + # Connect to a custom server + graphistry.register( + api=3, + server="my_custom_graphistry_server.com", + username="my_username", + password="my_password" + ) + +Protocol Configuration +~~~~~~~~~~~~~~~~~~~~~~ + +- **TLS (HTTPS)**: Communication uses `https` by default for secure communication. +- **Non-TLS (HTTP)**: If your server doesn't support TLS, set the `protocol` parameter to `"http"`. + +.. code-block:: python + + # Use HTTP protocol without TLS + graphistry.register( + api=3, + protocol="http", + server="my_custom_graphistry_server.com", + username="my_username", + password="my_password" + ) + +Authentication Methods +~~~~~~~~~~~~~~~~~~~~~~~ + +`graphistry.register()` supports several authentication methods: + +1. **Username & Password**: + + .. code-block:: python + + graphistry.register(api=3, username="my_username", password="my_password") + +2. **Personal Key ID & Secret** (for scripts or automation): + + .. code-block:: python + + graphistry.register(api=3, personal_key_id="my_key_id", personal_key_secret="my_key_secret") + +3. **Single Sign-On (SSO)** (for enterprise users): + + .. code-block:: python + + graphistry.register(api=3, idp_name="my_idp_name", sso_opt_into_type="browser") + + SSO authentication options: `sso_opt_into_type` can be `"browser"`, `"display"`, or `None` (default is print). + +Routing Configuration +~~~~~~~~~~~~~~~~~~~~~ + +- **Server Routing**: By default, server API and browser UI requests route through the same `server`. +- **Custom Browser Routing**: Override browser routing via `client_protocol_hostname`. + +.. code-block:: python + + # Override browser routing + graphistry.register( + api=3, + server="my_api_server.com", + username="my_username", + password="my_password", + client_protocol_hostname="https://my_ui_server.com" + ) + +Advanced Features +----------------- + + +JWT Session Handling +~~~~~~~~~~~~~~~~~~~~ + +`graphistry.register()` establishes a **JWT session** after authentication. The session token is managed automatically for future API calls. + +Retrieving the Current JWT Token +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To retrieve the current JWT token, you can use the following command after registering: + +.. code-block:: python + + # Get the current JWT token + current_token = graphistry.api_token() + print(current_token) + +The token is automatically refreshed as needed during the session. + + +Detailed Parameter Reference +---------------------------- + +- **username** *(Optional[str])*: Your Graphistry account username. +- **password** *(Optional[str])*: Your Graphistry account password. +- **personal_key_id** *(Optional[str])*: Your personal key ID for secure access. +- **personal_key_secret** *(Optional[str])*: Corresponding personal key secret. +- **server** *(Optional[str])*: The URL of the Graphistry server to connect to (e.g., `hub.graphistry.com` or a custom server). +- **protocol** *(Optional[str])*: The protocol to use (`https` or `http`), defaults to `https`. +- **api** *(Optional[int])*: The API version to use (always set to `3`). +- **client_protocol_hostname** *(Optional[str])*: Overrides the browser protocol/hostname. +- **org_name** *(Optional[str])*: Organization name for SSO authentication. +- **idp_name** *(Optional[str])*: Identity Provider (IdP) for SSO. +- **sso_opt_into_type** *(Optional[str])*: How to display the SSO URL (`"browser"`, `"display"`, or `None`). + +Examples +---------------------- + +Register with Username and Password +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import graphistry + + graphistry.register( + api=3, + username="my_username", + password="my_password" + ) + +Register with Personal Key ID and Secret +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import graphistry + + graphistry.register( + api=3, + personal_key_id="my_key_id", + personal_key_secret="my_key_secret" + ) + +Register with SSO (Organization with Specific IdP) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import graphistry + + graphistry.register( + api=3, + org_name="my_org_name", + idp_name="my_idp_name", + sso_opt_into_type="browser" + ) + +Register with Custom Server and Protocol +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import graphistry + + graphistry.register( + api=3, + protocol="http", + server="my_custom_server.com", + username="my_username", + password="my_password" + ) + +Register with Custom Browser Routing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import graphistry + + graphistry.register( + api=3, + server="my_api_server.com", + username="my_username", + password="my_password", + client_protocol_hostname="https://my_ui_server.com" + ) + +--- + +Best Practices +-------------- + +- **Security**: Always use secure protocols (`https`) and validate certificates. +- **Authentication**: Use `personal_key_id` and `personal_key_secret` for automation. +- **SSO**: For organizations, ensure correct `org_name` and, if needed, `idp_name`. +- **Session Management**: The library handles session tokens automatically; ensure safe credential handling when enabling memory storage. + +Troubleshooting +--------------- + +- **Connection Errors**: Check the `server` and `protocol` parameters and ensure your network allows access. +- **Authentication Failures**: Verify credentials. For SSO, ensure `org_name` and `idp_name` are correct. +- **SSL Issues**: Validate that the server certificate is valid or consider disabling SSL validation (`certificate_validation=False`), though not recommended. + diff --git a/docs/source/static/favicon.ico b/docs/source/static/favicon.ico new file mode 100644 index 000000000..a8d22f5c5 Binary files /dev/null and b/docs/source/static/favicon.ico differ diff --git a/docs/source/static/graphistry.css b/docs/source/static/graphistry.css new file mode 100644 index 000000000..79671e64e --- /dev/null +++ b/docs/source/static/graphistry.css @@ -0,0 +1,26 @@ +html[data-theme="light"] { + --pst-color-primary: #1f8e9f; + --pst-color-secondary: #25ba8d; +} + +html[data-theme="dark"] { + --pst-color-primary: #32a8bb; + --pst-color-secondary: #48daae; +} + + +html[data-theme="dark"] .highlight .c1 { + color: #32a8bb !important; +} + +html[data-theme="dark"] .highlight .s1, +html[data-theme="dark"] .highlight .s2, +html[data-theme="dark"] .highlight .o, +html[data-theme="dark"] .highlight .sa { + color: #38e3b9 !important; +} + +html[data-theme="dark"] .highlight .mi, +html[data-theme="dark"] .highlight .nb { + color: #ff9f54 !important; +} \ No newline at end of file diff --git a/docs/source/support.rst b/docs/source/support.rst new file mode 100644 index 000000000..2b6e73d6b --- /dev/null +++ b/docs/source/support.rst @@ -0,0 +1,14 @@ +.. _support: + +Support +===================== + +Stuck or thinking about a new project? Let's chat! + +- `Get Started `_ +- `Blog `_ +- `Slack Channel `_ +- `Zendesk Support `_ +- `GitHub `_ +- `Twitter `_ +- `LinkedIn `_ diff --git a/docs/source/versioneer.rst b/docs/source/versioneer.rst deleted file mode 100644 index 1f5d4bae4..000000000 --- a/docs/source/versioneer.rst +++ /dev/null @@ -1,6 +0,0 @@ -.. versioneer module -.. ================= -.. toctree:: - :maxdepth: 2 - - graphistry.plugins_types diff --git a/docs/source/visualization/10min.rst b/docs/source/visualization/10min.rst new file mode 100644 index 000000000..7ab562b4d --- /dev/null +++ b/docs/source/visualization/10min.rst @@ -0,0 +1,236 @@ +.. _10min-viz: + +10 Minutes to Graphistry Visualization +====================================== + +This guide covers core visualization topics like the difference between uploading and viewing graphs, how the client/server architecture works, and how to use PyGraphistry's fluent API to create powerful visualizations by combining ideas like encodings, layouts, and settings. Finally, we overview how to embed visualizations into different workflows. + +Key Concepts +------------ + +- :ref:`Client/Server Architecture ` +- :ref:`Fluent API Style ` +- :ref:`Shaping Your Data ` +- :ref:`Layouts ` +- :ref:`Node & Edge Encodings ` +- :ref:`Global URL settings ` +- :ref:`Plotting: Inline and URL Rendering ` +- :ref:`Additional Resources ` + + +.. _client-server-architecture: + +Client/Server Architecture: Uploading vs. Serving vs. Viewing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +PyGraphistry uses a **client-server** model. By separating the uploader, server, and viewer, we can achieve better performance, new capabilities, and a variety of usage modes. + +- **Upload Client**: In your local environment, you can shape data and call the Graphistry API to upload it to a server (self-hosted or `Graphistry Hub `__). +- **Visualization Server**: The server processes the data using GPU acceleration to handle large graphs. +- **Visualization Client**: The graph is then explored in your browser, where interactions like zooming and filtering are handled smoothly by using local and remote GPU resources as appropriate. + +This split architecture allows scalable, high-performance visualization for even the largest datasets. + + +.. _fluent-api-style: + +Fluent API Style +~~~~~~~~~~~~~~~~ + +PyGraphistry uses a **fluent style** API, which means that methods can be chained together. This allows for concise and readable code without an extensive setup: + +.. code-block:: python + + g1 = graphistry.edges(df, 'src', 'dst') + g2 = g1.nodes(df2, 'n') + g3 = g2.encode_point_size('score') + g3.plot() + + # As shorter fluent lines + g = graphistry.edges(df, 'src', 'dst').nodes(df2, 'n') + g.encode_point_size('score').plot() + +This approach lets you layer operations as needed, keeping code light and intuitive. + + +.. _shaping-your-data: + +Shaping Your Data +----------------- + +PyGraphistry supports flexible shaping of your graph data: + +- **`.edges()` & `.nodes()`**: Define edges between entities and optional node attributes + + .. code-block:: python + + # df[['src', 'dst', ...]] + graphistry.edges(df, 'src', 'dst').plot() + + # ... + df2[['n', ...]] + graphistry.edges(df, 'src', 'dst').nodes(df2, 'n').plot() + +- **Hypergraph**: Use multiple columns for nodes for more complex visualizations + + .. code-block:: python + + # df[['actor', 'event', 'location', ...]] + hg = graphistry.hypergraph(df, ['actor', 'event', 'location']) + hg['graph'].plot() + +- **UMAP**: Dimensionality reduction & embedding visualization tool based on row similarity + + .. code-block:: python + + # df[['score', 'time', ...]] + graphistry.nodes(df).umap(X=['score', 'time']).plot() + +These methods ensure you can quickly load & shape data and move into visualizing. + +.. _layouts: + +Layouts +------- + +PyGraphistry's :ref:`Layout catalog ` provides many options, covering: + +- **Live Layout**: Graphistry performs GPU-accelerated force-directed layouts at interaction time. + You can adjust settings, such as gravity, edge weight, and initial clustering time: + + .. code-block:: python + + g.settings(url_params={'play': 7000, 'info': True}).plot() + +- **PyGraphistry Layouts**: PyGraphistry ships with special layouts unavailable elsewhere and that work with the rendering engine's special features: + + .. code-block:: python + + g.time_ring_layout('time_col').plot() + +- **Plugin Layouts**: Integrated use of external libraries for specific layouts: + + - :ref:`Graphviz ` for hierarchical and directed layouts such as the `"dot"` engine + - :ref:`cuGraph ` for GPU-accelerated FA2, a weaker version of Graphistry's live layout + - :ref:`igraph ` for CPU-based layouts, similar to GraphViz and with layouts that focus more on medium-sized social networks + +- **External Layouts**: Pass in `x`, `y` columns, such as from your own edits, external data, or external ML/AI packages: + + .. code-block:: python + + # nodes_df[['x', 'y', 'n', ...]] + g = graphistry.edges(e_df, 's', 'd').nodes(nodes_df, 'n') + g2 = g.settings(url_params={'play': 0}) # skip initial loadtime layout + g2.plot() + + +.. _node-edge-encodings: + +Node & Edge Encodings +--------------------- + +You can encode your graph attributes visually using colors, sizes, icons, and more: + +* **Direct Encoding**: Set attributes like color directly on nodes or edges. + + .. code-block:: python + + g.encode_point_color('type', categorical_mapping={'A': 'red', 'B': 'blue'}).plot() + +* **Categorical & Continuous Mappings**: Handle both discrete and continuous data: + + .. code-block:: python + + g.encode_point_color('score', ['blue', 'yellow', 'red'], as_continuous=True).plot() + +* **Encodings List**: Beyond colors, you can also adjust edge thickness, node icon, and add badges using the following methods: + + * Points: + + - :meth:`graphistry.PlotterBase.PlotterBase.encode_point_badge` + - :meth:`graphistry.PlotterBase.PlotterBase.encode_point_color` + - :meth:`graphistry.PlotterBase.PlotterBase.encode_point_icon` + - :meth:`graphistry.PlotterBase.PlotterBase.encode_point_size` + + * Edges: + + - :meth:`graphistry.PlotterBase.PlotterBase.encode_edge_badge` + - :meth:`graphistry.PlotterBase.PlotterBase.encode_edge_color` + - :meth:`graphistry.PlotterBase.PlotterBase.encode_edge_icon` + +* **Bind**: Simpler data-driven settings are done through :meth:`graphistry.PlotterBase.PlotterBase.bind`: + + .. code-block:: python + + g.bind(point_title='my_node_title_col') + + + Where: + + .. code-block:: python + + bind(source=None, destination=None, node=None, edge=None, + edge_title=None, edge_label=None, edge_color=None, edge_weight=None, + edge_size=None, edge_opacity=None, edge_icon=None, + edge_source_color=None, edge_destination_color=None, + point_title=None, point_label=None, point_color=None, point_weight=None, + point_size=None, point_opacity=None, point_icon=None, point_x=None, point_y=None + ) + +.. _url-settings: + +Global URL settings +------------------- + +Graphistry visualizations are highly configurable via URL parameters. You can control the look, interaction, and data filters: + +.. code-block:: python + + g.settings(url_params={'play': 7000, 'info': True}).plot() + +For a complete list of parameters, refer to the `official REST URL params page `__. + +.. _plot: + +Plotting: Inline and URL Rendering +---------------------------------- + +Once you're ready to visualize, use `.plot()` to render: + +- **Inline Plotting**: Directly embed interactive visualizations in your notebook or Python environment: + + .. code-block:: python + + g.plot() + +- **URL Rendering**: Get a sharable and embeddable URL to view in the browser: + + .. code-block:: python + + url = g.plot(render=False) + print(f"View your graph at: {url}") + + You can further control the embeded visualization using URL parameters and JavaScript + +.. _extra: + + +Next Steps +---------- + +- :ref:`10 Minutes to GFQL <10min-gfql>`: Use GFQL to query and manipulate your graph data before visualization. +- :ref:`Layout guide `: Explore different layouts for your visualizations. +- :ref:`Plugins `: Discover more ways to connect to your data and work with your favorite tools. +- :ref:`Layout catalog `: Dive deeper into the layout options available in PyGraphistry. +- :ref:`PyGraphistry API Reference ` + +External Resources +-------------------- + +To dive deeper into graph analytics and visualizations, check out the following resources: + +- `Graphistry Get Started `__ +- `GraphistryJS Clients: NodeJS, React, & Vanilla `__ +- `Graphistry GitHub `__ +- `Slack Community `__ + +Happy graphing! diff --git a/docs/source/visualization/index.rst b/docs/source/visualization/index.rst new file mode 100644 index 000000000..f453d93dd --- /dev/null +++ b/docs/source/visualization/index.rst @@ -0,0 +1,15 @@ +Visualize +============= + +We recommend getting started with :ref:`10 Minutes to PyGraphistry <10min>`, :ref:`10 Minutes to Graphistry Visualization<10min-viz>`, and the :ref:`layout guide ` + +See also: + +.. toctree:: + :maxdepth: 1 + + 10min + uiguide + layout/intro + layout/catalog + layout/settings diff --git a/docs/source/visualization/layout/catalog.rst b/docs/source/visualization/layout/catalog.rst new file mode 100644 index 000000000..9179aeb69 --- /dev/null +++ b/docs/source/visualization/layout/catalog.rst @@ -0,0 +1,161 @@ +.. _layout-catalog: + +PyGraphistry Layout Catalog +============================ + +This page provides an overview of the main layouts available in PyGraphistry, including through plugins like graphviz and igraph. Each optimizes for different use cases. Click on a plugin to jump to its section. + +- :ref:`PyGraphistry Plugin `: GPU-accelerated layouts like ForceAtlas2, modularity-weighted, UMAP, and more. +- :ref:`cuGraph Plugin `: Large-scale graph layouts with GPU-optimized ForceAtlas2. +- :ref:`Graphviz Plugin `: Hierarchical, directed, and flowchart-like layouts for medium-sized graphs. +- :ref:`igraph Plugin `: Versatile 2D/3D layouts including Fruchterman-Reingold, Kamada-Kawai, and more. +- :ref:`Custom Layouts `: Manually compute or post-process custom layouts. + +.. _pygraphistry-plugin: + +PyGraphistry Plugins +--------------------- + +PyGraphistry supports GPU-accelerated layouts, including ForceAtlas2, modularity-weighted algorithms, and hierarchical ring layouts for large-scale and specialized structures. (:ref:`API reference on Graphistry layouts `) + +**Supported Layouts**: + +- **ForceAtlas2** — Optimized for large, dense graphs. Provides smooth clustering and cluster separation using GPU acceleration. (Implicit: Dynamic load-time run of Graphistry's GPU-accelerated ForceAtlas2) +- **Modularity-Weighted** — Lays out clusters based on modularity, optimizing for visualizing community structures. :ref:`API info on modularity-weighted layouts ` +- **Group-In-A-Box (GIB)** — Organizes nodes into visually distinct boxes based on their group or cluster for clear structure definition. :ref:`API info on group-in-a-box layouts ` +- **UMAP** — Reduces high-dimensional data into a 2D layout based on similarity, best for complex datasets needing dimensionality reduction. :py:meth:`API info on UMAP ` +- **Hierarchical Ring Layouts** — Creates ring layouts that categorize nodes by time, continuous variables, or categorical properties. :ref:`API info on ring layouts ` + +**Example**: + +Visit the :ref:`PyGraphistry visualization tutorial <10min-viz>`. + +.. code-block:: python + + g.time_ring_layout('time_col').plot() + +.. _cugraph-plugin: + +cuGraph Plugin +--------------- + +cuGraph provides one GPU-optimized graph layout for scaling large datasets, making it a candidate for massive graphs. (:ref:`API reference on cuGraph `) + +**Supported Layouts**: + +- **ForceAtlas2** — Designed for very large graphs, scaling with GPU acceleration to maintain interactive performance with 100k+ nodes. Less flexible version of the Graphistry ForceAtlas2 GPU algorithm. + +.. code-block:: python + + g.cugraph_layout('force_atlas2').plot() + +.. _graphviz-plugin: + +Graphviz Plugin +---------------- + +Graphviz specializes in directed and hierarchical layouts, useful for flowcharts, dependency trees, and acyclic graphs (DAGs). (:ref:`API reference on graphviz layouts `) + +**Supported Layouts**: + +- **acyclic** — Removes cycles from directed graphs by reversing edges to make the graph acyclic, useful for processing DAGs. +- **ccomps** — Extracts the connected components from a graph and outputs them as subgraphs. +- **circo** — Circular layout, arranging nodes in a radial fashion, ideal for cycle graphs. +- **dot** — Best for directed acyclic graphs (DAGs) like flowcharts, laying out hierarchies in a top-down manner. +- **fdp** — General force-directed layout, good for smaller undirected graphs. +- **gc** — Used for graph coloring, assigning colors to nodes such that no two adjacent nodes have the same color. +- **gvcolor** — Colorizes graphs based on specific attributes, often used for improving visual distinctions between nodes. +- **gvpr** — Graph pattern scanning and rewriting tool used for scripting changes in a graph, allowing custom manipulation of graph structures. +- **neato** — Force-directed layout for undirected graphs, suitable for smaller networks. +- **nop** — A no-op layout that performs no layout calculations, often used as a placeholder or for manual layout adjustments. +- **osage** — Useful for directed layered graphs with hierarchical structures. +- **patchwork** — Visualizes hierarchical clusters as a nested set of rectangles, similar to a treemap visualization. +- **sccmap** — Finds the strongly connected components in a graph and generates a reduced graph of those components. +- **sfdp** — Force-directed layout optimized for large graphs, providing fast and scalable rendering. +- **tred** — Transitive reduction algorithm that minimizes the number of edges while maintaining reachability between nodes in a directed graph. +- **twopi** — Radial layout that positions nodes in concentric circles, useful for radial hierarchies. +- **unflatten** — Improves readability by adjusting node levels to reduce overlap in hierarchical graphs. + +**Example**: + +Visit the :ref:`API reference on graphviz page ` for more examples. + +.. code-block:: python + + g.layout_graphviz('dot').plot() + +.. _igraph-plugin: + +igraph Plugin +--------------- + +The igraph plugin offers various layouts forvarious graph types. (:ref:`API reference on igraph `) + +**Supported Layouts**: + +- **auto / automatic** — Automatically chooses the best layout for the given graph based on its structure and size. +- **bipartite** — Positions nodes in two layers, useful for visualizing bipartite graphs (graphs with two distinct sets of nodes). +- **circle / circular** — Positions nodes in a circular layout, suitable for visualizing cycles and small networks. +- **circle_3d / circular_3d** — 3D version of the circular layout, positioning nodes in a 3D circular structure. +- **davidson_harel / dh** — Force-directed layout algorithm with an iterative approach for improving graph aesthetics, especially useful for smaller graphs. +- **drl** — Distributed Recursive Layout, a force-directed layout algorithm optimized for very large graphs. +- **drl_3d** — 3D version of the DRL algorithm, optimized for large graphs in a 3D space. +- **fr / fruchterman_reingold** — Force-directed layout balancing attractive and repulsive forces for clustered yet separated nodes. +- **fr_3d / fruchterman_reingold_3d / fr3d** — 3D version of the Fruchterman-Reingold force-directed layout. +- **grid** — Organizes nodes in a grid structure, useful for matrix-like data. +- **grid_3d** — 3D version of the grid layout, positioning nodes in a 3D grid. +- **graphopt** — Another force-directed layout algorithm, known for its fast convergence on small to medium-sized graphs. +- **kk / kamada_kawai** — Similar to Fruchterman-Reingold, this force-directed layout focuses on preserving geometric distances between nodes. +- **kk_3d / kamada_kawai_3d / kk3d** — 3D version of the Kamada-Kawai algorithm, preserving distances between nodes in a 3D space. +- **lgl / large / large_graph** — Optimized for very large graphs, often used for graphs with thousands of nodes. +- **mds** — Multi-Dimensional Scaling, used for dimensionality reduction and projecting nodes into 2D or 3D space based on similarity. +- **random / random_3d** — Randomly positions nodes in 2D or 3D space, often used for testing or debugging layout algorithms. +- **reingold_tilford / rt / tree** — Specialized for tree structures, arranging nodes hierarchically from top to bottom. +- **reingold_tilford_circular / rt_circular** — Circular version of the Reingold-Tilford tree layout, arranging tree nodes in a radial fashion. +- **sphere / spherical** — 3D layout positioning nodes on the surface of a sphere, useful for 3D graph exploration. +- **star** — Positions nodes in a star configuration, with a central node surrounded by peripheral nodes. +- **sugiyama** — Specialized for hierarchical structures, often used for organizational charts and trees. + +Full list: :ref:`More Info ` + +**Example**: + +Visit the :ref:`API reference on graphviz ` for more examples. + +.. code-block:: python + + g.layout_igraph('circle').plot() + +.. _custom-layouts: + +Custom Layouts +--------------- + +Users can manually compute layouts from external sources or post-process the results. This allows flexibility in integrating custom embedding algorithms or other specialized layouts into PyGraphistry. (`API reference `_) + +**Example**: + +Manually apply a layout and visualize by `custom layouts (notebook) <../../demos/more_examples/graphistry_features/external_layout/simple_manual_layout.ipynb>`_ . + +.. code-block:: python + + # Input: Precompute some x and y positions + nodes_df : pd.DataFrame = ... + assert 'x' in df.columns and 'y' in df.columns + + g2 = (g1 + .nodes(nodes_df) + .bind(point_x='x', point_y='y') + .settings(url_params={'play': 0}) # Prevent loadtime layout from running + ) + +Further reading +---------------- + +- :ref:`PyGraphistry API Reference `: GPU-accelerated layouts such as ForceAtlas2, modularity-weighted, hierarchical rings, UMAP, and group-in-a-box. +- :ref:`cuGraph API Reference `: ForceAtlas2 optimized for large-scale graphs using GPU acceleration. +- :ref:`Graphviz API Reference `: Best for hierarchical and flowchart/DAG layouts, including options like dot, neato, and circo. +- :ref:`igraph API Reference `: Versatile with 2D/3D layouts, including Fruchterman-Reingold, Kamada-Kawai, and Sugiyama. + + +Visit the respective tutorial links to dive deeper into each plugin’s capabilities and usage. diff --git a/docs/source/visualization/layout/intro.rst b/docs/source/visualization/layout/intro.rst new file mode 100644 index 000000000..e01d97588 --- /dev/null +++ b/docs/source/visualization/layout/intro.rst @@ -0,0 +1,160 @@ +.. _layout-guide: + +Quick Guide to PyGraphistry layouts +=================================== + +This guide provides a quick introduction to key layout concepts in PyGraphistry + +Key Concepts Covered +-------------------- + +- :ref:`Precomputed Layouts ` +- :ref:`Internal & Plugin Layouts ` +- :ref:`Runtime Dynamic Layouts ` +- :ref:`Runtime Layout Settings ` + +- Further reading and detailed configuration options for: + - :ref:`Ring Layout API ` + - :ref:`GIB Layout API ` + - :ref:`Modularity Layout API ` + - Plugin layouts: :ref:`GraphViz `, :ref:`cuGraph `, :ref:`iGraph ` + +--- + +Key Concepts +------------ + +.. _precomputed-layouts: + +Precomputed Layouts +~~~~~~~~~~~~~~~~~~~ + +Precomputed layouts involve manually calculating node positions (`x`, `y` columns) before rendering your graph. + +This is useful such as when you need to manually control a layout, or are visualizing externally provided positions such as from embeddings. + + .. code-block:: python + + # Precomputed 'x', 'y' coordinates in a nodes DataFrame + g = graphistry.edges(e_df, 'src', 'dst').nodes(n_df, 'n') + g2 = g.settings(url_params={'play': 0}) # skip initial loadtime layout + g2.plot() + +Precomputed layouts are ideal for handling complex visualizations where precision is key. + +.. _internal-plugin-layouts: + +Internal & Plugin Layouts +~~~~~~~~~~~~~~~~~~~~~~~~~ + +PyGraphistry includes a growing number of built-in layouts. + +These help with several scenarios, including: + +* Faster performance and greater scale +* Leveraging Graphistry runtime layout features +* Combining layouts + +**Graphistry Layouts:** + +- **Native Force-Directed Layout:** PyGraphistry’s default layout automatically arranges the nodes based on their connectivity. + + .. code-block:: python + + g = graphistry.edges(e_df, 'src', 'dst').plot() + +- **Ring Layout:** Ideal for visualizing sorted, hierarchical, or time-based data. + + .. code-block:: python + + g.time_ring_layout('my_timestamp').plot() + g.categorical_ring_layout('my_type').plot() + g.continuous_ring_layout('my_score').plot() + + For further details, refer to the :ref:`Ring Layout API `. + +- **Modularity Weighted Layout:** Weights edges based on modularity. + + .. code-block:: python + + # Separate by precomputed modules + assert 'partition' in g._nodes + g.modularity_weighted_layout(community_col='partition').plot() + + # Separate by automatically computed modules + g.modularity_weighted_layout(community_alg='louvain', engine='cudf').plot() + + Read more in the :ref:`Modularity Layout API `. + +- **Group-in-a-Box Layout:** Groups nodes into a grid of clusters. + + Popularized by NodeXL for analyzing large social networks, the PyGraphistry version enables quickly working with larger datasets than possible in other packages + + .. code-block:: python + + g.gib_layout().plot() + + Learn more in the :ref:`Group-in-a-Box Layout API `. + +**Plugin Layouts:** + +- **cuGraph Plugin (GPU-accelerated force layouts):** Ideal for large-scale graphs requiring performance. + + .. code-block:: python + + g.cugraph_force_layout().plot() + + See the :ref:`cuGraph Plugin ` for more details. + +- **GraphViz Plugin (Hierarchical layouts):** Great for tree-like or hierarchical data. + + .. code-block:: python + + g.graphviz_layout(engine='dot').plot() + + Find more details in the :ref:`GraphViz Plugin `. + +- **iGraph Plugin (Kamada-Kawai, Sugiyama, etc.):** Provides classic layout algorithms for a variety of graph types. + + .. code-block:: python + + g.igraph_layout('kamada_kawai').plot() + + See the :ref:`iGraph Plugin ` for more information. + +.. _runtime-dynamic-layouts: + +Runtime Dynamic Layouts +~~~~~~~~~~~~~~~~~~~~~~~ + +Dynamic layouts allow PyGraphistry to adjust node positions in real-time based on user interactions and graph updates. This provides highly interactive and scalable graph visualizations. + + .. code-block:: python + + # Run the force-directed layout at viz load time for 5 seconds (5,000 milliseconds) + g = graphistry.edges(e_df, 'src', 'dst') + g.settings(url_params={'play': 5000}).plot() + +For details on runtime settings and customization, explore the :ref:`Layout Settings ` page. + +--- + +Further Reading +--------------- + +Layout in general: + +- :ref:`Layout Catalog ` +- :ref:`Layout Settings ` + +Individaul layouts and plugins: + +- :ref:`Ring Layout API ` +- :ref:`GIB Layout API ` +- :ref:`Modularity Layout API ` +- :ref:`GraphViz Plugin ` +- :ref:`cuGraph Plugin ` +- :ref:`iGraph Plugin ` + + + diff --git a/docs/source/visualization/layout/settings.rst b/docs/source/visualization/layout/settings.rst new file mode 100644 index 000000000..8e41ede14 --- /dev/null +++ b/docs/source/visualization/layout/settings.rst @@ -0,0 +1,109 @@ +.. _layout-settings: + +Layout Settings & Visualization Embedding +========================================= + +This guide shows how to embed and configure Graphistry visualizations using the PyGraphistry Python API. For users interested in using URL parameters for embedding in HTML, refer to the external documentation. + +Using PyGraphistry for Customization +------------------------------------- + +You can use the PyGraphistry API to programmatically configure visualizations. Below are some examples of how to use the `g.settings` and `g.addStyle` methods to customize visualizations. + +Scene Settings +~~~~~~~~~~~~~~~ + +Use :meth:`graphistry.PlotterBase.PlotterBase.scene_settings` to modify the appearance of the graph, including menus, node sizes, and edge opacity: + +.. code-block:: python + + g2 = g.scene_settings( + # Hide menus + menu=False, + info=False, + # Customize graph + show_arrows=False, + point_size=1.0, + edge_curvature=0.0, + edge_opacity=0.5, + point_opacity=0.9 + ).plot() + + +Styling the Background and Foreground +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With :meth:`graphistry.PlotterBase.PlotterBase.addStyle`, you can configure background and foreground styles, including colors, gradients, and images: + +.. code-block:: python + + # Set a red background + g.addStyle(bg={'color': 'red'}) + + # Apply a radial gradient background + g.addStyle(bg={ + 'color': '#333', + 'gradient': { + 'kind': 'radial', + 'stops': [ + ["rgba(255,255,255, 0.1)", "10%", "rgba(0,0,0,0)", "20%"] + ] + } + }) + + # Use an image as a background with blend mode + g.addStyle(bg={'image': {'url': 'http://site.com/cool.png', 'blendMode': 'multiply'}}) + + # Apply blend mode for the foreground + g.addStyle(fg={'blendMode': 'color-burn'}) + +Page and Logo Settings +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Customize the page title, favicon, and logo using :meth:`graphistry.PlotterBase.PlotterBase.addStyle`, : + +.. code-block:: python + + # Set page title and favicon + g.addStyle(page={'title': 'My Site'}) + g.addStyle(page={'favicon': 'http://site.com/favicon.ico'}) + + # Add a logo + g.addStyle(logo={'url': 'http://www.site.com/transparent_logo.png'}) + + # Customize logo dimensions and opacity + g.addStyle(logo={ + 'url': 'http://www.site.com/transparent_logo.png', + 'dimensions': {'maxHeight': 200, 'maxWidth': 200}, + 'style': {'opacity': 0.5} + }) + +For more advanced Python configuration options, refer to the PyGraphistry REST API documentation on `URL parameters `_ and `Branding metadata `_. + +HTML/URL-based Configuration +-------------------------------- + +For users interested in configuring Graphistry visualizations through HTML and URL parameters, please refer to the official documentation: + +- `Graphistry URL Configuration Options `_ + +This guide covers how to embed Graphistry visualizations in web pages and configure visualizations via URL parameters like background color, layout settings, and more. + +IFrame CSS Style Tips +~~~~~~~~~~~~~~~~~~~~~~~ + +When embedding visualizations in HTML, you can customize the appearance using CSS. Below are some common style tips for `