diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4bc5e30d3..494565c95 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,7 +24,7 @@ jobs:
strategy:
matrix:
- python-version: [3.8, 3.9, '3.10', 3.11]
+ python-version: [3.8, 3.9, '3.10', 3.11, 3.12]
steps:
@@ -71,7 +71,7 @@ jobs:
strategy:
matrix:
- python-version: [3.8, 3.9, '3.10', 3.11]
+ python-version: [3.8, 3.9, '3.10', 3.11, 3.12]
steps:
@@ -93,7 +93,7 @@ jobs:
python -m venv pygraphistry
source pygraphistry/bin/activate
python -m pip install --upgrade pip
- python -m pip install -e .[docs,test,build,bolt,igraph,networkx,gremlin,nodexl,jupyter]
+ python -m pip install -e .[test,build,bolt,igraph,networkx,gremlin,nodexl,jupyter]
- name: Lint
run: |
@@ -110,6 +110,47 @@ jobs:
source pygraphistry/bin/activate
./bin/test.sh
+ test-graphviz:
+
+ needs: [ test-minimal-python ]
+ runs-on: ubuntu-latest
+
+ strategy:
+ matrix:
+ python-version: [3.8, 3.9, '3.10', 3.11, 3.12]
+
+ steps:
+
+ - name: Checkout repo
+ uses: actions/checkout@v3
+ with:
+ lfs: true
+
+ - name: Checkout LFS objects
+ run: git lfs pull
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install test dependencies
+ run: |
+ python -m venv pygraphistry
+ source pygraphistry/bin/activate
+ sudo apt-get install graphviz graphviz-dev
+ python -m pip install --upgrade pip
+ python -m pip install -e .[test,pygraphviz]
+
+ - name: Type check
+ run: |
+ source pygraphistry/bin/activate
+ ./bin/typecheck.sh
+
+ - name: Graphviz tests
+ run: |
+ source pygraphistry/bin/activate
+ ./bin/test-graphviz.sh
test-core-umap:
@@ -118,6 +159,7 @@ jobs:
strategy:
matrix:
+ #python-version: [3.8, 3.9, '3.10', 3.11, 3.12]
python-version: [3.8, 3.9]
steps:
@@ -165,6 +207,10 @@ jobs:
strategy:
matrix:
python-version: [3.8, 3.9]
+ #python-version: [3.8, 3.9, '3.10', 3.11, 3.12]
+ #include:
+ # - python-version: 3.12
+ # continue-on-error: true
steps:
@@ -284,7 +330,7 @@ jobs:
- name: Test building docs
run: |
- cd docs && ./docker.sh
+ cd docs && ./ci.sh
test-readme:
diff --git a/.gitignore b/.gitignore
index f8a1ee954..104d69b49 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,6 +61,8 @@ coverage.xml
# Sphinx documentation
docs/_build/
+docs/doctrees/
+docs/source/demos/
# PyBuilder
target/
@@ -87,3 +89,4 @@ demos/data/BIOGRID-IDENTIFIERS-3.3.123.tab.txt
# local jupyter dev
jupyter_dev/
+docs/source/demos
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 609e875f7..e037f6cd8 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -5,24 +5,53 @@
# Required
version: 2
-# Build documentation in the docs/ directory with Sphinx
-sphinx:
- configuration: docs/source/conf.py
-
build:
os: ubuntu-22.04
tools:
- python: "3.8"
+ python: "3.12"
+ apt_packages:
+ # More closely mirror https://github.com/sphinx-doc/sphinx-docker-images
+ - graphviz
+ - imagemagick
+ - make
+ - pandoc
+ - texlive-latex-base
+ - texlive-latex-recommended
+ - texlive-latex-extra
+ - texlive-fonts-recommended
+ commands:
+
+ # setup
+ - pip install ".[docs]"
+ - cp -r demos docs/source/demos
+ - cp README.md docs/source/README.md
+ - cp ARCHITECTURE.md docs/source/ARCHITECTURE.md
+ - cp CONTRIBUTE.md docs/source/CONTRIBUTE.md
+ - cp DEVELOP.md docs/source/DEVELOP.md
+
+ # build html
+ - sphinx-build -b html -d docs/doctrees docs/source $READTHEDOCS_OUTPUT/html/
+
+ # build epub
+ - sphinx-build -b epub -d docs/doctrees docs/source docs/_build/latexpdf
+ - mkdir -p $READTHEDOCS_OUTPUT/epub
+ - cp docs/_build/latexpdf/PyGraphistry.epub $READTHEDOCS_OUTPUT/epub/PyGraphistry.epub
-# Optionally build your docs in additional formats such as PDF
+ # build pdf
+ - sphinx-build -b latex -d docs/doctrees docs/source docs/_build/latexpdf
+ - cd docs/_build/latexpdf && pdflatex -file-line-error -interaction=nonstopmode PyGraphistry.tex && pdflatex -file-line-error -interaction=nonstopmode PyGraphistry.tex && echo ok || { echo fail && exit 1 ; }
+ - mkdir -p $READTHEDOCS_OUTPUT/pdf
+ - cp docs/_build/latexpdf/PyGraphistry.pdf $READTHEDOCS_OUTPUT/pdf/PyGraphistry.pdf
+
+#for nav links?
formats:
- pdf
- - htmlzip
- epub
+ - htmlzip
python:
install:
- method: pip
path: .
extra_requirements:
- - dev
\ No newline at end of file
+ - docs
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8e735dff9..a7cf750fa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,155 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
## [Development]
+## [0.34.16 - 2024-10-13]
+
+### Docs
+
+* Update and streamline readme.md
+* Add quicksheet for overall
+* More crosslinking
+
+### Infra
+
+* Add markdown support to docsite
+* ReadTheDocs homepage reuses github README.md
+* Docs pip install caches
+* Drop SVGs and external images during latexpdf generation
+
+### Changed
+
+* Treemap import `squarify` deferred to use to allow core import without squarify installed, such as in `--no-deps`
+
+## [0.34.15 - 2024-10-11]
+
+### Docs
+
+* Improve GFQL translation doc
+* Add examples and API links: Shaping, Hypergraphs, AI & ML
+* Add performance docs
+* Add AI examples
+
+## [0.34.14 - 2024-10-09]
+
+### Added
+
+* HTTP responses with error status codes log an `logging.ERROR`-level message of the status code and response body
+
+## [0.34.13 - 2024-10-07]
+
+### Docs
+
+* Add more GFQL cross-references
+
+## [0.34.12 - 2024-10-07]
+
+### Docs
+
+* Fix ipynb examples in ReadTheDocs distribution
+
+## [0.34.11 - 2024-10-07]
+
+### Fix
+
+* Types
+
+### Infra
+
+* Enable more Python version checks
+
+## [0.34.10 - 2024-10-07]
+
+### Fix
+
+* Docs: Notebook builds
+
+### Docs
+
+* More links, especially around plugins
+* Update color theme to match Graphistry branding
+
+## [0.34.9 - 2024-10-07]
+
+### Fix
+
+* Docs: 10 Minutes to PyGraphistry links
+
+## [0.34.8 - 2024-10-06]
+
+### Fix
+
+* Docs: PDF support
+* Docs: Links
+
+### Docs
+
+* More accessible theme
+
+## [0.34.7 - 2024-10-06]
+
+### Docs
+
+* RTD: Added notebook tutorials
+* RTD: Added various guides
+* RTD: Added cross-references
+* RTD: Cleaner navigation
+
+### Infra
+
+* Python: Add Python 12 to CI and document support
+* Docs: Udated dependencies - Sphinx 8, Python 12, and various related
+* Docs: Added nbsphinx - hub url grounding, ...
+* Docs: Redo as a docker compose flow with incremental builds (docker, sphinx)
+* Docs: Updated instructions for new flow
+
+### Fix
+
+* Docs: 2024
+* Notebooks: Compatibility with nbsphinx - exactly one title heading, no uncommented `!`, correct references, ...
+
+## [0.34.6 - 2024-10-04]
+
+### Added
+
+* Plugins: graphviz bindings, such as `g.layout_graphviz("dot")`
+
+### Docs
+
+* Reorganized readthedocs
+* Added intro tutorials: `10 Minutes to PyGraphistry`, `10 Minutes to GFQL`, `Login and Sharing`
+
+## [0.34.5 - 2024-09-23]
+
+### Fixed
+
+* GFQL: Fix `chain()` regression around an incorrectly disabled check manifesting as https://github.com/graphistry/pygraphistry/issues/583
+* GFQL: Fix `chain()`, `hop()` traverse filtering logic for a multi-hop edge scenarios
+* GFQL: Fix `hop()` predicate handling in multihop scenarios
+
+### Infra
+
+* GFQL: Expand test suite around multihop edge predicates in `hop()` and `chain()`
+
+## [0.34.4 - 2024-09-20]
+
+### Added
+
+* UMAP: Optional kwargs passthrough to umap library constructor, fit, and transform methods: `g.umap(..., umap_kwargs={...}, umap_fit_kwargs={...}, umap_transform_kwargs={...})`
+* Additional GPU support in featurize paths
+
+### Changed
+
+* Replace `verbose` with `logging`
+
+### Refactor
+
+* Narrow `use_scaler` and `use_scaler_target` typing to `ScalerType` (`Literal[...]`) vs `str`
+* Rename `featurize_or_get_nodes_dataframe_if_X_is_None` (and edges variant) as non-private due to being shared
+
+### Fixed
+
+* get_indegrees: Fix warning https://github.com/graphistry/pygraphistry/issues/587
+
## [0.34.3 - 2024-08-03]
### Added
diff --git a/README.md b/README.md
index a50e260f7..d5dd94b92 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# PyGraphistry: Explore Relationships
+# PyGraphistry: Leverage the power of graphs & GPUs to visualize, analyze, and scale your data
![Build Status](https://github.com/graphistry/pygraphistry/workflows/CI%20Tests/badge.svg)
[![CodeQL](https://github.com/graphistry/pygraphistry/workflows/CodeQL/badge.svg)](https://github.com/graphistry/pygraphistry/actions?query=workflow%3ACodeQL)
@@ -11,1606 +11,201 @@
[![Uptime Robot status](https://img.shields.io/uptimerobot/status/m787548531-e9c7b7508fc76fea927e2313?label=hub.graphistry.com)](https://status.graphistry.com/) [ ](https://join.slack.com/t/graphistry-community/shared_invite/zt-53ik36w2-fpP0Ibjbk7IJuVFIRSnr6g)
[![Twitter Follow](https://img.shields.io/twitter/follow/graphistry)](https://twitter.com/graphistry)
-**PyGraphistry is a dataframe-native Python visual graph AI library to extract, query, transform, analyze, model, and visualize big graphs, and especially alongside [Graphistry](https://www.graphistry.com) end-to-end GPU server sessions.** The GFQL query language supports running a large subset of the Cypher property graph query language without requiring external software and adds optional GPU acceleration. Installing PyGraphistry with the optional `graphistry[ai]` dependencies adds **graph autoML**, including automatic feature engineering, UMAP, and graph neural net support. Combined, PyGraphistry reduces your **time to graph** for going from raw data to visualizations and AI models down to three lines of code.
-
-The optional visual engine, Graphistry, gets used on problems like visually mapping the behavior of devices and users, investigating fraud, analyzing machine learning results, and starting in graph AI. It provides point-and-click features like timebars, search, filtering, clustering, coloring, sharing, and more. Graphistry is the only tool built ground-up for large graphs. The client's custom WebGL rendering engine renders up to 8MM nodes + edges at a time, and most older client GPUs smoothly support somewhere between 100K and 2MM elements. The serverside GPU analytics engine supports even bigger graphs. It smoothes graph workflows over the PyData ecosystem including Pandas/Spark/Dask dataframes, Nvidia RAPIDS GPU dataframes & GPU graphs, DGL/PyTorch graph neural networks, and various data connectors.
-
-The PyGraphistry Python client helps several kinds of usage modes:
-
-* **Data scientists**: Go from data to accelerated visual explorations in a couple lines, share live results, build up more advanced views over time, and do it all from notebook environments like Jupyter and Google Colab
-* **Developers**: Quickly prototype stunning Python solutions with PyGraphistry, embed in a language-neutral way with the [REST APIs](https://hub.graphistry.com/docs/api/), and go deep on customizations like colors, icons, layouts, JavaScript, and more
-* **Analysts**: Every Graphistry session is a point-and-click environment with interactive search, filters, timebars, histograms, and more
-* **Dashboarding**: Embed into your favorite framework. Additionally, see our sister project [Graph-App-Kit](https://github.com/graphistry/graph-app-kit) for quickly building interactive graph dashboards by launching a stack built on PyGraphistry, StreamLit, Docker, and ready recipes for integrating with common graph libraries
-
-PyGraphistry is a friendly and optimized PyData-native interface to the language-neutral [Graphistry REST APIs](https://hub.graphistry.com/docs/api/).
-You can use PyGraphistry with traditional Python data sources like CSVs, SQL, Neo4j, Splunk, and more (see below). Wrangle data however you want, and with especially good support for Pandas dataframes, Apache Arrow tables, Nvidia RAPIDS cuDF dataframes & cuGraph graphs, and DGL/PyTorch graph neural networks.
-
-1. [Interactive Demo](#demo-of-friendship-communities-on-facebook)
-2. [Graph Gallery](#gallery)
-3. [Install](#install)
-4. [Tutorial](#tutorial-les-misérables)
-5. [Next Steps](#next-steps)
-6. [Resources](#resources)
-
-## Demo of Friendship Communities on Facebook
-## **PyGraphistry is:**
-
-* **Fast & gorgeous:** Interactively cluster, filter, inspect large amounts of data, and zip through timebars. It clusters large graphs with a descendant of the gorgeous ForceAtlas2 layout algorithm introduced in Gephi. Our data explorer connects to Graphistry's GPU cluster to layout and render hundreds of thousand of nodes+edges in your browser at unparalleled speeds.
-
-* **Easy to install:** `pip install` the client in your notebook or web app, and then connect to a [free Graphistry Hub account](https://www.graphistry.com/get-started) or [launch your own private GPU server](https://www.graphistry.com/get-started)
-
- ```python
- # pip install --user graphistry # minimal
- # pip install --user graphistry[bolt,gremlin,nodexl,igraph,networkx] # data plugins
- # AI modules: Python 3.8+ with scikit-learn 1.0+:
- # pip install --user graphistry[umap-learn] # Lightweight: UMAP autoML (without text support); scikit-learn 1.0+
- # pip install --user graphistry[ai] # Heavy: Full UMAP + GNN autoML, including sentence transformers (1GB+)
-
- import graphistry
- graphistry.register(api=3, username='abc', password='xyz') # Free: hub.graphistry.com
- #graphistry.register(..., personal_key_id='pkey_id', personal_key_secret='pkey_secret') # Key instead of username+password+org_name
- #graphistry.register(..., is_sso_login=True) # SSO instead of password
- #graphistry.register(..., org_name='my-org') # Upload into an organization account vs personal
- #graphistry.register(..., protocol='https', server='my.site.ngo') # Use with a self-hosted server
- # ... and if client (browser) URLs are different than python server<> graphistry server uploads
- #graphistry.register(..., client_protocol_hostname='https://public.acme.co')
- ```
-
-* **Notebook-friendly:** PyGraphistry plays well with interactive notebooks like [Jupyter](http://ipython.org), [Zeppelin](https://zeppelin.incubator.apache.org/), and [Databricks](http://databricks.com). Process, visualize, and drill into with graphs directly within your notebooks:
-
- ```python
- graphistry.edges(pd.read_csv('rows.csv'), 'col_a', 'col_b').plot()
- ```
-
-* **Great for events, CSVs, and more:** Not sure if your data is graph-friendly? PyGraphistry's `hypergraph` transform helps turn any sample data like CSVs, SQL results, and event data into a graph for pattern analysis:
-
- ```python
- rows = pandas.read_csv('transactions.csv')[:1000]
- graphistry.hypergraph(rows)['graph'].plot()
- ```
-
-* **Embeddable:** Drop live views into your web dashboards and apps (and go further with [JS/React](https://hub.graphistry.com/docs)):
-
- ```python
- iframe_url = g.plot(render=False)
- print(f'')
- ```
-
-* **Configurable:** In-tool or via the declarative APIs, use the powerful encodings systems for tasks like coloring by time, sizing by score, clustering by weight, show icons by type, and more.
-
-* **Shareable:** Share live links, configure who has access, and more! [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/sharing_tutorial.ipynb)
-
-* **Graph AI that is fast & easy:** In oneines of code, turn messy data into feature vectors for modeling, GNNs for training pipelines, lower dimensional embeddings, and visualizations:
-
- ```python
- df = pandas.read_csv('accounts.csv')
-
- # UMAP dimensionality reduction with automatic feature engineering
- g1 = graphistry.nodes(df).umap()
-
- # Automatically shows top inferred similarity edges g1._edges
- g1.plot()
-
- # Optional: Use subset of columns, supervised learning target, & more
- g2.umap(X=['name', 'description', 'amount'], y=['label_col_1']).plot()
- ```
-
-### Explore any data as a graph
-
-It is easy to turn arbitrary data into insightful graphs. PyGraphistry comes with many built-in connectors, and by supporting Python dataframes (Pandas, Arrow, RAPIDS), it's easy to bring standard Python data libraries. If the data comes as a table instead of a graph, PyGraphistry will help you extract and explore the relationships.
-
-* [Pandas](http://pandas.pydata.org)
-
- ```python
- edges = pd.read_csv('facebook_combined.txt', sep=' ', names=['src', 'dst'])
- graphistry.edges(edges, 'src', 'dst').plot()
- ```
-
- ```python
- table_rows = pd.read_csv('honeypot.csv')
- graphistry.hypergraph(table_rows, ['attackerIP', 'victimIP', 'victimPort', 'vulnName'])['graph'].plot()
- ```
-
- ```python
- graphistry.hypergraph(table_rows, ['attackerIP', 'victimIP', 'victimPort', 'vulnName'],
- direct=True,
- opts={'EDGES': {
- 'attackerIP': ['victimIP', 'victimPort', 'vulnName'],
- 'victimIP': ['victimPort', 'vulnName'],
- 'victimPort': ['vulnName']
- }})['graph'].plot()
- ```
-
- ```python
- ### Override smart defaults with custom settings
- g1 = graphistry.bind(source='src', destination='dst').edges(edges)
- g2 = g1.nodes(nodes).bind(node='col2')
- g3 = g2.bind(point_color='col3')
- g4 = g3.settings(url_params={'edgeInfluence': 1.0, play: 2000})
- url = g4.plot(render=False)
- ```
-
- ```python
- ### Read back data and create modified variants
- enriched_edges = my_function1(g1._edges)
- enriched_nodes = my_function2(g1._nodes)
- g2 = g1.edges(enriched_edges).nodes(enriched_nodes)
- g2.plot()
- ```
-
-* GFQL: Cypher-style graph pattern mining queries on dataframes with optional GPU acceleration ([ipynb demo](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/hop_and_chain_graph_pattern_mining.ipynb), [benchmark](https://github.com/graphistry/pygraphistry/blob/master/demos/gfql/benchmark_hops_cpu_gpu.ipynb))
-
- Run Cypher-style graph queries natively on dataframes without going to a database or Java with GFQL:
-
- ```python
- from graphistry import n, e_undirected, is_in
-
- g2 = g1.chain([
- n({'user': 'Biden'}),
- e_undirected(),
- n(name='bridge'),
- e_undirected(),
- n({'user': is_in(['Trump', 'Obama'])})
- ])
-
- print('# bridges', len(g2._nodes[g2._nodes.bridge]))
- g2.plot()
- ```
-
- Enable GFQL's optional automatic GPU acceleration for 43X+ speedups:
-
- ```python
- # Switch from Pandas CPU dataframes to RAPIDS GPU dataframes
- import cudf
- g2 = g1.edges(lambda g: cudf.DataFrame(g._edges))
- # GFQL will automaticallly run on a GPU
- g3 = g2.chain([n(), e(hops=3), n()])
- g3.plot()
- ```
-
-* [Spark](https://spark.apache.org/)/[Databricks](https://databricks.com/) ([ipynb demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.ipynb), [dbc demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.dbc))
-
- ```python
- #optional but recommended
- spark.conf.set("spark.sql.execution.arrow.enabled", "true")
-
- edges_df = (
- spark.read.format('json').
- load('/databricks-datasets/iot/iot_devices.json')
- .sample(fraction=0.1)
- )
- g = graphistry.edges(edges_df, 'device_name', 'cn')
-
- #notebook
- displayHTML(g.plot())
-
- #dashboard: pick size of choice
- displayHTML(
- g.settings(url_params={'splashAfter': 'false'})
- .plot(override_html_style="""
- width: 50em;
- height: 50em;
- """)
- )
- ```
-
-* GPU [RAPIDS.ai](https://www.rapids.ai) cudf
-
- ```python
- edges = cudf.read_csv('facebook_combined.txt', sep=' ', names=['src', 'dst'])
- graphistry.edges(edges, 'src', 'dst').plot()
- ```
-
-* GPU [RAPIDS.ai](https://www.rapids.ai) cuML
-
- ```python
- g = graphistry.nodes(cudf.read_csv('rows.csv'))
- g = graphistry.nodes(G)
- g.umap(engine='cuml',metric='euclidean').plot()
- ```
-
-* GPU [RAPIDS.ai](https://www.rapids.ai) cugraph ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/gpu_rapids/cugraph.ipynb))
-
- ```python
- g = graphistry.from_cugraph(G)
- g2 = g.compute_cugraph('pagerank')
- g3 = g2.layout_cugraph('force_atlas2')
- g3.plot()
- G3 = g.to_cugraph()
- ```
-
-* [Apache Arrow](https://arrow.apache.org/)
-
- ```python
- edges = pa.Table.from_pandas(pd.read_csv('facebook_combined.txt', sep=' ', names=['src', 'dst']))
- graphistry.edges(edges, 'src', 'dst').plot()
- ```
-
-* [Neo4j](http://neo4j.com) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/neo4j/official/graphistry_bolt_tutorial_public.ipynb))
-
- ```python
- NEO4J_CREDS = {'uri': 'bolt://my.site.ngo:7687', 'auth': ('neo4j', 'mypwd')}
- graphistry.register(bolt=NEO4J_CREDS)
- graphistry.cypher("MATCH (n1)-[r1]->(n2) RETURN n1, r1, n2 LIMIT 1000").plot()
- ```
-
- ```python
- graphistry.cypher("CALL db.schema()").plot()
- ```
-
- ```python
- from neo4j import GraphDatabase, Driver
- graphistry.register(bolt=GraphDatabase.driver(**NEO4J_CREDS))
- g = graphistry.cypher("""
- MATCH (a)-[p:PAYMENT]->(b)
- WHERE p.USD > 7000 AND p.USD < 10000
- RETURN a, p, b
- LIMIT 100000""")
- print(g._edges.columns)
- g.plot()
- ```
-
-* [Memgraph](https://memgraph.com/) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/memgraph/visualizing_iam_dataset.ipynb))
-
- ```python
- from neo4j import GraphDatabase
- MEMGRAPH = {
- 'uri': "bolt://localhost:7687",
- 'auth': (" ", " ")
- }
- graphistry.register(bolt=MEMGRAPH)
- ```
-
- ```python
- driver = GraphDatabase.driver(**MEMGRAPH)
- with driver.session() as session:
- session.run("""
- CREATE (per1:Person {id: 1, name: "Julie"})
- CREATE (fil2:File {id: 2, name: "welcome_to_memgraph.txt"})
- CREATE (per1)-[:HAS_ACCESS_TO]->(fil2) """)
- g = graphistry.cypher("""
- MATCH (node1)-[connection]-(node2)
- RETURN node1, connection, node2;""")
- g.plot()
- ```
-
-* [Azure Cosmos DB (Gremlin)](https://azure.microsoft.com/en-us/services/cosmos-db/)
-
- ```python
- # pip install --user gremlinpython
- # Options in help(graphistry.cosmos)
- g = graphistry.cosmos(
- COSMOS_ACCOUNT='',
- COSMOS_DB='',
- COSMOS_CONTAINER='',
- COSMOS_PRIMARY_KEY=''
- )
- g2 = g.gremlin('g.E().sample(10000)').fetch_nodes()
- g2.plot()
- ```
-
-* [Amazon Neptune (Gremlin)](https://aws.amazon.com/neptune/) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/neptune/neptune_tutorial.ipynb), [dashboarding demo](https://aws.amazon.com/blogs/database/enabling-low-code-graph-data-apps-with-amazon-neptune-and-graphistry/))
-
- ```python
- # pip install --user gremlinpython==3.4.10
- # - Deploy tips: https://github.com/graphistry/graph-app-kit/blob/master/docs/neptune.md
- # - Versioning tips: https://gist.github.com/lmeyerov/459f6f0360abea787909c7c8c8f04cee
- # - Login options in help(graphistry.neptune)
- g = graphistry.neptune(endpoint='wss://zzz:8182/gremlin')
- g2 = g.gremlin('g.E().limit(100)').fetch_nodes()
- g2.plot()
- ```
-
-* [TigerGraph](https://tigergraph.com) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/tigergraph/tigergraph_pygraphistry_bindings.ipynb))
-
- ```python
- g = graphistry.tigergraph(protocol='https', ...)
- g2 = g.gsql("...", {'edges': '@@eList'})
- g2.plot()
- print('# edges', len(g2._edges))
- ```
-
- ```python
- g.endpoint('my_fn', {'arg': 'val'}, {'edges': '@@eList'}).plot()
- ```
-
-* [igraph](http://igraph.org)
-
- ```python
- edges = pd.read_csv('facebook_combined.txt', sep=' ', names=['src', 'dst'])
- g_a = graphistry.edges(edges, 'src', 'dst')
- g_b = g_a.layout_igraph('sugiyama', directed=True) # directed: for to_igraph
- g_b.compute_igraph('pagerank', params={'damping': 0.85}).plot() #params: for layout
-
- ig = igraph.read('facebook_combined.txt', format='edgelist', directed=False)
- g = graphistry.from_igraph(ig) # full conversion
- g.plot()
-
- ig2 = g.to_igraph()
- ig2.vs['spinglass'] = ig2.community_spinglass(spins=3).membership
- # selective column updates: preserve g._edges; merge 1 attribute from ig into g._nodes
- g2 = g.from_igraph(ig2, load_edges=False, node_attributes=[g._node, 'spinglass'])
- ```
-
-* [NetworkX](https://networkx.github.io) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/networkx/networkx.ipynb))
-
- ```python
- graph = networkx.read_edgelist('facebook_combined.txt')
- graphistry.bind(source='src', destination='dst', node='nodeid').plot(graph)
- ```
-
-* [HyperNetX](https://github.com/pnnl/HyperNetX) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/hypernetx/hypernetx.ipynb))
-
- ```python
- hg.hypernetx_to_graphistry_nodes(H).plot()
- ```
-
- ```python
- hg.hypernetx_to_graphistry_bipartite(H.dual()).plot()
- ```
-
-* [Splunk](https://www.splunk.com) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/splunk/splunk_demo_public.ipynb))
-
- ```python
- df = splunkToPandas("index=netflow bytes > 100000 | head 100000", {})
- graphistry.edges(df, 'src_ip', 'dest_ip').plot()
- ```
-
-* [NodeXL](https://www.nodexl.com) ([notebook demo](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_databases_apis/nodexl/official/nodexl_graphistry.ipynb))
-
- ```python
- graphistry.nodexl('/my/file.xls').plot()
- ```
-
- ```python
- graphistry.nodexl('https://file.xls').plot()
- ```
-
- ```python
- graphistry.nodexl('https://file.xls', 'twitter').plot()
- graphistry.nodexl('https://file.xls', verbose=True).plot()
- graphistry.nodexl('https://file.xls', engine='xlsxwriter').plot()
- graphistry.nodexl('https://file.xls')._nodes
- ```
-
-## Graph AI in a single line of code
-
-Graph autoML features including:
-
-### Generate features from raw data
-
-Automatically and intelligently transform text, numbers, booleans, and other formats to AI-ready representations:
-
-* Featurization
-
- ```python
- g = graphistry.nodes(df).featurize(kind='nodes', X=['col_1', ..., 'col_n'], y=['label', ..., 'other_targets'], ...)
-
- print('X', g._node_features)
- print('y', g._node_target)
- ```
+PyGraphistry is an open source Python library for data scientists and developers to leverage the power of graph visualization, analytics, AI, including with native GPU acceleration:
-* Set `kind='edges'` to featurize edges:
+* [**Python dataframe-native graph processing:**](https://pygraphistry.readthedocs.io/en/latest/10min.html) Quickly ingest & prepare data in many formats, shapes, and scales as graphs. Use tools like Pandas, Spark, [RAPIDS (GPU)](https://www.rapids.ai), and [Apache Arrow](https://arrow.apache.org/).
- ```python
- g = graphistry.edges(df, src, dst).featurize(kind='edges', X=['col_1', ..., 'col_n'], y=['label', ..., 'other_targets'], ...)
- ```
+* [**Integrations:**](https://pygraphistry.readthedocs.io/en/latest/plugins.html) Plug into [Amazon Neptune](https://docs.aws.amazon.com/neptune/latest/userguide/visualization-graphistry.html) ([notebook](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/neptune/neptune_cypher_viz_using_bolt.html)), [cuGraph](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/gpu_rapids/cugraph.html), [Databricks](https://www.databricks.com/solutions/accelerators/incident-investigation-using-graphistry) ([notebook](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.html)), [graphviz](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/graphviz/graphviz.html), [Neo4j](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/neo4j/official/graphistry_bolt_tutorial_public.html), [Splunk](https://www.splunk.com/en_us/blog/security/supercharge-cybersecurity-investigations-with-splunk-and-graphistry-a-powerful-combination-for-interactive-graph-exploration.html) ([notebook](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/splunk/splunk_demo_public.html)), [TigerGraph](https://pygraphistry.readthedocs.io/en/latest/demos/demos_databases_apis/tigergraph/tigergraph_pygraphistry_bindings.html), and many more in the [notebook data provider demo gallery](https://pygraphistry.readthedocs.io/en/latest/notebooks/plugins.connectors.html).
-* Use generated features with both Graphistry and external libraries:
- ```python
- # graphistry
- g = g.umap() # UMAP, GNNs, use features if already provided, otherwise will compute
+* [**Prototype locally and deploy remotely:**](https://www.graphistry.com/get-started) Prototype from notebooks like Jupyter and Databricks using local CPUs & GPUs, and then power production dashboards & pipelines with Graphistry Hub and your own self-hosted servers.
- # other pydata libraries
- X = g._node_features # g._get_feature('nodes') or g.get_matrix()
- y = g._node_target # g._get_target('nodes') or g.get_matrix(target=True)
- from sklearn.ensemble import RandomForestRegressor
- model = RandomForestRegressor().fit(X, y) # assumes train/test split
- new_df = pandas.read_csv(...) # mini batch
- X_new, _ = g.transform(new_df, None, kind='nodes', return_graph=False)
- preds = model.predict(X_new)
- ```
+* [**Query graphs with GFQL:**](https://pygraphistry.readthedocs.io/en/latest/gfql/index.html) Use GFQL, the first dataframe-native graph query language, to ask relationship questions that are difficult for tabular tools and without requiring a database.
-* Encode model definitions and compare models against each other
+* [**graphistry[ai]:**](https://pygraphistry.readthedocs.io/en/latest/gfql/combo.html#) Call streamlined graph ML & AI methods to benefit from clustering, UMAP embeddings, graph neural networks, automatic feature engineering, and more.
- ```python
- # graphistry
- from graphistry.features import search_model, topic_model, ngrams_model, ModelDict, default_featurize_parameters, default_umap_parameters
+* [**Visualize & explore large graphs:**](https://pygraphistry.readthedocs.io/en/latest/visualization/10min.html#) In just a few minutes, create stunning interactive visualizations with millions of edges and many point-and-click built-ins like drilldowns, timebars, and filtering. When ready, customize with Python, JavaScript, and REST APIs.
- g = graphistry.nodes(df)
- g2 = g.umap(X=[..], y=[..], **search_model)
+* [**Columnar & GPU acceleration:**](https://pygraphistry.readthedocs.io/en/latest/performance.html) CPU-mode ingestion and wrangling is fast due to native use of Apache Arrow and columnar analytics, and the optional RAPIDS-based GPU mode delivers 100X+ speedups.
- # set custom encoding model with any feature/umap/dbscan kwargs
- new_model = ModelDict(message='encoding new model parameters is easy', **default_featurize_parameters)
- new_model.update(dict(
- y=[...],
- kind='edges',
- model_name='sbert/cool_transformer_model',
- use_scaler_target='kbins',
- n_bins=11,
- strategy='normal'))
- print(new_model)
- g3 = g.umap(X=[..], **new_model)
- # compare g2 vs g3 or add to different pipelines
- ```
+From global 10 banks, manufacturers, news agencies, and government agencies, to startups, game companies, scientists, biotechs, and NGOs, many teams are tackling their graph workloads with Graphistry.
-See `help(g.featurize)` for more options
-### [sklearn-based UMAP](https://umap-learn.readthedocs.io/en/latest/), [cuML-based UMAP](https://docs.rapids.ai/api/cuml/stable/api.html?highlight=umap#cuml.UMAP)
-* Reduce dimensionality by plotting a similarity graph from feature vectors:
-
- ```python
- # automatic feature engineering, UMAP
- g = graphistry.nodes(df).umap()
-
- # plot the similarity graph without any explicit edge_dataframe passed in -- it is created during UMAP.
- g.plot()
- ```
-
-* Apply a trained model to new data:
-
- ```python
- new_df = pd.read_csv(...)
- embeddings, X_new, _ = g.transform_umap(new_df, None, kind='nodes', return_graph=False)
- ```
-
-* Infer a new graph from new data using the old umap coordinates to run inference without having to train a new umap model.
-
- ```python
- new_df = pd.read_csv(...)
- g2 = g.transform_umap(new_df, return_graph=True) # return_graph=True is default
- g2.plot() #
-
- # or if you want the new minibatch to cluster to closest points in previous fit:
- g3 = g.transform_umap(new_df, return_graph=True, merge_policy=True)
- g3.plot() # useful to see how new data connects to old -- play with `sample` and `n_neighbors` to control how much of old to include
- ```
-
-* UMAP supports many options, such as supervised mode, working on a subset of columns, and passing arguments to underlying `featurize()` and UMAP implementations (see `help(g.umap)`):
-
- ```python
- g.umap(kind='nodes', X=['col_1', ..., 'col_n'], y=['label', ..., 'other_targets'], ...)
- ```
-
-* `umap(engine="...")` supports multiple implementations. It defaults to using the GPU-accelerated `engine="cuml"` when a GPU is available, resulting in orders-of-magnitude speedups, and falls back to CPU processing via `engine="umap_learn"`.:
-
- ```python
- g.umap(engine='cuml')
- ```
-
-You can also featurize edges and UMAP them as we did above.
-
-UMAP support is rapidly evolving, please contact the team directly or on Slack for additional discussions
-
-See `help(g.umap)` for more options
-
-### [GNN models](https://docs.dgl.ai/en/0.6.x/index.html)
-
-* Graphistry adds bindings and automation to working with popular GNN models, currently focusing on DGL/PyTorch:
-
- ```python
- g = (graphistry
- .nodes(ndf)
- .edges(edf, src, dst)
- .build_gnn(
- X_nodes=['col_1', ..., 'col_n'], #columns from nodes_dataframe
- y_nodes=['label', ..., 'other_targets'],
- X_edges=['col_1_edge', ..., 'col_n_edge'], #columns from edges_dataframe
- y_edges=['label_edge', ..., 'other_targets_edge'],
- ...)
- )
- G = g.DGL_graph
-
- from [your_training_pipeline] import train, model
- # Train
- g = graphistry.nodes(df).build_gnn(y_nodes=`target`)
- G = g.DGL_graph
- train(G, model)
- # predict on new data
- X_new, _ = g.transform(new_df, None, kind='nodes' or 'edges', return_graph=False) # no targets
- predictions = model.predict(G_new, X_new)
- ```
-
-Like `g.umap()`, GNN layers automate feature engineering (`.featurize()`)
-
-See `help(g.build_gnn)` for options.
-
-GNN support is rapidly evolving, please contact the team directly or on Slack for additional discussions
-
-### [Semantic Search](https://www.sbert.net/examples/applications/semantic-search/README.html)
-
-* Search textual data semantically and see the resulting graph:
-
- ```python
- ndf = pd.read_csv(nodes.csv)
- edf = pd.read_csv(edges.csv)
-
- g = graphistry.nodes(ndf, 'node').edges(edf, 'src', 'dst')
-
- g2 = g.featurize(X = ['text_col_1', .., 'text_col_n'], kind='nodes',
- min_words = 0, # forces all named columns as textual ones
- #encode text as paraphrase embeddings, supports any sbert model
- model_name = "paraphrase-MiniLM-L6-v2")
-
- # or use convienence `ModelDict` to store parameters
-
- from graphistry.features import search_model
- g2 = g.featurize(X = ['text_col_1', .., 'text_col_n'], kind='nodes', **search_model)
-
- # query using the power of transformers to find richly relevant results
-
- results_df, query_vector = g2.search('my natural language query', ...)
-
- print(results_df[['_distance', 'text_col', ..]]) #sorted by relevancy
-
- # or see graph of matching entities and original edges
-
- g2.search_graph('my natural language query', ...).plot()
-
- ```
-
-* If edges are not given, `g.umap(..)` will supply them:
-
- ```python
- ndf = pd.read_csv(nodes.csv)
- g = graphistry.nodes(ndf)
- g2 = g.umap(X = ['text_col_1', .., 'text_col_n'], min_words=0, ...)
-
- g2.search_graph('my natural language query', ...).plot()
- ```
-
-See `help(g.search_graph)` for options
-
-### Knowledge Graph Embeddings
-
-* Train a RGCN model and predict:
-
- ```python
- edf = pd.read_csv(edges.csv)
- g = graphistry.edges(edf, src, dst)
- g2 = g.embed(relation='relationship_column_of_interest', **kwargs)
-
- # predict links over all nodes
- g3 = g2.predict_links_all(threshold=0.95) # score high confidence predicted edges
- g3.plot()
-
- # predict over any set of entities and/or relations.
- # Set any `source`, `destination` or `relation` to `None` to predict over all of them.
- # if all are None, it is better to use `g.predict_links_all` for speed.
- g4 = g2.predict_links(source=['entity_k'],
- relation=['relationship_1', 'relationship_4', ..],
- destination=['entity_l', 'entity_m', ..],
- threshold=0.9, # score threshold
- return_dataframe=False) # set to `True` to return dataframe, or just access via `g4._edges`
- ```
-
-* Detect Anamolous Behavior (example use cases such as Cyber, Fraud, etc)
-
- ```python
- # Score anomolous edges by setting the flag `anomalous` to True and set confidence threshold low
- g5 = g.predict_links_all(threshold=0.05, anomalous=True) # score low confidence predicted edges
- g5.plot()
-
- g6 = g.predict_links(source=['ip_address_1', 'user_id_3'],
- relation=['attempt_logon', 'phishing', ..],
- destination=['user_id_1', 'active_directory', ..],
- anomalous=True,
- threshold=0.05)
- g6.plot()
- ```
-
-* Train a RGCN model including auto-featurized node embeddings
-
- ```python
- edf = pd.read_csv(edges.csv)
- ndf = pd.read_csv(nodes.csv) # adding node dataframe
-
- g = graphistry.edges(edf, src, dst).nodes(ndf, node_column)
-
- # inherets all the featurization `kwargs` from `g.featurize`
- g2 = g.embed(relation='relationship_column_of_interest', use_feat=True, **kwargs)
- g2.predict_links_all(threshold=0.95).plot()
- ```
-
-See `help(g.embed)`, `help(g.predict_links)` , or `help(g.predict_links_all)` for options
-
-### DBSCAN
-
-* Enrich UMAP embeddings or featurization dataframe with GPU or CPU DBSCAN
-
- ```python
- g = graphistry.edges(edf, 'src', 'dst').nodes(ndf, 'node')
-
- # cluster by UMAP embeddings
- kind = 'nodes' | 'edges'
- g2 = g.umap(kind=kind).dbscan(kind=kind)
- print(g2._nodes['_dbscan']) | print(g2._edges['_dbscan'])
-
- # dbscan in `umap` or `featurize` via flag
- g2 = g.umap(dbscan=True, min_dist=0.2, min_samples=1)
-
- # or via chaining,
- g2 = g.umap().dbscan(min_dist=1.2, min_samples=2, **kwargs)
-
- # cluster by feature embeddings
- g2 = g.featurize().dbscan(**kwargs)
-
- # cluster by a given set of feature column attributes, inhereted from `g.get_matrix(cols)`
- g2 = g.featurize().dbscan(cols=['ip_172', 'location', 'alert'], **kwargs)
-
- # equivalent to above (ie, cols != None and umap=True will still use features dataframe, rather than UMAP embeddings)
- g2 = g.umap().dbscan(cols=['ip_172', 'location', 'alert'], umap=True | False, **kwargs)
- g2.plot() # color by `_dbscan`
-
- new_df = pd.read_csv(..)
- # transform on new data according to fit dbscan model
- g3 = g2.transform_dbscan(new_df)
- ```
-
-See `help(g.dbscan)` or `help(g.transform_dbscan)` for options
-
-### Quickly configurable
-
-Set visual attributes through [quick data bindings](https://hub.graphistry.com/docs/api/2/rest/upload/#createdataset2) and set [all sorts of URL options](https://hub.graphistry.com/docs/api/1/rest/url/). Check out the tutorials on [colors](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/encodings-colors.ipynb), [sizes](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/encodings-sizes.ipynb), [icons](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/encodings-icons.ipynb), [badges](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/encodings-badges.ipynb), [weighted clustering](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/edge-weights.ipynb) and [sharing controls](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/sharing_tutorial.ipynb):
+## Gallery
- ```python
- g
- .privacy(mode='private', invited_users=[{'email': 'friend1@site.ngo', 'action': '10'}], notify=False)
- .edges(df, 'col_a', 'col_b')
- .edges(my_transform1(g._edges))
- .nodes(df, 'col_c')
- .nodes(my_transform2(g._nodes))
- .bind(source='col_a', destination='col_b', node='col_c')
- .bind(
- point_color='col_a',
- point_size='col_b',
- point_title='col_c',
- point_x='col_d',
- point_y='col_e')
- .bind(
- edge_color='col_m',
- edge_weight='col_n',
- edge_title='col_o')
- .encode_edge_color('timestamp', ["blue", "yellow", "red"], as_continuous=True)
- .encode_point_icon('device_type', categorical_mapping={'macbook': 'laptop', ...})
- .encode_point_badge('passport', 'TopRight', categorical_mapping={'Canada': 'flag-icon-ca', ...})
- .encode_point_color('score', ['black', 'white'])
- .addStyle(bg={'color': 'red'}, fg={}, page={'title': 'My Graph'}, logo={})
- .settings(url_params={
- 'play': 2000,
- 'menu': True, 'info': True,
- 'showArrows': True,
- 'pointSize': 2.0, 'edgeCurvature': 0.5,
- 'edgeOpacity': 1.0, 'pointOpacity': 1.0,
- 'lockedX': False, 'lockedY': False, 'lockedR': False,
- 'linLog': False, 'strongGravity': False, 'dissuadeHubs': False,
- 'edgeInfluence': 1.0, 'precisionVsSpeed': 1.0, 'gravity': 1.0, 'scalingRatio': 1.0,
- 'showLabels': True, 'showLabelOnHover': True,
- 'showPointsOfInterest': True, 'showPointsOfInterestLabel': True, 'showLabelPropertiesOnHover': True,
- 'pointsOfInterestMax': 5
- })
- .plot()
- ```
-### Gallery
+The [notebook demo gallery](https://pygraphistry.readthedocs.io/en/latest/demos/for_analysis.html) shares many more live visualizations, demos, and integration examples
-## Install
-
-### Get
-
-You need to install the PyGraphistry Python client and connect it to a Graphistry GPU server of your choice:
-
-1. Graphistry server account:
- * Create a free [Graphistry Hub account](https://www.graphistry.com/get-started) for open data, or [one-click launch your own private AWS/Azure instance](https://www.graphistry.com/get-started)
- * Later, [setup and manage](https://github.com/graphistry/graphistry-cli) your own private Docker instance ([contact](https://www.graphistry.com/demo-request))
-
-2. PyGraphistry Python client:
- * `pip install --user graphistry` (Python 3.8+) or [directly call the HTTP API](https://hub.graphistry.com/docs/api/)
- * Use `pip install --user graphistry[all]` for optional dependencies such as Neo4j drivers
- * To use from a notebook environment, run your own [Jupyter](https://jupyter.org/) server ([one-click launch your own private AWS/Azure GPU instance](https://www.graphistry.com/get-started)) or another such as [Google Colab](https://colab.research.google.com)
- * See immediately following `configure` section for how to connect
-
-### Configure
-
-Most users connect to a Graphistry GPU server account via:
-
-* `graphistry.register(api=3, username='abc', password='xyz')`: personal hub.graphistry.com account
-* `graphistry.register(api=3, username='abc', password='xyz', org_name='optional_org')`: team hub.graphistry.com account
-* `graphistry.register(api=3, username='abc', password='xyz', org_name='optiona_org', protocol='http', server='my.private_server.org')`: private server
-
-For more advanced configuration, read on for:
-
-* Version: Use protocol `api=3`, which will soon become the default, or a legacy version
-
-* JWT Tokens: Connect to a GPU server by providing a `username='abc'`/`password='xyz'`, or for advanced long-running service account software, a refresh loop using 1-hour-only JWT tokens
-
-* Organizations: Optionally use `org_name` to set a specific organization
-
-* Private servers: PyGraphistry defaults to using the free [Graphistry Hub](https://hub.graphistry.com) public API
-
- * Connect to a [private Graphistry server](https://www.graphistry.com/get-started) and provide optional settings specific to it via `protocol`, `server`, and in some cases, `client_protocol_hostname`
-
-Non-Python users may want to explore the underlying language-neutral [authentication REST API docs](https://hub.graphistry.com/docs/api/1/rest/auth/).
-
-#### Advanced Login
-
-* **For people:** Provide your account username/password:
-
-```python
-import graphistry
-graphistry.register(api=3, username='username', password='your password')
-```
-
-* **For service accounts**: Long-running services may prefer to use 1-hour JWT tokens:
-
-```python
-import graphistry
-graphistry.register(api=3, username='username', password='your password')
-initial_one_hour_token = graphistry.api_token()
-graphistry.register(api=3, token=initial_one_hour_token)
-
-# must run every 59min
-graphistry.refresh()
-fresh_token = graphistry.api_token()
-assert initial_one_hour_token != fresh_token
-```
-
-Refreshes exhaust their limit every day/month. An upcoming Personal Key feature enables non-expiring use.
-
-Alternatively, you can rerun `graphistry.register(api=3, username='username', password='your password')`, which will also fetch a fresh token.
-
-#### Advanced: Private servers - server uploads
-
-Specify which Graphistry server to reach for Python uploads:
-
-```python
-graphistry.register(protocol='https', server='hub.graphistry.com')
-```
-
-Private Graphistry notebook environments are preconfigured to fill in this data for you:
-
-```python
-graphistry.register(protocol='http', server='nginx', client_protocol_hostname='')
-```
-
-Using `'http'`/`'nginx'` ensures uploads stay within the Docker network (vs. going more slowly through an outside network), and client protocol `''` ensures the browser URLs do not show `http://nginx/`, and instead use the server's name. (See immediately following **Switch client URL** section.)
-
-#### Advanced: Private servers - switch client URL for browser views
-
-In cases such as when the notebook server is the same as the Graphistry server, you may want your Python code to *upload* to a known local Graphistry address without going outside the network (e.g., `http://nginx` or `http://localhost`), but for web viewing, generate and embed URLs to a different public address (e.g., `https://graphistry.acme.ngo/`). In this case, explicitly set a client (browser) location different from `protocol` / `server`:
-
-```python
-graphistry.register(
- ### fast local notebook<>graphistry upload
- protocol='http', server='nginx',
-
- ### shareable public URL for browsers
- client_protocol_hostname='https://graphistry.acme.ngo'
-)
-```
-
-Prebuilt Graphistry servers are already setup to do this out-of-the-box.
-
-#### Advanced: Sharing controls
-
-Graphistry supports flexible sharing permissions that are similar to Google documents and Dropbox links
-
-By default, visualizations are publicly viewable by anyone with the URL (that is unguessable & unlisted), and only editable by their owner.
-* Private-only: You can globally default uploads to private:
-```python
-graphistry.privacy() # graphistry.privacy(mode='private')
-```
-
-* Organizations: You can login with an organization and share only within it
-
-```python
-graphistry.register(api=3, username='...', password='...', org_name='my-org123')
-graphistry.privacy(mode='organization')
-```
-
-* Invitees: You can share access to specify users, and optionally, even email them invites
-
-```python
-VIEW = "10"
-EDIT = "20"
-graphistry.privacy(
- mode='private',
- invited_users=[
- {"email": "friend1@site1.com", "action": VIEW},
- {"email": "friend2@site2.com", "action": EDIT}
- ],
- notify=True)
-```
+## Install
-* Per-visualization: You can choose different rules for global defaults vs. for specific visualizations
+Common configurations:
-```python
-graphistry.privacy(invited_users=[...])
-g = graphistry.hypergraph(pd.read_csv('...'))['graph']
-g.privacy(notify=True).plot()
-```
+* **Minimal core**
-See additional examples in the [sharing tutorial](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/sharing_tutorial.ipynb)
+ Includes: The GFQL dataframe-native graph query language, built-in layouts, Graphistry visualization server client
-## Tutorial: Les Misérables
+ ```python
+ pip install graphistry
+ ```
-Let's visualize relationships between the characters in [Les Misérables](http://en.wikipedia.org/wiki/Les_Misérables).
-For this example, we'll choose [Pandas](http://pandas.pydata.org) to wrangle data and [igraph](http://igraph.org) to run a community detection algorithm. You can [view](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/simple/MarvelTutorial.ipynb) the Jupyter notebook containing this example.
+ Does not include `graphistry[ai]`, plugins
-Our [dataset is a CSV file](https://raw.githubusercontent.com/graphistry/pygraphistry/master/demos/data/lesmiserables.csv) that looks like this:
+* **No dependencies and user-level**
-| source | target | value |
-| ------------- |:-------------:| ------:|
-| Cravatte | Myriel | 1
-| Valjean | Mme.Magloire | 3
-| Valjean | Mlle.Baptistine | 3
+ ```python
+ pip install --no-deps --user graphistry
+ ```
-*Source* and *target* are character names, and the *value* column counts the number of time they meet. Parsing is a one-liner with Pandas:
+* **GPU acceleration** - Optional
-```python
-import pandas
-links = pandas.read_csv('./lesmiserables.csv')
-```
+ Local GPU: Install [RAPIDS](https://www.rapids.ai) and/or deploy a GPU-ready [Graphistry server](https://www.graphistry.com/get-started)
+
+ Remote GPU: Use the [remote endpoints](https://www.graphistry.com/blog/graphistry-2-41-3).
-### Quick Visualization
+For further options, see the [installation guides](https://pygraphistry.readthedocs.io/en/latest/install/index.html)
-If you already have graph-like data, use this step. Otherwise, try the [Hypergraph Transform](https://github.com/graphistry/pygraphistry/blob/master/demos/demos_by_use_case/logs/malware-hypergraph/Malware%20Hypergraph.ipynb) for creating graphs from rows of data (logs, samples, records, ...).
-PyGraphistry can plot graphs directly from Pandas data frames, Arrow tables, cuGraph GPU data frames, igraph graphs, or NetworkX graphs. Calling *plot* uploads the data to our visualization servers and return an URL to an embeddable webpage containing the visualization.
+## Visualization quickstart
-To define the graph, we `bind` *source* and *destination* to the columns indicating the start and end nodes of each edges:
+Quickly go from raw data to a styled and interactive Graphistry graph visualization:
```python
import graphistry
-graphistry.register(api=3, username='YOUR_ACCOUNT_HERE', password='YOUR_PASSWORD_HERE')
-
-g = graphistry.bind(source="source", destination="target")
-g.edges(links).plot()
-```
-
-You should see a beautiful graph like this one:
-![Graph of Miserables](http://i.imgur.com/dRHHTyK.png)
-
-### Adding Labels
-
-Let's add labels to edges in order to show how many times each pair of characters met. We create a new column called *label* in edge table *links* that contains the text of the label and we bind *edge_label* to it.
-
-```python
-links["label"] = links.value.map(lambda v: "#Meetings: %d" % v)
-g = g.bind(edge_title="label")
-g.edges(links).plot()
-```
-
-### Controlling Node Title, Size, Color, and Position
-
-Let's size nodes based on their [PageRank](http://en.wikipedia.org/wiki/PageRank) score and color them using their [community](https://en.wikipedia.org/wiki/Community_structure).
-
-#### Warmup: igraph for computing statistics
-
-[igraph](http://igraph.org/python/) already has these algorithms implemented for us for small graphs. (See our cuGraph examples for big graphs.) If igraph is not already installed, fetch it with `pip install igraph`.
-
-We start by converting our edge dateframe into an igraph. The plotter can do the conversion for us using the *source* and *destination* bindings. Then we compute two new node attributes (*pagerank* & *community*).
-
-```python
-g = g.compute_igraph('pagerank', directed=True, params={'damping': 0.85}).compute_igraph('community_infomap')
-```
-
-The algorithm names `'pagerank'` and `'community_infomap'` correspond to method names of [igraph.Graph](https://igraph.org/python/api/latest/igraph.Graph.html). Likewise, optional `params={...}` allow specifying additional parameters.
-
-#### Bind node data to visual node attributes
-
-We can then bind the node `community` and `pagerank` columns to visualization attributes:
-
-```python
-g.bind(point_color='community', point_size='pagerank').plot()
-```
-
-See the [color palette documentation](https://hub.graphistry.com/docs/api/2/rest/upload/colors/#extendedpalette2) for specifying color values by using built-in ColorBrewer palettes (`int32`) or custom RGB values (`int64`).
-
-To control the position, we can add `.bind(point_x='colA', point_y='colB').settings(url_params={'play': 0})` ([see demos](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/external_layout) and [additional url parameters](https://hub.graphistry.com/docs/api/1/rest/url/#urloptions)]). In `api=1`, you created columns named `x` and `y`.
-
-You may also want to bind `point_title`: `.bind(point_title='colA')`.
-
-For more in-depth examples, check out the tutorials on [colors](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/encodings-colors.ipynb) and [sizes](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/encodings-sizes.ipynb).
-
-![Second Graph of Miserables](http://i.imgur.com/P7fm5sn.png)
-
-### Add edge colors and weights
-
-By default, edges get colored as a gradient between their source/destination node colors. You can override this by setting `.bind(edge_color='colA')`, similar to how node colors function. ([See color documentation](https://hub.graphistry.com/docs/api/2/rest/upload/colors/#extendedpalette2).)
-
-Similarly, you can bind the edge weight, where higher weights cause nodes to cluster closer together: `.bind(edge_weight='colA')`. [See tutorial](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/edge-weights.ipynb).
-
-For more in-depth examples, check out the tutorials on [colors](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/encodings-colors.ipynb) and [weighted clustering](demos/more_examples/graphistry_features/edge-weights.ipynb).
-
-### More advanced color and size controls
-
-You may want more controls like using gradients or maping specific values:
-
-```python
-g.encode_edge_color('int_col') # int32 or int64
-g.encode_edge_color('time_col', ["blue", "red"], as_continuous=True)
-g.encode_edge_color('type', as_categorical=True,
- categorical_mapping={"cat": "red", "sheep": "blue"}, default_mapping='#CCC')
-g.encode_edge_color('brand',
- categorical_mapping={'toyota': 'red', 'ford': 'blue'},
- default_mapping='#CCC')
-g.encode_point_size('numeric_col')
-g.encode_point_size('criticality',
- categorical_mapping={'critical': 200, 'ok': 100},
- default_mapping=50)
-g.encode_point_color('int_col') # int32 or int64
-g.encode_point_color('time_col', ["blue", "red"], as_continuous=True)
-g.encode_point_color('type', as_categorical=True,
- categorical_mapping={"cat": "red", "sheep": "blue"}, default_mapping='#CCC')
-```
-
-For more in-depth examples, check out the tutorials on [colors](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/encodings-colors.ipynb).
-
-### Custom icons and badges
-
-You can add a main icon and multiple peripherary badges to provide more visual information. Use column `type` for the icon type to appear visually in the legend. The glyph system supports text, icons, flags, and images, as well as multiple mapping and style controls.
-
-#### Main icon
-
-```python
-g.encode_point_icon(
- 'some_column',
- shape="circle", #clip excess
- categorical_mapping={
- 'macbook': 'laptop', #https://fontawesome.com/v4.7.0/icons/
- 'Canada': 'flag-icon-ca', #ISO3611-Alpha-2: https://github.com/datasets/country-codes/blob/master/data/country-codes.csv
- 'embedded_smile': 'data:svg...',
- 'external_logo': 'http://..../img.png'
- },
- default_mapping="question")
-g.encode_point_icon(
- 'another_column',
- continuous_binning=[
- [20, 'info'],
- [80, 'exclamation-circle'],
- [None, 'exclamation-triangle']
- ]
-)
-g.encode_point_icon(
- 'another_column',
- as_text=True,
- categorical_mapping={
- 'Canada': 'CA',
- 'United States': 'US'
- }
-)
-```
-
-For more in-depth examples, check out the tutorials on [icons](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/encodings-icons.ipynb).
-
-#### Badges
-
-```python
-# see icons examples for mappings and glyphs
-g.encode_point_badge('another_column', 'TopRight', categorical_mapping=...)
-
-g.encode_point_badge('another_column', 'TopRight', categorical_mapping=...,
- shape="circle",
- border={'width': 2, 'color': 'white', 'stroke': 'solid'},
- color={'mapping': {'categorical': {'fixed': {}, 'other': 'white'}}},
- bg={'color': {'mapping': {'continuous': {'bins': [], 'other': 'black'}}}})
-```
+import pandas as pd
-For more in-depth examples, check out the tutorials on [badges](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/encodings-badges.ipynb).
-
-#### Axes
-
-For more automated use, see the section on radial layouts below.
-
-Radial axes support three coloring types (`'external'`, `'internal'`, and `'space'`) and optional labels:
-
-```python
- g.encode_axis([
- {'r': 14, 'external': True, "label": "outermost"},
- {'r': 12, 'external': True},
- {'r': 10, 'space': True},
- {'r': 8, 'space': True},
- {'r': 6, 'internal': True},
- {'r': 4, 'space': True},
- {'r': 2, 'space': True, "label": "innermost"}
-])
-```
-
-Horizontal axis support optional labels and ranges:
-
-```python
-g.encode_axis([
- {"label": "a", "y": 2, "internal": True },
- {"label": "b", "y": 40, "external": True,
- "width": 20, "bounds": {"min": 40, "max": 400}},
-])
-```
-
-Radial axis are generally used with radial positioning:
-
-```python
-g2 = (g
- .nodes(
- g._nodes.assign(
- x = 1 + (g._nodes['ring']) * g._nodes['n'].apply(math.cos),
- y = 1 + (g._nodes['ring']) * g._nodes['n'].apply(math.sin)
- )).settings(url_params={'lockedR': 'true', 'play': 1000})
-```
-
-Horizontal axis are often used with pinned y and free x positions:
-
-```python
-g2 = (g
- .nodes(
- g._nodes.assign(
- y = 50 * g._nodes['level'])
- )).settings(url_params={'lockedY': 'true', 'play': 1000})
-```
-
-### Theming
-
-You can customize several style options to match your theme:
-
-```python
-g.addStyle(bg={'color': 'red'})
-g.addStyle(bg={
- 'color': '#333',
- 'gradient': {
- 'kind': 'radial',
- 'stops': [ ["rgba(255,255,255, 0.1)", "10%", "rgba(0,0,0,0)", "20%"] ]}})
-g.addStyle(bg={'image': {'url': 'http://site.com/cool.png', 'blendMode': 'multiply'}})
-g.addStyle(fg={'blendMode': 'color-burn'})
-g.addStyle(page={'title': 'My site'})
-g.addStyle(page={'favicon': 'http://site.com/favicon.ico'})
-g.addStyle(logo={'url': 'http://www.site.com/transparent_logo.png'})
-g.addStyle(logo={
- 'url': 'http://www.site.com/transparent_logo.png',
- 'dimensions': {'maxHeight': 200, 'maxWidth': 200},
- 'style': {'opacity': 0.5}
+# Raw data as Pandas CPU dataframes, cuDF GPU dataframes, Spark, ...
+df = pd.DataFrame({
+ 'src': ['Alice', 'Bob', 'Carol'],
+ 'dst': ['Bob', 'Carol', 'Alice'],
+ 'friendship': [0.3, 0.95, 0.8]
})
-```
-### Transforms
+# Bind
+g1 = graphistry.edges(df, 'src', 'dst')
-The below methods let you quickly manipulate graphs directly and with dataframe methods: Search, pattern mine, transform, and more:
+# Override styling defaults
+g1_styled = g1.encode_edge_color('friendship', as_continuous=True, ['blue', 'red'])
-```python
-from graphistry import n, e_forward, e_reverse, e_undirected, is_in
-g = (graphistry
- .edges(pd.DataFrame({
- 's': ['a', 'b'],
- 'd': ['b', 'c'],
- 'k1': ['x', 'y']
- }))
- .nodes(pd.DataFrame({
- 'n': ['a', 'b', 'c'],
- 'k2': [0, 2, 4, 6]
- })
-)
-
-g2 = graphistry.hypergraph(g._edges, ['s', 'd', 'k1'])['graph']
-g2.plot() # nodes are values from cols s, d, k1
-
-(g
- .materialize_nodes()
- .get_degrees()
- .get_indegrees()
- .get_outdegrees()
- .pipe(lambda g2: g2.nodes(g2._nodes.assign(t=x))) # transform
- .filter_edges_by_dict({"k1": "x"})
- .filter_nodes_by_dict({"k2": 4})
- .prune_self_edges()
- .hop( # filter to subgraph
- #almost all optional
- direction='forward', # 'reverse', 'undirected'
- hops=2, # number (1..n hops, inclusive) or None if to_fixed_point
- to_fixed_point=False,
-
- #every edge source node must match these
- source_node_match={"k2": 0, "k3": is_in(['a', 'b', 3, 4])},
- source_node_query='k2 == 0',
-
- #every edge must match these
- edge_match={"k1": "x"},
- edge_query='k1 == "x"',
-
- #every edge destination node must match these
- destination_node_match={"k2": 2},
- destination_node_query='k2 == 2 or k2 == 4',
- )
- .chain([ # filter to subgraph with Cypher-style GFQL
- n(),
- n({'k2': 0, "m": 'ok'}), #specific values
- n({'type': is_in(["type1", "type2"])}), #multiple valid values
- n(query='k2 == 0 or k2 == 4'), #dataframe query
- n(name="start"), # add column 'start':bool
- e_forward({'k1': 'x'}, hops=1), # same API as hop()
- e_undirected(name='second_edge'),
- e_reverse(
- {'k1': 'x'}, # edge property match
- hops=2, # 1 to 2 hops
- #same API as hop()
- source_node_match={"k2": 2},
- source_node_query='k2 == 2 or k2 == 4',
- edge_match={"k1": "x"},
- edge_query='k1 == "x"',
- destination_node_match={"k2": 0},
- destination_node_query='k2 == 0')
- ])
- # replace as one node the node w/ given id + transitively connected nodes w/ col=attr
- .collapse(node='some_id', column='some_col', attribute='some val')
-```
-
-Both `hop()` and `chain()` (GFQL) match dictionary expressions support dataframe series *predicates*. The above examples show `is_in([x, y, z, ...])`. Additional predicates include:
-
-* categorical: is_in, duplicated
-* temporal: is_month_start, is_month_end, is_quarter_start, is_quarter_end, is_year_start, is_year_end
-* numeric: gt, lt, ge, le, eq, ne, between, isna, notna
-* string: contains, startswith, endswith, match, isnumeric, isalpha, isdigit, islower, isupper, isspace, isalnum, isdecimal, istitle, isnull, notnull
-
-Both `hop()` and `chain()` will run on GPUs when passing in RAPIDS dataframes. Specify parameter `engine='cudf'` to be sure.
+# Connect: Free GPU accounts and self-hosting @ graphistry.com/get-started
+graphistry.register(api=3, username='your_username', password='your_password')
-#### Table to graph
-
-```python
-df = pd.read_csv('events.csv')
-hg = graphistry.hypergraph(df, ['user', 'email', 'org'], direct=True)
-g = hg['graph'] # g._edges: | src, dst, user, email, org, time, ... |
-g.plot()
+# Upload for GPU server visualization session
+g1_styled.plot()
```
-```python
-hg = graphistry.hypergraph(
- df,
- ['from_user', 'to_user', 'email', 'org'],
- direct=True,
- opts={
-
- # when direct=True, can define src -> [ dst1, dst2, ...] edges
- 'EDGES': {
- 'org': ['from_user'], # org->from_user
- 'from_user': ['email', 'to_user'], #from_user->email, from_user->to_user
- },
-
- 'CATEGORIES': {
- # determine which columns share the same namespace for node generation:
- # - if user 'louie' is both a from_user and to_user, show as 1 node
- # - if a user & org are both named 'louie', they will appear as 2 different nodes
- 'user': ['from_user', 'to_user']
- }
-})
-g = hg['graph']
-g.plot()
-```
+Explore [10 Minutes to Graphistry Visualization](https://pygraphistry.readthedocs.io/en/latest/visualization/10min.html) for more visualization examples and options
-#### Generate node table
-```python
-g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}))
-g2 = g.materialize_nodes()
-g2._nodes # pd.DataFrame({'id': ['a', 'b', 'c']})
-```
+## PyGraphistry[AI] & GFQL quickstart - CPU & GPU
-#### Compute degrees
+**CPU graph pipeline** combining graph ML, AI, mining, and visualization:
```python
-g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}))
-g2 = g.get_degrees()
-g2._nodes # pd.DataFrame({
- # 'id': ['a', 'b', 'c'],
- # 'degree_in': [0, 1, 1],
- # 'degree_out': [1, 1, 0],
- # 'degree': [1, 1, 1]
- #})
-```
-
-See also `get_indegrees()` and `get_outdegrees()`
+from graphistry import n, e, e_forward, e_reverse
-#### Use igraph (CPU) and cugraph (GPU) compute
-
-Install the plugin of choice and then:
-
-```python
-g2 = g.compute_igraph('pagerank')
+# Graph analytics
+g2 = g1.compute_igraph('pagerank')
assert 'pagerank' in g2._nodes.columns
-g3 = g.compute_cugraph('pagerank')
-assert 'pagerank' in g2._nodes.columns
-```
-
-#### Graph pattern matching
-
-PyGraphistry supports GFQL, its PyData-native variant of the popular Cypher graph query language, meaning you can do graph pattern matching directly from Pandas dataframes without installing a database or Java
-
-See also [graph pattern matching tutorial](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features/hop_and_chain_graph_pattern_mining.ipynb) and the CPU/GPU [benchmark](https://github.com/graphistry/pygraphistry/tree/master/demos/gfql/benchmark_hops_cpu_gpu.ipynb)
-
-Traverse within a graph, or expand one graph against another
-
-Simple node and edge filtering via `filter_edges_by_dict()` and `filter_nodes_by_dict()`:
-
-```python
-g = graphistry.edges(pd.read_csv('data.csv'), 's', 'd')
-g2 = g.materialize_nodes()
+# Graph ML/AI
+g3 = g2.umap()
+assert ('x' in g3._nodes.columns) and ('y' in g3._nodes.columns)
-g3 = g.filter_edges_by_dict({"v": 1, "b": True})
-g4 = g.filter_nodes_by_dict({"v2": 1, "b2": True})
-```
-
-Method `.hop()` enables slightly more complicated edge filters:
-
-```python
-
-from graphistry import is_in, gt
-
-# (a)-[{"v": 1, "type": "z"}]->(b) based on g
-g2b = g2.hop(
- source_node_match={g2._node: "a"},
- edge_match={"v": 1, "type": "z"},
- destination_node_match={g2._node: "b"})
-g2b = g2.hop(
- source_node_query='n == "a"',
- edge_query='v == 1 and type == "z"',
- destination_node_query='n == "b"')
-
-# (a {x in [1,2] and y > 3})-[e]->(b) based on g
-g2c = g2.hop(
- source_node_match={
- g2._node: "a",
- "x": is_in([1,2]),
- "y": gt(3)
- },
- destination_node_match={g2._node: "b"})
-)
-
-# (a or b)-[1 to 8 hops]->(anynode), based on graph g2
-g3 = g2.hop(pd.DataFrame({g2._node: ['a', 'b']}), hops=8)
-
-# (a or b)-[1 to 8 hops]->(anynode), based on graph g2
-g3 = g2.hop(pd.DataFrame({g2._node: is_in(['a', 'b'])}), hops=8)
-
-# (c)<-[any number of hops]-(any node), based on graph g3
-# Note multihop matches check source/destination/edge match/query predicates
-# against every encountered edge for it to be included
-g4 = g3.hop(source_node_match={"node": "c"}, direction='reverse', to_fixed_point=True)
-
-# (c)-[incoming or outgoing edge]-(any node),
-# for c in g4 with expansions against nodes/edges in g2
-g5 = g2.hop(pd.DataFrame({g4._node: g4[g4._node]}), hops=1, direction='undirected')
-
-g5.plot()
-```
-
-Rich compound patterns are enabled via `.chain()`:
-
-```python
-from graphistry import n, e_forward, e_reverse, e_undirected, is_in
-
-g2.chain([ n() ])
-g2.chain([ n({"x": 1, "y": True}) ]),
-g2.chain([ n(query='x == 1 and y == True') ]),
-g2.chain([ n({"z": is_in([1,2,4,'z'])}) ]), # multiple valid values
-g2.chain([ e_forward({"type": "x"}, hops=2) ]) # simple multi-hop
-g3 = g2.chain([
- n(name="start"), # tag node matches
- e_forward(hops=3),
- e_forward(name="final_edge"), # tag edge matches
- n(name="end")
+# Graph querying with GFQL
+g4 = g3.chain([
+ n(query='pagerank > 0.1'), e_forward(), n(query='pagerank > 0.1')
])
-g2.chain(n(), e_forward(), n(), e_reverse(), n()]) # rich shapes
-print('# end nodes: ', len(g3._nodes[ g3._nodes.end ]))
-print('# end edges: ', len(g3._edges[ g3._edges.final_edge ]))
-```
-
-See table above for more predicates like `is_in()` and `gt()`
-
-Queries can be serialized and deserialized, such as for saving and remote execution:
+assert (g4._nodes.pagerank > 0.1).all()
-```python
-from graphistry.compute.chain import Chain
-
-pattern = Chain([n(), e(), n()])
-pattern_json = pattern.to_json()
-pattern2 = Chain.from_json(pattern_json)
-g.chain(pattern2).plot()
+# Upload for GPU server visualization session
+g4.plot()
```
-Benefit from automatic GPU acceleration by passing in GPU dataframes:
+The **automatic GPU modes** require almost no code changes:
```python
import cudf
+from graphistry import n, e, e_forward, e_reverse
-g1 = graphistry.edges(cudf.read_csv('data.csv'), 's', 'd')
-g2 = g1.chain(..., engine='cudf')
-```
-
-The parameter `engine` is optional, defaulting to `'auto'`.
+# Modified -- Rebind data as a GPU dataframe and swap in a GPU plugin call
+g1_gpu = g1.edges(cudf.from_pandas(df))
+g2 = g1_gpu.compute_cugraph('pagerank')
-#### Pipelining
-
-```python
-def capitalize(df, col):
- df2 = df.copy()
- df2[col] df[col].str.capitalize()
- return df2
-
-g
- .cypher('MATCH (a)-[e]->(b) RETURN a, e, b')
- .nodes(lambda g: capitalize(g._nodes, 'nTitle'))
- .edges(capitalize, None, None, 'eTitle'),
- .pipe(lambda g: g.nodes(g._nodes.pipe(capitalize, 'nTitle')))
-```
-
-#### Removing nodes
-
-```python
-g = graphistry.edges(pd.DataFrame({'s': ['a', 'b', 'c'], 'd': ['b', 'c', 'a']}))
-g2 = g.drop_nodes(['c']) # drops node c, edge c->a, edge b->c,
-```
-
-#### Keeping nodes
-
-```python
-# keep nodes [a,b,c] and edges [(a,b),(b,c)]
-g2 = g.keep_nodes(['a, b, c'])
-g2 = g.keep_nodes(pd.Series(['a, b, c']))
-g2 = g.keep_nodes(cudf.Series(['a, b, c']))
-```
-
-#### Collapsing adjacent nodes with specific k=v matches
-
-One col/val pair:
-
-```python
-g2 = g.collapse(
- node='root_node_id', # rooted traversal beginning
- column='some_col', # column to inspect
- attribute='some val' # value match to collapse on if hit
-)
-assert len(g2._nodes) <= len(g._nodes)
-```
-
-Collapse for all possible vals in a column, and assuming a stable root node id:
-
-```python
-g3 = g
-for v in g._nodes['some_col'].unique():
- g3 = g3.collapse(node='root_node_id', column='some_col', attribute=v)
-```
-
-### Hierarchical layouts: Tree and radial
-
-A hierachical view via horizontal or vertical trees, or radial. Graph data may also be presented using these layouts.
-
-#### Tree
-
-```python
-g = graphistry.edges(pd.DataFrame({'s': ['a', 'b', 'b'], 'd': ['b', 'c', 'd']}))
-
-g2a = g.tree_layout()
-g2b = g2.tree_layout(allow_cycles=False, remove_self_loops=False, vertical=False)
-g2c = g2.tree_layout(ascending=False, level_align='center')
-g2d = g2.tree_layout(level_sort_values_by=['type', 'degree'], level_sort_values_by_ascending=False)
-
-g3a = g2a.layout_settings(locked_r=True, play=1000)
-g3b = g2a.layout_settings(locked_y=True, play=0)
-g3c = g2a.layout_settings(locked_x=True)
-
-g4 = g2.tree_layout().rotate(90)
-```
-
-To use with non-tree data, e.g., graphs with cycles, we recommend computing a tree such as via a minimum spanning tree, and then using that achieved layout with this algorithm. Alternatively, the radial layouts may more naturally support your graph.
-
-#### Radial
-
-A hierarchical view via radial rings that may be more space-efficient and aesthetic than the equivalent tree layout
-
-Supports time-based, continuous, and categorical modes:
-
-##### Radial: Time-based
-
-Use when the value column defining the ring order is a time column. See [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/layout_time_ring.ipynb)
-
-```python
-g.time_ring_layout().plot() # finds a time column and infers all settings
-
-g.time_ring_layout(
- time_col='my_node_time_col',
- num_rings=20,
- time_start=np.datetime64('2014-01-22'),
- time_end=np.datetime64('2015-01-22'),
- time_unit= 'Y', # s, m, h, D, W, M, Y, C
- min_r=100.0, # smallest ring radius
- max_r=1000.0, # biggest ring radius
- reverse=False,
- #format_axis: Optional[Callable[[List[Dict]], List[Dict]]] = None,
- #format_label: Optional[Callable[[np.datetime64, int, np.timedelta64], str]] = None,
- #play_ms: int = 2000,
- #engine='auto' # 'auto', 'pandas', 'cudf'
-).plot()
-```
-
-#### Continuous
-
-Use when the value column defining the ring order is a continuous number, like distance or amount. See [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/layout_continuous_ring.ipynb)
-
-```python
-g.ring_continuous_layout() # find a numeric column and infers all settings
-
-g.ring_continuous_layout(
- ring_col='my_numeric_col',
- #v_start= # first ring at this value
- #v_end= # last ring at this value
- #v_step= # distance between rings in the value domain
- min_r=100.0, # smallest ring radius
- max_r=1000.0, # biggest ring radius
- normalize_ring_col=True, # remap [v_start,v_end] to [min_r,max_r]
- num_rings=20,
- ring_step=100,
-
- #Control axis labels and styles
- #axis: Optional[Union[Dict[float,str],List[str]]] = None,
- #format_axis: Optional[Callable[[List[Dict]], List[Dict]]] = None,
- #format_labels: Optional[Callable[[float, int, float], str]] = None,
-
- reverse=False,
- play_ms=0,
- #engine='auto', # 'auto', 'pandas', 'cudf'
-)
-```
-
-#### Categorical
-
-Use when the value column defining the ring order is a categorical value, such as a name or ID. See [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/layout_categorical_ring.ipynb)
-
-```python
-g.ring_categorical_layout('my_categorical_col') # infers all settings
-
-g.ring_categorical_layout(
- ring_col='my_numeric_col',
- order=['col1', 'my_col2'],
- drop_empty=True, # remove unpopulated rings
- combine_unhandled=False, # Put values not covered by order into one ring Other vs a ring per unique value
- append_unhandled=True, # Append vs prepend
- min_r=100.0, # smallest ring radius
- max_r=1000.0, # biggest ring radius
-
- #Control axis labels and styles
- #axis: Optional[Dict[Any,str]] = None,
- #format_axis: Optional[Callable[[List[Dict]], List[Dict]]] = None,
- #format_labels: Optional[Callable[[Any, int, float], str]] = None,
-
- reverse=False,
- play_ms=0,
- #engine='auto', # 'auto', 'pandas', 'cudf'
-)
-```
-
-### Layout: Modularity weighted
-
-Weight edges by community membership to emphasize community structure. See [(Notebook tutorial)](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/layout_modularity_weighted.ipynb)
-
-```python
-g.modularity_weighted_layout().plot()
-g.modularity_weighted_layout('my_community_col').plot()
-g.modularity_weighted_layout(
- community_alg='louvain',
- engine='cudf',
- same_community_weight=2.0,
- cross_community_weight=0.3,
- edge_influence=2.0
-).plot()
-```
-
-### Plugin: igraph
-
-With `pip install graphistry[igraph]`, you can also use [`igraph` layouts](https://igraph.org/python/doc/api/igraph.Graph.html#layout):
-
-```python
-g.layout_igraph('sugiyama').plot()
-g.layout_igraph('sugiyama', directed=True, params={}).plot()
+# Unmodified -- Automatic GPU mode for all ML, AI, GFQL queries, & visualization APIs
+g3 = g2.umap()
+g4 = g3.chain([
+ n(query='pagerank > 0.1'), e_forward(), n(query='pagerank > 0.1')
+])
+g4.plot()
```
-See list [`layout_algs`](https://github.com/graphistry/pygraphistry/blob/master/graphistry/plugins/igraph.py#L365)
+Explore [10 Minutes to PyGraphistry](https://pygraphistry.readthedocs.io/en/latest/10min.html) for a wider variety of graph processing.
-### Plugin: cugraph
-With [Nvidia RAPIDS cuGraph](https://www.rapids.ai) install:
+## PyGraphistry documentation
-```python
-g.layout_cugraph('force_atlas2').plot()
-help(g.layout_cugraph)
-```
+* [Main PyGraphistry documentation](https://pygraphistry.readthedocs.io/en/latest/)
+* 10 Minutes to: [PyGraphistry](https://pygraphistry.readthedocs.io/en/latest/10min.html), [Visualization](https://pygraphistry.readthedocs.io/en/latest/visualization/10min.html), [GFQL](https://pygraphistry.readthedocs.io/en/latest/gfql/about.html)
+* Get started: [Install](https://pygraphistry.readthedocs.io/en/latest/install/index.html), [UI Guide](https://hub.graphistry.com/docs/ui/index/), [Notebooks](https://pygraphistry.readthedocs.io/en/latest/demos/for_analysis.html)
+* Performance: [PyGraphistry CPU+GPU](https://pygraphistry.readthedocs.io/en/latest/performance.html) & [GFQL CPU+GPU](https://pygraphistry.readthedocs.io/en/latest/gfql/performance.html)
+* API References
+ - [PyGraphistry API Reference](https://pygraphistry.readthedocs.io/en/latest/api/index.html): [Visualization & Compute](https://pygraphistry.readthedocs.io/en/latest/visualization/index.html), [PyGraphistry Cheatsheet](https://pygraphistry.readthedocs.io/en/latest/cheatsheet.html)
+ - [GFQL Documentation](https://pygraphistry.readthedocs.io/en/latest/gfql/index.html): [GFQL Cheatsheet](https://pygraphistry.readthedocs.io/en/latest/gfql/quick.html) and [GFQL Operator Cheatsheet](https://pygraphistry.readthedocs.io/en/latest/gfql/predicates/quick.html)
+ - [Plugins](https://pygraphistry.readthedocs.io/en/latest/plugins.html): Databricks, Splunk, Neptune, Neo4j, RAPIDS, and more
+ - Web: [iframe](https://hub.graphistry.com/docs/api/1/rest/url/#urloptions), [JavaScript](https://hub.graphistry.com/static/js-docs/index.html?path=/docs/introduction--docs), [REST](https://hub.graphistry.com/docs/api/1/rest/auth/)
-See list [`layout_algs`](https://github.com/graphistry/pygraphistry/blob/master/graphistry/plugins/cugraph.py#L315)
+## Graphistry ecosystem
-#### Group-in-a-box layout
+- **Graphistry server:**
+ - Launch - [Graphistry Hub, Graphistry cloud marketplaces, and self-hosting](https://www.graphistry.com/get-started)
+ - Self-hosting: [Administration (including Docker)](https://github.com/graphistry/graphistry-cli) & [Kubernetes](https://github.com/graphistry/graphistry-helm)
-[Group-in-a-box layout](https://ieeexplore.ieee.org/document/6113135) with igraph/pandas and cugraph/cudf implementations:
+- **Graphistry client APIs:**
+ - Web: [iframe](https://hub.graphistry.com/docs/api/1/rest/url/#urloptions), [JavaScript](https://hub.graphistry.com/static/js-docs/index.html?path=/docs/introduction--docs), [REST](https://hub.graphistry.com/docs/api/1/rest/auth/)
+ - [PyGraphistry](https://pygraphistry.readthedocs.io/en/latest/index.html)
+ - [Graphistry for Microsoft PowerBI](https://hub.graphistry.com/docs/powerbi/pbi/)
-```python
-g.group_in_a_box_layout().plot()
-g.group_in_a_box_layout(
- partition_alg='ecg', # see igraph/cugraph algs
- #partition_key='some_col', # use existing col
- #layout_alg='circle', # see igraph/cugraph algs
- #x, y, w, h
- #encode_colors=False,
- #colors=['#FFF', '#FF0', ...]
- engine='cudf'
-).plot()
-```
+- **Additional projects**:
+ - [Louie.ai](https://louie.ai/): GenAI-native notebooks & dashboards to talk to your databases & Graphistry
+ - [graph-app-kit](https://github.com/graphistry/graph-app-kit): Streamlit Python dashboards with batteries-include graph packages
+ - [cu-cat](https://chat.openai.com/chat): Automatic GPU feature engineering
-### Control render settings
-```python
-g = graphistry.edges(pd.DataFrame({'s': ['a', 'b', 'b'], 'd': ['b', 'c', 'd']}))
-g2 = g.scene_settings(
- #hide menus
- menu=False,
- info=False,
- #tweak graph
- show_arrows=False,
- point_size=1.0,
- edge_curvature=0.0,
- edge_opacity=0.5,
- point_opacity=0.9
-).plot()
+## Community and support
-```
+- [Blog](https://www.graphistry.com/blog) for tutorials, case studies, and updates
+- [Slack](https://join.slack.com/t/graphistry-community/shared_invite/zt-53ik36w2-fpP0Ibjbk7IJuVFIRSnr6g): Join the Graphistry Community Slack for discussions and support
+- [Twitter](https://twitter.com/graphistry) & [LinkedIn](https://www.linkedin.com/company/graphistry): Follow for updates
+- [GitHub Issues](https://github.com/graphistry/pygraphistry/issues) open source support
+- [Graphistry ZenDesk](https://graphistry.zendesk.com/) dedicated enterprise support
-With `pip install graphistry[igraph]`, you can also use [`igraph` layouts](https://igraph.org/python/doc/api/igraph.Graph.html#layout):
+## Contribute
-```python
-g.layout_igraph('sugiyama').plot()
-g.layout_igraph('sugiyama', directed=True, params={}).plot()
-```
+See [CONTRIBUTE](https://pygraphistry.readthedocs.io/en/latest/CONTRIBUTE.html) and [DEVELOP](https://pygraphistry.readthedocs.io/en/latest/DEVELOP.html) for participating in PyGraphistry development, or reach out to our team
-## Next Steps
-
-1. Create a free public data [Graphistry Hub](https://www.graphistry.com/get-started) account or [one-click launch a private Graphistry instance in AWS](https://www.graphistry.com/get-started)
-2. Check out the [analyst](https://github.com/graphistry/pygraphistry/tree/master/demos/for_analysis.ipynb) and [developer](https://github.com/graphistry/pygraphistry/tree/master/demos/for_developers.ipynb) introductions, or [try your own CSV](https://github.com/graphistry/pygraphistry/tree/master/demos/upload_csv_miniapp.ipynb)
-3. Explore the [demos folder](https://github.com/graphistry/pygraphistry/tree/master/demos) for your favorite [file format, database, API](https://github.com/graphistry/pygraphistry/tree/master/demos/demos_databases_apis), use case domain, kind of analysis, and [visual analytics feature](https://github.com/graphistry/pygraphistry/tree/master/demos/more_examples/graphistry_features)
-
-## Resources
-
-* Graphistry [In-Tool UI Guide](https://hub.graphistry.com/docs/ui/index/)
-* [General and REST API docs](https://hub.graphistry.com/docs/api/):
- * [URL settings](https://hub.graphistry.com/docs/api/1/rest/url/#urloptions)
- * [Authentication](https://hub.graphistry.com/docs/api/1/rest/auth/)
- * [Uploading](https://hub.graphistry.com/docs/api/2/rest/upload/#createdataset2), including multiple file formats and settings
- * [Color bindings](https://hub.graphistry.com/docs/api/2/rest/upload/colors/#extendedpalette2) and [color palettes](https://hub.graphistry.com/docs/api/api-color-palettes/) (ColorBrewer)
- * Bindings and colors, REST API, embedding URLs and URL parameters, dynamic JS API, and more
- * JavaScript and more!
-* Python-specific
- * [Python API ReadTheDocs](http://pygraphistry.readthedocs.org/en/latest/)
- * Within a notebook, you can always run `help(graphistry)`, `help(graphistry.hypergraph)`, etc.
-* [Administration docs](https://github.com/graphistry/graphistry-cli) for sizing, installing, configuring, managing, and updating Graphistry servers
-* [Graph-App-Kit Dashboarding](https://github.com/graphistry/graph-app-kit) dashboarding
diff --git a/bin/lint.sh b/bin/lint.sh
index 8c4d2f3c2..22c986570 100755
--- a/bin/lint.sh
+++ b/bin/lint.sh
@@ -19,7 +19,7 @@ flake8 \
graphistry \
--exclude graphistry/graph_vector_pb2.py,graphistry/_version.py \
--count \
- --ignore=C901,E121,E122,E123,E124,E125,E128,E131,E144,E201,E202,E203,E231,E251,E265,E301,E302,E303,E401,E501,E722,F401,W291,W293 \
+ --ignore=C901,E121,E122,E123,E124,E125,E128,E131,E144,E201,E202,E203,E231,E251,E265,E301,E302,E303,E401,E501,E722,F401,W291,W293,W503 \
--max-complexity=10 \
--max-line-length=127 \
--statistics
diff --git a/bin/test-graphviz.sh b/bin/test-graphviz.sh
new file mode 100755
index 000000000..765cc8577
--- /dev/null
+++ b/bin/test-graphviz.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -ex
+
+# Run from project root
+# - Args get passed to pytest phase
+# Non-zero exit code on fail
+
+# Assume [pygraphviz,test], apt-get install graphviz graphviz-dev
+
+python -m pytest --version
+
+python -B -m pytest -vv \
+ graphistry/tests/plugins/test_graphviz.py
diff --git a/demos/ai/cyber/CyberSecurity-Slim.ipynb b/demos/ai/cyber/CyberSecurity-Slim.ipynb
index 9b6cfdb7a..d8ee1c916 100644
--- a/demos/ai/cyber/CyberSecurity-Slim.ipynb
+++ b/demos/ai/cyber/CyberSecurity-Slim.ipynb
@@ -190,7 +190,7 @@
"id": "125f6ef0",
"metadata": {},
"source": [
- "# Fast Incident Response\n",
+ "## Fast Incident Response\n",
"An Incident Responder needs to quickly find which IP is the attacker.\n",
"\n",
"If, say, a predictive model enriched the data, responders could repeat the pipeline on new data\n",
@@ -285,7 +285,7 @@
"id": "caf504e5",
"metadata": {},
"source": [
- "# Do we have a predictive model?\n",
+ "## Do we have a predictive model?\n",
"\n",
"Using the x, y's we get from autofeaturization, we fit two RandomForest models"
]
@@ -378,7 +378,7 @@
"id": "671557b5",
"metadata": {},
"source": [
- "# Let's remove edges and see if there is a model of just 'common features' (ie no ip addresses)\n",
+ "## Let's remove edges and see if there is a model of just 'common features' (ie no ip addresses)\n",
"\n",
"Given learnings, we want to see if there is a model that does not use edge information (ie, no IP addresses, only connection metadata)"
]
@@ -525,7 +525,7 @@
"id": "71166b62",
"metadata": {},
"source": [
- "# Hence we see that including just common features clusters botnet traffic together under featurization and UMAP"
+ "## Hence we see that including just common features clusters botnet traffic together under featurization and UMAP"
]
},
{
@@ -557,7 +557,7 @@
"id": "762b80ed",
"metadata": {},
"source": [
- "# Now we dive deeper\n",
+ "## Now we dive deeper\n",
"-----------------------------------------"
]
},
@@ -566,7 +566,7 @@
"id": "2bac394b",
"metadata": {},
"source": [
- "# Let's encode the graph as a DGL graph for use in Machine Learning"
+ "## Let's encode the graph as a DGL graph for use in Machine Learning"
]
},
{
@@ -757,7 +757,7 @@
"id": "00751e3b",
"metadata": {},
"source": [
- "# Contributions\n",
+ "## Contributions\n",
"\n",
"Now we know how to take raw data and turn them into actionable features and models using the Graphistry[ai] API.\n",
"\n",
diff --git a/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.ipynb b/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.ipynb
old mode 100644
new mode 100755
index ab4126ce8..515a27057
--- a/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.ipynb
+++ b/demos/demos_databases_apis/databricks_pyspark/graphistry-notebook-dashboard.ipynb
@@ -39,128 +39,122 @@
}
},
"source": [
- "## Install & connect"
+ "## Install & authenticate with graphistry server"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
- "cellMetadata": {},
+ "cellMetadata": {
+ "byteLimit": 2048000,
+ "rowLimit": 10000
+ },
"inputWidgets": {},
"nuid": "eaf03d3c-d046-4f96-825e-5db2355af383",
"showTitle": false,
"title": ""
}
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Requirement already satisfied: graphistry in /local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/lib/python3.9/site-packages (0.28.5)\r\n",
- "Requirement already satisfied: numpy in /databricks/python3/lib/python3.9/site-packages (from graphistry) (1.20.3)\r\n",
- "Requirement already satisfied: pandas>=0.17.0 in /databricks/python3/lib/python3.9/site-packages (from graphistry) (1.3.4)\r\n",
- "Requirement already satisfied: packaging>=20.1 in /databricks/python3/lib/python3.9/site-packages (from graphistry) (21.0)\r\n",
- "Requirement already satisfied: squarify in /local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/lib/python3.9/site-packages (from graphistry) (0.4.3)\r\n",
- "Requirement already satisfied: palettable>=3.0 in /local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/lib/python3.9/site-packages (from graphistry) (3.3.0)\r\n",
- "Requirement already satisfied: typing-extensions in /databricks/python3/lib/python3.9/site-packages (from graphistry) (3.10.0.2)\r\n",
- "Requirement already satisfied: pyarrow>=0.15.0 in /databricks/python3/lib/python3.9/site-packages (from graphistry) (7.0.0)\r\n",
- "Requirement already satisfied: requests in /databricks/python3/lib/python3.9/site-packages (from graphistry) (2.26.0)\r\n",
- "Requirement already satisfied: pyparsing>=2.0.2 in /databricks/python3/lib/python3.9/site-packages (from packaging>=20.1->graphistry) (3.0.4)\r\n",
- "Requirement already satisfied: python-dateutil>=2.7.3 in /databricks/python3/lib/python3.9/site-packages (from pandas>=0.17.0->graphistry) (2.8.2)\r\n",
- "Requirement already satisfied: pytz>=2017.3 in /databricks/python3/lib/python3.9/site-packages (from pandas>=0.17.0->graphistry) (2021.3)\r\n",
- "Requirement already satisfied: six>=1.5 in /databricks/python3/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas>=0.17.0->graphistry) (1.16.0)\r\n",
- "Requirement already satisfied: idna<4,>=2.5 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (3.2)\r\n",
- "Requirement already satisfied: charset-normalizer~=2.0.0 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (2.0.4)\r\n",
- "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (1.26.7)\r\n",
- "Requirement already satisfied: certifi>=2017.4.17 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (2021.10.8)\r\n",
- "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.3.1 is available.\r\n",
- "You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/bin/python -m pip install --upgrade pip' command.\u001b[0m\r\n"
- ]
- },
- "metadata": {
- "application/vnd.databricks.v1+output": {
- "addedWidgets": {},
- "arguments": {},
- "data": "Requirement already satisfied: graphistry in /local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/lib/python3.9/site-packages (0.28.5)\r\nRequirement already satisfied: numpy in /databricks/python3/lib/python3.9/site-packages (from graphistry) (1.20.3)\r\nRequirement already satisfied: pandas>=0.17.0 in /databricks/python3/lib/python3.9/site-packages (from graphistry) (1.3.4)\r\nRequirement already satisfied: packaging>=20.1 in /databricks/python3/lib/python3.9/site-packages (from graphistry) (21.0)\r\nRequirement already satisfied: squarify in /local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/lib/python3.9/site-packages (from graphistry) (0.4.3)\r\nRequirement already satisfied: palettable>=3.0 in /local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/lib/python3.9/site-packages (from graphistry) (3.3.0)\r\nRequirement already satisfied: typing-extensions in /databricks/python3/lib/python3.9/site-packages (from graphistry) (3.10.0.2)\r\nRequirement already satisfied: pyarrow>=0.15.0 in /databricks/python3/lib/python3.9/site-packages (from graphistry) (7.0.0)\r\nRequirement already satisfied: requests in /databricks/python3/lib/python3.9/site-packages (from graphistry) (2.26.0)\r\nRequirement already satisfied: pyparsing>=2.0.2 in /databricks/python3/lib/python3.9/site-packages (from packaging>=20.1->graphistry) (3.0.4)\r\nRequirement already satisfied: python-dateutil>=2.7.3 in /databricks/python3/lib/python3.9/site-packages (from pandas>=0.17.0->graphistry) (2.8.2)\r\nRequirement already satisfied: pytz>=2017.3 in /databricks/python3/lib/python3.9/site-packages (from pandas>=0.17.0->graphistry) (2021.3)\r\nRequirement already satisfied: six>=1.5 in /databricks/python3/lib/python3.9/site-packages (from python-dateutil>=2.7.3->pandas>=0.17.0->graphistry) (1.16.0)\r\nRequirement already satisfied: idna<4,>=2.5 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (3.2)\r\nRequirement already satisfied: charset-normalizer~=2.0.0 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (2.0.4)\r\nRequirement already satisfied: urllib3<1.27,>=1.21.1 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (1.26.7)\r\nRequirement already satisfied: certifi>=2017.4.17 in /databricks/python3/lib/python3.9/site-packages (from requests->graphistry) (2021.10.8)\r\n\u001b[33mWARNING: You are using pip version 21.2.4; however, version 22.3.1 is available.\r\nYou should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-969db892-92cf-4b34-a5cf-61642fa76e77/bin/python -m pip install --upgrade pip' command.\u001b[0m\r\n",
- "datasetInfos": [],
- "metadata": {},
- "removedWidgets": [],
- "type": "ansi"
- }
+ "outputs": [],
+ "source": [
+ "# Uncomment and run first time or\n",
+ "# have databricks admin install graphistry python library: \n",
+ "# https://docs.databricks.com/en/libraries/package-repositories.html#pypi-package\n",
+ "\n",
+ "#%pip install graphistry\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "application/vnd.databricks.v1+cell": {
+ "cellMetadata": {
+ "byteLimit": 2048000,
+ "rowLimit": 10000
},
- "output_type": "display_data"
+ "inputWidgets": {},
+ "nuid": "8ad9b072-f037-4d4a-a1fa-ca2c14bd639f",
+ "showTitle": false,
+ "title": ""
}
- ],
+ },
+ "outputs": [],
"source": [
- "# Uncomment and run first time\n",
- "! pip install graphistry\n",
- "#! pip install git+https://github.com/graphistry/pygraphistry.git@dev/databricks\n",
- " \n",
- "# Can sometimes help:\n",
- "#dbutils.library.restartPython()"
+ "# Required to run after pip install to pick up new python package: \n",
+ "dbutils.library.restartPython()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
- "cellMetadata": {},
+ "cellMetadata": {
+ "byteLimit": 2048000,
+ "rowLimit": 10000
+ },
"inputWidgets": {},
- "nuid": "9e649f0e-fca5-4be6-8ad6-fa781bbb81d6",
+ "nuid": "cfd253ba-c647-4c45-8048-58b0ca427569",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
- "#Optional: Uncomment - We find this speeds up calls 10%+ on some datasets\n",
- "#spark.conf.set(\"spark.sql.execution.arrow.enabled\", \"true\")"
+ "import graphistry # if not yet available, install pygraphistry and/or restart Python kernel using the cells above\n",
+ "graphistry.__version__"
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
- "nuid": "cfd253ba-c647-4c45-8048-58b0ca427569",
+ "nuid": "55e30c26-3a8c-46dc-8eff-bd730d3c7798",
"showTitle": false,
"title": ""
}
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Out[12]: '0.28.5'"
- ]
- },
- "metadata": {
- "application/vnd.databricks.v1+output": {
- "addedWidgets": {},
- "arguments": {},
- "data": "Out[12]: '0.28.5'",
- "datasetInfos": [],
- "metadata": {},
- "removedWidgets": [],
- "type": "ansi"
- }
+ "source": [
+ "### Use databricks secrets to retrieve graphistry creds and pass to register "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "application/vnd.databricks.v1+cell": {
+ "cellMetadata": {
+ "byteLimit": 2048000,
+ "rowLimit": 10000
},
- "output_type": "display_data"
+ "inputWidgets": {},
+ "nuid": "b5496fa5-525a-48c9-ad46-0ce17ebdc4f8",
+ "showTitle": false,
+ "title": ""
}
- ],
+ },
+ "outputs": [],
"source": [
- "import graphistry # if not yet available, install and/or restart Python kernel using the above\n",
"\n",
- "# To specify Graphistry account & server, use:\n",
- "# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n",
- "# For more options, see https://github.com/graphistry/pygraphistry#configure\n",
+ "# As a best practice, use databricks secrets to store graphistry personal key (access token)\n",
+ "# create databricks secrets: https://docs.databricks.com/en/security/secrets/index.html \n",
+ "# create graphistry personal key: https://hub.graphistry.com/account/tokens\n",
"\n",
- "graphistry.__version__"
+ "graphistry.register(api=3, \n",
+ " personal_key_id=dbutils.secrets.get(scope=\"my-secret-scope\", key=\"graphistry-personal_key_id\"), \n",
+ " personal_key_secret=dbutils.secrets.get(scope=\"my-secret-scope\", key=\"graphistry-personal_key_secret\"), \n",
+ " protocol='https',\n",
+ " server='hub.graphistry.com')\n",
+ "\n",
+ "# Alternatively, use username and password: \n",
+ "# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n",
+ "# For more options, see https://github.com/graphistry/pygraphistry#configure"
]
},
{
@@ -188,337 +182,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
- "cellMetadata": {},
+ "cellMetadata": {
+ "byteLimit": 2048000,
+ "rowLimit": 10000
+ },
"inputWidgets": {},
"nuid": "c187c650-01c2-4e48-b8e0-803e937cdb11",
"showTitle": false,
"title": ""
}
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "type: \n"
- ]
- },
- "metadata": {
- "application/vnd.databricks.v1+output": {
- "addedWidgets": {},
- "arguments": {},
- "data": "type: \n",
- "datasetInfos": [],
- "metadata": {},
- "removedWidgets": [],
- "type": "ansi"
- }
- },
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "battery_level c02_level cca2 cca3 cn device_id device_name humidity ip latitude lcd longitude scale temp timestamp 8 868 US USA United States 1 meter-gauge-1xbYRYcj 51 68.161.225.1 38.0 green -97.0 Celsius 34 1458444054093 7 1473 NO NOR Norway 2 sensor-pad-2n2Pea 70 213.161.254.1 62.47 red 6.15 Celsius 11 1458444054119 2 1556 IT ITA Italy 3 device-mac-36TWSKiT 44 88.36.5.1 42.83 red 12.83 Celsius 19 1458444054120 6 1080 US USA United States 4 sensor-pad-4mzWkz 32 66.39.173.154 44.06 yellow -121.32 Celsius 28 1458444054121 4 931 PH PHL Philippines 5 therm-stick-5gimpUrBB 62 203.82.41.9 14.58 green 120.97 Celsius 25 1458444054122 3 1210 US USA United States 6 sensor-pad-6al7RTAobR 51 204.116.105.67 35.93 yellow -85.46 Celsius 27 1458444054122 3 1129 CN CHN China 7 meter-gauge-7GeDoanM 26 220.173.179.1 22.82 yellow 108.32 Celsius 18 1458444054123 0 1536 JP JPN Japan 8 sensor-pad-8xUD6pzsQI 35 210.173.177.1 35.69 red 139.69 Celsius 27 1458444054123 3 807 JP JPN Japan 9 device-mac-9GcjZ2pw 85 118.23.68.227 35.69 green 139.69 Celsius 13 1458444054124 7 1470 US USA United States 10 sensor-pad-10BsywSYUF 56 208.109.163.218 33.61 red -111.89 Celsius 26 1458444054125
"
- ]
- },
- "metadata": {
- "application/vnd.databricks.v1+output": {
- "addedWidgets": {},
- "aggData": [],
- "aggError": "",
- "aggOverflow": false,
- "aggSchema": [],
- "aggSeriesLimitReached": false,
- "aggType": "",
- "arguments": {},
- "columnCustomDisplayInfos": {},
- "data": [
- [
- 8,
- 868,
- "US",
- "USA",
- "United States",
- 1,
- "meter-gauge-1xbYRYcj",
- 51,
- "68.161.225.1",
- 38,
- "green",
- -97,
- "Celsius",
- 34,
- 1458444054093
- ],
- [
- 7,
- 1473,
- "NO",
- "NOR",
- "Norway",
- 2,
- "sensor-pad-2n2Pea",
- 70,
- "213.161.254.1",
- 62.47,
- "red",
- 6.15,
- "Celsius",
- 11,
- 1458444054119
- ],
- [
- 2,
- 1556,
- "IT",
- "ITA",
- "Italy",
- 3,
- "device-mac-36TWSKiT",
- 44,
- "88.36.5.1",
- 42.83,
- "red",
- 12.83,
- "Celsius",
- 19,
- 1458444054120
- ],
- [
- 6,
- 1080,
- "US",
- "USA",
- "United States",
- 4,
- "sensor-pad-4mzWkz",
- 32,
- "66.39.173.154",
- 44.06,
- "yellow",
- -121.32,
- "Celsius",
- 28,
- 1458444054121
- ],
- [
- 4,
- 931,
- "PH",
- "PHL",
- "Philippines",
- 5,
- "therm-stick-5gimpUrBB",
- 62,
- "203.82.41.9",
- 14.58,
- "green",
- 120.97,
- "Celsius",
- 25,
- 1458444054122
- ],
- [
- 3,
- 1210,
- "US",
- "USA",
- "United States",
- 6,
- "sensor-pad-6al7RTAobR",
- 51,
- "204.116.105.67",
- 35.93,
- "yellow",
- -85.46,
- "Celsius",
- 27,
- 1458444054122
- ],
- [
- 3,
- 1129,
- "CN",
- "CHN",
- "China",
- 7,
- "meter-gauge-7GeDoanM",
- 26,
- "220.173.179.1",
- 22.82,
- "yellow",
- 108.32,
- "Celsius",
- 18,
- 1458444054123
- ],
- [
- 0,
- 1536,
- "JP",
- "JPN",
- "Japan",
- 8,
- "sensor-pad-8xUD6pzsQI",
- 35,
- "210.173.177.1",
- 35.69,
- "red",
- 139.69,
- "Celsius",
- 27,
- 1458444054123
- ],
- [
- 3,
- 807,
- "JP",
- "JPN",
- "Japan",
- 9,
- "device-mac-9GcjZ2pw",
- 85,
- "118.23.68.227",
- 35.69,
- "green",
- 139.69,
- "Celsius",
- 13,
- 1458444054124
- ],
- [
- 7,
- 1470,
- "US",
- "USA",
- "United States",
- 10,
- "sensor-pad-10BsywSYUF",
- 56,
- "208.109.163.218",
- 33.61,
- "red",
- -111.89,
- "Celsius",
- 26,
- 1458444054125
- ]
- ],
- "datasetInfos": [],
- "dbfsResultPath": null,
- "isJsonSchema": true,
- "metadata": {},
- "overflow": false,
- "plotOptions": {
- "customPlotOptions": {},
- "displayType": "table",
- "pivotAggregation": null,
- "pivotColumns": [],
- "xColumns": [],
- "yColumns": []
- },
- "removedWidgets": [],
- "schema": [
- {
- "metadata": "{}",
- "name": "battery_level",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "c02_level",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "cca2",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "cca3",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "cn",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "device_id",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "device_name",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "humidity",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "ip",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "latitude",
- "type": "\"double\""
- },
- {
- "metadata": "{}",
- "name": "lcd",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "longitude",
- "type": "\"double\""
- },
- {
- "metadata": "{}",
- "name": "scale",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "temp",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "timestamp",
- "type": "\"long\""
- }
- ],
- "type": "table"
- }
- },
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"# Load the data from its source.\n",
"devices = spark.read \\\n",
@@ -532,393 +209,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
- "cellMetadata": {},
+ "cellMetadata": {
+ "byteLimit": 2048000,
+ "rowLimit": 10000
+ },
"inputWidgets": {},
"nuid": "c69b91ed-c172-47b7-9bb7-27532202179a",
"showTitle": false,
"title": ""
}
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "device_id cca2 cca3 cn device_name ip location_rounded1 location_rounded2 battery_level_min c02_level_min humidity_min timestamp_min battery_level_max c02_level_max humidity_max timestamp_max battery_level_avg c02_level_avg humidity_avg timestamp_avg 1 US USA United States meter-gauge-1xbYRYcj 68.161.225.1 38_-97 40_-100 8 868 51 1458444054093 8 868 51 1458444054093 8.0 868.0 51.0 1.458444054093E12 2 NO NOR Norway sensor-pad-2n2Pea 213.161.254.1 62_6 60_10 7 1473 70 1458444054119 7 1473 70 1458444054119 7.0 1473.0 70.0 1.458444054119E12 3 IT ITA Italy device-mac-36TWSKiT 88.36.5.1 43_13 40_10 2 1556 44 1458444054120 2 1556 44 1458444054120 2.0 1556.0 44.0 1.45844405412E12 4 US USA United States sensor-pad-4mzWkz 66.39.173.154 44_-121 40_-120 6 1080 32 1458444054121 6 1080 32 1458444054121 6.0 1080.0 32.0 1.458444054121E12 5 PH PHL Philippines therm-stick-5gimpUrBB 203.82.41.9 15_121 10_120 4 931 62 1458444054122 4 931 62 1458444054122 4.0 931.0 62.0 1.458444054122E12 6 US USA United States sensor-pad-6al7RTAobR 204.116.105.67 36_-85 40_-90 3 1210 51 1458444054122 3 1210 51 1458444054122 3.0 1210.0 51.0 1.458444054122E12 7 CN CHN China meter-gauge-7GeDoanM 220.173.179.1 23_108 20_110 3 1129 26 1458444054123 3 1129 26 1458444054123 3.0 1129.0 26.0 1.458444054123E12 8 JP JPN Japan sensor-pad-8xUD6pzsQI 210.173.177.1 36_140 40_140 0 1536 35 1458444054123 0 1536 35 1458444054123 0.0 1536.0 35.0 1.458444054123E12 9 JP JPN Japan device-mac-9GcjZ2pw 118.23.68.227 36_140 40_140 3 807 85 1458444054124 3 807 85 1458444054124 3.0 807.0 85.0 1.458444054124E12 10 US USA United States sensor-pad-10BsywSYUF 208.109.163.218 34_-112 30_-110 7 1470 56 1458444054125 7 1470 56 1458444054125 7.0 1470.0 56.0 1.458444054125E12
"
- ]
- },
- "metadata": {
- "application/vnd.databricks.v1+output": {
- "addedWidgets": {},
- "aggData": [],
- "aggError": "",
- "aggOverflow": false,
- "aggSchema": [],
- "aggSeriesLimitReached": false,
- "aggType": "",
- "arguments": {},
- "columnCustomDisplayInfos": {},
- "data": [
- [
- 1,
- "US",
- "USA",
- "United States",
- "meter-gauge-1xbYRYcj",
- "68.161.225.1",
- "38_-97",
- "40_-100",
- 8,
- 868,
- 51,
- 1458444054093,
- 8,
- 868,
- 51,
- 1458444054093,
- 8,
- 868,
- 51,
- 1458444054093
- ],
- [
- 2,
- "NO",
- "NOR",
- "Norway",
- "sensor-pad-2n2Pea",
- "213.161.254.1",
- "62_6",
- "60_10",
- 7,
- 1473,
- 70,
- 1458444054119,
- 7,
- 1473,
- 70,
- 1458444054119,
- 7,
- 1473,
- 70,
- 1458444054119
- ],
- [
- 3,
- "IT",
- "ITA",
- "Italy",
- "device-mac-36TWSKiT",
- "88.36.5.1",
- "43_13",
- "40_10",
- 2,
- 1556,
- 44,
- 1458444054120,
- 2,
- 1556,
- 44,
- 1458444054120,
- 2,
- 1556,
- 44,
- 1458444054120
- ],
- [
- 4,
- "US",
- "USA",
- "United States",
- "sensor-pad-4mzWkz",
- "66.39.173.154",
- "44_-121",
- "40_-120",
- 6,
- 1080,
- 32,
- 1458444054121,
- 6,
- 1080,
- 32,
- 1458444054121,
- 6,
- 1080,
- 32,
- 1458444054121
- ],
- [
- 5,
- "PH",
- "PHL",
- "Philippines",
- "therm-stick-5gimpUrBB",
- "203.82.41.9",
- "15_121",
- "10_120",
- 4,
- 931,
- 62,
- 1458444054122,
- 4,
- 931,
- 62,
- 1458444054122,
- 4,
- 931,
- 62,
- 1458444054122
- ],
- [
- 6,
- "US",
- "USA",
- "United States",
- "sensor-pad-6al7RTAobR",
- "204.116.105.67",
- "36_-85",
- "40_-90",
- 3,
- 1210,
- 51,
- 1458444054122,
- 3,
- 1210,
- 51,
- 1458444054122,
- 3,
- 1210,
- 51,
- 1458444054122
- ],
- [
- 7,
- "CN",
- "CHN",
- "China",
- "meter-gauge-7GeDoanM",
- "220.173.179.1",
- "23_108",
- "20_110",
- 3,
- 1129,
- 26,
- 1458444054123,
- 3,
- 1129,
- 26,
- 1458444054123,
- 3,
- 1129,
- 26,
- 1458444054123
- ],
- [
- 8,
- "JP",
- "JPN",
- "Japan",
- "sensor-pad-8xUD6pzsQI",
- "210.173.177.1",
- "36_140",
- "40_140",
- 0,
- 1536,
- 35,
- 1458444054123,
- 0,
- 1536,
- 35,
- 1458444054123,
- 0,
- 1536,
- 35,
- 1458444054123
- ],
- [
- 9,
- "JP",
- "JPN",
- "Japan",
- "device-mac-9GcjZ2pw",
- "118.23.68.227",
- "36_140",
- "40_140",
- 3,
- 807,
- 85,
- 1458444054124,
- 3,
- 807,
- 85,
- 1458444054124,
- 3,
- 807,
- 85,
- 1458444054124
- ],
- [
- 10,
- "US",
- "USA",
- "United States",
- "sensor-pad-10BsywSYUF",
- "208.109.163.218",
- "34_-112",
- "30_-110",
- 7,
- 1470,
- 56,
- 1458444054125,
- 7,
- 1470,
- 56,
- 1458444054125,
- 7,
- 1470,
- 56,
- 1458444054125
- ]
- ],
- "datasetInfos": [],
- "dbfsResultPath": null,
- "isJsonSchema": true,
- "metadata": {},
- "overflow": false,
- "plotOptions": {
- "customPlotOptions": {},
- "displayType": "table",
- "pivotAggregation": null,
- "pivotColumns": [],
- "xColumns": [],
- "yColumns": []
- },
- "removedWidgets": [],
- "schema": [
- {
- "metadata": "{}",
- "name": "device_id",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "cca2",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "cca3",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "cn",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "device_name",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "ip",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "location_rounded1",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "location_rounded2",
- "type": "\"string\""
- },
- {
- "metadata": "{}",
- "name": "battery_level_min",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "c02_level_min",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "humidity_min",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "timestamp_min",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "battery_level_max",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "c02_level_max",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "humidity_max",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "timestamp_max",
- "type": "\"long\""
- },
- {
- "metadata": "{}",
- "name": "battery_level_avg",
- "type": "\"double\""
- },
- {
- "metadata": "{}",
- "name": "c02_level_avg",
- "type": "\"double\""
- },
- {
- "metadata": "{}",
- "name": "humidity_avg",
- "type": "\"double\""
- },
- {
- "metadata": "{}",
- "name": "timestamp_avg",
- "type": "\"double\""
- }
- ],
- "type": "table"
- }
- },
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"from pyspark.sql import functions as F\n",
"from pyspark.sql.functions import concat_ws, col, round\n",
@@ -985,56 +289,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
- "cellMetadata": {},
+ "cellMetadata": {
+ "byteLimit": 2048000,
+ "rowLimit": 10000
+ },
"inputWidgets": {},
"nuid": "8028c3a6-308a-43ec-8988-0b51d9f1826d",
"showTitle": false,
"title": ""
}
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " \n",
- " \n",
- " "
- ]
- },
- "metadata": {
- "application/vnd.databricks.v1+output": {
- "addedWidgets": {},
- "arguments": {},
- "data": "\n \n \n \n ",
- "datasetInfos": [],
- "metadata": {},
- "removedWidgets": [],
- "textData": null,
- "type": "htmlSandbox"
- }
- },
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"(\n",
" graphistry \n",
- " .edges(devices.sample(fraction=0.1), 'device_name', 'cca3') \\\n",
+ " .edges(devices.sample(fraction=0.1).toPandas(), 'device_name', 'cca3') \\\n",
" .settings(url_params={'strongGravity': 'true'}) \\\n",
" .plot()\n",
")"
@@ -1042,73 +314,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
- "cellMetadata": {},
+ "cellMetadata": {
+ "byteLimit": 2048000,
+ "rowLimit": 10000
+ },
"inputWidgets": {},
"nuid": "852b48fe-61af-4953-858f-52680bf07fd2",
"showTitle": false,
"title": ""
}
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "# links 79200\n",
- "# events 19800\n",
- "# attrib entities 41197\n"
- ]
- },
- "metadata": {
- "application/vnd.databricks.v1+output": {
- "addedWidgets": {},
- "arguments": {},
- "data": "# links 79200\n# events 19800\n# attrib entities 41197\n",
- "datasetInfos": [],
- "metadata": {},
- "removedWidgets": [],
- "type": "ansi"
- }
- },
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " \n",
- " \n",
- " "
- ]
- },
- "metadata": {
- "application/vnd.databricks.v1+output": {
- "addedWidgets": {},
- "arguments": {},
- "data": "\n \n \n \n ",
- "datasetInfos": [],
- "metadata": {},
- "removedWidgets": [],
- "textData": null,
- "type": "htmlSandbox"
- }
- },
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"hg = graphistry.hypergraph(\n",
" devices_with_rounded_locations.sample(fraction=0.1).toPandas(),\n",
@@ -1150,55 +369,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
- "cellMetadata": {},
+ "cellMetadata": {
+ "byteLimit": 2048000,
+ "rowLimit": 10000
+ },
"inputWidgets": {},
"nuid": "4e327ad1-169b-4bb6-95c0-8fc0cf452625",
"showTitle": false,
"title": ""
}
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " \n",
- " \n",
- " "
- ]
- },
- "metadata": {
- "application/vnd.databricks.v1+output": {
- "addedWidgets": {},
- "arguments": {},
- "data": "\n \n \n \n ",
- "datasetInfos": [],
- "metadata": {},
- "removedWidgets": [],
- "textData": null,
- "type": "htmlSandbox"
- }
- },
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"(\n",
" g\n",
@@ -1227,37 +411,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
- "cellMetadata": {},
+ "cellMetadata": {
+ "byteLimit": 2048000,
+ "rowLimit": 10000
+ },
"inputWidgets": {},
"nuid": "a0e6bd79-1172-4cfe-ac6d-83b187d48747",
"showTitle": false,
"title": ""
}
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Out[18]: 'https://hub.graphistry.com/graph/graph.html?dataset=187d97493ce54498b820f727877eda4b&type=arrow&viztoken=b3106e8a-cbe9-4802-8519-97e1d0d539c3&usertag=50d9aebe-pygraphistry-0.28.5&splashAfter=1669270570&info=true&strongGravity=true'"
- ]
- },
- "metadata": {
- "application/vnd.databricks.v1+output": {
- "addedWidgets": {},
- "arguments": {},
- "data": "Out[18]: 'https://hub.graphistry.com/graph/graph.html?dataset=187d97493ce54498b820f727877eda4b&type=arrow&viztoken=b3106e8a-cbe9-4802-8519-97e1d0d539c3&usertag=50d9aebe-pygraphistry-0.28.5&splashAfter=1669270570&info=true&strongGravity=true'",
- "datasetInfos": [],
- "metadata": {},
- "removedWidgets": [],
- "type": "ansi"
- }
- },
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"url = g.plot(render=False)\n",
"url"
@@ -1265,12 +432,12 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
- "nuid": "ed683717-2c64-43b0-9a7e-bbe2115ba880",
+ "nuid": "cd326621-1224-4b91-890d-9285f7755ad2",
"showTitle": false,
"title": ""
}
@@ -1282,12 +449,12 @@
"metadata": {
"application/vnd.databricks.v1+notebook": {
"dashboards": [],
+ "environmentMetadata": null,
"language": "python",
"notebookMetadata": {
"pythonIndentUnit": 4
},
- "notebookName": "graphistry-notebook-dashboard",
- "notebookOrigID": 382244341032212,
+ "notebookName": "graphistry-notebook-dashboard (1)",
"widgets": {}
},
"kernelspec": {
@@ -1309,5 +476,5 @@
}
},
"nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 0
}
diff --git a/demos/demos_databases_apis/gpu_rapids/part_i_cpu_pandas.ipynb b/demos/demos_databases_apis/gpu_rapids/part_i_cpu_pandas.ipynb
index 956f05499..931eaad73 100644
--- a/demos/demos_databases_apis/gpu_rapids/part_i_cpu_pandas.ipynb
+++ b/demos/demos_databases_apis/gpu_rapids/part_i_cpu_pandas.ipynb
@@ -14,8 +14,10 @@
"This tutorial series visually analyzes Zeek/Bro network connection logs using different compute engines:\n",
"\n",
"* Part I: [CPU Baseline in Python Pandas](./part_i_cpu_pandas.ipynb)\n",
- "* Part II: [GPU Dataframse with RAPIDS Python cudf bindings](./part_ii_gpu_cudf)\n",
- "\n",
+ "* Part II: [GPU Dataframes with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)\n",
+ "* Part III: GPU SQL - deprecated as Dask-SQL replaced BlazingSQL in the RAPIDS ecosystem\n",
+ "* Part IV: [GPU ML with RAPIDS cuML UMAP and PyGraphistry](./part_iv_gpu_cuml.ipynb)\n",
+ "* [Graphistry cuGraph bindings](./cugraph.ipynb)\n",
"\n",
"**Part I Contents:**\n",
"\n",
@@ -81,9 +83,9 @@
"source": [
"%%time\n",
"# download data \n",
- "!if [ ! -f conn.log ]; then \\\n",
- " curl https://www.secrepo.com/maccdc2012/conn.log.gz | gzip -d > conn.log; \\\n",
- "fi"
+ "#!if [ ! -f conn.log ]; then \\\n",
+ "# curl https://www.secrepo.com/maccdc2012/conn.log.gz | gzip -d > conn.log; \\\n",
+ "#fi"
]
},
{
@@ -92,7 +94,7 @@
"metadata": {},
"outputs": [],
"source": [
- "!head -n 3 conn.log"
+ "#!head -n 3 conn.log"
]
},
{
@@ -291,7 +293,10 @@
"## Next Steps\n",
"\n",
"* Part I: [CPU Baseline in Python Pandas](./part_i_cpu_pandas.ipynb)\n",
- "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)"
+ "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)\n",
+ "* Part III: GPU SQL - deprecated as Dask-SQL replaced BlazingSQL in the RAPIDS ecosystem\n",
+ "* Part IV: [GPU ML with RAPIDS cuML UMAP and PyGraphistry](./part_iv_gpu_cuml.ipynb) \n",
+ "* [Graphistry cuGraph bindings](./cugraph.ipynb)\n"
]
},
{
diff --git a/demos/demos_databases_apis/gpu_rapids/part_ii_gpu_cudf.ipynb b/demos/demos_databases_apis/gpu_rapids/part_ii_gpu_cudf.ipynb
index 294356e76..b81141f1b 100644
--- a/demos/demos_databases_apis/gpu_rapids/part_ii_gpu_cudf.ipynb
+++ b/demos/demos_databases_apis/gpu_rapids/part_ii_gpu_cudf.ipynb
@@ -11,8 +11,10 @@
"This tutorial series visually analyzes Zeek/Bro network connection logs using different compute engines:\n",
"\n",
"* Part I: [CPU Baseline in Python Pandas](./part_i_cpu_pandas.ipynb)\n",
- "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf)\n",
- "\n",
+ "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)\n",
+ "* Part III: GPU SQL - deprecated as Dask-SQL replaced BlazingSQL in the RAPIDS ecosystem\n",
+ "* Part IV: [GPU ML with RAPIDS cuML UMAP and PyGraphistry](./part_iv_gpu_cuml.ipynb)\n",
+ "* [Graphistry cuGraph bindings](./cugraph.ipynb)\n",
"\n",
"**Part II Contents:**\n",
"\n",
@@ -114,9 +116,9 @@
"source": [
"%%time\n",
"# download data \n",
- "!if [ ! -f conn.log ]; then \\\n",
- " curl https://www.secrepo.com/maccdc2012/conn.log.gz | gzip -d > conn.log; \\\n",
- "fi"
+ "#!if [ ! -f conn.log ]; then \\\n",
+ "# curl https://www.secrepo.com/maccdc2012/conn.log.gz | gzip -d > conn.log; \\\n",
+ "#fi"
]
},
{
@@ -736,7 +738,10 @@
"## Next Steps\n",
"\n",
"* Part I: [CPU Baseline in Python Pandas](./part_i_cpu_pandas.ipynb)\n",
- "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf)"
+ "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)\n",
+ "* Part III: GPU SQL - deprecated as Dask-SQL replaced BlazingSQL in the RAPIDS ecosystem\n",
+ "* Part IV: [GPU ML with RAPIDS cuML UMAP and PyGraphistry](./part_iv_gpu_cuml.ipynb)\n",
+ "* [Graphistry cuGraph bindings](./cugraph.ipynb)\n"
]
},
{
diff --git a/demos/demos_databases_apis/gpu_rapids/part_iii_gpu_blazingsql.ipynb b/demos/demos_databases_apis/gpu_rapids/part_iii_gpu_blazingsql.ipynb
index a7b8e0794..5910cd629 100644
--- a/demos/demos_databases_apis/gpu_rapids/part_iii_gpu_blazingsql.ipynb
+++ b/demos/demos_databases_apis/gpu_rapids/part_iii_gpu_blazingsql.ipynb
@@ -6,7 +6,9 @@
"source": [
"# BlazingSQL + Graphistry: Netflow analysis\n",
"\n",
- "This tutorial shows running BlazingSQL (GPU-accelerated SQL) on raw parquet files and visually analyzing the result with Graphistry"
+ "This tutorial shows running BlazingSQL (GPU-accelerated SQL) on raw parquet files and visually analyzing the result with Graphistry\n",
+ "\n",
+ "**WARNING: Deprecated as BlazingSQL is no longer maintained, see dask-sql instead**"
]
},
{
diff --git a/demos/demos_databases_apis/gpu_rapids/part_iv_gpu_cuml.ipynb b/demos/demos_databases_apis/gpu_rapids/part_iv_gpu_cuml.ipynb
index 849887d4b..41a9a5c23 100644
--- a/demos/demos_databases_apis/gpu_rapids/part_iv_gpu_cuml.ipynb
+++ b/demos/demos_databases_apis/gpu_rapids/part_iv_gpu_cuml.ipynb
@@ -4,14 +4,25 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "UMAP is a popular method of dimensionality reduction, a helpful technique for meaningful analysis of large, complex datasets\n",
+ "# GPU UMAP\n",
+ "\n",
+ "UMAP is a popular method of dimensionality reduction, a helpful technique for meaningful analysis of large, complex datasets. Graphistry provides convenient bindings for working with `cuml.UMAP`.\n",
+ "\n",
"UMAP is:\n",
" * interested in the number of nearest numbers\n",
" * non-linear, unlike longstanding methods such as PCA\n",
" * non-scaling, which keep calculation fast\n",
" * stochastic and thus non-deterministic -- and different libraries handle this differently as you will see in this notebook\n",
" * `umap-learn` states that [\"variance between runs will exist, however small\"](https://umap-learn.readthedocs.io/en/latest/reproducibility.html)\n",
- " * `cuml` currently uses [\"exact kNN\"](https://docs.rapids.ai/api/cuml/stable/api.html?highlight=umap#cuml.UMAP). This may chance in [future releases](https://github.com/rapidsai/cuml/issues/1653#issuecomment-584357155)\n"
+ " * `cuml` currently uses [\"exact kNN\"](https://docs.rapids.ai/api/cuml/stable/api.html?highlight=umap#cuml.UMAP). This may chance in [future releases](https://github.com/rapidsai/cuml/issues/1653#issuecomment-584357155)\n",
+ "\n",
+ "Further reading:\n",
+ "\n",
+ "* Part I: [CPU Baseline in Python Pandas](./part_i_cpu_pandas.ipynb)\n",
+ "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)\n",
+ "* Part III: GPU SQL - deprecated as Dask-SQL replaced BlazingSQL in the RAPIDS ecosystem\n",
+ "* Part IV: [GPU ML with RAPIDS cuML UMAP and PyGraphistry](./part_iv_gpu_cuml.ipynb)\n",
+ "* [Graphistry cuGraph bindings](./cugraph.ipynb)\n"
]
},
{
@@ -24,11 +35,7 @@
{
"cell_type": "code",
"execution_count": 9,
- "metadata": {
- "vscode": {
- "languageId": "python"
- }
- },
+ "metadata": {},
"outputs": [
{
"name": "stdout",
@@ -64,11 +71,7 @@
{
"cell_type": "code",
"execution_count": 2,
- "metadata": {
- "vscode": {
- "languageId": "python"
- }
- },
+ "metadata": {},
"outputs": [
{
"data": {
@@ -237,11 +240,7 @@
{
"cell_type": "code",
"execution_count": 3,
- "metadata": {
- "vscode": {
- "languageId": "python"
- }
- },
+ "metadata": {},
"outputs": [
{
"name": "stderr",
@@ -278,11 +277,7 @@
{
"cell_type": "code",
"execution_count": 4,
- "metadata": {
- "vscode": {
- "languageId": "python"
- }
- },
+ "metadata": {},
"outputs": [
{
"name": "stderr",
@@ -312,11 +307,7 @@
{
"cell_type": "code",
"execution_count": 5,
- "metadata": {
- "vscode": {
- "languageId": "python"
- }
- },
+ "metadata": {},
"outputs": [
{
"name": "stderr",
@@ -353,11 +344,7 @@
{
"cell_type": "code",
"execution_count": 6,
- "metadata": {
- "vscode": {
- "languageId": "python"
- }
- },
+ "metadata": {},
"outputs": [
{
"name": "stderr",
@@ -394,11 +381,7 @@
{
"cell_type": "code",
"execution_count": 7,
- "metadata": {
- "vscode": {
- "languageId": "python"
- }
- },
+ "metadata": {},
"outputs": [
{
"name": "stderr",
@@ -427,11 +410,7 @@
{
"cell_type": "code",
"execution_count": 8,
- "metadata": {
- "vscode": {
- "languageId": "python"
- }
- },
+ "metadata": {},
"outputs": [
{
"name": "stderr",
@@ -467,11 +446,7 @@
{
"cell_type": "code",
"execution_count": 12,
- "metadata": {
- "vscode": {
- "languageId": "python"
- }
- },
+ "metadata": {},
"outputs": [
{
"name": "stderr",
@@ -502,11 +477,7 @@
{
"cell_type": "code",
"execution_count": 13,
- "metadata": {
- "vscode": {
- "languageId": "python"
- }
- },
+ "metadata": {},
"outputs": [
{
"name": "stdout",
@@ -608,15 +579,29 @@
{
"cell_type": "code",
"execution_count": 16,
- "metadata": {
- "vscode": {
- "languageId": "python"
- }
- },
+ "metadata": {},
"outputs": [],
"source": [
"#g3.plot()"
]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Next steps\n",
+ "\n",
+ "* Part I: [CPU Baseline in Python Pandas](./part_i_cpu_pandas.ipynb)\n",
+ "* Part II: [GPU Dataframe with RAPIDS Python cudf bindings](./part_ii_gpu_cudf.ipynb)\n",
+ "* Part III: GPU SQL - deprecated as Dask-SQL replaced BlazingSQL in the RAPIDS ecosystem\n",
+ "* Part IV: [GPU ML with RAPIDS cuML UMAP and PyGraphistry](./part_iv_gpu_cuml.ipynb)\n",
+ "* [Graphistry cuGraph bindings](./cugraph.ipynb)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
}
],
"metadata": {
diff --git a/demos/demos_databases_apis/graphviz/graphviz.ipynb b/demos/demos_databases_apis/graphviz/graphviz.ipynb
new file mode 100644
index 000000000..14fe015a9
--- /dev/null
+++ b/demos/demos_databases_apis/graphviz/graphviz.ipynb
@@ -0,0 +1,1430 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "fEjoJ5eBnuKZ"
+ },
+ "source": [
+ "# Graphistry <> graphviz integration quickstart\n",
+ "\n",
+ "The [graphviz engine](https://graphviz.org/) is popular for layout of small graphs and rendering to static images. The Graphistry Python bindings to graphviz enable using pygraphistry as usual for quickly loading and manipulating your data, and then benefiting from graphviz for layout, and optionally, rendering.\n",
+ "\n",
+ "The example below shows laying out and rendering company ownership data that is in a tree and benefits from graphviz's high-quality layout engine."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "0BF6FBhDpLas"
+ },
+ "source": [
+ "## Setup\n",
+ "\n",
+ "* graphviz: Install the graphviz engine and the pygraphviz bindings, see below (official [tutorial](https://pygraphviz.github.io/documentation/stable/install.html) )\n",
+ "* Graphistry: Install PyGraphistry below, and [get a free GPU account on Graphistry Hub](https://www.graphistry.com/get-started) or run your own server\n",
+ "\n",
+ "Notes:\n",
+ "\n",
+ "* You must install the graphviz engine, as well as its pygraphviz Python bindings and pygraphistry\n",
+ "* graphviz is most known for its `\"dot\"` layout engine, and it includes others as well\n",
+ "* graphviz is generally not recommended for layout of graphs over 10,000 nodes and edges"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "3XMNgAvIM9Ep",
+ "outputId": "b391eb13-0650-433b-bd2b-c905cdef9e18"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Building wheel for graphistry (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
+ ]
+ }
+ ],
+ "source": [
+ "#!apt-get install graphviz graphviz-dev\n",
+ "\n",
+ "#!pip install -q graphistry[pygraphviz]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "s40Iw_3vqQZy"
+ },
+ "source": [
+ "## Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 35
+ },
+ "id": "Cnhc-A4_M2Ad",
+ "outputId": "0f2fb73f-72a2-4fae-9b28-cea26b85d0ad"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "string"
+ },
+ "text/plain": [
+ "'0.34.5+12.g4dba3e6'"
+ ]
+ },
+ "execution_count": 102,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from typing import Any, Dict, Literal, Optional\n",
+ "import logging\n",
+ "try:\n",
+ " import pygraphviz as pgv\n",
+ "except (ImportError, ModuleNotFoundError):\n",
+ " logging.error(\"ImportError: Did you install pygraphviz and the supporting native packages?\")\n",
+ " raise\n",
+ "\n",
+ "import pandas as pd\n",
+ "import graphistry\n",
+ "from graphistry import Plottable\n",
+ "graphistry.register(api=3, username=FILL_ME_IN, password=FILL_ME_IN)\n",
+ "\n",
+ "graphistry.__version__"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "wwl3XdQLqf5k"
+ },
+ "source": [
+ "### Sample graph: HSBC Beneficial ownership graph\n",
+ "\n",
+ "Sample data from [openownership.org](https://openownership.org/). Corporate ownership graphs often have deeply tree structure, and for bigger conglomerates with numerous subsidaries, officers, board officers, suppliers, and lenders, can greatly benefit from higher-quality tree layouts."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "8-7OAzDml0RV"
+ },
+ "outputs": [],
+ "source": [
+ "companies_df = pd.DataFrame([{'label': 'Hsbc Finance (Netherlands)', 'n': '1862294673469042014'},\n",
+ " {'label': 'Hsbc Holdings Plc', 'n': '7622088245850069747'},\n",
+ " {'label': 'Unknown person(s)', 'n': '7622088245850069747-unknown'},\n",
+ " {'label': 'HSBC PROPERTY (UK) LIMITED', 'n': '16634236373777089526'},\n",
+ " {'label': 'HSBC ALTERNATIVE INVESTMENTS LIMITED',\n",
+ " 'n': '18011320449780894329'},\n",
+ " {'label': 'HSBC INVESTMENT COMPANY LIMITED', 'n': '9134577322728469115'},\n",
+ " {'label': 'HSBC IM PENSION TRUST LIMITED', 'n': '1446072728533515665'},\n",
+ " {'label': 'MERCANTILE COMPANY LIMITED', 'n': '6904185395252167658'},\n",
+ " {'label': 'Mp Payments Group Limited', 'n': '13630126251685975826'},\n",
+ " {'label': 'MP PAYMENTS OPERATIONS LIMITED', 'n': '11514603667851101425'},\n",
+ " {'label': 'MP PAYMENTS UK LIMITED', 'n': '13417892994160273884'},\n",
+ " {'label': 'Hsbc Asia Pacific Holdings (Uk) Limited',\n",
+ " 'n': '2173486047275631423'},\n",
+ " {'label': 'HSBC SECURITIES (JAPAN) LIMITED', 'n': '18045747820524565803'}])\n",
+ "\n",
+ "ownership_df = pd.DataFrame([{'s': '7622088245850069747', 'd': '1862294673469042014'},\n",
+ " {'s': '7622088245850069747-unknown', 'd': '7622088245850069747'},\n",
+ " {'s': '1862294673469042014', 'd': '16634236373777089526'},\n",
+ " {'s': '1862294673469042014', 'd': '18011320449780894329'},\n",
+ " {'s': '1862294673469042014', 'd': '9134577322728469115'},\n",
+ " {'s': '9134577322728469115', 'd': '1446072728533515665'},\n",
+ " {'s': '9134577322728469115', 'd': '6904185395252167658'},\n",
+ " {'s': '9134577322728469115', 'd': '13630126251685975826'},\n",
+ " {'s': '13630126251685975826', 'd': '11514603667851101425'},\n",
+ " {'s': '13630126251685975826', 'd': '13417892994160273884'},\n",
+ " {'s': '9134577322728469115', 'd': '2173486047275631423'},\n",
+ " {'s': '2173486047275631423', 'd': '18045747820524565803'},\n",
+ " {'s': '9134577322728469115', 'd': '16634236373777089526'}])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "id": "7TmvBE5iI8Tu"
+ },
+ "outputs": [],
+ "source": [
+ "g = graphistry.edges(ownership_df, 's', 'd').nodes(companies_df, 'n').bind(point_title='label')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {
+ "id": "y7eC5hOCwfE1"
+ },
+ "outputs": [],
+ "source": [
+ "g = g.nodes(g._nodes.assign(sz=1)).encode_point_size('sz')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "gnhwn1v_r3kD"
+ },
+ "source": [
+ "## Minimal tree layout and graphviz layout engines\n",
+ "\n",
+ "Graphviz provides 15+ layout engines you can use. General guidance is to use for graphs up to 10,000 nodes and engines.\n",
+ "\n",
+ "The `\"dot\"` layout engine is best known due to its beautiful hierarchical layouts for directed acycle graphs like trees."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 543
+ },
+ "id": "KiOxkJR_YKrh",
+ "outputId": "2b9af5e5-b199-452a-839b-d8cc7cc0ea50"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "g2 = g.layout_graphviz('dot')\n",
+ "g2.plot()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "dTEjXkhisCui"
+ },
+ "source": [
+ "Additional layout engines beyond `\"dot\"` are below. See also the [graphviz layout engines documents](https://graphviz.org/docs/layouts/). The same documentation, and the below section on global graph attributes, describe options you can pass in to different layout engines."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "_x28pxBUr7e_",
+ "outputId": "dd209734-f4cf-425b-dc3c-7cec0d9fac74"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['acyclic',\n",
+ " 'ccomps',\n",
+ " 'circo',\n",
+ " 'dot',\n",
+ " 'fdp',\n",
+ " 'gc',\n",
+ " 'gvcolor',\n",
+ " 'gvpr',\n",
+ " 'neato',\n",
+ " 'nop',\n",
+ " 'osage',\n",
+ " 'patchwork',\n",
+ " 'sccmap',\n",
+ " 'sfdp',\n",
+ " 'tred',\n",
+ " 'twopi',\n",
+ " 'unflatten']"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from graphistry.plugins_types.graphviz_types import PROGS\n",
+ "PROGS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 543
+ },
+ "id": "VD5ezMLss9Dw",
+ "outputId": "553250c2-26a8-4820-f01d-fe138280bcf0"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "g2b = g.layout_graphviz('neato')\n",
+ "g2b.plot()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from graphistry.plugins_types.graphviz_types import PROGS"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "oF9m9a_WuPjN"
+ },
+ "source": [
+ "### Global attributes\n",
+ "\n",
+ "You can set global attributes. Parameter [`graph_attr`](https://graphviz.org/docs/graph/) generally refers to layout engine options, while [`edge_attr`](https://graphviz.org/docs/edges/) and [`node_attr`](https://graphviz.org/docs/nodes/) are generally for default colors, sizes, shapes, etc."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 543
+ },
+ "id": "ACoYzOgCE7Pt",
+ "outputId": "597b8129-dc9f-4f1a-e086-912e7206b103"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "g2b = g.layout_graphviz(\n",
+ " 'dot',\n",
+ " graph_attr={'ratio': 10},\n",
+ " edge_attr={},\n",
+ " node_attr={}\n",
+ ")\n",
+ "g2b.plot()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['_background',\n",
+ " 'bb',\n",
+ " 'beautify',\n",
+ " 'bgcolor',\n",
+ " 'center',\n",
+ " 'charset',\n",
+ " 'class',\n",
+ " 'clusterrank',\n",
+ " 'colorscheme',\n",
+ " 'comment',\n",
+ " 'compound',\n",
+ " 'concentrate',\n",
+ " 'Damping',\n",
+ " 'defaultdist',\n",
+ " 'dim',\n",
+ " 'dimen',\n",
+ " 'diredgeconstraints',\n",
+ " 'dpi',\n",
+ " 'epsilon',\n",
+ " 'esep',\n",
+ " 'fontcolor',\n",
+ " 'fontname',\n",
+ " 'fontnames',\n",
+ " 'fontpath',\n",
+ " 'fontsize',\n",
+ " 'forcelabels',\n",
+ " 'gradientangle',\n",
+ " 'href',\n",
+ " 'id',\n",
+ " 'imagepath',\n",
+ " 'inputscale',\n",
+ " 'K',\n",
+ " 'label',\n",
+ " 'label_scheme',\n",
+ " 'labeljust',\n",
+ " 'labelloc',\n",
+ " 'landscape',\n",
+ " 'layerlistsep',\n",
+ " 'layers',\n",
+ " 'layerselect',\n",
+ " 'layersep',\n",
+ " 'layout',\n",
+ " 'levels',\n",
+ " 'levelsgap',\n",
+ " 'lheight',\n",
+ " 'linelength',\n",
+ " 'lp',\n",
+ " 'lwidth',\n",
+ " 'margin',\n",
+ " 'maxiter',\n",
+ " 'mclimit',\n",
+ " 'mindist',\n",
+ " 'mode',\n",
+ " 'model',\n",
+ " 'newrank',\n",
+ " 'nodesep',\n",
+ " 'nojustify',\n",
+ " 'normalize',\n",
+ " 'notranslate',\n",
+ " 'nslimit',\n",
+ " 'nslimit1',\n",
+ " 'oneblock',\n",
+ " 'ordering',\n",
+ " 'orientation',\n",
+ " 'outputorder',\n",
+ " 'overlap',\n",
+ " 'overlap_scaling',\n",
+ " 'overlap_shrink',\n",
+ " 'pack',\n",
+ " 'packmode',\n",
+ " 'pad',\n",
+ " 'page',\n",
+ " 'pagedir',\n",
+ " 'quadtree',\n",
+ " 'quantum',\n",
+ " 'rankdir',\n",
+ " 'ranksep',\n",
+ " 'ratio',\n",
+ " 'remincross',\n",
+ " 'repulsiveforce',\n",
+ " 'resolution',\n",
+ " 'root',\n",
+ " 'rotate',\n",
+ " 'rotation',\n",
+ " 'scale',\n",
+ " 'searchsize',\n",
+ " 'sep',\n",
+ " 'showboxes',\n",
+ " 'size',\n",
+ " 'smoothing',\n",
+ " 'sortv',\n",
+ " 'splines',\n",
+ " 'start',\n",
+ " 'style',\n",
+ " 'stylesheet',\n",
+ " 'target',\n",
+ " 'TBbalance',\n",
+ " 'tooltip',\n",
+ " 'truecolor',\n",
+ " 'URL',\n",
+ " 'viewport',\n",
+ " 'voro_margin',\n",
+ " 'xdotversion']"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "from graphistry.plugins_types.graphviz_types import GRAPH_ATTRS\n",
+ "GRAPH_ATTRS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['arrowhead',\n",
+ " 'arrowsize',\n",
+ " 'arrowtail',\n",
+ " 'class',\n",
+ " 'color',\n",
+ " 'colorscheme',\n",
+ " 'comment',\n",
+ " 'constraint',\n",
+ " 'decorate',\n",
+ " 'dir',\n",
+ " 'edgehref',\n",
+ " 'edgetarget',\n",
+ " 'edgetooltip',\n",
+ " 'edgeURL',\n",
+ " 'fillcolor',\n",
+ " 'fontcolor',\n",
+ " 'fontname',\n",
+ " 'fontsize',\n",
+ " 'head_lp',\n",
+ " 'headclip',\n",
+ " 'headhref',\n",
+ " 'headlabel',\n",
+ " 'headport',\n",
+ " 'headtarget',\n",
+ " 'headtooltip',\n",
+ " 'headURL',\n",
+ " 'href',\n",
+ " 'id',\n",
+ " 'label',\n",
+ " 'labelangle',\n",
+ " 'labeldistance',\n",
+ " 'labelfloat',\n",
+ " 'labelfontcolor',\n",
+ " 'labelfontname',\n",
+ " 'labelfontsize',\n",
+ " 'labelhref',\n",
+ " 'labeltarget',\n",
+ " 'labeltooltip',\n",
+ " 'labelURL',\n",
+ " 'layer',\n",
+ " 'len',\n",
+ " 'lhead',\n",
+ " 'lp',\n",
+ " 'ltail',\n",
+ " 'minlen',\n",
+ " 'nojustify',\n",
+ " 'penwidth',\n",
+ " 'pos',\n",
+ " 'samehead',\n",
+ " 'sametail',\n",
+ " 'showboxes',\n",
+ " 'style',\n",
+ " 'tail_lp',\n",
+ " 'tailclip',\n",
+ " 'tailhref',\n",
+ " 'taillabel',\n",
+ " 'tailport',\n",
+ " 'tailtarget',\n",
+ " 'tailtooltip',\n",
+ " 'tailURL',\n",
+ " 'target',\n",
+ " 'tooltip',\n",
+ " 'URL',\n",
+ " 'weight',\n",
+ " 'xlabel',\n",
+ " 'xlp']"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "from graphistry.plugins_types.graphviz_types import EDGE_ATTRS\n",
+ "EDGE_ATTRS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['area',\n",
+ " 'class',\n",
+ " 'color',\n",
+ " 'colorscheme',\n",
+ " 'comment',\n",
+ " 'distortion',\n",
+ " 'fillcolor',\n",
+ " 'fixedsize',\n",
+ " 'fontcolor',\n",
+ " 'fontname',\n",
+ " 'fontsize',\n",
+ " 'gradientangle',\n",
+ " 'group',\n",
+ " 'height',\n",
+ " 'href',\n",
+ " 'id',\n",
+ " 'image',\n",
+ " 'imagepos',\n",
+ " 'imagescale',\n",
+ " 'label',\n",
+ " 'labelloc',\n",
+ " 'layer',\n",
+ " 'margin',\n",
+ " 'nojustify',\n",
+ " 'ordering',\n",
+ " 'orientation',\n",
+ " 'penwidth',\n",
+ " 'peripheries',\n",
+ " 'pin',\n",
+ " 'pos',\n",
+ " 'rects',\n",
+ " 'regular',\n",
+ " 'root',\n",
+ " 'samplepoints',\n",
+ " 'shape',\n",
+ " 'shapefile',\n",
+ " 'showboxes',\n",
+ " 'sides',\n",
+ " 'skew',\n",
+ " 'sortv',\n",
+ " 'style',\n",
+ " 'target',\n",
+ " 'tooltip',\n",
+ " 'URL',\n",
+ " 'vertices',\n",
+ " 'width',\n",
+ " 'xlabel',\n",
+ " 'xlp',\n",
+ " 'z']"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "from graphistry.plugins_types.graphviz_types import NODE_ATTRS\n",
+ "NODE_ATTRS"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Fslt0bjvuyv0"
+ },
+ "source": [
+ "## Static image rendering and entity-level attributes\n",
+ "\n",
+ "graphviz suports rendering to a static file in various image formats such as png.\n",
+ "\n",
+ "You can add graphviz-specific columns to your node and edge dataframes that configure per-row render settings. These use the same names as in the above global attribute guidance, such as `color`, `shape`, and `label`.\n",
+ "\n",
+ "Adding a column for an attribute will typically disable the global attribute. For example, creating setting node column `\"shape\"` with values `\"star\"` and `None`, and global node attribute `\"shape\"` with value value `\"box\"`. All the nodes with `shape == \"star\"` will render as a star in the static image, and the rows with value `None` will not default to the global node attribute `\"box\"`, but to graphviz's general default of an oval."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 492
+ },
+ "id": "-3e_fH-80bhE",
+ "outputId": "015cb98e-f1eb-4fbe-e29b-18a821b3b3d3"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1862294673469042014 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 7622088245850069747 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 7622088245850069747-unknown \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 16634236373777089526 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 18011320449780894329 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " 9134577322728469115 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " 1446072728533515665 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " 6904185395252167658 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " 13630126251685975826 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " 11514603667851101425 \n",
+ " \n",
+ " \n",
+ " 10 \n",
+ " 13417892994160273884 \n",
+ " \n",
+ " \n",
+ " 11 \n",
+ " 2173486047275631423 \n",
+ " \n",
+ " \n",
+ " 12 \n",
+ " 18045747820524565803 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: object "
+ ],
+ "text/plain": [
+ "0 1862294673469042014\n",
+ "1 7622088245850069747\n",
+ "2 7622088245850069747-unknown\n",
+ "3 16634236373777089526\n",
+ "4 18011320449780894329\n",
+ "5 9134577322728469115\n",
+ "6 1446072728533515665\n",
+ "7 6904185395252167658\n",
+ "8 13630126251685975826\n",
+ "9 11514603667851101425\n",
+ "10 13417892994160273884\n",
+ "11 2173486047275631423\n",
+ "12 18045747820524565803\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 76,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "g._nodes.apply(lambda row: row['n'], axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 717
+ },
+ "id": "newXWAjEzo5F",
+ "outputId": "ff234aa4-5811-42ff-b289-d67e4e5c11b7"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "row 1862294673469042014\n",
+ "row 7622088245850069747\n",
+ "row 7622088245850069747-unknown\n",
+ "row 16634236373777089526\n",
+ "row 18011320449780894329\n",
+ "row 9134577322728469115\n",
+ "row 1446072728533515665\n",
+ "row 6904185395252167658\n",
+ "row 13630126251685975826\n",
+ "row 11514603667851101425\n",
+ "row 13417892994160273884\n",
+ "row 2173486047275631423\n",
+ "row 18045747820524565803\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " None \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " None \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " None \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " None \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " None \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " None \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " None \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " None \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " None \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " None \n",
+ " \n",
+ " \n",
+ " 10 \n",
+ " None \n",
+ " \n",
+ " \n",
+ " 11 \n",
+ " None \n",
+ " \n",
+ " \n",
+ " 12 \n",
+ " None \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: object "
+ ],
+ "text/plain": [
+ "0 None\n",
+ "1 None\n",
+ "2 None\n",
+ "3 None\n",
+ "4 None\n",
+ "5 None\n",
+ "6 None\n",
+ "7 None\n",
+ "8 None\n",
+ "9 None\n",
+ "10 None\n",
+ "11 None\n",
+ "12 None\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "g._nodes.apply(lambda row: print('row', row['n']), 1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "72YquJpavfLL",
+ "outputId": "40739d8a-dbe2-4437-fcc7-1a45c92a5b73"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "repr_error": "Out of range float values are not JSON compliant: nan",
+ "type": "dataframe"
+ },
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " n \n",
+ " x \n",
+ " y \n",
+ " label \n",
+ " sz \n",
+ " shape \n",
+ " color \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1862294673469042014 \n",
+ " 381.39 \n",
+ " 234.0 \n",
+ " Hsbc Finance (Netherlands) \n",
+ " 1 \n",
+ " None \n",
+ " red \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 16634236373777089526 \n",
+ " 140.39 \n",
+ " 90.0 \n",
+ " HSBC PROPERTY (UK) LIMITED \n",
+ " 1 \n",
+ " None \n",
+ " red \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 18011320449780894329 \n",
+ " 381.39 \n",
+ " 162.0 \n",
+ " HSBC ALTERNATIVE INVESTMENTS LIMITED \n",
+ " 1 \n",
+ " None \n",
+ " red \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 9134577322728469115 \n",
+ " 778.39 \n",
+ " 162.0 \n",
+ " HSBC INVESTMENT COMPANY LIMITED \n",
+ " 1 \n",
+ " None \n",
+ " red \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 1446072728533515665 \n",
+ " 454.39 \n",
+ " 90.0 \n",
+ " HSBC IM PENSION TRUST LIMITED \n",
+ " 1 \n",
+ " None \n",
+ " red \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ " n x y label \\\n",
+ "0 1862294673469042014 381.39 234.0 Hsbc Finance (Netherlands) \n",
+ "1 16634236373777089526 140.39 90.0 HSBC PROPERTY (UK) LIMITED \n",
+ "2 18011320449780894329 381.39 162.0 HSBC ALTERNATIVE INVESTMENTS LIMITED \n",
+ "3 9134577322728469115 778.39 162.0 HSBC INVESTMENT COMPANY LIMITED \n",
+ "4 1446072728533515665 454.39 90.0 HSBC IM PENSION TRUST LIMITED \n",
+ "\n",
+ " sz shape color \n",
+ "0 1 None red \n",
+ "1 1 None red \n",
+ "2 1 None red \n",
+ "3 1 None red \n",
+ "4 1 None red "
+ ]
+ },
+ "execution_count": 99,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# row-level attrs\n",
+ "\n",
+ "root_id = '7622088245850069747-unknown'\n",
+ "\n",
+ "g2c = g.nodes(g._nodes.assign(\n",
+ " label=g._nodes.apply(lambda row: \"ROOT: Unknown person(s)\" if row['n'] == root_id else row['label'], axis=1),\n",
+ " shape=g._nodes.n.apply(lambda n: \"box\" if n == root_id else None),\n",
+ " color=g._nodes.n.apply(lambda n: \"blue\" if n == root_id else 'red')\n",
+ ")).edges(g._edges.assign(\n",
+ " color=g._edges[g._source].apply(lambda n: 'blue' if n == root_id else None)\n",
+ "))\n",
+ "\n",
+ "\n",
+ "# Save a static graphviz render\n",
+ "g2c_positioned = g2c.layout_graphviz(\n",
+ " \"dot\",\n",
+ " render_to_disk=True,\n",
+ " path=f'./graph.png',\n",
+ " graph_attr={},\n",
+ " edge_attr={},\n",
+ " node_attr={'color': 'green'}, # ignored due to g2c._nodes.color\n",
+ " format='png'\n",
+ ")\n",
+ "\n",
+ "g2c_positioned._nodes.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 353
+ },
+ "id": "Orvnc-p4vwyd",
+ "outputId": "59f03155-128b-4d55-f216-58b16ce7f914"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 98,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from IPython.display import Image\n",
+ "Image(filename='./graph.png')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 543
+ },
+ "id": "X_e8slJ4LO2z",
+ "outputId": "c36ff578-dd8b-4626-8e52-7096bc5f0cfb"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 101,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "g2d = g.layout_graphviz('circo')\n",
+ "g2d.plot()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "pDEzL2UlZGz_"
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/demos/demos_databases_apis/gremlin-tinkerpop/TitanDemo.ipynb b/demos/demos_databases_apis/gremlin-tinkerpop/TitanDemo.ipynb
index f54add017..411c1184f 100644
--- a/demos/demos_databases_apis/gremlin-tinkerpop/TitanDemo.ipynb
+++ b/demos/demos_databases_apis/gremlin-tinkerpop/TitanDemo.ipynb
@@ -4,7 +4,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# In this notebook, we demonstrate how to create and modify a Titan graph in python, and then visualize the result using Graphistry's visual graph explorer. "
+ "# PyGraphistry <> Titan graph\n",
+ "\n",
+ "In this notebook, we demonstrate how to create and modify a Titan graph in python, and then visualize the result using Graphistry's visual graph explorer. "
]
},
{
@@ -12,12 +14,12 @@
"metadata": {},
"source": [
"### We assume the gremlin server for our Titan graph is hosted locally on port 8182\n",
- " - This notebook utilizes the python modules aiogremlin and asyncio.\n",
- " - The GremlinClient class of aiogremlin communicates asynchronously with the gremlin server using websockets via asyncio coroutines.\n",
- " - This implementation allows you to submit additional requests to the server before any responses are recieved, which is much faster than synchronous request / response cycles. \n",
- " - For more information about these modules, please visit:\n",
- " - aiogremlin: http://aiogremlin.readthedocs.org/en/latest/index.html\n",
- " - asyncio: https://pypi.python.org/pypi/asyncio"
+ "- This notebook utilizes the python modules aiogremlin and asyncio.\n",
+ "- The GremlinClient class of aiogremlin communicates asynchronously with the gremlin server using websockets via asyncio coroutines.\n",
+ "- This implementation allows you to submit additional requests to the server before any responses are recieved, which is much faster than synchronous request / response cycles. \n",
+ "- For more information about these modules, please visit:\n",
+ " - aiogremlin: http://aiogremlin.readthedocs.org/en/latest/index.html\n",
+ " - asyncio: https://pypi.python.org/pypi/asyncio"
]
},
{
diff --git a/demos/demos_databases_apis/neo4j/contributed/Neo4jTwitter.ipynb b/demos/demos_databases_apis/neo4j/contributed/Neo4jTwitter.ipynb
index 6621db721..47c707a08 100644
--- a/demos/demos_databases_apis/neo4j/contributed/Neo4jTwitter.ipynb
+++ b/demos/demos_databases_apis/neo4j/contributed/Neo4jTwitter.ipynb
@@ -65,9 +65,7 @@
"source": [
"## Connect To Neo4j\n",
"\n",
- "If you haven't already, create an instance of the Russian Twitter Trolls sandbox on [Neo4j Sandbox.](https://neo4j.com/sandbox-v2/) We'll use the [Python driver for Neo4j](https://github.com/neo4j/neo4j-python-driver) to fetch data from Neo4j. To do this we'll need to instantiate a `Driver` object, passing in the credentials for our Neo4j instance. If using Neo4j Sandbox you can find the credentials for your Neo4j instance in the \"Details\" tab. Specifically we need the IP address, bolt port, username, and password. Bolt is the binary protocol used by the Neo4j drivers so a typical database URL string takes the form `bolt://:`\n",
- "\n",
- "![](./img/sandbox.png)"
+ "If you haven't already, create an instance of the Russian Twitter Trolls sandbox on [Neo4j Sandbox.](https://neo4j.com/sandbox-v2/) We'll use the [Python driver for Neo4j](https://github.com/neo4j/neo4j-python-driver) to fetch data from Neo4j. To do this we'll need to instantiate a `Driver` object, passing in the credentials for our Neo4j instance. If using Neo4j Sandbox you can find the credentials for your Neo4j instance in the \"Details\" tab. Specifically we need the IP address, bolt port, username, and password. Bolt is the binary protocol used by the Neo4j drivers so a typical database URL string takes the form `bolt://:`\n"
]
},
{
@@ -118,7 +116,6 @@
"source": [
"If we inspect the datamodel in Neo4j we can see that we have inormation about Tweets and specifically Users mentioned in tweets.\n",
"\n",
- "![](./img/datamodel.png)\n",
"\n",
"Let's use Graphistry to visualize User-User Tweet mention interactions. We'll do this by querying Neo4j for all tweets that mention users."
]
@@ -371,8 +368,6 @@
"source": [
"After running the above Python cell you should see an interactive Graphistry visualization like this:\n",
"\n",
- "![](./img/graphistry1.png)\n",
- "\n",
"Known Troll user nodes are colored red, regular users colored blue. By default, the size of the nodes is proportional to the degree of the node (number of relationships). We'll see in the next section how we can use graph algorithms such as PageRank and visualize the results of those algorithms in Graphistry."
]
},
@@ -549,8 +544,6 @@
"source": [
"Now when we render the Graphistry visualization, node size is proprtional to the node's PageRank score. This results in a different set of nodes that are identified as most important. \n",
"\n",
- "![](./img/graphistry2.png)\n",
- "\n",
"By binding node size to the results of graph algorithms we are able to draw insight from the data at a glance and further explore the interactive visualization.\n"
]
},
diff --git a/demos/demos_databases_apis/neptune/neptune_cypher_viz_using_bolt.ipynb b/demos/demos_databases_apis/neptune/neptune_cypher_viz_using_bolt.ipynb
index 49cf8e156..c9d6f3c1d 100755
--- a/demos/demos_databases_apis/neptune/neptune_cypher_viz_using_bolt.ipynb
+++ b/demos/demos_databases_apis/neptune/neptune_cypher_viz_using_bolt.ipynb
@@ -5,7 +5,7 @@
"id": "10436f61-3f82-4316-b9be-b6a70746d4f7",
"metadata": {},
"source": [
- "## Graphistry for Neptune using pygraphistry bolt connector \n",
+ "# Graphistry for Neptune using pygraphistry bolt connector \n",
"\n",
"#### This example uses pygraphistry bolt helper class to run queries against AWS Neptune and retrieve query results as graph, then the bolt helper function extracts all the nodes and edges into the dataframes automatically. Then visualize the resulting datasets using Graphistry. \n",
"\n"
diff --git a/demos/demos_databases_apis/neptune/neptune_tutorial.ipynb b/demos/demos_databases_apis/neptune/neptune_tutorial.ipynb
index 5a6ee1101..da07191e6 100644
--- a/demos/demos_databases_apis/neptune/neptune_tutorial.ipynb
+++ b/demos/demos_databases_apis/neptune/neptune_tutorial.ipynb
@@ -698,7 +698,7 @@
"id": "removed-blair",
"metadata": {},
"source": [
- "# Next steps\n",
+ "## Next steps\n",
"\n",
"* Go deeper with [PyGraphistry](https://github.com/graphistry/pygraphistry): Examples for customization, GPU graph analytics, and more\n",
"* Explore [gremlinpython](https://pypi.org/project/gremlinpython/)\n",
diff --git a/demos/demos_databases_apis/networkx/networkx.ipynb b/demos/demos_databases_apis/networkx/networkx.ipynb
index 0426751bf..510cc6413 100644
--- a/demos/demos_databases_apis/networkx/networkx.ipynb
+++ b/demos/demos_databases_apis/networkx/networkx.ipynb
@@ -1,5 +1,14 @@
{
"cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# NetworkX\n",
+ "\n",
+ "NetworkX is an early graph manipulation library with a variety of algorithms and layouts."
+ ]
+ },
{
"cell_type": "code",
"execution_count": 1,
@@ -14,7 +23,8 @@
"# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n",
"# For more options, see https://github.com/graphistry/pygraphistry#configure\n",
"\n",
- "import networkx as nx"
+ "import networkx as nx\n",
+ "import pandas as pd"
]
},
{
@@ -49,7 +59,7 @@
}
],
"source": [
- "G=nx.Graph()\n",
+ "G = nx.Graph()\n",
"G.add_nodes_from([\n",
" (1, {\"v\": \"one\"}), \n",
" (2, {\"v\": \"two\"}), \n",
@@ -64,14 +74,26 @@
"graphistry.bind(source='src', destination='dst', node='nodeid').plot(G)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "When manipulating the graph, this form is even easier, as you can then use PyGraphistry methods for tasks like filtering, algorithmic enrichment, GFQL queries, etc:"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "collapsed": true
- },
+ "metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "g = graphistry.bind().from_networkx(G)\n",
+ "\n",
+ "assert isinstance(g._edges, pd.DataFrame)\n",
+ "assert isinstance(g._nodes, pd.DataFrame)\n",
+ "\n",
+ "g._edges"
+ ]
}
],
"metadata": {
@@ -90,7 +112,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.11"
+ "version": "3.9.16"
}
},
"nbformat": 4,
diff --git a/demos/demos_databases_apis/nodexl/official/nodexl_graphistry.ipynb b/demos/demos_databases_apis/nodexl/official/nodexl_graphistry.ipynb
index b11acdc56..f30220175 100644
--- a/demos/demos_databases_apis/nodexl/official/nodexl_graphistry.ipynb
+++ b/demos/demos_databases_apis/nodexl/official/nodexl_graphistry.ipynb
@@ -43,7 +43,7 @@
},
{
"cell_type": "code",
- "execution_count": 0,
+ "execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
@@ -56,7 +56,7 @@
},
{
"cell_type": "code",
- "execution_count": 0,
+ "execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
@@ -80,7 +80,7 @@
},
{
"cell_type": "code",
- "execution_count": 0,
+ "execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
@@ -95,7 +95,7 @@
},
{
"cell_type": "code",
- "execution_count": 0,
+ "execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
@@ -115,12 +115,12 @@
"id": "jK9AXFTjAyDD"
},
"source": [
- "# Sample use"
+ "## Sample use"
]
},
{
"cell_type": "code",
- "execution_count": 0,
+ "execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
@@ -134,7 +134,7 @@
},
{
"cell_type": "code",
- "execution_count": 0,
+ "execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
@@ -148,7 +148,7 @@
},
{
"cell_type": "code",
- "execution_count": 0,
+ "execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
@@ -166,7 +166,7 @@
"id": "a7erX6jnKhHj"
},
"source": [
- "# Twitter Demos"
+ "### Twitter Demos"
]
},
{
@@ -176,12 +176,12 @@
"id": "uGuj40xkxtMh"
},
"source": [
- "## Debate Warren"
+ "### Debate Warren"
]
},
{
"cell_type": "code",
- "execution_count": 0,
+ "execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
@@ -199,12 +199,12 @@
"id": "UrnlAwkryE10"
},
"source": [
- "## CES Samsung"
+ "### CES Samsung"
]
},
{
"cell_type": "code",
- "execution_count": 0,
+ "execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
@@ -222,12 +222,12 @@
"id": "2a4TOajvC4sb"
},
"source": [
- "## Larger Graph"
+ "### Larger Graph"
]
},
{
"cell_type": "code",
- "execution_count": 0,
+ "execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
@@ -245,7 +245,7 @@
"id": "wY9KhWEgDHzn"
},
"source": [
- "# MediaWiki Demos"
+ "## MediaWiki Demos"
]
},
{
@@ -255,12 +255,12 @@
"id": "t4Im6padK7Ze"
},
"source": [
- "## Demo 1"
+ "### Demo 1"
]
},
{
"cell_type": "code",
- "execution_count": 0,
+ "execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
diff --git a/demos/demos_databases_apis/splunk/splunk_demo_public.ipynb b/demos/demos_databases_apis/splunk/splunk_demo_public.ipynb
index b6d84abee..281bb88f0 100644
--- a/demos/demos_databases_apis/splunk/splunk_demo_public.ipynb
+++ b/demos/demos_databases_apis/splunk/splunk_demo_public.ipynb
@@ -151,34 +151,10 @@
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 188
- },
- "colab_type": "code",
- "id": "XPK5n5Yrvjb5",
- "outputId": "04e436c6-5a8b-4148-cd31-874421e6967e"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Collecting splunk-sdk\n",
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d4/bb/408c504f4307fcf4a89909cc85bc912d8529c9ca88200682f94a31a06186/splunk-sdk-1.6.5.tar.gz (103kB)\n",
- "\u001b[K 100% |████████████████████████████████| 112kB 2.6MB/s \n",
- "\u001b[?25hBuilding wheels for collected packages: splunk-sdk\n",
- " Running setup.py bdist_wheel for splunk-sdk ... \u001b[?25l-\b \bdone\n",
- "\u001b[?25h Stored in directory: /root/.cache/pip/wheels/87/83/8f/5f78fbc79322715add8f39ba8adc97511f27297852eb4dc270\n",
- "Successfully built splunk-sdk\n",
- "Installing collected packages: splunk-sdk\n",
- "Successfully installed splunk-sdk-1.6.5\n"
- ]
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
- "!pip install splunk-sdk\n",
+ "# !pip install splunk-sdk\n",
"\n",
"import splunklib"
]
diff --git a/demos/demos_databases_apis/sql/postgres.ipynb b/demos/demos_databases_apis/sql/postgres.ipynb
index 1422625d8..219e1a101 100644
--- a/demos/demos_databases_apis/sql/postgres.ipynb
+++ b/demos/demos_databases_apis/sql/postgres.ipynb
@@ -13,10 +13,9 @@
"* Shows several viz modes + a convenience function for sql->interactive viz\n",
"* Try: Modify the indicated lines to change to visualize any other table\n",
"\n",
- "Further docs\n",
+ "Further reading:\n",
" - [UI Guide](https://hub.graphistry.com/docs/ui/index/)\n",
- " - [More demos: database connectors, ...](/notebook/tree/demos/demos_databases_apis)\n",
- " - [CSV upload notebook app](/notebook/tree/demos/upload_csv_miniapp.ipynb)"
+ " - [CSV upload notebook app](../../upload_csv_miniapp.ipynb)"
]
},
{
@@ -357,8 +356,7 @@
"source": [
"## Further docs\n",
" - [UI Guide](https://hub.graphistry.com/docs/ui/index/)\n",
- " - [More demos: database connectors, ...](/notebook/tree/demos/demos_databases_apis)\n",
- " - [CSV upload notebook app](/notebook/tree/demos/upload_csv_miniapp.ipynb)"
+ " - [CSV upload notebook app](../..//upload_csv_miniapp.ipynb)"
]
},
{
diff --git a/demos/demos_databases_apis/tigergraph/fraud_raw_REST_calls.ipynb b/demos/demos_databases_apis/tigergraph/fraud_raw_REST_calls.ipynb
index dce85185a..c067b6563 100644
--- a/demos/demos_databases_apis/tigergraph/fraud_raw_REST_calls.ipynb
+++ b/demos/demos_databases_apis/tigergraph/fraud_raw_REST_calls.ipynb
@@ -104,7 +104,7 @@
"id": "LUEA1fmFOjCD"
},
"source": [
- "# 1. Fraud"
+ "## 1. Fraud"
]
},
{
@@ -114,7 +114,7 @@
"id": "rY8Ip6WcOnPl"
},
"source": [
- "## 1.a circleDetection"
+ "### 1.a circleDetection"
]
},
{
@@ -152,7 +152,7 @@
"id": "mXT2bD2UOp3o"
},
"source": [
- "## 1.b fraudConnectivity"
+ "### 1.b fraudConnectivity"
]
},
{
@@ -190,7 +190,7 @@
"id": "SKepDGbKZLGI"
},
"source": [
- "## Combined"
+ "### Combined"
]
},
{
diff --git a/demos/for_analysis.ipynb b/demos/for_analysis.ipynb
index adf548691..675722246 100644
--- a/demos/for_analysis.ipynb
+++ b/demos/for_analysis.ipynb
@@ -14,10 +14,10 @@
"3. Advanced plotting\n",
"4. Further reading\n",
" - [PyGraphistry](https://github.com/graphistry/pygraphistry)\n",
- " - [PyGraphistry demos: database connectors, ...](demos_databases_apis)\n",
+ " - [PyGraphistry demos: database connectors, ...](https://github.com/graphistry/pygraphistry/tree/master/demos/demos_databases_apis)\n",
" - [graph-app-kit: Streamlit graph dashboarding](https://github.com/graphistry/graph-app-kit)\n",
" - [UI Guide](https://hub.graphistry.com/docs/ui/index/)\n",
- " - [CSV upload notebook app](upload_csv_miniapp.ipynb)\n",
+ " - [CSV upload notebook app](https://github.com/graphistry/pygraphistry/tree/master/demos/upload_csv_miniapp.ipynb)\n",
" \n",
"## 1. Register\n"
]
@@ -896,10 +896,10 @@
"source": [
"## Further reading:\n",
" - [PyGraphistry](https://github.com/graphistry/pygraphistry)\n",
- " - [PyGraphistry demos: database connectors, ...](demos_databases_apis)\n",
+ " - [PyGraphistry demos: database connectors, ...](https://github.com/graphistry/pygraphistry/demos/demos_databases_apis)\n",
" - [graph-app-kit: Streamlit graph dashboarding](https://github.com/graphistry/graph-app-kit)\n",
" - [UI Guide](https://hub.graphistry.com/docs/ui/index/)\n",
- " - [CSV upload notebook app](upload_csv_miniapp.ipynb)"
+ " - [CSV upload notebook app](https://github.com/graphistry/pygraphistry/demos/upload_csv_miniapp.ipynb)"
]
}
],
diff --git a/demos/for_developers.ipynb b/demos/for_developers.ipynb
index 8d2187a10..25d07d1b7 100644
--- a/demos/for_developers.ipynb
+++ b/demos/for_developers.ipynb
@@ -7,7 +7,7 @@
"# Tutorial: Graphistry for Developers\n",
"\n",
"\n",
- "**Start by generating interactive graphs in the [Analysis tutorial](for_analysis.ipynb)**\n",
+ "**Start by generating interactive graphs in the [Analysis tutorial](https://github.com/graphistry/pygraphistry/demos/for_analysis.ipynb)**\n",
"\n",
"\n",
"**Graphistry is a client/server system:**\n",
@@ -48,7 +48,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# 1. Backend APIs\n",
+ "## 1. Backend APIs\n",
"\n",
"Graphistry provides a REST upload API, and you can reuse the Python client for more conveniently using it."
]
@@ -57,8 +57,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Python\n",
- "* Use the PyGraphistry API as in the [Analysis tutorial](for_analysis.ipynb)\n",
+ "### Python\n",
+ "* Use the PyGraphistry API as in the [Analysis tutorial](https://github.com/graphistry/pygraphistry/demos/for_analysis.ipynb)\n",
"* Instead of plotting, get the plot URL for embedding\n"
]
},
@@ -173,7 +173,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## iframe"
+ "### iframe"
]
},
{
@@ -207,7 +207,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## JavaScript - Browser vanilla JS\n",
+ "### JavaScript - Browser vanilla JS\n",
"* [npm](https://www.npmjs.com/package/@graphistry/client-api)\n",
"* `npm install --save \"@graphistry/client-api\"`\n",
"* See [vanilla js examples](https://hub.graphistry.com/static/js-docs/examples/toggles.html)]\n",
diff --git a/demos/gfql/benchmark_hops_cpu_gpu.ipynb b/demos/gfql/benchmark_hops_cpu_gpu.ipynb
index bf17b630e..cafd90815 100644
--- a/demos/gfql/benchmark_hops_cpu_gpu.ipynb
+++ b/demos/gfql/benchmark_hops_cpu_gpu.ipynb
@@ -1,23 +1,10 @@
{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
- "colab": {
- "provenance": [],
- "gpuType": "T4"
- },
- "kernelspec": {
- "name": "python3",
- "display_name": "Python 3"
- },
- "language_info": {
- "name": "python"
- },
- "accelerator": "GPU"
- },
"cells": [
{
"cell_type": "markdown",
+ "metadata": {
+ "id": "GZxoiU8sQDk_"
+ },
"source": [
"# GFQL CPU, GPU Benchmark\n",
"\n",
@@ -73,33 +60,27 @@
"| **Orkut** | N/A | N/A | 41.50 | N/A | 711.4 |\n",
"| **AVG** | 22X | 0.41 | 14.4 | 41.1 | 246.8\n",
"| **MAX** | 42X | 0.50 | 41.50 | 50.2 | 711.4\n"
- ],
- "metadata": {
- "id": "GZxoiU8sQDk_"
- }
+ ]
},
{
"cell_type": "markdown",
- "source": [
- "## Optional: GPU setup - Google Colab"
- ],
"metadata": {
"id": "SAj8lhREEOwS"
- }
+ },
+ "source": [
+ "## Optional: GPU setup - Google Colab"
+ ]
},
{
"cell_type": "markdown",
- "source": [],
"metadata": {
"id": "4hrEEAAm7DTO"
- }
+ },
+ "source": []
},
{
"cell_type": "code",
- "source": [
- "# Report GPU used when GPU benchmarking\n",
- "! nvidia-smi"
- ],
+ "execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -107,11 +88,10 @@
"id": "W2MF6ZsjDv3B",
"outputId": "46088cbc-2db9-4529-f724-dc57ed85dfb7"
},
- "execution_count": 1,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Tue Dec 26 00:50:30 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -135,27 +115,28 @@
"+---------------------------------------------------------------------------------------+\n"
]
}
+ ],
+ "source": [
+ "# Report GPU used when GPU benchmarking\n",
+ "# ! nvidia-smi"
]
},
{
"cell_type": "code",
- "source": [
- "# if in google colab\n",
- "!git clone https://github.com/rapidsai/rapidsai-csp-utils.git\n",
- "!python rapidsai-csp-utils/colab/pip-install.py"
- ],
+ "execution_count": 8,
"metadata": {
"id": "Aikh0x4ID_wK"
},
- "execution_count": 8,
- "outputs": []
+ "outputs": [],
+ "source": [
+ "# if in google colab\n",
+ "#!git clone https://github.com/rapidsai/rapidsai-csp-utils.git\n",
+ "#!python rapidsai-csp-utils/colab/pip-install.py"
+ ]
},
{
"cell_type": "code",
- "source": [
- "import cudf\n",
- "cudf.__version__"
- ],
+ "execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -164,160 +145,155 @@
"id": "Lwekdei1dH3N",
"outputId": "71f5b01d-7917-4283-8338-969167d6e1e8"
},
- "execution_count": 3,
"outputs": [
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- "'23.12.01'"
- ],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
- }
+ },
+ "text/plain": [
+ "'23.12.01'"
+ ]
},
+ "execution_count": 3,
"metadata": {},
- "execution_count": 3
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "import cudf\n",
+ "cudf.__version__"
]
},
{
"cell_type": "markdown",
- "source": [
- "# 1. Install & configure"
- ],
"metadata": {
"id": "QQpsrtwBT7sa"
- }
+ },
+ "source": [
+ "## 1. Install & configure"
+ ]
},
{
"cell_type": "code",
- "source": [
- "#! pip install graphistry[igraph]\n",
- "\n",
- "!pip install -q igraph\n",
- "#!pip install -q git+https://github.com/graphistry/pygraphistry.git@dev/cugfql\n",
- "!pip install -q graphistry\n"
- ],
+ "execution_count": 2,
"metadata": {
- "id": "cYjRbgkU9Sx8",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "cYjRbgkU9Sx8",
"outputId": "2cf25531-9b8b-4715-ccc7-e79094d84ebd"
},
- "execution_count": 2,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
]
}
+ ],
+ "source": [
+ "#! pip install graphistry[igraph]"
]
},
{
"cell_type": "markdown",
- "source": [
- "## Imports"
- ],
"metadata": {
"id": "Ff6Tt9DhkePl"
- }
+ },
+ "source": [
+ "### Imports"
+ ]
},
{
"cell_type": "code",
- "source": [
- "import pandas as pd\n",
- "\n",
- "import graphistry\n",
- "\n",
- "from graphistry import (\n",
- "\n",
- " # graph operators\n",
- " n, e_undirected, e_forward, e_reverse,\n",
- "\n",
- " # attribute predicates\n",
- " is_in, ge, startswith, contains, match as match_re\n",
- ")\n",
- "graphistry.__version__"
- ],
+ "execution_count": 3,
"metadata": {
- "id": "S5_y0CbLkjft",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
+ "id": "S5_y0CbLkjft",
"outputId": "a68a9c4b-c9c5-4b8b-ea4f-7bf1e4ddf315"
},
- "execution_count": 3,
"outputs": [
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- "'0.32.0+12.g72e778c'"
- ],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
- }
+ },
+ "text/plain": [
+ "'0.32.0+12.g72e778c'"
+ ]
},
+ "execution_count": 3,
"metadata": {},
- "execution_count": 3
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "import graphistry\n",
+ "\n",
+ "from graphistry import (\n",
+ "\n",
+ " # graph operators\n",
+ " n, e_undirected, e_forward, e_reverse,\n",
+ "\n",
+ " # attribute predicates\n",
+ " is_in, ge, startswith, contains, match as match_re\n",
+ ")\n",
+ "graphistry.__version__"
]
},
{
"cell_type": "code",
- "source": [
- "import cudf"
- ],
+ "execution_count": 6,
"metadata": {
"id": "I7Fg75jsG4co"
},
- "execution_count": 6,
- "outputs": []
+ "outputs": [],
+ "source": [
+ "import cudf"
+ ]
},
{
"cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "id": "uLZKph2-a5M4"
+ },
+ "outputs": [],
"source": [
"#work around google colab shell encoding bugs\n",
"\n",
"import locale\n",
"locale.getpreferredencoding = lambda: \"UTF-8\""
- ],
- "metadata": {
- "id": "uLZKph2-a5M4"
- },
- "execution_count": 7,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
- "source": [
- "# 2. Perf benchmarks"
- ],
"metadata": {
"id": "eU9SyauNUHtR"
- }
+ },
+ "source": [
+ "## 2. Perf benchmarks"
+ ]
},
{
"cell_type": "markdown",
- "source": [
- "### Facebook: 88K edges"
- ],
"metadata": {
"id": "NA0Ym11fkB8j"
- }
+ },
+ "source": [
+ "### Facebook: 88K edges"
+ ]
},
{
"cell_type": "code",
- "source": [
- "df = pd.read_csv('https://raw.githubusercontent.com/graphistry/pygraphistry/master/demos/data/facebook_combined.txt', sep=' ', names=['s', 'd'])\n",
- "print(df.shape)\n",
- "df.head(5)"
- ],
+ "execution_count": 10,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -326,26 +302,16 @@
"id": "vXuQogHekClJ",
"outputId": "64db92c0-2704-438b-d0e4-25865acbb5e9"
},
- "execution_count": 10,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(88234, 2)\n"
]
},
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- " s d\n",
- "0 0 1\n",
- "1 0 2\n",
- "2 0 3\n",
- "3 0 4\n",
- "4 0 5"
- ],
"text/html": [
"\n",
" \n",
@@ -608,20 +574,30 @@
"
\n",
" \n",
" \n"
+ ],
+ "text/plain": [
+ " s d\n",
+ "0 0 1\n",
+ "1 0 2\n",
+ "2 0 3\n",
+ "3 0 4\n",
+ "4 0 5"
]
},
+ "execution_count": 10,
"metadata": {},
- "execution_count": 10
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "df = pd.read_csv('https://raw.githubusercontent.com/graphistry/pygraphistry/master/demos/data/facebook_combined.txt', sep=' ', names=['s', 'd'])\n",
+ "print(df.shape)\n",
+ "df.head(5)"
]
},
{
"cell_type": "code",
- "source": [
- "fg = graphistry.edges(df, 's', 'd').materialize_nodes()\n",
- "print(fg._nodes.shape, fg._edges.shape)\n",
- "fg._nodes.head(5)"
- ],
+ "execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -630,26 +606,16 @@
"id": "jEma7hvvkzkN",
"outputId": "dbf21342-6b80-429c-bd3f-b1494c6854c7"
},
- "execution_count": 11,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(4039, 1) (88234, 2)\n"
]
},
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- " id\n",
- "0 0\n",
- "1 1\n",
- "2 2\n",
- "3 3\n",
- "4 4"
- ],
"text/html": [
"\n",
" \n",
@@ -906,20 +872,30 @@
"
\n",
" \n",
" \n"
+ ],
+ "text/plain": [
+ " id\n",
+ "0 0\n",
+ "1 1\n",
+ "2 2\n",
+ "3 3\n",
+ "4 4"
]
},
+ "execution_count": 11,
"metadata": {},
- "execution_count": 11
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "fg = graphistry.edges(df, 's', 'd').materialize_nodes()\n",
+ "print(fg._nodes.shape, fg._edges.shape)\n",
+ "fg._nodes.head(5)"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "for i in range(100):\n",
- " fg2 = fg.chain([n({'id': 0}), e_forward(hops=2)])"
- ],
+ "execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -927,30 +903,25 @@
"id": "5lEdCBw9lzd7",
"outputId": "ed7451e0-401e-4edc-c8de-79c5afd0c95b"
},
- "execution_count": 12,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"CPU times: user 13.6 s, sys: 2.08 s, total: 15.7 s\n",
"Wall time: 18 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "for i in range(100):\n",
+ " fg2 = fg.chain([n({'id': 0}), e_forward(hops=2)])"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "fg_gdf = fg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "for i in range(100):\n",
- " fg2 = fg_gdf.chain([n({'id': 0}), e_forward(hops=2)])\n",
- "print(fg._nodes.shape, fg._edges.shape)\n",
- "print(fg2._nodes.shape, fg2._edges.shape)\n",
- "del fg_gdf\n",
- "del fg2"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -958,11 +929,10 @@
"id": "JFKIBa8mJCvJ",
"outputId": "c22022f0-b33d-483a-db64-29992c5161e8"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(4039, 1) (88234, 2)\n",
"(1519, 1) (4060, 2)\n",
@@ -970,17 +940,21 @@
"Wall time: 11.9 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "fg_gdf = fg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
+ "for i in range(100):\n",
+ " fg2 = fg_gdf.chain([n({'id': 0}), e_forward(hops=2)])\n",
+ "print(fg._nodes.shape, fg._edges.shape)\n",
+ "print(fg2._nodes.shape, fg2._edges.shape)\n",
+ "del fg_gdf\n",
+ "del fg2"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "for i in range(50):\n",
- " fg2 = fg.chain([n({'id': 0}), e_forward(hops=5)])\n",
- "print(fg._nodes.shape, fg._edges.shape)\n",
- "print(fg2._nodes.shape, fg2._edges.shape)"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -988,11 +962,10 @@
"id": "-KBGLexek5tS",
"outputId": "2f462e6c-578a-4fa1-ec29-91bae753f4c5"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(4039, 1) (88234, 2)\n",
"(3829, 1) (86074, 2)\n",
@@ -1000,20 +973,18 @@
"Wall time: 16.2 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "fg_gdf = fg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
"for i in range(50):\n",
- " fg2 = fg_gdf.chain([n({'id': 0}), e_forward(hops=5)])\n",
+ " fg2 = fg.chain([n({'id': 0}), e_forward(hops=5)])\n",
"print(fg._nodes.shape, fg._edges.shape)\n",
- "print(fg2._nodes.shape, fg2._edges.shape)\n",
- "del fg_gdf\n",
- "del fg2"
- ],
+ "print(fg2._nodes.shape, fg2._edges.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1021,11 +992,10 @@
"id": "CVpcbhpdHFEF",
"outputId": "aba04ee1-781e-4226-b593-b42415a55fc4"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(4039, 1) (88234, 2)\n",
"(3829, 1) (86074, 2)\n",
@@ -1033,47 +1003,47 @@
"Wall time: 10.1 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "fg_gdf = fg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
+ "for i in range(50):\n",
+ " fg2 = fg_gdf.chain([n({'id': 0}), e_forward(hops=5)])\n",
+ "print(fg._nodes.shape, fg._edges.shape)\n",
+ "print(fg2._nodes.shape, fg2._edges.shape)\n",
+ "del fg_gdf\n",
+ "del fg2"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "for i in range(100):\n",
- " fg2 = fg.chain([e_forward(source_node_match={'id': 0}, hops=5)])"
- ],
+ "execution_count": null,
"metadata": {
- "id": "1cFIyJF9pLjE",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "1cFIyJF9pLjE",
"outputId": "107329af-8e4b-428c-8b03-77ed00bdf5bf"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"CPU times: user 11.8 s, sys: 377 ms, total: 12.1 s\n",
"Wall time: 13.1 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "for i in range(100):\n",
+ " fg2 = fg.chain([e_forward(source_node_match={'id': 0}, hops=5)])"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "fg_gdf = fg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "for i in range(100):\n",
- " fg2 = fg_gdf.chain([e_forward(source_node_match={'id': 0}, hops=5)])\n",
- "print(fg._nodes.shape, fg._edges.shape)\n",
- "print(fg2._nodes.shape, fg2._edges.shape)\n",
- "del fg_gdf\n",
- "del fg2\n",
- "\n"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1081,11 +1051,10 @@
"id": "M5uRiD6uJVNW",
"outputId": "5e938a19-2992-4280-80c2-784382d40113"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(4039, 1) (88234, 2)\n",
"(348, 1) (347, 2)\n",
@@ -1093,20 +1062,22 @@
"Wall time: 14.2 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "fg_gdf = fg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
+ "for i in range(100):\n",
+ " fg2 = fg_gdf.chain([e_forward(source_node_match={'id': 0}, hops=5)])\n",
+ "print(fg._nodes.shape, fg._edges.shape)\n",
+ "print(fg2._nodes.shape, fg2._edges.shape)\n",
+ "del fg_gdf\n",
+ "del fg2\n",
+ "\n"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "start_nodes = pd.DataFrame({fg._node: [0]})\n",
- "for i in range(100):\n",
- " fg2 = fg.hop(\n",
- " nodes=start_nodes,\n",
- " direction='forward',\n",
- " hops=2)\n",
- "print(fg2._nodes.shape, fg2._edges.shape)"
- ],
+ "execution_count": 17,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1114,35 +1085,31 @@
"id": "Y9vgzfT69x41",
"outputId": "6882c1ce-0df8-4087-dda4-0a105a8617e1"
},
- "execution_count": 17,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(1519, 1) (4060, 2)\n",
"CPU times: user 4.5 s, sys: 1.35 s, total: 5.85 s\n",
"Wall time: 6.09 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = cudf.DataFrame({fg._node: [0]})\n",
- "fg_gdf = fg.nodes(cudf.from_pandas(fg._nodes)).edges(cudf.from_pandas(fg._edges))\n",
+ "start_nodes = pd.DataFrame({fg._node: [0]})\n",
"for i in range(100):\n",
- " fg2 = fg_gdf.hop(\n",
+ " fg2 = fg.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
" hops=2)\n",
- "print(fg2._nodes.shape, fg2._edges.shape)\n",
- "del start_nodes\n",
- "del fg_gdf\n",
- "del fg2"
- ],
+ "print(fg2._nodes.shape, fg2._edges.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1150,31 +1117,35 @@
"id": "c7ybJqjc-T31",
"outputId": "37ccc1fb-6460-4193-8aa7-22837ff06d0a"
},
- "execution_count": 18,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(1519, 1) (4060, 2)\n",
"CPU times: user 2.58 s, sys: 6.75 ms, total: 2.59 s\n",
"Wall time: 2.58 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = pd.DataFrame({fg._node: [0]})\n",
+ "start_nodes = cudf.DataFrame({fg._node: [0]})\n",
+ "fg_gdf = fg.nodes(cudf.from_pandas(fg._nodes)).edges(cudf.from_pandas(fg._edges))\n",
"for i in range(100):\n",
- " fg2 = fg.hop(\n",
+ " fg2 = fg_gdf.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
- " hops=5)\n",
- "print(fg2._nodes.shape, fg2._edges.shape)"
- ],
+ " hops=2)\n",
+ "print(fg2._nodes.shape, fg2._edges.shape)\n",
+ "del start_nodes\n",
+ "del fg_gdf\n",
+ "del fg2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1182,35 +1153,31 @@
"id": "Dy7a4zDZ-7_G",
"outputId": "077b5d9c-c9ae-411a-8228-3c026b07a910"
},
- "execution_count": 19,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(3829, 1) (86074, 2)\n",
"CPU times: user 13.2 s, sys: 2 s, total: 15.2 s\n",
"Wall time: 18.3 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = cudf.DataFrame({fg._node: [0]})\n",
- "fg_gdf = fg.nodes(cudf.from_pandas(fg._nodes)).edges(cudf.from_pandas(fg._edges))\n",
+ "start_nodes = pd.DataFrame({fg._node: [0]})\n",
"for i in range(100):\n",
- " fg2 = fg_gdf.hop(\n",
+ " fg2 = fg.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
" hops=5)\n",
- "print(fg2._nodes.shape, fg2._edges.shape)\n",
- "del start_nodes\n",
- "del fg_gdf\n",
- "del fg2"
- ],
+ "print(fg2._nodes.shape, fg2._edges.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1218,49 +1185,58 @@
"id": "N5aUtF1a--ML",
"outputId": "0c2b67b8-fac6-45b3-dfbe-8002b5506e91"
},
- "execution_count": 20,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(3829, 1) (86074, 2)\n",
"CPU times: user 5.72 s, sys: 159 ms, total: 5.88 s\n",
"Wall time: 5.86 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "start_nodes = cudf.DataFrame({fg._node: [0]})\n",
+ "fg_gdf = fg.nodes(cudf.from_pandas(fg._nodes)).edges(cudf.from_pandas(fg._edges))\n",
+ "for i in range(100):\n",
+ " fg2 = fg_gdf.hop(\n",
+ " nodes=start_nodes,\n",
+ " direction='forward',\n",
+ " hops=5)\n",
+ "print(fg2._nodes.shape, fg2._edges.shape)\n",
+ "del start_nodes\n",
+ "del fg_gdf\n",
+ "del fg2"
]
},
{
"cell_type": "markdown",
+ "metadata": {
+ "id": "KrJKjXy2KLos"
+ },
"source": [
"## Twitter\n",
"\n",
"- edges: 2420766\n",
"- nodes: 81306"
- ],
- "metadata": {
- "id": "KrJKjXy2KLos"
- }
+ ]
},
{
"cell_type": "code",
- "source": [
- "! wget 'https://snap.stanford.edu/data/twitter_combined.txt.gz'\n",
- "#! curl -L 'https://snap.stanford.edu/data/twitter_combined.txt.gz' -o twitter_combined.txt.gz"
- ],
+ "execution_count": 21,
"metadata": {
- "id": "fO2qasGqpubr",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "fO2qasGqpubr",
"outputId": "d41a110e-9f7c-4710-9ce3-3f4906ab02ae"
},
- "execution_count": 21,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"--2023-12-25 21:58:27-- https://snap.stanford.edu/data/twitter_combined.txt.gz\n",
"Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80\n",
@@ -1275,24 +1251,25 @@
"\n"
]
}
+ ],
+ "source": [
+ "#! wget 'https://snap.stanford.edu/data/twitter_combined.txt.gz'"
]
},
{
"cell_type": "code",
- "source": [
- "! gunzip twitter_combined.txt.gz"
- ],
+ "execution_count": 22,
"metadata": {
"id": "fn7zeA3SGlEo"
},
- "execution_count": 22,
- "outputs": []
+ "outputs": [],
+ "source": [
+ "#! gunzip twitter_combined.txt.gz"
+ ]
},
{
"cell_type": "code",
- "source": [
- "! head -n 5 twitter_combined.txt"
- ],
+ "execution_count": 24,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1300,11 +1277,10 @@
"id": "68TAZkhLGz9g",
"outputId": "8ba7c23d-267f-4b59-d6c6-b3f66caec9cf"
},
- "execution_count": 24,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"214328887 34428380\n",
"17116707 28465635\n",
@@ -1313,15 +1289,14 @@
"107830991 17868918\n"
]
}
+ ],
+ "source": [
+ "#! head -n 5 twitter_combined.txt"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "te_df = pd.read_csv('twitter_combined.txt', sep=' ', names=['s', 'd'])\n",
- "te_df.shape"
- ],
+ "execution_count": 25,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1329,46 +1304,46 @@
"id": "QU2wNeGXG2GC",
"outputId": "349ac9c0-6f6c-4ce6-fec0-8bae75fca635"
},
- "execution_count": 25,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"CPU times: user 474 ms, sys: 61.9 ms, total: 536 ms\n",
"Wall time: 534 ms\n"
]
},
{
- "output_type": "execute_result",
"data": {
"text/plain": [
"(2420766, 2)"
]
},
+ "execution_count": 25,
"metadata": {},
- "execution_count": 25
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "%%time\n",
+ "te_df = pd.read_csv('twitter_combined.txt', sep=' ', names=['s', 'd'])\n",
+ "te_df.shape"
]
},
{
"cell_type": "code",
- "source": [
- "import graphistry"
- ],
+ "execution_count": 26,
"metadata": {
"id": "EK5gQH2iG5UU"
},
- "execution_count": 26,
- "outputs": []
+ "outputs": [],
+ "source": [
+ "import graphistry"
+ ]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "g = graphistry.edges(te_df, 's', 'd').materialize_nodes()\n",
- "g._nodes.shape"
- ],
+ "execution_count": 27,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1376,36 +1351,35 @@
"id": "ZtIW-eFGG_R4",
"outputId": "0686e9b3-b684-4b93-da03-289244394338"
},
- "execution_count": 27,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"CPU times: user 86.4 ms, sys: 106 ms, total: 193 ms\n",
"Wall time: 191 ms\n"
]
},
{
- "output_type": "execute_result",
"data": {
"text/plain": [
"(81306, 1)"
]
},
+ "execution_count": 27,
"metadata": {},
- "execution_count": 27
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "%%time\n",
+ "g = graphistry.edges(te_df, 's', 'd').materialize_nodes()\n",
+ "g._nodes.shape"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "for i in range(10):\n",
- " g2 = g.chain([n({'id': 17116707}), e_forward(hops=1)])\n",
- "g2._nodes.shape, g2._edges.shape"
- ],
+ "execution_count": 29,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1413,39 +1387,36 @@
"id": "yUaRfw4FHGMb",
"outputId": "3945cc5a-c36c-451b-ac95-8af992a3546f"
},
- "execution_count": 29,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"CPU times: user 11.8 s, sys: 8.4 s, total: 20.2 s\n",
"Wall time: 23 s\n"
]
},
{
- "output_type": "execute_result",
"data": {
"text/plain": [
"((140, 1), (615, 2))"
]
},
+ "execution_count": 29,
"metadata": {},
- "execution_count": 29
+ "output_type": "execute_result"
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "g_gdf = g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
"for i in range(10):\n",
- " out = g_gdf.chain([n({'id': 17116707}), e_forward(hops=1)])._nodes\n",
- "print(out.shape)\n",
- "del g_gdf\n",
- "del out"
- ],
+ " g2 = g.chain([n({'id': 17116707}), e_forward(hops=1)])\n",
+ "g2._nodes.shape, g2._edges.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1453,27 +1424,30 @@
"id": "5hM4NBu2_eks",
"outputId": "54505262-4871-44ee-e5e4-ad7ab32c13c2"
},
- "execution_count": 30,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(140, 1)\n",
"CPU times: user 1.33 s, sys: 46.6 ms, total: 1.38 s\n",
"Wall time: 1.63 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "g_gdf = g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
+ "for i in range(10):\n",
+ " out = g_gdf.chain([n({'id': 17116707}), e_forward(hops=1)])._nodes\n",
+ "print(out.shape)\n",
+ "del g_gdf\n",
+ "del out"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "for i in range(10):\n",
- " out = g.chain([n({'id': 17116707}), e_forward(hops=2)])\n",
- "print(out._nodes.shape, out._edges.shape)"
- ],
+ "execution_count": 31,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1481,30 +1455,27 @@
"id": "m2-MxD5lHX6u",
"outputId": "e89b9d4b-6c04-45c7-9e7f-cbdbbe0a4730"
},
- "execution_count": 31,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(2345, 1) (68536, 2)\n",
"CPU times: user 13.3 s, sys: 8.05 s, total: 21.4 s\n",
"Wall time: 21.6 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "for i in range(10):\n",
+ " out = g.chain([n({'id': 17116707}), e_forward(hops=2)])\n",
+ "print(out._nodes.shape, out._edges.shape)"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "g_gdf = g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "for i in range(10):\n",
- " out = g_gdf.chain([n({'id': 17116707}), e_forward(hops=2)])._nodes\n",
- "print(out.shape)\n",
- "del g_gdf\n",
- "del out"
- ],
+ "execution_count": 36,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1512,27 +1483,30 @@
"id": "7EQSRbIqLaGw",
"outputId": "60c00a03-9e7b-46b5-fce3-f4f567a09430"
},
- "execution_count": 36,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(2345, 1)\n",
"CPU times: user 1.67 s, sys: 55.8 ms, total: 1.72 s\n",
"Wall time: 1.75 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "g_gdf = g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
+ "for i in range(10):\n",
+ " out = g_gdf.chain([n({'id': 17116707}), e_forward(hops=2)])._nodes\n",
+ "print(out.shape)\n",
+ "del g_gdf\n",
+ "del out"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "for i in range(10):\n",
- " out = g.chain([n({'id': 17116707}), e_forward(hops=8)])\n",
- "print(out._nodes.shape, out._edges.shape)"
- ],
+ "execution_count": 37,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1540,30 +1514,27 @@
"id": "hh6WnjI3ITpB",
"outputId": "33138efe-a581-49ed-b2b4-247f8e9bdc09"
},
- "execution_count": 37,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(81304, 1) (2417796, 2)\n",
"CPU times: user 1min 56s, sys: 17.1 s, total: 2min 13s\n",
"Wall time: 2min 22s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "for i in range(10):\n",
+ " out = g.chain([n({'id': 17116707}), e_forward(hops=8)])\n",
+ "print(out._nodes.shape, out._edges.shape)"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "g_gdf = g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "for i in range(10):\n",
- " out = g_gdf.chain([n({'id': 17116707}), e_forward(hops=8)])._nodes\n",
- "print(out.shape)\n",
- "del g_gdf\n",
- "del out"
- ],
+ "execution_count": 38,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1571,31 +1542,30 @@
"id": "7jFFVUenM87j",
"outputId": "2cceb720-9de3-488e-8b74-b820fd06e6c1"
},
- "execution_count": 38,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(81304, 1)\n",
"CPU times: user 5.3 s, sys: 1.48 s, total: 6.78 s\n",
"Wall time: 7.89 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "g_gdf = g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
+ "for i in range(10):\n",
+ " out = g_gdf.chain([n({'id': 17116707}), e_forward(hops=8)])._nodes\n",
+ "print(out.shape)\n",
+ "del g_gdf\n",
+ "del out"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "start_nodes = pd.DataFrame({g._node: [17116707]})\n",
- "for i in range(10):\n",
- " g2 = g.hop(\n",
- " nodes=start_nodes,\n",
- " direction='forward',\n",
- " hops=1)\n",
- "print(g2._nodes.shape, g2._edges.shape)"
- ],
+ "execution_count": 39,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1603,35 +1573,31 @@
"id": "_5LD0bZB_lU4",
"outputId": "bc31bd03-e79f-46d2-ea8f-3b01d9ef39a2"
},
- "execution_count": 39,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(0, 1) (0, 2)\n",
"CPU times: user 2.58 s, sys: 1.59 s, total: 4.17 s\n",
"Wall time: 6.02 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = cudf.DataFrame({g._node: [17116707]})\n",
- "g_gdf = g.nodes(cudf.from_pandas(g._nodes)).edges(cudf.from_pandas(g._edges))\n",
+ "start_nodes = pd.DataFrame({g._node: [17116707]})\n",
"for i in range(10):\n",
- " g2 = g_gdf.hop(\n",
+ " g2 = g.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
- " hops=5)\n",
- "print(g2._nodes.shape, g2._edges.shape)\n",
- "del start_nodes\n",
- "del g_gdf\n",
- "del g2"
- ],
+ " hops=1)\n",
+ "print(g2._nodes.shape, g2._edges.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1639,31 +1605,35 @@
"id": "M_rHjqtvACQw",
"outputId": "8d3e308e-b1e2-452b-f402-573be0dd5b58"
},
- "execution_count": 44,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(61827, 1) (1473599, 2)\n",
"CPU times: user 822 ms, sys: 179 ms, total: 1 s\n",
"Wall time: 997 ms\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = pd.DataFrame({g._node: [17116707]})\n",
+ "start_nodes = cudf.DataFrame({g._node: [17116707]})\n",
+ "g_gdf = g.nodes(cudf.from_pandas(g._nodes)).edges(cudf.from_pandas(g._edges))\n",
"for i in range(10):\n",
- " g2 = g.hop(\n",
+ " g2 = g_gdf.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
- " hops=2)\n",
- "print(g2._nodes.shape, g2._edges.shape)"
- ],
+ " hops=5)\n",
+ "print(g2._nodes.shape, g2._edges.shape)\n",
+ "del start_nodes\n",
+ "del g_gdf\n",
+ "del g2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1671,35 +1641,31 @@
"id": "0zEIucaCAbj_",
"outputId": "83e64b0f-2b3a-4e4b-d189-3e6a8ef78f53"
},
- "execution_count": 40,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(2345, 1) (68536, 2)\n",
"CPU times: user 8.93 s, sys: 5.92 s, total: 14.9 s\n",
"Wall time: 15.8 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = cudf.DataFrame({g._node: [17116707]})\n",
- "g_gdf = g.nodes(cudf.from_pandas(g._nodes)).edges(cudf.from_pandas(g._edges))\n",
+ "start_nodes = pd.DataFrame({g._node: [17116707]})\n",
"for i in range(10):\n",
- " g2 = g_gdf.hop(\n",
+ " g2 = g.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
" hops=2)\n",
- "print(g2._nodes.shape, g2._edges.shape)\n",
- "del start_nodes\n",
- "del g_gdf\n",
- "del g2"
- ],
+ "print(g2._nodes.shape, g2._edges.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1707,31 +1673,35 @@
"id": "LKJh5gRtAdIj",
"outputId": "e3c7883d-74c0-4d55-b238-88457296c6bc"
},
- "execution_count": 41,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(2345, 1) (68536, 2)\n",
"CPU times: user 374 ms, sys: 6.92 ms, total: 381 ms\n",
"Wall time: 379 ms\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = pd.DataFrame({g._node: [17116707]})\n",
+ "start_nodes = cudf.DataFrame({g._node: [17116707]})\n",
+ "g_gdf = g.nodes(cudf.from_pandas(g._nodes)).edges(cudf.from_pandas(g._edges))\n",
"for i in range(10):\n",
- " g2 = g.hop(\n",
+ " g2 = g_gdf.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
- " hops=8)\n",
- "print(g2._nodes.shape, g2._edges.shape)"
- ],
+ " hops=2)\n",
+ "print(g2._nodes.shape, g2._edges.shape)\n",
+ "del start_nodes\n",
+ "del g_gdf\n",
+ "del g2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1739,35 +1709,31 @@
"id": "JZwxdofNAfmb",
"outputId": "2731be4c-75d9-47f4-8602-4f2d6cb2ddac"
},
- "execution_count": 42,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(81304, 1) (2417796, 2)\n",
"CPU times: user 38.8 s, sys: 8.7 s, total: 47.5 s\n",
"Wall time: 48.2 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = cudf.DataFrame({g._node: [17116707]})\n",
- "g_gdf = g.nodes(cudf.from_pandas(g._nodes)).edges(cudf.from_pandas(g._edges))\n",
+ "start_nodes = pd.DataFrame({g._node: [17116707]})\n",
"for i in range(10):\n",
- " g2 = g_gdf.hop(\n",
+ " g2 = g.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
" hops=8)\n",
- "print(g2._nodes.shape, g2._edges.shape)\n",
- "del start_nodes\n",
- "del g_gdf\n",
- "del g2"
- ],
+ "print(g2._nodes.shape, g2._edges.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1775,36 +1741,47 @@
"id": "9o_og8bSAhe3",
"outputId": "dd3e4f8f-f426-4705-98c4-60f1912ba28a"
},
- "execution_count": 43,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(81304, 1) (2417796, 2)\n",
"CPU times: user 1.8 s, sys: 506 ms, total: 2.3 s\n",
"Wall time: 2.3 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "start_nodes = cudf.DataFrame({g._node: [17116707]})\n",
+ "g_gdf = g.nodes(cudf.from_pandas(g._nodes)).edges(cudf.from_pandas(g._edges))\n",
+ "for i in range(10):\n",
+ " g2 = g_gdf.hop(\n",
+ " nodes=start_nodes,\n",
+ " direction='forward',\n",
+ " hops=8)\n",
+ "print(g2._nodes.shape, g2._edges.shape)\n",
+ "del start_nodes\n",
+ "del g_gdf\n",
+ "del g2"
]
},
{
"cell_type": "markdown",
+ "metadata": {
+ "id": "9dZzAAVONCD2"
+ },
"source": [
- "### GPlus\n",
+ "## GPlus\n",
"\n",
"- edges: 30494866\n",
"- nodes: 107614"
- ],
- "metadata": {
- "id": "9dZzAAVONCD2"
- }
+ ]
},
{
"cell_type": "code",
- "source": [
- "! wget https://snap.stanford.edu/data/gplus_combined.txt.gz"
- ],
+ "execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1812,11 +1789,10 @@
"id": "-nhWGNekKpcZ",
"outputId": "e2175290-337c-4faa-e5d8-4bc401583326"
},
- "execution_count": 4,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"--2023-12-26 18:36:29-- https://snap.stanford.edu/data/gplus_combined.txt.gz\n",
"Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80\n",
@@ -1831,27 +1807,25 @@
"\n"
]
}
+ ],
+ "source": [
+ "#! wget https://snap.stanford.edu/data/gplus_combined.txt.gz"
]
},
{
"cell_type": "code",
- "source": [
- "! gunzip gplus_combined.txt.gz"
- ],
+ "execution_count": 5,
"metadata": {
"id": "g5wgA_c2KqwJ"
},
- "execution_count": 5,
- "outputs": []
+ "outputs": [],
+ "source": [
+ "#! gunzip gplus_combined.txt.gz"
+ ]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "ge_df = pd.read_csv('gplus_combined.txt', sep=' ', names=['s', 'd'])\n",
- "print(ge_df.shape)\n",
- "ge_df.head(5)"
- ],
+ "execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -1860,11 +1834,10 @@
"id": "52hgDbr0Kti6",
"outputId": "217203fc-7095-4784-c4c4-d46ee9c78808"
},
- "execution_count": 6,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(30494866, 2)\n",
"CPU times: user 16 s, sys: 1.45 s, total: 17.5 s\n",
@@ -1872,16 +1845,7 @@
]
},
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- " s d\n",
- "0 116374117927631468606 101765416973555767821\n",
- "1 112188647432305746617 107727150903234299458\n",
- "2 116719211656774388392 100432456209427807893\n",
- "3 117421021456205115327 101096322838605097368\n",
- "4 116407635616074189669 113556266482860931616"
- ],
"text/html": [
"\n",
" \n",
@@ -2144,22 +2108,31 @@
"
\n",
" \n",
" \n"
+ ],
+ "text/plain": [
+ " s d\n",
+ "0 116374117927631468606 101765416973555767821\n",
+ "1 112188647432305746617 107727150903234299458\n",
+ "2 116719211656774388392 100432456209427807893\n",
+ "3 117421021456205115327 101096322838605097368\n",
+ "4 116407635616074189669 113556266482860931616"
]
},
+ "execution_count": 6,
"metadata": {},
- "execution_count": 6
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "%%time\n",
+ "ge_df = pd.read_csv('gplus_combined.txt', sep=' ', names=['s', 'd'])\n",
+ "print(ge_df.shape)\n",
+ "ge_df.head(5)"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "gg = graphistry.edges(ge_df, 's', 'd').materialize_nodes()\n",
- "gg = graphistry.edges(ge_df, 's', 'd').nodes(gg._nodes, 'id')\n",
- "print(gg._edges.shape, gg._nodes.shape)\n",
- "gg._nodes.head(5)"
- ],
+ "execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -2168,11 +2141,10 @@
"id": "w5YkN-nLK6UV",
"outputId": "dc98380d-54c2-4b36-c56e-5e8401c4ffa4"
},
- "execution_count": 7,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(30494866, 2) (107614, 1)\n",
"CPU times: user 4.49 s, sys: 1.25 s, total: 5.74 s\n",
@@ -2180,16 +2152,7 @@
]
},
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- " id\n",
- "0 116374117927631468606\n",
- "1 112188647432305746617\n",
- "2 116719211656774388392\n",
- "3 117421021456205115327\n",
- "4 116407635616074189669"
- ],
"text/html": [
"\n",
" \n",
@@ -2446,19 +2409,32 @@
"
\n",
" \n",
" \n"
+ ],
+ "text/plain": [
+ " id\n",
+ "0 116374117927631468606\n",
+ "1 112188647432305746617\n",
+ "2 116719211656774388392\n",
+ "3 117421021456205115327\n",
+ "4 116407635616074189669"
]
},
+ "execution_count": 7,
"metadata": {},
- "execution_count": 7
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "%%time\n",
+ "gg = graphistry.edges(ge_df, 's', 'd').materialize_nodes()\n",
+ "gg = graphistry.edges(ge_df, 's', 'd').nodes(gg._nodes, 'id')\n",
+ "print(gg._edges.shape, gg._nodes.shape)\n",
+ "gg._nodes.head(5)"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "gg.chain([ n({'id': '116374117927631468606'})])._nodes"
- ],
+ "execution_count": 49,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -2467,23 +2443,17 @@
"id": "NKtz54uELX-8",
"outputId": "5d8f3eef-893d-47cc-e7a9-c5cbfec8270c"
},
- "execution_count": 49,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"CPU times: user 534 ms, sys: 598 ms, total: 1.13 s\n",
"Wall time: 1.65 s\n"
]
},
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- " id\n",
- "0 116374117927631468606"
- ],
"text/html": [
"\n",
" \n",
@@ -2597,20 +2567,25 @@
"\n",
"
\n",
" \n"
+ ],
+ "text/plain": [
+ " id\n",
+ "0 116374117927631468606"
]
},
+ "execution_count": 49,
"metadata": {},
- "execution_count": 49
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "%%time\n",
+ "gg.chain([ n({'id': '116374117927631468606'})])._nodes"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=1)])._nodes\n",
- "out.shape"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -2618,38 +2593,35 @@
"id": "iNWdi00VLmZG",
"outputId": "ecfb56a6-c564-4bf6-f43f-2c95a103f4be"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"CPU times: user 27.5 s, sys: 11.1 s, total: 38.5 s\n",
"Wall time: 39.5 s\n"
]
},
{
- "output_type": "execute_result",
"data": {
"text/plain": [
"(1473, 1)"
]
},
+ "execution_count": 75,
"metadata": {},
- "execution_count": 75
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "%%time\n",
+ "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=1)])._nodes\n",
+ "out.shape"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=1)])\n",
- "print(out._nodes.shape, out._edges.shape)\n",
- "del gg_gdf\n",
- "del out"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -2657,26 +2629,29 @@
"id": "Q6p3h6uCOABh",
"outputId": "817fc80f-ef5d-4070-eb48-a12344be709c"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(1473, 1) (13375, 2)\n",
"CPU times: user 4.57 s, sys: 2.11 s, total: 6.68 s\n",
"Wall time: 7.63 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
+ "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=1)])\n",
+ "print(out._nodes.shape, out._edges.shape)\n",
+ "del gg_gdf\n",
+ "del out"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=2)])._nodes\n",
- "out.shape"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -2684,38 +2659,35 @@
"id": "6UdCcMdqLw-P",
"outputId": "70742c79-b22b-4db2-c548-cb1e25d572eb"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"CPU times: user 45.8 s, sys: 17 s, total: 1min 2s\n",
"Wall time: 1min 5s\n"
]
},
{
- "output_type": "execute_result",
"data": {
"text/plain": [
"(44073, 1)"
]
},
+ "execution_count": 77,
"metadata": {},
- "execution_count": 77
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "%%time\n",
+ "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=2)])._nodes\n",
+ "out.shape"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=2)])\n",
- "print(out._nodes.shape, out._edges.shape)\n",
- "del gg_gdf\n",
- "del out"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -2723,26 +2695,29 @@
"id": "QElqatDyNYCS",
"outputId": "0e15bd3e-d2d9-4965-df7d-c8856d036680"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(44073, 1) (2069325, 2)\n",
"CPU times: user 4.97 s, sys: 2.36 s, total: 7.34 s\n",
"Wall time: 10.6 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
+ "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=2)])\n",
+ "print(out._nodes.shape, out._edges.shape)\n",
+ "del gg_gdf\n",
+ "del out"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=3)])._nodes\n",
- "out.shape"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -2750,38 +2725,35 @@
"id": "3HJOItZ4MQMG",
"outputId": "f5be7bb4-7f09-4f80-c549-e703e99f5067"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"CPU times: user 3min 45s, sys: 1min 5s, total: 4min 50s\n",
"Wall time: 4min 52s\n"
]
},
{
- "output_type": "execute_result",
"data": {
"text/plain": [
"(102414, 1)"
]
},
+ "execution_count": 79,
"metadata": {},
- "execution_count": 79
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "%%time\n",
+ "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=3)])._nodes\n",
+ "out.shape"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=3)])\n",
- "print(out._nodes.shape, out._edges.shape)\n",
- "del gg_gdf\n",
- "del out"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -2789,26 +2761,29 @@
"id": "G32t_xthOUle",
"outputId": "7721741f-9c86-41aa-eb0b-2c8f0db2ed54"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(102414, 1) (24851333, 2)\n",
"CPU times: user 6.95 s, sys: 2.63 s, total: 9.57 s\n",
"Wall time: 9.84 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
+ "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=3)])\n",
+ "print(out._nodes.shape, out._edges.shape)\n",
+ "del gg_gdf\n",
+ "del out"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=4)])\n",
- "print(out._nodes.shape, out._edges.shape)"
- ],
+ "execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -2816,29 +2791,26 @@
"id": "bXy2yyJsMsEG",
"outputId": "911f2680-067c-44f2-9ba2-7f27d3c9bc6b"
},
- "execution_count": 8,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(105479, 1) (30450354, 2)\n",
"CPU times: user 4min 36s, sys: 1min 25s, total: 6min 2s\n",
"Wall time: 6min 4s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=4)])\n",
+ "print(out._nodes.shape, out._edges.shape)"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=4)])\n",
- "print(out._nodes.shape, out._edges.shape)\n",
- "del gg_gdf\n",
- "del out"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -2846,26 +2818,29 @@
"id": "Vt8hhjWDP_W_",
"outputId": "824ae644-e1cf-4239-bda9-84aecde52ad8"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(105479, 1) (30450354, 2)\n",
"CPU times: user 7.44 s, sys: 2.45 s, total: 9.88 s\n",
"Wall time: 9.9 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
+ "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=4)])\n",
+ "print(out._nodes.shape, out._edges.shape)\n",
+ "del gg_gdf\n",
+ "del out"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=5)])\n",
- "print(out._nodes.shape, out._edges.shape)"
- ],
+ "execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -2873,61 +2848,56 @@
"id": "_z4KpNZaOH8t",
"outputId": "2417f78b-e1b7-452d-8e26-7df259620c88"
},
- "execution_count": 9,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(105604, 1) (30468335, 2)\n",
"CPU times: user 5min 36s, sys: 1min 39s, total: 7min 16s\n",
"Wall time: 7min 15s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "out = gg.chain([ n({'id': '116374117927631468606'}), e_forward(hops=5)])\n",
+ "print(out._nodes.shape, out._edges.shape)"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=5)])\n",
- "print(out._nodes.shape, out._edges.shape)\n",
- "del gg_gdf\n",
- "del out"
- ],
+ "execution_count": null,
"metadata": {
- "id": "spUBH9EHSz2O",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "spUBH9EHSz2O",
"outputId": "22340ce3-e8d4-4a72-b485-9839c667b965"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(105604, 1) (30468335, 2)\n",
"CPU times: user 8.82 s, sys: 2.71 s, total: 11.5 s\n",
"Wall time: 11.9 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "gg_gdf = gg.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
+ "out = gg_gdf.chain([ n({'id': '116374117927631468606'}), e_forward(hops=5)])\n",
+ "print(out._nodes.shape, out._edges.shape)\n",
+ "del gg_gdf\n",
+ "del out"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n",
- "for i in range(1):\n",
- " g2 = gg.hop(\n",
- " nodes=start_nodes,\n",
- " direction='forward',\n",
- " hops=1)\n",
- "print(g2._nodes.shape, g2._edges.shape)"
- ],
+ "execution_count": 50,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -2935,35 +2905,31 @@
"id": "vCsdmc62A7OM",
"outputId": "adc05d29-c628-49ed-cd6d-8921c6dcd206"
},
- "execution_count": 50,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(1473, 1) (13375, 2)\n",
"CPU times: user 19.9 s, sys: 9.36 s, total: 29.2 s\n",
"Wall time: 41.8 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n",
- "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n",
+ "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n",
"for i in range(1):\n",
- " g2 = gg_gdf.hop(\n",
+ " g2 = gg.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
" hops=1)\n",
- "print(g2._nodes.shape, g2._edges.shape)\n",
- "del start_nodes\n",
- "del gg_gdf\n",
- "del g2"
- ],
+ "print(g2._nodes.shape, g2._edges.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -2971,31 +2937,35 @@
"id": "J3kV8NBYBQdW",
"outputId": "76073248-43e1-4c3c-c004-67324cc1d312"
},
- "execution_count": 52,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(1473, 1) (13375, 2)\n",
"CPU times: user 3.71 s, sys: 2.09 s, total: 5.8 s\n",
"Wall time: 6.05 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n",
+ "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n",
+ "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n",
"for i in range(1):\n",
- " g2 = gg.hop(\n",
+ " g2 = gg_gdf.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
- " hops=2)\n",
- "print(g2._nodes.shape, g2._edges.shape)"
- ],
+ " hops=1)\n",
+ "print(g2._nodes.shape, g2._edges.shape)\n",
+ "del start_nodes\n",
+ "del gg_gdf\n",
+ "del g2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3003,35 +2973,31 @@
"id": "ONv1RQeWBeeK",
"outputId": "58d57fa4-be72-45bc-abfa-5de9d1102f55"
},
- "execution_count": 53,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(44073, 1) (2069325, 2)\n",
"CPU times: user 27.8 s, sys: 13.2 s, total: 41 s\n",
"Wall time: 43.9 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n",
- "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n",
+ "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n",
"for i in range(1):\n",
- " g2 = gg_gdf.hop(\n",
+ " g2 = gg.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
" hops=2)\n",
- "print(g2._nodes.shape, g2._edges.shape)\n",
- "del start_nodes\n",
- "del gg_gdf\n",
- "del g2"
- ],
+ "print(g2._nodes.shape, g2._edges.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3039,31 +3005,35 @@
"id": "ke5SZZ01BgqR",
"outputId": "4173fd28-a11b-4300-d28b-6fdb87e8e9f3"
},
- "execution_count": 54,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(44073, 1) (2069325, 2)\n",
"CPU times: user 4.26 s, sys: 2.37 s, total: 6.63 s\n",
"Wall time: 7.91 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n",
+ "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n",
+ "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n",
"for i in range(1):\n",
- " g2 = gg.hop(\n",
+ " g2 = gg_gdf.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
- " hops=3)\n",
- "print(g2._nodes.shape, g2._edges.shape)"
- ],
+ " hops=2)\n",
+ "print(g2._nodes.shape, g2._edges.shape)\n",
+ "del start_nodes\n",
+ "del gg_gdf\n",
+ "del g2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3071,35 +3041,31 @@
"id": "U795pIBUBiZV",
"outputId": "d499433c-cc0c-4bbf-c69f-36b5d55402d9"
},
- "execution_count": 55,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(102414, 1) (24851333, 2)\n",
"CPU times: user 1min 3s, sys: 22.7 s, total: 1min 26s\n",
"Wall time: 1min 35s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n",
- "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n",
+ "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n",
"for i in range(1):\n",
- " g2 = gg_gdf.hop(\n",
+ " g2 = gg.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
" hops=3)\n",
- "print(g2._nodes.shape, g2._edges.shape)\n",
- "del start_nodes\n",
- "del gg_gdf\n",
- "del g2"
- ],
+ "print(g2._nodes.shape, g2._edges.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3107,31 +3073,35 @@
"id": "kIZYwSe1Bj2e",
"outputId": "b7e1ed9f-47d1-412e-9593-ecc436ac1486"
},
- "execution_count": 56,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(102414, 1) (24851333, 2)\n",
"CPU times: user 3.96 s, sys: 2.11 s, total: 6.07 s\n",
"Wall time: 6.05 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n",
+ "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n",
+ "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n",
"for i in range(1):\n",
- " g2 = gg.hop(\n",
+ " g2 = gg_gdf.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
- " hops=4)\n",
- "print(g2._nodes.shape, g2._edges.shape)"
- ],
+ " hops=3)\n",
+ "print(g2._nodes.shape, g2._edges.shape)\n",
+ "del start_nodes\n",
+ "del gg_gdf\n",
+ "del g2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3139,35 +3109,31 @@
"id": "YTI5sD6YBpYL",
"outputId": "b37bf2df-07dc-404c-8a83-a83f28e38bf6"
},
- "execution_count": 57,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(105479, 1) (30450354, 2)\n",
"CPU times: user 1min 34s, sys: 30.6 s, total: 2min 5s\n",
"Wall time: 2min 5s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n",
- "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n",
+ "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n",
"for i in range(1):\n",
- " g2 = gg_gdf.hop(\n",
+ " g2 = gg.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
" hops=4)\n",
- "print(g2._nodes.shape, g2._edges.shape)\n",
- "del start_nodes\n",
- "del gg_gdf\n",
- "del g2"
- ],
+ "print(g2._nodes.shape, g2._edges.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3175,31 +3141,35 @@
"id": "d5WBazICBrSz",
"outputId": "ef95e893-3a0f-4d47-ede4-bd8a6faebf98"
},
- "execution_count": 58,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(105479, 1) (30450354, 2)\n",
"CPU times: user 5.25 s, sys: 2.41 s, total: 7.67 s\n",
"Wall time: 7.69 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n",
+ "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n",
+ "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n",
"for i in range(1):\n",
- " g2 = gg.hop(\n",
+ " g2 = gg_gdf.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
- " hops=5)\n",
- "print(g2._nodes.shape, g2._edges.shape)"
- ],
+ " hops=4)\n",
+ "print(g2._nodes.shape, g2._edges.shape)\n",
+ "del start_nodes\n",
+ "del gg_gdf\n",
+ "del g2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3207,35 +3177,31 @@
"id": "ozQlRPaFBtPD",
"outputId": "4f1655c4-38fd-47f9-942d-836585e0d866"
},
- "execution_count": 59,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(105604, 1) (30468335, 2)\n",
"CPU times: user 2min 16s, sys: 39.1 s, total: 2min 55s\n",
"Wall time: 2min 58s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n",
- "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n",
+ "start_nodes = pd.DataFrame({gg._node: ['116374117927631468606']})\n",
"for i in range(1):\n",
- " g2 = gg_gdf.hop(\n",
+ " g2 = gg.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
" hops=5)\n",
- "print(g2._nodes.shape, g2._edges.shape)\n",
- "del start_nodes\n",
- "del gg_gdf\n",
- "del g2"
- ],
+ "print(g2._nodes.shape, g2._edges.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3243,35 +3209,46 @@
"id": "-ACkMG20B6HM",
"outputId": "f26c03a9-9f25-4f93-c7d3-0e8676694040"
},
- "execution_count": 60,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(105604, 1) (30468335, 2)\n",
"CPU times: user 5.79 s, sys: 2.51 s, total: 8.3 s\n",
"Wall time: 8.29 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "start_nodes = cudf.DataFrame({gg._node: ['116374117927631468606']})\n",
+ "gg_gdf = gg.nodes(cudf.from_pandas(gg._nodes)).edges(cudf.from_pandas(gg._edges))\n",
+ "for i in range(1):\n",
+ " g2 = gg_gdf.hop(\n",
+ " nodes=start_nodes,\n",
+ " direction='forward',\n",
+ " hops=5)\n",
+ "print(g2._nodes.shape, g2._edges.shape)\n",
+ "del start_nodes\n",
+ "del gg_gdf\n",
+ "del g2"
]
},
{
"cell_type": "markdown",
+ "metadata": {
+ "id": "R03M_swxarKC"
+ },
"source": [
- "### Orkut\n",
+ "## Orkut\n",
"- 117M edges\n",
"- 3M nodes"
- ],
- "metadata": {
- "id": "R03M_swxarKC"
- }
+ ]
},
{
"cell_type": "code",
- "source": [
- "! wget https://snap.stanford.edu/data/bigdata/communities/com-orkut.ungraph.txt.gz"
- ],
+ "execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3279,11 +3256,10 @@
"id": "QoabYR2maxPo",
"outputId": "2bb6275d-46bb-42da-ec05-d0e5a58b1f77"
},
- "execution_count": 8,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"--2023-12-26 00:55:52-- https://snap.stanford.edu/data/bigdata/communities/com-orkut.ungraph.txt.gz\n",
"Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80\n",
@@ -3298,24 +3274,25 @@
"\n"
]
}
+ ],
+ "source": [
+ "#! wget https://snap.stanford.edu/data/bigdata/communities/com-orkut.ungraph.txt.gz"
]
},
{
"cell_type": "code",
- "source": [
- "! gunzip com-orkut.ungraph.txt.gz"
- ],
+ "execution_count": 9,
"metadata": {
"id": "BvvfFPKWbAVJ"
},
- "execution_count": 9,
- "outputs": []
+ "outputs": [],
+ "source": [
+ "#! gunzip com-orkut.ungraph.txt.gz"
+ ]
},
{
"cell_type": "code",
- "source": [
- "! head -n 7 com-orkut.ungraph.txt"
- ],
+ "execution_count": 10,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3323,11 +3300,10 @@
"id": "YsWwRoPqbPIb",
"outputId": "2eb4f862-b4e1-42bf-ff5d-eec10b27cedc"
},
- "execution_count": 10,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"# Undirected graph: ../../data/output/orkut.txt\n",
"# Orkut\n",
@@ -3338,10 +3314,33 @@
"1\t4\n"
]
}
+ ],
+ "source": [
+ "#! head -n 7 com-orkut.ungraph.txt"
]
},
{
"cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "cbMC8r2ldjbW",
+ "outputId": "82688d53-7d56-4563-d65e-7c5cd32ac14e"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "('23.12.01', '0.32.0+12.g72e778c')"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"import pandas as pd\n",
"\n",
@@ -3363,33 +3362,11 @@
"locale.getpreferredencoding = lambda: \"UTF-8\"\n",
"\n",
"cudf.__version__, graphistry.__version__"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "cbMC8r2ldjbW",
- "outputId": "82688d53-7d56-4563-d65e-7c5cd32ac14e"
- },
- "execution_count": 11,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "('23.12.01', '0.32.0+12.g72e778c')"
- ]
- },
- "metadata": {},
- "execution_count": 11
- }
]
},
{
"cell_type": "code",
- "source": [
- "! nvidia-smi"
- ],
+ "execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3397,11 +3374,10 @@
"id": "TopFxAvnh_Cv",
"outputId": "cc9d9dc9-e594-4190-fe84-3f1b6dce8a1a"
},
- "execution_count": 12,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Tue Dec 26 00:56:27 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -3424,18 +3400,14 @@
"+---------------------------------------------------------------------------------------+\n"
]
}
+ ],
+ "source": [
+ "#! nvidia-smi"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "co_df = cudf.read_csv('com-orkut.ungraph.txt', sep='\\t', names=['s', 'd'], skiprows=5).to_pandas()\n",
- "print(co_df.shape)\n",
- "print(co_df.head(5))\n",
- "print(co_df.dtypes)\n",
- "#del co_df"
- ],
+ "execution_count": 13,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3443,11 +3415,10 @@
"id": "Oczs87ITbJgw",
"outputId": "ac203ddd-e684-4eb9-a586-f6a49fd1625d"
},
- "execution_count": 13,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(117185082, 2)\n",
" s d\n",
@@ -3463,17 +3434,19 @@
"Wall time: 6.76 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "co_df = cudf.read_csv('com-orkut.ungraph.txt', sep='\\t', names=['s', 'd'], skiprows=5).to_pandas()\n",
+ "print(co_df.shape)\n",
+ "print(co_df.head(5))\n",
+ "print(co_df.dtypes)\n",
+ "#del co_df"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "co_g = graphistry.edges(cudf.DataFrame(co_df), 's', 'd').materialize_nodes(engine='cudf')\n",
- "co_g = co_g.nodes(lambda g: g._nodes.to_pandas()).edges(lambda g: g._edges.to_pandas())\n",
- "print(co_g._nodes.shape, co_g._edges.shape)\n",
- "co_g._nodes.head(5)"
- ],
+ "execution_count": 14,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -3482,11 +3455,10 @@
"id": "gGSDjTtveFAT",
"outputId": "e7b38f4f-dc07-4f35-9bab-9c80a80bbf0b"
},
- "execution_count": 14,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(3072441, 1) (117185082, 2)\n",
"CPU times: user 1.96 s, sys: 2.95 s, total: 4.91 s\n",
@@ -3494,16 +3466,7 @@
]
},
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- " id\n",
- "0 1\n",
- "1 2\n",
- "2 3\n",
- "3 4\n",
- "4 5"
- ],
"text/html": [
"\n",
" \n",
@@ -3760,18 +3723,32 @@
"
\n",
" \n",
" \n"
+ ],
+ "text/plain": [
+ " id\n",
+ "0 1\n",
+ "1 2\n",
+ "2 3\n",
+ "3 4\n",
+ "4 5"
]
},
+ "execution_count": 14,
"metadata": {},
- "execution_count": 14
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "%%time\n",
+ "co_g = graphistry.edges(cudf.DataFrame(co_df), 's', 'd').materialize_nodes(engine='cudf')\n",
+ "co_g = co_g.nodes(lambda g: g._nodes.to_pandas()).edges(lambda g: g._edges.to_pandas())\n",
+ "print(co_g._nodes.shape, co_g._edges.shape)\n",
+ "co_g._nodes.head(5)"
]
},
{
"cell_type": "code",
- "source": [
- "! nvidia-smi"
- ],
+ "execution_count": 15,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3779,11 +3756,10 @@
"id": "V5qL8K7-dqIZ",
"outputId": "e08319fc-74d3-4f33-df0f-f98950dc8c99"
},
- "execution_count": 15,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Tue Dec 26 00:56:39 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -3806,18 +3782,14 @@
"+---------------------------------------------------------------------------------------+\n"
]
}
+ ],
+ "source": [
+ "#! nvidia-smi"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "# crashes\n",
- "if False:\n",
- " out = co_g.chain([ n({'id': 1}), e_forward(hops=1)])._nodes\n",
- " print(out.shape)\n",
- " del out"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3825,31 +3797,28 @@
"id": "hCbxZ8UmhRLp",
"outputId": "519aed6c-733d-41f4-d462-e57f5e32b131"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"CPU times: user 4 µs, sys: 1 µs, total: 5 µs\n",
"Wall time: 47.7 µs\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "# crashes\n",
+ "if False:\n",
+ " out = co_g.chain([ n({'id': 1}), e_forward(hops=1)])._nodes\n",
+ " print(out.shape)\n",
+ " del out"
]
},
{
- "cell_type": "code",
- "source": [
- "%%time\n",
- "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "! nvidia-smi\n",
- "for i in range(10):\n",
- " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=1)])\n",
- "! nvidia-smi\n",
- "print(out._nodes.shape, out._edges.shape)\n",
- "del co_gdf\n",
- "del out"
- ],
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3857,11 +3826,10 @@
"id": "Q682scC_eC-S",
"outputId": "7ff5f829-0de7-4a6c-a77d-e2857896a8a5"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Mon Dec 25 06:23:46 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -3906,21 +3874,22 @@
"Wall time: 4.42 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
"co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "! nvidia-smi\n",
+ "#! nvidia-smi\n",
"for i in range(10):\n",
- " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=2)])\n",
- "! nvidia-smi\n",
+ " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=1)])\n",
+ "#! nvidia-smi\n",
"print(out._nodes.shape, out._edges.shape)\n",
"del co_gdf\n",
"del out"
- ],
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3928,11 +3897,10 @@
"id": "i0AXhfqVbVsm",
"outputId": "8271f469-a73f-48e3-e1a9-3077026ab8ec"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Mon Dec 25 06:24:52 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -3977,21 +3945,22 @@
"Wall time: 6.13 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
"co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "! nvidia-smi\n",
+ "#! nvidia-smi\n",
"for i in range(10):\n",
- " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=3)])\n",
- "! nvidia-smi\n",
+ " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=2)])\n",
+ "#! nvidia-smi\n",
"print(out._nodes.shape, out._edges.shape)\n",
"del co_gdf\n",
"del out"
- ],
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -3999,11 +3968,10 @@
"id": "Hid0-iPKhpOd",
"outputId": "ecaeb534-d4d7-48fa-d4e1-c80b22626afe"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Mon Dec 25 06:25:25 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -4048,21 +4016,22 @@
"Wall time: 6.37 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
"co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "! nvidia-smi\n",
+ "#! nvidia-smi\n",
"for i in range(10):\n",
- " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=4)])\n",
- "! nvidia-smi\n",
+ " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=3)])\n",
+ "#! nvidia-smi\n",
"print(out._nodes.shape, out._edges.shape)\n",
"del co_gdf\n",
"del out"
- ],
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -4070,11 +4039,10 @@
"id": "buutj-ZjhrEe",
"outputId": "ae11addd-6bea-44e9-81c0-b431e1db8089"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Mon Dec 25 06:26:04 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -4119,21 +4087,22 @@
"Wall time: 9.84 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
"co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "! nvidia-smi\n",
+ "#! nvidia-smi\n",
"for i in range(10):\n",
- " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=5)])\n",
- "! nvidia-smi\n",
+ " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=4)])\n",
+ "#! nvidia-smi\n",
"print(out._nodes.shape, out._edges.shape)\n",
"del co_gdf\n",
"del out"
- ],
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -4141,11 +4110,10 @@
"id": "bK4C9Ly0hso-",
"outputId": "8a9a32ab-03e2-42b4-8b71-2bcf797b31b1"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Mon Dec 25 06:27:18 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -4190,10 +4158,26 @@
"Wall time: 39.2 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
+ "#! nvidia-smi\n",
+ "for i in range(10):\n",
+ " out = co_gdf.chain([ n({'id': 1}), e_forward(hops=5)])\n",
+ "#! nvidia-smi\n",
+ "print(out._nodes.shape, out._edges.shape)\n",
+ "del co_gdf\n",
+ "del out"
]
},
{
"cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "qrga-la0hwhh"
+ },
+ "outputs": [],
"source": [
"%%time\n",
"co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
@@ -4201,18 +4185,11 @@
"print(out.shape)\n",
"del co_gdf\n",
"del out"
- ],
- "metadata": {
- "id": "qrga-la0hwhh"
- },
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
- "source": [
- "!lscpu\n"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -4220,11 +4197,10 @@
"id": "eiXFImxF-rzw",
"outputId": "b807cc3d-ed1a-4bef-c6e0-bfc2df7356ff"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Architecture: x86_64\n",
" CPU op-mode(s): 32-bit, 64-bit\n",
@@ -4276,13 +4252,14 @@
" Tsx async abort: Vulnerable\n"
]
}
+ ],
+ "source": [
+ "#!lscpu\n"
]
},
{
"cell_type": "code",
- "source": [
- "!free -h\n"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -4290,36 +4267,24 @@
"id": "wJohLi58-sN5",
"outputId": "c3e144f6-c19a-4c68-e867-f5e7fa2e9df4"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
" total used free shared buff/cache available\n",
"Mem: 12Gi 717Mi 8.0Gi 1.0Mi 3.9Gi 11Gi\n",
"Swap: 0B 0B 0B\n"
]
}
+ ],
+ "source": [
+ "#!free -h\n"
]
},
{
"cell_type": "code",
- "source": [
- "%%time\n",
- "start_nodes = pd.DataFrame({'id': [1]})\n",
- "! nvidia-smi\n",
- "for i in range(1):\n",
- " g2 = co_g.hop(\n",
- " nodes=start_nodes,\n",
- " direction='forward',\n",
- " hops=1)\n",
- "! nvidia-smi\n",
- "print(g2._nodes.shape, g2._edges.shape)\n",
- "#del start_nodes\n",
- "#del co_gdf\n",
- "#del g2"
- ],
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -4327,11 +4292,10 @@
"id": "zak4Inhco5il",
"outputId": "30bcf2bc-853e-4e5e-8c57-ba0cd9429554"
},
- "execution_count": null,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Tue Dec 26 01:01:43 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -4354,38 +4318,37 @@
"+---------------------------------------------------------------------------------------+\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
- "start_nodes = cudf.DataFrame({'id': [1]})\n",
- "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "! nvidia-smi\n",
- "for i in range(10):\n",
- " g2 = co_gdf.hop(\n",
+ "start_nodes = pd.DataFrame({'id': [1]})\n",
+ "#! nvidia-smi\n",
+ "for i in range(1):\n",
+ " g2 = co_g.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
" hops=1)\n",
- "! nvidia-smi\n",
+ "#! nvidia-smi\n",
"print(g2._nodes.shape, g2._edges.shape)\n",
- "del start_nodes\n",
- "del co_gdf\n",
- "del g2"
- ],
+ "#del start_nodes\n",
+ "#del co_gdf\n",
+ "#del g2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
"metadata": {
- "id": "-SmFlCBS_Bgx",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "-SmFlCBS_Bgx",
"outputId": "d2326cf7-3ea6-4f99-9548-f2e98ece59a4"
},
- "execution_count": 16,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Tue Dec 26 00:56:45 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -4430,26 +4393,27 @@
"Wall time: 1.84 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
"start_nodes = cudf.DataFrame({'id': [1]})\n",
"co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "! nvidia-smi\n",
+ "#! nvidia-smi\n",
"for i in range(10):\n",
" g2 = co_gdf.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
- " hops=2)\n",
- "! nvidia-smi\n",
+ " hops=1)\n",
+ "#! nvidia-smi\n",
"print(g2._nodes.shape, g2._edges.shape)\n",
"del start_nodes\n",
"del co_gdf\n",
"del g2"
- ],
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -4457,11 +4421,10 @@
"id": "fjjt3YnYnabv",
"outputId": "05762f50-bfe1-4d23-9153-31431418c8e5"
},
- "execution_count": 17,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Tue Dec 26 00:56:47 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -4506,26 +4469,27 @@
"Wall time: 2.51 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
"start_nodes = cudf.DataFrame({'id': [1]})\n",
"co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "! nvidia-smi\n",
+ "#! nvidia-smi\n",
"for i in range(10):\n",
" g2 = co_gdf.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
- " hops=3)\n",
- "! nvidia-smi\n",
+ " hops=2)\n",
+ "#! nvidia-smi\n",
"print(g2._nodes.shape, g2._edges.shape)\n",
"del start_nodes\n",
"del co_gdf\n",
"del g2"
- ],
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -4533,11 +4497,10 @@
"id": "oIouuORgnbcY",
"outputId": "f07abe4c-5137-4ee3-935a-afbb2c5eaa1e"
},
- "execution_count": 18,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Tue Dec 26 00:56:50 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -4582,26 +4545,27 @@
"Wall time: 3.25 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
"start_nodes = cudf.DataFrame({'id': [1]})\n",
"co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "! nvidia-smi\n",
+ "#! nvidia-smi\n",
"for i in range(10):\n",
" g2 = co_gdf.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
- " hops=4)\n",
- "! nvidia-smi\n",
+ " hops=3)\n",
+ "#! nvidia-smi\n",
"print(g2._nodes.shape, g2._edges.shape)\n",
"del start_nodes\n",
"del co_gdf\n",
"del g2"
- ],
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -4609,11 +4573,10 @@
"id": "oNLZGjwInc85",
"outputId": "534097cf-4022-48cc-9419-a00c135f69e1"
},
- "execution_count": 19,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Tue Dec 26 00:56:53 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -4658,26 +4621,27 @@
"Wall time: 5.02 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
"start_nodes = cudf.DataFrame({'id': [1]})\n",
"co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "! nvidia-smi\n",
+ "#! nvidia-smi\n",
"for i in range(10):\n",
" g2 = co_gdf.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
- " hops=5)\n",
- "! nvidia-smi\n",
+ " hops=4)\n",
+ "#! nvidia-smi\n",
"print(g2._nodes.shape, g2._edges.shape)\n",
"del start_nodes\n",
"del co_gdf\n",
"del g2"
- ],
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -4685,11 +4649,10 @@
"id": "ePqaeujMneX8",
"outputId": "ffd88fff-016e-4ac0-ecb9-fa06baca60f8"
},
- "execution_count": 20,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Tue Dec 26 00:56:58 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -4734,26 +4697,27 @@
"Wall time: 12 s\n"
]
}
- ]
- },
- {
- "cell_type": "code",
+ ],
"source": [
"%%time\n",
"start_nodes = cudf.DataFrame({'id': [1]})\n",
"co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
- "! nvidia-smi\n",
+ "#! nvidia-smi\n",
"for i in range(10):\n",
" g2 = co_gdf.hop(\n",
" nodes=start_nodes,\n",
" direction='forward',\n",
- " hops=6)\n",
- "! nvidia-smi\n",
+ " hops=5)\n",
+ "#! nvidia-smi\n",
"print(g2._nodes.shape, g2._edges.shape)\n",
"del start_nodes\n",
"del co_gdf\n",
"del g2"
- ],
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -4761,11 +4725,10 @@
"id": "PTBkoIVHnfzK",
"outputId": "5615ecd7-47ea-46ab-fd36-13bce4b3c787"
},
- "execution_count": 21,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Tue Dec 26 00:57:10 2023 \n",
"+---------------------------------------------------------------------------------------+\n",
@@ -4810,16 +4773,48 @@
"Wall time: 28.2 s\n"
]
}
+ ],
+ "source": [
+ "%%time\n",
+ "start_nodes = cudf.DataFrame({'id': [1]})\n",
+ "co_gdf = co_g.nodes(lambda g: cudf.DataFrame(g._nodes)).edges(lambda g: cudf.DataFrame(g._edges))\n",
+ "#! nvidia-smi\n",
+ "for i in range(10):\n",
+ " g2 = co_gdf.hop(\n",
+ " nodes=start_nodes,\n",
+ " direction='forward',\n",
+ " hops=6)\n",
+ "#! nvidia-smi\n",
+ "print(g2._nodes.shape, g2._edges.shape)\n",
+ "del start_nodes\n",
+ "del co_gdf\n",
+ "del g2"
]
},
{
"cell_type": "code",
- "source": [],
+ "execution_count": null,
"metadata": {
"id": "Ygc2nrkznlCu"
},
- "execution_count": null,
- "outputs": []
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "gpuType": "T4",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
}
- ]
-}
\ No newline at end of file
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/demos/more_examples/graphistry_features/embed/simple-ssh-logs-rgcn-anomaly-detector.ipynb b/demos/more_examples/graphistry_features/embed/simple-ssh-logs-rgcn-anomaly-detector.ipynb
index e79c81cc6..3cb5b0dd6 100644
--- a/demos/more_examples/graphistry_features/embed/simple-ssh-logs-rgcn-anomaly-detector.ipynb
+++ b/demos/more_examples/graphistry_features/embed/simple-ssh-logs-rgcn-anomaly-detector.ipynb
@@ -12,10 +12,8 @@
"* Unsupervised graph neural network: RGCN\n",
"* Runs on both CPU + GPU: Toggle `is_gpu`\n",
"\n",
- "See also:\n",
- "* Other pygraphistry[ai] gnn notebooks for more advanced modes like incorporating node features\n",
- "* Intro to RGCNs - [intro-story.ipynb](intro-story.md)\n",
- "* In-depth RGCN - [advanced-identity-protection-40m.ipynb](advanced-identity-protection-40m.ipynb)\n"
+ "For background, so the RGCN intro: [intro-story.ipynb](../../../talks/infosec_jupyterthon2022/rgcn_login_anomaly_detection/advanced-identity-protection-40m.ipynb)\n",
+ "\n"
]
},
{
@@ -353,189 +351,10 @@
},
{
"cell_type": "code",
- "execution_count": 6,
- "id": "dbf488b3-2a98-4c19-aa4f-4aef63943412",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2022-12-02T20:52:04.712354Z",
- "iopub.status.busy": "2022-12-02T20:52:04.712254Z",
- "iopub.status.idle": "2022-12-02T20:52:34.396563Z",
- "shell.execute_reply": "2022-12-02T20:52:34.396305Z",
- "shell.execute_reply.started": "2022-12-02T20:52:04.712343Z"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Preprocessing embedding data\n",
- "--Splitting data\n",
- "--num_nodes: 97, num_relationships: 20\n",
- "Training embedding\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "epoch: 1, loss: 0.4165, score: 0.0000%: 0%| | 0/10 [00:03, ?it/s]/home/graphistry/.local/lib/python3.8/site-packages/graphistry/embed_utils.py:459: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
- " emb = torch.tensor(self._embeddings)\n",
- "epoch: 2, loss: 0.4542, score: 84.8845%: 10%|█ | 1/10 [00:03<00:28, 3.11s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating...\n",
- "--took 0.00 minutes to evaluate\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "epoch: 3, loss: 0.3594, score: 88.4535%: 20%|██ | 2/10 [00:06<00:24, 3.02s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating...\n",
- "--took 0.00 minutes to evaluate\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "epoch: 4, loss: 0.3348, score: 82.7852%: 30%|███ | 3/10 [00:09<00:21, 3.00s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating...\n",
- "--took 0.00 minutes to evaluate\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "epoch: 5, loss: 0.3213, score: 82.3653%: 40%|████ | 4/10 [00:12<00:17, 2.98s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating...\n",
- "--took 0.00 minutes to evaluate\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "epoch: 6, loss: 0.3318, score: 84.2547%: 50%|█████ | 5/10 [00:15<00:14, 2.93s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating...\n",
- "--took 0.00 minutes to evaluate\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "epoch: 7, loss: 0.2965, score: 80.7558%: 60%|██████ | 6/10 [00:17<00:11, 2.94s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating...\n",
- "--took 0.00 minutes to evaluate\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "epoch: 8, loss: 0.2893, score: 74.9475%: 70%|███████ | 7/10 [00:20<00:08, 2.96s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating...\n",
- "--took 0.00 minutes to evaluate\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "epoch: 9, loss: 0.2932, score: 70.3289%: 80%|████████ | 8/10 [00:24<00:05, 2.98s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating...\n",
- "--took 0.00 minutes to evaluate\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "epoch: 10, loss: 0.3061, score: 65.8502%: 90%|█████████ | 9/10 [00:26<00:02, 2.96s/it]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating...\n",
- "--took 0.00 minutes to evaluate\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "epoch: 10, loss: 0.2931, score: 67.88%: 100%|██████████| 10/10 [00:29<00:00, 2.97s/it] "
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating...\n",
- "--took 0.00 minutes to evaluate\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
+ "execution_count": null,
+ "id": "018a64cf",
+ "metadata": {},
+ "outputs": [],
"source": [
"g2 = g.embed( # rerun until happy with quality\n",
" device=dev0,\n",
@@ -841,8 +660,9 @@
"source": [
"## Next steps\n",
"\n",
- "- RGCN intro: [intro-story.ipynb](../../talks/infosec_jupyterthon2022/intro-story.ipynb)\n",
- "- In-depth RGCN: [advanced-identity-protection-40m.ipynb](../../talks/infosec_jupyterthon2022/advanced-identity-protection-40m.ipynb)\n",
+ "- RGCN intro: [intro-story.ipynb](../../../talks/infosec_jupyterthon2022/rgcn_login_anomaly_detection/intro-story.ipynb)\n",
+ "- In-depth RGCN: [advanced-identity-protection-40m.ipynb](../../../talks/infosec_jupyterthon2022/rgcn_login_anomaly_detection/advanced-identity-protection-40m.ipynb\n",
+ ")\n",
"- UMAP demo for 97% alert volume reduction & alert correlation\n",
"- [PyGraphistry](http://github.com/graphistry/pygraphistryhttp://github.com/graphistry/pygraphistry) (py, oss) + [Graphistry Hub](https://hub.graphistry.com/https://hub.graphistry.com/) (free)\n",
" - Dashboarding with [graph-app-kit (containerized, gpu, graph Streamlit)](https://github.com/graphistry/graph-app-kithttps://github.com/graphistry/graph-app-kit)\n",
@@ -874,7 +694,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.9"
+ "version": "3.8.10"
},
"vscode": {
"interpreter": {
diff --git a/demos/more_examples/graphistry_features/encodings-badges.ipynb b/demos/more_examples/graphistry_features/encodings-badges.ipynb
index f334b6264..449262ec4 100644
--- a/demos/more_examples/graphistry_features/encodings-badges.ipynb
+++ b/demos/more_examples/graphistry_features/encodings-badges.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Badge encodings tutorial\n",
+ "# Badge encodings tutorial\n",
"\n",
"See the examples below for common ways to map data to node badges in Graphistry. Icons appear in the main area of a node, while badges circle them (`TopRight`, `BottomLeft`, `Right`, etc.). They can be used together.\n",
"\n",
@@ -17,7 +17,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Setup\n",
+ "### Setup\n",
"\n",
"Mode `api=3` is recommended. It is required for `complex_encodings` (ex: `.encode_point_size(...)`). Mode `api=1` works with the simpler `.bind(point_size='col_a')` form."
]
diff --git a/demos/more_examples/graphistry_features/encodings-colors.ipynb b/demos/more_examples/graphistry_features/encodings-colors.ipynb
index 193770d7c..fcf0d6b93 100644
--- a/demos/more_examples/graphistry_features/encodings-colors.ipynb
+++ b/demos/more_examples/graphistry_features/encodings-colors.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Color encodings tutorial\n",
+ "# Color encodings tutorial\n",
"\n",
"See the examples below for common ways to map data to node/edge color in Graphistry.\n",
"\n",
diff --git a/demos/more_examples/graphistry_features/encodings-icons.ipynb b/demos/more_examples/graphistry_features/encodings-icons.ipynb
index fe99afd4d..07a1872df 100644
--- a/demos/more_examples/graphistry_features/encodings-icons.ipynb
+++ b/demos/more_examples/graphistry_features/encodings-icons.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Icons encodings tutorial\n",
+ "# Icons encodings tutorial\n",
"\n",
"See the examples below for common ways to map data to node icon in Graphistry.\n",
"\n",
diff --git a/demos/more_examples/graphistry_features/encodings-sizes.ipynb b/demos/more_examples/graphistry_features/encodings-sizes.ipynb
index 8ccefd501..16fa5832e 100644
--- a/demos/more_examples/graphistry_features/encodings-sizes.ipynb
+++ b/demos/more_examples/graphistry_features/encodings-sizes.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Size encodings tutorial\n",
+ "# Size encodings tutorial\n",
"\n",
"See the examples below for common ways to map data to node size in Graphistry.\n",
"\n",
diff --git a/demos/more_examples/graphistry_features/hop_and_chain_graph_pattern_mining.ipynb b/demos/more_examples/graphistry_features/hop_and_chain_graph_pattern_mining.ipynb
index 98cc207cd..98063ac1e 100644
--- a/demos/more_examples/graphistry_features/hop_and_chain_graph_pattern_mining.ipynb
+++ b/demos/more_examples/graphistry_features/hop_and_chain_graph_pattern_mining.ipynb
@@ -1,23 +1,12 @@
{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
- "colab": {
- "provenance": []
- },
- "kernelspec": {
- "name": "python3",
- "display_name": "Python 3"
- },
- "language_info": {
- "name": "python"
- }
- },
"cells": [
{
"cell_type": "markdown",
+ "metadata": {
+ "id": "GZxoiU8sQDk_"
+ },
"source": [
- "# Hop-and-chain: PyGraphistry Cypher-style graph pattern matching on dataframes\n",
+ "# GFQL: Hop-and-chain - PyGraphistry Cypher-style graph pattern matching on dataframes\n",
"\n",
"PyGraphistry supports a rich subset of the popular Cypher graph query language, which you can run on dataframes without needing to install a database nor native libraries. It is natively integrated with dataframes and thus has a Python-native syntax rather than the traditional string syntax.\n",
"\n",
@@ -35,7 +24,7 @@
"\n",
"---\n",
"\n",
- "# Tutorial:\n",
+ "**Tutorial**\n",
"\n",
"1. Install & configure\n",
"1. Load & enrich a US congress twitter interaction dataset\n",
@@ -44,42 +33,44 @@
"1. Advanced filter predicates\n",
"1. Result labeling\n",
"\n"
- ],
- "metadata": {
- "id": "GZxoiU8sQDk_"
- }
+ ]
},
{
"cell_type": "markdown",
- "source": [
- "# 1. Install & configure"
- ],
"metadata": {
"id": "QQpsrtwBT7sa"
- }
+ },
+ "source": [
+ "## 1. Install & configure"
+ ]
},
{
"cell_type": "code",
- "source": [
- "#! pip install graphistry[igraph]"
- ],
+ "execution_count": null,
"metadata": {
"id": "cYjRbgkU9Sx8"
},
- "execution_count": null,
- "outputs": []
+ "outputs": [],
+ "source": [
+ "# ! pip install graphistry[igraph]"
+ ]
},
{
"cell_type": "markdown",
- "source": [
- "## Imports"
- ],
"metadata": {
"id": "Ff6Tt9DhkePl"
- }
+ },
+ "source": [
+ "### Imports"
+ ]
},
{
"cell_type": "code",
+ "execution_count": 141,
+ "metadata": {
+ "id": "S5_y0CbLkjft"
+ },
+ "outputs": [],
"source": [
"import pandas as pd\n",
"\n",
@@ -93,66 +84,57 @@
" # attribute predicates\n",
" is_in, ge, startswith, contains, match as match_re\n",
")"
- ],
- "metadata": {
- "id": "S5_y0CbLkjft"
- },
- "execution_count": 141,
- "outputs": []
+ ]
},
{
"cell_type": "code",
- "source": [
- "graphistry.register(api=3, username='...', password='...')"
- ],
+ "execution_count": null,
"metadata": {
"id": "GQ83i-sKUaw9"
},
- "execution_count": null,
- "outputs": []
+ "outputs": [],
+ "source": [
+ "graphistry.register(api=3, username='...', password='...')"
+ ]
},
{
"cell_type": "markdown",
- "source": [
- "# 2. Load & enrich a US congress twitter interaction dataset"
- ],
"metadata": {
"id": "eU9SyauNUHtR"
- }
+ },
+ "source": [
+ "## 2. Load & enrich a US congress twitter interaction dataset"
+ ]
},
{
"cell_type": "markdown",
+ "metadata": {
+ "id": "AM9JhnaQkRd3"
+ },
"source": [
- "## Data\n",
+ "### Data\n",
"\n",
"* Download\n",
"* Turn json into a Pandas edges dataframe\n",
"* Turn edges dataframe into a PyGraphistry graph\n",
"* Enrich nodes and edges with some useful graph metrics\n",
"* Visualize full graph to test"
- ],
- "metadata": {
- "id": "AM9JhnaQkRd3"
- }
+ ]
},
{
"cell_type": "code",
- "source": [
- "! wget -q https://snap.stanford.edu/data/congress_network.zip\n",
- "! unzip congress_network.zip\n"
- ],
+ "execution_count": 9,
"metadata": {
- "id": "55xeNAyDXhAm",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "55xeNAyDXhAm",
"outputId": "287758f0-0df2-49ff-ecdc-283313f7e07a"
},
- "execution_count": 9,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"total 1.2M\n",
"drwxr-xr-x 1 root root 4.0K Dec 4 03:56 .\n",
@@ -168,29 +150,15 @@
"-rw-r--r-- 1 root root 299K May 9 2017 vertex2aid\n"
]
}
+ ],
+ "source": [
+ "# ! wget -q https://snap.stanford.edu/data/congress_network.zip\n",
+ "# ! unzip congress_network.zip\n"
]
},
{
"cell_type": "code",
- "source": [
- "import json\n",
- "\n",
- "with open('congress_network/congress_network_data.json', 'r') as file:\n",
- " data = json.load(file)\n",
- "\n",
- "edges = []\n",
- "for i, name in enumerate(data[0]['usernameList']):\n",
- " for ii, j in enumerate(data[0]['outList'][i]):\n",
- " edges.append({\n",
- " 'from': name,\n",
- " 'to': data[0]['usernameList'][j],\n",
- " 'weight': data[0]['outWeight'][i][ii]\n",
- " })\n",
- "edges_df = pd.DataFrame(edges)\n",
- "\n",
- "print(edges_df.shape)\n",
- "edges_df.sample(5)"
- ],
+ "execution_count": 40,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -199,26 +167,16 @@
"id": "6CmULn4N-8oh",
"outputId": "61a1a4cf-dfe1-4260-a427-46009f4e4aaf"
},
- "execution_count": 40,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"(13289, 3)\n"
]
},
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- " from to weight\n",
- "11112 RepBobbyRush janschakowsky 0.034364\n",
- "3836 RepCori Ilhan 0.015936\n",
- "5282 RepTedDeutch RepDWStweets 0.003268\n",
- "12352 BennieGThompson RepStricklandWA 0.006849\n",
- "9358 RepCarolMiller RepTroyNehls 0.005291"
- ],
"text/html": [
"\n",
" \n",
@@ -487,55 +445,57 @@
"
\n",
" \n",
" \n"
+ ],
+ "text/plain": [
+ " from to weight\n",
+ "11112 RepBobbyRush janschakowsky 0.034364\n",
+ "3836 RepCori Ilhan 0.015936\n",
+ "5282 RepTedDeutch RepDWStweets 0.003268\n",
+ "12352 BennieGThompson RepStricklandWA 0.006849\n",
+ "9358 RepCarolMiller RepTroyNehls 0.005291"
]
},
+ "execution_count": 40,
"metadata": {},
- "execution_count": 40
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "import json\n",
+ "\n",
+ "with open('congress_network/congress_network_data.json', 'r') as file:\n",
+ " data = json.load(file)\n",
+ "\n",
+ "edges = []\n",
+ "for i, name in enumerate(data[0]['usernameList']):\n",
+ " for ii, j in enumerate(data[0]['outList'][i]):\n",
+ " edges.append({\n",
+ " 'from': name,\n",
+ " 'to': data[0]['usernameList'][j],\n",
+ " 'weight': data[0]['outWeight'][i][ii]\n",
+ " })\n",
+ "edges_df = pd.DataFrame(edges)\n",
+ "\n",
+ "print(edges_df.shape)\n",
+ "edges_df.sample(5)"
]
},
{
"cell_type": "markdown",
+ "metadata": {
+ "id": "XLFTgDTEDSeA"
+ },
"source": [
- "## Load dataframe as a PyGraphistry graph\n",
+ "### Load dataframe as a PyGraphistry graph\n",
"\n",
"Turn into a graph and precompute some useful graph metrics\n",
"\n",
"Recall that a `g` object, underneath, is essentially just two dataframes, `g._edges` and `g._nodes`, and with many useful graph methods:"
- ],
- "metadata": {
- "id": "XLFTgDTEDSeA"
- }
+ ]
},
{
"cell_type": "code",
- "source": [
- "# Shape\n",
- "g = graphistry.edges(edges_df, 'from', 'to')\n",
- "\n",
- "# Enrich & style\n",
- "# Tip: Switch from compute_igraph to compute_cugraph when GPUs are available\n",
- "g2 = (g\n",
- " .materialize_nodes()\n",
- " .nodes(lambda g: g._nodes.assign(title=g._nodes.id))\n",
- " .edges(lambda g: g._edges.assign(weight2=g._edges.weight))\n",
- " .bind(point_title='title')\n",
- " .compute_igraph('community_infomap')\n",
- " .compute_igraph('pagerank')\n",
- " .get_degrees()\n",
- " .encode_point_color(\n",
- " 'community_infomap',\n",
- " as_categorical=True,\n",
- " categorical_mapping={\n",
- " 0: '#32a9a2', # vibrant teal\n",
- " 1: '#ff6b6b', # soft coral\n",
- " 2: '#f9d342', # muted yellow\n",
- " }\n",
- " )\n",
- ")\n",
- "\n",
- "g2._nodes"
- ],
+ "execution_count": 77,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -544,47 +504,16 @@
"id": "aB1U7e0HXmHh",
"outputId": "53b9fa91-0caf-4866-c5a9-d9cf80e3c9ac"
},
- "execution_count": 77,
"outputs": [
{
- "output_type": "stream",
"name": "stderr",
+ "output_type": "stream",
"text": [
"WARNING:root:edge index g._edge not set so using edge index as ID; set g._edge via g.edges(), or change merge_if_existing to FalseWARNING:root:edge index g._edge __edge_index__ missing as attribute in ig; using ig edge order for IDsWARNING:root:edge index g._edge not set so using edge index as ID; set g._edge via g.edges(), or change merge_if_existing to FalseWARNING:root:edge index g._edge __edge_index__ missing as attribute in ig; using ig edge order for IDs"
]
},
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- " id title community_infomap pagerank degree_in \\\n",
- "0 SenatorBaldwin SenatorBaldwin 0 0.001422 26 \n",
- "1 SenJohnBarrasso SenJohnBarrasso 0 0.001179 22 \n",
- "2 SenatorBennet SenatorBennet 0 0.001995 33 \n",
- "3 MarshaBlackburn MarshaBlackburn 0 0.001331 18 \n",
- "4 SenBlumenthal SenBlumenthal 0 0.001672 30 \n",
- ".. ... ... ... ... ... \n",
- "470 RepJoeWilson RepJoeWilson 1 0.001780 21 \n",
- "471 RobWittman RobWittman 1 0.001017 13 \n",
- "472 rep_stevewomack rep_stevewomack 1 0.002637 35 \n",
- "473 RepJohnYarmuth RepJohnYarmuth 2 0.000555 5 \n",
- "474 RepLeeZeldin RepLeeZeldin 1 0.000511 3 \n",
- "\n",
- " degree_out degree \n",
- "0 20 46 \n",
- "1 19 41 \n",
- "2 22 55 \n",
- "3 38 56 \n",
- "4 35 65 \n",
- ".. ... ... \n",
- "470 38 59 \n",
- "471 19 32 \n",
- "472 19 54 \n",
- "473 20 25 \n",
- "474 25 28 \n",
- "\n",
- "[475 rows x 7 columns]"
- ],
"text/html": [
"\n",
" \n",
@@ -938,18 +867,73 @@
"
\n",
" \n",
" \n"
+ ],
+ "text/plain": [
+ " id title community_infomap pagerank degree_in \\\n",
+ "0 SenatorBaldwin SenatorBaldwin 0 0.001422 26 \n",
+ "1 SenJohnBarrasso SenJohnBarrasso 0 0.001179 22 \n",
+ "2 SenatorBennet SenatorBennet 0 0.001995 33 \n",
+ "3 MarshaBlackburn MarshaBlackburn 0 0.001331 18 \n",
+ "4 SenBlumenthal SenBlumenthal 0 0.001672 30 \n",
+ ".. ... ... ... ... ... \n",
+ "470 RepJoeWilson RepJoeWilson 1 0.001780 21 \n",
+ "471 RobWittman RobWittman 1 0.001017 13 \n",
+ "472 rep_stevewomack rep_stevewomack 1 0.002637 35 \n",
+ "473 RepJohnYarmuth RepJohnYarmuth 2 0.000555 5 \n",
+ "474 RepLeeZeldin RepLeeZeldin 1 0.000511 3 \n",
+ "\n",
+ " degree_out degree \n",
+ "0 20 46 \n",
+ "1 19 41 \n",
+ "2 22 55 \n",
+ "3 38 56 \n",
+ "4 35 65 \n",
+ ".. ... ... \n",
+ "470 38 59 \n",
+ "471 19 32 \n",
+ "472 19 54 \n",
+ "473 20 25 \n",
+ "474 25 28 \n",
+ "\n",
+ "[475 rows x 7 columns]"
]
},
+ "execution_count": 77,
"metadata": {},
- "execution_count": 77
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "# Shape\n",
+ "g = graphistry.edges(edges_df, 'from', 'to')\n",
+ "\n",
+ "# Enrich & style\n",
+ "# Tip: Switch from compute_igraph to compute_cugraph when GPUs are available\n",
+ "g2 = (g\n",
+ " .materialize_nodes()\n",
+ " .nodes(lambda g: g._nodes.assign(title=g._nodes.id))\n",
+ " .edges(lambda g: g._edges.assign(weight2=g._edges.weight))\n",
+ " .bind(point_title='title')\n",
+ " .compute_igraph('community_infomap')\n",
+ " .compute_igraph('pagerank')\n",
+ " .get_degrees()\n",
+ " .encode_point_color(\n",
+ " 'community_infomap',\n",
+ " as_categorical=True,\n",
+ " categorical_mapping={\n",
+ " 0: '#32a9a2', # vibrant teal\n",
+ " 1: '#ff6b6b', # soft coral\n",
+ " 2: '#f9d342', # muted yellow\n",
+ " }\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "g2._nodes"
]
},
{
"cell_type": "code",
- "source": [
- "g2.plot()"
- ],
+ "execution_count": 79,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -958,14 +942,9 @@
"id": "GY9Q7KyqBMq8",
"outputId": "5b4b277e-17fd-4201-9518-25168b927c6f"
},
- "execution_count": 79,
"outputs": [
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- ""
- ],
"text/html": [
"\n",
"