From 30d80453595c7305329fbc1aede87be92258c187 Mon Sep 17 00:00:00 2001 From: Oliver Alvarado Rodriguez <41132909+alvaradoo@users.noreply.github.com> Date: Tue, 23 Apr 2024 15:17:01 -0400 Subject: [PATCH] New Sample and Release (#116) * Updated tests * update tutorial * updated notebook * update sample notebook and version metadata --- README.md | 2 +- arachne/README.md | 2 +- arachne/arachne_sample.ipynb | 1224 ++++++++++++++++++++-- arachne/client/arachne/propgraphclass.py | 5 - arachne/client/setup.py | 2 +- 5 files changed, 1136 insertions(+), 99 deletions(-) diff --git a/README.md b/README.md index a4b7da0f..8a5b1373 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This is an external repository to build functionality for [Arkouda](https://gith ## Installing Prerequisites We recommend following the installation instructions provided by the Arkouda development team. Most specifically, follow the [Prerequisites](https://github.com/Bears-R-Us/arkouda?tab=readme-ov-file#prerequisites-toc) section in its entirety, and only the [Dependency Configuration](https://github.com/Bears-R-Us/arkouda/blob/master/pydoc/setup/BUILD.md#building-the-server) section of the build instructions. The installation steps usually involve the following: -1. Download [Arkouda](https://github.com/Bears-R-Us/arkouda). **Use Arkouda version v2024.03.18.** A specified version can be selected for download by clicking on `Releases` in the main repository for Arkouda. +1. Download [Arkouda](https://github.com/Bears-R-Us/arkouda). **Use Arkouda version v2024.04.19.** A specified version can be selected for download by clicking on `Releases` in the main repository for Arkouda. 2. Install dependencies with `Anaconda`. An environment containing all dependencies can be installed from `arkouda-env.yml` or `arkouda-env-dev.yml` within your Arkouda home directory. 3. Download and build [Chapel](https://chapel-lang.org/download.html). **Use Chapel versions 1.33.0.** 4. [Configure your dependencies](https://github.com/Bears-R-Us/arkouda/blob/master/pydoc/setup/BUILD.md#dependency-configuration). This involves creating (or modifying) the `Makefile.paths` within your Arkouda home directory. diff --git a/arachne/README.md b/arachne/README.md index f735c4c1..56d0b281 100644 --- a/arachne/README.md +++ b/arachne/README.md @@ -11,7 +11,7 @@ python3 module_configuration.py --ak_loc=/complete/path/to/arkouda/ --pkg_path=/ ``` ## Usage -To see an example on how to run and use Arachne, please use `arkouda-njit/arachne/arachne_sample.ipynb` to build a random property graph and run queries. This assumes that you have started an Arkouda server using `./arkouda_server` in the Arkouda home directory. +To see an example on how to run and use Arachne, please use `arkouda-njit/arachne/arachne_sample.ipynb`. This assumes that you have started an Arkouda server using `./arkouda_server` in the Arkouda home directory. ## Testing The Arachne tests are executed from the arkouda-njit/arachne directory by running the `pytest` command. diff --git a/arachne/arachne_sample.ipynb b/arachne/arachne_sample.ipynb index 34efd3a9..eb0e6f7c 100644 --- a/arachne/arachne_sample.ipynb +++ b/arachne/arachne_sample.ipynb @@ -11,21 +11,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " _ _ _ \n", + " / \\ _ __| | _____ _ _ __| | __ _ \n", + " / _ \\ | '__| |/ / _ \\| | | |/ _` |/ _` |\n", + " / ___ \\| | | < (_) | |_| | (_| | (_| |\n", + "/_/ \\_\\_| |_|\\_\\___/ \\__,_|\\__,_|\\__,_|\n", + " \n", + "\n", + "Client Version: v2024.04.19\n" + ] + } + ], "source": [ "import arkouda as ak\n", "import arachne as ar\n", + "import scipy as sp\n", "import networkx as nx\n", + "import matplotlib.pyplot as plt\n", "import os" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "connected to arkouda server tcp://*:5555\n" + ] + } + ], "source": [ "# NOTE: Make sure to change the server name to whatever is applicable in your environment. If running locally, then use only ak.connect().\n", "ak.connect(\"n116\", 5555)" @@ -37,12 +62,12 @@ "metadata": {}, "source": [ "### **Graph Generation and Loading**\n", - "Graphs can be built from existing data or generated with our suite of random graph generators. The preferred way to load a graph into memory is from Arkouda arrays, however we provide a method to read a graph in from matrix market format, or to generate them from existing Arkouda data." + "Graphs can be built from existing data or generated with our suite of random graph generators. The preferred way to load a graph into memory is from Arkouda arrays, however we provide a method to read a graph in from matrix market format or randomly generate some graphs." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "4b6a0f2f", "metadata": {}, "outputs": [], @@ -54,34 +79,54 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "b625a57b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated graph has 24212 vertices and 441463 edges\n", + "Generated graph has 100 vertices and 2640 edges\n", + "Generated graph has 100 vertices and 100 edges\n", + "Generated graph has 100 vertices and 2018 edges\n" + ] + } + ], "source": [ - "# Generate a random RMAT graph.\n", - "random_graph = ar.rmat(20, create_using=ar.Graph)" + "# Generate a random graph using any variety of random generators available.\n", + "n = 100\n", + "rmat_graph = ar.rmat(15, create_using=ar.Graph)\n", + "gnp_graph = ar.gnp(n, 0.75, create_using=ar.Graph)\n", + "rtree_graph = ar.random_tree(n, create_using=ar.Graph)\n", + "ws_graph = ar.watts_strogatz_graph(n, 25, 0.56, create_using=ar.Graph)\n", + "graph_list = [rmat_graph, gnp_graph, rtree_graph, ws_graph]\n", + "for g in graph_list:\n", + " print(f\"Generated graph has {len(g)} vertices and {g.size()} edges\")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "78d1993d", + "execution_count": 5, + "id": "47d13bbc", "metadata": {}, "outputs": [], "source": [ - "n = 1_000\n", - "m = 1_000_000\n", - "k = 2" + "# Create a property graph from Arkouda dataframes, usually read in from HDF5, Parquet, or CSV files. For demonstrative purposes, we create some random dataset here.\n", + "n = 1_000 # Number of vertices.\n", + "m = 1_000_000 # Number of edges.\n", + "k = 2 # Value to cap the randomness at." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "5d0712b5", "metadata": {}, "outputs": [], "source": [ + "# Create variously different random arrays of different types: integers, unsigned integers, floats, booleans, strings, and categoricals.\n", "src_array = ak.randint(0, n, m, dtype=ak.dtype('int64'), seed=2)\n", "dst_array = ak.randint(0, n, m, dtype=ak.dtype('int64'), seed=4)\n", "int_array = ak.randint(-1, k, m, dtype=ak.dtype('int64'), seed=6)\n", @@ -94,21 +139,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "9c9b38b7", "metadata": {}, "outputs": [], "source": [ + "# Initialize an empty graph object.\n", "prop_graph = ar.PropGraph()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "af0f2c50", "metadata": {}, "outputs": [], "source": [ + "# Create a dataframe with the edge data.\n", "test_edge_dict = {\n", " \"src\":src_array,\n", " \"dst\":dst_array,\n", @@ -124,72 +171,222 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "ca0bbe6e", - "metadata": {}, - "outputs": [], - "source": [ - "prop_graph.load_edge_attributes(test_edge_df, source_column=\"src\", destination_column=\"dst\", relationship_columns=[\"data5\", \"data1\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b8d7246", - "metadata": {}, - "outputs": [], - "source": [ - "prop_graph.get_edge_attributes()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6c3ae3b", + "execution_count": 9, + "id": "0adda3be", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
srcdstdata1data2data3data4data5data6
0351671000.638154FalseBB
127871001.366654FalseLL
2828401010.887981TrueNN
3358706-101.219599False
4681898101.191131TrueQQ
...........................
999995619263101.663987False
999996927756-100.781876TrueII
99999763449101.614944False
999998938493100.226886True
999999465523010.014557True
\n", + "

1000000 rows x 8 columns

" + ], + "text/plain": [ + " src dst data1 data2 data3 data4 data5 data6\n", + "0 351 671 0 0 0.638154 False B B\n", + "1 278 71 0 0 1.366654 False L L\n", + "2 828 401 0 1 0.887981 True N N\n", + "3 358 706 -1 0 1.219599 False \n", + "4 681 898 1 0 1.191131 True Q Q\n", + "... ... ... ... ... ... ... ... ...\n", + "999995 619 263 1 0 1.663987 False \n", + "999996 927 756 -1 0 0.781876 True I I\n", + "999997 63 449 1 0 1.614944 False \n", + "999998 938 493 1 0 0.226886 True \n", + "999999 465 523 0 1 0.014557 True (1000000 rows x 8 columns)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "prop_graph.get_edge_relationships()" + "test_edge_df" ] }, { "cell_type": "code", - "execution_count": null, - "id": "c2703260", - "metadata": {}, - "outputs": [], - "source": [ - "print(type(prop_graph.edge_attributes[\"data5\"]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "84834dbe", + "execution_count": 10, + "id": "ca0bbe6e", "metadata": {}, "outputs": [], "source": [ - "# prop_graph.add_edges_from(src_array,dst_array)" + "# Load in the edge attributes, with sorts the edges and handles storing their data.\n", + "prop_graph.load_edge_attributes(test_edge_df, source_column=\"src\", destination_column=\"dst\", relationship_columns=[\"data5\", \"data1\"])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "e94bf24e", "metadata": {}, "outputs": [], "source": [ + "# Create sizes for vertex information.\n", "m = len(prop_graph)\n", "k = 2" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "5dc37703", "metadata": {}, "outputs": [], "source": [ + "# Create data of different types for vertices.\n", "int_array = ak.randint(-1, k, m, dtype=ak.dtype('int64'), seed=6)\n", "uint_array = ak.randint(0, k, m, dtype=ak.dtype('uint64'), seed=8)\n", "real_array = ak.randint(0, k, m, dtype=ak.dtype('float64'), seed=10)\n", @@ -200,11 +397,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "d849d4ce", "metadata": {}, "outputs": [], "source": [ + "# Create a dataframe with vertex data.\n", "test_node_dict = {\n", " \"nodes\":prop_graph.nodes(),\n", " \"data1\":int_array,\n", @@ -219,114 +417,958 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "56581839", + "execution_count": 14, + "id": "52c54e83", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodesdata1data2data3data4data5data6
00000.638154FalseBB
11001.366654FalseLL
22010.887981TrueNN
33-101.219599False
44101.191131TrueQQ
........................
995995110.253123False
996996-110.924143TrueBB
997997011.303909False
998998-101.732572True
999999-101.904719True
\n", + "

1000 rows x 7 columns

" + ], + "text/plain": [ + " nodes data1 data2 data3 data4 data5 data6\n", + "0 0 0 0 0.638154 False B B\n", + "1 1 0 0 1.366654 False L L\n", + "2 2 0 1 0.887981 True N N\n", + "3 3 -1 0 1.219599 False \n", + "4 4 1 0 1.191131 True Q Q\n", + ".. ... ... ... ... ... ... ...\n", + "995 995 1 1 0.253123 False \n", + "996 996 -1 1 0.924143 True B B\n", + "997 997 0 1 1.303909 False \n", + "998 998 -1 0 1.732572 True \n", + "999 999 -1 0 1.904719 True (1000 rows x 7 columns)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "prop_graph.load_node_attributes(test_node_df, node_column=\"nodes\", label_columns=[\"data5\", \"data2\"])" + "test_node_df" ] }, { "cell_type": "code", - "execution_count": null, - "id": "acdeada4", + "execution_count": 15, + "id": "56581839", "metadata": {}, "outputs": [], "source": [ - "prop_graph.get_node_attributes()" + "# Load in the vertex data.\n", + "prop_graph.load_node_attributes(test_node_df, node_column=\"nodes\", label_columns=[\"data5\", \"data2\"])" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "b193b125", + "cell_type": "markdown", + "id": "61f19115", "metadata": {}, - "outputs": [], "source": [ - "prop_graph.get_node_labels()" + "### **Graph Processing and Querying**\n", + "Treating the graphs as dataframes allows us to exploit Arkouda's array searches to generate subgraphs in seconds." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "2343803e", "metadata": {}, "outputs": [], "source": [ + "# Create filters for vertices.\n", "def node_filter(node_attributes):\n", " return node_attributes[\"data2\"] == 0" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "94b1b23a", "metadata": {}, "outputs": [], "source": [ + "# Create filters for edges.\n", "def edge_filter(edge_attributes):\n", " return edge_attributes[\"data1\"] > -1" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "ce31b434", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Subgraph generated with edge size: 467134\n", + "Subgraph generated with edge size: 420930\n", + "Subgraph generated with edge size: 100891\n" + ] + } + ], "source": [ - "subgraph_nodes = prop_graph.subgraph_view(filter_node=node_filter)" + "# Create different subgraphs from each demo and together.\n", + "subgraph_nodes = prop_graph.subgraph_view(filter_node=node_filter)\n", + "print(f\"Subgraph generated with edge size: {subgraph_nodes.size()}\")\n", + "subgraph_edges = prop_graph.subgraph_view(filter_edge=edge_filter)\n", + "print(f\"Subgraph generated with edge size: {subgraph_edges.size()}\")\n", + "subgraph_together = prop_graph.subgraph_view(filter_node=node_filter, filter_edge=edge_filter)\n", + "print(f\"Subgraph generated with edge size: {subgraph_together.size()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "eded8d6c", + "metadata": {}, + "source": [ + "### **Graph Algorithms**\n", + "Let's now revisit the karate graph from above and do some analyses with Arachne and NetworkX together! First, let's start out by reading the matrix market file again, both with Arachne and NetworkX." ] }, { "cell_type": "code", - "execution_count": null, - "id": "2461eb43", + "execution_count": 19, + "id": "61aa774c", "metadata": {}, "outputs": [], "source": [ - "subgraph_edges = prop_graph.subgraph_view(filter_edge=edge_filter)" + "# First, let's build the graph with Arachne.\n", + "filepath = os.path.abspath(\"data/karate.mtx\")\n", + "G = ar.read_matrix_market_file(filepath)\n", + "\n", + "edge_src, edge_dst = G.edges()\n", + "edge_src = edge_src.to_list()\n", + "edge_dst = edge_dst.to_list()\n", + "edge_list = []\n", + "for (u,v) in zip(edge_src,edge_dst):\n", + " edge_list.append((u,v))\n", + "nodes = G.nodes()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "3623966a", + "execution_count": 20, + "id": "7378b0cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Let's make sure that both graphs have the same number of vertices and edges. Arachne graph has 34 vertices and 78 edges. NetworkX has 34 vertices and 78 edges.\n" + ] + } + ], + "source": [ + "# Secondly, let's build the graph with NetworkX. NOTE: Arachne has a direct read_matrix_market_file method whereas NetworkX requires you to use SciPy to read in the matrix market file.\n", + "fh = open(filepath, \"rb\")\n", + "H = nx.from_scipy_sparse_array(sp.io.mmread(fh))\n", + "print(f\"Let's make sure that both graphs have the same number of vertices and edges. Arachne graph has {len(G)} vertices and {G.size()} edges. NetworkX has {len(H)} vertices and {H.size()} edges.\")" + ] + }, + { + "cell_type": "markdown", + "id": "10dadb6a", + "metadata": {}, + "source": [ + "We are also able to display our graph with NetworkX methods by exporting our edgelist and building a graph from the edges stored by Arachne. **Note: Visualization is resource-intensive, so you can really only do it for small graphs. However, we perform these steps to show how Arachne can complement NetworkX and how graphs can be exchanged between them.**" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "21ef43f7", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "nx_display = nx.Graph()\n", + "nx_display.add_edges_from(edge_list)\n", + "\n", + "pos = nx.kamada_kawai_layout(nx_display)\n", + "nx.draw_networkx(nx_display, pos, with_labels=True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "f1cba34d", + "metadata": {}, + "source": [ + "Below we also see how we can extract a degree view for a graph to see the dispersion of degrees amongst the vertices. This also works for directed graphs by using the `G.in_degree()` and `G.out_degree()` methods." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "07067f46", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[16 9 10 6 3 4 4 4 5 2 3 1 2 5 2 2 2 2 2 3 2 2 2 5 3 3 2 4 3 4 4 6 12 17]\n" + ] + } + ], + "source": [ + "degrees = G.degree()\n", + "print(degrees)" + ] + }, + { + "cell_type": "markdown", + "id": "94f7772e", + "metadata": {}, + "source": [ + "Using Arkouda arrays we can also extract the node with maximum degree." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c33d3fac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Node 34 has maximum degree of 17\n" + ] + } + ], + "source": [ + "print(f\"Node {G.nodes()[ak.argmax(degrees)]} has maximum degree of {ak.max(degrees)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3b71fed3", + "metadata": {}, + "source": [ + "### **Breadth-First Search**\n", + "\n", + "Given a graph $G$ (as defined above) and a source vertex $s$, breadth-first search (BFS) traverses the graph in a level-centric manner. It can return the tree inherently generated during BFS, the vertices found at each layer, or an ordering of nodes as predecessors or successors. For our implementation we opted to mimic NetworkX's `bfs_layers` function that returns the nodes and the layer they belong to.\n", + "\n", + "Our BFS method, written in Chapel, expands the next frontier in parallel on each locale, if it is run on a distributed cluster. Each locale iterates over the frontier, and if there are any local edges, it inspects the neighbors, and if not yet visited, updates the `depth` (layer) pdarray. The final `depth` array is returned which can be used to provide a different view of a graph." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "34d4c13b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 1 1 1 1 1 1 1 1 2 1 1 1 1 3 3 2 1 3 1 3 1 3 3 2 2 3 2 2 3 2 1 2 2]\n" + ] + } + ], + "source": [ + "d = ar.bfs_layers(G, 1)\n", + "print(d)" + ] + }, + { + "cell_type": "markdown", + "id": "1345792d", + "metadata": {}, + "source": [ + "We may also extract a histogram of the sizes of each level computed by BFS. This can be a useful manner in seeing how how big each group of vertices at a particular level is." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "2bef7b00", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1 16 9 8]\n" + ] + } + ], + "source": [ + "d_histogram = ak.histogram(d, bins=ak.max(d)+1)\n", + "print(d_histogram[0])" + ] + }, + { + "cell_type": "markdown", + "id": "0b8924b9", + "metadata": {}, + "source": [ + "From the above we can see that at level 0 we obviously only have one vertex. Then, we can see that at level 1 we have 16, level 2 we have 9, and level 3 we have 8. The maximum level is 3. What happens if we change the source vertex? Let's see the results of bfs_layers for each vertex." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "32bc98b9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size of breadth-first search layers for vertex 1: [1 16 9 8]\n", + "Size of breadth-first search layers for vertex 2: [1 9 13 11]\n", + "Size of breadth-first search layers for vertex 3: [1 10 20 3]\n", + "Size of breadth-first search layers for vertex 4: [1 6 16 11]\n", + "Size of breadth-first search layers for vertex 5: [1 3 14 8 8]\n", + "Size of breadth-first search layers for vertex 6: [1 4 13 8 8]\n", + "Size of breadth-first search layers for vertex 7: [1 4 13 8 8]\n", + "Size of breadth-first search layers for vertex 8: [1 4 17 11 1]\n", + "Size of breadth-first search layers for vertex 9: [1 5 25 3]\n", + "Size of breadth-first search layers for vertex 10: [1 2 20 10 1]\n", + "Size of breadth-first search layers for vertex 11: [1 3 14 8 8]\n", + "Size of breadth-first search layers for vertex 12: [1 1 15 9 8]\n", + "Size of breadth-first search layers for vertex 13: [1 2 14 9 8]\n", + "Size of breadth-first search layers for vertex 14: [1 5 25 3]\n", + "Size of breadth-first search layers for vertex 15: [1 2 16 6 8 1]\n", + "Size of breadth-first search layers for vertex 16: [1 2 16 6 8 1]\n", + "Size of breadth-first search layers for vertex 17: [1 2 3 12 8 8]\n", + "Size of breadth-first search layers for vertex 18: [1 2 15 8 8]\n", + "Size of breadth-first search layers for vertex 19: [1 2 16 6 8 1]\n", + "Size of breadth-first search layers for vertex 20: [1 3 27 3]\n", + "Size of breadth-first search layers for vertex 21: [1 2 16 6 8 1]\n", + "Size of breadth-first search layers for vertex 22: [1 2 15 8 8]\n", + "Size of breadth-first search layers for vertex 23: [1 2 16 6 8 1]\n", + "Size of breadth-first search layers for vertex 24: [1 5 15 4 8 1]\n", + "Size of breadth-first search layers for vertex 25: [1 3 6 23 1]\n", + "Size of breadth-first search layers for vertex 26: [1 3 6 23 1]\n", + "Size of breadth-first search layers for vertex 27: [1 2 15 6 9 1]\n", + "Size of breadth-first search layers for vertex 28: [1 4 20 8 1]\n", + "Size of breadth-first search layers for vertex 29: [1 3 21 8 1]\n", + "Size of breadth-first search layers for vertex 30: [1 4 15 5 8 1]\n", + "Size of breadth-first search layers for vertex 31: [1 4 20 8 1]\n", + "Size of breadth-first search layers for vertex 32: [1 6 26 1]\n", + "Size of breadth-first search layers for vertex 33: [1 12 12 8 1]\n", + "Size of breadth-first search layers for vertex 34: [1 17 6 9 1]\n" + ] + } + ], + "source": [ + "for i in range(1,len(G)+1):\n", + " d = ar.bfs_layers(G, i)\n", + " d_histogram = ak.histogram(d, bins=ak.max(d)+1)\n", + " print(f\"Size of breadth-first search layers for vertex {i}: {d_histogram[0]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b13d9f2a", + "metadata": {}, + "source": [ + "From the output above we notice some main things:\n", + "1. Starting BFS from different sources arise to different number of layers for the output of each vertex.\n", + "2. Some nodes have the same number of vertices at each layer. \n", + "3. Vertex 19 provides the largest number of layers for any of the vertices. \n", + "\n", + "Let's color the nodes for node 19 and show the output." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "c100625a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[3 3 2 3 4 4 4 3 2 2 4 4 4 2 2 2 5 4 0 2 2 4 2 2 3 3 2 2 2 2 2 2 1 1]\n", + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 18, 20, 22, 32, 31, 10, 28, 29, 33, 17, 34, 15, 16, 19, 21, 23, 24, 26, 30, 25, 27]\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "degrees = [16 9 10 6 3 4 4 4 5 2 3 1 2 5 2 2 2 2 2 3 2 2 2 5 3 3 2 4 3 4 4 6 12 17]\n" + ] + } + ], + "source": [ + "d = ar.bfs_layers(G, 19)\n", + "color_dict = {0:\"blue\", 1:\"green\", 2:\"red\", 3:\"purple\", 4:\"orange\", 5:\"yellow\"}\n", + "print(d)\n", + "\n", + "color_map = []\n", + "nodes = nx_display.nodes()\n", + "for u in nodes:\n", + " color_map.append(color_dict[d[u-1]])\n", + " \n", + "print(nx_display.nodes)\n", + "\n", + "pos = nx.kamada_kawai_layout(nx_display)\n", + "nx.draw_networkx(nx_display, pos, with_labels=True, node_color=color_map)\n", + "plt.show()\n", + "\n", + "print(f\"degrees = {degrees}\")" + ] + }, + { + "cell_type": "markdown", + "id": "891c38f1", + "metadata": {}, + "source": [ + "In the returned image we can see the first frontier are the green vertices $<0, 1, 33>$. If we look at the degree view of the graph we notice that 0, 1, and 33 are in the top 4 of vertices with most degree, which now makes sense why vertex 19 had the largest expanded frontier. " + ] + }, + { + "cell_type": "markdown", + "id": "056f8568", + "metadata": {}, + "source": [ + "### **Connected Components**\n", + "\n", + "Given a graph G, (as defined above) if there is a path from a vertex $u$ to every other vertex $v$ then the graph is said to be connected. If there is not a path, then the graph is said to be disconnected and composed of multiple connected components. There may be a large number of connected components of varying sizes in a graph.\n", + "\n", + "One of the most ancient manners of calculating connected components involve running BFS until every vertex has been visited, at every iteration looking for the node whose value `-1` and start BFS from it. These steps are repeated until all the vertices have been labeled. The other is using union-find to build a tree induced by each connected component. For our method we use a lbel propagation technique that sends the minimum vertex label to all the other vertices in a connected component.\n", + "\n", + "To run our connected components, you just have to call the `connected_components()` method. **We use one of the randomly generated graphs above for this example**." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "ab22ea5c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/scratch/users/oaa9/arkouda-2024.04.19/arkouda/alignment.py:159: UserWarning: Duplicate terms present in search space. Only first instance of each query term will be reported.\n", + " warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 0 0 ... 0 0 0]\n" + ] + } + ], + "source": [ + "cc = ar.connected_components(rmat_graph)\n", + "print(cc)" + ] + }, + { + "cell_type": "markdown", + "id": "4a9dfc0a", + "metadata": {}, + "source": [ + "We can use Arkouda methods to get the size of each component in our file. This will let us know which vertices will be included in the induced subgraph of the largest component." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "3affcb7e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The largest component is labeled: 0\n", + "The smallest component is labeled: 571\n" + ] + } + ], + "source": [ + "g = ak.GroupBy(cc)\n", + "keys, count = g.count()\n", + "label_of_largest_component = ak.argmax(count)\n", + "label_of_smallest_component = ak.argmin(count)\n", + "print(f\"The largest component is labeled: {keys[label_of_largest_component]}\")\n", + "print(f\"The smallest component is labeled: {keys[label_of_smallest_component]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e26d4aab", + "metadata": {}, + "source": [ + "### **Triangle Counting**\n", + "One of the most basic community structures that can be found in graphs are triangles. Triangles are 3-cliques in an undirected graph which means three nodes are strongly connected to each other. Triangle counting can be used to detect how cohesive communities are, the more triangles there are, the better connected a community is. It can also be used to drive other graph analytical algorithms such as centrality measures (triangle centrality) and substructure detection (k-truss analytics).\n", + "\n", + "Our triangle counting method involves inspecting every edge in a graph and the adjacency lists of both endpoints to find the intersection point. Most algorithms perform list intersection, for our case we perform a binary search of every vertex in the smaller adjacency list into the bigger one. This allows us to efficiently find triangles with less work.\n", + "\n", + "Please note that you have to divide the total number of triangles found by 3." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "ef580d45", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The whole graph has 45.0 triangles.\n" + ] + } + ], + "source": [ + "G_tri = ar.triangles(G)\n", + "print(f\"The whole graph has {G_tri/3} triangles.\")" + ] + }, + { + "cell_type": "markdown", + "id": "a3119c65", + "metadata": {}, + "source": [ + "You can also pass an array of vertex names to return only the number of triangles those vertices belong to." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "922ec7c5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vertex 1 has 21 triangles and vertex 19 has 1 triangles.\n" + ] + } + ], + "source": [ + "G_tris = ar.triangles(G, ak.array([1,19]))\n", + "print(f\"Vertex 1 has {G_tris[0]} triangles and vertex 19 has {G_tris[1]} triangles.\")" + ] + }, + { + "cell_type": "markdown", + "id": "a7808a40", + "metadata": {}, + "source": [ + "### **Truss Analytics**\n", + "\n", + "Given a graph $G$ (as defined above), the $k$-truss of a graph is one where every edge is a part of least $k-2$ triangles. This creates a cohesive subgraph where only the edges that meet these requirement are kept. It involves recalculating the number of triangles multiple times, which we avoid by tracking the support of each edge at every iteration of the algorithm. We provide novel algorithmic implementations for three truss analytical algorithms:\n", + "1. $k$-truss\n", + "2. max-truss\n", + "3. truss decomposition\n", + "\n", + "Below we run `k_truss` and display the edges that belong to at least 2 triangles. We could build an induced subgraph from these edges in the same manner as we showed above." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "453d1389", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-1 -1 -1 3 3 3 -1 3 3 3 3 -1 3 3 3 3 -1 -1 -1 -1 3 3 3 3 -1 -1 3 3 -1 3 3 3 -1 3 -1 3 3 3 3 3 3 -1 -1 -1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 -1 -1 -1 3 3 3 3 3 3 3 3 3 -1 -1 -1 -1 3 3 -1]\n" + ] + } + ], + "source": [ + "kt = ar.k_truss(G, 4)\n", + "print(kt)" + ] + }, + { + "cell_type": "markdown", + "id": "3f569d48", "metadata": {}, - "outputs": [], "source": [ - "subgraph_together = prop_graph.subgraph_view(filter_node=node_filter, filter_edge=edge_filter)" + "Here, we see that the maximum truss of this graph is 5. This means that the $k$-truss is no longer defined for any value of $k$ larger than 5." ] }, { "cell_type": "code", - "execution_count": null, - "id": "9a8f70ad", + "execution_count": 33, + "id": "d0cf789b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5\n" + ] + } + ], + "source": [ + "mt = ar.max_truss(G)\n", + "print(mt)" + ] + }, + { + "cell_type": "markdown", + "id": "5ebe66d5", + "metadata": {}, + "source": [ + "Lastly, the truss decomposition shows the trussness of every edge. This means that the largest value of k for every edge is the one presented in the edge index. " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "f9453589", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[5 5 5 3 3 3 5 3 3 2 3 5 3 3 3 2 5 5 5 5 3 3 3 2 5 5 3 2 5 2 2 3 5 3 5 3 3 3 3 3 3 4 4 4 2 2 3 3 3 3 3 3 2 3 3 3 3 2 3 4 4 4 3 2 3 3 3 3 3 3 3 4 4 4 4 3 3 4]\n" + ] + } + ], + "source": [ + "td = ar.truss_decomposition(G)\n", + "print(td)" + ] + }, + { + "cell_type": "markdown", + "id": "ada7b537", + "metadata": {}, + "source": [ + "Using similar coloring steps as above, we can actually color the edges of the graph. The steps for that are below." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "c6bfe6fe", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "color_dict = {5:\"red\", 4:\"purple\", 3:\"green\", 2:\"blue\"}\n", + "edge_color = []\n", + "\n", + "for i in range(G.size()):\n", + " edge_color.append(color_dict[td[i]])\n", + " \n", + "pos = nx.spring_layout(nx_display, seed=200)\n", + "nx.draw_networkx(nx_display, pos, with_labels=True, edge_color=edge_color)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "0f764afd", + "metadata": {}, + "source": [ + "### **Triangle Centrality**\n", + "\n", + "Triangle centrality is a centrality measure to decide how important a vertex is based off how many triangles surround a particular vertex. It is calculated based off the following formula as seen in the paper titled \"Triangle Centrality\" by Paul Burkhardt.\n", + "\n", + "$$TC(v) = \\frac{\\frac{1}{3} \\sum_{u \\in N_{\\Delta}^{+}(v)}{\\Delta(u)} + \\sum_{w \\in (N(v) \\backslash N_{\\Delta}(v))} {\\Delta (w)}}{\\Delta(G)}$$\n", + "\n", + "Where $N(v)$ is the neighborhood set of a vertex $v$, $N_{\\Delta}(v)$ is the set of neighbors that are in triangles with $v$, and $N_{\\Delta}^{+}$ is the closed set that includes $v$. There is not an equivalent metric of this form available in NetworkX." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "4aa9be7b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "One of the most important vertices in the graph is: 14\n" + ] + } + ], + "source": [ + "tce = ar.triangle_centrality(G)\n", + "print(f\"One of the most important vertices in the graph is: {G.nodes()[ak.argmax(tce)]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "04f3f8ea", + "metadata": {}, + "source": [ + "### **Subgraph Isomorphism**\n", + "\n", + "The problem of motif finding in graphs is one of pattern matching with a smaller subgraph to search inside of a larger host graph. Here, we have an implementation of parallel VF2 that returns the subgraph mappings of vertices inside of the graph that contain the same structure as the given subgraph. Currently, our subgraph isomorphism method works only for property graphs. Further, it returns monomorphisms." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "7fd4d820", "metadata": {}, "outputs": [], "source": [ - "print(f\"Subgraph generated with edge size: {subgraph_nodes.size()}\")" + "G_directed = ar.PropGraph()\n", + "edges_to_add = G.edges()\n", + "G_directed.add_edges_from(edges_to_add[0], edges_to_add[1])" ] }, { "cell_type": "code", - "execution_count": null, - "id": "3a608027", + "execution_count": 38, + "id": "999dfc01", "metadata": {}, "outputs": [], "source": [ - "print(f\"Subgraph generated with edge size: {subgraph_edges.size()}\")" + "subgraph = ar.PropGraph()\n", + "src = [0, 1, 2, 1]\n", + "dst = [1, 2, 0, 3]\n", + "subgraph.add_edges_from(ak.array(src), ak.array(dst))" ] }, { "cell_type": "code", - "execution_count": null, - "id": "ecc5c635", + "execution_count": 39, + "id": "36d90c88", "metadata": {}, "outputs": [], "source": [ - "print(f\"Subgraph generated with edge size: {subgraph_together.size()}\")" + "isos = ar.subgraph_isomorphism(G_directed, subgraph)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "db263405", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We found 1848.0 triangles with tails inside of the karate graph\n" + ] + } + ], + "source": [ + "print(f\"We found {len(isos)/4} triangles with tails inside of the karate graph\")" + ] + }, + { + "cell_type": "markdown", + "id": "57a27650", + "metadata": {}, + "source": [ + "### **Diameter**\n", + "\n", + "The diameter of a graph is the longest of the shortest paths between two vertices in a graph. Here, we should an experimental version that approximates the diameter with connected components." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "60bb8a86", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The diameter of karate is 4\n" + ] + } + ], + "source": [ + "di = ar.diameter(G)\n", + "print(f\"The diameter of karate is {di}\") " ] } ], diff --git a/arachne/client/arachne/propgraphclass.py b/arachne/client/arachne/propgraphclass.py index ee692e65..adc76a35 100644 --- a/arachne/client/arachne/propgraphclass.py +++ b/arachne/client/arachne/propgraphclass.py @@ -624,11 +624,6 @@ def subgraph_view(self, filter_node=no_filter, filter_edge=no_filter): nodes = nodes[filtered_nodes] - print(edges[0]) - print(edges[1]) - print(filtered_nodes) - print(filtered_edges) - src = edges[0][filtered_edges] dst = edges[1][filtered_edges] diff --git a/arachne/client/setup.py b/arachne/client/setup.py index b97b40f7..4aad61ad 100644 --- a/arachne/client/setup.py +++ b/arachne/client/setup.py @@ -10,7 +10,7 @@ setup( name="arachne", - version="2024.01.22", + version="2024.04.23", description="Graph functionality for Arkouda.", long_description=long_description, long_description_content_type="text/markdown",