Release 1.0.0rc1

jiadiwu · Apr 22, 2020 · 625a99b · 625a99b
2 parents 55a4ba3 + 682513a
commit 625a99b
Show file tree

Hide file tree

Showing 218 changed files with 12,563 additions and 3,942 deletions.
diff --git a/.buildkite/docker-compose.yml b/.buildkite/docker-compose.yml
@@ -7,13 +7,16 @@ services:
       dockerfile: docker/stellargraph-ci-runner/Dockerfile
       args:
         PYTHON_VERSION: "3.6"
+        # this can be set at the build level to have the whole build use pre-release versions of
+        # dependencies
+        PRERELEASE_VERSIONS: &prerelease "${PRERELEASE_VERSIONS:-0}"
     working_dir: /workdir
-    volumes:
+    volumes: &runner-volumes
       - $PWD:/workdir
       # We need to manually bring in buildkite-agent
       # (https://github.com/buildkite-plugins/docker-compose-buildkite-plugin/issues/158)
       - ${BUILDKITE_AGENT_BINARY_PATH:-/usr/bin/buildkite-agent}:/usr/bin/buildkite-agent
-    environment:
+    environment: &runner-environment
       - PYTHONDONTWRITEBYTECODE=1
       # make sure that no new uses of the legacy constructor are added (see also: test in
       # test_graph.py, filterwarnings in pytest.ini)
@@ -37,18 +40,33 @@ services:
       - BUILDKITE_COMMIT
       - BUILDKITE_TAG
       - CODECOV_TOKEN
+      - NEO4J_VERSION
     depends_on:
-      - neo4j
+      - ${RUNNER_DEPENDS_ON:-empty-service}
 
   runner-3_7:
     <<: *runner-config
     build:
       <<: *runner-build
       args:
         PYTHON_VERSION: "3.7"
+        PRERELEASE_VERSIONS: *prerelease
+
+  # a service that does nothing to use as a default depends_on for the runners
+  empty-service:
+    image: alpine:3.10
 
   neo4j:
-    build: docker/stellargraph-neo4j
+    build:
+      context: docker/stellargraph-neo4j
+      args:
+        - NEO4J_VERSION
     environment:
       # this is running entirely locally on a CI machine, so authentication is unnecessary
       NEO4J_AUTH: "none"
+
+  conda:
+    image: continuumio/anaconda3
+    working_dir: /workdir
+    volumes: *runner-volumes
+    environment: *runner-environment
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -83,7 +83,7 @@ steps:
     <<: *timeout
     key: "test-notebooks"
     depends_on: "runner-3_6"
-    parallelism: 43
+    parallelism: 45
     command: ".buildkite/steps/test-demo-notebooks.sh"
     plugins:
       <<: *plugins
@@ -95,9 +95,9 @@ steps:
     soft_fail:
       - exit_status: 2
 
-  - label: ":python::book: test neo4j notebooks"
+  - &test-neo4j-notebooks
+    label: ":python::book: test neo4j notebooks against neo4j 4.0"
     <<: *timeout
-    key: "test-neo4j-notebooks"
     depends_on: "runner-3_6"
     command: ".buildkite/steps/test-neo4j-notebooks.sh"
     plugins:
@@ -107,9 +107,18 @@ steps:
         run: runner-3_6
         env:
           - STELLARGRAPH_NEO4J_HOST=neo4j
+    env:
+      RUNNER_DEPENDS_ON: neo4j
+      NEO4J_VERSION: "4.0"
     agents:
       queue: "t2large"
 
+  - <<: *test-neo4j-notebooks
+    label: ":python::book: test neo4j notebooks against neo4j 3.5"
+    env:
+      RUNNER_DEPENDS_ON: neo4j
+      NEO4J_VERSION: "3.5"
+
   - label: ":python-black: format"
     <<: *timeout
     plugins:
@@ -188,6 +197,26 @@ steps:
     <<: *timeout
     command: "scripts/whitespace.sh --ci"
 
+  - label: ":book: check demo table"
+    command: "python scripts/demo_table.py --action=compare"
+    <<: *timeout
+    plugins:
+      <<: *plugins
+      docker#v3.5.0:
+        image: "python:3.6"
+        propagate_environment: true
+
+  - label: ":snake: conda build"
+    <<: *timeout
+    command: ".buildkite/steps/conda-build.sh"
+    plugins:
+      <<: *plugins
+      docker-compose#v3.2.0:
+        <<: *compose-config
+        run: conda
+    agents:
+      queue: "t2medium"
+
   - label: ":docker: build image"
     <<: *timeout
     plugins:

diff --git a/.buildkite/steps/conda-build.sh b/.buildkite/steps/conda-build.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -xeo pipefail
+
+echo "+++ :snake: :construction_worker: conda build"
+conda build . --no-anaconda-upload
+
+echo "+++ :snake::buildkite: upload package"
+conda_package="$(conda build . --output)"
+buildkite-agent artifact upload "${conda_package}"
diff --git a/.buildkite/steps/test-demo-notebooks.sh b/.buildkite/steps/test-demo-notebooks.sh
@@ -34,17 +34,16 @@ f=${NOTEBOOKS[$INDEX]}
 case $(basename "$f") in
   'attacks_clustering_analysis.ipynb' | 'hateful-twitters-interpretability.ipynb' | 'hateful-twitters.ipynb' | 'stellargraph-attri2vec-DBLP.ipynb' | \
     'node-link-importance-demo-gat.ipynb' | 'node-link-importance-demo-gcn.ipynb' | 'node-link-importance-demo-gcn-sparse.ipynb' | 'rgcn-aifb-node-classification-example.ipynb' | \
-    'stellargraph-metapath2vec.ipynb')
+    'stellargraph-metapath2vec.ipynb' | 'gcn-lstm-LA.ipynb')
     # These notebooks do not yet work on CI:
     # FIXME #818: datasets can't be downloaded
     # FIXME #819: out-of-memory
-    # FIXME #849: CI does not have neo4j
-    # FIXME #907: socialcomputing.asu.edu is down
+    # FIXME #1303: METR_LA dataset can't be downloaded automatically
     echo "+++ :python: :skull_and_crossbones: skipping $f"
     exit 2 # this will be a soft-fail for buildkite
     ;;
 
-  'directed-graphsage-on-cora-neo4j-example.ipynb' | 'undirected-graphsage-on-cora-neo4j-example.ipynb' | 'load-cora-into-neo4j.ipynb')
+  'loading-saving-neo4j.ipynb' | 'directed-graphsage-on-cora-neo4j-example.ipynb' | 'undirected-graphsage-on-cora-neo4j-example.ipynb' | 'load-cora-into-neo4j.ipynb')
     # these are tested separately (see test-neo4j-notebooks.sh)
     echo "+++ :python: skipping Neo4j notebook $f"
     exit 0

diff --git a/.buildkite/steps/test-neo4j-notebooks.sh b/.buildkite/steps/test-neo4j-notebooks.sh
@@ -13,8 +13,13 @@ echo "--- listing dependency versions"
 pip freeze
 
 directory="$PWD/demos/connector/neo4j"
-notebooks=('load-cora-into-neo4j.ipynb' 'directed-graphsage-on-cora-neo4j-example.ipynb' 'undirected-graphsage-on-cora-neo4j-example.ipynb')
+notebooks=(
+  "../../basics/loading-saving-neo4j.ipynb"
+  "load-cora-into-neo4j.ipynb"
+  "directed-graphsage-on-cora-neo4j-example.ipynb"
+  "undirected-graphsage-on-cora-neo4j-example.ipynb"
+)
 
 for name in "${notebooks[@]}"; do
-  .buildkite/steps/test-single-notebook.sh "$directory/$name"
+  .buildkite/steps/test-single-notebook.sh "$directory/$name" " using Neo4j ${NEO4J_VERSION}"
 done
diff --git a/.buildkite/steps/test-single-notebook.sh b/.buildkite/steps/test-single-notebook.sh
@@ -4,6 +4,7 @@ set -xeo pipefail
 
 stellargraph_dir="$PWD"
 f="$1"
+extra_info="${2-}"
 
 echo "+++ :python: running $f"
 cd "$(dirname "$f")"
@@ -36,8 +37,8 @@ echo "This notebook can be viewed at <$url>"
 
 if [ "$exitCode" -ne 0 ]; then
   # the notebook failed, so let's flag that more obviously, with helpful links
-  buildkite-agent annotate --style "error" --context "$filename" << EOF
-Notebook \`$filename\` had an error: [failed job](#${BUILDKITE_JOB_ID}), [rendered notebook]($url)
+  buildkite-agent annotate --style "error" --context "$filename-${BUILDKITE_JOB_ID}" << EOF
+Notebook \`$filename\` had an error${extra_info}: [failed job](#${BUILDKITE_JOB_ID}), [rendered notebook]($url)
 EOF
 fi
 

diff --git a/.dockerignore b/.dockerignore
@@ -123,9 +123,7 @@ site
 .buildkite/
 .github/
 
-# docker configuration isn't needed inside the image
-docker/
 # buildkite creates a docker-compose.buildkite-...-override.yml file
-# that changes each build, that also needs to be ignored
+# that changes each build, that needs to be ignored
 docker-compose*.yml
 .dockerignore
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,73 @@
 # Change Log
 
+## [1.0.0rc1](https://github.com/stellargraph/stellargraph/tree/v1.0.0rc1)
+
+[Full Changelog](https://github.com/stellargraph/stellargraph/compare/v0.11.0...v1.0.0rc1)
+
+This is the first release candidate for StellarGraph 1.0. The 1.0 release will be the culmination of 2 years of activate development, and this release candidate is the first milestone for that release.
+
+Jump in to this release, with the new demos and examples:
+
+- [More helpful indexing and guidance in demo READMEs](demos/)
+- [Loading from Neo4j][neo4j]
+- [More explanatory Node2Vec link prediction][n2v-lp]
+- [Unsupervised `GraphSAGE` and `HinSAGE` via `DeepGraphInfomax`][dgi]
+- [Graph classification with `GCNSupervisedGraphClassification`][gc]
+- [Time series prediction using spatial information, using `GraphConvolutionLSTM`][gcn-lstm] (experimental)
+
+[neo4j]: demos/basics/loading-saving-neo4j.ipynb
+[n2v-lp]: demos/link-prediction/random-walks/cora-lp-demo.ipynb
+[dgi]: demos/embeddings/deep-graph-infomax-cora.ipynb
+[gc]: demos/graph-classification/supervised-graph-classification.ipynb
+[gcn-lstm]: demos/spatio-temporal/gcn-lstm-LA.ipynb
+
+### Major features and improvements
+
+- Better demonstration notebooks and documentation to make the library more accessible to new and existing users:
+  - The [demos READMEs](demos/) now contain more guidance and explanation to make it easier to find a relevant example [\#1200](https://github.com/stellargraph/stellargraph/pull/1200)
+  - A [demo for loading data from Neo4j][neo4j] has been added [\#1184](https://github.com/stellargraph/stellargraph/pull/1184)
+  - The [demo for link prediction using Node2Vec][n2v-lp] has been rewritten to be clearer [\#1190](https://github.com/stellargraph/stellargraph/pull/1190)
+  - Notebooks are [now included in the API documentation](https://stellargraph.readthedocs.io/en/latest/demos/index.html), for more convenient access [\#1279](https://github.com/stellargraph/stellargraph/pull/1279)
+  - Notebooks now detect if they're being used with an incorrect version of the StellarGraph library, elimanting confusion about version mismatches [\#1242](https://github.com/stellargraph/stellargraph/pull/1242)
+- New algorithms:
+  - `GCNSupervisedGraphClassification`: supervised graph classification model based on Graph Convolutional layers (GCN) [\#929](https://github.com/stellargraph/stellargraph/issues/929), [demo][gc].
+- `DeepGraphInfomax` can be used to train almost any model in an unsupervised way, via the `corrupt_index_groups` parameter to `CorruptedGenerator` [\#1243](https://github.com/stellargraph/stellargraph/pull/1243), [demo][dgi]. Additionally, many algorithms provide defaults and so can be used with `DeepGraphInfomax` without specifying this parameter:
+  - any model using `FullBatchNodeGenerator`, including models supported in StellarGraph 0.11: `GCN`, `GAT`, `PPNP` and `APPNP`
+  - `GraphSAGE` [\#1162](https://github.com/stellargraph/stellargraph/pull/1162)
+  - `HinSAGE` for heterogeneous graphs with node features [\#1254](https://github.com/stellargraph/stellargraph/pull/1254)
+- `UnsupervisedSampler` supports a `walker` parameter to use other random walking algorithms such as `BiasedRandomWalk`, in addition to the default `UniformRandomWalk`. [\#1187](https://github.com/stellargraph/stellargraph/pull/1187)
+- The `StellarGraph` class is now smaller, faster and easier to construct:
+  - The `StellarGraph(..., edge_type_column=...)` parameter can be used to construct a heterogeneous graph from a single flat `DataFrame`, containing a column of the edge types [\#1284](https://github.com/stellargraph/stellargraph/pull/1284). This avoids the need to build separate `DataFrame`s for each type, and is significantly faster when there are many types. Using `edge_type_column` gives a 2.6× speedup for loading the `stellargraph.datasets.FB15k` dataset (with almost 600 thousand edges across 1345 types).
+  - `StellarGraph`'s internal cache of node adjacencies now uses the smallest integer type it can [\#1289](https://github.com/stellargraph/stellargraph/pull/1289). This reduces memory use by 31% on the `FB15k` dataset, and 36% on a reddit dataset (with 11.6 million edges).
+
+### Breaking changes
+
+- Edge weights are now validated to be numeric when creating a `StellarGraph`, previously edge weights could be any type, but all algorithms that use them would fail. [\#1191](https://github.com/stellargraph/stellargraph/pull/1191)
+- Full batch layers no longer support an "output indices" tensor to filter the output rows to a selected set of nodes [\#1204](https://github.com/stellargraph/stellargraph/pull/1204) (this does **not** affect models like `GCN`, only the layers within them: `APPNPPropagationLayer`, `ClusterGraphConvolution`, `GraphConvolution`, `GraphAttention`, `GraphAttentionSparse`, `PPNPPropagationLayer`, `RelationalGraphConvolution`). Migration: post-process the output using `tf.gather` manually or the new `sg.layer.misc.GatherIndices` layer.
+- `GraphConvolution` has been generalised to work with batch size > 1, subsuming the functionality of the now-deprecated `ClusterGraphConvolution` (and `GraphClassificationConvolution`) [\#1205](https://github.com/stellargraph/stellargraph/pull/1205). Migration: replace `stellargraph.layer.ClusterGraphConvolution` with `stellargraph.layer.GraphConvolution`.
+
+### Experimental features
+
+Some new algorithms and features are still under active development, and are available as an experimental preview. However, they may not be easy to use: their documentation or testing may be incomplete, and they may change dramatically from release to release. The experimental status is noted in the documentation and at runtime via prominent warnings.
+
+- `SortPooling` layer: the node pooling layer introduced in [Zhang et al](https://www.cse.wustl.edu/~muhan/papers/AAAI_2018_DGCNN.pdf) [\#1210](https://github.com/stellargraph/stellargraph/pull/1210)
+- `DeepGraphConvolutionalNeuralNetwork` (DGCNN): supervised graph classification using a stack of graph convolutional layers followed by `SortPooling`, and standard convolutional and pooling (such as `Conv1D` and `MaxPool1D`) [\#1212](https://github.com/stellargraph/stellargraph/pull/1212) [\#1265](https://github.com/stellargraph/stellargraph/pull/1265)
+- `GraphConvolutionLSTM`: time series prediction on spatio-temporal data, combining GCN with a [LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory) model to augment the conventional time-series model with information from nearby data points [\#1085](https://github.com/stellargraph/stellargraph/pull/1085), [demo][gcn-lstm]
+
+### Bug fixes and other changes
+
+- Random walk classes like `UniformRandomWalk` and `BiasedRandomWalk` can have their hyperparameters set on construction, in addition to in each call to `run` [\#1179](https://github.com/stellargraph/stellargraph/pull/1179)
+- Node feature sampling was made ~4× faster by ensuring a better data layout, this makes some configurations of `GraphSAGE` (and `HinSAGE`) noticably faster [\#1225](https://github.com/stellargraph/stellargraph/pull/1225)
+- The `PROTEINS` dataset has been added to `stellargraph.datasets`, for graph classification [\#1282](https://github.com/stellargraph/stellargraph/pull/1282)
+- The `BlogCatalog3` dataset can now be successfully downloaded again [\#1283](https://github.com/stellargraph/stellargraph/pull/1283)
+- Knowledge graph model evaluation via `rank_edges_against_all_nodes` now defaults to the `random` strategy for breaking ties, and supports `top` (previous default) and `bottom` as alternatives [\#1223](https://github.com/stellargraph/stellargraph/pull/1223)
+- Creating a `RelationalFullBatchNodeGenerator` is now significantly faster and requires much less memory (18× speedup and 560× smaller for the `stellargraph.datasets.AIFB` dataset) [\#1274](https://github.com/stellargraph/stellargraph/pull/1274)
+- `StellarGraph.info` now shows a summary of the edge weights for each edge type [\#1240](https://github.com/stellargraph/stellargraph/pull/1240)
+- Various documentation, demo and error message fixes and improvements: [\#1141](https://github.com/stellargraph/stellargraph/pull/1141), [\#1219](https://github.com/stellargraph/stellargraph/pull/1219), [\#1246](https://github.com/stellargraph/stellargraph/pull/1246), [\#1260](https://github.com/stellargraph/stellargraph/pull/1260), [\#1266](https://github.com/stellargraph/stellargraph/pull/1266)
+- DevOps changes:
+  - CI: [\#1161](https://github.com/stellargraph/stellargraph/pull/1161), [\#1189](https://github.com/stellargraph/stellargraph/pull/1189), [\#1230](https://github.com/stellargraph/stellargraph/pull/1230), [\#1122](https://github.com/stellargraph/stellargraph/pull/1122)
+  - Other: [\#1197](https://github.com/stellargraph/stellargraph/pull/1197)
+
 ## [0.11.1](https://github.com/stellargraph/stellargraph/tree/v0.11.1)
 
 [Full Changelog](https://github.com/stellargraph/stellargraph/compare/v0.11.0...v0.11.1)

diff --git a/README.md b/README.md
@@ -64,7 +64,7 @@ The StellarGraph library offers state-of-the-art algorithms for [graph machine l
 - Link prediction;
 - [Interpretation of node classification](https://medium.com/stellargraph/https-medium-com-stellargraph-saliency-maps-for-graph-machine-learning-5cca536974da) [8].
 
-Graph-structured data represent entities as nodes (or vertices) and relationships between them as edges (or links), and can data include data associated with either as attributes. For example, a graph can contain people as nodes and friendships between them as links, with data like a person's age and the date a friendship was established. StellarGraph supports analysis of many kinds of graphs:
+Graph-structured data represent entities as nodes (or vertices) and relationships between them as edges (or links), and can include data associated with either as attributes. For example, a graph can contain people as nodes and friendships between them as links, with data like a person's age and the date a friendship was established. StellarGraph supports analysis of many kinds of graphs:
 
 - homogeneous (with nodes and links of one type),
 - heterogeneous (with more than one type of nodes and/or links)
@@ -184,6 +184,7 @@ The StellarGraph library currently includes the following algorithms for graph m
 | Deep Graph Infomax [15] | Deep Graph Infomax trains unsupervised GNNs to maximize the shared information between node level and graph level features. |
 | Continuous-Time Dynamic Network Embeddings (CTDNE) [16] | Supports time-respecting random walks which can be used in a similar way as in Node2Vec for unsupervised representation learning. |
 | DistMult [17] | The DistMult algorithm computes embeddings for nodes (entities) and edge types (relations) in knowledge graphs, and can use these for link prediction |
+| DGCNN [18] | The Deep Graph Convolutional Neural Network (DGCNN) algorithm for supervised graph classification. |
 
 ## Installation
 
@@ -298,3 +299,5 @@ International Conference on Machine Learning (ICML), 2019. ([link](https://arxiv
 16. Continuous-Time Dynamic Network Embeddings. Giang Hoang Nguyen, John Boaz Lee, Ryan A. Rossi, Nesreen K. Ahmed, Eunyee Koh, and Sungchul Kim. Proceedings of the 3rd International Workshop on Learning Representations for Big Networks (WWW BigNet) 2018. ([link](https://dl.acm.org/doi/10.1145/3184558.3191526))
 
 17. Embedding Entities and Relations for Learning and Inference in Knowledge Bases. Bishan Yang, Wen-tau Yih, Xiaodong He, Jianfeng Gao, and Li Deng, ICLR, 2015. arXiv:1412.6575 ([link](https://arxiv.org/pdf/1412.6575))
+
+18. An End-to-End Deep Learning Architecture for Graph Classification. Muhan Zhang, Zhicheng Cui, Marion Neumann, and Yixin Chen, AAAI, 2018. ([link](https://www.cse.wustl.edu/~muhan/papers/AAAI_2018_DGCNN.pdf))