From e3f5c0005bfb56b8ed737382eca01784fd948588 Mon Sep 17 00:00:00 2001 From: Ashwin Sekar Date: Fri, 28 May 2021 12:53:11 -0700 Subject: [PATCH] Add partial network outage before test functionality (#17291) * Add partial network outage before testing functionality Allow a percentage of the stake to go offline before an automation test starts * code review fixes --- net/net.sh | 2 +- system-test/automation_utils.sh | 12 ++++++- .../gce-partition-with-offline.yml | 27 +++++++++++++++ system-test/testnet-automation.sh | 33 ++++++++++++++++--- 4 files changed, 68 insertions(+), 6 deletions(-) create mode 100755 system-test/partition-testcases/gce-partition-with-offline.yml diff --git a/net/net.sh b/net/net.sh index 3dbf78e6ece4d7..87930fd51e7fc7 100755 --- a/net/net.sh +++ b/net/net.sh @@ -705,7 +705,7 @@ stopNode() { declare pid=$! ln -sf "stop-validator-$ipAddress.log" "$netLogDir/stop-validator-$pid.log" if $block; then - wait $pid + wait $pid || true else pids+=("$pid") fi diff --git a/system-test/automation_utils.sh b/system-test/automation_utils.sh index e8b01713309478..4124f957925e5d 100755 --- a/system-test/automation_utils.sh +++ b/system-test/automation_utils.sh @@ -60,7 +60,7 @@ function analyze_packet_loss { ) } -function wait_for_bootstrap_validator_stake_drop { +function wait_for_max_stake { max_stake="$1" if [[ $max_stake -eq 100 ]]; then return @@ -74,6 +74,16 @@ function wait_for_bootstrap_validator_stake_drop { ssh "${sshOptions[@]}" "${validatorIpList[0]}" "RUST_LOG=info \$HOME/.cargo/bin/solana wait-for-max-stake $max_stake --url http://127.0.0.1:8899" } +function wait_for_equal_stake { + source "${REPO_ROOT}"/net/common.sh + loadConfigFile + + max_stake=$((100 / ${#validatorIpList[@]} + 1)) + execution_step "Waiting for max stake to fall below ${max_stake}%" + + wait_for_max_stake $max_stake +} + function get_slot { source "${REPO_ROOT}"/net/common.sh loadConfigFile diff --git a/system-test/partition-testcases/gce-partition-with-offline.yml b/system-test/partition-testcases/gce-partition-with-offline.yml new file mode 100755 index 00000000000000..85f18826911591 --- /dev/null +++ b/system-test/partition-testcases/gce-partition-with-offline.yml @@ -0,0 +1,27 @@ +steps: + - command: "system-test/testnet-automation.sh" + label: "GCE - CPU Only 5 Node - 20% network offline with 2 partitions" + env: + UPLOAD_RESULTS_TO_SLACK: "true" + CLOUD_PROVIDER: "gce" + TESTNET_TAG: "gce-perf-cpu-only" + NUMBER_OF_VALIDATOR_NODES: 4 + ENABLE_GPU: "false" + VALIDATOR_NODE_MACHINE_TYPE: "--machine-type n1-standard-16" + NUMBER_OF_CLIENT_NODES: 1 + CLIENT_OPTIONS: "bench-tps=1=--tx_count 10000 --thread-batch-sleep-ms 250" + TESTNET_ZONES: "us-west1-a" + USE_PUBLIC_IP_ADDRESSES: "true" + ADDITIONAL_FLAGS: "--dedicated" + APPLY_PARTITIONS: "true" + NETEM_CONFIG_FILE: "system-test/netem-configs/complete-loss-two-partitions" + PARTITION_ACTIVE_DURATION: 30 + PARTITION_INACTIVE_DURATION: 30 + PARTITION_ITERATION_COUNT: 5 + TEST_TYPE: "partition" + EXTRA_PRIMORDIAL_STAKES: 4 + WAIT_FOR_EQUAL_STAKE: "true" + WARMUP_SLOTS_BEFORE_TEST: 400 + NUMBER_OF_OFFLINE_NODES: 1 + agents: + - "queue=gce-deploy" diff --git a/system-test/testnet-automation.sh b/system-test/testnet-automation.sh index 3c12e266415760..7b2d865ac1bf12 100755 --- a/system-test/testnet-automation.sh +++ b/system-test/testnet-automation.sh @@ -142,8 +142,12 @@ function launch_testnet() { -c idle=$NUMBER_OF_CLIENT_NODES $maybeStartAllowBootFailures \ --gpu-mode $startGpuMode $maybeWarpSlot $maybeAsyncNodeInit $maybeExtraPrimordialStakes - execution_step "Waiting for bootstrap validator's stake to fall below ${BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD}%" - wait_for_bootstrap_validator_stake_drop "$BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD" + if [[ -n "$WAIT_FOR_EQUAL_STAKE" ]]; then + wait_for_equal_stake + else + execution_step "Waiting for bootstrap validator's stake to fall below ${BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD}%" + wait_for_max_stake "$BOOTSTRAP_VALIDATOR_MAX_STAKE_THRESHOLD" + fi if [[ $NUMBER_OF_CLIENT_NODES -gt 0 ]]; then execution_step "Starting ${NUMBER_OF_CLIENT_NODES} client nodes" @@ -153,6 +157,24 @@ function launch_testnet() { sleep 180 fi + if [[ -n "$WARMUP_SLOTS_BEFORE_TEST" ]]; then + # Allow the network to run for a bit before beginning the test + while [[ "$WARMUP_SLOTS_BEFORE_TEST" -gt $(get_slot) ]]; do + sleep 5 + done + fi + + # Stop the specified number of nodes + num_online_nodes=$(( NUMBER_OF_VALIDATOR_NODES + 1 )) + if [[ -n "$NUMBER_OF_OFFLINE_NODES" ]]; then + execution_step "Stopping $NUMBER_OF_OFFLINE_NODES nodes" + for (( i=NUMBER_OF_VALIDATOR_NODES; i>$(( NUMBER_OF_VALIDATOR_NODES - NUMBER_OF_OFFLINE_NODES )); i-- )); do + # shellcheck disable=SC2154 + "${REPO_ROOT}"/net/net.sh stopnode -i "${validatorIpList[$i]}" + done + num_online_nodes=$(( num_online_nodes - NUMBER_OF_OFFLINE_NODES )) + fi + SECONDS=0 START_SLOT=$(get_slot) SLOT_COUNT_START_SECONDS=$SECONDS @@ -170,11 +192,11 @@ function launch_testnet() { for (( i=1; i<=PARTITION_ITERATION_COUNT; i++ )); do execution_step "Partition Iteration $i of $PARTITION_ITERATION_COUNT" execution_step "Applying netem config $NETEM_CONFIG_FILE for $PARTITION_ACTIVE_DURATION seconds" - "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" + "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" -n $num_online_nodes sleep "$PARTITION_ACTIVE_DURATION" execution_step "Resolving partitions for $PARTITION_INACTIVE_DURATION seconds" - "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup + "${REPO_ROOT}"/net/net.sh netem --config-file "$NETEM_CONFIG_FILE" --netem-cmd cleanup -n $num_online_nodes sleep "$PARTITION_INACTIVE_DURATION" done STATS_FINISH_SECONDS=$SECONDS @@ -325,6 +347,9 @@ TEST_PARAMS_TO_DISPLAY=(CLOUD_PROVIDER \ ADDITIONAL_FLAGS \ APPLY_PARTITIONS \ NETEM_CONFIG_FILE \ + WAIT_FOR_EQUAL_STAKE \ + WARMUP_SLOTS_BEFORE_TEST \ + NUMBER_OF_OFFLINE_NODES \ PARTITION_ACTIVE_DURATION \ PARTITION_INACTIVE_DURATION \ PARTITION_ITERATION_COUNT \