Skip to content

Commit

Permalink
[Compatibility] Use snapshots to shard (MystenLabs#10931)
Browse files Browse the repository at this point in the history
## Description 

This PR uses a matrix to run across n hosts, with all but one hosts
restoring from snapshot. This should help with runtime and disk
utilization.

## Test Plan 

Temporarily set `pull_request` trigger to test in CI

---
If your changes are not user-facing and not a breaking change, you can
skip the following section. Otherwise, please indicate what changed, and
then add to the Release Notes section as highlighted during the release
process.

### Type of Change (Check all that apply)

- [ ] user-visible impact
- [ ] breaking change for a client SDKs
- [ ] breaking change for FNs (FN binary must upgrade)
- [ ] breaking change for validators or node operators (must upgrade
binaries)
- [ ] breaking change for on-chain data layout
- [ ] necessitate either a data wipe or data migration

### Release notes
  • Loading branch information
williampsmith authored Apr 15, 2023
1 parent 7370a21 commit 1065418
Show file tree
Hide file tree
Showing 4 changed files with 198 additions and 6 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/compatibility_checker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ concurrency:

on:
schedule:
- cron: "0 6 * * *" # UTC timing is every day at 10pm PST
- cron: "0 10 * * *" # UTC timing is every day at 2am PST
workflow_dispatch:
inputs:
verbose:
Expand Down Expand Up @@ -76,7 +76,7 @@ jobs:
- name: Run sync script
run: |
tsh -i ${{ steps.auth.outputs.identity-file }} --ttl 720 ssh ubuntu@fullnode-compat-test-01 "cd ~/sui && echo "Running compatibility checker against binary built at commit hash ${{ steps.get-ci-tag.outputs.CI_COMMIT_HASH }}" && CARGO_TERM_COLOR=always ./scripts/fullnode-sync.sh -p /var/lib/sui/bin/sui-node -n testnet ${{ github.event.inputs.verbose == true && '-v' || '' }}"
tsh -i ${{ steps.auth.outputs.identity-file }} --ttl 720 ssh ubuntu@fullnode-compat-test-01 "cd ~/sui && echo "Running compatibility checker against binary built at commit hash ${{ steps.get-ci-tag.outputs.CI_COMMIT_HASH }}" && CARGO_TERM_COLOR=always ./scripts/compatibility/fullnode-sync.sh -p /var/lib/sui/bin/sui-node -n testnet ${{ github.event.inputs.verbose == true && '-v' || '' }}"
notify:
name: Notify
Expand Down Expand Up @@ -110,7 +110,7 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
SUI_SHA: ${{ env.sui_sha }}
GH_JOB_LINK: ${{ env.gh_job_link }}
with:
with: # TODO fix commit hash below, as we are reunning on last commit that passed CI
channel-id: "compatibility-test-nightly"
payload: |
{
Expand Down
192 changes: 192 additions & 0 deletions .github/workflows/compatibility_sharded.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
name: Compatibility Checker (Sharded)

concurrency:
group: ${{ github.workflow }}

on:
schedule:
- cron: "0 6 * * *" # UTC timing is every day at 10pm PST
workflow_dispatch:
inputs:
verbose:
type: boolean
required: false
description: "Run with verbose logging"
default: false
preserve-state:
type: boolean
required: false
description: "Preserve database state of previous run"
default: true

jobs:
genesis-sync-test-sharded:
strategy:
matrix:
shard_manifest:
[
{
hostname: "fullnode-compat-test-01",
start_epoch: 0,
end_epoch: 748,
},
{
hostname: "fullnode-compat-test-02",
start_epoch: 748,
end_epoch: 753,
},
{
hostname: "fullnode-compat-test-03",
start_epoch: 753,
end_epoch: -1,
},
]
timeout-minutes: 720
permissions:
# The "id-token: write" permission is required or Machine ID will not be
# able to authenticate with the cluster.
id-token: write
contents: read
runs-on: ubuntu-latest

steps:
- name: Install Teleport
uses: teleport-actions/setup@75c810ac63a7e9b28ca4795c6111619f2a13ac71 # pin@v1
with:
version: 11.3.1
- name: Authorize against Teleport
id: auth
uses: teleport-actions/auth@6e73aa03a98e781ca6692817fbb6e554640a56b3 # pin@v1
with:
# Specify the publically accessible address of your Teleport proxy.
proxy: proxy.mysten-int.com:443
# Specify the name of the join token for your bot.
token: fullnode-compat-test
# Specify the length of time that the generated credentials should be
# valid for. This is optional and defaults to "1h"
certificate-ttl: 12h

- uses: actions/checkout@v3
with:
fetch-depth: 0

- name: Get latest CI-tagged binary commit hash
run: echo "CI_COMMIT_HASH=$(git for-each-ref --sort=creatordate refs/tags | grep -E 'sui_v(.*)_ci' | tail -1 | awk '{ print $1 }')" >> $GITHUB_OUTPUT
id: get-ci-tag

# Checkout the latest sui repo
- name: Checkout sui repo
run: |
tsh -i ${{ steps.auth.outputs.identity-file }} --ttl 10 ssh ubuntu@${{ matrix.shard_manifest.hostname }} "source ~/.bashrc && source ~/.cargo/env && cd ~/sui && git fetch origin main && git rebase origin/main"
# Cargo clean and git restore on any left over files from git checkout
- name: Environment clean
run: |
tsh -i ${{ steps.auth.outputs.identity-file }} --ttl 5 ssh ubuntu@${{ matrix.shard_manifest.hostname }} "source ~/.bashrc && source ~/.cargo/env && cd ~/sui && cargo clean && git restore ."
- name: Fetch sui-node binary
run: |
tsh -i ${{ steps.auth.outputs.identity-file }} --ttl 30 ssh ubuntu@${{ matrix.shard_manifest.hostname }} "source ~/.bashrc && mkdir -p /var/lib/sui/bin && cd /var/lib/sui/bin && wget https://sui-releases.s3.us-east-1.amazonaws.com/${{ steps.get-ci-tag.outputs.CI_COMMIT_HASH }}/sui-node && chmod +x sui-node"
# Wipe database from previous runs
- name: Wipe SuiDB
if: ${{ github.event.inputs.preserve-state == false }}
run: |
tsh -i ${{ steps.auth.outputs.identity-file }} --ttl 5 ssh ubuntu@${{ matrix.shard_manifest.hostname }} "source ~/.bashrc && sudo rm -rf /var/lib/sui/suidb || true"
- name: Setup DB structure
if: ${{ (matrix.shard_manifest.start_epoch != 0) && (github.event.inputs.preserve-state == false) }}
run: |
tsh -i ${{ steps.auth.outputs.identity-file }} --ttl 5 ssh ubuntu@${{ matrix.shard_manifest.hostname }} "source ~/.bashrc && mkdir -p /var/lib/sui/suidb/epoch_${{ matrix.shard_manifest.start_epoch }} && cd /var/lib/sui/suidb/epoch_${{ matrix.shard_manifest.start_epoch }} && mkdir checkpoints && mkdir epochs && mkdir -p store/perpetual"
- name: Download epoch snapshot artifacts
if: ${{ (matrix.shard_manifest.start_epoch != 0) && (github.event.inputs.preserve-state == false) }}
run: |
tsh -i ${{ steps.auth.outputs.identity-file }} --ttl 60 ssh ubuntu@${{ matrix.shard_manifest.hostname }} "source ~/.bashrc && cd /var/lib/sui/suidb/epoch_${{ matrix.shard_manifest.start_epoch }}/epochs && wget -i /var/lib/sui/s3_urls/epochs"
- name: Download checkpoint snapshot artifacts
if: ${{ (matrix.shard_manifest.start_epoch != 0) && (github.event.inputs.preserve-state == false) }}
run: |
tsh -i ${{ steps.auth.outputs.identity-file }} --ttl 60 ssh ubuntu@${{ matrix.shard_manifest.hostname }} "source ~/.bashrc && cd /var/lib/sui/suidb/epoch_${{ matrix.shard_manifest.start_epoch }}/checkpoints && wget -i /var/lib/sui/s3_urls/checkpoints"
- name: Download perpetual snapshot artifacts
if: ${{ (matrix.shard_manifest.start_epoch != 0) && (github.event.inputs.preserve-state == false) }}
run: |
tsh -i ${{ steps.auth.outputs.identity-file }} --ttl 60 ssh ubuntu@${{ matrix.shard_manifest.hostname }} "source ~/.bashrc && cd /var/lib/sui/suidb/epoch_${{ matrix.shard_manifest.start_epoch }}/store/perpetual && wget -i /var/lib/sui/s3_urls/store_perpetual"
- name: Mark snapshot as live
if: ${{ (matrix.shard_manifest.start_epoch != 0) && (github.event.inputs.preserve-state == false) }}
run: |
tsh -i ${{ steps.auth.outputs.identity-file }} --ttl 5 ssh ubuntu@${{ matrix.shard_manifest.hostname }} "source ~/.bashrc && mv /var/lib/sui/suidb/epoch_${{ matrix.shard_manifest.start_epoch }} /var/lib/sui/suidb/live"
- name: Run sync script
run: |
tsh -i ${{ steps.auth.outputs.identity-file }} --ttl 720 ssh ubuntu@${{ matrix.shard_manifest.hostname }} "source ~/.bashrc && source ~/.cargo/env && cd ~/sui && echo "Running compatibility checker against binary built at commit hash ${{ steps.get-ci-tag.outputs.CI_COMMIT_HASH }}" && CARGO_TERM_COLOR=always ./scripts/compatibility/fullnode-sync.sh -p /var/lib/sui/bin/sui-node -n testnet -e ${{ matrix.shard_manifest.end_epoch }} ${{ github.event.inputs.verbose == true && '-v' || '' }}"
notify:
name: Notify
needs: [genesis-sync-test-sharded]
runs-on: ubuntu-latest
if: always() # always notify

steps:
- uses: technote-space/workflow-conclusion-action@v3

- name: Checkout sui repo main branch
uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # pin@v3

- name: Get sui commit
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
export sui_sha=$(git rev-parse HEAD)
echo "sui_sha=${sui_sha}" >> $GITHUB_ENV
- name: Get link to logs
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
gh_job_link=$(gh api -X GET 'repos/MystenLabs/sui/actions/runs/${{ github.run_id }}/jobs' --jq '.jobs.[0].html_url')
echo "gh_job_link=${gh_job_link}" >> $GITHUB_ENV
- name: Post to slack
uses: slackapi/slack-github-action@936158bbe252e9a6062e793ea4609642c966e302 # [email protected]
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
SUI_SHA: ${{ env.sui_sha }}
GH_JOB_LINK: ${{ env.gh_job_link }}
with: # TODO fix commit hash below, as we are reunning on last commit that passed CI
channel-id: "compatibility-test-nightly"
payload: |
{
"text": "*${{ github.workflow }}* workflow status: `${{ env.WORKFLOW_CONCLUSION }}`",
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "*${{ github.workflow }}* workflow status: `${{ env.WORKFLOW_CONCLUSION }}`"
}
},
{
"type": "divider"
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "Sui commit: <https://github.com/MystenLabs/sui/commit/${{ env.SUI_SHA }}|${{ env.SUI_SHA }}> \nRun: <${{ env.GH_JOB_LINK }}|${{ github.run_id }}>"
}
},
{
"type": "divider"
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "To debug failures: See run logs; Rerun with `verbose` logging enabled, `clean` disabled in order to sync from last place); `tsh ssh ubuntu@fullnode-compat-test-01` if needed. \nMetrics can be viewed at https://metrics.sui.io/d/_nu1mWC7zx/sui-fullnode?orgId=1&refresh=1m&var-Environment=mysten-metrics-internal&var-network=testnet&var-host=fullnode-compat-test-01"
}
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ while getopts "hvn:e:p:" OPT; do
>&2 echo " -n NETWORK The network to run the fullnode on."
>&2 echo " (Default: ${DEFAULT_NETWORK})"
>&2 echo " -e END_EPOCH EpochID at which to stop syncing and declare success."
>&2 echo " If unspecified, will use current epoch of NETWORK."
>&2 echo " If unspecified or -1, will use current epoch of NETWORK."
>&2 echo " -t EPOCH_TIMEOUT Number of minutes to wait until epoch advancement before timing out."
>&2 exit 0
;;
Expand Down Expand Up @@ -106,7 +106,7 @@ SUI_NODE_PID=$!

# start monitoring script
END_EPOCH_ARG=""
if [[ ! -z $END_EPOCH ]]; then
if [[ ! -z $END_EPOCH && $END_EPOCH != -1 ]]; then
END_EPOCH_ARG="--end-epoch $END_EPOCH"
fi

Expand All @@ -115,7 +115,7 @@ if [[ ! -z $EPOCH_TIMEOUT ]]; then
EPOCH_TIMEOUT_ARG="--epoch-timeout $EPOCH_TIMEOUT"
fi

./scripts/monitor_synced.py $END_EPOCH_ARG $EPOCH_TIMEOUT_ARG --env $NETWORK $VERBOSE
./scripts/compatibility/monitor_synced.py $END_EPOCH_ARG $EPOCH_TIMEOUT_ARG --env $NETWORK $VERBOSE

kill $SUI_NODE_PID
exit 0
Expand Down
File renamed without changes.

0 comments on commit 1065418

Please sign in to comment.