Skip to content

Commit

Permalink
[Compatibility] Introduce monitor_synced script (MystenLabs#10547)
Browse files Browse the repository at this point in the history
## Description 

Introduces a script that can be run on a fullnode running from an
arbitrary checkpoint/epoch (e.g. genesis, or from a snapshot).

Given an end epoch, script monitors local syncing progress of the
fullnode towards that epoch, and exits successfully when reached. If
stuck not making progress either by checkpoint or epoch, fails.

If no end epoch is provided, will retrieve the current epoch of the
network provided, and target that epoch.

The configurable end epoch allows us to use this for checking syncing
over a large period in a sharded manner across many machines, or to
monitor a single machine syncing from genesis to the current network
state, or anywhere in between.

## Test Plan 

1. Run fullnode locally
2. Run script and ensure that it correctly tracks the progress of the
fullnode, i.e. fails if stuck, updates if making progress, succeeds once
synced

```
williamsmith in ~/github/sui on monitor-synced-script λ scripts/monitor_synced.py --end-epoch=2 --env=testnet
Will attempt to sync to epoch 2
Current local epoch: 0
Locally highest executed checkpoint: 0
New highest executed checkpoint: 3169
New highest executed checkpoint: 7397
New local epoch: 1
New highest executed checkpoint: 12001
New highest executed checkpoint: 15575
New highest executed checkpoint: 20227
New highest executed checkpoint: 24864
New highest executed checkpoint: 29523
New highest executed checkpoint: 34325
New highest executed checkpoint: 38285
New highest executed checkpoint: 42996
New highest executed checkpoint: 47683
New highest executed checkpoint: 52405
New highest executed checkpoint: 57066
New highest executed checkpoint: 61712
New highest executed checkpoint: 66268
New highest executed checkpoint: 70302
New highest executed checkpoint: 74441
New local epoch: 2
New highest executed checkpoint: 79057
-------------------------------
Successfully synced to epoch 2 from epoch 0 (79057 checkpoints) in 3.01 minutes
```
---
If your changes are not user-facing and not a breaking change, you can
skip the following section. Otherwise, please indicate what changed, and
then add to the Release Notes section as highlighted during the release
process.

### Type of Change (Check all that apply)

- [ ] user-visible impact
- [ ] breaking change for a client SDKs
- [ ] breaking change for FNs (FN binary must upgrade)
- [ ] breaking change for validators or node operators (must upgrade
binaries)
- [ ] breaking change for on-chain data layout
- [ ] necessitate either a data wipe or data migration

### Release notes
  • Loading branch information
williampsmith authored Apr 8, 2023
1 parent 83a81eb commit 70c4611
Showing 1 changed file with 138 additions and 0 deletions.
138 changes: 138 additions & 0 deletions scripts/monitor_synced.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/bin/env python3
# Copyright (c) Mysten Labs, Inc.
# SPDX-License-Identifier: Apache-2.0

import json
import os
import sys
import subprocess
import getopt
from enum import Enum
import time
from datetime import datetime


NUM_RETRIES = 5
CHECKPOINT_STUCK_THRESHOLD_SEC = 10
EPOCH_STUCK_THRESHOLD_SEC = 10 * 60


class Metric(Enum):
CHECKPOINT = 'last_executed_checkpoint'
EPOCH = 'current_epoch'


def get_current_network_epoch(env='testnet'):
for i in range(NUM_RETRIES):
cmd = ['curl', '--location', '--request', 'POST', f'https://explorer-rpc.{env}.sui.io/',
'--header', 'Content-Type: application/json', '--data-raw',
'{"jsonrpc":"2.0", "method":"suix_getEpochs", "params":[null, 1, true], "id":1}']
try:
result = subprocess.check_output(cmd, stderr=subprocess.PIPE)
except subprocess.CalledProcessError as e:
print(f'curl command failed with error {e.returncode}: {e.output}')
time.sleep(3)
continue

try:
result = json.loads(result)
return int(result['result']['data'][0]['epoch'])
except (KeyError, IndexError, json.JSONDecodeError):
print(f'suix_getEpochs rpc request failed: {result}')
time.sleep(3)
continue
print(f"Failed to get current network epoch after {NUM_RETRIES} tries")
exit(1)


def get_local_metric(metric: Metric):
for i in range(NUM_RETRIES):
# try:
curl = subprocess.Popen(
['curl', '-s', 'http://localhost:9184/metrics'], stdout=subprocess.PIPE)
grep_1 = subprocess.Popen(
['grep', metric.value], stdin=curl.stdout, stdout=subprocess.PIPE)
try:
result = subprocess.check_output(
['grep', '^[^#;]'], stdin=grep_1.stdout, stderr=subprocess.PIPE)
except subprocess.CalledProcessError as e:
print(f'curl command failed with error {e.returncode}: {e.output}')
time.sleep(3)
continue

# convert json result to dictionary
try:
return int(result.split()[1])
except (KeyError, IndexError, json.JSONDecodeError):
print(
f'Failed to get local metric {metric.value}: {result.stdout}')
time.sleep(3)
continue
print(
f"Failed to get local metric {metric.value} after {NUM_RETRIES} tries")
exit(1)


def main(argv):
if len(argv) > 2:
print(
"Usage: monitor_synced.py [--end-epoch=END_EPOCH] [--env=ENVIRONMENT]")
exit(1)

opts, args = getopt.getopt(argv, '', ["env=", "end-epoch="])

env = 'testnet'
end_epoch = None
for opt, arg in opts:
if opt == '--env':
env = arg
elif opt == '--end-epoch':
end_epoch = int(arg)

if end_epoch is None:
end_epoch = get_current_network_epoch(env)
print(f'Will attempt to sync to epoch {end_epoch}')

current_epoch = get_local_metric(Metric.EPOCH)
print(f'Current local epoch: {current_epoch}')
start_epoch = current_epoch

current_checkpoint = get_local_metric(Metric.CHECKPOINT)
print(f'Locally highest executed checkpoint: {current_checkpoint}')
start_checkpoint = current_checkpoint

# get local time so that we can measure the time since the last epoch change
current_time = datetime.now()
start_time = current_time
while current_epoch < end_epoch:
# check that we are making progress
time.sleep(10)
new_checkpoint = get_local_metric(Metric.CHECKPOINT)

if new_checkpoint == current_checkpoint:
print(
f'Checkpoint is stuck at {current_checkpoint} for over {CHECKPOINT_STUCK_THRESHOLD_SEC} seconds')
exit(1)
current_checkpoint = new_checkpoint

new_epoch = get_local_metric(Metric.EPOCH)
if new_epoch > current_epoch:
current_epoch = new_epoch
print(f'New local epoch: {current_epoch}')
current_time = datetime.now()
else:
# check if we have been stuck for more than 5 minutes
if (datetime.now() - current_time).total_seconds() > EPOCH_STUCK_THRESHOLD_SEC:
print(
f'Epoch is stuck at {current_epoch} for over {EPOCH_STUCK_THRESHOLD_SEC} seconds')
exit(1)
print(f'New highest executed checkpoint: {current_checkpoint}')

elapsed_minutes = (datetime.now() - start_time).total_seconds() / 60
print('-------------------------------')
print(f"Successfully synced to epoch {end_epoch} from epoch {start_epoch} ({current_checkpoint - start_checkpoint} checkpoints) in {elapsed_minutes:.2f} minutes")
exit(0)


if __name__ == "__main__":
main(sys.argv[1:])

0 comments on commit 70c4611

Please sign in to comment.