Skip to content

Commit

Permalink
Merge branch 'watchdog/gather-number-of-queries' into compatible
Browse files Browse the repository at this point in the history
  • Loading branch information
nholland94 committed May 8, 2021
2 parents f9ce4a9 + 49bf051 commit 5d1906a
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 13 deletions.
4 changes: 2 additions & 2 deletions automation/services/watchdog/make_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,8 @@ def add_resp(resp):
else:
print("Errored response: {}".format(error_str))
err_others += 1
except _:
print("Errored response: {}".format(error_str))
except:
print("Errored response: {}".format(p))
err_others += 1

print('\t%s valid responses from peers'%(str(len(list(peers)))))
Expand Down
31 changes: 22 additions & 9 deletions automation/services/watchdog/node_status_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def peer_to_multiaddr(peer):
peer['libp2p_port'],
peer['peer_id'] )

def collect_node_status_metrics(v1, namespace, nodes_synced_near_best_tip, nodes_synced, nodes_queried, nodes_responded, nodes_errored, context_deadline_exceeded, failed_security_protocol_negotiation, connection_refused_errors, size_limit_exceeded_errors, timed_out_errors, stream_reset_errors, other_connection_errors, prover_errors):
def collect_node_status_metrics(v1, namespace, nodes_synced_near_best_tip, nodes_synced, nodes_queried, nodes_responded, seed_nodes_queried, seed_nodes_responded, nodes_errored, context_deadline_exceeded, failed_security_protocol_negotiation, connection_refused_errors, size_limit_exceeded_errors, timed_out_errors, stream_reset_errors, other_connection_errors, prover_errors):
print('collecting node status metrics')

pods = v1.list_namespaced_pod(namespace, watch=False)
Expand All @@ -30,7 +30,7 @@ def collect_node_status_metrics(v1, namespace, nodes_synced_near_best_tip, nodes

seeds = [ p for p in pod_names if 'seed' in p ]

resp_count, valid_resps, error_resps = collect_node_status(v1, namespace, seeds, pods)
resp_count, valid_resps, error_resps = collect_node_status(v1, namespace, seeds, pods, seed_nodes_responded, seed_nodes_queried)

err_context_deadline = 0
err_negotiate_security_protocol = 0
Expand Down Expand Up @@ -149,10 +149,11 @@ def get_deepest_child(p):

# ========================================================================

def collect_node_status(v1, namespace, seeds, pods):
def collect_node_status(v1, namespace, seeds, pods, seed_nodes_responded, seed_nodes_queried):
peer_table = {}
error_resps = []
all_resps = []
peer_set = set()

def contains_error(resp):
try:
Expand All @@ -164,12 +165,16 @@ def contains_error(resp):
def no_error(resp):
return (not (contains_error(resp)))

def add_resp(raw):
def add_resp(raw, peers, seed, seed_node_responded, seed_node_queried):
resps = [ ast.literal_eval(s) for s in raw.split('\n') if s != '' ]

valid_resps = list(filter(no_error, resps))
error_resps.extend(list(filter(contains_error, resps)))
all_resps.extend(resps)
peer_set.update(set(peers))

seed_node_responded.labels(seed= seed).set(len(valid_resps))
seed_node_queried.labels(seed= seed).set(len(peers))

peer_resp_map = [ ((r['node_ip_addr'], r['node_peer_id']), r) for r in valid_resps ]

Expand All @@ -183,14 +188,22 @@ def add_resp(raw):
seed_vars_dict = [ v for v in seed_daemon_container['env'] ]
seed_daemon_port = [ v['value'] for v in seed_vars_dict if v['name'] == 'DAEMON_CLIENT_PORT'][0]

cmd = "mina advanced node-status -daemon-port " + seed_daemon_port + " -daemon-peers" + " -show-errors"
resp = util.exec_on_pod(v1, namespace, seed, 'coda', cmd)
try:
cmd = "mina advanced get-peers"
peers = util.exec_on_pod(v1, namespace, seed, 'coda', cmd).rstrip().split('\n')

cmd = "mina advanced node-status -daemon-port " + seed_daemon_port + " -peers " + ",".join(peers) + " -show-errors"
resp = util.exec_on_pod(v1, namespace, seed, 'coda', cmd)

if not 'Error: Unable to connect to Mina Daemon.' in resp:
add_resp(resp)
if not 'Error: Unable to connect to Mina Daemon.' in resp:
add_resp(resp, peers, seed, seed_nodes_responded, seed_nodes_queried)
except Exception as e:
print("failed to exec command on pod: {}".format(e))
continue


valid_resps = peer_table.values()

return (len(all_resps), valid_resps, error_resps)
return (len(peer_set), valid_resps, error_resps)

# ========================================================================
2 changes: 1 addition & 1 deletion automation/services/watchdog/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.4.10
0.4.11
4 changes: 3 additions & 1 deletion automation/services/watchdog/watchdog.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def main():
prover_errors = Counter('Coda_watchdog_prover_errors', 'Description of gauge')
pods_with_no_new_logs = Gauge('Coda_watchdog_pods_with_no_new_logs', 'Number of nodes whose latest log is older than 10 minutes')
nodes_queried=Gauge('Coda_watchdog_nodes_queried', 'Number of nodes that were queried for node-status')
seed_nodes_responded=Gauge('Coda_watchdog_nodes_responded_to_seed', 'Number of nodes that responded to the last status query on each seed', ['seed'])
seed_nodes_queried=Gauge('Coda_watchdog_nodes_queried_by_seed', 'Number of nodes that were queried for node-status on each seed', ['seed'])
context_deadline_exceeded=Gauge('Coda_watchdog_deadline_exceeded', 'Number of nodes that failed with the context-deadline-exceeded error to a node-status query')
failed_security_protocol_negotiation=Gauge('Coda_watchdog_failed_negotiation', 'Number of nodes that failed with the security-protocol-negotiation error to a node-status query')
connection_refused_errors=Gauge('Coda_watchdog_connection_refused', 'Number of nodes that failed with the connection-refused error to a node-status query')
Expand All @@ -47,7 +49,7 @@ def main():

fns = [
( lambda: metrics.collect_cluster_crashes(v1, namespace, cluster_crashes), 30*60 ),
( lambda: metrics.collect_node_status_metrics(v1, namespace, nodes_synced_near_best_tip, nodes_synced, nodes_queried, nodes_responded,nodes_errored, context_deadline_exceeded, failed_security_protocol_negotiation, connection_refused_errors,size_limit_exceeded_errors, timed_out_errors, stream_reset_errors, other_connection_errors, prover_errors), 10*60 ),
( lambda: metrics.collect_node_status_metrics(v1, namespace, nodes_synced_near_best_tip, nodes_synced, nodes_queried, nodes_responded, seed_nodes_queried, seed_nodes_responded, nodes_errored, context_deadline_exceeded, failed_security_protocol_negotiation, connection_refused_errors,size_limit_exceeded_errors, timed_out_errors, stream_reset_errors, other_connection_errors, prover_errors), 10*60 ),
( lambda: metrics.check_seed_list_up(v1, namespace, seeds_reachable), 60*60 ),
( lambda: metrics.pods_with_no_new_logs(v1, namespace, pods_with_no_new_logs), 60*10 ),
]
Expand Down

0 comments on commit 5d1906a

Please sign in to comment.