Skip to content

Commit

Permalink
Add resolver to DNS metrics (letsencrypt#3874)
Browse files Browse the repository at this point in the history
Helpful for debugging stuff in multi-resolver setups.
  • Loading branch information
Roland Bracewell Shoemaker authored and jsha committed Oct 1, 2018
1 parent 484fd31 commit 97d1788
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 13 deletions.
32 changes: 25 additions & 7 deletions bdns/dns.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,22 +193,22 @@ func NewDNSClientImpl(
Help: "Time taken to perform a DNS query",
Buckets: metrics.InternetFacingBuckets,
},
[]string{"qtype", "result", "authenticated_data"},
[]string{"qtype", "result", "authenticated_data", "resolver"},
)
totalLookupTime := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "dns_total_lookup_time",
Help: "Time taken to perform a DNS lookup, including all retried queries",
Buckets: metrics.InternetFacingBuckets,
},
[]string{"qtype", "result", "authenticated_data", "retries"},
[]string{"qtype", "result", "authenticated_data", "retries", "resolver"},
)
timeoutCounter := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "dns_timeout",
Help: "Counter of various types of DNS query timeouts",
},
[]string{"qtype", "type"},
[]string{"qtype", "type", "resolver"},
)
stats.MustRegister(queryTime, totalLookupTime, timeoutCounter)

Expand Down Expand Up @@ -276,6 +276,7 @@ func (dnsClient *DNSClientImpl) exchangeOne(ctx context.Context, hostname string
"result": result,
"authenticated_data": authenticated,
"retries": strconv.Itoa(tries),
"resolver": chosenServer,
}).Observe(dnsClient.clk.Since(start).Seconds())
}()
for {
Expand All @@ -292,17 +293,30 @@ func (dnsClient *DNSClientImpl) exchangeOne(ctx context.Context, hostname string
"qtype": qtypeStr,
"result": result,
"authenticated_data": authenticated,
"resolver": chosenServer,
}).Observe(rtt.Seconds())
ch <- dnsResp{m: rsp, err: err}
}()
select {
case <-ctx.Done():
if ctx.Err() == context.DeadlineExceeded {
dnsClient.timeoutCounter.With(prometheus.Labels{"qtype": qtypeStr, "type": "deadline exceeded"}).Inc()
dnsClient.timeoutCounter.With(prometheus.Labels{
"qtype": qtypeStr,
"type": "deadline exceeded",
"resolver": chosenServer,
}).Inc()
} else if ctx.Err() == context.Canceled {
dnsClient.timeoutCounter.With(prometheus.Labels{"qtype": qtypeStr, "type": "canceled"}).Inc()
dnsClient.timeoutCounter.With(prometheus.Labels{
"qtype": qtypeStr,
"type": "canceled",
"resolver": chosenServer,
}).Inc()
} else {
dnsClient.timeoutCounter.With(prometheus.Labels{"qtype": qtypeStr, "type": "unknown"}).Inc()
dnsClient.timeoutCounter.With(prometheus.Labels{
"qtype": qtypeStr,
"type": "unknown",
"resolver": chosenServer,
}).Inc()
}
err = ctx.Err()
return
Expand All @@ -321,7 +335,11 @@ func (dnsClient *DNSClientImpl) exchangeOne(ctx context.Context, hostname string
chosenServer = dnsClient.servers[chosenServerIndex]
continue
} else if isRetryable && !hasRetriesLeft {
dnsClient.timeoutCounter.With(prometheus.Labels{"qtype": qtypeStr, "type": "out of retries"}).Inc()
dnsClient.timeoutCounter.With(prometheus.Labels{
"qtype": qtypeStr,
"type": "out of retries",
"resolver": chosenServer,
}).Inc()
}
}
resp, err = r.m, r.err
Expand Down
15 changes: 9 additions & 6 deletions bdns/dns_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -611,8 +611,9 @@ func TestRetry(t *testing.T) {
}
if tc.metricsAllRetries > 0 {
count := test.CountCounter(dr.timeoutCounter.With(prometheus.Labels{
"qtype": "TXT",
"type": "out of retries",
"qtype": "TXT",
"type": "out of retries",
"resolver": dnsLoopbackAddr,
}))
if count != tc.metricsAllRetries {
t.Errorf("wrong count for timeoutCounter: got %d, expected %d", count, tc.metricsAllRetries)
Expand Down Expand Up @@ -648,16 +649,18 @@ func TestRetry(t *testing.T) {
}

count := test.CountCounter(dr.timeoutCounter.With(prometheus.Labels{
"qtype": "TXT",
"type": "canceled",
"qtype": "TXT",
"type": "canceled",
"resolver": dnsLoopbackAddr,
}))
if count != 1 {
t.Errorf("wrong count for timeoutCounter canceled: got %d, expected %d", count, 1)
}

count = test.CountCounter(dr.timeoutCounter.With(prometheus.Labels{
"qtype": "TXT",
"type": "deadline exceeded",
"qtype": "TXT",
"type": "deadline exceeded",
"resolver": dnsLoopbackAddr,
}))
if count != 2 {
t.Errorf("wrong count for timeoutCounter deadline exceeded: got %d, expected %d", count, 2)
Expand Down

0 comments on commit 97d1788

Please sign in to comment.