Skip to content

Commit

Permalink
Make DNS timeout stat more specific. (letsencrypt#3627)
Browse files Browse the repository at this point in the history
Distinguish between deadline exceeded vs canceled. Also, combine those
two cases with "out of retries" into a single stat with a label
determining type.
  • Loading branch information
jsha authored and cpu committed Apr 9, 2018
1 parent c591f8a commit 4951153
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 28 deletions.
37 changes: 17 additions & 20 deletions bdns/dns.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,9 @@ type DNSClientImpl struct {
maxTries int
clk clock.Clock

queryTime *prometheus.HistogramVec
totalLookupTime *prometheus.HistogramVec
cancelCounter *prometheus.CounterVec
usedAllRetriesCounter *prometheus.CounterVec
queryTime *prometheus.HistogramVec
totalLookupTime *prometheus.HistogramVec
timeoutCounter *prometheus.CounterVec
}

var _ DNSClient = &DNSClientImpl{}
Expand Down Expand Up @@ -203,21 +202,14 @@ func NewDNSClientImpl(
},
[]string{"qtype", "result", "authenticated_data", "retries"},
)
cancelCounter := prometheus.NewCounterVec(
timeoutCounter := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "dns_query_cancels",
Help: "Counter of canceled DNS queries",
Name: "dns_timeout",
Help: "Counter of various types of DNS query timeouts",
},
[]string{"qtype"},
[]string{"qtype", "type"},
)
usedAllRetriesCounter := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "dns_query_used_all_retries",
Help: "Counter of DNS queries which used all its retries",
},
[]string{"qtype"},
)
stats.MustRegister(queryTime, totalLookupTime, cancelCounter, usedAllRetriesCounter)
stats.MustRegister(queryTime, totalLookupTime, timeoutCounter)

return &DNSClientImpl{
dnsClient: dnsClient,
Expand All @@ -227,8 +219,7 @@ func NewDNSClientImpl(
clk: clk,
queryTime: queryTime,
totalLookupTime: totalLookupTime,
cancelCounter: cancelCounter,
usedAllRetriesCounter: usedAllRetriesCounter,
timeoutCounter: timeoutCounter,
}
}

Expand Down Expand Up @@ -304,7 +295,13 @@ func (dnsClient *DNSClientImpl) exchangeOne(ctx context.Context, hostname string
}()
select {
case <-ctx.Done():
dnsClient.cancelCounter.With(prometheus.Labels{"qtype": qtypeStr}).Inc()
if ctx.Err() == context.DeadlineExceeded {
dnsClient.timeoutCounter.With(prometheus.Labels{"qtype": qtypeStr, "type": "deadline exceeded"}).Inc()
} else if ctx.Err() == context.Canceled {
dnsClient.timeoutCounter.With(prometheus.Labels{"qtype": qtypeStr, "type": "canceled"}).Inc()
} else {
dnsClient.timeoutCounter.With(prometheus.Labels{"qtype": qtypeStr, "type": "unknown"}).Inc()
}
err = ctx.Err()
return
case r := <-ch:
Expand All @@ -316,7 +313,7 @@ func (dnsClient *DNSClientImpl) exchangeOne(ctx context.Context, hostname string
tries++
continue
} else if isRetryable && !hasRetriesLeft {
dnsClient.usedAllRetriesCounter.With(prometheus.Labels{"qtype": qtypeStr}).Inc()
dnsClient.timeoutCounter.With(prometheus.Labels{"qtype": qtypeStr, "type": "out of retries"}).Inc()
}
}
resp, err = r.m, r.err
Expand Down
31 changes: 23 additions & 8 deletions bdns/dns_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/letsencrypt/boulder/metrics"
"github.com/letsencrypt/boulder/test"
"github.com/miekg/dns"
"github.com/prometheus/client_golang/prometheus"
)

const dnsLoopbackAddr = "127.0.0.1:4053"
Expand Down Expand Up @@ -609,10 +610,13 @@ func TestRetry(t *testing.T) {
t.Errorf("#%d, error, expectedCount %v, got %v", i, tc.expectedCount, tc.te.count)
}
if tc.metricsAllRetries > 0 {
test.AssertEquals(t, test.CountCounterVec(
"qtype",
"TXT",
dr.usedAllRetriesCounter), tc.metricsAllRetries)
count := test.CountCounter(dr.timeoutCounter.With(prometheus.Labels{
"qtype": "TXT",
"type": "out of retries",
}))
if count != tc.metricsAllRetries {
t.Errorf("wrong count for timeoutCounter: got %d, expected %d", count, tc.metricsAllRetries)
}
}
}

Expand Down Expand Up @@ -643,10 +647,21 @@ func TestRetry(t *testing.T) {
t.Errorf("expected %s, got %s", context.DeadlineExceeded, err)
}

test.AssertEquals(t, test.CountCounterVec(
"qtype",
"TXT",
dr.cancelCounter), 3)
count := test.CountCounter(dr.timeoutCounter.With(prometheus.Labels{
"qtype": "TXT",
"type": "canceled",
}))
if count != 1 {
t.Errorf("wrong count for timeoutCounter canceled: got %d, expected %d", count, 1)
}

count = test.CountCounter(dr.timeoutCounter.With(prometheus.Labels{
"qtype": "TXT",
"type": "deadline exceeded",
}))
if count != 2 {
t.Errorf("wrong count for timeoutCounter deadline exceeded: got %d, expected %d", count, 2)
}
}

type tempError bool
Expand Down

0 comments on commit 4951153

Please sign in to comment.