Skip to content

Commit

Permalink
cert-checker: improve querying (letsencrypt#7088)
Browse files Browse the repository at this point in the history
The certificates table has no index on expires, so including expires in
the query was causing unnecessarily slow queries.

Expires was part of the query in order to support the "UnexpiredOnly"
feature. This feature isn't really necessary; if we tell cert-checker to
check certain certificates, we want to know about problems even if the
certificates are expired. Deprecate the feature and always include all
certificates in the time range. Remove "expires" from the queries.

This allows simplifying the query structure a bit and inlining query
arguments in each site where they are used.

Update the error message returned when no rows are returned for the
MIN(id) query.

The precursor to UnexpiredOnly was introduced in
letsencrypt#1644, but I can't find any
reference to a requirement for the flag.

Fixes letsencrypt#7085
  • Loading branch information
jsha authored Oct 3, 2023
1 parent 98a3f14 commit b68b21c
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 40 deletions.
104 changes: 71 additions & 33 deletions cmd/cert-checker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"crypto/x509"
"database/sql"
"encoding/json"
"errors"
"flag"
"fmt"
"log/syslog"
Expand Down Expand Up @@ -141,56 +140,94 @@ func newChecker(saDbMap certDB,
}
}

func (c *certChecker) getCerts(ctx context.Context, unexpiredOnly bool) error {
c.issuedReport.end = c.clock.Now()
c.issuedReport.begin = c.issuedReport.end.Add(-c.checkPeriod)

args := map[string]interface{}{"issued": c.issuedReport.begin, "now": 0}
if unexpiredOnly {
args["now"] = c.clock.Now()
}

var sni sql.NullInt64
// findStartingID returns the lowest `id` in the certificates table within the
// time window specified. The time window is a half-open interval [begin, end).
func (c *certChecker) findStartingID(ctx context.Context, begin, end time.Time) (int64, error) {
var output sql.NullInt64
var err error
var retries int
for {
sni, err = c.dbMap.SelectNullInt(

// Rather than querying `MIN(id)` across that whole window, we query it across the first
// hour of the window. This allows the query planner to use the index on `issued` more
// effectively. For a busy, actively issuing CA, that will always return results in the
// first query. For a less busy CA, or during integration tests, there may only exist
// certificates towards the end of the window, so we try querying later hourly chunks until
// we find a certificate or hit the end of the window. We also retry transient errors.
queryBegin := begin
queryEnd := begin.Add(time.Hour)

for queryBegin.Compare(end) < 0 {
output, err = c.dbMap.SelectNullInt(
ctx,
"SELECT MIN(id) FROM certificates WHERE issued >= :issued AND expires >= :now",
args,
`SELECT MIN(id) FROM certificates
WHERE issued >= :begin AND
issued < :end`,
map[string]interface{}{
"begin": queryBegin,
"end": queryEnd,
},
)
if err != nil {
c.logger.AuditErrf("finding starting certificate: %s", err)
retries++
time.Sleep(core.RetryBackoff(retries, time.Second, time.Minute, 2))
continue
}
retries = 0
break
}
if !sni.Valid {
// a nil response was returned by the DB, so return error and fail
return errors.New("the SELECT query resulted in a NULL response from the DB")
// https://mariadb.com/kb/en/min/
// MIN() returns NULL if there were no matching rows
// https://pkg.go.dev/database/sql#NullInt64
// Valid is true if Int64 is not NULL
if !output.Valid {
// No matching rows, try the next hour
queryBegin = queryBegin.Add(time.Hour)
queryEnd = queryEnd.Add(time.Hour)
if queryEnd.Compare(end) > 0 {
queryEnd = end
}
continue
}

return output.Int64, nil
}

initialID := sni.Int64
// Fell through the loop without finding a valid ID
return 0, fmt.Errorf("no rows found for certificates issued between %s and %s", begin, end)
}

func (c *certChecker) getCerts(ctx context.Context) error {
// The end of the report is the current time, rounded up to the nearest second.
c.issuedReport.end = c.clock.Now().Truncate(time.Second).Add(time.Second)
// The beginning of the report is the end minus the check period, rounded down to the nearest second.
c.issuedReport.begin = c.issuedReport.end.Add(-c.checkPeriod).Truncate(time.Second)

initialID, err := c.findStartingID(ctx, c.issuedReport.begin, c.issuedReport.end)
if err != nil {
return err
}
if initialID > 0 {
// decrement the initial ID so that we select below as we aren't using >=
initialID -= 1
}

// Retrieve certs in batches of 1000 (the size of the certificate channel)
// so that we don't eat unnecessary amounts of memory and avoid the 16MB MySQL
// packet limit.
args["limit"] = batchSize
args["id"] = initialID

batchStartID := initialID
var retries int
for {
certs, err := sa.SelectCertificates(
ctx,
c.dbMap,
"WHERE id > :id AND issued >= :issued AND expires >= :now ORDER BY id LIMIT :limit",
args,
`WHERE id > :id AND
issued >= :begin AND
issued < :end
ORDER BY id LIMIT :limit`,
map[string]interface{}{
"begin": c.issuedReport.begin,
"end": c.issuedReport.end,
// Retrieve certs in batches of 1000 (the size of the certificate channel)
// so that we don't eat unnecessary amounts of memory and avoid the 16MB MySQL
// packet limit.
"limit": batchSize,
"id": batchStartID,
},
)
if err != nil {
c.logger.AuditErrf("selecting certificates: %s", err)
Expand All @@ -206,7 +243,7 @@ func (c *certChecker) getCerts(ctx context.Context, unexpiredOnly bool) error {
break
}
lastCert := certs[len(certs)-1]
args["id"] = lastCert.ID
batchStartID = lastCert.ID
if lastCert.Issued.After(c.issuedReport.end) {
break
}
Expand Down Expand Up @@ -444,7 +481,8 @@ type Config struct {
DB cmd.DBConfig
cmd.HostnamePolicyConfig

Workers int `validate:"required,min=1"`
Workers int `validate:"required,min=1"`
// Deprecated: this is ignored, and cert checker always checks both expired and unexpired.
UnexpiredOnly bool
BadResultsOnly bool
CheckPeriod config.Duration
Expand Down Expand Up @@ -567,7 +605,7 @@ func main() {
// is finished it will close the certificate channel which allows the range
// loops in checker.processCerts to break
go func() {
err := checker.getCerts(context.TODO(), config.CertChecker.UnexpiredOnly)
err := checker.getCerts(context.TODO())
cmd.FailOnError(err, "Batch retrieval of certificates failed")
}()

Expand Down
59 changes: 52 additions & 7 deletions cmd/cert-checker/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ func TestGetAndProcessCerts(t *testing.T) {
}

batchSize = 2
err = checker.getCerts(context.Background(), false)
err = checker.getCerts(context.Background())
test.AssertNotError(t, err, "Failed to retrieve certificates")
test.AssertEquals(t, len(checker.certs), 5)
wg := new(sync.WaitGroup)
Expand Down Expand Up @@ -431,7 +431,7 @@ func TestGetCertsEmptyResults(t *testing.T) {
checker.dbMap = mismatchedCountDB{}

batchSize = 3
err = checker.getCerts(context.Background(), false)
err = checker.getCerts(context.Background())
test.AssertNotError(t, err, "Failed to retrieve certificates")
}

Expand All @@ -453,13 +453,58 @@ func (db emptyDB) SelectNullInt(_ context.Context, _ string, _ ...interface{}) (
// expected if the DB finds no certificates to match the SELECT query and
// should return an error.
func TestGetCertsNullResults(t *testing.T) {
saDbMap, err := sa.DBMapForTest(vars.DBConnSA)
test.AssertNotError(t, err, "Couldn't connect to database")
checker := newChecker(saDbMap, clock.NewFake(), pa, kp, time.Hour, testValidityDurations, blog.NewMock())
checker.dbMap = emptyDB{}
checker := newChecker(emptyDB{}, clock.NewFake(), pa, kp, time.Hour, testValidityDurations, blog.NewMock())

err = checker.getCerts(context.Background(), false)
err := checker.getCerts(context.Background())
test.AssertError(t, err, "Should have gotten error from empty DB")
if !strings.Contains(err.Error(), "no rows found for certificates issued between") {
t.Errorf("expected error to contain 'no rows found for certificates issued between', got '%s'", err.Error())
}
}

// lateDB is a certDB object that helps with TestGetCertsLate.
// It pretends to contain a single cert issued at the given time.
type lateDB struct {
issuedTime time.Time
selectedACert bool
}

// SelectNullInt is a method that returns a false sql.NullInt64 struct to
// mock a null DB response
func (db *lateDB) SelectNullInt(_ context.Context, _ string, args ...interface{}) (sql.NullInt64, error) {
args2 := args[0].(map[string]interface{})
begin := args2["begin"].(time.Time)
end := args2["end"].(time.Time)
if begin.Compare(db.issuedTime) < 0 && end.Compare(db.issuedTime) > 0 {
return sql.NullInt64{Int64: 23, Valid: true}, nil
}
return sql.NullInt64{Valid: false}, nil
}

func (db *lateDB) Select(_ context.Context, output interface{}, _ string, args ...interface{}) ([]interface{}, error) {
db.selectedACert = true
// For expediency we respond with an empty list of certificates; the checker will treat this as if it's
// reached the end of the list of certificates to process.
return nil, nil
}

func (db *lateDB) SelectOne(_ context.Context, _ interface{}, _ string, _ ...interface{}) error {
return nil
}

// TestGetCertsLate checks for correct behavior when certificates exist only late in the provided window.
func TestGetCertsLate(t *testing.T) {
clk := clock.NewFake()
db := &lateDB{issuedTime: clk.Now().Add(-time.Hour)}
checkPeriod := 24 * time.Hour
checker := newChecker(db, clk, pa, kp, checkPeriod, testValidityDurations, blog.NewMock())

err := checker.getCerts(context.Background())
test.AssertNotError(t, err, "getting certs")

if !db.selectedACert {
t.Errorf("checker never selected a certificate after getting a MIN(id)")
}
}

func TestSaveReport(t *testing.T) {
Expand Down

0 comments on commit b68b21c

Please sign in to comment.