Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
105816: kvserver: propagate `ReplicaUnavailableError` from intent resolution r=erikgrinaker a=erikgrinaker

If intent resolution attempted to access txn info on a different range, and hit a poisoned latch because that range had lost quorum, the returned error would incorrectly claim that the range with the intent was unavailable rather than the range that had lost quorum. This patch correctly propagates the `ReplicaUnavailableError` from the other range.

Resolves cockroachdb#105798.
Resolves cockroachdb#102936.
Epic: none

Release note: None

Co-authored-by: Erik Grinaker <[email protected]>
  • Loading branch information
craig[bot] and erikgrinaker committed Jun 29, 2023
2 parents c1d73cd + 3a68186 commit 6ff48ac
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 13 deletions.
50 changes: 50 additions & 0 deletions pkg/kv/kvserver/client_replica_circuit_breaker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,56 @@ func TestReplicaCircuitBreaker_Liveness_QuorumLoss(t *testing.T) {
require.NoError(t, tc.Write(n1))
}

// In this test, a txn anchored on the range losing quorum also has an intent on
// a healthy range. Quorum is lost before committing, poisoning latches for the
// txn info. When resolving the intent on the healthy range, it will hit a
// poisoned latch. This should result in a ReplicaUnavailableError from the
// original range that lost quorum, not from the range with the intent.
func TestReplicaCircuitBreaker_ResolveIntent_QuorumLoss(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
ctx := context.Background()
tc := setupCircuitBreakerTest(t)
defer tc.Stopper().Stop(ctx)

// Get lease on n1.
require.NoError(t, tc.Write(n1))

// Split off a healthy range, which will inherit a lease on n1. Remove the
// replica on n2, so that it remains healthy when we take down n2.
failKey := tc.ScratchRange(t)
okKey := failKey.Next()
failDesc, _ := tc.SplitRangeOrFatal(t, okKey)
okDesc := tc.RemoveVotersOrFatal(t, okKey, tc.Target(n2))
t.Logf("failDesc=%s", failDesc)
t.Logf("okDesc=%s", okDesc)

// Start a transaction, anchoring it on the faulty range.
db := tc.Server(n1).DB()
txn := db.NewTxn(ctx, "test")
require.NoError(t, txn.Put(ctx, failKey, "fail"))
require.NoError(t, txn.Put(ctx, okKey, "ok"))

// Lose quorum.
tc.StopServer(n2)
tc.HeartbeatNodeLiveness(t, n1)

// Attempt to commit. It should fail, but will poison latches on
// the faulty range.
tc.SetSlowThreshold(time.Second)
err := txn.Commit(ctx)
tc.RequireIsBreakerOpen(t, err)

// Read the key from the healthy range. It should fail due to a poisoned latch
// on the txn's anchored range. This error should appear to come from the
// failed range, not from the healthy range.
_, err = db.Get(ctx, okKey)
tc.RequireIsBreakerOpen(t, err)
ruErr := &kvpb.ReplicaUnavailableError{}
require.True(t, errors.As(err, &ruErr))
require.Equal(t, failDesc.RangeID, ruErr.Desc.RangeID)
}

type dummyStream struct {
name string
ctx context.Context
Expand Down
32 changes: 19 additions & 13 deletions pkg/kv/kvserver/replica_send.go
Original file line number Diff line number Diff line change
Expand Up @@ -477,19 +477,25 @@ func (r *Replica) executeBatchWithConcurrencyRetries(
}, requestEvalKind)
if pErr != nil {
if poisonErr := (*poison.PoisonedError)(nil); errors.As(pErr.GoError(), &poisonErr) {
// NB: we make the breaker error (which may be nil at this point, but
// usually is not) a secondary error, meaning it is not in the error
// chain. That is fine; the important bits to investigate
// programmatically are the ReplicaUnavailableError (which contains the
// descriptor) and the *PoisonedError (which contains the concrete
// subspan that caused this request to fail). We mark
// circuit.ErrBreakerOpen into the chain as well so that we have the
// invariant that all replica circuit breaker errors contain both
// ErrBreakerOpen and ReplicaUnavailableError.
pErr = kvpb.NewError(r.replicaUnavailableError(errors.CombineErrors(
errors.Mark(poisonErr, circuit.ErrBreakerOpen),
r.breaker.Signal().Err(),
)))
// It's possible that intent resolution accessed txn info anchored on a
// different range and hit a poisoned latch there, in which case we want
// to propagate its ReplicaUnavailableError instead of creating one for
// this range (which likely isn't tripped).
if !errors.HasType(pErr.GoError(), (*kvpb.ReplicaUnavailableError)(nil)) {
// NB: we make the breaker error (which may be nil at this point, but
// usually is not) a secondary error, meaning it is not in the error
// chain. That is fine; the important bits to investigate
// programmatically are the ReplicaUnavailableError (which contains the
// descriptor) and the *PoisonedError (which contains the concrete
// subspan that caused this request to fail). We mark
// circuit.ErrBreakerOpen into the chain as well so that we have the
// invariant that all replica circuit breaker errors contain both
// ErrBreakerOpen and ReplicaUnavailableError.
pErr = kvpb.NewError(r.replicaUnavailableError(errors.CombineErrors(
errors.Mark(poisonErr, circuit.ErrBreakerOpen),
r.breaker.Signal().Err(),
)))
}
}
return nil, nil, pErr
} else if resp != nil {
Expand Down

0 comments on commit 6ff48ac

Please sign in to comment.