Skip to content

Commit

Permalink
Expiration Manager: Handle Presumed Irrevocable Leases Separately (ha…
Browse files Browse the repository at this point in the history
…shicorp#11452)

* build out zombie lease system

* add typo for CI

* undo test CI commit

* time equality test isn't working on CI, so let's see what this does...

* protect against nil pointer receiver calls
  • Loading branch information
swayne275 committed Apr 29, 2021
1 parent a20b005 commit aafede4
Show file tree
Hide file tree
Showing 2 changed files with 253 additions and 11 deletions.
102 changes: 91 additions & 11 deletions vault/expiration.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ const (
numExpirationWorkersTest = 10

fairshareWorkersOverrideVar = "VAULT_LEASE_REVOCATION_WORKERS"

// limit zombie error messages to 240 characters to be respectful of storage
// requirements
maxZombieErrorLength = 240
)

type pendingInfo struct {
Expand Down Expand Up @@ -97,6 +101,11 @@ type ExpirationManager struct {
leaseCount int
pendingLock sync.RWMutex

// Track expired leases that have been determined to be irrevocable (without
// manual intervention). These irrevocable leases are referred to as
// "zombies" or "zombie leases"
zombies sync.Map

// The uniquePolicies map holds policy sets, so they can
// be deduplicated. It is periodically emptied to prevent
// unbounded growth.
Expand Down Expand Up @@ -210,6 +219,13 @@ func (r *revocationJob) OnFailure(err error) {

if pending.revokesAttempted >= maxRevokeAttempts {
r.m.logger.Trace("lease has consumed all retry attempts", "lease_id", r.leaseID)
le, loadErr := r.m.loadEntry(r.nsCtx, r.leaseID)
if loadErr != nil {
r.m.logger.Warn("failed to mark lease as zombie - failed to load", "lease_id", r.leaseID, "err", loadErr)
return
}

r.m.markLeaseAsZombie(r.nsCtx, le, errors.New("lease has consumed all retry attempts"))
return
}

Expand Down Expand Up @@ -824,6 +840,10 @@ func (m *ExpirationManager) Stop() error {
return true
})
m.uniquePolicies = make(map[string][]string)
m.zombies.Range(func(key, _ interface{}) bool {
m.zombies.Delete(key)
return true
})
m.pendingLock.Unlock()

if m.inRestoreMode() {
Expand Down Expand Up @@ -938,17 +958,9 @@ func (m *ExpirationManager) revokeCommon(ctx context.Context, leaseID string, fo

// Clear the expiration handler
m.pendingLock.Lock()
if info, ok := m.pending.Load(leaseID); ok {
pending := info.(pendingInfo)
pending.timer.Stop()
m.pending.Delete(leaseID)
m.leaseCount--
// Log but do not fail; unit tests (and maybe Tidy on production systems)
if err := m.core.quotasHandleLeases(ctx, quotas.LeaseActionDeleted, []string{leaseID}); err != nil {
m.logger.Error("failed to update quota on revocation", "error", err)
}
}
m.removeFromPending(ctx, leaseID)
m.nonexpiring.Delete(leaseID)
m.zombies.Delete(leaseID)
m.pendingLock.Unlock()

if m.logger.IsInfo() && !skipToken && m.logLeaseExpirations {
Expand Down Expand Up @@ -1580,6 +1592,11 @@ func (m *ExpirationManager) FetchLeaseTimes(ctx context.Context, leaseID string)
return m.leaseTimesForExport(info.(pendingInfo).cachedLeaseInfo), nil
}

info, ok = m.zombies.Load(leaseID)
if ok && info.(*leaseEntry) != nil {
return m.leaseTimesForExport(info.(*leaseEntry)), nil
}

// Load the entry
le, err := m.loadEntryInternal(ctx, leaseID, true, false)
if err != nil {
Expand Down Expand Up @@ -1664,11 +1681,14 @@ func (m *ExpirationManager) updatePending(le *leaseEntry) {
// updatePendingInternal is the locked version of updatePending; do not call
// this without a write lock on m.pending
func (m *ExpirationManager) updatePendingInternal(le *leaseEntry) {
var pending pendingInfo
if le.isZombie() {
return
}

// Check for an existing timer
info, ok := m.pending.Load(le.LeaseID)

var pending pendingInfo
if le.ExpireTime.IsZero() {
if le.nonexpiringToken() {
// Store this in the nonexpiring map instead of pending.
Expand Down Expand Up @@ -1859,6 +1879,11 @@ func (m *ExpirationManager) loadEntryInternal(ctx context.Context, leaseID strin
}
le.namespace = ns

if le.isZombie() {
m.zombies.Store(le.LeaseID, le)
return le, nil
}

if restoreMode {
if checkRestored {
// If we have already loaded this lease, we don't need to update on
Expand Down Expand Up @@ -2291,6 +2316,47 @@ func (m *ExpirationManager) walkLeases(walkFn leaseWalkFunction) error {
return nil
}

// must be called with m.pendingLock held
func (m *ExpirationManager) removeFromPending(ctx context.Context, leaseID string) {
if info, ok := m.pending.Load(leaseID); ok {
pending := info.(pendingInfo)
pending.timer.Stop()
m.pending.Delete(leaseID)
m.leaseCount--
// Log but do not fail; unit tests (and maybe Tidy on production systems)
if err := m.core.quotasHandleLeases(ctx, quotas.LeaseActionDeleted, []string{leaseID}); err != nil {
m.logger.Error("failed to update quota on revocation", "error", err)
}
}
}

// note: must be called with pending lock held
func (m *ExpirationManager) markLeaseAsZombie(ctx context.Context, le *leaseEntry, err error) {
if le == nil {
m.logger.Warn("attempted to mark nil lease as zombie")
return
}
if le.isZombie() {
m.logger.Info("attempted to re-mark lease as zombie", "original_error", le.RevokeErr, "new_error", err.Error())
return
}

errStr := err.Error()
if len(errStr) == 0 {
errStr = "no error message given"
}
if len(errStr) > maxZombieErrorLength {
errStr = errStr[:maxZombieErrorLength]
}

le.RevokeErr = errStr
m.persistEntry(ctx, le)

m.zombies.Store(le.LeaseID, le)
m.removeFromPending(ctx, le.LeaseID)
m.nonexpiring.Delete(le.LeaseID)
}

// leaseEntry is used to structure the values the expiration
// manager stores. This is used to handle renew and revocation.
type leaseEntry struct {
Expand All @@ -2311,6 +2377,12 @@ type leaseEntry struct {
Version int `json:"version"`

namespace *namespace.Namespace

// RevokeErr tracks if a lease has failed revocation in a way that is
// unlikely to be automatically resolved. The first time this happens,
// RevokeErr will be set, thus marking this leaseEntry as a zombie that will
// have to be manually removed.
RevokeErr string `json:"revokeErr"`
}

// encode is used to JSON encode the lease entry
Expand All @@ -2324,6 +2396,9 @@ func (le *leaseEntry) renewable() (bool, error) {
case le == nil:
return false, fmt.Errorf("lease not found")

case le.isZombie():
return false, fmt.Errorf("lease is not renewable and has failed previous revocation attempts")

case le.ExpireTime.IsZero():
return false, fmt.Errorf("lease is not renewable")

Expand Down Expand Up @@ -2356,6 +2431,11 @@ func (le *leaseEntry) nonexpiringToken() bool {
return !le.Auth.LeaseEnabled()
}

// TODO maybe lock RevokeErr once this goes in: https://github.com/hashicorp/vault/pull/11122
func (le *leaseEntry) isZombie() bool {
return le.RevokeErr != ""
}

// decodeLeaseEntry is used to reverse encode and return a new entry
func decodeLeaseEntry(buf []byte) (*leaseEntry, error) {
out := new(leaseEntry)
Expand Down
162 changes: 162 additions & 0 deletions vault/expiration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2502,3 +2502,165 @@ func TestExpiration_FairsharingEnvVar(t *testing.T) {
}
}
}

// register one lease ID and return the leaseID
func registerOneLease(t *testing.T, ctx context.Context, exp *ExpirationManager) string {
t.Helper()

req := &logical.Request{
Operation: logical.ReadOperation,
Path: "zombie/lease",
ClientToken: "sometoken",
}
req.SetTokenEntry(&logical.TokenEntry{ID: "sometoken", NamespaceID: "root"})
resp := &logical.Response{
Secret: &logical.Secret{
LeaseOptions: logical.LeaseOptions{
TTL: 10 * time.Hour,
},
},
}

leaseID, err := exp.Register(ctx, req, resp)
if err != nil {
t.Fatal(err)
}

return leaseID
}

func TestExpiration_MarkZombie(t *testing.T) {
exp := mockExpiration(t)
ctx := namespace.RootContext(nil)

leaseID := registerOneLease(t, ctx, exp)
loadedLE, err := exp.loadEntry(ctx, leaseID)
if err != nil {
t.Fatalf("error loading non zombie lease: %v", err)
}

if loadedLE.isZombie() {
t.Fatalf("lease is zombie and shouldn't be")
}
if _, ok := exp.zombies.Load(leaseID); ok {
t.Fatalf("lease included in zombie map")
}
if _, ok := exp.pending.Load(leaseID); !ok {
t.Fatalf("lease not included in pending map")
}

zombieErr := fmt.Errorf("test zombie error")
exp.markLeaseAsZombie(ctx, loadedLE, zombieErr)

if !loadedLE.isZombie() {
t.Fatalf("zombie lease is not zombie and should be")
}
if loadedLE.RevokeErr != zombieErr.Error() {
t.Errorf("zombie lease has wrong error message. expected %s, got %s", zombieErr.Error(), loadedLE.RevokeErr)
}
if _, ok := exp.zombies.Load(leaseID); !ok {
t.Fatalf("zombie lease not included in zombie map")
}
if _, ok := exp.pending.Load(leaseID); ok {
t.Fatalf("zombie lease included in pending map")
}
if _, ok := exp.nonexpiring.Load(leaseID); ok {
t.Fatalf("zombie lease included in nonexpiring map")
}

// stop and restore to verify that zombies are properly loaded from storage
err = exp.Stop()
if err != nil {
t.Fatalf("error stopping expiration manager: %v", err)
}

err = exp.Restore(nil)
if err != nil {
t.Fatalf("error restoring expiration manager: %v", err)
}

loadedLE, err = exp.loadEntry(ctx, leaseID)
if err != nil {
t.Fatalf("error loading non zombie lease after restore: %v", err)
}

if !loadedLE.isZombie() {
t.Fatalf("zombie lease is not zombie and should be")
}
if loadedLE.RevokeErr != zombieErr.Error() {
t.Errorf("zombie lease has wrong error message. expected %s, got %s", zombieErr.Error(), loadedLE.RevokeErr)
}
if _, ok := exp.zombies.Load(leaseID); !ok {
t.Fatalf("zombie lease not included in zombie map")
}
if _, ok := exp.pending.Load(leaseID); ok {
t.Fatalf("zombie lease included in pending map")
}
if _, ok := exp.nonexpiring.Load(leaseID); ok {
t.Fatalf("zombie lease included in nonexpiring map")
}
}

func TestExpiration_FetchLeaseTimesZombies(t *testing.T) {
exp := mockExpiration(t)
ctx := namespace.RootContext(nil)

leaseID := registerOneLease(t, ctx, exp)
expectedLeaseTimes, err := exp.FetchLeaseTimes(ctx, leaseID)
if err != nil {
t.Fatalf("error getting lease times: %v", err)
}
if expectedLeaseTimes == nil {
t.Fatal("got nil lease")
}

le, err := exp.loadEntry(ctx, leaseID)
if err != nil {
t.Fatalf("error loading lease: %v", err)
}
exp.markLeaseAsZombie(ctx, le, fmt.Errorf("test zombie error"))

zombieLeaseTimes, err := exp.FetchLeaseTimes(ctx, leaseID)
if err != nil {
t.Fatalf("error getting zombie lease times: %v", err)
}
if zombieLeaseTimes == nil {
t.Fatal("got nil zombie lease")
}

// strip monotonic clock reading
expectedLeaseTimes.IssueTime = expectedLeaseTimes.IssueTime.Round(0)
expectedLeaseTimes.ExpireTime = expectedLeaseTimes.ExpireTime.Round(0)
expectedLeaseTimes.LastRenewalTime = expectedLeaseTimes.LastRenewalTime.Round(0)

if !zombieLeaseTimes.IssueTime.Equal(expectedLeaseTimes.IssueTime) {
t.Errorf("bad issue time. expected %v, got %v", expectedLeaseTimes.IssueTime, zombieLeaseTimes.IssueTime)
}
if !zombieLeaseTimes.ExpireTime.Equal(expectedLeaseTimes.ExpireTime) {
t.Errorf("bad expire time. expected %v, got %v", expectedLeaseTimes.ExpireTime, zombieLeaseTimes.ExpireTime)
}
if !zombieLeaseTimes.LastRenewalTime.Equal(expectedLeaseTimes.LastRenewalTime) {
t.Errorf("bad last renew time. expected %v, got %v", expectedLeaseTimes.LastRenewalTime, zombieLeaseTimes.LastRenewalTime)
}
}

func TestExpiration_StopClearsZombieCache(t *testing.T) {
exp := mockExpiration(t)
ctx := namespace.RootContext(nil)

leaseID := registerOneLease(t, ctx, exp)
le, err := exp.loadEntry(ctx, leaseID)
if err != nil {
t.Fatalf("error loading non zombie lease: %v", err)
}

exp.markLeaseAsZombie(ctx, le, fmt.Errorf("test zombie error"))
err = exp.Stop()
if err != nil {
t.Fatalf("error stopping expiration manager: %v", err)
}

if _, ok := exp.zombies.Load(leaseID); ok {
t.Error("expiration manager zombies cache should be cleared on stop")
}
}

0 comments on commit aafede4

Please sign in to comment.