diff --git a/builtin/logical/pki/backend.go b/builtin/logical/pki/backend.go index 7025d7e215..419206e605 100644 --- a/builtin/logical/pki/backend.go +++ b/builtin/logical/pki/backend.go @@ -261,41 +261,7 @@ type backend struct { issuersLock sync.RWMutex } -type ( - tidyStatusState int - roleOperation func(ctx context.Context, req *logical.Request, data *framework.FieldData, role *roleEntry) (*logical.Response, error) -) - -const ( - tidyStatusInactive tidyStatusState = iota - tidyStatusStarted = iota - tidyStatusFinished = iota - tidyStatusError = iota - tidyStatusCancelling = iota - tidyStatusCancelled = iota -) - -type tidyStatus struct { - // Parameters used to initiate the operation - safetyBuffer int - issuerSafetyBuffer int - tidyCertStore bool - tidyRevokedCerts bool - tidyRevokedAssocs bool - tidyExpiredIssuers bool - tidyBackupBundle bool - pauseDuration string - - // Status - state tidyStatusState - err error - timeStarted time.Time - timeFinished time.Time - message string - certStoreDeletedCount uint - revokedCertDeletedCount uint - missingIssuerCertCount uint -} +type roleOperation func(ctx context.Context, req *logical.Request, data *framework.FieldData, role *roleEntry) (*logical.Response, error) const backendHelp = ` The PKI backend dynamically generates X509 server and client certificates. @@ -480,19 +446,21 @@ func (b *backend) invalidate(ctx context.Context, key string) { if !strings.HasSuffix(key, "/confirmed") { cluster := split[len(split)-2] serial := split[len(split)-1] - // Only process confirmations on the perf primary. b.crlBuilder.addCertForRevocationCheck(cluster, serial) } else { if len(split) >= 3 { cluster := split[len(split)-3] serial := split[len(split)-2] + // Only process confirmations on the perf primary. The + // performance secondaries cannot remove other clusters' + // entries, and so do not need to track them (only to + // ignore them). On performance primary nodes though, + // we do want to track them to remove them. if !isNotPerfPrimary { b.crlBuilder.addCertForRevocationRemoval(cluster, serial) } } } - - b.Logger().Debug("got replicated cross-cluster revocation: " + key) } } diff --git a/builtin/logical/pki/backend_test.go b/builtin/logical/pki/backend_test.go index 329dcf17de..a7c526db50 100644 --- a/builtin/logical/pki/backend_test.go +++ b/builtin/logical/pki/backend_test.go @@ -3952,6 +3952,7 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) { "missing_issuer_cert_count": json.Number("0"), "current_cert_store_count": json.Number("0"), "current_revoked_cert_count": json.Number("0"), + "revocation_queue_deleted_count": json.Number("0"), } // Let's copy the times from the response so that we can use deep.Equal() timeStarted, ok := tidyStatus.Data["time_started"] diff --git a/builtin/logical/pki/crl_util.go b/builtin/logical/pki/crl_util.go index 9616354e31..cae71ae5af 100644 --- a/builtin/logical/pki/crl_util.go +++ b/builtin/logical/pki/crl_util.go @@ -508,9 +508,7 @@ func (cb *crlBuilder) maybeGatherQueueForFirstProcess(sc *storageContext, isNotP cb.revQueue.Add(entry) } else if !isNotPerfPrimary { cb.removalQueue.Add(entry) - } else { - sc.Backend.Logger().Debug(fmt.Sprintf("ignoring confirmed revoked serial %v: %v vs %v ", serial, err, removalEntry)) - } + } // Else, this is a confirmation but we're on a perf secondary so ignore it. // Overwrite the error; we don't really care about its contents // at this step. @@ -543,12 +541,27 @@ func (cb *crlBuilder) processRevocationQueue(sc *storageContext) error { removalQueue := cb.removalQueue.Iterate() sc.Backend.Logger().Debug(fmt.Sprintf("gathered %v revocations and %v confirmation entries", len(revQueue), len(removalQueue))) + crlConfig, err := cb.getConfigWithUpdate(sc) if err != nil { return err } + + ourClusterId, err := sc.Backend.System().ClusterID(sc.Context) + if err != nil { + return fmt.Errorf("unable to fetch clusterID to ignore local revocation entries: %w", err) + } + for _, req := range revQueue { - sc.Backend.Logger().Debug(fmt.Sprintf("handling revocation request: %v", req)) + // Regardless of whether we're on the perf primary or a secondary + // cluster, we can safely ignore revocation requests originating + // from our node, because we've already checked them once (when + // they were created). + if ourClusterId != "" && ourClusterId == req.Cluster { + continue + } + + // Fetch the revocation entry to ensure it exists. rPath := crossRevocationPrefix + req.Cluster + "/" + req.Serial entry, err := sc.Storage.Get(sc.Context, rPath) if err != nil { @@ -562,7 +575,6 @@ func (cb *crlBuilder) processRevocationQueue(sc *storageContext) error { } resp, err := tryRevokeCertBySerial(sc, crlConfig, req.Serial) - sc.Backend.Logger().Debug(fmt.Sprintf("checked local revocation entry: %v / %v", resp, err)) if err == nil && resp != nil && !resp.IsError() && resp.Data != nil && resp.Data["state"].(string) == "revoked" { if isNotPerfPrimary { // Write a revocation queue removal entry. @@ -597,7 +609,9 @@ func (cb *crlBuilder) processRevocationQueue(sc *storageContext) error { } if isNotPerfPrimary { - sc.Backend.Logger().Debug(fmt.Sprintf("not on perf primary so done; ignoring any revocation confirmations")) + sc.Backend.Logger().Debug(fmt.Sprintf("not on perf primary so ignoring any revocation confirmations")) + + // See note in pki/backend.go; this should be empty. cb.removalQueue.RemoveAll() cb.haveInitializedQueue = true return nil @@ -609,7 +623,6 @@ func (cb *crlBuilder) processRevocationQueue(sc *storageContext) error { } for _, entry := range removalQueue { - sc.Backend.Logger().Debug(fmt.Sprintf("handling revocation confirmation: %v", entry)) // First remove the revocation request. for cIndex, cluster := range clusters { eEntry := crossRevocationPrefix + cluster + entry.Serial diff --git a/builtin/logical/pki/path_tidy.go b/builtin/logical/pki/path_tidy.go index c2cb47a7af..8a31d75222 100644 --- a/builtin/logical/pki/path_tidy.go +++ b/builtin/logical/pki/path_tidy.go @@ -19,6 +19,40 @@ import ( var tidyCancelledError = errors.New("tidy operation cancelled") +type tidyStatusState int + +const ( + tidyStatusInactive tidyStatusState = iota + tidyStatusStarted = iota + tidyStatusFinished = iota + tidyStatusError = iota + tidyStatusCancelling = iota + tidyStatusCancelled = iota +) + +type tidyStatus struct { + // Parameters used to initiate the operation + safetyBuffer int + issuerSafetyBuffer int + tidyCertStore bool + tidyRevokedCerts bool + tidyRevokedAssocs bool + tidyExpiredIssuers bool + tidyBackupBundle bool + pauseDuration string + + // Status + state tidyStatusState + err error + timeStarted time.Time + timeFinished time.Time + message string + certStoreDeletedCount uint + revokedCertDeletedCount uint + missingIssuerCertCount uint + revQueueDeletedCount uint +} + type tidyConfig struct { Enabled bool `json:"enabled"` Interval time.Duration `json:"interval_duration"` @@ -683,8 +717,15 @@ func (b *backend) doTidyRevocationQueue(ctx context.Context, req *logical.Reques return fmt.Errorf("failed to list cross-cluster revocation queue participating clusters: %w", err) } + // Grab locks as we're potentially modifying revocation-related storage. + b.revokeStorageLock.Lock() + defer b.revokeStorageLock.Unlock() + for cIndex, cluster := range clusters { - cluster = cluster[0 : len(cluster)-1] + if cluster[len(cluster)-1] == '/' { + cluster = cluster[0 : len(cluster)-1] + } + cPath := crossRevocationPrefix + cluster + "/" serials, err := sc.Storage.List(sc.Context, cPath) if err != nil { @@ -692,17 +733,50 @@ func (b *backend) doTidyRevocationQueue(ctx context.Context, req *logical.Reques } for _, serial := range serials { + // Check for pause duration to reduce resource consumption. + if config.PauseDuration > (0 * time.Second) { + b.revokeStorageLock.Unlock() + time.Sleep(config.PauseDuration) + b.revokeStorageLock.Lock() + } + // Confirmation entries _should_ be handled by this cluster's // processRevocationQueue(...) invocation; if not, when the plugin // reloads, maybeGatherQueueForFirstProcess(...) will remove all - // stale confirmation requests. + // stale confirmation requests. However, we don't want to force an + // operator to reload their in-use plugin, so allow tidy to also + // clean up confirmation values without reloading. if serial[len(serial)-1] == '/' { - continue - } + // Check if we have a confirmed entry. + confirmedPath := cPath + serial + "confirmed" + removalEntry, err := sc.Storage.Get(sc.Context, confirmedPath) + if err != nil { + return fmt.Errorf("error reading revocation confirmation (%v) during tidy: %w", confirmedPath, err) + } + if removalEntry == nil { + continue + } - // Check for pause duration to reduce resource consumption. - if config.PauseDuration > (0 * time.Second) { - time.Sleep(config.PauseDuration) + // Remove potential revocation requests from all clusters. + for _, subCluster := range clusters { + if subCluster[len(subCluster)-1] == '/' { + subCluster = subCluster[0 : len(subCluster)-1] + } + + reqPath := subCluster + "/" + serial[0:len(serial)-1] + if err := sc.Storage.Delete(sc.Context, reqPath); err != nil { + return fmt.Errorf("failed to remove confirmed revocation request on candidate cluster (%v): %w", reqPath, err) + } + } + + // Then delete the confirmation. + if err := sc.Storage.Delete(sc.Context, confirmedPath); err != nil { + return fmt.Errorf("failed to remove confirmed revocation confirmation (%v): %w", confirmedPath, err) + } + + // No need to handle a revocation request at this path: it can't + // still exist on this cluster after we deleted it above. + continue } ePath := cPath + serial @@ -719,9 +793,7 @@ func (b *backend) doTidyRevocationQueue(ctx context.Context, req *logical.Reques return fmt.Errorf("error reading revocation request (%v) to tidy: %w", ePath, err) } - now := time.Now() - afterBuffer := now.Add(-1 * config.QueueSafetyBuffer) - if revRequest.RequestedAt.After(afterBuffer) { + if time.Since(revRequest.RequestedAt) > config.QueueSafetyBuffer { continue } @@ -729,6 +801,20 @@ func (b *backend) doTidyRevocationQueue(ctx context.Context, req *logical.Reques if err := sc.Storage.Delete(sc.Context, ePath); err != nil { return fmt.Errorf("error deleting revocation request (%v): %w", ePath, err) } + + // Assumption: there should never be a need to remove this from + // the processing queue on this node. We're on the active primary, + // so our writes don't cause invalidations. This means we'd have + // to have slated it for deletion very quickly after it'd been + // sent (i.e., inside of the 1-minute boundary that periodicFunc + // executes at). While this is possible, because we grab the + // revocationStorageLock above, we can't execute interleaved + // with that periodicFunc, so the periodicFunc would've had to + // finished before we actually did this deletion (or it wouldn't + // have ignored this serial because our deletion would've + // happened prior to it reading the storage entry). Thus we should + // be safe to ignore the revocation queue removal here. + b.tidyStatusIncRevQueueCount() } } @@ -782,6 +868,7 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f "missing_issuer_cert_count": nil, "current_cert_store_count": nil, "current_revoked_cert_count": nil, + "revocation_queue_deleted_count": nil, }, } @@ -802,6 +889,7 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f resp.Data["cert_store_deleted_count"] = b.tidyStatus.certStoreDeletedCount resp.Data["revoked_cert_deleted_count"] = b.tidyStatus.revokedCertDeletedCount resp.Data["missing_issuer_cert_count"] = b.tidyStatus.missingIssuerCertCount + resp.Data["revocation_queue_deleted_count"] = b.tidyStatus.revQueueDeletedCount switch b.tidyStatus.state { case tidyStatusStarted: @@ -1038,6 +1126,13 @@ func (b *backend) tidyStatusIncMissingIssuerCertCount() { b.tidyStatus.missingIssuerCertCount++ } +func (b *backend) tidyStatusIncRevQueueCount() { + b.tidyStatusLock.Lock() + defer b.tidyStatusLock.Unlock() + + b.tidyStatus.revQueueDeletedCount++ +} + const pathTidyHelpSyn = ` Tidy up the backend by removing expired certificates, revocation information, or both.