feat(hosterrorscache): add Remove and MarkFailedOrRemove methods (#5984)

* feat(hosterrorscache): add `Remove` and `MarkFailedOrRemove` methods and also deprecating `MarkFailed` Signed-off-by: Dwi Siswanto <git@dw1.io> * refactor(*): unwraps `hosterrorscache\.MarkFailed` invocation Signed-off-by: Dwi Siswanto <git@dw1.io> * feat(hosterrorscache): add sync in `Check` and `MarkFailedOrRemove` methods * test(hosterrorscache): add concurrent test for `Check` method * refactor(hosterrorscache): do NOT change `MarkFailed` behavior Signed-off-by: Dwi Siswanto <git@dw1.io> * feat(*): use `MarkFailedOrRemove` explicitly Signed-off-by: Dwi Siswanto <git@dw1.io> --------- Signed-off-by: Dwi Siswanto <git@dw1.io>
2026-02-01 08:13:09 +08:00 · 2025-01-31 17:16:57 +07:00
parent 5a52e93113
commit 052fd8b79a
8 changed files with 185 additions and 86 deletions
--- a/pkg/protocols/common/hosterrorscache/hosterrorscache.go
+++ b/pkg/protocols/common/hosterrorscache/hosterrorscache.go
@@ -1,6 +1,7 @@
 package hosterrorscache

 import (
+	"errors"
 	"net"
 	"net/url"
 	"regexp"
@@ -20,10 +21,12 @@ import (
 // CacheInterface defines the signature of the hosterrorscache so that
 // users of Nuclei as embedded lib may implement their own cache
 type CacheInterface interface {
-	SetVerbose(verbose bool)                                          // log verbosely
-	Close()                                                           // close the cache
-	Check(protoType string, ctx *contextargs.Context) bool            // return true if the host should be skipped
-	MarkFailed(protoType string, ctx *contextargs.Context, err error) // record a failure (and cause) for the host
+	SetVerbose(verbose bool)                                                  // log verbosely
+	Close()                                                                   // close the cache
+	Check(protoType string, ctx *contextargs.Context) bool                    // return true if the host should be skipped
+	Remove(ctx *contextargs.Context)                                          // remove a host from the cache
+	MarkFailed(protoType string, ctx *contextargs.Context, err error)         // record a failure (and cause) for the host
+	MarkFailedOrRemove(protoType string, ctx *contextargs.Context, err error) // record a failure (and cause) for the host or remove it
 }

 var (
@@ -47,16 +50,20 @@ type cacheItem struct {
 	errors         atomic.Int32
 	isPermanentErr bool
 	cause          error // optional cause
+	mu             sync.Mutex
 }

 const DefaultMaxHostsCount = 10000

 // New returns a new host max errors cache
 func New(maxHostError, maxHostsCount int, trackError []string) *Cache {
-	gc := gcache.New[string, *cacheItem](maxHostsCount).
-		ARC().
-		Build()
-	return &Cache{failedTargets: gc, MaxHostError: maxHostError, TrackError: trackError}
+	gc := gcache.New[string, *cacheItem](maxHostsCount).ARC().Build()
+
+	return &Cache{
+		failedTargets: gc,
+		MaxHostError:  maxHostError,
+		TrackError:    trackError,
+	}
 }

 // SetVerbose sets the cache to log at verbose level
@@ -118,47 +125,108 @@ func (c *Cache) NormalizeCacheValue(value string) string {
 func (c *Cache) Check(protoType string, ctx *contextargs.Context) bool {
 	finalValue := c.GetKeyFromContext(ctx, nil)

-	existingCacheItem, err := c.failedTargets.GetIFPresent(finalValue)
+	cache, err := c.failedTargets.GetIFPresent(finalValue)
 	if err != nil {
 		return false
 	}
-	if existingCacheItem.isPermanentErr {
+
+	cache.mu.Lock()
+	defer cache.mu.Unlock()
+
+	if cache.isPermanentErr {
 		// skipping permanent errors is expected so verbose instead of info
-		gologger.Verbose().Msgf("Skipped %s from target list as found unresponsive permanently: %s", finalValue, existingCacheItem.cause)
+		gologger.Verbose().Msgf("Skipped %s from target list as found unresponsive permanently: %s", finalValue, cache.cause)
 		return true
 	}

-	if existingCacheItem.errors.Load() >= int32(c.MaxHostError) {
-		existingCacheItem.Do(func() {
-			gologger.Info().Msgf("Skipped %s from target list as found unresponsive %d times", finalValue, existingCacheItem.errors.Load())
+	if cache.errors.Load() >= int32(c.MaxHostError) {
+		cache.Do(func() {
+			gologger.Info().Msgf("Skipped %s from target list as found unresponsive %d times", finalValue, cache.errors.Load())
 		})
 		return true
 	}
+
 	return false
 }

+// Remove removes a host from the cache
+func (c *Cache) Remove(ctx *contextargs.Context) {
+	key := c.GetKeyFromContext(ctx, nil)
+	_ = c.failedTargets.Remove(key) // remove even the cache is not present
+}
+
 // MarkFailed marks a host as failed previously
+//
+// Deprecated: Use MarkFailedOrRemove instead.
 func (c *Cache) MarkFailed(protoType string, ctx *contextargs.Context, err error) {
-	if !c.checkError(protoType, err) {
+	if err == nil {
 		return
 	}
-	finalValue := c.GetKeyFromContext(ctx, err)
-	existingCacheItem, err := c.failedTargets.GetIFPresent(finalValue)
-	if err != nil || existingCacheItem == nil {
-		newItem := &cacheItem{errors: atomic.Int32{}}
-		newItem.errors.Store(1)
-		if errkit.IsKind(err, errkit.ErrKindNetworkPermanent) {
-			// skip this address altogether
-			// permanent errors are always permanent hence this is created once
-			// and never updated so no need to synchronize
-			newItem.isPermanentErr = true
-			newItem.cause = err
-		}
-		_ = c.failedTargets.Set(finalValue, newItem)
+
+	c.MarkFailedOrRemove(protoType, ctx, err)
+}
+
+// MarkFailedOrRemove marks a host as failed previously or removes it
+func (c *Cache) MarkFailedOrRemove(protoType string, ctx *contextargs.Context, err error) {
+	if err != nil && !c.checkError(protoType, err) {
 		return
 	}
-	existingCacheItem.errors.Add(1)
-	_ = c.failedTargets.Set(finalValue, existingCacheItem)
+
+	if err == nil {
+		// Remove the host from cache
+		//
+		// NOTE(dwisiswant0): The decision was made to completely remove the
+		// cached entry for the host instead of simply decrementing the error
+		// count (using `(atomic.Int32).Swap` to update the value to `N-1`).
+		// This approach was chosen because the error handling logic operates
+		// concurrently, and decrementing the count could lead to UB (unexpected
+		// behavior) even when the error is `nil`.
+		//
+		// To clarify, consider the following scenario where the error
+		// encountered does NOT belong to the permanent network error category
+		// (`errkit.ErrKindNetworkPermanent`):
+		//
+		// 1. Iteration 1: A timeout error occurs, and the error count for the
+		//    host is incremented.
+		// 2. Iteration 2: Another timeout error is encountered, leading to
+		//    another increment in the host's error count.
+		// 3. Iteration 3: A third timeout error happens, which increments the
+		//    error count further. At this point, the host is flagged as
+		//    unresponsive.
+		// 4. Iteration 4: The host becomes reachable (no error or a transient
+		//    issue resolved). Instead of performing a no-op and leaving the
+		//    host in the cache, the host entry is removed entirely to reset its
+		//    state.
+		// 5. Iteration 5: A subsequent timeout error occurs after the host was
+		//    removed and re-added to the cache. The error count is reset and
+		//    starts from 1 again.
+		//
+		// This removal strategy ensures the cache is updated dynamically to
+		// reflect the current state of the host without persisting stale or
+		// irrelevant error counts that could interfere with future error
+		// handling and tracking logic.
+		c.Remove(ctx)
+
+		return
+	}
+
+	cacheKey := c.GetKeyFromContext(ctx, err)
+	cache, cacheErr := c.failedTargets.GetIFPresent(cacheKey)
+	if errors.Is(cacheErr, gcache.KeyNotFoundError) {
+		cache = &cacheItem{errors: atomic.Int32{}}
+	}
+
+	cache.mu.Lock()
+	defer cache.mu.Unlock()
+
+	if errkit.IsKind(err, errkit.ErrKindNetworkPermanent) {
+		cache.isPermanentErr = true
+	}
+
+	cache.cause = err
+	cache.errors.Add(1)
+
+	_ = c.failedTargets.Set(cacheKey, cache)
 }

 // GetKeyFromContext returns the key for the cache from the context