diff --git a/cmd/gluetun/main.go b/cmd/gluetun/main.go index d81b9b20..d868cea2 100644 --- a/cmd/gluetun/main.go +++ b/cmd/gluetun/main.go @@ -253,9 +253,10 @@ func _main(ctx context.Context, buildInfo models.BuildInformation, } // TODO move inside firewall? wg := &sync.WaitGroup{} + healthy := make(chan bool) openvpnLooper := openvpn.NewLooper(allSettings.OpenVPN, nonRootUsername, puid, pgid, allServers, - ovpnConf, firewallConf, routingConf, logger, httpClient, os.OpenFile, tunnelReadyCh, cancel) + ovpnConf, firewallConf, routingConf, logger, httpClient, os.OpenFile, tunnelReadyCh, healthy, cancel) wg.Add(1) // wait for restartOpenvpn go openvpnLooper.Run(ctx, wg) @@ -302,7 +303,7 @@ func _main(ctx context.Context, buildInfo models.BuildInformation, healthcheckServer := healthcheck.NewServer( constants.HealthcheckAddress, logger) wg.Add(1) - go healthcheckServer.Run(ctx, wg) + go healthcheckServer.Run(ctx, healthy, wg) // Start openvpn for the first time in a blocking call // until openvpn is launched diff --git a/internal/healthcheck/health.go b/internal/healthcheck/health.go index 4cb90bfa..c51d3020 100644 --- a/internal/healthcheck/health.go +++ b/internal/healthcheck/health.go @@ -10,7 +10,7 @@ import ( "time" ) -func (s *server) runHealthcheckLoop(ctx context.Context, wg *sync.WaitGroup) { +func (s *server) runHealthcheckLoop(ctx context.Context, healthy chan<- bool, wg *sync.WaitGroup) { defer wg.Done() for { previousErr := s.handler.getErr() @@ -18,6 +18,12 @@ func (s *server) runHealthcheckLoop(ctx context.Context, wg *sync.WaitGroup) { err := healthCheck(ctx, s.resolver) s.handler.setErr(err) + // Notify the healthy channel, or not if it's already full + select { + case healthy <- err == nil: + default: + } + if previousErr != nil && err == nil { s.logger.Info("healthy!") } else if previousErr == nil && err != nil { @@ -36,8 +42,8 @@ func (s *server) runHealthcheckLoop(ctx context.Context, wg *sync.WaitGroup) { } continue } - // Success, check again in 10 minutes - const period = 10 * time.Minute + // Success, check again in 5 seconds + const period = 5 * time.Second timer := time.NewTimer(period) select { case <-ctx.Done(): diff --git a/internal/healthcheck/server.go b/internal/healthcheck/server.go index d7d42914..69b7c988 100644 --- a/internal/healthcheck/server.go +++ b/internal/healthcheck/server.go @@ -12,7 +12,7 @@ import ( ) type Server interface { - Run(ctx context.Context, wg *sync.WaitGroup) + Run(ctx context.Context, healthy chan<- bool, wg *sync.WaitGroup) } type server struct { @@ -32,12 +32,12 @@ func NewServer(address string, logger logging.Logger) Server { } } -func (s *server) Run(ctx context.Context, wg *sync.WaitGroup) { +func (s *server) Run(ctx context.Context, healthy chan<- bool, wg *sync.WaitGroup) { defer wg.Done() internalWg := &sync.WaitGroup{} internalWg.Add(1) - go s.runHealthcheckLoop(ctx, internalWg) + go s.runHealthcheckLoop(ctx, healthy, internalWg) server := http.Server{ Addr: s.address, diff --git a/internal/openvpn/loop.go b/internal/openvpn/loop.go index 750a5be3..7788561d 100644 --- a/internal/openvpn/loop.go +++ b/internal/openvpn/loop.go @@ -45,6 +45,7 @@ type looper struct { client *http.Client openFile os.OpenFileFunc tunnelReady chan<- struct{} + healthy <-chan bool cancel context.CancelFunc // Internal channels and locks loopLock sync.Mutex @@ -54,15 +55,19 @@ type looper struct { portForwardSignals chan net.IP crashed bool backoffTime time.Duration + healthWaitTime time.Duration } -const defaultBackoffTime = 15 * time.Second +const ( + defaultBackoffTime = 15 * time.Second + defaultHealthWaitTime = 6 * time.Second +) func NewLooper(settings configuration.OpenVPN, username string, puid, pgid int, allServers models.AllServers, conf Configurator, fw firewall.Configurator, routing routing.Routing, logger logging.Logger, client *http.Client, openFile os.OpenFileFunc, - tunnelReady chan<- struct{}, cancel context.CancelFunc) Looper { + tunnelReady chan<- struct{}, healthy <-chan bool, cancel context.CancelFunc) Looper { return &looper{ state: state{ status: constants.Stopped, @@ -80,6 +85,7 @@ func NewLooper(settings configuration.OpenVPN, client: client, openFile: openFile, tunnelReady: tunnelReady, + healthy: healthy, cancel: cancel, start: make(chan struct{}), running: make(chan models.LoopStatus), @@ -87,6 +93,7 @@ func NewLooper(settings configuration.OpenVPN, stopped: make(chan struct{}), portForwardSignals: make(chan net.IP), backoffTime: defaultBackoffTime, + healthWaitTime: defaultHealthWaitTime, } } @@ -215,6 +222,22 @@ func (l *looper) Run(ctx context.Context, wg *sync.WaitGroup) { //nolint:gocogni l.logAndWait(ctx, err) l.crashed = true stayHere = false + case healthy := <-l.healthy: + if healthy { + continue + } + // ensure it stays unhealthy for some time before restarting it + healthy = l.waitForHealth(ctx) + if healthy || ctx.Err() != nil { + continue + } + l.crashed = true // flag as crashed + l.state.setStatusWithLock(constants.Stopping) + l.logger.Warn("unhealthy program: restarting openvpn") + openvpnCancel() + <-waitError + l.state.setStatusWithLock(constants.Stopped) + stayHere = false } } close(waitError) @@ -240,6 +263,35 @@ func (l *looper) logAndWait(ctx context.Context, err error) { } } +// waitForHealth waits for a true healthy signal +// after restarting openvpn in order to avoid restarting +// openvpn in a loop as it requires a few seconds to connect. +func (l *looper) waitForHealth(ctx context.Context) (healthy bool) { + l.logger.Info("unhealthy program: waiting %s for it to change to healthy", l.healthWaitTime) + timer := time.NewTimer(l.healthWaitTime) + l.healthWaitTime *= 2 + for { + select { + case healthy = <-l.healthy: + if !healthy { + break + } + if !timer.Stop() { + <-timer.C + } + l.healthWaitTime = defaultHealthWaitTime + return true + case <-timer.C: + return false + case <-ctx.Done(): + if !timer.Stop() { + <-timer.C + } + return false + } + } +} + // portForward is a blocking operation which may or may not be infinite. // You should therefore always call it in a goroutine. func (l *looper) portForward(ctx context.Context, wg *sync.WaitGroup,