Files
xingrin/worker/internal/activity/runner.go

393 lines
10 KiB
Go

package activity
import (
"bufio"
"context"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"sync"
"syscall"
"time"
"github.com/orbit/worker/internal/pkg"
"github.com/shirou/gopsutil/v3/cpu"
"github.com/shirou/gopsutil/v3/mem"
"go.uber.org/zap"
)
// ansiRegex matches ANSI escape sequences (colors, cursor movement, etc.)
var ansiRegex = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]`)
// controlCharReplacer removes control characters in a single pass
var controlCharReplacer = strings.NewReplacer(
"\x00", "", // NUL
"\r", "", // CR
"\b", "", // Backspace
"\f", "", // Form feed
"\v", "", // Vertical tab
)
const (
DefaultDirPerm = 0755
ExitCodeTimeout = -1
ExitCodeError = -1
// System load thresholds
DefaultCPUThreshold = 90.0 // CPU usage percentage
DefaultMemThreshold = 80.0 // Memory usage percentage
LoadCheckInterval = 180 * time.Second // 3 minutes between checks
CommandStartupDelay = 5 * time.Second // Initial delay before first check
// Scanner buffer sizes
ScannerInitBufSize = 64 * 1024 // 64KB initial buffer
ScannerMaxBufSize = 1024 * 1024 // 1MB max buffer for long lines
)
// Result represents the result of an activity execution
type Result struct {
Name string
OutputFile string
LogFile string
ExitCode int
Duration time.Duration
Error error
}
// Command represents a command to execute
type Command struct {
Name string
Command string
OutputFile string
LogFile string
Timeout time.Duration
}
// Runner executes activities (external tools)
type Runner struct {
workDir string
}
// NewRunner creates a new activity runner
func NewRunner(workDir string) *Runner {
return &Runner{workDir: workDir}
}
// killProcessGroup terminates the entire process group
// When using shell=true (sh -c), the actual tool runs as a child of the shell.
// If we only kill the shell process, the child becomes an orphan and keeps running.
// By killing the process group, we ensure all child processes are terminated.
func killProcessGroup(cmd *exec.Cmd) {
if cmd == nil || cmd.Process == nil {
return
}
pid := cmd.Process.Pid
// Try to kill the process group first
// The negative PID signals the entire process group
if err := syscall.Kill(-pid, syscall.SIGKILL); err != nil {
pkg.Logger.Debug("Failed to kill process group, trying single process",
zap.Int("pid", pid),
zap.Error(err))
// Fallback: kill single process
_ = cmd.Process.Kill()
} else {
pkg.Logger.Debug("Killed process group", zap.Int("pgid", pid))
}
}
// waitForSystemLoad blocks until system CPU and memory usage are below thresholds.
// This prevents OOM when starting multiple concurrent commands.
func waitForSystemLoad(ctx context.Context) {
// Initial delay to let previous commands consume resources
select {
case <-ctx.Done():
return
case <-time.After(CommandStartupDelay):
}
for {
cpu, mem, err := getSystemLoad()
if err != nil {
pkg.Logger.Debug("Failed to get system load, skipping check", zap.Error(err))
return
}
if cpu < DefaultCPUThreshold && mem < DefaultMemThreshold {
return
}
pkg.Logger.Info("System load high, waiting before starting command",
zap.Float64("cpu", cpu),
zap.Float64("cpuThreshold", DefaultCPUThreshold),
zap.Float64("mem", mem),
zap.Float64("memThreshold", DefaultMemThreshold))
select {
case <-ctx.Done():
return
case <-time.After(LoadCheckInterval):
}
}
}
// getSystemLoad returns current CPU and memory usage percentages using gopsutil.
// Correctly handles container cgroup limits.
func getSystemLoad() (cpuPercent, memPercent float64, err error) {
// Get CPU usage (0 interval means since last call, false means aggregate all CPUs)
cpuPercents, err := cpu.Percent(0, false)
if err != nil {
return 0, 0, fmt.Errorf("failed to get CPU usage: %w", err)
}
if len(cpuPercents) == 0 {
return 0, 0, fmt.Errorf("no CPU data available")
}
// Get memory usage
memInfo, err := mem.VirtualMemory()
if err != nil {
return 0, 0, fmt.Errorf("failed to get memory usage: %w", err)
}
return cpuPercents[0], memInfo.UsedPercent, nil
}
// Run executes a single activity with streaming output
func (r *Runner) Run(ctx context.Context, cmd Command) *Result {
start := time.Now()
result := &Result{
Name: cmd.Name,
OutputFile: cmd.OutputFile,
LogFile: cmd.LogFile,
}
if ctx.Err() != nil {
result.Error = fmt.Errorf("context cancelled before execution: %w", ctx.Err())
result.ExitCode = ExitCodeError
return result
}
// Wait for system load to be acceptable before starting
waitForSystemLoad(ctx)
if ctx.Err() != nil {
result.Error = fmt.Errorf("context cancelled while waiting for system load: %w", ctx.Err())
result.ExitCode = ExitCodeError
return result
}
execCtx, cancel := context.WithTimeout(ctx, cmd.Timeout)
defer cancel()
execCmd := exec.CommandContext(execCtx, "sh", "-c", cmd.Command)
execCmd.Dir = r.workDir
// Create new process group so we can kill all child processes
execCmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
// Setup pipes for streaming
stdout, err := execCmd.StdoutPipe()
if err != nil {
result.Error = fmt.Errorf("failed to create stdout pipe: %w", err)
result.ExitCode = ExitCodeError
return result
}
stderr, err := execCmd.StderrPipe()
if err != nil {
result.Error = fmt.Errorf("failed to create stderr pipe: %w", err)
result.ExitCode = ExitCodeError
return result
}
// Prepare log file
logFile := r.prepareLogFile(cmd)
if logFile != nil {
defer func() { _ = logFile.Close() }()
r.writeLogHeader(logFile, cmd)
}
// Start command
if err := execCmd.Start(); err != nil {
result.Error = fmt.Errorf("failed to start command: %w", err)
result.ExitCode = ExitCodeError
return result
}
// Ensure process cleanup on any exit path
defer killProcessGroup(execCmd)
// Stream output
var wg sync.WaitGroup
wg.Add(2)
go r.streamOutput(&wg, stdout, logFile, cmd.Name, "stdout")
go r.streamOutput(&wg, stderr, logFile, cmd.Name, "stderr")
wg.Wait()
// Wait for command to finish
err = execCmd.Wait()
result.Duration = time.Since(start)
// Write duration to log
if logFile != nil {
r.writeLogFooter(logFile, result)
}
// Handle result
if execCtx.Err() == context.DeadlineExceeded {
result.Error = fmt.Errorf("activity execution timeout after %v", cmd.Timeout)
result.ExitCode = ExitCodeTimeout
pkg.Logger.Error("Activity timeout",
zap.String("activity", cmd.Name),
zap.Duration("timeout", cmd.Timeout))
return result
}
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
result.ExitCode = exitErr.ExitCode()
} else {
result.ExitCode = ExitCodeError
}
result.Error = fmt.Errorf("activity execution failed: %w", err)
pkg.Logger.Error("Activity failed",
zap.String("activity", cmd.Name),
zap.Int("exitCode", result.ExitCode),
zap.Error(err))
return result
}
result.ExitCode = 0
pkg.Logger.Info("Activity completed",
zap.String("activity", cmd.Name),
zap.Duration("duration", result.Duration))
return result
}
// RunParallel executes multiple activities in parallel
func (r *Runner) RunParallel(ctx context.Context, commands []Command) []*Result {
if len(commands) == 0 {
return nil
}
results := make([]*Result, len(commands))
var wg sync.WaitGroup
for i, cmd := range commands {
if ctx.Err() != nil {
results[i] = &Result{
Name: cmd.Name,
ExitCode: ExitCodeError,
Error: fmt.Errorf("context cancelled: %w", ctx.Err()),
}
continue
}
wg.Add(1)
go func(idx int, c Command) {
defer wg.Done()
results[idx] = r.Run(ctx, c)
}(i, cmd)
}
wg.Wait()
return results
}
func (r *Runner) prepareLogFile(cmd Command) *os.File {
if cmd.LogFile == "" {
return nil
}
dir := filepath.Dir(cmd.LogFile)
if err := os.MkdirAll(dir, DefaultDirPerm); err != nil {
pkg.Logger.Warn("Failed to create log directory",
zap.String("activity", cmd.Name),
zap.Error(err))
return nil
}
f, err := os.Create(cmd.LogFile)
if err != nil {
pkg.Logger.Warn("Failed to create log file",
zap.String("activity", cmd.Name),
zap.Error(err))
return nil
}
return f
}
func (r *Runner) streamOutput(wg *sync.WaitGroup, reader io.Reader, logFile *os.File, activityName, streamName string) {
defer wg.Done()
scanner := bufio.NewScanner(reader)
scanner.Buffer(make([]byte, ScannerInitBufSize), ScannerMaxBufSize)
for scanner.Scan() {
line := cleanLine(scanner.Text())
if line == "" {
continue
}
// Write to log file
if logFile != nil {
_, _ = fmt.Fprintln(logFile, line)
}
// Log debug output (optional, can be removed for less verbose logs)
pkg.Logger.Debug("Activity output",
zap.String("activity", activityName),
zap.String("stream", streamName),
zap.String("line", line))
}
if err := scanner.Err(); err != nil {
pkg.Logger.Warn("Error reading output stream",
zap.String("activity", activityName),
zap.String("stream", streamName),
zap.Error(err))
}
}
// cleanLine removes ANSI escape sequences and control characters from output
func cleanLine(line string) string {
line = ansiRegex.ReplaceAllString(line, "")
line = controlCharReplacer.Replace(line)
return strings.TrimSpace(line)
}
const logSeparator = "============================================================"
func (r *Runner) writeLogHeader(f *os.File, cmd Command) {
_, _ = fmt.Fprintf(f, "$ %s\n", cmd.Command)
_, _ = fmt.Fprintln(f, logSeparator)
_, _ = fmt.Fprintf(f, "# Tool: %s\n", cmd.Name)
_, _ = fmt.Fprintf(f, "# Started: %s\n", time.Now().Format("2006-01-02 15:04:05"))
_, _ = fmt.Fprintf(f, "# Timeout: %v\n", cmd.Timeout)
_, _ = fmt.Fprintln(f, "# Status: Running...")
_, _ = fmt.Fprintln(f, logSeparator)
_, _ = fmt.Fprintln(f)
}
func (r *Runner) writeLogFooter(f *os.File, result *Result) {
status := "✓ Success"
if result.ExitCode != 0 {
status = "✗ Failed"
}
_, _ = fmt.Fprintln(f)
_, _ = fmt.Fprintln(f, logSeparator)
_, _ = fmt.Fprintf(f, "# Finished: %s\n", time.Now().Format("2006-01-02 15:04:05"))
_, _ = fmt.Fprintf(f, "# Duration: %.2fs\n", result.Duration.Seconds())
_, _ = fmt.Fprintf(f, "# Exit Code: %d\n", result.ExitCode)
_, _ = fmt.Fprintf(f, "# Status: %s\n", status)
_, _ = fmt.Fprintln(f, logSeparator)
}