gitcaddy-server/models/actions/runner_health.go

// Copyright 2026 MarketAlly. All rights reserved.
// SPDX-License-Identifier: MIT

package actions

import (
	"context"
	"time"

	"code.gitcaddy.com/server/models/db"
	"code.gitcaddy.com/server/modules/json"
	"code.gitcaddy.com/server/modules/log"
	"code.gitcaddy.com/server/modules/setting"
	"code.gitcaddy.com/server/modules/timeutil"
)

// RunnerCapabilities represents the parsed capabilities from CapabilitiesJSON
type RunnerCapabilities struct {
	OS        string         `json:"os"`
	Arch      string         `json:"arch"`
	Disk      *DiskInfo      `json:"disk"`
	CPU       *CPUInfo       `json:"cpu"`
	Bandwidth *BandwidthInfo `json:"bandwidth"`
}

// DiskInfo contains disk usage information
type DiskInfo struct {
	TotalBytes  int64   `json:"total_bytes"`
	FreeBytes   int64   `json:"free_bytes"`
	UsedBytes   int64   `json:"used_bytes"`
	UsedPercent float64 `json:"used_percent"`
}

// CPUInfo contains CPU load information
type CPUInfo struct {
	NumCPU      int     `json:"num_cpu"`      // Number of logical CPUs
	LoadAvg1m   float64 `json:"load_avg_1m"`  // 1-minute load average
	LoadAvg5m   float64 `json:"load_avg_5m"`  // 5-minute load average
	LoadAvg15m  float64 `json:"load_avg_15m"` // 15-minute load average
	LoadPercent float64 `json:"load_percent"` // (load_avg_1m / num_cpu) * 100
}

// BandwidthInfo contains network performance information
type BandwidthInfo struct {
	DownloadMbps float64   `json:"download_mbps"`
	LatencyMs    float64   `json:"latency_ms"`
	TestedAt     time.Time `json:"tested_at"`
}

// RunnerHealthStatus represents the health status of a runner
type RunnerHealthStatus struct {
	Healthy         bool    `json:"healthy"`
	DiskHealthy     bool    `json:"disk_healthy"`
	CPUHealthy      bool    `json:"cpu_healthy"`
	LatencyHealthy  bool    `json:"latency_healthy"`
	DiskUsedPercent float64 `json:"disk_used_percent"`
	DiskFreeBytes   int64   `json:"disk_free_bytes"`
	CPULoadPercent  float64 `json:"cpu_load_percent"`
	LatencyMs       float64 `json:"latency_ms"`
	Reason          string  `json:"reason,omitempty"`
	NeedsCleanup    bool    `json:"needs_cleanup"`
}

// GetCapabilities parses and returns the runner's capabilities
func (r *ActionRunner) GetCapabilities() *RunnerCapabilities {
	if r.CapabilitiesJSON == "" {
		return nil
	}

	var caps RunnerCapabilities
	if err := json.Unmarshal([]byte(r.CapabilitiesJSON), &caps); err != nil {
		log.Error("Failed to parse runner %s capabilities: %v", r.Name, err)
		return nil
	}
	return &caps
}

// GetHealthStatus returns detailed health status of the runner
func (r *ActionRunner) GetHealthStatus() *RunnerHealthStatus {
	status := &RunnerHealthStatus{
		Healthy:        true,
		DiskHealthy:    true,
		CPUHealthy:     true,
		LatencyHealthy: true,
	}

	caps := r.GetCapabilities()
	if caps == nil {
		// No capabilities reported, assume healthy but note it
		status.Reason = "no capabilities reported"
		return status
	}

	healthSettings := setting.Actions.RunnerHealthCheck

	// Check disk health
	if caps.Disk != nil {
		status.DiskUsedPercent = caps.Disk.UsedPercent
		status.DiskFreeBytes = caps.Disk.FreeBytes

		freePercent := 100.0 - caps.Disk.UsedPercent
		if freePercent < healthSettings.MinDiskPercent {
			status.DiskHealthy = false
			status.Healthy = false
			status.Reason = "insufficient disk space"
			status.NeedsCleanup = true
		}

		if caps.Disk.UsedPercent >= healthSettings.MaxDiskUsagePercent {
			status.NeedsCleanup = true
		}
	}

	// Check CPU health
	if caps.CPU != nil {
		status.CPULoadPercent = caps.CPU.LoadPercent

		if caps.CPU.LoadPercent > healthSettings.MaxCPULoadPercent {
			status.CPUHealthy = false
			status.Healthy = false
			if status.Reason != "" {
				status.Reason += "; "
			}
			status.Reason += "CPU overloaded"
		}
	}

	// Check latency health
	if caps.Bandwidth != nil {
		status.LatencyMs = caps.Bandwidth.LatencyMs

		if caps.Bandwidth.LatencyMs > healthSettings.MaxLatencyMs {
			status.LatencyHealthy = false
			status.Healthy = false
			if status.Reason != "" {
				status.Reason += "; "
			}
			status.Reason += "high latency"
		}
	}

	return status
}

// IsHealthy returns true if the runner is healthy enough for job assignment
func (r *ActionRunner) IsHealthy() bool {
	if !setting.Actions.RunnerHealthCheck.Enabled {
		return true
	}
	return r.GetHealthStatus().Healthy
}

// NeedsCleanup returns true if the runner should perform cleanup
func (r *ActionRunner) NeedsCleanup() bool {
	status := r.GetHealthStatus()
	return status.NeedsCleanup
}

// RunnerCleanupRequest tracks cleanup requests sent to runners
type RunnerCleanupRequest struct {
	ID          int64              `xorm:"pk autoincr"`
	RunnerID    int64              `xorm:"INDEX NOT NULL"`
	RequestedAt timeutil.TimeStamp `xorm:"created INDEX"`
	CompletedAt timeutil.TimeStamp `xorm:"INDEX"`
	Success     bool
	BytesFreed  int64
	ErrorMsg    string `xorm:"TEXT"`
}

func init() {
	db.RegisterModel(new(RunnerCleanupRequest))
}

// TableName returns the table name for RunnerCleanupRequest
func (RunnerCleanupRequest) TableName() string {
	return "runner_cleanup_request"
}

// CreateCleanupRequest creates a new cleanup request for a runner
func CreateCleanupRequest(ctx context.Context, runnerID int64) (*RunnerCleanupRequest, error) {
	req := &RunnerCleanupRequest{
		RunnerID: runnerID,
	}
	_, err := db.GetEngine(ctx).Insert(req)
	return req, err
}

// GetLastCleanupRequest returns the last cleanup request for a runner
func GetLastCleanupRequest(ctx context.Context, runnerID int64) (*RunnerCleanupRequest, error) {
	req := &RunnerCleanupRequest{}
	has, err := db.GetEngine(ctx).Where("runner_id = ?", runnerID).
		OrderBy("requested_at DESC").
		Limit(1).
		Get(req)
	if err != nil {
		return nil, err
	}
	if !has {
		return nil, nil
	}
	return req, nil
}

// GetPendingCleanupRequest returns the pending (uncompleted) cleanup request for a runner
func GetPendingCleanupRequest(ctx context.Context, runnerID int64) (*RunnerCleanupRequest, error) {
	req := &RunnerCleanupRequest{}
	has, err := db.GetEngine(ctx).Where("runner_id = ? AND completed_at = 0", runnerID).
		OrderBy("requested_at DESC").
		Limit(1).
		Get(req)
	if err != nil {
		return nil, err
	}
	if !has {
		return nil, nil
	}
	return req, nil
}

// CanRequestCleanup checks if we can request cleanup (respects cooldown)
func CanRequestCleanup(ctx context.Context, runnerID int64) (bool, error) {
	lastReq, err := GetLastCleanupRequest(ctx, runnerID)
	if err != nil {
		return false, err
	}
	if lastReq == nil {
		return true, nil
	}

	cooldown := setting.Actions.RunnerHealthCheck.CleanupCooldown
	if time.Since(lastReq.RequestedAt.AsTime()) < cooldown {
		return false, nil
	}
	return true, nil
}

// CompleteCleanupRequest marks a cleanup request as completed
func CompleteCleanupRequest(ctx context.Context, id int64, success bool, bytesFreed int64, errorMsg string) error {
	_, err := db.GetEngine(ctx).ID(id).Cols("completed_at", "success", "bytes_freed", "error_msg").Update(&RunnerCleanupRequest{
		CompletedAt: timeutil.TimeStampNow(),
		Success:     success,
		BytesFreed:  bytesFreed,
		ErrorMsg:    errorMsg,
	})
	return err
}

// GetUnhealthyRunners returns all runners that are unhealthy
func GetUnhealthyRunners(ctx context.Context) ([]*ActionRunner, error) {
	var runners []*ActionRunner
	err := db.GetEngine(ctx).Where("deleted_unix = 0").Find(&runners)
	if err != nil {
		return nil, err
	}

	var unhealthy []*ActionRunner
	for _, r := range runners {
		if !r.IsOnline() {
			continue // Skip offline runners
		}
		if !r.IsHealthy() {
			unhealthy = append(unhealthy, r)
		}
	}
	return unhealthy, nil
}