internal queue to track job runs vs pod

This commit is contained in:
2026-01-11 22:27:56 +08:00
parent 13f04e1d84
commit 409a114532
3 changed files with 115 additions and 68 deletions

View File

@@ -21,6 +21,7 @@ import (
"fmt"
"math/rand"
"strings"
"sync"
"time"
batchv1 "k8s.io/api/batch/v1"
@@ -40,8 +41,9 @@ import (
// RunnerGroupReconciler reconciles a RunnerGroup object
type RunnerGroupReconciler struct {
client.Client
Scheme *runtime.Scheme
GiteaClient gitea.Client
Scheme *runtime.Scheme
GiteaClient gitea.Client
SpawnedJobsCache sync.Map
}
// +kubebuilder:rbac:groups=gitea.bpg.pw,resources=runnergroups,verbs=get;list;watch;create;update;patch;delete
@@ -121,7 +123,7 @@ func (r *RunnerGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request)
effectiveLabels := r.getEffectiveLabels(runnerGroup.Spec.Labels)
// Query for queued workflow runs
queuedJobs, err := r.GiteaClient.GetQueuedRuns(
stats, err := r.GiteaClient.GetRunnerStats(
ctx,
runnerGroup.Spec.GiteaURL,
authToken,
@@ -131,45 +133,78 @@ func (r *RunnerGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request)
effectiveLabels,
)
if err != nil {
logger.Error(err, "Failed to query Gitea for queued runs")
logger.Error(err, "Failed to query Gitea for runner stats")
return ctrl.Result{RequeueAfter: 10 * time.Second}, err
}
logger.Info("Gitea query result", "queuedJobs", queuedJobs)
logger.Info("Gitea query result", "queuedJobs", len(stats.QueuedJobs))
// 6. Scale Up
// 6. Scale Up and Cache Management
availableSlots := runnerGroup.Spec.MaxActiveRunners - activeRunners
toSpawn := min(queuedJobs, availableSlots)
if toSpawn > 0 {
logger.Info("Spawning runners",
"queuedJobs", queuedJobs,
"availableSlots", availableSlots,
"toSpawn", toSpawn)
// Track current queued IDs for cache cleanup
currentQueuedIDs := make(map[int64]bool)
// Retrieve Registration Token from Secret
registrationToken, err := r.getSecretValue(ctx, runnerGroup.Namespace, runnerGroup.Spec.RegistrationTokenRef)
// Retrieve Registration Token from Secret (only if we need to spawn)
var registrationToken string
tokenFetched := false
for _, giteaJob := range stats.QueuedJobs {
currentQueuedIDs[giteaJob.ID] = true
if availableSlots <= 0 {
continue
}
// Check if we already spawned a runner for this job
if value, loaded := r.SpawnedJobsCache.Load(giteaJob.ID); loaded {
spawnTime := value.(time.Time)
if time.Since(spawnTime) < 5*time.Minute {
// Already handling this job recently
continue
}
// TTL expired (runner likely failed to start), retry spawning
logger.Info("Job stuck in queue for too long, retrying runner spawn", "giteaJobID", giteaJob.ID)
}
// Need to spawn a runner
if !tokenFetched {
registrationToken, err = r.getSecretValue(ctx, runnerGroup.Namespace, runnerGroup.Spec.RegistrationTokenRef)
if err != nil {
logger.Error(err, "Failed to get registration token from secret")
return ctrl.Result{}, err
}
tokenFetched = true
}
job, err := r.constructJobForRunnerGroup(runnerGroup, registrationToken, effectiveLabels)
if err != nil {
logger.Error(err, "Failed to get registration token from secret")
logger.Error(err, "Failed to construct Job")
return ctrl.Result{}, err
}
// Spawn jobs
for i := 0; i < toSpawn; i++ {
job, err := r.constructJobForRunnerGroup(runnerGroup, registrationToken, effectiveLabels)
if err != nil {
logger.Error(err, "Failed to construct Job")
return ctrl.Result{}, err
}
if err := r.Create(ctx, job); err != nil {
logger.Error(err, "Failed to create Job", "jobName", job.Name)
return ctrl.Result{}, err
}
logger.Info("Created Job", "jobName", job.Name)
if err := r.Create(ctx, job); err != nil {
logger.Error(err, "Failed to create Job", "jobName", job.Name)
return ctrl.Result{}, err
}
logger.Info("Created Job for Gitea Run", "jobName", job.Name, "giteaJobID", giteaJob.ID)
// Mark as spawned
r.SpawnedJobsCache.Store(giteaJob.ID, time.Now())
availableSlots--
}
// Cleanup cache: remove jobs that are no longer queued in Gitea
r.SpawnedJobsCache.Range(func(key, value any) bool {
jobID := key.(int64)
if !currentQueuedIDs[jobID] {
// Job is no longer in the queue (running, completed, or cancelled)
r.SpawnedJobsCache.Delete(key)
}
return true
})
// 7. Requeue for continuous polling
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
}
@@ -239,6 +274,7 @@ func (r *RunnerGroupReconciler) constructJobForRunnerGroup(runnerGroup *giteav1a
{Name: "GITEA_INSTANCE_URL", Value: runnerGroup.Spec.GiteaURL},
{Name: "GITEA_RUNNER_REGISTRATION_TOKEN", Value: registrationToken},
{Name: "GITEA_RUNNER_EPHEMERAL", Value: "true"},
{Name: "GITEA_RUNNER_NAME", Value: name},
{Name: "DOCKER_HOST", Value: "tcp://localhost:2376"},
{Name: "DOCKER_CERT_PATH", Value: "/certs/client"},
{Name: "DOCKER_TLS_VERIFY", Value: "1"},