From 409a114532d5bd1169284968aa2d5162cf77cb56 Mon Sep 17 00:00:00 2001 From: Bapung Date: Sun, 11 Jan 2026 22:27:56 +0800 Subject: [PATCH] internal queue to track job runs vs pod --- config/manager/kustomization.yaml | 2 +- internal/controller/runnergroup_controller.go | 92 +++++++++++++------ internal/gitea/client.go | 89 ++++++++++-------- 3 files changed, 115 insertions(+), 68 deletions(-) diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 81dc759..86cca29 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -5,7 +5,7 @@ kind: Kustomization images: - name: controller newName: ghcr.io/bapung/gitea-runner-operator - newTag: sha-a34dedb + newTag: sha-13f04e1 patchesStrategicMerge: - image_pull_secret_patch.yaml diff --git a/internal/controller/runnergroup_controller.go b/internal/controller/runnergroup_controller.go index 7ca0448..ba1030b 100644 --- a/internal/controller/runnergroup_controller.go +++ b/internal/controller/runnergroup_controller.go @@ -21,6 +21,7 @@ import ( "fmt" "math/rand" "strings" + "sync" "time" batchv1 "k8s.io/api/batch/v1" @@ -40,8 +41,9 @@ import ( // RunnerGroupReconciler reconciles a RunnerGroup object type RunnerGroupReconciler struct { client.Client - Scheme *runtime.Scheme - GiteaClient gitea.Client + Scheme *runtime.Scheme + GiteaClient gitea.Client + SpawnedJobsCache sync.Map } // +kubebuilder:rbac:groups=gitea.bpg.pw,resources=runnergroups,verbs=get;list;watch;create;update;patch;delete @@ -121,7 +123,7 @@ func (r *RunnerGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) effectiveLabels := r.getEffectiveLabels(runnerGroup.Spec.Labels) // Query for queued workflow runs - queuedJobs, err := r.GiteaClient.GetQueuedRuns( + stats, err := r.GiteaClient.GetRunnerStats( ctx, runnerGroup.Spec.GiteaURL, authToken, @@ -131,45 +133,78 @@ func (r *RunnerGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) effectiveLabels, ) if err != nil { - logger.Error(err, "Failed to query Gitea for queued runs") + logger.Error(err, "Failed to query Gitea for runner stats") return ctrl.Result{RequeueAfter: 10 * time.Second}, err } - logger.Info("Gitea query result", "queuedJobs", queuedJobs) + logger.Info("Gitea query result", "queuedJobs", len(stats.QueuedJobs)) - // 6. Scale Up + // 6. Scale Up and Cache Management availableSlots := runnerGroup.Spec.MaxActiveRunners - activeRunners - toSpawn := min(queuedJobs, availableSlots) - if toSpawn > 0 { - logger.Info("Spawning runners", - "queuedJobs", queuedJobs, - "availableSlots", availableSlots, - "toSpawn", toSpawn) + // Track current queued IDs for cache cleanup + currentQueuedIDs := make(map[int64]bool) - // Retrieve Registration Token from Secret - registrationToken, err := r.getSecretValue(ctx, runnerGroup.Namespace, runnerGroup.Spec.RegistrationTokenRef) + // Retrieve Registration Token from Secret (only if we need to spawn) + var registrationToken string + tokenFetched := false + + for _, giteaJob := range stats.QueuedJobs { + currentQueuedIDs[giteaJob.ID] = true + + if availableSlots <= 0 { + continue + } + + // Check if we already spawned a runner for this job + if value, loaded := r.SpawnedJobsCache.Load(giteaJob.ID); loaded { + spawnTime := value.(time.Time) + if time.Since(spawnTime) < 5*time.Minute { + // Already handling this job recently + continue + } + // TTL expired (runner likely failed to start), retry spawning + logger.Info("Job stuck in queue for too long, retrying runner spawn", "giteaJobID", giteaJob.ID) + } + + // Need to spawn a runner + if !tokenFetched { + registrationToken, err = r.getSecretValue(ctx, runnerGroup.Namespace, runnerGroup.Spec.RegistrationTokenRef) + if err != nil { + logger.Error(err, "Failed to get registration token from secret") + return ctrl.Result{}, err + } + tokenFetched = true + } + + job, err := r.constructJobForRunnerGroup(runnerGroup, registrationToken, effectiveLabels) if err != nil { - logger.Error(err, "Failed to get registration token from secret") + logger.Error(err, "Failed to construct Job") return ctrl.Result{}, err } - // Spawn jobs - for i := 0; i < toSpawn; i++ { - job, err := r.constructJobForRunnerGroup(runnerGroup, registrationToken, effectiveLabels) - if err != nil { - logger.Error(err, "Failed to construct Job") - return ctrl.Result{}, err - } - - if err := r.Create(ctx, job); err != nil { - logger.Error(err, "Failed to create Job", "jobName", job.Name) - return ctrl.Result{}, err - } - logger.Info("Created Job", "jobName", job.Name) + if err := r.Create(ctx, job); err != nil { + logger.Error(err, "Failed to create Job", "jobName", job.Name) + return ctrl.Result{}, err } + + logger.Info("Created Job for Gitea Run", "jobName", job.Name, "giteaJobID", giteaJob.ID) + + // Mark as spawned + r.SpawnedJobsCache.Store(giteaJob.ID, time.Now()) + availableSlots-- } + // Cleanup cache: remove jobs that are no longer queued in Gitea + r.SpawnedJobsCache.Range(func(key, value any) bool { + jobID := key.(int64) + if !currentQueuedIDs[jobID] { + // Job is no longer in the queue (running, completed, or cancelled) + r.SpawnedJobsCache.Delete(key) + } + return true + }) + // 7. Requeue for continuous polling return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } @@ -239,6 +274,7 @@ func (r *RunnerGroupReconciler) constructJobForRunnerGroup(runnerGroup *giteav1a {Name: "GITEA_INSTANCE_URL", Value: runnerGroup.Spec.GiteaURL}, {Name: "GITEA_RUNNER_REGISTRATION_TOKEN", Value: registrationToken}, {Name: "GITEA_RUNNER_EPHEMERAL", Value: "true"}, + {Name: "GITEA_RUNNER_NAME", Value: name}, {Name: "DOCKER_HOST", Value: "tcp://localhost:2376"}, {Name: "DOCKER_CERT_PATH", Value: "/certs/client"}, {Name: "DOCKER_TLS_VERIFY", Value: "1"}, diff --git a/internal/gitea/client.go b/internal/gitea/client.go index 53676b8..30f5ffd 100644 --- a/internal/gitea/client.go +++ b/internal/gitea/client.go @@ -31,9 +31,8 @@ import ( // Client defines the interface for interacting with Gitea API type Client interface { - // GetQueuedRuns queries Gitea for queued workflow runs matching the scope and labels - // Returns the count of queued jobs that match the criteria - GetQueuedRuns( + // GetRunnerStats queries Gitea for queued workflow runs matching the scope and labels + GetRunnerStats( ctx context.Context, giteaURL string, authToken string, @@ -41,7 +40,12 @@ type Client interface { org string, repo string, labels []string, - ) (int, error) + ) (*RunnerStats, error) +} + +// RunnerStats contains lists of jobs in different states +type RunnerStats struct { + QueuedJobs []ActionWorkflowJob } // HTTPClient is the default implementation of the Gitea Client interface @@ -107,8 +111,8 @@ type ActionWorkflowJob struct { RunnerName string `json:"runner_name"` } -// GetQueuedRuns implements the Client interface -func (c *HTTPClient) GetQueuedRuns( +// GetRunnerStats implements the Client interface +func (c *HTTPClient) GetRunnerStats( ctx context.Context, giteaURL string, authToken string, @@ -116,44 +120,51 @@ func (c *HTTPClient) GetQueuedRuns( org string, repo string, labels []string, -) (int, error) { +) (*RunnerStats, error) { switch scope { case v1alpha1.RunnerGroupScopeRepo: - return c.getQueuedRunsForRepo(ctx, giteaURL, authToken, org, repo, labels) + return c.getRunnerStatsForRepo(ctx, giteaURL, authToken, org, repo, labels) case v1alpha1.RunnerGroupScopeOrg: - return c.getQueuedRunsForOrg(ctx, giteaURL, authToken, org, labels) + return c.getRunnerStatsForOrg(ctx, giteaURL, authToken, org, labels) case v1alpha1.RunnerGroupScopeGlobal: - return c.getQueuedRunsGlobal(ctx, giteaURL, authToken, labels) + return c.getRunnerStatsGlobal(ctx, giteaURL, authToken, labels) default: - return 0, fmt.Errorf("unknown scope: %s", scope) + return nil, fmt.Errorf("unknown scope: %s", scope) } } -// getQueuedRunsForRepo fetches queued runs for a specific repository -func (c *HTTPClient) getQueuedRunsForRepo(ctx context.Context, giteaURL, authToken, owner, repo string, labels []string) (int, error) { - // Use jobs endpoint since it contains the runner labels we need for filtering +// getRunnerStatsForRepo fetches queued runs for a specific repository +func (c *HTTPClient) getRunnerStatsForRepo(ctx context.Context, giteaURL, authToken, owner, repo string, labels []string) (*RunnerStats, error) { endpoint := fmt.Sprintf("%s/api/v1/repos/%s/%s/actions/jobs", strings.TrimSuffix(giteaURL, "/"), owner, repo) - return c.fetchWorkflowJobs(ctx, endpoint, authToken, labels) + return c.fetchRunnerStats(ctx, endpoint, authToken, labels) } -// getQueuedRunsForOrg fetches queued runs for all repos under an organization -func (c *HTTPClient) getQueuedRunsForOrg(ctx context.Context, giteaURL, authToken, org string, labels []string) (int, error) { - // Use direct org-level jobs endpoint for better performance +// getRunnerStatsForOrg fetches queued runs for all repos under an organization +func (c *HTTPClient) getRunnerStatsForOrg(ctx context.Context, giteaURL, authToken, org string, labels []string) (*RunnerStats, error) { endpoint := fmt.Sprintf("%s/api/v1/orgs/%s/actions/jobs", strings.TrimSuffix(giteaURL, "/"), org) - return c.fetchWorkflowJobs(ctx, endpoint, authToken, labels) + return c.fetchRunnerStats(ctx, endpoint, authToken, labels) } -// getQueuedRunsGlobal fetches queued runs using admin-level API for global scope -func (c *HTTPClient) getQueuedRunsGlobal(ctx context.Context, giteaURL, authToken string, labels []string) (int, error) { - // Use admin-level jobs endpoint which provides global view of all queued jobs +// getRunnerStatsGlobal fetches queued runs using admin-level API for global scope +func (c *HTTPClient) getRunnerStatsGlobal(ctx context.Context, giteaURL, authToken string, labels []string) (*RunnerStats, error) { endpoint := fmt.Sprintf("%s/api/v1/admin/actions/jobs", strings.TrimSuffix(giteaURL, "/")) - return c.fetchWorkflowJobs(ctx, endpoint, authToken, labels) + return c.fetchRunnerStats(ctx, endpoint, authToken, labels) +} + +func (c *HTTPClient) fetchRunnerStats(ctx context.Context, endpoint, authToken string, labels []string) (*RunnerStats, error) { + queuedJobs, err := c.fetchWorkflowJobs(ctx, endpoint, authToken, labels, []string{"queued", "waiting", "pending"}) + if err != nil { + return nil, err + } + + return &RunnerStats{ + QueuedJobs: queuedJobs, + }, nil } // fetchWorkflowJobs fetches workflow jobs from a given endpoint with label filtering and pagination -func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken string, labels []string) (int, error) { - totalCount := 0 - statuses := []string{"queued", "waiting", "pending"} +func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken string, labels []string, statuses []string) ([]ActionWorkflowJob, error) { + var allJobs []ActionWorkflowJob for _, status := range statuses { page := 1 @@ -174,7 +185,7 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil) if err != nil { - return 0, err + return nil, err } req.Header.Set("Authorization", "token "+authToken) @@ -183,7 +194,7 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken resp, err := c.httpClient.Do(req) if err != nil { fmt.Printf("DEBUG: Request failed: %v\n", err) - return 0, err + return nil, err } fmt.Printf("DEBUG: Response status: %s\n", resp.Status) @@ -192,7 +203,7 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken body, _ := io.ReadAll(resp.Body) resp.Body.Close() fmt.Printf("DEBUG: Error body: %s\n", string(body)) - return 0, c.handleHTTPError(resp.StatusCode, body, "fetch workflow jobs") + return nil, c.handleHTTPError(resp.StatusCode, body, "fetch workflow jobs") } body, _ := io.ReadAll(resp.Body) @@ -202,15 +213,15 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken var result ActionWorkflowJobsResponse if err := json.Unmarshal(body, &result); err != nil { fmt.Printf("DEBUG: Failed to decode response: %v\n", err) - return 0, err + return nil, err } fmt.Printf("DEBUG: Found %d jobs, total in Gitea: %d\n", len(result.Jobs), result.TotalCount) - // Filter and count matching jobs for this page - pageCount := c.filterQueuedJobs(result.Jobs, labels) - fmt.Printf("DEBUG: %d jobs matched labels %v\n", pageCount, labels) - totalCount += pageCount + // Filter and collect matching jobs for this page + matchedJobs := c.filterQueuedJobs(result.Jobs, labels) + fmt.Printf("DEBUG: %d jobs matched labels %v\n", len(matchedJobs), labels) + allJobs = append(allJobs, matchedJobs...) // Break if we've fetched all available results if len(result.Jobs) < limit { @@ -221,7 +232,7 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken } } - return totalCount, nil + return allJobs, nil } // fetchWorkflowRuns fetches workflow runs from a given endpoint (deprecated - use jobs for label filtering) @@ -465,16 +476,16 @@ func (c *HTTPClient) fetchUserRepos(ctx context.Context, giteaURL, authToken str } // filterQueuedJobs filters workflow jobs by labels -func (c *HTTPClient) filterQueuedJobs(jobs []ActionWorkflowJob, runnerLabels []string) int { - count := 0 +func (c *HTTPClient) filterQueuedJobs(jobs []ActionWorkflowJob, runnerLabels []string) []ActionWorkflowJob { + var matched []ActionWorkflowJob for _, job := range jobs { match := c.jobMatchesLabels(job.Labels, runnerLabels) fmt.Printf("DEBUG: Job %d (Status: %s, Labels: %v) matches runner capabilities %v? %v\n", job.ID, job.Status, job.Labels, runnerLabels, match) if match { - count++ + matched = append(matched, job) } } - return count + return matched } // jobMatchesLabels checks if a job's requirements are satisfied by the runner's supported labels