internal queue to track job runs vs pod

This commit is contained in:
2026-01-11 22:27:56 +08:00
parent 13f04e1d84
commit 409a114532
3 changed files with 115 additions and 68 deletions

View File

@@ -5,7 +5,7 @@ kind: Kustomization
images: images:
- name: controller - name: controller
newName: ghcr.io/bapung/gitea-runner-operator newName: ghcr.io/bapung/gitea-runner-operator
newTag: sha-a34dedb newTag: sha-13f04e1
patchesStrategicMerge: patchesStrategicMerge:
- image_pull_secret_patch.yaml - image_pull_secret_patch.yaml

View File

@@ -21,6 +21,7 @@ import (
"fmt" "fmt"
"math/rand" "math/rand"
"strings" "strings"
"sync"
"time" "time"
batchv1 "k8s.io/api/batch/v1" batchv1 "k8s.io/api/batch/v1"
@@ -40,8 +41,9 @@ import (
// RunnerGroupReconciler reconciles a RunnerGroup object // RunnerGroupReconciler reconciles a RunnerGroup object
type RunnerGroupReconciler struct { type RunnerGroupReconciler struct {
client.Client client.Client
Scheme *runtime.Scheme Scheme *runtime.Scheme
GiteaClient gitea.Client GiteaClient gitea.Client
SpawnedJobsCache sync.Map
} }
// +kubebuilder:rbac:groups=gitea.bpg.pw,resources=runnergroups,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=gitea.bpg.pw,resources=runnergroups,verbs=get;list;watch;create;update;patch;delete
@@ -121,7 +123,7 @@ func (r *RunnerGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request)
effectiveLabels := r.getEffectiveLabels(runnerGroup.Spec.Labels) effectiveLabels := r.getEffectiveLabels(runnerGroup.Spec.Labels)
// Query for queued workflow runs // Query for queued workflow runs
queuedJobs, err := r.GiteaClient.GetQueuedRuns( stats, err := r.GiteaClient.GetRunnerStats(
ctx, ctx,
runnerGroup.Spec.GiteaURL, runnerGroup.Spec.GiteaURL,
authToken, authToken,
@@ -131,45 +133,78 @@ func (r *RunnerGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request)
effectiveLabels, effectiveLabels,
) )
if err != nil { if err != nil {
logger.Error(err, "Failed to query Gitea for queued runs") logger.Error(err, "Failed to query Gitea for runner stats")
return ctrl.Result{RequeueAfter: 10 * time.Second}, err return ctrl.Result{RequeueAfter: 10 * time.Second}, err
} }
logger.Info("Gitea query result", "queuedJobs", queuedJobs) logger.Info("Gitea query result", "queuedJobs", len(stats.QueuedJobs))
// 6. Scale Up // 6. Scale Up and Cache Management
availableSlots := runnerGroup.Spec.MaxActiveRunners - activeRunners availableSlots := runnerGroup.Spec.MaxActiveRunners - activeRunners
toSpawn := min(queuedJobs, availableSlots)
if toSpawn > 0 { // Track current queued IDs for cache cleanup
logger.Info("Spawning runners", currentQueuedIDs := make(map[int64]bool)
"queuedJobs", queuedJobs,
"availableSlots", availableSlots,
"toSpawn", toSpawn)
// Retrieve Registration Token from Secret // Retrieve Registration Token from Secret (only if we need to spawn)
registrationToken, err := r.getSecretValue(ctx, runnerGroup.Namespace, runnerGroup.Spec.RegistrationTokenRef) var registrationToken string
tokenFetched := false
for _, giteaJob := range stats.QueuedJobs {
currentQueuedIDs[giteaJob.ID] = true
if availableSlots <= 0 {
continue
}
// Check if we already spawned a runner for this job
if value, loaded := r.SpawnedJobsCache.Load(giteaJob.ID); loaded {
spawnTime := value.(time.Time)
if time.Since(spawnTime) < 5*time.Minute {
// Already handling this job recently
continue
}
// TTL expired (runner likely failed to start), retry spawning
logger.Info("Job stuck in queue for too long, retrying runner spawn", "giteaJobID", giteaJob.ID)
}
// Need to spawn a runner
if !tokenFetched {
registrationToken, err = r.getSecretValue(ctx, runnerGroup.Namespace, runnerGroup.Spec.RegistrationTokenRef)
if err != nil {
logger.Error(err, "Failed to get registration token from secret")
return ctrl.Result{}, err
}
tokenFetched = true
}
job, err := r.constructJobForRunnerGroup(runnerGroup, registrationToken, effectiveLabels)
if err != nil { if err != nil {
logger.Error(err, "Failed to get registration token from secret") logger.Error(err, "Failed to construct Job")
return ctrl.Result{}, err return ctrl.Result{}, err
} }
// Spawn jobs if err := r.Create(ctx, job); err != nil {
for i := 0; i < toSpawn; i++ { logger.Error(err, "Failed to create Job", "jobName", job.Name)
job, err := r.constructJobForRunnerGroup(runnerGroup, registrationToken, effectiveLabels) return ctrl.Result{}, err
if err != nil {
logger.Error(err, "Failed to construct Job")
return ctrl.Result{}, err
}
if err := r.Create(ctx, job); err != nil {
logger.Error(err, "Failed to create Job", "jobName", job.Name)
return ctrl.Result{}, err
}
logger.Info("Created Job", "jobName", job.Name)
} }
logger.Info("Created Job for Gitea Run", "jobName", job.Name, "giteaJobID", giteaJob.ID)
// Mark as spawned
r.SpawnedJobsCache.Store(giteaJob.ID, time.Now())
availableSlots--
} }
// Cleanup cache: remove jobs that are no longer queued in Gitea
r.SpawnedJobsCache.Range(func(key, value any) bool {
jobID := key.(int64)
if !currentQueuedIDs[jobID] {
// Job is no longer in the queue (running, completed, or cancelled)
r.SpawnedJobsCache.Delete(key)
}
return true
})
// 7. Requeue for continuous polling // 7. Requeue for continuous polling
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
} }
@@ -239,6 +274,7 @@ func (r *RunnerGroupReconciler) constructJobForRunnerGroup(runnerGroup *giteav1a
{Name: "GITEA_INSTANCE_URL", Value: runnerGroup.Spec.GiteaURL}, {Name: "GITEA_INSTANCE_URL", Value: runnerGroup.Spec.GiteaURL},
{Name: "GITEA_RUNNER_REGISTRATION_TOKEN", Value: registrationToken}, {Name: "GITEA_RUNNER_REGISTRATION_TOKEN", Value: registrationToken},
{Name: "GITEA_RUNNER_EPHEMERAL", Value: "true"}, {Name: "GITEA_RUNNER_EPHEMERAL", Value: "true"},
{Name: "GITEA_RUNNER_NAME", Value: name},
{Name: "DOCKER_HOST", Value: "tcp://localhost:2376"}, {Name: "DOCKER_HOST", Value: "tcp://localhost:2376"},
{Name: "DOCKER_CERT_PATH", Value: "/certs/client"}, {Name: "DOCKER_CERT_PATH", Value: "/certs/client"},
{Name: "DOCKER_TLS_VERIFY", Value: "1"}, {Name: "DOCKER_TLS_VERIFY", Value: "1"},

View File

@@ -31,9 +31,8 @@ import (
// Client defines the interface for interacting with Gitea API // Client defines the interface for interacting with Gitea API
type Client interface { type Client interface {
// GetQueuedRuns queries Gitea for queued workflow runs matching the scope and labels // GetRunnerStats queries Gitea for queued workflow runs matching the scope and labels
// Returns the count of queued jobs that match the criteria GetRunnerStats(
GetQueuedRuns(
ctx context.Context, ctx context.Context,
giteaURL string, giteaURL string,
authToken string, authToken string,
@@ -41,7 +40,12 @@ type Client interface {
org string, org string,
repo string, repo string,
labels []string, labels []string,
) (int, error) ) (*RunnerStats, error)
}
// RunnerStats contains lists of jobs in different states
type RunnerStats struct {
QueuedJobs []ActionWorkflowJob
} }
// HTTPClient is the default implementation of the Gitea Client interface // HTTPClient is the default implementation of the Gitea Client interface
@@ -107,8 +111,8 @@ type ActionWorkflowJob struct {
RunnerName string `json:"runner_name"` RunnerName string `json:"runner_name"`
} }
// GetQueuedRuns implements the Client interface // GetRunnerStats implements the Client interface
func (c *HTTPClient) GetQueuedRuns( func (c *HTTPClient) GetRunnerStats(
ctx context.Context, ctx context.Context,
giteaURL string, giteaURL string,
authToken string, authToken string,
@@ -116,44 +120,51 @@ func (c *HTTPClient) GetQueuedRuns(
org string, org string,
repo string, repo string,
labels []string, labels []string,
) (int, error) { ) (*RunnerStats, error) {
switch scope { switch scope {
case v1alpha1.RunnerGroupScopeRepo: case v1alpha1.RunnerGroupScopeRepo:
return c.getQueuedRunsForRepo(ctx, giteaURL, authToken, org, repo, labels) return c.getRunnerStatsForRepo(ctx, giteaURL, authToken, org, repo, labels)
case v1alpha1.RunnerGroupScopeOrg: case v1alpha1.RunnerGroupScopeOrg:
return c.getQueuedRunsForOrg(ctx, giteaURL, authToken, org, labels) return c.getRunnerStatsForOrg(ctx, giteaURL, authToken, org, labels)
case v1alpha1.RunnerGroupScopeGlobal: case v1alpha1.RunnerGroupScopeGlobal:
return c.getQueuedRunsGlobal(ctx, giteaURL, authToken, labels) return c.getRunnerStatsGlobal(ctx, giteaURL, authToken, labels)
default: default:
return 0, fmt.Errorf("unknown scope: %s", scope) return nil, fmt.Errorf("unknown scope: %s", scope)
} }
} }
// getQueuedRunsForRepo fetches queued runs for a specific repository // getRunnerStatsForRepo fetches queued runs for a specific repository
func (c *HTTPClient) getQueuedRunsForRepo(ctx context.Context, giteaURL, authToken, owner, repo string, labels []string) (int, error) { func (c *HTTPClient) getRunnerStatsForRepo(ctx context.Context, giteaURL, authToken, owner, repo string, labels []string) (*RunnerStats, error) {
// Use jobs endpoint since it contains the runner labels we need for filtering
endpoint := fmt.Sprintf("%s/api/v1/repos/%s/%s/actions/jobs", strings.TrimSuffix(giteaURL, "/"), owner, repo) endpoint := fmt.Sprintf("%s/api/v1/repos/%s/%s/actions/jobs", strings.TrimSuffix(giteaURL, "/"), owner, repo)
return c.fetchWorkflowJobs(ctx, endpoint, authToken, labels) return c.fetchRunnerStats(ctx, endpoint, authToken, labels)
} }
// getQueuedRunsForOrg fetches queued runs for all repos under an organization // getRunnerStatsForOrg fetches queued runs for all repos under an organization
func (c *HTTPClient) getQueuedRunsForOrg(ctx context.Context, giteaURL, authToken, org string, labels []string) (int, error) { func (c *HTTPClient) getRunnerStatsForOrg(ctx context.Context, giteaURL, authToken, org string, labels []string) (*RunnerStats, error) {
// Use direct org-level jobs endpoint for better performance
endpoint := fmt.Sprintf("%s/api/v1/orgs/%s/actions/jobs", strings.TrimSuffix(giteaURL, "/"), org) endpoint := fmt.Sprintf("%s/api/v1/orgs/%s/actions/jobs", strings.TrimSuffix(giteaURL, "/"), org)
return c.fetchWorkflowJobs(ctx, endpoint, authToken, labels) return c.fetchRunnerStats(ctx, endpoint, authToken, labels)
} }
// getQueuedRunsGlobal fetches queued runs using admin-level API for global scope // getRunnerStatsGlobal fetches queued runs using admin-level API for global scope
func (c *HTTPClient) getQueuedRunsGlobal(ctx context.Context, giteaURL, authToken string, labels []string) (int, error) { func (c *HTTPClient) getRunnerStatsGlobal(ctx context.Context, giteaURL, authToken string, labels []string) (*RunnerStats, error) {
// Use admin-level jobs endpoint which provides global view of all queued jobs
endpoint := fmt.Sprintf("%s/api/v1/admin/actions/jobs", strings.TrimSuffix(giteaURL, "/")) endpoint := fmt.Sprintf("%s/api/v1/admin/actions/jobs", strings.TrimSuffix(giteaURL, "/"))
return c.fetchWorkflowJobs(ctx, endpoint, authToken, labels) return c.fetchRunnerStats(ctx, endpoint, authToken, labels)
}
func (c *HTTPClient) fetchRunnerStats(ctx context.Context, endpoint, authToken string, labels []string) (*RunnerStats, error) {
queuedJobs, err := c.fetchWorkflowJobs(ctx, endpoint, authToken, labels, []string{"queued", "waiting", "pending"})
if err != nil {
return nil, err
}
return &RunnerStats{
QueuedJobs: queuedJobs,
}, nil
} }
// fetchWorkflowJobs fetches workflow jobs from a given endpoint with label filtering and pagination // fetchWorkflowJobs fetches workflow jobs from a given endpoint with label filtering and pagination
func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken string, labels []string) (int, error) { func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken string, labels []string, statuses []string) ([]ActionWorkflowJob, error) {
totalCount := 0 var allJobs []ActionWorkflowJob
statuses := []string{"queued", "waiting", "pending"}
for _, status := range statuses { for _, status := range statuses {
page := 1 page := 1
@@ -174,7 +185,7 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken
req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil) req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil)
if err != nil { if err != nil {
return 0, err return nil, err
} }
req.Header.Set("Authorization", "token "+authToken) req.Header.Set("Authorization", "token "+authToken)
@@ -183,7 +194,7 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken
resp, err := c.httpClient.Do(req) resp, err := c.httpClient.Do(req)
if err != nil { if err != nil {
fmt.Printf("DEBUG: Request failed: %v\n", err) fmt.Printf("DEBUG: Request failed: %v\n", err)
return 0, err return nil, err
} }
fmt.Printf("DEBUG: Response status: %s\n", resp.Status) fmt.Printf("DEBUG: Response status: %s\n", resp.Status)
@@ -192,7 +203,7 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken
body, _ := io.ReadAll(resp.Body) body, _ := io.ReadAll(resp.Body)
resp.Body.Close() resp.Body.Close()
fmt.Printf("DEBUG: Error body: %s\n", string(body)) fmt.Printf("DEBUG: Error body: %s\n", string(body))
return 0, c.handleHTTPError(resp.StatusCode, body, "fetch workflow jobs") return nil, c.handleHTTPError(resp.StatusCode, body, "fetch workflow jobs")
} }
body, _ := io.ReadAll(resp.Body) body, _ := io.ReadAll(resp.Body)
@@ -202,15 +213,15 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken
var result ActionWorkflowJobsResponse var result ActionWorkflowJobsResponse
if err := json.Unmarshal(body, &result); err != nil { if err := json.Unmarshal(body, &result); err != nil {
fmt.Printf("DEBUG: Failed to decode response: %v\n", err) fmt.Printf("DEBUG: Failed to decode response: %v\n", err)
return 0, err return nil, err
} }
fmt.Printf("DEBUG: Found %d jobs, total in Gitea: %d\n", len(result.Jobs), result.TotalCount) fmt.Printf("DEBUG: Found %d jobs, total in Gitea: %d\n", len(result.Jobs), result.TotalCount)
// Filter and count matching jobs for this page // Filter and collect matching jobs for this page
pageCount := c.filterQueuedJobs(result.Jobs, labels) matchedJobs := c.filterQueuedJobs(result.Jobs, labels)
fmt.Printf("DEBUG: %d jobs matched labels %v\n", pageCount, labels) fmt.Printf("DEBUG: %d jobs matched labels %v\n", len(matchedJobs), labels)
totalCount += pageCount allJobs = append(allJobs, matchedJobs...)
// Break if we've fetched all available results // Break if we've fetched all available results
if len(result.Jobs) < limit { if len(result.Jobs) < limit {
@@ -221,7 +232,7 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken
} }
} }
return totalCount, nil return allJobs, nil
} }
// fetchWorkflowRuns fetches workflow runs from a given endpoint (deprecated - use jobs for label filtering) // fetchWorkflowRuns fetches workflow runs from a given endpoint (deprecated - use jobs for label filtering)
@@ -465,16 +476,16 @@ func (c *HTTPClient) fetchUserRepos(ctx context.Context, giteaURL, authToken str
} }
// filterQueuedJobs filters workflow jobs by labels // filterQueuedJobs filters workflow jobs by labels
func (c *HTTPClient) filterQueuedJobs(jobs []ActionWorkflowJob, runnerLabels []string) int { func (c *HTTPClient) filterQueuedJobs(jobs []ActionWorkflowJob, runnerLabels []string) []ActionWorkflowJob {
count := 0 var matched []ActionWorkflowJob
for _, job := range jobs { for _, job := range jobs {
match := c.jobMatchesLabels(job.Labels, runnerLabels) match := c.jobMatchesLabels(job.Labels, runnerLabels)
fmt.Printf("DEBUG: Job %d (Status: %s, Labels: %v) matches runner capabilities %v? %v\n", job.ID, job.Status, job.Labels, runnerLabels, match) fmt.Printf("DEBUG: Job %d (Status: %s, Labels: %v) matches runner capabilities %v? %v\n", job.ID, job.Status, job.Labels, runnerLabels, match)
if match { if match {
count++ matched = append(matched, job)
} }
} }
return count return matched
} }
// jobMatchesLabels checks if a job's requirements are satisfied by the runner's supported labels // jobMatchesLabels checks if a job's requirements are satisfied by the runner's supported labels