mirror of
https://github.com/bapung/gitea-runner-operator.git
synced 2026-06-22 07:58:44 +00:00
internal queue to track job runs vs pod
This commit is contained in:
@@ -5,7 +5,7 @@ kind: Kustomization
|
|||||||
images:
|
images:
|
||||||
- name: controller
|
- name: controller
|
||||||
newName: ghcr.io/bapung/gitea-runner-operator
|
newName: ghcr.io/bapung/gitea-runner-operator
|
||||||
newTag: sha-a34dedb
|
newTag: sha-13f04e1
|
||||||
|
|
||||||
patchesStrategicMerge:
|
patchesStrategicMerge:
|
||||||
- image_pull_secret_patch.yaml
|
- image_pull_secret_patch.yaml
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
batchv1 "k8s.io/api/batch/v1"
|
batchv1 "k8s.io/api/batch/v1"
|
||||||
@@ -42,6 +43,7 @@ type RunnerGroupReconciler struct {
|
|||||||
client.Client
|
client.Client
|
||||||
Scheme *runtime.Scheme
|
Scheme *runtime.Scheme
|
||||||
GiteaClient gitea.Client
|
GiteaClient gitea.Client
|
||||||
|
SpawnedJobsCache sync.Map
|
||||||
}
|
}
|
||||||
|
|
||||||
// +kubebuilder:rbac:groups=gitea.bpg.pw,resources=runnergroups,verbs=get;list;watch;create;update;patch;delete
|
// +kubebuilder:rbac:groups=gitea.bpg.pw,resources=runnergroups,verbs=get;list;watch;create;update;patch;delete
|
||||||
@@ -121,7 +123,7 @@ func (r *RunnerGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request)
|
|||||||
effectiveLabels := r.getEffectiveLabels(runnerGroup.Spec.Labels)
|
effectiveLabels := r.getEffectiveLabels(runnerGroup.Spec.Labels)
|
||||||
|
|
||||||
// Query for queued workflow runs
|
// Query for queued workflow runs
|
||||||
queuedJobs, err := r.GiteaClient.GetQueuedRuns(
|
stats, err := r.GiteaClient.GetRunnerStats(
|
||||||
ctx,
|
ctx,
|
||||||
runnerGroup.Spec.GiteaURL,
|
runnerGroup.Spec.GiteaURL,
|
||||||
authToken,
|
authToken,
|
||||||
@@ -131,31 +133,50 @@ func (r *RunnerGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request)
|
|||||||
effectiveLabels,
|
effectiveLabels,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Error(err, "Failed to query Gitea for queued runs")
|
logger.Error(err, "Failed to query Gitea for runner stats")
|
||||||
return ctrl.Result{RequeueAfter: 10 * time.Second}, err
|
return ctrl.Result{RequeueAfter: 10 * time.Second}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.Info("Gitea query result", "queuedJobs", queuedJobs)
|
logger.Info("Gitea query result", "queuedJobs", len(stats.QueuedJobs))
|
||||||
|
|
||||||
// 6. Scale Up
|
// 6. Scale Up and Cache Management
|
||||||
availableSlots := runnerGroup.Spec.MaxActiveRunners - activeRunners
|
availableSlots := runnerGroup.Spec.MaxActiveRunners - activeRunners
|
||||||
toSpawn := min(queuedJobs, availableSlots)
|
|
||||||
|
|
||||||
if toSpawn > 0 {
|
// Track current queued IDs for cache cleanup
|
||||||
logger.Info("Spawning runners",
|
currentQueuedIDs := make(map[int64]bool)
|
||||||
"queuedJobs", queuedJobs,
|
|
||||||
"availableSlots", availableSlots,
|
|
||||||
"toSpawn", toSpawn)
|
|
||||||
|
|
||||||
// Retrieve Registration Token from Secret
|
// Retrieve Registration Token from Secret (only if we need to spawn)
|
||||||
registrationToken, err := r.getSecretValue(ctx, runnerGroup.Namespace, runnerGroup.Spec.RegistrationTokenRef)
|
var registrationToken string
|
||||||
|
tokenFetched := false
|
||||||
|
|
||||||
|
for _, giteaJob := range stats.QueuedJobs {
|
||||||
|
currentQueuedIDs[giteaJob.ID] = true
|
||||||
|
|
||||||
|
if availableSlots <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we already spawned a runner for this job
|
||||||
|
if value, loaded := r.SpawnedJobsCache.Load(giteaJob.ID); loaded {
|
||||||
|
spawnTime := value.(time.Time)
|
||||||
|
if time.Since(spawnTime) < 5*time.Minute {
|
||||||
|
// Already handling this job recently
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// TTL expired (runner likely failed to start), retry spawning
|
||||||
|
logger.Info("Job stuck in queue for too long, retrying runner spawn", "giteaJobID", giteaJob.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Need to spawn a runner
|
||||||
|
if !tokenFetched {
|
||||||
|
registrationToken, err = r.getSecretValue(ctx, runnerGroup.Namespace, runnerGroup.Spec.RegistrationTokenRef)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Error(err, "Failed to get registration token from secret")
|
logger.Error(err, "Failed to get registration token from secret")
|
||||||
return ctrl.Result{}, err
|
return ctrl.Result{}, err
|
||||||
}
|
}
|
||||||
|
tokenFetched = true
|
||||||
|
}
|
||||||
|
|
||||||
// Spawn jobs
|
|
||||||
for i := 0; i < toSpawn; i++ {
|
|
||||||
job, err := r.constructJobForRunnerGroup(runnerGroup, registrationToken, effectiveLabels)
|
job, err := r.constructJobForRunnerGroup(runnerGroup, registrationToken, effectiveLabels)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Error(err, "Failed to construct Job")
|
logger.Error(err, "Failed to construct Job")
|
||||||
@@ -166,9 +187,23 @@ func (r *RunnerGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request)
|
|||||||
logger.Error(err, "Failed to create Job", "jobName", job.Name)
|
logger.Error(err, "Failed to create Job", "jobName", job.Name)
|
||||||
return ctrl.Result{}, err
|
return ctrl.Result{}, err
|
||||||
}
|
}
|
||||||
logger.Info("Created Job", "jobName", job.Name)
|
|
||||||
|
logger.Info("Created Job for Gitea Run", "jobName", job.Name, "giteaJobID", giteaJob.ID)
|
||||||
|
|
||||||
|
// Mark as spawned
|
||||||
|
r.SpawnedJobsCache.Store(giteaJob.ID, time.Now())
|
||||||
|
availableSlots--
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Cleanup cache: remove jobs that are no longer queued in Gitea
|
||||||
|
r.SpawnedJobsCache.Range(func(key, value any) bool {
|
||||||
|
jobID := key.(int64)
|
||||||
|
if !currentQueuedIDs[jobID] {
|
||||||
|
// Job is no longer in the queue (running, completed, or cancelled)
|
||||||
|
r.SpawnedJobsCache.Delete(key)
|
||||||
}
|
}
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
|
||||||
// 7. Requeue for continuous polling
|
// 7. Requeue for continuous polling
|
||||||
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
|
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
|
||||||
@@ -239,6 +274,7 @@ func (r *RunnerGroupReconciler) constructJobForRunnerGroup(runnerGroup *giteav1a
|
|||||||
{Name: "GITEA_INSTANCE_URL", Value: runnerGroup.Spec.GiteaURL},
|
{Name: "GITEA_INSTANCE_URL", Value: runnerGroup.Spec.GiteaURL},
|
||||||
{Name: "GITEA_RUNNER_REGISTRATION_TOKEN", Value: registrationToken},
|
{Name: "GITEA_RUNNER_REGISTRATION_TOKEN", Value: registrationToken},
|
||||||
{Name: "GITEA_RUNNER_EPHEMERAL", Value: "true"},
|
{Name: "GITEA_RUNNER_EPHEMERAL", Value: "true"},
|
||||||
|
{Name: "GITEA_RUNNER_NAME", Value: name},
|
||||||
{Name: "DOCKER_HOST", Value: "tcp://localhost:2376"},
|
{Name: "DOCKER_HOST", Value: "tcp://localhost:2376"},
|
||||||
{Name: "DOCKER_CERT_PATH", Value: "/certs/client"},
|
{Name: "DOCKER_CERT_PATH", Value: "/certs/client"},
|
||||||
{Name: "DOCKER_TLS_VERIFY", Value: "1"},
|
{Name: "DOCKER_TLS_VERIFY", Value: "1"},
|
||||||
|
|||||||
@@ -31,9 +31,8 @@ import (
|
|||||||
|
|
||||||
// Client defines the interface for interacting with Gitea API
|
// Client defines the interface for interacting with Gitea API
|
||||||
type Client interface {
|
type Client interface {
|
||||||
// GetQueuedRuns queries Gitea for queued workflow runs matching the scope and labels
|
// GetRunnerStats queries Gitea for queued workflow runs matching the scope and labels
|
||||||
// Returns the count of queued jobs that match the criteria
|
GetRunnerStats(
|
||||||
GetQueuedRuns(
|
|
||||||
ctx context.Context,
|
ctx context.Context,
|
||||||
giteaURL string,
|
giteaURL string,
|
||||||
authToken string,
|
authToken string,
|
||||||
@@ -41,7 +40,12 @@ type Client interface {
|
|||||||
org string,
|
org string,
|
||||||
repo string,
|
repo string,
|
||||||
labels []string,
|
labels []string,
|
||||||
) (int, error)
|
) (*RunnerStats, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunnerStats contains lists of jobs in different states
|
||||||
|
type RunnerStats struct {
|
||||||
|
QueuedJobs []ActionWorkflowJob
|
||||||
}
|
}
|
||||||
|
|
||||||
// HTTPClient is the default implementation of the Gitea Client interface
|
// HTTPClient is the default implementation of the Gitea Client interface
|
||||||
@@ -107,8 +111,8 @@ type ActionWorkflowJob struct {
|
|||||||
RunnerName string `json:"runner_name"`
|
RunnerName string `json:"runner_name"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetQueuedRuns implements the Client interface
|
// GetRunnerStats implements the Client interface
|
||||||
func (c *HTTPClient) GetQueuedRuns(
|
func (c *HTTPClient) GetRunnerStats(
|
||||||
ctx context.Context,
|
ctx context.Context,
|
||||||
giteaURL string,
|
giteaURL string,
|
||||||
authToken string,
|
authToken string,
|
||||||
@@ -116,44 +120,51 @@ func (c *HTTPClient) GetQueuedRuns(
|
|||||||
org string,
|
org string,
|
||||||
repo string,
|
repo string,
|
||||||
labels []string,
|
labels []string,
|
||||||
) (int, error) {
|
) (*RunnerStats, error) {
|
||||||
switch scope {
|
switch scope {
|
||||||
case v1alpha1.RunnerGroupScopeRepo:
|
case v1alpha1.RunnerGroupScopeRepo:
|
||||||
return c.getQueuedRunsForRepo(ctx, giteaURL, authToken, org, repo, labels)
|
return c.getRunnerStatsForRepo(ctx, giteaURL, authToken, org, repo, labels)
|
||||||
case v1alpha1.RunnerGroupScopeOrg:
|
case v1alpha1.RunnerGroupScopeOrg:
|
||||||
return c.getQueuedRunsForOrg(ctx, giteaURL, authToken, org, labels)
|
return c.getRunnerStatsForOrg(ctx, giteaURL, authToken, org, labels)
|
||||||
case v1alpha1.RunnerGroupScopeGlobal:
|
case v1alpha1.RunnerGroupScopeGlobal:
|
||||||
return c.getQueuedRunsGlobal(ctx, giteaURL, authToken, labels)
|
return c.getRunnerStatsGlobal(ctx, giteaURL, authToken, labels)
|
||||||
default:
|
default:
|
||||||
return 0, fmt.Errorf("unknown scope: %s", scope)
|
return nil, fmt.Errorf("unknown scope: %s", scope)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// getQueuedRunsForRepo fetches queued runs for a specific repository
|
// getRunnerStatsForRepo fetches queued runs for a specific repository
|
||||||
func (c *HTTPClient) getQueuedRunsForRepo(ctx context.Context, giteaURL, authToken, owner, repo string, labels []string) (int, error) {
|
func (c *HTTPClient) getRunnerStatsForRepo(ctx context.Context, giteaURL, authToken, owner, repo string, labels []string) (*RunnerStats, error) {
|
||||||
// Use jobs endpoint since it contains the runner labels we need for filtering
|
|
||||||
endpoint := fmt.Sprintf("%s/api/v1/repos/%s/%s/actions/jobs", strings.TrimSuffix(giteaURL, "/"), owner, repo)
|
endpoint := fmt.Sprintf("%s/api/v1/repos/%s/%s/actions/jobs", strings.TrimSuffix(giteaURL, "/"), owner, repo)
|
||||||
return c.fetchWorkflowJobs(ctx, endpoint, authToken, labels)
|
return c.fetchRunnerStats(ctx, endpoint, authToken, labels)
|
||||||
}
|
}
|
||||||
|
|
||||||
// getQueuedRunsForOrg fetches queued runs for all repos under an organization
|
// getRunnerStatsForOrg fetches queued runs for all repos under an organization
|
||||||
func (c *HTTPClient) getQueuedRunsForOrg(ctx context.Context, giteaURL, authToken, org string, labels []string) (int, error) {
|
func (c *HTTPClient) getRunnerStatsForOrg(ctx context.Context, giteaURL, authToken, org string, labels []string) (*RunnerStats, error) {
|
||||||
// Use direct org-level jobs endpoint for better performance
|
|
||||||
endpoint := fmt.Sprintf("%s/api/v1/orgs/%s/actions/jobs", strings.TrimSuffix(giteaURL, "/"), org)
|
endpoint := fmt.Sprintf("%s/api/v1/orgs/%s/actions/jobs", strings.TrimSuffix(giteaURL, "/"), org)
|
||||||
return c.fetchWorkflowJobs(ctx, endpoint, authToken, labels)
|
return c.fetchRunnerStats(ctx, endpoint, authToken, labels)
|
||||||
}
|
}
|
||||||
|
|
||||||
// getQueuedRunsGlobal fetches queued runs using admin-level API for global scope
|
// getRunnerStatsGlobal fetches queued runs using admin-level API for global scope
|
||||||
func (c *HTTPClient) getQueuedRunsGlobal(ctx context.Context, giteaURL, authToken string, labels []string) (int, error) {
|
func (c *HTTPClient) getRunnerStatsGlobal(ctx context.Context, giteaURL, authToken string, labels []string) (*RunnerStats, error) {
|
||||||
// Use admin-level jobs endpoint which provides global view of all queued jobs
|
|
||||||
endpoint := fmt.Sprintf("%s/api/v1/admin/actions/jobs", strings.TrimSuffix(giteaURL, "/"))
|
endpoint := fmt.Sprintf("%s/api/v1/admin/actions/jobs", strings.TrimSuffix(giteaURL, "/"))
|
||||||
return c.fetchWorkflowJobs(ctx, endpoint, authToken, labels)
|
return c.fetchRunnerStats(ctx, endpoint, authToken, labels)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *HTTPClient) fetchRunnerStats(ctx context.Context, endpoint, authToken string, labels []string) (*RunnerStats, error) {
|
||||||
|
queuedJobs, err := c.fetchWorkflowJobs(ctx, endpoint, authToken, labels, []string{"queued", "waiting", "pending"})
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &RunnerStats{
|
||||||
|
QueuedJobs: queuedJobs,
|
||||||
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// fetchWorkflowJobs fetches workflow jobs from a given endpoint with label filtering and pagination
|
// fetchWorkflowJobs fetches workflow jobs from a given endpoint with label filtering and pagination
|
||||||
func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken string, labels []string) (int, error) {
|
func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken string, labels []string, statuses []string) ([]ActionWorkflowJob, error) {
|
||||||
totalCount := 0
|
var allJobs []ActionWorkflowJob
|
||||||
statuses := []string{"queued", "waiting", "pending"}
|
|
||||||
|
|
||||||
for _, status := range statuses {
|
for _, status := range statuses {
|
||||||
page := 1
|
page := 1
|
||||||
@@ -174,7 +185,7 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken
|
|||||||
|
|
||||||
req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil)
|
req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
req.Header.Set("Authorization", "token "+authToken)
|
req.Header.Set("Authorization", "token "+authToken)
|
||||||
@@ -183,7 +194,7 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken
|
|||||||
resp, err := c.httpClient.Do(req)
|
resp, err := c.httpClient.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Printf("DEBUG: Request failed: %v\n", err)
|
fmt.Printf("DEBUG: Request failed: %v\n", err)
|
||||||
return 0, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Printf("DEBUG: Response status: %s\n", resp.Status)
|
fmt.Printf("DEBUG: Response status: %s\n", resp.Status)
|
||||||
@@ -192,7 +203,7 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken
|
|||||||
body, _ := io.ReadAll(resp.Body)
|
body, _ := io.ReadAll(resp.Body)
|
||||||
resp.Body.Close()
|
resp.Body.Close()
|
||||||
fmt.Printf("DEBUG: Error body: %s\n", string(body))
|
fmt.Printf("DEBUG: Error body: %s\n", string(body))
|
||||||
return 0, c.handleHTTPError(resp.StatusCode, body, "fetch workflow jobs")
|
return nil, c.handleHTTPError(resp.StatusCode, body, "fetch workflow jobs")
|
||||||
}
|
}
|
||||||
|
|
||||||
body, _ := io.ReadAll(resp.Body)
|
body, _ := io.ReadAll(resp.Body)
|
||||||
@@ -202,15 +213,15 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken
|
|||||||
var result ActionWorkflowJobsResponse
|
var result ActionWorkflowJobsResponse
|
||||||
if err := json.Unmarshal(body, &result); err != nil {
|
if err := json.Unmarshal(body, &result); err != nil {
|
||||||
fmt.Printf("DEBUG: Failed to decode response: %v\n", err)
|
fmt.Printf("DEBUG: Failed to decode response: %v\n", err)
|
||||||
return 0, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Printf("DEBUG: Found %d jobs, total in Gitea: %d\n", len(result.Jobs), result.TotalCount)
|
fmt.Printf("DEBUG: Found %d jobs, total in Gitea: %d\n", len(result.Jobs), result.TotalCount)
|
||||||
|
|
||||||
// Filter and count matching jobs for this page
|
// Filter and collect matching jobs for this page
|
||||||
pageCount := c.filterQueuedJobs(result.Jobs, labels)
|
matchedJobs := c.filterQueuedJobs(result.Jobs, labels)
|
||||||
fmt.Printf("DEBUG: %d jobs matched labels %v\n", pageCount, labels)
|
fmt.Printf("DEBUG: %d jobs matched labels %v\n", len(matchedJobs), labels)
|
||||||
totalCount += pageCount
|
allJobs = append(allJobs, matchedJobs...)
|
||||||
|
|
||||||
// Break if we've fetched all available results
|
// Break if we've fetched all available results
|
||||||
if len(result.Jobs) < limit {
|
if len(result.Jobs) < limit {
|
||||||
@@ -221,7 +232,7 @@ func (c *HTTPClient) fetchWorkflowJobs(ctx context.Context, endpoint, authToken
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return totalCount, nil
|
return allJobs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// fetchWorkflowRuns fetches workflow runs from a given endpoint (deprecated - use jobs for label filtering)
|
// fetchWorkflowRuns fetches workflow runs from a given endpoint (deprecated - use jobs for label filtering)
|
||||||
@@ -465,16 +476,16 @@ func (c *HTTPClient) fetchUserRepos(ctx context.Context, giteaURL, authToken str
|
|||||||
}
|
}
|
||||||
|
|
||||||
// filterQueuedJobs filters workflow jobs by labels
|
// filterQueuedJobs filters workflow jobs by labels
|
||||||
func (c *HTTPClient) filterQueuedJobs(jobs []ActionWorkflowJob, runnerLabels []string) int {
|
func (c *HTTPClient) filterQueuedJobs(jobs []ActionWorkflowJob, runnerLabels []string) []ActionWorkflowJob {
|
||||||
count := 0
|
var matched []ActionWorkflowJob
|
||||||
for _, job := range jobs {
|
for _, job := range jobs {
|
||||||
match := c.jobMatchesLabels(job.Labels, runnerLabels)
|
match := c.jobMatchesLabels(job.Labels, runnerLabels)
|
||||||
fmt.Printf("DEBUG: Job %d (Status: %s, Labels: %v) matches runner capabilities %v? %v\n", job.ID, job.Status, job.Labels, runnerLabels, match)
|
fmt.Printf("DEBUG: Job %d (Status: %s, Labels: %v) matches runner capabilities %v? %v\n", job.ID, job.Status, job.Labels, runnerLabels, match)
|
||||||
if match {
|
if match {
|
||||||
count++
|
matched = append(matched, job)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return count
|
return matched
|
||||||
}
|
}
|
||||||
|
|
||||||
// jobMatchesLabels checks if a job's requirements are satisfied by the runner's supported labels
|
// jobMatchesLabels checks if a job's requirements are satisfied by the runner's supported labels
|
||||||
|
|||||||
Reference in New Issue
Block a user