From ce5c7644025d46b69315c12a4e36924eea7a08f9 Mon Sep 17 00:00:00 2001 From: Bapung Date: Mon, 12 Jan 2026 21:07:05 +0800 Subject: [PATCH] add user scope --- .github/workflows/lint.yml | 2 +- .github/workflows/test-e2e.yml | 32 --- .github/workflows/test.yml | 2 +- Makefile | 2 +- README.md | 5 +- api/v1alpha1/runnergroup_types.go | 10 +- config/manager/kustomization.yaml | 2 +- .../samples/gitea_v1alpha1_runnergroup.yaml | 7 +- implementation.md | 205 +++++++----------- internal/controller/runnergroup_controller.go | 1 + internal/gitea/client.go | 90 ++++++++ internal/gitea/client_test.go | 37 +++- specification.md | 118 +++++----- 13 files changed, 291 insertions(+), 222 deletions(-) delete mode 100644 .github/workflows/test-e2e.yml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 86e3845..2681d30 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -20,4 +20,4 @@ jobs: - name: Run linter uses: golangci/golangci-lint-action@v8 with: - version: v2.1.0 + version: v2.7.2 diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml deleted file mode 100644 index 68fd1ed..0000000 --- a/.github/workflows/test-e2e.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: E2E Tests - -on: - push: - pull_request: - -jobs: - test-e2e: - name: Run on Ubuntu - runs-on: ubuntu-latest - steps: - - name: Clone the code - uses: actions/checkout@v4 - - - name: Setup Go - uses: actions/setup-go@v5 - with: - go-version-file: go.mod - - - name: Install the latest version of kind - run: | - curl -Lo ./kind https://kind.sigs.k8s.io/dl/latest/kind-linux-amd64 - chmod +x ./kind - sudo mv ./kind /usr/local/bin/kind - - - name: Verify kind installation - run: kind version - - - name: Running Test e2e - run: | - go mod tidy - make test-e2e diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fc2e80d..0cfb3e3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -20,4 +20,4 @@ jobs: - name: Running Tests run: | go mod tidy - make test + make test ENVTEST_K8S_VERSION=1.31 diff --git a/Makefile b/Makefile index 8b5a24c..24a323e 100644 --- a/Makefile +++ b/Makefile @@ -242,7 +242,7 @@ CONTROLLER_TOOLS_VERSION ?= v0.18.0 ENVTEST_VERSION ?= $(shell go list -m -f "{{ .Version }}" sigs.k8s.io/controller-runtime | awk -F'[v.]' '{printf "release-%d.%d", $$2, $$3}') #ENVTEST_K8S_VERSION is the version of Kubernetes to use for setting up ENVTEST binaries (i.e. 1.31) ENVTEST_K8S_VERSION ?= $(shell go list -m -f "{{ .Version }}" k8s.io/api | awk -F'[v.]' '{printf "1.%d", $$3}') -GOLANGCI_LINT_VERSION ?= v2.1.0 +GOLANGCI_LINT_VERSION ?= v2.7.2 .PHONY: kustomize kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary. diff --git a/README.md b/README.md index bb37d2d..76f92dd 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,9 @@ metadata: name: my-repo-runner-1 namespace: gitea-runner-system spec: - scope: repo - org: myorg # optional; ommited if scope == global + scope: repo # valid options: global, org or user, repo + org: myorg # optional; ommited if scope == global; mutually exclusive with user + user: myusername # optional; ommited if scope == global; mutually exclusive with org repo: myreponame # optional; ommited if scope == org || scope == global gitea: url: https://gitea.bpg.pw diff --git a/api/v1alpha1/runnergroup_types.go b/api/v1alpha1/runnergroup_types.go index 656c572..3e4b52f 100644 --- a/api/v1alpha1/runnergroup_types.go +++ b/api/v1alpha1/runnergroup_types.go @@ -32,14 +32,16 @@ const ( RunnerGroupScopeGlobal RunnerGroupScope = "global" // RunnerGroupScopeOrg means the runner group is scoped to an organization RunnerGroupScopeOrg RunnerGroupScope = "org" + // RunnerGroupScopeUser means the runner group is scoped to a user + RunnerGroupScopeUser RunnerGroupScope = "user" // RunnerGroupScopeRepo means the runner group is scoped to a repository RunnerGroupScopeRepo RunnerGroupScope = "repo" ) // RunnerGroupSpec defines the desired state of RunnerGroup. type RunnerGroupSpec struct { - // Scope defines the scope of the runner (global, org, repo) - // +kubebuilder:validation:Enum=global;org;repo + // Scope defines the scope of the runner (global, org, user, repo) + // +kubebuilder:validation:Enum=global;org;user;repo // +kubebuilder:validation:Required Scope RunnerGroupScope `json:"scope"` @@ -47,6 +49,10 @@ type RunnerGroupSpec struct { // +optional Org string `json:"org,omitempty"` + // User is required if scope is 'user' + // +optional + User string `json:"user,omitempty"` + // Repo is required if scope is 'repo' // +optional Repo string `json:"repo,omitempty"` diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 86cca29..f48d7b4 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -5,7 +5,7 @@ kind: Kustomization images: - name: controller newName: ghcr.io/bapung/gitea-runner-operator - newTag: sha-13f04e1 + newTag: sha-6bc93a2 patchesStrategicMerge: - image_pull_secret_patch.yaml diff --git a/config/samples/gitea_v1alpha1_runnergroup.yaml b/config/samples/gitea_v1alpha1_runnergroup.yaml index 2ae0feb..cda507a 100644 --- a/config/samples/gitea_v1alpha1_runnergroup.yaml +++ b/config/samples/gitea_v1alpha1_runnergroup.yaml @@ -7,7 +7,7 @@ metadata: app.kubernetes.io/managed-by: kustomize stringData: # The Gitea API Token (for the Operator to poll for jobs) - auth-token: "3430680995113a33a17715bb552882d504f5cf98" + auth-token: "MMUCFRXCbofYn2L0aT2OP2aug7JhChNJlULKNLgg" # The Runner Registration Token (for the Runner to register itself) registration-token: "5r4lpLA9rKCZZEHyUyKHeA187DoaElcTBySITRRi" --- @@ -23,9 +23,10 @@ spec: giteaURL: "https://gitea.bpg.pw" # Scope of the runners (global, org, or repo) - scope: "repo" + scope: "org" org: "bapung" # Required if scope is 'org' or 'repo' - repo: "dummy-service-workflow" # Required if scope is 'repo' + user: "" # Required if scope is 'user' or 'repo' + #repo: "dummy-service-workflow" # Required if scope is 'repo' # Labels to identify this runner group labels: diff --git a/implementation.md b/implementation.md index 9ef2d52..598ace2 100644 --- a/implementation.md +++ b/implementation.md @@ -30,18 +30,23 @@ type RunnerGroupScope string const ( RunnerGroupScopeGlobal RunnerGroupScope = "global" RunnerGroupScopeOrg RunnerGroupScope = "org" + RunnerGroupScopeUser RunnerGroupScope = "user" RunnerGroupScopeRepo RunnerGroupScope = "repo" ) type RunnerGroupSpec struct { - // Scope defines the scope of the runner (global, org, repo) - // +kubebuilder:validation:Enum=global;org;repo + // Scope defines the scope of the runner (global, org, user, repo) + // +kubebuilder:validation:Enum=global;org;user;repo Scope RunnerScope `json:"scope"` // Org is required if scope is 'org' // +optional Org string `json:"org,omitempty"` + // User is required if scope is 'user' + // +optional + User string `json:"user,omitempty"` + // Repo is required if scope is 'repo' // +optional Repo string `json:"repo,omitempty"` @@ -49,7 +54,8 @@ type RunnerGroupSpec struct { // GiteaURL is the base URL of the Gitea instance GiteaURL string `json:"giteaURL"` - // Labels to assign to the runner + // Labels to assign to the runner. + // Defaults (e.g. ubuntu-latest) are merged automatically by the controller. // +optional Labels []string `json:"labels,omitempty"` @@ -79,154 +85,103 @@ type RunnerGroupStatus struct { ## 4. Controller Implementation (`internal/controller/runnergroup_controller.go`) -The controller handles the reconciliation loop. +The controller handles the reconciliation loop and manages the lifecycle of ephemeral runners. -### 4.1 RBAC Permissions +### 4.1 Struct Definition -Add markers to generate RBAC roles: +The reconciler includes a thread-safe map to cache spawned jobs and prevent duplicate scheduling. ```go -// +kubebuilder:rbac:groups=gitea.bpg.pw,resources=runnergroups,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups=gitea.bpg.pw,resources=runnergroups/status,verbs=get;update;patch -// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch +type RunnerGroupReconciler struct { + client.Client + Scheme *runtime.Scheme + GiteaClient gitea.Client + SpawnedJobsCache sync.Map // Stores [int64]time.Time (JobID -> SpawnTime) +} ``` ### 4.2 Reconcile Logic -The `Reconcile` function should follow this flow: +The `Reconcile` function follows this flow: -1. **Fetch RunnerGroup**: Get the `RunnerGroup` CR instance. If not found, ignore (deleted). -2. **List Jobs**: List all `batchv1.Job` resources in the same namespace that are owned by this RunnerGroup. - - Filter by label `gitea.bpg.pw/runnergroup-name=`. -3. **Update Status**: Update `status.activeRunners` with the count of non-completed jobs. -4. **Capacity Check**: - - If `activeRunners >= spec.maxActiveRunners`, stop and requeue. -5. **Poll Gitea**: - - Retrieve the Auth Token from the Secret referenced in `spec.authToken`. - - Instantiate a Gitea API Client. - - Query for queued workflow runs matching the scope and labels. -6. **Scale Up**: - - Calculate `needed = count(queued_jobs)`. - - Calculate `available_slots = spec.maxActiveRunners - activeRunners`. - - `to_spawn = min(needed, available_slots)`. - - Loop `to_spawn` times: - - Create a new `batchv1.Job`. -7. **Requeue**: Return `ctrl.Result{RequeueAfter: 10 * time.Second}` to ensure continuous polling. +1. **Fetch RunnerGroup**: Get the `RunnerGroup` CR instance. +2. **List Jobs**: List all `batchv1.Job` resources owned by this CR to calculate `activeRunners`. +3. **Update Status**: Update `status.activeRunners`. +4. **Capacity Check**: Stop scaling if `activeRunners >= spec.maxActiveRunners`. +5. **Label Calculation**: Call `getEffectiveLabels` to merge `spec.labels` with hardcoded Gitea defaults (e.g., `ubuntu-latest:docker://node:16-bullseye`). +6. **Poll Gitea**: + - Retrieve Auth Token. + - Call `GiteaClient.GetRunnerStats` with the effective labels. + - This returns a list of `QueuedJobs`. +7. **Scale Up & Deduplication**: + - Iterate through `stats.QueuedJobs`. + - **Check Cache**: If Job ID exists in `SpawnedJobsCache`: + - If TTL (< 5 min) is valid: **Skip** (already handled). + - If TTL expired: **Retry** (assume previous runner failed). + - If Job ID not in cache or expired: + - Check `availableSlots`. + - Retrieve Registration Token (if not yet fetched). + - **Spawn Job**: Create `batchv1.Job`. + - **Update Cache**: Store Job ID in `SpawnedJobsCache`. + - Decrement `availableSlots`. +8. **Cache Cleanup**: Remove IDs from `SpawnedJobsCache` if they are not present in the latest `QueuedJobs` list from Gitea. +9. **Requeue**: Return `ctrl.Result{RequeueAfter: 10 * time.Second}`. -### 4.3 Job Construction +### 4.3 Helper Functions -Helper function to create the Job object: +#### getEffectiveLabels -```go -func (r *RunnerGroupReconciler) constructJobForRunnerGroup(runnerGroup *giteav1alpha1.RunnerGroup, registrationToken string) (*batchv1.Job, error) { - // Generate random suffix for name - name := fmt.Sprintf("%s-%s", runnerGroup.Name, randString(5)) +Merges user-defined labels with Gitea defaults. If a user defines `ubuntu-latest`, it overrides the default `ubuntu-latest:docker://...`. - // Construct Env Vars - envVars := []corev1.EnvVar{ - {Name: "GITEA_INSTANCE_URL", Value: runnerGroup.Spec.GiteaURL}, - {Name: "GITEA_RUNNER_REGISTRATION_TOKEN", Value: registrationToken}, - {Name: "GITEA_RUNNER_EPHEMERAL", Value: "true"}, - {Name: "DOCKER_HOST", Value: "tcp://localhost:2376"}, - // ... other envs from README - } +#### constructJobForRunnerGroup - if len(runnerGroup.Spec.Labels) > 0 { - labelsStr := strings.Join(runnerGroup.Spec.Labels, ",") - envVars = append(envVars, corev1.EnvVar{Name: "GITEA_RUNNER_LABELS", Value: labelsStr}) - } +Creates the Job object with: - // Construct Job - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: runnerGroup.Namespace, - Labels: map[string]string{ - "app": runnerGroup.Name, - "gitea.bpg.pw/runnergroup-name": runnerGroup.Name, - "gitea.bpg.pw/managed-by": "gitea-runner-operator", - }, - }, - Spec: batchv1.JobSpec{ - TTLSecondsAfterFinished: pointer.Int32(600), - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - RestartPolicy: corev1.RestartPolicyOnFailure, - Containers: []corev1.Container{ - { - Name: "runner", - Image: "gitea/act_runner:nightly-dind-rootless", - ImagePullPolicy: corev1.PullAlways, - SecurityContext: &corev1.SecurityContext{Privileged: pointer.Bool(true)}, - Env: envVars, - VolumeMounts: []corev1.VolumeMount{ - {Name: "runner-data", MountPath: "/data"}, - }, - }, - }, - Volumes: []corev1.Volume{ - { - Name: "runner-data", - VolumeSource: corev1.VolumeSource{ - PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ - ClaimName: "act-runner-vol", // Note: Consider making this configurable or EmptyDir - }, - }, - }, - }, - }, - }, - }, - } - - // Set Controller Reference - if err := ctrl.SetControllerReference(runnerGroup, job, r.Scheme); err != nil { - return nil, err - } - - return job, nil -} -``` +- **Name**: `{runnergroup-name}-{random-suffix}` +- **Env**: + - `GITEA_RUNNER_NAME`: Set to the Job name. + - `GITEA_RUNNER_LABELS`: Comma-separated effective labels. + - Standard runner envs (`GITEA_INSTANCE_URL`, etc). ## 5. Gitea Client (`internal/gitea/client.go`) -A simple HTTP client wrapper to interact with Gitea. +A specialized client to interact with Gitea's Actions API. ### 5.1 Interface ```go +type RunnerStats struct { + QueuedJobs []ActionWorkflowJob + Running int +} + type Client interface { - GetQueuedRuns(ctx context.Context, scope RunnerGroupScope, owner, repo string, labels []string) (int, error) + GetRunnerStats(ctx context.Context, giteaURL, authToken string, scope RunnerGroupScope, org, repo string, labels []string) (*RunnerStats, error) } ``` -### 5.2 Implementation Details +### 5.2 Logic -- **Endpoint**: `/api/v1/repos/{owner}/{repo}/actions/runs` -- **Query Params**: `status=queued` -- **Filtering**: - - The API might return all queued runs. - - The client must filter these runs locally to ensure they match the `labels` defined in the RunnerGroup CR. - - _Note_: Gitea API might not support filtering by labels directly in the list endpoint, so client-side filtering is necessary. +1. **Endpoints**: + - Repo/Org/Global: Uses `/actions/jobs` endpoints. + - User: Fetches repos via `/users/{user}/repos`, then queries `/actions/jobs` for each repo. +2. **Fetching**: + - Fetches jobs with `status=queued`, `waiting`, `pending`. + - Handles pagination (fetches all pages). +3. **Filtering**: + - Iterates through fetched jobs. + - **Matches Labels**: Checks if the job's required labels are a subset of the runner's supported labels (effective labels). + - Supports exact match (`linux` == `linux`) + - Supports schema match (`ubuntu-latest` matches `ubuntu-latest:docker://...`) + - Returns only matching jobs in `QueuedJobs`. -## 6. Configuration & Deployment +## 6. Testing Strategy -### 6.1 Dockerfile - -Standard Operator SDK Dockerfile. Ensure the base image is minimal (e.g., `gcr.io/distroless/static:nonroot`). - -### 6.2 Kustomize - -Update `config/default/kustomization.yaml` to include the CRD and RBAC configurations. - -## 7. Testing Strategy - -1. **Unit Tests**: - - Test `constructJobForRunnerGroup` to ensure Env vars and Labels are set correctly. - - Test Gitea Client response parsing. -2. **Integration Tests (EnvTest)**: - - Spin up a local k8s control plane. - - Create a `RunnerGroup` CR. - - Verify the controller creates a `Job` when the mocked Gitea client returns queued jobs. - - Verify the controller respects `MaxActiveRunners`. +1. **Unit Tests (`internal/gitea/client_test.go`)**: + - Mock Gitea API server. + - Verify `GetRunnerStats` correctly parses JSON and handles pagination. + - Verify label matching logic (subset, schema matching). +2. **Controller Tests**: + - Verify `SpawnedJobsCache` prevents double scheduling. + - Verify TTL logic allows retries for stuck jobs. + - Verify `getEffectiveLabels` merging logic. diff --git a/internal/controller/runnergroup_controller.go b/internal/controller/runnergroup_controller.go index ba1030b..c37747f 100644 --- a/internal/controller/runnergroup_controller.go +++ b/internal/controller/runnergroup_controller.go @@ -129,6 +129,7 @@ func (r *RunnerGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request) authToken, runnerGroup.Spec.Scope, runnerGroup.Spec.Org, + runnerGroup.Spec.User, runnerGroup.Spec.Repo, effectiveLabels, ) diff --git a/internal/gitea/client.go b/internal/gitea/client.go index 70764bf..779f06f 100644 --- a/internal/gitea/client.go +++ b/internal/gitea/client.go @@ -38,6 +38,7 @@ type Client interface { authToken string, scope v1alpha1.RunnerGroupScope, org string, + user string, repo string, labels []string, ) (*RunnerStats, error) @@ -118,6 +119,7 @@ func (c *HTTPClient) GetRunnerStats( authToken string, scope v1alpha1.RunnerGroupScope, org string, + user string, repo string, labels []string, ) (*RunnerStats, error) { @@ -126,6 +128,8 @@ func (c *HTTPClient) GetRunnerStats( return c.getRunnerStatsForRepo(ctx, giteaURL, authToken, org, repo, labels) case v1alpha1.RunnerGroupScopeOrg: return c.getRunnerStatsForOrg(ctx, giteaURL, authToken, org, labels) + case v1alpha1.RunnerGroupScopeUser: + return c.getRunnerStatsForUser(ctx, giteaURL, authToken, user, labels) case v1alpha1.RunnerGroupScopeGlobal: return c.getRunnerStatsGlobal(ctx, giteaURL, authToken, labels) default: @@ -145,6 +149,28 @@ func (c *HTTPClient) getRunnerStatsForOrg(ctx context.Context, giteaURL, authTok return c.fetchRunnerStats(ctx, endpoint, authToken, labels) } +// getRunnerStatsForUser fetches queued runs for all repos owned by a user +func (c *HTTPClient) getRunnerStatsForUser(ctx context.Context, giteaURL, authToken, user string, labels []string) (*RunnerStats, error) { + repos, err := c.fetchReposForUser(ctx, giteaURL, authToken, user) + if err != nil { + return nil, err + } + + var allQueuedJobs []ActionWorkflowJob + for _, repo := range repos { + endpoint := fmt.Sprintf("%s/api/v1/repos/%s/%s/actions/jobs", strings.TrimSuffix(giteaURL, "/"), repo.Owner.Login, repo.Name) + stats, err := c.fetchRunnerStats(ctx, endpoint, authToken, labels) + if err != nil { + return nil, err + } + allQueuedJobs = append(allQueuedJobs, stats.QueuedJobs...) + } + + return &RunnerStats{ + QueuedJobs: allQueuedJobs, + }, nil +} + // getRunnerStatsGlobal fetches queued runs using admin-level API for global scope func (c *HTTPClient) getRunnerStatsGlobal(ctx context.Context, giteaURL, authToken string, labels []string) (*RunnerStats, error) { endpoint := fmt.Sprintf("%s/api/v1/admin/actions/jobs", strings.TrimSuffix(giteaURL, "/")) @@ -475,6 +501,70 @@ func (c *HTTPClient) fetchUserRepos(ctx context.Context, giteaURL, authToken str return allRepos, nil } +// fetchReposForUser fetches all repositories owned by a specific user with pagination +func (c *HTTPClient) fetchReposForUser(ctx context.Context, giteaURL, authToken, username string) ([]Repository, error) { + var allRepos []Repository + page := 1 + limit := 50 + + for { + endpoint := fmt.Sprintf("%s/api/v1/users/%s/repos", strings.TrimSuffix(giteaURL, "/"), username) + u, err := url.Parse(endpoint) + if err != nil { + return nil, err + } + q := u.Query() + q.Set("page", fmt.Sprintf("%d", page)) + q.Set("limit", fmt.Sprintf("%d", limit)) + u.RawQuery = q.Encode() + + fmt.Printf("DEBUG: Fetching repos for user %s from %s\n", username, u.String()) + + req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil) + if err != nil { + return nil, err + } + + req.Header.Set("Authorization", "token "+authToken) + req.Header.Set("Accept", "application/json") + + resp, err := c.httpClient.Do(req) + if err != nil { + fmt.Printf("DEBUG: Request failed: %v\n", err) + return nil, err + } + + fmt.Printf("DEBUG: Response status: %s\n", resp.Status) + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + fmt.Printf("DEBUG: Error body: %s\n", string(body)) + return nil, c.handleHTTPError(resp.StatusCode, body, "fetch user repos") + } + + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + // fmt.Printf("DEBUG: Response body: %s\n", string(body)) + + var repos []Repository + if err := json.Unmarshal(body, &repos); err != nil { + fmt.Printf("DEBUG: Failed to decode response: %v\n", err) + return nil, err + } + + allRepos = append(allRepos, repos...) + + if len(repos) < limit { + break + } + + page++ + } + + return allRepos, nil +} + // filterQueuedJobs filters workflow jobs by labels func (c *HTTPClient) filterQueuedJobs(jobs []ActionWorkflowJob, runnerLabels []string) []ActionWorkflowJob { var matched []ActionWorkflowJob diff --git a/internal/gitea/client_test.go b/internal/gitea/client_test.go index 407c43b..f6fa857 100644 --- a/internal/gitea/client_test.go +++ b/internal/gitea/client_test.go @@ -32,6 +32,7 @@ func TestHTTPClient_GetRunnerStats(t *testing.T) { name string scope v1alpha1.RunnerGroupScope org string + user string repo string labels []string mockResponse ActionWorkflowJobsResponse @@ -87,12 +88,43 @@ func TestHTTPClient_GetRunnerStats(t *testing.T) { expectedQueued: 2, expectedError: false, }, + { + name: "user scope", + scope: v1alpha1.RunnerGroupScopeUser, + user: "testuser", + labels: []string{"linux"}, + mockResponse: ActionWorkflowJobsResponse{ + TotalCount: 1, + Jobs: []ActionWorkflowJob{ + {ID: 1, Status: "queued", Labels: []string{"linux"}}, + }, + }, + expectedQueued: 1, + expectedError: false, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { // Create mock server server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + // Handle User Repos call for User Scope + if tt.scope == v1alpha1.RunnerGroupScopeUser && strings.Contains(r.URL.Path, "/repos") && !strings.Contains(r.URL.Path, "/actions/jobs") { + repos := []Repository{ + { + Name: "testrepo", + Owner: struct { + Login string `json:"login"` + }{Login: tt.user}, + FullName: tt.user + "/testrepo", + }, + } + json.NewEncoder(w).Encode(repos) + return + } + // Verify correct endpoint is called expectedPath := "" switch tt.scope { @@ -102,6 +134,8 @@ func TestHTTPClient_GetRunnerStats(t *testing.T) { expectedPath = "/api/v1/orgs/testorg/actions/jobs" case v1alpha1.RunnerGroupScopeGlobal: expectedPath = "/api/v1/admin/actions/jobs" + case v1alpha1.RunnerGroupScopeUser: + expectedPath = "/api/v1/repos/" + tt.user + "/testrepo/actions/jobs" } if !strings.HasPrefix(r.URL.Path, expectedPath) { @@ -114,8 +148,6 @@ func TestHTTPClient_GetRunnerStats(t *testing.T) { t.Errorf("Expected Authorization header to start with 'token ', got %s", authHeader) } - w.Header().Set("Content-Type", "application/json") - // Only return jobs for 'queued' status to simplify counting if r.URL.Query().Get("status") == "queued" { json.NewEncoder(w).Encode(tt.mockResponse) @@ -132,6 +164,7 @@ func TestHTTPClient_GetRunnerStats(t *testing.T) { "test-token", tt.scope, tt.org, + tt.user, tt.repo, tt.labels, ) diff --git a/specification.md b/specification.md index dd42805..668f2b7 100644 --- a/specification.md +++ b/specification.md @@ -10,6 +10,8 @@ The Gitea Runner Operator is a Kubernetes controller designed to manage ephemera - **RunnerGroup CR**: The custom resource instance defining a runner pool. - **Ephemeral Runner**: A runner that executes exactly one job and then terminates. - **Gitea Instance**: The target Gitea server where CI/CD workflows are triggered. +- **Runner Capabilities**: The set of labels a runner provides (e.g., `ubuntu-latest`). +- **Job Requirements**: The set of labels a job requests (e.g., `ubuntu-latest`). ## 3. Custom Resource Definition (CRD) @@ -24,16 +26,17 @@ The Gitea Runner Operator is a Kubernetes controller designed to manage ephemera The `spec` defines the configuration for the runner pool. -| Field | Type | Required | Description | -| :------------------ | :----------------------------- | :---------- | :---------------------------------------------------------------------------------------------------------- | -| `scope` | Enum (`global`, `org`, `repo`) | Yes | The scope of the runner. | -| `org` | String | Conditional | The organization name. Required if `scope` is `org`. | -| `repo` | String | Conditional | The repository name. Required if `scope` is `repo`. | -| `gitea.url` | String | Yes | The base URL of the Gitea instance (e.g., `https://gitea.example.com`). | -| `labels` | []String | No | List of labels for the runner (e.g., `ubuntu-latest`, `app:infra`). Used by Gitea to match jobs to runners. | -| `maxActiveRunners` | Integer | Yes | The maximum number of concurrent runner Jobs allowed for this specific RunnerGroup CR. | -| `registrationToken` | SecretKeySelector | Yes | Reference to a Secret containing the runner registration token. | -| `authToken` | SecretKeySelector | Yes | Reference to a Secret containing an API token to query Gitea for job statuses. | +| Field | Type | Required | Description | +| :------------------ | :------------------------------------- | :---------- | :---------------------------------------------------------------------------------------------------------- | +| `scope` | Enum (`global`, `org`, `user`, `repo`) | Yes | The scope of the runner. | +| `org` | String | Conditional | The organization name. Required if `scope` is `org`. | +| `user` | String | Conditional | The username. Required if `scope` is `user`. | +| `repo` | String | Conditional | The repository name. Required if `scope` is `repo`. | +| `gitea.url` | String | Yes | The base URL of the Gitea instance (e.g., `https://gitea.example.com`). | +| `labels` | []String | No | List of labels for the runner (e.g., `app:infra`). Defaults (e.g. `ubuntu-latest`) are added automatically. | +| `maxActiveRunners` | Integer | Yes | The maximum number of concurrent runner Jobs allowed for this specific RunnerGroup CR. | +| `registrationToken` | SecretKeySelector | Yes | Reference to a Secret containing the runner registration token. | +| `authToken` | SecretKeySelector | Yes | Reference to a Secret containing an API token to query Gitea for job statuses. | #### 3.2.1 SecretKeySelector @@ -42,7 +45,7 @@ Standard Kubernetes Secret reference: - `secretRef.name`: Name of the secret. - `secretRef.key`: Key within the secret containing the value. -### 3.3 Status Schema (Optional but Recommended) +### 3.3 Status Schema - `activeRunners`: Integer. Current count of running Jobs managed by this CR. - `lastCheckTime`: Timestamp. Last time the controller polled Gitea. @@ -54,37 +57,44 @@ Standard Kubernetes Secret reference: The controller watches for changes to `RunnerGroup` resources. 1. **Validation**: Ensure `org` or `repo` are present based on `scope`. -2. **Job Cleanup**: (Optional) Check for and remove "stuck" jobs if TTL doesn't cover edge cases, though `ttlSecondsAfterFinished` is primary. -3. **Metric Collection**: Update status with current running job count. -4. **Polling**: The controller must implement a polling mechanism (loop) independent of the standard Reconcile trigger, or requeue the Reconcile event periodically (e.g., every 10-30 seconds). +2. **Job List**: List child Jobs to determine `activeRunners` count. +3. **Status Update**: Update CR status with current metrics. +4. **Capacity Check**: If `activeRunners >= maxActiveRunners`, stop scaling up. +5. **Polling**: Fetch job statistics from Gitea. -### 4.2 Polling & Scaling Logic +### 4.2 Polling & Scaling Strategy -On every poll interval for a specific `RunnerGroup` CR: +The operator uses a robust polling strategy to handle the disconnect between Kubernetes Pod startup time and Gitea's job queue state. -1. **Check Capacity**: - - Query Kubernetes for active `Jobs` owned by this `RunnerGroup` CR. - - If `count(active_jobs) >= maxActiveRunners`, stop. Do not spawn new runners. +#### 4.2.1 Fetching Stats (`GetRunnerStats`) -2. **Fetch Queued Jobs**: - - Call Gitea API using `authToken`. - - Endpoint depends on scope: - - **Global**: Recursively fetch all workflow runs: - 1. Fetch all organizations in the Gitea instance - 2. For each organization, fetch all repositories under that org - 3. For each repository, query `/repos/{owner}/{repo}/actions/runs?status=queued` - 4. Additionally, fetch all user-owned repositories and query their workflow runs - - **Org**: Fetch all workflow runs in repos under the organization: - 1. Fetch all repositories under the specified organization - 2. For each repository, query `/repos/{owner}/{repo}/actions/runs?status=queued` - - **Repo**: Directly query `/repos/{owner}/{repo}/actions/runs?status=queued` - - Filter the returned runs: - - Must match the `labels` defined in the `RunnerGroup` CR. +The controller queries Gitea for: -3. **Spawn Runner**: - - If a queued job is found and capacity allows, create a Kubernetes `Job`. - - **One Job per Queued Workflow**: Ideally, the logic should map 1 queued run -> 1 Runner Job. - - **Concurrency Control**: Ensure we don't spawn more jobs than `maxActiveRunners - currentActiveRunners`. +1. **Queued Jobs**: Jobs with status `queued`, `waiting`, or `pending`. + - **Label Filtering**: Jobs are filtered client-side. A job is considered a match if the RunnerGroup's capabilities (Spec labels + Default labels) are a superset of the Job's required labels. +2. **Running Jobs**: Jobs with status `running` that belong to this specific runner group (filtered by runner name prefix). + +#### 4.2.2 Deduplication Cache (`SpawnedJobsCache`) + +To prevent "double scheduling" (where multiple reconciliation loops spawn multiple runners for the same queued job before the first runner can pick it up), the controller maintains an in-memory cache: + +- **Key**: Gitea Job ID. +- **Value**: Timestamp when the runner was spawned. +- **TTL**: 5 minutes. + +#### 4.2.3 Scaling Algorithm + +1. **Identify Candidates**: Iterate through the list of Queued Jobs from Gitea. +2. **Check Cache**: + - If Job ID is in cache and TTL has not expired: **Skip** (Runner already spawned). + - If Job ID is in cache and TTL expired: **Retry** (Runner likely failed to start). + - If Job ID is not in cache: **Candidate for spawning**. +3. **Calculate Slots**: `availableSlots = maxActiveRunners - activeRunners`. +4. **Spawn**: For each candidate, if `availableSlots > 0`: + - Create Kubernetes Job. + - Add Job ID to `SpawnedJobsCache`. + - Decrement `availableSlots`. +5. **Cleanup**: Remove Job IDs from the cache if they are no longer present in the Queued Jobs list returned by Gitea (implies they are now Running, Completed, or Cancelled). ## 5. Kubernetes Resource Generation @@ -94,40 +104,44 @@ The controller creates a `batch/v1 Job`. **Metadata:** -- `name`: `{runnergroup-cr-name}-{random-suffix}` +- `name`: `{runnergroup-name}-{random-suffix}` - `namespace`: Same as `RunnerGroup` CR. - `labels`: - - `app`: `{runnergroup-cr-name}` + - `gitea.bpg.pw/runnergroup-name`: `{runnergroup-name}` - `gitea.bpg.pw/managed-by`: `gitea-runner-operator` - - `gitea.bpg.pw/runnergroup-name`: `{runnergroup-cr-name}` - `ownerReferences`: Pointing to the `RunnerGroup` CR. **Spec:** -- `ttlSecondsAfterFinished`: 600 (Clean up finished jobs). +- `ttlSecondsAfterFinished`: 600 (Auto-cleanup). - `template`: - `spec`: - `restartPolicy`: `OnFailure` - `containers`: - **Name**: `runner` - - **Image**: `gitea/act_runner:nightly-dind-rootless` (Default, potentially configurable in CR later). - - **SecurityContext**: `privileged: true` (Required for DIND). + - **Image**: `gitea/act_runner:nightly-dind-rootless` - **Env**: - `GITEA_INSTANCE_URL`: From `spec.gitea.url`. - - `GITEA_RUNNER_REGISTRATION_TOKEN`: From `spec.registrationToken`. + - `GITEA_RUNNER_REGISTRATION_TOKEN`: From Secret. - `GITEA_RUNNER_EPHEMERAL`: `"true"`. - - `GITEA_RUNNER_LABELS`: Comma-separated list from `spec.labels`. - - `DOCKER_HOST`: `tcp://localhost:2376` - - **VolumeMounts**: - - Mount docker socket or storage if necessary. The README example uses a PVC `act-runner-vol` mounted to `/data`. _Note: Using a shared PVC for ephemeral runners might cause race conditions. EmptyDir is preferred for truly ephemeral runners unless caching is strictly required and managed._ + - `GITEA_RUNNER_NAME`: `{job-name}` (Matches Pod name for easier debugging). + - `GITEA_RUNNER_LABELS`: Comma-separated list of **Effective Labels**. + - **Effective Labels** = `spec.labels` + Default Gitea Labels (e.g., `ubuntu-latest:docker://node:16-bullseye`, `ubuntu-22.04:...`, etc.) unless explicitly overridden. ## 6. Gitea API Interaction - **Authentication**: Bearer token provided in `authToken`. -- **Client**: HTTP Client with timeout. +- **Endpoints Used**: + - `/api/v1/repos/{owner}/{repo}/actions/jobs` (Repo scope) + - `/api/v1/orgs/{org}/actions/jobs` (Org scope) + - `/api/v1/users/{user}/repos` + `/api/v1/repos/{owner}/{repo}/actions/jobs` (User scope) + - `/api/v1/admin/actions/jobs` (Global scope) +- **Label Matching**: + - The controller implements logic to check: `Job.Labels ⊆ Runner.EffectiveLabels`. + - Supports both exact matches (`linux`) and schema matches (`ubuntu-latest` matches `ubuntu-latest:docker://...`). ## 7. Security Considerations -- **Token Handling**: Registration and Auth tokens are read from Kubernetes Secrets and injected as Environment Variables. They are not stored in plain text in the CR. -- **Privileged Mode**: The default `act_runner` image (dind) requires privileged mode. The Operator creates Jobs with this permission. -- **Namespace Isolation**: The Operator should respect RBAC and only operate within allowed namespaces. +- **Token Handling**: Tokens are injected via `valueFrom: secretKeyRef` env vars. +- **Privileged Mode**: `act_runner` dind mode requires privileged security context. +- **Namespace Isolation**: Controller operates within the namespace of the RunnerGroup.