feat(09-tooling-portable-setup-02): implement file/cron setup — scripts, skills, cron jobs, gateway restart

- write_session_init_script: mount verification via shell_init_files (D-10) - write_archive_script: DRY_RUN=true archive script for stale sessions (D-10) - write_jira_skill, write_aws_skill, write_confluence_skill, write_bitbucket_skill, write_session_skill: all 5 skills with 2 reference files embedded as heredocs - register_cron_jobs: 3 cron jobs via hermes cron create (ngn-daily-report, ngn-weekly-stale-summary, ngn-weekly-archive) - offer_gateway_restart: prompt to restart Hermes gateway at end - Main execution block [1/14] through [14/14] with progress indicators - Best-effort error handling for non-critical steps - D-10 referenced throughout for traceability
2026-06-15 23:30:27 +08:00
parent 9da972842d
commit 5a8c18380e
1 changed files with 944 additions and 0 deletions
--- a/ngn-agent/setup-ngn-agent.sh
+++ b/ngn-agent/setup-ngn-agent.sh
@@ -394,3 +394,947 @@ generate_cron_env_config() {
    echo "  ✓ Cron env vars configured"
 }
 # =============================================================================
 # Task 3: File/Cron Setup (D-10)
 # =============================================================================
 # ---- Write session-init.sh (D-10) ----
 write_session_init_script() {
    echo "  → Writing session-init.sh (D-10)..."
    cat > "$HOME/.hermes/scripts/session-init.sh" << 'SCRIPT'
 #!/bin/bash
 # session-init.sh — Verify DEFAULT_REPOS mounts at session start
 # Runs via shell_init_files before agent prompt. Non-blocking.
 # Reads DEFAULT_REPOS from environment (forwarded via docker_forward_env).
 set -uo pipefail
 DEFAULT_REPOS="${DEFAULT_REPOS:-}"
 if [ -z "$DEFAULT_REPOS" ]; then
  echo "[session-init] DEFAULT_REPOS not set — skipping verification"
  exit 0
 fi
 # Split comma-separated list
 IFS=',' read -ra REPOS <<< "$DEFAULT_REPOS"
 ALL_OK=true
 for repo in "${REPOS[@]}"; do
  # Trim whitespace
  repo="${repo#"${repo%%[![:space:]]*}"}"
  repo="${repo%"${repo##*[![:space:]]}"}"
  if [ -d "/workspace/$repo/.git" ]; then
    echo "[session-init] ✓ $repo — mounted at /workspace/$repo"
  else
    echo "[session-init] ⚠ $repo — NOT FOUND at /workspace/$repo"
    ALL_OK=false
  fi
 done
 if [ "$ALL_OK" = true ]; then
  echo "[session-init] All DEFAULT_REPOS verified"
 else
  echo "[session-init] Some repos missing — check docker_volumes in config.yaml"
 fi
 exit 0  # always exit cleanly — non-blocking
 SCRIPT
    chmod +x "$HOME/.hermes/scripts/session-init.sh"
    echo "  ✓ session-init.sh written and executable"
 }
 # ---- Write archive-stale-sessions.sh (D-10) ----
 write_archive_script() {
    echo "  → Writing archive-stale-sessions.sh (D-10)..."
    cat > "$HOME/.hermes/scripts/archive-stale-sessions.sh" << 'SCRIPT'
 #!/bin/bash
 # Archive stale sessions (inactive >30 days) and prune from live DB
 # This script runs via hermes cron with --no-agent
 # Stdout is delivered to Telegram via --deliver telegram
 # Dry-run mode: export only, no prune — safe default for first run
 set -euo pipefail
 DRY_RUN=true
 ARCHIVE_DIR="$HOME/.hermes/archive/sessions"
 mkdir -p "$ARCHIVE_DIR"
 TIMESTAMP=$(date +%Y%m%d_%H%M%S)
 OUTPUT_FILE="$ARCHIVE_DIR/sessions-${TIMESTAMP}.jsonl"
 echo "=== Stale Session Archive ==="
 echo "Started: $(date)"
 echo "Dry run: $DRY_RUN"
 echo ""
 echo "[1/3] Exporting session store..."
 echo "  Output: $OUTPUT_FILE"
 hermes sessions export "$OUTPUT_FILE"
 echo "  -> $(wc -l < "$OUTPUT_FILE") sessions exported"
 echo "  -> Size: $(du -h "$OUTPUT_FILE" | cut -f1)"
 echo ""
 if [ "$DRY_RUN" = false ]; then
  echo "[2/3] Pruning sessions older than 30 days..."
  hermes sessions prune --older-than 30 --yes
  echo "  Done."
 else
  echo "[2/3] SKIPPED (dry run) — set DRY_RUN=false to enable prune"
  echo "  Review $OUTPUT_FILE before enabling."
 fi
 echo ""
 echo "[3/3] Post-archive stats:"
 hermes sessions stats
 echo ""
 echo "✓ Archive complete."
 SCRIPT
    chmod +x "$HOME/.hermes/scripts/archive-stale-sessions.sh"
    echo "  ✓ archive-stale-sessions.sh written and executable"
 }
 # ---- Write skill files (D-10) ----
 write_jira_skill() {
    mkdir -p "$HOME/.hermes/skills/ngn-agent/jira"
    cat > "$HOME/.hermes/skills/ngn-agent/jira/SKILL.md" << 'SKILL'
 ---
 name: jira-query
 description: Query Jira Cloud issues, search, and manage tickets
 metadata:
  hermes:
    tags: [jira, project-management]
    category: devops
    requires_toolsets: [terminal]
 version: 1.0.0
 ---
 # Jira Cloud Query
 ## When to Use
 When the user asks to search Jira issues, check ticket status, or list project work.
 ## Procedure
 ### 1. Search issues by JQL
 ```bash
 ngn-jira GET '/rest/api/3/search?jql=ORDER BY created DESC&maxResults=10'
 ```
 For specific project:
 ```bash
 ngn-jira GET '/rest/api/3/search?jql=project=PROJ ORDER BY created DESC&maxResults=10'
 ```
 ### 2. Get issue details
 ```bash
 ngn-jira GET '/rest/api/3/issue/PROJ-123'
 ```
 ### 3. List sprints (if Jira Software)
 ```bash
 ngn-jira GET '/rest/agile/1.0/board'
 ngn-jira GET '/rest/agile/1.0/board/{boardId}/sprint?state=active'
 ```
 ### 4. Get issue comments
 ```bash
 ngn-jira GET '/rest/api/3/issue/PROJ-123/comment'
 ```
 ## Pitfalls
 - JQL is case-sensitive for field names
 - maxResults defaults to 50; set explicitly for large queries
 - Agile REST API may not be available on all plans
 ## Required Environment
 - `JIRA_EMAIL` — your Atlassian account email
 - `JIRA_API_TOKEN` — from https://id.atlassian.com/manage/api-tokens
 SKILL
    echo "  ✓ jira/SKILL.md written"
 }
 write_aws_skill() {
    mkdir -p "$HOME/.hermes/skills/ngn-agent/aws-diagnostics/references"
    cat > "$HOME/.hermes/skills/ngn-agent/aws-diagnostics/SKILL.md" << 'SKILL'
 ---
 name: aws-diagnostics
 description: Read-only AWS diagnostics for platform engineering
 metadata:
  hermes:
    tags: [aws, diagnostics, platform-engineering]
    category: devops
    requires_toolsets: [terminal]
 version: 1.0.0
 ---
 # AWS Diagnostics
 ## When to Use
 When the user asks to check AWS resources, investigate issues, or audit infrastructure in any account.
 ## Important
 - ALWAYS determine the correct AWS_PROFILE before running commands
 - NEVER run mutating AWS commands (delete, terminate, stop, modify)
 - Prefer read-only AWS CLI commands (describe, list, get)
 ## Procedure
 ### 1. Identify the target account
 Ask the user which account/environment they want to target. Available profiles in `/.aws/config`:
 - `rzaws-sw-rai-ava-dev/prod/rc` — AVA service
 - `rzaws-sw-rai-cs-dev/prod/rc` — CS service
 - `rzaws-sw-rai-qac-dev/prod` — QAC
 - `rzaws-sw-rai-ops` — Ops account
 - `rzaws-sw-rai-voicekit-dev/prod/rc` — VoiceKit
 - `rzaws-sw-rai-preprod` — Pre-production
 - `rzaws-sw-rai-nonprod` — Non-production
 ### 2. Set the profile
 ```bash
 export AWS_PROFILE=rzaws-sw-rai-<service>-<env>
 ```
 ### 3. Diagnostic commands
 **EC2 instances:**
 ```bash
 aws ec2 describe-instances --query 'Reservations[*].Instances[*].[InstanceId,State.Name,InstanceType,Tags[?Key==`Name`].Value|[0]]' --output table
 ```
 **ECS services:**
 ```bash
 aws ecs list-clusters && aws ecs list-services --cluster <name>
 ```
 **S3 buckets:**
 ```bash
 aws s3 ls
 ```
 **CloudWatch alarms:**
 ```bash
 aws cloudwatch describe-alarms --state-value ALARM --output table
 ```
 **ECS task health:**
 ```bash
 aws ecs describe-tasks --cluster <name> --tasks <task-ids>
 ```
 **RDS instances:**
 ```bash
 aws rds describe-db-instances --query 'DBInstances[*].[DBInstanceIdentifier,DBInstanceStatus,Engine,DBInstanceClass]' --output table
 ```
 **Lambda functions:**
 ```bash
 aws lambda list-functions --query 'Functions[*].[FunctionName,Runtime,LastModified]' --output table
 ```
 **ELB target group health:**
 ```bash
 aws elbv2 describe-target-groups --query 'TargetGroups[*].[TargetGroupName,TargetType]' --output table
 ```
 ### 4. Report findings
 Format as a concise table. Include account ID and profile used.
 ## Alternative: Infrastructure Code Analysis
 When AWS CLI access is unavailable (Docker containers, credential issues), examine existing infrastructure code instead:
 ```bash
 # Search for region patterns
 search_files --pattern="us-west-2" --path="/workspace"
 # Check terraform configurations
 read_file /workspace/rai-ops/aws/<account>/us-east-1/app/main.tf
 read_file /workspace/rai-ops/aws/<account>/us-east-1/app/<app>.tfvars
 # Look for provider configurations
 search_files --pattern="provider.*replica" --path="/workspace/rai-ops"
 # Check S3 migration data
 search_files --pattern="s3-mapping" --target="files"
 ```
 **When to use code analysis:**
 - Docker container with read-only filesystem
 - Missing AWS CLI or credentials
 - Need to understand intended architecture vs live state
 - Investigating multiregional setup patterns
 ## Pitfalls
 - SSO tokens expire (~6-8h). If you get auth errors, ask the user to run `aws sso login`
 - Some accounts may not have all services — check `aws sts get-caller-identity` first
 - Don't pipe large results directly — use `--query` and `--output table` for readability
 - **Don't persist with CLI installation in constrained environments** — switch to code analysis quickly when installation fails
 ## Verification
 Run `aws sts get-caller-identity` to confirm the correct profile is active before running diagnostics.
 ## References
 - `references/multiregional-patterns.md` - Terraform patterns for cross-region infrastructure setup
 SKILL
    # AWS reference file
    cat > "$HOME/.hermes/skills/ngn-agent/aws-diagnostics/references/multiregional-patterns.md" << 'REF'
 # Multiregional Infrastructure Patterns
 ## AVA Multiregional Setup
 ### Provider Configuration Pattern
 ```hcl
 # Primary provider (us-east-1)
 provider "aws" {
  region = var.region
  # ... assume_role block
 }
 # Replica provider (us-west-2)
 provider "aws" {
  alias  = "replica"
  region = "us-west-2"
  # ... same assume_role block
 }
 ```
 ### Module Consumption
 ```hcl
 module "app" {
  providers = {
    aws         = aws
    aws.replica = aws.replica  # Required by tf-modules/app/versions.tf
  }
  # ... other config
 }
 ```
 ### Database Replication Options
 **RDS Aurora PostgreSQL (Current Pattern)**
 - Engine: `aurora-postgresql`
 - Version: `16.11`
 - Cross-region read replicas supported
 - Can promote replica for DR scenarios
 **DynamoDB Global Tables (Available)**
 - Global Table v2 with us-east-1 + us-west-2 replicas
 - Per-region CMKs for encryption
 - Feature-flagged via `var.tenant_registry`
 - Documented in RAID-352
 ### S3 Cross-Region Replication
 Extensive existing pattern from migration data:
 - `ava-{env}-west-*` buckets in us-west-2
 - Matching `rai-s3-usw2-*` naming convention
 - Covers: bug reports, screenshots, game logs, shiny moments
 ### Key Files for Multiregional Analysis
 - `aws/<account>/us-east-1/app/provider.tf` - Replica provider config
 - `aws/<account>/us-east-1/app/<app>.tfvars` - App-specific resources
 - `raid-migration/raid-s3-migration/s3-mapping.csv` - Cross-region S3 inventory
 - `RAID-352-PR-DESCRIPTION.md` - DynamoDB Global Tables documentation
 REF
    echo "  ✓ aws-diagnostics/SKILL.md + references written"
 }
 write_confluence_skill() {
    mkdir -p "$HOME/.hermes/skills/ngn-agent/confluence"
    cat > "$HOME/.hermes/skills/ngn-agent/confluence/SKILL.md" << 'SKILL'
 ---
 name: confluence-search
 description: Search and retrieve Confluence pages
 metadata:
  hermes:
    tags: [confluence, documentation]
    category: devops
    requires_toolsets: [terminal]
 version: 1.0.0
 ---
 # Confluence Search
 ## When to Use
 When the user asks to find documentation, search Confluence pages, or retrieve page content.
 ## Procedure
 ### 1. Search pages by text
 ```bash
 ngn-confluence GET '/rest/api/search?cql=text~"search terms"&limit=10'
 ```
 ### 2. Search by space
 ```bash
 ngn-confluence GET '/rest/api/search?cql=space=ADM&limit=10'
 ```
 ### 3. Get page content
 ```bash
 ngn-confluence GET '/rest/api/content/{pageId}?expand=body.storage'
 ```
 ### 4. List pages in space
 ```bash
 ngn-confluence GET '/rest/api/content?spaceKey=ADM&limit=50'
 ```
 ### 5. Get page children
 ```bash
 ngn-confluence GET '/rest/api/content/{pageId}/child/page?limit=50'
 ```
 ## Pitfalls
 - CQL is different from JQL — `text~"query"` for full-text search
 - Page body needs `expand=body.storage` to retrieve content
 - Use `limit` parameter — defaults to 25
 ## Required Environment
 - `JIRA_EMAIL` — your Atlassian account email
 - `JIRA_API_TOKEN` — from https://id.atlassian.com/manage/api-tokens
 SKILL
    echo "  ✓ confluence/SKILL.md written"
 }
 write_bitbucket_skill() {
    mkdir -p "$HOME/.hermes/skills/ngn-agent/bitbucket"
    cat > "$HOME/.hermes/skills/ngn-agent/bitbucket/SKILL.md" << 'SKILL'
 ---
 name: bitbucket-pr
 description: Review Bitbucket pull requests and repositories
 metadata:
  hermes:
    tags: [bitbucket, git, code-review]
    category: devops
    requires_toolsets: [terminal]
 version: 1.0.0
 ---
 # Bitbucket Pull Requests
 ## When to Use
 When the user asks to check PRs, review code, or list repositories.
 ## Procedure
 ### 1. List repositories
 ```bash
 ngn-bitbucket GET '/repositories/razersw?pagelen=20'
 ```
 ### 2. List open PRs for a repo
 ```bash
 ngn-bitbucket GET '/repositories/razersw/{repo}/pullrequests?state=OPEN&pagelen=20'
 ```
 ### 3. Get PR details
 ```bash
 ngn-bitbucket GET '/repositories/razersw/{repo}/pullrequests/{prId}'
 ```
 ### 4. Get PR diff
 ```bash
 ngn-bitbucket GET '/repositories/razersw/{repo}/pullrequests/{prId}/diff'
 ```
 ### 5. Get PR comments
 ```bash
 ngn-bitbucket GET '/repositories/razersw/{repo}/pullrequests/{prId}/comments'
 ```
 ### 6. List branch list
 ```bash
 ngn-bitbucket GET '/repositories/razersw/{repo}/refs/branches?pagelen=20'
 ```
 ## Pitfalls
 - Bitbucket pagination uses `pagelen` and `page` params (not `maxResults`)
 - Diff endpoint returns raw diff text — may be large
 - PR comments include inline code comments, not just summary
 ## Required Environment
 - `JIRA_EMAIL` — your Atlassian account email
 - `JIRA_API_TOKEN` — from https://id.atlassian.com/manage/api-tokens
 SKILL
    echo "  ✓ bitbucket/SKILL.md written"
 }
 write_session_skill() {
    mkdir -p "$HOME/.hermes/skills/ngn-agent/session/references"
    cat > "$HOME/.hermes/skills/ngn-agent/session/SKILL.md" << 'SKILL'
 ---
 name: session
 description: Main ngn-agent session lifecycle — init, work, close
 metadata:
  hermes:
    tags: [ngn-agent, platform-engineering, session]
    category: devops
    requires_toolsets: [terminal]
 version: 1.0.0
 ---
 # ngn-agent Session Lifecycle
 ## When to Use
 Load this skill at the START of EVERY platform engineering session, before any other work. This skill defines the standard session workflow.
 Specific triggers:
 - When the user starts any infrastructure or platform engineering task
 - When the user asks to create a Jira ticket or find a ticket
 - When the user wants to search or load Confluence documentation
 - When a session is ending and you need to document progress
 - When you need to save context for future sessions
 ## Important
 - **Keep this skill loaded for the entire session** — if context grows large, reload via `skill_view("session")` before the session-end steps (Steps 5–7)
 - **Never create Jira tickets without asking the user first** (D-02)
 - **Never update Confluence without asking the user first** (D-11)
 - **Always save session summary to hindsight at end** — this step has no user prompt, it is automatic (D-12)
 - User must confirm before any Jira mutation (create, comment, transition) — D-08
 - Repos are already mounted at `/workspace/` from Phase 6 (rai-ops, rai-deployment, rai-devtools)
 - This skill replaces the ad-hoc session workflow with a repeatable init→work→close pattern
 ## Procedure
 ### 1. Check for Similar Previous Sessions
 At the very start of a session, use `hindsight_recall` with a query describing the user's current task to find similar sessions from the last 2 weeks.
 Call `hindsight_recall` with a budget of low:
 ```
 Tool: hindsight_recall
 Query: "<user's task description>"
 Budget: low
 ```
 Present any matches to the user in this format:
 ```
 Found [N] similar sessions from the last 2 weeks:
 1. [Session Title] — [Date] — [one-line summary]
 2. [Session Title] — [Date] — [one-line summary]
 ```
 Ask the user: "Would you like to resume any of these sessions, or start fresh?"
 - If they choose to resume: load that session's context and continue
 - If they choose fresh: proceed to step 2
 If no similar sessions are found (normal for first sessions), proceed to step 2.
 ### 2. Prompt: Create Jira Ticket
 Ask the user: "Would you like to create a Jira Task ticket for this session?"
 If YES:
 1. Ask which Jira project to use (e.g., "PLATFORM", "DEVOPS") — do not hardcode (D-06)
 2. Check hindsight for cached epics:
   ```
   Tool: hindsight_recall
   Query: "jira epics cached"
   Budget: low
   ```
 3. If epics are cached, check the cache timestamp:
   - If the cache is more than 24 hours old OR the user says the list looks wrong, refresh from Jira:
     ```bash
     ngn-jira GET '/rest/api/3/search?jql=issuetype=Epic AND project=<PROJECT>&fields=summary,id&maxResults=50'
     ```
     Save fresh epics to hindsight:
     ```
     Tool: hindsight_retain
     tier: "epic-cache"
     content: "Epic Cache [<date>]: PROJECT=<PROJECT>: [EPIC-KEY-1: Summary, EPIC-KEY-2: Summary, ...]"
     ```
   - If the cache is fresh (less than 24 hours old), use the cached list
 4. If no cached epics found, query Jira for current epics:
   ```bash
   ngn-jira GET '/rest/api/3/search?jql=issuetype=Epic AND project=<PROJECT>&fields=summary,id&maxResults=50'
   ```
   Save to hindsight for future sessions:
   ```
   Tool: hindsight_retain
   tier: "epic-cache"
   content: "Epic Cache [<date>]: PROJECT=<PROJECT>: [EPIC-KEY-1: Summary, ...]"
   ```
 5. Present cached/refreshed epics to the user: "Available epics: [list]. Would you like to set a parent epic?"
 6. If user selects an epic, include it as parent when creating the ticket
 7. Create the Task via Jira REST API:
   ```bash
   ngn-jira POST '/rest/api/3/issue' --body '{
     "fields": {
       "project": {"key": "<PROJECT>"},
       "summary": "<session task description>",
       "issuetype": {"name": "Task"},
       "parent": {"key": "<EPIC_KEY>"}
     }
   }'
   ```
 8. Note the ticket key (e.g., `PLATFORM-123`) — save it for session-end steps (Step 5)
 If NO: proceed to step 3 (no Jira ticket this session)
 ### 3. Prompt: Load Confluence Documentation
 Ask the user: "Would you like to load relevant Confluence documentation?"
 If YES:
 1. Search by the `ngn-agent` tag:
   ```bash
   ngn-confluence GET '/rest/api/search?cql=tag="ngn-agent"&limit=20'
   ```
 2. Present matching pages to the user:
   ```
   Found [N] pages tagged 'ngn-agent':
   - [Title] — [Space] — [Last Modified]
   ```
 3. Ask: "Which pages would you like me to load?"
 4. For each selected page, load its full content:
   ```bash
   ngn-confluence GET '/rest/api/content/{pageId}?expand=body.storage'
   ```
 5. Review the loaded content with the user
 If NO: proceed to step 4
 ### 4. Work Phase
 Repos are already mounted at `/workspace/` (rai-ops, rai-deployment, rai-devtools). Proceed with the task using standard Hermes tools.
 If you need to clone additional repos:
 ```bash
 git clone git@bitbucket.org:razersw/<repo>.git /workspace/<repo>
 ```
 The session skill remains loaded for the session-end steps below. If the skill is evicted from context during a long session, reload it with `skill_view("session")` before proceeding to Steps 5–7.
 ### 5. Session-End: Update Jira
 When the user indicates work is complete or the session wraps up:
 Ask the user: "Would you like me to update the Jira ticket with a summary comment?"
 If YES (and a ticket was created in Step 2):
 ```bash
 ngn-jira POST '/rest/api/3/issue/<TICKET-KEY>/comment' --body '{
  "body": "<summary of work done, key decisions, next steps>"
 }'
 ```
 If NO: proceed without updating Jira.
 **Important (D-08):** Do NOT transition tickets (e.g., close, resolve, move to Done) without explicit user confirmation. Only add comments unless the user specifically asks for a status change.
 ### 6. Session-End: Update Confluence
 Ask the user: "Would you like me to create or update a Confluence page documenting this session?"
 If YES:
 - For a new page:
  ```bash
  ngn-confluence POST '/rest/api/content' --body '{
    "type": "page",
    "title": "<Session Date>: <Task Description>",
    "space": {"key": "<SPACE_KEY>"},
    "body": {
      "storage": {
        "value": "<h1>Session Summary</h1><p><task summary, key decisions, outcomes></p>",
        "representation": "storage"
      }
    },
    "metadata": {
      "properties": {
        "content-appearance": {"value": "page"}
      }
    },
    "labels": [{"name": "ngn-agent"}]
  }'
  ```
 - For updating an existing page: ask the user which page to update, then PUT to update its content
 - **Important (D-11):** Do NOT create or update any Confluence page without the user confirming first
 If NO: proceed without updating Confluence.
 ### 7. Session-End: Save to Hindsight (Automatic — No Prompt)
 ALWAYS save a session summary to hindsight memory. Do NOT ask the user — this step is automatic and unconditional (D-12).
 ```bash
 Tool: hindsight_retain
 tier: "session-summary"
 content: "
 Session Summary
 ===============
 Date: <today>
 Task: <task description>
 Repos: <repos worked on>
 Jira: <ticket key or \"none\">
 Key Decisions:
 - <decision 1>
 - <decision 2>
 Outcomes:
 - <outcome 1>
 - <outcome 2>
 Next Steps:
 - <next step 1>
 "
 ```
 This summary allows future `hindsight_recall` queries to find this session for similarity matching (D-13). The structured content includes: date, task description, repos worked on, Jira ticket reference (or "none"), key decisions, outcomes, and next steps.
 ## Pitfalls
 - **Skill not loaded at session start:** If you find yourself midway through a session without having run Steps 1–3, you missed the session start workflow. Run Step 1 (hindsight_recall) retroactively and ask the user if they want to create a Jira ticket or load Confluence docs. For future sessions, make sure to load this skill at the very start.
 - **Epic cache too old:** Epics may change between sessions. Check the cache timestamp and refresh if more than 24 hours old. If the user says "that's wrong," always refresh regardless of age.
 - **Confluence tag mismatch:** If the `ngn-agent` tag returns no results, try `platform-engineering` as a fallback, or ask the user what tag they use for session documentation.
 - **Jira project doesn't exist:** If the create ticket call fails with a 404, the project key may be wrong. Ask the user to confirm the correct project key.
 - **Empty hindsight recall (first sessions):** The first few sessions will have no similar sessions to find. That is normal — proceed with a fresh session. Over time, hindsight will accumulate session summaries.
 - **Long sessions may evict this skill:** If the conversation grows long, the session skill content may be evicted from the agent's context. Reload it with `skill_view("session")` before the session-end steps (Steps 5–7) to ensure the Jira/Confluence prompts and hindsight save are not missed.
 - **Missing Jira credentials in cron jobs:** The ngn-jira tool requires both `JIRA_EMAIL` and `JIRA_API_TOKEN` environment variables. If either is missing, Jira operations will fail with "unbound variable" errors. Check environment setup before attempting Jira updates in automated workflows.
 ## Operational Automation
 ### Daily Session Monitoring (Cron Job)
 When running as a scheduled cron job for operational monitoring:
 1. **Discover Active Sessions**:
   ```bash
   hermes sessions export -  # NOT 'hermes sessions list' - no --json flag available
   ```
   Parse JSONL output with Python to find sessions with `last_active` within last 7 days
 2. **Find Associated Jira Tickets**:
   - Use `hindsight_recall` with query 'session summary jira' for each active session
   - Search session messages for Jira patterns: `PLATFORM-\d+`, `AIOPS-\d+`, `RAID-\d+`, etc.
   - Note: One session may have multiple Jira tickets (1-to-many mapping)
 3. **Update Jira with Progress**:
   ```bash
   ngn-jira POST '/rest/api/3/issue/<KEY>/comment' --body '{
     "body": "Session activity update — Date: <today>, Last active: <last_active>. Session: <session_id>. Progress: See session transcript for details."
   }'
   ```
 4. **Generate Telegram Report**:
   - Structure: Active Sessions + Jira Updated + Issues/Summary
   - Keep under 4096 character limit
   - Format with emoji sections for clarity
 **Environment Requirements for Operational Jobs**:
 - `JIRA_EMAIL` — Required for ngn-jira authentication
 - `JIRA_API_TOKEN` — API token from Atlassian account
 - Both must be set or Jira updates will fail
 See `references/operational-monitoring.md` for detailed patterns, templates, and troubleshooting.
 **Important Constraints (Cron Mode)**:
 - DO NOT transition ticket statuses (D-05) - only add comments
 - DO NOT update stale sessions (D-15) - only active within 7 days
 - Use silent mode `[SILENT]` if no active sessions found
 ## Verification
 1. On session start, agent checks for similar sessions via `hindsight_recall` ✓
 2. Jira Task ticket created (or user declined) ✓
 3. Confluence docs loaded by `ngn-agent` tag search (or user declined) ✓
 4. At session end, user prompted for Jira update ✓
 5. At session end, user prompted for Confluence update ✓
 6. Session summary automatically saved to hindsight via `hindsight_retain` (no prompt) ✓
 7. **Operational cron jobs can discover active sessions and update Jira tickets** ✓
 SKILL
    # Session reference file
    cat > "$HOME/.hermes/skills/ngn-agent/session/references/operational-monitoring.md" << 'REF'
 # Operational Session Monitoring
 ## Jira Ticket Pattern Detection
 When scanning session content for associated Jira tickets, search for these patterns:
 ```python
 jira_patterns = [
    r'(PLATFORM-\d+)',    # Platform engineering tickets
    r'(AIOPS-\d+)',       # AI Operations tickets
    r'(RAID-\d+)',        # RAID project tickets
    r'(DEVOPS-\d+)',      # DevOps tickets
    r'(QAC-\d+)'          # QAC tickets
 ]
 ```
 ## Session Export vs List Commands
 **CORRECT**: `hermes sessions export -`
 - Returns machine-readable JSONL format
 - Each line is a complete session object
 - Includes `last_active` timestamps for filtering
 **INCORRECT**: `hermes sessions list --json`
 - The `--json` flag does not exist (Pitfall from RESEARCH.md)
 - Use export for automation, list for human viewing only
 ## Environment Variable Requirements
 The `ngn-jira` tool wrapper expects:
 - `JIRA_EMAIL` - Atlassian account email
 - `JIRA_API_TOKEN` - From https://id.atlassian.com/manage/api-tokens
 Missing either variable causes: `bash: line 10: JIRA_EMAIL: unbound variable`
 ## Telegram Report Template
 ```
 📋 **ACTIVE SESSIONS** — {date}
 🔹 **{session_id}**
   Title: {title}
   Last Active: {timestamp}
   Jira: {ticket_keys or "None"}
 🔄 **JIRA UPDATED**: {list of updated ticket keys}
 ❌ **ISSUES**: {any operational problems}
 📊 **SUMMARY**: {count} active sessions found, {count} with Jira tickets
 ```
 Character limit: 4096 for Telegram delivery
 REF
    echo "  ✓ session/SKILL.md + references written"
 }
 # ---- Register cron jobs (D-10) ----
 register_cron_jobs() {
    echo "  → Registering cron jobs (D-10)..."
    # 1. ngn-daily-report (daily at 09:00 SGT)
    echo "  → Creating ngn-daily-report..."
    hermes cron create --deliver telegram --skill session --skill jira-query \
        '0 9 * * *' \
        'Daily session report. Export sessions, find active ones, check Jira, compose Telegram summary.' \
        2>/dev/null && echo "  ✓ ngn-daily-report registered" \
        || echo "  ⚠ ngn-daily-report may already exist"
    # 2. ngn-weekly-stale-summary (Sunday 20:00 SGT)
    echo "  → Creating ngn-weekly-stale-summary..."
    hermes cron create --deliver telegram --skill session \
        '0 20 * * 0' \
        'Weekly stale session summary. Review sessions inactive >30 days, compose Telegram summary.' \
        2>/dev/null && echo "  ✓ ngn-weekly-stale-summary registered" \
        || echo "  ⚠ ngn-weekly-stale-summary may already exist"
    # 3. ngn-weekly-archive (Sunday 20:05 SGT — 5 min after summary, per D-10)
    echo "  → Creating ngn-weekly-archive..."
    hermes cron create --no-agent --script archive-stale-sessions.sh \
        '5 20 * * 0' \
        2>/dev/null && echo "  ✓ ngn-weekly-archive registered" \
        || echo "  ⚠ ngn-weekly-archive may already exist"
 }
 # ---- Offer gateway restart (per CONTEXT.md "Specific Ideas") ----
 offer_gateway_restart() {
    echo ""
    echo "==> Setup complete!"
    echo ""
    read -p "Restart Hermes gateway now? [Y/n]: " restart
    if [[ "$restart" =~ ^[Yy]?$ ]]; then
        hermes gateway restart
        echo "  → Gateway restarted."
    else
        echo "  → Skipped. Run 'hermes gateway restart' when ready."
    fi
 }
 # =============================================================================
 # Main Execution Block
 # =============================================================================
 main() {
    echo ""
    echo "=== ngn-agent Setup Script ==="
    echo "Embedded file snapshots frozen at: 2026-06-15"
    echo ""
    # Step 1: Parse arguments (already done above)
    # Step 2: Check prerequisites
    echo "[1/14] Checking prerequisites..."
    check_prerequisites
    # Step 3: Print path summary
    print_summary
    # Step 4: Prompt for secrets
    echo "[2/14] Collecting secrets..."
    if [ "$NONINTERACTIVE" = false ]; then
        JIRA_API_TOKEN=$(prompt_secret "JIRA_API_TOKEN" "JIRA API Token (https://id.atlassian.com/manage/api-tokens): ")
        JIRA_EMAIL=$(prompt_secret "JIRA_EMAIL" "JIRA Email: ")
        TELEGRAM_BOT_TOKEN=$(prompt_secret "TELEGRAM_BOT_TOKEN" "Telegram Bot Token (from @BotFather): ")
        OPENROUTER_API_KEY=$(prompt_secret "OPENROUTER_API_KEY" "OpenRouter API Key (leave blank to keep existing): " "true")
    else
        echo "  → Non-interactive mode — using environment variables"
        : "${JIRA_API_TOKEN:?JIRA_API_TOKEN not set}"
        : "${JIRA_EMAIL:?JIRA_EMAIL not set}"
        : "${TELEGRAM_BOT_TOKEN:?TELEGRAM_BOT_TOKEN not set}"
    fi
    echo "  ✓ Secrets collected"
    # Step 5: Create directories
    echo "[3/14] Creating directories..."
    create_directories
    # Step 6: Backup existing config
    echo "[4/14] Backing up existing config..."
    backup_config
    # Step 7: Generate config.yaml
    echo "[5/14] Generating config.yaml..."
    generate_config_yaml
    # Step 8: Generate .env
    echo "[6/14] Generating .env..."
    generate_env_file
    # Step 9: Generate hindsight config
    echo "[7/14] Generating hindsight config..."
    generate_hindsight_config
    # Step 10: Generate cron env config
    echo "[8/14] Configuring cron environment..."
    generate_cron_env_config
    # Step 11: Write session-init script
    echo "[9/14] Writing session-init script..."
    write_session_init_script
    # Step 12: Write archive script
    echo "[10/14] Writing archive script..."
    write_archive_script
    # Step 13: Write skill files
    echo "[11/14] Writing skill files..."
    write_jira_skill
    write_aws_skill
    write_confluence_skill
    write_bitbucket_skill
    write_session_skill
    # Step 14: Register cron jobs
    echo "[12/14] Registering cron jobs..."
    register_cron_jobs || echo "  ⚠ Cron registration had issues (may already exist)"
    echo "[13/14] Setup complete."
    echo "[14/14] Offering gateway restart..."
    offer_gateway_restart
 }
 main "$@"