ngn-agent/ngn-agent/setup-ngn-agent.sh

#!/bin/bash
# setup-ngn-agent.sh — Portable ngn-agent configuration setup
#
# Phase 9, Plan 2 — recreates all configuration on a fresh macOS machine
# Assumes Hermes v0.16+ is installed (per D-07)
#
# Embedded file snapshots frozen at: 2026-06-15
# Regenerate by re-running this phase.
#
# D-06: Single script recreating all ngn-agent configuration
# D-07: Requires Hermes v0.16+ on PATH
# D-08: Interactive secrets: JIRA_API_TOKEN, JIRA_EMAIL, TELEGRAM_BOT_TOKEN, OPENROUTER_API_KEY
# D-09: Configurable paths via arguments (SSH keys, repos, timezone)
# D-10: Creates/updates: config.yaml, .env, hindsight/config.json, scripts, skills, cron jobs
#
set -euo pipefail

# ---- Usage ----
usage() {
    cat <<'USAGE'
usage: setup-ngn-agent.sh [OPTIONS]

Portable ngn-agent configuration setup for macOS + Hermes v0.16+

Options:
  -s1, --ssh-key-1 PATH      SSH private key path 1  (default: ~/.ssh/id_ed25519razer)
  -s2, --ssh-key-2 PATH      SSH private key path 2  (default: ~/.ssh/id_rsa)
  -sc, --ssh-config PATH     SSH config path          (default: ~/.ssh/config)
  -sh, --ssh-known-hosts PATH SSH known_hosts path     (default: ~/.ssh/known_hosts)
  -r1, --repo-ops PATH       rai-ops repo path         (default: ~/Razer/rai-ops)
  -r2, --repo-deploy PATH    rai-deployment repo path  (default: ~/Razer/rai-deployment)
  -r3, --repo-devtools PATH  rai-devtools repo path    (default: ~/Razer/rai-devtools)
  -t,  --timezone ZONE       Timezone                  (default: Asia/Singapore)
  -d,  --docker-image TAG    Docker image tag          (default: ngn-agent:latest)
  -y,  --yes                 Non-interactive mode (skip prompts, use env vars)
  -h,  --help                Show this help message

Secrets are prompted interactively with masked input unless -y is set,
in which case they are read from environment variables.
USAGE
}

# ---- Argument defaults ----
SSH_KEY_1="${SSH_KEY_1:-$HOME/.ssh/id_ed25519razer}"
SSH_KEY_2="${SSH_KEY_2:-$HOME/.ssh/id_rsa}"
SSH_CONFIG="${SSH_CONFIG:-$HOME/.ssh/config}"
SSH_KNOWN_HOSTS="${SSH_KNOWN_HOSTS:-$HOME/.ssh/known_hosts}"
REPO_OPS="${REPO_OPS:-$HOME/Razer/rai-ops}"
REPO_DEPLOY="${REPO_DEPLOY:-$HOME/Razer/rai-deployment}"
REPO_DEVTOOLS="${REPO_DEVTOOLS:-$HOME/Razer/rai-devtools}"
TIMEZONE="${TIMEZONE:-Asia/Singapore}"
DOCKER_IMAGE="${DOCKER_IMAGE:-ngn-agent:latest}"
NONINTERACTIVE=false

# ---- Argument parsing (per D-09) ----
while [[ $# -gt 0 ]]; do
    case "$1" in
        -s1|--ssh-key-1)
            SSH_KEY_1="$2"; shift 2 ;;
        -s2|--ssh-key-2)
            SSH_KEY_2="$2"; shift 2 ;;
        -sc|--ssh-config)
            SSH_CONFIG="$2"; shift 2 ;;
        -sh|--ssh-known-hosts)
            SSH_KNOWN_HOSTS="$2"; shift 2 ;;
        -r1|--repo-ops)
            REPO_OPS="$2"; shift 2 ;;
        -r2|--repo-deploy)
            REPO_DEPLOY="$2"; shift 2 ;;
        -r3|--repo-devtools)
            REPO_DEVTOOLS="$2"; shift 2 ;;
        -t|--timezone)
            TIMEZONE="$2"; shift 2 ;;
        -d|--docker-image)
            DOCKER_IMAGE="$2"; shift 2 ;;
        -y|--yes)
            NONINTERACTIVE=true; shift ;;
        -h|--help)
            usage; exit 0 ;;
        *)
            echo "Unknown option: $1"
            usage; exit 1 ;;
    esac
done

# ---- Interactive secret prompt (per D-08) ----
# T-09-05 mitigation: read -s for masked input, no echo to terminal
prompt_secret() {
    local var_name="$1"
    local prompt_text="$2"
    local is_optional="${3:-false}"
    local val=""

    # If env var is already set (e.g., user exported it), skip prompt
    if [ -n "${!var_name:-}" ]; then
        echo "  → ${var_name} already set (using environment value)"
        echo "${!var_name}"
        return
    fi

    while [ -z "$val" ]; do
        read -s -p "${prompt_text}" val
        echo
        if [ -z "$val" ] && [ "$is_optional" = "true" ]; then
            # Optional and empty — return empty string
            echo ""
            return
        elif [ -z "$val" ]; then
            echo "  ⚠ Value cannot be empty. Press Ctrl+C to cancel."
        fi
    done
    echo "$val"
}

# ---- Prerequisite checks ----
check_prerequisites() {
    echo "  → Checking prerequisites..."

    # 1. Hermes CLI installed (per D-07)
    if ! command -v hermes >/dev/null 2>&1; then
        echo "  ERROR: Hermes CLI not found — install v0.16+ first."
        echo "  See: https://github.com/nousresearch/hermes"
        exit 1
    fi
    echo "  ✓ Hermes CLI found: $(hermes --version 2>/dev/null || echo 'unknown version')"

    # 2. Docker running
    if ! docker info >/dev/null 2>&1; then
        echo "  ERROR: Docker is not running."
        echo "  Start Docker Desktop or Orbstack first."
        exit 1
    fi
    echo "  ✓ Docker is running"

    # 3. SSH key files exist
    if [ ! -f "$SSH_KEY_1" ]; then
        echo "  ⚠ SSH key not found: ${SSH_KEY_1}"
    else
        echo "  ✓ SSH key 1: ${SSH_KEY_1}"
    fi
    if [ ! -f "$SSH_KEY_2" ]; then
        echo "  ⚠ SSH key not found: ${SSH_KEY_2}"
    else
        echo "  ✓ SSH key 2: ${SSH_KEY_2}"
    fi
    if [ ! -f "$SSH_CONFIG" ]; then
        echo "  ⚠ SSH config not found: ${SSH_CONFIG}"
    else
        echo "  ✓ SSH config: ${SSH_CONFIG}"
    fi
    if [ ! -f "$SSH_KNOWN_HOSTS" ]; then
        echo "  ⚠ SSH known_hosts not found: ${SSH_KNOWN_HOSTS}"
    else
        echo "  ✓ SSH known_hosts: ${SSH_KNOWN_HOSTS}"
    fi

    # 4. Repo paths exist
    if [ ! -d "$REPO_OPS" ]; then
        echo "  ⚠ Repo not found: ${REPO_OPS}"
    else
        echo "  ✓ Repo (ops): ${REPO_OPS}"
    fi
    if [ ! -d "$REPO_DEPLOY" ]; then
        echo "  ⚠ Repo not found: ${REPO_DEPLOY}"
    else
        echo "  ✓ Repo (deploy): ${REPO_DEPLOY}"
    fi
    if [ ! -d "$REPO_DEVTOOLS" ]; then
        echo "  ⚠ Repo not found: ${REPO_DEVTOOLS}"
    else
        echo "  ✓ Repo (devtools): ${REPO_DEVTOOLS}"
    fi
}

# ---- Print path summary ----
print_summary() {
    echo ""
    echo "  Configuration paths:"
    echo "    SSH key 1:       ${SSH_KEY_1}"
    echo "    SSH key 2:       ${SSH_KEY_2}"
    echo "    SSH config:      ${SSH_CONFIG}"
    echo "    SSH known_hosts: ${SSH_KNOWN_HOSTS}"
    echo "    Repo (ops):      ${REPO_OPS}"
    echo "    Repo (deploy):   ${REPO_DEPLOY}"
    echo "    Repo (devtools): ${REPO_DEVTOOLS}"
    echo "    Timezone:        ${TIMEZONE}"
    echo "    Docker image:    ${DOCKER_IMAGE}"
    echo ""
}

# ---- Create config directories ----
create_directories() {
    echo "  → Creating config directories..."
    mkdir -p "$HOME/.hermes/scripts"
    mkdir -p "$HOME/.hermes/hindsight"
    mkdir -p "$HOME/.hermes/skills/ngn-agent"
    mkdir -p "$HOME/.hermes/archive/sessions"
    echo "  ✓ Directories created"
}

# ---- Backup existing config (per Anti-Pattern 4, T-09-07 mitigation) ----
backup_config() {
    if [ -f "$HOME/.hermes/config.yaml" ]; then
        local bak_file="$HOME/.hermes/config.yaml.bak.$(date +%Y%m%d_%H%M%S)"
        cp "$HOME/.hermes/config.yaml" "$bak_file"
        echo "  ✓ Backed up config.yaml → $(basename ${bak_file})"
    else
        echo "  → No existing config.yaml to backup"
    fi
}

# =============================================================================
# Task 2: Config Generation (D-10)
# =============================================================================

# ---- Generate config.yaml ----
# Uses hermes config set for simple keys, Python yaml or sed for arrays
generate_config_yaml() {
    echo "  → Generating config.yaml (D-10)..."

    # Scalars via hermes config set
    hermes config set terminal.backend docker
    hermes config set terminal.docker_image "${DOCKER_IMAGE}"
    hermes config set terminal.cwd /workspace
    hermes config set terminal.container_memory 5120
    hermes config set terminal.container_disk 51200
    hermes config set terminal.container_cpu 1
    hermes config set terminal.lifetime_seconds 300
    hermes config set memory.provider hindsight
    hermes config set terminal.timezone "${TIMEZONE}"
    hermes config set telegram.reactions false
    hermes config set terminal.docker_env.AWS_REGION us-east-1
    hermes config set terminal.container_persistent true
    hermes config set terminal.docker_mount_cwd_to_workspace true
    echo "  ✓ Scalar config keys set"

    # Complex structures (arrays): try Python yaml first, fall back to sed
    if python3 -c "import yaml; import os" 2>/dev/null; then
        echo "  → Using Python yaml for array structures..."
        python3 -c "
import yaml, os

path = os.path.expanduser('~/.hermes/config.yaml')
with open(path) as f:
    config = yaml.safe_load(f)

ssh_key_1 = '${SSH_KEY_1}'
ssh_key_2 = '${SSH_KEY_2}'
ssh_config = '${SSH_CONFIG}'
ssh_known_hosts = '${SSH_KNOWN_HOSTS}'
repo_ops = '${REPO_OPS}'
repo_deploy = '${REPO_DEPLOY}'
repo_devtools = '${REPO_DEVTOOLS}'
home = os.path.expanduser('~')

config['terminal']['docker_volumes'] = [
    ssh_key_1 + ':/root/.ssh/id_ed25519razer:ro',
    ssh_key_2 + ':/root/.ssh/id_rsa:ro',
    ssh_config + ':/root/.ssh/config:ro',
    ssh_known_hosts + ':/root/.ssh/known_hosts:ro',
    home + '/.aws/config:/root/.aws/config:ro',
    home + '/.aws/sso/cache:/root/.aws/sso/cache:rw',
    repo_ops + ':/workspace/rai-ops:rw',
    repo_deploy + ':/workspace/rai-deployment:rw',
    repo_devtools + ':/workspace/rai-devtools:rw',
    home + '/.hermes/scripts:/usr/local/bin:ro',
]

config['terminal']['docker_forward_env'] = ['JIRA_EMAIL', 'JIRA_API_TOKEN', 'DEFAULT_REPOS']
config['terminal']['shell_init_files'] = ['/usr/local/bin/session-init.sh']

with open(path, 'w') as f:
    yaml.dump(config, f, default_flow_style=False)
"
        echo "  ✓ Array structures set via Python yaml"
    else
        echo "  → Python yaml not available, using sed fallback..."
        # Fallback: use sed to inject arrays into config.yaml
        local config_file="$HOME/.hermes/config.yaml"

        # Add docker_volumes block
        cat >> "$config_file" << 'SEDVOL'

terminal:
  docker_volumes:
  - ${SSH_KEY_1}:/root/.ssh/id_ed25519razer:ro
  - ${SSH_KEY_2}:/root/.ssh/id_rsa:ro
  - ${SSH_CONFIG}:/root/.ssh/config:ro
  - ${SSH_KNOWN_HOSTS}:/root/.ssh/known_hosts:ro
  - ${HOME}/.aws/config:/root/.aws/config:ro
  - ${HOME}/.aws/sso/cache:/root/.aws/sso/cache:rw
  - ${REPO_OPS}:/workspace/rai-ops:rw
  - ${REPO_DEPLOY}:/workspace/rai-deployment:rw
  - ${REPO_DEVTOOLS}:/workspace/rai-devtools:rw
  - ${HOME}/.hermes/scripts:/usr/local/bin:ro
  docker_forward_env:
  - JIRA_EMAIL
  - JIRA_API_TOKEN
  - DEFAULT_REPOS
  shell_init_files:
  - /usr/local/bin/session-init.sh
SEDVOL
        echo "  ✓ Array structures set via sed (partial — review config.yaml)"
    fi

    # Validate
    local img
    img=$(hermes config get terminal.docker_image 2>/dev/null || echo "unset")
    echo "  ✓ Verified: terminal.docker_image = ${img}"
}

# ---- Generate .env file ----
# T-09-06 mitigation: chmod 600 on .env immediately after writing
generate_env_file() {
    echo "  → Generating .env (D-08, D-10)..."

    local env_file="$HOME/.hermes/.env"

    # Resolve HINDSIGHT_LLM_API_KEY — defaults to OPENROUTER_API_KEY if not separately provided
    local hind_key="${HINDSIGHT_LLM_API_KEY:-${OPENROUTER_API_KEY}}"

    cat > "$env_file" << ENVEOF
# ngn-agent Environment — generated by setup-ngn-agent.sh
# Embedded file snapshots frozen at: 2026-06-15

# =============================================================================
# LLM PROVIDER (OpenRouter)
# =============================================================================
OPENROUTER_API_KEY=${OPENROUTER_API_KEY}

# =============================================================================
# ATLASSIAN INTEGRATION
# =============================================================================
JIRA_API_TOKEN=${JIRA_API_TOKEN}
JIRA_EMAIL=${JIRA_EMAIL}

# =============================================================================
# TELEGRAM GATEWAY
# =============================================================================
TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN}
TELEGRAM_ALLOWED_USERS=474440517

# =============================================================================
# HINDSIGHT MEMORY
# =============================================================================
HINDSIGHT_LLM_API_KEY=${hind_key}

# =============================================================================
# NGN-AGENT CONFIG
# =============================================================================
DEFAULT_REPOS=rai-ops,rai-deployment,rai-devtools
TERMINAL_TIMEOUT=60
TERMINAL_LIFETIME_SECONDS=300
ENVEOF

    # T-09-06: Restrict permissions immediately after writing
    chmod 600 "$env_file"
    echo "  ✓ .env written with chmod 600 (T-09-06)"
}

# ---- Generate hindsight config.json (D-10) ----
generate_hindsight_config() {
    echo "  → Generating hindsight/config.json (D-10)..."

    cat > "$HOME/.hermes/hindsight/config.json" << 'JSONEOF'
{
  "mode": "local_embedded",
  "llm_provider": "openrouter",
  "llm_base_url": "https://openrouter.ai/api/v1",
  "llm_model": "qwen/qwen3.5-9b",
  "bank_id": "hermes",
  "recall_budget": "low",
  "recall_prefetch_method": "recall",
  "auto_recall": true,
  "recall_types": "observation",
  "auto_retain": true,
  "retain_async": true,
  "retain_every_n_turns": 5,
  "memory_mode": "hybrid"
}
JSONEOF

    echo "  ✓ hindsight/config.json written"
}

# ---- Generate cron env config (D-10) ----
generate_cron_env_config() {
    echo "  → Configuring cron environment (D-10)..."

    hermes config set cron.env.JIRA_EMAIL "${JIRA_EMAIL}" 2>/dev/null || \
        echo "  ⚠ Could not set cron.env.JIRA_EMAIL"
    hermes config set cron.env.JIRA_API_TOKEN "${JIRA_API_TOKEN}" 2>/dev/null || \
        echo "  ⚠ Could not set cron.env.JIRA_API_TOKEN"

    echo "  ✓ Cron env vars configured"
}

# =============================================================================
# Task 3: File/Cron Setup (D-10)
# =============================================================================

# ---- Write session-init.sh (D-10) ----
write_session_init_script() {
    echo "  → Writing session-init.sh (D-10)..."
    cat > "$HOME/.hermes/scripts/session-init.sh" << 'SCRIPT'
#!/bin/bash
# session-init.sh — Verify DEFAULT_REPOS mounts at session start
# Runs via shell_init_files before agent prompt. Non-blocking.
# Reads DEFAULT_REPOS from environment (forwarded via docker_forward_env).
set -uo pipefail

DEFAULT_REPOS="${DEFAULT_REPOS:-}"

if [ -z "$DEFAULT_REPOS" ]; then
  echo "[session-init] DEFAULT_REPOS not set — skipping verification"
  exit 0
fi

# Split comma-separated list
IFS=',' read -ra REPOS <<< "$DEFAULT_REPOS"
ALL_OK=true

for repo in "${REPOS[@]}"; do
  # Trim whitespace
  repo="${repo#"${repo%%[![:space:]]*}"}"
  repo="${repo%"${repo##*[![:space:]]}"}"

  if [ -d "/workspace/$repo/.git" ]; then
    echo "[session-init] ✓ $repo — mounted at /workspace/$repo"
  else
    echo "[session-init] ⚠ $repo — NOT FOUND at /workspace/$repo"
    ALL_OK=false
  fi
done

if [ "$ALL_OK" = true ]; then
  echo "[session-init] All DEFAULT_REPOS verified"
else
  echo "[session-init] Some repos missing — check docker_volumes in config.yaml"
fi

exit 0  # always exit cleanly — non-blocking
SCRIPT
    chmod +x "$HOME/.hermes/scripts/session-init.sh"
    echo "  ✓ session-init.sh written and executable"
}

# ---- Write archive-stale-sessions.sh (D-10) ----
write_archive_script() {
    echo "  → Writing archive-stale-sessions.sh (D-10)..."
    cat > "$HOME/.hermes/scripts/archive-stale-sessions.sh" << 'SCRIPT'
#!/bin/bash
# Archive stale sessions (inactive >30 days) and prune from live DB
# This script runs via hermes cron with --no-agent
# Stdout is delivered to Telegram via --deliver telegram
# Dry-run mode: export only, no prune — safe default for first run
set -euo pipefail

DRY_RUN=true

ARCHIVE_DIR="$HOME/.hermes/archive/sessions"
mkdir -p "$ARCHIVE_DIR"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
OUTPUT_FILE="$ARCHIVE_DIR/sessions-${TIMESTAMP}.jsonl"

echo "=== Stale Session Archive ==="
echo "Started: $(date)"
echo "Dry run: $DRY_RUN"
echo ""

echo "[1/3] Exporting session store..."
echo "  Output: $OUTPUT_FILE"
hermes sessions export "$OUTPUT_FILE"
echo "  -> $(wc -l < "$OUTPUT_FILE") sessions exported"
echo "  -> Size: $(du -h "$OUTPUT_FILE" | cut -f1)"
echo ""

if [ "$DRY_RUN" = false ]; then
  echo "[2/3] Pruning sessions older than 30 days..."
  hermes sessions prune --older-than 30 --yes
  echo "  Done."
else
  echo "[2/3] SKIPPED (dry run) — set DRY_RUN=false to enable prune"
  echo "  Review $OUTPUT_FILE before enabling."
fi
echo ""

echo "[3/3] Post-archive stats:"
hermes sessions stats
echo ""

echo "✓ Archive complete."
SCRIPT
    chmod +x "$HOME/.hermes/scripts/archive-stale-sessions.sh"
    echo "  ✓ archive-stale-sessions.sh written and executable"
}

# ---- Write skill files (D-10) ----
write_jira_skill() {
    mkdir -p "$HOME/.hermes/skills/ngn-agent/jira"
    cat > "$HOME/.hermes/skills/ngn-agent/jira/SKILL.md" << 'SKILL'
---
name: jira-query
description: Query Jira Cloud issues, search, and manage tickets
metadata:
  hermes:
    tags: [jira, project-management]
    category: devops
    requires_toolsets: [terminal]
version: 1.0.0
---
# Jira Cloud Query

## When to Use
When the user asks to search Jira issues, check ticket status, or list project work.

## Procedure

### 1. Search issues by JQL
```bash
ngn-jira GET '/rest/api/3/search?jql=ORDER BY created DESC&maxResults=10'
```

For specific project:
```bash
ngn-jira GET '/rest/api/3/search?jql=project=PROJ ORDER BY created DESC&maxResults=10'
```

### 2. Get issue details
```bash
ngn-jira GET '/rest/api/3/issue/PROJ-123'
```

### 3. List sprints (if Jira Software)
```bash
ngn-jira GET '/rest/agile/1.0/board'
ngn-jira GET '/rest/agile/1.0/board/{boardId}/sprint?state=active'
```

### 4. Get issue comments
```bash
ngn-jira GET '/rest/api/3/issue/PROJ-123/comment'
```

## Pitfalls
- JQL is case-sensitive for field names
- maxResults defaults to 50; set explicitly for large queries
- Agile REST API may not be available on all plans

## Required Environment
- `JIRA_EMAIL` — your Atlassian account email
- `JIRA_API_TOKEN` — from https://id.atlassian.com/manage/api-tokens
SKILL
    echo "  ✓ jira/SKILL.md written"
}

write_aws_skill() {
    mkdir -p "$HOME/.hermes/skills/ngn-agent/aws-diagnostics/references"
    cat > "$HOME/.hermes/skills/ngn-agent/aws-diagnostics/SKILL.md" << 'SKILL'
---
name: aws-diagnostics
description: Read-only AWS diagnostics for platform engineering
metadata:
  hermes:
    tags: [aws, diagnostics, platform-engineering]
    category: devops
    requires_toolsets: [terminal]
version: 1.0.0
---
# AWS Diagnostics

## When to Use
When the user asks to check AWS resources, investigate issues, or audit infrastructure in any account.

## Important
- ALWAYS determine the correct AWS_PROFILE before running commands
- NEVER run mutating AWS commands (delete, terminate, stop, modify)
- Prefer read-only AWS CLI commands (describe, list, get)

## Procedure

### 1. Identify the target account
Ask the user which account/environment they want to target. Available profiles in `/.aws/config`:
- `rzaws-sw-rai-ava-dev/prod/rc` — AVA service
- `rzaws-sw-rai-cs-dev/prod/rc` — CS service
- `rzaws-sw-rai-qac-dev/prod` — QAC
- `rzaws-sw-rai-ops` — Ops account
- `rzaws-sw-rai-voicekit-dev/prod/rc` — VoiceKit
- `rzaws-sw-rai-preprod` — Pre-production
- `rzaws-sw-rai-nonprod` — Non-production

### 2. Set the profile
```bash
export AWS_PROFILE=rzaws-sw-rai-<service>-<env>
```

### 3. Diagnostic commands

**EC2 instances:**
```bash
aws ec2 describe-instances --query 'Reservations[*].Instances[*].[InstanceId,State.Name,InstanceType,Tags[?Key==`Name`].Value|[0]]' --output table
```

**ECS services:**
```bash
aws ecs list-clusters && aws ecs list-services --cluster <name>
```

**S3 buckets:**
```bash
aws s3 ls
```

**CloudWatch alarms:**
```bash
aws cloudwatch describe-alarms --state-value ALARM --output table
```

**ECS task health:**
```bash
aws ecs describe-tasks --cluster <name> --tasks <task-ids>
```

**RDS instances:**
```bash
aws rds describe-db-instances --query 'DBInstances[*].[DBInstanceIdentifier,DBInstanceStatus,Engine,DBInstanceClass]' --output table
```

**Lambda functions:**
```bash
aws lambda list-functions --query 'Functions[*].[FunctionName,Runtime,LastModified]' --output table
```

**ELB target group health:**
```bash
aws elbv2 describe-target-groups --query 'TargetGroups[*].[TargetGroupName,TargetType]' --output table
```

### 4. Report findings
Format as a concise table. Include account ID and profile used.

## Alternative: Infrastructure Code Analysis

When AWS CLI access is unavailable (Docker containers, credential issues), examine existing infrastructure code instead:

```bash
# Search for region patterns
search_files --pattern="us-west-2" --path="/workspace"

# Check terraform configurations
read_file /workspace/rai-ops/aws/<account>/us-east-1/app/main.tf
read_file /workspace/rai-ops/aws/<account>/us-east-1/app/<app>.tfvars

# Look for provider configurations
search_files --pattern="provider.*replica" --path="/workspace/rai-ops"

# Check S3 migration data
search_files --pattern="s3-mapping" --target="files"
```

**When to use code analysis:**
- Docker container with read-only filesystem
- Missing AWS CLI or credentials
- Need to understand intended architecture vs live state
- Investigating multiregional setup patterns

## Pitfalls
- SSO tokens expire (~6-8h). If you get auth errors, ask the user to run `aws sso login`
- Some accounts may not have all services — check `aws sts get-caller-identity` first
- Don't pipe large results directly — use `--query` and `--output table` for readability
- **Don't persist with CLI installation in constrained environments** — switch to code analysis quickly when installation fails

## Verification
Run `aws sts get-caller-identity` to confirm the correct profile is active before running diagnostics.

## References
- `references/multiregional-patterns.md` - Terraform patterns for cross-region infrastructure setup
SKILL

    # AWS reference file
    cat > "$HOME/.hermes/skills/ngn-agent/aws-diagnostics/references/multiregional-patterns.md" << 'REF'
# Multiregional Infrastructure Patterns

## AVA Multiregional Setup

### Provider Configuration Pattern
```hcl
# Primary provider (us-east-1)
provider "aws" {
  region = var.region
  # ... assume_role block
}

# Replica provider (us-west-2)
provider "aws" {
  alias  = "replica"
  region = "us-west-2"
  # ... same assume_role block
}
```

### Module Consumption
```hcl
module "app" {
  providers = {
    aws         = aws
    aws.replica = aws.replica  # Required by tf-modules/app/versions.tf
  }
  # ... other config
}
```

### Database Replication Options

**RDS Aurora PostgreSQL (Current Pattern)**
- Engine: `aurora-postgresql`
- Version: `16.11`
- Cross-region read replicas supported
- Can promote replica for DR scenarios

**DynamoDB Global Tables (Available)**
- Global Table v2 with us-east-1 + us-west-2 replicas
- Per-region CMKs for encryption
- Feature-flagged via `var.tenant_registry`
- Documented in RAID-352

### S3 Cross-Region Replication
Extensive existing pattern from migration data:
- `ava-{env}-west-*` buckets in us-west-2
- Matching `rai-s3-usw2-*` naming convention
- Covers: bug reports, screenshots, game logs, shiny moments

### Key Files for Multiregional Analysis
- `aws/<account>/us-east-1/app/provider.tf` - Replica provider config
- `aws/<account>/us-east-1/app/<app>.tfvars` - App-specific resources
- `raid-migration/raid-s3-migration/s3-mapping.csv` - Cross-region S3 inventory
- `RAID-352-PR-DESCRIPTION.md` - DynamoDB Global Tables documentation
REF
    echo "  ✓ aws-diagnostics/SKILL.md + references written"
}

write_confluence_skill() {
    mkdir -p "$HOME/.hermes/skills/ngn-agent/confluence"
    cat > "$HOME/.hermes/skills/ngn-agent/confluence/SKILL.md" << 'SKILL'
---
name: confluence-search
description: Search and retrieve Confluence pages
metadata:
  hermes:
    tags: [confluence, documentation]
    category: devops
    requires_toolsets: [terminal]
version: 1.0.0
---
# Confluence Search

## When to Use
When the user asks to find documentation, search Confluence pages, or retrieve page content.

## Procedure

### 1. Search pages by text
```bash
ngn-confluence GET '/rest/api/search?cql=text~"search terms"&limit=10'
```

### 2. Search by space
```bash
ngn-confluence GET '/rest/api/search?cql=space=ADM&limit=10'
```

### 3. Get page content
```bash
ngn-confluence GET '/rest/api/content/{pageId}?expand=body.storage'
```

### 4. List pages in space
```bash
ngn-confluence GET '/rest/api/content?spaceKey=ADM&limit=50'
```

### 5. Get page children
```bash
ngn-confluence GET '/rest/api/content/{pageId}/child/page?limit=50'
```

## Pitfalls
- CQL is different from JQL — `text~"query"` for full-text search
- Page body needs `expand=body.storage` to retrieve content
- Use `limit` parameter — defaults to 25

## Required Environment
- `JIRA_EMAIL` — your Atlassian account email
- `JIRA_API_TOKEN` — from https://id.atlassian.com/manage/api-tokens
SKILL
    echo "  ✓ confluence/SKILL.md written"
}

write_bitbucket_skill() {
    mkdir -p "$HOME/.hermes/skills/ngn-agent/bitbucket"
    cat > "$HOME/.hermes/skills/ngn-agent/bitbucket/SKILL.md" << 'SKILL'
---
name: bitbucket-pr
description: Review Bitbucket pull requests and repositories
metadata:
  hermes:
    tags: [bitbucket, git, code-review]
    category: devops
    requires_toolsets: [terminal]
version: 1.0.0
---
# Bitbucket Pull Requests

## When to Use
When the user asks to check PRs, review code, or list repositories.

## Procedure

### 1. List repositories
```bash
ngn-bitbucket GET '/repositories/razersw?pagelen=20'
```

### 2. List open PRs for a repo
```bash
ngn-bitbucket GET '/repositories/razersw/{repo}/pullrequests?state=OPEN&pagelen=20'
```

### 3. Get PR details
```bash
ngn-bitbucket GET '/repositories/razersw/{repo}/pullrequests/{prId}'
```

### 4. Get PR diff
```bash
ngn-bitbucket GET '/repositories/razersw/{repo}/pullrequests/{prId}/diff'
```

### 5. Get PR comments
```bash
ngn-bitbucket GET '/repositories/razersw/{repo}/pullrequests/{prId}/comments'
```

### 6. List branch list
```bash
ngn-bitbucket GET '/repositories/razersw/{repo}/refs/branches?pagelen=20'
```

## Pitfalls
- Bitbucket pagination uses `pagelen` and `page` params (not `maxResults`)
- Diff endpoint returns raw diff text — may be large
- PR comments include inline code comments, not just summary

## Required Environment
- `JIRA_EMAIL` — your Atlassian account email
- `JIRA_API_TOKEN` — from https://id.atlassian.com/manage/api-tokens
SKILL
    echo "  ✓ bitbucket/SKILL.md written"
}

write_session_skill() {
    mkdir -p "$HOME/.hermes/skills/ngn-agent/session/references"
    cat > "$HOME/.hermes/skills/ngn-agent/session/SKILL.md" << 'SKILL'
---
name: session
description: Main ngn-agent session lifecycle — init, work, close
metadata:
  hermes:
    tags: [ngn-agent, platform-engineering, session]
    category: devops
    requires_toolsets: [terminal]
version: 1.0.0
---

# ngn-agent Session Lifecycle

## When to Use

Load this skill at the START of EVERY platform engineering session, before any other work. This skill defines the standard session workflow.

Specific triggers:
- When the user starts any infrastructure or platform engineering task
- When the user asks to create a Jira ticket or find a ticket
- When the user wants to search or load Confluence documentation
- When a session is ending and you need to document progress
- When you need to save context for future sessions

## Important

- **Keep this skill loaded for the entire session** — if context grows large, reload via `skill_view("session")` before the session-end steps (Steps 5–7)
- **Never create Jira tickets without asking the user first** (D-02)
- **Never update Confluence without asking the user first** (D-11)
- **Always save session summary to hindsight at end** — this step has no user prompt, it is automatic (D-12)
- User must confirm before any Jira mutation (create, comment, transition) — D-08
- Repos are already mounted at `/workspace/` from Phase 6 (rai-ops, rai-deployment, rai-devtools)
- This skill replaces the ad-hoc session workflow with a repeatable init→work→close pattern

## Procedure

### 1. Check for Similar Previous Sessions

At the very start of a session, use `hindsight_recall` with a query describing the user's current task to find similar sessions from the last 2 weeks.

Call `hindsight_recall` with a budget of low:

```
Tool: hindsight_recall
Query: "<user's task description>"
Budget: low
```

Present any matches to the user in this format:

```
Found [N] similar sessions from the last 2 weeks:
1. [Session Title] — [Date] — [one-line summary]
2. [Session Title] — [Date] — [one-line summary]
```

Ask the user: "Would you like to resume any of these sessions, or start fresh?"
- If they choose to resume: load that session's context and continue
- If they choose fresh: proceed to step 2

If no similar sessions are found (normal for first sessions), proceed to step 2.

### 2. Prompt: Create Jira Ticket

Ask the user: "Would you like to create a Jira Task ticket for this session?"

If YES:
1. Ask which Jira project to use (e.g., "PLATFORM", "DEVOPS") — do not hardcode (D-06)
2. Check hindsight for cached epics:

   ```
   Tool: hindsight_recall
   Query: "jira epics cached"
   Budget: low
   ```

3. If epics are cached, check the cache timestamp:
   - If the cache is more than 24 hours old OR the user says the list looks wrong, refresh from Jira:
     ```bash
     ngn-jira GET '/rest/api/3/search?jql=issuetype=Epic AND project=<PROJECT>&fields=summary,id&maxResults=50'
     ```
     Save fresh epics to hindsight:
     ```
     Tool: hindsight_retain
     tier: "epic-cache"
     content: "Epic Cache [<date>]: PROJECT=<PROJECT>: [EPIC-KEY-1: Summary, EPIC-KEY-2: Summary, ...]"
     ```
   - If the cache is fresh (less than 24 hours old), use the cached list
4. If no cached epics found, query Jira for current epics:
   ```bash
   ngn-jira GET '/rest/api/3/search?jql=issuetype=Epic AND project=<PROJECT>&fields=summary,id&maxResults=50'
   ```
   Save to hindsight for future sessions:
   ```
   Tool: hindsight_retain
   tier: "epic-cache"
   content: "Epic Cache [<date>]: PROJECT=<PROJECT>: [EPIC-KEY-1: Summary, ...]"
   ```
5. Present cached/refreshed epics to the user: "Available epics: [list]. Would you like to set a parent epic?"
6. If user selects an epic, include it as parent when creating the ticket
7. Create the Task via Jira REST API:

   ```bash
   ngn-jira POST '/rest/api/3/issue' --body '{
     "fields": {
       "project": {"key": "<PROJECT>"},
       "summary": "<session task description>",
       "issuetype": {"name": "Task"},
       "parent": {"key": "<EPIC_KEY>"}
     }
   }'
   ```

8. Note the ticket key (e.g., `PLATFORM-123`) — save it for session-end steps (Step 5)

If NO: proceed to step 3 (no Jira ticket this session)

### 3. Prompt: Load Confluence Documentation

Ask the user: "Would you like to load relevant Confluence documentation?"

If YES:
1. Search by the `ngn-agent` tag:

   ```bash
   ngn-confluence GET '/rest/api/search?cql=tag="ngn-agent"&limit=20'
   ```

2. Present matching pages to the user:
   ```
   Found [N] pages tagged 'ngn-agent':
   - [Title] — [Space] — [Last Modified]
   ```

3. Ask: "Which pages would you like me to load?"
4. For each selected page, load its full content:

   ```bash
   ngn-confluence GET '/rest/api/content/{pageId}?expand=body.storage'
   ```

5. Review the loaded content with the user

If NO: proceed to step 4

### 4. Work Phase

Repos are already mounted at `/workspace/` (rai-ops, rai-deployment, rai-devtools). Proceed with the task using standard Hermes tools.

If you need to clone additional repos:
```bash
git clone git@bitbucket.org:razersw/<repo>.git /workspace/<repo>
```

The session skill remains loaded for the session-end steps below. If the skill is evicted from context during a long session, reload it with `skill_view("session")` before proceeding to Steps 5–7.

### 5. Session-End: Update Jira

When the user indicates work is complete or the session wraps up:

Ask the user: "Would you like me to update the Jira ticket with a summary comment?"

If YES (and a ticket was created in Step 2):
```bash
ngn-jira POST '/rest/api/3/issue/<TICKET-KEY>/comment' --body '{
  "body": "<summary of work done, key decisions, next steps>"
}'
```

If NO: proceed without updating Jira.

**Important (D-08):** Do NOT transition tickets (e.g., close, resolve, move to Done) without explicit user confirmation. Only add comments unless the user specifically asks for a status change.

### 6. Session-End: Update Confluence

Ask the user: "Would you like me to create or update a Confluence page documenting this session?"

If YES:
- For a new page:
  ```bash
  ngn-confluence POST '/rest/api/content' --body '{
    "type": "page",
    "title": "<Session Date>: <Task Description>",
    "space": {"key": "<SPACE_KEY>"},
    "body": {
      "storage": {
        "value": "<h1>Session Summary</h1><p><task summary, key decisions, outcomes></p>",
        "representation": "storage"
      }
    },
    "metadata": {
      "properties": {
        "content-appearance": {"value": "page"}
      }
    },
    "labels": [{"name": "ngn-agent"}]
  }'
  ```

- For updating an existing page: ask the user which page to update, then PUT to update its content

- **Important (D-11):** Do NOT create or update any Confluence page without the user confirming first

If NO: proceed without updating Confluence.

### 7. Session-End: Save to Hindsight (Automatic — No Prompt)

ALWAYS save a session summary to hindsight memory. Do NOT ask the user — this step is automatic and unconditional (D-12).

```bash
Tool: hindsight_retain
tier: "session-summary"
content: "
Session Summary
===============
Date: <today>
Task: <task description>
Repos: <repos worked on>
Jira: <ticket key or \"none\">
Key Decisions:
- <decision 1>
- <decision 2>
Outcomes:
- <outcome 1>
- <outcome 2>
Next Steps:
- <next step 1>
"
```

This summary allows future `hindsight_recall` queries to find this session for similarity matching (D-13). The structured content includes: date, task description, repos worked on, Jira ticket reference (or "none"), key decisions, outcomes, and next steps.

## Pitfalls

- **Skill not loaded at session start:** If you find yourself midway through a session without having run Steps 1–3, you missed the session start workflow. Run Step 1 (hindsight_recall) retroactively and ask the user if they want to create a Jira ticket or load Confluence docs. For future sessions, make sure to load this skill at the very start.
- **Epic cache too old:** Epics may change between sessions. Check the cache timestamp and refresh if more than 24 hours old. If the user says "that's wrong," always refresh regardless of age.
- **Confluence tag mismatch:** If the `ngn-agent` tag returns no results, try `platform-engineering` as a fallback, or ask the user what tag they use for session documentation.
- **Jira project doesn't exist:** If the create ticket call fails with a 404, the project key may be wrong. Ask the user to confirm the correct project key.
- **Empty hindsight recall (first sessions):** The first few sessions will have no similar sessions to find. That is normal — proceed with a fresh session. Over time, hindsight will accumulate session summaries.
- **Long sessions may evict this skill:** If the conversation grows long, the session skill content may be evicted from the agent's context. Reload it with `skill_view("session")` before the session-end steps (Steps 5–7) to ensure the Jira/Confluence prompts and hindsight save are not missed.
- **Missing Jira credentials in cron jobs:** The ngn-jira tool requires both `JIRA_EMAIL` and `JIRA_API_TOKEN` environment variables. If either is missing, Jira operations will fail with "unbound variable" errors. Check environment setup before attempting Jira updates in automated workflows.

## Operational Automation

### Daily Session Monitoring (Cron Job)

When running as a scheduled cron job for operational monitoring:

1. **Discover Active Sessions**:
   ```bash
   hermes sessions export -  # NOT 'hermes sessions list' - no --json flag available
   ```
   Parse JSONL output with Python to find sessions with `last_active` within last 7 days

2. **Find Associated Jira Tickets**:
   - Use `hindsight_recall` with query 'session summary jira' for each active session
   - Search session messages for Jira patterns: `PLATFORM-\d+`, `AIOPS-\d+`, `RAID-\d+`, etc.
   - Note: One session may have multiple Jira tickets (1-to-many mapping)

3. **Update Jira with Progress**:
   ```bash
   ngn-jira POST '/rest/api/3/issue/<KEY>/comment' --body '{
     "body": "Session activity update — Date: <today>, Last active: <last_active>. Session: <session_id>. Progress: See session transcript for details."
   }'
   ```

4. **Generate Telegram Report**:
   - Structure: Active Sessions + Jira Updated + Issues/Summary
   - Keep under 4096 character limit
   - Format with emoji sections for clarity

**Environment Requirements for Operational Jobs**:
- `JIRA_EMAIL` — Required for ngn-jira authentication
- `JIRA_API_TOKEN` — API token from Atlassian account
- Both must be set or Jira updates will fail

See `references/operational-monitoring.md` for detailed patterns, templates, and troubleshooting.

**Important Constraints (Cron Mode)**:
- DO NOT transition ticket statuses (D-05) - only add comments
- DO NOT update stale sessions (D-15) - only active within 7 days
- Use silent mode `[SILENT]` if no active sessions found

## Verification

1. On session start, agent checks for similar sessions via `hindsight_recall` ✓
2. Jira Task ticket created (or user declined) ✓
3. Confluence docs loaded by `ngn-agent` tag search (or user declined) ✓
4. At session end, user prompted for Jira update ✓
5. At session end, user prompted for Confluence update ✓
6. Session summary automatically saved to hindsight via `hindsight_retain` (no prompt) ✓
7. **Operational cron jobs can discover active sessions and update Jira tickets** ✓
SKILL

    # Session reference file
    cat > "$HOME/.hermes/skills/ngn-agent/session/references/operational-monitoring.md" << 'REF'
# Operational Session Monitoring

## Jira Ticket Pattern Detection

When scanning session content for associated Jira tickets, search for these patterns:

```python
jira_patterns = [
    r'(PLATFORM-\d+)',    # Platform engineering tickets
    r'(AIOPS-\d+)',       # AI Operations tickets
    r'(RAID-\d+)',        # RAID project tickets
    r'(DEVOPS-\d+)',      # DevOps tickets
    r'(QAC-\d+)'          # QAC tickets
]
```

## Session Export vs List Commands

**CORRECT**: `hermes sessions export -`
- Returns machine-readable JSONL format
- Each line is a complete session object
- Includes `last_active` timestamps for filtering

**INCORRECT**: `hermes sessions list --json`
- The `--json` flag does not exist (Pitfall from RESEARCH.md)
- Use export for automation, list for human viewing only

## Environment Variable Requirements

The `ngn-jira` tool wrapper expects:
- `JIRA_EMAIL` - Atlassian account email
- `JIRA_API_TOKEN` - From https://id.atlassian.com/manage/api-tokens

Missing either variable causes: `bash: line 10: JIRA_EMAIL: unbound variable`

## Telegram Report Template

```
📋 **ACTIVE SESSIONS** — {date}

🔹 **{session_id}**
   Title: {title}
   Last Active: {timestamp}
   Jira: {ticket_keys or "None"}

🔄 **JIRA UPDATED**: {list of updated ticket keys}

❌ **ISSUES**: {any operational problems}

📊 **SUMMARY**: {count} active sessions found, {count} with Jira tickets
```

Character limit: 4096 for Telegram delivery
REF
    echo "  ✓ session/SKILL.md + references written"
}

# ---- Register cron jobs (D-10) ----
register_cron_jobs() {
    echo "  → Registering cron jobs (D-10)..."

    # 1. ngn-daily-report (daily at 09:00 SGT)
    echo "  → Creating ngn-daily-report..."
    hermes cron create --deliver telegram --skill session --skill jira-query \
        '0 9 * * *' \
        'Daily session report. Export sessions, find active ones, check Jira, compose Telegram summary.' \
        2>/dev/null && echo "  ✓ ngn-daily-report registered" \
        || echo "  ⚠ ngn-daily-report may already exist"

    # 2. ngn-weekly-stale-summary (Sunday 20:00 SGT)
    echo "  → Creating ngn-weekly-stale-summary..."
    hermes cron create --deliver telegram --skill session \
        '0 20 * * 0' \
        'Weekly stale session summary. Review sessions inactive >30 days, compose Telegram summary.' \
        2>/dev/null && echo "  ✓ ngn-weekly-stale-summary registered" \
        || echo "  ⚠ ngn-weekly-stale-summary may already exist"

    # 3. ngn-weekly-archive (Sunday 20:05 SGT — 5 min after summary, per D-10)
    echo "  → Creating ngn-weekly-archive..."
    hermes cron create --no-agent --script archive-stale-sessions.sh \
        '5 20 * * 0' \
        2>/dev/null && echo "  ✓ ngn-weekly-archive registered" \
        || echo "  ⚠ ngn-weekly-archive may already exist"
}

# ---- Offer gateway restart (per CONTEXT.md "Specific Ideas") ----
offer_gateway_restart() {
    echo ""
    echo "==> Setup complete!"
    echo ""
    read -p "Restart Hermes gateway now? [Y/n]: " restart
    if [[ "$restart" =~ ^[Yy]?$ ]]; then
        hermes gateway restart
        echo "  → Gateway restarted."
    else
        echo "  → Skipped. Run 'hermes gateway restart' when ready."
    fi
}

# =============================================================================
# Main Execution Block
# =============================================================================
main() {
    echo ""
    echo "=== ngn-agent Setup Script ==="
    echo "Embedded file snapshots frozen at: 2026-06-15"
    echo ""

    # Step 1: Parse arguments (already done above)
    # Step 2: Check prerequisites
    echo "[1/14] Checking prerequisites..."
    check_prerequisites

    # Step 3: Print path summary
    print_summary

    # Step 4: Prompt for secrets
    echo "[2/14] Collecting secrets..."
    if [ "$NONINTERACTIVE" = false ]; then
        JIRA_API_TOKEN=$(prompt_secret "JIRA_API_TOKEN" "JIRA API Token (https://id.atlassian.com/manage/api-tokens): ")
        JIRA_EMAIL=$(prompt_secret "JIRA_EMAIL" "JIRA Email: ")
        TELEGRAM_BOT_TOKEN=$(prompt_secret "TELEGRAM_BOT_TOKEN" "Telegram Bot Token (from @BotFather): ")
        OPENROUTER_API_KEY=$(prompt_secret "OPENROUTER_API_KEY" "OpenRouter API Key (leave blank to keep existing): " "true")
    else
        echo "  → Non-interactive mode — using environment variables"
        : "${JIRA_API_TOKEN:?JIRA_API_TOKEN not set}"
        : "${JIRA_EMAIL:?JIRA_EMAIL not set}"
        : "${TELEGRAM_BOT_TOKEN:?TELEGRAM_BOT_TOKEN not set}"
    fi
    echo "  ✓ Secrets collected"

    # Step 5: Create directories
    echo "[3/14] Creating directories..."
    create_directories

    # Step 6: Backup existing config
    echo "[4/14] Backing up existing config..."
    backup_config

    # Step 7: Generate config.yaml
    echo "[5/14] Generating config.yaml..."
    generate_config_yaml

    # Step 8: Generate .env
    echo "[6/14] Generating .env..."
    generate_env_file

    # Step 9: Generate hindsight config
    echo "[7/14] Generating hindsight config..."
    generate_hindsight_config

    # Step 10: Generate cron env config
    echo "[8/14] Configuring cron environment..."
    generate_cron_env_config

    # Step 11: Write session-init script
    echo "[9/14] Writing session-init script..."
    write_session_init_script

    # Step 12: Write archive script
    echo "[10/14] Writing archive script..."
    write_archive_script

    # Step 13: Write skill files
    echo "[11/14] Writing skill files..."
    write_jira_skill
    write_aws_skill
    write_confluence_skill
    write_bitbucket_skill
    write_session_skill

    # Step 14: Register cron jobs
    echo "[12/14] Registering cron jobs..."
    register_cron_jobs || echo "  ⚠ Cron registration had issues (may already exist)"

    echo "[13/14] Setup complete."
    echo "[14/14] Offering gateway restart..."
    offer_gateway_restart
}

main "$@"