Files
ngn-agent/ngn-agent/setup-ngn-agent.sh
Bagas Purwa Sentika 5a8c18380e feat(09-tooling-portable-setup-02): implement file/cron setup — scripts, skills, cron jobs, gateway restart
- write_session_init_script: mount verification via shell_init_files (D-10)
- write_archive_script: DRY_RUN=true archive script for stale sessions (D-10)
- write_jira_skill, write_aws_skill, write_confluence_skill, write_bitbucket_skill, write_session_skill: all 5 skills with 2 reference files embedded as heredocs
- register_cron_jobs: 3 cron jobs via hermes cron create (ngn-daily-report, ngn-weekly-stale-summary, ngn-weekly-archive)
- offer_gateway_restart: prompt to restart Hermes gateway at end
- Main execution block [1/14] through [14/14] with progress indicators
- Best-effort error handling for non-critical steps
- D-10 referenced throughout for traceability
2026-06-15 23:30:27 +08:00

1341 lines
44 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# setup-ngn-agent.sh — Portable ngn-agent configuration setup
#
# Phase 9, Plan 2 — recreates all configuration on a fresh macOS machine
# Assumes Hermes v0.16+ is installed (per D-07)
#
# Embedded file snapshots frozen at: 2026-06-15
# Regenerate by re-running this phase.
#
# D-06: Single script recreating all ngn-agent configuration
# D-07: Requires Hermes v0.16+ on PATH
# D-08: Interactive secrets: JIRA_API_TOKEN, JIRA_EMAIL, TELEGRAM_BOT_TOKEN, OPENROUTER_API_KEY
# D-09: Configurable paths via arguments (SSH keys, repos, timezone)
# D-10: Creates/updates: config.yaml, .env, hindsight/config.json, scripts, skills, cron jobs
#
set -euo pipefail
# ---- Usage ----
usage() {
cat <<'USAGE'
usage: setup-ngn-agent.sh [OPTIONS]
Portable ngn-agent configuration setup for macOS + Hermes v0.16+
Options:
-s1, --ssh-key-1 PATH SSH private key path 1 (default: ~/.ssh/id_ed25519razer)
-s2, --ssh-key-2 PATH SSH private key path 2 (default: ~/.ssh/id_rsa)
-sc, --ssh-config PATH SSH config path (default: ~/.ssh/config)
-sh, --ssh-known-hosts PATH SSH known_hosts path (default: ~/.ssh/known_hosts)
-r1, --repo-ops PATH rai-ops repo path (default: ~/Razer/rai-ops)
-r2, --repo-deploy PATH rai-deployment repo path (default: ~/Razer/rai-deployment)
-r3, --repo-devtools PATH rai-devtools repo path (default: ~/Razer/rai-devtools)
-t, --timezone ZONE Timezone (default: Asia/Singapore)
-d, --docker-image TAG Docker image tag (default: ngn-agent:latest)
-y, --yes Non-interactive mode (skip prompts, use env vars)
-h, --help Show this help message
Secrets are prompted interactively with masked input unless -y is set,
in which case they are read from environment variables.
USAGE
}
# ---- Argument defaults ----
SSH_KEY_1="${SSH_KEY_1:-$HOME/.ssh/id_ed25519razer}"
SSH_KEY_2="${SSH_KEY_2:-$HOME/.ssh/id_rsa}"
SSH_CONFIG="${SSH_CONFIG:-$HOME/.ssh/config}"
SSH_KNOWN_HOSTS="${SSH_KNOWN_HOSTS:-$HOME/.ssh/known_hosts}"
REPO_OPS="${REPO_OPS:-$HOME/Razer/rai-ops}"
REPO_DEPLOY="${REPO_DEPLOY:-$HOME/Razer/rai-deployment}"
REPO_DEVTOOLS="${REPO_DEVTOOLS:-$HOME/Razer/rai-devtools}"
TIMEZONE="${TIMEZONE:-Asia/Singapore}"
DOCKER_IMAGE="${DOCKER_IMAGE:-ngn-agent:latest}"
NONINTERACTIVE=false
# ---- Argument parsing (per D-09) ----
while [[ $# -gt 0 ]]; do
case "$1" in
-s1|--ssh-key-1)
SSH_KEY_1="$2"; shift 2 ;;
-s2|--ssh-key-2)
SSH_KEY_2="$2"; shift 2 ;;
-sc|--ssh-config)
SSH_CONFIG="$2"; shift 2 ;;
-sh|--ssh-known-hosts)
SSH_KNOWN_HOSTS="$2"; shift 2 ;;
-r1|--repo-ops)
REPO_OPS="$2"; shift 2 ;;
-r2|--repo-deploy)
REPO_DEPLOY="$2"; shift 2 ;;
-r3|--repo-devtools)
REPO_DEVTOOLS="$2"; shift 2 ;;
-t|--timezone)
TIMEZONE="$2"; shift 2 ;;
-d|--docker-image)
DOCKER_IMAGE="$2"; shift 2 ;;
-y|--yes)
NONINTERACTIVE=true; shift ;;
-h|--help)
usage; exit 0 ;;
*)
echo "Unknown option: $1"
usage; exit 1 ;;
esac
done
# ---- Interactive secret prompt (per D-08) ----
# T-09-05 mitigation: read -s for masked input, no echo to terminal
prompt_secret() {
local var_name="$1"
local prompt_text="$2"
local is_optional="${3:-false}"
local val=""
# If env var is already set (e.g., user exported it), skip prompt
if [ -n "${!var_name:-}" ]; then
echo "${var_name} already set (using environment value)"
echo "${!var_name}"
return
fi
while [ -z "$val" ]; do
read -s -p "${prompt_text}" val
echo
if [ -z "$val" ] && [ "$is_optional" = "true" ]; then
# Optional and empty — return empty string
echo ""
return
elif [ -z "$val" ]; then
echo " ⚠ Value cannot be empty. Press Ctrl+C to cancel."
fi
done
echo "$val"
}
# ---- Prerequisite checks ----
check_prerequisites() {
echo " → Checking prerequisites..."
# 1. Hermes CLI installed (per D-07)
if ! command -v hermes >/dev/null 2>&1; then
echo " ERROR: Hermes CLI not found — install v0.16+ first."
echo " See: https://github.com/nousresearch/hermes"
exit 1
fi
echo " ✓ Hermes CLI found: $(hermes --version 2>/dev/null || echo 'unknown version')"
# 2. Docker running
if ! docker info >/dev/null 2>&1; then
echo " ERROR: Docker is not running."
echo " Start Docker Desktop or Orbstack first."
exit 1
fi
echo " ✓ Docker is running"
# 3. SSH key files exist
if [ ! -f "$SSH_KEY_1" ]; then
echo " ⚠ SSH key not found: ${SSH_KEY_1}"
else
echo " ✓ SSH key 1: ${SSH_KEY_1}"
fi
if [ ! -f "$SSH_KEY_2" ]; then
echo " ⚠ SSH key not found: ${SSH_KEY_2}"
else
echo " ✓ SSH key 2: ${SSH_KEY_2}"
fi
if [ ! -f "$SSH_CONFIG" ]; then
echo " ⚠ SSH config not found: ${SSH_CONFIG}"
else
echo " ✓ SSH config: ${SSH_CONFIG}"
fi
if [ ! -f "$SSH_KNOWN_HOSTS" ]; then
echo " ⚠ SSH known_hosts not found: ${SSH_KNOWN_HOSTS}"
else
echo " ✓ SSH known_hosts: ${SSH_KNOWN_HOSTS}"
fi
# 4. Repo paths exist
if [ ! -d "$REPO_OPS" ]; then
echo " ⚠ Repo not found: ${REPO_OPS}"
else
echo " ✓ Repo (ops): ${REPO_OPS}"
fi
if [ ! -d "$REPO_DEPLOY" ]; then
echo " ⚠ Repo not found: ${REPO_DEPLOY}"
else
echo " ✓ Repo (deploy): ${REPO_DEPLOY}"
fi
if [ ! -d "$REPO_DEVTOOLS" ]; then
echo " ⚠ Repo not found: ${REPO_DEVTOOLS}"
else
echo " ✓ Repo (devtools): ${REPO_DEVTOOLS}"
fi
}
# ---- Print path summary ----
print_summary() {
echo ""
echo " Configuration paths:"
echo " SSH key 1: ${SSH_KEY_1}"
echo " SSH key 2: ${SSH_KEY_2}"
echo " SSH config: ${SSH_CONFIG}"
echo " SSH known_hosts: ${SSH_KNOWN_HOSTS}"
echo " Repo (ops): ${REPO_OPS}"
echo " Repo (deploy): ${REPO_DEPLOY}"
echo " Repo (devtools): ${REPO_DEVTOOLS}"
echo " Timezone: ${TIMEZONE}"
echo " Docker image: ${DOCKER_IMAGE}"
echo ""
}
# ---- Create config directories ----
create_directories() {
echo " → Creating config directories..."
mkdir -p "$HOME/.hermes/scripts"
mkdir -p "$HOME/.hermes/hindsight"
mkdir -p "$HOME/.hermes/skills/ngn-agent"
mkdir -p "$HOME/.hermes/archive/sessions"
echo " ✓ Directories created"
}
# ---- Backup existing config (per Anti-Pattern 4, T-09-07 mitigation) ----
backup_config() {
if [ -f "$HOME/.hermes/config.yaml" ]; then
local bak_file="$HOME/.hermes/config.yaml.bak.$(date +%Y%m%d_%H%M%S)"
cp "$HOME/.hermes/config.yaml" "$bak_file"
echo " ✓ Backed up config.yaml → $(basename ${bak_file})"
else
echo " → No existing config.yaml to backup"
fi
}
# =============================================================================
# Task 2: Config Generation (D-10)
# =============================================================================
# ---- Generate config.yaml ----
# Uses hermes config set for simple keys, Python yaml or sed for arrays
generate_config_yaml() {
echo " → Generating config.yaml (D-10)..."
# Scalars via hermes config set
hermes config set terminal.backend docker
hermes config set terminal.docker_image "${DOCKER_IMAGE}"
hermes config set terminal.cwd /workspace
hermes config set terminal.container_memory 5120
hermes config set terminal.container_disk 51200
hermes config set terminal.container_cpu 1
hermes config set terminal.lifetime_seconds 300
hermes config set memory.provider hindsight
hermes config set terminal.timezone "${TIMEZONE}"
hermes config set telegram.reactions false
hermes config set terminal.docker_env.AWS_REGION us-east-1
hermes config set terminal.container_persistent true
hermes config set terminal.docker_mount_cwd_to_workspace true
echo " ✓ Scalar config keys set"
# Complex structures (arrays): try Python yaml first, fall back to sed
if python3 -c "import yaml; import os" 2>/dev/null; then
echo " → Using Python yaml for array structures..."
python3 -c "
import yaml, os
path = os.path.expanduser('~/.hermes/config.yaml')
with open(path) as f:
config = yaml.safe_load(f)
ssh_key_1 = '${SSH_KEY_1}'
ssh_key_2 = '${SSH_KEY_2}'
ssh_config = '${SSH_CONFIG}'
ssh_known_hosts = '${SSH_KNOWN_HOSTS}'
repo_ops = '${REPO_OPS}'
repo_deploy = '${REPO_DEPLOY}'
repo_devtools = '${REPO_DEVTOOLS}'
home = os.path.expanduser('~')
config['terminal']['docker_volumes'] = [
ssh_key_1 + ':/root/.ssh/id_ed25519razer:ro',
ssh_key_2 + ':/root/.ssh/id_rsa:ro',
ssh_config + ':/root/.ssh/config:ro',
ssh_known_hosts + ':/root/.ssh/known_hosts:ro',
home + '/.aws/config:/root/.aws/config:ro',
home + '/.aws/sso/cache:/root/.aws/sso/cache:rw',
repo_ops + ':/workspace/rai-ops:rw',
repo_deploy + ':/workspace/rai-deployment:rw',
repo_devtools + ':/workspace/rai-devtools:rw',
home + '/.hermes/scripts:/usr/local/bin:ro',
]
config['terminal']['docker_forward_env'] = ['JIRA_EMAIL', 'JIRA_API_TOKEN', 'DEFAULT_REPOS']
config['terminal']['shell_init_files'] = ['/usr/local/bin/session-init.sh']
with open(path, 'w') as f:
yaml.dump(config, f, default_flow_style=False)
"
echo " ✓ Array structures set via Python yaml"
else
echo " → Python yaml not available, using sed fallback..."
# Fallback: use sed to inject arrays into config.yaml
local config_file="$HOME/.hermes/config.yaml"
# Add docker_volumes block
cat >> "$config_file" << 'SEDVOL'
terminal:
docker_volumes:
- ${SSH_KEY_1}:/root/.ssh/id_ed25519razer:ro
- ${SSH_KEY_2}:/root/.ssh/id_rsa:ro
- ${SSH_CONFIG}:/root/.ssh/config:ro
- ${SSH_KNOWN_HOSTS}:/root/.ssh/known_hosts:ro
- ${HOME}/.aws/config:/root/.aws/config:ro
- ${HOME}/.aws/sso/cache:/root/.aws/sso/cache:rw
- ${REPO_OPS}:/workspace/rai-ops:rw
- ${REPO_DEPLOY}:/workspace/rai-deployment:rw
- ${REPO_DEVTOOLS}:/workspace/rai-devtools:rw
- ${HOME}/.hermes/scripts:/usr/local/bin:ro
docker_forward_env:
- JIRA_EMAIL
- JIRA_API_TOKEN
- DEFAULT_REPOS
shell_init_files:
- /usr/local/bin/session-init.sh
SEDVOL
echo " ✓ Array structures set via sed (partial — review config.yaml)"
fi
# Validate
local img
img=$(hermes config get terminal.docker_image 2>/dev/null || echo "unset")
echo " ✓ Verified: terminal.docker_image = ${img}"
}
# ---- Generate .env file ----
# T-09-06 mitigation: chmod 600 on .env immediately after writing
generate_env_file() {
echo " → Generating .env (D-08, D-10)..."
local env_file="$HOME/.hermes/.env"
# Resolve HINDSIGHT_LLM_API_KEY — defaults to OPENROUTER_API_KEY if not separately provided
local hind_key="${HINDSIGHT_LLM_API_KEY:-${OPENROUTER_API_KEY}}"
cat > "$env_file" << ENVEOF
# ngn-agent Environment — generated by setup-ngn-agent.sh
# Embedded file snapshots frozen at: 2026-06-15
# =============================================================================
# LLM PROVIDER (OpenRouter)
# =============================================================================
OPENROUTER_API_KEY=${OPENROUTER_API_KEY}
# =============================================================================
# ATLASSIAN INTEGRATION
# =============================================================================
JIRA_API_TOKEN=${JIRA_API_TOKEN}
JIRA_EMAIL=${JIRA_EMAIL}
# =============================================================================
# TELEGRAM GATEWAY
# =============================================================================
TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN}
TELEGRAM_ALLOWED_USERS=474440517
# =============================================================================
# HINDSIGHT MEMORY
# =============================================================================
HINDSIGHT_LLM_API_KEY=${hind_key}
# =============================================================================
# NGN-AGENT CONFIG
# =============================================================================
DEFAULT_REPOS=rai-ops,rai-deployment,rai-devtools
TERMINAL_TIMEOUT=60
TERMINAL_LIFETIME_SECONDS=300
ENVEOF
# T-09-06: Restrict permissions immediately after writing
chmod 600 "$env_file"
echo " ✓ .env written with chmod 600 (T-09-06)"
}
# ---- Generate hindsight config.json (D-10) ----
generate_hindsight_config() {
echo " → Generating hindsight/config.json (D-10)..."
cat > "$HOME/.hermes/hindsight/config.json" << 'JSONEOF'
{
"mode": "local_embedded",
"llm_provider": "openrouter",
"llm_base_url": "https://openrouter.ai/api/v1",
"llm_model": "qwen/qwen3.5-9b",
"bank_id": "hermes",
"recall_budget": "low",
"recall_prefetch_method": "recall",
"auto_recall": true,
"recall_types": "observation",
"auto_retain": true,
"retain_async": true,
"retain_every_n_turns": 5,
"memory_mode": "hybrid"
}
JSONEOF
echo " ✓ hindsight/config.json written"
}
# ---- Generate cron env config (D-10) ----
generate_cron_env_config() {
echo " → Configuring cron environment (D-10)..."
hermes config set cron.env.JIRA_EMAIL "${JIRA_EMAIL}" 2>/dev/null || \
echo " ⚠ Could not set cron.env.JIRA_EMAIL"
hermes config set cron.env.JIRA_API_TOKEN "${JIRA_API_TOKEN}" 2>/dev/null || \
echo " ⚠ Could not set cron.env.JIRA_API_TOKEN"
echo " ✓ Cron env vars configured"
}
# =============================================================================
# Task 3: File/Cron Setup (D-10)
# =============================================================================
# ---- Write session-init.sh (D-10) ----
write_session_init_script() {
echo " → Writing session-init.sh (D-10)..."
cat > "$HOME/.hermes/scripts/session-init.sh" << 'SCRIPT'
#!/bin/bash
# session-init.sh — Verify DEFAULT_REPOS mounts at session start
# Runs via shell_init_files before agent prompt. Non-blocking.
# Reads DEFAULT_REPOS from environment (forwarded via docker_forward_env).
set -uo pipefail
DEFAULT_REPOS="${DEFAULT_REPOS:-}"
if [ -z "$DEFAULT_REPOS" ]; then
echo "[session-init] DEFAULT_REPOS not set — skipping verification"
exit 0
fi
# Split comma-separated list
IFS=',' read -ra REPOS <<< "$DEFAULT_REPOS"
ALL_OK=true
for repo in "${REPOS[@]}"; do
# Trim whitespace
repo="${repo#"${repo%%[![:space:]]*}"}"
repo="${repo%"${repo##*[![:space:]]}"}"
if [ -d "/workspace/$repo/.git" ]; then
echo "[session-init] ✓ $repo — mounted at /workspace/$repo"
else
echo "[session-init] ⚠ $repo — NOT FOUND at /workspace/$repo"
ALL_OK=false
fi
done
if [ "$ALL_OK" = true ]; then
echo "[session-init] All DEFAULT_REPOS verified"
else
echo "[session-init] Some repos missing — check docker_volumes in config.yaml"
fi
exit 0 # always exit cleanly — non-blocking
SCRIPT
chmod +x "$HOME/.hermes/scripts/session-init.sh"
echo " ✓ session-init.sh written and executable"
}
# ---- Write archive-stale-sessions.sh (D-10) ----
write_archive_script() {
echo " → Writing archive-stale-sessions.sh (D-10)..."
cat > "$HOME/.hermes/scripts/archive-stale-sessions.sh" << 'SCRIPT'
#!/bin/bash
# Archive stale sessions (inactive >30 days) and prune from live DB
# This script runs via hermes cron with --no-agent
# Stdout is delivered to Telegram via --deliver telegram
# Dry-run mode: export only, no prune — safe default for first run
set -euo pipefail
DRY_RUN=true
ARCHIVE_DIR="$HOME/.hermes/archive/sessions"
mkdir -p "$ARCHIVE_DIR"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
OUTPUT_FILE="$ARCHIVE_DIR/sessions-${TIMESTAMP}.jsonl"
echo "=== Stale Session Archive ==="
echo "Started: $(date)"
echo "Dry run: $DRY_RUN"
echo ""
echo "[1/3] Exporting session store..."
echo " Output: $OUTPUT_FILE"
hermes sessions export "$OUTPUT_FILE"
echo " -> $(wc -l < "$OUTPUT_FILE") sessions exported"
echo " -> Size: $(du -h "$OUTPUT_FILE" | cut -f1)"
echo ""
if [ "$DRY_RUN" = false ]; then
echo "[2/3] Pruning sessions older than 30 days..."
hermes sessions prune --older-than 30 --yes
echo " Done."
else
echo "[2/3] SKIPPED (dry run) — set DRY_RUN=false to enable prune"
echo " Review $OUTPUT_FILE before enabling."
fi
echo ""
echo "[3/3] Post-archive stats:"
hermes sessions stats
echo ""
echo "✓ Archive complete."
SCRIPT
chmod +x "$HOME/.hermes/scripts/archive-stale-sessions.sh"
echo " ✓ archive-stale-sessions.sh written and executable"
}
# ---- Write skill files (D-10) ----
write_jira_skill() {
mkdir -p "$HOME/.hermes/skills/ngn-agent/jira"
cat > "$HOME/.hermes/skills/ngn-agent/jira/SKILL.md" << 'SKILL'
---
name: jira-query
description: Query Jira Cloud issues, search, and manage tickets
metadata:
hermes:
tags: [jira, project-management]
category: devops
requires_toolsets: [terminal]
version: 1.0.0
---
# Jira Cloud Query
## When to Use
When the user asks to search Jira issues, check ticket status, or list project work.
## Procedure
### 1. Search issues by JQL
```bash
ngn-jira GET '/rest/api/3/search?jql=ORDER BY created DESC&maxResults=10'
```
For specific project:
```bash
ngn-jira GET '/rest/api/3/search?jql=project=PROJ ORDER BY created DESC&maxResults=10'
```
### 2. Get issue details
```bash
ngn-jira GET '/rest/api/3/issue/PROJ-123'
```
### 3. List sprints (if Jira Software)
```bash
ngn-jira GET '/rest/agile/1.0/board'
ngn-jira GET '/rest/agile/1.0/board/{boardId}/sprint?state=active'
```
### 4. Get issue comments
```bash
ngn-jira GET '/rest/api/3/issue/PROJ-123/comment'
```
## Pitfalls
- JQL is case-sensitive for field names
- maxResults defaults to 50; set explicitly for large queries
- Agile REST API may not be available on all plans
## Required Environment
- `JIRA_EMAIL` — your Atlassian account email
- `JIRA_API_TOKEN` — from https://id.atlassian.com/manage/api-tokens
SKILL
echo " ✓ jira/SKILL.md written"
}
write_aws_skill() {
mkdir -p "$HOME/.hermes/skills/ngn-agent/aws-diagnostics/references"
cat > "$HOME/.hermes/skills/ngn-agent/aws-diagnostics/SKILL.md" << 'SKILL'
---
name: aws-diagnostics
description: Read-only AWS diagnostics for platform engineering
metadata:
hermes:
tags: [aws, diagnostics, platform-engineering]
category: devops
requires_toolsets: [terminal]
version: 1.0.0
---
# AWS Diagnostics
## When to Use
When the user asks to check AWS resources, investigate issues, or audit infrastructure in any account.
## Important
- ALWAYS determine the correct AWS_PROFILE before running commands
- NEVER run mutating AWS commands (delete, terminate, stop, modify)
- Prefer read-only AWS CLI commands (describe, list, get)
## Procedure
### 1. Identify the target account
Ask the user which account/environment they want to target. Available profiles in `/.aws/config`:
- `rzaws-sw-rai-ava-dev/prod/rc` — AVA service
- `rzaws-sw-rai-cs-dev/prod/rc` — CS service
- `rzaws-sw-rai-qac-dev/prod` — QAC
- `rzaws-sw-rai-ops` — Ops account
- `rzaws-sw-rai-voicekit-dev/prod/rc` — VoiceKit
- `rzaws-sw-rai-preprod` — Pre-production
- `rzaws-sw-rai-nonprod` — Non-production
### 2. Set the profile
```bash
export AWS_PROFILE=rzaws-sw-rai-<service>-<env>
```
### 3. Diagnostic commands
**EC2 instances:**
```bash
aws ec2 describe-instances --query 'Reservations[*].Instances[*].[InstanceId,State.Name,InstanceType,Tags[?Key==`Name`].Value|[0]]' --output table
```
**ECS services:**
```bash
aws ecs list-clusters && aws ecs list-services --cluster <name>
```
**S3 buckets:**
```bash
aws s3 ls
```
**CloudWatch alarms:**
```bash
aws cloudwatch describe-alarms --state-value ALARM --output table
```
**ECS task health:**
```bash
aws ecs describe-tasks --cluster <name> --tasks <task-ids>
```
**RDS instances:**
```bash
aws rds describe-db-instances --query 'DBInstances[*].[DBInstanceIdentifier,DBInstanceStatus,Engine,DBInstanceClass]' --output table
```
**Lambda functions:**
```bash
aws lambda list-functions --query 'Functions[*].[FunctionName,Runtime,LastModified]' --output table
```
**ELB target group health:**
```bash
aws elbv2 describe-target-groups --query 'TargetGroups[*].[TargetGroupName,TargetType]' --output table
```
### 4. Report findings
Format as a concise table. Include account ID and profile used.
## Alternative: Infrastructure Code Analysis
When AWS CLI access is unavailable (Docker containers, credential issues), examine existing infrastructure code instead:
```bash
# Search for region patterns
search_files --pattern="us-west-2" --path="/workspace"
# Check terraform configurations
read_file /workspace/rai-ops/aws/<account>/us-east-1/app/main.tf
read_file /workspace/rai-ops/aws/<account>/us-east-1/app/<app>.tfvars
# Look for provider configurations
search_files --pattern="provider.*replica" --path="/workspace/rai-ops"
# Check S3 migration data
search_files --pattern="s3-mapping" --target="files"
```
**When to use code analysis:**
- Docker container with read-only filesystem
- Missing AWS CLI or credentials
- Need to understand intended architecture vs live state
- Investigating multiregional setup patterns
## Pitfalls
- SSO tokens expire (~6-8h). If you get auth errors, ask the user to run `aws sso login`
- Some accounts may not have all services — check `aws sts get-caller-identity` first
- Don't pipe large results directly — use `--query` and `--output table` for readability
- **Don't persist with CLI installation in constrained environments** — switch to code analysis quickly when installation fails
## Verification
Run `aws sts get-caller-identity` to confirm the correct profile is active before running diagnostics.
## References
- `references/multiregional-patterns.md` - Terraform patterns for cross-region infrastructure setup
SKILL
# AWS reference file
cat > "$HOME/.hermes/skills/ngn-agent/aws-diagnostics/references/multiregional-patterns.md" << 'REF'
# Multiregional Infrastructure Patterns
## AVA Multiregional Setup
### Provider Configuration Pattern
```hcl
# Primary provider (us-east-1)
provider "aws" {
region = var.region
# ... assume_role block
}
# Replica provider (us-west-2)
provider "aws" {
alias = "replica"
region = "us-west-2"
# ... same assume_role block
}
```
### Module Consumption
```hcl
module "app" {
providers = {
aws = aws
aws.replica = aws.replica # Required by tf-modules/app/versions.tf
}
# ... other config
}
```
### Database Replication Options
**RDS Aurora PostgreSQL (Current Pattern)**
- Engine: `aurora-postgresql`
- Version: `16.11`
- Cross-region read replicas supported
- Can promote replica for DR scenarios
**DynamoDB Global Tables (Available)**
- Global Table v2 with us-east-1 + us-west-2 replicas
- Per-region CMKs for encryption
- Feature-flagged via `var.tenant_registry`
- Documented in RAID-352
### S3 Cross-Region Replication
Extensive existing pattern from migration data:
- `ava-{env}-west-*` buckets in us-west-2
- Matching `rai-s3-usw2-*` naming convention
- Covers: bug reports, screenshots, game logs, shiny moments
### Key Files for Multiregional Analysis
- `aws/<account>/us-east-1/app/provider.tf` - Replica provider config
- `aws/<account>/us-east-1/app/<app>.tfvars` - App-specific resources
- `raid-migration/raid-s3-migration/s3-mapping.csv` - Cross-region S3 inventory
- `RAID-352-PR-DESCRIPTION.md` - DynamoDB Global Tables documentation
REF
echo " ✓ aws-diagnostics/SKILL.md + references written"
}
write_confluence_skill() {
mkdir -p "$HOME/.hermes/skills/ngn-agent/confluence"
cat > "$HOME/.hermes/skills/ngn-agent/confluence/SKILL.md" << 'SKILL'
---
name: confluence-search
description: Search and retrieve Confluence pages
metadata:
hermes:
tags: [confluence, documentation]
category: devops
requires_toolsets: [terminal]
version: 1.0.0
---
# Confluence Search
## When to Use
When the user asks to find documentation, search Confluence pages, or retrieve page content.
## Procedure
### 1. Search pages by text
```bash
ngn-confluence GET '/rest/api/search?cql=text~"search terms"&limit=10'
```
### 2. Search by space
```bash
ngn-confluence GET '/rest/api/search?cql=space=ADM&limit=10'
```
### 3. Get page content
```bash
ngn-confluence GET '/rest/api/content/{pageId}?expand=body.storage'
```
### 4. List pages in space
```bash
ngn-confluence GET '/rest/api/content?spaceKey=ADM&limit=50'
```
### 5. Get page children
```bash
ngn-confluence GET '/rest/api/content/{pageId}/child/page?limit=50'
```
## Pitfalls
- CQL is different from JQL — `text~"query"` for full-text search
- Page body needs `expand=body.storage` to retrieve content
- Use `limit` parameter — defaults to 25
## Required Environment
- `JIRA_EMAIL` — your Atlassian account email
- `JIRA_API_TOKEN` — from https://id.atlassian.com/manage/api-tokens
SKILL
echo " ✓ confluence/SKILL.md written"
}
write_bitbucket_skill() {
mkdir -p "$HOME/.hermes/skills/ngn-agent/bitbucket"
cat > "$HOME/.hermes/skills/ngn-agent/bitbucket/SKILL.md" << 'SKILL'
---
name: bitbucket-pr
description: Review Bitbucket pull requests and repositories
metadata:
hermes:
tags: [bitbucket, git, code-review]
category: devops
requires_toolsets: [terminal]
version: 1.0.0
---
# Bitbucket Pull Requests
## When to Use
When the user asks to check PRs, review code, or list repositories.
## Procedure
### 1. List repositories
```bash
ngn-bitbucket GET '/repositories/razersw?pagelen=20'
```
### 2. List open PRs for a repo
```bash
ngn-bitbucket GET '/repositories/razersw/{repo}/pullrequests?state=OPEN&pagelen=20'
```
### 3. Get PR details
```bash
ngn-bitbucket GET '/repositories/razersw/{repo}/pullrequests/{prId}'
```
### 4. Get PR diff
```bash
ngn-bitbucket GET '/repositories/razersw/{repo}/pullrequests/{prId}/diff'
```
### 5. Get PR comments
```bash
ngn-bitbucket GET '/repositories/razersw/{repo}/pullrequests/{prId}/comments'
```
### 6. List branch list
```bash
ngn-bitbucket GET '/repositories/razersw/{repo}/refs/branches?pagelen=20'
```
## Pitfalls
- Bitbucket pagination uses `pagelen` and `page` params (not `maxResults`)
- Diff endpoint returns raw diff text — may be large
- PR comments include inline code comments, not just summary
## Required Environment
- `JIRA_EMAIL` — your Atlassian account email
- `JIRA_API_TOKEN` — from https://id.atlassian.com/manage/api-tokens
SKILL
echo " ✓ bitbucket/SKILL.md written"
}
write_session_skill() {
mkdir -p "$HOME/.hermes/skills/ngn-agent/session/references"
cat > "$HOME/.hermes/skills/ngn-agent/session/SKILL.md" << 'SKILL'
---
name: session
description: Main ngn-agent session lifecycle — init, work, close
metadata:
hermes:
tags: [ngn-agent, platform-engineering, session]
category: devops
requires_toolsets: [terminal]
version: 1.0.0
---
# ngn-agent Session Lifecycle
## When to Use
Load this skill at the START of EVERY platform engineering session, before any other work. This skill defines the standard session workflow.
Specific triggers:
- When the user starts any infrastructure or platform engineering task
- When the user asks to create a Jira ticket or find a ticket
- When the user wants to search or load Confluence documentation
- When a session is ending and you need to document progress
- When you need to save context for future sessions
## Important
- **Keep this skill loaded for the entire session** — if context grows large, reload via `skill_view("session")` before the session-end steps (Steps 57)
- **Never create Jira tickets without asking the user first** (D-02)
- **Never update Confluence without asking the user first** (D-11)
- **Always save session summary to hindsight at end** — this step has no user prompt, it is automatic (D-12)
- User must confirm before any Jira mutation (create, comment, transition) — D-08
- Repos are already mounted at `/workspace/` from Phase 6 (rai-ops, rai-deployment, rai-devtools)
- This skill replaces the ad-hoc session workflow with a repeatable init→work→close pattern
## Procedure
### 1. Check for Similar Previous Sessions
At the very start of a session, use `hindsight_recall` with a query describing the user's current task to find similar sessions from the last 2 weeks.
Call `hindsight_recall` with a budget of low:
```
Tool: hindsight_recall
Query: "<user's task description>"
Budget: low
```
Present any matches to the user in this format:
```
Found [N] similar sessions from the last 2 weeks:
1. [Session Title] — [Date] — [one-line summary]
2. [Session Title] — [Date] — [one-line summary]
```
Ask the user: "Would you like to resume any of these sessions, or start fresh?"
- If they choose to resume: load that session's context and continue
- If they choose fresh: proceed to step 2
If no similar sessions are found (normal for first sessions), proceed to step 2.
### 2. Prompt: Create Jira Ticket
Ask the user: "Would you like to create a Jira Task ticket for this session?"
If YES:
1. Ask which Jira project to use (e.g., "PLATFORM", "DEVOPS") — do not hardcode (D-06)
2. Check hindsight for cached epics:
```
Tool: hindsight_recall
Query: "jira epics cached"
Budget: low
```
3. If epics are cached, check the cache timestamp:
- If the cache is more than 24 hours old OR the user says the list looks wrong, refresh from Jira:
```bash
ngn-jira GET '/rest/api/3/search?jql=issuetype=Epic AND project=<PROJECT>&fields=summary,id&maxResults=50'
```
Save fresh epics to hindsight:
```
Tool: hindsight_retain
tier: "epic-cache"
content: "Epic Cache [<date>]: PROJECT=<PROJECT>: [EPIC-KEY-1: Summary, EPIC-KEY-2: Summary, ...]"
```
- If the cache is fresh (less than 24 hours old), use the cached list
4. If no cached epics found, query Jira for current epics:
```bash
ngn-jira GET '/rest/api/3/search?jql=issuetype=Epic AND project=<PROJECT>&fields=summary,id&maxResults=50'
```
Save to hindsight for future sessions:
```
Tool: hindsight_retain
tier: "epic-cache"
content: "Epic Cache [<date>]: PROJECT=<PROJECT>: [EPIC-KEY-1: Summary, ...]"
```
5. Present cached/refreshed epics to the user: "Available epics: [list]. Would you like to set a parent epic?"
6. If user selects an epic, include it as parent when creating the ticket
7. Create the Task via Jira REST API:
```bash
ngn-jira POST '/rest/api/3/issue' --body '{
"fields": {
"project": {"key": "<PROJECT>"},
"summary": "<session task description>",
"issuetype": {"name": "Task"},
"parent": {"key": "<EPIC_KEY>"}
}
}'
```
8. Note the ticket key (e.g., `PLATFORM-123`) — save it for session-end steps (Step 5)
If NO: proceed to step 3 (no Jira ticket this session)
### 3. Prompt: Load Confluence Documentation
Ask the user: "Would you like to load relevant Confluence documentation?"
If YES:
1. Search by the `ngn-agent` tag:
```bash
ngn-confluence GET '/rest/api/search?cql=tag="ngn-agent"&limit=20'
```
2. Present matching pages to the user:
```
Found [N] pages tagged 'ngn-agent':
- [Title] — [Space] — [Last Modified]
```
3. Ask: "Which pages would you like me to load?"
4. For each selected page, load its full content:
```bash
ngn-confluence GET '/rest/api/content/{pageId}?expand=body.storage'
```
5. Review the loaded content with the user
If NO: proceed to step 4
### 4. Work Phase
Repos are already mounted at `/workspace/` (rai-ops, rai-deployment, rai-devtools). Proceed with the task using standard Hermes tools.
If you need to clone additional repos:
```bash
git clone git@bitbucket.org:razersw/<repo>.git /workspace/<repo>
```
The session skill remains loaded for the session-end steps below. If the skill is evicted from context during a long session, reload it with `skill_view("session")` before proceeding to Steps 57.
### 5. Session-End: Update Jira
When the user indicates work is complete or the session wraps up:
Ask the user: "Would you like me to update the Jira ticket with a summary comment?"
If YES (and a ticket was created in Step 2):
```bash
ngn-jira POST '/rest/api/3/issue/<TICKET-KEY>/comment' --body '{
"body": "<summary of work done, key decisions, next steps>"
}'
```
If NO: proceed without updating Jira.
**Important (D-08):** Do NOT transition tickets (e.g., close, resolve, move to Done) without explicit user confirmation. Only add comments unless the user specifically asks for a status change.
### 6. Session-End: Update Confluence
Ask the user: "Would you like me to create or update a Confluence page documenting this session?"
If YES:
- For a new page:
```bash
ngn-confluence POST '/rest/api/content' --body '{
"type": "page",
"title": "<Session Date>: <Task Description>",
"space": {"key": "<SPACE_KEY>"},
"body": {
"storage": {
"value": "<h1>Session Summary</h1><p><task summary, key decisions, outcomes></p>",
"representation": "storage"
}
},
"metadata": {
"properties": {
"content-appearance": {"value": "page"}
}
},
"labels": [{"name": "ngn-agent"}]
}'
```
- For updating an existing page: ask the user which page to update, then PUT to update its content
- **Important (D-11):** Do NOT create or update any Confluence page without the user confirming first
If NO: proceed without updating Confluence.
### 7. Session-End: Save to Hindsight (Automatic — No Prompt)
ALWAYS save a session summary to hindsight memory. Do NOT ask the user — this step is automatic and unconditional (D-12).
```bash
Tool: hindsight_retain
tier: "session-summary"
content: "
Session Summary
===============
Date: <today>
Task: <task description>
Repos: <repos worked on>
Jira: <ticket key or \"none\">
Key Decisions:
- <decision 1>
- <decision 2>
Outcomes:
- <outcome 1>
- <outcome 2>
Next Steps:
- <next step 1>
"
```
This summary allows future `hindsight_recall` queries to find this session for similarity matching (D-13). The structured content includes: date, task description, repos worked on, Jira ticket reference (or "none"), key decisions, outcomes, and next steps.
## Pitfalls
- **Skill not loaded at session start:** If you find yourself midway through a session without having run Steps 13, you missed the session start workflow. Run Step 1 (hindsight_recall) retroactively and ask the user if they want to create a Jira ticket or load Confluence docs. For future sessions, make sure to load this skill at the very start.
- **Epic cache too old:** Epics may change between sessions. Check the cache timestamp and refresh if more than 24 hours old. If the user says "that's wrong," always refresh regardless of age.
- **Confluence tag mismatch:** If the `ngn-agent` tag returns no results, try `platform-engineering` as a fallback, or ask the user what tag they use for session documentation.
- **Jira project doesn't exist:** If the create ticket call fails with a 404, the project key may be wrong. Ask the user to confirm the correct project key.
- **Empty hindsight recall (first sessions):** The first few sessions will have no similar sessions to find. That is normal — proceed with a fresh session. Over time, hindsight will accumulate session summaries.
- **Long sessions may evict this skill:** If the conversation grows long, the session skill content may be evicted from the agent's context. Reload it with `skill_view("session")` before the session-end steps (Steps 57) to ensure the Jira/Confluence prompts and hindsight save are not missed.
- **Missing Jira credentials in cron jobs:** The ngn-jira tool requires both `JIRA_EMAIL` and `JIRA_API_TOKEN` environment variables. If either is missing, Jira operations will fail with "unbound variable" errors. Check environment setup before attempting Jira updates in automated workflows.
## Operational Automation
### Daily Session Monitoring (Cron Job)
When running as a scheduled cron job for operational monitoring:
1. **Discover Active Sessions**:
```bash
hermes sessions export - # NOT 'hermes sessions list' - no --json flag available
```
Parse JSONL output with Python to find sessions with `last_active` within last 7 days
2. **Find Associated Jira Tickets**:
- Use `hindsight_recall` with query 'session summary jira' for each active session
- Search session messages for Jira patterns: `PLATFORM-\d+`, `AIOPS-\d+`, `RAID-\d+`, etc.
- Note: One session may have multiple Jira tickets (1-to-many mapping)
3. **Update Jira with Progress**:
```bash
ngn-jira POST '/rest/api/3/issue/<KEY>/comment' --body '{
"body": "Session activity update — Date: <today>, Last active: <last_active>. Session: <session_id>. Progress: See session transcript for details."
}'
```
4. **Generate Telegram Report**:
- Structure: Active Sessions + Jira Updated + Issues/Summary
- Keep under 4096 character limit
- Format with emoji sections for clarity
**Environment Requirements for Operational Jobs**:
- `JIRA_EMAIL` — Required for ngn-jira authentication
- `JIRA_API_TOKEN` — API token from Atlassian account
- Both must be set or Jira updates will fail
See `references/operational-monitoring.md` for detailed patterns, templates, and troubleshooting.
**Important Constraints (Cron Mode)**:
- DO NOT transition ticket statuses (D-05) - only add comments
- DO NOT update stale sessions (D-15) - only active within 7 days
- Use silent mode `[SILENT]` if no active sessions found
## Verification
1. On session start, agent checks for similar sessions via `hindsight_recall` ✓
2. Jira Task ticket created (or user declined) ✓
3. Confluence docs loaded by `ngn-agent` tag search (or user declined) ✓
4. At session end, user prompted for Jira update ✓
5. At session end, user prompted for Confluence update ✓
6. Session summary automatically saved to hindsight via `hindsight_retain` (no prompt) ✓
7. **Operational cron jobs can discover active sessions and update Jira tickets** ✓
SKILL
# Session reference file
cat > "$HOME/.hermes/skills/ngn-agent/session/references/operational-monitoring.md" << 'REF'
# Operational Session Monitoring
## Jira Ticket Pattern Detection
When scanning session content for associated Jira tickets, search for these patterns:
```python
jira_patterns = [
r'(PLATFORM-\d+)', # Platform engineering tickets
r'(AIOPS-\d+)', # AI Operations tickets
r'(RAID-\d+)', # RAID project tickets
r'(DEVOPS-\d+)', # DevOps tickets
r'(QAC-\d+)' # QAC tickets
]
```
## Session Export vs List Commands
**CORRECT**: `hermes sessions export -`
- Returns machine-readable JSONL format
- Each line is a complete session object
- Includes `last_active` timestamps for filtering
**INCORRECT**: `hermes sessions list --json`
- The `--json` flag does not exist (Pitfall from RESEARCH.md)
- Use export for automation, list for human viewing only
## Environment Variable Requirements
The `ngn-jira` tool wrapper expects:
- `JIRA_EMAIL` - Atlassian account email
- `JIRA_API_TOKEN` - From https://id.atlassian.com/manage/api-tokens
Missing either variable causes: `bash: line 10: JIRA_EMAIL: unbound variable`
## Telegram Report Template
```
📋 **ACTIVE SESSIONS** — {date}
🔹 **{session_id}**
Title: {title}
Last Active: {timestamp}
Jira: {ticket_keys or "None"}
🔄 **JIRA UPDATED**: {list of updated ticket keys}
❌ **ISSUES**: {any operational problems}
📊 **SUMMARY**: {count} active sessions found, {count} with Jira tickets
```
Character limit: 4096 for Telegram delivery
REF
echo " ✓ session/SKILL.md + references written"
}
# ---- Register cron jobs (D-10) ----
register_cron_jobs() {
echo " → Registering cron jobs (D-10)..."
# 1. ngn-daily-report (daily at 09:00 SGT)
echo " → Creating ngn-daily-report..."
hermes cron create --deliver telegram --skill session --skill jira-query \
'0 9 * * *' \
'Daily session report. Export sessions, find active ones, check Jira, compose Telegram summary.' \
2>/dev/null && echo " ✓ ngn-daily-report registered" \
|| echo " ⚠ ngn-daily-report may already exist"
# 2. ngn-weekly-stale-summary (Sunday 20:00 SGT)
echo " → Creating ngn-weekly-stale-summary..."
hermes cron create --deliver telegram --skill session \
'0 20 * * 0' \
'Weekly stale session summary. Review sessions inactive >30 days, compose Telegram summary.' \
2>/dev/null && echo " ✓ ngn-weekly-stale-summary registered" \
|| echo " ⚠ ngn-weekly-stale-summary may already exist"
# 3. ngn-weekly-archive (Sunday 20:05 SGT — 5 min after summary, per D-10)
echo " → Creating ngn-weekly-archive..."
hermes cron create --no-agent --script archive-stale-sessions.sh \
'5 20 * * 0' \
2>/dev/null && echo " ✓ ngn-weekly-archive registered" \
|| echo " ⚠ ngn-weekly-archive may already exist"
}
# ---- Offer gateway restart (per CONTEXT.md "Specific Ideas") ----
offer_gateway_restart() {
echo ""
echo "==> Setup complete!"
echo ""
read -p "Restart Hermes gateway now? [Y/n]: " restart
if [[ "$restart" =~ ^[Yy]?$ ]]; then
hermes gateway restart
echo " → Gateway restarted."
else
echo " → Skipped. Run 'hermes gateway restart' when ready."
fi
}
# =============================================================================
# Main Execution Block
# =============================================================================
main() {
echo ""
echo "=== ngn-agent Setup Script ==="
echo "Embedded file snapshots frozen at: 2026-06-15"
echo ""
# Step 1: Parse arguments (already done above)
# Step 2: Check prerequisites
echo "[1/14] Checking prerequisites..."
check_prerequisites
# Step 3: Print path summary
print_summary
# Step 4: Prompt for secrets
echo "[2/14] Collecting secrets..."
if [ "$NONINTERACTIVE" = false ]; then
JIRA_API_TOKEN=$(prompt_secret "JIRA_API_TOKEN" "JIRA API Token (https://id.atlassian.com/manage/api-tokens): ")
JIRA_EMAIL=$(prompt_secret "JIRA_EMAIL" "JIRA Email: ")
TELEGRAM_BOT_TOKEN=$(prompt_secret "TELEGRAM_BOT_TOKEN" "Telegram Bot Token (from @BotFather): ")
OPENROUTER_API_KEY=$(prompt_secret "OPENROUTER_API_KEY" "OpenRouter API Key (leave blank to keep existing): " "true")
else
echo " → Non-interactive mode — using environment variables"
: "${JIRA_API_TOKEN:?JIRA_API_TOKEN not set}"
: "${JIRA_EMAIL:?JIRA_EMAIL not set}"
: "${TELEGRAM_BOT_TOKEN:?TELEGRAM_BOT_TOKEN not set}"
fi
echo " ✓ Secrets collected"
# Step 5: Create directories
echo "[3/14] Creating directories..."
create_directories
# Step 6: Backup existing config
echo "[4/14] Backing up existing config..."
backup_config
# Step 7: Generate config.yaml
echo "[5/14] Generating config.yaml..."
generate_config_yaml
# Step 8: Generate .env
echo "[6/14] Generating .env..."
generate_env_file
# Step 9: Generate hindsight config
echo "[7/14] Generating hindsight config..."
generate_hindsight_config
# Step 10: Generate cron env config
echo "[8/14] Configuring cron environment..."
generate_cron_env_config
# Step 11: Write session-init script
echo "[9/14] Writing session-init script..."
write_session_init_script
# Step 12: Write archive script
echo "[10/14] Writing archive script..."
write_archive_script
# Step 13: Write skill files
echo "[11/14] Writing skill files..."
write_jira_skill
write_aws_skill
write_confluence_skill
write_bitbucket_skill
write_session_skill
# Step 14: Register cron jobs
echo "[12/14] Registering cron jobs..."
register_cron_jobs || echo " ⚠ Cron registration had issues (may already exist)"
echo "[13/14] Setup complete."
echo "[14/14] Offering gateway restart..."
offer_gateway_restart
}
main "$@"