diff --git a/ops/data-retention/data-retention-policy.yml b/ops/data-retention/data-retention-policy.yml new file mode 100644 index 00000000..583eead8 --- /dev/null +++ b/ops/data-retention/data-retention-policy.yml @@ -0,0 +1,202 @@ +# RemitFlow — Data Retention & Privacy Policy (Technical Implementation) +# +# Compliance: GDPR (EU), NDPR (Nigeria), POPIA (South Africa), PDPA (Kenya) +# Financial: CBN regulations, FCA record-keeping, FATF Recommendation 11 + +--- +data_categories: + + # ─── User Identity Data ────────────────────────────────────────────────────── + - category: "user_identity" + description: "PII: name, email, phone, address, date of birth" + storage: "PostgreSQL (encrypted at rest, AES-256)" + retention: + active_user: "Duration of account + 7 years (financial regulations)" + inactive_user: "5 years after last login (then anonymize)" + deleted_user: "Anonymize within 30 days of deletion request" + legal_basis: + gdpr: "Article 6(1)(b) — Contract performance + Article 6(1)(c) — Legal obligation" + ndpr: "Section 2.2 — Consent + legitimate interest" + deletion_procedure: + - "Replace PII with SHA-256 hash (preserves referential integrity)" + - "Retain transaction history with anonymized references" + - "Remove from search indices (OpenSearch)" + - "Purge from Redis cache" + - "Log deletion in audit trail (GDPR Article 30)" + automated: true + cron: "0 2 * * 0" # Weekly Sunday 2am UTC + + # ─── KYC Documents ────────────────────────────────────────────────────────── + - category: "kyc_documents" + description: "ID scans, selfies, proof of address, BVN/NIN verification results" + storage: "Object storage (S3/GCS, encrypted, separate bucket)" + retention: + active_user: "Duration of account + 7 years" + post_verification: "Original documents deleted after 90 days; verification result retained" + rejected_user: "6 months after rejection (regulatory requirement)" + legal_basis: + cbn: "CBN AML/CFT Regulations 2022 — 5 year minimum" + fca: "FCA SYSC 9.1 — 5 years after relationship ends" + fatf: "Recommendation 11 — 5 years minimum" + deletion_procedure: + - "Securely delete document files (cryptographic erasure)" + - "Retain verification metadata (passed/failed, date, tier)" + - "Retain document type and issuing country (no content)" + + # ─── Transaction Records ───────────────────────────────────────────────────── + - category: "transactions" + description: "Transfer records, payment intents, settlement records, batch payouts" + storage: "PostgreSQL + TigerBeetle (immutable ledger)" + retention: + all: "7 years minimum (financial regulation requirement)" + tigerbeetle: "Permanent (append-only, cannot delete)" + postgresql: "7 years active, then archive to cold storage" + legal_basis: + cbn: "CBN Prudential Guidelines — 7 years" + fca: "FCA record-keeping — 5 years (we retain 7 for safety)" + tax: "Tax authority requirements — typically 6-7 years" + archival: + trigger: "Records older than 2 years" + destination: "S3 Glacier Deep Archive" + format: "Parquet (compressed, queryable)" + cron: "0 3 1 * *" # Monthly 1st at 3am + + # ─── Audit Logs ───────────────────────────────────────────────────────────── + - category: "audit_logs" + description: "System actions, admin operations, access logs, Kafka events" + storage: "Kafka (30 days hot) → S3 (7 years cold)" + retention: + kafka_hot: "30 days" + s3_cold: "7 years" + security_events: "10 years (fraud investigations)" + deletion_procedure: + - "Kafka topic retention.ms = 2592000000 (30 days)" + - "Kafka Connect archives to S3 before expiry" + - "S3 lifecycle policy moves to Glacier after 1 year" + + # ─── SAR & Compliance Reports ──────────────────────────────────────────────── + - category: "compliance_reports" + description: "SARs, CTRs, PEP screening results, sanctions hits" + storage: "PostgreSQL (encrypted, restricted access)" + retention: + all: "10 years (FATF Recommendation 11, CBN AML/CFT)" + active_investigation: "Duration of investigation + 10 years" + access_control: + - "Compliance team only (Permify role: compliance_officer)" + - "Audit trail on every access" + - "Cannot be modified or deleted (append-only)" + legal_basis: + fatf: "Recommendation 11 — record-keeping for 5+ years" + cbn: "CBN AML/CFT — 10 years" + + # ─── Session & Auth Data ───────────────────────────────────────────────────── + - category: "sessions" + description: "Login sessions, OAuth tokens, device fingerprints" + storage: "Redis (active) + PostgreSQL (historical)" + retention: + active_session: "24 hours (auto-expire)" + refresh_token: "30 days" + login_history: "2 years" + device_fingerprints: "Duration of account" + deletion_procedure: + - "Redis TTL handles active session expiry" + - "Login history purged with account deletion" + + # ─── Analytics & Metrics ───────────────────────────────────────────────────── + - category: "analytics" + description: "Prometheus metrics, Grafana data, usage statistics" + storage: "Prometheus TSDB + Lakehouse (DuckDB/Delta)" + retention: + prometheus_raw: "30 days" + prometheus_downsampled: "1 year (5m resolution)" + lakehouse: "3 years (aggregated, no PII)" + anonymization: + - "All analytics are aggregated (no individual user tracking)" + - "Corridor volumes, not individual transfer amounts" + + # ─── Communication Data ────────────────────────────────────────────────────── + - category: "communications" + description: "SMS, email, push notification logs, webhook payloads" + storage: "PostgreSQL" + retention: + notification_content: "90 days" + delivery_metadata: "2 years (delivery status, timestamps)" + webhook_payloads: "30 days" + +--- +# DSAR (Data Subject Access Request) Implementation +dsar: + right_to_access: + endpoint: "/api/trpc/privacy.exportData" + format: "JSON + PDF (machine-readable + human-readable)" + response_time: "30 days maximum (GDPR Article 12)" + includes: + - "All PII" + - "Transaction history" + - "KYC verification status" + - "Communication preferences" + excludes: + - "SAR filings (legal exemption)" + - "Internal risk scores" + - "Fraud investigation notes" + + right_to_erasure: + endpoint: "/api/trpc/privacy.requestDeletion" + response_time: "30 days maximum" + exceptions: + - "Active financial obligations" + - "Regulatory retention requirements (7-10 years)" + - "Ongoing investigations" + process: + 1: "User requests deletion via app or support" + 2: "System checks for legal holds / obligations" + 3: "If clear: schedule anonymization in 30 days" + 4: "Notify user of timeline and any exceptions" + 5: "Execute anonymization (replace PII with hash)" + 6: "Confirm deletion to user" + + right_to_portability: + endpoint: "/api/trpc/privacy.exportPortable" + format: "JSON (structured, machine-readable)" + includes: "All data provided by user + generated during use" + +--- +# Automated Retention Jobs +automation: + jobs: + - name: "anonymize_inactive_users" + schedule: "0 2 * * 0" # Weekly + query: "SELECT id FROM users WHERE last_login < now() - interval '5 years' AND NOT anonymized" + action: "anonymize_user(id)" + + - name: "archive_old_transactions" + schedule: "0 3 1 * *" # Monthly + query: "SELECT * FROM transfers WHERE created_at < now() - interval '2 years'" + action: "archive_to_s3_glacier(records)" + + - name: "purge_expired_sessions" + schedule: "0 * * * *" # Hourly + action: "redis SCAN + DEL expired keys" + + - name: "purge_old_notifications" + schedule: "0 4 1 * *" # Monthly + query: "DELETE FROM notifications WHERE created_at < now() - interval '90 days'" + + - name: "kafka_archival" + schedule: "0 5 * * *" # Daily + action: "Kafka Connect S3 sink (all topics > 30 days)" + +--- +# Encryption Standards +encryption: + at_rest: + postgresql: "AES-256 (Transparent Data Encryption)" + s3: "AES-256-GCM (SSE-S3 or SSE-KMS)" + redis: "TLS in transit, no at-rest (ephemeral)" + tigerbeetle: "Built-in encryption" + in_transit: + external: "TLS 1.3 (minimum TLS 1.2)" + internal: "mTLS between services" + key_rotation: + schedule: "90 days" + method: "AWS KMS / HashiCorp Vault" diff --git a/ops/monitoring/alertmanager/alertmanager.yml b/ops/monitoring/alertmanager/alertmanager.yml new file mode 100644 index 00000000..fd090f1e --- /dev/null +++ b/ops/monitoring/alertmanager/alertmanager.yml @@ -0,0 +1,144 @@ +# RemitFlow — Alertmanager Configuration +# +# Routes alerts to appropriate channels based on severity and team. +# Integrates: PagerDuty (critical), Opsgenie (warning), Slack (info) + +global: + resolve_timeout: 5m + pagerduty_url: "https://events.pagerduty.com/v2/enqueue" + opsgenie_api_url: "https://api.opsgenie.com/" + slack_api_url: "${SLACK_WEBHOOK_URL}" + +# Notification templates +templates: + - "/etc/alertmanager/templates/*.tmpl" + +# Inhibition: suppress lower severity if higher is firing +inhibit_rules: + - source_matchers: + - severity="critical" + target_matchers: + - severity="warning" + equal: ["alertname", "team"] + + - source_matchers: + - alertname="ServiceDown" + target_matchers: + - alertname=~".*Latency.*|.*ErrorRate.*" + equal: ["job"] + +# Routing tree +route: + receiver: "default-slack" + group_by: ["alertname", "team", "severity"] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + + routes: + # Critical: Page immediately + - match: + severity: critical + receiver: "pagerduty-critical" + group_wait: 10s + repeat_interval: 1h + routes: + # Financial integrity: separate escalation + - match: + alertname: LedgerImbalance + receiver: "pagerduty-finance-critical" + group_wait: 0s + repeat_interval: 15m + + # Compliance critical: separate channel + - match: + team: compliance + receiver: "pagerduty-compliance" + group_wait: 10s + + # Warning: Create ticket + - match: + severity: warning + receiver: "opsgenie-warning" + group_wait: 1m + repeat_interval: 8h + routes: + - match: + team: finance + receiver: "opsgenie-finance" + + - match: + team: compliance + receiver: "opsgenie-compliance" + + # Info: Slack only + - match: + severity: info + receiver: "slack-info" + group_wait: 5m + repeat_interval: 24h + +# Receivers +receivers: + - name: "default-slack" + slack_configs: + - channel: "#remitflow-alerts" + send_resolved: true + title: '{{ template "slack.title" . }}' + text: '{{ template "slack.text" . }}' + + - name: "pagerduty-critical" + pagerduty_configs: + - service_key: "${PAGERDUTY_PLATFORM_KEY}" + severity: critical + description: '{{ template "pagerduty.description" . }}' + details: + firing: '{{ template "pagerduty.firing" . }}' + runbook: "{{ (index .Alerts 0).Labels.runbook }}" + + - name: "pagerduty-finance-critical" + pagerduty_configs: + - service_key: "${PAGERDUTY_FINANCE_KEY}" + severity: critical + description: "FINANCIAL INTEGRITY: {{ .CommonAnnotations.summary }}" + details: + firing: '{{ template "pagerduty.firing" . }}' + runbook: "{{ (index .Alerts 0).Labels.runbook }}" + slack_configs: + - channel: "#remitflow-finance-emergency" + send_resolved: true + color: danger + title: "🚨 LEDGER ALERT: {{ .CommonAnnotations.summary }}" + + - name: "pagerduty-compliance" + pagerduty_configs: + - service_key: "${PAGERDUTY_COMPLIANCE_KEY}" + severity: critical + description: "COMPLIANCE: {{ .CommonAnnotations.summary }}" + + - name: "opsgenie-warning" + opsgenie_configs: + - api_key: "${OPSGENIE_API_KEY}" + message: "{{ .CommonAnnotations.summary }}" + priority: P3 + tags: "remitflow,{{ .CommonLabels.team }}" + + - name: "opsgenie-finance" + opsgenie_configs: + - api_key: "${OPSGENIE_API_KEY}" + message: "FINANCE: {{ .CommonAnnotations.summary }}" + priority: P2 + tags: "remitflow,finance" + + - name: "opsgenie-compliance" + opsgenie_configs: + - api_key: "${OPSGENIE_API_KEY}" + message: "COMPLIANCE: {{ .CommonAnnotations.summary }}" + priority: P2 + tags: "remitflow,compliance" + + - name: "slack-info" + slack_configs: + - channel: "#remitflow-alerts-info" + send_resolved: true + title: "ℹ️ {{ .CommonAnnotations.summary }}" diff --git a/ops/monitoring/docker-compose.monitoring.yml b/ops/monitoring/docker-compose.monitoring.yml new file mode 100644 index 00000000..09428330 --- /dev/null +++ b/ops/monitoring/docker-compose.monitoring.yml @@ -0,0 +1,60 @@ +# RemitFlow — Monitoring Stack +# +# Usage: +# docker compose -f ops/monitoring/docker-compose.monitoring.yml up -d +# +# Access: +# Grafana: http://localhost:3100 (admin/remitflow) +# Prometheus: http://localhost:9090 +# Alertmanager: http://localhost:9093 + +services: + prometheus: + image: prom/prometheus:v2.51.0 + container_name: remitflow-prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml + - prometheus_data:/prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time=30d" + - "--web.enable-lifecycle" + restart: unless-stopped + + alertmanager: + image: prom/alertmanager:v0.27.0 + container_name: remitflow-alertmanager + ports: + - "9093:9093" + volumes: + - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml + environment: + - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-} + - PAGERDUTY_PLATFORM_KEY=${PAGERDUTY_PLATFORM_KEY:-} + - PAGERDUTY_FINANCE_KEY=${PAGERDUTY_FINANCE_KEY:-} + - PAGERDUTY_COMPLIANCE_KEY=${PAGERDUTY_COMPLIANCE_KEY:-} + - OPSGENIE_API_KEY=${OPSGENIE_API_KEY:-} + restart: unless-stopped + + grafana: + image: grafana/grafana:10.4.0 + container_name: remitflow-grafana + ports: + - "3100:3000" + volumes: + - ./grafana/provisioning:/etc/grafana/provisioning + - ./grafana/dashboards:/var/lib/grafana/dashboards + - grafana_data:/var/lib/grafana + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=remitflow + - GF_USERS_ALLOW_SIGN_UP=false + - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/remitflow-transfers.json + restart: unless-stopped + +volumes: + prometheus_data: + grafana_data: diff --git a/ops/monitoring/grafana/dashboards/remitflow-infrastructure.json b/ops/monitoring/grafana/dashboards/remitflow-infrastructure.json new file mode 100644 index 00000000..e0fb6e4e --- /dev/null +++ b/ops/monitoring/grafana/dashboards/remitflow-infrastructure.json @@ -0,0 +1,144 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Service Health Overview", + "type": "statusmap", + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 0 }, + "targets": [ + { + "expr": "up{job=~\"remitflow.*\"}", + "legendFormat": "{{job}}" + } + ] + }, + { + "title": "CPU Usage by Service", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{job=~\"remitflow.*\"}[5m]) * 100", + "legendFormat": "{{job}}" + } + ], + "fieldConfig": { "defaults": { "unit": "percent" } } + }, + { + "title": "Memory Usage by Service", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "targets": [ + { + "expr": "process_resident_memory_bytes{job=~\"remitflow.*\"} / 1024 / 1024", + "legendFormat": "{{job}}" + } + ], + "fieldConfig": { "defaults": { "unit": "decmbytes" } } + }, + { + "title": "PostgreSQL — Active Connections", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 }, + "targets": [ + { + "expr": "pg_stat_activity_count", + "legendFormat": "Active" + }, + { + "expr": "pg_settings_max_connections", + "legendFormat": "Max" + } + ] + }, + { + "title": "Redis — Operations/sec", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 }, + "targets": [ + { + "expr": "rate(redis_commands_processed_total[5m])", + "legendFormat": "Ops/sec" + } + ] + }, + { + "title": "Kafka — Messages/sec by Topic", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 }, + "targets": [ + { + "expr": "sum(rate(kafka_server_brokertopicmetrics_messagesin_total[5m])) by (topic)", + "legendFormat": "{{topic}}" + } + ] + }, + { + "title": "TigerBeetle — Transactions/sec", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 }, + "targets": [ + { + "expr": "rate(tigerbeetle_transfers_total[5m])", + "legendFormat": "Transfers/sec" + }, + { + "expr": "rate(tigerbeetle_accounts_total[5m])", + "legendFormat": "Account Ops/sec" + } + ] + }, + { + "title": "Temporal — Active Workflows", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 }, + "targets": [ + { + "expr": "temporal_workflow_active_count", + "legendFormat": "{{workflow_type}}" + } + ] + }, + { + "title": "Go Services — Goroutines", + "type": "timeseries", + "gridPos": { "h": 6, "w": 8, "x": 0, "y": 30 }, + "targets": [ + { + "expr": "go_goroutines{job=~\"remitflow-go.*\"}", + "legendFormat": "{{job}}" + } + ] + }, + { + "title": "Rust Services — Request Duration", + "type": "timeseries", + "gridPos": { "h": 6, "w": 8, "x": 8, "y": 30 }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(rust_http_request_duration_seconds_bucket[5m])) by (le, service))", + "legendFormat": "{{service}} p95" + } + ], + "fieldConfig": { "defaults": { "unit": "s" } } + }, + { + "title": "Python Services — Request Queue", + "type": "timeseries", + "gridPos": { "h": 6, "w": 8, "x": 16, "y": 30 }, + "targets": [ + { + "expr": "python_request_queue_size{job=~\"remitflow-python.*\"}", + "legendFormat": "{{job}}" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["remitflow", "infrastructure"], + "time": { "from": "now-1h", "to": "now" }, + "title": "RemitFlow — Infrastructure", + "uid": "remitflow-infra", + "version": 1 +} diff --git a/ops/monitoring/grafana/dashboards/remitflow-transfers.json b/ops/monitoring/grafana/dashboards/remitflow-transfers.json new file mode 100644 index 00000000..7549289f --- /dev/null +++ b/ops/monitoring/grafana/dashboards/remitflow-transfers.json @@ -0,0 +1,275 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Transfer Success Rate (SLO: 99.9%)", + "type": "gauge", + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 }, + "targets": [ + { + "expr": "sum(rate(transfers_completed_total[5m])) / sum(rate(transfers_initiated_total[5m])) * 100", + "legendFormat": "Success Rate %" + } + ], + "fieldConfig": { + "defaults": { + "min": 0, + "max": 100, + "thresholds": { + "steps": [ + { "color": "red", "value": 0 }, + { "color": "orange", "value": 99 }, + { "color": "green", "value": 99.9 } + ] + }, + "unit": "percent" + } + } + }, + { + "title": "Fund Delivery Latency (p95)", + "type": "timeseries", + "gridPos": { "h": 6, "w": 9, "x": 6, "y": 0 }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(transfer_delivery_duration_seconds_bucket[5m])) by (le, corridor))", + "legendFormat": "{{corridor}} p95" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(transfer_delivery_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "Global p50" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { "thresholdsStyle": { "mode": "line" } }, + "thresholds": { + "steps": [ + { "color": "green", "value": 0 }, + { "color": "red", "value": 30 } + ] + } + } + } + }, + { + "title": "Active Transfers", + "type": "stat", + "gridPos": { "h": 6, "w": 3, "x": 15, "y": 0 }, + "targets": [ + { + "expr": "sum(transfers_in_flight)", + "legendFormat": "In Flight" + } + ] + }, + { + "title": "Failed Transfers (last 1h)", + "type": "stat", + "gridPos": { "h": 6, "w": 3, "x": 18, "y": 0 }, + "targets": [ + { + "expr": "sum(increase(transfers_failed_total[1h]))", + "legendFormat": "Failed" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { "color": "green", "value": 0 }, + { "color": "orange", "value": 5 }, + { "color": "red", "value": 20 } + ] + } + } + } + }, + { + "title": "Corridor Volume (Transfers/min)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "targets": [ + { + "expr": "sum(rate(transfers_initiated_total[5m])) by (corridor) * 60", + "legendFormat": "{{corridor}}" + } + ], + "fieldConfig": { "defaults": { "unit": "tpm" } } + }, + { + "title": "TigerBeetle Ledger Balance (Debits - Credits)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "targets": [ + { + "expr": "sum(tigerbeetle_debits_total) - sum(tigerbeetle_credits_total)", + "legendFormat": "Imbalance (should be 0)" + } + ], + "fieldConfig": { + "defaults": { + "custom": { "thresholdsStyle": { "mode": "area" } }, + "thresholds": { + "steps": [ + { "color": "green", "value": -0.01 }, + { "color": "red", "value": 0.01 } + ] + } + } + } + }, + { + "title": "Error Rate by Endpoint", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (route) / sum(rate(http_requests_total[5m])) by (route) * 100", + "legendFormat": "{{route}}" + } + ], + "fieldConfig": { "defaults": { "unit": "percent", "max": 10 } } + }, + { + "title": "Circuit Breaker Status", + "type": "table", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, + "targets": [ + { + "expr": "circuit_breaker_state", + "legendFormat": "{{service}}", + "format": "table", + "instant": true + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "renameByName": { "service": "Service", "Value": "State (0=closed, 1=open, 2=half-open)" } + } + } + ] + }, + { + "title": "FX Rate Spread (Live vs Quoted)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 }, + "targets": [ + { + "expr": "abs(fx_live_rate - fx_quoted_rate) / fx_live_rate * 100", + "legendFormat": "{{pair}} spread %" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "steps": [ + { "color": "green", "value": 0 }, + { "color": "red", "value": 2 } + ] + } + } + } + }, + { + "title": "Settlement Queue Depth", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 }, + "targets": [ + { + "expr": "sum(settlement_queue_depth) by (rail)", + "legendFormat": "{{rail}}" + } + ] + }, + { + "title": "Dead Letter Queue Size", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 30 }, + "targets": [ + { + "expr": "sum(dead_letter_queue_size)", + "legendFormat": "DLQ Messages" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { "color": "green", "value": 0 }, + { "color": "orange", "value": 10 }, + { "color": "red", "value": 50 } + ] + } + } + } + }, + { + "title": "Kafka Consumer Lag", + "type": "timeseries", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 30 }, + "targets": [ + { + "expr": "sum(kafka_consumer_group_lag) by (group)", + "legendFormat": "{{group}}" + } + ] + }, + { + "title": "KYC Verification Queue", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 30 }, + "targets": [ + { + "expr": "sum(kyc_pending_verifications)", + "legendFormat": "Pending" + } + ] + }, + { + "title": "SAR Filings (24h)", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 30 }, + "targets": [ + { + "expr": "sum(increase(sar_filings_total[24h]))", + "legendFormat": "SARs Filed" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["remitflow", "transfers", "financial"], + "templating": { + "list": [ + { + "name": "corridor", + "type": "query", + "query": "label_values(transfers_initiated_total, corridor)", + "multi": true, + "includeAll": true + }, + { + "name": "environment", + "type": "custom", + "options": [ + { "text": "production", "value": "production" }, + { "text": "staging", "value": "staging" } + ] + } + ] + }, + "time": { "from": "now-6h", "to": "now" }, + "title": "RemitFlow — Transfer Operations", + "uid": "remitflow-transfers", + "version": 1 +} diff --git a/ops/monitoring/grafana/provisioning/dashboards.yml b/ops/monitoring/grafana/provisioning/dashboards.yml new file mode 100644 index 00000000..56b7b4d5 --- /dev/null +++ b/ops/monitoring/grafana/provisioning/dashboards.yml @@ -0,0 +1,11 @@ +apiVersion: 1 +providers: + - name: "RemitFlow" + orgId: 1 + folder: "RemitFlow" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/ops/monitoring/grafana/provisioning/datasources.yml b/ops/monitoring/grafana/provisioning/datasources.yml new file mode 100644 index 00000000..c9f4f3a9 --- /dev/null +++ b/ops/monitoring/grafana/provisioning/datasources.yml @@ -0,0 +1,8 @@ +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true diff --git a/ops/monitoring/prometheus/alerts.yml b/ops/monitoring/prometheus/alerts.yml new file mode 100644 index 00000000..79777192 --- /dev/null +++ b/ops/monitoring/prometheus/alerts.yml @@ -0,0 +1,215 @@ +# RemitFlow — Prometheus Alerting Rules +# +# Integrated with Alertmanager → PagerDuty/Opsgenie/Slack +# Severity levels: critical (page immediately), warning (ticket), info (log) + +groups: + # ─── Financial Integrity Alerts ────────────────────────────────────────────── + - name: financial_integrity + rules: + - alert: LedgerImbalance + expr: abs(sum(tigerbeetle_debits_total) - sum(tigerbeetle_credits_total)) > 0 + for: 1m + labels: + severity: critical + team: finance + runbook: ops/runbooks/ledger-imbalance.md + annotations: + summary: "TigerBeetle ledger imbalance detected" + description: "Debits and credits do not balance. Imbalance: {{ $value }}. Immediate investigation required." + impact: "Potential fund loss or duplication" + + - alert: TransferStuckInFlight + expr: sum(transfers_in_flight) > 100 and sum(rate(transfers_completed_total[5m])) == 0 + for: 5m + labels: + severity: critical + team: platform + runbook: ops/runbooks/stuck-transfers.md + annotations: + summary: "{{ $value }} transfers stuck in flight with no completions" + description: "Transfers are being initiated but none are completing. Settlement pipeline may be blocked." + + - alert: DeadLetterQueueGrowing + expr: sum(dead_letter_queue_size) > 50 + for: 10m + labels: + severity: warning + team: platform + annotations: + summary: "Dead letter queue has {{ $value }} messages" + description: "Failed transfers accumulating in DLQ. Manual reconciliation may be needed." + + - alert: FXRateStale + expr: time() - fx_rate_last_updated_timestamp > 300 + for: 5m + labels: + severity: warning + team: platform + annotations: + summary: "FX rates not updated in {{ $value }}s" + description: "Live FX rate provider may be down. Users may be quoted stale rates." + + # ─── SLA Breach Alerts ────────────────────────────────────────────────────── + - name: sla_breach + rules: + - alert: TransferDeliverySlowP95 + expr: histogram_quantile(0.95, sum(rate(transfer_delivery_duration_seconds_bucket[5m])) by (le)) > 30 + for: 5m + labels: + severity: warning + team: platform + runbook: ops/runbooks/slow-delivery.md + annotations: + summary: "Transfer delivery p95 latency {{ $value }}s exceeds 30s SLO" + description: "Fund delivery is taking longer than the 30-second SLO target." + + - alert: TransferSuccessRateLow + expr: (sum(rate(transfers_completed_total[5m])) / sum(rate(transfers_initiated_total[5m]))) < 0.999 + for: 5m + labels: + severity: critical + team: platform + runbook: ops/runbooks/low-success-rate.md + annotations: + summary: "Transfer success rate {{ $value | humanizePercentage }} below 99.9% SLO" + description: "More than 0.1% of transfers are failing. Check settlement services and external rails." + + - alert: APILatencyHigh + expr: histogram_quantile(0.95, sum(rate(http_request_duration_ms_bucket[5m])) by (le)) > 500 + for: 5m + labels: + severity: warning + team: platform + annotations: + summary: "API p95 latency {{ $value }}ms exceeds 500ms SLO" + + - alert: ErrorRateHigh + expr: sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.01 + for: 3m + labels: + severity: critical + team: platform + annotations: + summary: "Error rate {{ $value | humanizePercentage }} exceeds 1% threshold" + description: "More than 1% of requests are returning 5xx errors." + + # ─── Infrastructure Alerts ────────────────────────────────────────────────── + - name: infrastructure + rules: + - alert: ServiceDown + expr: up{job=~"remitflow.*"} == 0 + for: 2m + labels: + severity: critical + team: platform + annotations: + summary: "Service {{ $labels.job }} is DOWN" + description: "Service has been unreachable for 2 minutes. Circuit breaker should have activated." + + - alert: CircuitBreakerOpen + expr: circuit_breaker_state == 1 + for: 1m + labels: + severity: warning + team: platform + annotations: + summary: "Circuit breaker OPEN for {{ $labels.service }}" + description: "External service {{ $labels.service }} is failing. Requests are being short-circuited." + + - alert: PostgresConnectionPoolExhaustion + expr: pg_stat_activity_count / pg_settings_max_connections > 0.8 + for: 5m + labels: + severity: warning + team: platform + annotations: + summary: "PostgreSQL connection pool at {{ $value | humanizePercentage }}" + description: "Connection pool nearing exhaustion. May cause request failures." + + - alert: KafkaConsumerLag + expr: sum(kafka_consumer_group_lag) by (group) > 10000 + for: 5m + labels: + severity: warning + team: platform + annotations: + summary: "Kafka consumer group {{ $labels.group }} lag: {{ $value }} messages" + description: "Events not being processed in real-time. Audit trail and analytics may be delayed." + + - alert: RedisMemoryHigh + expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.85 + for: 5m + labels: + severity: warning + team: platform + annotations: + summary: "Redis memory at {{ $value | humanizePercentage }}" + + # ─── Compliance Alerts ────────────────────────────────────────────────────── + - name: compliance + rules: + - alert: SARFilingSpike + expr: sum(increase(sar_filings_total[1h])) > 10 + for: 0m + labels: + severity: warning + team: compliance + annotations: + summary: "{{ $value }} SAR filings in the last hour" + description: "Unusual spike in Suspicious Activity Reports. Review for potential AML event." + + - alert: KYCVerificationBacklog + expr: sum(kyc_pending_verifications) > 100 + for: 30m + labels: + severity: warning + team: compliance + annotations: + summary: "{{ $value }} KYC verifications pending" + description: "Users waiting for identity verification. May impact onboarding SLA." + + - alert: SanctionsScreeningDown + expr: rate(sanctions_screening_errors_total[5m]) > 0 + for: 5m + labels: + severity: critical + team: compliance + runbook: ops/runbooks/sanctions-screening-down.md + annotations: + summary: "Sanctions screening service errors detected" + description: "OFAC/UN/EU sanctions screening is failing. All transfers must be held until resolved." + + # ─── Settlement & Rail Alerts ──────────────────────────────────────────────── + - name: settlement + rules: + - alert: SettlementQueueBacklog + expr: sum(settlement_queue_depth) by (rail) > 500 + for: 10m + labels: + severity: warning + team: finance + annotations: + summary: "Settlement queue for {{ $labels.rail }}: {{ $value }} pending" + description: "Payment rail {{ $labels.rail }} has a growing backlog. Check rail provider status." + + - alert: RailProviderDown + expr: rail_provider_health == 0 + for: 3m + labels: + severity: critical + team: finance + runbook: ops/runbooks/rail-provider-down.md + annotations: + summary: "Payment rail {{ $labels.rail }} is DOWN" + description: "{{ $labels.rail }} provider is unreachable. Transfers on this rail will fail." + + - alert: HighSettlementLatency + expr: histogram_quantile(0.95, sum(rate(settlement_duration_seconds_bucket[5m])) by (le, rail)) > 300 + for: 10m + labels: + severity: warning + team: finance + annotations: + summary: "Settlement latency for {{ $labels.rail }}: {{ $value }}s (p95)" + description: "Settlements taking longer than 5 minutes. Users may experience delayed fund delivery." diff --git a/ops/monitoring/prometheus/prometheus.yml b/ops/monitoring/prometheus/prometheus.yml new file mode 100644 index 00000000..7c90834e --- /dev/null +++ b/ops/monitoring/prometheus/prometheus.yml @@ -0,0 +1,58 @@ +# RemitFlow — Prometheus Scrape Configuration + +global: + scrape_interval: 15s + evaluation_interval: 15s + +rule_files: + - "alerts.yml" + +alerting: + alertmanagers: + - static_configs: + - targets: ["alertmanager:9093"] + +scrape_configs: + # Main API server + - job_name: "remitflow-api" + metrics_path: "/metrics/features" + static_configs: + - targets: ["host.docker.internal:3001"] + scrape_interval: 10s + + # Go services + - job_name: "remitflow-go-fiat-rails" + static_configs: + - targets: ["host.docker.internal:8125"] + - job_name: "remitflow-go-qr-gateway" + static_configs: + - targets: ["host.docker.internal:8122"] + + # Rust services + - job_name: "remitflow-rust-search" + static_configs: + - targets: ["host.docker.internal:8126"] + - job_name: "remitflow-rust-qr-crypto" + static_configs: + - targets: ["host.docker.internal:8123"] + + # Python services + - job_name: "remitflow-python-voice" + static_configs: + - targets: ["host.docker.internal:8127"] + - job_name: "remitflow-python-analytics" + static_configs: + - targets: ["host.docker.internal:8124"] + + # Infrastructure + - job_name: "postgres" + static_configs: + - targets: ["host.docker.internal:9187"] # postgres_exporter + + - job_name: "redis" + static_configs: + - targets: ["host.docker.internal:9121"] # redis_exporter + + - job_name: "kafka" + static_configs: + - targets: ["host.docker.internal:9308"] # kafka_exporter diff --git a/ops/runbooks/incident-response.md b/ops/runbooks/incident-response.md new file mode 100644 index 00000000..28f1dd45 --- /dev/null +++ b/ops/runbooks/incident-response.md @@ -0,0 +1,158 @@ +# RemitFlow — Incident Response Procedure + +## Severity Classification + +| Level | Definition | Response Time | Example | +|-------|-----------|---------------|---------| +| **SEV1** | Service fully down, funds at risk | 5 min | Ledger imbalance, all transfers failing | +| **SEV2** | Major feature degraded, some users impacted | 15 min | One corridor down, high error rate | +| **SEV3** | Minor degradation, workaround exists | 1 hour | Slow quotes, analytics delayed | +| **SEV4** | Cosmetic / informational | Next business day | Dashboard rendering issue | + +## Incident Lifecycle + +``` +DETECT → TRIAGE → MITIGATE → RESOLVE → POST-MORTEM + │ │ │ │ │ + │ │ │ │ └─ Within 48h + │ │ │ └─ Fix root cause + │ │ └─ Stop bleeding (failover, rollback, hold) + │ └─ Assess severity, assign IC + └─ Alert fires or user reports +``` + +## Roles + +| Role | Responsibility | +|------|---------------| +| **Incident Commander (IC)** | Coordinates response, makes decisions, communicates | +| **Tech Lead** | Investigates root cause, implements fix | +| **Comms Lead** | Updates status page, notifies affected users | +| **Finance Lead** | Assesses financial impact, authorizes compensations | + +## Step-by-Step Response + +### 1. DETECT (Automated) +- Prometheus alert fires → PagerDuty pages on-call +- User reports via support channel +- Automated monitoring detects anomaly + +### 2. TRIAGE (First 5 minutes) +``` +IC Checklist: +□ Acknowledge alert in PagerDuty +□ Open incident channel: #incident-YYYY-MM-DD-