dlx-ansible/playbooks/configure-storage-monitorin...

---
# Configure proactive storage monitoring and alerting for Proxmox hosts
# Monitors: Filesystem usage, Docker storage, Container allocation
# Alerts at: 75%, 85%, 95% capacity thresholds

- name: "Setup storage monitoring and alerting"
  hosts: proxmox
  gather_facts: yes
  vars:
    alert_threshold_75: true   # Alert when >75% full
    alert_threshold_85: true   # Alert when >85% full
    alert_threshold_95: true   # Alert when >95% full (critical)
    alert_email: "admin@directlx.dev"
    monitoring_interval: "5m"  # Check every 5 minutes
  tasks:
    - name: Create storage monitoring directory
      file:
        path: /usr/local/bin/storage-monitoring
        state: directory
        mode: "0755"
      become: yes

    - name: Create filesystem capacity check script
      copy:
        content: |
          #!/bin/bash
          # Filesystem capacity monitoring
          # Alerts when thresholds are exceeded

          HOSTNAME=$(hostname)
          THRESHOLD_75=75
          THRESHOLD_85=85
          THRESHOLD_95=95
          LOGFILE="/var/log/storage-monitor.log"

          log_event() {
              LEVEL=$1
              FS=$2
              USAGE=$3
              TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
              echo "[$TIMESTAMP] [$LEVEL] $FS: ${USAGE}% used" >> $LOGFILE
          }

          check_filesystem() {
              FS=$1
              USAGE=$(df $FS | tail -1 | awk '{print $5}' | sed 's/%//')

              if [ $USAGE -gt $THRESHOLD_95 ]; then
                  log_event "CRITICAL" "$FS" "$USAGE"
                  echo "CRITICAL: $HOSTNAME $FS is $USAGE% full" | \
                    logger -t storage-monitor -p local0.crit
              elif [ $USAGE -gt $THRESHOLD_85 ]; then
                  log_event "WARNING" "$FS" "$USAGE"
                  echo "WARNING: $HOSTNAME $FS is $USAGE% full" | \
                    logger -t storage-monitor -p local0.warning
              elif [ $USAGE -gt $THRESHOLD_75 ]; then
                  log_event "ALERT" "$FS" "$USAGE"
                  echo "ALERT: $HOSTNAME $FS is $USAGE% full" | \
                    logger -t storage-monitor -p local0.notice
              fi
          }

          # Check root filesystem
          check_filesystem "/"

          # Check Proxmox-specific mounts
          for mount in /mnt/pve/* /mnt/dlx-*; do
              if [ -d "$mount" ]; then
                  check_filesystem "$mount"
              fi
          done

          # Check specific critical mounts
          [ -d "/var" ] && check_filesystem "/var"
          [ -d "/home" ] && check_filesystem "/home"
        dest: /usr/local/bin/storage-monitoring/check-capacity.sh
        mode: "0755"
      become: yes

    - name: Create Docker-specific monitoring script
      copy:
        content: |
          #!/bin/bash
          # Docker storage utilization monitoring
          # Only runs on hosts with Docker installed

          if ! command -v docker &> /dev/null; then
              exit 0
          fi

          HOSTNAME=$(hostname)
          LOGFILE="/var/log/docker-monitor.log"
          THRESHOLD_75=75
          THRESHOLD_85=85
          THRESHOLD_95=95

          log_docker_event() {
              LEVEL=$1
              USAGE=$2
              TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
              echo "[$TIMESTAMP] [$LEVEL] Docker storage: ${USAGE}% used" >> $LOGFILE
          }

          # Check dlx-docker mount (proxmox-01)
          if [ -d "/mnt/pve/dlx-docker" ]; then
              USAGE=$(df /mnt/pve/dlx-docker | tail -1 | awk '{print $5}' | sed 's/%//')

              if [ $USAGE -gt $THRESHOLD_95 ]; then
                  log_docker_event "CRITICAL" "$USAGE"
                  echo "CRITICAL: Docker storage $USAGE% full on $HOSTNAME" | \
                    logger -t docker-monitor -p local0.crit
              elif [ $USAGE -gt $THRESHOLD_85 ]; then
                  log_docker_event "WARNING" "$USAGE"
                  echo "WARNING: Docker storage $USAGE% full on $HOSTNAME" | \
                    logger -t docker-monitor -p local0.warning
              elif [ $USAGE -gt $THRESHOLD_75 ]; then
                  log_docker_event "ALERT" "$USAGE"
                  echo "ALERT: Docker storage $USAGE% full on $HOSTNAME" | \
                    logger -t docker-monitor -p local0.notice
              fi

              # Also check Docker disk usage
              docker system df >> $LOGFILE 2>&1
          fi
        dest: /usr/local/bin/storage-monitoring/check-docker.sh
        mode: "0755"
      become: yes

    - name: Create container allocation tracking script
      copy:
        content: |
          #!/bin/bash
          # Track LXC/KVM container disk allocations
          # Reports containers using >50GB or >80% of allocation

          HOSTNAME=$(hostname)
          LOGFILE="/var/log/container-monitor.log"
          TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')

          echo "[$TIMESTAMP] Container allocation audit:" >> $LOGFILE

          pct list 2>/dev/null | tail -n +2 | while read line; do
              VMID=$(echo $line | awk '{print $1}')
              NAME=$(echo $line | awk '{print $2}')
              STATUS=$(echo $line | awk '{print $3}')

              # Get max disk allocation
              MAXDISK=$(pct config $VMID 2>/dev/null | grep -i rootfs | grep size | \
                        sed 's/.*size=//' | sed 's/G.*//' || echo "0")

              if [ "$MAXDISK" != "0" ] && [ $MAXDISK -gt 50 ]; then
                  echo "  [$STATUS] $VMID ($NAME): ${MAXDISK}GB allocated" >> $LOGFILE
              fi
          done

          # Also check KVM/QEMU VMs
          qm list 2>/dev/null | tail -n +2 | while read line; do
              VMID=$(echo $line | awk '{print $1}')
              NAME=$(echo $line | awk '{print $2}')
              STATUS=$(echo $line | awk '{print $3}')

              # Get max disk allocation
              MAXDISK=$(qm config $VMID 2>/dev/null | grep -i scsi | wc -l)
              if [ $MAXDISK -gt 0 ]; then
                  echo "  [$STATUS] QEMU:$VMID ($NAME)" >> $LOGFILE
              fi
          done
        dest: /usr/local/bin/storage-monitoring/check-containers.sh
        mode: "0755"
      become: yes

    - name: Install monitoring cron jobs
      cron:
        name: "{{ item.name }}"
        hour: "{{ item.hour }}"
        minute: "{{ item.minute }}"
        job: "{{ item.job }} >> /var/log/storage-cron.log 2>&1"
        user: root
      become: yes
      with_items:
        - name: "Storage capacity check"
          hour: "*"
          minute: "*/5"
          job: "/usr/local/bin/storage-monitoring/check-capacity.sh"
        - name: "Docker storage check"
          hour: "*"
          minute: "*/10"
          job: "/usr/local/bin/storage-monitoring/check-docker.sh"
        - name: "Container allocation audit"
          hour: "*/4"
          minute: "0"
          job: "/usr/local/bin/storage-monitoring/check-containers.sh"

    - name: Configure logrotate for monitoring logs
      copy:
        content: |
          /var/log/storage-monitor.log
          /var/log/docker-monitor.log
          /var/log/container-monitor.log
          /var/log/storage-cron.log {
              daily
              rotate 14
              compress
              missingok
              notifempty
              create 0640 root root
          }
        dest: /etc/logrotate.d/storage-monitoring
      become: yes

    - name: Create storage monitoring summary script
      copy:
        content: |
          #!/bin/bash
          # Summarize storage status across cluster
          # Run this for quick dashboard view

          echo "╔════════════════════════════════════════════════════════════╗"
          echo "║         PROXMOX CLUSTER STORAGE STATUS                     ║"
          echo "╚════════════════════════════════════════════════════════════╝"
          echo ""

          for host in proxmox-00 proxmox-01 proxmox-02; do
              echo "[$host]"
              ssh -o ConnectTimeout=5 dlxadmin@$(ansible-inventory --host $host 2>/dev/null | jq -r '.ansible_host' 2>/dev/null || echo $host) \
                  "df -h / | tail -1 | awk '{printf \"  Root: %s (used: %s)\\n\", \$5, \$3}'; \
                   [ -d /mnt/pve/dlx-docker ] && df -h /mnt/pve/dlx-docker | tail -1 | awk '{printf \"  Docker: %s (used: %s)\\n\", \$5, \$3}'; \
                   df -h /mnt/pve/* 2>/dev/null | tail -n +2 | awk '{printf \"  %s: %s (used: %s)\\n\", \$NF, \$5, \$3}'" 2>/dev/null || \
              echo "  [unreachable]"
              echo ""
          done

          echo "Monitoring logs:"
          echo "  tail -f /var/log/storage-monitor.log"
          echo "  tail -f /var/log/docker-monitor.log"
          echo "  tail -f /var/log/container-monitor.log"
        dest: /usr/local/bin/storage-monitoring/cluster-status.sh
        mode: "0755"
      become: yes

    - name: Display monitoring setup summary
      debug:
        msg: |
          ╔══════════════════════════════════════════════════════════════╗
          ║         STORAGE MONITORING CONFIGURED                        ║
          ╚══════════════════════════════════════════════════════════════╝

          Monitoring scripts installed:
          ✓ /usr/local/bin/storage-monitoring/check-capacity.sh
          ✓ /usr/local/bin/storage-monitoring/check-docker.sh
          ✓ /usr/local/bin/storage-monitoring/check-containers.sh
          ✓ /usr/local/bin/storage-monitoring/cluster-status.sh

          Cron Jobs Configured:
          ✓ Every 5 min: Filesystem capacity checks
          ✓ Every 10 min: Docker storage checks
          ✓ Every 4 hours: Container allocation audit

          Alert Thresholds:
          ⚠️  75%: ALERT (notice level)
          ⚠️  85%: WARNING (warning level)
          🔴 95%: CRITICAL (critical level)

          Log Files:
          • /var/log/storage-monitor.log
          • /var/log/docker-monitor.log
          • /var/log/container-monitor.log
          • /var/log/storage-cron.log (cron execution log)

          Quick Status Commands:
          $ /usr/local/bin/storage-monitoring/cluster-status.sh
          $ tail -f /var/log/storage-monitor.log
          $ grep CRITICAL /var/log/storage-monitor.log

          System Integration:
          - Logs sent to syslog (logger -t storage-monitor)
          - Searchable with: journalctl -t storage-monitor
          - Can integrate with rsyslog for forwarding
          - Can integrate with monitoring tools (Prometheus, Grafana)

---

- name: "Create Prometheus metrics export (optional)"
  hosts: proxmox
  gather_facts: yes
  tasks:
    - name: Create Prometheus metrics script
      copy:
        content: |
          #!/bin/bash
          # Export storage metrics in Prometheus format
          # Endpoint: http://host:9100/storage-metrics (if using node_exporter)

          cat << 'EOF'
          # HELP pve_storage_capacity_bytes Storage capacity in bytes
          # TYPE pve_storage_capacity_bytes gauge
          EOF

          df -B1 | tail -n +2 | while read fs total used available use percent mount; do
              # Skip certain mounts
              [[ "$mount" =~ ^/(dev|proc|sys|run|boot) ]] && continue

              SAFEMOUNT=$(echo "$mount" | sed 's/\//_/g; s/^_//g')
              echo "pve_storage_capacity_bytes{mount=\"$mount\",type=\"total\"} $total"
              echo "pve_storage_capacity_bytes{mount=\"$mount\",type=\"used\"} $used"
              echo "pve_storage_capacity_bytes{mount=\"$mount\",type=\"available\"} $available"
              echo "pve_storage_percent{mount=\"$mount\"} $(echo $use | sed 's/%//')"
          done
        dest: /usr/local/bin/storage-monitoring/prometheus-metrics.sh
        mode: "0755"
      become: yes

    - name: Display Prometheus integration note
      debug:
        msg: |
          Prometheus Integration Available:
          $ /usr/local/bin/storage-monitoring/prometheus-metrics.sh

          To integrate with node_exporter:
          1. Copy script to node_exporter textfile directory
          2. Add collector to Prometheus scrape config
          3. Create dashboards in Grafana

          Example Prometheus queries:
          - Storage usage: pve_storage_capacity_bytes{type="used"}
          - Available space: pve_storage_capacity_bytes{type="available"}
          - Percentage: pve_storage_percent

---

- name: "Display final configuration summary"
  hosts: localhost
  gather_facts: no
  tasks:
    - name: Summary
      debug:
        msg: |
          ╔══════════════════════════════════════════════════════════════╗
          ║     STORAGE MONITORING & REMEDIATION COMPLETE                ║
          ╚══════════════════════════════════════════════════════════════╝

          Playbooks Created:
          1. remediate-storage-critical-issues.yml
             - Cleans logs on proxmox-00
             - Prunes Docker on proxmox-01
             - Audits SonarQube usage

          2. remediate-docker-storage.yml
             - Detailed Docker cleanup
             - Removes dangling resources
             - Sets up automated weekly prune

          3. remediate-stopped-containers.yml
             - Safely removes unused containers
             - Creates config backups
             - Recoverable deletions

          4. configure-storage-monitoring.yml
             - Continuous capacity monitoring
             - Alert thresholds (75/85/95%)
             - Prometheus integration

          To Execute All Remediations:
          $ ansible-playbook playbooks/remediate-storage-critical-issues.yml
          $ ansible-playbook playbooks/remediate-docker-storage.yml
          $ ansible-playbook playbooks/configure-storage-monitoring.yml

          To Check Monitoring Status:
          SSH to any Proxmox host and run:
          $ tail -f /var/log/storage-monitor.log
          $ /usr/local/bin/storage-monitoring/cluster-status.sh

          Next Steps:
          1. Review and test playbooks with --check
          2. Run on one host first (proxmox-00)
          3. Monitor for 48 hours for stability
          4. Extend to other hosts once verified
          5. Schedule regular execution (weekly)

          Expected Results:
          - proxmox-00 root: 84.5% → 70%
          - proxmox-01 docker: 81.1% → 70%
          - Freed space: 500+ GB
          - Monitoring active and alerting