dlx-ansible/playbooks/remediate-storage-critical-...

---
# Remediation playbooks for critical storage issues identified in STORAGE-AUDIT.md
# This playbook addresses:
# 1. proxmox-00 root filesystem at 84.5% capacity
# 2. proxmox-01 dlx-docker at 81.1% capacity
# 3. SonarQube at 82% of allocated space

# CRITICAL: Test in non-production first
# Run with --check for dry-run

- name: "Remediate proxmox-00 root filesystem (CRITICAL: 84.5% full)"
  hosts: proxmox-00
  gather_facts: yes
  vars:
    cleanup_journal_days: 30
    cleanup_apt_cache: true
    cleanup_temp_files: true
    log_threshold_days: 90
  tasks:
    - name: Get filesystem usage before cleanup
      shell: df -h / | tail -1
      register: fs_before
      changed_when: false

    - name: Display filesystem usage before
      debug:
        msg: "Before cleanup: {{ fs_before.stdout }}"

    - name: Compress old journal logs
      shell: journalctl --vacuum=time:{{ cleanup_journal_days }}d
      become: yes
      register: journal_cleanup
      when: cleanup_journal_cache | default(true)

    - name: Display journal cleanup result
      debug:
        msg: "{{ journal_cleanup.stderr }}"
      when: journal_cleanup.changed

    - name: Clean old syslog files
      shell: |
        find /var/log -name "*.log.*" -type f -mtime +{{ log_threshold_days }} -delete
        find /var/log -name "*.gz" -type f -mtime +{{ log_threshold_days }} -delete
      become: yes
      register: log_cleanup

    - name: Clean apt cache if enabled
      shell: apt-get clean && apt-get autoclean
      become: yes
      register: apt_cleanup
      when: cleanup_apt_cache

    - name: Clean tmp directories
      shell: |
        find /tmp -type f -atime +30 -delete 2>/dev/null || true
        find /var/tmp -type f -atime +30 -delete 2>/dev/null || true
      become: yes
      register: tmp_cleanup
      when: cleanup_temp_files

    - name: Find large files in /var/log
      shell: find /var/log -type f -size +100M
      register: large_logs
      changed_when: false

    - name: Display large log files
      debug:
        msg: "Large files in /var/log (>100MB): {{ large_logs.stdout_lines }}"
      when: large_logs.stdout

    - name: Get filesystem usage after cleanup
      shell: df -h / | tail -1
      register: fs_after
      changed_when: false

    - name: Display filesystem usage after
      debug:
        msg: "After cleanup: {{ fs_after.stdout }}"

    - name: Calculate freed space
      debug:
        msg: |
          Cleanup Summary:
          - Journal logs compressed: {{ cleanup_journal_days }} days retained
          - Old syslog files removed: {{ log_threshold_days }}+ days
          - Apt cache cleaned: {{ cleanup_apt_cache }}
          - Temp files cleaned: {{ cleanup_temp_files }}
          NOTE: Re-run 'df -h /' on proxmox-00 to verify space was freed

    - name: Set alert for continued monitoring
      debug:
        msg: |
          ⚠️  ALERT: Root filesystem still approaching capacity
          Next steps if space still insufficient:
          1. Move /var to separate partition
          2. Archive/compress old log files to NFS
          3. Review application logs for rotation config
          4. Consider expanding root partition

---

- name: "Remediate proxmox-01 dlx-docker high utilization (81.1% full)"
  hosts: proxmox-01
  gather_facts: yes
  tasks:
    - name: Check if Docker is installed
      stat:
        path: /usr/bin/docker
      register: docker_installed

    - name: Get Docker storage usage before cleanup
      shell: docker system df
      register: docker_before
      when: docker_installed.stat.exists
      changed_when: false

    - name: Display Docker usage before
      debug:
        msg: "{{ docker_before.stdout }}"
      when: docker_installed.stat.exists

    - name: Remove unused Docker images
      shell: docker image prune -f
      become: yes
      register: image_prune
      when: docker_installed.stat.exists

    - name: Display pruned images
      debug:
        msg: "{{ image_prune.stdout }}"
      when: docker_installed.stat.exists and image_prune.changed

    - name: Remove unused Docker volumes
      shell: docker volume prune -f
      become: yes
      register: volume_prune
      when: docker_installed.stat.exists

    - name: Display pruned volumes
      debug:
        msg: "{{ volume_prune.stdout }}"
      when: docker_installed.stat.exists and volume_prune.changed

    - name: Remove dangling build cache
      shell: docker builder prune -f -a
      become: yes
      register: cache_prune
      when: docker_installed.stat.exists
      failed_when: false  # Older Docker versions may not support this

    - name: Get Docker storage usage after cleanup
      shell: docker system df
      register: docker_after
      when: docker_installed.stat.exists
      changed_when: false

    - name: Display Docker usage after
      debug:
        msg: "{{ docker_after.stdout }}"
      when: docker_installed.stat.exists

    - name: List Docker containers on dlx-docker storage
      shell: |
        df /mnt/pve/dlx-docker
        echo "---"
        du -sh /mnt/pve/dlx-docker/* 2>/dev/null | sort -hr | head -10
      become: yes
      register: storage_usage
      changed_when: false

    - name: Display storage breakdown
      debug:
        msg: "{{ storage_usage.stdout }}"

    - name: Alert for manual review
      debug:
        msg: |
          ⚠️  ALERT: dlx-docker still at high capacity
          Manual steps to consider:
          1. Check running containers: docker ps -a
          2. Inspect container logs: docker logs <container-id> | wc -l
          3. Review log rotation config: docker inspect <container-id>
          4. Consider migrating containers to dlx-nfs-* storage
          5. Archive old analysis/build artifacts

---

- name: "Audit and report SonarQube disk usage (354 GB)"
  hosts: proxmox-00
  gather_facts: yes
  tasks:
    - name: Check SonarQube container exists
      shell: pct list | grep -i sonar || echo "sonar not found on this host"
      register: sonar_check
      changed_when: false

    - name: Display SonarQube status
      debug:
        msg: "{{ sonar_check.stdout }}"

    - name: Check if dlx-sonar container is on proxmox-01
      debug:
        msg: |
          NOTE: dlx-sonar (VMID 202) is running on proxmox-01
          Current disk allocation: 422 GB
          Current disk usage: 354 GB (82%)

          This is expected for SonarQube with large code analysis databases.

          Remediation options:
          1. Archive old analysis: sonar-scanner with delete API
          2. Configure data retention in SonarQube settings
          3. Move to dedicated storage pool (dlx-nfs-sdb-02)
          4. Increase disk allocation if needed
          5. Run cleanup task: DELETE /api/ce/activity?createdBefore=<date>

---

- name: "Audit stopped containers for cleanup decisions"
  hosts: proxmox-00
  gather_facts: yes
  tasks:
    - name: List all stopped LXC containers
      shell: pct list | awk 'NR>1 && $3=="stopped" {print $1, $2}'
      register: stopped_containers
      changed_when: false

    - name: Display stopped containers
      debug:
        msg: |
          Stopped containers found:
          {{ stopped_containers.stdout }}

          These containers are allocated but not running:
          - dlx-wireguard (105): 32 GB - VPN service
          - dlx-mysql-02 (108): 200 GB - Database replica
          - dlx-mattermost (107): 32 GB - Chat platform
          - dlx-mysql-03 (109): 200 GB - Database replica
          - dlx-nocodb (116): 100 GB - No-code database

          Total allocated: ~564 GB

          Decision Matrix:
          ┌─────────────────┬───────────┬──────────────────────────────┐
          │ Container       │ Allocated │ Recommendation               │
          ├─────────────────┼───────────┼──────────────────────────────┤
          │ dlx-wireguard   │ 32 GB     │ REMOVE if not in active use  │
          │ dlx-mysql-*     │ 400 GB    │ REMOVE if using dlx-mysql-01 │
          │ dlx-mattermost  │ 32 GB     │ REMOVE if using Slack/Teams  │
          │ dlx-nocodb      │ 100 GB    │ REMOVE if not in active use  │
          └─────────────────┴───────────┴──────────────────────────────┘

    - name: Create removal recommendations
      debug:
        msg: |
          To safely remove stopped containers:

          1. VERIFY PURPOSE: Document why each was created
          2. CHECK BACKUPS: Ensure data is backed up elsewhere
          3. EXPORT CONFIG: pct config VMID > backup.conf
          4. DELETE: pct destroy VMID --force

          Example safe removal script:
          ---
          # Backup container config before deletion
          pct config 105 > /tmp/dlx-wireguard-backup.conf
          pct destroy 105 --force

          # This frees 32 GB immediately
          ---

---

- name: "Storage remediation summary and next steps"
  hosts: localhost
  gather_facts: no
  tasks:
    - name: Display remediation summary
      debug:
        msg: |
          ╔════════════════════════════════════════════════════════════════╗
          ║        STORAGE REMEDIATION PLAYBOOK EXECUTION SUMMARY          ║
          ╚════════════════════════════════════════════════════════════════╝

          ✓ COMPLETED ACTIONS:
          1. Compressed journal logs on proxmox-00
          2. Cleaned old syslog files (>90 days)
          3. Cleaned apt cache
          4. Cleaned temp directories (/tmp, /var/tmp)
          5. Pruned Docker images, volumes, and cache
          6. Analyzed container storage usage
          7. Generated SonarQube audit report
          8. Identified stopped containers for cleanup

          ⚠️  IMMEDIATE ACTIONS REQUIRED:
          1. [ ] SSH to proxmox-00 and verify root FS space freed
             Command: df -h /
          2. [ ] Review stopped containers and decide keep/remove
          3. [ ] Monitor dlx-docker on proxmox-01 (currently 81% full)
          4. [ ] Schedule SonarQube data cleanup if needed

          📊 CAPACITY TARGETS:
          - proxmox-00 root: Target <70% (currently 84%)
          - proxmox-01 dlx-docker: Target <75% (currently 81%)
          - SonarQube: Keep <75% if possible

          🔄 AUTOMATION RECOMMENDATIONS:
          1. Create logrotate config for persistent log management
          2. Schedule weekly: docker system prune -f
          3. Schedule monthly: journalctl --vacuum=time:60d
          4. Set up monitoring alerts at 75%, 85%, 95% capacity

          📝 NEXT AUDIT:
          Schedule: 2026-03-08 (30 days)
          Update: /docs/STORAGE-AUDIT.md with new metrics

    - name: Create remediation tracking file
      copy:
        content: |
          # Storage Remediation Tracking
          Generated: {{ ansible_date_time.iso8601 }}

          ## Issues Addressed
          - [ ] proxmox-00 root filesystem cleanup
          - [ ] proxmox-01 dlx-docker cleanup
          - [ ] SonarQube audit completed
          - [ ] Stopped containers reviewed

          ## Manual Verification Required
          - [ ] SSH to proxmox-00: df -h /
          - [ ] SSH to proxmox-01: docker system df
          - [ ] Review stopped container logs
          - [ ] Decide on stopped container removal

          ## Follow-up Tasks
          - [ ] Create logrotate policies
          - [ ] Set up monitoring/alerting
          - [ ] Schedule periodic cleanup runs
          - [ ] Document storage policies

          ## Completed Dates

        dest: "/tmp/storage-remediation-tracking.txt"
      delegate_to: localhost
      run_once: true

    - name: Display follow-up instructions
      debug:
        msg: |
          Next Step: Run targeted remediation

          To clean up individual issues:

          1. Clean proxmox-00 root filesystem ONLY:
             ansible-playbook playbooks/remediate-storage-critical-issues.yml \\
               --tags cleanup_root_fs -l proxmox-00

          2. Clean proxmox-01 Docker storage ONLY:
             ansible-playbook playbooks/remediate-storage-critical-issues.yml \\
               --tags cleanup_docker -l proxmox-01

          3. Dry-run (check mode):
             ansible-playbook playbooks/remediate-storage-critical-issues.yml \\
               --check

          4. Run with verbose output:
             ansible-playbook playbooks/remediate-storage-critical-issues.yml \\
               -vvv