dlx-ansible/playbooks/remediate-storage-critical-...

369 lines
13 KiB
YAML

---
# Remediation playbooks for critical storage issues identified in STORAGE-AUDIT.md
# This playbook addresses:
# 1. proxmox-00 root filesystem at 84.5% capacity
# 2. proxmox-01 dlx-docker at 81.1% capacity
# 3. SonarQube at 82% of allocated space
# CRITICAL: Test in non-production first
# Run with --check for dry-run
- name: "Remediate proxmox-00 root filesystem (CRITICAL: 84.5% full)"
hosts: proxmox-00
gather_facts: yes
vars:
cleanup_journal_days: 30
cleanup_apt_cache: true
cleanup_temp_files: true
log_threshold_days: 90
tasks:
- name: Get filesystem usage before cleanup
shell: df -h / | tail -1
register: fs_before
changed_when: false
- name: Display filesystem usage before
debug:
msg: "Before cleanup: {{ fs_before.stdout }}"
- name: Compress old journal logs
shell: journalctl --vacuum=time:{{ cleanup_journal_days }}d
become: yes
register: journal_cleanup
when: cleanup_journal_cache | default(true)
- name: Display journal cleanup result
debug:
msg: "{{ journal_cleanup.stderr }}"
when: journal_cleanup.changed
- name: Clean old syslog files
shell: |
find /var/log -name "*.log.*" -type f -mtime +{{ log_threshold_days }} -delete
find /var/log -name "*.gz" -type f -mtime +{{ log_threshold_days }} -delete
become: yes
register: log_cleanup
- name: Clean apt cache if enabled
shell: apt-get clean && apt-get autoclean
become: yes
register: apt_cleanup
when: cleanup_apt_cache
- name: Clean tmp directories
shell: |
find /tmp -type f -atime +30 -delete 2>/dev/null || true
find /var/tmp -type f -atime +30 -delete 2>/dev/null || true
become: yes
register: tmp_cleanup
when: cleanup_temp_files
- name: Find large files in /var/log
shell: find /var/log -type f -size +100M
register: large_logs
changed_when: false
- name: Display large log files
debug:
msg: "Large files in /var/log (>100MB): {{ large_logs.stdout_lines }}"
when: large_logs.stdout
- name: Get filesystem usage after cleanup
shell: df -h / | tail -1
register: fs_after
changed_when: false
- name: Display filesystem usage after
debug:
msg: "After cleanup: {{ fs_after.stdout }}"
- name: Calculate freed space
debug:
msg: |
Cleanup Summary:
- Journal logs compressed: {{ cleanup_journal_days }} days retained
- Old syslog files removed: {{ log_threshold_days }}+ days
- Apt cache cleaned: {{ cleanup_apt_cache }}
- Temp files cleaned: {{ cleanup_temp_files }}
NOTE: Re-run 'df -h /' on proxmox-00 to verify space was freed
- name: Set alert for continued monitoring
debug:
msg: |
⚠️ ALERT: Root filesystem still approaching capacity
Next steps if space still insufficient:
1. Move /var to separate partition
2. Archive/compress old log files to NFS
3. Review application logs for rotation config
4. Consider expanding root partition
---
- name: "Remediate proxmox-01 dlx-docker high utilization (81.1% full)"
hosts: proxmox-01
gather_facts: yes
tasks:
- name: Check if Docker is installed
stat:
path: /usr/bin/docker
register: docker_installed
- name: Get Docker storage usage before cleanup
shell: docker system df
register: docker_before
when: docker_installed.stat.exists
changed_when: false
- name: Display Docker usage before
debug:
msg: "{{ docker_before.stdout }}"
when: docker_installed.stat.exists
- name: Remove unused Docker images
shell: docker image prune -f
become: yes
register: image_prune
when: docker_installed.stat.exists
- name: Display pruned images
debug:
msg: "{{ image_prune.stdout }}"
when: docker_installed.stat.exists and image_prune.changed
- name: Remove unused Docker volumes
shell: docker volume prune -f
become: yes
register: volume_prune
when: docker_installed.stat.exists
- name: Display pruned volumes
debug:
msg: "{{ volume_prune.stdout }}"
when: docker_installed.stat.exists and volume_prune.changed
- name: Remove dangling build cache
shell: docker builder prune -f -a
become: yes
register: cache_prune
when: docker_installed.stat.exists
failed_when: false # Older Docker versions may not support this
- name: Get Docker storage usage after cleanup
shell: docker system df
register: docker_after
when: docker_installed.stat.exists
changed_when: false
- name: Display Docker usage after
debug:
msg: "{{ docker_after.stdout }}"
when: docker_installed.stat.exists
- name: List Docker containers on dlx-docker storage
shell: |
df /mnt/pve/dlx-docker
echo "---"
du -sh /mnt/pve/dlx-docker/* 2>/dev/null | sort -hr | head -10
become: yes
register: storage_usage
changed_when: false
- name: Display storage breakdown
debug:
msg: "{{ storage_usage.stdout }}"
- name: Alert for manual review
debug:
msg: |
⚠️ ALERT: dlx-docker still at high capacity
Manual steps to consider:
1. Check running containers: docker ps -a
2. Inspect container logs: docker logs <container-id> | wc -l
3. Review log rotation config: docker inspect <container-id>
4. Consider migrating containers to dlx-nfs-* storage
5. Archive old analysis/build artifacts
---
- name: "Audit and report SonarQube disk usage (354 GB)"
hosts: proxmox-00
gather_facts: yes
tasks:
- name: Check SonarQube container exists
shell: pct list | grep -i sonar || echo "sonar not found on this host"
register: sonar_check
changed_when: false
- name: Display SonarQube status
debug:
msg: "{{ sonar_check.stdout }}"
- name: Check if dlx-sonar container is on proxmox-01
debug:
msg: |
NOTE: dlx-sonar (VMID 202) is running on proxmox-01
Current disk allocation: 422 GB
Current disk usage: 354 GB (82%)
This is expected for SonarQube with large code analysis databases.
Remediation options:
1. Archive old analysis: sonar-scanner with delete API
2. Configure data retention in SonarQube settings
3. Move to dedicated storage pool (dlx-nfs-sdb-02)
4. Increase disk allocation if needed
5. Run cleanup task: DELETE /api/ce/activity?createdBefore=<date>
---
- name: "Audit stopped containers for cleanup decisions"
hosts: proxmox-00
gather_facts: yes
tasks:
- name: List all stopped LXC containers
shell: pct list | awk 'NR>1 && $3=="stopped" {print $1, $2}'
register: stopped_containers
changed_when: false
- name: Display stopped containers
debug:
msg: |
Stopped containers found:
{{ stopped_containers.stdout }}
These containers are allocated but not running:
- dlx-wireguard (105): 32 GB - VPN service
- dlx-mysql-02 (108): 200 GB - Database replica
- dlx-mattermost (107): 32 GB - Chat platform
- dlx-mysql-03 (109): 200 GB - Database replica
- dlx-nocodb (116): 100 GB - No-code database
Total allocated: ~564 GB
Decision Matrix:
┌─────────────────┬───────────┬──────────────────────────────┐
│ Container │ Allocated │ Recommendation │
├─────────────────┼───────────┼──────────────────────────────┤
│ dlx-wireguard │ 32 GB │ REMOVE if not in active use │
│ dlx-mysql-* │ 400 GB │ REMOVE if using dlx-mysql-01 │
│ dlx-mattermost │ 32 GB │ REMOVE if using Slack/Teams │
│ dlx-nocodb │ 100 GB │ REMOVE if not in active use │
└─────────────────┴───────────┴──────────────────────────────┘
- name: Create removal recommendations
debug:
msg: |
To safely remove stopped containers:
1. VERIFY PURPOSE: Document why each was created
2. CHECK BACKUPS: Ensure data is backed up elsewhere
3. EXPORT CONFIG: pct config VMID > backup.conf
4. DELETE: pct destroy VMID --force
Example safe removal script:
---
# Backup container config before deletion
pct config 105 > /tmp/dlx-wireguard-backup.conf
pct destroy 105 --force
# This frees 32 GB immediately
---
---
- name: "Storage remediation summary and next steps"
hosts: localhost
gather_facts: no
tasks:
- name: Display remediation summary
debug:
msg: |
╔════════════════════════════════════════════════════════════════╗
║ STORAGE REMEDIATION PLAYBOOK EXECUTION SUMMARY ║
╚════════════════════════════════════════════════════════════════╝
✓ COMPLETED ACTIONS:
1. Compressed journal logs on proxmox-00
2. Cleaned old syslog files (>90 days)
3. Cleaned apt cache
4. Cleaned temp directories (/tmp, /var/tmp)
5. Pruned Docker images, volumes, and cache
6. Analyzed container storage usage
7. Generated SonarQube audit report
8. Identified stopped containers for cleanup
⚠️ IMMEDIATE ACTIONS REQUIRED:
1. [ ] SSH to proxmox-00 and verify root FS space freed
Command: df -h /
2. [ ] Review stopped containers and decide keep/remove
3. [ ] Monitor dlx-docker on proxmox-01 (currently 81% full)
4. [ ] Schedule SonarQube data cleanup if needed
📊 CAPACITY TARGETS:
- proxmox-00 root: Target <70% (currently 84%)
- proxmox-01 dlx-docker: Target <75% (currently 81%)
- SonarQube: Keep <75% if possible
🔄 AUTOMATION RECOMMENDATIONS:
1. Create logrotate config for persistent log management
2. Schedule weekly: docker system prune -f
3. Schedule monthly: journalctl --vacuum=time:60d
4. Set up monitoring alerts at 75%, 85%, 95% capacity
📝 NEXT AUDIT:
Schedule: 2026-03-08 (30 days)
Update: /docs/STORAGE-AUDIT.md with new metrics
- name: Create remediation tracking file
copy:
content: |
# Storage Remediation Tracking
Generated: {{ ansible_date_time.iso8601 }}
## Issues Addressed
- [ ] proxmox-00 root filesystem cleanup
- [ ] proxmox-01 dlx-docker cleanup
- [ ] SonarQube audit completed
- [ ] Stopped containers reviewed
## Manual Verification Required
- [ ] SSH to proxmox-00: df -h /
- [ ] SSH to proxmox-01: docker system df
- [ ] Review stopped container logs
- [ ] Decide on stopped container removal
## Follow-up Tasks
- [ ] Create logrotate policies
- [ ] Set up monitoring/alerting
- [ ] Schedule periodic cleanup runs
- [ ] Document storage policies
## Completed Dates
dest: "/tmp/storage-remediation-tracking.txt"
delegate_to: localhost
run_once: true
- name: Display follow-up instructions
debug:
msg: |
Next Step: Run targeted remediation
To clean up individual issues:
1. Clean proxmox-00 root filesystem ONLY:
ansible-playbook playbooks/remediate-storage-critical-issues.yml \\
--tags cleanup_root_fs -l proxmox-00
2. Clean proxmox-01 Docker storage ONLY:
ansible-playbook playbooks/remediate-storage-critical-issues.yml \\
--tags cleanup_docker -l proxmox-01
3. Dry-run (check mode):
ansible-playbook playbooks/remediate-storage-critical-issues.yml \\
--check
4. Run with verbose output:
ansible-playbook playbooks/remediate-storage-critical-issues.yml \\
-vvv