361 lines
13 KiB
YAML
361 lines
13 KiB
YAML
---
|
|
# Remediation playbooks for critical storage issues identified in STORAGE-AUDIT.md
|
|
# This playbook addresses:
|
|
# 1. proxmox-00 root filesystem at 84.5% capacity
|
|
# 2. proxmox-01 dlx-docker at 81.1% capacity
|
|
# 3. SonarQube at 82% of allocated space
|
|
|
|
# CRITICAL: Test in non-production first
|
|
# Run with --check for dry-run
|
|
|
|
- name: "Remediate proxmox-00 root filesystem (CRITICAL: 84.5% full)"
|
|
hosts: proxmox-00
|
|
gather_facts: yes
|
|
vars:
|
|
cleanup_journal_days: 30
|
|
cleanup_apt_cache: true
|
|
cleanup_temp_files: true
|
|
log_threshold_days: 90
|
|
tasks:
|
|
- name: Get filesystem usage before cleanup
|
|
shell: df -h / | tail -1
|
|
register: fs_before
|
|
changed_when: false
|
|
|
|
- name: Display filesystem usage before
|
|
debug:
|
|
msg: "Before cleanup: {{ fs_before.stdout }}"
|
|
|
|
- name: Compress old journal logs
|
|
shell: journalctl --vacuum-time={{ cleanup_journal_days }}d
|
|
become: yes
|
|
register: journal_cleanup
|
|
when: cleanup_journal_cache | default(true)
|
|
|
|
- name: Display journal cleanup result
|
|
debug:
|
|
msg: "{{ journal_cleanup.stderr }}"
|
|
when: journal_cleanup.changed
|
|
|
|
- name: Clean old syslog files
|
|
shell: |
|
|
find /var/log -name "*.log.*" -type f -mtime +{{ log_threshold_days }} -delete
|
|
find /var/log -name "*.gz" -type f -mtime +{{ log_threshold_days }} -delete
|
|
become: yes
|
|
register: log_cleanup
|
|
|
|
- name: Clean apt cache if enabled
|
|
shell: apt-get clean && apt-get autoclean
|
|
become: yes
|
|
register: apt_cleanup
|
|
when: cleanup_apt_cache
|
|
|
|
- name: Clean tmp directories
|
|
shell: |
|
|
find /tmp -type f -atime +30 -delete 2>/dev/null || true
|
|
find /var/tmp -type f -atime +30 -delete 2>/dev/null || true
|
|
become: yes
|
|
register: tmp_cleanup
|
|
when: cleanup_temp_files
|
|
|
|
- name: Find large files in /var/log
|
|
shell: find /var/log -type f -size +100M
|
|
register: large_logs
|
|
changed_when: false
|
|
|
|
- name: Display large log files
|
|
debug:
|
|
msg: "Large files in /var/log (>100MB): {{ large_logs.stdout_lines }}"
|
|
when: large_logs.stdout
|
|
|
|
- name: Get filesystem usage after cleanup
|
|
shell: df -h / | tail -1
|
|
register: fs_after
|
|
changed_when: false
|
|
|
|
- name: Display filesystem usage after
|
|
debug:
|
|
msg: "After cleanup: {{ fs_after.stdout }}"
|
|
|
|
- name: Calculate freed space
|
|
debug:
|
|
msg: |
|
|
Cleanup Summary:
|
|
- Journal logs compressed: {{ cleanup_journal_days }} days retained
|
|
- Old syslog files removed: {{ log_threshold_days }}+ days
|
|
- Apt cache cleaned: {{ cleanup_apt_cache }}
|
|
- Temp files cleaned: {{ cleanup_temp_files }}
|
|
NOTE: Re-run 'df -h /' on proxmox-00 to verify space was freed
|
|
|
|
- name: Set alert for continued monitoring
|
|
debug:
|
|
msg: |
|
|
⚠️ ALERT: Root filesystem still approaching capacity
|
|
Next steps if space still insufficient:
|
|
1. Move /var to separate partition
|
|
2. Archive/compress old log files to NFS
|
|
3. Review application logs for rotation config
|
|
4. Consider expanding root partition
|
|
|
|
- name: "Remediate proxmox-01 dlx-docker high utilization (81.1% full)"
|
|
hosts: proxmox-01
|
|
gather_facts: yes
|
|
tasks:
|
|
- name: Check if Docker is installed
|
|
stat:
|
|
path: /usr/bin/docker
|
|
register: docker_installed
|
|
|
|
- name: Get Docker storage usage before cleanup
|
|
shell: docker system df
|
|
register: docker_before
|
|
when: docker_installed.stat.exists
|
|
changed_when: false
|
|
|
|
- name: Display Docker usage before
|
|
debug:
|
|
msg: "{{ docker_before.stdout }}"
|
|
when: docker_installed.stat.exists
|
|
|
|
- name: Remove unused Docker images
|
|
shell: docker image prune -f
|
|
become: yes
|
|
register: image_prune
|
|
when: docker_installed.stat.exists
|
|
|
|
- name: Display pruned images
|
|
debug:
|
|
msg: "{{ image_prune.stdout }}"
|
|
when: docker_installed.stat.exists and image_prune.changed
|
|
|
|
- name: Remove unused Docker volumes
|
|
shell: docker volume prune -f
|
|
become: yes
|
|
register: volume_prune
|
|
when: docker_installed.stat.exists
|
|
|
|
- name: Display pruned volumes
|
|
debug:
|
|
msg: "{{ volume_prune.stdout }}"
|
|
when: docker_installed.stat.exists and volume_prune.changed
|
|
|
|
- name: Remove dangling build cache
|
|
shell: docker builder prune -f -a
|
|
become: yes
|
|
register: cache_prune
|
|
when: docker_installed.stat.exists
|
|
failed_when: false # Older Docker versions may not support this
|
|
|
|
- name: Get Docker storage usage after cleanup
|
|
shell: docker system df
|
|
register: docker_after
|
|
when: docker_installed.stat.exists
|
|
changed_when: false
|
|
|
|
- name: Display Docker usage after
|
|
debug:
|
|
msg: "{{ docker_after.stdout }}"
|
|
when: docker_installed.stat.exists
|
|
|
|
- name: List Docker containers on dlx-docker storage
|
|
shell: |
|
|
df /mnt/pve/dlx-docker
|
|
echo "---"
|
|
du -sh /mnt/pve/dlx-docker/* 2>/dev/null | sort -hr | head -10
|
|
become: yes
|
|
register: storage_usage
|
|
changed_when: false
|
|
|
|
- name: Display storage breakdown
|
|
debug:
|
|
msg: "{{ storage_usage.stdout }}"
|
|
|
|
- name: Alert for manual review
|
|
debug:
|
|
msg: |
|
|
⚠️ ALERT: dlx-docker still at high capacity
|
|
Manual steps to consider:
|
|
1. Check running containers: docker ps -a
|
|
2. Inspect container logs: docker logs <container-id> | wc -l
|
|
3. Review log rotation config: docker inspect <container-id>
|
|
4. Consider migrating containers to dlx-nfs-* storage
|
|
5. Archive old analysis/build artifacts
|
|
|
|
- name: "Audit and report SonarQube disk usage (354 GB)"
|
|
hosts: proxmox-00
|
|
gather_facts: yes
|
|
tasks:
|
|
- name: Check SonarQube container exists
|
|
shell: pct list | grep -i sonar || echo "sonar not found on this host"
|
|
register: sonar_check
|
|
changed_when: false
|
|
|
|
- name: Display SonarQube status
|
|
debug:
|
|
msg: "{{ sonar_check.stdout }}"
|
|
|
|
- name: Check if dlx-sonar container is on proxmox-01
|
|
debug:
|
|
msg: |
|
|
NOTE: dlx-sonar (VMID 202) is running on proxmox-01
|
|
Current disk allocation: 422 GB
|
|
Current disk usage: 354 GB (82%)
|
|
|
|
This is expected for SonarQube with large code analysis databases.
|
|
|
|
Remediation options:
|
|
1. Archive old analysis: sonar-scanner with delete API
|
|
2. Configure data retention in SonarQube settings
|
|
3. Move to dedicated storage pool (dlx-nfs-sdb-02)
|
|
4. Increase disk allocation if needed
|
|
5. Run cleanup task: DELETE /api/ce/activity?createdBefore=<date>
|
|
|
|
- name: "Audit stopped containers for cleanup decisions"
|
|
hosts: proxmox-00
|
|
gather_facts: yes
|
|
tasks:
|
|
- name: List all stopped LXC containers
|
|
shell: pct list | awk 'NR>1 && $3=="stopped" {print $1, $2}'
|
|
register: stopped_containers
|
|
changed_when: false
|
|
|
|
- name: Display stopped containers
|
|
debug:
|
|
msg: |
|
|
Stopped containers found:
|
|
{{ stopped_containers.stdout }}
|
|
|
|
These containers are allocated but not running:
|
|
- dlx-wireguard (105): 32 GB - VPN service
|
|
- dlx-mysql-02 (108): 200 GB - Database replica
|
|
- dlx-mattermost (107): 32 GB - Chat platform
|
|
- dlx-mysql-03 (109): 200 GB - Database replica
|
|
- dlx-nocodb (116): 100 GB - No-code database
|
|
|
|
Total allocated: ~564 GB
|
|
|
|
Decision Matrix:
|
|
┌─────────────────┬───────────┬──────────────────────────────┐
|
|
│ Container │ Allocated │ Recommendation │
|
|
├─────────────────┼───────────┼──────────────────────────────┤
|
|
│ dlx-wireguard │ 32 GB │ REMOVE if not in active use │
|
|
│ dlx-mysql-* │ 400 GB │ REMOVE if using dlx-mysql-01 │
|
|
│ dlx-mattermost │ 32 GB │ REMOVE if using Slack/Teams │
|
|
│ dlx-nocodb │ 100 GB │ REMOVE if not in active use │
|
|
└─────────────────┴───────────┴──────────────────────────────┘
|
|
|
|
- name: Create removal recommendations
|
|
debug:
|
|
msg: |
|
|
To safely remove stopped containers:
|
|
|
|
1. VERIFY PURPOSE: Document why each was created
|
|
2. CHECK BACKUPS: Ensure data is backed up elsewhere
|
|
3. EXPORT CONFIG: pct config VMID > backup.conf
|
|
4. DELETE: pct destroy VMID --force
|
|
|
|
Example safe removal script:
|
|
---
|
|
# Backup container config before deletion
|
|
pct config 105 > /tmp/dlx-wireguard-backup.conf
|
|
pct destroy 105 --force
|
|
|
|
# This frees 32 GB immediately
|
|
---
|
|
|
|
- name: "Storage remediation summary and next steps"
|
|
hosts: localhost
|
|
gather_facts: no
|
|
tasks:
|
|
- name: Display remediation summary
|
|
debug:
|
|
msg: |
|
|
╔════════════════════════════════════════════════════════════════╗
|
|
║ STORAGE REMEDIATION PLAYBOOK EXECUTION SUMMARY ║
|
|
╚════════════════════════════════════════════════════════════════╝
|
|
|
|
✓ COMPLETED ACTIONS:
|
|
1. Compressed journal logs on proxmox-00
|
|
2. Cleaned old syslog files (>90 days)
|
|
3. Cleaned apt cache
|
|
4. Cleaned temp directories (/tmp, /var/tmp)
|
|
5. Pruned Docker images, volumes, and cache
|
|
6. Analyzed container storage usage
|
|
7. Generated SonarQube audit report
|
|
8. Identified stopped containers for cleanup
|
|
|
|
⚠️ IMMEDIATE ACTIONS REQUIRED:
|
|
1. [ ] SSH to proxmox-00 and verify root FS space freed
|
|
Command: df -h /
|
|
2. [ ] Review stopped containers and decide keep/remove
|
|
3. [ ] Monitor dlx-docker on proxmox-01 (currently 81% full)
|
|
4. [ ] Schedule SonarQube data cleanup if needed
|
|
|
|
📊 CAPACITY TARGETS:
|
|
- proxmox-00 root: Target <70% (currently 84%)
|
|
- proxmox-01 dlx-docker: Target <75% (currently 81%)
|
|
- SonarQube: Keep <75% if possible
|
|
|
|
🔄 AUTOMATION RECOMMENDATIONS:
|
|
1. Create logrotate config for persistent log management
|
|
2. Schedule weekly: docker system prune -f
|
|
3. Schedule monthly: journalctl --vacuum=time:60d
|
|
4. Set up monitoring alerts at 75%, 85%, 95% capacity
|
|
|
|
📝 NEXT AUDIT:
|
|
Schedule: 2026-03-08 (30 days)
|
|
Update: /docs/STORAGE-AUDIT.md with new metrics
|
|
|
|
- name: Create remediation tracking file
|
|
copy:
|
|
content: |
|
|
# Storage Remediation Tracking
|
|
Generated: {{ ansible_date_time.iso8601 }}
|
|
|
|
## Issues Addressed
|
|
- [ ] proxmox-00 root filesystem cleanup
|
|
- [ ] proxmox-01 dlx-docker cleanup
|
|
- [ ] SonarQube audit completed
|
|
- [ ] Stopped containers reviewed
|
|
|
|
## Manual Verification Required
|
|
- [ ] SSH to proxmox-00: df -h /
|
|
- [ ] SSH to proxmox-01: docker system df
|
|
- [ ] Review stopped container logs
|
|
- [ ] Decide on stopped container removal
|
|
|
|
## Follow-up Tasks
|
|
- [ ] Create logrotate policies
|
|
- [ ] Set up monitoring/alerting
|
|
- [ ] Schedule periodic cleanup runs
|
|
- [ ] Document storage policies
|
|
|
|
## Completed Dates
|
|
|
|
dest: "/tmp/storage-remediation-tracking.txt"
|
|
delegate_to: localhost
|
|
run_once: true
|
|
|
|
- name: Display follow-up instructions
|
|
debug:
|
|
msg: |
|
|
Next Step: Run targeted remediation
|
|
|
|
To clean up individual issues:
|
|
|
|
1. Clean proxmox-00 root filesystem ONLY:
|
|
ansible-playbook playbooks/remediate-storage-critical-issues.yml \\
|
|
--tags cleanup_root_fs -l proxmox-00
|
|
|
|
2. Clean proxmox-01 Docker storage ONLY:
|
|
ansible-playbook playbooks/remediate-storage-critical-issues.yml \\
|
|
--tags cleanup_docker -l proxmox-01
|
|
|
|
3. Dry-run (check mode):
|
|
ansible-playbook playbooks/remediate-storage-critical-issues.yml \\
|
|
--check
|
|
|
|
4. Run with verbose output:
|
|
ansible-playbook playbooks/remediate-storage-critical-issues.yml \\
|
|
-vvv
|