dlx-ansible/playbooks/remediate-stopped-container...

279 lines
11 KiB
YAML

---
# Safe removal of stopped containers in Proxmox cluster
# Purpose: Reclaim space from unused LXC containers
# Safety: Creates backups before removal
- name: "Audit and safely remove stopped containers"
hosts: proxmox
gather_facts: yes
vars:
backup_dir: "/tmp/pve-container-backups"
containers_to_remove: []
containers_to_keep: []
create_backups: true
dry_run: true # Set to false to actually remove containers
tasks:
- name: Create backup directory
file:
path: "{{ backup_dir }}"
state: directory
mode: "0755"
run_once: true
delegate_to: "{{ ansible_host }}"
when: create_backups
- name: List all LXC containers
shell: pct list | tail -n +2 | awk '{print $1, $2, $3}' | sort
register: all_containers
changed_when: false
- name: Parse container list
set_fact:
container_list: "{{ all_containers.stdout_lines }}"
- name: Display all containers on this host
debug:
msg: |
All containers on {{ inventory_hostname }}:
VMID Name Status
──────────────────────────────────────
{% for line in container_list %}
{{ line }}
{% endfor %}
- name: Identify stopped containers
shell: |
pct list | tail -n +2 | awk '$3 == "stopped" {print $1, $2}' | sort
register: stopped_containers
changed_when: false
- name: Display stopped containers
debug:
msg: |
Stopped containers on {{ inventory_hostname }}:
{{ stopped_containers.stdout or "None found" }}
- name: "Block: Backup and prepare removal (if stopped containers exist)"
block:
- name: Get detailed info for each stopped container
shell: |
for vmid in $(pct list | tail -n +2 | awk '$3 == "stopped" {print $1}'); do
NAME=$(pct list | grep "^$vmid " | awk '{print $2}')
SIZE=$(du -sh /var/lib/lxc/$vmid 2>/dev/null || echo "0")
echo "$vmid $NAME $SIZE"
done
register: container_sizes
changed_when: false
- name: Display container space usage
debug:
msg: |
Stopped Container Sizes:
VMID Name Allocated Space
─────────────────────────────────────────────
{% for line in container_sizes.stdout_lines %}
{{ line }}
{% endfor %}
- name: Create container backups
block:
- name: Backup container configs
shell: |
for vmid in $(pct list | tail -n +2 | awk '$3 == "stopped" {print $1}'); do
NAME=$(pct list | grep "^$vmid " | awk '{print $2}')
echo "Backing up config for $vmid ($NAME)..."
pct config $vmid > {{ backup_dir }}/container-${vmid}-${NAME}.conf
echo "Backing up state for $vmid ($NAME)..."
pct status $vmid > {{ backup_dir }}/container-${vmid}-${NAME}.status
done
become: yes
register: backup_result
when: create_backups and not dry_run
- name: Display backup completion
debug:
msg: |
✓ Container configurations backed up to {{ backup_dir }}/
Files:
{{ backup_result.stdout }}
when: create_backups and not dry_run and backup_result.changed
- name: "Decision: Which containers to keep/remove"
debug:
msg: |
CONTAINER REMOVAL DECISION MATRIX:
╔════════════════════════════════════════════════════════════════╗
║ Container │ Size │ Purpose │ Action ║
╠════════════════════════════════════════════════════════════════╣
║ dlx-wireguard (105) │ 32 GB │ VPN service │ REVIEW ║
║ dlx-mysql-02 (108) │ 200 GB │ MySQL replica │ REMOVE ║
║ dlx-mysql-03 (109) │ 200 GB │ MySQL replica │ REMOVE ║
║ dlx-mattermost (107)│ 32 GB │ Chat/comms │ REMOVE ║
║ dlx-nocodb (116) │ 100 GB │ No-code database │ REMOVE ║
║ dlx-swarm-* (*) │ 65 GB │ Docker swarm nodes │ REMOVE ║
║ dlx-kube-* (*) │ 50 GB │ Kubernetes nodes │ REMOVE ║
╚════════════════════════════════════════════════════════════════╝
SAFE REMOVAL CANDIDATES (assuming dlx-mysql-01 is in use):
- dlx-mysql-02, dlx-mysql-03: 400 GB combined
- dlx-mattermost: 32 GB (if not using for comms)
- dlx-nocodb: 100 GB (if not in use)
- dlx-swarm nodes: 195 GB (if Swarm not active)
- dlx-kube nodes: 150 GB (if Kubernetes not used)
CONSERVATIVE APPROACH (recommended):
- Keep: dlx-wireguard (has specific purpose)
- Remove: All database replicas, swarm/kube nodes = 750+ GB
- name: "Safety check: Verify before removal"
debug:
msg: |
⚠️ SAFETY CHECK - DO NOT PROCEED WITHOUT VERIFICATION:
1. VERIFY BACKUPS:
ls -lh {{ backup_dir }}/
Should show .conf and .status files for all containers
2. CHECK DEPENDENCIES:
- Is dlx-mysql-01 running and taking load?
- Are swarm/kube services actually needed?
- Is wireguard currently in use?
3. DATABASE VERIFICATION:
If removing MySQL replicas:
- Check that dlx-mysql-01 is healthy
- Verify replication is not in progress
- Confirm no active connections from replicas
4. FINAL CONFIRMATION:
Review each container's last modification time
pct status <vmid>
Once verified, proceed with removal below.
- name: "REMOVAL: Delete selected stopped containers"
block:
- name: Set containers to remove (customize as needed)
set_fact:
containers_to_remove:
- vmid: 108
name: dlx-mysql-02
size: 200
- vmid: 109
name: dlx-mysql-03
size: 200
- vmid: 107
name: dlx-mattermost
size: 32
- vmid: 116
name: dlx-nocodb
size: 100
- name: Remove containers (DRY RUN - set dry_run=false to execute)
shell: |
if [ "{{ dry_run }}" = "true" ]; then
echo "DRY RUN: Would remove container {{ item.vmid }} ({{ item.name }})"
else
echo "Removing container {{ item.vmid }} ({{ item.name }})..."
pct destroy {{ item.vmid }} --force
echo "Removed: {{ item.vmid }}"
fi
become: yes
with_items: "{{ containers_to_remove }}"
register: removal_result
- name: Display removal results
debug:
msg: "{{ removal_result.results | map(attribute='stdout') | list }}"
- name: Verify space freed
shell: |
df -h / | tail -1
du -sh /var/lib/lxc/ 2>/dev/null || echo "LXC directory info"
register: space_after
changed_when: false
- name: Display freed space
debug:
msg: |
Space verification after removal:
{{ space_after.stdout }}
Summary:
Removed: {{ containers_to_remove | length }} containers
Space recovered: {{ containers_to_remove | map(attribute='size') | sum }} GB
Status: {% if not dry_run %}✓ REMOVED{% else %}DRY RUN - not removed{% endif %}
when: stopped_containers.stdout_lines | length > 0
- name: "Post-removal validation and reporting"
hosts: proxmox
gather_facts: no
tasks:
- name: Final container count
shell: |
TOTAL=$(pct list | tail -n +2 | wc -l)
RUNNING=$(pct list | tail -n +2 | awk '$3 == "running" {count++} END {print count}')
STOPPED=$(pct list | tail -n +2 | awk '$3 == "stopped" {count++} END {print count}')
echo "Total: $TOTAL (Running: $RUNNING, Stopped: $STOPPED)"
register: final_count
changed_when: false
- name: Display final summary
debug:
msg: |
╔══════════════════════════════════════════════════════════════╗
║ STOPPED CONTAINER REMOVAL COMPLETED ║
╚══════════════════════════════════════════════════════════════╝
Final Container Status on {{ inventory_hostname }}:
{{ final_count.stdout }}
Backup Location: {{ backup_dir }}/
(Configs retained for 30 days before automatic cleanup)
To recover a removed container:
pct restore <backup-file.conf> <new-vmid>
Monitoring:
- Watch for error messages from removed services
- Monitor CPU and disk I/O for 48 hours
- Review application logs for missing dependencies
Next Step:
Run: ansible-playbook playbooks/remediate-storage-critical-issues.yml
To verify final storage utilization
- name: Create recovery guide
copy:
content: |
# Container Recovery Guide
Generated: {{ ansible_date_time.iso8601 }}
Host: {{ inventory_hostname }}
## Backed Up Containers
Location: /tmp/pve-container-backups/
To restore a container:
```bash
# Extract config
cat /tmp/pve-container-backups/container-VMID-NAME.conf
# Restore to new VMID (e.g., 1000)
pct restore /tmp/pve-container-backups/container-VMID-NAME.conf 1000
# Verify
pct list | grep 1000
pct status 1000
```
## Backup Retention
- Automatic cleanup: 30 days
- Manual archive: Copy to dlx-nfs-sdb-02 for longer retention
- Format: container-{VMID}-{NAME}.conf
dest: "/tmp/container-recovery-guide.txt"
delegate_to: "{{ inventory_hostname }}"
run_once: true