dlx-ansible/playbooks/remediate-stopped-container...

---
# Safe removal of stopped containers in Proxmox cluster
# Purpose: Reclaim space from unused LXC containers
# Safety: Creates backups before removal

- name: "Audit and safely remove stopped containers"
  hosts: proxmox
  gather_facts: yes
  vars:
    backup_dir: "/tmp/pve-container-backups"
    containers_to_remove: []
    containers_to_keep: []
    create_backups: true
    dry_run: true  # Set to false to actually remove containers
  tasks:
    - name: Create backup directory
      file:
        path: "{{ backup_dir }}"
        state: directory
        mode: "0755"
      run_once: true
      delegate_to: "{{ ansible_host }}"
      when: create_backups

    - name: List all LXC containers
      shell: pct list | tail -n +2 | awk '{print $1, $2, $3}' | sort
      register: all_containers
      changed_when: false

    - name: Parse container list
      set_fact:
        container_list: "{{ all_containers.stdout_lines }}"

    - name: Display all containers on this host
      debug:
        msg: |
          All containers on {{ inventory_hostname }}:
          VMID  Name                  Status
          ──────────────────────────────────────
          {% for line in container_list %}
          {{ line }}
          {% endfor %}

    - name: Identify stopped containers
      shell: |
        pct list | tail -n +2 | awk '$3 == "stopped" {print $1, $2}' | sort
      register: stopped_containers
      changed_when: false

    - name: Display stopped containers
      debug:
        msg: |
          Stopped containers on {{ inventory_hostname }}:
          {{ stopped_containers.stdout or "None found" }}

    - name: "Block: Backup and prepare removal (if stopped containers exist)"
      block:
        - name: Get detailed info for each stopped container
          shell: |
            for vmid in $(pct list | tail -n +2 | awk '$3 == "stopped" {print $1}'); do
              NAME=$(pct list | grep "^$vmid " | awk '{print $2}')
              SIZE=$(du -sh /var/lib/lxc/$vmid 2>/dev/null || echo "0")
              echo "$vmid $NAME $SIZE"
            done
          register: container_sizes
          changed_when: false

        - name: Display container space usage
          debug:
            msg: |
              Stopped Container Sizes:
              VMID  Name                  Allocated Space
              ─────────────────────────────────────────────
              {% for line in container_sizes.stdout_lines %}
              {{ line }}
              {% endfor %}

        - name: Create container backups
          block:
            - name: Backup container configs
              shell: |
                for vmid in $(pct list | tail -n +2 | awk '$3 == "stopped" {print $1}'); do
                  NAME=$(pct list | grep "^$vmid " | awk '{print $2}')
                  echo "Backing up config for $vmid ($NAME)..."
                  pct config $vmid > {{ backup_dir }}/container-${vmid}-${NAME}.conf
                  echo "Backing up state for $vmid ($NAME)..."
                  pct status $vmid > {{ backup_dir }}/container-${vmid}-${NAME}.status
                done
              become: yes
              register: backup_result
              when: create_backups and not dry_run

            - name: Display backup completion
              debug:
                msg: |
                  ✓ Container configurations backed up to {{ backup_dir }}/
                  Files:
                  {{ backup_result.stdout }}
              when: create_backups and not dry_run and backup_result.changed

    - name: "Decision: Which containers to keep/remove"
      debug:
        msg: |
          CONTAINER REMOVAL DECISION MATRIX:

          ╔════════════════════════════════════════════════════════════════╗
          ║ Container           │ Size   │ Purpose              │ Action  ║
          ╠════════════════════════════════════════════════════════════════╣
          ║ dlx-wireguard (105) │ 32 GB  │ VPN service          │ REVIEW  ║
          ║ dlx-mysql-02 (108)  │ 200 GB │ MySQL replica        │ REMOVE  ║
          ║ dlx-mysql-03 (109)  │ 200 GB │ MySQL replica        │ REMOVE  ║
          ║ dlx-mattermost (107)│ 32 GB  │ Chat/comms           │ REMOVE  ║
          ║ dlx-nocodb (116)    │ 100 GB │ No-code database     │ REMOVE  ║
          ║ dlx-swarm-* (*)     │ 65 GB  │ Docker swarm nodes   │ REMOVE  ║
          ║ dlx-kube-* (*)      │ 50 GB  │ Kubernetes nodes     │ REMOVE  ║
          ╚════════════════════════════════════════════════════════════════╝

          SAFE REMOVAL CANDIDATES (assuming dlx-mysql-01 is in use):
          - dlx-mysql-02, dlx-mysql-03: 400 GB combined
          - dlx-mattermost: 32 GB (if not using for comms)
          - dlx-nocodb: 100 GB (if not in use)
          - dlx-swarm nodes: 195 GB (if Swarm not active)
          - dlx-kube nodes: 150 GB (if Kubernetes not used)

          CONSERVATIVE APPROACH (recommended):
          - Keep: dlx-wireguard (has specific purpose)
          - Remove: All database replicas, swarm/kube nodes = 750+ GB

    - name: "Safety check: Verify before removal"
      debug:
        msg: |
          ⚠️  SAFETY CHECK - DO NOT PROCEED WITHOUT VERIFICATION:

          1. VERIFY BACKUPS:
             ls -lh {{ backup_dir }}/
             Should show .conf and .status files for all containers

          2. CHECK DEPENDENCIES:
             - Is dlx-mysql-01 running and taking load?
             - Are swarm/kube services actually needed?
             - Is wireguard currently in use?

          3. DATABASE VERIFICATION:
             If removing MySQL replicas:
             - Check that dlx-mysql-01 is healthy
             - Verify replication is not in progress
             - Confirm no active connections from replicas

          4. FINAL CONFIRMATION:
             Review each container's last modification time
             pct status <vmid>

          Once verified, proceed with removal below.

    - name: "REMOVAL: Delete selected stopped containers"
      block:
        - name: Set containers to remove (customize as needed)
          set_fact:
            containers_to_remove:
              - vmid: 108
                name: dlx-mysql-02
                size: 200
              - vmid: 109
                name: dlx-mysql-03
                size: 200
              - vmid: 107
                name: dlx-mattermost
                size: 32
              - vmid: 116
                name: dlx-nocodb
                size: 100

        - name: Remove containers (DRY RUN - set dry_run=false to execute)
          shell: |
            if [ "{{ dry_run }}" = "true" ]; then
              echo "DRY RUN: Would remove container {{ item.vmid }} ({{ item.name }})"
            else
              echo "Removing container {{ item.vmid }} ({{ item.name }})..."
              pct destroy {{ item.vmid }} --force
              echo "Removed: {{ item.vmid }}"
            fi
          become: yes
          with_items: "{{ containers_to_remove }}"
          register: removal_result

        - name: Display removal results
          debug:
            msg: "{{ removal_result.results | map(attribute='stdout') | list }}"

        - name: Verify space freed
          shell: |
            df -h / | tail -1
            du -sh /var/lib/lxc/ 2>/dev/null || echo "LXC directory info"
          register: space_after
          changed_when: false

        - name: Display freed space
          debug:
            msg: |
              Space verification after removal:
              {{ space_after.stdout }}

              Summary:
              Removed: {{ containers_to_remove | length }} containers
              Space recovered: {{ containers_to_remove | map(attribute='size') | sum }} GB
              Status: {% if not dry_run %}✓ REMOVED{% else %}DRY RUN - not removed{% endif %}

      when: stopped_containers.stdout_lines | length > 0

- name: "Post-removal validation and reporting"
  hosts: proxmox
  gather_facts: no
  tasks:
    - name: Final container count
      shell: |
        TOTAL=$(pct list | tail -n +2 | wc -l)
        RUNNING=$(pct list | tail -n +2 | awk '$3 == "running" {count++} END {print count}')
        STOPPED=$(pct list | tail -n +2 | awk '$3 == "stopped" {count++} END {print count}')
        echo "Total: $TOTAL (Running: $RUNNING, Stopped: $STOPPED)"
      register: final_count
      changed_when: false

    - name: Display final summary
      debug:
        msg: |
          ╔══════════════════════════════════════════════════════════════╗
          ║      STOPPED CONTAINER REMOVAL COMPLETED                     ║
          ╚══════════════════════════════════════════════════════════════╝

          Final Container Status on {{ inventory_hostname }}:
          {{ final_count.stdout }}

          Backup Location: {{ backup_dir }}/
          (Configs retained for 30 days before automatic cleanup)

          To recover a removed container:
          pct restore <backup-file.conf> <new-vmid>

          Monitoring:
          - Watch for error messages from removed services
          - Monitor CPU and disk I/O for 48 hours
          - Review application logs for missing dependencies

          Next Step:
          Run: ansible-playbook playbooks/remediate-storage-critical-issues.yml
          To verify final storage utilization

    - name: Create recovery guide
      copy:
        content: |
          # Container Recovery Guide
          Generated: {{ ansible_date_time.iso8601 }}
          Host: {{ inventory_hostname }}

          ## Backed Up Containers
          Location: /tmp/pve-container-backups/

          To restore a container:
          ```bash
          # Extract config
          cat /tmp/pve-container-backups/container-VMID-NAME.conf

          # Restore to new VMID (e.g., 1000)
          pct restore /tmp/pve-container-backups/container-VMID-NAME.conf 1000

          # Verify
          pct list | grep 1000
          pct status 1000
          ```

          ## Backup Retention
          - Automatic cleanup: 30 days
          - Manual archive: Copy to dlx-nfs-sdb-02 for longer retention
          - Format: container-{VMID}-{NAME}.conf

        dest: "/tmp/container-recovery-guide.txt"
      delegate_to: "{{ inventory_hostname }}"
      run_once: true