dlx-ansible/playbooks/configure-storage-monitorin...

385 lines
15 KiB
YAML

---
# Configure proactive storage monitoring and alerting for Proxmox hosts
# Monitors: Filesystem usage, Docker storage, Container allocation
# Alerts at: 75%, 85%, 95% capacity thresholds
- name: "Setup storage monitoring and alerting"
hosts: proxmox
gather_facts: yes
vars:
alert_threshold_75: true # Alert when >75% full
alert_threshold_85: true # Alert when >85% full
alert_threshold_95: true # Alert when >95% full (critical)
alert_email: "admin@directlx.dev"
monitoring_interval: "5m" # Check every 5 minutes
tasks:
- name: Create storage monitoring directory
file:
path: /usr/local/bin/storage-monitoring
state: directory
mode: "0755"
become: yes
- name: Create filesystem capacity check script
copy:
content: |
#!/bin/bash
# Filesystem capacity monitoring
# Alerts when thresholds are exceeded
HOSTNAME=$(hostname)
THRESHOLD_75=75
THRESHOLD_85=85
THRESHOLD_95=95
LOGFILE="/var/log/storage-monitor.log"
log_event() {
LEVEL=$1
FS=$2
USAGE=$3
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$TIMESTAMP] [$LEVEL] $FS: ${USAGE}% used" >> $LOGFILE
}
check_filesystem() {
FS=$1
USAGE=$(df $FS | tail -1 | awk '{print $5}' | sed 's/%//')
if [ $USAGE -gt $THRESHOLD_95 ]; then
log_event "CRITICAL" "$FS" "$USAGE"
echo "CRITICAL: $HOSTNAME $FS is $USAGE% full" | \
logger -t storage-monitor -p local0.crit
elif [ $USAGE -gt $THRESHOLD_85 ]; then
log_event "WARNING" "$FS" "$USAGE"
echo "WARNING: $HOSTNAME $FS is $USAGE% full" | \
logger -t storage-monitor -p local0.warning
elif [ $USAGE -gt $THRESHOLD_75 ]; then
log_event "ALERT" "$FS" "$USAGE"
echo "ALERT: $HOSTNAME $FS is $USAGE% full" | \
logger -t storage-monitor -p local0.notice
fi
}
# Check root filesystem
check_filesystem "/"
# Check Proxmox-specific mounts
for mount in /mnt/pve/* /mnt/dlx-*; do
if [ -d "$mount" ]; then
check_filesystem "$mount"
fi
done
# Check specific critical mounts
[ -d "/var" ] && check_filesystem "/var"
[ -d "/home" ] && check_filesystem "/home"
dest: /usr/local/bin/storage-monitoring/check-capacity.sh
mode: "0755"
become: yes
- name: Create Docker-specific monitoring script
copy:
content: |
#!/bin/bash
# Docker storage utilization monitoring
# Only runs on hosts with Docker installed
if ! command -v docker &> /dev/null; then
exit 0
fi
HOSTNAME=$(hostname)
LOGFILE="/var/log/docker-monitor.log"
THRESHOLD_75=75
THRESHOLD_85=85
THRESHOLD_95=95
log_docker_event() {
LEVEL=$1
USAGE=$2
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$TIMESTAMP] [$LEVEL] Docker storage: ${USAGE}% used" >> $LOGFILE
}
# Check dlx-docker mount (proxmox-01)
if [ -d "/mnt/pve/dlx-docker" ]; then
USAGE=$(df /mnt/pve/dlx-docker | tail -1 | awk '{print $5}' | sed 's/%//')
if [ $USAGE -gt $THRESHOLD_95 ]; then
log_docker_event "CRITICAL" "$USAGE"
echo "CRITICAL: Docker storage $USAGE% full on $HOSTNAME" | \
logger -t docker-monitor -p local0.crit
elif [ $USAGE -gt $THRESHOLD_85 ]; then
log_docker_event "WARNING" "$USAGE"
echo "WARNING: Docker storage $USAGE% full on $HOSTNAME" | \
logger -t docker-monitor -p local0.warning
elif [ $USAGE -gt $THRESHOLD_75 ]; then
log_docker_event "ALERT" "$USAGE"
echo "ALERT: Docker storage $USAGE% full on $HOSTNAME" | \
logger -t docker-monitor -p local0.notice
fi
# Also check Docker disk usage
docker system df >> $LOGFILE 2>&1
fi
dest: /usr/local/bin/storage-monitoring/check-docker.sh
mode: "0755"
become: yes
- name: Create container allocation tracking script
copy:
content: |
#!/bin/bash
# Track LXC/KVM container disk allocations
# Reports containers using >50GB or >80% of allocation
HOSTNAME=$(hostname)
LOGFILE="/var/log/container-monitor.log"
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$TIMESTAMP] Container allocation audit:" >> $LOGFILE
pct list 2>/dev/null | tail -n +2 | while read line; do
VMID=$(echo $line | awk '{print $1}')
NAME=$(echo $line | awk '{print $2}')
STATUS=$(echo $line | awk '{print $3}')
# Get max disk allocation
MAXDISK=$(pct config $VMID 2>/dev/null | grep -i rootfs | grep size | \
sed 's/.*size=//' | sed 's/G.*//' || echo "0")
if [ "$MAXDISK" != "0" ] && [ $MAXDISK -gt 50 ]; then
echo " [$STATUS] $VMID ($NAME): ${MAXDISK}GB allocated" >> $LOGFILE
fi
done
# Also check KVM/QEMU VMs
qm list 2>/dev/null | tail -n +2 | while read line; do
VMID=$(echo $line | awk '{print $1}')
NAME=$(echo $line | awk '{print $2}')
STATUS=$(echo $line | awk '{print $3}')
# Get max disk allocation
MAXDISK=$(qm config $VMID 2>/dev/null | grep -i scsi | wc -l)
if [ $MAXDISK -gt 0 ]; then
echo " [$STATUS] QEMU:$VMID ($NAME)" >> $LOGFILE
fi
done
dest: /usr/local/bin/storage-monitoring/check-containers.sh
mode: "0755"
become: yes
- name: Install monitoring cron jobs
cron:
name: "{{ item.name }}"
hour: "{{ item.hour }}"
minute: "{{ item.minute }}"
job: "{{ item.job }} >> /var/log/storage-cron.log 2>&1"
user: root
become: yes
with_items:
- name: "Storage capacity check"
hour: "*"
minute: "*/5"
job: "/usr/local/bin/storage-monitoring/check-capacity.sh"
- name: "Docker storage check"
hour: "*"
minute: "*/10"
job: "/usr/local/bin/storage-monitoring/check-docker.sh"
- name: "Container allocation audit"
hour: "*/4"
minute: "0"
job: "/usr/local/bin/storage-monitoring/check-containers.sh"
- name: Configure logrotate for monitoring logs
copy:
content: |
/var/log/storage-monitor.log
/var/log/docker-monitor.log
/var/log/container-monitor.log
/var/log/storage-cron.log {
daily
rotate 14
compress
missingok
notifempty
create 0640 root root
}
dest: /etc/logrotate.d/storage-monitoring
become: yes
- name: Create storage monitoring summary script
copy:
content: |
#!/bin/bash
# Summarize storage status across cluster
# Run this for quick dashboard view
echo "╔════════════════════════════════════════════════════════════╗"
echo "║ PROXMOX CLUSTER STORAGE STATUS ║"
echo "╚════════════════════════════════════════════════════════════╝"
echo ""
for host in proxmox-00 proxmox-01 proxmox-02; do
echo "[$host]"
ssh -o ConnectTimeout=5 dlxadmin@$(ansible-inventory --host $host 2>/dev/null | jq -r '.ansible_host' 2>/dev/null || echo $host) \
"df -h / | tail -1 | awk '{printf \" Root: %s (used: %s)\\n\", \$5, \$3}'; \
[ -d /mnt/pve/dlx-docker ] && df -h /mnt/pve/dlx-docker | tail -1 | awk '{printf \" Docker: %s (used: %s)\\n\", \$5, \$3}'; \
df -h /mnt/pve/* 2>/dev/null | tail -n +2 | awk '{printf \" %s: %s (used: %s)\\n\", \$NF, \$5, \$3}'" 2>/dev/null || \
echo " [unreachable]"
echo ""
done
echo "Monitoring logs:"
echo " tail -f /var/log/storage-monitor.log"
echo " tail -f /var/log/docker-monitor.log"
echo " tail -f /var/log/container-monitor.log"
dest: /usr/local/bin/storage-monitoring/cluster-status.sh
mode: "0755"
become: yes
- name: Display monitoring setup summary
debug:
msg: |
╔══════════════════════════════════════════════════════════════╗
║ STORAGE MONITORING CONFIGURED ║
╚══════════════════════════════════════════════════════════════╝
Monitoring scripts installed:
✓ /usr/local/bin/storage-monitoring/check-capacity.sh
✓ /usr/local/bin/storage-monitoring/check-docker.sh
✓ /usr/local/bin/storage-monitoring/check-containers.sh
✓ /usr/local/bin/storage-monitoring/cluster-status.sh
Cron Jobs Configured:
✓ Every 5 min: Filesystem capacity checks
✓ Every 10 min: Docker storage checks
✓ Every 4 hours: Container allocation audit
Alert Thresholds:
⚠️ 75%: ALERT (notice level)
⚠️ 85%: WARNING (warning level)
🔴 95%: CRITICAL (critical level)
Log Files:
• /var/log/storage-monitor.log
• /var/log/docker-monitor.log
• /var/log/container-monitor.log
• /var/log/storage-cron.log (cron execution log)
Quick Status Commands:
$ /usr/local/bin/storage-monitoring/cluster-status.sh
$ tail -f /var/log/storage-monitor.log
$ grep CRITICAL /var/log/storage-monitor.log
System Integration:
- Logs sent to syslog (logger -t storage-monitor)
- Searchable with: journalctl -t storage-monitor
- Can integrate with rsyslog for forwarding
- Can integrate with monitoring tools (Prometheus, Grafana)
---
- name: "Create Prometheus metrics export (optional)"
hosts: proxmox
gather_facts: yes
tasks:
- name: Create Prometheus metrics script
copy:
content: |
#!/bin/bash
# Export storage metrics in Prometheus format
# Endpoint: http://host:9100/storage-metrics (if using node_exporter)
cat << 'EOF'
# HELP pve_storage_capacity_bytes Storage capacity in bytes
# TYPE pve_storage_capacity_bytes gauge
EOF
df -B1 | tail -n +2 | while read fs total used available use percent mount; do
# Skip certain mounts
[[ "$mount" =~ ^/(dev|proc|sys|run|boot) ]] && continue
SAFEMOUNT=$(echo "$mount" | sed 's/\//_/g; s/^_//g')
echo "pve_storage_capacity_bytes{mount=\"$mount\",type=\"total\"} $total"
echo "pve_storage_capacity_bytes{mount=\"$mount\",type=\"used\"} $used"
echo "pve_storage_capacity_bytes{mount=\"$mount\",type=\"available\"} $available"
echo "pve_storage_percent{mount=\"$mount\"} $(echo $use | sed 's/%//')"
done
dest: /usr/local/bin/storage-monitoring/prometheus-metrics.sh
mode: "0755"
become: yes
- name: Display Prometheus integration note
debug:
msg: |
Prometheus Integration Available:
$ /usr/local/bin/storage-monitoring/prometheus-metrics.sh
To integrate with node_exporter:
1. Copy script to node_exporter textfile directory
2. Add collector to Prometheus scrape config
3. Create dashboards in Grafana
Example Prometheus queries:
- Storage usage: pve_storage_capacity_bytes{type="used"}
- Available space: pve_storage_capacity_bytes{type="available"}
- Percentage: pve_storage_percent
---
- name: "Display final configuration summary"
hosts: localhost
gather_facts: no
tasks:
- name: Summary
debug:
msg: |
╔══════════════════════════════════════════════════════════════╗
║ STORAGE MONITORING & REMEDIATION COMPLETE ║
╚══════════════════════════════════════════════════════════════╝
Playbooks Created:
1. remediate-storage-critical-issues.yml
- Cleans logs on proxmox-00
- Prunes Docker on proxmox-01
- Audits SonarQube usage
2. remediate-docker-storage.yml
- Detailed Docker cleanup
- Removes dangling resources
- Sets up automated weekly prune
3. remediate-stopped-containers.yml
- Safely removes unused containers
- Creates config backups
- Recoverable deletions
4. configure-storage-monitoring.yml
- Continuous capacity monitoring
- Alert thresholds (75/85/95%)
- Prometheus integration
To Execute All Remediations:
$ ansible-playbook playbooks/remediate-storage-critical-issues.yml
$ ansible-playbook playbooks/remediate-docker-storage.yml
$ ansible-playbook playbooks/configure-storage-monitoring.yml
To Check Monitoring Status:
SSH to any Proxmox host and run:
$ tail -f /var/log/storage-monitor.log
$ /usr/local/bin/storage-monitoring/cluster-status.sh
Next Steps:
1. Review and test playbooks with --check
2. Run on one host first (proxmox-00)
3. Monitor for 48 hours for stability
4. Extend to other hosts once verified
5. Schedule regular execution (weekly)
Expected Results:
- proxmox-00 root: 84.5% → 70%
- proxmox-01 docker: 81.1% → 70%
- Freed space: 500+ GB
- Monitoring active and alerting