385 lines
15 KiB
YAML
385 lines
15 KiB
YAML
---
|
|
# Configure proactive storage monitoring and alerting for Proxmox hosts
|
|
# Monitors: Filesystem usage, Docker storage, Container allocation
|
|
# Alerts at: 75%, 85%, 95% capacity thresholds
|
|
|
|
- name: "Setup storage monitoring and alerting"
|
|
hosts: proxmox
|
|
gather_facts: yes
|
|
vars:
|
|
alert_threshold_75: true # Alert when >75% full
|
|
alert_threshold_85: true # Alert when >85% full
|
|
alert_threshold_95: true # Alert when >95% full (critical)
|
|
alert_email: "admin@directlx.dev"
|
|
monitoring_interval: "5m" # Check every 5 minutes
|
|
tasks:
|
|
- name: Create storage monitoring directory
|
|
file:
|
|
path: /usr/local/bin/storage-monitoring
|
|
state: directory
|
|
mode: "0755"
|
|
become: yes
|
|
|
|
- name: Create filesystem capacity check script
|
|
copy:
|
|
content: |
|
|
#!/bin/bash
|
|
# Filesystem capacity monitoring
|
|
# Alerts when thresholds are exceeded
|
|
|
|
HOSTNAME=$(hostname)
|
|
THRESHOLD_75=75
|
|
THRESHOLD_85=85
|
|
THRESHOLD_95=95
|
|
LOGFILE="/var/log/storage-monitor.log"
|
|
|
|
log_event() {
|
|
LEVEL=$1
|
|
FS=$2
|
|
USAGE=$3
|
|
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
|
|
echo "[$TIMESTAMP] [$LEVEL] $FS: ${USAGE}% used" >> $LOGFILE
|
|
}
|
|
|
|
check_filesystem() {
|
|
FS=$1
|
|
USAGE=$(df $FS | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
|
|
if [ $USAGE -gt $THRESHOLD_95 ]; then
|
|
log_event "CRITICAL" "$FS" "$USAGE"
|
|
echo "CRITICAL: $HOSTNAME $FS is $USAGE% full" | \
|
|
logger -t storage-monitor -p local0.crit
|
|
elif [ $USAGE -gt $THRESHOLD_85 ]; then
|
|
log_event "WARNING" "$FS" "$USAGE"
|
|
echo "WARNING: $HOSTNAME $FS is $USAGE% full" | \
|
|
logger -t storage-monitor -p local0.warning
|
|
elif [ $USAGE -gt $THRESHOLD_75 ]; then
|
|
log_event "ALERT" "$FS" "$USAGE"
|
|
echo "ALERT: $HOSTNAME $FS is $USAGE% full" | \
|
|
logger -t storage-monitor -p local0.notice
|
|
fi
|
|
}
|
|
|
|
# Check root filesystem
|
|
check_filesystem "/"
|
|
|
|
# Check Proxmox-specific mounts
|
|
for mount in /mnt/pve/* /mnt/dlx-*; do
|
|
if [ -d "$mount" ]; then
|
|
check_filesystem "$mount"
|
|
fi
|
|
done
|
|
|
|
# Check specific critical mounts
|
|
[ -d "/var" ] && check_filesystem "/var"
|
|
[ -d "/home" ] && check_filesystem "/home"
|
|
dest: /usr/local/bin/storage-monitoring/check-capacity.sh
|
|
mode: "0755"
|
|
become: yes
|
|
|
|
- name: Create Docker-specific monitoring script
|
|
copy:
|
|
content: |
|
|
#!/bin/bash
|
|
# Docker storage utilization monitoring
|
|
# Only runs on hosts with Docker installed
|
|
|
|
if ! command -v docker &> /dev/null; then
|
|
exit 0
|
|
fi
|
|
|
|
HOSTNAME=$(hostname)
|
|
LOGFILE="/var/log/docker-monitor.log"
|
|
THRESHOLD_75=75
|
|
THRESHOLD_85=85
|
|
THRESHOLD_95=95
|
|
|
|
log_docker_event() {
|
|
LEVEL=$1
|
|
USAGE=$2
|
|
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
|
|
echo "[$TIMESTAMP] [$LEVEL] Docker storage: ${USAGE}% used" >> $LOGFILE
|
|
}
|
|
|
|
# Check dlx-docker mount (proxmox-01)
|
|
if [ -d "/mnt/pve/dlx-docker" ]; then
|
|
USAGE=$(df /mnt/pve/dlx-docker | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
|
|
if [ $USAGE -gt $THRESHOLD_95 ]; then
|
|
log_docker_event "CRITICAL" "$USAGE"
|
|
echo "CRITICAL: Docker storage $USAGE% full on $HOSTNAME" | \
|
|
logger -t docker-monitor -p local0.crit
|
|
elif [ $USAGE -gt $THRESHOLD_85 ]; then
|
|
log_docker_event "WARNING" "$USAGE"
|
|
echo "WARNING: Docker storage $USAGE% full on $HOSTNAME" | \
|
|
logger -t docker-monitor -p local0.warning
|
|
elif [ $USAGE -gt $THRESHOLD_75 ]; then
|
|
log_docker_event "ALERT" "$USAGE"
|
|
echo "ALERT: Docker storage $USAGE% full on $HOSTNAME" | \
|
|
logger -t docker-monitor -p local0.notice
|
|
fi
|
|
|
|
# Also check Docker disk usage
|
|
docker system df >> $LOGFILE 2>&1
|
|
fi
|
|
dest: /usr/local/bin/storage-monitoring/check-docker.sh
|
|
mode: "0755"
|
|
become: yes
|
|
|
|
- name: Create container allocation tracking script
|
|
copy:
|
|
content: |
|
|
#!/bin/bash
|
|
# Track LXC/KVM container disk allocations
|
|
# Reports containers using >50GB or >80% of allocation
|
|
|
|
HOSTNAME=$(hostname)
|
|
LOGFILE="/var/log/container-monitor.log"
|
|
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
|
|
|
|
echo "[$TIMESTAMP] Container allocation audit:" >> $LOGFILE
|
|
|
|
pct list 2>/dev/null | tail -n +2 | while read line; do
|
|
VMID=$(echo $line | awk '{print $1}')
|
|
NAME=$(echo $line | awk '{print $2}')
|
|
STATUS=$(echo $line | awk '{print $3}')
|
|
|
|
# Get max disk allocation
|
|
MAXDISK=$(pct config $VMID 2>/dev/null | grep -i rootfs | grep size | \
|
|
sed 's/.*size=//' | sed 's/G.*//' || echo "0")
|
|
|
|
if [ "$MAXDISK" != "0" ] && [ $MAXDISK -gt 50 ]; then
|
|
echo " [$STATUS] $VMID ($NAME): ${MAXDISK}GB allocated" >> $LOGFILE
|
|
fi
|
|
done
|
|
|
|
# Also check KVM/QEMU VMs
|
|
qm list 2>/dev/null | tail -n +2 | while read line; do
|
|
VMID=$(echo $line | awk '{print $1}')
|
|
NAME=$(echo $line | awk '{print $2}')
|
|
STATUS=$(echo $line | awk '{print $3}')
|
|
|
|
# Get max disk allocation
|
|
MAXDISK=$(qm config $VMID 2>/dev/null | grep -i scsi | wc -l)
|
|
if [ $MAXDISK -gt 0 ]; then
|
|
echo " [$STATUS] QEMU:$VMID ($NAME)" >> $LOGFILE
|
|
fi
|
|
done
|
|
dest: /usr/local/bin/storage-monitoring/check-containers.sh
|
|
mode: "0755"
|
|
become: yes
|
|
|
|
- name: Install monitoring cron jobs
|
|
cron:
|
|
name: "{{ item.name }}"
|
|
hour: "{{ item.hour }}"
|
|
minute: "{{ item.minute }}"
|
|
job: "{{ item.job }} >> /var/log/storage-cron.log 2>&1"
|
|
user: root
|
|
become: yes
|
|
with_items:
|
|
- name: "Storage capacity check"
|
|
hour: "*"
|
|
minute: "*/5"
|
|
job: "/usr/local/bin/storage-monitoring/check-capacity.sh"
|
|
- name: "Docker storage check"
|
|
hour: "*"
|
|
minute: "*/10"
|
|
job: "/usr/local/bin/storage-monitoring/check-docker.sh"
|
|
- name: "Container allocation audit"
|
|
hour: "*/4"
|
|
minute: "0"
|
|
job: "/usr/local/bin/storage-monitoring/check-containers.sh"
|
|
|
|
- name: Configure logrotate for monitoring logs
|
|
copy:
|
|
content: |
|
|
/var/log/storage-monitor.log
|
|
/var/log/docker-monitor.log
|
|
/var/log/container-monitor.log
|
|
/var/log/storage-cron.log {
|
|
daily
|
|
rotate 14
|
|
compress
|
|
missingok
|
|
notifempty
|
|
create 0640 root root
|
|
}
|
|
dest: /etc/logrotate.d/storage-monitoring
|
|
become: yes
|
|
|
|
- name: Create storage monitoring summary script
|
|
copy:
|
|
content: |
|
|
#!/bin/bash
|
|
# Summarize storage status across cluster
|
|
# Run this for quick dashboard view
|
|
|
|
echo "╔════════════════════════════════════════════════════════════╗"
|
|
echo "║ PROXMOX CLUSTER STORAGE STATUS ║"
|
|
echo "╚════════════════════════════════════════════════════════════╝"
|
|
echo ""
|
|
|
|
for host in proxmox-00 proxmox-01 proxmox-02; do
|
|
echo "[$host]"
|
|
ssh -o ConnectTimeout=5 dlxadmin@$(ansible-inventory --host $host 2>/dev/null | jq -r '.ansible_host' 2>/dev/null || echo $host) \
|
|
"df -h / | tail -1 | awk '{printf \" Root: %s (used: %s)\\n\", \$5, \$3}'; \
|
|
[ -d /mnt/pve/dlx-docker ] && df -h /mnt/pve/dlx-docker | tail -1 | awk '{printf \" Docker: %s (used: %s)\\n\", \$5, \$3}'; \
|
|
df -h /mnt/pve/* 2>/dev/null | tail -n +2 | awk '{printf \" %s: %s (used: %s)\\n\", \$NF, \$5, \$3}'" 2>/dev/null || \
|
|
echo " [unreachable]"
|
|
echo ""
|
|
done
|
|
|
|
echo "Monitoring logs:"
|
|
echo " tail -f /var/log/storage-monitor.log"
|
|
echo " tail -f /var/log/docker-monitor.log"
|
|
echo " tail -f /var/log/container-monitor.log"
|
|
dest: /usr/local/bin/storage-monitoring/cluster-status.sh
|
|
mode: "0755"
|
|
become: yes
|
|
|
|
- name: Display monitoring setup summary
|
|
debug:
|
|
msg: |
|
|
╔══════════════════════════════════════════════════════════════╗
|
|
║ STORAGE MONITORING CONFIGURED ║
|
|
╚══════════════════════════════════════════════════════════════╝
|
|
|
|
Monitoring scripts installed:
|
|
✓ /usr/local/bin/storage-monitoring/check-capacity.sh
|
|
✓ /usr/local/bin/storage-monitoring/check-docker.sh
|
|
✓ /usr/local/bin/storage-monitoring/check-containers.sh
|
|
✓ /usr/local/bin/storage-monitoring/cluster-status.sh
|
|
|
|
Cron Jobs Configured:
|
|
✓ Every 5 min: Filesystem capacity checks
|
|
✓ Every 10 min: Docker storage checks
|
|
✓ Every 4 hours: Container allocation audit
|
|
|
|
Alert Thresholds:
|
|
⚠️ 75%: ALERT (notice level)
|
|
⚠️ 85%: WARNING (warning level)
|
|
🔴 95%: CRITICAL (critical level)
|
|
|
|
Log Files:
|
|
• /var/log/storage-monitor.log
|
|
• /var/log/docker-monitor.log
|
|
• /var/log/container-monitor.log
|
|
• /var/log/storage-cron.log (cron execution log)
|
|
|
|
Quick Status Commands:
|
|
$ /usr/local/bin/storage-monitoring/cluster-status.sh
|
|
$ tail -f /var/log/storage-monitor.log
|
|
$ grep CRITICAL /var/log/storage-monitor.log
|
|
|
|
System Integration:
|
|
- Logs sent to syslog (logger -t storage-monitor)
|
|
- Searchable with: journalctl -t storage-monitor
|
|
- Can integrate with rsyslog for forwarding
|
|
- Can integrate with monitoring tools (Prometheus, Grafana)
|
|
|
|
---
|
|
|
|
- name: "Create Prometheus metrics export (optional)"
|
|
hosts: proxmox
|
|
gather_facts: yes
|
|
tasks:
|
|
- name: Create Prometheus metrics script
|
|
copy:
|
|
content: |
|
|
#!/bin/bash
|
|
# Export storage metrics in Prometheus format
|
|
# Endpoint: http://host:9100/storage-metrics (if using node_exporter)
|
|
|
|
cat << 'EOF'
|
|
# HELP pve_storage_capacity_bytes Storage capacity in bytes
|
|
# TYPE pve_storage_capacity_bytes gauge
|
|
EOF
|
|
|
|
df -B1 | tail -n +2 | while read fs total used available use percent mount; do
|
|
# Skip certain mounts
|
|
[[ "$mount" =~ ^/(dev|proc|sys|run|boot) ]] && continue
|
|
|
|
SAFEMOUNT=$(echo "$mount" | sed 's/\//_/g; s/^_//g')
|
|
echo "pve_storage_capacity_bytes{mount=\"$mount\",type=\"total\"} $total"
|
|
echo "pve_storage_capacity_bytes{mount=\"$mount\",type=\"used\"} $used"
|
|
echo "pve_storage_capacity_bytes{mount=\"$mount\",type=\"available\"} $available"
|
|
echo "pve_storage_percent{mount=\"$mount\"} $(echo $use | sed 's/%//')"
|
|
done
|
|
dest: /usr/local/bin/storage-monitoring/prometheus-metrics.sh
|
|
mode: "0755"
|
|
become: yes
|
|
|
|
- name: Display Prometheus integration note
|
|
debug:
|
|
msg: |
|
|
Prometheus Integration Available:
|
|
$ /usr/local/bin/storage-monitoring/prometheus-metrics.sh
|
|
|
|
To integrate with node_exporter:
|
|
1. Copy script to node_exporter textfile directory
|
|
2. Add collector to Prometheus scrape config
|
|
3. Create dashboards in Grafana
|
|
|
|
Example Prometheus queries:
|
|
- Storage usage: pve_storage_capacity_bytes{type="used"}
|
|
- Available space: pve_storage_capacity_bytes{type="available"}
|
|
- Percentage: pve_storage_percent
|
|
|
|
---
|
|
|
|
- name: "Display final configuration summary"
|
|
hosts: localhost
|
|
gather_facts: no
|
|
tasks:
|
|
- name: Summary
|
|
debug:
|
|
msg: |
|
|
╔══════════════════════════════════════════════════════════════╗
|
|
║ STORAGE MONITORING & REMEDIATION COMPLETE ║
|
|
╚══════════════════════════════════════════════════════════════╝
|
|
|
|
Playbooks Created:
|
|
1. remediate-storage-critical-issues.yml
|
|
- Cleans logs on proxmox-00
|
|
- Prunes Docker on proxmox-01
|
|
- Audits SonarQube usage
|
|
|
|
2. remediate-docker-storage.yml
|
|
- Detailed Docker cleanup
|
|
- Removes dangling resources
|
|
- Sets up automated weekly prune
|
|
|
|
3. remediate-stopped-containers.yml
|
|
- Safely removes unused containers
|
|
- Creates config backups
|
|
- Recoverable deletions
|
|
|
|
4. configure-storage-monitoring.yml
|
|
- Continuous capacity monitoring
|
|
- Alert thresholds (75/85/95%)
|
|
- Prometheus integration
|
|
|
|
To Execute All Remediations:
|
|
$ ansible-playbook playbooks/remediate-storage-critical-issues.yml
|
|
$ ansible-playbook playbooks/remediate-docker-storage.yml
|
|
$ ansible-playbook playbooks/configure-storage-monitoring.yml
|
|
|
|
To Check Monitoring Status:
|
|
SSH to any Proxmox host and run:
|
|
$ tail -f /var/log/storage-monitor.log
|
|
$ /usr/local/bin/storage-monitoring/cluster-status.sh
|
|
|
|
Next Steps:
|
|
1. Review and test playbooks with --check
|
|
2. Run on one host first (proxmox-00)
|
|
3. Monitor for 48 hours for stability
|
|
4. Extend to other hosts once verified
|
|
5. Schedule regular execution (weekly)
|
|
|
|
Expected Results:
|
|
- proxmox-00 root: 84.5% → 70%
|
|
- proxmox-01 docker: 81.1% → 70%
|
|
- Freed space: 500+ GB
|
|
- Monitoring active and alerting
|