diff --git a/docs/KAFKA-LOCALHOST-FIX.md b/docs/KAFKA-LOCALHOST-FIX.md new file mode 100644 index 0000000..a2f4471 --- /dev/null +++ b/docs/KAFKA-LOCALHOST-FIX.md @@ -0,0 +1,106 @@ +# Kafka Admin Client `localhost:9092` Warning Fix + +## Symptom + +During `sj_api` (Spring Boot) startup, the following warnings appear repeatedly: + +``` +WARN [kafka-admin-client-thread | smart-api-admin-0] + Connection to node -1 (localhost/127.0.0.1:9092) could not be established. + Node may not be available. +``` + +The application eventually starts successfully but takes ~60 seconds due to retry loops. + +## Root Cause + +Two separate issues compound each other: + +### 1. Kafka has two listeners — services were using the wrong one + +`services/kafka.yaml` defines: +```yaml +KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,EXTERNAL_LISTENER://192.168.200.114:9092 +KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:29092,EXTERNAL_LISTENER://0.0.0.0:9092 +``` + +- `PLAINTEXT://kafka:29092` — internal Docker network (for container-to-container) +- `EXTERNAL_LISTENER://192.168.200.114:9092` — external host access (for outside Docker) + +The `.env` had `kafkaservice=kafka:9092`, which connects to the **external** listener. +When a container connects via the external listener, Kafka returns metadata advertising +`192.168.200.114:9092` as the broker address. From inside a container, this routes back +through the host and causes connection confusion, including resolving to `localhost`. + +**Fix:** Change `.env` to use the internal PLAINTEXT listener: +``` +kafkaservice=kafka:29092 +``` + +### 2. Spring Boot `dev` profile hardcodes `localhost:9092` for the Kafka admin client + +The application jar's `application-dev.yml` has `localhost:9092` as the default Kafka +bootstrap server. The `KAFKASERVICE` env var only overrides the producer/consumer +clients — the Spring Kafka admin client reads from `spring.kafka.bootstrap-servers` +which was still falling back to the dev profile's `localhost:9092`. + +**Fix:** Add `SPRING_KAFKA_BOOTSTRAP_SERVERS` to the api service environment in +`docker-compose-prod.yaml`, pointing at the same value as `KAFKASERVICE`: + +```yaml +environment: + - KAFKASERVICE=${kafkaservice} + - SPRING_KAFKA_BOOTSTRAP_SERVERS=${kafkaservice} # <-- add this +``` + +This overrides the dev profile default for the admin client at container startup. + +## Files Changed + +| File | Change | +|---|---| +| `/opt/smartjournal/.env` | `kafkaservice=kafka:9092` → `kafkaservice=kafka:29092` | +| `/opt/smartjournal/docker-compose-prod.yaml` | Added `SPRING_KAFKA_BOOTSTRAP_SERVERS=${kafkaservice}` to `api` service environment | + +## Result + +- No more `localhost:9092` warnings +- Startup time: ~60 seconds → ~20 seconds + +## Applying to Another Environment + +1. **Check Kafka listeners** — ensure the internal listener (PLAINTEXT) is on a different + port from the external listener and that `kafkaservice` in `.env` points to the internal one: + ``` + kafkaservice=kafka: + ``` + +2. **Add the Spring override** to the api service in `docker-compose-prod.yaml`: + ```yaml + - SPRING_KAFKA_BOOTSTRAP_SERVERS=${kafkaservice} + ``` + +3. **Recreate the api container** (restart is not sufficient — env vars require recreate): + ```bash + docker compose -f docker-compose-prod.yaml up -d --force-recreate api + ``` + +4. **Verify** — startup should complete in ~20 seconds with no `localhost` warnings: + ```bash + docker logs sj_api 2>&1 | grep -E 'localhost.*9092|Started UiApplication' + ``` + Expected: only the `Started UiApplication in XX seconds` line, no localhost warnings. + +## Related Issues Found During This Session + +- `mfa_enabled=fasle` typo in `.env` — caused `Invalid boolean value` startup crash. + Fixed by correcting to `mfa_enabled=false`. + +- Duplicate env vars with hyphens vs underscores in `docker-compose-prod.yaml`: + ```yaml + - SAML-MAPPER-GRAPH-PROXY-PORT=${saml-mapper-graph-proxy-port} # broken (hyphen) + - SAML-MAPPER-GRAPH-PROXY-PORT=${saml_mapper_graph_proxy_port} # correct (underscore) + ``` + Shell interprets `${saml-mapper-graph-proxy-port}` as `${saml}` with default + `mapper-graph-proxy-port`, so the port env var receives a string instead of an integer, + crashing Spring Boot. Fixed by removing the hyphenated duplicate lines. diff --git a/host_vars/hiveops.yml b/host_vars/hiveops.yml deleted file mode 100644 index d1284ae..0000000 --- a/host_vars/hiveops.yml +++ /dev/null @@ -1,11 +0,0 @@ ---- -# HiveOps specific variables - -# Disable firewall (too many ports needed) -common_firewall_enabled: false - -# Enable IP forwarding for Docker networking -common_sysctl_settings: - net.ipv4.ip_forward: 1 - net.ipv4.conf.all.send_redirects: 0 - net.ipv4.conf.default.send_redirects: 0 diff --git a/inventory/hosts.yml b/inventory/hosts.yml index 3a7250c..4a88789 100644 --- a/inventory/hosts.yml +++ b/inventory/hosts.yml @@ -44,12 +44,18 @@ all: application: hosts: - hiveops: - ansible_host: 192.168.200.112 smartjournal: ansible_host: 192.168.200.114 - odoo: - ansible_host: 192.168.200.61 + + + kubernetes: + hosts: + dlx-kube-01: + ansible_host: 192.168.200.215 + dlx-kube-02: + ansible_host: 192.168.200.216 + dlx-kube-03: + ansible_host: 192.168.200.217 local: hosts: diff --git a/playbooks/configure-directlx-dev-dns.yml b/playbooks/configure-directlx-dev-dns.yml index 36a8f91..9b553d0 100644 --- a/playbooks/configure-directlx-dev-dns.yml +++ b/playbooks/configure-directlx-dev-dns.yml @@ -7,11 +7,7 @@ dns_records: - { ip: "192.168.200.71", hostname: "www" } - { ip: "192.168.200.71", hostname: "gitea" } - - { ip: "192.168.200.71", hostname: "mgmt" } - - { ip: "192.168.200.71", hostname: "hiveops" } - - { ip: "192.168.200.71", hostname: "browser" } - { ip: "192.168.200.71", hostname: "smartjournal" } - - { ip: "192.168.200.71", hostname: "incidents" } - { ip: "192.168.200.71", hostname: "remote" } - { ip: "192.168.200.71", hostname: "registry" } diff --git a/playbooks/configure-local-dns-localhost.yml b/playbooks/configure-local-dns-localhost.yml index 93738a6..c57623b 100644 --- a/playbooks/configure-local-dns-localhost.yml +++ b/playbooks/configure-local-dns-localhost.yml @@ -6,10 +6,6 @@ vars: npm_server_ip: "192.168.200.71" directlx_domains: - - incident.directlx.dev - - hiveops.directlx.dev - - mgmt.directlx.dev - - release.directlx.dev - gitea.directlx.dev - smartjournal.directlx.dev - directlx.dev @@ -40,10 +36,10 @@ - name: Test DNS resolution ansible.builtin.shell: | echo "Testing DNS resolution..." - getent hosts incident.directlx.dev + getent hosts gitea.directlx.dev echo "" echo "Testing HTTPS connectivity..." - curl -I --max-time 5 https://incident.directlx.dev 2>&1 | head -3 + curl -I --max-time 5 https://gitea.directlx.dev 2>&1 | head -3 register: test_results changed_when: false failed_when: false @@ -58,10 +54,6 @@ {{ test_results.stdout }} You can now access DirectLX services reliably: - - https://incident.directlx.dev - - https://hiveops.directlx.dev - - https://mgmt.directlx.dev - - https://release.directlx.dev - https://gitea.directlx.dev - https://smartjournal.directlx.dev - https://registry.directlx.dev (Docker Registry) diff --git a/playbooks/pihole-dns.yml b/playbooks/pihole-dns.yml index affd699..fcfc367 100644 --- a/playbooks/pihole-dns.yml +++ b/playbooks/pihole-dns.yml @@ -17,9 +17,7 @@ - { ip: "192.168.200.100", hostname: "pihole" } - { ip: "192.168.200.102", hostname: "gitea" } - { ip: "192.168.200.91", hostname: "jenkins" } - - { ip: "192.168.200.112", hostname: "hiveops" } - { ip: "192.168.200.114", hostname: "smartjournal" } - - { ip: "192.168.200.61", hostname: "odoo" } tasks: - name: Copy DNS update script