# Check nodes
kubectl get nodes -o wide
# Check all pods (show non-running)
kubectl get pods -A | grep -v "Running\|Completed"
# Events (recent issues)
kubectl get events -A --sort-by='.lastTimestamp' | tail -20
# Resource usage
kubectl top nodes
kubectl top pods -A --sort-by=memory | head -20
# Check Longhorn volumes
kubectl get volumes.longhorn.io -n longhorn-system
# Follow logs
kubectl logs -n cardboard deployment/cardboard -f --tail=50
# Last 100 lines
kubectl logs -n trade-bot deployment/trade-bot --tail=100
# All containers in a pod
kubectl logs -n dev-workspace <pod-name> --all-containers
# Previous container (after crash)
kubectl logs -n namespace pod-name --previous
kubectl rollout restart deployment/cardboard -n cardboard
kubectl rollout status deployment/cardboard -n cardboard --timeout=300s
# Watch rollout
kubectl rollout status deployment/myapp -n <namespace>
# View history
kubectl rollout history deployment/myapp -n <namespace>
# Rollback to previous
kubectl rollout undo deployment/myapp -n <namespace>
# Exec into running pod
kubectl exec -it -n namespace pod-name -- /bin/bash
# Debug crashed pod
kubectl debug -n namespace pod-name -it --image=busybox
# Force-delete stuck pod
kubectl delete pod <name> -n <namespace> --force --grace-period=0
ssh -F ssh_config k3s-server-1
# Or directly:
ssh debian@192.168.20.20
kubectl cordon k3s-agent-1
kubectl drain k3s-agent-1 --ignore-daemonsets --delete-emptydir-data
# Perform maintenance...
kubectl uncordon k3s-agent-1
Plex on k3s-agent-4: Plex has a
PodDisruptionBudgetwithmaxUnavailable: 0. kubectl drain will block until Plex is manually scaled down:kubectl scale deployment plex -n media --replicas=0 kubectl drain k3s-agent-4 --ignore-daemonsets --delete-emptydir-data # After maintenance: kubectl uncordon k3s-agent-4 kubectl scale deployment plex -n media --replicas=1
# For agents (safer)
kubectl drain k3s-agent-1 --ignore-daemonsets --delete-emptydir-data
ssh k3s-agent-1 'sudo reboot'
kubectl uncordon k3s-agent-1
# For servers — do ONE at a time, check etcd first:
kubectl get pods -n kube-system -l component=etcd
sudo k3s etcd-snapshot list
Critical gotcha: k3s agent upgrades wipe
/etc/systemd/system/k3s-agent.service.env. Always restoreK3S_TOKENandK3S_URLafter every agent upgrade or the agent will fail to rejoin the cluster.
# Use the helper script
./scripts/upgrade-k3s.sh v1.34.5+k3s1
# Or manually — upgrade servers first, one at a time
for server in k3s-server-1 k3s-server-2 k3s-server-3; do
ssh debian@<server-ip> "curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=v1.XX.X+k3s1 sh -"
sleep 30
kubectl get nodes # verify server rejoined
done
# Then upgrade agents
for agent in k3s-agent-1 k3s-agent-2 k3s-agent-3 k3s-agent-4; do
ssh debian@<agent-ip> "curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=v1.XX.X+k3s1 INSTALL_K3S_EXEC=agent sh -"
# IMMEDIATELY restore env file after each agent upgrade:
ssh debian@<agent-ip> "sudo systemctl show k3s-agent --property=EnvironmentFiles"
# Verify K3S_TOKEN and K3S_URL are in the env file:
ssh debian@<agent-ip> "sudo cat /etc/systemd/system/k3s-agent.service.env"
done
If an agent fails to rejoin (e.g., k3s-agent service shows K3S_URL not set error):
# SSH to the affected agent
ssh debian@192.168.20.30 # adjust IP
# Restore the environment file
sudo tee /etc/systemd/system/k3s-agent.service.env <<EOF
K3S_TOKEN=<cluster-token-from-group_vars>
K3S_URL=https://192.168.20.20:6443
EOF
sudo systemctl daemon-reload
sudo systemctl restart k3s-agent
sudo systemctl status k3s-agent
The K3S_TOKEN is in ansible/group_vars/all.yml. The K3S_URL always points to k3s-server-1 static IP (192.168.20.20:6443).
# All nodes should show new version
kubectl get nodes -o wide | awk '{print $1, $5}'
# Verify no pods stuck in terminating
kubectl get pods -A | grep -v "Running\|Completed"
| Tier | What | When (UTC) | Retention | S3 Path |
|---|---|---|---|---|
| etcd snapshots | Cluster state (k3s) | 2:00 AM daily | 7 snapshots | etcd-snapshots/ |
| PostgreSQL dumps | 15 app databases | 3:00–4:05 AM daily | 7 days | postgres-backups/<app>/ (openclaw-memory-db → zolty-homelab-backups/openclaw-memory/) |
| Longhorn volumes | PVC data | 5:00 AM daily | 7 snapshots | longhorn/ |
| Velero (daily) | All Kubernetes objects | 2:00 AM daily | 30 days | velero/ |
| Velero (weekly) | All Kubernetes objects | Sundays 3:00 AM | ~360 days (8640h) | velero/ |
| Velero (monthly) | All Kubernetes objects | 1st of month 4:00 AM | 365 days (8760h) | velero/ |
Most CronJobs dump via pg_dump to s3://k3s-homelab-backups-855878721457/postgres-backups/<namespace>/. Retention: 7 daily backups per database. Exception: openclaw-memory-db backs up to a separate bucket: s3://zolty-homelab-backups/openclaw-memory/.
| Database | Namespace | DB Name | Schedule (UTC) | CronJob File |
|---|---|---|---|---|
| cardboard | cardboard | cardboard |
3:00 AM | cardboard/postgres-backup-cronjob.yaml |
| openclaw-memory-db | open-webui | openclaw-memory-db |
3:00 AM | openclaw/openclaw-db.yaml (separate bucket: zolty-homelab-backups/openclaw-memory/) |
| ham | ham | habittracker |
3:15 AM | ham/postgres-backup-cronjob.yaml |
| trade-bot | trade-bot | tradebot |
3:15 AM | trade-bot/postgres-backup-cronjob.yaml |
| aja-recipes | aja-recipes | recipes |
3:20 AM | aja-recipes/postgres-backup-cronjob.yaml |
| dnd | dnd | dndmulti |
3:25 AM | dnd/postgres-backup-cronjob.yaml |
| wiki | wiki | wiki |
3:30 AM | wiki/postgres-backup-cronjob.yaml |
| jellyfin | media | jellyfin |
3:30 AM | media/jellyfin-pg-backup.yaml |
| digital-signage | digital-signage | digitalsignage |
3:35 AM | digital-signage/postgres-backup-cronjob.yaml |
| openclaw-ops | openclaw-ops | openclaw_ops |
3:40 AM | openclaw-ops/postgres-backup-cronjob.yaml |
| openclaw-personal | openclaw-personal | openclaw_personal |
3:45 AM | openclaw-personal/postgres-backup-cronjob.yaml |
| media-profiler | media-profiler | mediaprofiler |
3:50 AM | media-profiler/postgres-backup-cronjob.yaml |
| authentik | authentik | authentik |
4:00 AM | embedded in authentik/authentik.yaml |
| media-controller | media | media_controller |
4:00 AM | media/media-controller-postgres-backup.yaml |
| polymarket-lab | polymarket-lab | polymarket |
4:05 AM | polymarket-lab/postgres-backup-cronjob.yaml |
No PostgreSQL backup CronJob exists for:
auto-brand(PostgreSQL, NATS, and Redis StatefulSets have no automated backup). Data loss risk if PVC is lost. Verify current state:kubectl get cronjob -A | grep backup
# Check backup status
kubectl get backups -n velero
kubectl get schedules -n velero
# Describe a backup
kubectl describe backup k3s-daily-backup-<timestamp> -n velero
# Trigger manual backup
velero backup create manual-$(date +%Y%m%d) --include-namespaces="*" --exclude-namespaces="velero,kube-system"
# Check restore status
kubectl get restores -n velero
# Restore from backup
velero restore create --from-backup k3s-daily-backup-<timestamp>
Note: Velero backs up Kubernetes objects (Deployments, Services, ConfigMaps, PVCs, etc.) — not PVC data. Use Longhorn backups for volume data restoration. Together they provide full cluster + data recovery.
See Backup & Restoration Guide for full restore procedures.
kubectl port-forward -n longhorn-system svc/longhorn-frontend 8080:80
# Open http://localhost:8080
# Or navigate to https://longhorn.k3s.internal.strommen.systems
kubectl get volumes -n longhorn-system
kubectl get nodes.longhorn.io -n longhorn-system
kubectl get certificates -A
kubectl get certificaterequests -A
kubectl get challenges -A
kubectl delete secret <tls-secret-name> -n <namespace>
# cert-manager automatically renews
kubectl get svc -A | grep LoadBalancer
kubectl get ipaddresspools -n metallb-system
# View assigned IPs
kubectl get svc -A -o custom-columns='NS:.metadata.namespace,NAME:.metadata.name,IP:.status.loadBalancer.ingress[0].ip' | grep -v none
# Promote image from staging to production
gh workflow run promote-image.yml -f image=<name> -f tag=sha-<commit>
# Check Harbor robot account tokens (admin access available)
# Navigate to: https://harbor.k3s.internal.strommen.systems/harbor/projects
cd terraform/environments/homelab-prod
terraform plan
terraform apply
# AWS resources
cd terraform/environments/aws
terraform plan
terraform apply
# Unlock stuck state
terraform force-unlock <lock-id>
kubectl drain k3s-agent-1 --ignore-daemonsets --delete-emptydir-data --force
terraform destroy -target='module.k3s_agents["k3s-agent-1"]'
terraform apply
ansible-playbook -i inventory/homelab playbooks/site.yml --limit k3s-agent-1
Symptoms: Volume shows "Degraded" in Longhorn UI, pod stuck in ContainerCreating.
kubectl get volumes.longhorn.io -n longhorn-systemMost common cause: Service selector is missing app.kubernetes.io/component: web.
kubectl describe svc <service-name> -n <namespace>
# Check Selector: — must include component: web when PostgreSQL shares namespace
kubectl get certificates -Akubectl get challenges -Akubectl logs -n cert-manager deployment/cert-managerpvecm statusSymptoms: All pods fail DNS resolution.
kubectl get pods -n kube-system -l k8s-app=kube-dns -o widekubectl rollout restart deployment/coredns -n kube-systemSymptom: k3s-agent service fails, logs show K3S_URL not set or token invalid.
sudo cat /etc/systemd/system/k3s-agent.service.envK3S_TOKEN and K3S_URL (see k3s Upgrade section above)sudo systemctl daemon-reload && sudo systemctl restart k3s-agentAlert: VeleroBackupFailed fires if any backup hasn't completed in 15 minutes.
kubectl get backups -n velero --sort-by='.metadata.creationTimestamp'kubectl describe backup <name> -n velerokubectl logs -n velero deployment/velerocloud-credentials secret), S3 bucket permissionsSensitive files encrypted with SOPS + age:
.sops.yaml defines encryption rulesscripts/setup-sops.sh — init/encrypt/decrypt/edit/verify commandsage1t3wct0qrj3pdvgxzmky2rapyaq8aa2xd58kxr4s3z8tmputdfvms3cx8hf# Add to ~/.zshrc
alias k='kubectl'
alias kgp='kubectl get pods -A'
alias kgn='kubectl get nodes'
alias kgs='kubectl get svc -A'
alias klog='kubectl logs -f'
alias kex='kubectl exec -it'