diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 1426906..c523f1e 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -1,7 +1,11 @@ { "permissions": { "allow": [ - "Bash(podman run:*)" + "Bash(podman run:*)", + "Bash(git stash:*)", + "Bash(make lint:*)", + "Bash(ls:*)", + "Bash(git add:*)" ] } } diff --git a/AGENTS.md b/AGENTS.md index 54e47e2..dfbade4 100755 --- a/AGENTS.md +++ b/AGENTS.md @@ -344,9 +344,9 @@ Serves the hyperfleet REST API with full authentication, database connectivity, - `--ocm-debug` - Enable OCM API debug logging - **Monitoring & Health Checks:** - - `--health-check-server-bindaddress` - Health check server address (default: "localhost:8083") - - `--enable-health-check-https` - Enable HTTPS for health check server - - `--metrics-server-bindaddress` - Metrics server address (default: "localhost:8080") + - `--health-server-bindaddress` - Health endpoints server address (default: "localhost:8080") + - `--enable-health-https` - Enable HTTPS for health server + - `--metrics-server-bindaddress` - Metrics endpoint server address (default: "localhost:9090") - `--enable-metrics-https` - Enable HTTPS for metrics server - **Performance Tuning:** @@ -686,8 +686,8 @@ The server is configured in cmd/hyperfleet/server/: **Ports**: - `8000` - Main API server -- `8080` - Metrics endpoint -- `8083` - Health check endpoint +- `8080` - Health endpoints (`/healthz`, `/readyz`) +- `9090` - Metrics endpoint (`/metrics`) **Middleware Chain**: 1. Request logging @@ -774,7 +774,7 @@ The API is designed to be stateless and horizontally scalable: **Health Check**: `GET /healthcheck` returns 200 OK when database is accessible -**Metrics**: Prometheus metrics available at `/metrics` +**Metrics**: Prometheus metrics available at `/metrics` (port 9090) ## References diff --git a/README.md b/README.md index f5e27fe..c5533ed 100755 --- a/README.md +++ b/README.md @@ -80,8 +80,9 @@ The service starts on `localhost:8000`: - **REST API**: `http://localhost:8000/api/hyperfleet/v1/` - **OpenAPI spec**: `http://localhost:8000/api/hyperfleet/v1/openapi` - **Swagger UI**: `http://localhost:8000/api/hyperfleet/v1/openapi.html` -- **Health check**: `http://localhost:8083/healthcheck` -- **Metrics**: `http://localhost:8080/metrics` +- **Liveness probe**: `http://localhost:8080/healthz` +- **Readiness probe**: `http://localhost:8080/readyz` +- **Metrics**: `http://localhost:9090/metrics` ```bash # Test the API diff --git a/charts/templates/deployment.yaml b/charts/templates/deployment.yaml index 7ea0b7c..2f3f215 100644 --- a/charts/templates/deployment.yaml +++ b/charts/templates/deployment.yaml @@ -53,17 +53,17 @@ spec: args: - serve - --api-server-bindaddress={{ .Values.server.bindAddress | default ":8000" }} - - --health-check-server-bindaddress={{ .Values.server.healthBindAddress | default ":8083" }} - - --metrics-server-bindaddress={{ .Values.server.metricsBindAddress | default ":8080" }} + - --health-server-bindaddress={{ .Values.server.healthBindAddress | default ":8080" }} + - --metrics-server-bindaddress={{ .Values.server.metricsBindAddress | default ":9090" }} ports: - name: http containerPort: 8000 protocol: TCP - name: health - containerPort: 8083 + containerPort: 8080 protocol: TCP - name: metrics - containerPort: 8080 + containerPort: 9090 protocol: TCP env: {{- if .Values.auth.jwksUrl }} @@ -82,15 +82,15 @@ spec: {{- end }} livenessProbe: httpGet: - path: /healthcheck + path: /healthz port: health - initialDelaySeconds: 30 - periodSeconds: 10 + initialDelaySeconds: 15 + periodSeconds: 20 timeoutSeconds: 5 failureThreshold: 3 readinessProbe: httpGet: - path: /healthcheck + path: /readyz port: health initialDelaySeconds: 5 periodSeconds: 5 diff --git a/charts/templates/service.yaml b/charts/templates/service.yaml index 6eb3628..8517485 100644 --- a/charts/templates/service.yaml +++ b/charts/templates/service.yaml @@ -11,11 +11,11 @@ spec: targetPort: http protocol: TCP name: http - - port: 8083 + - port: 8080 targetPort: health protocol: TCP name: health - - port: 8080 + - port: 9090 targetPort: metrics protocol: TCP name: metrics diff --git a/charts/values.yaml b/charts/values.yaml index d5535f8..7a6fd7a 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -17,8 +17,8 @@ fullnameOverride: "" # Use ":PORT" format to bind to all interfaces (required for Kubernetes) server: bindAddress: ":8000" - healthBindAddress: ":8083" - metricsBindAddress: ":8080" + healthBindAddress: ":8080" + metricsBindAddress: ":9090" serviceAccount: # Specifies whether a service account should be created diff --git a/cmd/hyperfleet-api/servecmd/cmd.go b/cmd/hyperfleet-api/servecmd/cmd.go index f19d3ff..1b2f072 100755 --- a/cmd/hyperfleet-api/servecmd/cmd.go +++ b/cmd/hyperfleet-api/servecmd/cmd.go @@ -14,6 +14,7 @@ import ( "github.com/openshift-hyperfleet/hyperfleet-api/cmd/hyperfleet-api/server" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/db/db_session" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/health" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/telemetry" ) @@ -79,8 +80,17 @@ func runServe(cmd *cobra.Command, args []string) { metricsServer := server.NewMetricsServer() go metricsServer.Start() - healthcheckServer := server.NewHealthCheckServer() - go healthcheckServer.Start() + healthServer := server.NewHealthServer() + go healthServer.Start() + + // Wait for health server to be listening before marking as ready + if notifier, ok := healthServer.(server.ListenNotifier); ok { + <-notifier.NotifyListening() + } + + // Mark application as ready to receive traffic + health.GetReadinessState().SetReady() + logger.Info(ctx, "Application ready to receive traffic") sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) @@ -88,8 +98,12 @@ func runServe(cmd *cobra.Command, args []string) { logger.Info(ctx, "Shutdown signal received, starting graceful shutdown...") - if err := healthcheckServer.Stop(); err != nil { - logger.WithError(ctx, err).Error("Failed to stop healthcheck server") + // Mark application as not ready (returns 503 on /readyz) + health.GetReadinessState().SetShuttingDown() + logger.Info(ctx, "Marked as not ready, draining in-flight requests...") + + if err := healthServer.Stop(); err != nil { + logger.WithError(ctx, err).Error("Failed to stop health server") } if err := apiServer.Stop(); err != nil { logger.WithError(ctx, err).Error("Failed to stop API server") @@ -99,11 +113,16 @@ func runServe(cmd *cobra.Command, args []string) { } if tp != nil { - if err := telemetry.Shutdown(context.Background(), tp); err != nil { + shutdownCtx, cancel := context.WithTimeout(context.Background(), environments.Environment().Config.Health.ShutdownTimeout) + defer cancel() + if err := telemetry.Shutdown(shutdownCtx, tp); err != nil { logger.WithError(ctx, err).Error("Failed to shutdown OpenTelemetry") } } + // Close database connections + environments.Environment().Teardown() + logger.Info(ctx, "Graceful shutdown completed") } diff --git a/cmd/hyperfleet-api/server/health_server.go b/cmd/hyperfleet-api/server/health_server.go new file mode 100644 index 0000000..9e78151 --- /dev/null +++ b/cmd/hyperfleet-api/server/health_server.go @@ -0,0 +1,99 @@ +package server + +import ( + "context" + "fmt" + "net" + "net/http" + "time" + + "github.com/gorilla/mux" + + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/health" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" +) + +func NewHealthServer() Server { + mainRouter := mux.NewRouter() + mainRouter.NotFoundHandler = http.HandlerFunc(api.SendNotFound) + + // health endpoints (HyperFleet standard) + healthHandler := health.NewHandler(env().Database.SessionFactory) + mainRouter.HandleFunc("/healthz", healthHandler.LivenessHandler).Methods(http.MethodGet) + mainRouter.HandleFunc("/readyz", healthHandler.ReadinessHandler).Methods(http.MethodGet) + + var mainHandler http.Handler = mainRouter + + s := &healthServer{ + shutdownTimeout: env().Config.Health.ShutdownTimeout, + listening: make(chan struct{}), + } + s.httpServer = &http.Server{ + Addr: env().Config.Health.BindAddress, + Handler: mainHandler, + } + return s +} + +type healthServer struct { + httpServer *http.Server + shutdownTimeout time.Duration + listening chan struct{} +} + +var _ Server = &healthServer{} + +func (s *healthServer) Listen() (listener net.Listener, err error) { + return net.Listen("tcp", s.httpServer.Addr) +} + +func (s *healthServer) Serve(listener net.Listener) { + ctx := context.Background() + var err error + + if env().Config.Health.EnableHTTPS { + if env().Config.Server.HTTPSCertFile == "" || env().Config.Server.HTTPSKeyFile == "" { + check( + fmt.Errorf("unspecified required --https-cert-file, --https-key-file"), + "Can't start https server", + ) + } + + logger.With(ctx, logger.FieldBindAddress, env().Config.Health.BindAddress).Info("Serving Health with TLS") + err = s.httpServer.ServeTLS(listener, env().Config.Server.HTTPSCertFile, env().Config.Server.HTTPSKeyFile) + } else { + logger.With(ctx, logger.FieldBindAddress, env().Config.Health.BindAddress).Info("Serving Health without TLS") + err = s.httpServer.Serve(listener) + } + if err != nil && err != http.ErrServerClosed { + check(err, "Health server terminated with errors") + } else { + logger.Info(ctx, "Health server terminated") + } +} + +// Start is a convenience wrapper that calls Listen() and Serve() +func (s *healthServer) Start() { + listener, err := s.Listen() + if err != nil { + check(err, "Failed to create health server listener") + return + } + + // Signal that we're listening + close(s.listening) + + s.Serve(listener) +} + +// NotifyListening returns a channel that is closed when the server is listening +func (s *healthServer) NotifyListening() <-chan struct{} { + return s.listening +} + +func (s healthServer) Stop() error { + ctx, cancel := context.WithTimeout(context.Background(), s.shutdownTimeout) + defer cancel() + return s.httpServer.Shutdown(ctx) +} diff --git a/cmd/hyperfleet-api/server/healthcheck_server.go b/cmd/hyperfleet-api/server/healthcheck_server.go deleted file mode 100755 index 63f8631..0000000 --- a/cmd/hyperfleet-api/server/healthcheck_server.go +++ /dev/null @@ -1,87 +0,0 @@ -package server - -import ( - "context" - "fmt" - "net" - "net/http" - - health "github.com/docker/go-healthcheck" - "github.com/gorilla/mux" - - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" -) - -var ( - updater = health.NewStatusUpdater() -) - -var _ Server = &healthCheckServer{} - -type healthCheckServer struct { - httpServer *http.Server -} - -func NewHealthCheckServer() *healthCheckServer { - router := mux.NewRouter() - health.DefaultRegistry = health.NewRegistry() - health.Register("maintenance_status", updater) - router.HandleFunc("/healthcheck", health.StatusHandler).Methods(http.MethodGet) - router.HandleFunc("/healthcheck/down", downHandler).Methods(http.MethodPost) - router.HandleFunc("/healthcheck/up", upHandler).Methods(http.MethodPost) - - srv := &http.Server{ - Handler: router, - Addr: env().Config.HealthCheck.BindAddress, - } - - return &healthCheckServer{ - httpServer: srv, - } -} - -func (s healthCheckServer) Start() { - ctx := context.Background() - var err error - if env().Config.HealthCheck.EnableHTTPS { - if env().Config.Server.HTTPSCertFile == "" || env().Config.Server.HTTPSKeyFile == "" { - check( - fmt.Errorf("unspecified required --https-cert-file, --https-key-file"), - "Can't start https server", - ) - } - - // Serve with TLS - logger.With(ctx, logger.FieldBindAddress, env().Config.HealthCheck.BindAddress).Info("Serving HealthCheck with TLS") - err = s.httpServer.ListenAndServeTLS(env().Config.Server.HTTPSCertFile, env().Config.Server.HTTPSKeyFile) - } else { - logger.With(ctx, logger.FieldBindAddress, env().Config.HealthCheck.BindAddress).Info("Serving HealthCheck without TLS") - err = s.httpServer.ListenAndServe() - } - if err != nil && err != http.ErrServerClosed { - check(err, "HealthCheck server terminated with errors") - } else { - logger.Info(ctx, "HealthCheck server terminated") - } -} - -func (s healthCheckServer) Stop() error { - return s.httpServer.Shutdown(context.Background()) -} - -// Listen Unimplemented -func (s healthCheckServer) Listen() (listener net.Listener, err error) { - return nil, nil -} - -// Serve Unimplemented -func (s healthCheckServer) Serve(listener net.Listener) { -} - -func upHandler(w http.ResponseWriter, r *http.Request) { - updater.Update(nil) -} - -func downHandler(w http.ResponseWriter, r *http.Request) { - updater.Update(fmt.Errorf("maintenance mode")) -} diff --git a/cmd/hyperfleet-api/server/metrics_server.go b/cmd/hyperfleet-api/server/metrics_server.go index 198ad69..b985ef7 100755 --- a/cmd/hyperfleet-api/server/metrics_server.go +++ b/cmd/hyperfleet-api/server/metrics_server.go @@ -17,7 +17,7 @@ func NewMetricsServer() Server { mainRouter := mux.NewRouter() mainRouter.NotFoundHandler = http.HandlerFunc(api.SendNotFound) - // metrics endpoint + // metrics endpoint only (health endpoints moved to health_server.go on port 8080) prometheusMetricsHandler := handlers.NewPrometheusMetricsHandler() mainRouter.Handle("/metrics", prometheusMetricsHandler.Handler()) diff --git a/cmd/hyperfleet-api/server/server.go b/cmd/hyperfleet-api/server/server.go index 02a5d14..e3928c7 100755 --- a/cmd/hyperfleet-api/server/server.go +++ b/cmd/hyperfleet-api/server/server.go @@ -17,6 +17,12 @@ type Server interface { Serve(net.Listener) } +// ListenNotifier is an optional interface that servers can implement +// to signal when they are ready to accept connections +type ListenNotifier interface { + NotifyListening() <-chan struct{} +} + func removeTrailingSlash(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { r.URL.Path = strings.TrimSuffix(r.URL.Path, "/") diff --git a/docs/deployment.md b/docs/deployment.md index b374b28..414e58b 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -78,8 +78,8 @@ export OPENAPI_SCHEMA_PATH=/path/to/custom-schema.yaml **Server:** - `PORT` - API server port (default: `8000`) -- `METRICS_PORT` - Metrics endpoint port (default: `8080`) -- `HEALTH_PORT` - Health check port (default: `8083`) +- `HEALTH_PORT` - Health endpoints port (default: `8080`) +- `METRICS_PORT` - Metrics endpoint port (default: `9090`) **Logging:** - `LOG_LEVEL` - Logging level: `debug`, `info`, `warn`, `error` (default: `info`) @@ -272,7 +272,10 @@ kubectl get configmaps --namespace hyperfleet-system ## Health Checks -The deployment includes liveness and readiness probes at `GET /healthcheck` (port 8083). +The deployment includes: +- Liveness probe: `GET /healthz` (port 8080) - Returns 200 if the process is alive +- Readiness probe: `GET /readyz` (port 8080) - Returns 200 when ready to receive traffic, 503 during startup/shutdown +- Metrics: `GET /metrics` (port 9090) - Prometheus metrics endpoint ## Scaling @@ -291,7 +294,7 @@ Enable autoscaling via Helm values (`autoscaling.enabled=true`). ## Monitoring -Prometheus metrics available at `http://:8080/metrics`. +Prometheus metrics available at `http://:9090/metrics`. For Prometheus Operator, enable ServiceMonitor via Helm values (`serviceMonitor.enabled=true`). diff --git a/docs/development.md b/docs/development.md index e0107eb..664b0dd 100644 --- a/docs/development.md +++ b/docs/development.md @@ -95,8 +95,9 @@ The service starts on `localhost:8000`: - REST API: `http://localhost:8000/api/hyperfleet/v1/` - OpenAPI spec: `http://localhost:8000/api/hyperfleet/v1/openapi` - Swagger UI: `http://localhost:8000/api/hyperfleet/v1/openapi.html` -- Health check: `http://localhost:8083/healthcheck` -- Metrics: `http://localhost:8080/metrics` +- Liveness probe: `http://localhost:8080/healthz` +- Readiness probe: `http://localhost:8080/readyz` +- Metrics: `http://localhost:9090/metrics` ### Testing the API diff --git a/docs/hyperfleet-api.http b/docs/hyperfleet-api.http index 969cc9f..61dbd7f 100644 --- a/docs/hyperfleet-api.http +++ b/docs/hyperfleet-api.http @@ -4,8 +4,8 @@ @host = localhost @port = 8000 -@metrics_port = 8080 -@health_port = 8083 +@health_port = 8080 +@metrics_port = 9090 @baseUrl = http://{{host}}:{{port}} @authToken = @@ -193,8 +193,12 @@ Content-Type: application/json } ### -# @name healthCheck -GET http://{{host}}:{{health_port}}/healthcheck +# @name livenessProbe +GET http://{{host}}:{{health_port}}/healthz + +### +# @name readinessProbe +GET http://{{host}}:{{health_port}}/readyz ### # @name metrics diff --git a/go.mod b/go.mod index cf19c36..3702583 100755 --- a/go.mod +++ b/go.mod @@ -125,6 +125,7 @@ require ( go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 // indirect go.opentelemetry.io/otel/metric v1.38.0 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/crypto v0.45.0 // indirect golang.org/x/net v0.47.0 // indirect golang.org/x/sync v0.18.0 // indirect @@ -136,4 +137,5 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect gorm.io/driver/mysql v1.5.6 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index 3e25a85..abbab1c 100755 --- a/go.sum +++ b/go.sum @@ -679,6 +679,8 @@ go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee/go.mod h1:vJERXedbb3MVM5f9E go.uber.org/zap v1.9.1/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= go.uber.org/zap v1.13.0/go.mod h1:zwrFLgMcdUuIBviXEYEH1YKNaOBnKXsx2IPda5bBwHM= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -1087,3 +1089,5 @@ honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/pkg/config/config.go b/pkg/config/config.go index d5851d9..c0092ae 100755 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -13,22 +13,22 @@ import ( ) type ApplicationConfig struct { - Server *ServerConfig `json:"server"` - Metrics *MetricsConfig `json:"metrics"` - HealthCheck *HealthCheckConfig `json:"health_check"` - Database *DatabaseConfig `json:"database"` - OCM *OCMConfig `json:"ocm"` - Logging *LoggingConfig `json:"logging"` + Server *ServerConfig `json:"server"` + Metrics *MetricsConfig `json:"metrics"` + Health *HealthConfig `json:"health"` + Database *DatabaseConfig `json:"database"` + OCM *OCMConfig `json:"ocm"` + Logging *LoggingConfig `json:"logging"` } func NewApplicationConfig() *ApplicationConfig { return &ApplicationConfig{ - Server: NewServerConfig(), - Metrics: NewMetricsConfig(), - HealthCheck: NewHealthCheckConfig(), - Database: NewDatabaseConfig(), - OCM: NewOCMConfig(), - Logging: NewLoggingConfig(), + Server: NewServerConfig(), + Metrics: NewMetricsConfig(), + Health: NewHealthConfig(), + Database: NewDatabaseConfig(), + OCM: NewOCMConfig(), + Logging: NewLoggingConfig(), } } @@ -36,7 +36,7 @@ func (c *ApplicationConfig) AddFlags(flagset *pflag.FlagSet) { flagset.AddGoFlagSet(flag.CommandLine) c.Server.AddFlags(flagset) c.Metrics.AddFlags(flagset) - c.HealthCheck.AddFlags(flagset) + c.Health.AddFlags(flagset) c.Database.AddFlags(flagset) c.OCM.AddFlags(flagset) c.Logging.AddFlags(flagset) @@ -51,7 +51,7 @@ func (c *ApplicationConfig) ReadFiles() []string { {c.Database.ReadFiles, "Database"}, {c.OCM.ReadFiles, "OCM"}, {c.Metrics.ReadFiles, "Metrics"}, - {c.HealthCheck.ReadFiles, "HealthCheck"}, + {c.Health.ReadFiles, "Health"}, {c.Logging.ReadFiles, "Logging"}, } var messages []string diff --git a/pkg/config/health.go b/pkg/config/health.go new file mode 100644 index 0000000..601ac52 --- /dev/null +++ b/pkg/config/health.go @@ -0,0 +1,31 @@ +package config + +import ( + "time" + + "github.com/spf13/pflag" +) + +type HealthConfig struct { + BindAddress string `json:"bind_address"` + EnableHTTPS bool `json:"enable_https"` + ShutdownTimeout time.Duration `json:"shutdown_timeout"` +} + +func NewHealthConfig() *HealthConfig { + return &HealthConfig{ + BindAddress: "localhost:8080", + EnableHTTPS: false, + ShutdownTimeout: 20 * time.Second, + } +} + +func (s *HealthConfig) AddFlags(fs *pflag.FlagSet) { + fs.StringVar(&s.BindAddress, "health-server-bindaddress", s.BindAddress, "Health server bind address") + fs.BoolVar(&s.EnableHTTPS, "enable-health-https", s.EnableHTTPS, "Enable HTTPS for health server") + fs.DurationVar(&s.ShutdownTimeout, "health-shutdown-timeout", s.ShutdownTimeout, "Health server shutdown timeout") +} + +func (s *HealthConfig) ReadFiles() error { + return nil +} diff --git a/pkg/config/health_check.go b/pkg/config/health_check.go deleted file mode 100755 index e3aaa79..0000000 --- a/pkg/config/health_check.go +++ /dev/null @@ -1,26 +0,0 @@ -package config - -import ( - "github.com/spf13/pflag" -) - -type HealthCheckConfig struct { - BindAddress string `json:"bind_address"` - EnableHTTPS bool `json:"enable_https"` -} - -func NewHealthCheckConfig() *HealthCheckConfig { - return &HealthCheckConfig{ - BindAddress: "localhost:8083", - EnableHTTPS: false, - } -} - -func (c *HealthCheckConfig) AddFlags(fs *pflag.FlagSet) { - fs.StringVar(&c.BindAddress, "health-check-server-bindaddress", c.BindAddress, "Health check server bind adddress") - fs.BoolVar(&c.EnableHTTPS, "enable-health-check-https", c.EnableHTTPS, "Enable HTTPS for health check server") -} - -func (c *HealthCheckConfig) ReadFiles() error { - return nil -} diff --git a/pkg/config/metrics.go b/pkg/config/metrics.go index b01c3c3..0008f75 100755 --- a/pkg/config/metrics.go +++ b/pkg/config/metrics.go @@ -14,7 +14,7 @@ type MetricsConfig struct { func NewMetricsConfig() *MetricsConfig { return &MetricsConfig{ - BindAddress: "localhost:8080", + BindAddress: "localhost:9090", EnableHTTPS: false, LabelMetricsInclusionDuration: 7 * 24 * time.Hour, } diff --git a/pkg/health/handler.go b/pkg/health/handler.go new file mode 100644 index 0000000..06ceb49 --- /dev/null +++ b/pkg/health/handler.go @@ -0,0 +1,80 @@ +package health + +import ( + "encoding/json" + "net/http" + + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/db" +) + +// Handler provides HTTP handlers for health checks +type Handler struct { + sessionFactory db.SessionFactory +} + +// NewHandler creates a new health handler +func NewHandler(sessionFactory db.SessionFactory) *Handler { + return &Handler{ + sessionFactory: sessionFactory, + } +} + +// LivenessHandler handles the /healthz endpoint +// Returns 200 OK if the application is alive +func (h *Handler) LivenessHandler(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(map[string]string{"status": "ok"}) +} + +// ReadinessHandler handles the /readyz endpoint +// Returns 200 OK if the application is ready to receive traffic +// Returns 503 Service Unavailable if the application is shutting down or not ready +func (h *Handler) ReadinessHandler(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + state := GetReadinessState() + + if state.IsShuttingDown() { + w.WriteHeader(http.StatusServiceUnavailable) + _ = json.NewEncoder(w).Encode(map[string]string{ + "status": "shutting_down", + "reason": "Application is shutting down", + }) + return + } + + if !state.IsReady() { + w.WriteHeader(http.StatusServiceUnavailable) + _ = json.NewEncoder(w).Encode(map[string]string{ + "status": "not_ready", + "reason": "Application is not ready", + }) + return + } + + // Check database connectivity if session factory is available + if h.sessionFactory != nil { + sqlDB := h.sessionFactory.DirectDB() + if sqlDB == nil { + w.WriteHeader(http.StatusServiceUnavailable) + _ = json.NewEncoder(w).Encode(map[string]string{ + "status": "not_ready", + "reason": "Database connection not available", + }) + return + } + + if err := sqlDB.PingContext(r.Context()); err != nil { + w.WriteHeader(http.StatusServiceUnavailable) + _ = json.NewEncoder(w).Encode(map[string]string{ + "status": "not_ready", + "reason": "Database ping failed", + }) + return + } + } + + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(map[string]string{"status": "ok"}) +} diff --git a/pkg/health/readiness.go b/pkg/health/readiness.go new file mode 100644 index 0000000..299c65a --- /dev/null +++ b/pkg/health/readiness.go @@ -0,0 +1,59 @@ +package health + +import ( + "sync" +) + +// ReadinessState represents the current readiness state of the application +type ReadinessState struct { + mu sync.RWMutex + ready bool + shuttingDown bool +} + +var ( + globalState *ReadinessState + globalStateOnce sync.Once +) + +// GetReadinessState returns the singleton ReadinessState instance +func GetReadinessState() *ReadinessState { + globalStateOnce.Do(func() { + globalState = &ReadinessState{ + ready: false, + shuttingDown: false, + } + }) + return globalState +} + +// SetReady marks the application as ready to receive traffic +func (r *ReadinessState) SetReady() { + r.mu.Lock() + defer r.mu.Unlock() + r.ready = true + r.shuttingDown = false +} + +// SetShuttingDown marks the application as shutting down +// This will cause the readiness check to fail, signaling to Kubernetes +// to stop routing traffic to this instance +func (r *ReadinessState) SetShuttingDown() { + r.mu.Lock() + defer r.mu.Unlock() + r.shuttingDown = true +} + +// IsReady returns true if the application is ready and not shutting down +func (r *ReadinessState) IsReady() bool { + r.mu.RLock() + defer r.mu.RUnlock() + return r.ready && !r.shuttingDown +} + +// IsShuttingDown returns true if the application is shutting down +func (r *ReadinessState) IsShuttingDown() bool { + r.mu.RLock() + defer r.mu.RUnlock() + return r.shuttingDown +} diff --git a/test/helper.go b/test/helper.go index 505bb96..19ba246 100755 --- a/test/helper.go +++ b/test/helper.go @@ -60,7 +60,7 @@ type Helper struct { AppConfig *config.ApplicationConfig APIServer server.Server MetricsServer server.Server - HealthCheckServer server.Server + HealthServer server.Server TimeFunc TimeFunc JWTPrivateKey *rsa.PrivateKey JWTCA *rsa.PublicKey @@ -116,7 +116,7 @@ func NewHelper(t *testing.T) *Helper { } helper.startAPIServer() helper.startMetricsServer() - helper.startHealthCheckServer() + helper.startHealthServer() }) helper.T = t return helper @@ -181,12 +181,12 @@ func (helper *Helper) stopMetricsServer() error { return nil } -func (helper *Helper) startHealthCheckServer() { +func (helper *Helper) startHealthServer() { ctx := context.Background() - helper.HealthCheckServer = server.NewHealthCheckServer() + helper.HealthServer = server.NewHealthServer() go func() { logger.Debug(ctx, "Test health check server started") - helper.HealthCheckServer.Start() + helper.HealthServer.Start() logger.Debug(ctx, "Test health check server stopped") }() } @@ -258,8 +258,8 @@ func (helper *Helper) MetricsURL(path string) string { return fmt.Sprintf("http://%s%s", helper.AppConfig.Metrics.BindAddress, path) } -func (helper *Helper) HealthCheckURL(path string) string { - return fmt.Sprintf("http://%s%s", helper.AppConfig.HealthCheck.BindAddress, path) +func (helper *Helper) HealthURL(path string) string { + return fmt.Sprintf("http://%s%s", helper.AppConfig.Health.BindAddress, path) } func (helper *Helper) NewApiClient() *openapi.ClientWithResponses {