diff --git a/.gitignore b/.gitignore index ff7161e57..ed34f7842 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ node_modules # draw.io temp files .$*.bkp -.$*.dtmp \ No newline at end of file +.$*.dtmp +venv \ No newline at end of file diff --git a/metrics-collector/.ceignore b/metrics-collector/.ceignore new file mode 100644 index 000000000..a5ed506b1 --- /dev/null +++ b/metrics-collector/.ceignore @@ -0,0 +1,2 @@ +images/ +setup/ \ No newline at end of file diff --git a/metrics-collector/Dockerfile b/metrics-collector/Dockerfile index 111dd9481..3796e7b37 100644 --- a/metrics-collector/Dockerfile +++ b/metrics-collector/Dockerfile @@ -1,11 +1,71 @@ -FROM quay.io/projectquay/golang:1.23 AS build-env +# Stage 1: Build Go binary +FROM quay.io/projectquay/golang:1.25 AS go-builder WORKDIR /go/src/app -COPY . . - +COPY go.mod go.sum ./ RUN go mod download -RUN CGO_ENABLED=0 go build -o /go/bin/app main.go +COPY main.go ./ +RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o app main.go + +# Stage 2: Download and extract Prometheus +FROM busybox:1.36-glibc AS prometheus-downloader +ARG PROMETHEUS_VERSION=3.10.0 +ARG TARGETARCH=amd64 + +WORKDIR /tmp +RUN wget https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-${TARGETARCH}.tar.gz && \ + tar xzf prometheus-${PROMETHEUS_VERSION}.linux-${TARGETARCH}.tar.gz && \ + mv prometheus-${PROMETHEUS_VERSION}.linux-${TARGETARCH}/prometheus /prometheus + +# Stage 3: Get CA certificates and curl +FROM alpine:latest AS certs +RUN apk --no-cache add ca-certificates curl + +# Stage 4: Runtime image +FROM busybox:1.36-glibc + +# Copy curl binary and all its dependencies from alpine +# Use following command locally to determine the correct source paths: +# docker run --rm alpine:latest sh -c "apk add --no-cache curl > /dev/null 2>&1 && ldd /usr/bin/curl" +COPY --from=certs /usr/bin/curl /usr/bin/curl +COPY --from=certs /lib/ld-musl-x86_64.so.1 /lib/ld-musl-x86_64.so.1 +COPY --from=certs /usr/lib/libcurl.so.4 /usr/lib/libcurl.so.4 +COPY --from=certs /usr/lib/libz.so.1 /usr/lib/libz.so.1 +COPY --from=certs /usr/lib/libnghttp2.so.14 /usr/lib/libnghttp2.so.14 +COPY --from=certs /usr/lib/libnghttp3.so.9 /usr/lib/libnghttp3.so.9 +COPY --from=certs /usr/lib/libbrotlidec.so.1 /usr/lib/libbrotlidec.so.1 +COPY --from=certs /usr/lib/libbrotlicommon.so.1 /usr/lib/libbrotlicommon.so.1 +COPY --from=certs /usr/lib/libssl.so.3 /usr/lib/libssl.so.3 +COPY --from=certs /usr/lib/libcrypto.so.3 /usr/lib/libcrypto.so.3 +COPY --from=certs /usr/lib/libcares.so.2 /usr/lib/libcares.so.2 +COPY --from=certs /usr/lib/libidn2.so.0 /usr/lib/libidn2.so.0 +COPY --from=certs /usr/lib/libpsl.so.5 /usr/lib/libpsl.so.5 +COPY --from=certs /usr/lib/libzstd.so.1 /usr/lib/libzstd.so.1 +COPY --from=certs /usr/lib/libunistring.so.5 /usr/lib/libunistring.so.5 + +# Copy CA certificates for TLS verification +COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt + +# Copy Go binary +COPY --from=go-builder /go/src/app/app /app + +# Copy Prometheus binary +COPY --from=prometheus-downloader /prometheus /bin/prometheus + +# Copy configuration and scripts +COPY prometheus.yml.template /etc/prometheus/prometheus.yml.template +COPY start.sh /start.sh +RUN chmod +x /start.sh + +# Create necessary directories with proper permissions +RUN mkdir -p /tmp/agent-data && \ + mkdir -p /etc/secrets && \ + chmod 777 /tmp/agent-data && \ + chmod 777 /etc/secrets + +# Set SSL certificate path environment variable +ENV SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt + +# Use non-root user +USER 1000:1000 -# Copy the exe into a smaller base image -FROM gcr.io/distroless/static-debian12 -COPY --from=build-env /go/bin/app / -CMD ["/app"] +ENTRYPOINT ["/start.sh"] diff --git a/metrics-collector/README.md b/metrics-collector/README.md index 9abb6844a..7cbe4de6e 100644 --- a/metrics-collector/README.md +++ b/metrics-collector/README.md @@ -1,59 +1,207 @@ # IBM Cloud Code Engine - Metrics Collector -Code Engine job that demonstrates how to collect resource metrics (CPU, memory and disk usage) of running Code Engine apps, jobs, and builds +Code Engine job that demonstrates how to collect resource metrics (CPU, memory and disk usage) of running Code Engine apps, jobs, and builds. Those metrics can either be render + +in **IBM Cloud Monitoring** (see [instructions](#Send-metrics-to-IBM-Cloud-Monitoring)) + +![](./images/monitoring-dashboard-ce-component-resources.png) + +or in **IBM Cloud Logs** (see [instructions](#ibm-cloud-logs-setup)) ![Dashboard overview](./images/icl-dashboard-overview.png) -## Installation -### Capture metrics every n seconds +## Send metrics to IBM Cloud Monitoring + +### How It Works + +![](./images/metrics-collector.overview.png) + +1. The metrics collector exposes Prometheus metrics on `localhost:9100/metrics` +2. The embedded Prometheus agent scrapes these metrics every 30 seconds +3. The agent also discovers and scrapes pods with the `codeengine.cloud.ibm.com/userMetricsScrape: 'true'` annotation +4. All metrics are forwarded to IBM Cloud Monitoring via remote write +5. If either the collector or Prometheus agent crashes, the container exits with a non-zero code to trigger a restart + +### Setup Instructions + +The metrics collector supports two authentication methods for accessing IBM Cloud Monitoring: + +- **Option 1: Trusted Profile Authentication (Recommended)** - Automatically obtains monitoring API keys using IBM Cloud Trusted Profiles +- **Option 2: Explicit API Key Secret** - Manually create and mount a monitoring API key as a secret + +#### Prerequisites (Both Options) + +**Step 1:** You need an IBM Cloud Monitoring instance +```bash +REGION= +MONITORING_INSTANCE_NAME="" +MONITORING_INSTANCE_GUID=$(ibmcloud resource service-instance "$MONITORING_INSTANCE_NAME" -o JSON|jq -r '.[0].guid') +echo "MONITORING_INSTANCE_GUID: '$MONITORING_INSTANCE_GUID'" +``` + +**Step 2:** The collector must run in a Code Engine project +```bash +# Create new Code Engine project +ibmcloud ce project create --name -* Create Code Engine job template +# Select an existing Code Engine project +ibmcloud ce project select --name ``` -$ ibmcloud ce job create \ + +#### Option 1: Trusted Profile Authentication (Recommended) + +This method automatically obtains the monitoring API key using IBM Cloud Trusted Profiles. No manual secret creation or rotation is required. + +**Step 3a:** Create an IBM Cloud Trusted Profile + +Create a Trusted Profile that allows Code Engine compute resources to authenticate: + +```bash +# Create the Trusted Profile +ibmcloud iam trusted-profile-create metrics-collector-profile \ + --description "Trusted profile for Code Engine metrics collector" + +# Get the Trusted Profile ID +TRUSTED_PROFILE_ID=$(ibmcloud iam trusted-profiles --output json | jq -r '.[] | select(.name=="metrics-collector-profile") | .id') + +# Add Code Engine compute resources as trusted entities +ibmcloud iam trusted-profile-rule-create metrics-collector-profile \ + --name code-engine-rule \ + --type Profile-CR \ + --cr-type CE \ + --conditions claim:project_name,operator:EQUALS,value:"$(ibmcloud ce proj current --output json|jq -r '.name')" \ + --conditions claim:component_type,operator:EQUALS,value:"job" \ + --conditions claim:component_name,operator:EQUALS,value:"metrics-collector" + +# Grant the profile access to the Monitoring instance +ibmcloud iam trusted-profile-policy-create metrics-collector-profile \ + --roles Viewer,Writer \ + --service-name sysdig-monitor \ + --service-instance $MONITORING_INSTANCE_GUID +``` + +**Note:** Replace `` with your Code Engine project's Kubernetes namespace (typically in the format ``). + +**Step 4a:** Create your metrics-collector job with Trusted Profile configuration +```bash +ibmcloud ce job create \ --name metrics-collector \ - --src . \ + --src "." \ --mode daemon \ --cpu 0.25 \ --memory 0.5G \ - --wait + --service-account reader \ + --build-size xlarge \ + --trusted-profiles-enabled \ + --env INTERVAL=30 \ + --env METRICS_ENABLED=true \ + --env METRICS_REMOTE_WRITE_FQDN=ingest.prws.private.${REGION}.monitoring.cloud.ibm.com \ + --env CE_PROJECT_NAME="$(ibmcloud ce proj current --output json|jq -r '.name')" \ + --env MONITORING_INSTANCE_GUID="$MONITORING_INSTANCE_GUID" \ + --env MONITORING_REGION="$REGION" \ + --env TRUSTED_PROFILE_NAME="metrics-collector-profile" ``` -* Submit a daemon job that collects metrics in an endless loop. The daemon job queries the Metrics API every 30 seconds -``` -$ ibmcloud ce jobrun submit \ - --job metrics-collector \ - --env INTERVAL=30 +**Step 5a:** Submit a daemon job run +```bash +ibmcloud ce jobrun submit \ + --job metrics-collector ``` +#### Option 2: Explicit API Key Secret -### Capture metrics every n minutes +This method requires manually creating and mounting a monitoring API key. Use this option for local development, testing, or when Trusted Profiles are not available. -* Create Code Engine job template +**Step 3b:** Create a secret with your IBM Cloud Monitoring API token +```bash +# Obtain the Monitoring API token of the IBM Cloud Monitoring instance +# using the IAM access token of the current IBM CLI Session +MONITORING_INSTANCE_MONITORING_API_KEY=$(curl --silent -X GET https://$REGION.monitoring.cloud.ibm.com/api/token -H "Authorization: $(ibmcloud iam oauth-tokens --output JSON|jq -r '.iam_token')" -H "IBMInstanceID: $MONITORING_INSTANCE_GUID" -H "content-type: application/json"|jq -r '.token.key') + +# Create a Code Engine secret that stores the Monitoring API Key +ibmcloud ce secret create \ + --name monitoring-apikey \ + --from-literal monitoring-apikey=$MONITORING_INSTANCE_MONITORING_API_KEY ``` -$ ibmcloud ce job create \ + +**Step 4b:** Create your metrics-collector job with the required configuration +```bash +ibmcloud ce job create \ --name metrics-collector \ - --src . \ - --mode task \ + --src "." \ + --mode daemon \ --cpu 0.25 \ --memory 0.5G \ - --wait + --service-account reader \ + --build-size xlarge \ + --env INTERVAL=30 \ + --env METRICS_ENABLED=true \ + --env METRICS_REMOTE_WRITE_FQDN=ingest.prws.private.${REGION}.monitoring.cloud.ibm.com \ + --env CE_PROJECT_NAME="$(ibmcloud ce proj current --output json|jq -r '.name')" \ + --mount-secret /etc/secrets=monitoring-apikey ``` -* Submit a Code Engine cron subscription that triggers the metrics collector every minute to query the Metrics API +**Step 5b:** Submit a daemon job run +```bash +ibmcloud ce jobrun submit \ + --job metrics-collector ``` -$ ibmcloud ce subscription cron create \ - --name collect-metrics-every-minute \ - --destination-type job \ - --destination metrics-collector \ - --schedule '*/1 * * * *' + +#### Import Dashboard + +After setting up the metrics collector with either authentication method, import the dashboard: + +```bash +# Load the most recent dashboard configuration +CE_MONITORING_DASHBOARD=$(curl -sL https://raw.githubusercontent.com/IBM/CodeEngine/main/metrics-collector/setup/ibm-cloud-monitoring/code-engine-component-resource-overview.json) + +# Import the dashboard +curl -X POST https://$REGION.monitoring.cloud.ibm.com/api/v3/dashboards \ + -H "Authorization: $(ibmcloud iam oauth-tokens --output JSON|jq -r '.iam_token')" \ + -H "IBMInstanceID: $MONITORING_INSTANCE_GUID" \ + -H "Content-Type: application/json" \ + -d "{\"dashboard\": $CE_MONITORING_DASHBOARD}" + ``` -## Configuration +**Note:** A more elaborated approach to manage custom Cloud Monitoring dashboards can be found [here](setup/ibm-cloud-monitoring/README.md) + + +#### Exposed Metrics + +The following Prometheus metrics are exposed as gauges: + +Container Metrics: +- **`ibm_codeengine_instance_cpu_usage_millicores`**: Current CPU usage in millicores +- **`ibm_codeengine_instance_cpu_limit_millicores`**: Configured CPU limit in millicores +- **`ibm_codeengine_instance_memory_usage_bytes`**: Current memory usage in bytes +- **`ibm_codeengine_instance_memory_limit_bytes`**: Configured memory limit in bytes +- **`ibm_codeengine_instance_ephemeral_storage_usage_bytes`**: Current ephemeral storage usage in bytes (if `COLLECT_DISKUSAGE=true`) -Per default the metrics collector collects memory and CPU statistics, like `usage`, `current` and `configured`. +The following 3 metrics are used to monitor the collector itself: +- **`ibm_codeengine_collector_collection_duration_seconds`**: Time taken to collect metrics in seconds (if `METRICS_INTERNAL_STATS=true`) +- **`ibm_codeengine_collector_last_collection_timestamp_seconds`**: Unix timestamp of last successful collection (if `METRICS_INTERNAL_STATS=true`) +- **`ibm_codeengine_collector_collection_errors_total`**: Total number of collection errors (counter) (if `METRICS_INTERNAL_STATS=true`) -One can use the environment variable `COLLECT_DISKUSAGE=true` to also collect the amount of disk space that is used. Please note, the metrics collector can only calculate the overall file size stored in the pods filesystem which includes files that are part of the container image, the epheremal storage as well as mounted COS buckets. Hence, this metric cannot be used to calculate the ephemeral storage usage. +#### Metric Labels + +All container metrics include the following labels: +- `ibm_codeengine_instance_name`: Name of the pod instance +- `ibm_codeengine_component_type`: Type of component (`app`, `job`, or `build`) +- `ibm_codeengine_component_name`: Name of the Code Engine component + +#### Example Metrics Output + +```prometheus +# HELP ibm_codeengine_instance_cpu_usage_millicores Current CPU usage in millicores +# TYPE ibm_codeengine_instance_cpu_usage_millicores gauge +ibm_codeengine_instance_cpu_usage_millicores{ibm_codeengine_instance_name="myapp-00001-deployment-abc123",ibm_codeengine_component_type="app",ibm_codeengine_component_name="myapp"} 250 + +# HELP ibm_codeengine_instance_memory_usage_bytes Current memory usage in bytes +# TYPE ibm_codeengine_instance_memory_usage_bytes gauge +ibm_codeengine_instance_memory_usage_bytes{ibm_codeengine_instance_name="myapp-00001-deployment-abc123",ibm_codeengine_component_type="app",ibm_codeengine_component_name="myapp"} 134217728 +``` ## IBM Cloud Logs setup @@ -71,7 +219,7 @@ Follow the steps below to create a custom dashboard in your IBM Cloud Logs insta ![New dashboard](./images/icl-dashboard-new.png) -* In the "Import" modal, select the file [./setup/dashboard-code_engine_resource_consumption_metrics.json](./setup/dashboard-code_engine_resource_consumption_metrics.json) located in this repository, and click "Import" +* In the "Import" modal, select the file [./setup/ibm-cloud-logs/dashboard-code_engine_resource_consumption_metrics.json](./setup/ibm-cloud-logs/dashboard-code_engine_resource_consumption_metrics.json) located in this repository, and click "Import" ![Import modal](./images/icl-dashboard-import.png) @@ -117,8 +265,6 @@ app:"codeengine" AND message.metric:"instance-resources" * In the top-right corner, select `1-line` as view mode -![View](./images/icl-logs-view-mode.png) - * In the graph title it says "**Count** all grouped by **Severity**". Click on `Severity` and select `message.component_name` instead. Furthermore, select `Max` as aggregation metric and choose `message.memory.usage` as aggregation field ![Graph](./images/icl-logs-view-graph.png) @@ -132,54 +278,38 @@ app:"codeengine" AND message.metric:"instance-resources" ![Logs overview](./images/icl-logs-view-overview.png) -## IBM Log Analysis setup (deprecated) - -### Log lines - -Along with a human readable message, like `Captured metrics of app instance 'load-generator-00001-deployment-677d5b7754-ktcf6': 3m vCPU, 109 MB memory, 50 MB ephemeral storage`, each log line passes specific resource utilization details in a structured way allowing to apply advanced filters on them. - -E.g. -- `cpu.usage:>80`: Filter for all log lines that noticed a CPU utilization of 80% or higher -- `memory.current:>1000`: Filter for all log lines that noticed an instance that used 1GB or higher of memory -- `component_type:app`: Filter only for app instances. Possible values are `app`, `job`, and `build` -- `component_name:`: Filter for all instances of a specific app, job, or build -- `name:`: Filter for a specific instance - -![IBM Cloud Logs](./images/ibm-cloud-logs--loglines.png) - -### Log graphs - -Best is to create IBM Cloud Logs Board, in order to visualize the CPU and Memory usage per Code Engine component. +### Troubleshooting & Configuration -1. In your log instance navigate to Boards -1. Give it a proper name, enter `metric:instance-resources` as query and submit by clicking `Add Graph` -![New Board](./images/new-board.png) -1. Now the graph shows the overall amount of logs captured for the specified query per time interval -![Count of metrics log lines ](./images/count-of-metrics-lines.png) -1. Click on the filter icon above the graph and put in `metric:instance-resources AND component_name:` -1. Switch the metric of the Graph to `Maximums` -1. Below the graph Add a new plot`cpu.usage` as field and choose `ANY` as field values -![Configure Graph plots](./images/configure-plots.png) -1. Add another plot for the field `memory.usage` and values `ANY` -1. Finally delete the plot `metrics:instance-resources` and adjust the plot colors to your likings -![Resource Usage graph](./images/resource-usage-graph.png) -1. The usage graph above renders the utilization in % of the CPU and Memory +#### Common Issues -#### Add CPU utilization -1. Duplicate the graph, change its name to CPU and replace its plots with `cpu.configured` and `cpu.current`. -- The resulting graph will render the actual CPU usage compared to the configured limit. The the unit is milli vCPUs (1000 -> 1 vCPU). -![](./images/cpu-utilization.png) +**Trusted Profile Authentication Failures:** +- **Missing environment variables**: Ensure `MONITORING_INSTANCE_GUID`, `MONITORING_REGION`, and `TRUSTED_PROFILE_NAME` are all set +- **Container resource token not found**: Verify the job is running in Code Engine with proper service account permissions +- **IAM token request failed**: Check that the Trusted Profile exists and has the correct trust relationship configured +- **Monitoring API key retrieval failed**: Verify the Trusted Profile has appropriate permissions (Viewer, Writer) for the Monitoring instance +- **Invalid region**: Ensure `MONITORING_REGION` matches your Monitoring instance region (e.g., `us-south`, `eu-de`, `eu-gb`) +**Explicit Secret Authentication Failures:** +- **Missing `/etc/secrets/monitoring-apikey` file**: Ensure the secret is created and properly mounted +- **Invalid API key**: Regenerate the monitoring API key and update the secret -#### Add memory utilization -1. Duplicate the graph, change its name to Memory and replace its plots with `memory.configured` and `memory.current`. -1. The resulting graph will render the actual memory usage compared to the configured limit. The the unit is MB (1000 -> 1 GB). -![](./images/memory-utilization.png) +**General Issues:** +- **Missing or wrong `METRICS_REMOTE_WRITE_FQDN`**: Verify the endpoint matches your region's ingestion endpoint +- **Prometheus agent fails to start**: Check the logs for configuration errors or network connectivity issues +#### Environment Variables +**Core Configuration:** +- **`INTERVAL`** (default: `30`): Collection interval in seconds (minimum 30 seconds). Controls how frequently metrics are collected from the Kubernetes API endpoint in daemon mode. +- **`COLLECT_DISKUSAGE`** (default: `false`): Set to `true` to collect disk space usage. Note: The metrics collector calculates the overall file size stored in the pod's filesystem, which includes files from the container image, ephemeral storage, and mounted COS buckets. This metric cannot be used to calculate ephemeral storage usage alone. +- **`METRICS_ENABLED`** (default: `false`): Set to `true` to enable the HTTP metrics server. When disabled, the collector still runs and logs metrics to stdout but does not expose the HTTP endpoint. +- **`METRICS_REMOTE_WRITE_FQDN`**: IBM Cloud Monitoring ingestion endpoint FQDN (required when `METRICS_ENABLED=true`) +- **`METRICS_PORT`** (default: `9100`): HTTP server port for the Prometheus metrics endpoint. Only used when `METRICS_ENABLED=true` in daemon mode. -#### Add disk utilization -1. Duplicate the graph or create a new one, change its name to "Disk usage" and replace its plots with `disk_usage.current`. -1. The resulting graph will render the actual disk usage. While this does not allow to identify the usage of disk space compared with the configured ephemeral storage limit, this graph gives an impression on whether the disk usage is growing over time. The the unit is MB (1000 -> 1 GB). -![](./images/disk-utilization.png) +**Trusted Profile Authentication (Option 1):** +- **`MONITORING_INSTANCE_GUID`**: The GUID of your IBM Cloud Monitoring instance (required for Trusted Profile authentication) +- **`MONITORING_REGION`**: The region where your Monitoring instance is deployed, e.g., `us-south`, `eu-de`, `eu-gb` (required for Trusted Profile authentication) +- **`TRUSTED_PROFILE_NAME`**: The name of the IBM Cloud Trusted Profile to use for authentication (required for Trusted Profile authentication) +- **`CR_TOKEN_FILENAME`** (optional): Override the default container resource token file path (default: `/var/run/secrets/codeengine.cloud.ibm.com/compute-resource-token/token`) +**Note:** When all three Trusted Profile environment variables are set, the collector will attempt Trusted Profile authentication first. If it fails or the variables are not set, it will fall back to using a mounted secret at `/etc/secrets/monitoring-apikey`. diff --git a/metrics-collector/docs/indicator-metric-design.md b/metrics-collector/docs/indicator-metric-design.md new file mode 100644 index 000000000..1ce67ba39 --- /dev/null +++ b/metrics-collector/docs/indicator-metric-design.md @@ -0,0 +1,272 @@ +# Indicator Metric Design for Sysdig Dashboard Auto-Discovery + +## Overview + +This document analyzes the requirements and provides recommendations for implementing an indicator metric (`ibm_codeengine_instance_resources`) that Sysdig will use to automatically show built-in dashboards. + +## Requirements Analysis + +### Purpose +- Signal to Sysdig that Code Engine metrics are available +- Trigger automatic display of pre-configured dashboards +- Indicate the health/readiness status of the metrics collector + +### Sysdig Recommendation +- **Metric Type**: Gauge +- **Metric Name**: `ibm_codeengine_instance_resources` +- **Status Label**: Should indicate readiness (value: `ready`) + +## Design Recommendations + +### ✅ YES - This Metric Makes Sense + +**Reasons:** +1. **Dashboard Discovery**: Sysdig can detect the presence of Code Engine metrics and automatically enable relevant dashboards +2. **Health Indicator**: Provides a simple way to monitor if the metrics collector is functioning +3. **Low Cardinality**: When designed properly, it adds minimal overhead +4. **Industry Standard**: Similar patterns used by other monitoring integrations (e.g., `up` metric in Prometheus exporters) + +### Metric Design + +#### Recommended Approach: Single Instance Gauge + +```prometheus +# HELP ibm_codeengine_instance_resources Indicator metric for IBM Code Engine resource monitoring +# TYPE ibm_codeengine_instance_resources gauge +ibm_codeengine_instance_resources{status="ready"} 1 +``` + +**Characteristics:** +- **Value**: Always `1` when collector is running and healthy +- **Single time series**: Only one metric instance per collector +- **Minimal cardinality**: Only 1 time series regardless of number of pods/apps/jobs + +#### Alternative Approach: Per-Namespace Gauge (If Multiple Namespaces) + +```prometheus +# HELP ibm_codeengine_instance_resources Indicator metric for IBM Code Engine resource monitoring +# TYPE ibm_codeengine_instance_resources gauge +ibm_codeengine_instance_resources{namespace="my-project-namespace",status="ready"} 1 +``` + +**Use this if:** +- You plan to monitor multiple Code Engine projects from a single collector +- You need namespace-level visibility + +**Cardinality**: 1 time series per namespace (still very low) + +### Label Design + +#### Recommended Labels + +**Minimal (Recommended):** +```go +status="ready" // Indicates collector is operational +``` + +**Extended (Optional):** +```go +status="ready" // Collector status +namespace="my-project-namespace" // Code Engine project namespace +collector_version="1.0.0" // Collector version for troubleshooting +``` + +#### Status Values + +| Status | Value | Meaning | Use Case | +|--------|-------|---------|----------| +| Ready | `ready` | Collector is operational and collecting metrics | Normal operation | +| Degraded | `degraded` | Collector is running but experiencing issues | Partial failures | +| Starting | `starting` | Collector is initializing | Startup phase | + +**Recommendation**: Start with only `ready` status. Add others only if needed for operational visibility. + +### Cardinality Analysis + +#### Current Metrics Cardinality +Looking at your existing metrics: +``` +ibm_codeengine_instance_cpu_usage_millicores{ + ibm_codeengine_instance_name="pod-xyz", + ibm_codeengine_component_type="app", + ibm_codeengine_component_name="my-app" +} +``` + +**Cardinality**: N time series (where N = number of pods) +- If you have 100 pods, you have 100 time series per metric type +- Total: 100 pods × 4 metrics = 400 time series + +#### Indicator Metric Cardinality + +**Option 1: Single Instance (Recommended)** +``` +ibm_codeengine_instance_resources{status="ready"} 1 +``` +**Cardinality**: 1 time series (constant, regardless of pod count) +**Impact**: Negligible (~0.25% overhead for 100 pods) + +**Option 2: Per-Namespace** +``` +ibm_codeengine_instance_resources{namespace="ns1",status="ready"} 1 +ibm_codeengine_instance_resources{namespace="ns2",status="ready"} 1 +``` +**Cardinality**: M time series (where M = number of namespaces) +**Impact**: Still negligible for typical use cases (1-10 namespaces) + +**Option 3: Per-Component (NOT RECOMMENDED)** +``` +ibm_codeengine_instance_resources{component_name="app1",status="ready"} 1 +ibm_codeengine_instance_resources{component_name="app2",status="ready"} 1 +``` +**Cardinality**: K time series (where K = number of components) +**Impact**: Defeats the purpose of an indicator metric; use existing metrics instead + +### Gauge Value Recommendations + +#### Option 1: Binary Indicator (Recommended) +```go +Value: 1 // Collector is running +Value: 0 // Collector is stopped (metric disappears) +``` + +**Advantages:** +- Simple and clear +- Standard pattern (like Prometheus `up` metric) +- Easy to alert on: `ibm_codeengine_instance_resources == 0` + +#### Option 2: Timestamp +```go +Value: 1710950400 // Unix timestamp of last successful collection +``` + +**Advantages:** +- Can detect stale metrics +- Useful for debugging collection delays + +**Disadvantages:** +- More complex to interpret +- Not necessary if Prometheus already tracks metric timestamps + +#### Option 3: Count of Monitored Resources +```go +Value: 42 // Number of pods being monitored +``` + +**Advantages:** +- Provides additional context +- Can alert on unexpected changes + +**Disadvantages:** +- Adds complexity +- Information already available from other metrics + +**Recommendation**: Use **Option 1 (Binary Indicator)** with value `1` + +### Implementation Strategy + +#### When to Emit the Metric + +**Option A: Always Emit (Recommended)** +```go +// Emit on every metrics collection cycle +// Value: 1 (constant) +``` + +**Option B: Conditional Emit** +```go +// Only emit if collection was successful +// Value: 1 if success, 0 or absent if failure +``` + +**Recommendation**: Use Option A for simplicity. The metric's presence indicates the collector is running. + +#### Where to Add in Code + +Add to [`formatPrometheusMetrics()`](main.go:81) function after existing metrics: + +```go +// After line 189 (after internal stats) +sb.WriteString("# HELP ibm_codeengine_instance_resources Indicator metric for IBM Code Engine resource monitoring\n") +sb.WriteString("# TYPE ibm_codeengine_instance_resources gauge\n") +sb.WriteString("ibm_codeengine_instance_resources{status=\"ready\"} 1\n") +sb.WriteString("\n") +``` + +### Complete Metric Specification + +```prometheus +# HELP ibm_codeengine_instance_resources Indicator metric for IBM Code Engine resource monitoring +# TYPE ibm_codeengine_instance_resources gauge +ibm_codeengine_instance_resources{status="ready"} 1 +``` + +**Metric Properties:** +- **Name**: `ibm_codeengine_instance_resources` +- **Type**: Gauge +- **Value**: `1` (constant) +- **Labels**: + - `status`: `"ready"` (indicates collector is operational) +- **Cardinality**: 1 time series per collector instance +- **Update Frequency**: Every metrics scrape (same as other metrics) + +### Alerting Examples + +Once implemented, you can create alerts: + +```promql +# Alert if collector is down +absent(ibm_codeengine_instance_resources{status="ready"}) + +# Alert if collector hasn't reported in 5 minutes +time() - timestamp(ibm_codeengine_instance_resources{status="ready"}) > 300 +``` + +### Dashboard Auto-Discovery + +Sysdig will use this metric to: +1. Detect Code Engine metrics are available +2. Automatically enable the "IBM Code Engine - Container Resource Overview" dashboard +3. Show the dashboard in the user's dashboard list + +**Detection Query Example:** +```promql +count(ibm_codeengine_instance_resources{status="ready"}) > 0 +``` + +## Summary + +### ✅ Recommended Implementation + +```go +// Add to formatPrometheusMetrics() function +sb.WriteString("# HELP ibm_codeengine_instance_resources Indicator metric for IBM Code Engine resource monitoring\n") +sb.WriteString("# TYPE ibm_codeengine_instance_resources gauge\n") +sb.WriteString("ibm_codeengine_instance_resources{status=\"ready\"} 1\n") +sb.WriteString("\n") +``` + +### Key Benefits + +1. **Minimal Cardinality**: Only 1 time series (0.25% overhead) +2. **Simple Design**: Binary indicator (value = 1) +3. **Standard Pattern**: Follows Prometheus exporter conventions +4. **Operational Value**: Can be used for health checks and alerting +5. **Dashboard Discovery**: Enables Sysdig auto-discovery feature + +### Cardinality Impact + +| Scenario | Existing Metrics | Indicator Metric | Total | Overhead | +|----------|------------------|------------------|-------|----------| +| 10 pods | 40 | 1 | 41 | 2.5% | +| 100 pods | 400 | 1 | 401 | 0.25% | +| 1000 pods | 4000 | 1 | 4001 | 0.025% | + +**Conclusion**: The indicator metric adds negligible cardinality overhead while providing significant operational value. + +## Next Steps + +1. Implement the metric in [`main.go`](main.go:81) +2. Test with Sysdig to verify dashboard auto-discovery +3. Update documentation with the new metric +4. Consider adding to the dashboard JSON if needed for visibility diff --git a/metrics-collector/docs/metrics-collector.drawio b/metrics-collector/docs/metrics-collector.drawio new file mode 100644 index 000000000..11852c3b5 --- /dev/null +++ b/metrics-collector/docs/metrics-collector.drawio @@ -0,0 +1,327 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/metrics-collector/go.mod b/metrics-collector/go.mod index b816d4e37..e358afe92 100644 --- a/metrics-collector/go.mod +++ b/metrics-collector/go.mod @@ -1,6 +1,6 @@ module metrics-collector -go 1.23.0 +go 1.25.0 require ( k8s.io/api v0.30.1 @@ -31,7 +31,7 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect - golang.org/x/net v0.38.0 // indirect + golang.org/x/net v0.23.0 // indirect golang.org/x/oauth2 v0.27.0 // indirect golang.org/x/sys v0.31.0 // indirect golang.org/x/term v0.30.0 // indirect @@ -42,7 +42,7 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/klog/v2 v2.120.1 // indirect - k8s.io/kube-openapi v0.0.0-20240430033511-f0e62f92d13f // indirect + k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect diff --git a/metrics-collector/go.sum b/metrics-collector/go.sum index 6f6389a87..f2758c0dc 100644 --- a/metrics-collector/go.sum +++ b/metrics-collector/go.sum @@ -14,8 +14,7 @@ github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDsl github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= -github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= -github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= @@ -28,8 +27,8 @@ github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6 h1:k7nVchz72niMH6YLQNvHSdIE7iqsQxK1P41mySCvssg= -github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6/go.mod h1:kf6iHlnVGwgKolg33glAes7Yg/8iWP8ukqeldJSO7jw= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= @@ -58,10 +57,10 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= -github.com/onsi/ginkgo/v2 v2.17.2 h1:7eMhcy3GimbsA3hEnVKdw/PQM9XN9krpKVXsZdph0/g= -github.com/onsi/ginkgo/v2 v2.17.2/go.mod h1:nP2DPOQoNsQmsVyv5rDA8JkXQoCs6goXIvr/PRJ1eCc= -github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk= -github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0= +github.com/onsi/ginkgo/v2 v2.15.0 h1:79HwNRBAZHOEwrczrgSOPy+eFTTlIGELKy5as+ClttY= +github.com/onsi/ginkgo/v2 v2.15.0/go.mod h1:HlxMHtYF57y6Dpf+mc5529KKmSq9h2FpCF+/ZkwUxKM= +github.com/onsi/gomega v1.31.0 h1:54UJxxj6cPInHS3a35wm6BK/F9nHYueZ1NVujHDrnXE= +github.com/onsi/gomega v1.31.0/go.mod h1:DW9aCi7U6Yi40wNVAvT6kzFnEVEI5n3DloYBiKiT6zk= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= @@ -83,8 +82,8 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= -golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -133,8 +132,8 @@ k8s.io/client-go v0.30.1 h1:uC/Ir6A3R46wdkgCV3vbLyNOYyCJ8oZnjtJGKfytl/Q= k8s.io/client-go v0.30.1/go.mod h1:wrAqLNs2trwiCH/wxxmT/x3hKVH9PuV0GGW0oDoHVqc= k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20240430033511-f0e62f92d13f h1:0LQagt0gDpKqvIkAMPaRGcXawNMouPECM1+F9BVxEaM= -k8s.io/kube-openapi v0.0.0-20240430033511-f0e62f92d13f/go.mod h1:S9tOR0FxgyusSNR+MboCuiDpVWkAifZvaYI1Q2ubgro= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98= k8s.io/kubectl v0.30.1 h1:sHFIRI3oP0FFZmBAVEE8ErjnTyXDPkBcvO88mH9RjuY= k8s.io/kubectl v0.30.1/go.mod h1:7j+L0Cc38RYEcx+WH3y44jRBe1Q1jxdGPKkX0h4iDq0= k8s.io/metrics v0.30.1 h1:PeA9cP0kxVtaC8Wkzp4sTkr7YSkd9R0UYP6cCHOOY1M= diff --git a/metrics-collector/images/configure-plots.png b/metrics-collector/images/configure-plots.png deleted file mode 100644 index ceffd258b..000000000 Binary files a/metrics-collector/images/configure-plots.png and /dev/null differ diff --git a/metrics-collector/images/count-of-metrics-lines.png b/metrics-collector/images/count-of-metrics-lines.png deleted file mode 100644 index e274f48b1..000000000 Binary files a/metrics-collector/images/count-of-metrics-lines.png and /dev/null differ diff --git a/metrics-collector/images/cpu-utilization.png b/metrics-collector/images/cpu-utilization.png deleted file mode 100644 index 4329d5714..000000000 Binary files a/metrics-collector/images/cpu-utilization.png and /dev/null differ diff --git a/metrics-collector/images/ibm-cloud-logs--loglines.png b/metrics-collector/images/ibm-cloud-logs--loglines.png deleted file mode 100644 index e865c935c..000000000 Binary files a/metrics-collector/images/ibm-cloud-logs--loglines.png and /dev/null differ diff --git a/metrics-collector/images/memory-utilization.png b/metrics-collector/images/memory-utilization.png deleted file mode 100644 index d1b533810..000000000 Binary files a/metrics-collector/images/memory-utilization.png and /dev/null differ diff --git a/metrics-collector/images/metrics-collector.overview.png b/metrics-collector/images/metrics-collector.overview.png new file mode 100644 index 000000000..89535b90c Binary files /dev/null and b/metrics-collector/images/metrics-collector.overview.png differ diff --git a/metrics-collector/images/monitoring-dashboard-ce-component-resources.png b/metrics-collector/images/monitoring-dashboard-ce-component-resources.png new file mode 100644 index 000000000..c6b2a282f Binary files /dev/null and b/metrics-collector/images/monitoring-dashboard-ce-component-resources.png differ diff --git a/metrics-collector/images/new-board.png b/metrics-collector/images/new-board.png deleted file mode 100644 index b95e182e5..000000000 Binary files a/metrics-collector/images/new-board.png and /dev/null differ diff --git a/metrics-collector/images/resource-usage-graph.png b/metrics-collector/images/resource-usage-graph.png deleted file mode 100644 index 47eb80be4..000000000 Binary files a/metrics-collector/images/resource-usage-graph.png and /dev/null differ diff --git a/metrics-collector/main.go b/metrics-collector/main.go index 63a2ba867..4bf7d7792 100644 --- a/metrics-collector/main.go +++ b/metrics-collector/main.go @@ -5,10 +5,14 @@ import ( "context" "encoding/json" "fmt" + "net/http" "os" + "os/signal" "strconv" "strings" "sync" + "sync/atomic" + "syscall" "time" v1 "k8s.io/api/core/v1" @@ -23,13 +27,186 @@ import ( metricsv "k8s.io/metrics/pkg/client/clientset/versioned" ) -func main() { +// MetricsCache holds the latest collected metrics in a thread-safe manner +type MetricsCache struct { + mu sync.RWMutex + metrics []InstanceResourceStats + namespace string + lastUpdate time.Time + collectionCount int64 + errorCount int64 +} + +// CollectorStats tracks collector performance metrics +type CollectorStats struct { + lastCollectionDuration atomic.Int64 // in milliseconds + lastCollectionTime atomic.Int64 // unix timestamp + totalErrors atomic.Int64 +} + +var ( + metricsCache = &MetricsCache{} + collectorStats = &CollectorStats{} +) + +// setupHTTPHandlers configures the HTTP routes +func setupHTTPHandlers() http.Handler { + mux := http.NewServeMux() + mux.HandleFunc("/metrics", metricsHandler) + return mux +} + +// metricsHandler serves Prometheus-formatted metrics +func metricsHandler(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + metricsCache.mu.RLock() + metrics := metricsCache.metrics + namespace := metricsCache.namespace + lastUpdate := metricsCache.lastUpdate + metricsCache.mu.RUnlock() + + // Set content type for Prometheus + w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + + // Write Prometheus metrics + output := formatPrometheusMetrics(metrics, namespace, lastUpdate) + w.Write([]byte(output)) +} + +// formatPrometheusMetrics converts metrics to Prometheus text format +func formatPrometheusMetrics(metrics []InstanceResourceStats, namespace string, lastUpdate time.Time) string { + var sb strings.Builder + // Helper function to escape label values + escapeLabelValue := func(s string) string { + s = strings.ReplaceAll(s, "\\", "\\\\") + s = strings.ReplaceAll(s, "\"", "\\\"") + s = strings.ReplaceAll(s, "\n", "\\n") + return s + } + + // Write container CPU usage metrics + sb.WriteString("# HELP ibm_codeengine_instance_cpu_usage_millicores Current CPU usage in millicores\n") + sb.WriteString("# TYPE ibm_codeengine_instance_cpu_usage_millicores gauge\n") + for _, m := range metrics { + labels := fmt.Sprintf("ibm_codeengine_instance_name=\"%s\",ibm_codeengine_component_type=\"%s\",ibm_codeengine_component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_cpu_usage_millicores{%s} %d\n", labels, m.Cpu.Current)) + } + sb.WriteString("\n") + + // Write container CPU limit metrics + sb.WriteString("# HELP ibm_codeengine_instance_cpu_limit_millicores Configured CPU limit in millicores\n") + sb.WriteString("# TYPE ibm_codeengine_instance_cpu_limit_millicores gauge\n") + for _, m := range metrics { + if m.Cpu.Configured > 0 { + labels := fmt.Sprintf("ibm_codeengine_instance_name=\"%s\",ibm_codeengine_component_type=\"%s\",ibm_codeengine_component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_cpu_limit_millicores{%s} %d\n", labels, m.Cpu.Configured)) + } + } + sb.WriteString("\n") + + // Write container memory usage metrics + sb.WriteString("# HELP ibm_codeengine_instance_memory_usage_bytes Current memory usage in bytes\n") + sb.WriteString("# TYPE ibm_codeengine_instance_memory_usage_bytes gauge\n") + for _, m := range metrics { + labels := fmt.Sprintf("ibm_codeengine_instance_name=\"%s\",ibm_codeengine_component_type=\"%s\",ibm_codeengine_component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + // Convert MB to bytes + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_memory_usage_bytes{%s} %d\n", labels, m.Memory.Current*1000*1000)) + } + sb.WriteString("\n") + + // Write container memory limit metrics + sb.WriteString("# HELP ibm_codeengine_instance_memory_limit_bytes Configured memory limit in bytes\n") + sb.WriteString("# TYPE ibm_codeengine_instance_memory_limit_bytes gauge\n") + for _, m := range metrics { + if m.Memory.Configured > 0 { + labels := fmt.Sprintf("ibm_codeengine_instance_name=\"%s\",ibm_codeengine_component_type=\"%s\",ibm_codeengine_component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + // Convert MB to bytes + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_memory_limit_bytes{%s} %d\n", labels, m.Memory.Configured*1000*1000)) + } + } + sb.WriteString("\n") + + // Write container ephemeral storage usage metrics (if available) + hasStorageMetrics := false + for _, m := range metrics { + if m.DiskUsage.Current > 0 { + hasStorageMetrics = true + break + } + } + + if hasStorageMetrics { + sb.WriteString("# HELP ibm_codeengine_instance_ephemeral_storage_usage_bytes Current ephemeral storage usage in bytes\n") + sb.WriteString("# TYPE ibm_codeengine_instance_ephemeral_storage_usage_bytes gauge\n") + for _, m := range metrics { + if m.DiskUsage.Current > 0 { + labels := fmt.Sprintf("ibm_codeengine_instance_name=\"%s\",ibm_codeengine_component_type=\"%s\",ibm_codeengine_component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + // Convert MB to bytes + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_ephemeral_storage_usage_bytes{%s} %d\n", labels, m.DiskUsage.Current*1000*1000)) + } + } + sb.WriteString("\n") + } + + if os.Getenv("METRICS_INTERNAL_STATS") == "true" { + // Write collector self-monitoring metrics + sb.WriteString("# HELP codeengine_collector_collection_duration_seconds Time taken to collect metrics in seconds\n") + sb.WriteString("# TYPE codeengine_collector_collection_duration_seconds gauge\n") + durationMs := collectorStats.lastCollectionDuration.Load() + sb.WriteString(fmt.Sprintf("codeengine_collector_collection_duration_seconds %.3f\n", float64(durationMs)/1000.0)) + sb.WriteString("\n") + + sb.WriteString("# HELP codeengine_collector_last_collection_timestamp_seconds Unix timestamp of last successful collection\n") + sb.WriteString("# TYPE codeengine_collector_last_collection_timestamp_seconds gauge\n") + lastCollectionTime := collectorStats.lastCollectionTime.Load() + sb.WriteString(fmt.Sprintf("codeengine_collector_last_collection_timestamp_seconds %d\n", lastCollectionTime)) + sb.WriteString("\n") + + sb.WriteString("# HELP codeengine_collector_collection_errors_total Total number of collection errors\n") + sb.WriteString("# TYPE codeengine_collector_collection_errors_total counter\n") + totalErrors := collectorStats.totalErrors.Load() + sb.WriteString(fmt.Sprintf("codeengine_collector_collection_errors_total %d\n", totalErrors)) + sb.WriteString("\n") + } + + // Add indicator metric for Sysdig dashboard discovery + sb.WriteString("# HELP ibm_codeengine_instance_resources Indicator metric for IBM Cloud Code Engine resource monitoring\n") + sb.WriteString("# TYPE ibm_codeengine_instance_resources gauge\n") + sb.WriteString("ibm_codeengine_instance_resources{status=\"ready\"} 1\n") + sb.WriteString("\n") + + return sb.String() +} + +func main() { jobMode := os.Getenv("JOB_MODE") // In task mode, collect the resource metrics once if jobMode == "task" { - collectInstanceMetrics() + if err := collectInstanceMetrics(metricsCache); err != nil { + fmt.Printf("Error collecting metrics: %v\n", err) + os.Exit(1) + } return } @@ -42,11 +219,105 @@ func main() { } } - // In daemon mode, collect resource metrics in an endless loop - for { - collectInstanceMetrics() - time.Sleep(time.Duration(sleepDuration) * time.Second) + // Check if HTTP metrics server should be enabled + metricsEnabled := os.Getenv("METRICS_ENABLED") == "true" + + // Get metrics port configuration + metricsPort := "9100" + if port := os.Getenv("METRICS_PORT"); port != "" { + metricsPort = port + } + + // Create context for graceful shutdown + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Setup signal handling for graceful shutdown + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + // Start HTTP server only if METRICS_ENABLED=true + var server *http.Server + var serverErrors chan error + + if metricsEnabled { + server = &http.Server{ + Addr: ":" + metricsPort, + Handler: setupHTTPHandlers(), + } + + // Start HTTP server in a goroutine + serverErrors = make(chan error, 1) + go func() { + fmt.Printf("Starting HTTP metrics server on port %s\n", metricsPort) + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + serverErrors <- fmt.Errorf("HTTP server error: %w", err) + } + }() + } else { + fmt.Println("HTTP metrics server disabled (METRICS_ENABLED not set to 'true')") } + + // Start metrics collection loop in a goroutine + collectionDone := make(chan struct{}) + go func() { + defer close(collectionDone) + ticker := time.NewTicker(time.Duration(sleepDuration) * time.Second) + defer ticker.Stop() + + // Collect metrics immediately on startup + if err := collectInstanceMetrics(metricsCache); err != nil { + fmt.Printf("Error collecting metrics: %v\n", err) + collectorStats.totalErrors.Add(1) + } + + for { + select { + case <-ctx.Done(): + fmt.Println("Stopping metrics collection...") + return + case <-ticker.C: + if err := collectInstanceMetrics(metricsCache); err != nil { + fmt.Printf("Error collecting metrics: %v\n", err) + collectorStats.totalErrors.Add(1) + } + } + } + }() + + // Wait for shutdown signal or server error + if metricsEnabled { + select { + case sig := <-sigChan: + fmt.Printf("\nReceived signal %v, initiating graceful shutdown...\n", sig) + case err := <-serverErrors: + fmt.Printf("Server error: %v\n", err) + } + } else { + // If server is not running, just wait for signal + sig := <-sigChan + fmt.Printf("\nReceived signal %v, initiating graceful shutdown...\n", sig) + } + + // Cancel context to stop metrics collection + cancel() + + // Shutdown HTTP server with timeout (only if it was started) + if metricsEnabled && server != nil { + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer shutdownCancel() + + if err := server.Shutdown(shutdownCtx); err != nil { + fmt.Printf("HTTP server shutdown error: %v\n", err) + } else { + fmt.Println("HTTP server stopped gracefully") + } + } + + // Wait for metrics collection to finish + <-collectionDone + fmt.Println("Metrics collection stopped") + fmt.Println("Shutdown complete") } type ComponentType int64 @@ -88,28 +359,129 @@ type InstanceResourceStats struct { Message string `json:"message"` } +// buildPodMap creates a map of pod names to pod objects for O(1) lookup +func buildPodMap(pods *[]v1.Pod) map[string]*v1.Pod { + podMap := make(map[string]*v1.Pod, len(*pods)) + for i := range *pods { + podMap[(*pods)[i].Name] = &(*pods)[i] + } + return podMap +} + +// extractComponentMetadata extracts component type, name, and parent from pod metric labels +func extractComponentMetadata(podMetric *v1beta1.PodMetrics) (componentType ComponentType, componentName, parent string) { + componentType = determineComponentType(podMetric) + + switch componentType { + case Job: + if jobName, ok := podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-definition-name"]; ok { + componentName = jobName + } else if jobRunName, ok := podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-run"]; ok { + componentName = jobRunName + } else { + componentName = "standalone" + } + parent = podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-run"] + case App: + componentName = podMetric.ObjectMeta.Labels["serving.knative.dev/service"] + parent = podMetric.ObjectMeta.Labels["serving.knative.dev/revision"] + case Build: + if buildName, ok := podMetric.ObjectMeta.Labels["build.shipwright.io/name"]; ok { + componentName = buildName + } else if buildRunName, ok := podMetric.ObjectMeta.Labels["buildrun.shipwright.io/name"]; ok { + componentName = buildRunName + } else { + componentName = "standalone" + } + parent = podMetric.ObjectMeta.Labels["buildrun.shipwright.io/name"] + default: + componentName = "unknown" + } + + return +} + +// processMetric processes a single pod metric and outputs the JSON log line +func processMetric( + podMetric *v1beta1.PodMetrics, + podMap map[string]*v1.Pod, + clientset *kubernetes.Clientset, + namespace string, + config *rest.Config, +) *InstanceResourceStats { + // Extract component metadata + componentType, componentName, parent := extractComponentMetadata(podMetric) + + // Determine the actual CPU and memory usage + cpuCurrent := podMetric.Containers[0].Usage.Cpu().ToDec().AsApproximateFloat64() * 1000 + memoryCurrent := podMetric.Containers[0].Usage.Memory().ToDec().AsApproximateFloat64() / 1000 / 1000 + + stats := InstanceResourceStats{ + Metric: "instance-resources", + Name: podMetric.Name, + Parent: parent, + ComponentType: componentType.String(), + ComponentName: componentName, + Cpu: ResourceStats{ + Current: int64(cpuCurrent), + }, + Memory: ResourceStats{ + Current: int64(memoryCurrent), + }, + } + + // Gather the configured resource limits and calculate the usage (in percent) + pod := podMap[podMetric.Name] + if pod != nil { + userContainerName := getUserContainerName(componentType, pod) + + // determine the actual disk usage + storageCurrent := obtainDiskUsage(clientset, namespace, podMetric.Name, userContainerName, config) + stats.DiskUsage.Current = int64(storageCurrent) + + // extract memory and cpu limits + cpu, memory := getCpuAndMemoryLimits(userContainerName, pod) + + cpuLimit := cpu.ToDec().AsApproximateFloat64() * 1000 + stats.Cpu.Configured = int64(cpuLimit) + stats.Cpu.Usage = int64((cpuCurrent / cpuLimit) * 100) + + memoryLimit := memory.ToDec().AsApproximateFloat64() / 1000 / 1000 + stats.Memory.Configured = int64(memoryLimit) + stats.Memory.Usage = int64(memoryCurrent / memoryLimit * 100) + } + + // Compose the log line message + stats.Message = "Captured metrics of " + stats.ComponentType + " instance '" + stats.Name + "': " + fmt.Sprintf("%d", stats.Cpu.Current) + "m vCPU, " + fmt.Sprintf("%d", stats.Memory.Current) + " MB memory, " + fmt.Sprintf("%d", stats.DiskUsage.Current) + " MB disk usage" + + // Write the stringified JSON struct and make use of IBM Cloud Logs built-in parsing mechanism, + // which allows to annotate log lines by providing a JSON object instead of a simple string + fmt.Println(ToJSONString(&stats)) + + return &stats +} + // Helper function that retrieves all pods and all pod metrics // this function creates a structured log line for each pod for which the kube metrics api provides a metric -func collectInstanceMetrics() { - +func collectInstanceMetrics(cache *MetricsCache) error { startTime := time.Now() fmt.Println("Start to capture pod metrics ...") config, err := rest.InClusterConfig() if err != nil { - panic(err.Error()) + return fmt.Errorf("failed to get cluster config: %w", err) } // obtain the kube namespace related to this Code Engine project nsBytes, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/namespace") if err != nil { - panic(err.Error()) + return fmt.Errorf("failed to read namespace: %w", err) } namespace := string(nsBytes) coreClientset, err := kubernetes.NewForConfig(config) if err != nil { - panic(err.Error()) + return fmt.Errorf("failed to create clientset: %w", err) } // fetches all pods @@ -118,96 +490,53 @@ func collectInstanceMetrics() { // fetch all pod metrics podMetrics := getAllPodMetrics(namespace, config) + // Build pod map for O(1) lookup + podMap := buildPodMap(pods) + + // Collect metrics into a slice + var collectedMetrics []InstanceResourceStats + var metricsMu sync.Mutex + + // Use semaphore to limit concurrent goroutines + const maxConcurrency = 20 + sem := make(chan struct{}, maxConcurrency) var wg sync.WaitGroup for _, metric := range *podMetrics { wg.Add(1) + sem <- struct{}{} // Acquire semaphore go func(podMetric *v1beta1.PodMetrics) { defer wg.Done() + defer func() { <-sem }() // Release semaphore - // Determine the component type (either app, job, build or unknown) - componentType := determineComponentType(podMetric) - - // Determine the component name - var componentName string - var parent string - switch componentType { - case Job: - if val, ok := podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-definition-name"]; ok { - componentName = val - } else { - componentName = "standalone" - } - parent = podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-run"] - case App: - componentName = podMetric.ObjectMeta.Labels["serving.knative.dev/service"] - parent = podMetric.ObjectMeta.Labels["serving.knative.dev/revision"] - case Build: - if val, ok := podMetric.ObjectMeta.Labels["build.shipwright.io/name"]; ok { - componentName = val - } else { - componentName = "standalone" - } - - parent = podMetric.ObjectMeta.Labels["buildrun.shipwright.io/name"] - default: - componentName = "unknown" - } - - // Determine the actual CPU and memory usage - cpuCurrent := podMetric.Containers[0].Usage.Cpu().ToDec().AsApproximateFloat64() * 1000 - memoryCurrent := podMetric.Containers[0].Usage.Memory().ToDec().AsApproximateFloat64() / 1000 / 1000 - - stats := InstanceResourceStats{ - Metric: "instance-resources", - Name: podMetric.Name, - Parent: parent, - ComponentType: componentType.String(), - ComponentName: componentName, - Cpu: ResourceStats{ - Current: int64(cpuCurrent), - }, - Memory: ResourceStats{ - Current: int64(memoryCurrent), - }, - } - - // Gather the configured resource limits and calculate the usage (in percent) - pod := getPod(podMetric.Name, pods) - if pod != nil { - - userContainerName := getUserContainerName(componentType, pod) - - // determine the actual disk usage - storageCurrent := obtainDiskUsage(coreClientset, namespace, podMetric.Name, userContainerName, config) - stats.DiskUsage.Current = int64(storageCurrent) - - // extract memory and cpu limits - cpu, memory := getCpuAndMemoryLimits(userContainerName, pod) - - cpuLimit := cpu.ToDec().AsApproximateFloat64() * 1000 - stats.Cpu.Configured = int64(cpuLimit) - stats.Cpu.Usage = int64((cpuCurrent / cpuLimit) * 100) - - memoryLimit := memory.ToDec().AsApproximateFloat64() / 1000 / 1000 - stats.Memory.Configured = int64(memoryLimit) - stats.Memory.Usage = int64(memoryCurrent / memoryLimit * 100) + stats := processMetric(podMetric, podMap, coreClientset, namespace, config) + if stats != nil { + metricsMu.Lock() + collectedMetrics = append(collectedMetrics, *stats) + metricsMu.Unlock() } - - // Compose the log line message - stats.Message = "Captured metrics of " + stats.ComponentType + " instance '" + stats.Name + "': " + fmt.Sprintf("%d", stats.Cpu.Current) + "m vCPU, " + fmt.Sprintf("%d", stats.Memory.Current) + " MB memory, " + fmt.Sprintf("%d", stats.DiskUsage.Current) + " MB disk usage" - - // Write the stringified JSON struct and make use of IBM Cloud Logs built-in parsing mechanism, - // which allows to annotate log lines by providing a JSON object instead of a simple string - fmt.Println(ToJSONString(&stats)) - }(&metric) } wg.Wait() - fmt.Println("Captured pod metrics in " + strconv.FormatInt(time.Since(startTime).Milliseconds(), 10) + " ms") + duration := time.Since(startTime) + fmt.Println("Captured pod metrics in " + strconv.FormatInt(duration.Milliseconds(), 10) + " ms") + + // Update cache with collected metrics + cache.mu.Lock() + cache.metrics = collectedMetrics + cache.namespace = namespace + cache.lastUpdate = time.Now() + cache.collectionCount++ + cache.mu.Unlock() + + // Update collector statistics + collectorStats.lastCollectionDuration.Store(duration.Milliseconds()) + collectorStats.lastCollectionTime.Store(time.Now().Unix()) + + return nil } // Helper function to determine the component type @@ -224,16 +553,6 @@ func determineComponentType(podMetric *v1beta1.PodMetrics) ComponentType { return Unknown } -// Helper function to obtain a pod by its name from a slice of pods -func getPod(name string, pods *[]v1.Pod) *v1.Pod { - for _, pod := range *pods { - if pod.Name == name { - return &pod - } - } - return nil -} - // Helper function to retrieve all pods from the Kube API func getAllPods(coreClientset *kubernetes.Clientset, namespace string, config *rest.Config) *[]v1.Pod { @@ -374,10 +693,17 @@ func getUserContainerName(componentType ComponentType, pod *v1.Pod) string { return "user-container" } - if componentType == Job || componentType == Build { + if componentType == Job { return pod.Spec.Containers[0].Name } + // builds are using two containers: + // a quite small 'step-source-default' + // and the 'step-build-and-push' which does the heavy lifting + if componentType == Build && len(pod.Spec.Containers) > 1 { + return pod.Spec.Containers[1].Name + } + // for kube-native deployments, we pick the first container return pod.Spec.Containers[0].Name } diff --git a/metrics-collector/prometheus.yml.template b/metrics-collector/prometheus.yml.template new file mode 100644 index 000000000..f944024e1 --- /dev/null +++ b/metrics-collector/prometheus.yml.template @@ -0,0 +1,99 @@ +global: + scrape_interval: 30s + external_labels: + ibm_codeengine_project_name: '${CE_PROJECT_NAME}' + +scrape_configs: + - job_name: 'codeengine-metrics-project' + static_configs: + - targets: ['localhost:9100'] + relabel_configs: + # Add project name label + - source_labels: [job] + action: replace + regex: (.+) + replacement: '${CE_PROJECT_NAME}' + target_label: ibm_codeengine_project_name + + - job_name: 'codeengine-metrics-user' + fallback_scrape_protocol: PrometheusText0.0.4 + kubernetes_sd_configs: + - api_server: 'https://172.21.0.1' + role: pod + namespaces: + names: + - ${CE_SUBDOMAIN} + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + # only scrape when annotation codeengine.cloud.ibm.com/userMetricsScrape: 'true' is set + - source_labels: [__meta_kubernetes_pod_annotation_codeengine_cloud_ibm_com_userMetricsScrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_codeengine_cloud_ibm_com_userMetricsPath] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_codeengine_cloud_ibm_com_userMetricsPort] + action: replace + regex: (.+):(?:\d+);(\d+) + replacement: ${1}:${2} + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + # rename important meta data labels + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: ibm_codeengine_namespace + - action: replace + replacement: '${CE_PROJECT_NAME}' + target_label: ibm_codeengine_project_name + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: ibm_codeengine_instance_name + - source_labels: [__meta_kubernetes_pod_label_serving_knative_dev_service] + action: replace + target_label: ibm_codeengine_component_name + - source_labels: [__meta_kubernetes_pod_label_serving_knative_dev_configuration] + action: replace + regex: (.+) + replacement: app + target_label: ibm_codeengine_component_type + - source_labels: [__meta_kubernetes_pod_label_serving_knative_dev_revision] + action: replace + target_label: ibm_codeengine_subcomponent_name + - source_labels: [__meta_kubernetes_pod_label_serving_knative_dev_revisionUID] + action: replace + regex: (.+) + replacement: app_revision + target_label: ibm_codeengine_subcomponent_type + # drop codeengine, istio, and knative labels + - action: labeldrop + regex: "codeengine_cloud_ibm_com_(.+)" + - action: labeldrop + regex: "security_istio_io_(.+)" + - action: labeldrop + regex: "service_istio_io_(.+)" + - action: labeldrop + regex: "serving_knative_dev_(.+)" + # drop default prometheus labels + metric_relabel_configs: + - action: labeldrop + regex: "instance" + - action: labeldrop + regex: "pod_template_hash" + - action: labeldrop + regex: "app" +# +# Define IBM Cloud Monitoring as the remote write target +# +remote_write: +- url: https://${METRICS_REMOTE_WRITE_FQDN}/prometheus/remote/write + authorization: + credentials_file: "/etc/secrets/monitoring-apikey" + write_relabel_configs: + # Dropping scrape metrics (e.g. scrape_duration_seconds) + - source_labels: [__name__] + regex: 'scrape_duration_seconds|scrape_samples_scraped|scrape_series_added|scrape_samples_post_metric_relabeling' + action: drop \ No newline at end of file diff --git a/metrics-collector/setup/ibm-cloud-monitoring/README.md b/metrics-collector/setup/ibm-cloud-monitoring/README.md new file mode 100644 index 000000000..6ce675b30 --- /dev/null +++ b/metrics-collector/setup/ibm-cloud-monitoring/README.md @@ -0,0 +1,343 @@ +# IBM Cloud Monitoring Dashboard Setup + +This directory contains tools and dashboards for IBM Cloud Monitoring (Sysdig) integration. + +## Files + +- **`monitoring-dashboard-manager.sh`**: Unified bash script for managing dashboards (recommended) +- **`code-engine-component-resource-overview.json`**: Dashboard configuration for Code Engine resource monitoring + +## Quick Start + +### Prerequisites + +1. **IBM Cloud CLI** installed and logged in +2. **jq** (JSON processor) installed +3. **curl** installed (usually pre-installed) +4. An IBM Cloud Monitoring (Sysdig) instance + +### Installation + +The bash script is ready to use. Make it executable if needed: + +```bash +chmod +x monitoring-dashboard-manager.sh +``` + +### Basic Usage + +```bash +# Show help +./monitoring-dashboard-manager.sh help + +# List all monitoring instances +./monitoring-dashboard-manager.sh list-instances + +# List dashboards in an instance +./monitoring-dashboard-manager.sh list-dashboards \ + --instance-id "YOUR_INSTANCE_ID" \ + --region us-south + +# Export a dashboard +./monitoring-dashboard-manager.sh export \ + --instance-id "YOUR_INSTANCE_ID" \ + --region us-south \ + --dashboard-id 12345 + +# Import/create a dashboard +./monitoring-dashboard-manager.sh import \ + --instance-id "YOUR_INSTANCE_ID" \ + --region us-south \ + --file code-engine-component-resource-overview.json +``` + +## Bash Script: monitoring-dashboard-manager.sh + +The unified bash script provides all dashboard management functionality using IBM Cloud CLI authentication. + +### Features + +- ✅ List monitoring instances (global and by region) +- ✅ List dashboards with name, ID, and last updated timestamp +- ✅ Export dashboards to JSON files +- ✅ Create new dashboards from JSON files +- ✅ Update existing dashboards +- ✅ Uses IBM Cloud CLI login context (no API key needed) +- ✅ Comprehensive error handling +- ✅ Table and JSON output formats +- ✅ Verbose mode for debugging + +### Authentication + +The script uses your current IBM Cloud CLI login session. Ensure you're logged in: + +```bash +ibmcloud login +``` + +The script automatically retrieves the IAM token using: +```bash +ibmcloud iam oauth-tokens --output JSON | jq -r '.iam_token' +``` + +### Commands + +#### List Monitoring Instances + +List all monitoring instances in your account: + +```bash +# List all instances +./monitoring-dashboard-manager.sh list-instances + +# List instances in a specific region +./monitoring-dashboard-manager.sh list-instances --region us-south + +# Output in JSON format +./monitoring-dashboard-manager.sh list-instances --format json +``` + +#### List Dashboards + +List all dashboards in a monitoring instance: + +```bash +./monitoring-dashboard-manager.sh list-dashboards \ + --instance-id "12345678-1234-1234-1234-123456789abc" \ + --region us-south + +# JSON format +./monitoring-dashboard-manager.sh list-dashboards \ + --instance-id "12345678-1234-1234-1234-123456789abc" \ + --region us-south \ + --format json +``` + +#### Export Dashboard + +Export a dashboard to a JSON file: + +```bash +# Export with auto-generated filename +./monitoring-dashboard-manager.sh export \ + --instance-id "12345678-1234-1234-1234-123456789abc" \ + --region us-south \ + --dashboard-id 12345 + +# Export with custom filename +./monitoring-dashboard-manager.sh export \ + --instance-id "12345678-1234-1234-1234-123456789abc" \ + --region us-south \ + --dashboard-id 12345 \ + --output my-dashboard.json +``` + +#### Import/Create Dashboard + +Import a dashboard from a JSON file: + +```bash +# Create new dashboard +./monitoring-dashboard-manager.sh import \ + --instance-id "12345678-1234-1234-1234-123456789abc" \ + --region us-south \ + --file code-engine-component-resource-overview.json + +# Update existing dashboard (by name) +./monitoring-dashboard-manager.sh import \ + --instance-id "12345678-1234-1234-1234-123456789abc" \ + --region us-south \ + --file code-engine-component-resource-overview.json \ + --update +``` + +### Environment Variables + +You can set default values using environment variables: + +```bash +export SYSDIG_INSTANCE_ID="12345678-1234-1234-1234-123456789abc" +export SYSDIG_REGION="us-south" + +# Now you can omit --instance-id and --region +./monitoring-dashboard-manager.sh list-dashboards +``` + +### Supported Regions + +- `us-south` - US South (Dallas) +- `us-east` - US East (Washington DC) +- `eu-de` - EU Central (Frankfurt) +- `eu-es` - EU Spain (Madrid) +- `eu-gb` - EU GB (London) +- `jp-tok` - Japan (Tokyo) +- `jp-osa` - Japan (Osaka) +- `au-syd` - Australia (Sydney) +- `ca-tor` - Canada (Toronto) +- `br-sao` - Brazil (São Paulo) + +### Verbose Mode + +Enable verbose output for debugging: + +```bash +./monitoring-dashboard-manager.sh list-instances --verbose +``` + +## Dashboard: Code Engine Container Resource Overview + +The `code-engine-component-resource-overview.json` dashboard provides comprehensive monitoring of Code Engine resources. + +### Panels + +1. **CPU Usage vs Limit (per Pod)** - Compares live CPU usage to configured limits +2. **CPU Utilization % (per App)** - CPU percentage by component +3. **Memory Usage vs Limit (per Pod)** - Compares memory usage to limits +4. **Memory Utilization % (per App)** - Memory percentage by component +5. **CPU Utilization % (per Namespace)** - Namespace-level CPU monitoring +6. **Memory Utilization % (per Namespace)** - Namespace-level memory monitoring +7. **CPU Utilization % (per Revision/Parent)** - Revision-level CPU tracking +8. **Memory Utilization % (per Revision/Parent)** - Revision-level memory tracking +9. **Top Pods by CPU** - Top 10 CPU consumers +10. **Top Pods by Memory** - Top 10 memory consumers +11. **Cluster CPU Utilization (%)** - Global CPU percentage +12. **Cluster Memory Utilization (%)** - Global memory percentage + +### Required Metrics + +The dashboard uses the following Prometheus metrics: + +- `codeengine_container_cpu_usage_millicores` +- `codeengine_container_cpu_limit_millicores` +- `codeengine_container_memory_usage_bytes` +- `codeengine_container_memory_limit_bytes` + +These metrics are exposed by the Code Engine metrics collector when running with `METRICS_ENABLED=true`. + +## Getting Your IBM Cloud Credentials + +### IBM Cloud IAM API Key (for Python scripts) + +1. Log in to [IBM Cloud Console](https://cloud.ibm.com) +2. Go to **Manage** > **Access (IAM)** > **API keys** +3. Click **Create an IBM Cloud API key** +4. Give it a name and description +5. Copy and save the API key securely + +### Monitoring Instance ID + +1. Navigate to your IBM Cloud Monitoring instance +2. Click on **Overview** or **Settings** +3. Copy the **Instance ID** (GUID format: `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`) + +### Region + +Note the region where your Monitoring instance is deployed (e.g., `us-south`, `eu-de`) + +## Complete Example Workflow + +```bash +# 1. Ensure you're logged in to IBM Cloud +ibmcloud login + +# 2. List your monitoring instances to find the instance ID +./monitoring-dashboard-manager.sh list-instances + +# Example output: +# NAME INSTANCE_ID REGION STATE +# my-monitoring-instance 12345678-1234-1234-1234-123456789abc us-south active + +# 3. Set environment variables for convenience +export SYSDIG_INSTANCE_ID="12345678-1234-1234-1234-123456789abc" +export SYSDIG_REGION="us-south" + +# 4. List existing dashboards +./monitoring-dashboard-manager.sh list-dashboards + +# 5. Import the Code Engine dashboard +./monitoring-dashboard-manager.sh import \ + --file code-engine-component-resource-overview.json + +# Output: +# → Loading dashboard configuration from: code-engine-component-resource-overview.json +# → Dashboard name: IBM Code Engine - Container Resource Overview +# → Checking if dashboard already exists... +# → Dashboard does not exist. Creating new dashboard... +# ✓ Dashboard 'IBM Code Engine - Container Resource Overview' created successfully! +# +# Dashboard ID: 12345 +# Dashboard URL: https://us-south.monitoring.cloud.ibm.com/#/dashboards/12345 + +# 6. Later, update the dashboard with changes +./monitoring-dashboard-manager.sh import \ + --file code-engine-component-resource-overview.json \ + --update + +# 7. Export a dashboard for backup +./monitoring-dashboard-manager.sh export --dashboard-id 12345 +``` + +## Troubleshooting + +### Authentication Errors + +**Error: "Not logged in to IBM Cloud CLI"** +- Solution: Run `ibmcloud login` to authenticate + +**Error: "Failed to obtain IAM token"** +- Solution: Ensure you're logged in and your session hasn't expired +- Try: `ibmcloud iam oauth-tokens` to verify token generation + +### Missing Tools + +**Error: "Missing required tools: jq"** +- macOS: `brew install jq` +- Ubuntu/Debian: `sudo apt-get install jq` +- RHEL/CentOS: `sudo yum install jq` + +**Error: "Missing required tools: ibmcloud"** +- Install IBM Cloud CLI: https://cloud.ibm.com/docs/cli?topic=cli-install-ibmcloud-cli + +### Dashboard Not Showing Data + +If the dashboard shows no data: +- Verify the metrics collector is running with `METRICS_ENABLED=true` +- Check that metrics are being sent to IBM Cloud Monitoring +- Ensure the Prometheus remote write configuration is correct +- Wait a few minutes for data to appear (initial scrape interval) + +### Import Errors + +**Error: "Dashboard configuration must include a 'name' field"** +- Ensure your JSON file has a `name` field at the top level + +**Error: "Dashboard 'X' already exists"** +- Use the `--update` flag to update the existing dashboard +- Or rename the dashboard in the JSON file + +### API Errors + +**Error: "API request failed (HTTP 403)"** +- Verify you have access to the monitoring instance +- Check that the instance ID is correct +- Ensure your IBM Cloud account has the necessary permissions + +**Error: "API request failed (HTTP 404)"** +- Verify the instance ID and region are correct +- Check that the dashboard ID exists (for export operations) + +## Additional Resources + +- [IBM Cloud Monitoring Documentation](https://cloud.ibm.com/docs/monitoring) +- [Sysdig Dashboard API](https://docs.sysdig.com/en/docs/developer-tools/sysdig-rest-api-conventions/) +- [PromQL Query Language](https://prometheus.io/docs/prometheus/latest/querying/basics/) +- [IBM Cloud CLI Documentation](https://cloud.ibm.com/docs/cli) + +## Contributing + +When making changes to the dashboard or scripts: + +1. Test thoroughly with a real IBM Cloud Monitoring instance +2. Update this README with any new features or changes +3. Ensure backward compatibility where possible +4. Document any breaking changes clearly diff --git a/metrics-collector/setup/ibm-cloud-monitoring/code-engine-component-resource-overview.json b/metrics-collector/setup/ibm-cloud-monitoring/code-engine-component-resource-overview.json new file mode 100644 index 000000000..e7cfd2263 --- /dev/null +++ b/metrics-collector/setup/ibm-cloud-monitoring/code-engine-component-resource-overview.json @@ -0,0 +1,1069 @@ +{ + "name": "IBM Cloud Code Engine - Component Resource Overview", + "panels": [ + { + "id": 1, + "type": "text", + "name": "Dashboard Overview", + "description": "", + "nullValueDisplayText": null, + "links": null, + "markdownSource": "Monitor vCPU, and Memory usage across Code Engine components.\n\n**Use the scope filters above to narrow by:**\n- `ibm_codeengine_component_type` (app, job, build)\n- `ibm_codeengine_component_name` (specific app/job/build name)", + "transparentBackground": false, + "panelTitleVisible": true, + "textAutosized": false + }, + { + "id": 5, + "type": "advancedTimechart", + "name": "vCPU Utilization % (by Component)", + "description": "vCPU usage as percentage of limit, grouped by ibm_codeengine_component_name", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "(sum by (ibm_codeengine_component_name) (ibm_codeengine_instance_cpu_usage_millicores{$__scope}) / sum by (ibm_codeengine_component_name) (ibm_codeengine_instance_cpu_limit_millicores{$__scope})) * 100", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": "{{ibm_codeengine_component_name}}", + "type": "lines" + }, + "format": { + "unit": "%", + "inputFormat": "0-100", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "%", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "0-100", + "maxInputFormat": "0-100", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "%", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "0-100", + "maxInputFormat": "0-100", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 80.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "ok", + "value": 0.0, + "inputFormat": "0-100", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 6, + "type": "advancedTimechart", + "name": "Memory Utilization % (by Component)", + "description": "Memory usage as percentage of limit, grouped by ibm_codeengine_component_name", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "(sum by (ibm_codeengine_component_name) (ibm_codeengine_instance_memory_usage_bytes{$__scope}) / sum by (ibm_codeengine_component_name) (ibm_codeengine_instance_memory_limit_bytes{$__scope})) * 100", + "enabled": true, + "displayInfo": { + "displayName": "Memory %", + "timeSeriesDisplayNameTemplate": "{{ibm_codeengine_component_name}}", + "type": "lines" + }, + "format": { + "unit": "%", + "inputFormat": "0-100", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "0-100", + "maxInputFormat": "0-100", + "scale": "linear" + }, + "right": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 80.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "ok", + "value": 30.0, + "inputFormat": "0-100", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 8, + "type": "advancedNumber", + "name": "Total vCPU Utilization in %", + "description": "Overall vCPU utilization across all instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "(sum(ibm_codeengine_instance_cpu_usage_millicores{$__scope}) / sum(ibm_codeengine_instance_cpu_limit_millicores{$__scope})) * 100", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "%", + "inputFormat": "0-100", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "medium", + "value": 90.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "ok", + "value": 30.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "none", + "value": 1.0, + "inputFormat": "0-100", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 9, + "type": "advancedNumber", + "name": "Total Memory Utilization in %", + "description": "Overall memory utilization across all instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "(sum(ibm_codeengine_instance_memory_usage_bytes{$__scope}) / sum(ibm_codeengine_instance_memory_limit_bytes{$__scope})) * 100", + "enabled": true, + "displayInfo": { + "displayName": "Memory %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "%", + "inputFormat": "0-100", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "medium", + "value": 80.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "ok", + "value": 30.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "none", + "value": 1.0, + "inputFormat": "0-100", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 11, + "type": "advancedNumber", + "name": "Total Available vCPUs", + "description": "Overall number of available vCPUs across all instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(ibm_codeengine_instance_cpu_limit_millicores{$__scope}) / 1000", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "info", + "value": 80.0, + "inputFormat": "1", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 12, + "type": "advancedNumber", + "name": "Total Available Memory", + "description": "Overall amount of available memory across all instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(ibm_codeengine_instance_memory_limit_bytes{$__scope})", + "enabled": true, + "displayInfo": { + "displayName": "Memory %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "byte", + "inputFormat": "B", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 13, + "type": "advancedNumber", + "name": "Total Used vCPUs", + "description": "Actual number of vCPUs that are used across all instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(ibm_codeengine_instance_cpu_usage_millicores{$__scope}) / 1000", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "info", + "value": 80.0, + "inputFormat": "1", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 14, + "type": "advancedNumber", + "name": "Total Used Memory", + "description": "Actual amount of memory used across all instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(ibm_codeengine_instance_memory_usage_bytes{$__scope})", + "enabled": true, + "displayInfo": { + "displayName": "Memory %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "byte", + "inputFormat": "B", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 2, + "type": "advancedTimechart", + "name": "vCPU Usage (per Instance)", + "description": "Current vCPU usage per instance", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "ibm_codeengine_instance_cpu_usage_millicores{$__scope}", + "enabled": true, + "displayInfo": { + "displayName": "CPU Usage", + "timeSeriesDisplayNameTemplate": "{{ibm_codeengine_component_name}}/{{ibm_codeengine_instance_name}} usage", + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "0-100", + "maxInputFormat": "0-100", + "scale": "linear" + }, + "right": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 3, + "type": "advancedTimechart", + "name": "Memory Usage (per Instance)", + "description": "Current memory usage per instance", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "ibm_codeengine_instance_memory_usage_bytes{$__scope}", + "enabled": true, + "displayInfo": { + "displayName": "Memory Usage", + "timeSeriesDisplayNameTemplate": "{{ibm_codeengine_component_name}}/{{ibm_codeengine_instance_name}} usage", + "type": "lines" + }, + "format": { + "unit": "byte", + "inputFormat": "B", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "B", + "maxInputFormat": "B", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 15, + "type": "advancedNumber", + "name": "Components with Running Instances", + "description": "Number of Code Engine components that do have running instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "count(count(\n ibm_codeengine_instance_cpu_limit_millicores{$__scope}\n)by (ibm_codeengine_component_name))", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 16, + "type": "advancedNumber", + "name": "Running Instances", + "description": "Number of running Code Engine instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(count(\n ibm_codeengine_instance_cpu_limit_millicores{$__scope}\n)by (ibm_codeengine_component_name))", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "info", + "value": 80.0, + "inputFormat": "1", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + } + ], + "scopeExpressionList": [ + { + "operand": "ibm_codeengine_project_name", + "operator": "in", + "displayName": "", + "value": [], + "descriptor": { + "documentId": "ibm_codeengine_project_name", + "id": "ibm_codeengine_project_name", + "metricType": "tag", + "type": "string", + "scale": 0.0, + "name": "ibm_codeengine_project_name", + "description": "ibm_codeengine_project_name", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "scopes": [], + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "aggregationForGroup": "none", + "hidden": false, + "experimental": false, + "deferred": false, + "identity": false, + "canMonitor": false, + "canGroupBy": false, + "canFilter": true, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "publicId": "ibm_codeengine_project_name", + "heuristic": false, + "documentType": "metric", + "segment": false, + "documentTimestamp": 1772465428162 + }, + "variable": true, + "isVariable": true + }, + { + "operand": "ibm_codeengine_component_type", + "operator": "in", + "displayName": "", + "value": [], + "descriptor": { + "documentId": "ibm_codeengine_component_type", + "id": "ibm_codeengine_component_type", + "metricType": "tag", + "type": "string", + "scale": 0.0, + "name": "ibm_codeengine_component_type", + "description": "ibm_codeengine_component_type", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "scopes": [], + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "aggregationForGroup": "none", + "hidden": false, + "experimental": false, + "deferred": false, + "identity": false, + "canMonitor": false, + "canGroupBy": false, + "canFilter": true, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "publicId": "ibm_codeengine_component_type", + "heuristic": false, + "documentType": "metric", + "segment": false, + "documentTimestamp": 1772465428162 + }, + "variable": true, + "isVariable": true + }, + { + "operand": "ibm_codeengine_component_name", + "operator": "in", + "displayName": "", + "value": [], + "descriptor": { + "documentId": "ibm_codeengine_component_name", + "id": "ibm_codeengine_component_name", + "metricType": "tag", + "type": "string", + "scale": 0.0, + "name": "ibm_codeengine_component_name", + "description": "ibm_codeengine_component_name", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "scopes": [], + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "aggregationForGroup": "none", + "hidden": false, + "experimental": false, + "deferred": false, + "identity": false, + "canMonitor": false, + "canGroupBy": false, + "canFilter": true, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "publicId": "ibm_codeengine_component_name", + "heuristic": false, + "documentType": "metric", + "segment": false, + "documentTimestamp": 1772465428162 + }, + "variable": true, + "isVariable": true + } + ], + "eventDisplaySettings": { + "enabled": true, + "queryParams": { + "severities": [], + "alertStatuses": [], + "categories": [], + "filter": "", + "teamScope": false + } + }, + "shared": true, + "public": false, + "description": "Overview of Code Engine instance resource usage: CPU/memory current vs limits, with filtering by ibm_codeengine_component_type and ibm_codeengine_component_name", + "layout": [ + { + "panelId": 1, + "x": 0, + "y": 0, + "w": 7, + "h": 4 + }, + { + "panelId": 5, + "x": 0, + "y": 4, + "w": 12, + "h": 6 + }, + { + "panelId": 6, + "x": 0, + "y": 10, + "w": 12, + "h": 5 + }, + { + "panelId": 8, + "x": 21, + "y": 0, + "w": 3, + "h": 2 + }, + { + "panelId": 9, + "x": 21, + "y": 2, + "w": 3, + "h": 2 + }, + { + "panelId": 11, + "x": 14, + "y": 0, + "w": 4, + "h": 2 + }, + { + "panelId": 12, + "x": 14, + "y": 2, + "w": 4, + "h": 2 + }, + { + "panelId": 13, + "x": 18, + "y": 0, + "w": 3, + "h": 2 + }, + { + "panelId": 14, + "x": 18, + "y": 2, + "w": 3, + "h": 2 + }, + { + "panelId": 2, + "x": 12, + "y": 4, + "w": 12, + "h": 6 + }, + { + "panelId": 3, + "x": 12, + "y": 10, + "w": 12, + "h": 5 + }, + { + "panelId": 15, + "x": 7, + "y": 0, + "w": 4, + "h": 4 + }, + { + "panelId": 16, + "x": 11, + "y": 0, + "w": 3, + "h": 4 + } + ], + "schema": 3 +} \ No newline at end of file diff --git a/metrics-collector/setup/ibm-cloud-monitoring/monitoring-dashboard-manager.sh b/metrics-collector/setup/ibm-cloud-monitoring/monitoring-dashboard-manager.sh new file mode 100755 index 000000000..20823e695 --- /dev/null +++ b/metrics-collector/setup/ibm-cloud-monitoring/monitoring-dashboard-manager.sh @@ -0,0 +1,822 @@ +#!/usr/bin/env bash + +# IBM Cloud Monitoring Dashboard Manager +# A unified tool for managing IBM Cloud Monitoring (Sysdig) dashboards +# +# This script provides functionality to: +# - List monitoring instances (global and by region) +# - List dashboards (name, id, last_updated) +# - Export dashboard by id into a file +# - Create/update dashboard from JSON file +# +# Authentication uses the current IBM Cloud CLI login context + +set -euo pipefail + +# Script metadata +readonly SCRIPT_VERSION="1.0.0" +readonly SCRIPT_NAME="$(basename "$0")" + +# Regional endpoints for IBM Cloud Monitoring +declare -A REGION_ENDPOINTS=( + ["us-south"]="https://us-south.monitoring.cloud.ibm.com" + ["us-east"]="https://us-east.monitoring.cloud.ibm.com" + ["eu-de"]="https://eu-de.monitoring.cloud.ibm.com" + ["eu-es"]="https://eu-es.monitoring.cloud.ibm.com" + ["eu-gb"]="https://eu-gb.monitoring.cloud.ibm.com" + ["jp-tok"]="https://jp-tok.monitoring.cloud.ibm.com" + ["au-syd"]="https://au-syd.monitoring.cloud.ibm.com" + ["jp-osa"]="https://jp-osa.monitoring.cloud.ibm.com" + ["ca-tor"]="https://ca-tor.monitoring.cloud.ibm.com" + ["br-sao"]="https://br-sao.monitoring.cloud.ibm.com" +) + +# IBM Cloud Resource Controller API +readonly RESOURCE_CONTROLLER_API="https://resource-controller.cloud.ibm.com/v2" + +# Default values +DEFAULT_OUTPUT_DIR="." +DEFAULT_FORMAT="table" + +# Global variables +IAM_TOKEN="" +VERBOSE=false + +# Color codes for output +readonly RED='\033[0;31m' +readonly GREEN='\033[0;32m' +readonly YELLOW='\033[1;33m' +readonly BLUE='\033[0;34m' +readonly NC='\033[0m' # No Color + +####################################### +# Print error message and exit +# Arguments: +# Error message +####################################### +error() { + echo -e "${RED}ERROR:${NC} $1" >&2 + exit 1 +} + +####################################### +# Print warning message +# Arguments: +# Warning message +####################################### +warn() { + echo -e "${YELLOW}WARNING:${NC} $1" >&2 +} + +####################################### +# Print info message +# Arguments: +# Info message +####################################### +info() { + echo -e "${BLUE}→${NC} $1" +} + +####################################### +# Print success message +# Arguments: +# Success message +####################################### +success() { + echo -e "${GREEN}✓${NC} $1" +} + +####################################### +# Print verbose message +# Arguments: +# Message +####################################### +verbose() { + if [[ "$VERBOSE" == true ]]; then + echo -e "${BLUE}[DEBUG]${NC} $1" >&2 + fi +} + +####################################### +# Check if required tools are installed +####################################### +check_prerequisites() { + local missing_tools=() + + # Check for required commands + for cmd in ibmcloud curl jq; do + if ! command -v "$cmd" &> /dev/null; then + missing_tools+=("$cmd") + fi + done + + if [[ ${#missing_tools[@]} -gt 0 ]]; then + error "Missing required tools: ${missing_tools[*]}\nPlease install them and try again." + fi + + verbose "All required tools are installed" +} + +####################################### +# Verify IBM Cloud CLI is logged in +####################################### +check_ibmcloud_login() { + verbose "Checking IBM Cloud CLI login status..." + + if ! ibmcloud account show &> /dev/null; then + error "Not logged in to IBM Cloud CLI. Please run 'ibmcloud login' first." + fi + + verbose "IBM Cloud CLI is logged in" +} + +####################################### +# Get IAM token from IBM Cloud CLI +# Returns: +# IAM token (Bearer token) +####################################### +get_iam_token() { + if [[ -n "$IAM_TOKEN" ]]; then + echo "$IAM_TOKEN" + return + fi + + verbose "Obtaining IAM token from IBM Cloud CLI..." + + local token + token=$(ibmcloud iam oauth-tokens --output JSON 2>/dev/null | jq -r '.iam_token' 2>/dev/null) + + if [[ -z "$token" || "$token" == "null" ]]; then + error "Failed to obtain IAM token. Please ensure you're logged in to IBM Cloud CLI." + fi + + if [[ ! "$token" =~ ^Bearer ]]; then + error "Invalid token format received from IBM Cloud CLI" + fi + + IAM_TOKEN="$token" + verbose "IAM token obtained successfully" + echo "$token" +} + +####################################### +# Get monitoring endpoint URL for region +# Arguments: +# Region name +# Returns: +# Endpoint URL +####################################### +get_region_endpoint() { + local region="$1" + + if [[ -z "${REGION_ENDPOINTS[$region]:-}" ]]; then + error "Unsupported region: $region\nSupported regions: ${!REGION_ENDPOINTS[*]}" + fi + + echo "${REGION_ENDPOINTS[$region]}" +} + +####################################### +# Make HTTP request with error handling +# Arguments: +# HTTP method +# URL +# Additional curl arguments (optional) +# Returns: +# Response body +####################################### +http_request() { + local method="$1" + local url="$2" + shift 2 + local curl_args=("$@") + + local response + local http_code + local temp_file + temp_file=$(mktemp) + + verbose "Making $method request to: $url" + + # Make request and capture both response and HTTP code + http_code=$(curl -s -w "%{http_code}" -o "$temp_file" \ + -X "$method" \ + "$url" \ + "${curl_args[@]}" \ + 2>/dev/null || echo "000") + + response=$(cat "$temp_file") + rm -f "$temp_file" + + verbose "HTTP response code: $http_code" + + # Check for HTTP errors + if [[ "$http_code" -ge 400 ]] || [[ "$http_code" == "000" ]]; then + local error_msg="API request failed (HTTP $http_code)" + if [[ -n "$response" ]]; then + local api_error + api_error=$(echo "$response" | jq -r '.message // .error // empty' 2>/dev/null || echo "") + if [[ -n "$api_error" ]]; then + error_msg="$error_msg: $api_error" + fi + fi + error "$error_msg" + fi + + echo "$response" +} + +####################################### +# List monitoring instances +# Arguments: +# Region filter (optional) +# Output format (table|json) +####################################### +list_monitoring_instances() { + local region_filter="${1:-}" + local format="${2:-table}" + + info "Fetching monitoring instances..." + + local token + token=$(get_iam_token) + + local response + response=$(http_request GET \ + "$RESOURCE_CONTROLLER_API/resource_instances?type=service_instance" \ + -H "Authorization: $token" \ + -H "Content-Type: application/json") + + # Filter for sysdig-monitor instances + local instances + instances=$(echo "$response" | jq -c '[.resources[] | select(.id | contains("sysdig-monitor"))]') + + # Apply region filter if specified + if [[ -n "$region_filter" ]]; then + instances=$(echo "$instances" | jq -c --arg region "$region_filter" '[.[] | select(.region_id == $region)]') + fi + + local count + count=$(echo "$instances" | jq 'length') + + if [[ "$count" -eq 0 ]]; then + warn "No monitoring instances found" + return + fi + + success "Found $count monitoring instance(s)" + echo + + if [[ "$format" == "json" ]]; then + echo "$instances" | jq '.' + else + # Table format + echo "$instances" | jq -r '["NAME", "INSTANCE_ID", "REGION", "STATE"], + (.[] | [.name, .guid, .region_id, .state]) | @tsv' | column -t -s $'\t' + fi +} + +####################################### +# List dashboards in a monitoring instance +# Arguments: +# Instance ID +# Region +# Output format (table|json) +####################################### +list_dashboards() { + local instance_id="$1" + local region="$2" + local format="${3:-table}" + + info "Fetching dashboards from instance $instance_id in region $region..." + + local token + token=$(get_iam_token) + + local endpoint + endpoint=$(get_region_endpoint "$region") + + local response + response=$(http_request GET \ + "$endpoint/api/v3/dashboards" \ + -H "Authorization: $token" \ + -H "IBMInstanceID: $instance_id" \ + -H "Content-Type: application/json") + + local dashboards + dashboards=$(echo "$response" | jq -c '.dashboards // []') + + local count + count=$(echo "$dashboards" | jq 'length') + + if [[ "$count" -eq 0 ]]; then + warn "No dashboards found in this instance" + return + fi + + success "Found $count dashboard(s)" + echo + + if [[ "$format" == "json" ]]; then + echo "$dashboards" | jq '.' + else + # Table format with ID, Name, and Last Updated + echo "$dashboards" | jq -r '["ID", "NAME", "LAST_UPDATED"], + (.[] | [.id, .name, (.modifiedOn | . / 1000 | strftime("%Y-%m-%dT%H:%M:%S %Z"))]) | @tsv' | column -t -s $'\t' + fi +} + +####################################### +# Find dashboard by name +# Arguments: +# Instance ID +# Region +# Dashboard name +# Returns: +# Dashboard ID or empty string +####################################### +find_dashboard_by_name() { + local instance_id="$1" + local region="$2" + local name="$3" + + verbose "Searching for dashboard: $name" + + local token + token=$(get_iam_token) + + local endpoint + endpoint=$(get_region_endpoint "$region") + + local response + response=$(http_request GET \ + "$endpoint/api/v3/dashboards" \ + -H "Authorization: $token" \ + -H "IBMInstanceID: $instance_id" \ + -H "Content-Type: application/json") + + local dashboard_id + dashboard_id=$(echo "$response" | jq -r --arg name "$name" '.dashboards[] | select(.name == $name) | .id') + + echo "$dashboard_id" +} + +####################################### +# Export dashboard to JSON file +# Arguments: +# Instance ID +# Region +# Dashboard ID +# Output file (optional) +####################################### +export_dashboard() { + local instance_id="$1" + local region="$2" + local dashboard_id="$3" + local output_file="${4:-}" + + info "Exporting dashboard $dashboard_id from instance $instance_id..." + + local token + token=$(get_iam_token) + + local endpoint + endpoint=$(get_region_endpoint "$region") + + local response + response=$(http_request GET \ + "$endpoint/api/v3/dashboards/$dashboard_id" \ + -H "Authorization: $token" \ + -H "IBMInstanceID: $instance_id" \ + -H "Content-Type: application/json") + + # Extract dashboard object + local dashboard + dashboard=$(echo "$response" | jq '.dashboard') + + if [[ "$dashboard" == "null" ]]; then + error "Dashboard not found or invalid response" + fi + + # Generate output filename if not provided + if [[ -z "$output_file" ]]; then + local dashboard_name + dashboard_name=$(echo "$dashboard" | jq -r '.name // "dashboard"' | tr ' ' '_' | tr '[:upper:]' '[:lower:]') + local timestamp + timestamp=$(date -u +"%Y%m%d_%H%M%S") + output_file="${dashboard_name}_${timestamp}.json" + fi + + # Save to file with pretty printing + echo "$dashboard" | jq '.' > "$output_file" + + success "Dashboard exported successfully to: $output_file" + + # Show dashboard info + local name + name=$(echo "$dashboard" | jq -r '.name') + echo + echo "Dashboard: $name" + echo "ID: $dashboard_id" + echo "File: $output_file" +} + +####################################### +# Import or update dashboard from JSON file +# Arguments: +# Instance ID +# Region +# JSON file path +# Update mode (true|false) +####################################### +import_dashboard() { + local instance_id="$1" + local region="$2" + local file="$3" + local update_mode="${4:-false}" + + # Validate file exists + if [[ ! -f "$file" ]]; then + error "Dashboard file not found: $file" + fi + + info "Loading dashboard configuration from: $file" + + # Load and validate JSON + local dashboard_config + if ! dashboard_config=$(jq '.' "$file" 2>/dev/null); then + error "Invalid JSON in dashboard file: $file" + fi + + # Extract dashboard name + local dashboard_name + dashboard_name=$(echo "$dashboard_config" | jq -r '.name // empty') + + if [[ -z "$dashboard_name" ]]; then + error "Dashboard configuration must include a 'name' field" + fi + + info "Dashboard name: $dashboard_name" + + local token + token=$(get_iam_token) + + local endpoint + endpoint=$(get_region_endpoint "$region") + + # Check if dashboard already exists + info "Checking if dashboard already exists..." + local existing_id + existing_id=$(find_dashboard_by_name "$instance_id" "$region" "$dashboard_name") + + local response + local dashboard_id + local operation + + if [[ -n "$existing_id" ]]; then + if [[ "$update_mode" == "true" ]]; then + info "Dashboard exists (ID: $existing_id). Updating..." + operation="updated" + + # Update existing dashboard + response=$(http_request PUT \ + "$endpoint/api/v3/dashboards/$existing_id" \ + -H "Authorization: $token" \ + -H "IBMInstanceID: $instance_id" \ + -H "Content-Type: application/json" \ + -d "{\"dashboard\": $dashboard_config}") + + dashboard_id="$existing_id" + else + error "Dashboard '$dashboard_name' already exists (ID: $existing_id).\nUse --update flag to update the existing dashboard." + fi + else + info "Dashboard does not exist. Creating new dashboard..." + operation="created" + + # Create new dashboard + response=$(http_request POST \ + "$endpoint/api/v3/dashboards" \ + -H "Authorization: $token" \ + -H "IBMInstanceID: $instance_id" \ + -H "Content-Type: application/json" \ + -d "{\"dashboard\": $dashboard_config}") + + dashboard_id=$(echo "$response" | jq -r '.dashboard.id // empty') + + if [[ -z "$dashboard_id" ]]; then + error "Failed to create dashboard: No ID returned" + fi + fi + + success "Dashboard '$dashboard_name' $operation successfully!" + echo + echo "Dashboard ID: $dashboard_id" + echo "Dashboard URL: $endpoint/#/dashboards/$dashboard_id" +} + +####################################### +# Show usage information +####################################### +show_usage() { + cat << EOF +IBM Cloud Monitoring Dashboard Manager v${SCRIPT_VERSION} + +USAGE: + $SCRIPT_NAME [options] + +COMMANDS: + list-instances List monitoring instances + list-dashboards List dashboards in an instance + export Export a dashboard to JSON file + import Import/create a dashboard from JSON file + help Show this help message + +OPTIONS: + --instance-id ID Monitoring instance ID (GUID) + --region REGION IBM Cloud region + --dashboard-id ID Dashboard ID (for export) + --file FILE JSON file path (for import) + --output FILE Output file path (for export) + --format FORMAT Output format: table or json (default: table) + --update Update existing dashboard (for import) + --verbose Enable verbose output + --version Show version information + +EXAMPLES: + # List all monitoring instances + $SCRIPT_NAME list-instances + + # List instances in a specific region + $SCRIPT_NAME list-instances --region us-south + + # List instances in JSON format + $SCRIPT_NAME list-instances --format json + + # List dashboards in an instance + $SCRIPT_NAME list-dashboards \\ + --instance-id "12345678-1234-1234-1234-123456789abc" \\ + --region us-south + + # Export a dashboard + $SCRIPT_NAME export \\ + --instance-id "12345678-1234-1234-1234-123456789abc" \\ + --region us-south \\ + --dashboard-id 12345 + + # Export with custom output file + $SCRIPT_NAME export \\ + --instance-id "12345678-1234-1234-1234-123456789abc" \\ + --region us-south \\ + --dashboard-id 12345 \\ + --output my-dashboard.json + + # Import a new dashboard + $SCRIPT_NAME import \\ + --instance-id "12345678-1234-1234-1234-123456789abc" \\ + --region us-south \\ + --file dashboard.json + + # Update an existing dashboard + $SCRIPT_NAME import \\ + --instance-id "12345678-1234-1234-1234-123456789abc" \\ + --region us-south \\ + --file dashboard.json \\ + --update + +SUPPORTED REGIONS: + ${!REGION_ENDPOINTS[*]} + +ENVIRONMENT VARIABLES: + SYSDIG_INSTANCE_ID Default instance ID + SYSDIG_REGION Default region + SYSDIG_OUTPUT_DIR Default output directory for exports + +AUTHENTICATION: + This script uses the IBM Cloud CLI login context. Ensure you are logged in: + ibmcloud login + +For more information, see the documentation in the setup/ibm-cloud-monitoring directory. +EOF +} + +####################################### +# Show version information +####################################### +show_version() { + echo "IBM Cloud Monitoring Dashboard Manager v${SCRIPT_VERSION}" +} + +####################################### +# Parse command line arguments +####################################### +parse_args() { + local command="${1:-}" + + if [[ -z "$command" ]]; then + show_usage + exit 0 + fi + + shift + + # Parse command + case "$command" in + list-instances) + local region_filter="" + local format="$DEFAULT_FORMAT" + + while [[ $# -gt 0 ]]; do + case "$1" in + --region) + region_filter="$2" + shift 2 + ;; + --format) + format="$2" + shift 2 + ;; + --verbose) + VERBOSE=true + shift + ;; + *) + error "Unknown option: $1" + ;; + esac + done + + check_prerequisites + check_ibmcloud_login + list_monitoring_instances "$region_filter" "$format" + ;; + + list-dashboards) + local instance_id="${SYSDIG_INSTANCE_ID:-}" + local region="${SYSDIG_REGION:-}" + local format="$DEFAULT_FORMAT" + + while [[ $# -gt 0 ]]; do + case "$1" in + --instance-id) + instance_id="$2" + shift 2 + ;; + --region) + region="$2" + shift 2 + ;; + --format) + format="$2" + shift 2 + ;; + --verbose) + VERBOSE=true + shift + ;; + *) + error "Unknown option: $1" + ;; + esac + done + + if [[ -z "$instance_id" ]]; then + error "Instance ID is required. Use --instance-id or set SYSDIG_INSTANCE_ID environment variable." + fi + + if [[ -z "$region" ]]; then + error "Region is required. Use --region or set SYSDIG_REGION environment variable." + fi + + check_prerequisites + check_ibmcloud_login + list_dashboards "$instance_id" "$region" "$format" + ;; + + export) + local instance_id="${SYSDIG_INSTANCE_ID:-}" + local region="${SYSDIG_REGION:-}" + local dashboard_id="" + local output_file="" + + while [[ $# -gt 0 ]]; do + case "$1" in + --instance-id) + instance_id="$2" + shift 2 + ;; + --region) + region="$2" + shift 2 + ;; + --dashboard-id) + dashboard_id="$2" + shift 2 + ;; + --output) + output_file="$2" + shift 2 + ;; + --verbose) + VERBOSE=true + shift + ;; + *) + error "Unknown option: $1" + ;; + esac + done + + if [[ -z "$instance_id" ]]; then + error "Instance ID is required. Use --instance-id or set SYSDIG_INSTANCE_ID environment variable." + fi + + if [[ -z "$region" ]]; then + error "Region is required. Use --region or set SYSDIG_REGION environment variable." + fi + + if [[ -z "$dashboard_id" ]]; then + error "Dashboard ID is required. Use --dashboard-id." + fi + + check_prerequisites + check_ibmcloud_login + export_dashboard "$instance_id" "$region" "$dashboard_id" "$output_file" + ;; + + import) + local instance_id="${SYSDIG_INSTANCE_ID:-}" + local region="${SYSDIG_REGION:-}" + local file="" + local update_mode="false" + + while [[ $# -gt 0 ]]; do + case "$1" in + --instance-id) + instance_id="$2" + shift 2 + ;; + --region) + region="$2" + shift 2 + ;; + --file) + file="$2" + shift 2 + ;; + --update) + update_mode="true" + shift + ;; + --verbose) + VERBOSE=true + shift + ;; + *) + error "Unknown option: $1" + ;; + esac + done + + if [[ -z "$instance_id" ]]; then + error "Instance ID is required. Use --instance-id or set SYSDIG_INSTANCE_ID environment variable." + fi + + if [[ -z "$region" ]]; then + error "Region is required. Use --region or set SYSDIG_REGION environment variable." + fi + + if [[ -z "$file" ]]; then + error "File is required. Use --file." + fi + + check_prerequisites + check_ibmcloud_login + import_dashboard "$instance_id" "$region" "$file" "$update_mode" + ;; + + help|--help|-h) + show_usage + ;; + + version|--version|-v) + show_version + ;; + + *) + error "Unknown command: $command\nRun '$SCRIPT_NAME help' for usage information." + ;; + esac +} + +####################################### +# Main entry point +####################################### +main() { + parse_args "$@" +} + +# Run main function +main "$@" + +# Made with Bob diff --git a/metrics-collector/start.sh b/metrics-collector/start.sh new file mode 100644 index 000000000..a0e9c4ed8 --- /dev/null +++ b/metrics-collector/start.sh @@ -0,0 +1,251 @@ +#!/bin/sh +set -e + +echo "Starting Code Engine Metrics Collector..." + +# Function to read container resource token from filesystem +read_container_token() { + local cr_token_filename="${CR_TOKEN_FILENAME:-/var/run/secrets/codeengine.cloud.ibm.com/compute-resource-token/token}" + + if [ ! -f "$cr_token_filename" ]; then + echo "ERROR: Container resource token file not found at $cr_token_filename" + return 1 + fi + + CR_TOKEN=$(cat "$cr_token_filename") + if [ -z "$CR_TOKEN" ]; then + echo "ERROR: Container resource token is empty" + return 1 + fi + + echo "Container resource token retrieved successfully" + return 0 +} + +# Function to obtain IAM token using container resource token +obtain_iam_token() { + local cr_token="$1" + local trusted_profile_name="$2" + + if [ -z "$cr_token" ] || [ -z "$trusted_profile_name" ]; then + echo "ERROR: Missing required parameters for IAM token retrieval" + return 1 + fi + + # Make the request to IAM token endpoint + local response + response=$(curl --silent --fail -X POST \ + "https://iam.cloud.ibm.com/identity/token" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -H "Accept: application/json" \ + --data-urlencode "grant_type=urn:ibm:params:oauth:grant-type:cr-token" \ + --data-urlencode "cr_token=$cr_token" \ + --data-urlencode "profile_name=$trusted_profile_name" 2>&1) + + if [ $? -ne 0 ]; then + echo "ERROR: Failed to obtain IAM token from IBM Cloud" + echo "Response: $response" + return 1 + fi + + # Extract access token from response + IBM_CLOUD_BEARER_TOKEN=$(echo "$response" | grep -o '"access_token":"[^"]*"' | cut -d'"' -f4) + + if [ -z "$IBM_CLOUD_BEARER_TOKEN" ]; then + echo "ERROR: Failed to extract access token from IAM response" + echo "Response: $response" + return 1 + fi + + echo "IAM token obtained successfully" + return 0 +} + +# Function to obtain Monitoring API key using IAM token +obtain_monitoring_apikey() { + local bearer_token="$1" + local monitoring_region="$2" + local monitoring_instance_guid="$3" + + if [ -z "$bearer_token" ] || [ -z "$monitoring_region" ] || [ -z "$monitoring_instance_guid" ]; then + echo "ERROR: Missing required parameters for monitoring API key retrieval" + return 1 + fi + + # Fetch monitoring API key + local response + response=$(curl --silent --fail -X GET \ + "https://${monitoring_region}.monitoring.cloud.ibm.com/api/token" \ + -H "Authorization: Bearer $bearer_token" \ + -H "IBMInstanceID: $monitoring_instance_guid" \ + -H "content-type: application/json" 2>&1) + + if [ $? -ne 0 ]; then + echo "ERROR: Failed to obtain Monitoring API key" + echo "Response: $response" + return 1 + fi + + # Extract the API key from response + MONITORING_API_KEY=$(echo "$response" | grep -o '"key":"[^"]*"' | cut -d'"' -f4) + + if [ -z "$MONITORING_API_KEY" ]; then + echo "ERROR: Failed to extract API key from Monitoring response" + echo "Response: $response" + return 1 + fi + + echo "Monitoring API key obtained successfully" + return 0 +} + +# Main authentication function with fallback logic +authenticate_monitoring() { + echo "Authenticating to IBM Cloud Monitoring..." + + # Check if Trusted Profile authentication is configured + if [ -n "$MONITORING_INSTANCE_GUID" ] && [ -n "$MONITORING_REGION" ] && [ -n "$TRUSTED_PROFILE_NAME" ]; then + echo "Attempting Trusted Profile authentication..." + + # Step 1: Read container resource token + if ! read_container_token; then + echo "WARNING: Failed to read container resource token" + echo "Falling back to mounted secret..." + else + # Step 2: Obtain IAM token + if ! obtain_iam_token "$CR_TOKEN" "$TRUSTED_PROFILE_NAME"; then + echo "WARNING: Failed to obtain IAM token" + echo "Falling back to mounted secret..." + else + # Step 3: Obtain Monitoring API key + if ! obtain_monitoring_apikey "$IBM_CLOUD_BEARER_TOKEN" "$MONITORING_REGION" "$MONITORING_INSTANCE_GUID"; then + echo "WARNING: Failed to obtain Monitoring API key" + echo "Falling back to mounted secret..." + else + # Step 4: Write API key to file + echo "$MONITORING_API_KEY" > /etc/secrets/monitoring-apikey + if [ $? -eq 0 ]; then + echo "Monitoring API key written to /etc/secrets/monitoring-apikey" + return 0 + else + echo "ERROR: Failed to write Monitoring API key to file" + echo "Falling back to mounted secret..." + fi + fi + fi + fi + fi + + # Fallback: Check for mounted secret + if [ -f "/etc/secrets/monitoring-apikey" ]; then + echo "Using mounted monitoring API key secret" + return 0 + fi + + # Neither method available + echo "ERROR: No valid authentication method available" + echo "" + echo "Please configure one of the following:" + echo "" + echo "Option 1: Trusted Profile Authentication (Recommended)" + echo " Set the following environment variables:" + echo " - MONITORING_INSTANCE_GUID" + echo " - MONITORING_REGION" + echo " - TRUSTED_PROFILE_NAME" + echo "" + echo "Option 2: Explicit API Key Secret" + echo " Mount a secret containing your Monitoring API key:" + echo " ibmcloud ce secret create --name monitoring-apikey --from-literal monitoring-apikey=YOUR_API_KEY" + echo " ibmcloud ce job update --name metrics-collector --mount-secret /etc/secrets=monitoring-apikey" + echo "" + return 1 +} + +# Check if METRICS_ENABLED is set to true +if [ "$METRICS_ENABLED" = "true" ]; then + echo "Prometheus metrics export enabled" + + # Authenticate to IBM Cloud Monitoring (Trusted Profile or mounted secret) + if ! authenticate_monitoring; then + exit 0 + fi + + # Check required environment variables + if [ -z "$CE_SUBDOMAIN" ]; then + echo "ERROR: CE_SUBDOMAIN environment variable is required when METRICS_ENABLED=true" + exit 1 + fi + + if [ -z "$METRICS_REMOTE_WRITE_FQDN" ]; then + echo "ERROR: METRICS_REMOTE_WRITE_FQDN environment variable is required when METRICS_ENABLED=true" + exit 1 + fi + + if [ -z "$CE_PROJECT_NAME" ]; then + CE_PROJECT_NAME="default" + fi + + # Generate prometheus.yml from template with environment variable substitution + echo "Generating Prometheus configuration..." + sed -e "s/\${CE_SUBDOMAIN}/$CE_SUBDOMAIN/g" \ + -e "s/\${CE_PROJECT_NAME}/$CE_PROJECT_NAME/g" \ + -e "s/\${METRICS_REMOTE_WRITE_FQDN}/$METRICS_REMOTE_WRITE_FQDN/g" \ + /etc/prometheus/prometheus.yml.template > /tmp/prometheus.yml + + echo "Starting Prometheus agent..." + /bin/prometheus --config.file=/tmp/prometheus.yml --agent --storage.agent.path=/tmp/agent-data --log.level info --log.format json 2>&1 & + PROMETHEUS_PID=$! + echo "Prometheus agent started with PID $PROMETHEUS_PID" + + # Give Prometheus a moment to start and check if it's actually running + sleep 2 + if ! kill -0 "$PROMETHEUS_PID" 2>/dev/null; then + echo "ERROR: Prometheus agent failed to start" + exit 1 + fi +else + echo "Prometheus metrics export disabled (METRICS_ENABLED not set to 'true')" +fi + +# Start the metrics collector +echo "Starting metrics collector..." +/app & +APP_PID=$! +echo "Metrics collector started with PID $APP_PID" + +# Function to handle shutdown +shutdown() { + echo "Shutting down..." + if [ -n "$APP_PID" ]; then + kill -TERM "$APP_PID" 2>/dev/null || true + fi + if [ -n "$PROMETHEUS_PID" ]; then + kill -TERM "$PROMETHEUS_PID" 2>/dev/null || true + fi + wait + exit 0 +} + +# Trap signals +trap shutdown SIGTERM SIGINT + +# Monitor processes +while true; do + # Check if app is still running + if ! kill -0 "$APP_PID" 2>/dev/null; then + echo "ERROR: Metrics collector process died unexpectedly" + if [ -n "$APP_PID" ]; then + kill -TERM "$APP_PID" 2>/dev/null || true + fi + exit 1 + fi + + # Check if Prometheus is still running (only if it was started) + if [ "$METRICS_ENABLED" = "true" ] && ! kill -0 "$PROMETHEUS_PID" 2>/dev/null; then + echo "ERROR: Prometheus agent process died unexpectedly" + kill -TERM "$PROMETHEUS_PID" 2>/dev/null || true + exit 1 + fi + + sleep 5 +done diff --git a/metrics-examples/.ceignore b/metrics-examples/.ceignore new file mode 100644 index 000000000..ef43a4df7 --- /dev/null +++ b/metrics-examples/.ceignore @@ -0,0 +1,3 @@ +node_modules/ +target/ +vendor/ \ No newline at end of file diff --git a/metrics-examples/README.md b/metrics-examples/README.md new file mode 100644 index 000000000..c5e7f45d0 --- /dev/null +++ b/metrics-examples/README.md @@ -0,0 +1,225 @@ +# Code Engine custom metrics examples + +The following samples demonstrate how to emit custom metrics in Code Engine jobs and apps across multiple programming languages. + +## Available Languages + +This repository provides identical metrics examples in four languages: + +- **[Node.js](node/README.md)** - Express + prom-client +- **[Go](go/README.md)** - Gorilla Mux + prometheus/client_golang +- **[Java](java/README.md)** - Spring Boot + Micrometer +- **[Python](python/README.md)** - FastAPI + prometheus-client + +All implementations expose the same metrics with identical names and provide the same API endpoints. + +## Quick Start + +### Build All Languages + +```bash +# Build all language implementations +REGISTRY= ./build + +# Or build a specific language +REGISTRY= LANGUAGE=go ./build +REGISTRY= LANGUAGE=java ./build +REGISTRY= LANGUAGE=python ./build +REGISTRY= LANGUAGE=node ./build +``` + +### Deploy to Code Engine + +```bash +ibmcloud ce project select --name + +# Deploy a specific language +ibmcloud ce application create \ + --name metrics-example-app-go \ + --src ./go \ + --memory 0.5G \ + --cpu 0.25 \ + --port 8080 + +# Or use the run script (if available) +./run all +``` + +## Language Comparison + +| Feature | Node.js | Go | Java | Python | +|---------|---------|-----|------|--------| +| **Framework** | Express | Gorilla Mux | Spring Boot | FastAPI | +| **Metrics Library** | prom-client | prometheus/client_golang | Micrometer | prometheus-client | +| **Startup Time** | ~1s | <1s | ~5-10s | ~2-3s | +| **Memory Footprint** | ~150-200 MB | ~20-30 MB | ~200-250 MB | ~100-150 MB | +| **Image Size** | ~150-200 MB | ~20-30 MB | ~200-250 MB | ~100-150 MB | +| **Concurrency Model** | Event loop | Goroutines | Threads | Async/await | +| **Best For** | Rapid development | Performance & efficiency | Enterprise apps | Modern APIs | + +## Metrics + +All applications expose Prometheus metrics at `/metrics` (port 2112). All metric names are prefixed with a configurable value set via the `METRICS_NAME_PREFIX` environment variable (default: `mymetrics_`). + +Once custom metrics scraping is enabled (see asset [metrics-collector](../metrics-collector/README.md)), the following command can be used to import the "My custom Code Engine Metrics" dashboard into IBM Cloud Monitoring: + +```bash +REGION= +MONITORING_INSTANCE_GUID= + +# Load the custom metric dashboard configuration +CE_CUSTOM_METRICS_DASHBOARD=$(curl -sL https://raw.githubusercontent.com/IBM/CodeEngine/metric-collector-v2/metrics-examples/my-custom-code-engine-metrics-dashboard.json) + +# Import the dashboard +curl -X POST https://$REGION.monitoring.cloud.ibm.com/api/v3/dashboards \ + -H "Authorization: $(ibmcloud iam oauth-tokens --output JSON|jq -r '.iam_token')" \ + -H "IBMInstanceID: $MONITORING_INSTANCE_GUID" \ + -H "Content-Type: application/json" \ + -d "{\"dashboard\": $CE_CUSTOM_METRICS_DASHBOARD}" +``` + +To customize the prefix, set the environment variable when starting the application: + +```bash +# Node.js +METRICS_NAME_PREFIX=myapp_ node app.mjs + +# Go +METRICS_NAME_PREFIX=myapp_ go run main.go + +# Java +METRICS_NAME_PREFIX=myapp_ java -jar target/metrics-example-1.0.0.jar + +# Python +METRICS_NAME_PREFIX=myapp_ python app.py +``` + +On Code Engine, set the environment variable in the application configuration: + +```bash +ibmcloud ce app update "metrics-example-app-" --env METRICS_NAME_PREFIX=myapp_ +``` + +Following metrics are emitted by all language implementations: + +**Request Metrics** +- `mymetrics_requests_total`: Total requests by method and path + +**Outbound Call Metrics** +- `mymetrics_outbound_request_duration_seconds`: Histogram of outbound request durations +- `mymetrics_outbound_requests_total`: Total outbound requests by target, method, and status + +**Database Metrics** +- `mymetrics_db_query_duration_seconds`: Histogram of query durations by operation and table +- `mymetrics_db_queries_total`: Total queries by operation, table, and status +- `mymetrics_db_connections_active`: Active database connections gauge + +**Compute Metrics** +- `mymetrics_compute_duration_seconds`: Histogram of compute operation durations + +**Language-Specific Runtime Metrics** +- **Node.js**: Event loop lag, heap usage, GC stats +- **Go**: Goroutines, memory stats, GC stats +- **Java**: JVM memory, threads, GC, class loading +- **Python**: Process stats, memory, CPU + +## Load Testing + +Generate test traffic using the included script: + +```bash +# Test a specific language locally +LANGUAGE=node ./load-test.sh +LANGUAGE=go ./load-test.sh +LANGUAGE=java ./load-test.sh +LANGUAGE=python ./load-test.sh + +# Test all languages sequentially +LANGUAGE=all ./load-test.sh + +# Test against a remote deployment +TARGET_URL=https://your-app.example.com LANGUAGE=node ./load-test.sh + +# Custom configuration +TARGET_URL=https://your-app.example.com LANGUAGE=go DURATION=120 CONCURRENT_REQUESTS=10 ./load-test.sh +``` + +Configuration options: +- `LANGUAGE`: Target language (node, go, java, python, or all) (default: node) +- `TARGET_URL`: Application endpoint (default: http://localhost:8080) +- `DURATION`: Test duration in seconds (default: 60) +- `CONCURRENT_REQUESTS`: Number of concurrent workers (default: 5) + + +### Deploying httpbin Backend + +To deploy your own httpbin instance on IBM Cloud Code Engine instead of using the public service, use the following command with an image from a registry other than docker.io: + +```bash +ibmcloud ce app create \ + --name httpbin \ + --src https://github.com/mark-sivill/httpbin \ + --memory 0.5G \ + --cpu 0.25 \ + --min-scale 0 \ + --max-scale 3 \ + --concurrency 100 \ + --port 9000 +``` + +After deployment, get the application URL: + +```bash +ibmcloud ce application get --name httpbin --output url +``` + +Then configure any metrics-example-app to use your httpbin instance: + +```bash +# For any language +ibmcloud ce application update \ + --name metrics-example-app- \ + --env HTTPBIN_BASE_URL=https://httpbin.your-project.us-south.codeengine.appdomain.cloud +``` + +## API Endpoints + +All language implementations provide identical endpoints: + +- `GET /` - Health check +- `GET /test-db` - Test PostgreSQL connectivity +- `GET /outbound/delay` - Outbound call with random delay (0-2s) and 5% error rate +- `GET /outbound/get` - Simple outbound GET request +- `POST /outbound/post` - Outbound POST request +- `GET /outbound/status/{code}` - Request specific HTTP status code + +Metrics endpoint (port 2112): +- `GET /metrics` - Prometheus metrics (all languages) +- `GET /prometheus` - Alternative metrics endpoint (Java only) + +## Project Structure + +``` +metrics-examples/ +├── node/ # Node.js implementation +├── go/ # Go implementation +├── java/ # Java implementation +├── python/ # Python implementation +├── build # Build script for all languages +├── load-test.sh # Load testing script +└── README.md # This file +``` + +## Contributing + +When adding new features or metrics: + +1. Implement the feature in all four languages +2. Use identical metric names across all implementations +3. Maintain consistent API endpoints +4. Update all language-specific README files +5. Test with the load-test.sh script + +## License + +See individual language directories for specific dependencies and licenses. diff --git a/metrics-examples/build b/metrics-examples/build new file mode 100644 index 000000000..c3a282f30 --- /dev/null +++ b/metrics-examples/build @@ -0,0 +1,55 @@ +#!/bin/bash + +# Env Vars: +# REGISTRY: name of the image registry/namespace to store the images +# NOCACHE: set this to "--no-cache" to turn off the Docker build cache +# LANGUAGE: specific language to build (node, go, java, python) or "all" for all languages +# +# NOTE: to run this you MUST set the REGISTRY environment variable to +# your own image registry/namespace otherwise the `docker push` commands +# will fail due to an auth failure. Which means, you also need to be logged +# into that registry before you run it. + +set -ex +export REGISTRY=${REGISTRY:-icr.io/codeengine} +export LANGUAGE=${LANGUAGE:-all} + +# Function to build and push a language-specific image +build_language() { + local lang=$1 + echo "Building ${lang} application..." + cd ${lang} + docker build ${NOCACHE} -t ${REGISTRY}/metrics-example-app-${lang} -f Dockerfile . --platform linux/amd64 + docker push ${REGISTRY}/metrics-example-app-${lang} + cd .. + echo "✓ ${lang} application built and pushed successfully" +} + +# Build based on LANGUAGE parameter +case ${LANGUAGE} in + node) + build_language "node" + ;; + go) + build_language "go" + ;; + java) + build_language "java" + ;; + python) + build_language "python" + ;; + all) + echo "Building all language implementations..." + build_language "node" + build_language "go" + build_language "java" + build_language "python" + echo "✓ All applications built and pushed successfully" + ;; + *) + echo "Error: Unknown language '${LANGUAGE}'" + echo "Usage: LANGUAGE= ./build" + exit 1 + ;; +esac diff --git a/metrics-examples/go/Dockerfile b/metrics-examples/go/Dockerfile new file mode 100644 index 000000000..afcd58242 --- /dev/null +++ b/metrics-examples/go/Dockerfile @@ -0,0 +1,11 @@ +FROM quay.io/projectquay/golang:1.25 AS build-env +WORKDIR /go/src/app +COPY . . +RUN CGO_ENABLED=0 go build -o /go/bin/app main.go + +# Runtime stage using distroless +FROM gcr.io/distroless/static-debian12:nonroot +COPY --from=build-env /go/bin/app / +USER nonroot:nonroot +EXPOSE 8080 2112 +ENTRYPOINT ["/app"] diff --git a/metrics-examples/go/README.md b/metrics-examples/go/README.md new file mode 100644 index 000000000..671a92d3b --- /dev/null +++ b/metrics-examples/go/README.md @@ -0,0 +1,198 @@ +# Code Engine custom metrics examples for Go + +This application helps debug connectivity issues for IBM Cloud Services and provides comprehensive monitoring through Prometheus metrics. It includes outbound HTTP call simulation, database connectivity testing, and compute-intensive workload simulation. + +## Features + +- **Outbound HTTP Calls**: Configurable endpoints that simulate delays and error responses to httpbin.org-compatible backends +- **Database Testing**: PostgreSQL connectivity verification with instrumented queries +- **Prometheus Metrics**: Comprehensive instrumentation for requests, outbound calls, database operations, and compute workloads +- **Load Testing**: Included shell script for generating realistic traffic patterns +- **Graceful Shutdown**: Proper cleanup of database connections and HTTP servers + +## Quick Start + +### Deploy to IBM Cloud Code Engine + +Deploy the application with recommended resource settings: + +```bash +ibmcloud ce application create \ + --name metrics-example-app-go \ + --src "." \ + --memory 0.5G \ + --cpu 0.25 \ + --port 8080 +``` + +To configure environment variables during deployment: + +```bash +ibmcloud ce application create \ + --name metrics-example-app-go \ + --src "." \ + --memory 0.5G \ + --cpu 0.25 \ + --env HTTPBIN_BASE_URL=https://httpbin.org \ + --env METRICS_NAME_PREFIX=mymetrics_ \ + --env METRICS_COLLECT_GO_METRICS_ENABLED=true +``` + +Update an existing application: + +```bash +ibmcloud ce application update \ + --name metrics-example-app-go \ + --env HTTPBIN_BASE_URL=https://custom-backend.example.com \ + --env METRICS_COLLECT_GO_METRICS_ENABLED=true +``` + +### Run Locally + +Pull and run with Docker: +```bash +docker pull icr.io/codeengine/metrics-example-app-go +docker run -p 8080:8080 -p 2112:2112 icr.io/codeengine/metrics-example-app-go +``` + +Or run from source: +```bash +go mod download +go run main.go +``` + +The application exposes two servers: +- Main application: `http://localhost:8080` +- Metrics endpoint: `http://localhost:2112/metrics` + +## Configuration + +### Environment Variables + +- `PORT`: Application server port (default: 8080) +- `HTTPBIN_BASE_URL`: Backend URL for outbound calls (default: `https://httpbin.org`) +- `METRICS_NAME_PREFIX`: Prefix for all Prometheus metrics (default: `mymetrics_`) +- `METRICS_COLLECT_GO_METRICS_ENABLED`: Enable Go runtime and process metrics collection (default: false, set to `true` to enable) +- `DATABASES_FOR_POSTGRESQL_CONNECTION`: PostgreSQL connection credentials (JSON format) + +### Service Bindings + +For database connectivity, create a Code Engine service binding between your project and the IBM Cloud service. See [Working with service bindings](https://cloud.ibm.com/docs/codeengine?topic=codeengine-service-binding) for details. + +## API Endpoints + +- `GET /` - Health check +- `GET /test-db` - Test PostgreSQL connectivity +- `GET /outbound/delay` - Outbound call with random delay (0-2s) and 5% error rate +- `GET /outbound/get` - Simple outbound GET request +- `POST /outbound/post` - Outbound POST request +- `GET /outbound/status/:code` - Request specific HTTP status code + +All outbound endpoints include simulated compute-intensive data processing (0-3s duration, 40-80% CPU intensity). + +## Metrics + +The application exposes Prometheus metrics at `/metrics` (port 2112). All metric names are prefixed with a configurable value set via the `METRICS_NAME_PREFIX` environment variable (default: `mymetrics_`). + +**Request Metrics** +- `mymetrics_requests_total`: Total requests by method and path + +**Outbound Call Metrics** +- `mymetrics_outbound_request_duration_seconds`: Histogram of outbound request durations +- `mymetrics_outbound_requests_total`: Total outbound requests by target, method, and status + +**Database Metrics** +- `mymetrics_db_query_duration_seconds`: Histogram of query durations by operation and table +- `mymetrics_db_queries_total`: Total queries by operation, table, and status +- `mymetrics_db_connections_active`: Active database connections gauge + +**Compute Metrics** +- `mymetrics_compute_duration_seconds`: Histogram of compute operation durations + +**Go Runtime Metrics** (collected when `METRICS_COLLECT_GO_METRICS_ENABLED=true`) +- `go_goroutines`: Number of goroutines that currently exist +- `go_threads`: Number of OS threads created +- `go_info`: Information about the Go environment +- `go_memstats_*`: Detailed memory statistics (alloc, heap, stack, GC, etc.) +- `go_gc_duration_seconds`: GC invocation durations + +**Process Metrics** (collected when `METRICS_COLLECT_GO_METRICS_ENABLED=true`) +- `process_cpu_seconds_total`: Total user and system CPU time spent in seconds +- `process_open_fds`: Number of open file descriptors +- `process_max_fds`: Maximum number of open file descriptors +- `process_virtual_memory_bytes`: Virtual memory size in bytes +- `process_resident_memory_bytes`: Resident memory size in bytes +- `process_start_time_seconds`: Start time of the process since unix epoch + +## Development + +### Prerequisites + +- Go 1.22 or later +- Docker (for containerized builds) + +### Building + +```bash +# Build binary +go build -o metrics-example-app + +# Build Docker image +docker build -t metrics-example-app-go . + +# Run tests +go test ./... +``` + +### Project Structure + +``` +go/ +├── main.go # Application entry point +├── go.mod # Go module definition +├── go.sum # Dependency checksums +├── Dockerfile # Multi-stage Docker build +├── README.md # This file +└── internal/ + ├── db/ + │ └── postgres.go # PostgreSQL connection handling + ├── handlers/ + │ └── handlers.go # HTTP request handlers + └── metrics/ + └── metrics.go # Prometheus metrics definitions +``` + +## Performance Characteristics + +- **Startup Time**: < 1 second +- **Memory Footprint**: ~20-30 MB +- **Image Size**: ~20-30 MB (distroless) +- **Concurrency**: Native goroutine-based, highly concurrent +- **CPU Efficiency**: Compiled binary, excellent performance + +## Troubleshooting + +### Database Connection Issues + +If you encounter database connection errors: + +1. Verify the `DATABASES_FOR_POSTGRESQL_CONNECTION` environment variable is set correctly +2. Check that the service binding is properly configured +3. Ensure the certificate is valid and properly base64 encoded +4. Verify network connectivity to the PostgreSQL instance + +### Import Errors + +If you see import errors during development: + +```bash +go mod tidy +go mod download +``` + +### Build Issues + +For build issues with CGO: + +```bash +CGO_ENABLED=0 go build diff --git a/metrics-examples/go/go.mod b/metrics-examples/go/go.mod new file mode 100644 index 000000000..fb2ad8137 --- /dev/null +++ b/metrics-examples/go/go.mod @@ -0,0 +1,25 @@ +module github.com/IBM/CodeEngine/metrics-examples/go + +go 1.25.0 + +require ( + github.com/gorilla/mux v1.8.1 + github.com/jackc/pgx/v5 v5.5.5 + github.com/prometheus/client_golang v1.19.0 +) + +require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/jackc/pgpassfile v1.0.0 // indirect + github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a // indirect + github.com/jackc/puddle/v2 v2.2.1 // indirect + github.com/prometheus/client_model v0.5.0 // indirect + github.com/prometheus/common v0.48.0 // indirect + github.com/prometheus/procfs v0.12.0 // indirect + golang.org/x/crypto v0.17.0 // indirect + golang.org/x/sync v0.3.0 // indirect + golang.org/x/sys v0.16.0 // indirect + golang.org/x/text v0.14.0 // indirect + google.golang.org/protobuf v1.32.0 // indirect +) diff --git a/metrics-examples/go/go.sum b/metrics-examples/go/go.sum new file mode 100644 index 000000000..f469bb0de --- /dev/null +++ b/metrics-examples/go/go.sum @@ -0,0 +1,48 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= +github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= +github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= +github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= +github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a h1:bbPeKD0xmW/Y25WS6cokEszi5g+S0QxI/d45PkRi7Nk= +github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= +github.com/jackc/pgx/v5 v5.5.5 h1:amBjrZVmksIdNjxGW/IiIMzxMKZFelXbUoPNb+8sjQw= +github.com/jackc/pgx/v5 v5.5.5/go.mod h1:ez9gk+OAat140fv9ErkZDYFWmXLfV+++K0uAOiwgm1A= +github.com/jackc/puddle/v2 v2.2.1 h1:RhxXJtFG022u4ibrCSMSiu5aOq1i77R3OHKNJj77OAk= +github.com/jackc/puddle/v2 v2.2.1/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.19.0 h1:ygXvpU1AoN1MhdzckN+PyD9QJOSD4x7kmXYlnfbA6JU= +github.com/prometheus/client_golang v1.19.0/go.mod h1:ZRM9uEAypZakd+q/x7+gmsvXdURP+DABIEIjnmDdp+k= +github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw= +github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI= +github.com/prometheus/common v0.48.0 h1:QO8U2CdOzSn1BBsmXJXduaaW+dY/5QLjfB8svtSzKKE= +github.com/prometheus/common v0.48.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5EC6ILDTlAPc= +github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo= +github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k= +golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= +golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= +golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +google.golang.org/protobuf v1.32.0 h1:pPC6BG5ex8PDFnkbrGU3EixyhKcQ2aDuBS36lqK/C7I= +google.golang.org/protobuf v1.32.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/metrics-examples/go/internal/db/postgres.go b/metrics-examples/go/internal/db/postgres.go new file mode 100644 index 000000000..687f2bb2a --- /dev/null +++ b/metrics-examples/go/internal/db/postgres.go @@ -0,0 +1,132 @@ +package db + +import ( + "context" + "crypto/tls" + "crypto/x509" + "encoding/base64" + "encoding/json" + "fmt" + "log" + "os" + "time" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" +) + +var dbPool *pgxpool.Pool + +// PostgresCredentials represents the structure of the service binding credentials +type PostgresCredentials struct { + CLI struct { + Environment struct { + PGPASSWORD string `json:"PGPASSWORD"` + } `json:"environment"` + } `json:"cli"` + Postgres struct { + Authentication struct { + Username string `json:"username"` + } `json:"authentication"` + Certificate struct { + CertificateBase64 string `json:"certificate_base64"` + } `json:"certificate"` + Database string `json:"database"` + Hosts []struct { + Hostname string `json:"hostname"` + Port int `json:"port"` + } `json:"hosts"` + } `json:"postgres"` +} + +// GetDBPool returns a connection pool to PostgreSQL +func GetDBPool(ctx context.Context) (*pgxpool.Pool, error) { + if dbPool != nil { + return dbPool, nil + } + + pgServiceCredentials := os.Getenv("DATABASES_FOR_POSTGRESQL_CONNECTION") + if pgServiceCredentials == "" { + return nil, fmt.Errorf("DATABASES_FOR_POSTGRESQL_CONNECTION not set") + } + + log.Println("Connecting to PostgreSQL instance...") + + var creds PostgresCredentials + if err := json.Unmarshal([]byte(pgServiceCredentials), &creds); err != nil { + return nil, fmt.Errorf("failed to parse credentials: %w", err) + } + + // Decode certificate + certBytes, err := base64.StdEncoding.DecodeString(creds.Postgres.Certificate.CertificateBase64) + if err != nil { + return nil, fmt.Errorf("failed to decode certificate: %w", err) + } + + // Create certificate pool + certPool := x509.NewCertPool() + if !certPool.AppendCertsFromPEM(certBytes) { + return nil, fmt.Errorf("failed to append certificate") + } + + // Build connection string + connString := fmt.Sprintf( + "postgres://%s:%s@%s:%d/%s?sslmode=require", + creds.Postgres.Authentication.Username, + creds.CLI.Environment.PGPASSWORD, + creds.Postgres.Hosts[0].Hostname, + creds.Postgres.Hosts[0].Port, + creds.Postgres.Database, + ) + + // Configure connection pool + config, err := pgxpool.ParseConfig(connString) + if err != nil { + return nil, fmt.Errorf("failed to parse config: %w", err) + } + + config.ConnConfig.TLSConfig = &tls.Config{ + RootCAs: certPool, + ServerName: creds.Postgres.Hosts[0].Hostname, + } + config.MaxConns = 10 + config.MinConns = 2 + config.MaxConnLifetime = time.Hour + config.MaxConnIdleTime = 30 * time.Minute + config.HealthCheckPeriod = time.Minute + + // Create pool + pool, err := pgxpool.NewWithConfig(ctx, config) + if err != nil { + return nil, fmt.Errorf("failed to create pool: %w", err) + } + + // Test connection + if err := pool.Ping(ctx); err != nil { + pool.Close() + return nil, fmt.Errorf("failed to ping database: %w", err) + } + + dbPool = pool + log.Println("Successfully connected to PostgreSQL") + return dbPool, nil +} + +// ExecuteQuery executes a query and returns the result +func ExecuteQuery(ctx context.Context, query string) (pgx.Rows, error) { + if dbPool == nil { + return nil, fmt.Errorf("database pool not initialized") + } + + return dbPool.Query(ctx, query) +} + +// Close closes the database connection pool +func Close() { + if dbPool != nil { + dbPool.Close() + log.Println("DB connection closed") + } +} + +// Made with Bob diff --git a/metrics-examples/go/internal/handlers/handlers.go b/metrics-examples/go/internal/handlers/handlers.go new file mode 100644 index 000000000..75781c514 --- /dev/null +++ b/metrics-examples/go/internal/handlers/handlers.go @@ -0,0 +1,262 @@ +package handlers + +import ( + "encoding/json" + "fmt" + "io" + "log" + "math" + "math/rand" + "net/http" + "os" + "strconv" + "time" + + "github.com/IBM/CodeEngine/metrics-examples/go/internal/db" + "github.com/IBM/CodeEngine/metrics-examples/go/internal/metrics" + "github.com/gorilla/mux" +) + +var httpbinBaseURL = getHTTPBinURL() + +func getHTTPBinURL() string { + url := os.Getenv("HTTPBIN_BASE_URL") + if url == "" { + url = "https://httpbin.org" + } + return url +} + +// SimulateCompute performs CPU-intensive work for the specified duration +func SimulateCompute(durationSeconds float64, cpuIntensity float64) { + startTime := time.Now() + endTime := startTime.Add(time.Duration(durationSeconds * float64(time.Second))) + + for time.Now().Before(endTime) { + // Perform CPU work + workIterations := int(cpuIntensity * 1000) + for i := 0; i < workIterations; i++ { + _ = math.Sqrt(rand.Float64() * 1000000) + } + + // Small sleep to control CPU usage + sleepTime := time.Duration((100-cpuIntensity)/10) * time.Millisecond + time.Sleep(sleepTime) + } +} + +// MakeOutboundCall makes an HTTP request and records metrics +func MakeOutboundCall(endpoint, method string) map[string]interface{} { + url := httpbinBaseURL + endpoint + startTime := time.Now() + + req, err := http.NewRequest(method, url, nil) + if err != nil { + duration := time.Since(startTime).Seconds() + metrics.OutboundRequestDuration.WithLabelValues(httpbinBaseURL, method, "error").Observe(duration) + metrics.OutboundRequestsTotal.WithLabelValues(httpbinBaseURL, method, "error").Inc() + return map[string]interface{}{ + "success": false, + "error": err.Error(), + "duration": duration, + } + } + + client := &http.Client{Timeout: 30 * time.Second} + resp, err := client.Do(req) + if err != nil { + duration := time.Since(startTime).Seconds() + metrics.OutboundRequestDuration.WithLabelValues(httpbinBaseURL, method, "error").Observe(duration) + metrics.OutboundRequestsTotal.WithLabelValues(httpbinBaseURL, method, "error").Inc() + return map[string]interface{}{ + "success": false, + "error": err.Error(), + "duration": duration, + } + } + defer resp.Body.Close() + + duration := time.Since(startTime).Seconds() + statusCode := strconv.Itoa(resp.StatusCode) + + body, _ := io.ReadAll(resp.Body) + + metrics.OutboundRequestDuration.WithLabelValues(httpbinBaseURL, method, statusCode).Observe(duration) + metrics.OutboundRequestsTotal.WithLabelValues(httpbinBaseURL, method, statusCode).Inc() + + return map[string]interface{}{ + "success": true, + "status": resp.StatusCode, + "duration": duration, + "data": string(body), + } +} + +// HealthHandler handles the root health check endpoint +func HealthHandler(w http.ResponseWriter, r *http.Request) { + appName := os.Getenv("CE_APP") + if appName == "" { + appName = "metrics-example-app" + } + fmt.Fprintf(w, "app '%s' is ready!", appName) +} + +// TestDBHandler tests database connectivity +func TestDBHandler(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + pool, err := db.GetDBPool(ctx) + if err != nil { + http.Error(w, fmt.Sprintf("Could not connect to postgres instance: %v", err), http.StatusInternalServerError) + return + } + + metrics.DBConnectionsActive.Inc() + defer metrics.DBConnectionsActive.Dec() + + // Execute query with metrics + startTime := time.Now() + status := "success" + + query := "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE'" + rows, err := pool.Query(ctx, query) + if err != nil { + status = "error" + duration := time.Since(startTime).Seconds() + metrics.DBQueryDuration.WithLabelValues("SELECT", "INFORMATION_SCHEMA.TABLES", status).Observe(duration) + metrics.DBQueriesTotal.WithLabelValues("SELECT", "INFORMATION_SCHEMA.TABLES", status).Inc() + http.Error(w, fmt.Sprintf("Could not connect to postgres instance: '%v'", err), http.StatusInternalServerError) + return + } + defer rows.Close() + + duration := time.Since(startTime).Seconds() + metrics.DBQueryDuration.WithLabelValues("SELECT", "INFORMATION_SCHEMA.TABLES", status).Observe(duration) + metrics.DBQueriesTotal.WithLabelValues("SELECT", "INFORMATION_SCHEMA.TABLES", status).Inc() + + log.Printf("Successfully queried database in %.3fs", duration) + w.WriteHeader(http.StatusOK) + fmt.Fprint(w, "Successfully connected to postgres instance") +} + +// OutboundDelayHandler handles requests with random delays and errors +func OutboundDelayHandler(w http.ResponseWriter, r *http.Request) { + // Random delay between 0-2 seconds + delay := rand.Float64() * 2 + + // 5% error rate + shouldError := rand.Float64() < 0.05 + + var result map[string]interface{} + if shouldError { + result = MakeOutboundCall("/status/500", "GET") + } else { + result = MakeOutboundCall(fmt.Sprintf("/delay/%.1f", delay), "GET") + } + + // Simulate compute-intensive data handling + computeStart := time.Now() + computeDuration := rand.Float64() * 3 // 0-3 seconds + cpuIntensity := 40 + rand.Float64()*40 // 40-80% + SimulateCompute(computeDuration, cpuIntensity) + actualComputeDuration := time.Since(computeStart).Seconds() + metrics.ComputeDuration.WithLabelValues("data_processing").Observe(actualComputeDuration) + + response := map[string]interface{}{ + "message": "Outbound call completed", + "delay": delay, + "outboundCall": result, + "computeTime": actualComputeDuration, + "cpuIntensity": fmt.Sprintf("%.1f%%", cpuIntensity), + } + + if shouldError { + response["message"] = "Simulated error response" + w.WriteHeader(http.StatusInternalServerError) + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(response) +} + +// OutboundGetHandler handles simple GET requests +func OutboundGetHandler(w http.ResponseWriter, r *http.Request) { + result := MakeOutboundCall("/get", "GET") + + // Simulate compute-intensive data handling + computeStart := time.Now() + computeDuration := rand.Float64() * 3 + cpuIntensity := 40 + rand.Float64()*40 + SimulateCompute(computeDuration, cpuIntensity) + actualComputeDuration := time.Since(computeStart).Seconds() + metrics.ComputeDuration.WithLabelValues("data_processing").Observe(actualComputeDuration) + + response := map[string]interface{}{ + "message": "Outbound GET call completed", + "outboundCall": result, + "computeTime": actualComputeDuration, + "cpuIntensity": fmt.Sprintf("%.1f%%", cpuIntensity), + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(response) +} + +// OutboundPostHandler handles POST requests +func OutboundPostHandler(w http.ResponseWriter, r *http.Request) { + result := MakeOutboundCall("/post", "POST") + + // Simulate compute-intensive data handling + computeStart := time.Now() + computeDuration := rand.Float64() * 3 + cpuIntensity := 40 + rand.Float64()*40 + SimulateCompute(computeDuration, cpuIntensity) + actualComputeDuration := time.Since(computeStart).Seconds() + metrics.ComputeDuration.WithLabelValues("data_processing").Observe(actualComputeDuration) + + response := map[string]interface{}{ + "message": "Outbound POST call completed", + "outboundCall": result, + "computeTime": actualComputeDuration, + "cpuIntensity": fmt.Sprintf("%.1f%%", cpuIntensity), + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(response) +} + +// OutboundStatusHandler handles requests for specific status codes +func OutboundStatusHandler(w http.ResponseWriter, r *http.Request) { + vars := mux.Vars(r) + statusCode := vars["code"] + + result := MakeOutboundCall("/status/"+statusCode, "GET") + + // Simulate compute-intensive data handling + computeStart := time.Now() + computeDuration := rand.Float64() * 3 + cpuIntensity := 40 + rand.Float64()*40 + SimulateCompute(computeDuration, cpuIntensity) + actualComputeDuration := time.Since(computeStart).Seconds() + metrics.ComputeDuration.WithLabelValues("data_processing").Observe(actualComputeDuration) + + response := map[string]interface{}{ + "message": "Outbound call completed", + "requestedStatus": statusCode, + "outboundCall": result, + "computeTime": actualComputeDuration, + "cpuIntensity": fmt.Sprintf("%.1f%%", cpuIntensity), + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(response) +} + +// MetricsMiddleware records request metrics +func MetricsMiddleware(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + metrics.RequestsTotal.WithLabelValues(r.Method, r.URL.Path).Inc() + next.ServeHTTP(w, r) + }) +} + +// Made with Bob diff --git a/metrics-examples/go/internal/metrics/metrics.go b/metrics-examples/go/internal/metrics/metrics.go new file mode 100644 index 000000000..4eedb698d --- /dev/null +++ b/metrics-examples/go/internal/metrics/metrics.go @@ -0,0 +1,117 @@ +package metrics + +import ( + "os" + + "github.com/prometheus/client_golang/prometheus" +) + +var ( + prefix = getMetricsPrefix() + + // RequestsTotal tracks total requests by method and path + RequestsTotal *prometheus.CounterVec + + // OutboundRequestDuration tracks duration of outbound HTTP requests + OutboundRequestDuration *prometheus.HistogramVec + + // OutboundRequestsTotal tracks total outbound HTTP requests + OutboundRequestsTotal *prometheus.CounterVec + + // DBQueryDuration tracks duration of database queries + DBQueryDuration *prometheus.HistogramVec + + // DBQueriesTotal tracks total database queries + DBQueriesTotal *prometheus.CounterVec + + // DBConnectionsActive tracks active database connections + DBConnectionsActive prometheus.Gauge + + // ComputeDuration tracks duration of compute-intensive operations + ComputeDuration *prometheus.HistogramVec +) + +func getMetricsPrefix() string { + prefix := os.Getenv("METRICS_NAME_PREFIX") + if prefix == "" { + prefix = "mymetrics_" + } + return prefix +} + +// RegisterMetrics registers all application-specific metrics with the provided registry +func RegisterMetrics(reg *prometheus.Registry) { + // RequestsTotal tracks total requests by method and path + RequestsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: prefix + "requests_total", + Help: "Total number of requests", + }, + []string{"method", "path"}, + ) + reg.MustRegister(RequestsTotal) + + // OutboundRequestDuration tracks duration of outbound HTTP requests + OutboundRequestDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: prefix + "outbound_request_duration_seconds", + Help: "Duration of outbound HTTP requests in seconds", + Buckets: []float64{0.1, 0.5, 1, 2, 5, 10}, + }, + []string{"target", "method", "status_code"}, + ) + reg.MustRegister(OutboundRequestDuration) + + // OutboundRequestsTotal tracks total outbound HTTP requests + OutboundRequestsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: prefix + "outbound_requests_total", + Help: "Total number of outbound HTTP requests", + }, + []string{"target", "method", "status_code"}, + ) + reg.MustRegister(OutboundRequestsTotal) + + // DBQueryDuration tracks duration of database queries + DBQueryDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: prefix + "db_query_duration_seconds", + Help: "Duration of database queries in seconds", + Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1, 2, 5}, + }, + []string{"operation", "table", "status"}, + ) + reg.MustRegister(DBQueryDuration) + + // DBQueriesTotal tracks total database queries + DBQueriesTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: prefix + "db_queries_total", + Help: "Total number of database queries", + }, + []string{"operation", "table", "status"}, + ) + reg.MustRegister(DBQueriesTotal) + + // DBConnectionsActive tracks active database connections + DBConnectionsActive = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: prefix + "db_connections_active", + Help: "Number of active database connections", + }, + ) + reg.MustRegister(DBConnectionsActive) + + // ComputeDuration tracks duration of compute-intensive operations + ComputeDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: prefix + "compute_duration_seconds", + Help: "Duration of compute-intensive operations in seconds", + Buckets: []float64{0.5, 1, 2, 3, 5}, + }, + []string{"operation"}, + ) + reg.MustRegister(ComputeDuration) +} + +// Made with Bob diff --git a/metrics-examples/go/main.go b/metrics-examples/go/main.go new file mode 100644 index 000000000..d9de0d827 --- /dev/null +++ b/metrics-examples/go/main.go @@ -0,0 +1,130 @@ +package main + +import ( + "context" + "log" + "net/http" + "os" + "os/signal" + "syscall" + "time" + + "github.com/IBM/CodeEngine/metrics-examples/go/internal/db" + "github.com/IBM/CodeEngine/metrics-examples/go/internal/handlers" + "github.com/IBM/CodeEngine/metrics-examples/go/internal/metrics" + "github.com/gorilla/mux" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +func main() { + // Get configuration from environment + port := os.Getenv("PORT") + if port == "" { + port = "8080" + } + metricsPort := "2112" + + httpbinURL := os.Getenv("HTTPBIN_BASE_URL") + if httpbinURL == "" { + httpbinURL = "https://httpbin.org" + } + + // Create main application router + router := mux.NewRouter() + router.Use(handlers.MetricsMiddleware) + + // Register application routes + router.HandleFunc("/", handlers.HealthHandler).Methods("GET") + router.HandleFunc("/test-db", handlers.TestDBHandler).Methods("GET") + router.HandleFunc("/outbound/delay", handlers.OutboundDelayHandler).Methods("GET") + router.HandleFunc("/outbound/get", handlers.OutboundGetHandler).Methods("GET") + router.HandleFunc("/outbound/post", handlers.OutboundPostHandler).Methods("POST") + router.HandleFunc("/outbound/status/{code}", handlers.OutboundStatusHandler).Methods("GET") + + // Create main application server + appServer := &http.Server{ + Addr: ":" + port, + Handler: router, + ReadTimeout: 30 * time.Second, + WriteTimeout: 30 * time.Second, + IdleTimeout: 120 * time.Second, + } + + // Create a new registry for metrics + reg := prometheus.NewRegistry() + + // Register default Go runtime and process metrics if enabled + if os.Getenv("METRICS_COLLECT_GO_METRICS_ENABLED") == "true" { + // Register default Go runtime metrics (memory, goroutines, GC, etc.) + reg.MustRegister(collectors.NewGoCollector()) + + // Register process metrics (CPU, memory, file descriptors, etc.) + reg.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})) + + log.Println("Go runtime and process metrics collection enabled") + } + + // Register application-specific metrics + metrics.RegisterMetrics(reg) + + // Create metrics server with custom registry + metricsRouter := mux.NewRouter() + metricsRouter.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{ + EnableOpenMetrics: true, + })) + metricsServer := &http.Server{ + Addr: ":" + metricsPort, + Handler: metricsRouter, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + IdleTimeout: 60 * time.Second, + } + + // Start servers in goroutines + go func() { + log.Printf("Application server is running at http://localhost:%s", port) + log.Printf("Configured httpbin backend: %s", httpbinURL) + if err := appServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Fatalf("Application server error: %v", err) + } + }() + + go func() { + log.Printf("Metrics server is running at http://localhost:%s", metricsPort) + if err := metricsServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Fatalf("Metrics server error: %v", err) + } + }() + + // Wait for interrupt signal to gracefully shutdown the servers + quit := make(chan os.Signal, 1) + signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM) + <-quit + + log.Println("Shutting down servers...") + + // Create shutdown context with timeout + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + // Close database connections + db.Close() + + // Shutdown metrics server + if err := metricsServer.Shutdown(ctx); err != nil { + log.Printf("Metrics server shutdown error: %v", err) + } else { + log.Println("Metrics server closed") + } + + // Shutdown application server + if err := appServer.Shutdown(ctx); err != nil { + log.Printf("Application server shutdown error: %v", err) + } else { + log.Println("Application server closed") + } + + log.Println("Servers exited") +} diff --git a/metrics-examples/java/Dockerfile b/metrics-examples/java/Dockerfile new file mode 100644 index 000000000..0d70af551 --- /dev/null +++ b/metrics-examples/java/Dockerfile @@ -0,0 +1,19 @@ +# Download dependencies and compile in builder stage +FROM registry.access.redhat.com/ubi9/openjdk-21 AS builder + +COPY --chown=${UID} . /src +WORKDIR /src +RUN mvn package -Dmaven.test.skip=true + +# Runtime stage using distroless +FROM gcr.io/distroless/java21-debian12:nonroot + +# Copy the JAR from builder +COPY --chown=1001:0 --from=builder /src/target/*.jar /app/app.jar + +USER 1001:0 +WORKDIR /app +EXPOSE 8080 2112 + +# Run the application +CMD ["app.jar"] diff --git a/metrics-examples/java/README.md b/metrics-examples/java/README.md new file mode 100644 index 000000000..49190584f --- /dev/null +++ b/metrics-examples/java/README.md @@ -0,0 +1,252 @@ +# Code Engine custom metrics examples for Java + +This application helps debug connectivity issues for IBM Cloud Services and provides comprehensive monitoring through Prometheus metrics. It includes outbound HTTP call simulation, database connectivity testing, and compute-intensive workload simulation. + +## Features + +- **Outbound HTTP Calls**: Configurable endpoints that simulate delays and error responses to httpbin.org-compatible backends +- **Database Testing**: PostgreSQL connectivity verification with instrumented queries +- **Prometheus Metrics**: Comprehensive instrumentation for requests, outbound calls, database operations, and compute workloads using Micrometer +- **Load Testing**: Included shell script for generating realistic traffic patterns +- **Spring Boot**: Enterprise-grade framework with auto-configuration and dependency injection +- **Reactive HTTP Client**: Non-blocking WebClient for efficient outbound calls + +## Quick Start + +### Deploy to IBM Cloud Code Engine + +Deploy the application with recommended resource settings: + +```bash +ibmcloud ce application create \ + --name metrics-example-app-java \ + --src "." \ + --memory 0.5G \ + --cpu 0.25 \ + --port 8080 +``` + +To configure environment variables during deployment: + +```bash +ibmcloud ce application create \ + --name metrics-example-app-java \ + --src "." \ + --memory 0.5G \ + --cpu 0.25 \ + --env HTTPBIN_BASE_URL=https://httpbin.org \ + --env METRICS_NAME_PREFIX=mymetrics_ +``` + +Update an existing application: + +```bash +ibmcloud ce application update \ + --name metrics-example-app-java \ + --env HTTPBIN_BASE_URL=https://custom-backend.example.com +``` + +### Run Locally + +Pull and run with Docker: +```bash +docker pull icr.io/codeengine/metrics-example-app-java +docker run -p 8080:8080 -p 2112:2112 icr.io/codeengine/metrics-example-app-java +``` + +Or run from source: +```bash +mvn spring-boot:run +``` + +Or build and run the JAR: +```bash +mvn clean package +java -jar target/metrics-example-1.0.0.jar +``` + +The application exposes two servers: +- Main application: `http://localhost:8080` +- Metrics endpoint: `http://localhost:2112/prometheus` (or `/metrics`) +- Health check: `http://localhost:2112/health` + +## Configuration + +### Environment Variables + +- `PORT`: Application server port (default: 8080) +- `HTTPBIN_BASE_URL`: Backend URL for outbound calls (default: `https://httpbin.org`) +- `METRICS_NAME_PREFIX`: Prefix for all Prometheus metrics (default: `mymetrics_`) +- `DATABASES_FOR_POSTGRESQL_CONNECTION`: PostgreSQL connection credentials (JSON format) +- `CE_APP`: Application name for health check response + +### Service Bindings + +For database connectivity, create a Code Engine service binding between your project and the IBM Cloud service. See [Working with service bindings](https://cloud.ibm.com/docs/codeengine?topic=codeengine-service-binding) for details. + +## API Endpoints + +- `GET /` - Health check +- `GET /test-db` - Test PostgreSQL connectivity +- `GET /outbound/delay` - Outbound call with random delay (0-2s) and 5% error rate +- `GET /outbound/get` - Simple outbound GET request +- `POST /outbound/post` - Outbound POST request +- `GET /outbound/status/{code}` - Request specific HTTP status code + +Management endpoints (port 2112): +- `GET /prometheus` - Prometheus metrics endpoint +- `GET /metrics` - Alternative metrics endpoint +- `GET /health` - Health check endpoint + +All outbound endpoints include simulated compute-intensive data processing (0-3s duration, 40-80% CPU intensity). + +## Metrics + +The application exposes Prometheus metrics at `/prometheus` (port 2112). All metric names are prefixed with a configurable value set via the `METRICS_NAME_PREFIX` environment variable (default: `mymetrics_`). + +**Request Metrics** +- `mymetrics_requests_total`: Total requests by method and path + +**Outbound Call Metrics** +- `mymetrics_outbound_request_duration_seconds`: Timer for outbound request durations +- `mymetrics_outbound_requests_total`: Total outbound requests by target, method, and status + +**Database Metrics** +- `mymetrics_db_query_duration_seconds`: Timer for query durations by operation and table +- `mymetrics_db_queries_total`: Total queries by operation, table, and status +- `mymetrics_db_connections_active`: Active database connections gauge + +**Compute Metrics** +- `mymetrics_compute_duration_seconds`: Timer for compute operation durations + +**JVM Metrics** (automatically collected by Micrometer) +- JVM memory, threads, GC, class loading, and more + +## Development + +### Prerequisites + +- Java 21 or later +- Maven 3.9 or later +- Docker (for containerized builds) + +### Building + +```bash +# Build the project +mvn clean package + +# Run the application +mvn spring-boot:run + +# Run with custom profile +mvn spring-boot:run -Dspring-boot.run.profiles=dev + +# Build Docker image +docker build -t metrics-example-app-java . + +# Run tests +mvn test +``` + +### Project Structure + +``` +java/ +├── pom.xml # Maven project configuration +├── Dockerfile # Multi-stage Docker build +├── README.md # This file +└── src/main/ + ├── java/com/ibm/codeengine/metrics/ + │ ├── MetricsApplication.java # Main application class + │ ├── config/ + │ │ ├── MetricsConfig.java # Metrics configuration + │ │ └── DatabaseConfig.java # Database configuration + │ ├── controller/ + │ │ └── MetricsController.java # REST endpoints + │ ├── service/ + │ │ ├── OutboundService.java # HTTP client service + │ │ ├── DatabaseService.java # Database service + │ │ └── ComputeService.java # Compute simulation + │ └── model/ + │ └── OutboundCallResult.java # Data model + └── resources/ + └── application.properties # Application configuration +``` + +## Performance Characteristics + +- **Startup Time**: ~5-10 seconds (JVM warmup) +- **Memory Footprint**: ~200-250 MB (JVM + application) +- **Image Size**: ~200-250 MB (distroless with JRE) +- **Concurrency**: Thread-based with reactive HTTP client +- **CPU Efficiency**: Good after JVM warmup and JIT compilation + +## Spring Boot Features + +This implementation uses Spring Boot 3.x, which provides: + +- **Auto-configuration**: Automatic setup of common components +- **Dependency Injection**: Clean, testable code architecture +- **Actuator**: Production-ready features like health checks and metrics +- **Micrometer**: Vendor-neutral metrics facade with Prometheus support +- **WebFlux**: Reactive, non-blocking HTTP client +- **HikariCP**: High-performance JDBC connection pooling + +## Troubleshooting + +### Database Connection Issues + +If you encounter database connection errors: + +1. Verify the `DATABASES_FOR_POSTGRESQL_CONNECTION` environment variable is set correctly +2. Check that the service binding is properly configured +3. Ensure the certificate is valid and properly base64 encoded +4. Verify network connectivity to the PostgreSQL instance +5. Check Spring Boot logs for detailed error messages + +### Build Issues + +If Maven build fails: + +```bash +# Clean and rebuild +mvn clean install -U + +# Skip tests if needed +mvn clean package -DskipTests +``` + +### Memory Issues + +For memory-constrained environments: + +```bash +# Set JVM memory limits +java -Xmx256m -Xms128m -jar target/metrics-example-1.0.0.jar + +# Or via environment variable +JAVA_OPTS="-Xmx256m -Xms128m" java -jar target/metrics-example-1.0.0.jar +``` + +### Slow Startup + +To improve startup time: + +```bash +# Use CDS (Class Data Sharing) +java -Xshare:on -jar target/metrics-example-1.0.0.jar + +# Reduce logging during startup +java -Dlogging.level.root=WARN -jar target/metrics-example-1.0.0.jar +``` + +## Micrometer Integration + +This application uses Micrometer for metrics, which provides: + +- **Multiple Backend Support**: Easy to switch from Prometheus to other systems +- **Dimensional Metrics**: Tags/labels for flexible querying +- **Timer Support**: Automatic percent tiles and distribution summaries +- **JVM Metrics**: Comprehensive runtime monitoring +- **Spring Integration**: Seamless integration with Spring Boot Actuator diff --git a/metrics-examples/java/pom.xml b/metrics-examples/java/pom.xml new file mode 100644 index 000000000..1bbf49f7f --- /dev/null +++ b/metrics-examples/java/pom.xml @@ -0,0 +1,96 @@ + + + 4.0.0 + + + org.springframework.boot + spring-boot-starter-parent + 3.2.2 + + + + com.ibm.codeengine + metrics-example + 1.0.0 + metrics-example + Code Engine custom metrics example for Java + + + 21 + 21 + 21 + UTF-8 + + + + + + org.springframework.boot + spring-boot-starter-web + + + + + org.springframework.boot + spring-boot-starter-actuator + + + + + io.micrometer + micrometer-registry-prometheus + + + + + org.springframework.boot + spring-boot-starter-webflux + + + + + org.postgresql + postgresql + runtime + + + + + org.springframework.boot + spring-boot-starter-data-jdbc + + + + + com.fasterxml.jackson.core + jackson-databind + + + + + org.projectlombok + lombok + true + + + + + + + org.springframework.boot + spring-boot-maven-plugin + + + + org.projectlombok + lombok + + + + + + + diff --git a/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/MetricsApplication.java b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/MetricsApplication.java new file mode 100644 index 000000000..6b6aaf82a --- /dev/null +++ b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/MetricsApplication.java @@ -0,0 +1,19 @@ +package com.ibm.codeengine.metrics; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.boot.context.properties.ConfigurationPropertiesScan; + +/** + * Main application class for the Code Engine metrics example. + */ +@SpringBootApplication +@ConfigurationPropertiesScan +public class MetricsApplication { + + public static void main(String[] args) { + SpringApplication.run(MetricsApplication.class, args); + } +} + +// Made with Bob diff --git a/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/config/DatabaseConfig.java b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/config/DatabaseConfig.java new file mode 100644 index 000000000..9a6195e55 --- /dev/null +++ b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/config/DatabaseConfig.java @@ -0,0 +1,73 @@ +package com.ibm.codeengine.metrics.config; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +import javax.sql.DataSource; +import java.util.Base64; + +/** + * Configuration for PostgreSQL database connection using service binding. + * Only activated when DATABASES_FOR_POSTGRESQL_CONNECTION environment variable is present. + */ +@Configuration +public class DatabaseConfig { + + private static final Logger logger = LoggerFactory.getLogger(DatabaseConfig.class); + + /** + * Create DataSource from service binding credentials. + * Only created when the environment variable is present. + */ + @Bean + @ConditionalOnProperty(name = "DATABASES_FOR_POSTGRESQL_CONNECTION") + public DataSource dataSource() { + String credentials = System.getenv("DATABASES_FOR_POSTGRESQL_CONNECTION"); + + try { + ObjectMapper mapper = new ObjectMapper(); + JsonNode root = mapper.readTree(credentials); + + // Extract connection details + String username = root.path("postgres").path("authentication").path("username").asText(); + String password = root.path("cli").path("environment").path("PGPASSWORD").asText(); + String hostname = root.path("postgres").path("hosts").get(0).path("hostname").asText(); + int port = root.path("postgres").path("hosts").get(0).path("port").asInt(); + String database = root.path("postgres").path("database").asText(); + + // Build JDBC URL with SSL + String jdbcUrl = String.format( + "jdbc:postgresql://%s:%d/%s?sslmode=require&ssl=true", + hostname, port, database + ); + + // Configure HikariCP + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(jdbcUrl); + config.setUsername(username); + config.setPassword(password); + config.setDriverClassName("org.postgresql.Driver"); + config.setMaximumPoolSize(10); + config.setMinimumIdle(2); + config.setConnectionTimeout(15000); + config.setIdleTimeout(600000); + config.setMaxLifetime(1800000); + + logger.info("PostgreSQL DataSource configured successfully"); + return new HikariDataSource(config); + + } catch (Exception e) { + logger.error("Failed to configure PostgreSQL DataSource", e); + throw new RuntimeException("Failed to configure PostgreSQL DataSource", e); + } + } +} + +// Made with Bob diff --git a/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/config/MetricsConfig.java b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/config/MetricsConfig.java new file mode 100644 index 000000000..41773f2ce --- /dev/null +++ b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/config/MetricsConfig.java @@ -0,0 +1,118 @@ +package com.ibm.codeengine.metrics.config; + +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.DistributionSummary; +import io.micrometer.core.instrument.Gauge; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.Timer; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +import java.util.concurrent.atomic.AtomicInteger; + +/** + * Configuration for custom Prometheus metrics. + */ +@Configuration +public class MetricsConfig { + + @Value("${metrics.name.prefix:mymetrics_}") + private String metricsPrefix; + + private final AtomicInteger activeDbConnections = new AtomicInteger(0); + + @Bean + public String metricsPrefix() { + return metricsPrefix; + } + + @Bean + public AtomicInteger activeDbConnections() { + return activeDbConnections; + } + + /** + * Register custom metrics with the MeterRegistry. + * Returns a simple marker object to satisfy Spring's @Bean requirement. + */ + @Bean + public String registerCustomMetrics(MeterRegistry registry) { + // Register DB connections gauge + Gauge.builder(metricsPrefix + "db_connections_active", activeDbConnections, AtomicInteger::get) + .description("Number of active database connections") + .register(registry); + return "metricsRegistered"; + } + + /** + * Create a counter for requests. + */ + public Counter requestsCounter(MeterRegistry registry, String method, String path) { + return Counter.builder(metricsPrefix + "requests_total") + .description("Total number of requests") + .tag("method", method) + .tag("path", path) + .register(registry); + } + + /** + * Create a timer for outbound requests. + */ + public Timer outboundRequestTimer(MeterRegistry registry, String target, String method, String statusCode) { + return Timer.builder(metricsPrefix + "outbound_request_duration_seconds") + .description("Duration of outbound HTTP requests in seconds") + .tag("target", target) + .tag("method", method) + .tag("status_code", statusCode) + .register(registry); + } + + /** + * Create a counter for outbound requests. + */ + public Counter outboundRequestsCounter(MeterRegistry registry, String target, String method, String statusCode) { + return Counter.builder(metricsPrefix + "outbound_requests_total") + .description("Total number of outbound HTTP requests") + .tag("target", target) + .tag("method", method) + .tag("status_code", statusCode) + .register(registry); + } + + /** + * Create a timer for database queries. + */ + public Timer dbQueryTimer(MeterRegistry registry, String operation, String table, String status) { + return Timer.builder(metricsPrefix + "db_query_duration_seconds") + .description("Duration of database queries in seconds") + .tag("operation", operation) + .tag("table", table) + .tag("status", status) + .register(registry); + } + + /** + * Create a counter for database queries. + */ + public Counter dbQueriesCounter(MeterRegistry registry, String operation, String table, String status) { + return Counter.builder(metricsPrefix + "db_queries_total") + .description("Total number of database queries") + .tag("operation", operation) + .tag("table", table) + .tag("status", status) + .register(registry); + } + + /** + * Create a timer for compute operations. + */ + public Timer computeTimer(MeterRegistry registry, String operation) { + return Timer.builder(metricsPrefix + "compute_duration_seconds") + .description("Duration of compute-intensive operations in seconds") + .tag("operation", operation) + .register(registry); + } +} + +// Made with Bob diff --git a/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/controller/MetricsController.java b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/controller/MetricsController.java new file mode 100644 index 000000000..0be331dfc --- /dev/null +++ b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/controller/MetricsController.java @@ -0,0 +1,201 @@ +package com.ibm.codeengine.metrics.controller; + +import com.ibm.codeengine.metrics.config.MetricsConfig; +import com.ibm.codeengine.metrics.model.OutboundCallResult; +import com.ibm.codeengine.metrics.service.ComputeService; +import com.ibm.codeengine.metrics.service.DatabaseService; +import com.ibm.codeengine.metrics.service.OutboundService; +import io.micrometer.core.instrument.MeterRegistry; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.http.HttpStatus; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.*; + +import java.time.Duration; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; + +/** + * REST controller for the metrics example application. + */ +@RestController +public class MetricsController { + + private static final Logger logger = LoggerFactory.getLogger(MetricsController.class); + private static final Random random = new Random(); + + private final OutboundService outboundService; + private final DatabaseService databaseService; + private final ComputeService computeService; + private final MeterRegistry meterRegistry; + private final MetricsConfig metricsConfig; + private final String appName; + + public MetricsController( + OutboundService outboundService, + DatabaseService databaseService, + ComputeService computeService, + MeterRegistry meterRegistry, + MetricsConfig metricsConfig, + @Value("${CE_APP:metrics-example-app}") String appName) { + this.outboundService = outboundService; + this.databaseService = databaseService; + this.computeService = computeService; + this.meterRegistry = meterRegistry; + this.metricsConfig = metricsConfig; + this.appName = appName; + } + + /** + * Middleware to track requests - implemented via interceptor would be better + */ + private void recordRequest(String method, String path) { + metricsConfig.requestsCounter(meterRegistry, method, path).increment(); + } + + @GetMapping("/") + public String healthCheck() { + recordRequest("GET", "/"); + return String.format("app '%s' is ready!", appName); + } + + @GetMapping("/test-db") + public ResponseEntity testDb() { + recordRequest("GET", "/test-db"); + + if (!databaseService.isConfigured()) { + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR) + .body("Could not connect to postgres instance: no postgres instance configured"); + } + + try { + databaseService.testConnection(); + return ResponseEntity.ok("Successfully connected to postgres instance"); + } catch (Exception e) { + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR) + .body("Could not connect to postgres instance: '" + e.getMessage() + "'"); + } + } + + @GetMapping("/outbound/delay") + public ResponseEntity> outboundDelay() { + recordRequest("GET", "/outbound/delay"); + + // Random delay between 0-2 seconds + double delay = random.nextDouble() * 2; + + // 5% error rate + boolean shouldError = random.nextDouble() < 0.05; + + OutboundCallResult result; + if (shouldError) { + result = outboundService.makeOutboundCall("/status/500", "GET"); + } else { + result = outboundService.makeOutboundCall(String.format("/delay/%.1f", delay), "GET"); + } + + // Simulate compute-intensive data handling + long computeStart = System.currentTimeMillis(); + double computeDuration = random.nextDouble() * 3; // 0-3 seconds + double cpuIntensity = 40 + random.nextDouble() * 40; // 40-80% + computeService.simulateCompute(computeDuration, cpuIntensity); + double actualComputeDuration = (System.currentTimeMillis() - computeStart) / 1000.0; + + metricsConfig.computeTimer(meterRegistry, "data_processing") + .record(Duration.ofMillis((long)(actualComputeDuration * 1000))); + + Map response = new HashMap<>(); + response.put("message", shouldError ? "Simulated error response" : "Outbound call completed"); + response.put("delay", delay); + response.put("outboundCall", result); + response.put("computeTime", actualComputeDuration); + response.put("cpuIntensity", String.format("%.1f%%", cpuIntensity)); + + if (shouldError) { + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(response); + } + + return ResponseEntity.ok(response); + } + + @GetMapping("/outbound/get") + public ResponseEntity> outboundGet() { + recordRequest("GET", "/outbound/get"); + + OutboundCallResult result = outboundService.makeOutboundCall("/get", "GET"); + + // Simulate compute-intensive data handling + long computeStart = System.currentTimeMillis(); + double computeDuration = random.nextDouble() * 3; + double cpuIntensity = 40 + random.nextDouble() * 40; + computeService.simulateCompute(computeDuration, cpuIntensity); + double actualComputeDuration = (System.currentTimeMillis() - computeStart) / 1000.0; + + metricsConfig.computeTimer(meterRegistry, "data_processing") + .record(Duration.ofMillis((long)(actualComputeDuration * 1000))); + + Map response = new HashMap<>(); + response.put("message", "Outbound GET call completed"); + response.put("outboundCall", result); + response.put("computeTime", actualComputeDuration); + response.put("cpuIntensity", String.format("%.1f%%", cpuIntensity)); + + return ResponseEntity.ok(response); + } + + @PostMapping("/outbound/post") + public ResponseEntity> outboundPost() { + recordRequest("POST", "/outbound/post"); + + OutboundCallResult result = outboundService.makeOutboundCall("/post", "POST"); + + // Simulate compute-intensive data handling + long computeStart = System.currentTimeMillis(); + double computeDuration = random.nextDouble() * 3; + double cpuIntensity = 40 + random.nextDouble() * 40; + computeService.simulateCompute(computeDuration, cpuIntensity); + double actualComputeDuration = (System.currentTimeMillis() - computeStart) / 1000.0; + + metricsConfig.computeTimer(meterRegistry, "data_processing") + .record(Duration.ofMillis((long)(actualComputeDuration * 1000))); + + Map response = new HashMap<>(); + response.put("message", "Outbound POST call completed"); + response.put("outboundCall", result); + response.put("computeTime", actualComputeDuration); + response.put("cpuIntensity", String.format("%.1f%%", cpuIntensity)); + + return ResponseEntity.ok(response); + } + + @GetMapping("/outbound/status/{code}") + public ResponseEntity> outboundStatus(@PathVariable int code) { + recordRequest("GET", "/outbound/status/" + code); + + OutboundCallResult result = outboundService.makeOutboundCall("/status/" + code, "GET"); + + // Simulate compute-intensive data handling + long computeStart = System.currentTimeMillis(); + double computeDuration = random.nextDouble() * 3; + double cpuIntensity = 40 + random.nextDouble() * 40; + computeService.simulateCompute(computeDuration, cpuIntensity); + double actualComputeDuration = (System.currentTimeMillis() - computeStart) / 1000.0; + + metricsConfig.computeTimer(meterRegistry, "data_processing") + .record(Duration.ofMillis((long)(actualComputeDuration * 1000))); + + Map response = new HashMap<>(); + response.put("message", "Outbound call completed"); + response.put("requestedStatus", code); + response.put("outboundCall", result); + response.put("computeTime", actualComputeDuration); + response.put("cpuIntensity", String.format("%.1f%%", cpuIntensity)); + + return ResponseEntity.ok(response); + } +} + +// Made with Bob diff --git a/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/model/OutboundCallResult.java b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/model/OutboundCallResult.java new file mode 100644 index 000000000..40fb1425c --- /dev/null +++ b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/model/OutboundCallResult.java @@ -0,0 +1,19 @@ +package com.ibm.codeengine.metrics.model; + +import lombok.Builder; +import lombok.Data; + +/** + * Result of an outbound HTTP call. + */ +@Data +@Builder +public class OutboundCallResult { + private boolean success; + private Integer status; + private double duration; + private String data; + private String error; +} + +// Made with Bob diff --git a/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/service/ComputeService.java b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/service/ComputeService.java new file mode 100644 index 000000000..3a3d5229c --- /dev/null +++ b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/service/ComputeService.java @@ -0,0 +1,40 @@ +package com.ibm.codeengine.metrics.service; + +import org.springframework.stereotype.Service; + +/** + * Service for simulating compute-intensive operations. + */ +@Service +public class ComputeService { + + /** + * Simulate CPU-intensive work for the specified duration. + * + * @param durationSeconds How long to run the computation + * @param cpuIntensity CPU usage intensity (0-100) + */ + public void simulateCompute(double durationSeconds, double cpuIntensity) { + long startTime = System.currentTimeMillis(); + long endTime = startTime + (long) (durationSeconds * 1000); + + while (System.currentTimeMillis() < endTime) { + // Perform CPU work + int workIterations = (int) (cpuIntensity * 1000); + for (int i = 0; i < workIterations; i++) { + Math.sqrt(Math.random() * 1000000); + } + + // Small sleep to control CPU usage + try { + long sleepTime = (long) ((100 - cpuIntensity) / 10); + Thread.sleep(sleepTime); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + } + } +} + +// Made with Bob diff --git a/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/service/DatabaseService.java b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/service/DatabaseService.java new file mode 100644 index 000000000..64012b3ff --- /dev/null +++ b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/service/DatabaseService.java @@ -0,0 +1,103 @@ +package com.ibm.codeengine.metrics.service; + +import com.ibm.codeengine.metrics.config.MetricsConfig; +import io.micrometer.core.instrument.MeterRegistry; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.condition.ConditionalOnBean; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.stereotype.Service; + +import javax.sql.DataSource; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * Service for database operations with metrics. + * Database support is optional - service works without a DataSource. + */ +@Service +public class DatabaseService { + + private static final Logger logger = LoggerFactory.getLogger(DatabaseService.class); + + private final JdbcTemplate jdbcTemplate; + private final MeterRegistry meterRegistry; + private final MetricsConfig metricsConfig; + private final AtomicInteger activeDbConnections; + + public DatabaseService( + @Autowired(required = false) DataSource dataSource, + MeterRegistry meterRegistry, + MetricsConfig metricsConfig, + AtomicInteger activeDbConnections) { + this.jdbcTemplate = dataSource != null ? new JdbcTemplate(dataSource) : null; + this.meterRegistry = meterRegistry; + this.metricsConfig = metricsConfig; + this.activeDbConnections = activeDbConnections; + + if (dataSource == null) { + logger.info("DatabaseService initialized without DataSource - database features disabled"); + } else { + logger.info("DatabaseService initialized with DataSource - database features enabled"); + } + } + + /** + * Test database connectivity by querying tables. + */ + public List> testConnection() { + if (jdbcTemplate == null) { + throw new IllegalStateException("Database not configured"); + } + + activeDbConnections.incrementAndGet(); + long startTime = System.currentTimeMillis(); + String status = "success"; + + try { + String query = "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE'"; + List> result = jdbcTemplate.queryForList(query); + + long duration = System.currentTimeMillis() - startTime; + + // Record metrics + metricsConfig.dbQueryTimer(meterRegistry, "SELECT", "INFORMATION_SCHEMA.TABLES", status) + .record(Duration.ofMillis(duration)); + metricsConfig.dbQueriesCounter(meterRegistry, "SELECT", "INFORMATION_SCHEMA.TABLES", status) + .increment(); + + logger.info("Successfully queried database in {}ms", duration); + return result; + + } catch (Exception e) { + status = "error"; + long duration = System.currentTimeMillis() - startTime; + + // Record error metrics + metricsConfig.dbQueryTimer(meterRegistry, "SELECT", "INFORMATION_SCHEMA.TABLES", status) + .record(Duration.ofMillis(duration)); + metricsConfig.dbQueriesCounter(meterRegistry, "SELECT", "INFORMATION_SCHEMA.TABLES", status) + .increment(); + + logger.error("Database query failed: {}", e.getMessage()); + throw new RuntimeException("Database query failed: " + e.getMessage(), e); + + } finally { + activeDbConnections.decrementAndGet(); + } + } + + /** + * Check if database is configured. + */ + public boolean isConfigured() { + return jdbcTemplate != null; + } +} + +// Made with Bob diff --git a/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/service/OutboundService.java b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/service/OutboundService.java new file mode 100644 index 000000000..a7615c7e5 --- /dev/null +++ b/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/service/OutboundService.java @@ -0,0 +1,95 @@ +package com.ibm.codeengine.metrics.service; + +import java.time.Duration; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; +import org.springframework.web.reactive.function.client.WebClient; + +import com.ibm.codeengine.metrics.config.MetricsConfig; +import com.ibm.codeengine.metrics.model.OutboundCallResult; + +import io.micrometer.core.instrument.MeterRegistry; +import reactor.core.publisher.Mono; + +/** + * Service for making outbound HTTP calls with metrics. + */ +@Service +public class OutboundService { + + private static final Logger logger = LoggerFactory.getLogger(OutboundService.class); + + private final WebClient webClient; + private final MeterRegistry meterRegistry; + private final MetricsConfig metricsConfig; + private final String httpbinBaseUrl; + + public OutboundService( + WebClient.Builder webClientBuilder, + MeterRegistry meterRegistry, + MetricsConfig metricsConfig, + @Value("${httpbin.base.url}") String httpbinBaseUrl) { + this.webClient = webClientBuilder + .baseUrl(httpbinBaseUrl) + .build(); + this.meterRegistry = meterRegistry; + this.metricsConfig = metricsConfig; + this.httpbinBaseUrl = httpbinBaseUrl; + } + + /** + * Make an outbound HTTP call and record metrics. + */ + public OutboundCallResult makeOutboundCall(String endpoint, String method) { + long startTime = System.currentTimeMillis(); + + try { + String response = webClient + .method(org.springframework.http.HttpMethod.valueOf(method)) + .uri(endpoint) + .retrieve() + .bodyToMono(String.class) + .timeout(Duration.ofSeconds(30)) + .onErrorResume(e -> Mono.just("Error: " + e.getMessage())) + .block(); + + long duration = System.currentTimeMillis() - startTime; + String statusCode = "200"; // Simplified - in real scenario, capture actual status + + // Record metrics + metricsConfig.outboundRequestTimer(meterRegistry, httpbinBaseUrl, method, statusCode) + .record(Duration.ofMillis(duration)); + metricsConfig.outboundRequestsCounter(meterRegistry, httpbinBaseUrl, method, statusCode) + .increment(); + + return OutboundCallResult.builder() + .success(true) + .status(200) + .duration(duration / 1000.0) + .data(response) + .build(); + + } catch (Exception e) { + long duration = System.currentTimeMillis() - startTime; + + // Record error metrics + metricsConfig.outboundRequestTimer(meterRegistry, httpbinBaseUrl, method, "error") + .record(Duration.ofMillis(duration)); + metricsConfig.outboundRequestsCounter(meterRegistry, httpbinBaseUrl, method, "error") + .increment(); + + logger.error("Outbound call failed: {}", e.getMessage()); + + return OutboundCallResult.builder() + .success(false) + .error(e.getMessage()) + .duration(duration / 1000.0) + .build(); + } + } +} + +// Made with Bob diff --git a/metrics-examples/java/src/main/resources/application.properties b/metrics-examples/java/src/main/resources/application.properties new file mode 100644 index 000000000..e44515cde --- /dev/null +++ b/metrics-examples/java/src/main/resources/application.properties @@ -0,0 +1,40 @@ +# Server Configuration +server.port=${PORT:8080} +server.shutdown=graceful +spring.lifecycle.timeout-per-shutdown-phase=30s + +# Application Name +spring.application.name=metrics-example-app + +# Actuator Configuration +management.server.port=2112 +management.endpoints.web.exposure.include=health,prometheus,metrics +management.endpoints.web.base-path=/ +management.endpoint.health.show-details=always +management.endpoint.prometheus.enabled=true +management.metrics.export.prometheus.enabled=true + +# Metrics Configuration +metrics.name.prefix=${METRICS_NAME_PREFIX:mymetrics_} + +# HTTP Client Configuration +httpbin.base.url=${HTTPBIN_BASE_URL:https://httpbin.org} + +# Disable auto-configuration of DataSource when not needed +spring.autoconfigure.exclude=org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration,\ + org.springframework.boot.autoconfigure.jdbc.DataSourceTransactionManagerAutoConfiguration,\ + org.springframework.boot.autoconfigure.data.jdbc.JdbcRepositoriesAutoConfiguration + +# Database Configuration (will be overridden by service binding if present) +spring.datasource.hikari.maximum-pool-size=10 +spring.datasource.hikari.minimum-idle=2 +spring.datasource.hikari.connection-timeout=15000 +spring.datasource.hikari.idle-timeout=600000 +spring.datasource.hikari.max-lifetime=1800000 + +# Logging +logging.level.root=INFO +logging.level.com.ibm.codeengine.metrics=INFO +logging.pattern.console=%level %logger{36} - %msg%n + +# Made with Bob diff --git a/metrics-examples/java/target/classes/application.properties b/metrics-examples/java/target/classes/application.properties new file mode 100644 index 000000000..e44515cde --- /dev/null +++ b/metrics-examples/java/target/classes/application.properties @@ -0,0 +1,40 @@ +# Server Configuration +server.port=${PORT:8080} +server.shutdown=graceful +spring.lifecycle.timeout-per-shutdown-phase=30s + +# Application Name +spring.application.name=metrics-example-app + +# Actuator Configuration +management.server.port=2112 +management.endpoints.web.exposure.include=health,prometheus,metrics +management.endpoints.web.base-path=/ +management.endpoint.health.show-details=always +management.endpoint.prometheus.enabled=true +management.metrics.export.prometheus.enabled=true + +# Metrics Configuration +metrics.name.prefix=${METRICS_NAME_PREFIX:mymetrics_} + +# HTTP Client Configuration +httpbin.base.url=${HTTPBIN_BASE_URL:https://httpbin.org} + +# Disable auto-configuration of DataSource when not needed +spring.autoconfigure.exclude=org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration,\ + org.springframework.boot.autoconfigure.jdbc.DataSourceTransactionManagerAutoConfiguration,\ + org.springframework.boot.autoconfigure.data.jdbc.JdbcRepositoriesAutoConfiguration + +# Database Configuration (will be overridden by service binding if present) +spring.datasource.hikari.maximum-pool-size=10 +spring.datasource.hikari.minimum-idle=2 +spring.datasource.hikari.connection-timeout=15000 +spring.datasource.hikari.idle-timeout=600000 +spring.datasource.hikari.max-lifetime=1800000 + +# Logging +logging.level.root=INFO +logging.level.com.ibm.codeengine.metrics=INFO +logging.pattern.console=%level %logger{36} - %msg%n + +# Made with Bob diff --git a/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/MetricsApplication.class b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/MetricsApplication.class new file mode 100644 index 000000000..eb73e3e92 Binary files /dev/null and b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/MetricsApplication.class differ diff --git a/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/config/DatabaseConfig.class b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/config/DatabaseConfig.class new file mode 100644 index 000000000..bf6c44880 Binary files /dev/null and b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/config/DatabaseConfig.class differ diff --git a/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/config/MetricsConfig.class b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/config/MetricsConfig.class new file mode 100644 index 000000000..9a9d038b8 Binary files /dev/null and b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/config/MetricsConfig.class differ diff --git a/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/controller/MetricsController.class b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/controller/MetricsController.class new file mode 100644 index 000000000..2bfd9a2d6 Binary files /dev/null and b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/controller/MetricsController.class differ diff --git a/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/model/OutboundCallResult.class b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/model/OutboundCallResult.class new file mode 100644 index 000000000..77741c76c Binary files /dev/null and b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/model/OutboundCallResult.class differ diff --git a/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/service/ComputeService.class b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/service/ComputeService.class new file mode 100644 index 000000000..e201f14f9 Binary files /dev/null and b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/service/ComputeService.class differ diff --git a/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/service/DatabaseService.class b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/service/DatabaseService.class new file mode 100644 index 000000000..259b8f2c1 Binary files /dev/null and b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/service/DatabaseService.class differ diff --git a/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/service/OutboundService.class b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/service/OutboundService.class new file mode 100644 index 000000000..8b84798a1 Binary files /dev/null and b/metrics-examples/java/target/classes/com/ibm/codeengine/metrics/service/OutboundService.class differ diff --git a/metrics-examples/java/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst b/metrics-examples/java/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst new file mode 100644 index 000000000..42dd676fe --- /dev/null +++ b/metrics-examples/java/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst @@ -0,0 +1,4 @@ +com/ibm/codeengine/metrics/service/ComputeService.class +com/ibm/codeengine/metrics/config/MetricsConfig.class +com/ibm/codeengine/metrics/model/OutboundCallResult.class +com/ibm/codeengine/metrics/MetricsApplication.class diff --git a/metrics-examples/java/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst b/metrics-examples/java/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst new file mode 100644 index 000000000..045886af4 --- /dev/null +++ b/metrics-examples/java/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst @@ -0,0 +1,8 @@ +/Users/reggeenr/go/src/github.ibm.com/CodeEngine/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/model/OutboundCallResult.java +/Users/reggeenr/go/src/github.ibm.com/CodeEngine/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/service/OutboundService.java +/Users/reggeenr/go/src/github.ibm.com/CodeEngine/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/service/ComputeService.java +/Users/reggeenr/go/src/github.ibm.com/CodeEngine/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/MetricsApplication.java +/Users/reggeenr/go/src/github.ibm.com/CodeEngine/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/config/MetricsConfig.java +/Users/reggeenr/go/src/github.ibm.com/CodeEngine/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/config/DatabaseConfig.java +/Users/reggeenr/go/src/github.ibm.com/CodeEngine/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/service/DatabaseService.java +/Users/reggeenr/go/src/github.ibm.com/CodeEngine/metrics-examples/java/src/main/java/com/ibm/codeengine/metrics/controller/MetricsController.java diff --git a/metrics-examples/load-test.sh b/metrics-examples/load-test.sh new file mode 100755 index 000000000..0d45a32f8 --- /dev/null +++ b/metrics-examples/load-test.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +# Load testing script for network-test-app +# Generates random load to produce metric data +# Press Ctrl+C to abort the test + +# Configuration +TARGET_URL="${TARGET_URL:-http://localhost:8080}" +DURATION="${DURATION:-60}" +CONCURRENT_REQUESTS="${CONCURRENT_REQUESTS:-5}" + +# Track worker PIDs for cleanup +WORKER_PIDS=() +INTERRUPTED=false + +# Cleanup function +cleanup() { + if [ "$INTERRUPTED" = false ]; then + INTERRUPTED=true + echo "" + echo "Aborting load test..." + + # Kill all worker processes + for pid in "${WORKER_PIDS[@]}"; do + kill "$pid" 2>/dev/null + done + + # Wait for all processes to terminate + wait 2>/dev/null + + echo "Load test aborted!" + echo "View metrics at: ${TARGET_URL%:*}:2112/metrics" + exit 0 + fi +} + +# Set up signal handlers for graceful shutdown +trap cleanup SIGINT SIGTERM + +echo "Load Testing Configuration:" +echo " Target URL: $TARGET_URL" +echo " Duration: ${DURATION}s" +echo " Concurrent requests: $CONCURRENT_REQUESTS" +echo "" +echo "Press Ctrl+C to abort the test" +echo "" + +# Endpoints to test +ENDPOINTS=( + "/" + "/outbound/delay" + "/outbound/get" + "/outbound/status/200" + "/outbound/status/404" + "/test-db" +) + +# Function to make a random request +make_request() { + local endpoint=${ENDPOINTS[$RANDOM % ${#ENDPOINTS[@]}]} + local method="GET" + + # POST endpoint + if [[ "$endpoint" == "/outbound/post" ]]; then + method="POST" + fi + + local start_time=$(date +%s%N) + local response_code=$(curl -s -o /dev/null -w "%{http_code}" -X "$method" "${TARGET_URL}${endpoint}" 2>/dev/null) + local end_time=$(date +%s%N) + local duration=$(( (end_time - start_time) / 1000000 )) + + echo "[$(date +%H:%M:%S)] $method $endpoint -> $response_code (${duration}ms)" +} + +# Function to run load test worker +run_worker() { + local worker_id=$1 + local end_time=$(($(date +%s) + DURATION)) + + while [ $(date +%s) -lt $end_time ] && [ "$INTERRUPTED" = false ]; do + make_request + # Random sleep between requests (0.5-2 seconds) + sleep $(awk -v min=0.5 -v max=2 'BEGIN{srand(); print min+rand()*(max-min)}') 2>/dev/null || break + done +} + +# Start concurrent workers +echo "Starting load test..." +for i in $(seq 1 $CONCURRENT_REQUESTS); do + run_worker $i & + WORKER_PIDS+=($!) +done + +# Wait for all workers to complete +wait + +# Check if we completed normally or were interrupted +if [ "$INTERRUPTED" = false ]; then + echo "" + echo "Load test completed!" + echo "View metrics at: ${TARGET_URL%:*}:2112/metrics" +fi diff --git a/metrics-examples/my-custom-code-engine-metrics-dashboard.json b/metrics-examples/my-custom-code-engine-metrics-dashboard.json new file mode 100644 index 000000000..1e9f405ed --- /dev/null +++ b/metrics-examples/my-custom-code-engine-metrics-dashboard.json @@ -0,0 +1,792 @@ +{ + "name": "My custom Code Engine Metrics", + "panels": [ + { + "id": 1, + "type": "text", + "name": "Dashboard Overview", + "description": "", + "nullValueDisplayText": null, + "links": null, + "markdownSource": "Monitor custom application metrics for Code Engine applications.\n\n**Metrics included:**\n- Request rates by method and path\n- Outbound HTTP request performance\n- Database connection monitoring\n- Compute operation duration", + "transparentBackground": false, + "panelTitleVisible": true, + "textAutosized": false + }, + { + "id": 2, + "type": "advancedTimechart", + "name": "Request Rate (by Path)", + "description": "Rate of incoming requests grouped by path", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "rate(mymetrics_requests_total{$__scope}[5m])", + "enabled": true, + "displayInfo": { + "displayName": "Request Rate", + "timeSeriesDisplayNameTemplate": "{{method}} {{path}}", + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 2, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "number", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "number", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 3, + "type": "advancedNumber", + "name": "Total Requests", + "description": "Total number of requests received", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(mymetrics_requests_total{$__scope})", + "enabled": true, + "displayInfo": { + "displayName": "Total Requests", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 4, + "type": "advancedTimechart", + "name": "Outbound Request Duration (P95)", + "description": "95th percentile of outbound HTTP request duration", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "histogram_quantile(0.95, rate(mymetrics_outbound_request_duration_seconds_bucket{$__scope}[5m]))", + "enabled": true, + "displayInfo": { + "displayName": "P95 Duration", + "timeSeriesDisplayNameTemplate": "{{target}} {{method}}", + "type": "lines" + }, + "format": { + "unit": "relativeTime", + "inputFormat": "s", + "displayFormat": "auto", + "decimals": 3, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 2.0, + "inputFormat": "s", + "displayText": "" + }, + { + "severity": "medium", + "value": 1.0, + "inputFormat": "s", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 5, + "type": "advancedTimechart", + "name": "Outbound Request Duration (Average)", + "description": "Average duration of outbound HTTP requests", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "rate(mymetrics_outbound_request_duration_seconds_sum{$__scope}[5m]) / rate(mymetrics_outbound_request_duration_seconds_count{$__scope}[5m])", + "enabled": true, + "displayInfo": { + "displayName": "Avg Duration", + "timeSeriesDisplayNameTemplate": "{{target}} {{method}}", + "type": "lines" + }, + "format": { + "unit": "relativeTime", + "inputFormat": "s", + "displayFormat": "auto", + "decimals": 3, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 6, + "type": "advancedTimechart", + "name": "Outbound Request Rate", + "description": "Rate of outbound HTTP requests by target and status", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "rate(mymetrics_outbound_requests_total{$__scope}[5m])", + "enabled": true, + "displayInfo": { + "displayName": "Request Rate", + "timeSeriesDisplayNameTemplate": "{{target}} {{method}} ({{status_code}})", + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 2, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "number", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "number", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 7, + "type": "advancedNumber", + "name": "Active DB Connections", + "description": "Current number of active database connections", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "mymetrics_db_connections_active{$__scope}", + "enabled": true, + "displayInfo": { + "displayName": "Active Connections", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 80, + "inputFormat": "1", + "displayText": "" + }, + { + "severity": "medium", + "value": 50, + "inputFormat": "1", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 8, + "type": "advancedTimechart", + "name": "Database Connections Over Time", + "description": "Active database connections timeline", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "mymetrics_db_connections_active{$__scope}", + "enabled": true, + "displayInfo": { + "displayName": "Active Connections", + "timeSeriesDisplayNameTemplate": "DB Connections", + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "number", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "number", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 80, + "inputFormat": "1", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 9, + "type": "advancedTimechart", + "name": "Compute Operation Duration (P95)", + "description": "95th percentile of compute operation duration", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "histogram_quantile(0.95, rate(mymetrics_compute_duration_seconds_bucket{$__scope}[5m]))", + "enabled": true, + "displayInfo": { + "displayName": "P95 Duration", + "timeSeriesDisplayNameTemplate": "{{operation}}", + "type": "lines" + }, + "format": { + "unit": "relativeTime", + "inputFormat": "s", + "displayFormat": "auto", + "decimals": 3, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 5.0, + "inputFormat": "s", + "displayText": "" + }, + { + "severity": "medium", + "value": 3.0, + "inputFormat": "s", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 10, + "type": "advancedTimechart", + "name": "Compute Operation Duration (Average)", + "description": "Average duration of compute operations", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "rate(mymetrics_compute_duration_seconds_sum{$__scope}[5m]) / rate(mymetrics_compute_duration_seconds_count{$__scope}[5m])", + "enabled": true, + "displayInfo": { + "displayName": "Avg Duration", + "timeSeriesDisplayNameTemplate": "{{operation}}", + "type": "lines" + }, + "format": { + "unit": "relativeTime", + "inputFormat": "s", + "displayFormat": "auto", + "decimals": 3, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + } + ], + "scopeExpressionList": [], + "eventDisplaySettings": { + "enabled": true, + "queryParams": { + "severities": [], + "alertStatuses": [], + "categories": [], + "filter": "", + "teamScope": false + } + }, + "shared": true, + "public": false, + "description": "Custom dashboard for monitoring application metrics including request rates, outbound HTTP performance, database connections, and compute operations", + "layout": [ + { + "panelId": 1, + "x": 0, + "y": 0, + "w": 12, + "h": 3 + }, + { + "panelId": 3, + "x": 12, + "y": 0, + "w": 6, + "h": 3 + }, + { + "panelId": 7, + "x": 18, + "y": 0, + "w": 6, + "h": 3 + }, + { + "panelId": 2, + "x": 0, + "y": 3, + "w": 12, + "h": 6 + }, + { + "panelId": 6, + "x": 12, + "y": 3, + "w": 12, + "h": 6 + }, + { + "panelId": 4, + "x": 0, + "y": 9, + "w": 12, + "h": 6 + }, + { + "panelId": 5, + "x": 12, + "y": 9, + "w": 12, + "h": 6 + }, + { + "panelId": 8, + "x": 0, + "y": 15, + "w": 12, + "h": 6 + }, + { + "panelId": 9, + "x": 0, + "y": 21, + "w": 12, + "h": 6 + }, + { + "panelId": 10, + "x": 12, + "y": 21, + "w": 12, + "h": 6 + } + ], + "schema": 3 +} diff --git a/metrics-examples/node/Dockerfile b/metrics-examples/node/Dockerfile new file mode 100644 index 000000000..f7224cc66 --- /dev/null +++ b/metrics-examples/node/Dockerfile @@ -0,0 +1,19 @@ +# Download dependencies in builder stage +FROM registry.access.redhat.com/ubi9/nodejs-24:latest AS builder + +COPY --chown=${CNB_USER_ID}:${CNB_GROUP_ID} package.json /app/ +WORKDIR /app +RUN npm i --omit=dev + + +# Use a small distroless image for as runtime image +FROM gcr.io/distroless/nodejs24 + +COPY --chown=1001:0 --from=builder /app/node_modules /app/node_modules +COPY --chown=1001:0 . /app/ + +USER 1001:0 +WORKDIR /app +EXPOSE 8080 + +CMD ["app.mjs"] diff --git a/metrics-examples/node/README.md b/metrics-examples/node/README.md new file mode 100644 index 000000000..0f6ed894d --- /dev/null +++ b/metrics-examples/node/README.md @@ -0,0 +1,90 @@ +# # Code Engine custom metrics examples for Node.js + +This application helps debug connectivity issues for IBM Cloud Services and provides comprehensive monitoring through Prometheus metrics. It includes outbound HTTP call simulation, database connectivity testing, and compute-intensive workload simulation. + +## Features + +- **Outbound HTTP Calls**: Configurable endpoints that simulate delays and error responses to httpbin.org-compatible backends +- **Database Testing**: PostgreSQL connectivity verification with instrumented queries +- **Prometheus Metrics**: Comprehensive instrumentation for requests, outbound calls, database operations, and compute workloads +- **Load Testing**: Included shell script for generating realistic traffic patterns + +## Quick Start + +### Deploy to IBM Cloud Code Engine + +Deploy the application with recommended resource settings: + +```bash +ibmcloud ce application create \ + --name metrics-example-app-node \ + --src "." \ + --memory 0.5G \ + --cpu 0.25 \ + --port 8080 +``` + +The `--concurrency 5` setting limits each instance to handle a maximum of 5 concurrent requests, ensuring stable performance given the compute-intensive operations. + +To configure environment variables during deployment: + +```bash +ibmcloud ce application create \ + --name metrics-example-app-node \ + --src "." \ + --memory 0.5G \ + --cpu 0.25 \ + --env HTTPBIN_BASE_URL=https://httpbin.org \ + --env METRICS_COLLECT_NODE_METRICS_ENABLED=true +``` + +Update an existing application: + +```bash +ibmcloud ce application update \ + --name metrics-example-app-node \ + --env HTTPBIN_BASE_URL=https://custom-backend.example.com +``` + +### Run Locally + +Pull and run with Docker: +```bash +docker pull icr.io/codeengine/metrics-example-app-node +docker run -p 8080:8080 -p 2112:2112 icr.io/codeengine/metrics-example-app-node +``` + +Or run from source: +```bash +npm install +node app.mjs +``` + +The application exposes two servers: +- Main application: `http://localhost:8080` +- Metrics endpoint: `http://localhost:2112/metrics` + +## Configuration + +### Environment Variables + +- `PORT`: Application server port (default: 8080) +- `HTTPBIN_BASE_URL`: Backend URL for outbound calls (default: `https://httpbin.org`) +- `METRICS_NAME_PREFIX`: Prefix for all Prometheus metrics (default: `mymetrics_`) +- `METRICS_COLLECT_NODE_METRICS_ENABLED`: Enable Node.js runtime metrics (set to "true") +- `DATABASES_FOR_POSTGRESQL_CONNECTION`: PostgreSQL connection credentials (JSON format) + +### Service Bindings + +For database connectivity, create a Code Engine service binding between your project and the IBM Cloud service. See [Working with service bindings](https://cloud.ibm.com/docs/codeengine?topic=codeengine-service-binding) for details. + +## API Endpoints + +- `GET /` - Health check +- `GET /test-db` - Test PostgreSQL connectivity +- `GET /outbound/delay` - Outbound call with random delay (0-2s) and 5% error rate +- `GET /outbound/get` - Simple outbound GET request +- `POST /outbound/post` - Outbound POST request +- `GET /outbound/status/:code` - Request specific HTTP status code + +All outbound endpoints include simulated compute-intensive data processing (0-3s duration, 40-80% CPU intensity). \ No newline at end of file diff --git a/metrics-examples/node/app.mjs b/metrics-examples/node/app.mjs new file mode 100644 index 000000000..5c5bd91cb --- /dev/null +++ b/metrics-examples/node/app.mjs @@ -0,0 +1,370 @@ +import express from "express"; +import promClient from "prom-client"; +import { closeDbClient, getDbClient } from "./utils/db.mjs"; + +// ==================================== +// Configuration +// ==================================== +const HTTPBIN_BASE_URL = process.env.HTTPBIN_BASE_URL || "https://httpbin.org"; + +// ==================================== +// Initialize Prometheus metrics +// ==================================== +const METRICS_NAME_PREFIX = process.env.METRICS_NAME_PREFIX || "mymetrics_"; +// Create a registry to register the metrics +const register = new promClient.Registry(); + +// Create a custom counter metric with path label +// Note: For high-cardinality paths, consider using a Histogram instead to track +// request duration distribution, or a Gauge to track active requests. +// Histogram example: new promClient.Histogram({ +// name: `${METRICS_NAME_PREFIX}request_duration_seconds`, +// help: "Request duration in seconds", +// labelNames: ["method", "path", "status_code"], +// buckets: [0.1, 0.5, 1, 2, 5] +// }); +const counter = new promClient.Counter({ + name: `${METRICS_NAME_PREFIX}requests_total`, + help: "Total number of requests", + labelNames: ["method", "path"], +}); +register.registerMetric(counter); + +// Outbound HTTP call metrics +const outboundCallDuration = new promClient.Histogram({ + name: `${METRICS_NAME_PREFIX}outbound_request_duration_seconds`, + help: "Duration of outbound HTTP requests in seconds", + labelNames: ["target", "method", "status_code"], + buckets: [0.1, 0.5, 1, 2, 5, 10], +}); +register.registerMetric(outboundCallDuration); + +const outboundCallTotal = new promClient.Counter({ + name: `${METRICS_NAME_PREFIX}outbound_requests_total`, + help: "Total number of outbound HTTP requests", + labelNames: ["target", "method", "status_code"], +}); +register.registerMetric(outboundCallTotal); + +// Database operation metrics +const dbQueryDuration = new promClient.Histogram({ + name: `${METRICS_NAME_PREFIX}db_query_duration_seconds`, + help: "Duration of database queries in seconds", + labelNames: ["operation", "table", "status"], + buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5], +}); +register.registerMetric(dbQueryDuration); + +const dbQueryTotal = new promClient.Counter({ + name: `${METRICS_NAME_PREFIX}db_queries_total`, + help: "Total number of database queries", + labelNames: ["operation", "table", "status"], +}); +register.registerMetric(dbQueryTotal); + +const dbConnectionsActive = new promClient.Gauge({ + name: `${METRICS_NAME_PREFIX}db_connections_active`, + help: "Number of active database connections", +}); +register.registerMetric(dbConnectionsActive); + +// Compute operation metrics +const computeDuration = new promClient.Histogram({ + name: `${METRICS_NAME_PREFIX}compute_duration_seconds`, + help: "Duration of compute-intensive operations in seconds", + labelNames: ["operation"], + buckets: [0.5, 1, 2, 3, 5], +}); +register.registerMetric(computeDuration); + +if (process.env.METRICS_COLLECT_NODE_METRICS_ENABLED === "true") { + promClient.collectDefaultMetrics({ register }); +} + +// ==================================== +// Helper Functions +// ==================================== + +// Simulate compute-intensive operation +function simulateCompute(durationSeconds, cpuIntensity) { + const startTime = Date.now(); + const endTime = startTime + durationSeconds * 1000; + + // CPU-intensive loop based on intensity (40-80%) + while (Date.now() < endTime) { + // Perform some CPU work + const workIterations = Math.floor(cpuIntensity * 1000); + for (let i = 0; i < workIterations; i++) { + Math.sqrt(Math.random() * 1000000); + } + // Small sleep to control CPU usage + const sleepTime = (100 - cpuIntensity) / 10; + const sleepEnd = Date.now() + sleepTime; + while (Date.now() < sleepEnd) { + // Busy wait for precise timing + } + } +} + +// Make outbound HTTP call with metrics +async function makeOutboundCall(endpoint, method = "GET") { + const url = `${HTTPBIN_BASE_URL}${endpoint}`; + const startTime = Date.now(); + + try { + const response = await fetch(url, { method }); + const duration = (Date.now() - startTime) / 1000; + const statusCode = response.status.toString(); + + // Record metrics + outboundCallDuration.observe({ target: HTTPBIN_BASE_URL, method, status_code: statusCode }, duration); + outboundCallTotal.inc({ target: HTTPBIN_BASE_URL, method, status_code: statusCode }); + + return { success: true, status: response.status, duration, data: await response.text() }; + } catch (error) { + const duration = (Date.now() - startTime) / 1000; + + // Record error metrics + outboundCallDuration.observe({ target: HTTPBIN_BASE_URL, method, status_code: "error" }, duration); + outboundCallTotal.inc({ target: HTTPBIN_BASE_URL, method, status_code: "error" }); + + return { success: false, error: error.message, duration }; + } +} + +// Instrumented DB query wrapper +async function executeDbQuery(dbClient, query, operation, table) { + const startTime = Date.now(); + let status = "success"; + + try { + const result = await dbClient.query(query); + const duration = (Date.now() - startTime) / 1000; + + dbQueryDuration.observe({ operation, table, status }, duration); + dbQueryTotal.inc({ operation, table, status }); + + return result; + } catch (error) { + status = "error"; + const duration = (Date.now() - startTime) / 1000; + + dbQueryDuration.observe({ operation, table, status }, duration); + dbQueryTotal.inc({ operation, table, status }); + + throw error; + } +} + +// ====================================== +// Initialize Express app +// ====================================== +const app = express(); +app.use(express.json()); +const router = express.Router(); +app.use("/", router); + +// Middleware to count requests with path +router.use((req, res, next) => { + counter.inc({ method: req.method, path: req.path }); + next(); +}); + +router.get("/", (req, res) => { + res.send(`app '${process.env.CE_APP || "network-test-app"}' is ready!`); +}); + +router.get("/test-db", async (request, response) => { + const dbClient = await getDbClient(); + if (!dbClient) { + return response.status(500).send("Could not connect to postgres instance: no postgres instance configured"); + } + + try { + // Update connection gauge + dbConnectionsActive.inc(); + + // Run a simple command to verify that we connected to the postgres instance + console.log("List tables"); + const result = await executeDbQuery( + dbClient, + "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE';", + "SELECT", + "INFORMATION_SCHEMA.TABLES", + ); + console.log(`Received the following query result: ${JSON.stringify(result)}`); + response.status(200).send("Successfully connected to postgres instance"); + } catch (err) { + console.error("Failed to connect to PostgreSQL instance", err); + response.status(500).send(`Could not connect to postgres instance: '${err.message}'`); + } finally { + dbConnectionsActive.dec(); + } +}); + +// ==================================== +// Outbound call endpoints +// ==================================== + +router.get("/outbound/delay", async (req, res) => { + try { + // Random delay between 0-2 seconds + const delay = Math.random() * 2; + + // 5% error rate + const shouldError = Math.random() < 0.05; + + if (shouldError) { + // Simulate error by calling status/500 + const result = await makeOutboundCall("/status/500", "GET"); + + // Simulate compute-intensive data handling + const computeStart = Date.now(); + const computeDurationSec = Math.random() * 3; // 0-3 seconds + const cpuIntensity = 40 + Math.random() * 40; // 40-80% + simulateCompute(computeDurationSec, cpuIntensity); + const actualComputeDuration = (Date.now() - computeStart) / 1000; + computeDuration.observe({ operation: "data_processing" }, actualComputeDuration); + + return res.status(500).json({ + message: "Simulated error response", + delay, + outboundCall: result, + computeTime: actualComputeDuration, + cpuIntensity: `${cpuIntensity.toFixed(1)}%`, + }); + } + + // Normal flow with delay + const result = await makeOutboundCall(`/delay/${delay.toFixed(1)}`, "GET"); + + // Simulate compute-intensive data handling + const computeStart = Date.now(); + const computeDurationSec = Math.random() * 3; // 0-3 seconds + const cpuIntensity = 40 + Math.random() * 40; // 40-80% + simulateCompute(computeDurationSec, cpuIntensity); + const actualComputeDuration = (Date.now() - computeStart) / 1000; + computeDuration.observe({ operation: "data_processing" }, actualComputeDuration); + + res.status(200).json({ + message: "Outbound call completed", + delay, + outboundCall: result, + computeTime: actualComputeDuration, + cpuIntensity: `${cpuIntensity.toFixed(1)}%`, + }); + } catch (error) { + res.status(500).json({ error: error.message }); + } +}); + +router.get("/outbound/status/:code", async (req, res) => { + try { + const statusCode = req.params.code; + const result = await makeOutboundCall(`/status/${statusCode}`, "GET"); + + // Simulate compute-intensive data handling + const computeStart = Date.now(); + const computeDurationSec = Math.random() * 3; + const cpuIntensity = 40 + Math.random() * 40; + simulateCompute(computeDurationSec, cpuIntensity); + const actualComputeDuration = (Date.now() - computeStart) / 1000; + computeDuration.observe({ operation: "data_processing" }, actualComputeDuration); + + res.status(200).json({ + message: "Outbound call completed", + requestedStatus: statusCode, + outboundCall: result, + computeTime: actualComputeDuration, + cpuIntensity: `${cpuIntensity.toFixed(1)}%`, + }); + } catch (error) { + res.status(500).json({ error: error.message }); + } +}); + +router.get("/outbound/get", async (req, res) => { + try { + const result = await makeOutboundCall("/get", "GET"); + + // Simulate compute-intensive data handling + const computeStart = Date.now(); + const computeDurationSec = Math.random() * 3; + const cpuIntensity = 40 + Math.random() * 40; + simulateCompute(computeDurationSec, cpuIntensity); + const actualComputeDuration = (Date.now() - computeStart) / 1000; + computeDuration.observe({ operation: "data_processing" }, actualComputeDuration); + + res.status(200).json({ + message: "Outbound GET call completed", + outboundCall: result, + computeTime: actualComputeDuration, + cpuIntensity: `${cpuIntensity.toFixed(1)}%`, + }); + } catch (error) { + res.status(500).json({ error: error.message }); + } +}); + +router.post("/outbound/post", async (req, res) => { + try { + const result = await makeOutboundCall("/post", "POST"); + + // Simulate compute-intensive data handling + const computeStart = Date.now(); + const computeDurationSec = Math.random() * 3; + const cpuIntensity = 40 + Math.random() * 40; + simulateCompute(computeDurationSec, cpuIntensity); + const actualComputeDuration = (Date.now() - computeStart) / 1000; + computeDuration.observe({ operation: "data_processing" }, actualComputeDuration); + + res.status(200).json({ + message: "Outbound POST call completed", + outboundCall: result, + computeTime: actualComputeDuration, + cpuIntensity: `${cpuIntensity.toFixed(1)}%`, + }); + } catch (error) { + res.status(500).json({ error: error.message }); + } +}); + +// ====================================== +// Start the http server +// ====================================== +const port = process.env.PORT || 8080; +const server = app.listen(port, async () => { + console.log(`Application server is running at http://localhost:${port}`); + console.log(`Configured httpbin backend: ${HTTPBIN_BASE_URL}`); +}); + +// ====================================== +// Metrics server +// ====================================== +const metricsApp = express(); +const metricsPort = 2112; +// Expose metrics endpoint +metricsApp.get("/metrics", async (req, res) => { + res.set("Content-Type", register.contentType); + res.end(await register.metrics()); +}); +// Start the metrics server +const metricsServer = metricsApp.listen(metricsPort, () => { + console.log(`Metrics server is running at http://localhost:${metricsPort}`); +}); + +// ====================================== +// Handle shutdown signals +// ====================================== +process.on("SIGTERM", async () => { + console.info("SIGTERM signal received."); + await closeDbClient(); + + metricsServer.close(() => { + console.log("Metrics server closed."); + }); + + server.close(() => { + console.log("Http server closed."); + }); +}); diff --git a/network-test-app/package-lock.json b/metrics-examples/node/package-lock.json similarity index 69% rename from network-test-app/package-lock.json rename to metrics-examples/node/package-lock.json index 46b4a8c16..d5302ca88 100644 --- a/network-test-app/package-lock.json +++ b/metrics-examples/node/package-lock.json @@ -9,51 +9,61 @@ "version": "1.0.0", "license": "ISC", "dependencies": { - "express": "^4.22.1", - "pg": "^8.13.0" + "express": "^5.2.1", + "pg": "^8.20.0", + "prom-client": "^15.1.3" + } + }, + "node_modules/@opentelemetry/api": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", + "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", + "license": "Apache-2.0", + "engines": { + "node": ">=8.0.0" } }, "node_modules/accepts": { - "version": "1.3.8", - "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz", - "integrity": "sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw==", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/accepts/-/accepts-2.0.0.tgz", + "integrity": "sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==", "license": "MIT", "dependencies": { - "mime-types": "~2.1.34", - "negotiator": "0.6.3" + "mime-types": "^3.0.0", + "negotiator": "^1.0.0" }, "engines": { "node": ">= 0.6" } }, - "node_modules/array-flatten": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", - "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==", + "node_modules/bintrees": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/bintrees/-/bintrees-1.0.2.tgz", + "integrity": "sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==", "license": "MIT" }, "node_modules/body-parser": { - "version": "1.20.3", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.3.tgz", - "integrity": "sha512-7rAxByjUMqQ3/bHJy7D6OGXvx/MMc4IqBn/X0fcM1QUcAItpZrBEYhWGem+tzXH90c+G01ypMcYJBO9Y30203g==", + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.2.tgz", + "integrity": "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==", "license": "MIT", "dependencies": { - "bytes": "3.1.2", - "content-type": "~1.0.5", - "debug": "2.6.9", - "depd": "2.0.0", - "destroy": "1.2.0", - "http-errors": "2.0.0", - "iconv-lite": "0.4.24", - "on-finished": "2.4.1", - "qs": "6.13.0", - "raw-body": "2.5.2", - "type-is": "~1.6.18", - "unpipe": "1.0.0" + "bytes": "^3.1.2", + "content-type": "^1.0.5", + "debug": "^4.4.3", + "http-errors": "^2.0.0", + "iconv-lite": "^0.7.0", + "on-finished": "^2.4.1", + "qs": "^6.14.1", + "raw-body": "^3.0.1", + "type-is": "^2.0.1" }, "engines": { - "node": ">= 0.8", - "npm": "1.2.8000 || >= 1.4.16" + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/bytes": { @@ -95,15 +105,16 @@ } }, "node_modules/content-disposition": { - "version": "0.5.4", - "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz", - "integrity": "sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-1.0.1.tgz", + "integrity": "sha512-oIXISMynqSqm241k6kcQ5UwttDILMK4BiurCfGEREw6+X9jkkpEe5T9FZaApyLGGOnFuyMWZpdolTXMtvEJ08Q==", "license": "MIT", - "dependencies": { - "safe-buffer": "5.2.1" - }, "engines": { - "node": ">= 0.6" + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/content-type": { @@ -125,18 +136,29 @@ } }, "node_modules/cookie-signature": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz", - "integrity": "sha512-QADzlaHc8icV8I7vbaJXJwod9HWYp8uCqf1xa4OfNu1T7JVxQIrUgOWtHdNDtPiywmFbiS12VjotIXLrKM3orQ==", - "license": "MIT" + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.2.2.tgz", + "integrity": "sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg==", + "license": "MIT", + "engines": { + "node": ">=6.6.0" + } }, "node_modules/debug": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", "license": "MIT", "dependencies": { - "ms": "2.0.0" + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } } }, "node_modules/depd": { @@ -148,16 +170,6 @@ "node": ">= 0.8" } }, - "node_modules/destroy": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz", - "integrity": "sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg==", - "license": "MIT", - "engines": { - "node": ">= 0.8", - "npm": "1.2.8000 || >= 1.4.16" - } - }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -233,82 +245,67 @@ } }, "node_modules/express": { - "version": "4.22.1", - "resolved": "https://registry.npmjs.org/express/-/express-4.22.1.tgz", - "integrity": "sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g==", - "license": "MIT", - "dependencies": { - "accepts": "~1.3.8", - "array-flatten": "1.1.1", - "body-parser": "~1.20.3", - "content-disposition": "~0.5.4", - "content-type": "~1.0.4", - "cookie": "~0.7.1", - "cookie-signature": "~1.0.6", - "debug": "2.6.9", - "depd": "2.0.0", - "encodeurl": "~2.0.0", - "escape-html": "~1.0.3", - "etag": "~1.8.1", - "finalhandler": "~1.3.1", - "fresh": "~0.5.2", - "http-errors": "~2.0.0", - "merge-descriptors": "1.0.3", - "methods": "~1.1.2", - "on-finished": "~2.4.1", - "parseurl": "~1.3.3", - "path-to-regexp": "~0.1.12", - "proxy-addr": "~2.0.7", - "qs": "~6.14.0", - "range-parser": "~1.2.1", - "safe-buffer": "5.2.1", - "send": "~0.19.0", - "serve-static": "~1.16.2", - "setprototypeof": "1.2.0", - "statuses": "~2.0.1", - "type-is": "~1.6.18", - "utils-merge": "1.0.1", - "vary": "~1.1.2" - }, - "engines": { - "node": ">= 0.10.0" + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/express/-/express-5.2.1.tgz", + "integrity": "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==", + "license": "MIT", + "dependencies": { + "accepts": "^2.0.0", + "body-parser": "^2.2.1", + "content-disposition": "^1.0.0", + "content-type": "^1.0.5", + "cookie": "^0.7.1", + "cookie-signature": "^1.2.1", + "debug": "^4.4.0", + "depd": "^2.0.0", + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "etag": "^1.8.1", + "finalhandler": "^2.1.0", + "fresh": "^2.0.0", + "http-errors": "^2.0.0", + "merge-descriptors": "^2.0.0", + "mime-types": "^3.0.0", + "on-finished": "^2.4.1", + "once": "^1.4.0", + "parseurl": "^1.3.3", + "proxy-addr": "^2.0.7", + "qs": "^6.14.0", + "range-parser": "^1.2.1", + "router": "^2.2.0", + "send": "^1.1.0", + "serve-static": "^2.2.0", + "statuses": "^2.0.1", + "type-is": "^2.0.1", + "vary": "^1.1.2" + }, + "engines": { + "node": ">= 18" }, "funding": { "type": "opencollective", "url": "https://opencollective.com/express" } }, - "node_modules/express/node_modules/qs": { - "version": "6.14.2", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.14.2.tgz", - "integrity": "sha512-V/yCWTTF7VJ9hIh18Ugr2zhJMP01MY7c5kh4J870L7imm6/DIzBsNLTXzMwUA3yZ5b/KBqLx8Kp3uRvd7xSe3Q==", - "license": "BSD-3-Clause", - "dependencies": { - "side-channel": "^1.1.0" - }, - "engines": { - "node": ">=0.6" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, "node_modules/finalhandler": { - "version": "1.3.1", - "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.3.1.tgz", - "integrity": "sha512-6BN9trH7bp3qvnrRyzsBz+g3lZxTNZTbVO2EV1CS0WIcDbawYVdYvGflME/9QP0h0pYlCDBCTjYa9nZzMDpyxQ==", + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-2.1.1.tgz", + "integrity": "sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA==", "license": "MIT", "dependencies": { - "debug": "2.6.9", - "encodeurl": "~2.0.0", - "escape-html": "~1.0.3", - "on-finished": "2.4.1", - "parseurl": "~1.3.3", - "statuses": "2.0.1", - "unpipe": "~1.0.0" + "debug": "^4.4.0", + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "on-finished": "^2.4.1", + "parseurl": "^1.3.3", + "statuses": "^2.0.1" }, "engines": { - "node": ">= 0.8" + "node": ">= 18.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/forwarded": { @@ -321,12 +318,12 @@ } }, "node_modules/fresh": { - "version": "0.5.2", - "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz", - "integrity": "sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q==", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/fresh/-/fresh-2.0.0.tgz", + "integrity": "sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==", "license": "MIT", "engines": { - "node": ">= 0.6" + "node": ">= 0.8" } }, "node_modules/function-bind": { @@ -412,31 +409,39 @@ } }, "node_modules/http-errors": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz", - "integrity": "sha512-FtwrG/euBzaEjYeRqOgly7G0qviiXoJWnvEH2Z1plBdXgbyjv34pHTSb9zoeHMyDy33+DWy5Wt9Wo+TURtOYSQ==", + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.1.tgz", + "integrity": "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==", "license": "MIT", "dependencies": { - "depd": "2.0.0", - "inherits": "2.0.4", - "setprototypeof": "1.2.0", - "statuses": "2.0.1", - "toidentifier": "1.0.1" + "depd": "~2.0.0", + "inherits": "~2.0.4", + "setprototypeof": "~1.2.0", + "statuses": "~2.0.2", + "toidentifier": "~1.0.1" }, "engines": { "node": ">= 0.8" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/iconv-lite": { - "version": "0.4.24", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", - "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", + "version": "0.7.2", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.2.tgz", + "integrity": "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==", "license": "MIT", "dependencies": { - "safer-buffer": ">= 2.1.2 < 3" + "safer-buffer": ">= 2.1.2 < 3.0.0" }, "engines": { "node": ">=0.10.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/inherits": { @@ -454,6 +459,12 @@ "node": ">= 0.10" } }, + "node_modules/is-promise": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/is-promise/-/is-promise-4.0.0.tgz", + "integrity": "sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ==", + "license": "MIT" + }, "node_modules/math-intrinsics": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", @@ -464,75 +475,61 @@ } }, "node_modules/media-typer": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", - "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-1.1.0.tgz", + "integrity": "sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw==", "license": "MIT", "engines": { - "node": ">= 0.6" + "node": ">= 0.8" } }, "node_modules/merge-descriptors": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.3.tgz", - "integrity": "sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ==", - "license": "MIT", - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/methods": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz", - "integrity": "sha512-iclAHeNqNm68zFtnZ0e+1L2yUIdvzNoauKU4WBA3VvH/vPFieF7qfRlwUZU+DA9P9bPXIS90ulxoUoCH23sV2w==", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-2.0.0.tgz", + "integrity": "sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g==", "license": "MIT", "engines": { - "node": ">= 0.6" - } - }, - "node_modules/mime": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", - "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==", - "license": "MIT", - "bin": { - "mime": "cli.js" + "node": ">=18" }, - "engines": { - "node": ">=4" + "funding": { + "url": "https://github.com/sponsors/sindresorhus" } }, "node_modules/mime-db": { - "version": "1.52.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", - "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "version": "1.54.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.54.0.tgz", + "integrity": "sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ==", "license": "MIT", "engines": { "node": ">= 0.6" } }, "node_modules/mime-types": { - "version": "2.1.35", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", - "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-3.0.2.tgz", + "integrity": "sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A==", "license": "MIT", "dependencies": { - "mime-db": "1.52.0" + "mime-db": "^1.54.0" }, "engines": { - "node": ">= 0.6" + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/ms": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", - "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", "license": "MIT" }, "node_modules/negotiator": { - "version": "0.6.3", - "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz", - "integrity": "sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg==", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-1.0.0.tgz", + "integrity": "sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg==", "license": "MIT", "engines": { "node": ">= 0.6" @@ -562,6 +559,15 @@ "node": ">= 0.8" } }, + "node_modules/once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", + "license": "ISC", + "dependencies": { + "wrappy": "1" + } + }, "node_modules/parseurl": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", @@ -572,28 +578,33 @@ } }, "node_modules/path-to-regexp": { - "version": "0.1.12", - "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.12.tgz", - "integrity": "sha512-RA1GjUVMnvYFxuqovrEqZoxxW5NUZqbwKtYz/Tt7nXerk0LbLblQmrsgdeOxV5SFHf0UDggjS/bSeOZwt1pmEQ==", - "license": "MIT" + "version": "8.3.0", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-8.3.0.tgz", + "integrity": "sha512-7jdwVIRtsP8MYpdXSwOS0YdD0Du+qOoF/AEPIt88PcCFrZCzx41oxku1jD88hZBwbNUIEfpqvuhjFaMAqMTWnA==", + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } }, "node_modules/pg": { - "version": "8.13.0", - "resolved": "https://registry.npmjs.org/pg/-/pg-8.13.0.tgz", - "integrity": "sha512-34wkUTh3SxTClfoHB3pQ7bIMvw9dpFU1audQQeZG837fmHfHpr14n/AELVDoOYVDW2h5RDWU78tFjkD+erSBsw==", + "version": "8.20.0", + "resolved": "https://registry.npmjs.org/pg/-/pg-8.20.0.tgz", + "integrity": "sha512-ldhMxz2r8fl/6QkXnBD3CR9/xg694oT6DZQ2s6c/RI28OjtSOpxnPrUCGOBJ46RCUxcWdx3p6kw/xnDHjKvaRA==", "license": "MIT", + "peer": true, "dependencies": { - "pg-connection-string": "^2.7.0", - "pg-pool": "^3.7.0", - "pg-protocol": "^1.7.0", - "pg-types": "^2.1.0", - "pgpass": "1.x" + "pg-connection-string": "^2.12.0", + "pg-pool": "^3.13.0", + "pg-protocol": "^1.13.0", + "pg-types": "2.2.0", + "pgpass": "1.0.5" }, "engines": { - "node": ">= 8.0.0" + "node": ">= 16.0.0" }, "optionalDependencies": { - "pg-cloudflare": "^1.1.1" + "pg-cloudflare": "^1.3.0" }, "peerDependencies": { "pg-native": ">=3.0.1" @@ -605,16 +616,16 @@ } }, "node_modules/pg-cloudflare": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/pg-cloudflare/-/pg-cloudflare-1.1.1.tgz", - "integrity": "sha512-xWPagP/4B6BgFO+EKz3JONXv3YDgvkbVrGw2mTo3D6tVDQRh1e7cqVGvyR3BE+eQgAvx1XhW/iEASj4/jCWl3Q==", + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/pg-cloudflare/-/pg-cloudflare-1.3.0.tgz", + "integrity": "sha512-6lswVVSztmHiRtD6I8hw4qP/nDm1EJbKMRhf3HCYaqud7frGysPv7FYJ5noZQdhQtN2xJnimfMtvQq21pdbzyQ==", "license": "MIT", "optional": true }, "node_modules/pg-connection-string": { - "version": "2.7.0", - "resolved": "https://registry.npmjs.org/pg-connection-string/-/pg-connection-string-2.7.0.tgz", - "integrity": "sha512-PI2W9mv53rXJQEOb8xNR8lH7Hr+EKa6oJa38zsK0S/ky2er16ios1wLKhZyxzD7jUReiWokc9WK5nxSnC7W1TA==", + "version": "2.12.0", + "resolved": "https://registry.npmjs.org/pg-connection-string/-/pg-connection-string-2.12.0.tgz", + "integrity": "sha512-U7qg+bpswf3Cs5xLzRqbXbQl85ng0mfSV/J0nnA31MCLgvEaAo7CIhmeyrmJpOr7o+zm0rXK+hNnT5l9RHkCkQ==", "license": "MIT" }, "node_modules/pg-int8": { @@ -627,18 +638,18 @@ } }, "node_modules/pg-pool": { - "version": "3.7.0", - "resolved": "https://registry.npmjs.org/pg-pool/-/pg-pool-3.7.0.tgz", - "integrity": "sha512-ZOBQForurqh4zZWjrgSwwAtzJ7QiRX0ovFkZr2klsen3Nm0aoh33Ls0fzfv3imeH/nw/O27cjdz5kzYJfeGp/g==", + "version": "3.13.0", + "resolved": "https://registry.npmjs.org/pg-pool/-/pg-pool-3.13.0.tgz", + "integrity": "sha512-gB+R+Xud1gLFuRD/QgOIgGOBE2KCQPaPwkzBBGC9oG69pHTkhQeIuejVIk3/cnDyX39av2AxomQiyPT13WKHQA==", "license": "MIT", "peerDependencies": { "pg": ">=8.0" } }, "node_modules/pg-protocol": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.7.0.tgz", - "integrity": "sha512-hTK/mE36i8fDDhgDFjy6xNOG+LCorxLG3WO17tku+ij6sVHXh1jQUJ8hYAnRhNla4QVD2H8er/FOjc/+EgC6yQ==", + "version": "1.13.0", + "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.13.0.tgz", + "integrity": "sha512-zzdvXfS6v89r6v7OcFCHfHlyG/wvry1ALxZo4LqgUoy7W9xhBDMaqOuMiF3qEV45VqsN6rdlcehHrfDtlCPc8w==", "license": "MIT" }, "node_modules/pg-types": { @@ -705,6 +716,19 @@ "node": ">=0.10.0" } }, + "node_modules/prom-client": { + "version": "15.1.3", + "resolved": "https://registry.npmjs.org/prom-client/-/prom-client-15.1.3.tgz", + "integrity": "sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api": "^1.4.0", + "tdigest": "^0.1.1" + }, + "engines": { + "node": "^16 || ^18 || >=20" + } + }, "node_modules/proxy-addr": { "version": "2.0.7", "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", @@ -719,12 +743,12 @@ } }, "node_modules/qs": { - "version": "6.13.0", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz", - "integrity": "sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==", + "version": "6.15.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.15.0.tgz", + "integrity": "sha512-mAZTtNCeetKMH+pSjrb76NAM8V9a05I9aBZOHztWy/UqcJdQYNsf59vrRKWnojAT9Y+GbIvoTBC++CPHqpDBhQ==", "license": "BSD-3-Clause", "dependencies": { - "side-channel": "^1.0.6" + "side-channel": "^1.1.0" }, "engines": { "node": ">=0.6" @@ -743,39 +767,35 @@ } }, "node_modules/raw-body": { - "version": "2.5.2", - "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.2.tgz", - "integrity": "sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==", + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-3.0.2.tgz", + "integrity": "sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA==", "license": "MIT", "dependencies": { - "bytes": "3.1.2", - "http-errors": "2.0.0", - "iconv-lite": "0.4.24", - "unpipe": "1.0.0" + "bytes": "~3.1.2", + "http-errors": "~2.0.1", + "iconv-lite": "~0.7.0", + "unpipe": "~1.0.0" }, "engines": { - "node": ">= 0.8" + "node": ">= 0.10" } }, - "node_modules/safe-buffer": { - "version": "5.2.1", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", - "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "MIT" + "node_modules/router": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/router/-/router-2.2.0.tgz", + "integrity": "sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ==", + "license": "MIT", + "dependencies": { + "debug": "^4.4.0", + "depd": "^2.0.0", + "is-promise": "^4.0.0", + "parseurl": "^1.3.3", + "path-to-regexp": "^8.0.0" + }, + "engines": { + "node": ">= 18" + } }, "node_modules/safer-buffer": { "version": "2.1.2", @@ -784,57 +804,48 @@ "license": "MIT" }, "node_modules/send": { - "version": "0.19.0", - "resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz", - "integrity": "sha512-dW41u5VfLXu8SJh5bwRmyYUbAoSB3c9uQh6L8h/KtsFREPWpbX1lrljJo186Jc4nmci/sGUZ9a0a0J2zgfq2hw==", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/send/-/send-1.2.1.tgz", + "integrity": "sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ==", "license": "MIT", "dependencies": { - "debug": "2.6.9", - "depd": "2.0.0", - "destroy": "1.2.0", - "encodeurl": "~1.0.2", - "escape-html": "~1.0.3", - "etag": "~1.8.1", - "fresh": "0.5.2", - "http-errors": "2.0.0", - "mime": "1.6.0", - "ms": "2.1.3", - "on-finished": "2.4.1", - "range-parser": "~1.2.1", - "statuses": "2.0.1" + "debug": "^4.4.3", + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "etag": "^1.8.1", + "fresh": "^2.0.0", + "http-errors": "^2.0.1", + "mime-types": "^3.0.2", + "ms": "^2.1.3", + "on-finished": "^2.4.1", + "range-parser": "^1.2.1", + "statuses": "^2.0.2" }, "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/send/node_modules/encodeurl": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", - "integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==", - "license": "MIT", - "engines": { - "node": ">= 0.8" + "node": ">= 18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, - "node_modules/send/node_modules/ms": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", - "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", - "license": "MIT" - }, "node_modules/serve-static": { - "version": "1.16.2", - "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.2.tgz", - "integrity": "sha512-VqpjJZKadQB/PEbEwvFdO43Ax5dFBZ2UECszz8bQ7pi7wt//PWe1P6MN7eCnjsatYtBT6EuiClbjSWP2WrIoTw==", + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-2.2.1.tgz", + "integrity": "sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw==", "license": "MIT", "dependencies": { - "encodeurl": "~2.0.0", - "escape-html": "~1.0.3", - "parseurl": "~1.3.3", - "send": "0.19.0" + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "parseurl": "^1.3.3", + "send": "^1.2.0" }, "engines": { - "node": ">= 0.8.0" + "node": ">= 18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/setprototypeof": { @@ -925,14 +936,23 @@ } }, "node_modules/statuses": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz", - "integrity": "sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.2.tgz", + "integrity": "sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw==", "license": "MIT", "engines": { "node": ">= 0.8" } }, + "node_modules/tdigest": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/tdigest/-/tdigest-0.1.2.tgz", + "integrity": "sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==", + "license": "MIT", + "dependencies": { + "bintrees": "1.0.2" + } + }, "node_modules/toidentifier": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz", @@ -943,13 +963,14 @@ } }, "node_modules/type-is": { - "version": "1.6.18", - "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz", - "integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==", + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/type-is/-/type-is-2.0.1.tgz", + "integrity": "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==", "license": "MIT", "dependencies": { - "media-typer": "0.3.0", - "mime-types": "~2.1.24" + "content-type": "^1.0.5", + "media-typer": "^1.1.0", + "mime-types": "^3.0.0" }, "engines": { "node": ">= 0.6" @@ -964,15 +985,6 @@ "node": ">= 0.8" } }, - "node_modules/utils-merge": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", - "integrity": "sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==", - "license": "MIT", - "engines": { - "node": ">= 0.4.0" - } - }, "node_modules/vary": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", @@ -982,6 +994,12 @@ "node": ">= 0.8" } }, + "node_modules/wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", + "license": "ISC" + }, "node_modules/xtend": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", diff --git a/network-test-app/package.json b/metrics-examples/node/package.json similarity index 74% rename from network-test-app/package.json rename to metrics-examples/node/package.json index d56cac095..0d1897289 100644 --- a/network-test-app/package.json +++ b/metrics-examples/node/package.json @@ -1,8 +1,8 @@ { - "name": "network-test-app", + "name": "metrics-example", "version": "1.0.0", "description": "App to test connectivity to IBM Cloud Services", - "main": "app.js", + "main": "app.mjs", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" }, @@ -18,7 +18,8 @@ "author": "", "license": "ISC", "dependencies": { - "express": "^4.22.1", - "pg": "^8.13.0" + "express": "^5.2.1", + "pg": "^8.20.0", + "prom-client": "^15.1.3" } } diff --git a/metrics-examples/node/utils/db.mjs b/metrics-examples/node/utils/db.mjs new file mode 100644 index 000000000..f33008349 --- /dev/null +++ b/metrics-examples/node/utils/db.mjs @@ -0,0 +1,56 @@ +import { Client } from "pg"; + +const pgServiceCredentials = process.env.DATABASES_FOR_POSTGRESQL_CONNECTION; +const pgTimeoutMs = 15000; // timeout in 15 seconds +let _dbClient = null; + +export async function getDbClient() { + if (!pgServiceCredentials) { + return undefined; + } + + if (_dbClient) { + return _dbClient; + } + + // Use env variables loaded from service binding to connect to our postgres instance + console.log("Connecting to PostgreSQL instance..."); + postgresSetup = JSON.parse(pgServiceCredentials); + cli = postgresSetup.cli; + postgres = postgresSetup.postgres; + cert = Buffer.from(postgres.certificate.certificate_base64, "base64").toString("utf8"); + + // Define the client + const client = new Client({ + user: postgres.authentication.username, + password: cli.environment.PGPASSWORD, + host: postgres.hosts[0].hostname, + database: postgres.database, + port: postgres.hosts[0].port, + statement_timeout: pgTimeoutMs, + query_timeout: pgTimeoutMs, + lock_timeout: pgTimeoutMs, + application_name: "network-test-app", + connectionTimeoutMillis: pgTimeoutMs, + ssl: { + ca: cert, + rejectUnauthorized: true, + }, + }); + + // Initiate the connection + _dbClient = await client.connect(); + + return _dbClient; +} + +export async function closeDbClient() { + try { + if (_dbClient) { + await _dbClient.end(); + console.log("DB connection closed."); + } + } catch (e) { + console.error("Failed to close DB connection."); + } +} diff --git a/metrics-examples/python/Dockerfile b/metrics-examples/python/Dockerfile new file mode 100644 index 000000000..396e292e0 --- /dev/null +++ b/metrics-examples/python/Dockerfile @@ -0,0 +1,24 @@ +# Use Python slim image +FROM python:3.12-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY . . + +# Set environment +ENV PYTHONUNBUFFERED=1 + +# Create non-root user +RUN useradd -m -u 1001 appuser && chown -R appuser:appuser /app +USER appuser + +# Expose ports +EXPOSE 8080 2112 + +# Run the application with uvicorn +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/metrics-examples/python/README.md b/metrics-examples/python/README.md new file mode 100644 index 000000000..e832e6294 --- /dev/null +++ b/metrics-examples/python/README.md @@ -0,0 +1,221 @@ +# Code Engine custom metrics examples for Python + +This application helps debug connectivity issues for IBM Cloud Services and provides comprehensive monitoring through Prometheus metrics. It includes outbound HTTP call simulation, database connectivity testing, and compute-intensive workload simulation. + +## Features + +- **Outbound HTTP Calls**: Configurable endpoints that simulate delays and error responses to httpbin.org-compatible backends +- **Database Testing**: PostgreSQL connectivity verification with instrumented queries +- **Prometheus Metrics**: Comprehensive instrumentation for requests, outbound calls, database operations, and compute workloads +- **Load Testing**: Included shell script for generating realistic traffic patterns +- **Async/Await**: Non-blocking I/O for high performance +- **FastAPI**: Modern, fast web framework with automatic OpenAPI documentation + +## Quick Start + +### Deploy to IBM Cloud Code Engine + +Deploy the application with recommended resource settings: + +```bash +ibmcloud ce application create \ + --name metrics-example-app-python \ + --src "." \ + --memory 0.5G \ + --cpu 0.25 \ + --port 8080 +``` + +To configure environment variables during deployment: + +```bash +ibmcloud ce application create \ + --name metrics-example-app-python \ + --src "." \ + --memory 0.5G \ + --cpu 0.25 \ + --env HTTPBIN_BASE_URL=https://httpbin.org \ + --env METRICS_NAME_PREFIX=mymetrics_ +``` + +Update an existing application: + +```bash +ibmcloud ce application update \ + --name metrics-example-app-python \ + --env HTTPBIN_BASE_URL=https://custom-backend.example.com +``` + +### Run Locally + +Pull and run with Docker: +```bash +docker pull icr.io/codeengine/metrics-example-app-python +docker run -p 8080:8080 -p 2112:2112 icr.io/codeengine/metrics-example-app-python +``` + +Or run from source: +```bash +pip install -r requirements.txt +python app.py +``` + +Or with uvicorn directly: +```bash +pip install -r requirements.txt +uvicorn app:app --host 0.0.0.0 --port 8080 +``` + +The application exposes: +- Main application: `http://localhost:8080` +- Metrics endpoint: `http://localhost:8080/metrics` +- Interactive API docs: `http://localhost:8080/docs` +- Alternative API docs: `http://localhost:8080/redoc` + +## Configuration + +### Environment Variables + +- `PORT`: Application server port (default: 8080) +- `HTTPBIN_BASE_URL`: Backend URL for outbound calls (default: `https://httpbin.org`) +- `METRICS_NAME_PREFIX`: Prefix for all Prometheus metrics (default: `mymetrics_`) +- `DATABASES_FOR_POSTGRESQL_CONNECTION`: PostgreSQL connection credentials (JSON format) + +### Service Bindings + +For database connectivity, create a Code Engine service binding between your project and the IBM Cloud service. See [Working with service bindings](https://cloud.ibm.com/docs/codeengine?topic=codeengine-service-binding) for details. + +## API Endpoints + +- `GET /` - Health check +- `GET /test-db` - Test PostgreSQL connectivity +- `GET /outbound/delay` - Outbound call with random delay (0-2s) and 5% error rate +- `GET /outbound/get` - Simple outbound GET request +- `POST /outbound/post` - Outbound POST request +- `GET /outbound/status/{code}` - Request specific HTTP status code +- `GET /metrics` - Prometheus metrics endpoint +- `GET /docs` - Interactive API documentation (Swagger UI) +- `GET /redoc` - Alternative API documentation (ReDoc) + +All outbound endpoints include simulated compute-intensive data processing (0-3s duration, 40-80% CPU intensity). + +## Metrics + +The application exposes Prometheus metrics at `/metrics`. All metric names are prefixed with a configurable value set via the `METRICS_NAME_PREFIX` environment variable (default: `mymetrics_`). + +**Request Metrics** +- `mymetrics_requests_total`: Total requests by method and path + +**Outbound Call Metrics** +- `mymetrics_outbound_request_duration_seconds`: Histogram of outbound request durations +- `mymetrics_outbound_requests_total`: Total outbound requests by target, method, and status + +**Database Metrics** +- `mymetrics_db_query_duration_seconds`: Histogram of query durations by operation and table +- `mymetrics_db_queries_total`: Total queries by operation, table, and status +- `mymetrics_db_connections_active`: Active database connections gauge + +**Compute Metrics** +- `mymetrics_compute_duration_seconds`: Histogram of compute operation durations + +**Python Runtime Metrics** (automatically collected by prometheus-client) +- Process CPU, memory, and other system metrics + +## Development + +### Prerequisites + +- Python 3.12 or later +- pip or poetry for package management +- Docker (for containerized builds) + +### Building + +```bash +# Install dependencies +pip install -r requirements.txt + +# Run the application +python app.py + +# Or with uvicorn for development (with auto-reload) +uvicorn app:app --reload --host 0.0.0.0 --port 8080 + +# Build Docker image +docker build -t metrics-example-app-python . + +# Run tests (if you add them) +pytest +``` + +### Project Structure + +``` +python/ +├── app.py # FastAPI application entry point +├── requirements.txt # Python dependencies +├── Dockerfile # Multi-stage Docker build +├── README.md # This file +└── utils/ + ├── __init__.py # Package initialization + ├── metrics.py # Prometheus metrics definitions + ├── db.py # PostgreSQL connection handling + └── compute.py # Compute simulation utilities +``` + +## Performance Characteristics + +- **Startup Time**: ~2-3 seconds +- **Memory Footprint**: ~100-150 MB +- **Image Size**: ~100-150 MB (distroless) +- **Concurrency**: Async/await based, handles many concurrent requests efficiently +- **CPU Efficiency**: Good performance with async I/O + +## Troubleshooting + +### Database Connection Issues + +If you encounter database connection errors: + +1. Verify the `DATABASES_FOR_POSTGRESQL_CONNECTION` environment variable is set correctly +2. Check that the service binding is properly configured +3. Ensure the certificate is valid and properly base64 encoded +4. Verify network connectivity to the PostgreSQL instance +5. Check asyncpg is properly installed: `pip install asyncpg` + +### Import Errors + +If you see import errors: + +```bash +pip install -r requirements.txt --upgrade +``` + +### SSL/TLS Issues + +For SSL certificate issues with PostgreSQL: + +```bash +# Ensure you have the latest version of asyncpg +pip install --upgrade asyncpg +``` + +### Performance Issues + +For better performance in production: + +```bash +# Use gunicorn with uvicorn workers +pip install gunicorn +gunicorn app:app -w 4 -k uvicorn.workers.UvicornWorker --bind 0.0.0.0:8080 +``` + +## FastAPI Features + +This implementation uses FastAPI, which provides: + +- **Automatic API Documentation**: Visit `/docs` for Swagger UI or `/redoc` for ReDoc +- **Type Validation**: Automatic request/response validation using Pydantic +- **Async Support**: Native async/await for non-blocking I/O +- **High Performance**: One of the fastest Python frameworks available +- **Standards-based**: Based on OpenAPI and JSON Schema diff --git a/metrics-examples/python/app.py b/metrics-examples/python/app.py new file mode 100644 index 000000000..146bbdb7e --- /dev/null +++ b/metrics-examples/python/app.py @@ -0,0 +1,324 @@ +"""FastAPI application with Prometheus metrics for Code Engine.""" + +import os +import time +import random +import logging +from contextlib import asynccontextmanager +from typing import Dict, Any + +from fastapi import FastAPI, Response, HTTPException +from fastapi.responses import PlainTextResponse +import httpx +from prometheus_client import generate_latest, CONTENT_TYPE_LATEST + +from utils.db import get_db_pool, close_db_pool +from utils.compute import simulate_compute +from utils import metrics + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +# Configuration +HTTPBIN_BASE_URL = os.getenv("HTTPBIN_BASE_URL", "https://httpbin.org") +PORT = int(os.getenv("PORT", "8080")) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Handle application startup and shutdown.""" + logger.info(f"Application server starting on port {PORT}") + logger.info(f"Configured httpbin backend: {HTTPBIN_BASE_URL}") + logger.info("Metrics server running on port 2112") + yield + # Cleanup + await close_db_pool() + logger.info("Application shutdown complete") + + +# Create FastAPI app +app = FastAPI( + title="Metrics Example App", + description="Code Engine custom metrics example for Python", + version="1.0.0", + lifespan=lifespan +) + + +# Middleware to track requests +@app.middleware("http") +async def metrics_middleware(request, call_next): + """Record request metrics.""" + metrics.requests_total.labels( + method=request.method, + path=request.url.path + ).inc() + response = await call_next(request) + return response + + +async def make_outbound_call(endpoint: str, method: str = "GET") -> Dict[str, Any]: + """Make an outbound HTTP call and record metrics.""" + url = f"{HTTPBIN_BASE_URL}{endpoint}" + start_time = time.time() + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + if method == "GET": + response = await client.get(url) + elif method == "POST": + response = await client.post(url) + else: + response = await client.request(method, url) + + duration = time.time() - start_time + status_code = str(response.status_code) + + # Record metrics + metrics.outbound_request_duration.labels( + target=HTTPBIN_BASE_URL, + method=method, + status_code=status_code + ).observe(duration) + + metrics.outbound_requests_total.labels( + target=HTTPBIN_BASE_URL, + method=method, + status_code=status_code + ).inc() + + return { + "success": True, + "status": response.status_code, + "duration": duration, + "data": response.text + } + + except Exception as e: + duration = time.time() - start_time + + # Record error metrics + metrics.outbound_request_duration.labels( + target=HTTPBIN_BASE_URL, + method=method, + status_code="error" + ).observe(duration) + + metrics.outbound_requests_total.labels( + target=HTTPBIN_BASE_URL, + method=method, + status_code="error" + ).inc() + + return { + "success": False, + "error": str(e), + "duration": duration + } + + +@app.get("/", response_class=PlainTextResponse) +async def health_check(): + """Health check endpoint.""" + app_name = os.getenv("CE_APP", "metrics-example-app") + return f"app '{app_name}' is ready!" + + +@app.get("/test-db") +async def test_db(): + """Test database connectivity.""" + pool = await get_db_pool() + if pool is None: + raise HTTPException( + status_code=500, + detail="Could not connect to postgres instance: no postgres instance configured" + ) + + try: + metrics.db_connections_active.inc() + + # Execute query with metrics + start_time = time.time() + status = "success" + + query = "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE'" + + try: + async with pool.acquire() as conn: + result = await conn.fetch(query) + + duration = time.time() - start_time + + metrics.db_query_duration.labels( + operation="SELECT", + table="INFORMATION_SCHEMA.TABLES", + status=status + ).observe(duration) + + metrics.db_queries_total.labels( + operation="SELECT", + table="INFORMATION_SCHEMA.TABLES", + status=status + ).inc() + + logger.info(f"Successfully queried database in {duration:.3f}s") + return {"message": "Successfully connected to postgres instance"} + + except Exception as e: + status = "error" + duration = time.time() - start_time + + metrics.db_query_duration.labels( + operation="SELECT", + table="INFORMATION_SCHEMA.TABLES", + status=status + ).observe(duration) + + metrics.db_queries_total.labels( + operation="SELECT", + table="INFORMATION_SCHEMA.TABLES", + status=status + ).inc() + + raise HTTPException( + status_code=500, + detail=f"Could not connect to postgres instance: '{str(e)}'" + ) + finally: + metrics.db_connections_active.dec() + + +@app.get("/outbound/delay") +async def outbound_delay(): + """Outbound call with random delay and error rate.""" + # Random delay between 0-2 seconds + delay = random.random() * 2 + + # 5% error rate + should_error = random.random() < 0.05 + + if should_error: + result = await make_outbound_call("/status/500", "GET") + else: + result = await make_outbound_call(f"/delay/{delay:.1f}", "GET") + + # Simulate compute-intensive data handling + compute_start = time.time() + compute_duration = random.random() * 3 # 0-3 seconds + cpu_intensity = 40 + random.random() * 40 # 40-80% + simulate_compute(compute_duration, cpu_intensity) + actual_compute_duration = time.time() - compute_start + + metrics.compute_duration.labels(operation="data_processing").observe(actual_compute_duration) + + response = { + "message": "Simulated error response" if should_error else "Outbound call completed", + "delay": delay, + "outboundCall": result, + "computeTime": actual_compute_duration, + "cpuIntensity": f"{cpu_intensity:.1f}%" + } + + if should_error: + return Response( + content=str(response), + status_code=500, + media_type="application/json" + ) + + return response + + +@app.get("/outbound/get") +async def outbound_get(): + """Simple outbound GET request.""" + result = await make_outbound_call("/get", "GET") + + # Simulate compute-intensive data handling + compute_start = time.time() + compute_duration = random.random() * 3 + cpu_intensity = 40 + random.random() * 40 + simulate_compute(compute_duration, cpu_intensity) + actual_compute_duration = time.time() - compute_start + + metrics.compute_duration.labels(operation="data_processing").observe(actual_compute_duration) + + return { + "message": "Outbound GET call completed", + "outboundCall": result, + "computeTime": actual_compute_duration, + "cpuIntensity": f"{cpu_intensity:.1f}%" + } + + +@app.post("/outbound/post") +async def outbound_post(): + """Outbound POST request.""" + result = await make_outbound_call("/post", "POST") + + # Simulate compute-intensive data handling + compute_start = time.time() + compute_duration = random.random() * 3 + cpu_intensity = 40 + random.random() * 40 + simulate_compute(compute_duration, cpu_intensity) + actual_compute_duration = time.time() - compute_start + + metrics.compute_duration.labels(operation="data_processing").observe(actual_compute_duration) + + return { + "message": "Outbound POST call completed", + "outboundCall": result, + "computeTime": actual_compute_duration, + "cpuIntensity": f"{cpu_intensity:.1f}%" + } + + +@app.get("/outbound/status/{code}") +async def outbound_status(code: int): + """Request specific HTTP status code.""" + result = await make_outbound_call(f"/status/{code}", "GET") + + # Simulate compute-intensive data handling + compute_start = time.time() + compute_duration = random.random() * 3 + cpu_intensity = 40 + random.random() * 40 + simulate_compute(compute_duration, cpu_intensity) + actual_compute_duration = time.time() - compute_start + + metrics.compute_duration.labels(operation="data_processing").observe(actual_compute_duration) + + return { + "message": "Outbound call completed", + "requestedStatus": code, + "outboundCall": result, + "computeTime": actual_compute_duration, + "cpuIntensity": f"{cpu_intensity:.1f}%" + } + + +@app.get("/metrics") +async def get_metrics(): + """Prometheus metrics endpoint.""" + return Response( + content=generate_latest(), + media_type=CONTENT_TYPE_LATEST + ) + + +if __name__ == "__main__": + import uvicorn + + # Run both app server and metrics server + # In production, use separate processes or the Dockerfile approach + uvicorn.run( + app, + host="0.0.0.0", + port=PORT, + log_level="info" + ) + +# Made with Bob diff --git a/metrics-examples/python/requirements.txt b/metrics-examples/python/requirements.txt new file mode 100644 index 000000000..6b82df4e0 --- /dev/null +++ b/metrics-examples/python/requirements.txt @@ -0,0 +1,6 @@ +fastapi==0.109.2 +uvicorn[standard]==0.27.1 +httpx==0.26.0 +asyncpg==0.29.0 +prometheus-client==0.19.0 +python-multipart==0.0.9 diff --git a/metrics-examples/python/utils/__init__.py b/metrics-examples/python/utils/__init__.py new file mode 100644 index 000000000..673b426b7 --- /dev/null +++ b/metrics-examples/python/utils/__init__.py @@ -0,0 +1,3 @@ +# Utils package for metrics example application + +# Made with Bob diff --git a/metrics-examples/python/utils/compute.py b/metrics-examples/python/utils/compute.py new file mode 100644 index 000000000..4c4a6b527 --- /dev/null +++ b/metrics-examples/python/utils/compute.py @@ -0,0 +1,29 @@ +"""Compute-intensive operation simulation.""" + +import time +import math +import random + + +def simulate_compute(duration_seconds: float, cpu_intensity: float) -> None: + """ + Simulate CPU-intensive work for the specified duration. + + Args: + duration_seconds: How long to run the computation + cpu_intensity: CPU usage intensity (0-100) + """ + start_time = time.time() + end_time = start_time + duration_seconds + + while time.time() < end_time: + # Perform CPU work + work_iterations = int(cpu_intensity * 1000) + for _ in range(work_iterations): + _ = math.sqrt(random.random() * 1000000) + + # Small sleep to control CPU usage + sleep_time = (100 - cpu_intensity) / 10000 # Convert to seconds + time.sleep(sleep_time) + +# Made with Bob diff --git a/metrics-examples/python/utils/db.py b/metrics-examples/python/utils/db.py new file mode 100644 index 000000000..af187d9ec --- /dev/null +++ b/metrics-examples/python/utils/db.py @@ -0,0 +1,85 @@ +"""PostgreSQL database connection handling.""" + +import os +import json +import base64 +import logging +from typing import Optional +import asyncpg + +logger = logging.getLogger(__name__) + +_db_pool: Optional[asyncpg.Pool] = None + + +async def get_db_pool() -> Optional[asyncpg.Pool]: + """Get or create a PostgreSQL connection pool.""" + global _db_pool + + if _db_pool is not None: + return _db_pool + + pg_credentials = os.getenv("DATABASES_FOR_POSTGRESQL_CONNECTION") + if not pg_credentials: + return None + + logger.info("Connecting to PostgreSQL instance...") + + try: + # Parse credentials + creds = json.loads(pg_credentials) + + # Extract connection details + username = creds["postgres"]["authentication"]["username"] + password = creds["cli"]["environment"]["PGPASSWORD"] + hostname = creds["postgres"]["hosts"][0]["hostname"] + port = creds["postgres"]["hosts"][0]["port"] + database = creds["postgres"]["database"] + + # Decode certificate + cert_base64 = creds["postgres"]["certificate"]["certificate_base64"] + cert = base64.b64decode(cert_base64).decode("utf-8") + + # Create connection pool with SSL + _db_pool = await asyncpg.create_pool( + user=username, + password=password, + host=hostname, + port=port, + database=database, + ssl="require", + server_settings={ + "application_name": "metrics-example-app" + }, + min_size=2, + max_size=10, + command_timeout=15, + timeout=15 + ) + + logger.info("Successfully connected to PostgreSQL") + return _db_pool + + except Exception as e: + logger.error(f"Failed to connect to PostgreSQL: {e}") + raise + + +async def execute_query(query: str): + """Execute a query and return results.""" + if _db_pool is None: + raise RuntimeError("Database pool not initialized") + + async with _db_pool.acquire() as conn: + return await conn.fetch(query) + + +async def close_db_pool(): + """Close the database connection pool.""" + global _db_pool + if _db_pool is not None: + await _db_pool.close() + _db_pool = None + logger.info("DB connection closed") + +# Made with Bob diff --git a/metrics-examples/python/utils/metrics.py b/metrics-examples/python/utils/metrics.py new file mode 100644 index 000000000..4fb947135 --- /dev/null +++ b/metrics-examples/python/utils/metrics.py @@ -0,0 +1,64 @@ +"""Prometheus metrics definitions for the application.""" + +import os +from prometheus_client import Counter, Histogram, Gauge, CollectorRegistry, REGISTRY + +# Get metrics prefix from environment +METRICS_PREFIX = os.getenv("METRICS_NAME_PREFIX", "mymetrics_") + +# Request metrics +requests_total = Counter( + f"{METRICS_PREFIX}requests_total", + "Total number of requests", + ["method", "path"], + registry=REGISTRY +) + +# Outbound HTTP call metrics +outbound_request_duration = Histogram( + f"{METRICS_PREFIX}outbound_request_duration_seconds", + "Duration of outbound HTTP requests in seconds", + ["target", "method", "status_code"], + buckets=[0.1, 0.5, 1, 2, 5, 10], + registry=REGISTRY +) + +outbound_requests_total = Counter( + f"{METRICS_PREFIX}outbound_requests_total", + "Total number of outbound HTTP requests", + ["target", "method", "status_code"], + registry=REGISTRY +) + +# Database operation metrics +db_query_duration = Histogram( + f"{METRICS_PREFIX}db_query_duration_seconds", + "Duration of database queries in seconds", + ["operation", "table", "status"], + buckets=[0.01, 0.05, 0.1, 0.5, 1, 2, 5], + registry=REGISTRY +) + +db_queries_total = Counter( + f"{METRICS_PREFIX}db_queries_total", + "Total number of database queries", + ["operation", "table", "status"], + registry=REGISTRY +) + +db_connections_active = Gauge( + f"{METRICS_PREFIX}db_connections_active", + "Number of active database connections", + registry=REGISTRY +) + +# Compute operation metrics +compute_duration = Histogram( + f"{METRICS_PREFIX}compute_duration_seconds", + "Duration of compute-intensive operations in seconds", + ["operation"], + buckets=[0.5, 1, 2, 3, 5], + registry=REGISTRY +) + +# Made with Bob diff --git a/metrics-examples/run b/metrics-examples/run new file mode 100755 index 000000000..70f057754 --- /dev/null +++ b/metrics-examples/run @@ -0,0 +1,56 @@ +#!/bin/bash +set -eo pipefail + +PREFIX="${PREFIX:=metrics-example}" +LANGUAGE="${LANGUAGE:=all}" + +# Create the Code Engine project if it does not exist, yet +if ! ibmcloud ce project get --name "$PREFIX" >/dev/null 2>&1;then + ibmcloud ce project create --name "$PREFIX" +fi + +# Select the Code Engine project +ibmcloud ce project select --name "$PREFIX" --kubecfg + +languages=( + go + java + node + python +) + +for i in "${languages[@]}"; do + + if [[ "$LANGUAGE" == "all" || "$LANGUAGE" == "$i" ]];then + echo "Deploying Code Engine app for $i ..." + app_name="${PREFIX}-app-$i" + + create_or_update=update + if ! ibmcloud ce app get --name $app_name >/dev/null 2>&1; then + echo -e "\nCreating the app '$app_name' ..." + create_or_update=create + else + echo -e "\nUpdating the app '$app_name' ..." + fi + + # Create or update the app + ibmcloud ce application $create_or_update \ + --name $app_name \ + --context-dir "$i/" \ + --src "." \ + --memory 0.5G \ + --cpu 0.25 \ + --env HTTPBIN_BASE_URL=https://httpbin.2690nz3tkq6c.eu-es.codeengine.appdomain.cloud + + # Annotate the app + kubectl patch kservice "$app_name" --type=json -p='[ + {"op":"add","path":"/spec/template/metadata/annotations/codeengine.cloud.ibm.com~1userMetricsScrape","value":"true"}, + {"op":"add","path":"/spec/template/metadata/annotations/codeengine.cloud.ibm.com~1userMetricsPath","value":"/metrics"}, + {"op":"add","path":"/spec/template/metadata/annotations/codeengine.cloud.ibm.com~1userMetricsPort","value":"2112"} + ]' + else + continue; + fi +done + +echo "Done" diff --git a/network-test-app/Dockerfile b/network-test-app/Dockerfile deleted file mode 100644 index de42fb623..000000000 --- a/network-test-app/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -FROM icr.io/codeengine/node:22-alpine -RUN apk -U upgrade - -WORKDIR /network-test-app - -COPY *.js *.json /network-test-app/ - -RUN npm install - -ENTRYPOINT [ "node", "app.js" ] \ No newline at end of file diff --git a/network-test-app/README.md b/network-test-app/README.md deleted file mode 100644 index d1a421394..000000000 --- a/network-test-app/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Network Connectivity Test App - -This sample is intended to help users debug connectivity issues for IBM Cloud Services. You can use this app to help isolate network connection issues between your own code and a working app. - -- - - - -This sample includes a `build` script which will build the container image and push the image to `icr.io/codeengine/network-test-app`. The customer should: -- Pull the image `icr.io/codeengine/network-test-app` -- Deploy the image as an application -- Make an HTTP request to your application, and observe the response - -## Configuring the Service Credentials for the App - -This app works by attempting to connect your Code Engine project to another IBM Cloud service; in order to do this properly, it must consume service credentials that should be configured by creating a `service binding` between the customer's project and the service they wish to connect to. - -For more information about how to create a service binding, see [Working with service bindings to integrate IBM Cloud services with Code Engine](https://cloud.ibm.com/docs/codeengine?topic=codeengine-service-binding). - -### Example: Databases for PostgreSQL -If the app is attempting to connect to a postgres instance, then after creating a service binding for the instance the app will contain the credentials for the postgres instance in the form of an environment variable `DATABASES_FOR_POSTGRESQL_CONNECTION`. -- **Without this environment variable properly configured, the app will NOT be able to connect to postgres** \ No newline at end of file diff --git a/network-test-app/app.js b/network-test-app/app.js deleted file mode 100644 index b50427e90..000000000 --- a/network-test-app/app.js +++ /dev/null @@ -1,71 +0,0 @@ -const { Client } = require("pg"); -const express = require("express"); -const app = express() -const timeoutMs = 15000 // timeout in 15 seconds -const port = process.env.PORT; - -app.get("/", async (request, response) => { - pgServiceCredentials = process.env.DATABASES_FOR_POSTGRESQL_CONNECTION - if(!!pgServiceCredentials){ - /* - Postgres service credentials have been configured properly, - continue with attempting to connect to service - */ - try { - // Use env variables loaded from service binding to connect to our postgres instance - console.log("Connecting to PostgreSQL instance..."); - - postgresSetup = JSON.parse(pgServiceCredentials); - cli = postgresSetup.cli; - postgres = postgresSetup.postgres; - cert = Buffer.from(postgres.certificate.certificate_base64, 'base64').toString('utf8'); - - const client = new Client({ - user: postgres.authentication.username, - password: cli.environment.PGPASSWORD, - host: postgres.hosts[0].hostname, - database: postgres.database, - port: postgres.hosts[0].port, - statement_timeout: timeoutMs, - query_timeout: timeoutMs, - lock_timeout: timeoutMs, - application_name: "network-test-app", - connectionTimeoutMillis: timeoutMs, - ssl: { - ca: cert, - rejectUnauthorized: true, - }, - }); - await client.connect(); - - // Run a simple command to verify that we connected to the postgres instance - console.log("List tables"); - result = await client.query("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE';"); - console.log(result) - await client.end() - response.status(200).send("Successfully connected to postgres instance"); - } catch (err) { - console.error("Failed to connect to PostgreSQL instance", err); - response.status(500).send("Could not connect to postgres instance:", err); - } - } else { - response.status(500).send("Could not connect to postgres instance: no postgres instance configured"); - } - - -}) - -const server = app.listen(port, async () => { - console.log('listening on localhost', port) -}) - -process.on('SIGTERM', () => { - console.info('SIGTERM signal received.'); - server.close(() => { - console.log('Http server closed.'); - }); -}); - - - - diff --git a/network-test-app/build b/network-test-app/build deleted file mode 100755 index d14931bc4..000000000 --- a/network-test-app/build +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -# Env Vars: -# REGISTRY: name of the image registry/namespace to store the images -# NOCACHE: set this to "--no-cache" to turn off the Docker build cache -# -# NOTE: to run this you MUST set the REGISTRY environment variable to -# your own image registry/namespace otherwise the `docker push` commands -# will fail due to an auth failure. Which means, you also need to be logged -# into that registry before you run it. - -set -ex -export REGISTRY=${REGISTRY:-icr.io/codeengine} - -# First build the app's image and push it -docker build ${NOCACHE} -t ${REGISTRY}/network-test-app -f Dockerfile . --platform linux/amd64 -docker push ${REGISTRY}/network-test-app \ No newline at end of file diff --git a/private-path-to-vpc-vsi/ce-app/Dockerfile b/private-path-to-vpc-vsi/ce-app/Dockerfile index 7ae1e0829..93565d9e5 100644 --- a/private-path-to-vpc-vsi/ce-app/Dockerfile +++ b/private-path-to-vpc-vsi/ce-app/Dockerfile @@ -1,10 +1,10 @@ -FROM quay.io/projectquay/golang:1.23 AS build-env +FROM quay.io/projectquay/golang:1.25 AS build-env WORKDIR /go/src/app COPY . . RUN CGO_ENABLED=0 go build -o /go/bin/app . # Copy the exe into a smaller base image -FROM gcr.io/distroless/static-debian12 +FROM gcr.io/distroless/static-debian13 COPY --from=build-env /go/bin/app / ENTRYPOINT ["/app"] diff --git a/private-path-to-vpc-vsi/ce-app/go.mod b/private-path-to-vpc-vsi/ce-app/go.mod index bff2499af..f65b725d1 100644 --- a/private-path-to-vpc-vsi/ce-app/go.mod +++ b/private-path-to-vpc-vsi/ce-app/go.mod @@ -1,5 +1,5 @@ module github.com/IBM/CodeEngine/ce-private-path -go 1.23.0 +go 1.25 require github.com/lib/pq v1.10.9 diff --git a/satellite-connector-to-vpc-vsi/ce-app/Dockerfile b/satellite-connector-to-vpc-vsi/ce-app/Dockerfile index 7ae1e0829..93565d9e5 100644 --- a/satellite-connector-to-vpc-vsi/ce-app/Dockerfile +++ b/satellite-connector-to-vpc-vsi/ce-app/Dockerfile @@ -1,10 +1,10 @@ -FROM quay.io/projectquay/golang:1.23 AS build-env +FROM quay.io/projectquay/golang:1.25 AS build-env WORKDIR /go/src/app COPY . . RUN CGO_ENABLED=0 go build -o /go/bin/app . # Copy the exe into a smaller base image -FROM gcr.io/distroless/static-debian12 +FROM gcr.io/distroless/static-debian13 COPY --from=build-env /go/bin/app / ENTRYPOINT ["/app"] diff --git a/satellite-connector-to-vpc-vsi/ce-app/go.mod b/satellite-connector-to-vpc-vsi/ce-app/go.mod index 06d27fb97..4511f6fe9 100644 --- a/satellite-connector-to-vpc-vsi/ce-app/go.mod +++ b/satellite-connector-to-vpc-vsi/ce-app/go.mod @@ -1,5 +1,5 @@ module github.com/IBM/CodeEngine/ce-satellite-connector -go 1.21.0 +go 1.25 require github.com/lib/pq v1.10.9 diff --git a/trusted-profiles/node/package-lock.json b/trusted-profiles/node/package-lock.json index f90b8d095..7d07c42dd 100644 --- a/trusted-profiles/node/package-lock.json +++ b/trusted-profiles/node/package-lock.json @@ -72,6 +72,7 @@ "resolved": "https://registry.npmjs.org/axios/-/axios-1.13.5.tgz", "integrity": "sha512-cz4ur7Vb0xS4/KUN0tPWe44eqxrIu31me+fbang3ijiNscE129POzipJJA6zniq2C/Z6sJCjMimjS8Lc/GAs8Q==", "license": "MIT", + "peer": true, "dependencies": { "follow-redirects": "^1.15.11", "form-data": "^4.0.5",