MetricsEx

MetricsEx

CI StatusHex.pmDocumentationElixirLicense

Metrics aggregation with multiple exporters and intelligent alerting


Purpose

MetricsEx provides comprehensive metrics collection, aggregation, and querying for:

Features

Installation

Add metrics_ex to your list of dependencies in mix.exs:

def deps do
  [
    {:metrics_ex, "~> 0.1.0"}
  ]
end

Configuration

# config/config.exs
config :metrics_ex,
  retention_hours: 24,
  pubsub: MyApp.PubSub  # Optional: for real-time streaming

Quick Start

Recording Metrics

# Record experiment results
MetricsEx.record(:experiment_result, %{
  experiment_id: "exp_123",
  metric: :entailment_score,
  value: 0.75,
  tags: %{model: "llama-3.1", dataset: "scifact"}
})

# Increment counters
MetricsEx.increment(:jobs_completed, tags: %{tenant: "cns"})
MetricsEx.increment(:requests_total, 5, tags: %{endpoint: "/api"})

# Record gauge values (point-in-time)
MetricsEx.gauge(:queue_depth, 42, tags: %{queue: "sno_validation"})

# Record histogram values (distributions)
MetricsEx.histogram(:response_time, 123.45, tags: %{endpoint: "/api"})

# Measure execution time
result = MetricsEx.measure(:database_query, fn ->
  # expensive operation
  MyApp.run_query()
end, tags: %{query: "SELECT"})

Querying Metrics

# Get raw metrics
MetricsEx.get_metrics(name: :jobs_completed, limit: 100)
# => %{metrics: [...], count: 100, timestamp: "2025-12-06T..."}

# Aggregate with grouping
MetricsEx.query(:experiment_result,
  group_by: [:model],
  aggregation: :mean,
  window: :last_24h
)
# => [
#   %{model: "llama-3.1", mean: 0.72},
#   %{model: "qwen", mean: 0.68}
# ]

# Time series data
MetricsEx.time_series(:jobs_completed,
  interval: :hour,
  aggregation: :count,
  window: :last_24h
)
# => [
#   %{timestamp: ~U[2025-12-06 00:00:00Z], count: 45},
#   %{timestamp: ~U[2025-12-06 01:00:00Z], count: 52},
#   ...
# ]

# Pre-computed rollups for dashboards
MetricsEx.rollup(:experiment_result,
  group_by: [:model, :dataset],
  aggregations: [:mean, :count, :p95],
  window: :last_24h
)
# => %{
#   "llama-3.1/scifact" => %{mean: 0.72, count: 150, p95: 0.89},
#   "qwen/fever" => %{mean: 0.68, count: 200, p95: 0.85}
# }

Telemetry Integration

# Attach to telemetry events
MetricsEx.attach_telemetry([
  {[:work, :job, :completed], :counter},
  {[:work, :job, :duration], :histogram},
  {[:crucible, :experiment, :completed], :histogram},
  {[:queue, :depth], :gauge}
])

# Now telemetry events are automatically recorded
:telemetry.execute([:work, :job, :completed], %{count: 1}, %{tenant: "cns"})

Prometheus Export

# Export all metrics in Prometheus format
prometheus_text = MetricsEx.Storage.Prometheus.export()

# Export specific metric
prometheus_text = MetricsEx.Storage.Prometheus.export(:jobs_completed)

# Use in Phoenix controller
defmodule MyAppWeb.MetricsController do
  use MyAppWeb, :controller

  def prometheus(conn, _params) do
    metrics = MetricsEx.Storage.Prometheus.export()

    conn
    |> put_resp_content_type(MetricsEx.Storage.Prometheus.content_type())
    |> send_resp(200, metrics)
  end
end

Architecture

Supervision Tree

MetricsEx.Supervisor
├── Phoenix.PubSub (optional)
├── MetricsEx.Storage.ETS (storage backend)
└── MetricsEx.Recorder (recording coordinator)

Components

1. Metric Types (MetricsEx.Metric)

Defines three core metric types:

2. Storage Backend (MetricsEx.Storage.ETS)

3. Recorder (MetricsEx.Recorder)

4. Aggregator (MetricsEx.Aggregator)

5. Telemetry Handler (MetricsEx.TelemetryHandler)

6. API (MetricsEx.API)

Aggregation Functions

Function Description Example
:count Number of metrics count: 150
:sum Total sum of values sum: 1234
:mean Average value mean: 0.72
:min Minimum value min: 0.45
:max Maximum value max: 0.95
:p50 50th percentile (median) p50: 0.70
:p95 95th percentile p95: 0.89
:p99 99th percentile p99: 0.93

Time Windows

Window Description
:last_hour Last 60 minutes
:last_24h Last 24 hours
:last_7d Last 7 days
:last_30d Last 30 days

Time Intervals

Interval Description
:minute 1-minute buckets
:hour 1-hour buckets
:day 1-day buckets

System Statistics

MetricsEx.get_stats()
# => %{
#   storage: %{
#     total_metrics: 12345,
#     metrics_stored: 12345,
#     metrics_pruned: 567,
#     retention_hours: 24,
#     memory_bytes: 1048576
#   },
#   recorder: %{
#     metrics_recorded: 12345
#   },
#   timestamp: "2025-12-06T12:00:00Z"
# }

Integration Examples

With Crucible Experiments

# Record experiment metrics
MetricsEx.record(:crucible_experiment, %{
  value: entailment_score,
  tags: %{
    experiment_id: experiment.id,
    model: "llama-3.1",
    dataset: "scifact",
    stage: "validation"
  }
})

# Query experiment results
MetricsEx.query(:crucible_experiment,
  group_by: [:model, :dataset],
  aggregation: :mean,
  window: :last_7d
)

With CNS Agents

# Record agent performance
MetricsEx.record(:cns_agent_metric, %{
  value: beta1_score,
  tags: %{
    agent: "antagonist",
    sno_id: sno.id,
    iteration: 3
  }
})

# Track agent iterations
MetricsEx.time_series(:cns_agent_metric,
  interval: :hour,
  aggregation: :mean,
  tags: %{agent: "synthesizer"},
  window: :last_24h
)

With Work Job System

# Auto-record via telemetry
:telemetry.execute(
  [:work, :job, :completed],
  %{duration: job_duration_ms},
  %{tenant: tenant, queue: queue_name}
)

# Monitor job throughput
MetricsEx.rollup(:work_job_completed,
  group_by: [:tenant, :queue],
  aggregations: [:count, :mean, :p95],
  window: :last_hour
)

Testing

# Run all tests
mix test

# Run with coverage
mix test --cover

# Run specific test file
mix test test/metrics_ex/aggregator_test.exs

Performance

Roadmap

Contributing

This project is part of the North-Shore-AI monorepo. See the main repository for contribution guidelines.

License

MIT License - see LICENSE for details.

Related Projects