π Metrics & Monitoring
TaskRunna provides comprehensive Prometheus metrics out of the box for production observability.
π Quick Start
Enable Metrics
import com.taskrunna.batch.metrics.PrometheusConfig
// Create metrics instance
val metrics = PrometheusConfig.createBatchMetrics("order_processor")
// Use with BatchJobProcessor
val processor = BatchJobProcessor(
iterator = OrderIterator(),
submitJob = ::processOrder,
metrics = metrics, // Enable metrics!
jobName = "order_processing"
)
Expose Metrics Endpoint
import io.ktor.server.application.*
import io.ktor.server.response.*
import io.ktor.server.routing.*
import io.micrometer.prometheus.PrometheusMeterRegistry
fun Application.configureRouting(registry: PrometheusMeterRegistry) {
routing {
get("/metrics") {
call.respondText(registry.scrape(), ContentType.Text.Plain)
}
}
}
π Available Metrics
TaskRunna automatically collects these metrics:
Metric Name | Type | Description | Tags |
---|---|---|---|
{prefix}_jobs_started_total |
Counter | Total batch jobs started | job_name |
{prefix}_jobs_completed_total |
Counter | Total batch jobs completed | job_name , result |
{prefix}_job_duration_seconds |
Timer | Time taken for complete jobs | job_name , result |
{prefix}_tasks_submitted_total |
Counter | Total tasks submitted | job_name |
{prefix}_tasks_completed_total |
Counter | Total tasks completed | job_name , result , error_type |
{prefix}_task_duration_seconds |
Timer | Time taken for individual tasks | job_name , result |
{prefix}_batches_processed_total |
Counter | Total batches processed | job_name |
{prefix}_items_processed_total |
Counter | Total items processed | job_name |
Metric Tags
job_name
: The name you assign to your batch jobresult
:success
orfailure
error_type
: The exception class name for failed tasks
Example Metrics Output
# HELP order_retry_jobs_started_total Total batch jobs started
# TYPE order_retry_jobs_started_total counter
order_retry_jobs_started_total{job_name="order_retry_job"} 1.0
# HELP order_retry_tasks_completed_total Total number of tasks completed
# TYPE order_retry_tasks_completed_total counter
order_retry_tasks_completed_total{job_name="order_retry_job",result="success"} 42.0
order_retry_tasks_completed_total{job_name="order_retry_job",result="failure",error_type="PaymentException"} 8.0
# HELP order_retry_task_duration_seconds Time taken to complete individual tasks
# TYPE order_retry_task_duration_seconds summary
order_retry_task_duration_seconds_count{job_name="order_retry_job",result="success"} 42.0
order_retry_task_duration_seconds_sum{job_name="order_retry_job",result="success"} 12.5
π§ Configuration
Custom Metric Prefix
val metrics = MicrometerBatchMetrics(
meterRegistry = PrometheusMeterRegistry.builder().build(),
prefix = "my_app_batch" // Custom prefix
)
Auto-Detection
TaskRunna automatically detects if Prometheus is available:
// Automatically uses PrometheusMeterRegistry if available
val metrics = PrometheusConfig.createBatchMetrics("job_name")
// Falls back to SimpleMeterRegistry if Prometheus not in classpath
// Falls back to NoOpBatchMetrics if Micrometer not available
Disable Metrics
import com.taskrunna.batch.metrics.NoOpBatchMetrics
val processor = BatchJobProcessor(
iterator = OrderIterator(),
submitJob = ::processOrder,
metrics = NoOpBatchMetrics.INSTANCE // No metrics overhead
)
π Monitoring with PromQL
Success Rate
# Overall success rate
rate(order_retry_tasks_completed_total{result="success"}[5m]) /
rate(order_retry_tasks_completed_total[5m]) * 100
Error Rate by Type
# Errors by exception type
rate(order_retry_tasks_completed_total{result="failure"}[5m]) by (error_type)
Average Processing Time
# Average task duration
rate(order_retry_task_duration_seconds_sum[5m]) /
rate(order_retry_task_duration_seconds_count[5m])
Throughput
# Tasks processed per second
rate(order_retry_tasks_completed_total[5m])
Job Completion Rate
# Jobs completed per hour
rate(order_retry_jobs_completed_total[1h]) * 3600
π¨ Alerting Rules
High Error Rate
groups:
- name: taskrunna.rules
rules:
- alert: TaskRunnaHighErrorRate
expr: |
(
rate(order_retry_tasks_completed_total{result="failure"}[5m]) /
rate(order_retry_tasks_completed_total[5m])
) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "TaskRunna job has high error rate"
description: "Error rate is for job "
Job Stalled
- alert: TaskRunnaJobStalled
expr: |
increase(order_retry_jobs_started_total[10m]) > 0 and
increase(order_retry_jobs_completed_total[10m]) == 0
for: 5m
labels:
severity: critical
annotations:
summary: "TaskRunna job appears stalled"
description: "Job started but no completion in 10 minutes"
Slow Processing
- alert: TaskRunnaSlowProcessing
expr: |
rate(order_retry_task_duration_seconds_sum[5m]) /
rate(order_retry_task_duration_seconds_count[5m]) > 30
for: 3m
labels:
severity: warning
annotations:
summary: "TaskRunna job processing slowly"
description: "Average task duration is s"
π Grafana Dashboard
Key Panels
1. Job Overview
# Jobs started vs completed
increase(order_retry_jobs_started_total[1h])
increase(order_retry_jobs_completed_total[1h])
2. Task Success Rate
# Success rate over time
rate(order_retry_tasks_completed_total{result="success"}[5m]) /
rate(order_retry_tasks_completed_total[5m]) * 100
3. Processing Time Distribution
# P50, P95, P99 latencies
histogram_quantile(0.50, rate(order_retry_task_duration_seconds_bucket[5m]))
histogram_quantile(0.95, rate(order_retry_task_duration_seconds_bucket[5m]))
histogram_quantile(0.99, rate(order_retry_task_duration_seconds_bucket[5m]))
4. Error Breakdown
# Errors by type
rate(order_retry_tasks_completed_total{result="failure"}[5m]) by (error_type)
5. Throughput
# Items processed per second
rate(order_retry_items_processed_total[5m])
Dashboard JSON
{
"dashboard": {
"title": "TaskRunna Batch Processing",
"panels": [
{
"title": "Success Rate",
"type": "stat",
"targets": [
{
"expr": "rate(order_retry_tasks_completed_total{result=\"success\"}[5m]) / rate(order_retry_tasks_completed_total[5m]) * 100"
}
]
}
]
}
}
π Troubleshooting
No Metrics Appearing
Check Dependencies:
dependencies {
implementation("io.micrometer:micrometer-registry-prometheus:1.12.0")
}
Verify Metrics are Enabled:
val metrics = PrometheusConfig.createBatchMetrics("my_job")
// Should not be NoOpBatchMetrics.INSTANCE
Metrics Not Updating
Ensure Job Name is Set:
val processor = BatchJobProcessor(
// ... other parameters
metrics = metrics,
jobName = "order_processing" // Required for proper tagging
)
Memory Usage Concerns
Use Metric Filters:
val registry = PrometheusMeterRegistry.builder()
.meterFilter(MeterFilter.denyNameStartsWith("jvm")) // Exclude JVM metrics
.build()
val metrics = MicrometerBatchMetrics(registry, "batch")
π Production Best Practices
1. Metric Retention
Configure appropriate retention in Prometheus:
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "taskrunna.rules.yml"
scrape_configs:
- job_name: 'taskrunna'
static_configs:
- targets: ['localhost:8080']
scrape_interval: 30s
metrics_path: /metrics
2. Cardinality Management
Be careful with high-cardinality tags:
// Good: Low cardinality
val processor = BatchJobProcessor(
// ...
jobName = "order_retry" // Same for all instances
)
// Bad: High cardinality
val processor = BatchJobProcessor(
// ...
jobName = "order_retry_${System.currentTimeMillis()}" // Unique per run
)
3. Resource Monitoring
Monitor TaskRunnaβs resource usage:
# Memory usage
process_resident_memory_bytes{job="taskrunna"}
# CPU usage
rate(process_cpu_seconds_total{job="taskrunna"}[5m])
# Thread count
process_threads{job="taskrunna"}
4. SLA Monitoring
Set up SLO/SLI tracking:
# SLI: 95% of tasks complete successfully
sli_success_rate: |
rate(order_retry_tasks_completed_total{result="success"}[5m]) /
rate(order_retry_tasks_completed_total[5m])
# SLO: Success rate > 95%
slo_target: 0.95
π Integration Examples
Spring Boot Actuator
@Component
class TaskRunnaMetricsExporter {
@EventListener
fun onJobCompleted(event: BatchJobCompletedEvent) {
meterRegistry.counter("batch.jobs.completed",
"job_name", event.jobName,
"result", if (event.success) "success" else "failure"
).increment()
}
}
Custom Metrics
class CustomMetricsCollector(private val meterRegistry: MeterRegistry) {
private val businessMetrics = meterRegistry.counter("business.orders.processed")
fun onOrderProcessed(order: Order) {
businessMetrics.increment(
Tags.of(
"customer_tier", order.customerTier,
"region", order.region
)
)
}
}
Want to see metrics in action? Run the live example!