Skip to content

Monitor Application Health

Extract health metrics and track service behavior from JSON application logs.

Problem

You have JSON logs from microservices and need to monitor health, track errors, measure performance, and understand service behavior.

Solutions

Basic Health Check

Monitor overall service health:

kelora -j examples/simple_json.jsonl \
    -e 'track_count(e.level)' \
    -e 'track_count(e.service)' \
    --metrics
kelora -j examples/simple_json.jsonl \
    -e 'track_count(e.level)' \
    -e 'track_count(e.service)' \
    --metrics
timestamp='2024-01-15T10:00:00Z' level='INFO' message='Application started' service='api'
  version='1.2.3'
timestamp='2024-01-15T10:00:05Z' level='DEBUG' message='Loading configuration' service='api'
  config_file='/etc/app/config.yml'
timestamp='2024-01-15T10:00:10Z' level='INFO' message='Connection pool initialized'
  service='database' max_connections=50
timestamp='2024-01-15T10:01:00Z' level='WARN' message='High memory usage detected' service='api'
  memory_percent=85
timestamp='2024-01-15T10:01:30Z' level='ERROR' message='Query timeout' service='database'
  query='SELECT * FROM users' duration_ms=5000
timestamp='2024-01-15T10:02:00Z' level='INFO' message='Request received' service='api' method='GET'
  path='/api/users' user_id=123
timestamp='2024-01-15T10:02:15Z' level='DEBUG' message='Cache hit' service='cache' key='user:123'
  ttl=3600
timestamp='2024-01-15T10:02:30Z' level='INFO' message='Response sent' service='api' status=200
  duration_ms=45
timestamp='2024-01-15T10:03:00Z' level='WARN' message='Failed login attempt' service='auth'
  username='admin' ip='192.168.1.100'
timestamp='2024-01-15T10:03:30Z' level='ERROR' message='Account locked' service='auth'
  username='admin' attempts=5
timestamp='2024-01-15T10:04:00Z' level='INFO' message='Cron job started' service='scheduler'
  job='backup' schedule='0 2 * * *'
timestamp='2024-01-15T10:05:00Z' level='DEBUG' message='Running backup script' service='scheduler'
  script='/usr/local/bin/backup.sh'
timestamp='2024-01-15T10:10:00Z' level='INFO' message='Backup completed' service='scheduler'
  size_mb=1024 duration_ms=300000
timestamp='2024-01-15T10:15:00Z' level='CRITICAL' message='Disk space critical' service='disk'
  partition='/var' free_gb=0.5
timestamp='2024-01-15T10:16:00Z' level='ERROR' message='Service unavailable' service='api'
  reason='disk space'
timestamp='2024-01-15T10:17:00Z' level='WARN' severity='high' message='Alert sent'
  service='monitoring' channel='slack'
timestamp='2024-01-15T10:20:00Z' level='INFO' message='Disk cleanup initiated' service='admin'
  target='/var/log'
timestamp='2024-01-15T10:25:00Z' level='INFO' message='Cleanup completed' service='admin'
  freed_gb=10
timestamp='2024-01-15T10:26:00Z' level='INFO' message='Service resumed' service='api'
  downtime_seconds=600
timestamp='2024-01-15T10:30:00Z' level='DEBUG' message='Health check passed' service='health'
  endpoints=["api","database","cache"]

kelora: Tracked metrics:
CRITICAL     = 1
DEBUG        = 4
ERROR        = 3
INFO         = 9
WARN         = 3
admin        = 2
api          = 7
auth         = 2
cache        = 1
database     = 2
disk         = 1
health       = 1
monitoring   = 1
scheduler    = 3

Error Rate Monitoring

Track error rates over time:

kelora -j app.log \
    -e 'if e.level == "ERROR" || e.level == "CRITICAL" { track_count("errors") }' \
    -e 'track_count("total")' \
    --metrics

Calculate error percentage from metrics output.

Service-Specific Health

Monitor individual service health:

kelora -j app.log \
    --filter 'e.service == "database"' \
    -e 'track_count(e.level)' \
    -e 'track_avg("duration", e.get_path("duration_ms", 0))' \
    --metrics

Response Time Monitoring

Track performance metrics:

kelora -j app.log \
    --filter 'e.has_path("duration_ms")' \
    -e 'track_avg("response_time", e.duration_ms)' \
    -e 'track_min("fastest", e.duration_ms)' \
    -e 'track_max("slowest", e.duration_ms)' \
    --metrics

Memory Usage Tracking

Monitor memory consumption:

kelora -j app.log \
    --filter 'e.has_path("memory_percent")' \
    -e 'track_avg("memory", e.memory_percent)' \
    -e 'track_max("peak_memory", e.memory_percent)' \
    --metrics

Endpoint Performance

Analyze API endpoint health:

kelora -j app.log \
    --filter 'e.has_path("path")' \
    -e 'track_count(e.path)' \
    -e 'track_avg(e.path, e.get_path("duration_ms", 0))' \
    --metrics

Real-World Examples

Service Status Dashboard

Generate a comprehensive health report:

kelora -j app.log \
    -e 'track_count(e.service)' \
    -e 'track_count(e.level)' \
    -e 'if e.level == "ERROR" { track_count(e.service + "_errors") }' \
    -e 'if e.has_path("duration_ms") { track_avg("avg_duration", e.duration_ms) }' \
    --metrics

Failed Operations

Track operations that fail:

kelora -j app.log \
    --filter 'e.get_path("status", "success") != "success"' \
    -e 'e.operation = e.get_path("operation", "unknown")' \
    -e 'track_count(e.operation)' \
    -k timestamp,service,operation,message \
    --metrics

Database Query Health

Monitor database performance:

kelora -j app.log \
    --filter 'e.service == "database"' \
    -e 'if e.get_path("duration_ms", 0) > 1000 { e.slow = true }' \
    -e 'track_count("queries")' \
    -e 'if e.slow { track_count("slow_queries") }' \
    -e 'track_avg("query_time", e.get_path("duration_ms", 0))' \
    --metrics

Authentication Failures

Track login and auth issues:

kelora -j app.log \
    --filter 'e.service == "auth"' \
    --filter 'e.message.contains("failed") || e.message.contains("locked")' \
    -e 'track_count(e.username)' \
    -e 'track_count(e.get_path("ip", "unknown"))' \
    -k timestamp,username,ip,message \
    --metrics

Cache Performance

Monitor cache hit rates:

kelora -j app.log \
    --filter 'e.service == "cache"' \
    -e 'if e.message.contains("hit") { track_count("cache_hits") }' \
    -e 'if e.message.contains("miss") { track_count("cache_misses") }' \
    -e 'track_count("cache_total")' \
    --metrics

Service Dependencies

Track which services are interacting:

kelora -j app.log \
    --filter 'e.has_path("downstream_service")' \
    -e 'e.call = e.service + " -> " + e.downstream_service' \
    -e 'track_count(e.call)' \
    --metrics

Hourly Health Report

Break down health by time:

kelora -j app.log \
    -e 'e.hour = e.timestamp.format("%Y-%m-%d %H:00")' \
    -e 'track_count(e.hour)' \
    -e 'if e.level == "ERROR" { track_count(e.hour + "_errors") }' \
    --metrics

Resource Exhaustion Detection

Find resource pressure points:

kelora -j app.log \
    --filter 'e.level == "WARN" || e.level == "ERROR"' \
    --filter 'e.message.contains("memory") || e.message.contains("disk") || e.message.contains("connection")' \
    -e 'track_count(e.service)' \
    -k timestamp,service,level,message

User Activity Tracking

Monitor user-facing operations:

kelora -j app.log \
    --filter 'e.has_path("user_id")' \
    -e 'track_unique("active_users", e.user_id)' \
    -e 'track_count(e.get_path("operation", "unknown"))' \
    --metrics

Time-Based Monitoring

Last Hour's Health

kelora -j app.log \
    --since "1 hour ago" \
    -e 'track_count(e.level)' \
    -e 'track_count(e.service)' \
    --metrics

Compare Time Periods

# Morning traffic
kelora -j app.log \
    --since "2024-01-15 06:00:00" \
    --until "2024-01-15 12:00:00" \
    -e 'track_count(e.level)' \
    --metrics

# Afternoon traffic
kelora -j app.log \
    --since "2024-01-15 12:00:00" \
    --until "2024-01-15 18:00:00" \
    -e 'track_count(e.level)' \
    --metrics

Real-Time Monitoring

tail -f /var/log/app.log | kelora -j \
    -e 'track_count(e.level)' \
    -e 'if e.level == "ERROR" { eprint("⚠️  ERROR in " + e.service) }' \
    --metrics
Get-Content -Wait app.log | kelora -j \
    -e 'track_count(e.level)' \
    -e 'if e.level == "ERROR" { eprint("⚠️  ERROR in " + e.service) }' \
    --metrics

Alerting Patterns

Critical Error Detection

kelora -j app.log \
    --filter 'e.level == "CRITICAL"' \
    -e 'eprint("🚨 CRITICAL: " + e.service + " - " + e.message)' \
    -qq

The -qq flag suppresses event output, showing only alerts.

Threshold Alerts

kelora -j app.log \
    -e 'if e.get_path("memory_percent", 0) > 90 { eprint("⚠️  High memory: " + e.service + " at " + e.memory_percent + "%") }' \
    -e 'if e.get_path("duration_ms", 0) > 5000 { eprint("⚠️  Slow request: " + e.get_path("path", "unknown") + " took " + e.duration_ms + "ms") }' \
    -qq

Service Down Detection

kelora -j app.log \
    --filter 'e.message.contains("unavailable") || e.message.contains("timeout") || e.message.contains("unreachable")' \
    -e 'eprint("🔴 Service issue: " + e.service + " - " + e.message)' \
    -k timestamp,service,message

Export for Monitoring Tools

Prometheus-Style Metrics

kelora -j app.log \
    -e 'track_count("http_requests_total")' \
    -e 'if e.status >= 500 { track_count("http_requests_errors") }' \
    -e 'track_avg("http_request_duration_ms", e.get_path("duration_ms", 0))' \
    --metrics

JSON Export for Dashboards

kelora -j app.log \
    --filter 'e.level == "ERROR"' \
    -e 'e.error_type = e.get_path("error.type", "unknown")' \
    -k timestamp,service,error_type,message \
    -J > errors.json

CSV for Spreadsheets

kelora -j app.log \
    -e 'e.hour = e.timestamp.format("%Y-%m-%d %H:00")' \
    -k hour,service,level,message \
    -F csv > health_report.csv

Performance Tips

Large Log Files:

kelora -j large-app.log.gz \
    --parallel \
    -e 'track_count(e.service)' \
    --metrics

Sampling for Quick Analysis:

kelora -j app.log \
    -e 'if e.user_id.bucket() % 10 == 0 { track_count("sampled") }' \
    --metrics

Focus on Recent Events:

kelora -j app.log \
    --since "30 minutes ago" \
    -e 'track_count(e.level)' \
    --metrics

Common Patterns

Service health summary:

kelora -j app.log \
    -e 'track_count(e.service + "_" + e.level)' \
    --metrics

Error rate calculation:

kelora -j app.log \
    -e 'track_count("total")' \
    -e 'if e.level == "ERROR" { track_count("errors") }' \
    --metrics
# Calculate: errors / total * 100

Unique users:

kelora -j app.log \
    -e 'track_unique("users", e.user_id)' \
    --metrics

Service call patterns:

kelora -j app.log \
    --filter 'e.has_path("operation")' \
    -e 'track_count(e.service + "::" + e.operation)' \
    --metrics

Troubleshooting

Missing fields:

# Use safe access with defaults
e.get_path("nested.field", "default_value")

Inconsistent log formats:

# Check if field exists before using
if e.has_path("duration_ms") {
    track_avg("duration", e.duration_ms)
}

Large numbers:

# Convert to human-readable
e.duration_s = e.duration_ms / 1000
e.memory_mb = e.memory_bytes / 1024 / 1024

See Also