chore: update otel example

2022-11-09 15:16:13 +02:00 · 2022-11-09 15:16:13 +02:00 · dd858eaf07
parent 1278a8094f
commit dd858eaf07
8 changed files with 164 additions and 22 deletions
--- a/example/otel/README.md
+++ b/example/otel/README.md
@ -40,9 +40,25 @@ UPTRACE_DSN=http://project2_secret_token@localhost:14317/2 go run client.go
 trace: http://localhost:14318/traces/ee029d8782242c8ed38b16d961093b35
 ```

+![Redis trace](./image/redis-trace.png)
+
 You can also open Uptrace UI at [http://localhost:14318](http://localhost:14318) to view available
 spans, logs, and metrics.

+## Redis monitoring
+
+You can also [monitor Redis performance](https://uptrace.dev/opentelemetry/redis-monitoring.html)
+metrics By installing OpenTelemetry Collector.
+
+[OpenTelemetry Collector](https://uptrace.dev/opentelemetry/collector.html) is an agent that pulls
+telemetry data from systems you want to monitor and sends it to APM tools using the OpenTelemetry
+protocol (OTLP).
+
+When telemetry data reaches Uptrace, it automatically generates a Redis dashboard from a pre-defined
+template.
+
+![Redis dashboard](./image/metrics.png)
+
 ## Links

 - [Uptrace open-source APM](https://uptrace.dev/get/open-source-apm.html)
--- a/example/otel/config/alertmanager.yml
+++ b/example/otel/config/alertmanager.yml
@ -0,0 +1,53 @@
+# See https://prometheus.io/docs/alerting/latest/configuration/ for details.
+
+global:
+  # The smarthost and SMTP sender used for mail notifications.
+  smtp_smarthost: 'mailhog:1025'
+  smtp_from: 'alertmanager@example.com'
+  smtp_require_tls: false
+
+receivers:
+  - name: 'team-X'
+    email_configs:
+      - to: 'some-receiver@example.com'
+        send_resolved: true
+
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  group_by: ['alertname', 'cluster', 'service']
+
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first
+  # notification.
+  group_wait: 30s
+
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  # resend them.
+  repeat_interval: 3h
+
+  # A default receiver
+  receiver: team-X
+
+  # All the above attributes are inherited by all child routes and can
+  # overwritten on each.
+
+  # The child route trees.
+  routes:
+    # This route matches error alerts created from spans or logs.
+    - matchers:
+        - alert_kind="error"
+      group_interval: 24h
+      receiver: team-X
+
+# The directory from which notification templates are read.
+templates:
+  - '/etc/alertmanager/template/*.tmpl'
--- a/example/otel/config/otel-collector.yaml
+++ b/example/otel/config/otel-collector.yaml
--- a/example/otel/config/vector.toml
+++ b/example/otel/config/vector.toml
--- a/example/otel/docker-compose.yml
+++ b/example/otel/docker-compose.yml
@ -18,7 +18,7 @@ services:
      - '9000:9000'

  uptrace:
-    image: 'uptrace/uptrace:1.1.0'
+    image: 'uptrace/uptrace:1.2.0'
    #image: 'uptrace/uptrace-dev:latest'
    restart: on-failure
    volumes:
@ -36,11 +36,8 @@ services:
  otel-collector:
    image: otel/opentelemetry-collector-contrib:0.58.0
    restart: on-failure
-    user: '0:0' # required for logs
    volumes:
-      - ./otel-collector.yaml:/etc/otelcol-contrib/config.yaml
-      - /var/lib/docker/containers:/var/lib/docker/containers:ro
-      - /var/log:/var/log:ro
+      - ./config/otel-collector.yaml:/etc/otelcol-contrib/config.yaml
    ports:
      - '4317:4317'
      - '4318:4318'
@ -48,7 +45,25 @@ services:
  vector:
    image: timberio/vector:0.24.X-alpine
    volumes:
-      - ./vector.toml:/etc/vector/vector.toml:ro
+      - ./config/vector.toml:/etc/vector/vector.toml:ro
+
+  alertmanager:
+    image: prom/alertmanager:v0.24.0
+    restart: on-failure
+    volumes:
+      - ./config/alertmanager.yml:/etc/alertmanager/config.yml
+      - alertmanager_data:/alertmanager
+    ports:
+      - 9093:9093
+    command:
+      - '--config.file=/etc/alertmanager/config.yml'
+      - '--storage.path=/alertmanager'
+
+  mailhog:
+    image: mailhog/mailhog:v1.0.1
+    restart: on-failure
+    ports:
+      - '8025:8025'

  redis-server:
    image: redis
--- a/example/otel/image/metrics.png
+++ b/example/otel/image/metrics.png
--- a/example/otel/image/redis-trace.png
+++ b/example/otel/image/redis-trace.png
--- a/example/otel/uptrace.yml
+++ b/example/otel/uptrace.yml
@ -13,6 +13,16 @@
 ##   foo: $$FOO_BAR
 ##

+##
+## ClickHouse database credentials.
+##
+ch:
+  # Connection string for ClickHouse database. For example:
+  # clickhouse://<user>:<password>@<host>:<port>/<database>?sslmode=disable
+  #
+  # See https://clickhouse.uptrace.dev/guide/golang-clickhouse.html#options
+  dsn: 'clickhouse://default:@clickhouse:9000/uptrace?sslmode=disable'
+
 ##
 ## A list of pre-configured projects. Each project is fully isolated.
 ##
@ -26,6 +36,10 @@ projects:
      - service.name
      - host.name
      - deployment.environment
+    # Group spans by deployment.environment attribute.
+    group_by_env: false
+    # Group funcs spans by service.name attribute.
+    group_funcs_by_service: false

  # Other projects can be used to monitor your applications.
  # To monitor micro-services or multiple related services, use a single project.
@ -36,6 +50,49 @@ projects:
      - service.name
      - host.name
      - deployment.environment
+    # Group spans by deployment.environment attribute.
+    group_by_env: false
+    # Group funcs spans by service.name attribute.
+    group_funcs_by_service: false
+
+##
+## Create metrics from spans and events.
+##
+metrics_from_spans:
+  - name: uptrace.tracing.spans_duration
+    description: Spans duration (excluding events)
+    instrument: histogram
+    unit: microseconds
+    value: span.duration / 1000
+    attrs:
+      - span.system as system
+      - service.name as service
+      - host.name as host
+      - span.status_code as status
+    where: not span.is_event
+
+  - name: uptrace.tracing.spans
+    description: Spans count (excluding events)
+    instrument: counter
+    unit: 1
+    value: span.count
+    attrs:
+      - span.system as system
+      - service.name as service
+      - host.name as host
+      - span.status_code as status
+    where: not span.is_event
+
+  - name: uptrace.tracing.events
+    description: Events count (excluding spans)
+    instrument: counter
+    unit: 1
+    value: span.count
+    attrs:
+      - span.system as system
+      - service.name as service
+      - host.name as host
+    where: span.is_event

 ##
 ## To require authentication, uncomment the following section.
@ -78,16 +135,6 @@ auth:
  #     # Defaults to 'preferred_username'.
  #     claim: preferred_username

-##
-## ClickHouse database credentials.
-##
-ch:
-  # Connection string for ClickHouse database. For example:
-  # clickhouse://<user>:<password>@<host>:<port>/<database>?sslmode=disable
-  #
-  # See https://clickhouse.uptrace.dev/guide/golang-clickhouse.html#options
-  dsn: 'clickhouse://default:@clickhouse:9000/uptrace?sslmode=disable'
-
 ##
 ## Alerting rules for monitoring metrics.
 ##
@ -102,8 +149,8 @@ alerting:
        - $net_errors > 0 group by host.name
      # for the last 5 minutes
      for: 5m
-      # in the project id=1
-      projects: [1]
+      annotations:
+        summary: '{{ $labels.host_name }} has high number of net errors: {{ $values.net_errors }}'

    - name: Filesystem usage >= 90%
      metrics:
@ -114,7 +161,8 @@ alerting:
        - where device !~ "loop"
        - $fs_usage{state="used"} / $fs_usage >= 0.9
      for: 5m
-      projects: [1]
+      annotations:
+        summary: '{{ $labels.host_name }} has high FS usage: {{ $values.fs_usage }}'

    - name: Uptrace is dropping spans
      metrics:
@ -122,7 +170,17 @@ alerting:
      query:
        - $spans{type=dropped} > 0
      for: 1m
-      projects: [1]
+      annotations:
+        summary: 'Uptrace has dropped {{ $values.spans }} spans'
+
+    - name: Always firing (for fun and testing)
+      metrics:
+        - process.runtime.go.goroutines as $goroutines
+      query:
+        - $goroutines >= 0 group by host.name
+      for: 1m
+      annotations:
+        summary: '{{ $labels.host_name }} has high number of goroutines: {{ $values.goroutines }}'

  # Create alerts from error logs and span events.
  create_alerts_from_spans:
@ -139,8 +197,8 @@ alerting:
 ##
 alertmanager_client:
  # AlertManager API endpoints that Uptrace uses to manage alerts.
-  # urls:
-  #   - 'http://alertmanager:9093/api/v2/alerts'
+  urls:
+    - 'http://alertmanager:9093/api/v2/alerts'

 ##
 ## Various options to tweak ClickHouse schema.