chore: update otel example

2022-11-09 15:16:13 +02:00 · 2022-11-09 15:16:13 +02:00 · dd858eaf07
parent 1278a8094f
commit dd858eaf07
8 changed files with 164 additions and 22 deletions
--- a/example/otel/README.md
+++ b/example/otel/README.md
@ -40,9 +40,25 @@ UPTRACE_DSN=http://project2_secret_token@localhost:14317/2 go run client.go
 trace: http://localhost:14318/traces/ee029d8782242c8ed38b16d961093b35
 ```
 ![Redis trace](./image/redis-trace.png)
 You can also open Uptrace UI at [http://localhost:14318](http://localhost:14318) to view available
 spans, logs, and metrics.
 ## Redis monitoring
 You can also [monitor Redis performance](https://uptrace.dev/opentelemetry/redis-monitoring.html)
 metrics By installing OpenTelemetry Collector.
 [OpenTelemetry Collector](https://uptrace.dev/opentelemetry/collector.html) is an agent that pulls
 telemetry data from systems you want to monitor and sends it to APM tools using the OpenTelemetry
 protocol (OTLP).
 When telemetry data reaches Uptrace, it automatically generates a Redis dashboard from a pre-defined
 template.
 ![Redis dashboard](./image/metrics.png)
 ## Links
 - [Uptrace open-source APM](https://uptrace.dev/get/open-source-apm.html)
--- a/example/otel/config/alertmanager.yml
+++ b/example/otel/config/alertmanager.yml
@ -0,0 +1,53 @@
 # See https://prometheus.io/docs/alerting/latest/configuration/ for details.
 global:
  # The smarthost and SMTP sender used for mail notifications.
  smtp_smarthost: 'mailhog:1025'
  smtp_from: 'alertmanager@example.com'
  smtp_require_tls: false
 receivers:
  - name: 'team-X'
    email_configs:
      - to: 'some-receiver@example.com'
        send_resolved: true
 # The root route on which each incoming alert enters.
 route:
  # The labels by which incoming alerts are grouped together. For example,
  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
  # be batched into a single group.
  group_by: ['alertname', 'cluster', 'service']
  # When a new group of alerts is created by an incoming alert, wait at
  # least 'group_wait' to send the initial notification.
  # This way ensures that you get multiple alerts for the same group that start
  # firing shortly after another are batched together on the first
  # notification.
  group_wait: 30s
  # When the first notification was sent, wait 'group_interval' to send a batch
  # of new alerts that started firing for that group.
  group_interval: 5m
  # If an alert has successfully been sent, wait 'repeat_interval' to
  # resend them.
  repeat_interval: 3h
  # A default receiver
  receiver: team-X
  # All the above attributes are inherited by all child routes and can
  # overwritten on each.
  # The child route trees.
  routes:
    # This route matches error alerts created from spans or logs.
    - matchers:
        - alert_kind="error"
      group_interval: 24h
      receiver: team-X
 # The directory from which notification templates are read.
 templates:
  - '/etc/alertmanager/template/*.tmpl'
--- a/example/otel/config/otel-collector.yaml
+++ b/example/otel/config/otel-collector.yaml
--- a/example/otel/config/vector.toml
+++ b/example/otel/config/vector.toml
--- a/example/otel/docker-compose.yml
+++ b/example/otel/docker-compose.yml
@ -18,7 +18,7 @@ services:
      - '9000:9000'
  uptrace:
-    image: 'uptrace/uptrace:1.1.0'
+    image: 'uptrace/uptrace:1.2.0'
    #image: 'uptrace/uptrace-dev:latest'
    restart: on-failure
    volumes:
@ -36,11 +36,8 @@ services:
  otel-collector:
    image: otel/opentelemetry-collector-contrib:0.58.0
    restart: on-failure
    user: '0:0' # required for logs
    volumes:
-      - ./otel-collector.yaml:/etc/otelcol-contrib/config.yaml
+      - ./config/otel-collector.yaml:/etc/otelcol-contrib/config.yaml
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
      - /var/log:/var/log:ro
    ports:
      - '4317:4317'
      - '4318:4318'
@ -48,7 +45,25 @@ services:
  vector:
    image: timberio/vector:0.24.X-alpine
    volumes:
-      - ./vector.toml:/etc/vector/vector.toml:ro
+      - ./config/vector.toml:/etc/vector/vector.toml:ro
  alertmanager:
    image: prom/alertmanager:v0.24.0
    restart: on-failure
    volumes:
      - ./config/alertmanager.yml:/etc/alertmanager/config.yml
      - alertmanager_data:/alertmanager
    ports:
      - 9093:9093
    command:
      - '--config.file=/etc/alertmanager/config.yml'
      - '--storage.path=/alertmanager'
  mailhog:
    image: mailhog/mailhog:v1.0.1
    restart: on-failure
    ports:
      - '8025:8025'
  redis-server:
    image: redis
--- a/example/otel/image/metrics.png
+++ b/example/otel/image/metrics.png
--- a/example/otel/image/redis-trace.png
+++ b/example/otel/image/redis-trace.png
--- a/example/otel/uptrace.yml
+++ b/example/otel/uptrace.yml
@ -13,6 +13,16 @@
 ##   foo: $$FOO_BAR
 ##
 ##
 ## ClickHouse database credentials.
 ##
 ch:
  # Connection string for ClickHouse database. For example:
  # clickhouse://<user>:<password>@<host>:<port>/<database>?sslmode=disable
  #
  # See https://clickhouse.uptrace.dev/guide/golang-clickhouse.html#options
  dsn: 'clickhouse://default:@clickhouse:9000/uptrace?sslmode=disable'
 ##
 ## A list of pre-configured projects. Each project is fully isolated.
 ##
@ -26,6 +36,10 @@ projects:
      - service.name
      - host.name
      - deployment.environment
    # Group spans by deployment.environment attribute.
    group_by_env: false
    # Group funcs spans by service.name attribute.
    group_funcs_by_service: false
  # Other projects can be used to monitor your applications.
  # To monitor micro-services or multiple related services, use a single project.
@ -36,6 +50,49 @@ projects:
      - service.name
      - host.name
      - deployment.environment
    # Group spans by deployment.environment attribute.
    group_by_env: false
    # Group funcs spans by service.name attribute.
    group_funcs_by_service: false
 ##
 ## Create metrics from spans and events.
 ##
 metrics_from_spans:
  - name: uptrace.tracing.spans_duration
    description: Spans duration (excluding events)
    instrument: histogram
    unit: microseconds
    value: span.duration / 1000
    attrs:
      - span.system as system
      - service.name as service
      - host.name as host
      - span.status_code as status
    where: not span.is_event
  - name: uptrace.tracing.spans
    description: Spans count (excluding events)
    instrument: counter
    unit: 1
    value: span.count
    attrs:
      - span.system as system
      - service.name as service
      - host.name as host
      - span.status_code as status
    where: not span.is_event
  - name: uptrace.tracing.events
    description: Events count (excluding spans)
    instrument: counter
    unit: 1
    value: span.count
    attrs:
      - span.system as system
      - service.name as service
      - host.name as host
    where: span.is_event
 ##
 ## To require authentication, uncomment the following section.
@ -78,16 +135,6 @@ auth:
  #     # Defaults to 'preferred_username'.
  #     claim: preferred_username
 ##
 ## ClickHouse database credentials.
 ##
 ch:
  # Connection string for ClickHouse database. For example:
  # clickhouse://<user>:<password>@<host>:<port>/<database>?sslmode=disable
  #
  # See https://clickhouse.uptrace.dev/guide/golang-clickhouse.html#options
  dsn: 'clickhouse://default:@clickhouse:9000/uptrace?sslmode=disable'
 ##
 ## Alerting rules for monitoring metrics.
 ##
@ -102,8 +149,8 @@ alerting:
        - $net_errors > 0 group by host.name
      # for the last 5 minutes
      for: 5m
-      # in the project id=1
+      annotations:
-      projects: [1]
+        summary: '{{ $labels.host_name }} has high number of net errors: {{ $values.net_errors }}'
    - name: Filesystem usage >= 90%
      metrics:
@ -114,7 +161,8 @@ alerting:
        - where device !~ "loop"
        - $fs_usage{state="used"} / $fs_usage >= 0.9
      for: 5m
-      projects: [1]
+      annotations:
        summary: '{{ $labels.host_name }} has high FS usage: {{ $values.fs_usage }}'
    - name: Uptrace is dropping spans
      metrics:
@ -122,7 +170,17 @@ alerting:
      query:
        - $spans{type=dropped} > 0
      for: 1m
-      projects: [1]
+      annotations:
        summary: 'Uptrace has dropped {{ $values.spans }} spans'
    - name: Always firing (for fun and testing)
      metrics:
        - process.runtime.go.goroutines as $goroutines
      query:
        - $goroutines >= 0 group by host.name
      for: 1m
      annotations:
        summary: '{{ $labels.host_name }} has high number of goroutines: {{ $values.goroutines }}'
  # Create alerts from error logs and span events.
  create_alerts_from_spans:
@ -139,8 +197,8 @@ alerting:
 ##
 alertmanager_client:
  # AlertManager API endpoints that Uptrace uses to manage alerts.
-  # urls:
+  urls:
-  #   - 'http://alertmanager:9093/api/v2/alerts'
+    - 'http://alertmanager:9093/api/v2/alerts'
 ##
 ## Various options to tweak ClickHouse schema.