架設 open-telemetry 的一些紀錄

29 min readSep 21, 2023

這篇主要是紀錄我一些架設的方式，因為我覺得目前 open-telemetry 部屬的方式文件很多很亂，在一連串的摸索中把我現階段的成果紀錄下來，若是之後新版本的部署方式比較簡單就可以不用參考這篇文章了。

目前我的部署方式是採用 docker-compose 部署至虛擬機內，有分成兩個 compose file ，兩個 compose之間是用 docker 網路橋接，這樣服務之間可以互通，並且確保 ES 的 9200 port 不會外露至 host 上。

docker-compose1.yml

# Copyright The OpenTelemetry Authors
# SPDX-License-Identifier: Apache-2.0

version: '3.9'
x-default-logging: &logging
  driver: "json-file"
  options:
    max-size: "5m"
    max-file: "2"

services:
  jaeger:
    image: jaegertracing/all-in-one:1.49
    networks:
      - hyman-network
      - default
    command:
      - "--prometheus.server-url=http://prometheus:9090"
      - "--log-level=error"
      - "--es.server-urls=https://es01:9200"
      - "--es.password=XXXXXXXX
      - "--es.tls.enabled=true"
      - "--es.tls.skip-host-verify=true"
      - "--es.username=elastic"
      - "--es.version=7"
    deploy:
      resources:
        limits:
          memory: 300M
    restart: unless-stopped
    ports:
      - "16686:16686"
      - "4317"                           # OTLP gRPC default port
    environment:
      - COLLECTOR_OTLP_ENABLED=true
      - METRICS_STORAGE_TYPE=prometheus
      - SPAN_STORAGE_TYPE=elasticsearch
    logging: *logging

  # OpenTelemetry Collector
  otelcol:
    image: otel/opentelemetry-collector-contrib:0.82.0
    deploy:
      resources:
        limits:
          memory: 125M
    restart: unless-stopped
    command: 
      - "--config=/etc/otelcol-config.yml"
      - "--config=/etc/otelcol-observability.yml"
      - "--config=/etc/otelcol-config-extras.yml"
    volumes:
      - ./otelcollector/otelcol-config.yml:/etc/otelcol-config.yml
      - ./otelcollector/otelcol-observability.yml:/etc/otelcol-observability.yml
      - ./otelcollector/otelcol-config-extras.yml:/etc/otelcol-config-extras.yml
    ports:
      - "4317:4317"      # OTLP over gRPC receiver
      - "4318:4318"     # OTLP over HTTP receiver
      - "9464"          # Prometheus exporter
      - "8888"          # metrics endpoint
    depends_on:
      - jaeger
    logging: *logging

  # Prometheus
  prometheus:
    image: quay.io/prometheus/prometheus:v2.46.0
    command:
      - --web.console.templates=/etc/prometheus/consoles
      - --web.console.libraries=/etc/prometheus/console_libraries
      - --storage.tsdb.retention.time=1h
      - --config.file=/etc/prometheus/prometheus-config.yaml
      - --storage.tsdb.path=/prometheus
      - --web.enable-lifecycle
      - --web.route-prefix=/
      - --enable-feature=exemplar-storage
    volumes:
      - ./prometheus/prometheus-config.yaml:/etc/prometheus/prometheus-config.yaml
    deploy:
      resources:
        limits:
          memory: 1300M
    ports:
      - "9090:9090"
    logging: *logging

networks:
  hyman-network:
    external: true

docker-compose2.yml

version: "2.2"

services:
  setup:
    image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
    volumes:
      - certs:/usr/share/elasticsearch/config/certs
    user: "0"
    command: >
      bash -c '
        if [ x${ELASTIC_PASSWORD} == x ]; then
          echo "Set the ELASTIC_PASSWORD environment variable in the .env file";
          exit 1;
        elif [ x${KIBANA_PASSWORD} == x ]; then
          echo "Set the KIBANA_PASSWORD environment variable in the .env file";
          exit 1;
        fi;
        if [ ! -f config/certs/ca.zip ]; then
          echo "Creating CA";
          bin/elasticsearch-certutil ca --silent --pem -out config/certs/ca.zip;
          unzip config/certs/ca.zip -d config/certs;
        fi;
        if [ ! -f config/certs/certs.zip ]; then
          echo "Creating certs";
          echo -ne \
          "instances:\n"\
          "  - name: es01\n"\
          "    dns:\n"\
          "      - es01\n"\
          "      - localhost\n"\
          "    ip:\n"\
          "      - 127.0.0.1\n"\
          "  - name: es02\n"\
          "    dns:\n"\
          "      - es02\n"\
          "      - localhost\n"\
          "    ip:\n"\
          "      - 127.0.0.1\n"\
          "  - name: es03\n"\
          "    dns:\n"\
          "      - es03\n"\
          "      - localhost\n"\
          "    ip:\n"\
          "      - 127.0.0.1\n"\
          > config/certs/instances.yml;
          bin/elasticsearch-certutil cert --silent --pem -out config/certs/certs.zip --in config/certs/instances.yml --ca-cert config/certs/ca/ca.crt --ca-key config/certs/ca/ca.key;
          unzip config/certs/certs.zip -d config/certs;
        fi;
        echo "Setting file permissions"
        chown -R root:root config/certs;
        find . -type d -exec chmod 750 \{\} \;;
        find . -type f -exec chmod 640 \{\} \;;
        echo "Waiting for Elasticsearch availability";
        until curl -s --cacert config/certs/ca/ca.crt https://es01:9200 | grep -q "missing authentication credentials"; do sleep 30; done;
        echo "Setting kibana_system password";
        until curl -s -X POST --cacert config/certs/ca/ca.crt -u "elastic:${ELASTIC_PASSWORD}" -H "Content-Type: application/json" https://es01:9200/_security/user/kibana_system/_password -d "{\"password\":\"${KIBANA_PASSWORD}\"}" | grep -q "^{}"; do sleep 10; done;
        echo "All done!";
      '
    healthcheck:
      test: ["CMD-SHELL", "[ -f config/certs/es01/es01.crt ]"]
      interval: 1s
      timeout: 5s
      retries: 120

  es01:
    networks:
      - hyman-network
      - default
    depends_on:
      setup:
        condition: service_healthy
    image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
    volumes:
      - certs:/usr/share/elasticsearch/config/certs
      - esdata01:/usr/share/elasticsearch/data
    environment:
      - node.name=es01
      - cluster.name=${CLUSTER_NAME}
      - cluster.initial_master_nodes=es01,es02,es03
      - discovery.seed_hosts=es02,es03
      - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
      - bootstrap.memory_lock=true
      - xpack.security.enabled=true
      - xpack.security.http.ssl.enabled=true
      - xpack.security.http.ssl.key=certs/es01/es01.key
      - xpack.security.http.ssl.certificate=certs/es01/es01.crt
      - xpack.security.http.ssl.certificate_authorities=certs/ca/ca.crt
      - xpack.security.transport.ssl.enabled=true
      - xpack.security.transport.ssl.key=certs/es01/es01.key
      - xpack.security.transport.ssl.certificate=certs/es01/es01.crt
      - xpack.security.transport.ssl.certificate_authorities=certs/ca/ca.crt
      - xpack.security.transport.ssl.verification_mode=certificate
      - xpack.license.self_generated.type=${LICENSE}
    mem_limit: ${MEM_LIMIT}
    ulimits:
      memlock:
        soft: -1
        hard: -1
    healthcheck:
      test:
        [
          "CMD-SHELL",
          "curl -s --cacert config/certs/ca/ca.crt https://localhost:9200 | grep -q 'missing authentication credentials'",
        ]
      interval: 10s
      timeout: 10s
      retries: 120

  es02:
    networks:
      - hyman-network
      - default
    depends_on:
      - es01
    image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
    volumes:
      - certs:/usr/share/elasticsearch/config/certs
      - esdata02:/usr/share/elasticsearch/data
    environment:
      - node.name=es02
      - cluster.name=${CLUSTER_NAME}
      - cluster.initial_master_nodes=es01,es02,es03
      - discovery.seed_hosts=es01,es03
      - bootstrap.memory_lock=true
      - xpack.security.enabled=true
      - xpack.security.http.ssl.enabled=true
      - xpack.security.http.ssl.key=certs/es02/es02.key
      - xpack.security.http.ssl.certificate=certs/es02/es02.crt
      - xpack.security.http.ssl.certificate_authorities=certs/ca/ca.crt
      - xpack.security.transport.ssl.enabled=true
      - xpack.security.transport.ssl.key=certs/es02/es02.key
      - xpack.security.transport.ssl.certificate=certs/es02/es02.crt
      - xpack.security.transport.ssl.certificate_authorities=certs/ca/ca.crt
      - xpack.security.transport.ssl.verification_mode=certificate
      - xpack.license.self_generated.type=${LICENSE}
    mem_limit: ${MEM_LIMIT}
    ulimits:
      memlock:
        soft: -1
        hard: -1
    healthcheck:
      test:
        [
          "CMD-SHELL",
          "curl -s --cacert config/certs/ca/ca.crt https://localhost:9200 | grep -q 'missing authentication credentials'",
        ]
      interval: 10s
      timeout: 10s
      retries: 120

  es03:
    networks:
      - hyman-network
      - default
    depends_on:
      - es02
    image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
    volumes:
      - certs:/usr/share/elasticsearch/config/certs
      - esdata03:/usr/share/elasticsearch/data
    environment:
      - node.name=es03
      - cluster.name=${CLUSTER_NAME}
      - cluster.initial_master_nodes=es01,es02,es03
      - discovery.seed_hosts=es01,es02
      - bootstrap.memory_lock=true
      - xpack.security.enabled=true
      - xpack.security.http.ssl.enabled=true
      - xpack.security.http.ssl.key=certs/es03/es03.key
      - xpack.security.http.ssl.certificate=certs/es03/es03.crt
      - xpack.security.http.ssl.certificate_authorities=certs/ca/ca.crt
      - xpack.security.transport.ssl.enabled=true
      - xpack.security.transport.ssl.key=certs/es03/es03.key
      - xpack.security.transport.ssl.certificate=certs/es03/es03.crt
      - xpack.security.transport.ssl.certificate_authorities=certs/ca/ca.crt
      - xpack.security.transport.ssl.verification_mode=certificate
      - xpack.license.self_generated.type=${LICENSE}
    mem_limit: ${MEM_LIMIT}
    ulimits:
      memlock:
        soft: -1
        hard: -1
    healthcheck:
      test:
        [
          "CMD-SHELL",
          "curl -s --cacert config/certs/ca/ca.crt https://localhost:9200 | grep -q 'missing authentication credentials'",
        ]
      interval: 10s
      timeout: 10s
      retries: 120

  kibana:
    networks:
      - hyman-network
      - default
    depends_on:
      es01:
        condition: service_healthy
      es02:
        condition: service_healthy
      es03:
        condition: service_healthy
    image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
    volumes:
      - certs:/usr/share/kibana/config/certs
      - kibanadata:/usr/share/kibana/data
    ports:
      - ${KIBANA_PORT}:5601
    environment:
      - SERVERNAME=kibana
      - ELASTICSEARCH_HOSTS=https://es01:9200
      - ELASTICSEARCH_USERNAME=kibana_system
      - ELASTICSEARCH_PASSWORD=${KIBANA_PASSWORD}
      - ELASTICSEARCH_SSL_CERTIFICATEAUTHORITIES=config/certs/ca/ca.crt
    mem_limit: ${MEM_LIMIT}
    healthcheck:
      test:
        [
          "CMD-SHELL",
          "curl -s -I http://localhost:5601 | grep -q 'HTTP/1.1 302 Found'",
        ]
      interval: 10s
      timeout: 10s
      retries: 120

volumes:
  certs:
    driver: local
  esdata01:
    driver: local
  esdata02:
    driver: local
  esdata03:
    driver: local
  kibanadata:
    driver: local

networks:
  hyman-network:
    external: true

elastic .env

# Password for the 'elastic' user (at least 6 characters)
ELASTIC_PASSWORD=XXXXXXXX

# Password for the 'kibana_system' user (at least 6 characters)
KIBANA_PASSWORD=XXXXXXXX

# Version of Elastic products
STACK_VERSION=7.17.13

# Set the cluster name
CLUSTER_NAME=elastic-stack

# Set to 'basic' or 'trial' to automatically start the 30-day trial
LICENSE=basic
#LICENSE=trial

# Port to expose Elasticsearch HTTP API to the host
ES_PORT=9200
#ES_PORT=127.0.0.1:9200

# Port to expose Kibana to the host
KIBANA_PORT=5601
#KIBANA_PORT=80

# Increase or decrease based on the available host memory (in bytes)
MEM_LIMIT=1073741824

# Project namespace (defaults to the current folder name if not set)
#COMPOSE_PROJECT_NAME=myproject

為什麼使用 elastic-search ?

原本使用範例 jaeger all-in-one memory storage 可以正常跑，但是服務一堆資料量很大，很快就會壞掉了，所以把儲存拉出來外面，elastic-search 有ILM( index lifecycle management)可以控管 span / service 的生命週期。

資料量很大，硬碟空間不夠？

在 open-telemetry 那邊可以去 filter span，我之前沒設定的時候一天資料量大約是 6–10 GB 後來去設定之後就減少很多資料量，每日大約是 100M，很多不必要被記錄的資料，例如我會對後端去掃 healthy，或是我的後端會去上傳 log 至其他 log-server。結果 open-telemetry 都會記錄下來，所以要想辦法設定 otelcol-config.yml 排除，有關process那邊要怎樣寫 filter ，可以參考我的範例，詳細還是要看官方文件。

# Copyright The OpenTelemetry Authors
# SPDX-License-Identifier: Apache-2.0
receivers:
  otlp:
    protocols:
      grpc:
      http:
        cors:
          allowed_origins:
            - "http://*"
            - "https://*"

exporters:
  logging:

processors:
  filter/post-tele:
    spans:
      exclude:
        match_type: regexp
        attributes:
          - key: net.peer.name
            value: xxx.xxx.server.com
  filter/post-loki:
    spans:
      exclude:
        match_type: regexp
        attributes:
          - key: net.peer.name
            value: ooo.ooo.server.com
  filter/security:
    spans:
      exclude:
        match_type: regexp
        span_names:
          - (authorize request|security filterchain before|secured request|security filterchain after|getSystemHash|getHash)
  filter/static:
    spans:
      exclude:
        match_type: regexp
        attributes:
          - key: http.route
            value: \/\S+\/\*\*\/\*
          - key: http.target
            value: (\.png|\.js)
  filter/static2:
    spans:
      exclude:
        match_type: regexp
        span_names:
          - http get \/\*\*\/\*
          - ParameterizableViewController.handleRequest
          - Render forward:\/index.html
          - ResourceHttpRequestHandler.handleRequest
          - ^GET \/\w+\/$

connectors:
  spanmetrics:

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: 
        - filter/post-tele
        - filter/post-loki
        - filter/security
        - filter/static2
        - filter/static
      exporters: [logging, spanmetrics]
    metrics:
      receivers: [otlp, spanmetrics]
      processors: []
      exporters: [logging]
    logs:
      receivers: [otlp]
      processors: []
      exporters: [logging]

elastic-search jaeger 的 ILM 怎樣設置?

這個部分我有嘗試過讓 jaeger 自己建立 alias 然後去用它官方的 rollover，但是很可惜並沒有建立成功，後來還是自己想辦法，因為 jaeger 如果沒有特別設定 alias 他的 index 會自動根據日期分割，我們只要把 lifecycle policy 串接上 index template 就可以達到效果。

要注意執行的步驟 ES deploy => setup ILM => open-telemetry deploy

建立 Index Lifecycle Policy，我的範例是過五天之後會自動刪除該筆 index，不設定 rollover，可以根據自己需要的情境去設定。

PUT _ilm/policy/jaeger-ilm-policy
{
  "policy": {
    "phases": {
      "hot": {
        "min_age": "0ms",
        "actions": {
          "set_priority": {
            "priority": 100
          }
        }
      },
      "delete": {
        "min_age": "5d",
        "actions": {
          "delete": {
            "delete_searchable_snapshot": true
          }
        }
      }
    }
  }
}

2. 建立 index template

PUT _index_template/jaeger-span
{
  "index_patterns": [
    "*jaeger-span-*"
  ],
  "template": {
    "settings": {
      "index": {
        "lifecycle": {
          "name": "jaeger-ilm-policy"
        },
        "mapping": {
          "nested_fields": {
            "limit": "50"
          }
        },
        "requests": {
          "cache": {
            "enable": "true"
          }
        },
        "number_of_shards": "5",
        "number_of_replicas": "1"
      }
    },
    "mappings": {
      "dynamic_templates": [
        {
          "span_tags_map": {
            "path_match": "tag.*",
            "mapping": {
              "ignore_above": 256,
              "type": "keyword"
            }
          }
        },
        {
          "process_tags_map": {
            "path_match": "process.tag.*",
            "mapping": {
              "ignore_above": 256,
              "type": "keyword"
            }
          }
        }
      ],
      "properties": {
        "traceID": {
          "ignore_above": 256,
          "type": "keyword"
        },
        "process": {
          "type": "object",
          "properties": {
            "tag": {
              "type": "object"
            },
            "serviceName": {
              "ignore_above": 256,
              "type": "keyword"
            },
            "tags": {
              "dynamic": false,
              "type": "nested",
              "properties": {
                "tagType": {
                  "ignore_above": 256,
                  "type": "keyword"
                },
                "value": {
                  "ignore_above": 256,
                  "type": "keyword"
                },
                "key": {
                  "ignore_above": 256,
                  "type": "keyword"
                }
              }
            }
          }
        },
        "references": {
          "dynamic": false,
          "type": "nested",
          "properties": {
            "spanID": {
              "ignore_above": 256,
              "type": "keyword"
            },
            "traceID": {
              "ignore_above": 256,
              "type": "keyword"
            },
            "refType": {
              "ignore_above": 256,
              "type": "keyword"
            }
          }
        },
        "startTimeMillis": {
          "format": "epoch_millis",
          "type": "date"
        },
        "flags": {
          "type": "integer"
        },
        "operationName": {
          "ignore_above": 256,
          "type": "keyword"
        },
        "parentSpanID": {
          "ignore_above": 256,
          "type": "keyword"
        },
        "tags": {
          "dynamic": false,
          "type": "nested",
          "properties": {
            "tagType": {
              "ignore_above": 256,
              "type": "keyword"
            },
            "value": {
              "ignore_above": 256,
              "type": "keyword"
            },
            "key": {
              "ignore_above": 256,
              "type": "keyword"
            }
          }
        },
        "duration": {
          "type": "long"
        },
        "spanID": {
          "ignore_above": 256,
          "type": "keyword"
        },
        "startTime": {
          "type": "long"
        },
        "tag": {
          "type": "object"
        },
        "logs": {
          "dynamic": false,
          "type": "nested",
          "properties": {
            "fields": {
              "dynamic": false,
              "type": "nested",
              "properties": {
                "tagType": {
                  "ignore_above": 256,
                  "type": "keyword"
                },
                "value": {
                  "ignore_above": 256,
                  "type": "keyword"
                },
                "key": {
                  "ignore_above": 256,
                  "type": "keyword"
                }
              }
            },
            "timestamp": {
              "type": "long"
            }
          }
        }
      }
    }
  },
  "composed_of": [],
  "priority": 1,
  "version": 1
}