架設 open-telemetry 的一些紀錄

Hyman Chen
29 min readSep 21, 2023

--

這篇主要是紀錄我一些架設的方式,因為我覺得目前 open-telemetry 部屬的方式文件很多很亂,在一連串的摸索中把我現階段的成果紀錄下來,若是之後新版本的部署方式比較簡單就可以不用參考這篇文章了。

目前我的部署方式是採用 docker-compose 部署至虛擬機內,有分成兩個 compose file ,兩個 compose之間是用 docker 網路橋接,這樣服務之間可以互通,並且確保 ES 的 9200 port 不會外露至 host 上。

docker-compose1.yml

# Copyright The OpenTelemetry Authors
# SPDX-License-Identifier: Apache-2.0

version: '3.9'
x-default-logging: &logging
driver: "json-file"
options:
max-size: "5m"
max-file: "2"

services:
jaeger:
image: jaegertracing/all-in-one:1.49
networks:
- hyman-network
- default
command:
- "--prometheus.server-url=http://prometheus:9090"
- "--log-level=error"
- "--es.server-urls=https://es01:9200"
- "--es.password=XXXXXXXX
- "--es.tls.enabled=true"
- "--es.tls.skip-host-verify=true"
- "--es.username=elastic"
- "--es.version=7"
deploy:
resources:
limits:
memory: 300M
restart: unless-stopped
ports:
- "16686:16686"
- "4317" # OTLP gRPC default port
environment:
- COLLECTOR_OTLP_ENABLED=true
- METRICS_STORAGE_TYPE=prometheus
- SPAN_STORAGE_TYPE=elasticsearch
logging: *logging

# OpenTelemetry Collector
otelcol:
image: otel/opentelemetry-collector-contrib:0.82.0
deploy:
resources:
limits:
memory: 125M
restart: unless-stopped
command:
- "--config=/etc/otelcol-config.yml"
- "--config=/etc/otelcol-observability.yml"
- "--config=/etc/otelcol-config-extras.yml"
volumes:
- ./otelcollector/otelcol-config.yml:/etc/otelcol-config.yml
- ./otelcollector/otelcol-observability.yml:/etc/otelcol-observability.yml
- ./otelcollector/otelcol-config-extras.yml:/etc/otelcol-config-extras.yml
ports:
- "4317:4317" # OTLP over gRPC receiver
- "4318:4318" # OTLP over HTTP receiver
- "9464" # Prometheus exporter
- "8888" # metrics endpoint
depends_on:
- jaeger
logging: *logging

# Prometheus
prometheus:
image: quay.io/prometheus/prometheus:v2.46.0
command:
- --web.console.templates=/etc/prometheus/consoles
- --web.console.libraries=/etc/prometheus/console_libraries
- --storage.tsdb.retention.time=1h
- --config.file=/etc/prometheus/prometheus-config.yaml
- --storage.tsdb.path=/prometheus
- --web.enable-lifecycle
- --web.route-prefix=/
- --enable-feature=exemplar-storage
volumes:
- ./prometheus/prometheus-config.yaml:/etc/prometheus/prometheus-config.yaml
deploy:
resources:
limits:
memory: 1300M
ports:
- "9090:9090"
logging: *logging

networks:
hyman-network:
external: true

docker-compose2.yml

version: "2.2"

services:
setup:
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
volumes:
- certs:/usr/share/elasticsearch/config/certs
user: "0"
command: >
bash -c '
if [ x${ELASTIC_PASSWORD} == x ]; then
echo "Set the ELASTIC_PASSWORD environment variable in the .env file";
exit 1;
elif [ x${KIBANA_PASSWORD} == x ]; then
echo "Set the KIBANA_PASSWORD environment variable in the .env file";
exit 1;
fi;
if [ ! -f config/certs/ca.zip ]; then
echo "Creating CA";
bin/elasticsearch-certutil ca --silent --pem -out config/certs/ca.zip;
unzip config/certs/ca.zip -d config/certs;
fi;
if [ ! -f config/certs/certs.zip ]; then
echo "Creating certs";
echo -ne \
"instances:\n"\
" - name: es01\n"\
" dns:\n"\
" - es01\n"\
" - localhost\n"\
" ip:\n"\
" - 127.0.0.1\n"\
" - name: es02\n"\
" dns:\n"\
" - es02\n"\
" - localhost\n"\
" ip:\n"\
" - 127.0.0.1\n"\
" - name: es03\n"\
" dns:\n"\
" - es03\n"\
" - localhost\n"\
" ip:\n"\
" - 127.0.0.1\n"\
> config/certs/instances.yml;
bin/elasticsearch-certutil cert --silent --pem -out config/certs/certs.zip --in config/certs/instances.yml --ca-cert config/certs/ca/ca.crt --ca-key config/certs/ca/ca.key;
unzip config/certs/certs.zip -d config/certs;
fi;
echo "Setting file permissions"
chown -R root:root config/certs;
find . -type d -exec chmod 750 \{\} \;;
find . -type f -exec chmod 640 \{\} \;;
echo "Waiting for Elasticsearch availability";
until curl -s --cacert config/certs/ca/ca.crt https://es01:9200 | grep -q "missing authentication credentials"; do sleep 30; done;
echo "Setting kibana_system password";
until curl -s -X POST --cacert config/certs/ca/ca.crt -u "elastic:${ELASTIC_PASSWORD}" -H "Content-Type: application/json" https://es01:9200/_security/user/kibana_system/_password -d "{\"password\":\"${KIBANA_PASSWORD}\"}" | grep -q "^{}"; do sleep 10; done;
echo "All done!";
'
healthcheck:
test: ["CMD-SHELL", "[ -f config/certs/es01/es01.crt ]"]
interval: 1s
timeout: 5s
retries: 120

es01:
networks:
- hyman-network
- default
depends_on:
setup:
condition: service_healthy
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
volumes:
- certs:/usr/share/elasticsearch/config/certs
- esdata01:/usr/share/elasticsearch/data
environment:
- node.name=es01
- cluster.name=${CLUSTER_NAME}
- cluster.initial_master_nodes=es01,es02,es03
- discovery.seed_hosts=es02,es03
- ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
- bootstrap.memory_lock=true
- xpack.security.enabled=true
- xpack.security.http.ssl.enabled=true
- xpack.security.http.ssl.key=certs/es01/es01.key
- xpack.security.http.ssl.certificate=certs/es01/es01.crt
- xpack.security.http.ssl.certificate_authorities=certs/ca/ca.crt
- xpack.security.transport.ssl.enabled=true
- xpack.security.transport.ssl.key=certs/es01/es01.key
- xpack.security.transport.ssl.certificate=certs/es01/es01.crt
- xpack.security.transport.ssl.certificate_authorities=certs/ca/ca.crt
- xpack.security.transport.ssl.verification_mode=certificate
- xpack.license.self_generated.type=${LICENSE}
mem_limit: ${MEM_LIMIT}
ulimits:
memlock:
soft: -1
hard: -1
healthcheck:
test:
[
"CMD-SHELL",
"curl -s --cacert config/certs/ca/ca.crt https://localhost:9200 | grep -q 'missing authentication credentials'",
]
interval: 10s
timeout: 10s
retries: 120

es02:
networks:
- hyman-network
- default
depends_on:
- es01
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
volumes:
- certs:/usr/share/elasticsearch/config/certs
- esdata02:/usr/share/elasticsearch/data
environment:
- node.name=es02
- cluster.name=${CLUSTER_NAME}
- cluster.initial_master_nodes=es01,es02,es03
- discovery.seed_hosts=es01,es03
- bootstrap.memory_lock=true
- xpack.security.enabled=true
- xpack.security.http.ssl.enabled=true
- xpack.security.http.ssl.key=certs/es02/es02.key
- xpack.security.http.ssl.certificate=certs/es02/es02.crt
- xpack.security.http.ssl.certificate_authorities=certs/ca/ca.crt
- xpack.security.transport.ssl.enabled=true
- xpack.security.transport.ssl.key=certs/es02/es02.key
- xpack.security.transport.ssl.certificate=certs/es02/es02.crt
- xpack.security.transport.ssl.certificate_authorities=certs/ca/ca.crt
- xpack.security.transport.ssl.verification_mode=certificate
- xpack.license.self_generated.type=${LICENSE}
mem_limit: ${MEM_LIMIT}
ulimits:
memlock:
soft: -1
hard: -1
healthcheck:
test:
[
"CMD-SHELL",
"curl -s --cacert config/certs/ca/ca.crt https://localhost:9200 | grep -q 'missing authentication credentials'",
]
interval: 10s
timeout: 10s
retries: 120

es03:
networks:
- hyman-network
- default
depends_on:
- es02
image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
volumes:
- certs:/usr/share/elasticsearch/config/certs
- esdata03:/usr/share/elasticsearch/data
environment:
- node.name=es03
- cluster.name=${CLUSTER_NAME}
- cluster.initial_master_nodes=es01,es02,es03
- discovery.seed_hosts=es01,es02
- bootstrap.memory_lock=true
- xpack.security.enabled=true
- xpack.security.http.ssl.enabled=true
- xpack.security.http.ssl.key=certs/es03/es03.key
- xpack.security.http.ssl.certificate=certs/es03/es03.crt
- xpack.security.http.ssl.certificate_authorities=certs/ca/ca.crt
- xpack.security.transport.ssl.enabled=true
- xpack.security.transport.ssl.key=certs/es03/es03.key
- xpack.security.transport.ssl.certificate=certs/es03/es03.crt
- xpack.security.transport.ssl.certificate_authorities=certs/ca/ca.crt
- xpack.security.transport.ssl.verification_mode=certificate
- xpack.license.self_generated.type=${LICENSE}
mem_limit: ${MEM_LIMIT}
ulimits:
memlock:
soft: -1
hard: -1
healthcheck:
test:
[
"CMD-SHELL",
"curl -s --cacert config/certs/ca/ca.crt https://localhost:9200 | grep -q 'missing authentication credentials'",
]
interval: 10s
timeout: 10s
retries: 120

kibana:
networks:
- hyman-network
- default
depends_on:
es01:
condition: service_healthy
es02:
condition: service_healthy
es03:
condition: service_healthy
image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
volumes:
- certs:/usr/share/kibana/config/certs
- kibanadata:/usr/share/kibana/data
ports:
- ${KIBANA_PORT}:5601
environment:
- SERVERNAME=kibana
- ELASTICSEARCH_HOSTS=https://es01:9200
- ELASTICSEARCH_USERNAME=kibana_system
- ELASTICSEARCH_PASSWORD=${KIBANA_PASSWORD}
- ELASTICSEARCH_SSL_CERTIFICATEAUTHORITIES=config/certs/ca/ca.crt
mem_limit: ${MEM_LIMIT}
healthcheck:
test:
[
"CMD-SHELL",
"curl -s -I http://localhost:5601 | grep -q 'HTTP/1.1 302 Found'",
]
interval: 10s
timeout: 10s
retries: 120

volumes:
certs:
driver: local
esdata01:
driver: local
esdata02:
driver: local
esdata03:
driver: local
kibanadata:
driver: local

networks:
hyman-network:
external: true

elastic .env

# Password for the 'elastic' user (at least 6 characters)
ELASTIC_PASSWORD=XXXXXXXX

# Password for the 'kibana_system' user (at least 6 characters)
KIBANA_PASSWORD=XXXXXXXX

# Version of Elastic products
STACK_VERSION=7.17.13

# Set the cluster name
CLUSTER_NAME=elastic-stack

# Set to 'basic' or 'trial' to automatically start the 30-day trial
LICENSE=basic
#LICENSE=trial

# Port to expose Elasticsearch HTTP API to the host
ES_PORT=9200
#ES_PORT=127.0.0.1:9200

# Port to expose Kibana to the host
KIBANA_PORT=5601
#KIBANA_PORT=80

# Increase or decrease based on the available host memory (in bytes)
MEM_LIMIT=1073741824

# Project namespace (defaults to the current folder name if not set)
#COMPOSE_PROJECT_NAME=myproject

為什麼使用 elastic-search ?

原本使用範例 jaeger all-in-one memory storage 可以正常跑,但是服務一堆資料量很大,很快就會壞掉了,所以把儲存拉出來外面,elastic-search 有ILM( index lifecycle management)可以控管 span / service 的生命週期。

資料量很大,硬碟空間不夠?

在 open-telemetry 那邊可以去 filter span,我之前沒設定的時候一天資料量大約是 6–10 GB 後來去設定之後就減少很多資料量,每日大約是 100M,很多不必要被記錄的資料,例如我會對後端去掃 healthy,或是我的後端會去上傳 log 至其他 log-server。結果 open-telemetry 都會記錄下來,所以要想辦法設定 otelcol-config.yml 排除,有關process那邊要怎樣寫 filter ,可以參考我的範例,詳細還是要看官方文件。

# Copyright The OpenTelemetry Authors
# SPDX-License-Identifier: Apache-2.0
receivers:
otlp:
protocols:
grpc:
http:
cors:
allowed_origins:
- "http://*"
- "https://*"

exporters:
logging:

processors:
filter/post-tele:
spans:
exclude:
match_type: regexp
attributes:
- key: net.peer.name
value: xxx.xxx.server.com
filter/post-loki:
spans:
exclude:
match_type: regexp
attributes:
- key: net.peer.name
value: ooo.ooo.server.com
filter/security:
spans:
exclude:
match_type: regexp
span_names:
- (authorize request|security filterchain before|secured request|security filterchain after|getSystemHash|getHash)
filter/static:
spans:
exclude:
match_type: regexp
attributes:
- key: http.route
value: \/\S+\/\*\*\/\*
- key: http.target
value: (\.png|\.js)
filter/static2:
spans:
exclude:
match_type: regexp
span_names:
- http get \/\*\*\/\*
- ParameterizableViewController.handleRequest
- Render forward:\/index.html
- ResourceHttpRequestHandler.handleRequest
- ^GET \/\w+\/$

connectors:
spanmetrics:

service:
pipelines:
traces:
receivers: [otlp]
processors:
- filter/post-tele
- filter/post-loki
- filter/security
- filter/static2
- filter/static
exporters: [logging, spanmetrics]
metrics:
receivers: [otlp, spanmetrics]
processors: []
exporters: [logging]
logs:
receivers: [otlp]
processors: []
exporters: [logging]

elastic-search jaeger 的 ILM 怎樣設置?

這個部分我有嘗試過讓 jaeger 自己建立 alias 然後去用它官方的 rollover,但是很可惜並沒有建立成功,後來還是自己想辦法,因為 jaeger 如果沒有特別設定 alias 他的 index 會自動根據日期分割,我們只要把 lifecycle policy 串接上 index template 就可以達到效果。

要注意執行的步驟 ES deploy => setup ILM => open-telemetry deploy

  1. 建立 Index Lifecycle Policy,我的範例是過五天之後會自動刪除該筆 index,不設定 rollover,可以根據自己需要的情境去設定。
PUT _ilm/policy/jaeger-ilm-policy
{
"policy": {
"phases": {
"hot": {
"min_age": "0ms",
"actions": {
"set_priority": {
"priority": 100
}
}
},
"delete": {
"min_age": "5d",
"actions": {
"delete": {
"delete_searchable_snapshot": true
}
}
}
}
}
}

2. 建立 index template

PUT _index_template/jaeger-span
{
"index_patterns": [
"*jaeger-span-*"
],
"template": {
"settings": {
"index": {
"lifecycle": {
"name": "jaeger-ilm-policy"
},
"mapping": {
"nested_fields": {
"limit": "50"
}
},
"requests": {
"cache": {
"enable": "true"
}
},
"number_of_shards": "5",
"number_of_replicas": "1"
}
},
"mappings": {
"dynamic_templates": [
{
"span_tags_map": {
"path_match": "tag.*",
"mapping": {
"ignore_above": 256,
"type": "keyword"
}
}
},
{
"process_tags_map": {
"path_match": "process.tag.*",
"mapping": {
"ignore_above": 256,
"type": "keyword"
}
}
}
],
"properties": {
"traceID": {
"ignore_above": 256,
"type": "keyword"
},
"process": {
"type": "object",
"properties": {
"tag": {
"type": "object"
},
"serviceName": {
"ignore_above": 256,
"type": "keyword"
},
"tags": {
"dynamic": false,
"type": "nested",
"properties": {
"tagType": {
"ignore_above": 256,
"type": "keyword"
},
"value": {
"ignore_above": 256,
"type": "keyword"
},
"key": {
"ignore_above": 256,
"type": "keyword"
}
}
}
}
},
"references": {
"dynamic": false,
"type": "nested",
"properties": {
"spanID": {
"ignore_above": 256,
"type": "keyword"
},
"traceID": {
"ignore_above": 256,
"type": "keyword"
},
"refType": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"startTimeMillis": {
"format": "epoch_millis",
"type": "date"
},
"flags": {
"type": "integer"
},
"operationName": {
"ignore_above": 256,
"type": "keyword"
},
"parentSpanID": {
"ignore_above": 256,
"type": "keyword"
},
"tags": {
"dynamic": false,
"type": "nested",
"properties": {
"tagType": {
"ignore_above": 256,
"type": "keyword"
},
"value": {
"ignore_above": 256,
"type": "keyword"
},
"key": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"duration": {
"type": "long"
},
"spanID": {
"ignore_above": 256,
"type": "keyword"
},
"startTime": {
"type": "long"
},
"tag": {
"type": "object"
},
"logs": {
"dynamic": false,
"type": "nested",
"properties": {
"fields": {
"dynamic": false,
"type": "nested",
"properties": {
"tagType": {
"ignore_above": 256,
"type": "keyword"
},
"value": {
"ignore_above": 256,
"type": "keyword"
},
"key": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"timestamp": {
"type": "long"
}
}
}
}
}
},
"composed_of": [],
"priority": 1,
"version": 1
}

Grafana 要怎樣設定 jaeger?

grafana 的 jaeger 設定完 datasource 之後,如果要讓圖表可以連接還 loki 的 log 資訊還需要設定關聯,Query 欄位就可以定義 LogQL 的樣板,他會從 jaeger 的 spanId 跟 traceId 製造出 query 反查回 loki

當然後端也要設定 traceId 跟 spanId 也寫入 label 裡面,這樣 LogQL 才能搜尋到。

Grafana 要怎樣設定 loki 連接至 jaeger ?

如果需要看 log 然後反查 jaeger 也是要在 datasource 內設定 loki Derived fields,這就每一筆 log 裏面去搜尋出 traceId 關聯至 jaeger

總結

當服務發生錯誤,我可以在 Grafana 根據時間找到錯誤的 log ,並且可以追蹤錯誤沿途經過的微服務有哪些,經過微服務裡面的哪些功能,在反查回 log 。會降低查找錯誤的時間。

圖片來自網路https://devpress.csdn.net/cicd/62eb54b26484667128339eac.html

還有一個好處就是,我能夠知道這個 API 在每個步驟花多少時間,未來可以對效能不好的服務來做優化。

--

--