Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
8f61046
Add otel telemetry to agent monitoring data
faec Sep 10, 2025
3cb7ad0
add remaining standard processors
faec Sep 10, 2025
a05982e
Handle custom telemetry endpoint configurations
faec Sep 12, 2025
64dab48
Merge branch 'main' of github.com:elastic/elastic-agent into otel-tel…
faec Sep 24, 2025
415aa0a
Merge branch 'main' of github.com:elastic/elastic-agent into otel-tel…
faec Sep 24, 2025
815e444
Testing remap script
faec Sep 25, 2025
9c95ee8
Merge branch 'otel-telemetry' of github.com:faec/elastic-agent into o…
faec Sep 25, 2025
9bb0b09
adjusting event fields
faec Sep 25, 2025
c591ccc
Update field names / semantics
faec Sep 29, 2025
a77e355
fix telemetry label scope
faec Sep 29, 2025
6c37528
add comment
faec Sep 29, 2025
3e8da6a
Fix remaining component / metricset fields
faec Sep 29, 2025
922979c
Mangle agent id to prevent rejection of metrics from different label …
faec Sep 29, 2025
80841a2
fix exporter label
faec Sep 29, 2025
940c046
Remove debug code
faec Sep 29, 2025
2772b3f
Merge branch 'main' of github.com:elastic/elastic-agent into otel-tel…
faec Sep 29, 2025
b633459
Rework otel config parsing to tolerate absent config
faec Sep 30, 2025
c8c8641
Check for nil otel config
faec Sep 30, 2025
a3ab177
make check
faec Sep 30, 2025
e681979
replace custom parsing with stock otel config struct
faec Sep 30, 2025
fdfd694
add changelog fragment
faec Sep 30, 2025
4257b29
mage check
faec Sep 30, 2025
c5a7db7
mage notice
faec Sep 30, 2025
085b371
update TestMonitoringFull golden file
faec Sep 30, 2025
ae03c29
integration test: there are now 4 monitoring components instead of 3
faec Sep 30, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
424 changes: 212 additions & 212 deletions NOTICE-fips.txt

Large diffs are not rendered by default.

424 changes: 212 additions & 212 deletions NOTICE.txt

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions changelog/fragments/1759257958-collector-telemetry-monitoring.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: enhancement

# Change summary; a 80ish characters long description of the change.
summary: Include OTel Collector internal telemetry in Agent monitoring

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
#pr: https://github.com/owner/repo/1234

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
#issue: https://github.com/owner/repo/1234
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ require (
go.opentelemetry.io/collector/processor/memorylimiterprocessor v0.135.0
go.opentelemetry.io/collector/receiver/nopreceiver v0.135.0
go.opentelemetry.io/collector/receiver/otlpreceiver v0.135.0
go.opentelemetry.io/collector/service v0.135.0
go.opentelemetry.io/ebpf-profiler v0.0.202536
go.uber.org/zap v1.27.0
go.yaml.in/yaml/v3 v3.0.4
Expand Down Expand Up @@ -719,7 +720,6 @@ require (
go.opentelemetry.io/collector/scraper v0.135.0 // indirect
go.opentelemetry.io/collector/scraper/scraperhelper v0.135.0 // indirect
go.opentelemetry.io/collector/semconv v0.128.1-0.20250610090210-188191247685 // indirect
go.opentelemetry.io/collector/service v0.135.0 // indirect
go.opentelemetry.io/collector/service/hostcapabilities v0.135.0 // indirect
go.opentelemetry.io/contrib/bridges/otelzap v0.12.0 // indirect
go.opentelemetry.io/contrib/detectors/gcp v1.36.0 // indirect
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/agent/application/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ func New(
if err != nil {
return nil, nil, nil, fmt.Errorf("failed to create upgrader: %w", err)
}
monitor := componentmonitoring.New(isMonitoringSupported, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, agentInfo, isOtelExecModeSubprocess)
monitor := componentmonitoring.New(isMonitoringSupported, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, rawConfig.OTel, agentInfo, isOtelExecModeSubprocess)

runtime, err := runtime.NewManager(
log,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ type MonitorManager interface {
// Enabled when configured to collect metrics/logs.
Enabled() bool

// Reload reloads the configuration for the upgrade manager.
// Reload reloads the configuration for the monitoring manager.
Reload(rawConfig *config.Config) error

// MonitoringConfig injects monitoring configuration into resolved ast tree.
Expand Down
139 changes: 139 additions & 0 deletions internal/pkg/agent/application/monitoring/component/otel_remap.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// A script for use in the Beats script processor, to remap raw OTel telemetry
// from its prometheus endpoint to backwards-compatible Beats metrics fields
// that can be viewed in Agent dashboards.

function process(event) {
// This hard-coded exporter name will not work for the general
// (non-monitoring) use case.
var elastic_exporter = event.Get("prometheus.labels.exporter") == "elasticsearch/_agent-component/monitoring";
var elastic_scope = event.Get("prometheus.labels.otel_scope_name") == "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/elasticsearchexporter";

// We accept general collector fields that are scoped to the elasticsearch
// exporter (queue metrics, sent / error stats), or fields specifically
// scoped to the elasticsearch exporter (custom elastic metrics).
if (!elastic_exporter && !elastic_scope) {
event.Cancel();
return;
}

// Hack: if the scope is elastic-custom fields, deterministically mangle the
// agent.id. Since the label set is different, these are passed through in
// different events, and if we don't do this one of the events will be
// rejected as a duplicate since they have the same component id, agent id,
// and metricset.
var id = event.Get("agent.id");
if (id != null && id.length > 0) {
// Increment / wrap the last hex character of the uuid
var prefix = id.substring(0, id.length - 1);
var last = id.substring(id.length - 1);
var rotated = "0";
if (last < "f") {
rotated = String.fromCharCode(last.charCodeAt(0) + 1);
}
id = prefix + rotated;
event.Put("agent.id", id);
}

// The event will be discarded unless we find some valid metric to convert.
var keep_event = false;

var queue_size = event.Get("prometheus.metrics.otelcol_exporter_queue_size");
var queue_capacity = event.Get("prometheus.metrics.otelcol_exporter_queue_capacity");
if (queue_size != null) {
keep_event = true;
event.Put("beat.stats.libbeat.pipeline.queue.filled.events", queue_size);
}
if (queue_capacity != null) {
keep_event = true;
event.Put("beat.stats.libbeat.pipeline.queue.max_events", queue_capacity);
}
if (queue_size != null && queue_capacity != null) {
var queue_pct = queue_size / queue_capacity;
if (!isNaN(queue_pct)) {
event.Put("beat.stats.libbeat.pipeline.queue.filled.pct", queue_pct);
}
}

var total_sent = 0;
var total_sent_valid = false;
// Add send statistics from all source types
var sent_logs = event.Get("prometheus.metrics.otelcol_exporter_sent_log_records_total");
if (sent_logs != null) {
total_sent += sent_logs;
total_sent_valid = true;
}
var sent_spans = event.Get("prometheus.metrics.otelcol_exporter_sent_spans_total");
if (sent_spans != null) {
total_sent += sent_spans;
total_sent_valid = true;
}
var sent_metrics = event.Get("prometheus.metrics.otelcol_exporter_sent_metric_points_total");
if (sent_metrics != null) {
total_sent += sent_metrics;
total_sent_valid = true;
}
if (total_sent_valid) {
event.Put("beat.stats.libbeat.output.events.acked", total_sent);
keep_event = true;
}

var total_failed = 0;
var total_failed_valid = false;
// Add failed statistics from all source types
var failed_logs = event.Get("prometheus.metrics.otelcol_exporter_send_failed_log_records_total");
if (failed_logs != null) {
total_failed += failed_logs;
total_failed_valid = true;
}
var failed_spans = event.Get("prometheus.metrics.otelcol_exporter_send_failed_spans_total");
if (failed_spans != null) {
total_failed += failed_spans;
total_failed_valid = true;
}
var failed_metrics = event.Get("prometheus.metrics.otelcol_exporter_send_failed_metric_points_total");
if (failed_metrics != null) {
total_failed += failed_metrics;
total_failed_valid = true;
}
if (total_failed_valid) {
event.Put("beat.stats.libbeat.output.events.dropped", total_failed);
keep_event = true;
}

var flushed_bytes = event.Get("prometheus.metrics.otelcol_elasticsearch_flushed_bytes_total");
if (flushed_bytes != null) {
event.Put("beat.stats.libbeat.output.write.bytes", flushed_bytes);
keep_event = true;
}

var retried_docs = event.Get("prometheus.metrics.otelcol_elasticsearch_docs_retried_ratio_total");
if (retried_docs != null) {
// "failed" in the beats metric means an event failed to ingest but was
// not dropped, and will be retried.
event.Put("beat.stats.libbeat.output.events.failed", retried_docs);
keep_event = true;
}

var request_count = event.Get("prometheus.metrics.otelcol_elasticsearch_bulk_requests_count_ratio_total");
if (request_count != null) {
// This is not an exact semantic match for how Beats measures batch count,
// but it's close.
event.Put("beat.stats.libbeat.output.events.batches", request_count);
keep_event = true;
}

var processed_docs_count = event.Get("prometheus.metrics.otelcol_elasticsearch_docs_processed_ratio_total");
if (processed_docs_count != null) {
// Approximate semantic match: the otel metric counts all document
// ingestion attempts, including success, failure, and retries,
// which is a better match for the Beats definition of total events
// than otelcol_elasticsearch_docs_received_ratio_total which
// includes only unique events seen (regardless of retries etc).
event.Put("beat.stats.libbeat.output.events.total", processed_docs_count);
keep_event = true;
}

if (!keep_event) {
event.Cancel();
}
}
Loading