Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
429 changes: 429 additions & 0 deletions NOTICE-fips.txt

Large diffs are not rendered by default.

429 changes: 429 additions & 0 deletions NOTICE.txt

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions changelog/fragments/1759257958-collector-telemetry-monitoring.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: enhancement

# Change summary; a 80ish characters long description of the change.
summary: Include OTel Collector internal telemetry in Agent monitoring

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
#pr: https://github.com/owner/repo/1234

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
#issue: https://github.com/owner/repo/1234
23 changes: 23 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,30 @@ require (
go.elastic.co/apm/v2 v2.7.1
go.elastic.co/ecszap v1.0.3
go.elastic.co/go-licence-detector v0.7.0
<<<<<<< HEAD
go.opentelemetry.io/collector/component/componentstatus v0.132.0
go.opentelemetry.io/collector/connector/forwardconnector v0.132.0
go.opentelemetry.io/collector/pipeline v1.38.0
go.opentelemetry.io/collector/processor/memorylimiterprocessor v0.132.0
go.opentelemetry.io/collector/receiver/nopreceiver v0.132.0
=======
go.opentelemetry.io/collector/component/componentstatus v0.135.0
go.opentelemetry.io/collector/component/componenttest v0.135.0
go.opentelemetry.io/collector/connector/forwardconnector v0.135.0
go.opentelemetry.io/collector/exporter/debugexporter v0.135.0
go.opentelemetry.io/collector/exporter/nopexporter v0.135.0
go.opentelemetry.io/collector/exporter/otlpexporter v0.135.0
go.opentelemetry.io/collector/exporter/otlphttpexporter v0.135.0
go.opentelemetry.io/collector/extension/extensiontest v0.135.0
go.opentelemetry.io/collector/extension/memorylimiterextension v0.135.0
go.opentelemetry.io/collector/pipeline v1.41.0
go.opentelemetry.io/collector/processor/batchprocessor v0.135.0
go.opentelemetry.io/collector/processor/memorylimiterprocessor v0.135.0
go.opentelemetry.io/collector/receiver/nopreceiver v0.135.0
go.opentelemetry.io/collector/receiver/otlpreceiver v0.135.0
go.opentelemetry.io/collector/service v0.135.0
go.opentelemetry.io/ebpf-profiler v0.0.202536
>>>>>>> a441ebee7 (Ingest internal telemetry from the OTel Collector when it is running (#9928))
go.uber.org/zap v1.27.0
golang.org/x/crypto v0.41.0
golang.org/x/exp v0.0.0-20250215185904-eff6e970281f
Expand Down Expand Up @@ -686,8 +705,12 @@ require (
go.opentelemetry.io/collector/scraper v0.132.0 // indirect
go.opentelemetry.io/collector/scraper/scraperhelper v0.132.0 // indirect
go.opentelemetry.io/collector/semconv v0.128.1-0.20250610090210-188191247685 // indirect
<<<<<<< HEAD
go.opentelemetry.io/collector/service v0.132.0 // indirect
go.opentelemetry.io/collector/service/hostcapabilities v0.132.0 // indirect
=======
go.opentelemetry.io/collector/service/hostcapabilities v0.135.0 // indirect
>>>>>>> a441ebee7 (Ingest internal telemetry from the OTel Collector when it is running (#9928))
go.opentelemetry.io/contrib/bridges/otelzap v0.12.0 // indirect
go.opentelemetry.io/contrib/detectors/gcp v1.36.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.62.0 // indirect
Expand Down
4 changes: 4 additions & 0 deletions internal/pkg/agent/application/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,11 @@ func New(
if err != nil {
return nil, nil, nil, fmt.Errorf("failed to create upgrader: %w", err)
}
<<<<<<< HEAD
monitor := monitoring.New(isMonitoringSupported, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, agentInfo)
=======
monitor := componentmonitoring.New(isMonitoringSupported, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, rawConfig.OTel, agentInfo, isOtelExecModeSubprocess)
>>>>>>> a441ebee7 (Ingest internal telemetry from the OTel Collector when it is running (#9928))

runtime, err := runtime.NewManager(
log,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ type MonitorManager interface {
// Enabled when configured to collect metrics/logs.
Enabled() bool

// Reload reloads the configuration for the upgrade manager.
// Reload reloads the configuration for the monitoring manager.
Reload(rawConfig *config.Config) error

// MonitoringConfig injects monitoring configuration into resolved ast tree.
Expand Down
139 changes: 139 additions & 0 deletions internal/pkg/agent/application/monitoring/otel_remap.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// A script for use in the Beats script processor, to remap raw OTel telemetry
// from its prometheus endpoint to backwards-compatible Beats metrics fields
// that can be viewed in Agent dashboards.

function process(event) {
// This hard-coded exporter name will not work for the general
// (non-monitoring) use case.
var elastic_exporter = event.Get("prometheus.labels.exporter") == "elasticsearch/_agent-component/monitoring";
var elastic_scope = event.Get("prometheus.labels.otel_scope_name") == "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/elasticsearchexporter";

// We accept general collector fields that are scoped to the elasticsearch
// exporter (queue metrics, sent / error stats), or fields specifically
// scoped to the elasticsearch exporter (custom elastic metrics).
if (!elastic_exporter && !elastic_scope) {
event.Cancel();
return;
}

// Hack: if the scope is elastic-custom fields, deterministically mangle the
// agent.id. Since the label set is different, these are passed through in
// different events, and if we don't do this one of the events will be
// rejected as a duplicate since they have the same component id, agent id,
// and metricset.
var id = event.Get("agent.id");
if (id != null && id.length > 0) {
// Increment / wrap the last hex character of the uuid
var prefix = id.substring(0, id.length - 1);
var last = id.substring(id.length - 1);
var rotated = "0";
if (last < "f") {
rotated = String.fromCharCode(last.charCodeAt(0) + 1);
}
id = prefix + rotated;
event.Put("agent.id", id);
}

// The event will be discarded unless we find some valid metric to convert.
var keep_event = false;

var queue_size = event.Get("prometheus.metrics.otelcol_exporter_queue_size");
var queue_capacity = event.Get("prometheus.metrics.otelcol_exporter_queue_capacity");
if (queue_size != null) {
keep_event = true;
event.Put("beat.stats.libbeat.pipeline.queue.filled.events", queue_size);
}
if (queue_capacity != null) {
keep_event = true;
event.Put("beat.stats.libbeat.pipeline.queue.max_events", queue_capacity);
}
if (queue_size != null && queue_capacity != null) {
var queue_pct = queue_size / queue_capacity;
if (!isNaN(queue_pct)) {
event.Put("beat.stats.libbeat.pipeline.queue.filled.pct", queue_pct);
}
}

var total_sent = 0;
var total_sent_valid = false;
// Add send statistics from all source types
var sent_logs = event.Get("prometheus.metrics.otelcol_exporter_sent_log_records_total");
if (sent_logs != null) {
total_sent += sent_logs;
total_sent_valid = true;
}
var sent_spans = event.Get("prometheus.metrics.otelcol_exporter_sent_spans_total");
if (sent_spans != null) {
total_sent += sent_spans;
total_sent_valid = true;
}
var sent_metrics = event.Get("prometheus.metrics.otelcol_exporter_sent_metric_points_total");
if (sent_metrics != null) {
total_sent += sent_metrics;
total_sent_valid = true;
}
if (total_sent_valid) {
event.Put("beat.stats.libbeat.output.events.acked", total_sent);
keep_event = true;
}

var total_failed = 0;
var total_failed_valid = false;
// Add failed statistics from all source types
var failed_logs = event.Get("prometheus.metrics.otelcol_exporter_send_failed_log_records_total");
if (failed_logs != null) {
total_failed += failed_logs;
total_failed_valid = true;
}
var failed_spans = event.Get("prometheus.metrics.otelcol_exporter_send_failed_spans_total");
if (failed_spans != null) {
total_failed += failed_spans;
total_failed_valid = true;
}
var failed_metrics = event.Get("prometheus.metrics.otelcol_exporter_send_failed_metric_points_total");
if (failed_metrics != null) {
total_failed += failed_metrics;
total_failed_valid = true;
}
if (total_failed_valid) {
event.Put("beat.stats.libbeat.output.events.dropped", total_failed);
keep_event = true;
}

var flushed_bytes = event.Get("prometheus.metrics.otelcol_elasticsearch_flushed_bytes_total");
if (flushed_bytes != null) {
event.Put("beat.stats.libbeat.output.write.bytes", flushed_bytes);
keep_event = true;
}

var retried_docs = event.Get("prometheus.metrics.otelcol_elasticsearch_docs_retried_ratio_total");
if (retried_docs != null) {
// "failed" in the beats metric means an event failed to ingest but was
// not dropped, and will be retried.
event.Put("beat.stats.libbeat.output.events.failed", retried_docs);
keep_event = true;
}

var request_count = event.Get("prometheus.metrics.otelcol_elasticsearch_bulk_requests_count_ratio_total");
if (request_count != null) {
// This is not an exact semantic match for how Beats measures batch count,
// but it's close.
event.Put("beat.stats.libbeat.output.events.batches", request_count);
keep_event = true;
}

var processed_docs_count = event.Get("prometheus.metrics.otelcol_elasticsearch_docs_processed_ratio_total");
if (processed_docs_count != null) {
// Approximate semantic match: the otel metric counts all document
// ingestion attempts, including success, failure, and retries,
// which is a better match for the Beats definition of total events
// than otelcol_elasticsearch_docs_received_ratio_total which
// includes only unique events seen (regardless of retries etc).
event.Put("beat.stats.libbeat.output.events.total", processed_docs_count);
keep_event = true;
}

if (!keep_event) {
event.Cancel();
}
}
Loading
Loading