Skip to content

High network bandwith usage by consul on port 8300 #22121

@Alexsandr-Random

Description

@Alexsandr-Random

Overview of the Issue

Recently we noticed high traffic rates in iptables counters on master servers. After investigation we found that the main part of traffic comes from consul agents on port 8300 (RPC).
This creates a big problem in environments with limited bandwidth or with a large number of nodes.
A little context. We have a cluster that consists of more than 100 nodes (nomad + consul) that are geographically located in different parts of the world and communicate via a VPN channel.
We noticed that on smaller clusters (6 nodes, but many services on each node) - the situation is repeated and found that about 1-1.5 GB of traffic comes to the master per day on port 8300 from 1 agent.
Thus, this creates problems with large distributed clusters with 100+ nodes (it turns out that we receive more than 100 GB of traffic every day only from the consul to the master server) because of which we have already started receiving letters about exceeding the limits (20 TB / month)
Can you explain what could be the cause of such a volume of traffic or is this normal behavior?
I will provide the configuration of 1 master and 1 agent (some values ​​​​are edited for security purposes)

I would also like to note that there are no messages in the Consul logs except information (we also turned on debug, but there was nothing interesting there either). Consul quite often synchronizes the traefik service (more often than the others), namely the service, not the check, but I doubt that this service is the cause of such high traffic consumption.


Reproduction Steps

Steps to reproduce this issue, eg:

  1. Create a cluster with 100+ client nodes and 3 or more server nodes in different subnets. Connection via vpn is not necessary. it is enough for agents and masters to see each other directly
  2. Set up separate counters for clients and masters network on port 8300. (also you could setup other counters for rest consul ports)
  3. Let it for 1h and see high net usage (if *24 to get value for 1d)

We tried to reduce consumption by adding the following parameters to the consul configuration (they can be ignored, the problem is reproducible without them)

    "cache": {
        "entry_fetch_max_burst": 3,
        "entry_fetch_rate": 0.333
    },
    "disable_update_check": true,
    "check_update_interval": "10m",
    "gossip_lan": {
       "probe_interval": "6s",
       "probe_timeout": "3s",
       "retransmit_mult": 3,
       "suspicion_mult": 5,
       "gossip_interval": "1s",
       "gossip_nodes": 3
   },

Consul info for both Client and Server

Client info
Output from client 'consul info' command here

agent:
        check_monitors = 0
        check_ttls = 2
        checks = 7
        services = 7
build:
        prerelease = 
        revision = 33e5727a
        version = 1.20.2
        version_metadata = 
consul:
        acl = enabled
        known_servers = 3
        server = false
runtime:
        arch = amd64
        cpu_count = 8
        goroutines = 75
        max_procs = 8
        os = linux
        version = go1.22.7
serf_lan:
        coordinate_resets = 0
        encrypted = true
        event_queue = 0
        event_time = 351
        failed = 0
        health_score = 0
        intent_queue = 0
        left = 0
        member_time = 58820
        members = 79
        query_queue = 0
        query_time = 4

Client agent HCL config

{
  "bind_addr": "10.100.20.3",
  "client_addr": "127.0.0.1",
  "datacenter": "hel1",
  "primary_datacenter": "fsn1",
  "data_dir": "/var/local/consul",
  "domain": "consul",
  "enable_script_checks": true,
  "dns_config": {
    "enable_truncate": true,
    "only_passing": true
  },
  "node_name": "2559",
  "enable_syslog": true,
  "encrypt": "secret",
  "encrypt_verify_incoming": true,
  "encrypt_verify_outgoing": true,
  "leave_on_terminate": true,
  "log_level": "info",
  "rejoin_after_leave": true,
  "retry_join": [
    "10.100.10.11",
    "10.100.10.12",
    "10.100.10.13"
  ],
  "acl": {
    "enabled": true,
    "default_policy": "deny",
    "enable_token_persistence": true,
    "tokens": {
      "agent": "secret"
    }
  },
  "tls": {
    "defaults": {
      "verify_incoming": true,
      "verify_outgoing": true,
      "ca_file": "/usr/local/etc/ssl/consul-agent-ca.pem"
    },
    "internal_rpc": {
      "verify_server_hostname": true
    }
  },
  "auto_encrypt": {
    "tls": true
  },
  "ports": {
    "grpc": 8502
  },
  "cache": {
    "entry_fetch_max_burst": 3,
    "entry_fetch_rate": 0.333
  },
  "disable_update_check": true,
  "check_update_interval": "60m",
  "gossip_lan": {
    "probe_interval": "6s",
    "probe_timeout": "3s",
    "retransmit_mult": 3,
    "suspicion_mult": 5,
    "gossip_interval": "1s",
    "gossip_nodes": 3
  }
}
Server info
Output from server 'consul info' command here

agent:
        check_monitors = 0
        check_ttls = 0
        checks = 4
        services = 4
build:
        prerelease = 
        revision = 33e5727a
        version = 1.20.2
        version_metadata = 
consul:
        acl = enabled
        bootstrap = false
        known_datacenters = 2
        leader = true
        leader_addr = 10.100.10.13:8300
        server = true
raft:
        applied_index = 12409662
        commit_index = 12409662
        fsm_pending = 0
        last_contact = 0
        last_log_index = 12409662
        last_log_term = 1371
        last_snapshot_index = 12398574
        last_snapshot_term = 1371
        latest_configuration = [{Suffrage:Voter ID:f48b1b28-0af1-876b-f949-9ba9ccedde51 Address:10.100.10.13:8300} {Suffrage:Voter ID:1fac6648-f334-90c0-9657-368c9626fe00 Address:10.100.10.11:8300} {Suffrage:Voter ID:d368c927-8d7a-fb00-3571-f53b693c9394 Address:10.100.10.12:8300}]
        latest_configuration_index = 0
        num_peers = 2
        protocol_version = 3
        protocol_version_max = 3
        protocol_version_min = 0
        snapshot_version_max = 1
        snapshot_version_min = 0
        state = Leader
        term = 1371
runtime:
        arch = amd64
        cpu_count = 2
        goroutines = 1091
        max_procs = 2
        os = linux
        version = go1.22.7
serf_lan:
        coordinate_resets = 0
        encrypted = true
        event_queue = 0
        event_time = 351
        failed = 1
        health_score = 0
        intent_queue = 0
        left = 0
        member_time = 58820
        members = 80
        query_queue = 0
        query_time = 4
serf_wan:
        coordinate_resets = 0
        encrypted = true
        event_queue = 0
        event_time = 1
        failed = 0
        health_score = 0
        intent_queue = 0
        left = 0
        member_time = 1913
        members = 6
        query_queue = 0
        query_time = 1

Server agent HCL config

{
    "bootstrap_expect": 3,
    "server": true,
    "advertise_addr": "10.100.10.13",
    "advertise_addr_wan": "10.44.10.3",
    "client_addr": "127.0.0.1",
    "datacenter": "hel1",
    "primary_datacenter": "fsn1",
    "data_dir": "/var/local/consul",
    "domain": "consul",
    "recursors": ["1.1.1.1", "8.8.8.8"],
    "enable_script_checks": true,
    "dns_config": {
        "enable_truncate": true,
        "only_passing": true
    },
    "performance": {
        "raft_multiplier": 1
    },
    "ports": {
        "dns": 53
    },
    "enable_syslog": true,
    "encrypt": "secret",
    "encrypt_verify_incoming": true,
    "encrypt_verify_outgoing": true,
    "leave_on_terminate": true,
    "log_level": "info",
    "rejoin_after_leave": true,
    "retry_join": ["10.44.10.1", "10.44.10.2", "10.44.10.3"],
    "retry_join_wan": ["10.44.0.2", "10.44.0.3", "10.44.0.4"],
    "acl": {
        "enabled": true,
        "default_policy": "deny",
        "down_policy": "extend-cache",
        "enable_token_persistence": true,
        "enable_token_replication": true
    },
    "cache": {
        "entry_fetch_max_burst": 3,
        "entry_fetch_rate": 0.333
    },
    "disable_update_check": true,
    "check_update_interval": "10m",
    "gossip_lan": {
       "probe_interval": "6s",
       "probe_timeout": "3s",
       "retransmit_mult": 3,
       "suspicion_mult": 5,
       "gossip_interval": "1s",
       "gossip_nodes": 3
   },
    "ui_config": {
        "enabled": true
    },
    "connect": {
        "enabled": true
    }
}

Operating system and Environment details

Run on ubuntu 24.04 LTS

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions