Skip to content

Patch release v0.21.2 #2059

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog

## [0.21.2]

### Fixed

- Fixed #1754 - net: traffic blocks when running ingress UDP performance tests
with very large buffers.

## [0.21.1]

### Fixed
Expand Down
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/api_server/swagger/firecracker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ info:
The API is accessible through HTTP calls on specific URLs
carrying JSON modeled data.
The transport medium is a Unix Domain Socket.
version: 0.21.1
version: 0.21.2
termsOfService: ""
contact:
email: "[email protected]"
Expand Down
9 changes: 7 additions & 2 deletions src/devices/src/virtio/net.rs
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,11 @@ impl EpollHandler for NetEpollHandler {
RX_TAP_EVENT => {
METRICS.net.rx_tap_event_count.inc();

if self.rx.queue.is_empty(&self.mem) {
// While there are no available RX queue buffers and there's a deferred_frame
// don't process any more incoming. Otherwise start processing a frame. In the
// process the deferred_frame flag will be set in order to avoid freezing the
// RX queue.
if self.rx.queue.is_empty(&self.mem) && self.rx.deferred_frame {
return Err(DeviceError::NoAvailBuffers);
}

Expand Down Expand Up @@ -1479,7 +1483,8 @@ mod tests {
let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap();
let (mut h, _txq, rxq) = default_test_netepollhandler(&mem, test_mutators);

// The RX queue is empty.
// The RX queue is empty and rx.deferred_frame flag is set.
h.rx.deferred_frame = true;
match h.handle_event(RX_TAP_EVENT, epoll::Events::EPOLLIN) {
Err(DeviceError::NoAvailBuffers) => (),
_ => panic!("invalid"),
Expand Down
2 changes: 1 addition & 1 deletion src/firecracker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "firecracker"
version = "0.21.1"
version = "0.21.2"
authors = ["Amazon Firecracker team <[email protected]>"]

[dependencies]
Expand Down
2 changes: 1 addition & 1 deletion src/jailer/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "jailer"
version = "0.21.1"
version = "0.21.2"
authors = ["Amazon Firecracker team <[email protected]>"]

[dependencies]
Expand Down
26 changes: 26 additions & 0 deletions tests/framework/microvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from retry import retry
from retry.api import retry_call

import host_tools.cpu_load as cpu_tools
import host_tools.memory as mem_tools
import host_tools.network as net_tools

Expand All @@ -31,6 +32,8 @@
MachineConfigure, Network, Vsock


# Too many public methods
# pylint: disable=R0904
class Microvm:
"""Class to represent a Firecracker microvm.

Expand Down Expand Up @@ -114,6 +117,10 @@ def __init__(
else:
self._memory_events_queue = None

# Cpu load monitoring has to be explicitly enabled using
# the `enable_cpu_load_monitor` method.
self._cpu_load_monitor = None

# External clone/exec tool, because Python can't into clone
self.bin_cloner_path = bin_cloner_path

Expand All @@ -133,6 +140,11 @@ def kill(self):
raise mem_tools.MemoryUsageExceededException(
self._memory_events_queue.get())

if self._cpu_load_monitor:
self._cpu_load_monitor.signal_stop()
self._cpu_load_monitor.join()
self._cpu_load_monitor.check_samples()

@property
def api_session(self):
"""Return the api session associated with this microVM."""
Expand Down Expand Up @@ -220,6 +232,20 @@ def memory_events_queue(self, queue):
"""Set the memory usage events queue."""
self._memory_events_queue = queue

def enable_cpu_load_monitor(self, threshold):
"""Enable the cpu load monitor."""
process_pid = self.jailer_clone_pid
# We want to monitor the emulation thread, which is currently
# the first one created.
# A possible improvement is to find it by name.
thread_pid = self.jailer_clone_pid
self._cpu_load_monitor = cpu_tools.CpuLoadMonitor(
process_pid,
thread_pid,
threshold
)
self._cpu_load_monitor.start()

def create_jailed_resource(self, path, create_jail=False):
"""Create a hard link to some resource inside this microvm."""
return self.jailer.jailed_path(path, create=True,
Expand Down
126 changes: 126 additions & 0 deletions tests/host_tools/cpu_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
"""Utilities for measuring cpu utilisation for a process."""
import time

from subprocess import run, CalledProcessError, PIPE
from threading import Thread

# /proc/<pid>/stat output taken from
# https://www.man7.org/linux/man-pages/man5/proc.5.html
STAT_UTIME_IDX = 13
STAT_STIME_IDX = 14
STAT_STARTTIME_IDX = 21


class CpuLoadExceededException(Exception):
"""A custom exception containing details on excessive cpu load."""

def __init__(self, cpu_load_samples, threshold):
"""Compose the error message containing the cpu load details."""
super(CpuLoadExceededException, self).__init__(
'Cpu load samples {} exceeded maximum threshold {}.\n'
.format(cpu_load_samples, threshold)
)


class CpuLoadMonitor(Thread):
"""Class to represent a cpu load monitor for a thread."""

CPU_LOAD_SAMPLES_TIMEOUT_S = 1

def __init__(
self,
process_pid,
thread_pid,
threshold
):
"""Set up monitor attributes."""
Thread.__init__(self)
self._process_pid = process_pid
self._thread_pid = thread_pid
self._cpu_load_samples = []
self._threshold = threshold
self._should_stop = False

@property
def process_pid(self):
"""Get the process pid."""
return self._process_pid

@property
def thread_pid(self):
"""Get the thread pid."""
return self._thread_pid

@property
def threshold(self):
"""Get the cpu load threshold."""
return self._threshold

@property
def cpu_load_samples(self):
"""Get the cpu load samples."""
return self._cpu_load_samples

def signal_stop(self):
"""Signal that the thread should stop."""
self._should_stop = True

def run(self):
"""Thread for monitoring cpu load of some pid.

`/proc/<process pid>/task/<thread pid>/stat` is used to compute
the cpu load, which is then added to the list.
It is up to the caller to check the queue.
"""
clock_ticks_cmd = 'getconf CLK_TCK'
try:
stdout = run(
clock_ticks_cmd,
shell=True,
check=True,
stdout=PIPE
).stdout.decode('utf-8')
except CalledProcessError:
return
try:
clock_ticks = int(stdout.strip("\n"))
except ValueError:
return

while not self._should_stop:
try:
with open('/proc/uptime') as uptime_file:
uptime = uptime_file.readline().strip("\n").split()[0]

with open('/proc/{pid}/task/{tid}/stat'.format(
pid=self.process_pid,
tid=self.thread_pid)
) as stat_file:
stat = stat_file.readline().strip("\n").split()
except IOError:
break

try:
uptime = float(uptime)
utime = int(stat[STAT_UTIME_IDX])
stime = int(stat[STAT_STIME_IDX])
starttime = int(stat[STAT_STARTTIME_IDX])
except ValueError:
break

total_time = utime + stime
seconds = uptime - starttime / clock_ticks
cpu_load = (total_time * 100 / clock_ticks) / seconds

if cpu_load > self.threshold:
self.cpu_load_samples.append(cpu_load)

time.sleep(self.CPU_LOAD_SAMPLES_TIMEOUT_S)

def check_samples(self):
"""Check that there are no samples above the threshold."""
if len(self.cpu_load_samples) > 0:
raise CpuLoadExceededException(
self._cpu_load_samples, self._threshold)
13 changes: 13 additions & 0 deletions tests/host_tools/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,19 @@ def netns(self):
"""Return the network namespace of this tap."""
return self._netns

def set_tx_queue_len(self, tx_queue_len):
"""Set the length of the tap's TX queue."""
run(
'ip netns exec {} ip link set {} txqueuelen {}'.format(
self.netns,
self.name,
tx_queue_len
),
shell=True,
stderr=PIPE,
check=True
)

def __del__(self):
"""Destructor doing tap interface clean up."""
# pylint: disable=subprocess-run-check
Expand Down
51 changes: 51 additions & 0 deletions tests/integration_tests/functional/test_net.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
"""Tests for the net device."""
from subprocess import run, PIPE
import time

import host_tools.network as net_tools

# The iperf version to run this tests with
IPERF_BINARY = 'iperf3'


def test_high_ingress_traffic(test_microvm_with_ssh, network_config):
"""Run iperf rx with high UDP traffic."""
test_microvm = test_microvm_with_ssh
test_microvm.spawn()

test_microvm.basic_config()

# Create tap before configuring interface.
tap, _host_ip, guest_ip = test_microvm.ssh_network_config(
network_config,
'1'
)
# Set the tap's tx queue len to 5. This increases the probability
# of filling the tap under high ingress traffic.
tap.set_tx_queue_len(5)

# Start the microvm.
test_microvm.start()

# Start iperf3 server on the guest.
ssh_connection = net_tools.SSHConnection(test_microvm.ssh_config)
ssh_connection.execute_command('{} -sD\n'.format(IPERF_BINARY))
time.sleep(1)

# Start iperf3 client on the host. Send 1Gbps UDP traffic.
# If the net device breaks, iperf will freeze. We have to use a timeout.
run(
'timeout 30 {} {} -c {} -u -V -b 1000000000 -t 30'.format(
test_microvm.jailer.netns_cmd_prefix(),
IPERF_BINARY,
guest_ip,
), stdout=PIPE, shell=True, check=False
)

# Check if the high ingress traffic broke the net interface.
# If the net interface still works we should be able to execute
# ssh commands.
exit_code, _, _ = ssh_connection.execute_command('echo success\n')
assert exit_code == 0
42 changes: 42 additions & 0 deletions tests/integration_tests/functional/test_rate_limiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,48 @@ def test_rx_rate_limiting(test_microvm_with_ssh, network_config):
_check_rx_rate_limit_patch(test_microvm, guest_ips)


def test_rx_rate_limiting_cpu_load(test_microvm_with_ssh, network_config):
"""Run iperf rx with rate limiting; verify cpu load is below threshold."""
test_microvm = test_microvm_with_ssh
test_microvm.spawn()

test_microvm.basic_config()

# Enable monitor that checks if the cpu load is over the threshold.
# After multiple runs, the average value for the cpu load
# seems to be around 10%. Setting the threshold a little
# higher to skip false positives.
threshold = 20
test_microvm.enable_cpu_load_monitor(threshold)

# Create interface with aggressive rate limiting enabled.
rx_rate_limiter_no_burst = {
'bandwidth': {
'size': 65536, # 64KBytes
'refill_time': 1000 # 1s
}
}
_tap, _host_ip, guest_ip = test_microvm.ssh_network_config(
network_config,
'1',
rx_rate_limiter=rx_rate_limiter_no_burst
)

test_microvm.start()

# Start iperf server on guest.
_start_iperf_on_guest(test_microvm, guest_ip)

# Run iperf client sending UDP traffic.
iperf_cmd = '{} {} -u -c {} -b 1000000000 -t{} -f KBytes'.format(
test_microvm.jailer.netns_cmd_prefix(),
IPERF_BINARY,
guest_ip,
IPERF_TRANSMIT_TIME * 5
)
_iperf_out = _run_local_iperf(iperf_cmd)


def _check_tx_rate_limiting(test_microvm, guest_ips, host_ips):
"""Check that the transmit rate is within expectations."""
# Start iperf on the host as this is the tx rate limiting test.
Expand Down
Loading