diff --git a/doc/Install-CentOS8 b/doc/Install-CentOS8 index dba933f..55240e0 100644 --- a/doc/Install-CentOS8 +++ b/doc/Install-CentOS8 @@ -37,6 +37,18 @@ It used the 2021-02-19 repo. You can adjust your commands based on running in a $ sudo dnf --disablerepo="*" --enablerepo="elrepo-kernel" list available | grep kernel-ml Install the latest mainline kernel $ sudo dnf --enablerepo=elrepo-kernel install kernel-ml + +2b. [required only for Example R] Configure Soft-RoCE + In order to configure Soft-RoCE follow the manuals: + - https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/networking_guide/sec-configuring_soft-_roce or/and + - https://support.mellanox.com/s/article/howto-configure-soft-roce + +Add also the following 4 lines to the end of the '/etc/security/limits.conf' file: +* soft memlock unlimited +* hard memlock unlimited +* soft nofile 1048000 +* hard nofile 1048000 +to set the values of maximum locked memory and maximum open files required by Soft-RoCE. 3. Install the basics you care about, something like: $ sudo yum -y install dnf git ipmctl ndctl vim-enhanced firewalld certbot @@ -121,6 +133,16 @@ Install the latest mainline kernel $ sudo ipmctl create -goal PersistentMemoryType=AppDirect $ sudo systemctl reboot +11a. [Example R] + + $ sudo ndctl create-namespace -f -e namespace0.0 --mode=devdax + $ sudo chown -R $(whoami):$(whoami) /dev/dax0.0 + $ sudo chmod a+rw /dev/dax0.0 + +Go to bullet 13. (Add a root cron job to expire old webhackathon sessions). + +11b. [examples other than Example R] + $ sudo ndctl create-namespace --mode fsdax --continue (note: this will fail if there is already an fsxdax on the memory, go to the next step) diff --git a/docker/Dockerfile b/docker/Dockerfile index b473808..cbd89c8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -11,7 +11,15 @@ FROM fedora:30 LABEL maintainer="andy.rudoff@intel.com" +ENV RPMA_DEPS "\ + iproute \ + iputils \ + librdmacm-utils \ + net-tools \ + rdma-core-devel" + RUN dnf update -y && dnf install -y\ + $RPMA_DEPS\ autoconf\ automake\ bash-completion\ @@ -103,7 +111,10 @@ RUN /pmemkv-ruby.sh COPY memkind.sh / RUN /memkind.sh +COPY librpma.sh / +RUN /librpma.sh + COPY tz.sh / RUN /tz.sh -RUN rm /pmdk.sh /valgrind.sh /pmemobj-cpp.sh /pmemkv.sh /setup-maven-settings.sh /pmemkv-java.sh /pmemkv-python.sh /pmemkv-nodejs.sh /pmemkv-ruby.sh /memkind.sh /tz.sh +RUN rm /pmdk.sh /valgrind.sh /pmemobj-cpp.sh /pmemkv.sh /setup-maven-settings.sh /pmemkv-java.sh /pmemkv-python.sh /pmemkv-nodejs.sh /pmemkv-ruby.sh /memkind.sh /librpma.sh /tz.sh diff --git a/docker/librpma.sh b/docker/librpma.sh new file mode 100755 index 0000000..69371fa --- /dev/null +++ b/docker/librpma.sh @@ -0,0 +1,19 @@ +#!/bin/bash -ex + +LIBRPMA_VERSION=1.1.0 +ZIP_FILE=rpma.zip + +# install librpma +wget -O $ZIP_FILE https://github.com/pmem/rpma/archive/${LIBRPMA_VERSION}.zip +unzip $ZIP_FILE +mkdir -p rpma-${LIBRPMA_VERSION}/build +pushd rpma-${LIBRPMA_VERSION}/build +cmake .. -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_INSTALL_PREFIX=/usr \ + -DBUILD_DOC=OFF \ + -DBUILD_EXAMPLES=OFF \ + -DBUILD_TESTS=OFF +make -j$(nproc) +sudo make -j$(nproc) install +popd +rm -rf $ZIP_FILE rpma-${LIBRPMA_VERSION} diff --git a/img/examples/R/ecosystem.png b/img/examples/R/ecosystem.png new file mode 100644 index 0000000..93f8eb8 Binary files /dev/null and b/img/examples/R/ecosystem.png differ diff --git a/scripts/config_softroce.sh b/scripts/config_softroce.sh new file mode 100755 index 0000000..8303f0e --- /dev/null +++ b/scripts/config_softroce.sh @@ -0,0 +1,119 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2020-2022, Intel Corporation + +# +# config_softroce.sh - configure SoftRoCE +# +# This script configures a software-emulated RDMA network interface (SoftRoCE), +# which requires: +# - one machine with a regular Ethernet network adapter, +# - the libibverbs and librdmacm libraries (or the rdma-core package +# containing both of them) installed, +# - the 'rdma_rxe' kernel module loaded. +# +# Usage: config_softroce.sh [] +# +# Options: +# - configure SoftRoCE for the given +# or for the first active and up one if no argument +# is given +# + +NET_IF=$1 + +MODULE="rdma_rxe" +DIR="/lib/modules/$(uname -r)" +STATE_OK="state ACTIVE physical_state LINK_UP" + +function get_IP4() { + NET_IF=$1 + ip -4 -j -p a show $NET_IF | grep -e local | cut -d'"' -f4 +} + +function print_IP4_of() { + NET_IF=$1 + IP=$(get_IP4 $NET_IF) + if [ "$IP" != "" ]; then + echo -n "$IP" + else + echo "no IP address assigned" + exit 1 + fi +} + +function print_IP4_all() { + NET_IFS=$(rdma link show | grep -e "$STATE_OK" | cut -d' ' -f8) + for NET_IF in $NET_IFS; do + IP=$(get_IP4 $NET_IF) + [ "$IP" != "" ] && echo -n "$IP" + done +} + +if [ $(lsmod | grep -e $MODULE | wc -l) -lt 1 ]; then + N_MODULES=$(find $DIR -name "$MODULE.ko*" | wc -l) + if [ $N_MODULES -lt 1 ]; then + echo "Error: cannot find the '$MODULE' module in the '$DIR' directory" + exit 1 + fi + + if ! sudo modprobe $MODULE; then + echo "Error: cannot load the '$MODULE' module" + sudo modprobe -v $MODULE + exit 1 + fi +fi + +if ! which ip > /dev/null; then + echo "Error: cannot find the 'ip' command. Install the 'iproute/iproute2' package" + exit 1 +fi + +if ! which rdma > /dev/null; then + echo "Error: cannot find the 'rdma' command. Install the 'iproute/iproute2' package" + exit 1 +fi + +if ! rdma link show > /dev/null ; then + echo "Error: the 'rdma link show' command failed" + exit 1 +fi + +if [ "$NET_IF" == "" ]; then + RDMA_LINKS=$(rdma link show | grep -e "$STATE_OK" | wc -l) + if [ $RDMA_LINKS -gt 0 ]; then + print_IP4_all + exit 0 + fi + + # pick up the first 'up' network interface + NET_IF=$(ip link | grep -v -e "LOOPBACK" | grep -e "state UP" | head -n1 | cut -d: -f2 | cut -d' ' -f2) + if [ "$NET_IF" == "" ]; then + # + # Look for a USB Ethernet network interfaces, + # which may not have 'state UP', + # but only 'UP' and 'state UNKNOWN', for example: + # ... ... state UNKNOWN ... + # + NET_IF=$(ip link | grep -v -e "LOOPBACK" | grep -e "UP" | grep -e "state UNKNOWN" | head -n1 | cut -d: -f2 | cut -d' ' -f2) + if [ "$NET_IF" == "" ]; then + echo "Error: cannot find an active and up network interface" + exit 1 + fi + fi +fi + +RXE_NAME="rxe_$NET_IF" +sudo rdma link add $RXE_NAME type rxe netdev $NET_IF +if [ $? -ne 0 ]; then + echo "Error: configuring SoftRoCE failed" + exit 1 +fi + +RDMA_LINKS=$(rdma link show | grep -e "$STATE_OK" | grep -e "$NET_IF" | wc -l) +if [ $RDMA_LINKS -lt 1 ]; then + echo "Error: configuring SoftRoCE for the '$NET_IF' network interface failed" + exit 1 +fi + +print_IP4_of $NET_IF diff --git a/scripts/enable_pmemusers b/scripts/enable_pmemusers index a9e5cb0..1eea6b2 100755 --- a/scripts/enable_pmemusers +++ b/scripts/enable_pmemusers @@ -85,6 +85,7 @@ die "$H/img directory not found\n" unless -d "$H/img"; die "$H/js directory not found\n" unless -d "$H/js"; die "$H/templates directory not found\n" unless -d "$H/templates"; die "$H/users directory not found\n" unless -d "$H/users"; +die "the jq command is missing\n" unless system("which jq > /dev/null") == 0; my $passwd = `grep '^pmemuser[0-9][0-9]*:' /etc/passwd`; my $nopasswd = $? >> 8; @@ -92,6 +93,25 @@ die "no pmemuserX accounts in /etc/passwd\n" if $nopasswd; my @pmem_mounts = ('/pmem0', '/pmem1'); +# set name of device DAX +my $dax_name = "dax0.0"; + +# check if the device DAX exists +my $dev_dax = "/dev/$dax_name"; +die "$dev_dax device DAX not found\n" unless -c "$dev_dax"; + +# look for device DAX in ndctl +my $exists = `ndctl list | grep -c -e "$dax_name"`; +chomp($exists); +die "$dev_dax device DAX not listed by ndctl\n" if ($exists != 1); + +# verify if the device DAX has the minimum required size of 800KiB (4KiB per user) +my $size = `ndctl list | jq '.[] | select(.chardev=="$dax_name")' | jq '.size'`; +die "size of device DAX is less than 800KiB ($size < 819200)\n" if ($size < 819200); + +# add RW permissions for all users to the device DAX +runy("chmod a+rw $dev_dax"); + for (my $id = $Lo; $id <= $Hi; $id++) { my $thisid = $BASEID + $id; my $user = "pmemuser$id"; @@ -105,7 +125,18 @@ for (my $id = $Lo; $id <= $Hi; $id++) { runy("echo '$user:$Pass' | chpasswd"); } - runy("docker run -t -d --name=$user -v $H/users/$user/shadow:/etc/shadow -v $H/users/$user/passwd:/etc/passwd -v $H/users/$user/group:/etc/group -v $H/users/$user/home:/home/$user -v $pmemdir:/pmem -u $thisid:$thisid --hostname=container$id --workdir=/home/$user --cap-add SYS_PTRACE --restart=unless-stopped $Image"); + runy("echo 'enable_pmemusers: PMEMUSER_ID=$id'"); + runy("echo 'enable_pmemusers: DEV_DAX=$dev_dax'"); + + my $ip = `./config_softroce.sh`; + if ($? >> 8) { + say("WARNING: config_softroce.sh failed, SoftRoCE is not configured, the Example R will not work!"); + $ip = "config_softroce.sh_failed"; + runy("docker run -t -d --name=$user --env RPMA_SOFT_ROCE_IP=$ip --env PMEMUSER_ID=$id --env DEV_DAX=$dev_dax -v $H/users/$user/shadow:/etc/shadow -v $H/users/$user/passwd:/etc/passwd -v $H/users/$user/group:/etc/group -v $H/users/$user/home:/home/$user -v $pmemdir:/pmem -u $thisid:$thisid --hostname=container$id --workdir=/home/$user --cap-add SYS_PTRACE --restart=unless-stopped $Image"); + } else { + runy("echo 'enable_pmemusers: RPMA_SOFT_ROCE_IP=$ip'"); + runy("docker run -t -d --name=$user --env RPMA_SOFT_ROCE_IP=$ip --env PMEMUSER_ID=$id --env DEV_DAX=$dev_dax --network host --device /dev/infiniband --device $dev_dax -v /sys/class/infiniband:/sys/class/infiniband -v /sys/class/infiniband_verbs:/sys/class/infiniband_verbs -v /sys/class/misc/rdma_cm:/sys/class/misc/rdma_cm -v $H/users/$user/shadow:/etc/shadow -v $H/users/$user/passwd:/etc/passwd -v $H/users/$user/group:/etc/group -v $H/users/$user/home:/home/$user -v $pmemdir:/pmem -u $thisid:$thisid --hostname=container$id --workdir=/home/$user --cap-add SYS_PTRACE --restart=unless-stopped $Image"); + } } say('done'); diff --git a/templates/examples/R/body.tmpl b/templates/examples/R/body.tmpl new file mode 100644 index 0000000..f54f5eb --- /dev/null +++ b/templates/examples/R/body.tmpl @@ -0,0 +1,247 @@ +{{top "Remote Persistent Memory"}} + +

+We assume you already know what Persistent Memory (or PMem for short) is and hopefully +you have learned many possible ways of benefiting from using it in your applications. +In this example we want to introduce a basic way of accessing the very same +Persistent Memory but when installed in a remote system. Remote Persistent Memory +(or RPMem for short) is a way of doing this via a network by making use +of the Remote Direct Memory Access (RDMA) technology. + +

+This example is an introduction to RPMem. We will guide you briefly through setting up +all required hardware and software components and verifying whether the connection +works properly. Having that, we will show you how to access Persistent Memory +on a remote system using the librpma library (where RPMA stands for +Remote Persistent Memory Access, makes sense right?). + +

+After completing this example you will know: +

    +
  • what is RPMem and what it is good for,
  • +
  • how RPMem is different comparing to PMem,
  • +
  • what hardware and software components are required to start using RPMem,
  • +
  • how to verify whether the RDMA network works properly,
  • +
  • how to use librpma API to: +
      +
    • establish a connection,
    • +
    • prepare memory for remote manipulation,
    • +
    • manipulate memory on the remote system,
    • +
    • assure persistency of stores to the remote system.
    • +
    +
  • +
+ +

+Step-by-step you will: +

    +
  • test the connection on the basic level using ping,
  • +
  • test the connection RDMA capabilities using rping,
  • +
  • review an application focusing on establishing a connection,
  • +
  • review an application reading remote system's memory and writing it back + in the persistent manner.
  • +
+ +{{step "Connectivity check"}} + +

+In order to have a real RDMA network you have to have: +

    +
  • two machines both equipped with RDMA-capable network adapters connected + to each other,
  • +
  • libibverbs and librdmacm libraries installed in both systems (or the rdma-core + package containing both of them).
  • +
+ +

+Alternatively, for development purposes, you can use a software-emulated RDMA network +interface (SoftRoCE) that can be set up using the following +script. + +

+All used network interfaces should be configured, up and running with an IP address +assigned. + +

+First, you will check the basic Ethernet network connectivity using the ping command +(see the ping(8) +manual for details). + +{{edit "run_test_ping.sh"}} + +{{run "./run_test_ping.sh"}} + +

+Next, you will check the RDMA connection with the RDMA ping-pong test using +the rping command (provided by the librdmacm-utils package). + +

+The rping command establishes a reliable RDMA connection between two nodes using +librdmacm and optionally performs RDMA transfers between the nodes, then disconnects. +When rping works, you can be sure that the RDMA connection is correctly configured +and works well (see the +rping(1) +manual for details). + +{{edit "run_test_rping.sh"}} + +{{run "./run_test_rping.sh"}} + +{{step "Establishing a connection"}} + +

+The Remote Persistent Memory Access library (librpma) is a C library to simplify accessing +PMem on remote systems over RDMA. For more information see +pmem.io. + +

+ + +

+Now you will implement both sides of an RDMA connection: +

+ +

+These applications intentionally do only connection and disconnection to familiarize you +with these processes. Building on this you will do more in the next step. + +{{edit "simple_client.c" "simple_server.c" "common.h" "common.c" "CMakeLists.txt" "build_simple.sh"}} + +{{build "./build_simple.sh"}} + +{{edit "run_simple.sh"}} + +{{run "./run_simple.sh"}} + +{{step "Remote Persistent Memory access"}} + +

+Now you will read and write back RPMem's content in the persistent way using +the following librpma functions: +

    +
  • rpma_read() - initiates transferring (reading) data from the remote + memory to the local memory (see the + rpma_read(3) + manual for details),
  • +
  • rpma_write() - initiates transferring (writing) data from the local + memory to the remote memory (see the + rpma_write(3) + manual for details),
  • +
  • rpma_flush() - initiates finalizing a transfer of data to the remote + memory, it flushes data down to the persistent domain (see the + rpma_flush(3) + manual for details).
  • +
+ +

+In this example, after having established a connection (see the previous step for +details), the client reads (using rpma_read()) an initial content +of the server's memory (the string: "The initial content of the server memory"), +then it writes (using rpma_write()) the new message ("Hello world!") +to the server's memory and finally it flushes the just written data down +to the persistent domain. + +

+Looking more deeply into the code it can be re-described in the following way: +

    +
  • The server prepares a local persistent memory and exposes the memory description + along with other parameters required to perform an RDMA read, write and flush + operations. After the connection is established, the server waits for the client + to disconnect.
  • +
  • The client allocates memory from DRAM and registers it as a reading destination + and writing source. After the connection is established the client receives + the server's memory regions registered as a reading source and a writing + destination. The client performs the RDMA read from the remote memory region + to the local memory region, then it writes new data to the local memory region + and performs the RDMA write from the local memory region to the remote memory + region followed by the RPMA flush.
  • +
+ +

+Note #1: The server requires a unique "user-id" argument in order to use +a different part of persistent memory that is shared by all the server instances +running on the same PMem device. The "user-id" argument is also used to pick +a unique TCP port. + +

+Note #2: For the sake of this example, the memory region being written to and +the server's peer configuration are transferred via the connection's private +data. In general, it can be transferred via an out-of-band or the in-band +channel. + +

+Note #3: This example uses exactly the same "common.h" and "common.c" +files copied from the previous step. + +{{edit "client.c" "server.c" "common.h" "build_main.sh"}} + +{{build "./build_main.sh"}} + +{{edit "run_main.sh"}} + +{{run "./run_main.sh"}} + +{{summary}} + +

+During this session, you have: +

    +
  • checked the basic network connectivity,
  • +
  • checked whether the network is RDMA-capable,
  • +
  • established a connection using the librpma library,
  • +
  • read and written back the RPMem's content in the persistent way.
  • +
+ +

+Takeaways: +

    +
  • Remote Persistent Memory (RPMem) is a way of accessing Persistent Memory when + it is installed in a remote system,
  • +
  • RPMem leverages RDMA capabilities to achieve e.g.: +
      +
    • zero-copy read of the contents of RPMem,
    • +
    • single-sided persistency of the PMem's contents in a remote system,
    • +
    +
  • +
  • RPMem is a way to seamlessly incorporate PMem into modern cloud architectures,
  • +
  • the librpma library is a ready-to-use simple API implementing the RPMem principles.
  • +
+ +

+If you want to learn more: +

    +
  • https://github.com/pmem/rpma + - where librpma is forged +
      +
    • We especially recommend getting familiar with + examples + that allow you to learn other supported use-cases.
    • +
    +
  • +
  • https://pmem.io/rpma/ + - where you can find the librpma library manuals, configuration guidelines + and performance reports.
  • +
  • It may be also useful to read our whitepaper: + Persistent Memory Replication Over Traditional RDMA. +
      +
    • Note it describes RPMem in the context of the already deprecated librpmem library + (do not confuse with librpma) which is a completely different story.
    • +
    +
  • +
+ +

+Thank you very much for your attention! + +

+Yours sincerely
+RPMem PMDK Squad + +{{bottom}} diff --git a/templates/examples/R/description.tmpl b/templates/examples/R/description.tmpl new file mode 100644 index 0000000..2a2cbcc --- /dev/null +++ b/templates/examples/R/description.tmpl @@ -0,0 +1,18 @@ +{{template "tocEntryStart" .}} +{{template "tocShortText" .}} +Remote Persistent Memory +{{template "tocLongText" .}} +{{template "tocRecommended" .}} +You already know how to access local Persistent Memory, +but Persistent Memory can be also accessed via network +based on Remote Direct Memory Access technology (RDMA). +Moreover, you can store data in remote Persistent Memory +(a Persistent Memory which is attached to another physical system) +and make this store persistent. +

+From this example you will learn how to establish an RDMA connection, +prepare your local Persistent Memory for remote manipulation, +and how to modify remote Persistent Memory using the librpma library. +

+Language: C +{{template "tocEntryEnd" .}}