From 6332a0e1efa77d85e8609202d06e1ad8048e4a71 Mon Sep 17 00:00:00 2001 From: Joshua Hursey Date: Tue, 22 Feb 2022 15:26:42 -0500 Subject: [PATCH] Big Count Collectives Test Suite * Run collectives with a `count` parameter as close to `INT_MAX` as possible. - This test suite often highlights cases where the underlying algorithm assumes that the payload (roughly `count x sizeof(datatype)`) is and `int` when it should be handled as a `size_t`. * Includes: - Test with `int` and `double _Complex` primitive data types - Correctness checks - Mechanism to control (as best as we can) the amount of memory consumed per node. * Assumes: - Roughly the same amount of memory per node - Same number of processes per node * See `README.md` for details Signed-off-by: Joshua Hursey --- .gitignore | 12 + collective-big-count/Makefile | 109 ++++++ collective-big-count/README.md | 115 ++++++ collective-big-count/common.h | 470 +++++++++++++++++++++++++ collective-big-count/diagnostic.c | 17 + collective-big-count/test_allgather.c | 216 ++++++++++++ collective-big-count/test_allgatherv.c | 388 ++++++++++++++++++++ collective-big-count/test_allreduce.c | 181 ++++++++++ collective-big-count/test_alltoall.c | 188 ++++++++++ collective-big-count/test_bcast.c | 154 ++++++++ collective-big-count/test_gather.c | 197 +++++++++++ collective-big-count/test_gatherv.c | 378 ++++++++++++++++++++ collective-big-count/test_reduce.c | 187 ++++++++++ collective-big-count/test_scatter.c | 200 +++++++++++ collective-big-count/test_scatterv.c | 396 +++++++++++++++++++++ 15 files changed, 3208 insertions(+) create mode 100644 collective-big-count/Makefile create mode 100644 collective-big-count/README.md create mode 100644 collective-big-count/common.h create mode 100644 collective-big-count/diagnostic.c create mode 100644 collective-big-count/test_allgather.c create mode 100644 collective-big-count/test_allgatherv.c create mode 100644 collective-big-count/test_allreduce.c create mode 100644 collective-big-count/test_alltoall.c create mode 100644 collective-big-count/test_bcast.c create mode 100644 collective-big-count/test_gather.c create mode 100644 collective-big-count/test_gatherv.c create mode 100644 collective-big-count/test_reduce.c create mode 100644 collective-big-count/test_scatter.c create mode 100644 collective-big-count/test_scatterv.c diff --git a/.gitignore b/.gitignore index 192e505..6c958f7 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,15 @@ sessions/sessions_test7 sessions/sessions_test8 sessions/sessions_test9 +collective-big-count/diagnostic +collective-big-count/test_allgather +collective-big-count/test_allgatherv +collective-big-count/test_allreduce +collective-big-count/test_alltoall +collective-big-count/test_bcast +collective-big-count/test_gather +collective-big-count/test_gatherv +collective-big-count/test_reduce +collective-big-count/test_scatter +collective-big-count/test_scatterv +collective-big-count/*_uniform_count diff --git a/collective-big-count/Makefile b/collective-big-count/Makefile new file mode 100644 index 0000000..761eb58 --- /dev/null +++ b/collective-big-count/Makefile @@ -0,0 +1,109 @@ +# +# Copyright (c) 2021-2022 IBM Corporation. All rights reserved. +# +# $COPYRIGHT$ +# + +###################################################################### +# Utilities +###################################################################### +.PHONY: default help + +CC = mpicc +F77 = mpif77 +F90 = mpif90 +MPIRUN = mpirun +RM = /bin/rm -f + +# GCC +CC_FLAGS = -g -O0 -Wall -Werror +# Clang +#CC_FLAGS = -g -O0 -Wall -Wshorten-64-to-32 -Werror +F90_FLAGS = +F77_FLAGS = $(F90_FLAGS) + +###################################################################### +# TEST_UNIFORM_COUNT: (Default defined here) +# The 'count' size to be used regardless of the datatype +# This should never exceed that of INT_MAX (2147483647) which +# is the maximum count allowed by the MPI Interface in MPI 3 +###################################################################### +# Test at the limit of INT_MAX : 2147483647 +TEST_UNIFORM_COUNT=2147483647 + +###################################################################### +# TEST_PAYLOAD_SIZE: (Default in common.h) +# This value is the total payload size the collective should perform. +# The 'count' is calculated as relative to the datatype size so +# as to target this payload size as closely as possible: +# count = TEST_PAYLOAD_SIZE / sizeof(datatype) +###################################################################### +# INT_MAX : == 2 GB so guard will not trip (INT_MAX == 2GB -1byte) +TEST_PAYLOAD_SIZE=2147483647 + +###################################################################### +# Binaries +###################################################################### +BINCC = \ + test_alltoall \ + test_allgather test_allgatherv \ + test_allreduce \ + test_bcast \ + test_gather test_gatherv \ + test_reduce \ + test_scatter test_scatterv \ + diagnostic + +BIN = $(BINCC) + +###################################################################### +# Targets +###################################################################### +all: $(BIN) + +clean: + $(RM) $(BIN) *.o *_uniform_count *_uniform_payload + +diagnostic: common.h diagnostic.c + $(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. diagnostic.c + $(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. diagnostic.c + +test_allgather: common.h test_allgather.c + $(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_allgather.c + $(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_allgather.c + +test_allgatherv: common.h test_allgatherv.c + $(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_allgatherv.c + $(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_allgatherv.c + +test_allreduce: common.h test_allreduce.c + $(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_allreduce.c + $(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_allreduce.c + +test_alltoall: common.h test_alltoall.c + $(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_alltoall.c + $(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_alltoall.c + +test_bcast: common.h test_bcast.c + $(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_bcast.c + $(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_bcast.c + +test_gather: common.h test_gather.c + $(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_gather.c + $(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_gather.c + +test_gatherv: common.h test_gatherv.c + $(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_gatherv.c + $(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_gatherv.c + +test_reduce: common.h test_reduce.c + $(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_reduce.c + $(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_reduce.c + +test_scatter: common.h test_scatter.c + $(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_scatter.c + $(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_scatter.c + +test_scatterv: common.h test_scatterv.c + $(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_scatterv.c + $(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_scatterv.c diff --git a/collective-big-count/README.md b/collective-big-count/README.md new file mode 100644 index 0000000..ca71709 --- /dev/null +++ b/collective-big-count/README.md @@ -0,0 +1,115 @@ +# Big Count Collectives Tests + +This test suite is for testing with large **count** payload operations. Large payload is defined as: + + > total payload size (count x sizeof(datatype)) is greater than UINT_MAX (4294967295 =~ 4 GB) + +| Test Suite | Count | Datatype | +| ---------- | ----- | -------- | +| N/A | small | small | +| BigCount | **LARGE** | small | +| [BigMPI](https://github.com/jeffhammond/BigMPI) | small | **LARGE** | + + * Assumes: + - Roughly the same amount of memory per node + - Same number of processes per node + +## Building + +``` +make clean +make all +``` + +## Running + +For each unit test two different binaries are generated: + * `test_FOO` : Run with a total payload size as close to `INT_MAX` as possible relative to the target datatype. + * `test_FOO_uniform_count` : Run with a uniform count regardless of the datatype. Default `count = 2147483647 (INT_MAX)` + +Currently, the unit tests use the `int` and `double _Complex` datatypes in the MPI collectives. + +``` +mpirun --np 8 --map-by ppr:2:node --host host01:2,host02:2,host03:2,host04:2 \ + -mca coll basic,inter,libnbc,self ./test_allreduce + +mpirun --np 8 --map-by ppr:2:node --host host01:2,host02:2,host03:2,host04:2 \ + -x BIGCOUNT_MEMORY_PERCENT=15 -x BIGCOUNT_MEMORY_DIFF=10 \ + --mca coll basic,inter,libnbc,self ./test_allreduce + +mpirun --np 8 --map-by ppr:2:node --host host01:2,host02:2,host03:2,host04:2 \ + -x BIGCOUNT_MEMORY_PERCENT=15 -x BIGCOUNT_MEMORY_DIFF=10 \ + --mca coll basic,inter,libnbc,self ./test_allreduce_uniform_count +``` + +Expected output will look something like the following. Notice that depending on the `BIGCOUNT_MEMORY_PERCENT` environment variable you might see the collective `Adjust count to fit in memory` message as the test harness is trying to honor that parameter. +``` +shell$ mpirun --np 4 --map-by ppr:1:node --host host01,host02,host03,host04 \ + -x BIGCOUNT_MEMORY_PERCENT=6 -x BIGCOUNT_MEMORY_DIFF=10 \ + --mca coll basic,inter,libnbc,self ./test_allreduce_uniform_count +----------------------:----------------------------------------- +Total Memory Avail. : 567 GB +Percent memory to use : 6 % +Tolerate diff. : 10 GB +Max memory to use : 34 GB +----------------------:----------------------------------------- +INT_MAX : 2147483647 +UINT_MAX : 4294967295 +SIZE_MAX : 18446744073709551615 +----------------------:----------------------------------------- + : Count x Datatype size = Total Bytes +TEST_UNIFORM_COUNT : 2147483647 +V_SIZE_DOUBLE_COMPLEX : 2147483647 x 16 = 32.0 GB +V_SIZE_DOUBLE : 2147483647 x 8 = 16.0 GB +V_SIZE_FLOAT_COMPLEX : 2147483647 x 8 = 16.0 GB +V_SIZE_FLOAT : 2147483647 x 4 = 8.0 GB +V_SIZE_INT : 2147483647 x 4 = 8.0 GB +----------------------:----------------------------------------- +--------------------- +Results from MPI_Allreduce(int x 2147483647 = 8589934588 or 8.0 GB): +Rank 3: PASSED +Rank 2: PASSED +Rank 1: PASSED +Rank 0: PASSED +--------------------- Adjust count to fit in memory: 2147483647 x 50.0% = 1073741823 +Root : payload 34359738336 32.0 GB = 16 dt x 1073741823 count x 2 peers x 1.0 inflation +Peer : payload 34359738336 32.0 GB = 16 dt x 1073741823 count x 2 peers x 1.0 inflation +Total : payload 34359738336 32.0 GB = 32.0 GB root + 32.0 GB x 0 local peers +--------------------- +Results from MPI_Allreduce(double _Complex x 1073741823 = 17179869168 or 16.0 GB): +Rank 3: PASSED +Rank 2: PASSED +Rank 0: PASSED +Rank 1: PASSED +--------------------- +Results from MPI_Iallreduce(int x 2147483647 = 8589934588 or 8.0 GB): +Rank 2: PASSED +Rank 0: PASSED +Rank 3: PASSED +Rank 1: PASSED +--------------------- Adjust count to fit in memory: 2147483647 x 50.0% = 1073741823 +Root : payload 34359738336 32.0 GB = 16 dt x 1073741823 count x 2 peers x 1.0 inflation +Peer : payload 34359738336 32.0 GB = 16 dt x 1073741823 count x 2 peers x 1.0 inflation +Total : payload 34359738336 32.0 GB = 32.0 GB root + 32.0 GB x 0 local peers +--------------------- +Results from MPI_Iallreduce(double _Complex x 1073741823 = 17179869168 or 16.0 GB): +Rank 2: PASSED +Rank 0: PASSED +Rank 3: PASSED +Rank 1: PASSED +``` + +## Environment variables + + * `BIGCOUNT_MEMORY_DIFF` (Default: `0`): Maximum difference (as integer in GB) in total available memory between processes. + * `BIGCOUNT_MEMORY_PERCENT` (Default: `80`): Maximum percent (as integer) of memory to consume. + * `BIGCOUNT_ENABLE_NONBLOCKING` (Default: `1`): Enable/Disable the nonblocking collective tests. `y`/`Y`/`1` means Enable, otherwise disable. + * `BIGCOUNT_ALG_INFLATION` (Default: `1.0`): Memory overhead multiplier for a given algorithm. Some algorithms use internal buffers relative to the size of the payload and/or communicator size. This envar allow you to account for that to help avoid Out-Of-Memory (OOM) scenarios. + +## Missing Collectives (to do list) + +Collectives missing from this test suite: + * Barrier (N/A) + * Alltoallv + * Reduce_scatter + * Scan / Exscan diff --git a/collective-big-count/common.h b/collective-big-count/common.h new file mode 100644 index 0000000..b4df7f3 --- /dev/null +++ b/collective-big-count/common.h @@ -0,0 +1,470 @@ +/* + * Copyright (c) 2021-2022 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + */ + +/* + * Default: Testing (TEST_PAYLOAD_SIZE / sizeof(type)) + 1 + * + * Adjust TEST_PAYLOAD_SIZE from default value: + * -DTEST_PAYLOAD_SIZE=# + * TEST_PAYLOAD_SIZE value is used as the numerator for each datatype. + * The count used for that datatype is calculated as: + * (1 + (TEST_PAYLOAD_SIZE / sizeof(datatype))) + * Using this variable one can test any code that guards against + * a payload size that is too large. + * + * Adjust an individual size: + * -DV_SIZE_DOUBLE_COMPLEX=123 + * + * Set the same count for all types: + * -DTEST_UNIFORM_COUNT=123 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Debugging messages + * 0 = off + * 1 = show displacements at root + * 2 = show all checks + */ +int debug = 0; + +/* + * Valid after MPI_Init + */ +#ifndef MPI_MAX_PROCESSOR_NAME +#define MPI_MAX_PROCESSOR_NAME 255 +#endif +int world_size = 0, world_rank = 0, local_size = 0; +char my_hostname[MPI_MAX_PROCESSOR_NAME]; + +/* + * Limit how much total memory a collective can take on the system + * across all processes. + */ +int max_sys_mem_gb = 0; + +/* + * Total memory on the system (for reference) + */ +int total_sys_mem_gb = 0; + +/* + * Tolerate this number of GB difference between systems + */ +int mem_diff_tolerance = 0; + +/* + * Percent of memory to allocate + */ +int mem_percent = 0; + +/* + * Allow the nonblocking tests to run + */ +bool allow_nonblocked = true; + +/* + * Algorithm expected inflation multiplier + */ +double alg_inflation = 1.0; + +/* + * 'v' collectives have two modes + * Packed: contiguous packing of data + * Skip : a 'disp_stride' is created between every rank's contribution + */ +enum { + MODE_PACKED = 1, + MODE_SKIP = 2 +}; + +/* + * Displacement between elements when not PACKED + */ +int disp_stride = 2; + +/* + * Define count paramters to use in the tests + */ +// Default: UINT_MAX 4294967295 +// INT_MAX 2147483647 +#ifndef TEST_PAYLOAD_SIZE +#define TEST_PAYLOAD_SIZE UINT_MAX +#endif + +#ifndef TEST_UNIFORM_COUNT +#ifndef V_SIZE_DOUBLE_COMPLEX +// double _Complex = 16 bytes x 268435455.9375 +#define V_SIZE_DOUBLE_COMPLEX (int)(TEST_PAYLOAD_SIZE / sizeof(double _Complex)) +#endif + +#ifndef V_SIZE_DOUBLE +// double = 8 bytes x 536870911.875 +#define V_SIZE_DOUBLE (int)(TEST_PAYLOAD_SIZE / sizeof(double)) +#endif + +#ifndef V_SIZE_FLOAT_COMPLEX +// float _Complex = 8 bytes x 536870911.875 +#define V_SIZE_FLOAT_COMPLEX (int)(TEST_PAYLOAD_SIZE / sizeof(float _Complex)) +#endif + +#ifndef V_SIZE_FLOAT +// float = 4 bytes x 1073741823.75 +#define V_SIZE_FLOAT (int)(TEST_PAYLOAD_SIZE / sizeof(float)) +#endif + +#ifndef V_SIZE_INT +// int = 4 bytes x 1073741823.75 +#define V_SIZE_INT (int)(TEST_PAYLOAD_SIZE / sizeof(int)) +#endif + +#else +#define V_SIZE_DOUBLE_COMPLEX TEST_UNIFORM_COUNT +#define V_SIZE_DOUBLE TEST_UNIFORM_COUNT +#define V_SIZE_FLOAT_COMPLEX TEST_UNIFORM_COUNT +#define V_SIZE_FLOAT TEST_UNIFORM_COUNT +#define V_SIZE_INT TEST_UNIFORM_COUNT +#define V_SIZE_CHAR TEST_UNIFORM_COUNT +#endif + +/* + * Wrapper around 'malloc' that errors out if we cannot allocate the buffer. + * + * @param sz size of the buffer + * @return pointer to the memory. Does not return on error. + */ +static inline void * safe_malloc(size_t sz) +{ + void * ptr = NULL; + ptr = malloc(sz); + if( NULL == ptr ) { + fprintf(stderr, "Rank %d on %s) Error: Failed to malloc(%zu)\n", world_rank, my_hostname, sz); +#ifdef MPI_VERSION + MPI_Abort(MPI_COMM_WORLD, 3); +#else + exit(ENOMEM); +#endif + } + return ptr; +} + +/* + * Convert a value in whole bytes to the abbreviated form + * + * @param value size in whole bytes + * @return static string representation of the whole bytes in the nearest abbreviated form + */ +static inline const char * human_bytes(size_t value) +{ + static char *suffix[] = {"B", "KB", "MB", "GB", "TB"}; + static int s_len = 5; + static char h_out[30]; + int s_idx = 0; + double d_value = value; + + if( value > 1024 ) { + for( s_idx = 0; s_idx < s_len && d_value > 1024; ++s_idx ) { + d_value = d_value / 1024.0; + } + } + + snprintf(h_out, 30, "%2.1f %s", d_value, suffix[s_idx]); + return h_out; +} + +/* + * Determine amount of memory to use, in GBytes as a percentage of total physical memory + * + * @return Amount of memory to use (in GB) + */ +static int get_max_memory(void) { + char *mem_percent_str; + char *endp; + FILE *meminfo_file; + char *proc_data; + char *token; + size_t bufsize; + int mem_to_use; + int rc; + + mem_percent_str = getenv("BIGCOUNT_MEMORY_PERCENT"); + if (NULL == mem_percent_str) { + mem_percent_str = "80"; + } + + mem_percent = strtol(mem_percent_str, &endp, 10); + if ('\0' != *endp) { + fprintf(stderr, "BIGCOUNT_MEMORY_PERCENT is not numeric\n"); + exit(1); + } + + meminfo_file = fopen("/proc/meminfo", "r"); + if (NULL == meminfo_file) { + fprintf(stderr, "Unable to open /proc/meminfo file: %s\n", strerror(errno)); + exit(1); + } + + bufsize = 0; + proc_data = NULL; + mem_to_use = 0; + rc = getline(&proc_data, &bufsize, meminfo_file); + while (rc > 0) { + token = strtok(proc_data, " "); + if (NULL != token) { + if (!strcmp(token, "MemTotal:")) { + token = strtok(NULL, " "); + total_sys_mem_gb = strtol(token, NULL, 10); + total_sys_mem_gb = (int)(total_sys_mem_gb / 1048576.0); + /* /proc/meminfo specifies memory in KBytes, convert to GBytes */ + mem_to_use = (int)(total_sys_mem_gb * (mem_percent / 100.0)); + break; + } + } + rc = getline(&proc_data, &bufsize, meminfo_file); + } + + if (0 == mem_to_use) { + fprintf(stderr, "Unable to determine memory to use\n"); + exit(1); + } + + free(proc_data); + fclose(meminfo_file); + return mem_to_use; +} + +/* + * Display a diagnostic table + */ +static inline void display_diagnostics(void) { + printf("----------------------:-----------------------------------------\n"); + printf("Total Memory Avail. : %4d GB\n", total_sys_mem_gb); + printf("Percent memory to use : %4d %%\n", mem_percent); + printf("Tolerate diff. : %4d GB\n", mem_diff_tolerance); + printf("Max memory to use : %4d GB\n", max_sys_mem_gb); + printf("----------------------:-----------------------------------------\n"); + printf("INT_MAX : %20zu\n", (size_t)INT_MAX); + printf("UINT_MAX : %20zu\n", (size_t)UINT_MAX); + printf("SIZE_MAX : %20zu\n", (size_t)SIZE_MAX); + printf("----------------------:-----------------------------------------\n"); + printf(" : Count x Datatype size = Total Bytes\n"); +#ifndef TEST_UNIFORM_COUNT + printf("TEST_PAYLOAD_SIZE : %20zu = %10s\n", (size_t)TEST_PAYLOAD_SIZE, human_bytes((size_t)TEST_PAYLOAD_SIZE)); +#else + printf("TEST_UNIFORM_COUNT : %20zu\n", (size_t)TEST_UNIFORM_COUNT); +#endif + printf("V_SIZE_DOUBLE_COMPLEX : %20zu x %3zu = %10s\n", (size_t)V_SIZE_DOUBLE_COMPLEX, sizeof(double _Complex), human_bytes(V_SIZE_DOUBLE_COMPLEX * sizeof(double _Complex))); + printf("V_SIZE_DOUBLE : %20zu x %3zu = %10s\n", (size_t)V_SIZE_DOUBLE, sizeof(double), human_bytes(V_SIZE_DOUBLE * sizeof(double))); + printf("V_SIZE_FLOAT_COMPLEX : %20zu x %3zu = %10s\n", (size_t)V_SIZE_FLOAT_COMPLEX, sizeof(float _Complex), human_bytes(V_SIZE_FLOAT_COMPLEX * sizeof(float _Complex))); + printf("V_SIZE_FLOAT : %20zu x %3zu = %10s\n", (size_t)V_SIZE_FLOAT, sizeof(float), human_bytes(V_SIZE_FLOAT * sizeof(float))); + printf("V_SIZE_INT : %20zu x %3zu = %10s\n", (size_t)V_SIZE_INT, sizeof(int), human_bytes(V_SIZE_INT * sizeof(int))); + printf("----------------------:-----------------------------------------\n"); +} + +/* + * Initialize the unit testing environment + * Note: Must be called after MPI_Init() + * + * @param argc Argument count + * @param argv Array of string arguments + * @return 0 on success + */ +int init_environment(int argc, char** argv) { + max_sys_mem_gb = get_max_memory(); + +#ifdef MPI_VERSION + int i; + int *per_local_sizes = NULL; + int *local_max_mem = NULL; + char *mem_diff_tolerance_str = NULL; + char *env_str = NULL; + int min_mem = 0; + + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + MPI_Get_processor_name(my_hostname, &i); + + if( NULL != getenv("OMPI_COMM_WORLD_LOCAL_SIZE") ) { + local_size = (int)strtol(getenv("OMPI_COMM_WORLD_LOCAL_SIZE"), NULL, 10); + } else { + local_size = world_size; + } + + if( NULL != (env_str = getenv("BIGCOUNT_ENABLE_NONBLOCKING")) ) { + if( 'y' == env_str[0] || 'Y' == env_str[0] || '1' == env_str[0] ) { + allow_nonblocked = true; + } else { + allow_nonblocked = false; + } + } + + if( NULL != (env_str = getenv("BIGCOUNT_ALG_INFLATION")) ) { + alg_inflation = strtod(env_str, NULL); + } + + // Make sure that the local size is uniform + if( 0 == world_rank ) { + per_local_sizes = (int*)safe_malloc(sizeof(int) * world_size); + } + + MPI_Gather(&local_size, 1, MPI_INT, per_local_sizes, 1, MPI_INT, 0, MPI_COMM_WORLD); + if( 0 == world_rank ) { + for(i = 0; i < world_size; ++i) { + if( local_size != per_local_sizes[i] ) { + printf("Error: Non-uniform local size at peer %d : actual %d vs expected %d\n", + i, per_local_sizes[i], local_size); + assert(local_size == per_local_sizes[i]); + } + } + free(per_local_sizes); + } + // Make sure max memory usage is the same for all tasks + if( 0 == world_rank ) { + local_max_mem = (int*)safe_malloc(sizeof(int) * world_size); + } + + mem_diff_tolerance_str = getenv("BIGCOUNT_MEMORY_DIFF"); + if (NULL != mem_diff_tolerance_str) { + mem_diff_tolerance = strtol(mem_diff_tolerance_str, NULL, 10); + } + + MPI_Gather(&max_sys_mem_gb, 1, MPI_INT, local_max_mem, 1, MPI_INT, 0, MPI_COMM_WORLD); + if( 0 == world_rank ) { + min_mem = max_sys_mem_gb; + for(i = 0; i < world_size; ++i) { + if( max_sys_mem_gb != local_max_mem[i] ) { + if( (max_sys_mem_gb < local_max_mem[i] && max_sys_mem_gb + mem_diff_tolerance < local_max_mem[i]) || + (max_sys_mem_gb > local_max_mem[i] && max_sys_mem_gb > mem_diff_tolerance + local_max_mem[i]) ) { + printf("Error: Non-uniform max memory usage at peer %d : actual %d vs expected %d (+/- %d)\n", + i, local_max_mem[i], max_sys_mem_gb, mem_diff_tolerance); + assert(max_sys_mem_gb == local_max_mem[i]); + } + if( min_mem > local_max_mem[i] ) { + min_mem = local_max_mem[i]; + } + } + } + free(local_max_mem); + if( min_mem != max_sys_mem_gb ) { + printf("Warning: Detected difference between local and remote available memory. Adjusting to: %d GB\n", + min_mem); + max_sys_mem_gb = min_mem; + } + } + + // Agree on the max memory usage value to use across all processes + MPI_Bcast(&max_sys_mem_gb, 1, MPI_INT, 0, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); +#else + world_size = world_rank = local_size = -1; + snprintf(my_hostname, MPI_MAX_PROCESSOR_NAME, "localhost"); +#endif + + if( 0 == world_rank || -1 == world_rank ) { + display_diagnostics(); + } + + return 0; +} + +/* + * Calculate the uniform count for this collective given the datatype size, + * number of processes (local and global), expected inflation in memory during + * the collective, and the amount of memory we are limiting this test to consuming + * on the system. + * + * @param datateye_size size of the datatype + * @param proposed_count the count that the caller wishes to use + * @param mult_root memory multiplier at root (useful in gather-like operations where the root gathers N times the count) + * @param mult_peer memory multiplier at non-roots (useful in allgather-like operations where the buffer is N times count) + * @param alg_inflation Inflation expected due to the algorithm we are expecting to call + * @return proposed count to use in the collective + */ +size_t calc_uniform_count(size_t datatype_size, size_t proposed_count, + size_t mult_root, size_t mult_peer, double alg_inflation) +{ + size_t orig_proposed_count = proposed_count; + size_t orig_mult_root = mult_root; + size_t orig_mult_peer = mult_peer; + size_t payload_size_root; + size_t payload_size_peer; + size_t payload_size_all; + double perc = 1.0; + char *cpy_root = NULL, *cpy_peer = NULL; + + mult_root = (size_t)(mult_root * alg_inflation); + mult_peer = (size_t)(mult_peer * alg_inflation); + + payload_size_root = datatype_size * proposed_count * mult_root; + payload_size_peer = datatype_size * proposed_count * mult_peer; + payload_size_all = payload_size_root + (payload_size_peer * (local_size-1)); + + while( (payload_size_all / ((size_t)1024 * 1024 * 1024)) > max_sys_mem_gb ) { + if( 2 == debug && 0 == world_rank ) { + fprintf(stderr, "----DEBUG---- Adjusting count. Try count of %10zu (payload_size %4zu GB) to fit in %4d GB limit (perc %6.2f)\n", + proposed_count, payload_size_all/((size_t)1024 * 1024 * 1024), max_sys_mem_gb, perc); + } + + perc -= 0.05; + // It is possible that we are working with extremely limited memory + // so the percentage dropped below 0. In this case just make the + // percentage equal to the max_sys_mem_gb. + if( perc <= 0.0 ) { + proposed_count = (max_sys_mem_gb * (size_t)1024 * 1024 * 1024) / (datatype_size * mult_root + datatype_size * mult_peer * (local_size-1)); + perc = proposed_count / (double)orig_proposed_count; + if( 2 == debug && 0 == world_rank ) { + fprintf(stderr, "----DEBUG---- Adjusting count. Try count of %10zu (from %10zu) to fit in %4d GB limit (perc %6.9f) -- FINAL\n", + proposed_count, orig_proposed_count, max_sys_mem_gb, perc); + } + } else { + proposed_count = orig_proposed_count * perc; + } + assert(perc > 0.0); + + payload_size_root = datatype_size * proposed_count * mult_root; + payload_size_peer = datatype_size * proposed_count * mult_peer; + payload_size_all = payload_size_root + (payload_size_peer * (local_size-1)); + } + + if(proposed_count != orig_proposed_count ) { + if( 0 == world_rank ) { + + printf("--------------------- Adjust count to fit in memory: %10zu x %5.1f%% = %10zu\n", + orig_proposed_count, + (proposed_count / (double)orig_proposed_count)*100, + proposed_count); + + cpy_root = strdup(human_bytes(payload_size_root)); + printf("Root : payload %14zu %8s = %3zu dt x %10zu count x %3zu peers x %5.1f inflation\n", + payload_size_root, cpy_root, + datatype_size, proposed_count, orig_mult_root, alg_inflation); + + cpy_peer = strdup(human_bytes(payload_size_peer)); + printf("Peer : payload %14zu %8s = %3zu dt x %10zu count x %3zu peers x %5.1f inflation\n", + payload_size_peer, cpy_peer, + datatype_size, proposed_count, orig_mult_peer, alg_inflation); + + printf("Total : payload %14zu %8s = %8s root + %8s x %3d local peers\n", + payload_size_all, human_bytes(payload_size_all), + cpy_root, cpy_peer, local_size-1); + + free(cpy_root); + free(cpy_peer); + } + } + + return proposed_count; +} diff --git a/collective-big-count/diagnostic.c b/collective-big-count/diagnostic.c new file mode 100644 index 0000000..e69ba4a --- /dev/null +++ b/collective-big-count/diagnostic.c @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2021-2022 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + */ +#include +#include +#include +#include +#include + +#include "common.h" + +int main(int argc, char** argv) { + init_environment(argc, argv); + return 0; +} diff --git a/collective-big-count/test_allgather.c b/collective-big-count/test_allgather.c new file mode 100644 index 0000000..49348f6 --- /dev/null +++ b/collective-big-count/test_allgather.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2021-2022 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + */ +#include +#include +#include + +#include +#include "common.h" + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool in_place, + bool blocking); + +int main(int argc, char** argv) { + /* + * Initialize the MPI environment + */ + int ret = 0; + + MPI_Init(NULL, NULL); + init_environment(argc, argv); + + // Run the tests +#ifndef TEST_UNIFORM_COUNT + // Each rank contribues: V_SIZE_INT / world_size elements + // Largest buffer is : V_SIZE_INT elements + ret += my_c_test_core(MPI_INT, V_SIZE_INT, false, true); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, false, true); + if (allow_nonblocked) { + ret += my_c_test_core(MPI_INT, V_SIZE_INT, false, false); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, false, false); + } +#else + size_t proposed_count; + + // Each rank contribues: TEST_UNIFORM_COUNT elements + // Largest buffer is : TEST_UNIFORM_COUNT x world_size + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + (size_t)world_size, (size_t)world_size, alg_inflation); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, true, true); + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + (size_t)world_size, (size_t)world_size, alg_inflation); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, true, true); + if (allow_nonblocked) { + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + (size_t)world_size, (size_t)world_size, alg_inflation); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, true, false); + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + (size_t)world_size, (size_t)world_size, alg_inflation); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, true, + false); + } +#endif + + /* + * All done + */ + MPI_Finalize(); + return ret; +} + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool in_place, bool blocking) +{ + int ret = 0; + size_t i; + + // Actual payload size as divisible by the sizeof(dt) + size_t payload_size_actual; + + /* + * Initialize vector + */ + int *my_int_send_vector = NULL; + int *my_int_recv_vector = NULL; + double _Complex *my_dc_send_vector = NULL; + double _Complex *my_dc_recv_vector = NULL; + size_t recv_count = 0; + size_t send_count = 0; + int exp; + size_t num_wrong = 0; + MPI_Request request; + char *mpi_function = blocking ? "MPI_Allgather" : "MPI_Iallgather"; + + assert(MPI_INT == dtype || MPI_C_DOUBLE_COMPLEX == dtype); + + send_count = recv_count = total_num_elements / (size_t)world_size; + // total_num_elements must be a multiple of world_size. Drop any remainder + total_num_elements = send_count * (size_t)world_size; + + if( MPI_INT == dtype ) { + payload_size_actual = total_num_elements * sizeof(int); + my_int_recv_vector = (int*)safe_malloc(payload_size_actual); + if( !in_place ) { + my_int_send_vector = (int*)safe_malloc(send_count * sizeof(int)); + } + } else { + payload_size_actual = total_num_elements * sizeof(double _Complex); + my_dc_recv_vector = (double _Complex*)safe_malloc(payload_size_actual); + if( !in_place ) { + my_dc_send_vector = (double _Complex*)safe_malloc(send_count * sizeof(double _Complex)); + } + } + + if( in_place ) { + for(i = 0; i < total_num_elements; ++i) { + exp = 1 + world_rank; + if( MPI_INT == dtype ) { + if( (i / (size_t)send_count) == world_rank ) { + my_int_recv_vector[i] = exp; + } else { + my_int_recv_vector[i] = -1; + } + } else { + if( (i / (size_t)send_count) == world_rank ) { + my_dc_recv_vector[i] = 1.0*exp - 1.0*exp*I; + } else { + my_dc_recv_vector[i] = 1.0 + 1.0*I; + } + } + } + } else { + for(i = 0; i < send_count; ++i) { + exp = 1 + world_rank; + if( MPI_INT == dtype ) { + my_int_send_vector[i] = exp; + } else { + my_dc_send_vector[i] = 1.0*exp - 1.0*exp*I; + } + } + for(i = 0; i < total_num_elements; ++i) { + if( MPI_INT == dtype ) { + my_int_recv_vector[i] = -1; + } else { + my_dc_recv_vector[i] = 1.0 + 1.0*I; + } + } + } + + if (world_rank == 0) { + printf("---------------------\nResults from %s(%s x %zu = %zu or %s): %s\n", + mpi_function, (MPI_INT == dtype ? "int" : "double _Complex"), + total_num_elements, payload_size_actual, human_bytes(payload_size_actual), + ((in_place) ? " MPI_IN_PLACE" : "")); + } + if(!in_place) { + assert(send_count <= INT_MAX); + } + assert(recv_count <= INT_MAX); + if (blocking) { + if( MPI_INT == dtype ) { + MPI_Allgather(in_place ? MPI_IN_PLACE : my_int_send_vector, (int)send_count, dtype, + my_int_recv_vector, (int)recv_count, dtype, + MPI_COMM_WORLD); + } else { + MPI_Allgather(in_place ? MPI_IN_PLACE : my_dc_send_vector, (int)send_count, dtype, + my_dc_recv_vector, (int)recv_count, dtype, + MPI_COMM_WORLD); + } + } + else { + if( MPI_INT == dtype ) { + MPI_Iallgather(in_place ? MPI_IN_PLACE : my_int_send_vector, (int)send_count, dtype, + my_int_recv_vector, (int)recv_count, dtype, + MPI_COMM_WORLD, &request); + } else { + MPI_Iallgather(in_place ? MPI_IN_PLACE : my_dc_send_vector, (int)send_count, dtype, + my_dc_recv_vector, (int)recv_count, dtype, + MPI_COMM_WORLD, &request); + } + MPI_Wait(&request, MPI_STATUS_IGNORE); + } + + /* + * Check results. + */ + exp = 0; + for(i = 0; i < total_num_elements; ++i) { + exp = (int)(1 + (i / (size_t)recv_count)); + if( MPI_INT == dtype ) { + if(my_int_recv_vector[i] != exp) { + ++num_wrong; + } + } else { + if(my_dc_recv_vector[i] != 1.0*exp - 1.0*exp*I) { + ++num_wrong; + } + } + } + + if( 0 == num_wrong) { + printf("Rank %2d: PASSED\n", world_rank); + } else { + printf("Rank %2d: ERROR: DI in %14zu of %14zu slots (%6.1f %% wrong)\n", world_rank, + num_wrong, total_num_elements, ((num_wrong * 1.0)/total_num_elements)*100.0); + ret = 1; + } + + if( NULL != my_int_send_vector ) { + free(my_int_send_vector); + } + if( NULL != my_int_recv_vector ){ + free(my_int_recv_vector); + } + if( NULL != my_dc_send_vector ) { + free(my_dc_send_vector); + } + if( NULL != my_dc_recv_vector ){ + free(my_dc_recv_vector); + } + fflush(NULL); + MPI_Barrier(MPI_COMM_WORLD); + + return ret; +} diff --git a/collective-big-count/test_allgatherv.c b/collective-big-count/test_allgatherv.c new file mode 100644 index 0000000..fd65ed9 --- /dev/null +++ b/collective-big-count/test_allgatherv.c @@ -0,0 +1,388 @@ +/* + * Copyright (c) 2021-2022 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + */ +#include +#include +#include + +#include +#include "common.h" + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, int mode, bool in_place, bool blocking); + +int main(int argc, char** argv) { + /* + * Initialize the MPI environment + */ + int ret = 0; + + MPI_Init(NULL, NULL); + init_environment(argc, argv); + + // Run the tests +#ifndef TEST_UNIFORM_COUNT + // Each rank contribues: V_SIZE_INT / world_size elements + // Largest buffer is : V_SIZE_INT elements + ret += my_c_test_core(MPI_INT, V_SIZE_INT, MODE_PACKED, true, true); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_INT, + (V_SIZE_INT - disp_stride*world_size), + MODE_SKIP, true, true); + + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, MODE_PACKED, true, true); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, + (V_SIZE_DOUBLE_COMPLEX - disp_stride*world_size), + MODE_SKIP, true, true); + if (allow_nonblocked) { + ret += my_c_test_core(MPI_INT, V_SIZE_INT, MODE_PACKED, true, false); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_INT, + (V_SIZE_INT - disp_stride*world_size), + MODE_SKIP, true, false); + + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, MODE_PACKED, true, false); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, + (V_SIZE_DOUBLE_COMPLEX - disp_stride*world_size), + MODE_SKIP, true, false); + } +#else + size_t proposed_count; + + // Each rank contribues: TEST_UNIFORM_COUNT elements + // Largest buffer is : TEST_UNIFORM_COUNT x world_size + + // Note: Displacement is an int, so the recv buffer cannot be too large as to overflow the int + // As such divide by the world_size + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT / (size_t)world_size, + (size_t)world_size, (size_t)world_size, 1.0); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, MODE_PACKED, true, true); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_INT, + (proposed_count - disp_stride*world_size) * (size_t)world_size, + MODE_SKIP, true, true); + + // Note: Displacement is an int, so the recv buffer cannot be too large as to overflow the int + // As such divide by the world_size + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT / (size_t)world_size, + (size_t)world_size, (size_t)world_size, 1.0); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, MODE_PACKED, true, + true); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, + (proposed_count - disp_stride*world_size) * (size_t)world_size, + MODE_SKIP, true, true); + if (allow_nonblocked) { + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT / (size_t)world_size, + (size_t)world_size, (size_t)world_size, 1.0); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, MODE_PACKED, true, + false); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_INT, + (proposed_count - disp_stride*world_size) * (size_t)world_size, + MODE_SKIP, true, false); + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT / (size_t)world_size, + (size_t)world_size, (size_t)world_size, 1.0); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, MODE_PACKED, + true, false); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, + (proposed_count - disp_stride*world_size) * (size_t)world_size, + MODE_SKIP, true, false); + } +#endif + + /* + * All done + */ + MPI_Finalize(); + return ret; +} + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, int mode, bool in_place, bool blocking) +{ + int ret = 0; + size_t i; + + // Actual payload size as divisible by the sizeof(dt) + size_t payload_size_actual; + + /* + * Initialize vector + */ + int *my_int_send_vector = NULL; + int *my_int_recv_vector = NULL; + int int_exp; + + double _Complex *my_dc_send_vector = NULL; + double _Complex *my_dc_recv_vector = NULL; + double _Complex dc_exp; + + int *my_recv_counts = NULL; + int *my_recv_disp = NULL; + int send_count = 0; + int d_idx, r_idx; + size_t last_disp, last_count; + size_t num_wrong = 0; + size_t v_size, v_rem; + MPI_Request request; + char *mpi_function = blocking ? "MPI_Allgatherv" : "MPI_Iallgatherv"; + + assert(MPI_INT == dtype || MPI_C_DOUBLE_COMPLEX == dtype); + + // total_num_elements = final recv count + // send_count = final send count + v_size = total_num_elements / world_size; + v_rem = total_num_elements % world_size; + assert(send_count <= INT_MAX); + send_count = (int)v_size; + if (0 != v_rem && world_rank == world_size-1) { + send_count += v_rem; + } + + if( MODE_PACKED == mode ) { + /* Strategy for testing: + * - Displacement should skip 0 elements producing a tightly packed buffer + * - Count will be the same at all ranks + * - buffer can be v_size elements in size + * + * NP = 4 and total_num_elements = 9 then the final buffer will be: + * [1, 1, 2, 2, 3, 3, 4, 4, 4] + */ + if( MPI_INT == dtype ) { + payload_size_actual = total_num_elements * sizeof(int); + my_int_recv_vector = (int*)safe_malloc(payload_size_actual); + for(i = 0; i < total_num_elements; ++i) { + my_int_recv_vector[i] = -1; + } + } else { + payload_size_actual = total_num_elements * sizeof(double _Complex); + my_dc_recv_vector = (double _Complex*)safe_malloc(payload_size_actual); + for(i = 0; i < total_num_elements; ++i) { + my_dc_recv_vector[i] = 1.0 - 1.0*I; + } + } + my_recv_counts = (int*)safe_malloc(sizeof(int) * world_size); + my_recv_disp = (int*)safe_malloc(sizeof(int) * world_size); + last_disp = 0; + last_count = v_size; + + for(d_idx = 0; d_idx < world_size; ++d_idx) { + if (0 != v_rem && d_idx == world_size-1) { + last_count += v_rem; + } + assert(last_count <= INT_MAX); + my_recv_counts[d_idx] = (int)last_count; + assert(last_disp <= INT_MAX); + my_recv_disp[d_idx] = (int)last_disp; + if( debug > 0 ) { + printf("d_idx %3d / last_disp %9d / last_count %9d | total_count %10zu / payload_size %10zu\n", + d_idx, (int)last_disp, (int)last_count, total_num_elements, payload_size_actual); + } + // Shift displacement by the count for tightly packed buffer + last_disp += last_count; + } + } else { + /* Strategy for testing: + * - Displacement should skip 2 elements before first element and between each peer making a small gap + * - Count will be the same at all ranks +/- and divisible by v_size + * - buffer can be v_size + gaps for displacements + * + * NP = 4 and total_num_elements = 9 (17 with stride) then the final buffer will be: + * [-1, -1, 1, 1, -1, -1, 2, 2, -1, -1, 3, 3, -1, -1, 4, 4, 4] + */ + total_num_elements += disp_stride * (size_t)world_size; + if( MPI_INT == dtype ) { + payload_size_actual = total_num_elements * sizeof(int); + my_int_recv_vector = (int*)safe_malloc(payload_size_actual); + for(i = 0; i < total_num_elements; ++i) { + my_int_recv_vector[i] = -1; + } + } else { + payload_size_actual = total_num_elements * sizeof(double _Complex); + my_dc_recv_vector = (double _Complex*)safe_malloc(payload_size_actual); + for(i = 0; i < total_num_elements; ++i) { + my_dc_recv_vector[i] = -1.0 - 1.0*I; + } + } + my_recv_counts = (int*)safe_malloc(sizeof(int) * world_size); + my_recv_disp = (int*)safe_malloc(sizeof(int) * world_size); + last_disp = disp_stride; + last_count = v_size; + + for(d_idx = 0; d_idx < world_size; ++d_idx) { + if (0 != v_rem && d_idx == world_size-1) { + last_count += v_rem; + } + assert(last_count <= INT_MAX); + my_recv_counts[d_idx] = (int)last_count; + assert(last_disp <= INT_MAX); + my_recv_disp[d_idx] = (int)last_disp; + if( debug > 0) { + printf("d_idx %3d / last_disp %9d / last_count %9d | total_count %10zu / payload_size %10zu\n", + d_idx, (int)last_disp, (int)last_count, total_num_elements, payload_size_actual); + } + // Shift displacement by the count for tightly packed buffer + last_disp += last_count + disp_stride; + } + } + + if( in_place ) { + if( MPI_INT == dtype ) { + for(i = 0; i < send_count; ++i) { + my_int_recv_vector[i+my_recv_disp[world_rank]] = 1 + world_rank; + } + } else { + for(i = 0; i < send_count; ++i) { + my_dc_recv_vector[i+my_recv_disp[world_rank]] = 1.0*(1+world_rank) + 1.0*(1+world_rank)*I; + } + } + } else { + if( MPI_INT == dtype ) { + my_int_send_vector = (int*)safe_malloc(sizeof(int) * send_count); + for(i = 0; i < send_count; ++i) { + my_int_send_vector[i] = 1 + world_rank; + } + } else { + my_dc_send_vector = (double _Complex*)safe_malloc(sizeof(double _Complex) * send_count); + for(i = 0; i < send_count; ++i) { + my_dc_send_vector[i] = 1.0*(1+world_rank) + 1.0*(1+world_rank)*I; + } + } + } + + if (world_rank == 0) { + printf("---------------------\nResults from %s(%s x %zu = %zu or %s): Mode: %s%s\n", + mpi_function, (MPI_INT == dtype ? "int" : "double _Complex"), + total_num_elements, payload_size_actual, human_bytes(payload_size_actual), + ((MODE_PACKED == mode) ? "PACKED" : "SKIPPY"), + ((in_place) ? " MPI_IN_PLACE" : "")); + } + + if (blocking) { + if( MPI_INT == dtype ) { + MPI_Allgatherv(in_place ? MPI_IN_PLACE : my_int_send_vector, send_count, dtype, + my_int_recv_vector, my_recv_counts, my_recv_disp, dtype, + MPI_COMM_WORLD); + } else { + MPI_Allgatherv(in_place ? MPI_IN_PLACE : my_dc_send_vector, send_count, dtype, + my_dc_recv_vector, my_recv_counts, my_recv_disp, dtype, + MPI_COMM_WORLD); + } + } + else { + if( MPI_INT == dtype ) { + MPI_Iallgatherv(in_place ? MPI_IN_PLACE : my_int_send_vector, send_count, dtype, + my_int_recv_vector, my_recv_counts, my_recv_disp, dtype, + MPI_COMM_WORLD, &request); + } else { + MPI_Iallgatherv(in_place ? MPI_IN_PLACE : my_dc_send_vector, send_count, dtype, + my_dc_recv_vector, my_recv_counts, my_recv_disp, dtype, + MPI_COMM_WORLD, &request); + } + MPI_Wait(&request, MPI_STATUS_IGNORE); + } + + /* + * Check results. + */ + int_exp = 0; + d_idx = 0; + r_idx = 0; + + if( world_size > 1 ) { + last_disp = my_recv_counts[r_idx] + my_recv_disp[r_idx]; + } else { + last_disp = 0; + } + + if( MODE_PACKED == mode ) { + for(i = 0; i < total_num_elements; ++i) { + if( world_size > r_idx+1 && i == last_disp ) { + ++r_idx; + last_disp = my_recv_counts[r_idx] + my_recv_disp[r_idx]; + } + int_exp = 1 + r_idx; + if( MPI_INT == dtype ) { + if( debug > 1) { + printf("CHECK: %2zu : %3d vs %3d [%3d : %3d + %3d = %3d]\n", + i, my_int_recv_vector[i], int_exp, + r_idx, my_recv_counts[r_idx], my_recv_disp[r_idx], (int)last_disp); + } + if(my_int_recv_vector[i] != int_exp) { + ++num_wrong; + } + } else { + dc_exp = 1.0*int_exp + 1.0*int_exp*I; + if( debug > 1) { + printf("CHECK: %2zu : (%14.0f,%14.0fi) vs (%14.0f,%14.0fi) [%3d : %3d + %3d = %3d]\n", + i, creal(my_dc_recv_vector[i]), cimag(my_dc_recv_vector[i]), creal(dc_exp), cimag(dc_exp), + r_idx, my_recv_counts[r_idx], my_recv_disp[r_idx], (int)last_disp); + } + if(my_dc_recv_vector[i] != dc_exp) { + ++num_wrong; + } + } + } + } else { + for(i = 0; i < total_num_elements; ++i) { + if( world_size > r_idx+1 && i == last_disp ) { + ++r_idx; + last_disp = my_recv_counts[r_idx] + my_recv_disp[r_idx]; + } + if( i < my_recv_disp[r_idx] ) { + int_exp = -1; + } else { + int_exp = 1 + r_idx; + } + if( MPI_INT == dtype ) { + if( debug > 1) { + printf("CHECK: %2zu : %3d vs %3d [%3d : %3d + %3d = %3d]\n", + i, my_int_recv_vector[i], int_exp, + r_idx, my_recv_counts[r_idx], my_recv_disp[r_idx], (int)last_disp); + } + if(my_int_recv_vector[i] != int_exp) { + ++num_wrong; + } + } else { + dc_exp = 1.0*int_exp + 1.0*int_exp*I; + if( debug > 1) { + printf("CHECK: %2zu : (%14.0f,%14.0fi) vs (%14.0f,%14.0fi) [%3d : %3d + %3d = %3d]\n", + i, creal(my_dc_recv_vector[i]), cimag(my_dc_recv_vector[i]), creal(dc_exp), cimag(dc_exp), + r_idx, my_recv_counts[r_idx], my_recv_disp[r_idx], (int)last_disp); + } + if(my_dc_recv_vector[i] != dc_exp) { + ++num_wrong; + } + } + } + } + + if( 0 == num_wrong) { + printf("Rank %2d: PASSED\n", world_rank); + } else { + printf("Rank %2d: ERROR: DI in %14zu of %14zu slots (%6.1f %% wrong)\n", world_rank, + num_wrong, total_num_elements, ((num_wrong * 1.0)/total_num_elements)*100.0); + ret = 1; + } + + if( NULL != my_int_send_vector ) { + free(my_int_send_vector); + } + if( NULL != my_int_recv_vector ){ + free(my_int_recv_vector); + } + if( NULL != my_dc_send_vector ) { + free(my_dc_send_vector); + } + if( NULL != my_dc_recv_vector ){ + free(my_dc_recv_vector); + } + fflush(NULL); + MPI_Barrier(MPI_COMM_WORLD); + + return ret; +} diff --git a/collective-big-count/test_allreduce.c b/collective-big-count/test_allreduce.c new file mode 100644 index 0000000..54d12c4 --- /dev/null +++ b/collective-big-count/test_allreduce.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2021-2022 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + */ + +#include +#include +#include + +#include +#include "common.h" + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking); + +/** + * The example show MPI_Allreduce fails for arrays with large size when number of processes are too many. + * On the other hand, MPI_Reduce + MPI_Bcast will work. + */ +int main(int argc, char** argv) { + /* + * Initialize the MPI environment + */ + int ret = 0; + + MPI_Init(NULL, NULL); + init_environment(argc, argv); + + // Run the tests +#ifndef TEST_UNIFORM_COUNT + // Each rank contribues: V_SIZE_INT elements + // Largest buffer is : V_SIZE_INT elements + ret += my_c_test_core(MPI_INT, V_SIZE_INT, true); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, true); + if (allow_nonblocked) { + ret += my_c_test_core(MPI_INT, V_SIZE_INT, false); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, false); + } +#else + size_t proposed_count; + + // Each rank contribues: TEST_UNIFORM_COUNT elements + // Largest buffer is : TEST_UNIFORM_COUNT elements + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + 2, 2, 1.0); // 1 send, 1 recv buffer each + ret += my_c_test_core(MPI_INT, proposed_count, true); + + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + 2, 2, 1.0); // 1 send, 1 recv buffer each + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count, true); + if (allow_nonblocked) { + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + 2, 2, 1.0); // 1 send, 1 recv buffer each + ret += my_c_test_core(MPI_INT, proposed_count, false); + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + 2, 2, 1.0); // 1 send, 1 recv buffer each + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count, false); + } +#endif + + /* + * All done + */ + MPI_Finalize(); + return ret; +} + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking) +{ + int ret = 0; + size_t i; + MPI_Request request; + + char *mpi_function = blocking ? "MPI_Allreduce" : "MPI_Iallreduce"; + // Actual payload size as divisible by the sizeof(dt) + size_t payload_size_actual; + + /* + * Initialize vector + */ + int *my_int_recv_vector = NULL; + int *my_int_send_vector = NULL; + double _Complex *my_dc_recv_vector = NULL; + double _Complex *my_dc_send_vector = NULL; + size_t num_wrong = 0; + + assert(MPI_INT == dtype || MPI_C_DOUBLE_COMPLEX == dtype); + + if( MPI_INT == dtype ) { + payload_size_actual = total_num_elements * sizeof(int); + my_int_recv_vector = (int*)safe_malloc(payload_size_actual); + //my_int_send_vector = (int*)safe_malloc(payload_size_actual); + } else { + payload_size_actual = total_num_elements * sizeof(double _Complex); + my_dc_recv_vector = (double _Complex*)safe_malloc(payload_size_actual); + //my_dc_send_vector = (double _Complex*)safe_malloc(payload_size_actual); + } + + for(i = 0; i < total_num_elements; ++i) { + if( MPI_INT == dtype ) { + my_int_recv_vector[i] = 1; + } else { + my_dc_recv_vector[i] = 1.0 - 1.0*I; + } + } + + /* + * MPI_Allreduce fails when size of my_int_vector is large + */ + if (world_rank == 0) { + printf("---------------------\nResults from %s(%s x %zu = %zu or %s):\n", + mpi_function, (MPI_INT == dtype ? "int" : "double _Complex"), + total_num_elements, payload_size_actual, human_bytes(payload_size_actual)); + } + assert(total_num_elements <= INT_MAX); + if (blocking) { + if( MPI_INT == dtype ) { + MPI_Allreduce(MPI_IN_PLACE, my_int_recv_vector, + (int)total_num_elements, dtype, + MPI_SUM, MPI_COMM_WORLD); + } else { + MPI_Allreduce(MPI_IN_PLACE, my_dc_recv_vector, + (int)total_num_elements, dtype, + MPI_SUM, MPI_COMM_WORLD); + } + } + else { + if( MPI_INT == dtype ) { + MPI_Iallreduce(MPI_IN_PLACE, my_int_recv_vector, + (int)total_num_elements, dtype, + MPI_SUM, MPI_COMM_WORLD, &request); + } else { + MPI_Iallreduce(MPI_IN_PLACE, my_dc_recv_vector, + (int)total_num_elements, dtype, + MPI_SUM, MPI_COMM_WORLD, &request); + } + MPI_Wait(&request, MPI_STATUS_IGNORE); + } + + /* + * Check results. + * The exact result = (size*number_of_processes, -size*number_of_processes) + */ + for(i = 0; i < total_num_elements; ++i) { + if( MPI_INT == dtype ) { + if(my_int_recv_vector[i] != world_size) { + ++num_wrong; + } + } else { + if(my_dc_recv_vector[i] != 1.0*world_size - 1.0*world_size*I) { + ++num_wrong; + } + } + } + + if( 0 == num_wrong) { + printf("Rank %2d: PASSED\n", world_rank); + } else { + printf("Rank %2d: ERROR: DI in %14zu of %14zu slots (%6.1f %% wrong)\n", world_rank, + num_wrong, total_num_elements, ((num_wrong * 1.0)/total_num_elements)*100.0); + ret = 1; + } + + if( NULL != my_int_send_vector ) { + free(my_int_send_vector); + } + if( NULL != my_int_recv_vector ){ + free(my_int_recv_vector); + } + if( NULL != my_dc_send_vector ) { + free(my_dc_send_vector); + } + if( NULL != my_dc_recv_vector ){ + free(my_dc_recv_vector); + } + + fflush(NULL); + MPI_Barrier(MPI_COMM_WORLD); + + return ret; +} diff --git a/collective-big-count/test_alltoall.c b/collective-big-count/test_alltoall.c new file mode 100644 index 0000000..4ab7567 --- /dev/null +++ b/collective-big-count/test_alltoall.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2021-2022 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + */ +#include +#include +#include + +#include +#include "common.h" + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking); + +int main(int argc, char** argv) { + /* + * Initialize the MPI environment + */ + int ret = 0; + + MPI_Init(NULL, NULL); + init_environment(argc, argv); + +#ifndef TEST_UNIFORM_COUNT + // Buffer size: 2 GB + // V_SIZE_INT tells us how many elements are needed to reach 2GB payload + // Each rank will send/recv a count of V_SIZE_INT / world_size + // The function will try to get as close to that as possible. + // + // Each rank contribues: V_SIZE_INT / world_size elements + // Largest buffer is : V_SIZE_INT elements + ret += my_c_test_core(MPI_INT, V_SIZE_INT, true); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, true); + if (allow_nonblocked) { + ret += my_c_test_core(MPI_INT, V_SIZE_INT, false); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, false); + } +#else + size_t proposed_count; + + // Each rank contribues: TEST_UNIFORM_COUNT elements + // Largest buffer is : TEST_UNIFORM_COUNT x world_size + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + (size_t)world_size, (size_t)world_size, alg_inflation); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, true); + + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + (size_t)world_size, (size_t)world_size, alg_inflation); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, true); + if (allow_nonblocked) { + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + (size_t)world_size, (size_t)world_size, alg_inflation); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, false); + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + (size_t)world_size, (size_t)world_size, alg_inflation); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, false); + } +#endif + + /* + * All done + */ + MPI_Finalize(); + return ret; +} + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking) +{ + int ret = 0; + size_t i; + + // Actual payload size as divisible by the sizeof(dt) + size_t payload_size_actual; + + /* + * Initialize vector + */ + int *my_int_recv_vector = NULL; + int *my_int_send_vector = NULL; + double _Complex *my_dc_recv_vector = NULL; + double _Complex *my_dc_send_vector = NULL; + size_t recv_count = 0; + size_t send_count = 0; + int exp; + size_t num_wrong = 0; + MPI_Request request; + char *mpi_function = blocking ? "MPI_Alltoall" : "MPI_Ialltoall"; + + assert(MPI_INT == dtype || MPI_C_DOUBLE_COMPLEX == dtype); + + send_count = recv_count = total_num_elements / (size_t)world_size; + // total_num_elements must be a multiple of world_size. Drop any remainder + total_num_elements = send_count * (size_t)world_size; + + if( MPI_INT == dtype ) { + payload_size_actual = total_num_elements * sizeof(int); + my_int_recv_vector = (int*)safe_malloc(payload_size_actual); + //my_int_send_vector = (int*)safe_malloc(payload_size_actual); + } else { + payload_size_actual = total_num_elements * sizeof(double _Complex); + my_dc_recv_vector = (double _Complex*)safe_malloc(payload_size_actual); + //my_dc_send_vector = (double _Complex*)safe_malloc(payload_size_actual); + } + + for(i = 0; i < total_num_elements; ++i) { + exp = (int)((i / (size_t)recv_count) + ((world_rank+1)*2) + (i % (size_t)recv_count)); + if( MPI_INT == dtype ) { + my_int_recv_vector[i] = exp; + } else { + my_dc_recv_vector[i] = 1.0*exp - 1.0*exp*I; + } + } + + if (world_rank == 0) { + printf("---------------------\nResults from %s(%s x %zu = %zu or %s): MPI_IN_PLACE\n", + mpi_function, (MPI_INT == dtype ? "int" : "double _Complex"), + total_num_elements, payload_size_actual, human_bytes(payload_size_actual)); + } + assert(send_count <= INT_MAX); + assert(recv_count <= INT_MAX); + if (blocking) { + if( MPI_INT == dtype ) { + MPI_Alltoall(MPI_IN_PLACE, (int)send_count, dtype, + my_int_recv_vector, (int)recv_count, dtype, + MPI_COMM_WORLD); + } else { + MPI_Alltoall(MPI_IN_PLACE, (int)send_count, dtype, + my_dc_recv_vector, (int)recv_count, dtype, + MPI_COMM_WORLD); + } + } + else { + if( MPI_INT == dtype ) { + MPI_Ialltoall(MPI_IN_PLACE, (int)send_count, dtype, + my_int_recv_vector, (int)recv_count, dtype, + MPI_COMM_WORLD, &request); + } else { + MPI_Ialltoall(MPI_IN_PLACE, (int)send_count, dtype, + my_dc_recv_vector, (int)recv_count, dtype, + MPI_COMM_WORLD, &request); + } + MPI_Wait(&request, MPI_STATUS_IGNORE); + } + + /* + * Check results. + */ + exp = 0; + for(i = 0; i < total_num_elements; ++i) { + // Dest_Rank + Src_Rank + counter + exp = (int)( (((i / (size_t)recv_count)+1)*2) + world_rank + (i % (size_t)recv_count)); + if( MPI_INT == dtype ) { + if(my_int_recv_vector[i] != exp) { + ++num_wrong; + } + } else { + if(my_dc_recv_vector[i] != 1.0*exp - 1.0*exp*I) { + ++num_wrong; + } + } + } + + if( 0 == num_wrong) { + printf("Rank %2d: PASSED\n", world_rank); + } else { + printf("Rank %2d: ERROR: DI in %14zu of %14zu slots (%6.1f %% wrong)\n", world_rank, + num_wrong, recv_count, ((num_wrong * 1.0)/recv_count)*100.0); + ret = 1; + } + + if( NULL != my_int_send_vector ) { + free(my_int_send_vector); + } + if( NULL != my_int_recv_vector ){ + free(my_int_recv_vector); + } + if( NULL != my_dc_send_vector ) { + free(my_dc_send_vector); + } + if( NULL != my_dc_recv_vector ){ + free(my_dc_recv_vector); + } + + fflush(NULL); + MPI_Barrier(MPI_COMM_WORLD); + + return ret; +} diff --git a/collective-big-count/test_bcast.c b/collective-big-count/test_bcast.c new file mode 100644 index 0000000..ef1fd0f --- /dev/null +++ b/collective-big-count/test_bcast.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2021-2022 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + */ +#include +#include +#include + +#include +#include "common.h" + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking); + +int main(int argc, char** argv) { + /* + * Initialize the MPI environment + */ + int ret = 0; + + MPI_Init(NULL, NULL); + init_environment(argc, argv); + +#ifndef TEST_UNIFORM_COUNT + // Each rank contribues: V_SIZE_INT elements + // Largest buffer is : V_SIZE_INT elements + ret += my_c_test_core(MPI_INT, V_SIZE_INT, true); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, true); + if (allow_nonblocked) { + ret += my_c_test_core(MPI_INT, V_SIZE_INT, false); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, false); + } +#else + size_t proposed_count; + + // Each rank contribues: TEST_UNIFORM_COUNT elements + // Largest buffer is : TEST_UNIFORM_COUNT elements + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + 2, 2, 1.0); // 1 send, 1 recv buffer each + ret += my_c_test_core(MPI_INT, proposed_count, true); + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + 2, 2, 1.0); // 1 send, 1 recv buffer each + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count, true); + if (allow_nonblocked) { + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + 2, 2, 1.0); // 1 send, 1 recv buffer each + ret += my_c_test_core(MPI_INT, proposed_count, false); + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + 2, 2, 1.0); // 1 send, 1 recv buffer each + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count, false); + } +#endif + + /* + * All done + */ + MPI_Finalize(); + return ret; +} + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking) +{ + int ret = 0; + int i; + MPI_Request request; + char *mpi_function = blocking ? "MPI_Bcast" : "MPI_Ibcast"; + + // Actual payload size as divisible by the sizeof(dt) + size_t payload_size_actual; + + /* + * Initialize vector + */ + int *my_int_vector = NULL; + double _Complex *my_dc_vector = NULL; + void *buff_ptr = NULL; + unsigned int num_wrong = 0; + + assert(MPI_INT == dtype || MPI_C_DOUBLE_COMPLEX == dtype); + assert(total_num_elements <= INT_MAX); + + if( MPI_INT == dtype ) { + payload_size_actual = total_num_elements * sizeof(int); + my_int_vector = (int*)safe_malloc(payload_size_actual); + buff_ptr = my_int_vector; + } else { + payload_size_actual = total_num_elements * sizeof(double _Complex); + my_dc_vector = (double _Complex*)safe_malloc(payload_size_actual); + buff_ptr = my_dc_vector; + } + + for(i = 0; i < total_num_elements; ++i) { + if( MPI_INT == dtype ) { + if (world_rank == 0) { + my_int_vector[i] = i; + } else { + my_int_vector[i] = -1; + } + } else { + if (world_rank == 0) { + my_dc_vector[i] = 1.0*i - 1.0*i*I; + } else { + my_dc_vector[i] = -1.0 - 1.0*I; + } + } + } + + if (world_rank == 0) { + printf("---------------------\nResults from %s(%s x %zu = %zu or %s):\n", + mpi_function, (MPI_INT == dtype ? "int" : "double _Complex"), + total_num_elements, payload_size_actual, human_bytes(payload_size_actual)); + } + if (blocking) { + MPI_Bcast(buff_ptr, (int)total_num_elements, dtype, 0, MPI_COMM_WORLD); + } + else { + MPI_Ibcast(buff_ptr, (int)total_num_elements, dtype, 0, MPI_COMM_WORLD, &request); + MPI_Wait(&request, MPI_STATUS_IGNORE); + } + + /* + * Check results. + */ + for(i = 0; i < total_num_elements; ++i) { + if( MPI_INT == dtype ) { + if(my_int_vector[i] != i) { + ++num_wrong; + } + } else { + if(my_dc_vector[i] != 1.0*i - 1.0*i*I) { + ++num_wrong; + } + } + } + + if( 0 == num_wrong) { + printf("Rank %2d: Passed\n", world_rank); + } else { + printf("Rank %2d: ERROR: DI in %14u of %14zu slots (%6.1f %% wrong)\n", world_rank, + num_wrong, total_num_elements, ((num_wrong * 1.0)/total_num_elements)*100.0); + ret = 1; + } + + if(NULL != my_int_vector) { + free(my_int_vector); + } + if(NULL != my_dc_vector) { + free(my_dc_vector); + } + fflush(NULL); + MPI_Barrier(MPI_COMM_WORLD); + + return ret; +} diff --git a/collective-big-count/test_gather.c b/collective-big-count/test_gather.c new file mode 100644 index 0000000..88863fe --- /dev/null +++ b/collective-big-count/test_gather.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2021-2022 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + */ +#include +#include +#include + +#include +#include "common.h" + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking); + +int main(int argc, char** argv) { + /* + * Initialize the MPI environment + */ + int ret = 0; + + MPI_Init(NULL, NULL); + init_environment(argc, argv); + +#ifndef TEST_UNIFORM_COUNT + // Each rank contribues: V_SIZE_INT / world_size elements + // Largest buffer is : V_SIZE_INT elements + ret += my_c_test_core(MPI_INT, V_SIZE_INT, true); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, true); + if (allow_nonblocked) { + ret += my_c_test_core(MPI_INT, V_SIZE_INT, false); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, false); + } +#else + size_t proposed_count; + + // Each rank contribues: TEST_UNIFORM_COUNT elements + // Largest buffer is : TEST_UNIFORM_COUNT x world_size + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, true); + + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, true); + if (allow_nonblocked) { + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, false); + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, false); + } +#endif + + /* + * All done + */ + MPI_Finalize(); + return ret; +} + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking) +{ + int ret = 0; + size_t i; + MPI_Request request; + char *mpi_function = blocking ? "MPI_Gather" : "MPI_Igather"; + + // Actual payload size as divisible by the sizeof(dt) + size_t payload_size_actual; + + /* + * Initialize vector + */ + int *my_int_recv_vector = NULL; + int *my_int_send_vector = NULL; + double _Complex *my_dc_recv_vector = NULL; + double _Complex *my_dc_send_vector = NULL; + size_t recv_count = 0; + size_t send_count = 0; + int exp; + size_t num_wrong = 0; + + assert(MPI_INT == dtype || MPI_C_DOUBLE_COMPLEX == dtype); + + send_count = total_num_elements / (size_t)world_size; + recv_count = total_num_elements / (size_t)world_size; + assert(send_count <= INT_MAX); + assert(recv_count <= INT_MAX); + // total_num_elements must be a multiple of world_size. Drop any remainder + total_num_elements = send_count * (size_t)world_size; + + if( MPI_INT == dtype ) { + if (world_rank == 0) { + payload_size_actual = total_num_elements * sizeof(int); + my_int_recv_vector = (int*)safe_malloc(payload_size_actual); + } + my_int_send_vector = (int*)safe_malloc(send_count * sizeof(int)); + } else { + if (world_rank == 0) { + payload_size_actual = total_num_elements * sizeof(double _Complex); + my_dc_recv_vector = (double _Complex*)safe_malloc(payload_size_actual); + } + my_dc_send_vector = (double _Complex*)safe_malloc(send_count * sizeof(double _Complex)); + } + + for(i = 0; i < send_count; ++i) { + exp = 1 + world_rank; + if( MPI_INT == dtype ) { + my_int_send_vector[i] = exp; + } else { + my_dc_send_vector[i] = 1.0*exp - 1.0*exp*I; + } + } + if (world_rank == 0) { + for(i = 0; i < total_num_elements; ++i) { + if( MPI_INT == dtype ) { + my_int_recv_vector[i] = -1; + } else { + my_dc_recv_vector[i] = 1.0 + 1.0*I; + } + } + } + + if (world_rank == 0) { + printf("---------------------\nResults from %s(%s x %zu = %zu or %s):\n", + mpi_function, (MPI_INT == dtype ? "int" : "double _Complex"), + total_num_elements, payload_size_actual, human_bytes(payload_size_actual)); + } + if (blocking) { + if( MPI_INT == dtype ) { + MPI_Gather(my_int_send_vector, (int)send_count, dtype, + my_int_recv_vector, (int)recv_count, dtype, + 0, MPI_COMM_WORLD); + } else { + MPI_Gather(my_dc_send_vector, (int)send_count, dtype, + my_dc_recv_vector, (int)recv_count, dtype, + 0, MPI_COMM_WORLD); + } + } + else { + if( MPI_INT == dtype ) { + MPI_Igather(my_int_send_vector, (int)send_count, dtype, + my_int_recv_vector, (int)recv_count, dtype, + 0, MPI_COMM_WORLD, &request); + } else { + MPI_Igather(my_dc_send_vector, (int)send_count, dtype, + my_dc_recv_vector, (int)recv_count, dtype, + 0, MPI_COMM_WORLD, &request); + } + MPI_Wait(&request, MPI_STATUS_IGNORE); + } + + /* + * Check results. + */ + if (world_rank == 0) { + exp = 0; + for(i = 0; i < total_num_elements; ++i) { + exp = (int)(1 + (i / (size_t)recv_count)); + if( MPI_INT == dtype ) { + if(my_int_recv_vector[i] != exp) { + ++num_wrong; + } + } else { + if(my_dc_recv_vector[i] != 1.0*exp - 1.0*exp*I) { + ++num_wrong; + } + } + } + + if( 0 == num_wrong) { + printf("Rank %2d: PASSED\n", world_rank); + } else { + printf("Rank %2d: ERROR: DI in %14zu of %14zu slots (%6.1f %% wrong)\n", world_rank, + num_wrong, total_num_elements, ((num_wrong * 1.0)/total_num_elements)*100.0); + ret = 1; + } + } + + if( NULL != my_int_send_vector ) { + free(my_int_send_vector); + } + if( NULL != my_int_recv_vector ){ + free(my_int_recv_vector); + } + if( NULL != my_dc_send_vector ) { + free(my_dc_send_vector); + } + if( NULL != my_dc_recv_vector ){ + free(my_dc_recv_vector); + } + fflush(NULL); + MPI_Barrier(MPI_COMM_WORLD); + + return ret; +} diff --git a/collective-big-count/test_gatherv.c b/collective-big-count/test_gatherv.c new file mode 100644 index 0000000..aca805a --- /dev/null +++ b/collective-big-count/test_gatherv.c @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2021-2022 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + */ +#include +#include +#include + +#include +#include "common.h" + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, int mode, bool blocking); + +int main(int argc, char** argv) { + /* + * Initialize the MPI environment + */ + int ret = 0; + + MPI_Init(NULL, NULL); + init_environment(argc, argv); + +#ifndef TEST_UNIFORM_COUNT + // Each rank contribues: V_SIZE_INT / world_size elements + // Largest buffer is : V_SIZE_INT elements + ret += my_c_test_core(MPI_INT, V_SIZE_INT, MODE_PACKED, true); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_INT, + (V_SIZE_INT - disp_stride*world_size), + MODE_SKIP, true); + + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, MODE_PACKED, true); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, + (V_SIZE_DOUBLE_COMPLEX - disp_stride*world_size), + MODE_SKIP, true); + if (allow_nonblocked) { + ret += my_c_test_core(MPI_INT, V_SIZE_INT, MODE_PACKED, false); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_INT, + (V_SIZE_INT - disp_stride*world_size), + MODE_SKIP, false); + + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, MODE_PACKED, false); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, + (V_SIZE_DOUBLE_COMPLEX - disp_stride*world_size), + MODE_SKIP, false); + } +#else + size_t proposed_count; + + // Each rank contribues: TEST_UNIFORM_COUNT elements + // Largest buffer is : TEST_UNIFORM_COUNT x world_size + + // Note: Displacement is an int, so the recv buffer cannot be too large as to overflow the int + // As such divide by the world_size + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT / (size_t)world_size, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, MODE_PACKED, true); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_INT, + (proposed_count - disp_stride*world_size) * (size_t)world_size, + MODE_SKIP, true); + + // Note: Displacement is an int, so the recv buffer cannot be too large as to overflow the int + // As such divide by the world_size + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT / (size_t)world_size, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, MODE_PACKED, true); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, + (proposed_count - disp_stride*world_size) * (size_t)world_size, + MODE_SKIP, true); + if (allow_nonblocked) { + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT / (size_t)world_size, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, MODE_PACKED, false); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_INT, + (proposed_count - disp_stride*world_size) * (size_t)world_size, + MODE_SKIP, false); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, + (proposed_count - disp_stride*world_size) * (size_t)world_size, + MODE_SKIP, false); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, MODE_PACKED, + false); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, + (proposed_count - disp_stride*world_size) * (size_t)world_size, + MODE_SKIP, false); + } +#endif + + /* + * All done + */ + MPI_Finalize(); + return ret; +} + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, int mode, bool blocking) +{ + int ret = 0; + size_t i; + + // Actual payload size as divisible by the sizeof(dt) + size_t payload_size_actual; + + /* + * Initialize vector + */ + int *my_int_send_vector = NULL; + int *my_int_recv_vector = NULL; + int int_exp; + + double _Complex *my_dc_send_vector = NULL; + double _Complex *my_dc_recv_vector = NULL; + double _Complex dc_exp; + + int *my_recv_counts = NULL; + int *my_recv_disp = NULL; + int send_count = 0; + int d_idx, r_idx; + size_t last_disp, last_count; + size_t num_wrong = 0; + size_t v_size, v_rem; + MPI_Request request; + char *mpi_function = blocking ? "MPI_Gatherv" : "MPI_Igatherv"; + + assert(MPI_INT == dtype || MPI_C_DOUBLE_COMPLEX == dtype); + + // total_num_elements = final recv count (at root) + // send_count = final send count + v_size = total_num_elements / world_size; + v_rem = total_num_elements % world_size; + if (0 != v_rem && world_rank == world_size-1) { + v_size += v_rem; + } + assert(total_num_elements <= INT_MAX); + assert(send_count <= INT_MAX); + send_count = (int)v_size; + + if (world_rank == 0) { + if( MODE_PACKED == mode ) { + /* Strategy for testing: + * - Displacement should skip 0 elements producing a tightly packed buffer + * - Count will be the same at all ranks + * - buffer can be v_size elements in size + * + * NP = 4 and total_num_elements = 9 then the final buffer will be: + * [1, 1, 2, 2, 3, 3, 4, 4, 4] + */ + if( MPI_INT == dtype ) { + payload_size_actual = total_num_elements * sizeof(int); + my_int_recv_vector = (int*)safe_malloc(payload_size_actual); + for(i = 0; i < total_num_elements; ++i) { + my_int_recv_vector[i] = -1; + } + } else { + payload_size_actual = total_num_elements * sizeof(double _Complex); + my_dc_recv_vector = (double _Complex*)safe_malloc(payload_size_actual); + for(i = 0; i < total_num_elements; ++i) { + my_dc_recv_vector[i] = 1.0 - 1.0*I; + } + } + my_recv_counts = (int*)safe_malloc(sizeof(int) * world_size); + my_recv_disp = (int*)safe_malloc(sizeof(int) * world_size); + last_disp = 0; + last_count = v_size; + + for(d_idx = 0; d_idx < world_size; ++d_idx) { + if (0 != v_rem && d_idx == world_size-1) { + last_count += v_rem; + } + assert(last_count <= INT_MAX); + my_recv_counts[d_idx] = (int)last_count; + assert(last_disp <= INT_MAX); + my_recv_disp[d_idx] = (int)last_disp; + if( debug > 0 ) { + printf("d_idx %3d / last_disp %9d / last_count %9d | total_count %10zu / payload_size %10zu\n", + d_idx, (int)last_disp, (int)last_count, total_num_elements, payload_size_actual); + } + // Shift displacement by the count for tightly packed buffer + last_disp += last_count; + } + } else { + /* Strategy for testing: + * - Displacement should skip 2 elements before first element and between each peer making a small gap + * - Count will be the same at all ranks +/- and divisible by v_size + * - buffer can be v_size + gaps for displacements + * + * NP = 4 and total_num_elements = 9 (17 with stride) then the final buffer will be: + * [-1, -1, 1, 1, -1, -1, 2, 2, -1, -1, 3, 3, -1, -1, 4, 4, 4] + */ + total_num_elements += disp_stride * (size_t)world_size; + if( MPI_INT == dtype ) { + payload_size_actual = total_num_elements * sizeof(int); + my_int_recv_vector = (int*)safe_malloc(payload_size_actual); + for(i = 0; i < total_num_elements; ++i) { + my_int_recv_vector[i] = -1; + } + } else { + payload_size_actual = total_num_elements * sizeof(double _Complex); + my_dc_recv_vector = (double _Complex*)safe_malloc(payload_size_actual); + for(i = 0; i < total_num_elements; ++i) { + my_dc_recv_vector[i] = -1.0 - 1.0*I; + } + } + my_recv_counts = (int*)safe_malloc(sizeof(int) * world_size); + my_recv_disp = (int*)safe_malloc(sizeof(int) * world_size); + last_disp = disp_stride; + last_count = v_size; + + for(d_idx = 0; d_idx < world_size; ++d_idx) { + if (0 != v_rem && d_idx == world_size-1) { + last_count += v_rem; + } + assert(last_count <= INT_MAX); + my_recv_counts[d_idx] = (int)last_count; + assert(last_disp <= INT_MAX); + my_recv_disp[d_idx] = (int)last_disp; + if( debug > 0) { + printf("d_idx %3d / last_disp %9d / last_count %9d | total_count %10zu / payload_size %10zu\n", + d_idx, (int)last_disp, (int)last_count, total_num_elements, payload_size_actual); + } + // Shift displacement by the count for tightly packed buffer + last_disp += last_count + disp_stride; + } + } + } + + if( MPI_INT == dtype ) { + my_int_send_vector = (int*)safe_malloc(sizeof(int) * send_count); + for(i = 0; i < send_count; ++i) { + my_int_send_vector[i] = 1 + world_rank; + } + } else { + my_dc_send_vector = (double _Complex*)safe_malloc(sizeof(double _Complex) * send_count); + for(i = 0; i < send_count; ++i) { + my_dc_send_vector[i] = 1.0*(1+world_rank) + 1.0*(1+world_rank)*I; + } + } + + if (world_rank == 0) { + printf("---------------------\nResults from %s(%s x %zu = %zu or %s): Mode: %s\n", + mpi_function, (MPI_INT == dtype ? "int" : "double _Complex"), + total_num_elements, payload_size_actual, human_bytes(payload_size_actual), + (MODE_PACKED == mode) ? "PACKED" : "SKIPPY"); + } + + if (blocking) { + if( MPI_INT == dtype ) { + MPI_Gatherv(my_int_send_vector, send_count, dtype, + my_int_recv_vector, my_recv_counts, my_recv_disp, dtype, + 0, MPI_COMM_WORLD); + } else { + MPI_Gatherv(my_dc_send_vector, send_count, dtype, + my_dc_recv_vector, my_recv_counts, my_recv_disp, dtype, + 0, MPI_COMM_WORLD); + } + } + else { + if( MPI_INT == dtype ) { + MPI_Igatherv(my_int_send_vector, send_count, dtype, + my_int_recv_vector, my_recv_counts, my_recv_disp, dtype, + 0, MPI_COMM_WORLD, &request); + } else { + MPI_Igatherv(my_dc_send_vector, send_count, dtype, + my_dc_recv_vector, my_recv_counts, my_recv_disp, dtype, + 0, MPI_COMM_WORLD, &request); + } + MPI_Wait(&request, MPI_STATUS_IGNORE); + } + + /* + * Check results. + */ + if (world_rank == 0) { + int_exp = 0; + d_idx = 0; + r_idx = 0; + + if( world_size > 1 ) { + last_disp = my_recv_counts[r_idx] + my_recv_disp[r_idx]; + } else { + last_disp = 0; + } + + if( MODE_PACKED == mode ) { + for(i = 0; i < total_num_elements; ++i) { + if( world_size > r_idx+1 && i == last_disp ) { + ++r_idx; + last_disp = my_recv_counts[r_idx] + my_recv_disp[r_idx]; + } + int_exp = 1 + r_idx; + if( MPI_INT == dtype ) { + if( debug > 1) { + printf("CHECK: %2zu : %3d vs %3d [%3d : %3d + %3d = %3d]\n", + i, my_int_recv_vector[i], int_exp, + r_idx, my_recv_counts[r_idx], my_recv_disp[r_idx], (int)last_disp); + } + if(my_int_recv_vector[i] != int_exp) { + ++num_wrong; + } + } else { + dc_exp = 1.0*int_exp + 1.0*int_exp*I; + if( debug > 1) { + printf("CHECK: %2zu : (%14.0f,%14.0fi) vs (%14.0f,%14.0fi) [%3d : %3d + %3d = %3d]\n", + i, creal(my_dc_recv_vector[i]), cimag(my_dc_recv_vector[i]), creal(dc_exp), cimag(dc_exp), + r_idx, my_recv_counts[r_idx], my_recv_disp[r_idx], (int)last_disp); + } + if(my_dc_recv_vector[i] != dc_exp) { + ++num_wrong; + } + } + } + } else { + for(i = 0; i < total_num_elements; ++i) { + if( world_size > r_idx+1 && i == last_disp ) { + ++r_idx; + last_disp = my_recv_counts[r_idx] + my_recv_disp[r_idx]; + } + if( i < my_recv_disp[r_idx] ) { + int_exp = -1; + } else { + int_exp = 1 + r_idx; + } + if( MPI_INT == dtype ) { + if( debug > 1) { + printf("CHECK: %2zu : %3d vs %3d [%3d : %3d + %3d = %3d]\n", + i, my_int_recv_vector[i], int_exp, + r_idx, my_recv_counts[r_idx], my_recv_disp[r_idx], (int)last_disp); + } + if(my_int_recv_vector[i] != int_exp) { + ++num_wrong; + } + } else { + dc_exp = 1.0*int_exp + 1.0*int_exp*I; + if( debug > 1) { + printf("CHECK: %2zu : (%14.0f,%14.0fi) vs (%14.0f,%14.0fi) [%3d : %3d + %3d = %3d]\n", + i, creal(my_dc_recv_vector[i]), cimag(my_dc_recv_vector[i]), creal(dc_exp), cimag(dc_exp), + r_idx, my_recv_counts[r_idx], my_recv_disp[r_idx], (int)last_disp); + } + if(my_dc_recv_vector[i] != dc_exp) { + ++num_wrong; + } + } + } + } + + if( 0 == num_wrong) { + printf("Rank %2d: PASSED\n", world_rank); + } else { + printf("Rank %2d: ERROR: DI in %14zu of %14zu slots (%6.1f %% wrong)\n", world_rank, + num_wrong, total_num_elements, ((num_wrong * 1.0)/total_num_elements)*100.0); + ret = 1; + } + } + + if( NULL != my_int_send_vector ) { + free(my_int_send_vector); + } + if( NULL != my_int_recv_vector ){ + free(my_int_recv_vector); + } + if( NULL != my_dc_send_vector ) { + free(my_dc_send_vector); + } + if( NULL != my_dc_recv_vector ){ + free(my_dc_recv_vector); + } + fflush(NULL); + MPI_Barrier(MPI_COMM_WORLD); + + return ret; +} diff --git a/collective-big-count/test_reduce.c b/collective-big-count/test_reduce.c new file mode 100644 index 0000000..462eb59 --- /dev/null +++ b/collective-big-count/test_reduce.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2021-2022 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + */ +#include +#include +#include + +#include +#include "common.h" + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking); + +int main(int argc, char** argv) { + /* + * Initialize the MPI environment + */ + int ret = 0; + + MPI_Init(NULL, NULL); + init_environment(argc, argv); + +#ifndef TEST_UNIFORM_COUNT + // Each rank contribues: V_SIZE_INT elements + // Largest buffer is : V_SIZE_INT elements + ret += my_c_test_core(MPI_INT, V_SIZE_INT, true); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, true); + if (allow_nonblocked) { + ret += my_c_test_core(MPI_INT, V_SIZE_INT, false); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, false); + } +#else + size_t proposed_count; + + // Each rank contribues: TEST_UNIFORM_COUNT elements + // Largest buffer is : TEST_UNIFORM_COUNT elements + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + 2, 2, 1.0); // 1 send, 1 recv buffer each + ret += my_c_test_core(MPI_INT, proposed_count, true); + + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + 2, 2, 1.0); // 1 send, 1 recv buffer each + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count, true); + if (allow_nonblocked) { + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + 2, 2, 1.0); // 1 send, 1 recv buffer each + ret += my_c_test_core(MPI_INT, proposed_count, false); + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + 2, 2, 1.0); // 1 send, 1 recv buffer each + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count, false); + } +#endif + + /* + * All done + */ + MPI_Finalize(); + return ret; +} + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking) +{ + int ret = 0; + size_t i; + MPI_Request request; + char *mpi_function = blocking ? "MPI_Reduce" : "MPI_Ireduce"; + + // Actual payload size as divisible by the sizeof(dt) + size_t payload_size_actual; + + /* + * Initialize vector + */ + int *my_int_recv_vector = NULL; + int *my_int_send_vector = NULL; + double _Complex *my_dc_recv_vector = NULL; + double _Complex *my_dc_send_vector = NULL; + size_t num_wrong = 0; + + assert(MPI_INT == dtype || MPI_C_DOUBLE_COMPLEX == dtype); + assert(total_num_elements <= INT_MAX); + + if( MPI_INT == dtype ) { + payload_size_actual = total_num_elements * sizeof(int); + if (world_rank == 0) { + my_int_recv_vector = (int*)safe_malloc(payload_size_actual); + } + my_int_send_vector = (int*)safe_malloc(payload_size_actual); + } else { + payload_size_actual = total_num_elements * sizeof(double _Complex); + if (world_rank == 0) { + my_dc_recv_vector = (double _Complex*)safe_malloc(payload_size_actual); + } + my_dc_send_vector = (double _Complex*)safe_malloc(payload_size_actual); + } + + for(i = 0; i < total_num_elements; ++i) { + if( MPI_INT == dtype ) { + my_int_send_vector[i] = 1; + if (world_rank == 0) { + my_int_recv_vector[i] = -1; + } + } else { + my_dc_send_vector[i] = 1.0 - 1.0*I; + if (world_rank == 0) { + my_dc_recv_vector[i] = -1.0 + 1.0*I; + } + } + } + + /* + * MPI_Allreduce fails when size of my_int_vector is large + */ + if (world_rank == 0) { + printf("---------------------\nResults from %s(%s x %zu = %zu or %s):\n", + mpi_function, (MPI_INT == dtype ? "int" : "double _Complex"), + total_num_elements, payload_size_actual, human_bytes(payload_size_actual)); + } + if (blocking) { + if( MPI_INT == dtype ) { + MPI_Reduce(my_int_send_vector, my_int_recv_vector, + (int)total_num_elements, dtype, + MPI_SUM, 0, MPI_COMM_WORLD); + } else { + MPI_Reduce(my_dc_send_vector, my_dc_recv_vector, + (int)total_num_elements, dtype, + MPI_SUM, 0, MPI_COMM_WORLD); + } + } + else { + if( MPI_INT == dtype ) { + MPI_Ireduce(my_int_send_vector, my_int_recv_vector, + (int)total_num_elements, dtype, + MPI_SUM, 0, MPI_COMM_WORLD, &request); + } else { + MPI_Ireduce(my_dc_send_vector, my_dc_recv_vector, + (int)total_num_elements, dtype, + MPI_SUM, 0, MPI_COMM_WORLD, &request); + } + MPI_Wait(&request, MPI_STATUS_IGNORE); + } + + /* + * Check results. + * The exact result = (size*number_of_processes, -size*number_of_processes) + */ + if (world_rank == 0) { + for(i = 0; i < total_num_elements; ++i) { + if( MPI_INT == dtype ) { + if(my_int_recv_vector[i] != world_size) { + ++num_wrong; + } + } else { + if(my_dc_recv_vector[i] != 1.0*world_size - 1.0*world_size*I) { + ++num_wrong; + } + } + } + + if( 0 == num_wrong) { + printf("Rank %2d: PASSED\n", world_rank); + } else { + printf("Rank %2d: ERROR: DI in %14zu of %14zu slots (%6.1f %% wrong)\n", world_rank, + num_wrong, total_num_elements, ((num_wrong * 1.0)/total_num_elements)*100.0); + ret = 1; + } + } + + if( NULL != my_int_send_vector ) { + free(my_int_send_vector); + } + if( NULL != my_int_recv_vector ){ + free(my_int_recv_vector); + } + if( NULL != my_dc_send_vector ) { + free(my_dc_send_vector); + } + if( NULL != my_dc_recv_vector ){ + free(my_dc_recv_vector); + } + + fflush(NULL); + MPI_Barrier(MPI_COMM_WORLD); + + return ret; +} diff --git a/collective-big-count/test_scatter.c b/collective-big-count/test_scatter.c new file mode 100644 index 0000000..d97a377 --- /dev/null +++ b/collective-big-count/test_scatter.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2021-2022 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + */ +#include +#include +#include + +#include +#include "common.h" + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking); + +int main(int argc, char** argv) { + /* + * Initialize the MPI environment + */ + int ret = 0; + + MPI_Init(NULL, NULL); + init_environment(argc, argv); + + // Run the tests +#ifndef TEST_UNIFORM_COUNT + // Each rank contribues: V_SIZE_INT / world_size elements + // Largest buffer is : V_SIZE_INT elements + ret += my_c_test_core(MPI_INT, V_SIZE_INT, true); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, true); + if (allow_nonblocked) { + ret += my_c_test_core(MPI_INT, V_SIZE_INT, false); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, false); + } +#else + size_t proposed_count; + + // Each rank contribues: TEST_UNIFORM_COUNT elements + // Largest buffer is : TEST_UNIFORM_COUNT x world_size + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, true); + + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, + true); + if (allow_nonblocked) { + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, false); + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, + false); + } +#endif + + /* + * All done + */ + MPI_Finalize(); + return ret; +} + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking) +{ + int ret = 0; + size_t i; + MPI_Request request; + char *mpi_function = blocking ? "MPI_Scatter" : "MPI_Iscatter"; + + // Actual payload size as divisible by the sizeof(dt) + size_t payload_size_actual; + + /* + * Initialize vector + */ + int *my_int_recv_vector = NULL; + int *my_int_send_vector = NULL; + double _Complex *my_dc_recv_vector = NULL; + double _Complex *my_dc_send_vector = NULL; + size_t recv_count = 0; + size_t send_count = 0; + int exp; + size_t num_wrong = 0; + + assert(MPI_INT == dtype || MPI_C_DOUBLE_COMPLEX == dtype); + + send_count = total_num_elements / (size_t)world_size; + recv_count = total_num_elements / (size_t)world_size; + assert(send_count <= INT_MAX); + assert(recv_count <= INT_MAX); + // total_num_elements must be a multiple of world_size. Drop any remainder + total_num_elements = send_count * (size_t)world_size; + + if( MPI_INT == dtype ) { + if (world_rank == 0) { + payload_size_actual = total_num_elements * sizeof(int); + my_int_send_vector = (int*)safe_malloc(payload_size_actual); + } + my_int_recv_vector = (int*)safe_malloc(recv_count * sizeof(int)); + } else { + if (world_rank == 0) { + payload_size_actual = total_num_elements * sizeof(double _Complex); + my_dc_send_vector = (double _Complex*)safe_malloc(payload_size_actual); + } + my_dc_recv_vector = (double _Complex*)safe_malloc(recv_count * sizeof(double _Complex)); + } + + for(i = 0; i < recv_count; ++i) { + if( MPI_INT == dtype ) { + my_int_recv_vector[i] = -1; + } else { + my_dc_recv_vector[i] = 1.0 + 1.0*I; + } + } + if (world_rank == 0) { + for(i = 0; i < total_num_elements; ++i) { + // Rank + counter + exp = (int)((i / (size_t)send_count) + (i % (size_t)send_count)); + if( MPI_INT == dtype ) { + my_int_send_vector[i] = exp; + } else { + my_dc_send_vector[i] = 1.0*exp - 1.0*exp*I; + } + } + } + + if (world_rank == 0) { + printf("---------------------\nResults from %s(%s x %zu = %zu or %s):\n", + mpi_function, (MPI_INT == dtype ? "int" : "double _Complex"), + total_num_elements, payload_size_actual, human_bytes(payload_size_actual)); + } + if (blocking) { + if( MPI_INT == dtype ) { + MPI_Scatter(my_int_send_vector, (int)send_count, dtype, + my_int_recv_vector, (int)recv_count, dtype, + 0, MPI_COMM_WORLD); + } else { + MPI_Scatter(my_dc_send_vector, (int)send_count, dtype, + my_dc_recv_vector, (int)recv_count, dtype, + 0, MPI_COMM_WORLD); + } + } + else { + if( MPI_INT == dtype ) { + MPI_Iscatter(my_int_send_vector, (int)send_count, dtype, + my_int_recv_vector, (int)recv_count, dtype, + 0, MPI_COMM_WORLD, &request); + } else { + MPI_Iscatter(my_dc_send_vector, (int)send_count, dtype, + my_dc_recv_vector, (int)recv_count, dtype, + 0, MPI_COMM_WORLD, &request); + } + MPI_Wait(&request, MPI_STATUS_IGNORE); + } + + /* + * Check results. + */ + exp = 0; + for(i = 0; i < recv_count; ++i) { + // Rank + counter + exp = (int)(world_rank + i); + if( MPI_INT == dtype ) { + if(my_int_recv_vector[i] != exp) { + ++num_wrong; + } + } else { + if(my_dc_recv_vector[i] != 1.0*exp - 1.0*exp*I) { + ++num_wrong; + } + } + } + + if( 0 == num_wrong) { + printf("Rank %2d: PASSED\n", world_rank); + } else { + printf("Rank %2d: ERROR: DI in %14zu of %14zu slots (%6.1f %% wrong)\n", world_rank, + num_wrong, recv_count, ((num_wrong * 1.0)/recv_count)*100.0); + ret = 1; + } + + if( NULL != my_int_recv_vector ) { + free(my_int_recv_vector); + } + if( NULL != my_int_send_vector ){ + free(my_int_send_vector); + } + if( NULL != my_dc_recv_vector ) { + free(my_dc_recv_vector); + } + if( NULL != my_dc_send_vector ){ + free(my_dc_send_vector); + } + fflush(NULL); + MPI_Barrier(MPI_COMM_WORLD); + + return ret; +} diff --git a/collective-big-count/test_scatterv.c b/collective-big-count/test_scatterv.c new file mode 100644 index 0000000..9c18f8a --- /dev/null +++ b/collective-big-count/test_scatterv.c @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2021-2022 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + */ +#include +#include +#include + +#include +#include "common.h" + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, int mode, bool blocking); + +int main(int argc, char** argv) { + /* + * Initialize the MPI environment + */ + int ret = 0; + + MPI_Init(NULL, NULL); + init_environment(argc, argv); + + // Run the tests +#ifndef TEST_UNIFORM_COUNT + // Each rank contribues: V_SIZE_INT / world_size elements + // Largest buffer is : V_SIZE_INT elements + ret += my_c_test_core(MPI_INT, V_SIZE_INT,MODE_PACKED, true); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_INT, + (V_SIZE_INT - disp_stride*world_size), + MODE_SKIP, true); + + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, MODE_PACKED, + true); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, + (V_SIZE_DOUBLE_COMPLEX - disp_stride*world_size), + MODE_SKIP, true); + if (allow_nonblocked) { + ret += my_c_test_core(MPI_INT, V_SIZE_INT,MODE_PACKED, false); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_INT, + (V_SIZE_INT - disp_stride*world_size), + MODE_SKIP, false); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, MODE_PACKED, + false); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, + (V_SIZE_DOUBLE_COMPLEX - disp_stride*world_size), + MODE_SKIP, false); + } +#else + size_t proposed_count; + + // Each rank contribues: TEST_UNIFORM_COUNT elements + // Largest buffer is : TEST_UNIFORM_COUNT x world_size + + // Note: Displacement is an int, so the recv buffer cannot be too large as to overflow the int + // As such divide by the world_size + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT / (size_t)world_size, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, MODE_PACKED, true); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_INT, + (proposed_count - disp_stride*world_size) * (size_t)world_size, + MODE_SKIP, true); + + // Note: Displacement is an int, so the recv buffer cannot be too large as to overflow the int + // As such divide by the world_size + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT / (size_t)world_size, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, MODE_PACKED, true); + // Adjust these to be V_SIZE_INT - displacement strides so it will pass + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, + (proposed_count - disp_stride*world_size) * (size_t)world_size, + MODE_SKIP, true); + if (allow_nonblocked) { + proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT / (size_t)world_size, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, MODE_PACKED, false); + ret += my_c_test_core(MPI_INT, + (proposed_count - disp_stride*world_size) * (size_t)world_size, + MODE_SKIP, false); + proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT / (size_t)world_size, + (size_t)world_size, 1, 1.0); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, MODE_PACKED, + false); + ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, + (proposed_count - disp_stride*world_size) * (size_t)world_size, + MODE_SKIP, false); + } +#endif + + /* + * All done + */ + MPI_Finalize(); + return ret; +} + +int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, int mode, bool blocking) +{ + int ret = 0; + size_t i; + MPI_Request request; + char *mpi_function = blocking ? "MPI_Scatterv" : "MPI_Iscatterv"; + + // Actual payload size as divisible by the sizeof(dt) + size_t payload_size_actual; + + /* + * Initialize vector + */ + int *my_int_send_vector = NULL; + int *my_int_recv_vector = NULL; + int int_exp; + + double _Complex *my_dc_send_vector = NULL; + double _Complex *my_dc_recv_vector = NULL; + double _Complex dc_exp; + + int *my_send_counts = NULL; + int *my_send_disp = NULL; + int recv_count = 0; + int d_idx, r_idx; + size_t last_disp, last_count; + size_t num_wrong = 0; + size_t v_size, v_rem; + + assert(MPI_INT == dtype || MPI_C_DOUBLE_COMPLEX == dtype); + + // total_num_elements = send_size (at root) + // recv_count = recv_count + v_size = total_num_elements / world_size; + v_rem = total_num_elements % world_size; + if (0 != v_rem && world_rank == world_size-1) { + v_size += v_rem; + } + assert(recv_count <= INT_MAX); + recv_count = (int)v_size; + + if (world_rank == 0) { + if( MODE_PACKED == mode ) { + /* Strategy for testing: + * - Displacement should skip 0 elements producing a tightly packed buffer + * - Count will be the same at all ranks + * - buffer can be v_size elements in size + * + * NP = 4 and total_num_elements = 9 then the final buffer will be: + * [1, 1, 2, 2, 3, 3, 4, 4, 4] + */ + if( MPI_INT == dtype ) { + payload_size_actual = total_num_elements * sizeof(int); + my_int_send_vector = (int*)safe_malloc(payload_size_actual); + } else { + payload_size_actual = total_num_elements * sizeof(double _Complex); + my_dc_send_vector = (double _Complex*)safe_malloc(payload_size_actual); + } + my_send_counts = (int*)safe_malloc(sizeof(int) * world_size); + my_send_disp = (int*)safe_malloc(sizeof(int) * world_size); + last_disp = 0; + last_count = v_size; + + for(d_idx = 0; d_idx < world_size; ++d_idx) { + if (0 != v_rem && d_idx == world_size-1) { + last_count += v_rem; + } + assert(last_count <= INT_MAX); + my_send_counts[d_idx] = (int)last_count; + assert(last_disp <= INT_MAX); + my_send_disp[d_idx] = (int)last_disp; + if( debug > 0 ) { + printf("d_idx %3d / last_disp %9d / last_count %9d | total_count %10zu / payload_size %10zu\n", + d_idx, (int)last_disp, (int)last_count, total_num_elements, payload_size_actual); + } + // Shift displacement by the count for tightly packed buffer + last_disp += last_count; + } + + r_idx = 0; + if( world_size > 1 ) { + last_disp = my_send_counts[r_idx] + my_send_disp[r_idx]; + } else { + last_disp = 0; + } + if( MPI_INT == dtype ) { + for(i = 0; i < total_num_elements; ++i) { + if( world_size > r_idx+1 && i == last_disp ) { + ++r_idx; + last_disp = my_send_counts[r_idx] + my_send_disp[r_idx]; + } + my_int_send_vector[i] = 1 + r_idx; + } + } else { + for(i = 0; i < total_num_elements; ++i) { + if( world_size > r_idx+1 && i == last_disp ) { + ++r_idx; + last_disp = my_send_counts[r_idx] + my_send_disp[r_idx]; + } + my_dc_send_vector[i] = 1.0*(1+r_idx) + 1.0*(1+r_idx)*I; + } + } + } else { + /* Strategy for testing: + * - Displacement should skip 2 elements before first element and between each peer making a small gap + * - Count will be the same at all ranks +/- and divisible by v_size + * - buffer can be v_size + gaps for displacements + * + * NP = 4 and total_num_elements = 9 (17 with stride) then the final buffer will be: + * [-1, -1, 1, 1, -1, -1, 2, 2, -1, -1, 3, 3, -1, -1, 4, 4, 4] + */ + total_num_elements += disp_stride * (size_t)world_size; + + if( MPI_INT == dtype ) { + payload_size_actual = total_num_elements * sizeof(int); + my_int_send_vector = (int*)safe_malloc(payload_size_actual); + } else { + payload_size_actual = total_num_elements * sizeof(double _Complex); + my_dc_send_vector = (double _Complex*)safe_malloc(payload_size_actual); + } + my_send_counts = (int*)safe_malloc(sizeof(int) * world_size); + my_send_disp = (int*)safe_malloc(sizeof(int) * world_size); + last_disp = disp_stride; + last_count = v_size; + + for(d_idx = 0; d_idx < world_size; ++d_idx) { + if (0 != v_rem && d_idx == world_size-1) { + last_count += v_rem; + } + assert(last_count <= INT_MAX); + my_send_counts[d_idx] = (int)last_count; + assert(last_disp <= INT_MAX); + my_send_disp[d_idx] = (int)last_disp; + if( debug > 0) { + printf("d_idx %3d / last_disp %9d / last_count %9d | total_count %10zu / payload_size %10zu\n", + d_idx, (int)last_disp, (int)last_count, total_num_elements, payload_size_actual); + } + // Shift displacement by the count for tightly packed buffer + last_disp += last_count + disp_stride; + } + + r_idx = 0; + if( world_size > 1 ) { + last_disp = my_send_counts[r_idx] + my_send_disp[r_idx]; + } else { + last_disp = 0; + } + + if( MPI_INT == dtype ) { + for(i = 0; i < total_num_elements; ++i) { + if( world_size > r_idx+1 && i == last_disp ) { + ++r_idx; + last_disp = my_send_counts[r_idx] + my_send_disp[r_idx]; + } + if( i < my_send_disp[r_idx] ) { + my_int_send_vector[i] = -1; + } else { + my_int_send_vector[i] = 1 + r_idx; + } + } + } else { + for(i = 0; i < total_num_elements; ++i) { + if( world_size > r_idx+1 && i == last_disp ) { + ++r_idx; + last_disp = my_send_counts[r_idx] + my_send_disp[r_idx]; + } + if( i < my_send_disp[r_idx] ) { + my_dc_send_vector[i] = -1.0 - 1.0*I; + } else { + my_dc_send_vector[i] = 1.0*(1+r_idx) + 1.0*(1+r_idx)*I; + } + } + } + } + } + + if( MPI_INT == dtype ) { + my_int_recv_vector = (int*)safe_malloc(sizeof(int) * recv_count); + for(i = 0; i < recv_count; ++i) { + my_int_recv_vector[i] = -1; + } + } else { + my_dc_recv_vector = (double _Complex*)safe_malloc(sizeof(double _Complex) * recv_count); + for(i = 0; i < recv_count; ++i) { + my_dc_recv_vector[i] = -1.0 - 1.0*I; + } + } + + if (world_rank == 0) { + printf("---------------------\nResults from %s(%s x %zu = %zu or %s): Mode: %s\n", + mpi_function, (MPI_INT == dtype ? "int" : "double _Complex"), + total_num_elements, payload_size_actual, human_bytes(payload_size_actual), + (MODE_PACKED == mode) ? "PACKED" : "SKIPPY"); + } + + if (blocking) { + if( MPI_INT == dtype ) { + MPI_Scatterv(my_int_send_vector, my_send_counts, my_send_disp, dtype, + my_int_recv_vector, recv_count, dtype, + 0, MPI_COMM_WORLD); + } else { + MPI_Scatterv(my_dc_send_vector, my_send_counts, my_send_disp, dtype, + my_dc_recv_vector, recv_count, dtype, + 0, MPI_COMM_WORLD); + } + } + else { + if( MPI_INT == dtype ) { + MPI_Iscatterv(my_int_send_vector, my_send_counts, my_send_disp, dtype, + my_int_recv_vector, recv_count, dtype, + 0, MPI_COMM_WORLD, &request); + } else { + MPI_Iscatterv(my_dc_send_vector, my_send_counts, my_send_disp, dtype, + my_dc_recv_vector, recv_count, dtype, + 0, MPI_COMM_WORLD, &request); + } + MPI_Wait(&request, MPI_STATUS_IGNORE); + } + + /* + * Check results. + */ + int_exp = 0; + + if( MODE_PACKED == mode ) { + for(i = 0; i < recv_count; ++i) { + int_exp = 1 + world_rank; + if( MPI_INT == dtype ) { + if( debug > 1) { + printf("%2d CHECK: %2zu : %3d vs %3d\n", + world_rank, i, my_int_recv_vector[i], int_exp); + } + if(my_int_recv_vector[i] != int_exp) { + ++num_wrong; + } + } else { + dc_exp = 1.0*int_exp + 1.0*int_exp*I; + if( debug > 1) { + printf("%2d CHECK: %2zu : (%14.0f,%14.0fi) vs (%14.0f,%14.0fi)\n", + world_rank, i, creal(my_dc_recv_vector[i]), cimag(my_dc_recv_vector[i]), creal(dc_exp), cimag(dc_exp)); + } + if(my_dc_recv_vector[i] != dc_exp) { + ++num_wrong; + } + } + } + } else { + for(i = 0; i < recv_count; ++i) { + int_exp = 1 + world_rank; + if( MPI_INT == dtype ) { + if( debug > 1) { + printf("%2d CHECK: %2zu : %3d vs %3d\n", + world_rank, i, my_int_recv_vector[i], int_exp); + } + if(my_int_recv_vector[i] != int_exp) { + ++num_wrong; + } + } else { + dc_exp = 1.0*int_exp + 1.0*int_exp*I; + if( debug > 1) { + printf("%2d CHECK: %2zu : (%14.0f,%14.0fi) vs (%14.0f,%14.0fi)\n", + world_rank, i, creal(my_dc_recv_vector[i]), cimag(my_dc_recv_vector[i]), creal(dc_exp), cimag(dc_exp)); + } + if(my_dc_recv_vector[i] != dc_exp) { + ++num_wrong; + } + } + } + } + + if( 0 == num_wrong) { + printf("Rank %2d: PASSED\n", world_rank); + } else { + printf("Rank %2d: ERROR: DI in %14zu of %14zu slots (%6.1f %% wrong)\n", world_rank, + num_wrong, total_num_elements, ((num_wrong * 1.0)/total_num_elements)*100.0); + ret = 1; + } + + if( NULL != my_int_send_vector ) { + free(my_int_send_vector); + } + if( NULL != my_int_recv_vector ){ + free(my_int_recv_vector); + } + if( NULL != my_dc_send_vector ) { + free(my_dc_send_vector); + } + if( NULL != my_dc_recv_vector ){ + free(my_dc_recv_vector); + } + fflush(NULL); + MPI_Barrier(MPI_COMM_WORLD); + + return ret; +}