Skip to content

Commit aaef492

Browse files
authored
Merge pull request #9303 from wckzhang/v5.0.x
v5.0.x common/ofi: Utilize new libfabric API to import memhooks monitor
2 parents 97a3e00 + 2697379 commit aaef492

File tree

5 files changed

+172
-6
lines changed

5 files changed

+172
-6
lines changed

config/opal_check_ofi.m4

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ dnl
33
dnl Copyright (c) 2015-2020 Cisco Systems, Inc. All rights reserved.
44
dnl Copyright (c) 2016-2017 Los Alamos National Security, LLC. All rights
55
dnl reserved.
6+
dnl Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights
7+
dnl reserved.
68
dnl $COPYRIGHT$
79
dnl
810
dnl Additional copyrights may follow
@@ -155,6 +157,18 @@ AC_DEFUN([_OPAL_CHECK_OFI],[
155157
[AC_MSG_WARN([OFI libfabric support requested (via --with-ofi or --with-libfabric), but not found.])
156158
AC_MSG_ERROR([Cannot continue.])])
157159
])
160+
opal_ofi_import_monitor=no
161+
AS_IF([test $opal_ofi_happy = "yes"],
162+
[OPAL_CHECK_OFI_VERSION_GE([1,13],
163+
[opal_ofi_import_monitor=yes],
164+
[opal_ofi_import_monitor=no])])
165+
166+
167+
if test "$opal_ofi_import_monitor" = "yes"; then
168+
AC_DEFINE_UNQUOTED([OPAL_OFI_IMPORT_MONITOR_SUPPORT],1,
169+
[Whether libfabric supports monitor import])
170+
fi
171+
158172
])dnl
159173

160174

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
* Copyright (c) 2014-2021 Cisco Systems, Inc. All rights reserved
66
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
77
* reserved.
8-
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
9-
* Copyright (c) 2020 Triad National Security, LLC. All rights
8+
* Copyright (c) 2018-2021 Amazon.com, Inc. or its affiliates. All Rights reserved.
9+
* Copyright (c) 2020-2021 Triad National Security, LLC. All rights
1010
* reserved.
1111
* $COPYRIGHT$
1212
*
@@ -285,8 +285,7 @@ ompi_mtl_ofi_component_open(void)
285285
"provider_exclude")) {
286286
return OMPI_ERR_NOT_AVAILABLE;
287287
}
288-
289-
return OMPI_SUCCESS;
288+
return opal_common_ofi_init();
290289
}
291290

292291
static int
@@ -304,6 +303,7 @@ ompi_mtl_ofi_component_close(void)
304303
mca_common_cuda_fini();
305304
#endif
306305
opal_common_ofi_mca_deregister();
306+
opal_common_ofi_fini();
307307
return OMPI_SUCCESS;
308308
}
309309

opal/mca/btl/ofi/btl_ofi_component.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* reserved.
1515
* Copyright (c) 2018-2019 Intel, Inc. All rights reserved.
1616
*
17-
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
17+
* Copyright (c) 2018-2021 Amazon.com, Inc. or its affiliates. All Rights reserved.
1818
* Copyright (c) 2020 Triad National Security, LLC. All rights
1919
* reserved.
2020
* $COPYRIGHT$
@@ -199,7 +199,7 @@ static int mca_btl_ofi_component_register(void)
199199
static int mca_btl_ofi_component_open(void)
200200
{
201201
mca_btl_ofi_component.module_count = 0;
202-
return OPAL_SUCCESS;
202+
return opal_common_ofi_init();
203203
}
204204

205205
/*
@@ -208,6 +208,7 @@ static int mca_btl_ofi_component_open(void)
208208
static int mca_btl_ofi_component_close(void)
209209
{
210210
opal_common_ofi_mca_deregister();
211+
opal_common_ofi_fini();
211212
/* If we don't sleep, sockets provider freaks out. Ummm this is a scary comment */
212213
sleep(1);
213214
return OPAL_SUCCESS;

opal/mca/common/ofi/common_ofi.c

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
* reserved.
77
* Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved
88
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
9+
* Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights
10+
* reserved.
911
* $COPYRIGHT$
1012
*
1113
* Additional copyrights may follow
@@ -33,6 +35,120 @@ OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL,
3335

3436
static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic";
3537
static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT;
38+
static bool opal_common_ofi_initialized = false;
39+
static int opal_common_ofi_init_ref_cnt = 0;
40+
41+
#if OPAL_OFI_IMPORT_MONITOR_SUPPORT
42+
43+
static int opal_common_ofi_monitor_start(struct fid_mem_monitor *monitor)
44+
{
45+
return 0;
46+
}
47+
static void opal_common_ofi_monitor_stop(struct fid_mem_monitor *monitor)
48+
{
49+
return;
50+
}
51+
static int opal_common_ofi_monitor_subscribe(struct fid_mem_monitor *monitor,
52+
const void *addr, size_t len)
53+
{
54+
return 0;
55+
}
56+
static void opal_common_ofi_monitor_unsubscribe(struct fid_mem_monitor *monitor,
57+
const void *addr, size_t len)
58+
{
59+
return;
60+
}
61+
static bool opal_common_ofi_monitor_valid(struct fid_mem_monitor *monitor,
62+
const void *addr, size_t len)
63+
{
64+
return true;
65+
}
66+
67+
static struct fid_mem_monitor *opal_common_ofi_monitor;
68+
static struct fid *opal_common_ofi_cache_fid;
69+
static struct fi_ops_mem_monitor opal_common_ofi_export_ops = {
70+
.size = sizeof(struct fi_ops_mem_monitor),
71+
.start = opal_common_ofi_monitor_start,
72+
.stop = opal_common_ofi_monitor_stop,
73+
.subscribe = opal_common_ofi_monitor_subscribe,
74+
.unsubscribe = opal_common_ofi_monitor_unsubscribe,
75+
.valid = opal_common_ofi_monitor_valid,
76+
};
77+
78+
OPAL_DECLSPEC void opal_common_ofi_mem_release_cb(void *buf, size_t length,
79+
void *cbdata, bool from_alloc)
80+
{
81+
opal_common_ofi_monitor->import_ops->notify(opal_common_ofi_monitor,
82+
buf, length);
83+
}
84+
#endif /* OPAL_OFI_IMPORT_MONITOR_SUPPORT */
85+
86+
OPAL_DECLSPEC int opal_common_ofi_init(void)
87+
{
88+
int ret;
89+
90+
opal_common_ofi_init_ref_cnt++;
91+
if (opal_common_ofi_initialized) {
92+
return OPAL_SUCCESS;
93+
}
94+
#if OPAL_OFI_IMPORT_MONITOR_SUPPORT
95+
96+
mca_base_framework_open(&opal_memory_base_framework, 0);
97+
if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT)
98+
!= (((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT))
99+
& opal_mem_hooks_support_level())) {
100+
return OPAL_SUCCESS;
101+
}
102+
103+
ret = fi_open(FI_VERSION(1,13), "mr_cache", NULL, 0, 0, &opal_common_ofi_cache_fid, NULL);
104+
if (ret) {
105+
goto err;
106+
}
107+
108+
opal_common_ofi_monitor = calloc(1, sizeof(*opal_common_ofi_monitor));
109+
if (!opal_common_ofi_monitor) {
110+
goto err;
111+
}
112+
113+
opal_common_ofi_monitor->fid.fclass = FI_CLASS_MEM_MONITOR;
114+
opal_common_ofi_monitor->export_ops = &opal_common_ofi_export_ops;
115+
ret = fi_import_fid(opal_common_ofi_cache_fid, &opal_common_ofi_monitor->fid, 0);
116+
if (ret) {
117+
goto err;
118+
}
119+
opal_mem_hooks_register_release(opal_common_ofi_mem_release_cb, NULL);
120+
opal_common_ofi_initialized = true;
121+
122+
return OPAL_SUCCESS;
123+
err:
124+
if (opal_common_ofi_cache_fid) {
125+
fi_close(opal_common_ofi_cache_fid);
126+
}
127+
if (opal_common_ofi_monitor) {
128+
free(opal_common_ofi_monitor);
129+
}
130+
131+
return OPAL_ERROR;
132+
#else
133+
opal_common_ofi_initialized = true;
134+
return OPAL_SUCCESS;
135+
#endif
136+
}
137+
138+
OPAL_DECLSPEC int opal_common_ofi_fini(void)
139+
{
140+
if (opal_common_ofi_initialized && !--opal_common_ofi_init_ref_cnt) {
141+
#if OPAL_OFI_IMPORT_MONITOR_SUPPORT
142+
opal_mem_hooks_unregister_release(opal_common_ofi_mem_release_cb);
143+
fi_close(opal_common_ofi_cache_fid);
144+
fi_close(&opal_common_ofi_monitor->fid);
145+
free(opal_common_ofi_monitor);
146+
#endif
147+
opal_common_ofi_initialized = false;
148+
}
149+
150+
return OPAL_SUCCESS;
151+
}
36152

37153
OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item)
38154
{

opal/mca/common/ofi/common_ofi.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* reserved.
66
* Copyright (c) 2020 Triad National Security, LLC. All rights
77
* reserved.
8+
* Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights
9+
* reserved.
810
*
911
* $COPYRIGHT$
1012
*
@@ -20,7 +22,11 @@
2022
#include "opal/mca/base/mca_base_framework.h"
2123
#include "opal/mca/base/mca_base_var.h"
2224
#include "opal/util/proc.h"
25+
#include "opal/memoryhooks/memory.h"
2326
#include <rdma/fabric.h>
27+
#if OPAL_OFI_IMPORT_MONITOR_SUPPORT
28+
#include <rdma/fi_ext.h>
29+
#endif
2430

2531
BEGIN_C_DECLS
2632

@@ -33,6 +39,7 @@ typedef struct opal_common_ofi_module {
3339
} opal_common_ofi_module_t;
3440

3541
extern opal_common_ofi_module_t opal_common_ofi;
42+
extern mca_base_framework_t opal_memory_base_framework;
3643

3744
OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component);
3845
OPAL_DECLSPEC void opal_common_ofi_mca_register(void);
@@ -54,6 +61,34 @@ OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void);
5461
*/
5562
OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
5663

64+
#if OPAL_OFI_IMPORT_MONITOR_SUPPORT
65+
/*
66+
* @param buf (IN) Pointer to the start of the allocation
67+
* @param length (IN) Length of the allocation
68+
* @param cbdata (IN) Data passed to memory hooks when callback
69+
* was registered
70+
* @param from_alloc (IN) True if the callback is caused by a call to the
71+
* general allocation routines (malloc, calloc, free,
72+
* etc.) or directly from the user (mmap, munmap, etc.)
73+
*
74+
* Callback function triggered when memory is about to be freed.
75+
* is about to be freed. The callback will be triggered according to
76+
* the note in opal_mem_hooks_register_release().
77+
*
78+
*/
79+
OPAL_DECLSPEC void opal_common_ofi_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc);
80+
#endif /* OPAL_OFI_IMPORT_MONITOR_SUPPORT */
81+
82+
/*
83+
* Initializes common objects for libfabric
84+
*/
85+
OPAL_DECLSPEC int opal_common_ofi_init(void);
86+
87+
/*
88+
* Cleans up common objects for libfabric
89+
*/
90+
OPAL_DECLSPEC int opal_common_ofi_fini(void);
91+
5792
END_C_DECLS
5893

5994
struct fi_info *opal_mca_common_ofi_select_provider(struct fi_info *provider_list,

0 commit comments

Comments
 (0)