3
3
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
4
4
* University Research and Technology
5
5
* Corporation. All rights reserved.
6
- * Copyright (c) 2004-2016 The University of Tennessee and The University
6
+ * Copyright (c) 2004-2019 The University of Tennessee and The University
7
7
* of Tennessee Research Foundation. All rights
8
8
* reserved.
9
9
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
53
53
#endif /* defined(CHECKSUM) */
54
54
55
55
56
- #define IOVEC_MEM_LIMIT 8192
57
-
58
56
/* the contig versions does not use the stack. They can easily retrieve
59
57
* the status with just the informations from pConvertor->bConverted.
60
58
*/
@@ -68,9 +66,8 @@ opal_pack_homogeneous_contig_function( opal_convertor_t* pConv,
68
66
unsigned char * source_base = NULL ;
69
67
uint32_t iov_count ;
70
68
size_t length = pConv -> local_size - pConv -> bConverted , initial_amount = pConv -> bConverted ;
71
- ptrdiff_t initial_displ = pConv -> use_desc -> desc [pConv -> use_desc -> used ].end_loop .first_elem_disp ;
72
69
73
- source_base = (pConv -> pBaseBuf + initial_displ + pStack [0 ].disp + pStack [1 ].disp );
70
+ source_base = (pConv -> pBaseBuf + pConv -> pDesc -> true_lb + pStack [0 ].disp + pStack [1 ].disp );
74
71
75
72
/* There are some optimizations that can be done if the upper level
76
73
* does not provide a buffer.
@@ -111,155 +108,123 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
111
108
uint32_t * out_size ,
112
109
size_t * max_data )
113
110
{
111
+ size_t remaining , length , initial_bytes_converted = pConv -> bConverted ;
114
112
const opal_datatype_t * pData = pConv -> pDesc ;
115
113
dt_stack_t * stack = pConv -> pStack ;
114
+ ptrdiff_t extent = pData -> ub - pData -> lb ;
116
115
unsigned char * user_memory , * packed_buffer ;
117
- uint32_t iov_count , index ;
116
+ uint32_t idx = 0 ;
118
117
size_t i ;
119
- size_t bConverted , remaining , length , initial_bytes_converted = pConv -> bConverted ;
120
- ptrdiff_t extent = pData -> ub - pData -> lb ;
121
- ptrdiff_t initial_displ = pConv -> use_desc -> desc [pConv -> use_desc -> used ].end_loop .first_elem_disp ;
122
118
119
+ /* The memory layout is contiguous with gaps in the begining and at the end. The datatype true_lb
120
+ * is the initial displacement, the size the length of the contiguous area and the extent represent
121
+ * how much we should jump between elements.
122
+ */
123
123
assert ( (pData -> flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) && ((ptrdiff_t )pData -> size != extent ) );
124
124
DO_DEBUG ( opal_output ( 0 , "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n" ,
125
125
(void * )pConv -> pBaseBuf , * out_size ); );
126
126
if ( stack [1 ].type != opal_datatype_uint1 .id ) {
127
127
stack [1 ].count *= opal_datatype_basicDatatypes [stack [1 ].type ]-> size ;
128
128
stack [1 ].type = opal_datatype_uint1 .id ;
129
129
}
130
+ /* We can provide directly the pointers in the user buffers (like the convertor_raw) */
131
+ if ( NULL == iov [0 ].iov_base ) {
132
+ user_memory = pConv -> pBaseBuf + pData -> true_lb + stack [0 ].disp + stack [1 ].disp ;
133
+ if ( stack [1 ].count != pData -> size ) {
134
+ iov [idx ].iov_base = user_memory ;
135
+ iov [idx ].iov_len = stack [1 ].count ;
136
+ COMPUTE_CSUM ( iov [idx ].iov_base , iov [idx ].iov_len , pConv );
137
+ stack [0 ].count -- ; /* update the first stack position */
138
+ stack [0 ].disp += extent ;
139
+ stack [1 ].count = pData -> size ; /* for safety */
140
+ stack [1 ].disp = 0 ;
141
+ idx ++ ; /* update next iovec */
142
+ user_memory = pConv -> pBaseBuf + pData -> true_lb + stack [0 ].disp ;
143
+ pConv -> bConverted += stack [1 ].count ;
144
+ }
145
+ for ( ; (idx < (* out_size )) && stack [0 ].count ; idx ++ ) {
146
+ iov [idx ].iov_base = user_memory ;
147
+ iov [idx ].iov_len = pData -> size ;
148
+ COMPUTE_CSUM ( iov [idx ].iov_base , iov [idx ].iov_len , pConv );
149
+ stack [0 ].count -- ;
150
+ stack [0 ].disp += extent ;
151
+ user_memory += extent ;
152
+ pConv -> bConverted += pData -> size ;
153
+ }
154
+ goto update_status_and_return ;
155
+ }
130
156
131
- /* There are some optimizations that can be done if the upper level
132
- * does not provide a buffer.
133
- */
134
- for ( iov_count = 0 ; iov_count < (* out_size ); iov_count ++ ) {
157
+ for ( idx = 0 ; idx < (* out_size ); idx ++ ) {
135
158
/* Limit the amount of packed data to the data left over on this convertor */
136
159
remaining = pConv -> local_size - pConv -> bConverted ;
137
160
if ( 0 == remaining ) break ; /* we're done this time */
138
- if ( remaining > iov [iov_count ].iov_len )
139
- remaining = iov [iov_count ].iov_len ;
140
- packed_buffer = (unsigned char * )iov [iov_count ].iov_base ;
141
- bConverted = remaining ; /* how much will get unpacked this time */
142
- user_memory = pConv -> pBaseBuf + initial_displ + stack [0 ].disp + stack [1 ].disp ;
143
- i = pConv -> count - stack [0 ].count ; /* how many we already packed */
144
- assert (i == (pConv -> bConverted / pData -> size ));
145
-
146
- if ( packed_buffer == NULL ) {
147
- /* special case for small data. We avoid allocating memory if we
148
- * can fill the iovec directly with the address of the remaining
149
- * data.
150
- */
151
- if ( stack -> count < (size_t )((* out_size ) - iov_count ) ) {
152
- stack [1 ].count = pData -> size - (pConv -> bConverted % pData -> size );
153
- for ( index = iov_count ; i < pConv -> count ; i ++ , index ++ ) {
154
- iov [index ].iov_base = (IOVBASE_TYPE * ) user_memory ;
155
- iov [index ].iov_len = stack [1 ].count ;
156
- stack [0 ].disp += extent ;
157
- pConv -> bConverted += stack [1 ].count ;
158
- stack [1 ].disp = 0 ; /* reset it for the next round */
159
- stack [1 ].count = pData -> size ;
160
- user_memory = pConv -> pBaseBuf + initial_displ + stack [0 ].disp ;
161
- COMPUTE_CSUM ( iov [index ].iov_base , iov [index ].iov_len , pConv );
162
- }
163
- * out_size = iov_count + index ;
164
- * max_data = (pConv -> bConverted - initial_bytes_converted );
165
- pConv -> flags |= CONVERTOR_COMPLETED ;
166
- return 1 ; /* we're done */
167
- }
168
- /* now special case for big contiguous data with gaps around */
169
- if ( pData -> size >= IOVEC_MEM_LIMIT ) {
170
- /* as we dont have to copy any data, we can simply fill the iovecs
171
- * with data from the user data description.
172
- */
173
- for ( index = iov_count ; (i < pConv -> count ) && (index < (* out_size ));
174
- i ++ , index ++ ) {
175
- if ( remaining < pData -> size ) {
176
- iov [index ].iov_base = (IOVBASE_TYPE * ) user_memory ;
177
- iov [index ].iov_len = remaining ;
178
- remaining = 0 ;
179
- COMPUTE_CSUM ( iov [index ].iov_base , iov [index ].iov_len , pConv );
180
- break ;
181
- } else {
182
- iov [index ].iov_base = (IOVBASE_TYPE * ) user_memory ;
183
- iov [index ].iov_len = pData -> size ;
184
- user_memory += extent ;
185
- COMPUTE_CSUM ( iov [index ].iov_base , (size_t )iov [index ].iov_len , pConv );
186
- }
187
- remaining -= iov [index ].iov_len ;
188
- pConv -> bConverted += iov [index ].iov_len ;
189
- }
190
- * out_size = index ;
191
- * max_data = (pConv -> bConverted - initial_bytes_converted );
192
- if ( pConv -> bConverted == pConv -> local_size ) {
193
- pConv -> flags |= CONVERTOR_COMPLETED ;
194
- return 1 ;
195
- }
196
- return 0 ;
161
+ if ( remaining > iov [idx ].iov_len )
162
+ remaining = iov [idx ].iov_len ;
163
+ packed_buffer = (unsigned char * )iov [idx ].iov_base ;
164
+ pConv -> bConverted += remaining ;
165
+ user_memory = pConv -> pBaseBuf + pData -> true_lb + stack [0 ].disp + stack [1 ].disp ;
166
+
167
+ DO_DEBUG ( opal_output ( 0 , "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %" PRIsize_t "\n" ,
168
+ (void * )user_memory , (void * )packed_buffer , remaining ); );
169
+
170
+ length = (0 == pConv -> stack_pos ? 0 : stack [1 ].count ); /* left over from the last pack */
171
+ /* data left from last round and enough space in the buffer */
172
+ if ( (pData -> size != length ) && (length <= remaining )) {
173
+ /* copy the partial left-over from the previous round */
174
+ OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , length , pConv -> pBaseBuf ,
175
+ pData , pConv -> count );
176
+ DO_DEBUG ( opal_output ( 0 , "pack dest %p src %p length %" PRIsize_t " [prologue]\n" ,
177
+ (void * )user_memory , (void * )packed_buffer , length ); );
178
+ MEMCPY_CSUM ( packed_buffer , user_memory , length , pConv );
179
+ packed_buffer += length ;
180
+ remaining -= length ;
181
+ stack [1 ].count -= length ;
182
+ stack [1 ].disp += length ; /* just in case, we overwrite this below */
183
+ if ( 0 == stack [1 ].count ) { /* one completed element */
184
+ stack [0 ].count -- ;
185
+ stack [0 ].disp += extent ;
186
+ if ( 0 == stack [0 ].count ) /* not yet done */
187
+ break ;
188
+ stack [1 ].count = pData -> size ;
189
+ stack [1 ].disp = 0 ;
197
190
}
191
+ user_memory = pConv -> pBaseBuf + pData -> true_lb + stack [0 ].disp + stack [1 ].disp ;
198
192
}
199
193
200
- {
201
- DO_DEBUG ( opal_output ( 0 , "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n" ,
202
- (void * )user_memory , (void * )packed_buffer , (unsigned long )remaining ); );
203
-
204
- length = (0 == pConv -> stack_pos ? 0 : stack [1 ].count ); /* left over from the last pack */
205
- /* data left from last round and enough space in the buffer */
206
- if ( (0 != length ) && (length <= remaining )) {
207
- /* copy the partial left-over from the previous round */
208
- OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , length , pConv -> pBaseBuf ,
209
- pData , pConv -> count );
210
- DO_DEBUG ( opal_output ( 0 , "2. pack dest %p src %p length %lu\n" ,
211
- (void * )user_memory , (void * )packed_buffer , (unsigned long )length ); );
212
- MEMCPY_CSUM ( packed_buffer , user_memory , length , pConv );
213
- packed_buffer += length ;
214
- user_memory += (extent - pData -> size + length );
215
- remaining -= length ;
216
- stack [1 ].count -= length ;
217
- if ( 0 == stack [1 ].count ) { /* one completed element */
218
- stack [0 ].count -- ;
219
- stack [0 ].disp += extent ;
220
- if ( 0 != stack [0 ].count ) { /* not yet done */
221
- stack [1 ].count = pData -> size ;
222
- stack [1 ].disp = 0 ;
223
- }
224
- }
225
- }
226
- for ( i = 0 ; pData -> size <= remaining ; i ++ ) {
227
- OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , pData -> size , pConv -> pBaseBuf ,
228
- pData , pConv -> count );
229
- DO_DEBUG ( opal_output ( 0 , "3. pack dest %p src %p length %lu\n" ,
230
- (void * )user_memory , (void * )packed_buffer , (unsigned long )pData -> size ); );
231
- MEMCPY_CSUM ( packed_buffer , user_memory , pData -> size , pConv );
232
- packed_buffer += pData -> size ;
233
- user_memory += extent ;
234
- remaining -= pData -> size ;
235
- }
236
- stack [0 ].count -= i ; /* the filled up and the entire types */
237
- stack [0 ].disp += (i * extent );
238
- stack [1 ].disp += remaining ;
239
- /* Copy the last bits */
240
- if ( 0 != remaining ) {
241
- OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , remaining , pConv -> pBaseBuf ,
242
- pData , pConv -> count );
243
- DO_DEBUG ( opal_output ( 0 , "4. pack dest %p src %p length %lu\n" ,
244
- (void * )user_memory , (void * )packed_buffer , (unsigned long )remaining ); );
245
- MEMCPY_CSUM ( packed_buffer , user_memory , remaining , pConv );
246
- user_memory += remaining ;
247
- stack [1 ].count -= remaining ;
248
- }
194
+ for ( i = 0 ; pData -> size <= remaining ; i ++ ) {
195
+ OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , pData -> size , pConv -> pBaseBuf ,
196
+ pData , pConv -> count );
197
+ DO_DEBUG ( opal_output ( 0 , "pack dest %p src %p length %" PRIsize_t " [%" PRIsize_t "/%" PRIsize_t "\n" ,
198
+ (void * )user_memory , (void * )packed_buffer , pData -> size , remaining , iov [idx ].iov_len ); );
199
+ MEMCPY_CSUM ( packed_buffer , user_memory , pData -> size , pConv );
200
+ packed_buffer += pData -> size ;
201
+ user_memory += extent ;
202
+ remaining -= pData -> size ;
203
+ }
204
+ stack [0 ].count -= i ; /* the entire datatype copied above */
205
+ stack [0 ].disp += (i * extent );
206
+
207
+ /* Copy the last bits */
208
+ if ( 0 != remaining ) {
209
+ OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , remaining , pConv -> pBaseBuf ,
210
+ pData , pConv -> count );
211
+ DO_DEBUG ( opal_output ( 0 , "4. pack dest %p src %p length %" PRIsize_t "\n" ,
212
+ (void * )user_memory , (void * )packed_buffer , remaining ); );
213
+ MEMCPY_CSUM ( packed_buffer , user_memory , remaining , pConv );
214
+ stack [1 ].count -= remaining ;
215
+ stack [1 ].disp += remaining ; /* keep the += in case we are copying less that the datatype size */
249
216
if ( 0 == stack [1 ].count ) { /* prepare for the next element */
250
217
stack [1 ].count = pData -> size ;
251
218
stack [1 ].disp = 0 ;
252
219
}
253
220
}
254
- pConv -> bConverted += bConverted ;
255
221
}
256
- * out_size = iov_count ;
257
- * max_data = (pConv -> bConverted - initial_bytes_converted );
258
- if ( pConv -> bConverted == pConv -> local_size ) {
259
- pConv -> flags |= CONVERTOR_COMPLETED ;
260
- return 1 ;
261
- }
262
- return 0 ;
222
+
223
+ update_status_and_return :
224
+ * out_size = idx ;
225
+ * max_data = pConv -> bConverted - initial_bytes_converted ;
226
+ if ( pConv -> bConverted == pConv -> local_size ) pConv -> flags |= CONVERTOR_COMPLETED ;
227
+ return !!(pConv -> flags & CONVERTOR_COMPLETED ); /* done or not */
263
228
}
264
229
265
230
/* The pack/unpack functions need a cleanup. I have to create a proper interface to access
0 commit comments