33 * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
44 * University Research and Technology
55 * Corporation. All rights reserved.
6- * Copyright (c) 2004-2016 The University of Tennessee and The University
6+ * Copyright (c) 2004-2019 The University of Tennessee and The University
77 * of Tennessee Research Foundation. All rights
88 * reserved.
99 * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
5353#endif /* defined(CHECKSUM) */
5454
5555
56- #define IOVEC_MEM_LIMIT 8192
57-
5856/* the contig versions does not use the stack. They can easily retrieve
5957 * the status with just the informations from pConvertor->bConverted.
6058 */
@@ -68,9 +66,8 @@ opal_pack_homogeneous_contig_function( opal_convertor_t* pConv,
6866 unsigned char * source_base = NULL ;
6967 uint32_t iov_count ;
7068 size_t length = pConv -> local_size - pConv -> bConverted , initial_amount = pConv -> bConverted ;
71- ptrdiff_t initial_displ = pConv -> use_desc -> desc [pConv -> use_desc -> used ].end_loop .first_elem_disp ;
7269
73- source_base = (pConv -> pBaseBuf + initial_displ + pStack [0 ].disp + pStack [1 ].disp );
70+ source_base = (pConv -> pBaseBuf + pConv -> pDesc -> true_lb + pStack [0 ].disp + pStack [1 ].disp );
7471
7572 /* There are some optimizations that can be done if the upper level
7673 * does not provide a buffer.
@@ -111,155 +108,123 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
111108 uint32_t * out_size ,
112109 size_t * max_data )
113110{
111+ size_t remaining , length , initial_bytes_converted = pConv -> bConverted ;
114112 const opal_datatype_t * pData = pConv -> pDesc ;
115113 dt_stack_t * stack = pConv -> pStack ;
114+ ptrdiff_t extent = pData -> ub - pData -> lb ;
116115 unsigned char * user_memory , * packed_buffer ;
117- uint32_t iov_count , index ;
116+ uint32_t idx = 0 ;
118117 size_t i ;
119- size_t bConverted , remaining , length , initial_bytes_converted = pConv -> bConverted ;
120- ptrdiff_t extent = pData -> ub - pData -> lb ;
121- ptrdiff_t initial_displ = pConv -> use_desc -> desc [pConv -> use_desc -> used ].end_loop .first_elem_disp ;
122118
119+ /* The memory layout is contiguous with gaps in the begining and at the end. The datatype true_lb
120+ * is the initial displacement, the size the length of the contiguous area and the extent represent
121+ * how much we should jump between elements.
122+ */
123123 assert ( (pData -> flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) && ((ptrdiff_t )pData -> size != extent ) );
124124 DO_DEBUG ( opal_output ( 0 , "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n" ,
125125 (void * )pConv -> pBaseBuf , * out_size ); );
126126 if ( stack [1 ].type != opal_datatype_uint1 .id ) {
127127 stack [1 ].count *= opal_datatype_basicDatatypes [stack [1 ].type ]-> size ;
128128 stack [1 ].type = opal_datatype_uint1 .id ;
129129 }
130+ /* We can provide directly the pointers in the user buffers (like the convertor_raw) */
131+ if ( NULL == iov [0 ].iov_base ) {
132+ user_memory = pConv -> pBaseBuf + pData -> true_lb + stack [0 ].disp + stack [1 ].disp ;
133+ if ( stack [1 ].count != pData -> size ) {
134+ iov [idx ].iov_base = user_memory ;
135+ iov [idx ].iov_len = stack [1 ].count ;
136+ COMPUTE_CSUM ( iov [idx ].iov_base , iov [idx ].iov_len , pConv );
137+ stack [0 ].count -- ; /* update the first stack position */
138+ stack [0 ].disp += extent ;
139+ stack [1 ].count = pData -> size ; /* for safety */
140+ stack [1 ].disp = 0 ;
141+ idx ++ ; /* update next iovec */
142+ user_memory = pConv -> pBaseBuf + pData -> true_lb + stack [0 ].disp ;
143+ pConv -> bConverted += stack [1 ].count ;
144+ }
145+ for ( ; (idx < (* out_size )) && stack [0 ].count ; idx ++ ) {
146+ iov [idx ].iov_base = user_memory ;
147+ iov [idx ].iov_len = pData -> size ;
148+ COMPUTE_CSUM ( iov [idx ].iov_base , iov [idx ].iov_len , pConv );
149+ stack [0 ].count -- ;
150+ stack [0 ].disp += extent ;
151+ user_memory += extent ;
152+ pConv -> bConverted += pData -> size ;
153+ }
154+ goto update_status_and_return ;
155+ }
130156
131- /* There are some optimizations that can be done if the upper level
132- * does not provide a buffer.
133- */
134- for ( iov_count = 0 ; iov_count < (* out_size ); iov_count ++ ) {
157+ for ( idx = 0 ; idx < (* out_size ); idx ++ ) {
135158 /* Limit the amount of packed data to the data left over on this convertor */
136159 remaining = pConv -> local_size - pConv -> bConverted ;
137160 if ( 0 == remaining ) break ; /* we're done this time */
138- if ( remaining > iov [iov_count ].iov_len )
139- remaining = iov [iov_count ].iov_len ;
140- packed_buffer = (unsigned char * )iov [iov_count ].iov_base ;
141- bConverted = remaining ; /* how much will get unpacked this time */
142- user_memory = pConv -> pBaseBuf + initial_displ + stack [0 ].disp + stack [1 ].disp ;
143- i = pConv -> count - stack [0 ].count ; /* how many we already packed */
144- assert (i == (pConv -> bConverted / pData -> size ));
145-
146- if ( packed_buffer == NULL ) {
147- /* special case for small data. We avoid allocating memory if we
148- * can fill the iovec directly with the address of the remaining
149- * data.
150- */
151- if ( stack -> count < (size_t )((* out_size ) - iov_count ) ) {
152- stack [1 ].count = pData -> size - (pConv -> bConverted % pData -> size );
153- for ( index = iov_count ; i < pConv -> count ; i ++ , index ++ ) {
154- iov [index ].iov_base = (IOVBASE_TYPE * ) user_memory ;
155- iov [index ].iov_len = stack [1 ].count ;
156- stack [0 ].disp += extent ;
157- pConv -> bConverted += stack [1 ].count ;
158- stack [1 ].disp = 0 ; /* reset it for the next round */
159- stack [1 ].count = pData -> size ;
160- user_memory = pConv -> pBaseBuf + initial_displ + stack [0 ].disp ;
161- COMPUTE_CSUM ( iov [index ].iov_base , iov [index ].iov_len , pConv );
162- }
163- * out_size = iov_count + index ;
164- * max_data = (pConv -> bConverted - initial_bytes_converted );
165- pConv -> flags |= CONVERTOR_COMPLETED ;
166- return 1 ; /* we're done */
167- }
168- /* now special case for big contiguous data with gaps around */
169- if ( pData -> size >= IOVEC_MEM_LIMIT ) {
170- /* as we dont have to copy any data, we can simply fill the iovecs
171- * with data from the user data description.
172- */
173- for ( index = iov_count ; (i < pConv -> count ) && (index < (* out_size ));
174- i ++ , index ++ ) {
175- if ( remaining < pData -> size ) {
176- iov [index ].iov_base = (IOVBASE_TYPE * ) user_memory ;
177- iov [index ].iov_len = remaining ;
178- remaining = 0 ;
179- COMPUTE_CSUM ( iov [index ].iov_base , iov [index ].iov_len , pConv );
180- break ;
181- } else {
182- iov [index ].iov_base = (IOVBASE_TYPE * ) user_memory ;
183- iov [index ].iov_len = pData -> size ;
184- user_memory += extent ;
185- COMPUTE_CSUM ( iov [index ].iov_base , (size_t )iov [index ].iov_len , pConv );
186- }
187- remaining -= iov [index ].iov_len ;
188- pConv -> bConverted += iov [index ].iov_len ;
189- }
190- * out_size = index ;
191- * max_data = (pConv -> bConverted - initial_bytes_converted );
192- if ( pConv -> bConverted == pConv -> local_size ) {
193- pConv -> flags |= CONVERTOR_COMPLETED ;
194- return 1 ;
195- }
196- return 0 ;
161+ if ( remaining > iov [idx ].iov_len )
162+ remaining = iov [idx ].iov_len ;
163+ packed_buffer = (unsigned char * )iov [idx ].iov_base ;
164+ pConv -> bConverted += remaining ;
165+ user_memory = pConv -> pBaseBuf + pData -> true_lb + stack [0 ].disp + stack [1 ].disp ;
166+
167+ DO_DEBUG ( opal_output ( 0 , "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %" PRIsize_t "\n" ,
168+ (void * )user_memory , (void * )packed_buffer , remaining ); );
169+
170+ length = (0 == pConv -> stack_pos ? 0 : stack [1 ].count ); /* left over from the last pack */
171+ /* data left from last round and enough space in the buffer */
172+ if ( (pData -> size != length ) && (length <= remaining )) {
173+ /* copy the partial left-over from the previous round */
174+ OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , length , pConv -> pBaseBuf ,
175+ pData , pConv -> count );
176+ DO_DEBUG ( opal_output ( 0 , "pack dest %p src %p length %" PRIsize_t " [prologue]\n" ,
177+ (void * )user_memory , (void * )packed_buffer , length ); );
178+ MEMCPY_CSUM ( packed_buffer , user_memory , length , pConv );
179+ packed_buffer += length ;
180+ remaining -= length ;
181+ stack [1 ].count -= length ;
182+ stack [1 ].disp += length ; /* just in case, we overwrite this below */
183+ if ( 0 == stack [1 ].count ) { /* one completed element */
184+ stack [0 ].count -- ;
185+ stack [0 ].disp += extent ;
186+ if ( 0 == stack [0 ].count ) /* not yet done */
187+ break ;
188+ stack [1 ].count = pData -> size ;
189+ stack [1 ].disp = 0 ;
197190 }
191+ user_memory = pConv -> pBaseBuf + pData -> true_lb + stack [0 ].disp + stack [1 ].disp ;
198192 }
199193
200- {
201- DO_DEBUG ( opal_output ( 0 , "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n" ,
202- (void * )user_memory , (void * )packed_buffer , (unsigned long )remaining ); );
203-
204- length = (0 == pConv -> stack_pos ? 0 : stack [1 ].count ); /* left over from the last pack */
205- /* data left from last round and enough space in the buffer */
206- if ( (0 != length ) && (length <= remaining )) {
207- /* copy the partial left-over from the previous round */
208- OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , length , pConv -> pBaseBuf ,
209- pData , pConv -> count );
210- DO_DEBUG ( opal_output ( 0 , "2. pack dest %p src %p length %lu\n" ,
211- (void * )user_memory , (void * )packed_buffer , (unsigned long )length ); );
212- MEMCPY_CSUM ( packed_buffer , user_memory , length , pConv );
213- packed_buffer += length ;
214- user_memory += (extent - pData -> size + length );
215- remaining -= length ;
216- stack [1 ].count -= length ;
217- if ( 0 == stack [1 ].count ) { /* one completed element */
218- stack [0 ].count -- ;
219- stack [0 ].disp += extent ;
220- if ( 0 != stack [0 ].count ) { /* not yet done */
221- stack [1 ].count = pData -> size ;
222- stack [1 ].disp = 0 ;
223- }
224- }
225- }
226- for ( i = 0 ; pData -> size <= remaining ; i ++ ) {
227- OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , pData -> size , pConv -> pBaseBuf ,
228- pData , pConv -> count );
229- DO_DEBUG ( opal_output ( 0 , "3. pack dest %p src %p length %lu\n" ,
230- (void * )user_memory , (void * )packed_buffer , (unsigned long )pData -> size ); );
231- MEMCPY_CSUM ( packed_buffer , user_memory , pData -> size , pConv );
232- packed_buffer += pData -> size ;
233- user_memory += extent ;
234- remaining -= pData -> size ;
235- }
236- stack [0 ].count -= i ; /* the filled up and the entire types */
237- stack [0 ].disp += (i * extent );
238- stack [1 ].disp += remaining ;
239- /* Copy the last bits */
240- if ( 0 != remaining ) {
241- OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , remaining , pConv -> pBaseBuf ,
242- pData , pConv -> count );
243- DO_DEBUG ( opal_output ( 0 , "4. pack dest %p src %p length %lu\n" ,
244- (void * )user_memory , (void * )packed_buffer , (unsigned long )remaining ); );
245- MEMCPY_CSUM ( packed_buffer , user_memory , remaining , pConv );
246- user_memory += remaining ;
247- stack [1 ].count -= remaining ;
248- }
194+ for ( i = 0 ; pData -> size <= remaining ; i ++ ) {
195+ OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , pData -> size , pConv -> pBaseBuf ,
196+ pData , pConv -> count );
197+ DO_DEBUG ( opal_output ( 0 , "pack dest %p src %p length %" PRIsize_t " [%" PRIsize_t "/%" PRIsize_t "\n" ,
198+ (void * )user_memory , (void * )packed_buffer , pData -> size , remaining , iov [idx ].iov_len ); );
199+ MEMCPY_CSUM ( packed_buffer , user_memory , pData -> size , pConv );
200+ packed_buffer += pData -> size ;
201+ user_memory += extent ;
202+ remaining -= pData -> size ;
203+ }
204+ stack [0 ].count -= i ; /* the entire datatype copied above */
205+ stack [0 ].disp += (i * extent );
206+
207+ /* Copy the last bits */
208+ if ( 0 != remaining ) {
209+ OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , remaining , pConv -> pBaseBuf ,
210+ pData , pConv -> count );
211+ DO_DEBUG ( opal_output ( 0 , "4. pack dest %p src %p length %" PRIsize_t "\n" ,
212+ (void * )user_memory , (void * )packed_buffer , remaining ); );
213+ MEMCPY_CSUM ( packed_buffer , user_memory , remaining , pConv );
214+ stack [1 ].count -= remaining ;
215+ stack [1 ].disp += remaining ; /* keep the += in case we are copying less that the datatype size */
249216 if ( 0 == stack [1 ].count ) { /* prepare for the next element */
250217 stack [1 ].count = pData -> size ;
251218 stack [1 ].disp = 0 ;
252219 }
253220 }
254- pConv -> bConverted += bConverted ;
255221 }
256- * out_size = iov_count ;
257- * max_data = (pConv -> bConverted - initial_bytes_converted );
258- if ( pConv -> bConverted == pConv -> local_size ) {
259- pConv -> flags |= CONVERTOR_COMPLETED ;
260- return 1 ;
261- }
262- return 0 ;
222+
223+ update_status_and_return :
224+ * out_size = idx ;
225+ * max_data = pConv -> bConverted - initial_bytes_converted ;
226+ if ( pConv -> bConverted == pConv -> local_size ) pConv -> flags |= CONVERTOR_COMPLETED ;
227+ return !!(pConv -> flags & CONVERTOR_COMPLETED ); /* done or not */
263228}
264229
265230/* The pack/unpack functions need a cleanup. I have to create a proper interface to access
0 commit comments