@@ -672,34 +672,46 @@ static inline int
672672mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t * ugni_module )
673673{
674674 int rc = OPAL_SUCCESS ;
675+ opal_list_t tmplist ;
676+ opal_list_t * waitlist = & ugni_module -> ep_wait_list ;
675677 mca_btl_base_endpoint_t * endpoint = NULL ;
676678 int count ;
677679
678- if (0 == opal_list_get_size (& ugni_module -> ep_wait_list )) {
679- return 0 ;
680- }
681-
682680 /* check the count before taking the lock to avoid unnecessary locking */
683- count = opal_list_get_size (& ugni_module -> ep_wait_list );
681+ count = opal_list_get_size (waitlist );
684682 if (0 == count ) {
685683 return 0 ;
686684 }
687685
686+ /* Don't hold the wait-list lock while processing the list as that may lead
687+ * to a deadlock.
688+ * Instead, move the wait_list elements into a temporary list and work on that.*/
689+ OBJ_CONSTRUCT (& tmplist , opal_list_t );
688690 OPAL_THREAD_LOCK (& ugni_module -> ep_wait_list_lock );
689- count = opal_list_get_size (& ugni_module -> ep_wait_list );
691+ opal_list_join (& tmplist , opal_list_get_end (& tmplist ), waitlist );
692+ OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
693+ count = opal_list_get_size (& tmplist );
690694 do {
691- endpoint = (mca_btl_base_endpoint_t * ) opal_list_remove_first (& ugni_module -> ep_wait_list );
695+ endpoint = (mca_btl_base_endpoint_t * ) opal_list_remove_first (& tmplist );
692696 if (endpoint != NULL ) {
693697 rc = mca_btl_ugni_progress_send_wait_list (endpoint );
694698
695699 if (OPAL_SUCCESS != rc ) {
696- opal_list_append (& ugni_module -> ep_wait_list , & endpoint -> super );
700+ opal_list_append (& tmplist , & endpoint -> super );
697701 } else {
698702 endpoint -> wait_listed = false;
699703 }
700704 }
701705 } while (endpoint != NULL && -- count > 0 ) ;
702- OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
706+
707+ /* reinsert unfinished elements into the wait-list */
708+ count = opal_list_get_size (& tmplist );
709+ if (0 < count ) {
710+ OPAL_THREAD_LOCK (& ugni_module -> ep_wait_list_lock );
711+ opal_list_join (waitlist , opal_list_get_end (waitlist ), & tmplist );
712+ OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
713+ }
714+ OBJ_DESTRUCT (& tmplist );
703715
704716 return rc ;
705717}
0 commit comments