42
42
43
43
#include "orte/mca/errmgr/errmgr.h"
44
44
#include "orte/mca/rmaps/base/base.h"
45
+ #include "orte/mca/rml/base/rml_contact.h"
45
46
#include "orte/mca/state/state.h"
46
47
#include "orte/util/name_fns.h"
47
48
#include "orte/util/show_help.h"
@@ -537,7 +538,14 @@ static void _cnlk(int status, opal_list_t *data, void *cbdata)
537
538
int rc , cnt ;
538
539
opal_pmix_pdata_t * pdat ;
539
540
orte_job_t * jdata ;
540
- opal_buffer_t buf ;
541
+ orte_node_t * node ;
542
+ orte_proc_t * proc ;
543
+ opal_buffer_t buf , bucket ;
544
+ opal_byte_object_t * bo ;
545
+ orte_process_name_t dmn , pname ;
546
+ char * uri ;
547
+ opal_value_t val ;
548
+ opal_list_t nodes ;
541
549
542
550
ORTE_ACQUIRE_OBJECT (cd );
543
551
@@ -554,6 +562,7 @@ static void _cnlk(int status, opal_list_t *data, void *cbdata)
554
562
pdat = (opal_pmix_pdata_t * )opal_list_get_first (data );
555
563
if (OPAL_BYTE_OBJECT != pdat -> value .type ) {
556
564
rc = ORTE_ERR_BAD_PARAM ;
565
+ ORTE_ERROR_LOG (rc );
557
566
goto release ;
558
567
}
559
568
/* the data will consist of a packed buffer with the job data in it */
@@ -563,15 +572,107 @@ static void _cnlk(int status, opal_list_t *data, void *cbdata)
563
572
pdat -> value .data .bo .size = 0 ;
564
573
cnt = 1 ;
565
574
if (OPAL_SUCCESS != (rc = opal_dss .unpack (& buf , & jdata , & cnt , ORTE_JOB ))) {
575
+ ORTE_ERROR_LOG (rc );
576
+ OBJ_DESTRUCT (& buf );
577
+ goto release ;
578
+ }
579
+
580
+ /* unpack the byte object containing the daemon uri's */
581
+ cnt = 1 ;
582
+ if (ORTE_SUCCESS != (rc = opal_dss .unpack (& buf , & bo , & cnt , OPAL_BYTE_OBJECT ))) {
583
+ ORTE_ERROR_LOG (rc );
566
584
OBJ_DESTRUCT (& buf );
567
585
goto release ;
568
586
}
587
+ /* load it into a buffer */
588
+ OBJ_CONSTRUCT (& bucket , opal_buffer_t );
589
+ opal_dss .load (& bucket , bo -> bytes , bo -> size );
590
+ bo -> bytes = NULL ;
591
+ free (bo );
592
+ /* prep a list to save the nodes */
593
+ OBJ_CONSTRUCT (& nodes , opal_list_t );
594
+ /* unpack and store the URI's */
595
+ cnt = 1 ;
596
+ while (OPAL_SUCCESS == (rc = opal_dss .unpack (& bucket , & uri , & cnt , OPAL_STRING ))) {
597
+ rc = orte_rml_base_parse_uris (uri , & dmn , NULL );
598
+ if (ORTE_SUCCESS != rc ) {
599
+ OBJ_DESTRUCT (& buf );
600
+ OBJ_DESTRUCT (& bucket );
601
+ goto release ;
602
+ }
603
+ /* save a node object for this daemon */
604
+ node = OBJ_NEW (orte_node_t );
605
+ node -> daemon = OBJ_NEW (orte_proc_t );
606
+ memcpy (& node -> daemon -> name , & dmn , sizeof (orte_process_name_t ));
607
+ opal_list_append (& nodes , & node -> super );
608
+ /* register the URI */
609
+ OBJ_CONSTRUCT (& val , opal_value_t );
610
+ val .key = OPAL_PMIX_PROC_URI ;
611
+ val .type = OPAL_STRING ;
612
+ val .data .string = uri ;
613
+ if (OPAL_SUCCESS != (rc = opal_pmix .store_local (& dmn , & val ))) {
614
+ ORTE_ERROR_LOG (rc );
615
+ val .key = NULL ;
616
+ val .data .string = NULL ;
617
+ OBJ_DESTRUCT (& val );
618
+ OBJ_DESTRUCT (& buf );
619
+ OBJ_DESTRUCT (& bucket );
620
+ goto release ;
621
+ }
622
+ val .key = NULL ;
623
+ val .data .string = NULL ;
624
+ OBJ_DESTRUCT (& val );
625
+ cnt = 1 ;
626
+ }
627
+ OBJ_DESTRUCT (& bucket );
628
+
629
+ /* unpack the proc-to-daemon map */
630
+ cnt = 1 ;
631
+ if (ORTE_SUCCESS != (rc = opal_dss .unpack (& buf , & bo , & cnt , OPAL_BYTE_OBJECT ))) {
632
+ ORTE_ERROR_LOG (rc );
633
+ OBJ_DESTRUCT (& buf );
634
+ goto release ;
635
+ }
636
+ /* load it into a buffer */
637
+ OBJ_CONSTRUCT (& bucket , opal_buffer_t );
638
+ opal_dss .load (& bucket , bo -> bytes , bo -> size );
639
+ bo -> bytes = NULL ;
640
+ free (bo );
641
+ /* unpack and store the map */
642
+ cnt = 1 ;
643
+ while (OPAL_SUCCESS == (rc = opal_dss .unpack (& bucket , & pname , & cnt , ORTE_NAME ))) {
644
+ /* get the name of the daemon hosting it */
645
+ if (OPAL_SUCCESS != (rc = opal_dss .unpack (& bucket , & dmn , & cnt , ORTE_NAME ))) {
646
+ OBJ_DESTRUCT (& buf );
647
+ OBJ_DESTRUCT (& bucket );
648
+ goto release ;
649
+ }
650
+ /* create the proc object */
651
+ proc = OBJ_NEW (orte_proc_t );
652
+ memcpy (& proc -> name , & pname , sizeof (orte_process_name_t ));
653
+ opal_pointer_array_set_item (jdata -> procs , pname .vpid , proc );
654
+ /* find the daemon */
655
+ OPAL_LIST_FOREACH (node , & nodes , orte_node_t ) {
656
+ if (node -> daemon -> name .vpid == dmn .vpid ) {
657
+ OBJ_RETAIN (node );
658
+ proc -> node = node ;
659
+ break ;
660
+ }
661
+ }
662
+ }
663
+ OBJ_DESTRUCT (& bucket );
664
+ OPAL_LIST_DESTRUCT (& nodes );
569
665
OBJ_DESTRUCT (& buf );
666
+
667
+ /* register the nspace */
570
668
if (ORTE_SUCCESS != (rc = orte_pmix_server_register_nspace (jdata , true))) {
669
+ ORTE_ERROR_LOG (rc );
571
670
OBJ_RELEASE (jdata );
572
671
goto release ;
573
672
}
574
- OBJ_RELEASE (jdata ); // no reason to keep this around
673
+
674
+ /* save the job object so we don't endlessly cycle */
675
+ opal_hash_table_set_value_uint32 (orte_job_data , jdata -> jobid , jdata );
575
676
576
677
/* restart the cnct processor */
577
678
ORTE_PMIX_OPERATION (cd -> procs , cd -> info , _cnct , cd -> cbfunc , cd -> cbdata );
@@ -617,6 +718,7 @@ static void _cnct(int sd, short args, void *cbdata)
617
718
* out about it, and all we can do is return an error */
618
719
if (orte_pmix_server_globals .server .jobid == ORTE_PROC_MY_HNP -> jobid &&
619
720
orte_pmix_server_globals .server .vpid == ORTE_PROC_MY_HNP -> vpid ) {
721
+ ORTE_ERROR_LOG (ORTE_ERR_NOT_SUPPORTED );
620
722
rc = ORTE_ERR_NOT_SUPPORTED ;
621
723
goto release ;
622
724
}
@@ -632,6 +734,7 @@ static void _cnct(int sd, short args, void *cbdata)
632
734
kv -> data .uint32 = geteuid ();
633
735
opal_list_append (cd -> info , & kv -> super );
634
736
if (ORTE_SUCCESS != (rc = pmix_server_lookup_fn (& nm -> name , keys , cd -> info , _cnlk , cd ))) {
737
+ ORTE_ERROR_LOG (rc );
635
738
opal_argv_free (keys );
636
739
goto release ;
637
740
}
@@ -645,6 +748,7 @@ static void _cnct(int sd, short args, void *cbdata)
645
748
if (!orte_get_attribute (& jdata -> attributes , ORTE_JOB_NSPACE_REGISTERED , NULL , OPAL_BOOL )) {
646
749
/* it hasn't been registered yet, so register it now */
647
750
if (ORTE_SUCCESS != (rc = orte_pmix_server_register_nspace (jdata , true))) {
751
+ ORTE_ERROR_LOG (rc );
648
752
goto release ;
649
753
}
650
754
}
0 commit comments