@@ -300,10 +300,27 @@ void CuptiActivityProfiler::logGpuVersions() {
300300#endif
301301}
302302
303+ namespace {
304+
305+ const std::unordered_set<std::string>& getLoggerMedataAllowList () {
306+ static const std::unordered_set<std::string> kLoggerMedataAllowList {
307+ " with_stack" , " with_modules" , " record_shapes" , " profile_memory" };
308+ return kLoggerMedataAllowList ;
309+ }
310+
311+ } // namespace
312+
303313void CuptiActivityProfiler::processTraceInternal (ActivityLogger& logger) {
304314 LOG (INFO) << " Processing " << traceBuffers_->cpu .size () << " CPU buffers" ;
305315 VLOG (0 ) << " Profile time range: " << captureWindowStartTime_ << " - "
306316 << captureWindowEndTime_;
317+
318+ // Pass metadata within the trace to the logger observer.
319+ for (const auto & pair : metadata_) {
320+ if (getLoggerMedataAllowList ().count (pair.first ) > 0 ) {
321+ LOGGER_OBSERVER_ADD_METADATA (pair.first , pair.second );
322+ }
323+ }
307324 for (auto & pair : versionMetadata_) {
308325 addMetadata (pair.first , pair.second );
309326 }
@@ -319,6 +336,9 @@ void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) {
319336 device_properties.push_back (props);
320337 }
321338 }
339+ for (const auto & [key, value] : session->getMetadata ()) {
340+ addMetadata (key, value);
341+ }
322342 }
323343 logger.handleTraceStart (
324344 metadata_, fmt::format (" {}" , fmt::join (device_properties, " ," )));
@@ -429,7 +449,7 @@ void CuptiActivityProfiler::processCpuTrace(
429449 return ;
430450 }
431451 setCpuActivityPresent (true );
432-
452+ bool warn_once = false ;
433453 CpuGpuSpanPair& span_pair =
434454 recordTraceSpan (cpuTrace.span , cpuTrace.gpuOpCount );
435455 TraceSpan& cpu_span = span_pair.first ;
@@ -442,18 +462,23 @@ void CuptiActivityProfiler::processCpuTrace(
442462 const std::unique_ptr<GenericTraceActivity>>::value,
443463 " handleActivity is unsafe and relies on the caller to maintain not "
444464 " only lifetime but also address stability." );
445- if (act->type () == ActivityType::USER_ANNOTATION &&
446- act->duration () <= 0 ) {
465+ if (act->duration () < 0 ) {
447466 act->endTime = captureWindowEndTime_;
448467 act->addMetadata (" finished" , " false" );
449- } else {
450- act->addMetadata (" finished" , " true" );
451468 }
452469 logger.handleActivity (*act);
453470 }
454471 clientActivityTraceMap_[act->correlationId ()] = &span_pair;
455472 activityMap_[act->correlationId ()] = act.get ();
456-
473+ if (act->deviceId () == 0 ) {
474+ if (!warn_once) {
475+ LOG (WARNING)
476+ << " CPU activity with pid 0 detected. This is likely due to the python stack"
477+ " tracer not being able to determine the pid for an event. Overriding pid to main thread pid" ;
478+ }
479+ act->setDevice (processId ());
480+ warn_once = true ;
481+ }
457482 recordThreadInfo (act->resourceId (), act->getThreadId (), act->deviceId ());
458483 }
459484 logger.handleTraceSpan (cpu_span);
@@ -564,8 +589,8 @@ inline static bool isBlockListedRuntimeCbid(CUpti_CallbackId cbid) {
564589 if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 ||
565590 cbid == CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 ||
566591 cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020 ||
567- // Support cudaEventRecord and cudaEventSynchronize, revisit if others are
568- // needed
592+ // Support cudaEventRecord and cudaEventSynchronize, revisit if others
593+ // are needed
569594 cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020 ||
570595 cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020 ||
571596 cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020) {
@@ -1322,20 +1347,21 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
13221347 || cupti_.stopCollection
13231348#endif // HAS_CUPTI || HAS_ROCTRACER
13241349 ) {
1325- // Update runloop state first to prevent further updates to shared state
1350+ // Update runloop state first to prevent further updates to shared
1351+ // state
13261352 LOG (INFO) << " Tracing complete." ;
13271353 VLOG_IF (1 , currentIter >= 0 )
13281354 << " This state change was invoked by application's step() call" ;
13291355
13301356 // currentIter >= 0 means this is called from the step() api of
1331- // the profile in pytorch main thread, it should be executed in another
1332- // thread in case pytorch main thread is blocked
1357+ // the profile in pytorch main thread, it should be executed in
1358+ // another thread in case pytorch main thread is blocked
13331359 if (currentIter >= 0 ) {
13341360 // if collectTraceThread_ is already running, there's no need to
13351361 // execute collectTrace twice.
1336- // Do not call collectTrace when profilerThread_ is collecting Trace.
1337- // Otherwise, libkineto::api().client()->stop will be called twice,
1338- // which leads to an unrecoverable ::c10:Error at
1362+ // Do not call collectTrace when profilerThread_ is collecting
1363+ // Trace. Otherwise, libkineto::api().client()->stop will be called
1364+ // twice, which leads to an unrecoverable ::c10:Error at
13391365 // disableProfiler
13401366 if (!collectTraceThread_ && !getCollectTraceState ()) {
13411367 std::lock_guard<std::recursive_mutex> guard (mutex_);
@@ -1404,13 +1430,33 @@ void CuptiActivityProfiler::finalizeTrace(
14041430 iterationCountMap_.clear ();
14051431 }
14061432
1433+ // Thread & stream info
1434+ for (const auto & pair : resourceInfo_) {
1435+ const auto & resource = pair.second ;
1436+ logger.handleResourceInfo (resource, captureWindowStartTime_);
1437+ }
1438+
1439+ bool use_default_device_info = true ;
1440+ for (auto & session : sessions_) {
1441+ auto device_info = session->getDeviceInfo ();
1442+ if (device_info != nullptr ) {
1443+ use_default_device_info = false ;
1444+ logger.handleDeviceInfo (*device_info, captureWindowStartTime_);
1445+ }
1446+
1447+ auto resource_infos = session->getResourceInfos ();
1448+ for (const auto & resource_info : resource_infos) {
1449+ logger.handleResourceInfo (resource_info, captureWindowStartTime_);
1450+ }
1451+ }
1452+
14071453 // Process names
14081454 int32_t pid = processId ();
14091455 string process_name = processName (pid);
14101456 if (!process_name.empty ()) {
14111457 logger.handleDeviceInfo (
14121458 {pid, pid, process_name, " CPU" }, captureWindowStartTime_);
1413- if (!cpuOnly_) {
1459+ if (!cpuOnly_ && use_default_device_info ) {
14141460 // Usually, GPU events use device id as pid (0-7).
14151461 // In some cases, CPU sockets are numbered starting from 0.
14161462 // In the worst case, 8 CPU sockets + 8 GPUs, so the max GPU ID is 15.
@@ -1428,24 +1474,6 @@ void CuptiActivityProfiler::finalizeTrace(
14281474 }
14291475 }
14301476
1431- // Thread & stream info
1432- for (auto pair : resourceInfo_) {
1433- const auto & resource = pair.second ;
1434- logger.handleResourceInfo (resource, captureWindowStartTime_);
1435- }
1436-
1437- for (auto & session : sessions_) {
1438- auto device_info = session->getDeviceInfo ();
1439- if (device_info != nullptr ) {
1440- logger.handleDeviceInfo (*device_info, captureWindowStartTime_);
1441- }
1442-
1443- auto resource_infos = session->getResourceInfos ();
1444- for (auto resource_info : resource_infos) {
1445- logger.handleResourceInfo (resource_info, captureWindowStartTime_);
1446- }
1447- }
1448-
14491477 for (const auto & iterations : traceSpans_) {
14501478 for (const auto & span_pair : iterations.second ) {
14511479 const TraceSpan& gpu_span = span_pair.second ;
0 commit comments