|
| 1 | +Operation Name,Section 1 Calls,Section 2 Calls,Section 3 Calls,Section 4 Calls,Section 5 Calls,Total Calls |
| 2 | +urEnqueueKernelLaunch,1590,1590,1590,1590,1590,7950 |
| 3 | +"at::native::xpu::VectorizedElementwiseKernel<8, at::...",449,449,449,449,449,2245 |
| 4 | +at::native::xpu::ElementwiseGlobalRangeKernel<at::na...,390,390,390,390,390,1950 |
| 5 | +gemm_kernel,226,226,226,226,226,1130 |
| 6 | +"at::native::xpu::VectorizedElementwiseKernel<4, at::...",210,210,210,210,210,1050 |
| 7 | +urUSMDeviceAlloc,164,160,160,160,160,804 |
| 8 | +Memset (DEVICE),160,160,160,160,160,800 |
| 9 | +urEnqueueUSMFill,160,160,160,160,160,800 |
| 10 | +at::native::xpu::UnrolledElementwiseKernel<at::nativ...,138,138,138,138,138,690 |
| 11 | +"at::native::xpu::ReduceKernel<1, at::native::xpu::Re...",74,74,74,74,74,370 |
| 12 | +micro_sdpa,32,32,32,32,32,160 |
| 13 | +"at::native::xpu::VectorizedElementwiseKernel<2, at::...",16,16,16,16,16,80 |
| 14 | +"at::native::xpu::VectorizedElementwiseKernel<16, at:...",14,14,14,14,14,70 |
| 15 | +urEnqueueUSMMemcpy,10,10,10,10,10,50 |
| 16 | +at::native::xpu::SegmentedRadixSortPairsDownsweepFun...,8,8,8,8,8,40 |
| 17 | +at::native::xpu::SegmentedRadixSortPairsUpsweepFunct...,8,8,8,8,8,40 |
| 18 | +at::native::xpu::SegmentedRadixSortPairsScanFunctor<...,8,8,8,8,8,40 |
| 19 | +Memcpy D2M (DEVICE -> MEMORY(Unknown)),6,6,6,6,6,30 |
| 20 | +at::native::xpu::SegmentScanKernel<at::native::xpu::...,4,4,4,4,4,20 |
| 21 | +at::native::xpu::impl::SoftmaxForwardKernelFunctor<4...,2,2,2,2,2,10 |
| 22 | +Memcpy H2D (HOST -> DEVICE),2,2,2,2,2,10 |
| 23 | +at::native::xpu::CatArrayBatchedCopyKernelFunctor<lo...,2,2,2,2,2,10 |
| 24 | +at::native::xpu::AssertAsyncKernelFunctor1<bool>,2,2,2,2,2,10 |
| 25 | +Memcpy M2D (MEMORY(Unknown) -> DEVICE),2,2,2,2,2,10 |
| 26 | +detach_,2,2,2,2,2,10 |
| 27 | +at::native::xpu::SegmentedGroupRadixSelectPairsFunct...,1,1,1,1,1,5 |
| 28 | +at::native::xpu::SegmentedGroupRadixSortPairsFunctor...,1,1,1,1,1,5 |
| 29 | +at::native::xpu::ScatterGatherElementwiseKernelFunct...,1,1,1,1,1,5 |
| 30 | +"at::native::xpu::ReduceKernel<2, at::native::xpu::Re...",1,1,1,1,1,5 |
| 31 | +at::native::xpu::IndexKernel<at::native::xpu::IndexK...,1,1,1,1,1,5 |
| 32 | +at::native::xpu::DistributionElementwiseKernelFuncto...,1,1,1,1,1,5 |
| 33 | +at::native::xpu::AccumulateCarrierKernelFunctor<at::...,1,1,1,1,1,5 |
| 34 | +urUSMHostAlloc,1,0,0,0,0,1 |
0 commit comments