@@ -1716,68 +1716,234 @@ static bool should_migrate_to_smem(struct xe_bo *bo)
17161716 bo -> attr .atomic_access == DRM_XE_ATOMIC_CPU ;
17171717}
17181718
1719- static vm_fault_t xe_gem_fault (struct vm_fault * vmf )
1719+ /* Populate the bo if swapped out, or migrate if the access mode requires that. */
1720+ static int xe_bo_fault_migrate (struct xe_bo * bo , struct ttm_operation_ctx * ctx ,
1721+ struct drm_exec * exec )
1722+ {
1723+ struct ttm_buffer_object * tbo = & bo -> ttm ;
1724+ int err = 0 ;
1725+
1726+ if (ttm_manager_type (tbo -> bdev , tbo -> resource -> mem_type )-> use_tt ) {
1727+ xe_assert (xe_bo_device (bo ),
1728+ dma_resv_test_signaled (tbo -> base .resv , DMA_RESV_USAGE_KERNEL ) ||
1729+ (tbo -> ttm && ttm_tt_is_populated (tbo -> ttm )));
1730+ err = ttm_bo_populate (& bo -> ttm , ctx );
1731+ } else if (should_migrate_to_smem (bo )) {
1732+ xe_assert (xe_bo_device (bo ), bo -> flags & XE_BO_FLAG_SYSTEM );
1733+ err = xe_bo_migrate (bo , XE_PL_TT , ctx , exec );
1734+ }
1735+
1736+ return err ;
1737+ }
1738+
1739+ /* Call into TTM to populate PTEs, and register bo for PTE removal on runtime suspend. */
1740+ static vm_fault_t __xe_bo_cpu_fault (struct vm_fault * vmf , struct xe_device * xe , struct xe_bo * bo )
1741+ {
1742+ vm_fault_t ret ;
1743+
1744+ trace_xe_bo_cpu_fault (bo );
1745+
1746+ ret = ttm_bo_vm_fault_reserved (vmf , vmf -> vma -> vm_page_prot ,
1747+ TTM_BO_VM_NUM_PREFAULT );
1748+ /*
1749+ * When TTM is actually called to insert PTEs, ensure no blocking conditions
1750+ * remain, in which case TTM may drop locks and return VM_FAULT_RETRY.
1751+ */
1752+ xe_assert (xe , ret != VM_FAULT_RETRY );
1753+
1754+ if (ret == VM_FAULT_NOPAGE &&
1755+ mem_type_is_vram (bo -> ttm .resource -> mem_type )) {
1756+ mutex_lock (& xe -> mem_access .vram_userfault .lock );
1757+ if (list_empty (& bo -> vram_userfault_link ))
1758+ list_add (& bo -> vram_userfault_link ,
1759+ & xe -> mem_access .vram_userfault .list );
1760+ mutex_unlock (& xe -> mem_access .vram_userfault .lock );
1761+ }
1762+
1763+ return ret ;
1764+ }
1765+
1766+ static vm_fault_t xe_err_to_fault_t (int err )
1767+ {
1768+ switch (err ) {
1769+ case 0 :
1770+ case - EINTR :
1771+ case - ERESTARTSYS :
1772+ case - EAGAIN :
1773+ return VM_FAULT_NOPAGE ;
1774+ case - ENOMEM :
1775+ case - ENOSPC :
1776+ return VM_FAULT_OOM ;
1777+ default :
1778+ break ;
1779+ }
1780+ return VM_FAULT_SIGBUS ;
1781+ }
1782+
1783+ static bool xe_ttm_bo_is_imported (struct ttm_buffer_object * tbo )
1784+ {
1785+ dma_resv_assert_held (tbo -> base .resv );
1786+
1787+ return tbo -> ttm &&
1788+ (tbo -> ttm -> page_flags & (TTM_TT_FLAG_EXTERNAL | TTM_TT_FLAG_EXTERNAL_MAPPABLE )) ==
1789+ TTM_TT_FLAG_EXTERNAL ;
1790+ }
1791+
1792+ static vm_fault_t xe_bo_cpu_fault_fastpath (struct vm_fault * vmf , struct xe_device * xe ,
1793+ struct xe_bo * bo , bool needs_rpm )
1794+ {
1795+ struct ttm_buffer_object * tbo = & bo -> ttm ;
1796+ vm_fault_t ret = VM_FAULT_RETRY ;
1797+ struct xe_validation_ctx ctx ;
1798+ struct ttm_operation_ctx tctx = {
1799+ .interruptible = true,
1800+ .no_wait_gpu = true,
1801+ .gfp_retry_mayfail = true,
1802+
1803+ };
1804+ int err ;
1805+
1806+ if (needs_rpm && !xe_pm_runtime_get_if_active (xe ))
1807+ return VM_FAULT_RETRY ;
1808+
1809+ err = xe_validation_ctx_init (& ctx , & xe -> val , NULL ,
1810+ (struct xe_val_flags ) {
1811+ .interruptible = true,
1812+ .no_block = true
1813+ });
1814+ if (err )
1815+ goto out_pm ;
1816+
1817+ if (!dma_resv_trylock (tbo -> base .resv ))
1818+ goto out_validation ;
1819+
1820+ if (xe_ttm_bo_is_imported (tbo )) {
1821+ ret = VM_FAULT_SIGBUS ;
1822+ drm_dbg (& xe -> drm , "CPU trying to access an imported buffer object.\n" );
1823+ goto out_unlock ;
1824+ }
1825+
1826+ err = xe_bo_fault_migrate (bo , & tctx , NULL );
1827+ if (err ) {
1828+ /* Return VM_FAULT_RETRY on these errors. */
1829+ if (err != - ENOMEM && err != - ENOSPC && err != - EBUSY )
1830+ ret = xe_err_to_fault_t (err );
1831+ goto out_unlock ;
1832+ }
1833+
1834+ if (dma_resv_test_signaled (bo -> ttm .base .resv , DMA_RESV_USAGE_KERNEL ))
1835+ ret = __xe_bo_cpu_fault (vmf , xe , bo );
1836+
1837+ out_unlock :
1838+ dma_resv_unlock (tbo -> base .resv );
1839+ out_validation :
1840+ xe_validation_ctx_fini (& ctx );
1841+ out_pm :
1842+ if (needs_rpm )
1843+ xe_pm_runtime_put (xe );
1844+
1845+ return ret ;
1846+ }
1847+
1848+ static vm_fault_t xe_bo_cpu_fault (struct vm_fault * vmf )
17201849{
17211850 struct ttm_buffer_object * tbo = vmf -> vma -> vm_private_data ;
17221851 struct drm_device * ddev = tbo -> base .dev ;
17231852 struct xe_device * xe = to_xe_device (ddev );
17241853 struct xe_bo * bo = ttm_to_xe_bo (tbo );
17251854 bool needs_rpm = bo -> flags & XE_BO_FLAG_VRAM_MASK ;
1726- struct drm_exec * exec ;
1855+ bool retry_after_wait = false;
1856+ struct xe_validation_ctx ctx ;
1857+ struct drm_exec exec ;
17271858 vm_fault_t ret ;
1728- int idx , r = 0 ;
1859+ int err = 0 ;
1860+ int idx ;
17291861
1730- if (needs_rpm )
1731- xe_pm_runtime_get ( xe );
1862+ if (! drm_dev_enter ( & xe -> drm , & idx ) )
1863+ return ttm_bo_vm_dummy_page ( vmf , vmf -> vma -> vm_page_prot );
17321864
1733- exec = XE_VALIDATION_UNIMPLEMENTED ;
1734- ret = ttm_bo_vm_reserve (tbo , vmf );
1735- if (ret )
1865+ ret = xe_bo_cpu_fault_fastpath (vmf , xe , bo , needs_rpm );
1866+ if (ret != VM_FAULT_RETRY )
17361867 goto out ;
17371868
1738- if (drm_dev_enter (ddev , & idx )) {
1739- trace_xe_bo_cpu_fault (bo );
1869+ if (fault_flag_allow_retry_first (vmf -> flags )) {
1870+ if (vmf -> flags & FAULT_FLAG_RETRY_NOWAIT )
1871+ goto out ;
1872+ retry_after_wait = true;
1873+ xe_bo_get (bo );
1874+ mmap_read_unlock (vmf -> vma -> vm_mm );
1875+ } else {
1876+ ret = VM_FAULT_NOPAGE ;
1877+ }
17401878
1741- xe_validation_assert_exec (xe , exec , & tbo -> base );
1742- if (should_migrate_to_smem (bo )) {
1743- xe_assert (xe , bo -> flags & XE_BO_FLAG_SYSTEM );
1879+ /*
1880+ * The fastpath failed and we were not required to return and retry immediately.
1881+ * We're now running in one of two modes:
1882+ *
1883+ * 1) retry_after_wait == true: The mmap_read_lock() is dropped, and we're trying
1884+ * to resolve blocking waits. But we can't resolve the fault since the
1885+ * mmap_read_lock() is dropped. After retrying the fault, the aim is that the fastpath
1886+ * should succeed. But it may fail since we drop the bo lock.
1887+ *
1888+ * 2) retry_after_wait == false: The fastpath failed, typically even after
1889+ * a retry. Do whatever's necessary to resolve the fault.
1890+ *
1891+ * This construct is recommended to avoid excessive waits under the mmap_lock.
1892+ */
1893+
1894+ if (needs_rpm )
1895+ xe_pm_runtime_get (xe );
17441896
1745- r = xe_bo_migrate (bo , XE_PL_TT , exec );
1746- if (r == - EBUSY || r == - ERESTARTSYS || r == - EINTR )
1747- ret = VM_FAULT_NOPAGE ;
1748- else if (r )
1749- ret = VM_FAULT_SIGBUS ;
1897+ xe_validation_guard (& ctx , & xe -> val , & exec , (struct xe_val_flags ) {.interruptible = true},
1898+ err ) {
1899+ struct ttm_operation_ctx tctx = {
1900+ .interruptible = true,
1901+ .no_wait_gpu = false,
1902+ .gfp_retry_mayfail = retry_after_wait ,
1903+ };
1904+ long lerr ;
1905+
1906+ err = drm_exec_lock_obj (& exec , & tbo -> base );
1907+ drm_exec_retry_on_contention (& exec );
1908+ if (err )
1909+ break ;
1910+
1911+ if (xe_ttm_bo_is_imported (tbo )) {
1912+ err = - EFAULT ;
1913+ drm_dbg (& xe -> drm , "CPU trying to access an imported buffer object.\n" );
1914+ break ;
17501915 }
1751- if (!ret )
1752- ret = ttm_bo_vm_fault_reserved (vmf ,
1753- vmf -> vma -> vm_page_prot ,
1754- TTM_BO_VM_NUM_PREFAULT );
1755- drm_dev_exit (idx );
17561916
1757- if (ret == VM_FAULT_RETRY &&
1758- !(vmf -> flags & FAULT_FLAG_RETRY_NOWAIT ))
1759- goto out ;
1917+ err = xe_bo_fault_migrate (bo , & tctx , & exec );
1918+ if (err ) {
1919+ drm_exec_retry_on_contention (& exec );
1920+ xe_validation_retry_on_oom (& ctx , & err );
1921+ break ;
1922+ }
17601923
1761- /*
1762- * ttm_bo_vm_reserve() already has dma_resv_lock.
1763- */
1764- if (ret == VM_FAULT_NOPAGE &&
1765- mem_type_is_vram (tbo -> resource -> mem_type )) {
1766- mutex_lock (& xe -> mem_access .vram_userfault .lock );
1767- if (list_empty (& bo -> vram_userfault_link ))
1768- list_add (& bo -> vram_userfault_link ,
1769- & xe -> mem_access .vram_userfault .list );
1770- mutex_unlock (& xe -> mem_access .vram_userfault .lock );
1924+ lerr = dma_resv_wait_timeout (tbo -> base .resv ,
1925+ DMA_RESV_USAGE_KERNEL , true,
1926+ MAX_SCHEDULE_TIMEOUT );
1927+ if (lerr < 0 ) {
1928+ err = lerr ;
1929+ break ;
17711930 }
1772- } else {
1773- ret = ttm_bo_vm_dummy_page (vmf , vmf -> vma -> vm_page_prot );
1931+
1932+ if (!retry_after_wait )
1933+ ret = __xe_bo_cpu_fault (vmf , xe , bo );
17741934 }
1935+ /* if retry_after_wait == true, we *must* return VM_FAULT_RETRY. */
1936+ if (err && !retry_after_wait )
1937+ ret = xe_err_to_fault_t (err );
17751938
1776- dma_resv_unlock (tbo -> base .resv );
1777- out :
17781939 if (needs_rpm )
17791940 xe_pm_runtime_put (xe );
17801941
1942+ if (retry_after_wait )
1943+ xe_bo_put (bo );
1944+ out :
1945+ drm_dev_exit (idx );
1946+
17811947 return ret ;
17821948}
17831949
@@ -1821,7 +1987,7 @@ int xe_bo_read(struct xe_bo *bo, u64 offset, void *dst, int size)
18211987}
18221988
18231989static const struct vm_operations_struct xe_gem_vm_ops = {
1824- .fault = xe_gem_fault ,
1990+ .fault = xe_bo_cpu_fault ,
18251991 .open = ttm_bo_vm_open ,
18261992 .close = ttm_bo_vm_close ,
18271993 .access = xe_bo_vm_access ,
@@ -3057,6 +3223,8 @@ static void xe_place_from_ttm_type(u32 mem_type, struct ttm_place *place)
30573223 * xe_bo_migrate - Migrate an object to the desired region id
30583224 * @bo: The buffer object to migrate.
30593225 * @mem_type: The TTM region type to migrate to.
3226+ * @tctx: A pointer to a struct ttm_operation_ctx or NULL if
3227+ * a default interruptibe ctx is to be used.
30603228 * @exec: The drm_exec transaction to use for exhaustive eviction.
30613229 *
30623230 * Attempt to migrate the buffer object to the desired memory region. The
@@ -3069,7 +3237,8 @@ static void xe_place_from_ttm_type(u32 mem_type, struct ttm_place *place)
30693237 * Return: 0 on success. Negative error code on failure. In particular may
30703238 * return -EINTR or -ERESTARTSYS if signal pending.
30713239 */
3072- int xe_bo_migrate (struct xe_bo * bo , u32 mem_type , struct drm_exec * exec )
3240+ int xe_bo_migrate (struct xe_bo * bo , u32 mem_type , struct ttm_operation_ctx * tctx ,
3241+ struct drm_exec * exec )
30733242{
30743243 struct xe_device * xe = ttm_to_xe_device (bo -> ttm .bdev );
30753244 struct ttm_operation_ctx ctx = {
@@ -3081,6 +3250,7 @@ int xe_bo_migrate(struct xe_bo *bo, u32 mem_type, struct drm_exec *exec)
30813250 struct ttm_place requested ;
30823251
30833252 xe_bo_assert_held (bo );
3253+ tctx = tctx ? tctx : & ctx ;
30843254
30853255 if (bo -> ttm .resource -> mem_type == mem_type )
30863256 return 0 ;
@@ -3107,8 +3277,9 @@ int xe_bo_migrate(struct xe_bo *bo, u32 mem_type, struct drm_exec *exec)
31073277 add_vram (xe , bo , & requested , bo -> flags , mem_type , & c );
31083278 }
31093279
3110- xe_validation_assert_exec (xe_bo_device (bo ), exec , & bo -> ttm .base );
3111- return ttm_bo_validate (& bo -> ttm , & placement , & ctx );
3280+ if (!tctx -> no_wait_gpu )
3281+ xe_validation_assert_exec (xe_bo_device (bo ), exec , & bo -> ttm .base );
3282+ return ttm_bo_validate (& bo -> ttm , & placement , tctx );
31123283}
31133284
31143285/**
0 commit comments