Skip to content

Commit 0aed55a

Browse files
committed
x86, uaccess: introduce copy_from_iter_flushcache for pmem / cache-bypass operations
The pmem driver has a need to transfer data with a persistent memory destination and be able to rely on the fact that the destination writes are not cached. It is sufficient for the writes to be flushed to a cpu-store-buffer (non-temporal / "movnt" in x86 terms), as we expect userspace to call fsync() to ensure data-writes have reached a power-fail-safe zone in the platform. The fsync() triggers a REQ_FUA or REQ_FLUSH to the pmem driver which will turn around and fence previous writes with an "sfence". Implement a __copy_from_user_inatomic_flushcache, memcpy_page_flushcache, and memcpy_flushcache, that guarantee that the destination buffer is not dirty in the cpu cache on completion. The new copy_from_iter_flushcache and sub-routines will be used to replace the "pmem api" (include/linux/pmem.h + arch/x86/include/asm/pmem.h). The availability of copy_from_iter_flushcache() and memcpy_flushcache() are gated by the CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE config symbol, and fallback to copy_from_iter_nocache() and plain memcpy() otherwise. This is meant to satisfy the concern from Linus that if a driver wants to do something beyond the normal nocache semantics it should be something private to that driver [1], and Al's concern that anything uaccess related belongs with the rest of the uaccess code [2]. The first consumer of this interface is a new 'copy_from_iter' dax operation so that pmem can inject cache maintenance operations without imposing this overhead on other dax-capable drivers. [1]: https://lists.01.org/pipermail/linux-nvdimm/2017-January/008364.html [2]: https://lists.01.org/pipermail/linux-nvdimm/2017-April/009942.html Cc: <[email protected]> Cc: Jan Kara <[email protected]> Cc: Jeff Moyer <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Christoph Hellwig <[email protected]> Cc: Toshi Kani <[email protected]> Cc: "H. Peter Anvin" <[email protected]> Cc: Al Viro <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Matthew Wilcox <[email protected]> Reviewed-by: Ross Zwisler <[email protected]> Signed-off-by: Dan Williams <[email protected]>
1 parent 3c2993b commit 0aed55a

File tree

13 files changed

+209
-7
lines changed

13 files changed

+209
-7
lines changed

arch/x86/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ config X86
5454
select ARCH_HAS_KCOV if X86_64
5555
select ARCH_HAS_MMIO_FLUSH
5656
select ARCH_HAS_PMEM_API if X86_64
57+
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
5758
select ARCH_HAS_SET_MEMORY
5859
select ARCH_HAS_SG_CHAIN
5960
select ARCH_HAS_STRICT_KERNEL_RWX

arch/x86/include/asm/string_64.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt)
109109
return 0;
110110
}
111111

112+
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
113+
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
114+
void memcpy_flushcache(void *dst, const void *src, size_t cnt);
115+
#endif
116+
112117
#endif /* __KERNEL__ */
113118

114119
#endif /* _ASM_X86_STRING_64_H */

arch/x86/include/asm/uaccess_64.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,10 @@ unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigne
171171
extern long __copy_user_nocache(void *dst, const void __user *src,
172172
unsigned size, int zerorest);
173173

174+
extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
175+
extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
176+
size_t len);
177+
174178
static inline int
175179
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
176180
unsigned size)
@@ -179,6 +183,13 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
179183
return __copy_user_nocache(dst, src, size, 0);
180184
}
181185

186+
static inline int
187+
__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
188+
{
189+
kasan_check_write(dst, size);
190+
return __copy_user_flushcache(dst, src, size);
191+
}
192+
182193
unsigned long
183194
copy_user_handle_tail(char *to, char *from, unsigned len);
184195

arch/x86/lib/usercopy_64.c

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
*/
88
#include <linux/export.h>
99
#include <linux/uaccess.h>
10+
#include <linux/highmem.h>
1011

1112
/*
1213
* Zero Userspace
@@ -73,3 +74,130 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
7374
clac();
7475
return len;
7576
}
77+
78+
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
79+
/**
80+
* clean_cache_range - write back a cache range with CLWB
81+
* @vaddr: virtual start address
82+
* @size: number of bytes to write back
83+
*
84+
* Write back a cache range using the CLWB (cache line write back)
85+
* instruction. Note that @size is internally rounded up to be cache
86+
* line size aligned.
87+
*/
88+
static void clean_cache_range(void *addr, size_t size)
89+
{
90+
u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
91+
unsigned long clflush_mask = x86_clflush_size - 1;
92+
void *vend = addr + size;
93+
void *p;
94+
95+
for (p = (void *)((unsigned long)addr & ~clflush_mask);
96+
p < vend; p += x86_clflush_size)
97+
clwb(p);
98+
}
99+
100+
long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
101+
{
102+
unsigned long flushed, dest = (unsigned long) dst;
103+
long rc = __copy_user_nocache(dst, src, size, 0);
104+
105+
/*
106+
* __copy_user_nocache() uses non-temporal stores for the bulk
107+
* of the transfer, but we need to manually flush if the
108+
* transfer is unaligned. A cached memory copy is used when
109+
* destination or size is not naturally aligned. That is:
110+
* - Require 8-byte alignment when size is 8 bytes or larger.
111+
* - Require 4-byte alignment when size is 4 bytes.
112+
*/
113+
if (size < 8) {
114+
if (!IS_ALIGNED(dest, 4) || size != 4)
115+
clean_cache_range(dst, 1);
116+
} else {
117+
if (!IS_ALIGNED(dest, 8)) {
118+
dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
119+
clean_cache_range(dst, 1);
120+
}
121+
122+
flushed = dest - (unsigned long) dst;
123+
if (size > flushed && !IS_ALIGNED(size - flushed, 8))
124+
clean_cache_range(dst + size - 1, 1);
125+
}
126+
127+
return rc;
128+
}
129+
130+
void memcpy_flushcache(void *_dst, const void *_src, size_t size)
131+
{
132+
unsigned long dest = (unsigned long) _dst;
133+
unsigned long source = (unsigned long) _src;
134+
135+
/* cache copy and flush to align dest */
136+
if (!IS_ALIGNED(dest, 8)) {
137+
unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest);
138+
139+
memcpy((void *) dest, (void *) source, len);
140+
clean_cache_range((void *) dest, len);
141+
dest += len;
142+
source += len;
143+
size -= len;
144+
if (!size)
145+
return;
146+
}
147+
148+
/* 4x8 movnti loop */
149+
while (size >= 32) {
150+
asm("movq (%0), %%r8\n"
151+
"movq 8(%0), %%r9\n"
152+
"movq 16(%0), %%r10\n"
153+
"movq 24(%0), %%r11\n"
154+
"movnti %%r8, (%1)\n"
155+
"movnti %%r9, 8(%1)\n"
156+
"movnti %%r10, 16(%1)\n"
157+
"movnti %%r11, 24(%1)\n"
158+
:: "r" (source), "r" (dest)
159+
: "memory", "r8", "r9", "r10", "r11");
160+
dest += 32;
161+
source += 32;
162+
size -= 32;
163+
}
164+
165+
/* 1x8 movnti loop */
166+
while (size >= 8) {
167+
asm("movq (%0), %%r8\n"
168+
"movnti %%r8, (%1)\n"
169+
:: "r" (source), "r" (dest)
170+
: "memory", "r8");
171+
dest += 8;
172+
source += 8;
173+
size -= 8;
174+
}
175+
176+
/* 1x4 movnti loop */
177+
while (size >= 4) {
178+
asm("movl (%0), %%r8d\n"
179+
"movnti %%r8d, (%1)\n"
180+
:: "r" (source), "r" (dest)
181+
: "memory", "r8");
182+
dest += 4;
183+
source += 4;
184+
size -= 4;
185+
}
186+
187+
/* cache copy for remaining bytes */
188+
if (size) {
189+
memcpy((void *) dest, (void *) source, size);
190+
clean_cache_range((void *) dest, size);
191+
}
192+
}
193+
EXPORT_SYMBOL_GPL(memcpy_flushcache);
194+
195+
void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
196+
size_t len)
197+
{
198+
char *from = kmap_atomic(page);
199+
200+
memcpy_flushcache(to, from + offset, len);
201+
kunmap_atomic(from);
202+
}
203+
#endif

drivers/acpi/nfit/core.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1842,8 +1842,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
18421842
}
18431843

18441844
if (rw)
1845-
memcpy_to_pmem(mmio->addr.aperture + offset,
1846-
iobuf + copied, c);
1845+
memcpy_flushcache(mmio->addr.aperture + offset, iobuf + copied, c);
18471846
else {
18481847
if (nfit_blk->dimm_flags & NFIT_BLK_READ_FLUSH)
18491848
mmio_flush_range((void __force *)

drivers/nvdimm/claim.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
277277
rc = -EIO;
278278
}
279279

280-
memcpy_to_pmem(nsio->addr + offset, buf, size);
280+
memcpy_flushcache(nsio->addr + offset, buf, size);
281281
nvdimm_flush(to_nd_region(ndns->dev.parent));
282282

283283
return rc;

drivers/nvdimm/pmem.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <linux/pfn_t.h>
3030
#include <linux/slab.h>
3131
#include <linux/pmem.h>
32+
#include <linux/uio.h>
3233
#include <linux/dax.h>
3334
#include <linux/nd.h>
3435
#include "pmem.h"
@@ -80,7 +81,7 @@ static void write_pmem(void *pmem_addr, struct page *page,
8081
{
8182
void *mem = kmap_atomic(page);
8283

83-
memcpy_to_pmem(pmem_addr, mem + off, len);
84+
memcpy_flushcache(pmem_addr, mem + off, len);
8485
kunmap_atomic(mem);
8586
}
8687

@@ -235,8 +236,15 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
235236
return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
236237
}
237238

239+
static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
240+
void *addr, size_t bytes, struct iov_iter *i)
241+
{
242+
return copy_from_iter_flushcache(addr, bytes, i);
243+
}
244+
238245
static const struct dax_operations pmem_dax_ops = {
239246
.direct_access = pmem_dax_direct_access,
247+
.copy_from_iter = pmem_copy_from_iter,
240248
};
241249

242250
static void pmem_release_queue(void *q)
@@ -294,7 +302,8 @@ static int pmem_attach_disk(struct device *dev,
294302
dev_set_drvdata(dev, pmem);
295303
pmem->phys_addr = res->start;
296304
pmem->size = resource_size(res);
297-
if (nvdimm_has_flush(nd_region) < 0)
305+
if (!IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE)
306+
|| nvdimm_has_flush(nd_region) < 0)
298307
dev_warn(dev, "unable to guarantee persistence of writes\n");
299308

300309
if (!devm_request_mem_region(dev, res->start, resource_size(res),

drivers/nvdimm/region_devs.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,8 +1015,8 @@ void nvdimm_flush(struct nd_region *nd_region)
10151015
* The first wmb() is needed to 'sfence' all previous writes
10161016
* such that they are architecturally visible for the platform
10171017
* buffer flush. Note that we've already arranged for pmem
1018-
* writes to avoid the cache via arch_memcpy_to_pmem(). The
1019-
* final wmb() ensures ordering for the NVDIMM flush write.
1018+
* writes to avoid the cache via memcpy_flushcache(). The final
1019+
* wmb() ensures ordering for the NVDIMM flush write.
10201020
*/
10211021
wmb();
10221022
for (i = 0; i < nd_region->ndr_mappings; i++)

include/linux/dax.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ struct dax_operations {
1616
*/
1717
long (*direct_access)(struct dax_device *, pgoff_t, long,
1818
void **, pfn_t *);
19+
/* copy_from_iter: dax-driver override for default copy_from_iter */
20+
size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
21+
struct iov_iter *);
1922
};
2023

2124
#if IS_ENABLED(CONFIG_DAX)

include/linux/string.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,12 @@ static inline __must_check int memcpy_mcsafe(void *dst, const void *src,
122122
return 0;
123123
}
124124
#endif
125+
#ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE
126+
static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
127+
{
128+
memcpy(dst, src, cnt);
129+
}
130+
#endif
125131
void *memchr_inv(const void *s, int c, size_t n);
126132
char *strreplace(char *s, char old, char new);
127133

include/linux/uio.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,21 @@ size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
9595
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
9696
bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i);
9797
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
98+
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
99+
/*
100+
* Note, users like pmem that depend on the stricter semantics of
101+
* copy_from_iter_flushcache() than copy_from_iter_nocache() must check for
102+
* IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the
103+
* destination is flushed from the cache on return.
104+
*/
105+
size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
106+
#else
107+
static inline size_t copy_from_iter_flushcache(void *addr, size_t bytes,
108+
struct iov_iter *i)
109+
{
110+
return copy_from_iter_nocache(addr, bytes, i);
111+
}
112+
#endif
98113
bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i);
99114
size_t iov_iter_zero(size_t bytes, struct iov_iter *);
100115
unsigned long iov_iter_alignment(const struct iov_iter *i);

lib/Kconfig

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,9 @@ config ARCH_HAS_SG_CHAIN
548548
config ARCH_HAS_PMEM_API
549549
bool
550550

551+
config ARCH_HAS_UACCESS_FLUSHCACHE
552+
bool
553+
551554
config ARCH_HAS_MMIO_FLUSH
552555
bool
553556

lib/iov_iter.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,28 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
615615
}
616616
EXPORT_SYMBOL(copy_from_iter_nocache);
617617

618+
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
619+
size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
620+
{
621+
char *to = addr;
622+
if (unlikely(i->type & ITER_PIPE)) {
623+
WARN_ON(1);
624+
return 0;
625+
}
626+
iterate_and_advance(i, bytes, v,
627+
__copy_from_user_flushcache((to += v.iov_len) - v.iov_len,
628+
v.iov_base, v.iov_len),
629+
memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
630+
v.bv_offset, v.bv_len),
631+
memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base,
632+
v.iov_len)
633+
)
634+
635+
return bytes;
636+
}
637+
EXPORT_SYMBOL_GPL(copy_from_iter_flushcache);
638+
#endif
639+
618640
bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
619641
{
620642
char *to = addr;

0 commit comments

Comments
 (0)