2 * linux/fs/nfs/direct.c
4 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
6 * High-performance uncached I/O for the Linux NFS client
8 * There are important applications whose performance or correctness
9 * depends on uncached access to file data. Database clusters
10 * (multiple copies of the same instance running on separate hosts)
11 * implement their own cache coherency protocol that subsumes file
12 * system cache protocols. Applications that process datasets
13 * considerably larger than the client's memory do not always benefit
14 * from a local cache. A streaming video server, for instance, has no
15 * need to cache the contents of a file.
17 * When an application requests uncached I/O, all read and write requests
18 * are made directly to the server; data stored or fetched via these
19 * requests is not cached in the Linux page cache. The client does not
20 * correct unaligned requests from applications. All requested bytes are
21 * held on permanent storage before a direct write system call returns to
24 * Solaris implements an uncached I/O facility called directio() that
25 * is used for backups and sequential I/O to very large files. Solaris
26 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
27 * an undocumented mount option.
29 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
30 * help from Andrew Morton.
32 * 18 Dec 2001 Initial implementation for 2.4 --cel
33 * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
34 * 08 Jun 2003 Port to 2.5 APIs --cel
38 #include <linux/config.h>
39 #include <linux/errno.h>
40 #include <linux/sched.h>
41 #include <linux/kernel.h>
42 #include <linux/smp_lock.h>
43 #include <linux/file.h>
44 #include <linux/pagemap.h>
46 #include <linux/nfs_fs.h>
47 #include <linux/nfs_page.h>
48 #include <linux/sunrpc/clnt.h>
50 #include <asm/system.h>
51 #include <asm/uaccess.h>
53 #define NFSDBG_FACILITY NFSDBG_VFS
54 #define VERF_SIZE (2 * sizeof(__u32))
55 #define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT)
59 * nfs_get_user_pages - find and set up pages underlying user's buffer
60 * rw: direction (read or write)
61 * user_addr: starting address of this segment of user's buffer
62 * count: size of this segment
63 * @pages: returned array of page struct pointers underlying user's buffer
66 nfs_get_user_pages(int rw, unsigned long user_addr, size_t size,
70 unsigned long page_count;
73 /* set an arbitrary limit to prevent arithmetic overflow */
74 if (size > MAX_DIRECTIO_SIZE)
77 page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
78 page_count -= user_addr >> PAGE_SHIFT;
80 array_size = (page_count * sizeof(struct page *));
81 *pages = kmalloc(array_size, GFP_KERNEL);
83 down_read(¤t->mm->mmap_sem);
84 result = get_user_pages(current, current->mm, user_addr,
85 page_count, (rw == READ), 0,
87 up_read(¤t->mm->mmap_sem);
93 * nfs_free_user_pages - tear down page struct array
94 * @pages: array of page struct pointers underlying target buffer
97 nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
100 for (i = 0; i < npages; i++) {
102 set_page_dirty_lock(pages[i]);
103 page_cache_release(pages[i]);
109 * nfs_direct_read_seg - Read in one iov segment. Generate separate
110 * read RPCs for each "rsize" bytes.
111 * @inode: target inode
112 * @file: target file (may be NULL)
113 * user_addr: starting address of this segment of user's buffer
114 * count: size of this segment
115 * file_offset: offset in file to begin the operation
116 * @pages: array of addresses of page structs defining user's buffer
117 * nr_pages: size of pages array
120 nfs_direct_read_seg(struct inode *inode, struct file *file,
121 unsigned long user_addr, size_t count, loff_t file_offset,
122 struct page **pages, int nr_pages)
124 const unsigned int rsize = NFS_SERVER(inode)->rsize;
127 struct nfs_read_data rdata = {
131 .lockowner = current->files,
134 .fattr = &rdata.fattr,
138 rdata.args.pgbase = user_addr & ~PAGE_MASK;
139 rdata.args.offset = file_offset;
143 rdata.args.count = count;
144 if (rdata.args.count > rsize)
145 rdata.args.count = rsize;
146 rdata.args.pages = &pages[curpage];
148 dprintk("NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
149 rdata.args.count, (long long) rdata.args.offset,
150 user_addr + tot_bytes, rdata.args.pgbase, curpage);
153 result = NFS_PROTO(inode)->read(&rdata, file);
159 if (result == -EISDIR)
168 rdata.args.offset += result;
169 rdata.args.pgbase += result;
170 curpage += rdata.args.pgbase >> PAGE_SHIFT;
171 rdata.args.pgbase &= ~PAGE_MASK;
173 } while (count != 0);
175 /* XXX: should we zero the rest of the user's buffer if we
182 * nfs_direct_read - For each iov segment, map the user's buffer
183 * then generate read RPCs.
184 * @inode: target inode
185 * @file: target file (may be NULL)
186 * @iov: array of vectors that define I/O buffer
187 * file_offset: offset in file to begin the operation
188 * nr_segs: size of iovec array
190 * generic_file_direct_IO has already pushed out any non-direct
191 * writes so that this read will see them when we read from the
195 nfs_direct_read(struct inode *inode, struct file *file,
196 const struct iovec *iov, loff_t file_offset,
197 unsigned long nr_segs)
199 ssize_t tot_bytes = 0;
200 unsigned long seg = 0;
202 while ((seg < nr_segs) && (tot_bytes >= 0)) {
206 const struct iovec *vec = &iov[seg++];
207 unsigned long user_addr = (unsigned long) vec->iov_base;
208 size_t size = vec->iov_len;
210 page_count = nfs_get_user_pages(READ, user_addr, size, &pages);
211 if (page_count < 0) {
212 nfs_free_user_pages(pages, 0, 0);
218 result = nfs_direct_read_seg(inode, file, user_addr, size,
219 file_offset, pages, page_count);
221 nfs_free_user_pages(pages, page_count, 1);
229 file_offset += result;
238 * nfs_direct_write_seg - Write out one iov segment. Generate separate
239 * write RPCs for each "wsize" bytes, then commit.
240 * @inode: target inode
241 * @file: target file (may be NULL)
242 * user_addr: starting address of this segment of user's buffer
243 * count: size of this segment
244 * file_offset: offset in file to begin the operation
245 * @pages: array of addresses of page structs defining user's buffer
246 * nr_pages: size of pages array
249 nfs_direct_write_seg(struct inode *inode, struct file *file,
250 unsigned long user_addr, size_t count, loff_t file_offset,
251 struct page **pages, int nr_pages)
253 const unsigned int wsize = NFS_SERVER(inode)->wsize;
258 struct nfs_writeverf first_verf;
259 struct nfs_write_data wdata = {
263 .lockowner = current->files,
266 .fattr = &wdata.fattr,
271 wdata.args.stable = NFS_UNSTABLE;
272 if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize)
273 wdata.args.stable = NFS_FILE_SYNC;
275 nfs_begin_data_update(inode);
281 wdata.args.pgbase = user_addr & ~PAGE_MASK;
282 wdata.args.offset = file_offset;
286 wdata.args.count = request;
287 if (wdata.args.count > wsize)
288 wdata.args.count = wsize;
289 wdata.args.pages = &pages[curpage];
291 dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
292 wdata.args.count, (long long) wdata.args.offset,
293 user_addr + tot_bytes, wdata.args.pgbase, curpage);
296 result = NFS_PROTO(inode)->write(&wdata, file);
306 memcpy(&first_verf.verifier, &wdata.verf.verifier,
308 if (wdata.verf.committed != NFS_FILE_SYNC) {
310 if (memcmp(&first_verf.verifier,
311 &wdata.verf.verifier, VERF_SIZE))
316 wdata.args.offset += result;
317 wdata.args.pgbase += result;
318 curpage += wdata.args.pgbase >> PAGE_SHIFT;
319 wdata.args.pgbase &= ~PAGE_MASK;
321 } while (request != 0);
324 * Commit data written so far, even in the event of an error
329 wdata.args.count = tot_bytes;
330 wdata.args.offset = file_offset;
333 result = NFS_PROTO(inode)->commit(&wdata, file);
336 if (result < 0 || memcmp(&first_verf.verifier,
337 &wdata.verf.verifier,
341 nfs_end_data_update_defer(inode);
346 wdata.args.stable = NFS_FILE_SYNC;
351 * nfs_direct_write - For each iov segment, map the user's buffer
352 * then generate write and commit RPCs.
353 * @inode: target inode
354 * @file: target file (may be NULL)
355 * @iov: array of vectors that define I/O buffer
356 * file_offset: offset in file to begin the operation
357 * nr_segs: size of iovec array
359 * Upon return, generic_file_direct_IO invalidates any cached pages
360 * that non-direct readers might access, so they will pick up these
361 * writes immediately.
364 nfs_direct_write(struct inode *inode, struct file *file,
365 const struct iovec *iov, loff_t file_offset,
366 unsigned long nr_segs)
368 ssize_t tot_bytes = 0;
369 unsigned long seg = 0;
371 while ((seg < nr_segs) && (tot_bytes >= 0)) {
375 const struct iovec *vec = &iov[seg++];
376 unsigned long user_addr = (unsigned long) vec->iov_base;
377 size_t size = vec->iov_len;
379 page_count = nfs_get_user_pages(WRITE, user_addr, size, &pages);
380 if (page_count < 0) {
381 nfs_free_user_pages(pages, 0, 0);
387 result = nfs_direct_write_seg(inode, file, user_addr, size,
388 file_offset, pages, page_count);
389 nfs_free_user_pages(pages, page_count, 0);
397 file_offset += result;
405 * nfs_direct_IO - NFS address space operation for direct I/O
406 * rw: direction (read or write)
407 * @iocb: target I/O control block
408 * @iov: array of vectors that define I/O buffer
409 * file_offset: offset in file to begin the operation
410 * nr_segs: size of iovec array
412 * Usually a file system implements direct I/O by calling out to
413 * blockdev_direct_IO. The NFS client doesn't have a backing block
414 * device, so we do everything by hand instead.
416 * The inode's i_sem is no longer held by the VFS layer before it calls
417 * this function to do a write.
420 nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
421 loff_t file_offset, unsigned long nr_segs)
423 ssize_t result = -EINVAL;
424 struct file *file = iocb->ki_filp;
425 struct dentry *dentry = file->f_dentry;
426 struct inode *inode = dentry->d_inode;
429 * No support for async yet
431 if (!is_sync_kiocb(iocb))
434 result = nfs_revalidate_inode(NFS_SERVER(inode), inode);
440 dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n",
441 dentry->d_name.name, file_offset, nr_segs);
443 result = nfs_direct_read(inode, file, iov,
444 file_offset, nr_segs);
447 dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n",
448 dentry->d_name.name, file_offset, nr_segs);
450 result = nfs_direct_write(inode, file, iov,
451 file_offset, nr_segs);
458 dprintk("NFS: direct_IO result=%zd\n", result);