2 * linux/fs/nfs/direct.c
4 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
6 * High-performance uncached I/O for the Linux NFS client
8 * There are important applications whose performance or correctness
9 * depends on uncached access to file data. Database clusters
10 * (multiple copies of the same instance running on separate hosts)
11 * implement their own cache coherency protocol that subsumes file
12 * system cache protocols. Applications that process datasets
13 * considerably larger than the client's memory do not always benefit
14 * from a local cache. A streaming video server, for instance, has no
15 * need to cache the contents of a file.
17 * When an application requests uncached I/O, all read and write requests
18 * are made directly to the server; data stored or fetched via these
19 * requests is not cached in the Linux page cache. The client does not
20 * correct unaligned requests from applications. All requested bytes are
21 * held on permanent storage before a direct write system call returns to
24 * Solaris implements an uncached I/O facility called directio() that
25 * is used for backups and sequential I/O to very large files. Solaris
26 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
27 * an undocumented mount option.
29 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
30 * help from Andrew Morton.
32 * 18 Dec 2001 Initial implementation for 2.4 --cel
33 * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
34 * 08 Jun 2003 Port to 2.5 APIs --cel
35 * 31 Mar 2004 Handle direct I/O without VFS support --cel
39 #include <linux/config.h>
40 #include <linux/errno.h>
41 #include <linux/sched.h>
42 #include <linux/kernel.h>
43 #include <linux/smp_lock.h>
44 #include <linux/file.h>
45 #include <linux/pagemap.h>
47 #include <linux/nfs_fs.h>
48 #include <linux/nfs_page.h>
49 #include <linux/sunrpc/clnt.h>
51 #include <asm/system.h>
52 #include <asm/uaccess.h>
54 #define NFSDBG_FACILITY NFSDBG_VFS
55 #define VERF_SIZE (2 * sizeof(__u32))
56 #define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT)
60 * nfs_get_user_pages - find and set up pages underlying user's buffer
61 * rw: direction (read or write)
62 * user_addr: starting address of this segment of user's buffer
63 * count: size of this segment
64 * @pages: returned array of page struct pointers underlying user's buffer
67 nfs_get_user_pages(int rw, unsigned long user_addr, size_t size,
71 unsigned long page_count;
74 /* set an arbitrary limit to prevent arithmetic overflow */
75 if (size > MAX_DIRECTIO_SIZE) {
80 page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
81 page_count -= user_addr >> PAGE_SHIFT;
83 array_size = (page_count * sizeof(struct page *));
84 *pages = kmalloc(array_size, GFP_KERNEL);
86 down_read(¤t->mm->mmap_sem);
87 result = get_user_pages(current, current->mm, user_addr,
88 page_count, (rw == READ), 0,
90 up_read(¤t->mm->mmap_sem);
96 * nfs_free_user_pages - tear down page struct array
97 * @pages: array of page struct pointers underlying target buffer
100 nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
103 for (i = 0; i < npages; i++) {
105 set_page_dirty_lock(pages[i]);
106 page_cache_release(pages[i]);
112 * nfs_direct_read_seg - Read in one iov segment. Generate separate
113 * read RPCs for each "rsize" bytes.
114 * @inode: target inode
115 * @ctx: target file open context
116 * user_addr: starting address of this segment of user's buffer
117 * count: size of this segment
118 * file_offset: offset in file to begin the operation
119 * @pages: array of addresses of page structs defining user's buffer
120 * nr_pages: size of pages array
123 nfs_direct_read_seg(struct inode *inode, struct nfs_open_context *ctx,
124 unsigned long user_addr, size_t count, loff_t file_offset,
125 struct page **pages, int nr_pages)
127 const unsigned int rsize = NFS_SERVER(inode)->rsize;
130 struct nfs_read_data rdata = {
138 .fattr = &rdata.fattr,
142 rdata.args.pgbase = user_addr & ~PAGE_MASK;
143 rdata.args.offset = file_offset;
147 rdata.args.count = count;
148 if (rdata.args.count > rsize)
149 rdata.args.count = rsize;
150 rdata.args.pages = &pages[curpage];
152 dprintk("NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
153 rdata.args.count, (long long) rdata.args.offset,
154 user_addr + tot_bytes, rdata.args.pgbase, curpage);
157 result = NFS_PROTO(inode)->read(&rdata);
163 if (result == -EISDIR)
172 rdata.args.offset += result;
173 rdata.args.pgbase += result;
174 curpage += rdata.args.pgbase >> PAGE_SHIFT;
175 rdata.args.pgbase &= ~PAGE_MASK;
177 } while (count != 0);
179 /* XXX: should we zero the rest of the user's buffer if we
186 * nfs_direct_read - For each iov segment, map the user's buffer
187 * then generate read RPCs.
188 * @inode: target inode
189 * @ctx: target file open context
190 * @iov: array of vectors that define I/O buffer
191 * file_offset: offset in file to begin the operation
192 * nr_segs: size of iovec array
194 * generic_file_direct_IO has already pushed out any non-direct
195 * writes so that this read will see them when we read from the
199 nfs_direct_read(struct inode *inode, struct nfs_open_context *ctx,
200 const struct iovec *iov, loff_t file_offset,
201 unsigned long nr_segs)
203 ssize_t tot_bytes = 0;
204 unsigned long seg = 0;
206 while ((seg < nr_segs) && (tot_bytes >= 0)) {
210 const struct iovec *vec = &iov[seg++];
211 unsigned long user_addr = (unsigned long) vec->iov_base;
212 size_t size = vec->iov_len;
214 page_count = nfs_get_user_pages(READ, user_addr, size, &pages);
215 if (page_count < 0) {
216 nfs_free_user_pages(pages, 0, 0);
222 result = nfs_direct_read_seg(inode, ctx, user_addr, size,
223 file_offset, pages, page_count);
225 nfs_free_user_pages(pages, page_count, 1);
233 file_offset += result;
242 * nfs_direct_write_seg - Write out one iov segment. Generate separate
243 * write RPCs for each "wsize" bytes, then commit.
244 * @inode: target inode
245 * @ctx: target file open context
246 * user_addr: starting address of this segment of user's buffer
247 * count: size of this segment
248 * file_offset: offset in file to begin the operation
249 * @pages: array of addresses of page structs defining user's buffer
250 * nr_pages: size of pages array
253 nfs_direct_write_seg(struct inode *inode, struct nfs_open_context *ctx,
254 unsigned long user_addr, size_t count, loff_t file_offset,
255 struct page **pages, int nr_pages)
257 const unsigned int wsize = NFS_SERVER(inode)->wsize;
259 int curpage, need_commit, result, tot_bytes;
260 struct nfs_writeverf first_verf;
261 struct nfs_write_data wdata = {
269 .fattr = &wdata.fattr,
274 wdata.args.stable = NFS_UNSTABLE;
275 if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize)
276 wdata.args.stable = NFS_FILE_SYNC;
278 nfs_begin_data_update(inode);
284 wdata.args.pgbase = user_addr & ~PAGE_MASK;
285 wdata.args.offset = file_offset;
287 wdata.args.count = request;
288 if (wdata.args.count > wsize)
289 wdata.args.count = wsize;
290 wdata.args.pages = &pages[curpage];
292 dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
293 wdata.args.count, (long long) wdata.args.offset,
294 user_addr + tot_bytes, wdata.args.pgbase, curpage);
297 result = NFS_PROTO(inode)->write(&wdata);
307 memcpy(&first_verf.verifier, &wdata.verf.verifier,
309 if (wdata.verf.committed != NFS_FILE_SYNC) {
311 if (memcmp(&first_verf.verifier,
312 &wdata.verf.verifier, VERF_SIZE))
317 wdata.args.offset += result;
318 wdata.args.pgbase += result;
319 curpage += wdata.args.pgbase >> PAGE_SHIFT;
320 wdata.args.pgbase &= ~PAGE_MASK;
322 } while (request != 0);
325 * Commit data written so far, even in the event of an error
328 wdata.args.count = tot_bytes;
329 wdata.args.offset = file_offset;
332 result = NFS_PROTO(inode)->commit(&wdata);
335 if (result < 0 || memcmp(&first_verf.verifier,
336 &wdata.verf.verifier,
343 nfs_end_data_update_defer(inode);
348 wdata.args.stable = NFS_FILE_SYNC;
353 * nfs_direct_write - For each iov segment, map the user's buffer
354 * then generate write and commit RPCs.
355 * @inode: target inode
356 * @ctx: target file open context
357 * @iov: array of vectors that define I/O buffer
358 * file_offset: offset in file to begin the operation
359 * nr_segs: size of iovec array
361 * Upon return, generic_file_direct_IO invalidates any cached pages
362 * that non-direct readers might access, so they will pick up these
363 * writes immediately.
365 static int nfs_direct_write(struct inode *inode, struct nfs_open_context *ctx,
366 const struct iovec *iov, loff_t file_offset,
367 unsigned long nr_segs)
369 ssize_t tot_bytes = 0;
370 unsigned long seg = 0;
372 while ((seg < nr_segs) && (tot_bytes >= 0)) {
376 const struct iovec *vec = &iov[seg++];
377 unsigned long user_addr = (unsigned long) vec->iov_base;
378 size_t size = vec->iov_len;
380 page_count = nfs_get_user_pages(WRITE, user_addr, size, &pages);
381 if (page_count < 0) {
382 nfs_free_user_pages(pages, 0, 0);
388 result = nfs_direct_write_seg(inode, ctx, user_addr, size,
389 file_offset, pages, page_count);
390 nfs_free_user_pages(pages, page_count, 0);
398 file_offset += result;
406 * nfs_direct_IO - NFS address space operation for direct I/O
407 * rw: direction (read or write)
408 * @iocb: target I/O control block
409 * @iov: array of vectors that define I/O buffer
410 * file_offset: offset in file to begin the operation
411 * nr_segs: size of iovec array
415 nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
416 loff_t file_offset, unsigned long nr_segs)
418 ssize_t result = -EINVAL;
419 struct file *file = iocb->ki_filp;
420 struct nfs_open_context *ctx;
421 struct dentry *dentry = file->f_dentry;
422 struct inode *inode = dentry->d_inode;
425 * No support for async yet
427 if (!is_sync_kiocb(iocb))
430 ctx = (struct nfs_open_context *)file->private_data;
433 dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n",
434 dentry->d_name.name, file_offset, nr_segs);
436 result = nfs_direct_read(inode, ctx, iov,
437 file_offset, nr_segs);
440 dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n",
441 dentry->d_name.name, file_offset, nr_segs);
443 result = nfs_direct_write(inode, ctx, iov,
444 file_offset, nr_segs);
453 * nfs_file_direct_read - file direct read operation for NFS files
454 * @iocb: target I/O control block
455 * @buf: user's buffer into which to read data
456 * count: number of bytes to read
457 * pos: byte offset in file where reading starts
459 * We use this function for direct reads instead of calling
460 * generic_file_aio_read() in order to avoid gfar's check to see if
461 * the request starts before the end of the file. For that check
462 * to work, we must generate a GETATTR before each direct read, and
463 * even then there is a window between the GETATTR and the subsequent
464 * READ where the file size could change. So our preference is simply
465 * to do all reads the application wants, and the server will take
466 * care of managing the end of file boundary.
468 * This function also eliminates unnecessarily updating the file's
469 * atime locally, as the NFS server sets the file's atime, and this
470 * client must read the updated atime from the server back into its
474 nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
476 ssize_t retval = -EINVAL;
477 loff_t *ppos = &iocb->ki_pos;
478 struct file *file = iocb->ki_filp;
479 struct nfs_open_context *ctx =
480 (struct nfs_open_context *) file->private_data;
481 struct dentry *dentry = file->f_dentry;
482 struct address_space *mapping = file->f_mapping;
483 struct inode *inode = mapping->host;
489 dprintk("nfs: direct read(%s/%s, %lu@%lu)\n",
490 dentry->d_parent->d_name.name, dentry->d_name.name,
491 (unsigned long) count, (unsigned long) pos);
493 if (!is_sync_kiocb(iocb))
498 if (!access_ok(VERIFY_WRITE, iov.iov_base, iov.iov_len))
504 if (mapping->nrpages) {
505 retval = filemap_fdatawrite(mapping);
507 retval = filemap_fdatawait(mapping);
512 retval = nfs_direct_read(inode, ctx, &iov, pos, 1);
514 *ppos = pos + retval;
521 * nfs_file_direct_write - file direct write operation for NFS files
522 * @iocb: target I/O control block
523 * @buf: user's buffer from which to write data
524 * count: number of bytes to write
525 * pos: byte offset in file where writing starts
527 * We use this function for direct writes instead of calling
528 * generic_file_aio_write() in order to avoid taking the inode
529 * semaphore and updating the i_size. The NFS server will set
530 * the new i_size and this client must read the updated size
531 * back into its cache. We let the server do generic write
532 * parameter checking and report problems.
534 * We also avoid an unnecessary invocation of generic_osync_inode(),
535 * as it is fairly meaningless to sync the metadata of an NFS file.
537 * We eliminate local atime updates, see direct read above.
539 * We avoid unnecessary page cache invalidations for normal cached
540 * readers of this file.
542 * Note that O_APPEND is not supported for NFS direct writes, as there
543 * is no atomic O_APPEND write facility in the NFS protocol.
546 nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
548 ssize_t retval = -EINVAL;
549 loff_t *ppos = &iocb->ki_pos;
550 unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
551 struct file *file = iocb->ki_filp;
552 struct nfs_open_context *ctx =
553 (struct nfs_open_context *) file->private_data;
554 struct dentry *dentry = file->f_dentry;
555 struct address_space *mapping = file->f_mapping;
556 struct inode *inode = mapping->host;
558 .iov_base = (char __user *)buf,
562 dfprintk(VFS, "nfs: direct write(%s/%s(%ld), %lu@%lu)\n",
563 dentry->d_parent->d_name.name, dentry->d_name.name,
564 inode->i_ino, (unsigned long) count, (unsigned long) pos);
566 if (!is_sync_kiocb(iocb))
573 if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len))
576 retval = file->f_error;
581 if (limit != RLIM_INFINITY) {
583 send_sig(SIGXFSZ, current, 0);
586 if (count > limit - (unsigned long) pos)
587 count = limit - (unsigned long) pos;
593 if (mapping->nrpages) {
594 retval = filemap_fdatawrite(mapping);
596 retval = filemap_fdatawait(mapping);
601 retval = nfs_direct_write(inode, ctx, &iov, pos, 1);
602 if (mapping->nrpages)
603 invalidate_inode_pages2(mapping);
605 *ppos = pos + retval;