X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fmadvise.c;h=4e196155a0c3635801bd0f8dcff0b18ce1bbcf37;hb=34a75f0025b9cf803b6a88db032e6ad6950c9313;hp=81c4ea30c75e60b9001127697045d000ae075fd1;hpb=5273a3df6485dc2ad6aa7ddd441b9a21970f003b;p=linux-2.6.git diff --git a/mm/madvise.c b/mm/madvise.c index 81c4ea30c..4e196155a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -7,52 +7,85 @@ #include #include - +#include +#include +#include /* * We can potentially split a vm area into separate * areas, each area with its own behavior. */ -static long madvise_behavior(struct vm_area_struct * vma, unsigned long start, - unsigned long end, int behavior) +static long madvise_behavior(struct vm_area_struct * vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, int behavior) { struct mm_struct * mm = vma->vm_mm; - int error; + int error = 0; + pgoff_t pgoff; + int new_flags = vma->vm_flags; + + switch (behavior) { + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; + break; + case MADV_SEQUENTIAL: + new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; + break; + case MADV_RANDOM: + new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; + break; + case MADV_DONTFORK: + new_flags |= VM_DONTCOPY; + break; + case MADV_DOFORK: + new_flags &= ~VM_DONTCOPY; + break; + } + + if (new_flags == vma->vm_flags) { + *prev = vma; + goto out; + } + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma)); + if (*prev) { + vma = *prev; + goto success; + } + + *prev = vma; if (start != vma->vm_start) { error = split_vma(mm, vma, start, 1); if (error) - return -EAGAIN; + goto out; } if (end != vma->vm_end) { error = split_vma(mm, vma, end, 0); if (error) - return -EAGAIN; + goto out; } - spin_lock(&mm->page_table_lock); - VM_ClearReadHint(vma); - - switch (behavior) { - case MADV_SEQUENTIAL: - vma->vm_flags |= VM_SEQ_READ; - break; - case MADV_RANDOM: - vma->vm_flags |= VM_RAND_READ; - break; - default: - break; - } - spin_unlock(&mm->page_table_lock); +success: + /* + * vm_flags is protected by the mmap_sem held in write mode. + */ + vma->vm_flags = new_flags; - return 0; +out: + if (error == -ENOMEM) + error = -EAGAIN; + return error; } /* * Schedule all required I/O operations. Do not wait for completion. */ static long madvise_willneed(struct vm_area_struct * vma, + struct vm_area_struct ** prev, unsigned long start, unsigned long end) { struct file *file = vma->vm_file; @@ -60,6 +93,12 @@ static long madvise_willneed(struct vm_area_struct * vma, if (!file) return -EBADF; + if (file->f_mapping->a_ops->get_xip_page) { + /* no bad return value, but ignore advice */ + return 0; + } + + *prev = vma; start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) end = vma->vm_end; @@ -90,49 +129,91 @@ static long madvise_willneed(struct vm_area_struct * vma, * dirty pages is already available as msync(MS_INVALIDATE). */ static long madvise_dontneed(struct vm_area_struct * vma, + struct vm_area_struct ** prev, unsigned long start, unsigned long end) { - struct zap_details details; - - if (vma->vm_flags & VM_LOCKED) + *prev = vma; + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { - details.check_mapping = NULL; - details.nonlinear_vma = vma; - details.first_index = 0; - details.last_index = ULONG_MAX; + struct zap_details details = { + .nonlinear_vma = vma, + .last_index = ULONG_MAX, + }; zap_page_range(vma, start, end - start, &details); } else zap_page_range(vma, start, end - start, NULL); return 0; } -static long madvise_vma(struct vm_area_struct * vma, unsigned long start, - unsigned long end, int behavior) +/* + * Application wants to free up the pages and associated backing store. + * This is effectively punching a hole into the middle of a file. + * + * NOTE: Currently, only shmfs/tmpfs is supported for this operation. + * Other filesystems return -ENOSYS. + */ +static long madvise_remove(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct address_space *mapping; + loff_t offset, endoff; + + if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) + return -EINVAL; + + if (!vma->vm_file || !vma->vm_file->f_mapping + || !vma->vm_file->f_mapping->host) { + return -EINVAL; + } + + if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) + return -EACCES; + + mapping = vma->vm_file->f_mapping; + + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + endoff = (loff_t)(end - vma->vm_start - 1) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + return vmtruncate_range(mapping->host, offset, endoff); +} + +static long +madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end, int behavior) { - long error = -EBADF; + long error; switch (behavior) { + case MADV_DOFORK: + if (vma->vm_flags & VM_IO) { + error = -EINVAL; + break; + } + case MADV_DONTFORK: case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: - error = madvise_behavior(vma, start, end, behavior); + error = madvise_behavior(vma, prev, start, end, behavior); + break; + case MADV_REMOVE: + error = madvise_remove(vma, start, end); break; case MADV_WILLNEED: - error = madvise_willneed(vma, start, end); + error = madvise_willneed(vma, prev, start, end); break; case MADV_DONTNEED: - error = madvise_dontneed(vma, start, end); + error = madvise_dontneed(vma, prev, start, end); break; default: error = -EINVAL; break; } - return error; } @@ -158,6 +239,8 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start, * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. + * MADV_REMOVE - the application wants to free up the given range of + * pages and associated backing store. * * return values: * zero - success @@ -170,18 +253,24 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start, * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior) +asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) { - unsigned long end; - struct vm_area_struct * vma; + unsigned long end, tmp; + struct vm_area_struct * vma, *prev; int unmapped_error = 0; int error = -EINVAL; + size_t len; down_write(¤t->mm->mmap_sem); if (start & ~PAGE_MASK) goto out; - len = (len + ~PAGE_MASK) & PAGE_MASK; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + goto out; + end = start + len; if (end < start) goto out; @@ -193,40 +282,43 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior) /* * If the interval [start,end) covers some unmapped address * ranges, just ignore them, but return -ENOMEM at the end. + * - different from the way of handling in mlock etc. */ - vma = find_vma(current->mm, start); + vma = find_vma_prev(current->mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + for (;;) { /* Still start < end. */ error = -ENOMEM; if (!vma) goto out; - /* Here start < vma->vm_end. */ + /* Here start < (end|vma->vm_end). */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; start = vma->vm_start; + if (start >= end) + goto out; } - /* Here vma->vm_start <= start < vma->vm_end. */ - if (end <= vma->vm_end) { - if (start < end) { - error = madvise_vma(vma, start, end, - behavior); - if (error) - goto out; - } - error = unmapped_error; - goto out; - } + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; - /* Here vma->vm_start <= start < vma->vm_end < end. */ - error = madvise_vma(vma, start, vma->vm_end, behavior); + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = madvise_vma(vma, &prev, start, tmp, behavior); if (error) goto out; - start = vma->vm_end; - vma = vma->vm_next; + start = tmp; + if (start < prev->vm_end) + start = prev->vm_end; + error = unmapped_error; + if (start >= end) + goto out; + vma = prev->vm_next; } - out: up_write(¤t->mm->mmap_sem); return error;