This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / mm / shmem.c
1 /*
2  * Resizable virtual memory filesystem for Linux.
3  *
4  * Copyright (C) 2000 Linus Torvalds.
5  *               2000 Transmeta Corp.
6  *               2000-2001 Christoph Rohland
7  *               2000-2001 SAP AG
8  *               2002 Red Hat Inc.
9  * Copyright (C) 2002-2003 Hugh Dickins.
10  * Copyright (C) 2002-2003 VERITAS Software Corporation.
11  * Copyright (C) 2004 Andi Kleen, SuSE Labs
12  *
13  * This file is released under the GPL.
14  */
15
16 /*
17  * This virtual memory filesystem is heavily based on the ramfs. It
18  * extends ramfs by the ability to use swap and honor resource limits
19  * which makes it a completely usable filesystem.
20  */
21
22 #include <linux/config.h>
23 #include <linux/module.h>
24 #include <linux/init.h>
25 #include <linux/devfs_fs_kernel.h>
26 #include <linux/fs.h>
27 #include <linux/mm.h>
28 #include <linux/mman.h>
29 #include <linux/file.h>
30 #include <linux/swap.h>
31 #include <linux/pagemap.h>
32 #include <linux/string.h>
33 #include <linux/slab.h>
34 #include <linux/backing-dev.h>
35 #include <linux/shmem_fs.h>
36 #include <linux/mount.h>
37 #include <linux/writeback.h>
38 #include <linux/vfs.h>
39 #include <linux/blkdev.h>
40 #include <linux/security.h>
41 #include <linux/swapops.h>
42 #include <linux/mempolicy.h>
43 #include <asm/uaccess.h>
44 #include <asm/div64.h>
45 #include <asm/pgtable.h>
46
47 /* This magic number is used in glibc for posix shared memory */
48 #define TMPFS_MAGIC     0x01021994
49
50 #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
51 #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
52 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
53
54 #define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
55 #define SHMEM_MAX_BYTES  ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
56
57 #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
58
59 /* info->flags needs VM_flags to handle pagein/truncate races efficiently */
60 #define SHMEM_PAGEIN     VM_READ
61 #define SHMEM_TRUNCATE   VM_WRITE
62
63 /* Pretend that each entry is of this size in directory's i_size */
64 #define BOGO_DIRENT_SIZE 20
65
66 /* Keep swapped page count in private field of indirect struct page */
67 #define nr_swapped              private
68
69 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
70 enum sgp_type {
71         SGP_QUICK,      /* don't try more than file page cache lookup */
72         SGP_READ,       /* don't exceed i_size, don't allocate page */
73         SGP_CACHE,      /* don't exceed i_size, may allocate page */
74         SGP_WRITE,      /* may exceed i_size, may allocate page */
75 };
76
77 static int shmem_getpage(struct inode *inode, unsigned long idx,
78                          struct page **pagep, enum sgp_type sgp, int *type);
79
80 static inline struct page *shmem_dir_alloc(unsigned int gfp_mask)
81 {
82         /*
83          * The above definition of ENTRIES_PER_PAGE, and the use of
84          * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
85          * might be reconsidered if it ever diverges from PAGE_SIZE.
86          */
87         return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
88 }
89
90 static inline void shmem_dir_free(struct page *page)
91 {
92         __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
93 }
94
95 static struct page **shmem_dir_map(struct page *page)
96 {
97         return (struct page **)kmap_atomic(page, KM_USER0);
98 }
99
100 static inline void shmem_dir_unmap(struct page **dir)
101 {
102         kunmap_atomic(dir, KM_USER0);
103 }
104
105 static swp_entry_t *shmem_swp_map(struct page *page)
106 {
107         /*
108          * We have to avoid the unconditional inc_preempt_count()
109          * in kmap_atomic(), since shmem_swp_unmap() will also be
110          * applied to the low memory addresses within i_direct[].
111          * PageHighMem and high_memory tests are good for all arches
112          * and configs: highmem_start_page and FIXADDR_START are not.
113          */
114         return PageHighMem(page)?
115                 (swp_entry_t *)kmap_atomic(page, KM_USER1):
116                 (swp_entry_t *)page_address(page);
117 }
118
119 static inline void shmem_swp_unmap(swp_entry_t *entry)
120 {
121         if (entry >= (swp_entry_t *)high_memory)
122                 kunmap_atomic(entry, KM_USER1);
123 }
124
125 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
126 {
127         return sb->s_fs_info;
128 }
129
130 /*
131  * shmem_file_setup pre-accounts the whole fixed size of a VM object,
132  * for shared memory and for shared anonymous (/dev/zero) mappings
133  * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
134  * consistent with the pre-accounting of private mappings ...
135  */
136 static inline int shmem_acct_size(unsigned long flags, loff_t size)
137 {
138         return (flags & VM_ACCOUNT)?
139                 security_vm_enough_memory(VM_ACCT(size)): 0;
140 }
141
142 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
143 {
144         if (flags & VM_ACCOUNT)
145                 vm_unacct_memory(VM_ACCT(size));
146 }
147
148 /*
149  * ... whereas tmpfs objects are accounted incrementally as
150  * pages are allocated, in order to allow huge sparse files.
151  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
152  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
153  */
154 static inline int shmem_acct_block(unsigned long flags)
155 {
156         return (flags & VM_ACCOUNT)?
157                 0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
158 }
159
160 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
161 {
162         if (!(flags & VM_ACCOUNT))
163                 vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
164 }
165
166 static struct super_operations shmem_ops;
167 static struct address_space_operations shmem_aops;
168 static struct file_operations shmem_file_operations;
169 static struct inode_operations shmem_inode_operations;
170 static struct inode_operations shmem_dir_inode_operations;
171 static struct vm_operations_struct shmem_vm_ops;
172
173 static struct backing_dev_info shmem_backing_dev_info = {
174         .ra_pages       = 0,    /* No readahead */
175         .memory_backed  = 1,    /* Does not contribute to dirty memory */
176         .unplug_io_fn = default_unplug_io_fn,
177 };
178
179 LIST_HEAD(shmem_inodes);
180 static spinlock_t shmem_ilock = SPIN_LOCK_UNLOCKED;
181
182 static void shmem_free_block(struct inode *inode)
183 {
184         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
185         spin_lock(&sbinfo->stat_lock);
186         sbinfo->free_blocks++;
187         inode->i_blocks -= BLOCKS_PER_PAGE;
188         spin_unlock(&sbinfo->stat_lock);
189 }
190
191 /*
192  * shmem_recalc_inode - recalculate the size of an inode
193  *
194  * @inode: inode to recalc
195  *
196  * We have to calculate the free blocks since the mm can drop
197  * undirtied hole pages behind our back.
198  *
199  * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
200  * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
201  *
202  * It has to be called with the spinlock held.
203  */
204 static void shmem_recalc_inode(struct inode *inode)
205 {
206         struct shmem_inode_info *info = SHMEM_I(inode);
207         long freed;
208
209         freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
210         if (freed > 0) {
211                 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
212                 info->alloced -= freed;
213                 spin_lock(&sbinfo->stat_lock);
214                 sbinfo->free_blocks += freed;
215                 inode->i_blocks -= freed*BLOCKS_PER_PAGE;
216                 spin_unlock(&sbinfo->stat_lock);
217                 shmem_unacct_blocks(info->flags, freed);
218         }
219 }
220
221 /*
222  * shmem_swp_entry - find the swap vector position in the info structure
223  *
224  * @info:  info structure for the inode
225  * @index: index of the page to find
226  * @page:  optional page to add to the structure. Has to be preset to
227  *         all zeros
228  *
229  * If there is no space allocated yet it will return NULL when
230  * page is NULL, else it will use the page for the needed block,
231  * setting it to NULL on return to indicate that it has been used.
232  *
233  * The swap vector is organized the following way:
234  *
235  * There are SHMEM_NR_DIRECT entries directly stored in the
236  * shmem_inode_info structure. So small files do not need an addional
237  * allocation.
238  *
239  * For pages with index > SHMEM_NR_DIRECT there is the pointer
240  * i_indirect which points to a page which holds in the first half
241  * doubly indirect blocks, in the second half triple indirect blocks:
242  *
243  * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
244  * following layout (for SHMEM_NR_DIRECT == 16):
245  *
246  * i_indirect -> dir --> 16-19
247  *            |      +-> 20-23
248  *            |
249  *            +-->dir2 --> 24-27
250  *            |        +-> 28-31
251  *            |        +-> 32-35
252  *            |        +-> 36-39
253  *            |
254  *            +-->dir3 --> 40-43
255  *                     +-> 44-47
256  *                     +-> 48-51
257  *                     +-> 52-55
258  */
259 static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
260 {
261         unsigned long offset;
262         struct page **dir;
263         struct page *subdir;
264
265         if (index < SHMEM_NR_DIRECT)
266                 return info->i_direct+index;
267         if (!info->i_indirect) {
268                 if (page) {
269                         info->i_indirect = *page;
270                         *page = NULL;
271                 }
272                 return NULL;                    /* need another page */
273         }
274
275         index -= SHMEM_NR_DIRECT;
276         offset = index % ENTRIES_PER_PAGE;
277         index /= ENTRIES_PER_PAGE;
278         dir = shmem_dir_map(info->i_indirect);
279
280         if (index >= ENTRIES_PER_PAGE/2) {
281                 index -= ENTRIES_PER_PAGE/2;
282                 dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
283                 index %= ENTRIES_PER_PAGE;
284                 subdir = *dir;
285                 if (!subdir) {
286                         if (page) {
287                                 *dir = *page;
288                                 *page = NULL;
289                         }
290                         shmem_dir_unmap(dir);
291                         return NULL;            /* need another page */
292                 }
293                 shmem_dir_unmap(dir);
294                 dir = shmem_dir_map(subdir);
295         }
296
297         dir += index;
298         subdir = *dir;
299         if (!subdir) {
300                 if (!page || !(subdir = *page)) {
301                         shmem_dir_unmap(dir);
302                         return NULL;            /* need a page */
303                 }
304                 *dir = subdir;
305                 *page = NULL;
306         }
307         shmem_dir_unmap(dir);
308
309         /*
310          * With apologies... caller shmem_swp_alloc passes non-NULL
311          * page (though perhaps NULL *page); and now we know that this
312          * indirect page has been allocated, we can shortcut the final
313          * kmap if we know it contains no swap entries, as is commonly
314          * the case: return pointer to a 0 which doesn't need kmapping.
315          */
316         return (page && !subdir->nr_swapped)?
317                 (swp_entry_t *)&subdir->nr_swapped:
318                 shmem_swp_map(subdir) + offset;
319 }
320
321 static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
322 {
323         long incdec = value? 1: -1;
324
325         entry->val = value;
326         info->swapped += incdec;
327         if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
328                 kmap_atomic_to_page(entry)->nr_swapped += incdec;
329 }
330
331 /*
332  * shmem_swp_alloc - get the position of the swap entry for the page.
333  *                   If it does not exist allocate the entry.
334  *
335  * @info:       info structure for the inode
336  * @index:      index of the page to find
337  * @sgp:        check and recheck i_size? skip allocation?
338  */
339 static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
340 {
341         struct inode *inode = &info->vfs_inode;
342         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
343         struct page *page = NULL;
344         swp_entry_t *entry;
345         static const swp_entry_t unswapped = { 0 };
346
347         if (sgp != SGP_WRITE &&
348             ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
349                 return ERR_PTR(-EINVAL);
350
351         while (!(entry = shmem_swp_entry(info, index, &page))) {
352                 if (sgp == SGP_READ)
353                         return (swp_entry_t *) &unswapped;
354                 /*
355                  * Test free_blocks against 1 not 0, since we have 1 data
356                  * page (and perhaps indirect index pages) yet to allocate:
357                  * a waste to allocate index if we cannot allocate data.
358                  */
359                 spin_lock(&sbinfo->stat_lock);
360                 if (sbinfo->free_blocks <= 1) {
361                         spin_unlock(&sbinfo->stat_lock);
362                         return ERR_PTR(-ENOSPC);
363                 }
364                 sbinfo->free_blocks--;
365                 inode->i_blocks += BLOCKS_PER_PAGE;
366                 spin_unlock(&sbinfo->stat_lock);
367
368                 spin_unlock(&info->lock);
369                 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
370                 if (page) {
371                         clear_highpage(page);
372                         page->nr_swapped = 0;
373                 }
374                 spin_lock(&info->lock);
375
376                 if (!page) {
377                         shmem_free_block(inode);
378                         return ERR_PTR(-ENOMEM);
379                 }
380                 if (sgp != SGP_WRITE &&
381                     ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
382                         entry = ERR_PTR(-EINVAL);
383                         break;
384                 }
385                 if (info->next_index <= index)
386                         info->next_index = index + 1;
387         }
388         if (page) {
389                 /* another task gave its page, or truncated the file */
390                 shmem_free_block(inode);
391                 shmem_dir_free(page);
392         }
393         if (info->next_index <= index && !IS_ERR(entry))
394                 info->next_index = index + 1;
395         return entry;
396 }
397
398 /*
399  * shmem_free_swp - free some swap entries in a directory
400  *
401  * @dir:   pointer to the directory
402  * @edir:  pointer after last entry of the directory
403  */
404 static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
405 {
406         swp_entry_t *ptr;
407         int freed = 0;
408
409         for (ptr = dir; ptr < edir; ptr++) {
410                 if (ptr->val) {
411                         free_swap_and_cache(*ptr);
412                         *ptr = (swp_entry_t){0};
413                         freed++;
414                 }
415         }
416         return freed;
417 }
418
419 static void shmem_truncate(struct inode *inode)
420 {
421         struct shmem_inode_info *info = SHMEM_I(inode);
422         unsigned long idx;
423         unsigned long size;
424         unsigned long limit;
425         unsigned long stage;
426         struct page **dir;
427         struct page *subdir;
428         struct page *empty;
429         swp_entry_t *ptr;
430         int offset;
431         int freed;
432
433         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
434         idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
435         if (idx >= info->next_index)
436                 return;
437
438         spin_lock(&info->lock);
439         info->flags |= SHMEM_TRUNCATE;
440         limit = info->next_index;
441         info->next_index = idx;
442         if (info->swapped && idx < SHMEM_NR_DIRECT) {
443                 ptr = info->i_direct;
444                 size = limit;
445                 if (size > SHMEM_NR_DIRECT)
446                         size = SHMEM_NR_DIRECT;
447                 info->swapped -= shmem_free_swp(ptr+idx, ptr+size);
448         }
449         if (!info->i_indirect)
450                 goto done2;
451
452         BUG_ON(limit <= SHMEM_NR_DIRECT);
453         limit -= SHMEM_NR_DIRECT;
454         idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
455         offset = idx % ENTRIES_PER_PAGE;
456         idx -= offset;
457
458         empty = NULL;
459         dir = shmem_dir_map(info->i_indirect);
460         stage = ENTRIES_PER_PAGEPAGE/2;
461         if (idx < ENTRIES_PER_PAGEPAGE/2)
462                 dir += idx/ENTRIES_PER_PAGE;
463         else {
464                 dir += ENTRIES_PER_PAGE/2;
465                 dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
466                 while (stage <= idx)
467                         stage += ENTRIES_PER_PAGEPAGE;
468                 if (*dir) {
469                         subdir = *dir;
470                         size = ((idx - ENTRIES_PER_PAGEPAGE/2) %
471                                 ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
472                         if (!size && !offset) {
473                                 empty = subdir;
474                                 *dir = NULL;
475                         }
476                         shmem_dir_unmap(dir);
477                         dir = shmem_dir_map(subdir) + size;
478                 } else {
479                         offset = 0;
480                         idx = stage;
481                 }
482         }
483
484         for (; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
485                 if (unlikely(idx == stage)) {
486                         shmem_dir_unmap(dir-1);
487                         dir = shmem_dir_map(info->i_indirect) +
488                             ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
489                         while (!*dir) {
490                                 dir++;
491                                 idx += ENTRIES_PER_PAGEPAGE;
492                                 if (idx >= limit)
493                                         goto done1;
494                         }
495                         stage = idx + ENTRIES_PER_PAGEPAGE;
496                         subdir = *dir;
497                         *dir = NULL;
498                         shmem_dir_unmap(dir);
499                         if (empty) {
500                                 shmem_dir_free(empty);
501                                 shmem_free_block(inode);
502                         }
503                         empty = subdir;
504                         cond_resched_lock(&info->lock);
505                         dir = shmem_dir_map(subdir);
506                 }
507                 subdir = *dir;
508                 if (subdir && subdir->nr_swapped) {
509                         ptr = shmem_swp_map(subdir);
510                         size = limit - idx;
511                         if (size > ENTRIES_PER_PAGE)
512                                 size = ENTRIES_PER_PAGE;
513                         freed = shmem_free_swp(ptr+offset, ptr+size);
514                         shmem_swp_unmap(ptr);
515                         info->swapped -= freed;
516                         subdir->nr_swapped -= freed;
517                         BUG_ON(subdir->nr_swapped > offset);
518                 }
519                 if (offset)
520                         offset = 0;
521                 else if (subdir) {
522                         *dir = NULL;
523                         shmem_dir_free(subdir);
524                         shmem_free_block(inode);
525                 }
526         }
527 done1:
528         shmem_dir_unmap(dir-1);
529         if (empty) {
530                 shmem_dir_free(empty);
531                 shmem_free_block(inode);
532         }
533         if (info->next_index <= SHMEM_NR_DIRECT) {
534                 shmem_dir_free(info->i_indirect);
535                 info->i_indirect = NULL;
536                 shmem_free_block(inode);
537         }
538 done2:
539         BUG_ON(info->swapped > info->next_index);
540         if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
541                 /*
542                  * Call truncate_inode_pages again: racing shmem_unuse_inode
543                  * may have swizzled a page in from swap since vmtruncate or
544                  * generic_delete_inode did it, before we lowered next_index.
545                  * Also, though shmem_getpage checks i_size before adding to
546                  * cache, no recheck after: so fix the narrow window there too.
547                  */
548                 spin_unlock(&info->lock);
549                 truncate_inode_pages(inode->i_mapping, inode->i_size);
550                 spin_lock(&info->lock);
551         }
552         info->flags &= ~SHMEM_TRUNCATE;
553         shmem_recalc_inode(inode);
554         spin_unlock(&info->lock);
555 }
556
557 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
558 {
559         struct inode *inode = dentry->d_inode;
560         struct page *page = NULL;
561         int error;
562
563         if (attr->ia_valid & ATTR_SIZE) {
564                 if (attr->ia_size < inode->i_size) {
565                         /*
566                          * If truncating down to a partial page, then
567                          * if that page is already allocated, hold it
568                          * in memory until the truncation is over, so
569                          * truncate_partial_page cannnot miss it were
570                          * it assigned to swap.
571                          */
572                         if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
573                                 (void) shmem_getpage(inode,
574                                         attr->ia_size>>PAGE_CACHE_SHIFT,
575                                                 &page, SGP_READ, NULL);
576                         }
577                         /*
578                          * Reset SHMEM_PAGEIN flag so that shmem_truncate can
579                          * detect if any pages might have been added to cache
580                          * after truncate_inode_pages.  But we needn't bother
581                          * if it's being fully truncated to zero-length: the
582                          * nrpages check is efficient enough in that case.
583                          */
584                         if (attr->ia_size) {
585                                 struct shmem_inode_info *info = SHMEM_I(inode);
586                                 spin_lock(&info->lock);
587                                 info->flags &= ~SHMEM_PAGEIN;
588                                 spin_unlock(&info->lock);
589                         }
590                 }
591         }
592
593         error = inode_change_ok(inode, attr);
594         if (!error)
595                 error = inode_setattr(inode, attr);
596         if (page)
597                 page_cache_release(page);
598         return error;
599 }
600
601 static void shmem_delete_inode(struct inode *inode)
602 {
603         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
604         struct shmem_inode_info *info = SHMEM_I(inode);
605
606         if (inode->i_op->truncate == shmem_truncate) {
607                 spin_lock(&shmem_ilock);
608                 list_del(&info->list);
609                 spin_unlock(&shmem_ilock);
610                 shmem_unacct_size(info->flags, inode->i_size);
611                 inode->i_size = 0;
612                 shmem_truncate(inode);
613         }
614         BUG_ON(inode->i_blocks);
615         spin_lock(&sbinfo->stat_lock);
616         sbinfo->free_inodes++;
617         spin_unlock(&sbinfo->stat_lock);
618         clear_inode(inode);
619 }
620
621 static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
622 {
623         swp_entry_t *ptr;
624
625         for (ptr = dir; ptr < edir; ptr++) {
626                 if (ptr->val == entry.val)
627                         return ptr - dir;
628         }
629         return -1;
630 }
631
632 static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
633 {
634         struct inode *inode;
635         unsigned long idx;
636         unsigned long size;
637         unsigned long limit;
638         unsigned long stage;
639         struct page **dir;
640         struct page *subdir;
641         swp_entry_t *ptr;
642         int offset;
643
644         idx = 0;
645         ptr = info->i_direct;
646         spin_lock(&info->lock);
647         limit = info->next_index;
648         size = limit;
649         if (size > SHMEM_NR_DIRECT)
650                 size = SHMEM_NR_DIRECT;
651         offset = shmem_find_swp(entry, ptr, ptr+size);
652         if (offset >= 0)
653                 goto found;
654         if (!info->i_indirect)
655                 goto lost2;
656         /* we might be racing with shmem_truncate */
657         if (limit <= SHMEM_NR_DIRECT)
658                 goto lost2;
659
660         dir = shmem_dir_map(info->i_indirect);
661         stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
662
663         for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
664                 if (unlikely(idx == stage)) {
665                         shmem_dir_unmap(dir-1);
666                         dir = shmem_dir_map(info->i_indirect) +
667                             ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
668                         while (!*dir) {
669                                 dir++;
670                                 idx += ENTRIES_PER_PAGEPAGE;
671                                 if (idx >= limit)
672                                         goto lost1;
673                         }
674                         stage = idx + ENTRIES_PER_PAGEPAGE;
675                         subdir = *dir;
676                         shmem_dir_unmap(dir);
677                         dir = shmem_dir_map(subdir);
678                 }
679                 subdir = *dir;
680                 if (subdir && subdir->nr_swapped) {
681                         ptr = shmem_swp_map(subdir);
682                         size = limit - idx;
683                         if (size > ENTRIES_PER_PAGE)
684                                 size = ENTRIES_PER_PAGE;
685                         offset = shmem_find_swp(entry, ptr, ptr+size);
686                         if (offset >= 0) {
687                                 shmem_dir_unmap(dir);
688                                 goto found;
689                         }
690                         shmem_swp_unmap(ptr);
691                 }
692         }
693 lost1:
694         shmem_dir_unmap(dir-1);
695 lost2:
696         spin_unlock(&info->lock);
697         return 0;
698 found:
699         idx += offset;
700         inode = &info->vfs_inode;
701         if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
702                 info->flags |= SHMEM_PAGEIN;
703                 shmem_swp_set(info, ptr + offset, 0);
704         }
705         shmem_swp_unmap(ptr);
706         spin_unlock(&info->lock);
707         /*
708          * Decrement swap count even when the entry is left behind:
709          * try_to_unuse will skip over mms, then reincrement count.
710          */
711         swap_free(entry);
712         return 1;
713 }
714
715 /*
716  * shmem_unuse() search for an eventually swapped out shmem page.
717  */
718 int shmem_unuse(swp_entry_t entry, struct page *page)
719 {
720         struct list_head *p;
721         struct shmem_inode_info *info;
722         int found = 0;
723
724         spin_lock(&shmem_ilock);
725         list_for_each(p, &shmem_inodes) {
726                 info = list_entry(p, struct shmem_inode_info, list);
727
728                 if (info->swapped && shmem_unuse_inode(info, entry, page)) {
729                         /* move head to start search for next from here */
730                         list_move_tail(&shmem_inodes, &info->list);
731                         found = 1;
732                         break;
733                 }
734         }
735         spin_unlock(&shmem_ilock);
736         return found;
737 }
738
739 /*
740  * Move the page from the page cache to the swap cache.
741  */
742 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
743 {
744         struct shmem_inode_info *info;
745         swp_entry_t *entry, swap;
746         struct address_space *mapping;
747         unsigned long index;
748         struct inode *inode;
749
750         BUG_ON(!PageLocked(page));
751         BUG_ON(page_mapped(page));
752
753         mapping = page->mapping;
754         index = page->index;
755         inode = mapping->host;
756         info = SHMEM_I(inode);
757         if (info->flags & VM_LOCKED)
758                 goto redirty;
759         swap = get_swap_page();
760         if (!swap.val)
761                 goto redirty;
762
763         spin_lock(&info->lock);
764         shmem_recalc_inode(inode);
765         if (index >= info->next_index) {
766                 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
767                 goto unlock;
768         }
769         entry = shmem_swp_entry(info, index, NULL);
770         BUG_ON(!entry);
771         BUG_ON(entry->val);
772
773         if (move_to_swap_cache(page, swap) == 0) {
774                 shmem_swp_set(info, entry, swap.val);
775                 shmem_swp_unmap(entry);
776                 spin_unlock(&info->lock);
777                 unlock_page(page);
778                 return 0;
779         }
780
781         shmem_swp_unmap(entry);
782 unlock:
783         spin_unlock(&info->lock);
784         swap_free(swap);
785 redirty:
786         set_page_dirty(page);
787         return WRITEPAGE_ACTIVATE;      /* Return with the page locked */
788 }
789
790 #ifdef CONFIG_NUMA
791 static struct page *shmem_swapin_async(struct shared_policy *p,
792                                        swp_entry_t entry, unsigned long idx)
793 {
794         struct page *page;
795         struct vm_area_struct pvma;
796
797         /* Create a pseudo vma that just contains the policy */
798         memset(&pvma, 0, sizeof(struct vm_area_struct));
799         pvma.vm_end = PAGE_SIZE;
800         pvma.vm_pgoff = idx;
801         pvma.vm_policy = mpol_shared_policy_lookup(p, idx);
802         page = read_swap_cache_async(entry, &pvma, 0);
803         mpol_free(pvma.vm_policy);
804         return page;
805 }
806
807 struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
808                           unsigned long idx)
809 {
810         struct shared_policy *p = &info->policy;
811         int i, num;
812         struct page *page;
813         unsigned long offset;
814
815         num = valid_swaphandles(entry, &offset);
816         for (i = 0; i < num; offset++, i++) {
817                 page = shmem_swapin_async(p,
818                                 swp_entry(swp_type(entry), offset), idx);
819                 if (!page)
820                         break;
821                 page_cache_release(page);
822         }
823         lru_add_drain();        /* Push any new pages onto the LRU now */
824         return shmem_swapin_async(p, entry, idx);
825 }
826
827 static struct page *
828 shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info,
829                  unsigned long idx)
830 {
831         struct vm_area_struct pvma;
832         struct page *page;
833
834         memset(&pvma, 0, sizeof(struct vm_area_struct));
835         pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
836         pvma.vm_pgoff = idx;
837         pvma.vm_end = PAGE_SIZE;
838         page = alloc_page_vma(gfp, &pvma, 0);
839         mpol_free(pvma.vm_policy);
840         return page;
841 }
842 #else
843 static inline struct page *
844 shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
845 {
846         swapin_readahead(entry, 0, NULL);
847         return read_swap_cache_async(entry, NULL, 0);
848 }
849
850 static inline struct page *
851 shmem_alloc_page(unsigned long gfp,struct shmem_inode_info *info,
852                                  unsigned long idx)
853 {
854         return alloc_page(gfp);
855 }
856 #endif
857
858 /*
859  * shmem_getpage - either get the page from swap or allocate a new one
860  *
861  * If we allocate a new one we do not mark it dirty. That's up to the
862  * vm. If we swap it in we mark it dirty since we also free the swap
863  * entry since a page cannot live in both the swap and page cache
864  */
865 static int shmem_getpage(struct inode *inode, unsigned long idx,
866                         struct page **pagep, enum sgp_type sgp, int *type)
867 {
868         struct address_space *mapping = inode->i_mapping;
869         struct shmem_inode_info *info = SHMEM_I(inode);
870         struct shmem_sb_info *sbinfo;
871         struct page *filepage = *pagep;
872         struct page *swappage;
873         swp_entry_t *entry;
874         swp_entry_t swap;
875         int error, majmin = VM_FAULT_MINOR;
876
877         if (idx >= SHMEM_MAX_INDEX)
878                 return -EFBIG;
879         /*
880          * Normally, filepage is NULL on entry, and either found
881          * uptodate immediately, or allocated and zeroed, or read
882          * in under swappage, which is then assigned to filepage.
883          * But shmem_prepare_write passes in a locked filepage,
884          * which may be found not uptodate by other callers too,
885          * and may need to be copied from the swappage read in.
886          */
887 repeat:
888         if (!filepage)
889                 filepage = find_lock_page(mapping, idx);
890         if (filepage && PageUptodate(filepage))
891                 goto done;
892         error = 0;
893         if (sgp == SGP_QUICK)
894                 goto failed;
895
896         spin_lock(&info->lock);
897         shmem_recalc_inode(inode);
898         entry = shmem_swp_alloc(info, idx, sgp);
899         if (IS_ERR(entry)) {
900                 spin_unlock(&info->lock);
901                 error = PTR_ERR(entry);
902                 goto failed;
903         }
904         swap = *entry;
905
906         if (swap.val) {
907                 /* Look it up and read it in.. */
908                 swappage = lookup_swap_cache(swap);
909                 if (!swappage) {
910                         shmem_swp_unmap(entry);
911                         spin_unlock(&info->lock);
912                         /* here we actually do the io */
913                         if (majmin == VM_FAULT_MINOR && type)
914                                 inc_page_state(pgmajfault);
915                         majmin = VM_FAULT_MAJOR;
916                         swappage = shmem_swapin(info, swap, idx);
917                         if (!swappage) {
918                                 spin_lock(&info->lock);
919                                 entry = shmem_swp_alloc(info, idx, sgp);
920                                 if (IS_ERR(entry))
921                                         error = PTR_ERR(entry);
922                                 else {
923                                         if (entry->val == swap.val)
924                                                 error = -ENOMEM;
925                                         shmem_swp_unmap(entry);
926                                 }
927                                 spin_unlock(&info->lock);
928                                 if (error)
929                                         goto failed;
930                                 goto repeat;
931                         }
932                         wait_on_page_locked(swappage);
933                         page_cache_release(swappage);
934                         goto repeat;
935                 }
936
937                 /* We have to do this with page locked to prevent races */
938                 if (TestSetPageLocked(swappage)) {
939                         shmem_swp_unmap(entry);
940                         spin_unlock(&info->lock);
941                         wait_on_page_locked(swappage);
942                         page_cache_release(swappage);
943                         goto repeat;
944                 }
945                 if (PageWriteback(swappage)) {
946                         shmem_swp_unmap(entry);
947                         spin_unlock(&info->lock);
948                         wait_on_page_writeback(swappage);
949                         unlock_page(swappage);
950                         page_cache_release(swappage);
951                         goto repeat;
952                 }
953                 if (!PageUptodate(swappage)) {
954                         shmem_swp_unmap(entry);
955                         spin_unlock(&info->lock);
956                         unlock_page(swappage);
957                         page_cache_release(swappage);
958                         error = -EIO;
959                         goto failed;
960                 }
961
962                 if (filepage) {
963                         shmem_swp_set(info, entry, 0);
964                         shmem_swp_unmap(entry);
965                         delete_from_swap_cache(swappage);
966                         spin_unlock(&info->lock);
967                         copy_highpage(filepage, swappage);
968                         unlock_page(swappage);
969                         page_cache_release(swappage);
970                         flush_dcache_page(filepage);
971                         SetPageUptodate(filepage);
972                         set_page_dirty(filepage);
973                         swap_free(swap);
974                 } else if (!(error = move_from_swap_cache(
975                                 swappage, idx, mapping))) {
976                         info->flags |= SHMEM_PAGEIN;
977                         shmem_swp_set(info, entry, 0);
978                         shmem_swp_unmap(entry);
979                         spin_unlock(&info->lock);
980                         filepage = swappage;
981                         swap_free(swap);
982                 } else {
983                         shmem_swp_unmap(entry);
984                         spin_unlock(&info->lock);
985                         unlock_page(swappage);
986                         page_cache_release(swappage);
987                         if (error == -ENOMEM) {
988                                 /* let kswapd refresh zone for GFP_ATOMICs */
989                                 blk_congestion_wait(WRITE, HZ/50);
990                         }
991                         goto repeat;
992                 }
993         } else if (sgp == SGP_READ && !filepage) {
994                 shmem_swp_unmap(entry);
995                 filepage = find_get_page(mapping, idx);
996                 if (filepage &&
997                     (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
998                         spin_unlock(&info->lock);
999                         wait_on_page_locked(filepage);
1000                         page_cache_release(filepage);
1001                         filepage = NULL;
1002                         goto repeat;
1003                 }
1004                 spin_unlock(&info->lock);
1005         } else {
1006                 shmem_swp_unmap(entry);
1007                 sbinfo = SHMEM_SB(inode->i_sb);
1008                 spin_lock(&sbinfo->stat_lock);
1009                 if (sbinfo->free_blocks == 0 || shmem_acct_block(info->flags)) {
1010                         spin_unlock(&sbinfo->stat_lock);
1011                         spin_unlock(&info->lock);
1012                         error = -ENOSPC;
1013                         goto failed;
1014                 }
1015                 sbinfo->free_blocks--;
1016                 inode->i_blocks += BLOCKS_PER_PAGE;
1017                 spin_unlock(&sbinfo->stat_lock);
1018
1019                 if (!filepage) {
1020                         spin_unlock(&info->lock);
1021                         filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
1022                                                     info,
1023                                                     idx);
1024                         if (!filepage) {
1025                                 shmem_unacct_blocks(info->flags, 1);
1026                                 shmem_free_block(inode);
1027                                 error = -ENOMEM;
1028                                 goto failed;
1029                         }
1030
1031                         spin_lock(&info->lock);
1032                         entry = shmem_swp_alloc(info, idx, sgp);
1033                         if (IS_ERR(entry))
1034                                 error = PTR_ERR(entry);
1035                         else {
1036                                 swap = *entry;
1037                                 shmem_swp_unmap(entry);
1038                         }
1039                         if (error || swap.val || 0 != add_to_page_cache_lru(
1040                                         filepage, mapping, idx, GFP_ATOMIC)) {
1041                                 spin_unlock(&info->lock);
1042                                 page_cache_release(filepage);
1043                                 shmem_unacct_blocks(info->flags, 1);
1044                                 shmem_free_block(inode);
1045                                 filepage = NULL;
1046                                 if (error)
1047                                         goto failed;
1048                                 goto repeat;
1049                         }
1050                         info->flags |= SHMEM_PAGEIN;
1051                 }
1052
1053                 info->alloced++;
1054                 spin_unlock(&info->lock);
1055                 clear_highpage(filepage);
1056                 flush_dcache_page(filepage);
1057                 SetPageUptodate(filepage);
1058         }
1059 done:
1060         if (!*pagep) {
1061                 if (filepage) {
1062                         unlock_page(filepage);
1063                         *pagep = filepage;
1064                 } else
1065                         *pagep = ZERO_PAGE(0);
1066         }
1067         if (type)
1068                 *type = majmin;
1069         return 0;
1070
1071 failed:
1072         if (*pagep != filepage) {
1073                 unlock_page(filepage);
1074                 page_cache_release(filepage);
1075         }
1076         return error;
1077 }
1078
1079 struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
1080 {
1081         struct inode *inode = vma->vm_file->f_dentry->d_inode;
1082         struct page *page = NULL;
1083         unsigned long idx;
1084         int error;
1085
1086         idx = (address - vma->vm_start) >> PAGE_SHIFT;
1087         idx += vma->vm_pgoff;
1088         idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
1089
1090         error = shmem_getpage(inode, idx, &page, SGP_CACHE, type);
1091         if (error)
1092                 return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
1093
1094         mark_page_accessed(page);
1095         return page;
1096 }
1097
1098 static int shmem_populate(struct vm_area_struct *vma,
1099         unsigned long addr, unsigned long len,
1100         pgprot_t prot, unsigned long pgoff, int nonblock)
1101 {
1102         struct inode *inode = vma->vm_file->f_dentry->d_inode;
1103         struct mm_struct *mm = vma->vm_mm;
1104         enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
1105         unsigned long size;
1106
1107         size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
1108         if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size)
1109                 return -EINVAL;
1110
1111         while ((long) len > 0) {
1112                 struct page *page = NULL;
1113                 int err;
1114                 /*
1115                  * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
1116                  */
1117                 err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
1118                 if (err)
1119                         return err;
1120                 if (page) {
1121                         mark_page_accessed(page);
1122                         err = install_page(mm, vma, addr, page, prot);
1123                         if (err) {
1124                                 page_cache_release(page);
1125                                 return err;
1126                         }
1127                 } else if (nonblock) {
1128                         /*
1129                          * If a nonlinear mapping then store the file page
1130                          * offset in the pte.
1131                          */
1132                         if (pgoff != linear_page_index(vma, addr)) {
1133                                 err = install_file_pte(mm, vma, addr, pgoff, prot);
1134                                 if (err)
1135                                         return err;
1136                         }
1137                 }
1138
1139                 len -= PAGE_SIZE;
1140                 addr += PAGE_SIZE;
1141                 pgoff++;
1142         }
1143         return 0;
1144 }
1145
1146 #ifdef CONFIG_NUMA
1147 int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1148 {
1149         struct inode *i = vma->vm_file->f_dentry->d_inode;
1150         return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1151 }
1152
1153 struct mempolicy *
1154 shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
1155 {
1156         struct inode *i = vma->vm_file->f_dentry->d_inode;
1157         unsigned long idx;
1158
1159         idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1160         return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
1161 }
1162 #endif
1163
1164 int shmem_lock(struct file *file, int lock)
1165 {
1166         struct inode *inode = file->f_dentry->d_inode;
1167         struct shmem_inode_info *info = SHMEM_I(inode);
1168         struct mm_struct *mm = current->mm;
1169         unsigned long lock_limit, locked;
1170         int retval = -ENOMEM;
1171
1172         spin_lock(&info->lock);
1173         if (lock && !(info->flags & VM_LOCKED)) {
1174                 locked = inode->i_size >> PAGE_SHIFT;
1175                 locked += mm->locked_vm;
1176                 lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
1177                 lock_limit >>= PAGE_SHIFT;
1178                 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
1179                         goto out_nomem;
1180                 mm->locked_vm = locked;
1181         }
1182         if (!lock && (info->flags & VM_LOCKED) && mm) {
1183                 locked = inode->i_size >> PAGE_SHIFT;
1184                 mm->locked_vm -= locked;
1185         }
1186         if (lock)
1187                 info->flags |= VM_LOCKED;
1188         else
1189                 info->flags &= ~VM_LOCKED;
1190         retval = 0;
1191 out_nomem:
1192         spin_unlock(&info->lock);
1193         return retval;
1194 }
1195
1196 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1197 {
1198         file_accessed(file);
1199         vma->vm_ops = &shmem_vm_ops;
1200         return 0;
1201 }
1202
1203 static struct inode *
1204 shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1205 {
1206         struct inode *inode;
1207         struct shmem_inode_info *info;
1208         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1209
1210         spin_lock(&sbinfo->stat_lock);
1211         if (!sbinfo->free_inodes) {
1212                 spin_unlock(&sbinfo->stat_lock);
1213                 return NULL;
1214         }
1215         sbinfo->free_inodes--;
1216         spin_unlock(&sbinfo->stat_lock);
1217
1218         inode = new_inode(sb);
1219         if (inode) {
1220                 inode->i_mode = mode;
1221                 inode->i_uid = current->fsuid;
1222                 inode->i_gid = current->fsgid;
1223                 inode->i_blksize = PAGE_CACHE_SIZE;
1224                 inode->i_blocks = 0;
1225                 inode->i_mapping->a_ops = &shmem_aops;
1226                 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1227                 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1228                 info = SHMEM_I(inode);
1229                 memset(info, 0, (char *)inode - (char *)info);
1230                 spin_lock_init(&info->lock);
1231                 mpol_shared_policy_init(&info->policy);
1232                 switch (mode & S_IFMT) {
1233                 default:
1234                         init_special_inode(inode, mode, dev);
1235                         break;
1236                 case S_IFREG:
1237                         inode->i_op = &shmem_inode_operations;
1238                         inode->i_fop = &shmem_file_operations;
1239                         spin_lock(&shmem_ilock);
1240                         list_add_tail(&info->list, &shmem_inodes);
1241                         spin_unlock(&shmem_ilock);
1242                         break;
1243                 case S_IFDIR:
1244                         inode->i_nlink++;
1245                         /* Some things misbehave if size == 0 on a directory */
1246                         inode->i_size = 2 * BOGO_DIRENT_SIZE;
1247                         inode->i_op = &shmem_dir_inode_operations;
1248                         inode->i_fop = &simple_dir_operations;
1249                         break;
1250                 case S_IFLNK:
1251                         break;
1252                 }
1253         }
1254         return inode;
1255 }
1256
1257 static int shmem_set_size(struct shmem_sb_info *info,
1258                           unsigned long max_blocks, unsigned long max_inodes)
1259 {
1260         int error;
1261         unsigned long blocks, inodes;
1262
1263         spin_lock(&info->stat_lock);
1264         blocks = info->max_blocks - info->free_blocks;
1265         inodes = info->max_inodes - info->free_inodes;
1266         error = -EINVAL;
1267         if (max_blocks < blocks)
1268                 goto out;
1269         if (max_inodes < inodes)
1270                 goto out;
1271         error = 0;
1272         info->max_blocks  = max_blocks;
1273         info->free_blocks = max_blocks - blocks;
1274         info->max_inodes  = max_inodes;
1275         info->free_inodes = max_inodes - inodes;
1276 out:
1277         spin_unlock(&info->stat_lock);
1278         return error;
1279 }
1280
1281 #ifdef CONFIG_TMPFS
1282
1283 static struct inode_operations shmem_symlink_inode_operations;
1284 static struct inode_operations shmem_symlink_inline_operations;
1285
1286 /*
1287  * Normally tmpfs makes no use of shmem_prepare_write, but it
1288  * lets a tmpfs file be used read-write below the loop driver.
1289  */
1290 static int
1291 shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
1292 {
1293         struct inode *inode = page->mapping->host;
1294         return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
1295 }
1296
1297 static ssize_t
1298 shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
1299 {
1300         struct inode    *inode = file->f_dentry->d_inode;
1301         loff_t          pos;
1302         unsigned long   written;
1303         int             err;
1304
1305         if ((ssize_t) count < 0)
1306                 return -EINVAL;
1307
1308         if (!access_ok(VERIFY_READ, buf, count))
1309                 return -EFAULT;
1310
1311         down(&inode->i_sem);
1312
1313         pos = *ppos;
1314         written = 0;
1315
1316         err = generic_write_checks(file, &pos, &count, 0);
1317         if (err || !count)
1318                 goto out;
1319
1320         err = remove_suid(file->f_dentry);
1321         if (err)
1322                 goto out;
1323
1324         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1325
1326         do {
1327                 struct page *page = NULL;
1328                 unsigned long bytes, index, offset;
1329                 char *kaddr;
1330                 int left;
1331
1332                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1333                 index = pos >> PAGE_CACHE_SHIFT;
1334                 bytes = PAGE_CACHE_SIZE - offset;
1335                 if (bytes > count)
1336                         bytes = count;
1337
1338                 /*
1339                  * We don't hold page lock across copy from user -
1340                  * what would it guard against? - so no deadlock here.
1341                  * But it still may be a good idea to prefault below.
1342                  */
1343
1344                 err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
1345                 if (err)
1346                         break;
1347
1348                 left = bytes;
1349                 if (PageHighMem(page)) {
1350                         volatile unsigned char dummy;
1351                         __get_user(dummy, buf);
1352                         __get_user(dummy, buf + bytes - 1);
1353
1354                         kaddr = kmap_atomic(page, KM_USER0);
1355                         left = __copy_from_user(kaddr + offset, buf, bytes);
1356                         kunmap_atomic(kaddr, KM_USER0);
1357                 }
1358                 if (left) {
1359                         kaddr = kmap(page);
1360                         left = __copy_from_user(kaddr + offset, buf, bytes);
1361                         kunmap(page);
1362                 }
1363
1364                 written += bytes;
1365                 count -= bytes;
1366                 pos += bytes;
1367                 buf += bytes;
1368                 if (pos > inode->i_size)
1369                         i_size_write(inode, pos);
1370
1371                 flush_dcache_page(page);
1372                 set_page_dirty(page);
1373                 mark_page_accessed(page);
1374                 page_cache_release(page);
1375
1376                 if (left) {
1377                         pos -= left;
1378                         written -= left;
1379                         err = -EFAULT;
1380                         break;
1381                 }
1382
1383                 /*
1384                  * Our dirty pages are not counted in nr_dirty,
1385                  * and we do not attempt to balance dirty pages.
1386                  */
1387
1388                 cond_resched();
1389         } while (count);
1390
1391         *ppos = pos;
1392         if (written)
1393                 err = written;
1394 out:
1395         up(&inode->i_sem);
1396         return err;
1397 }
1398
1399 static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1400 {
1401         struct inode *inode = filp->f_dentry->d_inode;
1402         struct address_space *mapping = inode->i_mapping;
1403         unsigned long index, offset;
1404
1405         index = *ppos >> PAGE_CACHE_SHIFT;
1406         offset = *ppos & ~PAGE_CACHE_MASK;
1407
1408         for (;;) {
1409                 struct page *page = NULL;
1410                 unsigned long end_index, nr, ret;
1411                 loff_t i_size = i_size_read(inode);
1412
1413                 end_index = i_size >> PAGE_CACHE_SHIFT;
1414                 if (index > end_index)
1415                         break;
1416                 if (index == end_index) {
1417                         nr = i_size & ~PAGE_CACHE_MASK;
1418                         if (nr <= offset)
1419                                 break;
1420                 }
1421
1422                 desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
1423                 if (desc->error) {
1424                         if (desc->error == -EINVAL)
1425                                 desc->error = 0;
1426                         break;
1427                 }
1428
1429                 /*
1430                  * We must evaluate after, since reads (unlike writes)
1431                  * are called without i_sem protection against truncate
1432                  */
1433                 nr = PAGE_CACHE_SIZE;
1434                 i_size = i_size_read(inode);
1435                 end_index = i_size >> PAGE_CACHE_SHIFT;
1436                 if (index == end_index) {
1437                         nr = i_size & ~PAGE_CACHE_MASK;
1438                         if (nr <= offset) {
1439                                 page_cache_release(page);
1440                                 break;
1441                         }
1442                 }
1443                 nr -= offset;
1444
1445                 if (page != ZERO_PAGE(0)) {
1446                         /*
1447                          * If users can be writing to this page using arbitrary
1448                          * virtual addresses, take care about potential aliasing
1449                          * before reading the page on the kernel side.
1450                          */
1451                         if (mapping_writably_mapped(mapping))
1452                                 flush_dcache_page(page);
1453                         /*
1454                          * Mark the page accessed if we read the beginning.
1455                          */
1456                         if (!offset)
1457                                 mark_page_accessed(page);
1458                 }
1459
1460                 /*
1461                  * Ok, we have the page, and it's up-to-date, so
1462                  * now we can copy it to user space...
1463                  *
1464                  * The actor routine returns how many bytes were actually used..
1465                  * NOTE! This may not be the same as how much of a user buffer
1466                  * we filled up (we may be padding etc), so we can only update
1467                  * "pos" here (the actor routine has to update the user buffer
1468                  * pointers and the remaining count).
1469                  */
1470                 ret = actor(desc, page, offset, nr);
1471                 offset += ret;
1472                 index += offset >> PAGE_CACHE_SHIFT;
1473                 offset &= ~PAGE_CACHE_MASK;
1474
1475                 page_cache_release(page);
1476                 if (ret != nr || !desc->count)
1477                         break;
1478
1479                 cond_resched();
1480         }
1481
1482         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1483         file_accessed(filp);
1484 }
1485
1486 static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1487 {
1488         read_descriptor_t desc;
1489
1490         if ((ssize_t) count < 0)
1491                 return -EINVAL;
1492         if (!access_ok(VERIFY_WRITE, buf, count))
1493                 return -EFAULT;
1494         if (!count)
1495                 return 0;
1496
1497         desc.written = 0;
1498         desc.count = count;
1499         desc.buf = buf;
1500         desc.error = 0;
1501
1502         do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1503         if (desc.written)
1504                 return desc.written;
1505         return desc.error;
1506 }
1507
1508 static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
1509                          size_t count, read_actor_t actor, void __user *target)
1510 {
1511         read_descriptor_t desc;
1512
1513         if (!count)
1514                 return 0;
1515
1516         desc.written = 0;
1517         desc.count = count;
1518         desc.buf = target;
1519         desc.error = 0;
1520
1521         do_shmem_file_read(in_file, ppos, &desc, actor);
1522         if (desc.written)
1523                 return desc.written;
1524         return desc.error;
1525 }
1526
1527 static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
1528 {
1529         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1530
1531         buf->f_type = TMPFS_MAGIC;
1532         buf->f_bsize = PAGE_CACHE_SIZE;
1533         spin_lock(&sbinfo->stat_lock);
1534         buf->f_blocks = sbinfo->max_blocks;
1535         buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
1536         buf->f_files = sbinfo->max_inodes;
1537         buf->f_ffree = sbinfo->free_inodes;
1538         spin_unlock(&sbinfo->stat_lock);
1539         buf->f_namelen = NAME_MAX;
1540         return 0;
1541 }
1542
1543 /*
1544  * File creation. Allocate an inode, and we're done..
1545  */
1546 static int
1547 shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1548 {
1549         struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
1550         int error = -ENOSPC;
1551
1552         if (inode) {
1553                 if (dir->i_mode & S_ISGID) {
1554                         inode->i_gid = dir->i_gid;
1555                         if (S_ISDIR(mode))
1556                                 inode->i_mode |= S_ISGID;
1557                 }
1558                 dir->i_size += BOGO_DIRENT_SIZE;
1559                 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1560                 d_instantiate(dentry, inode);
1561                 dget(dentry); /* Extra count - pin the dentry in core */
1562                 error = 0;
1563         }
1564         return error;
1565 }
1566
1567 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1568 {
1569         int error;
1570
1571         if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1572                 return error;
1573         dir->i_nlink++;
1574         return 0;
1575 }
1576
1577 static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
1578                 struct nameidata *nd)
1579 {
1580         return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1581 }
1582
1583 /*
1584  * Link a file..
1585  */
1586 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1587 {
1588         struct inode *inode = old_dentry->d_inode;
1589
1590         dir->i_size += BOGO_DIRENT_SIZE;
1591         inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1592         inode->i_nlink++;
1593         atomic_inc(&inode->i_count);    /* New dentry reference */
1594         dget(dentry);           /* Extra pinning count for the created dentry */
1595         d_instantiate(dentry, inode);
1596         return 0;
1597 }
1598
1599 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1600 {
1601         struct inode *inode = dentry->d_inode;
1602
1603         dir->i_size -= BOGO_DIRENT_SIZE;
1604         inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1605         inode->i_nlink--;
1606         dput(dentry);   /* Undo the count from "create" - this does all the work */
1607         return 0;
1608 }
1609
1610 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1611 {
1612         if (!simple_empty(dentry))
1613                 return -ENOTEMPTY;
1614
1615         dir->i_nlink--;
1616         return shmem_unlink(dir, dentry);
1617 }
1618
1619 /*
1620  * The VFS layer already does all the dentry stuff for rename,
1621  * we just have to decrement the usage count for the target if
1622  * it exists so that the VFS layer correctly free's it when it
1623  * gets overwritten.
1624  */
1625 static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
1626 {
1627         struct inode *inode = old_dentry->d_inode;
1628         int they_are_dirs = S_ISDIR(inode->i_mode);
1629
1630         if (!simple_empty(new_dentry))
1631                 return -ENOTEMPTY;
1632
1633         if (new_dentry->d_inode) {
1634                 (void) shmem_unlink(new_dir, new_dentry);
1635                 if (they_are_dirs)
1636                         old_dir->i_nlink--;
1637         } else if (they_are_dirs) {
1638                 old_dir->i_nlink--;
1639                 new_dir->i_nlink++;
1640         }
1641
1642         old_dir->i_size -= BOGO_DIRENT_SIZE;
1643         new_dir->i_size += BOGO_DIRENT_SIZE;
1644         old_dir->i_ctime = old_dir->i_mtime =
1645         new_dir->i_ctime = new_dir->i_mtime =
1646         inode->i_ctime = CURRENT_TIME;
1647         return 0;
1648 }
1649
1650 static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1651 {
1652         int error;
1653         int len;
1654         struct inode *inode;
1655         struct page *page = NULL;
1656         char *kaddr;
1657         struct shmem_inode_info *info;
1658
1659         len = strlen(symname) + 1;
1660         if (len > PAGE_CACHE_SIZE)
1661                 return -ENAMETOOLONG;
1662
1663         inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
1664         if (!inode)
1665                 return -ENOSPC;
1666
1667         info = SHMEM_I(inode);
1668         inode->i_size = len-1;
1669         if (len <= (char *)inode - (char *)info) {
1670                 /* do it inline */
1671                 memcpy(info, symname, len);
1672                 inode->i_op = &shmem_symlink_inline_operations;
1673         } else {
1674                 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
1675                 if (error) {
1676                         iput(inode);
1677                         return error;
1678                 }
1679                 inode->i_op = &shmem_symlink_inode_operations;
1680                 spin_lock(&shmem_ilock);
1681                 list_add_tail(&info->list, &shmem_inodes);
1682                 spin_unlock(&shmem_ilock);
1683                 kaddr = kmap_atomic(page, KM_USER0);
1684                 memcpy(kaddr, symname, len);
1685                 kunmap_atomic(kaddr, KM_USER0);
1686                 set_page_dirty(page);
1687                 page_cache_release(page);
1688         }
1689         if (dir->i_mode & S_ISGID)
1690                 inode->i_gid = dir->i_gid;
1691         dir->i_size += BOGO_DIRENT_SIZE;
1692         dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1693         d_instantiate(dentry, inode);
1694         dget(dentry);
1695         return 0;
1696 }
1697
1698 static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
1699 {
1700         nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
1701         return 0;
1702 }
1703
1704 static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1705 {
1706         struct page *page = NULL;
1707         int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
1708         nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
1709         return 0;
1710 }
1711
1712 static void shmem_put_link(struct dentry *dentry, struct nameidata *nd)
1713 {
1714         if (!IS_ERR(nd_get_link(nd))) {
1715                 struct page *page;
1716
1717                 page = find_get_page(dentry->d_inode->i_mapping, 0);
1718                 if (!page)
1719                         BUG();
1720                 kunmap(page);
1721                 mark_page_accessed(page);
1722                 page_cache_release(page);
1723                 page_cache_release(page);
1724         }
1725 }
1726
1727 static struct inode_operations shmem_symlink_inline_operations = {
1728         .readlink       = generic_readlink,
1729         .follow_link    = shmem_follow_link_inline,
1730 };
1731
1732 static struct inode_operations shmem_symlink_inode_operations = {
1733         .truncate       = shmem_truncate,
1734         .readlink       = generic_readlink,
1735         .follow_link    = shmem_follow_link,
1736         .put_link       = shmem_put_link,
1737 };
1738
1739 static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
1740 {
1741         char *this_char, *value, *rest;
1742
1743         while ((this_char = strsep(&options, ",")) != NULL) {
1744                 if (!*this_char)
1745                         continue;
1746                 if ((value = strchr(this_char,'=')) != NULL) {
1747                         *value++ = 0;
1748                 } else {
1749                         printk(KERN_ERR
1750                             "tmpfs: No value for mount option '%s'\n",
1751                             this_char);
1752                         return 1;
1753                 }
1754
1755                 if (!strcmp(this_char,"size")) {
1756                         unsigned long long size;
1757                         size = memparse(value,&rest);
1758                         if (*rest == '%') {
1759                                 size <<= PAGE_SHIFT;
1760                                 size *= totalram_pages;
1761                                 do_div(size, 100);
1762                                 rest++;
1763                         }
1764                         if (*rest)
1765                                 goto bad_val;
1766                         *blocks = size >> PAGE_CACHE_SHIFT;
1767                 } else if (!strcmp(this_char,"nr_blocks")) {
1768                         *blocks = memparse(value,&rest);
1769                         if (*rest)
1770                                 goto bad_val;
1771                 } else if (!strcmp(this_char,"nr_inodes")) {
1772                         *inodes = memparse(value,&rest);
1773                         if (*rest)
1774                                 goto bad_val;
1775                 } else if (!strcmp(this_char,"mode")) {
1776                         if (!mode)
1777                                 continue;
1778                         *mode = simple_strtoul(value,&rest,8);
1779                         if (*rest)
1780                                 goto bad_val;
1781                 } else if (!strcmp(this_char,"uid")) {
1782                         if (!uid)
1783                                 continue;
1784                         *uid = simple_strtoul(value,&rest,0);
1785                         if (*rest)
1786                                 goto bad_val;
1787                 } else if (!strcmp(this_char,"gid")) {
1788                         if (!gid)
1789                                 continue;
1790                         *gid = simple_strtoul(value,&rest,0);
1791                         if (*rest)
1792                                 goto bad_val;
1793                 } else {
1794                         printk(KERN_ERR "tmpfs: Bad mount option %s\n",
1795                                this_char);
1796                         return 1;
1797                 }
1798         }
1799         return 0;
1800
1801 bad_val:
1802         printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
1803                value, this_char);
1804         return 1;
1805
1806 }
1807
1808 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
1809 {
1810         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1811         unsigned long max_blocks = sbinfo->max_blocks;
1812         unsigned long max_inodes = sbinfo->max_inodes;
1813
1814         if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
1815                 return -EINVAL;
1816         return shmem_set_size(sbinfo, max_blocks, max_inodes);
1817 }
1818 #endif
1819
1820 static int shmem_fill_super(struct super_block *sb,
1821                             void *data, int silent)
1822 {
1823         struct inode *inode;
1824         struct dentry *root;
1825         unsigned long blocks, inodes;
1826         int mode   = S_IRWXUGO | S_ISVTX;
1827         uid_t uid = current->fsuid;
1828         gid_t gid = current->fsgid;
1829         struct shmem_sb_info *sbinfo;
1830         int err = -ENOMEM;
1831
1832         sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL);
1833         if (!sbinfo)
1834                 return -ENOMEM;
1835         sb->s_fs_info = sbinfo;
1836         memset(sbinfo, 0, sizeof(struct shmem_sb_info));
1837
1838         /*
1839          * Per default we only allow half of the physical ram per
1840          * tmpfs instance
1841          */
1842         blocks = inodes = totalram_pages / 2;
1843
1844 #ifdef CONFIG_TMPFS
1845         if (shmem_parse_options(data, &mode, &uid, &gid, &blocks, &inodes)) {
1846                 err = -EINVAL;
1847                 goto failed;
1848         }
1849 #else
1850         sb->s_flags |= MS_NOUSER;
1851 #endif
1852
1853         spin_lock_init(&sbinfo->stat_lock);
1854         sbinfo->max_blocks = blocks;
1855         sbinfo->free_blocks = blocks;
1856         sbinfo->max_inodes = inodes;
1857         sbinfo->free_inodes = inodes;
1858         sb->s_maxbytes = SHMEM_MAX_BYTES;
1859         sb->s_blocksize = PAGE_CACHE_SIZE;
1860         sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
1861         sb->s_magic = TMPFS_MAGIC;
1862         sb->s_op = &shmem_ops;
1863         inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
1864         if (!inode)
1865                 goto failed;
1866         inode->i_uid = uid;
1867         inode->i_gid = gid;
1868         root = d_alloc_root(inode);
1869         if (!root)
1870                 goto failed_iput;
1871         sb->s_root = root;
1872         return 0;
1873
1874 failed_iput:
1875         iput(inode);
1876 failed:
1877         kfree(sbinfo);
1878         sb->s_fs_info = NULL;
1879         return err;
1880 }
1881
1882 static void shmem_put_super(struct super_block *sb)
1883 {
1884         kfree(sb->s_fs_info);
1885         sb->s_fs_info = NULL;
1886 }
1887
1888 static kmem_cache_t *shmem_inode_cachep;
1889
1890 static struct inode *shmem_alloc_inode(struct super_block *sb)
1891 {
1892         struct shmem_inode_info *p;
1893         p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL);
1894         if (!p)
1895                 return NULL;
1896         return &p->vfs_inode;
1897 }
1898
1899 static void shmem_destroy_inode(struct inode *inode)
1900 {
1901         mpol_free_shared_policy(&SHMEM_I(inode)->policy);
1902         kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
1903 }
1904
1905 static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
1906 {
1907         struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
1908
1909         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
1910             SLAB_CTOR_CONSTRUCTOR) {
1911                 inode_init_once(&p->vfs_inode);
1912         }
1913 }
1914
1915 static int init_inodecache(void)
1916 {
1917         shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
1918                                 sizeof(struct shmem_inode_info),
1919                                 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
1920                                 init_once, NULL);
1921         if (shmem_inode_cachep == NULL)
1922                 return -ENOMEM;
1923         return 0;
1924 }
1925
1926 static void destroy_inodecache(void)
1927 {
1928         if (kmem_cache_destroy(shmem_inode_cachep))
1929                 printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
1930 }
1931
1932 static struct address_space_operations shmem_aops = {
1933         .writepage      = shmem_writepage,
1934         .set_page_dirty = __set_page_dirty_nobuffers,
1935 #ifdef CONFIG_TMPFS
1936         .prepare_write  = shmem_prepare_write,
1937         .commit_write   = simple_commit_write,
1938 #endif
1939 };
1940
1941 static struct file_operations shmem_file_operations = {
1942         .mmap           = shmem_mmap,
1943 #ifdef CONFIG_TMPFS
1944         .llseek         = generic_file_llseek,
1945         .read           = shmem_file_read,
1946         .write          = shmem_file_write,
1947         .fsync          = simple_sync_file,
1948         .sendfile       = shmem_file_sendfile,
1949 #endif
1950 };
1951
1952 static struct inode_operations shmem_inode_operations = {
1953         .truncate       = shmem_truncate,
1954         .setattr        = shmem_notify_change,
1955 };
1956
1957 static struct inode_operations shmem_dir_inode_operations = {
1958 #ifdef CONFIG_TMPFS
1959         .create         = shmem_create,
1960         .lookup         = simple_lookup,
1961         .link           = shmem_link,
1962         .unlink         = shmem_unlink,
1963         .symlink        = shmem_symlink,
1964         .mkdir          = shmem_mkdir,
1965         .rmdir          = shmem_rmdir,
1966         .mknod          = shmem_mknod,
1967         .rename         = shmem_rename,
1968 #endif
1969 };
1970
1971 static struct super_operations shmem_ops = {
1972         .alloc_inode    = shmem_alloc_inode,
1973         .destroy_inode  = shmem_destroy_inode,
1974 #ifdef CONFIG_TMPFS
1975         .statfs         = shmem_statfs,
1976         .remount_fs     = shmem_remount_fs,
1977 #endif
1978         .delete_inode   = shmem_delete_inode,
1979         .drop_inode     = generic_delete_inode,
1980         .put_super      = shmem_put_super,
1981 };
1982
1983 static struct vm_operations_struct shmem_vm_ops = {
1984         .nopage         = shmem_nopage,
1985         .populate       = shmem_populate,
1986 #ifdef CONFIG_NUMA
1987         .set_policy     = shmem_set_policy,
1988         .get_policy     = shmem_get_policy,
1989 #endif
1990 };
1991
1992 static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
1993         int flags, const char *dev_name, void *data)
1994 {
1995         return get_sb_nodev(fs_type, flags, data, shmem_fill_super);
1996 }
1997
1998 static struct file_system_type tmpfs_fs_type = {
1999         .owner          = THIS_MODULE,
2000         .name           = "tmpfs",
2001         .get_sb         = shmem_get_sb,
2002         .kill_sb        = kill_litter_super,
2003 };
2004 static struct vfsmount *shm_mnt;
2005
2006 static int __init init_tmpfs(void)
2007 {
2008         int error;
2009
2010         error = init_inodecache();
2011         if (error)
2012                 goto out3;
2013
2014         error = register_filesystem(&tmpfs_fs_type);
2015         if (error) {
2016                 printk(KERN_ERR "Could not register tmpfs\n");
2017                 goto out2;
2018         }
2019 #ifdef CONFIG_TMPFS
2020         devfs_mk_dir("shm");
2021 #endif
2022         shm_mnt = kern_mount(&tmpfs_fs_type);
2023         if (IS_ERR(shm_mnt)) {
2024                 error = PTR_ERR(shm_mnt);
2025                 printk(KERN_ERR "Could not kern_mount tmpfs\n");
2026                 goto out1;
2027         }
2028
2029         /* The internal instance should not do size checking */
2030         shmem_set_size(SHMEM_SB(shm_mnt->mnt_sb), ULONG_MAX, ULONG_MAX);
2031         return 0;
2032
2033 out1:
2034         unregister_filesystem(&tmpfs_fs_type);
2035 out2:
2036         destroy_inodecache();
2037 out3:
2038         shm_mnt = ERR_PTR(error);
2039         return error;
2040 }
2041 module_init(init_tmpfs)
2042
2043 /*
2044  * shmem_file_setup - get an unlinked file living in tmpfs
2045  *
2046  * @name: name for dentry (to be seen in /proc/<pid>/maps
2047  * @size: size to be set for the file
2048  *
2049  */
2050 struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2051 {
2052         int error;
2053         struct file *file;
2054         struct inode *inode;
2055         struct dentry *dentry, *root;
2056         struct qstr this;
2057
2058         if (IS_ERR(shm_mnt))
2059                 return (void *)shm_mnt;
2060
2061         if (size > SHMEM_MAX_BYTES)
2062                 return ERR_PTR(-EINVAL);
2063
2064         if (shmem_acct_size(flags, size))
2065                 return ERR_PTR(-ENOMEM);
2066
2067         error = -ENOMEM;
2068         this.name = name;
2069         this.len = strlen(name);
2070         this.hash = 0; /* will go */
2071         root = shm_mnt->mnt_root;
2072         dentry = d_alloc(root, &this);
2073         if (!dentry)
2074                 goto put_memory;
2075
2076         error = -ENFILE;
2077         file = get_empty_filp();
2078         if (!file)
2079                 goto put_dentry;
2080
2081         error = -ENOSPC;
2082         inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
2083         if (!inode)
2084                 goto close_file;
2085
2086         SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
2087         d_instantiate(dentry, inode);
2088         inode->i_size = size;
2089         inode->i_nlink = 0;     /* It is unlinked */
2090         file->f_vfsmnt = mntget(shm_mnt);
2091         file->f_dentry = dentry;
2092         file->f_mapping = inode->i_mapping;
2093         file->f_op = &shmem_file_operations;
2094         file->f_mode = FMODE_WRITE | FMODE_READ;
2095         return(file);
2096
2097 close_file:
2098         put_filp(file);
2099 put_dentry:
2100         dput(dentry);
2101 put_memory:
2102         shmem_unacct_size(flags, size);
2103         return ERR_PTR(error);
2104 }
2105
2106 /*
2107  * shmem_zero_setup - setup a shared anonymous mapping
2108  *
2109  * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
2110  */
2111 int shmem_zero_setup(struct vm_area_struct *vma)
2112 {
2113         struct file *file;
2114         loff_t size = vma->vm_end - vma->vm_start;
2115
2116         file = shmem_file_setup("dev/zero", size, vma->vm_flags);
2117         if (IS_ERR(file))
2118                 return PTR_ERR(file);
2119
2120         if (vma->vm_file)
2121                 fput(vma->vm_file);
2122         vma->vm_file = file;
2123         vma->vm_ops = &shmem_vm_ops;
2124         return 0;
2125 }
2126
2127 EXPORT_SYMBOL(shmem_file_setup);