From 6984c962b5265e56ece6ed6436f4cd540c7bc3cd Mon Sep 17 00:00:00 2001 From: Mark Huang Date: Thu, 29 Jul 2004 20:30:16 +0000 Subject: [PATCH] Remove patch cruft --- arch/um/drivers/net_kern.c.orig | 870 ------------- arch/um/kernel/smp.c.orig | 302 ----- fs/Makefile.orig | 94 -- include/asm-um/pgtable.h.orig | 415 ------- include/asm-um/unistd.h.orig | 144 --- include/linux/gfp.h.orig | 128 -- include/linux/mm.h.orig | 729 ----------- mm/Makefile.orig | 17 - mm/mmap.c.orig | 1805 --------------------------- mm/mprotect.c.orig | 282 ----- mm/page_alloc.c.orig | 2013 ------------------------------- 11 files changed, 6799 deletions(-) delete mode 100644 arch/um/drivers/net_kern.c.orig delete mode 100644 arch/um/kernel/smp.c.orig delete mode 100644 fs/Makefile.orig delete mode 100644 include/asm-um/pgtable.h.orig delete mode 100644 include/asm-um/unistd.h.orig delete mode 100644 include/linux/gfp.h.orig delete mode 100644 include/linux/mm.h.orig delete mode 100644 mm/Makefile.orig delete mode 100644 mm/mmap.c.orig delete mode 100644 mm/mprotect.c.orig delete mode 100644 mm/page_alloc.c.orig diff --git a/arch/um/drivers/net_kern.c.orig b/arch/um/drivers/net_kern.c.orig deleted file mode 100644 index 4c2ee09b0..000000000 --- a/arch/um/drivers/net_kern.c.orig +++ /dev/null @@ -1,870 +0,0 @@ -/* - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - * Licensed under the GPL. - */ - -#include "linux/config.h" -#include "linux/kernel.h" -#include "linux/netdevice.h" -#include "linux/rtnetlink.h" -#include "linux/skbuff.h" -#include "linux/socket.h" -#include "linux/spinlock.h" -#include "linux/module.h" -#include "linux/init.h" -#include "linux/etherdevice.h" -#include "linux/list.h" -#include "linux/inetdevice.h" -#include "linux/ctype.h" -#include "linux/bootmem.h" -#include "user_util.h" -#include "kern_util.h" -#include "net_kern.h" -#include "net_user.h" -#include "mconsole_kern.h" -#include "init.h" -#include "irq_user.h" - -static spinlock_t opened_lock = SPIN_LOCK_UNLOCKED; -LIST_HEAD(opened); - -static int uml_net_rx(struct net_device *dev) -{ - struct uml_net_private *lp = dev->priv; - int pkt_len; - struct sk_buff *skb; - - /* If we can't allocate memory, try again next round. */ - if ((skb = dev_alloc_skb(dev->mtu)) == NULL) { - lp->stats.rx_dropped++; - return 0; - } - - skb->dev = dev; - skb_put(skb, dev->mtu); - skb->mac.raw = skb->data; - pkt_len = (*lp->read)(lp->fd, &skb, lp); - - if (pkt_len > 0) { - skb_trim(skb, pkt_len); - skb->protocol = (*lp->protocol)(skb); - netif_rx(skb); - - lp->stats.rx_bytes += skb->len; - lp->stats.rx_packets++; - return pkt_len; - } - - kfree_skb(skb); - return pkt_len; -} - -void uml_net_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - struct net_device *dev = dev_id; - struct uml_net_private *lp = dev->priv; - int err; - - if(!netif_running(dev)) - return; - - spin_lock(&lp->lock); - while((err = uml_net_rx(dev)) > 0) ; - if(err < 0) { - printk(KERN_ERR - "Device '%s' read returned %d, shutting it down\n", - dev->name, err); - dev_close(dev); - goto out; - } - reactivate_fd(lp->fd, UM_ETH_IRQ); - - out: - spin_unlock(&lp->lock); -} - -static int uml_net_open(struct net_device *dev) -{ - struct uml_net_private *lp = dev->priv; - char addr[sizeof("255.255.255.255\0")]; - int err; - - spin_lock(&lp->lock); - - if(lp->fd >= 0){ - err = -ENXIO; - goto out; - } - - if(!lp->have_mac){ - dev_ip_addr(dev, addr, &lp->mac[2]); - set_ether_mac(dev, lp->mac); - } - - lp->fd = (*lp->open)(&lp->user); - if(lp->fd < 0){ - err = lp->fd; - goto out; - } - - err = um_request_irq(dev->irq, lp->fd, IRQ_READ, uml_net_interrupt, - SA_INTERRUPT | SA_SHIRQ, dev->name, dev); - if(err != 0){ - printk(KERN_ERR "uml_net_open: failed to get irq(%d)\n", err); - if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user); - lp->fd = -1; - err = -ENETUNREACH; - } - - lp->tl.data = (unsigned long) &lp->user; - netif_start_queue(dev); - - spin_lock(&opened_lock); - list_add(&lp->list, &opened); - spin_unlock(&opened_lock); - MOD_INC_USE_COUNT; - out: - spin_unlock(&lp->lock); - return(err); -} - -static int uml_net_close(struct net_device *dev) -{ - struct uml_net_private *lp = dev->priv; - - netif_stop_queue(dev); - spin_lock(&lp->lock); - - free_irq(dev->irq, dev); - if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user); - lp->fd = -1; - spin_lock(&opened_lock); - list_del(&lp->list); - spin_unlock(&opened_lock); - - MOD_DEC_USE_COUNT; - spin_unlock(&lp->lock); - return 0; -} - -static int uml_net_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct uml_net_private *lp = dev->priv; - unsigned long flags; - int len; - - netif_stop_queue(dev); - - spin_lock_irqsave(&lp->lock, flags); - - len = (*lp->write)(lp->fd, &skb, lp); - - if(len == skb->len) { - lp->stats.tx_packets++; - lp->stats.tx_bytes += skb->len; - dev->trans_start = jiffies; - netif_start_queue(dev); - - /* this is normally done in the interrupt when tx finishes */ - netif_wake_queue(dev); - } - else if(len == 0){ - netif_start_queue(dev); - lp->stats.tx_dropped++; - } - else { - netif_start_queue(dev); - printk(KERN_ERR "uml_net_start_xmit: failed(%d)\n", len); - } - - spin_unlock_irqrestore(&lp->lock, flags); - - dev_kfree_skb(skb); - - return 0; -} - -static struct net_device_stats *uml_net_get_stats(struct net_device *dev) -{ - struct uml_net_private *lp = dev->priv; - return &lp->stats; -} - -static void uml_net_set_multicast_list(struct net_device *dev) -{ - if (dev->flags & IFF_PROMISC) return; - else if (dev->mc_count) dev->flags |= IFF_ALLMULTI; - else dev->flags &= ~IFF_ALLMULTI; -} - -static void uml_net_tx_timeout(struct net_device *dev) -{ - dev->trans_start = jiffies; - netif_wake_queue(dev); -} - -static int uml_net_set_mac(struct net_device *dev, void *addr) -{ - struct uml_net_private *lp = dev->priv; - struct sockaddr *hwaddr = addr; - - spin_lock(&lp->lock); - memcpy(dev->dev_addr, hwaddr->sa_data, ETH_ALEN); - spin_unlock(&lp->lock); - - return(0); -} - -static int uml_net_change_mtu(struct net_device *dev, int new_mtu) -{ - struct uml_net_private *lp = dev->priv; - int err = 0; - - spin_lock(&lp->lock); - - new_mtu = (*lp->set_mtu)(new_mtu, &lp->user); - if(new_mtu < 0){ - err = new_mtu; - goto out; - } - - dev->mtu = new_mtu; - - out: - spin_unlock(&lp->lock); - return err; -} - -static int uml_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) -{ - return(-EINVAL); -} - -void uml_net_user_timer_expire(unsigned long _conn) -{ -#ifdef undef - struct connection *conn = (struct connection *)_conn; - - dprintk(KERN_INFO "uml_net_user_timer_expire [%p]\n", conn); - do_connect(conn); -#endif -} - -/* - * default do nothing hard header packet routines for struct net_device init. - * real ethernet transports will overwrite with real routines. - */ -static int uml_net_hard_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, void *daddr, void *saddr, unsigned len) -{ - return(0); /* no change */ -} - -static int uml_net_rebuild_header(struct sk_buff *skb) -{ - return(0); /* ignore */ -} - -static int uml_net_header_cache(struct neighbour *neigh, struct hh_cache *hh) -{ - return(-1); /* fail */ -} - -static void uml_net_header_cache_update(struct hh_cache *hh, - struct net_device *dev, unsigned char * haddr) -{ - /* ignore */ -} - -static int uml_net_header_parse(struct sk_buff *skb, unsigned char *haddr) -{ - return(0); /* nothing */ -} - -static spinlock_t devices_lock = SPIN_LOCK_UNLOCKED; -static struct list_head devices = LIST_HEAD_INIT(devices); - -static int eth_configure(int n, void *init, char *mac, - struct transport *transport) -{ - struct uml_net *device; - struct net_device *dev; - struct uml_net_private *lp; - int err, size; - - size = transport->private_size + sizeof(struct uml_net_private) + - sizeof(((struct uml_net_private *) 0)->user); - - device = kmalloc(sizeof(*device), GFP_KERNEL); - if (device == NULL) { - printk(KERN_ERR "eth_configure failed to allocate uml_net\n"); - return(1); - } - - memset(device, 0, sizeof(*device)); - INIT_LIST_HEAD(&device->list); - device->index = n; - - spin_lock(&devices_lock); - list_add(&device->list, &devices); - spin_unlock(&devices_lock); - - if (setup_etheraddr(mac, device->mac)) - device->have_mac = 1; - - printk(KERN_INFO "Netdevice %d ", n); - if (device->have_mac) - printk("(%02x:%02x:%02x:%02x:%02x:%02x) ", - device->mac[0], device->mac[1], - device->mac[2], device->mac[3], - device->mac[4], device->mac[5]); - printk(": "); - dev = alloc_etherdev(size); - if (dev == NULL) { - printk(KERN_ERR "eth_configure: failed to allocate device\n"); - return 1; - } - - /* If this name ends up conflicting with an existing registered - * netdevice, that is OK, register_netdev{,ice}() will notice this - * and fail. - */ - snprintf(dev->name, sizeof(dev->name), "eth%d", n); - device->dev = dev; - - dev->hard_header = uml_net_hard_header; - dev->rebuild_header = uml_net_rebuild_header; - dev->hard_header_cache = uml_net_header_cache; - dev->header_cache_update= uml_net_header_cache_update; - dev->hard_header_parse = uml_net_header_parse; - - (*transport->kern->init)(dev, init); - - dev->mtu = transport->user->max_packet; - dev->open = uml_net_open; - dev->hard_start_xmit = uml_net_start_xmit; - dev->stop = uml_net_close; - dev->get_stats = uml_net_get_stats; - dev->set_multicast_list = uml_net_set_multicast_list; - dev->tx_timeout = uml_net_tx_timeout; - dev->set_mac_address = uml_net_set_mac; - dev->change_mtu = uml_net_change_mtu; - dev->do_ioctl = uml_net_ioctl; - dev->watchdog_timeo = (HZ >> 1); - dev->irq = UM_ETH_IRQ; - - rtnl_lock(); - err = register_netdevice(dev); - rtnl_unlock(); - if (err) { - device->dev = NULL; - /* XXX: should we call ->remove() here? */ - free_netdev(dev); - return 1; - } - lp = dev->priv; - - INIT_LIST_HEAD(&lp->list); - spin_lock_init(&lp->lock); - lp->dev = dev; - lp->fd = -1; - lp->mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0 }; - lp->have_mac = device->have_mac; - lp->protocol = transport->kern->protocol; - lp->open = transport->user->open; - lp->close = transport->user->close; - lp->remove = transport->user->remove; - lp->read = transport->kern->read; - lp->write = transport->kern->write; - lp->add_address = transport->user->add_address; - lp->delete_address = transport->user->delete_address; - lp->set_mtu = transport->user->set_mtu; - - init_timer(&lp->tl); - lp->tl.function = uml_net_user_timer_expire; - if (lp->have_mac) - memcpy(lp->mac, device->mac, sizeof(lp->mac)); - - if (transport->user->init) - (*transport->user->init)(&lp->user, dev); - - if (device->have_mac) - set_ether_mac(dev, device->mac); - return(0); -} - -static struct uml_net *find_device(int n) -{ - struct uml_net *device; - struct list_head *ele; - - spin_lock(&devices_lock); - list_for_each(ele, &devices){ - device = list_entry(ele, struct uml_net, list); - if(device->index == n) - goto out; - } - device = NULL; - out: - spin_unlock(&devices_lock); - return(device); -} - -static int eth_parse(char *str, int *index_out, char **str_out) -{ - char *end; - int n; - - n = simple_strtoul(str, &end, 0); - if(end == str){ - printk(KERN_ERR "eth_setup: Failed to parse '%s'\n", str); - return(1); - } - if(n < 0){ - printk(KERN_ERR "eth_setup: device %d is negative\n", n); - return(1); - } - str = end; - if(*str != '='){ - printk(KERN_ERR - "eth_setup: expected '=' after device number\n"); - return(1); - } - str++; - if(find_device(n)){ - printk(KERN_ERR "eth_setup: Device %d already configured\n", - n); - return(1); - } - if(index_out) *index_out = n; - *str_out = str; - return(0); -} - -struct eth_init { - struct list_head list; - char *init; - int index; -}; - -/* Filled in at boot time. Will need locking if the transports become - * modular. - */ -struct list_head transports = LIST_HEAD_INIT(transports); - -/* Filled in during early boot */ -struct list_head eth_cmd_line = LIST_HEAD_INIT(eth_cmd_line); - -static int check_transport(struct transport *transport, char *eth, int n, - void **init_out, char **mac_out) -{ - int len; - - len = strlen(transport->name); - if(strncmp(eth, transport->name, len)) - return(0); - - eth += len; - if(*eth == ',') - eth++; - else if(*eth != '\0') - return(0); - - *init_out = kmalloc(transport->setup_size, GFP_KERNEL); - if(*init_out == NULL) - return(1); - - if(!transport->setup(eth, mac_out, *init_out)){ - kfree(*init_out); - *init_out = NULL; - } - return(1); -} - -void register_transport(struct transport *new) -{ - struct list_head *ele, *next; - struct eth_init *eth; - void *init; - char *mac = NULL; - int match; - - list_add(&new->list, &transports); - - list_for_each_safe(ele, next, ð_cmd_line){ - eth = list_entry(ele, struct eth_init, list); - match = check_transport(new, eth->init, eth->index, &init, - &mac); - if(!match) - continue; - else if(init != NULL){ - eth_configure(eth->index, init, mac, new); - kfree(init); - } - list_del(ð->list); - } -} - -static int eth_setup_common(char *str, int index) -{ - struct list_head *ele; - struct transport *transport; - void *init; - char *mac = NULL; - - list_for_each(ele, &transports){ - transport = list_entry(ele, struct transport, list); - if(!check_transport(transport, str, index, &init, &mac)) - continue; - if(init != NULL){ - eth_configure(index, init, mac, transport); - kfree(init); - } - return(1); - } - return(0); -} - -static int eth_setup(char *str) -{ - struct eth_init *new; - int n, err; - - err = eth_parse(str, &n, &str); - if(err) return(1); - - new = alloc_bootmem(sizeof(new)); - if (new == NULL){ - printk("eth_init : alloc_bootmem failed\n"); - return(1); - } - - INIT_LIST_HEAD(&new->list); - new->index = n; - new->init = str; - - list_add_tail(&new->list, ð_cmd_line); - return(1); -} - -__setup("eth", eth_setup); -__uml_help(eth_setup, -"eth[0-9]+=,\n" -" Configure a network device.\n\n" -); - -static int eth_init(void) -{ - struct list_head *ele, *next; - struct eth_init *eth; - - list_for_each_safe(ele, next, ð_cmd_line){ - eth = list_entry(ele, struct eth_init, list); - - if(eth_setup_common(eth->init, eth->index)) - list_del(ð->list); - } - - return(1); -} - -__initcall(eth_init); - -static int net_config(char *str) -{ - int n, err; - - err = eth_parse(str, &n, &str); - if(err) return(err); - - str = uml_strdup(str); - if(str == NULL){ - printk(KERN_ERR "net_config failed to strdup string\n"); - return(-1); - } - err = !eth_setup_common(str, n); - if(err) - kfree(str); - return(err); -} - -static int net_remove(char *str) -{ - struct uml_net *device; - struct net_device *dev; - struct uml_net_private *lp; - char *end; - int n; - - n = simple_strtoul(str, &end, 0); - if((*end != '\0') || (end == str)) - return(-1); - - device = find_device(n); - if(device == NULL) - return(0); - - dev = device->dev; - lp = dev->priv; - if(lp->fd > 0) return(-1); - if(lp->remove != NULL) (*lp->remove)(&lp->user); - unregister_netdev(dev); - - list_del(&device->list); - free_netdev(device); - return(0); -} - -static struct mc_device net_mc = { - .name = "eth", - .config = net_config, - .get_config = NULL, - .remove = net_remove, -}; - -static int uml_inetaddr_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct in_ifaddr *ifa = ptr; - u32 addr = ifa->ifa_address; - u32 netmask = ifa->ifa_mask; - struct net_device *dev = ifa->ifa_dev->dev; - struct uml_net_private *lp; - void (*proc)(unsigned char *, unsigned char *, void *); - unsigned char addr_buf[4], netmask_buf[4]; - - if(dev->open != uml_net_open) return(NOTIFY_DONE); - - lp = dev->priv; - - proc = NULL; - switch (event){ - case NETDEV_UP: - proc = lp->add_address; - break; - case NETDEV_DOWN: - proc = lp->delete_address; - break; - } - if(proc != NULL){ - addr_buf[0] = addr & 0xff; - addr_buf[1] = (addr >> 8) & 0xff; - addr_buf[2] = (addr >> 16) & 0xff; - addr_buf[3] = addr >> 24; - netmask_buf[0] = netmask & 0xff; - netmask_buf[1] = (netmask >> 8) & 0xff; - netmask_buf[2] = (netmask >> 16) & 0xff; - netmask_buf[3] = netmask >> 24; - (*proc)(addr_buf, netmask_buf, &lp->user); - } - return(NOTIFY_DONE); -} - -struct notifier_block uml_inetaddr_notifier = { - .notifier_call = uml_inetaddr_event, -}; - -static int uml_net_init(void) -{ - struct list_head *ele; - struct uml_net_private *lp; - struct in_device *ip; - struct in_ifaddr *in; - - mconsole_register_dev(&net_mc); - register_inetaddr_notifier(¨_inetaddr_notifier); - - /* Devices may have been opened already, so the uml_inetaddr_notifier - * didn't get a chance to run for them. This fakes it so that - * addresses which have already been set up get handled properly. - */ - list_for_each(ele, &opened){ - lp = list_entry(ele, struct uml_net_private, list); - ip = lp->dev->ip_ptr; - if(ip == NULL) continue; - in = ip->ifa_list; - while(in != NULL){ - uml_inetaddr_event(NULL, NETDEV_UP, in); - in = in->ifa_next; - } - } - - return(0); -} - -__initcall(uml_net_init); - -static void close_devices(void) -{ - struct list_head *ele; - struct uml_net_private *lp; - - list_for_each(ele, &opened){ - lp = list_entry(ele, struct uml_net_private, list); - if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user); - if(lp->remove != NULL) (*lp->remove)(&lp->user); - } -} - -__uml_exitcall(close_devices); - -int setup_etheraddr(char *str, unsigned char *addr) -{ - char *end; - int i; - - if(str == NULL) - return(0); - for(i=0;i<6;i++){ - addr[i] = simple_strtoul(str, &end, 16); - if((end == str) || - ((*end != ':') && (*end != ',') && (*end != '\0'))){ - printk(KERN_ERR - "setup_etheraddr: failed to parse '%s' " - "as an ethernet address\n", str); - return(0); - } - str = end + 1; - } - if(addr[0] & 1){ - printk(KERN_ERR - "Attempt to assign a broadcast ethernet address to a " - "device disallowed\n"); - return(0); - } - return(1); -} - -void dev_ip_addr(void *d, char *buf, char *bin_buf) -{ - struct net_device *dev = d; - struct in_device *ip = dev->ip_ptr; - struct in_ifaddr *in; - u32 addr; - - if((ip == NULL) || ((in = ip->ifa_list) == NULL)){ - printk(KERN_WARNING "dev_ip_addr - device not assigned an " - "IP address\n"); - return; - } - addr = in->ifa_address; - sprintf(buf, "%d.%d.%d.%d", addr & 0xff, (addr >> 8) & 0xff, - (addr >> 16) & 0xff, addr >> 24); - if(bin_buf){ - bin_buf[0] = addr & 0xff; - bin_buf[1] = (addr >> 8) & 0xff; - bin_buf[2] = (addr >> 16) & 0xff; - bin_buf[3] = addr >> 24; - } -} - -void set_ether_mac(void *d, unsigned char *addr) -{ - struct net_device *dev = d; - - memcpy(dev->dev_addr, addr, ETH_ALEN); -} - -struct sk_buff *ether_adjust_skb(struct sk_buff *skb, int extra) -{ - if((skb != NULL) && (skb_tailroom(skb) < extra)){ - struct sk_buff *skb2; - - skb2 = skb_copy_expand(skb, 0, extra, GFP_ATOMIC); - dev_kfree_skb(skb); - skb = skb2; - } - if(skb != NULL) skb_put(skb, extra); - return(skb); -} - -void iter_addresses(void *d, void (*cb)(unsigned char *, unsigned char *, - void *), - void *arg) -{ - struct net_device *dev = d; - struct in_device *ip = dev->ip_ptr; - struct in_ifaddr *in; - unsigned char address[4], netmask[4]; - - if(ip == NULL) return; - in = ip->ifa_list; - while(in != NULL){ - address[0] = in->ifa_address & 0xff; - address[1] = (in->ifa_address >> 8) & 0xff; - address[2] = (in->ifa_address >> 16) & 0xff; - address[3] = in->ifa_address >> 24; - netmask[0] = in->ifa_mask & 0xff; - netmask[1] = (in->ifa_mask >> 8) & 0xff; - netmask[2] = (in->ifa_mask >> 16) & 0xff; - netmask[3] = in->ifa_mask >> 24; - (*cb)(address, netmask, arg); - in = in->ifa_next; - } -} - -int dev_netmask(void *d, void *m) -{ - struct net_device *dev = d; - struct in_device *ip = dev->ip_ptr; - struct in_ifaddr *in; - __u32 *mask_out = m; - - if(ip == NULL) - return(1); - - in = ip->ifa_list; - if(in == NULL) - return(1); - - *mask_out = in->ifa_mask; - return(0); -} - -void *get_output_buffer(int *len_out) -{ - void *ret; - - ret = (void *) __get_free_pages(GFP_KERNEL, 0); - if(ret) *len_out = PAGE_SIZE; - else *len_out = 0; - return(ret); -} - -void free_output_buffer(void *buffer) -{ - free_pages((unsigned long) buffer, 0); -} - -int tap_setup_common(char *str, char *type, char **dev_name, char **mac_out, - char **gate_addr) -{ - char *remain; - - remain = split_if_spec(str, dev_name, mac_out, gate_addr, NULL); - if(remain != NULL){ - printk("tap_setup_common - Extra garbage on specification : " - "'%s'\n", remain); - return(1); - } - - return(0); -} - -unsigned short eth_protocol(struct sk_buff *skb) -{ - return(eth_type_trans(skb, skb->dev)); -} - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/arch/um/kernel/smp.c.orig b/arch/um/kernel/smp.c.orig deleted file mode 100644 index 34f826c08..000000000 --- a/arch/um/kernel/smp.c.orig +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#include "linux/config.h" - -#ifdef CONFIG_SMP - -#include "linux/sched.h" -#include "linux/module.h" -#include "linux/threads.h" -#include "linux/interrupt.h" -#include "linux/err.h" -#include "asm/smp.h" -#include "asm/processor.h" -#include "asm/spinlock.h" -#include "asm/hardirq.h" -#include "user_util.h" -#include "kern_util.h" -#include "kern.h" -#include "irq_user.h" -#include "os.h" - -/* CPU online map, set by smp_boot_cpus */ -unsigned long cpu_online_map = cpumask_of_cpu(0); - -EXPORT_SYMBOL(cpu_online_map); - -/* Per CPU bogomips and other parameters - * The only piece used here is the ipi pipe, which is set before SMP is - * started and never changed. - */ -struct cpuinfo_um cpu_data[NR_CPUS]; - -spinlock_t um_bh_lock = SPIN_LOCK_UNLOCKED; - -atomic_t global_bh_count; - -/* Not used by UML */ -unsigned char global_irq_holder = NO_PROC_ID; -unsigned volatile long global_irq_lock; - -/* Set when the idlers are all forked */ -int smp_threads_ready = 0; - -/* A statistic, can be a little off */ -int num_reschedules_sent = 0; - -/* Small, random number, never changed */ -unsigned long cache_decay_ticks = 5; - -/* Not changed after boot */ -struct task_struct *idle_threads[NR_CPUS]; - -void smp_send_reschedule(int cpu) -{ - write(cpu_data[cpu].ipi_pipe[1], "R", 1); - num_reschedules_sent++; -} - -static void show(char * str) -{ - int cpu = smp_processor_id(); - - printk(KERN_INFO "\n%s, CPU %d:\n", str, cpu); -} - -#define MAXCOUNT 100000000 - -static inline void wait_on_bh(void) -{ - int count = MAXCOUNT; - do { - if (!--count) { - show("wait_on_bh"); - count = ~0; - } - /* nothing .. wait for the other bh's to go away */ - } while (atomic_read(&global_bh_count) != 0); -} - -/* - * This is called when we want to synchronize with - * bottom half handlers. We need to wait until - * no other CPU is executing any bottom half handler. - * - * Don't wait if we're already running in an interrupt - * context or are inside a bh handler. - */ -void synchronize_bh(void) -{ - if (atomic_read(&global_bh_count) && !in_interrupt()) - wait_on_bh(); -} - -void smp_send_stop(void) -{ - int i; - - printk(KERN_INFO "Stopping all CPUs..."); - for(i = 0; i < num_online_cpus(); i++){ - if(i == current->thread_info->cpu) - continue; - write(cpu_data[i].ipi_pipe[1], "S", 1); - } - printk("done\n"); -} - -static cpumask_t smp_commenced_mask; -static cpumask_t smp_callin_map = CPU_MASK_NONE; - -static int idle_proc(void *cpup) -{ - int cpu = (int) cpup, err; - - err = os_pipe(cpu_data[cpu].ipi_pipe, 1, 1); - if(err) - panic("CPU#%d failed to create IPI pipe, errno = %d", cpu, - -err); - - activate_ipi(cpu_data[cpu].ipi_pipe[0], - current->thread.mode.tt.extern_pid); - - wmb(); - if (cpu_test_and_set(cpu, &smp_callin_map)) { - printk("huh, CPU#%d already present??\n", cpu); - BUG(); - } - - while (!cpu_isset(cpu, &smp_commenced_mask)) - cpu_relax(); - - cpu_set(cpu, cpu_online_map); - default_idle(); - return(0); -} - -static struct task_struct *idle_thread(int cpu) -{ - struct task_struct *new_task; - unsigned char c; - - current->thread.request.u.thread.proc = idle_proc; - current->thread.request.u.thread.arg = (void *) cpu; - new_task = do_fork(CLONE_VM | CLONE_IDLETASK, 0, NULL, 0, NULL, NULL); - if(IS_ERR(new_task)) panic("do_fork failed in idle_thread"); - - cpu_tasks[cpu] = ((struct cpu_task) - { .pid = new_task->thread.mode.tt.extern_pid, - .task = new_task } ); - idle_threads[cpu] = new_task; - CHOOSE_MODE(write(new_task->thread.mode.tt.switch_pipe[1], &c, - sizeof(c)), - ({ panic("skas mode doesn't support SMP"); })); - return(new_task); -} - -void smp_prepare_cpus(unsigned int maxcpus) -{ - struct task_struct *idle; - unsigned long waittime; - int err, cpu; - - cpu_set(0, cpu_online_map); - cpu_set(0, smp_callin_map); - - err = os_pipe(cpu_data[0].ipi_pipe, 1, 1); - if(err) panic("CPU#0 failed to create IPI pipe, errno = %d", -err); - - activate_ipi(cpu_data[0].ipi_pipe[0], - current->thread.mode.tt.extern_pid); - - for(cpu = 1; cpu < ncpus; cpu++){ - printk("Booting processor %d...\n", cpu); - - idle = idle_thread(cpu); - - init_idle(idle, cpu); - unhash_process(idle); - - waittime = 200000000; - while (waittime-- && !cpu_isset(cpu, smp_callin_map)) - cpu_relax(); - - if (cpu_isset(cpu, smp_callin_map)) - printk("done\n"); - else printk("failed\n"); - } -} - -void smp_prepare_boot_cpu(void) -{ - cpu_set(smp_processor_id(), cpu_online_map); -} - -int __cpu_up(unsigned int cpu) -{ - cpu_set(cpu, smp_commenced_mask); - while (!cpu_isset(cpu, cpu_online_map)) - mb(); - return(0); -} - -int setup_profiling_timer(unsigned int multiplier) -{ - printk(KERN_INFO "setup_profiling_timer\n"); - return(0); -} - -void smp_call_function_slave(int cpu); - -void IPI_handler(int cpu) -{ - unsigned char c; - int fd; - - fd = cpu_data[cpu].ipi_pipe[0]; - while (read(fd, &c, 1) == 1) { - switch (c) { - case 'C': - smp_call_function_slave(cpu); - break; - - case 'R': - set_tsk_need_resched(current); - break; - - case 'S': - printk("CPU#%d stopping\n", cpu); - while(1) - pause(); - break; - - default: - printk("CPU#%d received unknown IPI [%c]!\n", cpu, c); - break; - } - } -} - -int hard_smp_processor_id(void) -{ - return(pid_to_processor_id(os_getpid())); -} - -static spinlock_t call_lock = SPIN_LOCK_UNLOCKED; -static atomic_t scf_started; -static atomic_t scf_finished; -static void (*func)(void *info); -static void *info; - -void smp_call_function_slave(int cpu) -{ - atomic_inc(&scf_started); - (*func)(info); - atomic_inc(&scf_finished); -} - -int smp_call_function(void (*_func)(void *info), void *_info, int nonatomic, - int wait) -{ - int cpus = num_online_cpus() - 1; - int i; - - if (!cpus) - return 0; - - spin_lock_bh(&call_lock); - atomic_set(&scf_started, 0); - atomic_set(&scf_finished, 0); - func = _func; - info = _info; - - for (i=0;ithread_info->cpu) && - cpu_isset(i, cpu_online_map)) - write(cpu_data[i].ipi_pipe[1], "C", 1); - - while (atomic_read(&scf_started) != cpus) - barrier(); - - if (wait) - while (atomic_read(&scf_finished) != cpus) - barrier(); - - spin_unlock_bh(&call_lock); - return 0; -} - -#endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/fs/Makefile.orig b/fs/Makefile.orig deleted file mode 100644 index a288c0cb3..000000000 --- a/fs/Makefile.orig +++ /dev/null @@ -1,94 +0,0 @@ -# -# Makefile for the Linux filesystems. -# -# 14 Sep 2000, Christoph Hellwig -# Rewritten to use lists instead of if-statements. -# - -obj-y := open.o read_write.o file_table.o buffer.o \ - bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \ - namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ - dcache.o inode.o attr.o bad_inode.o file.o dnotify.o \ - filesystems.o namespace.o seq_file.o xattr.o libfs.o \ - fs-writeback.o mpage.o direct-io.o aio.o - -obj-$(CONFIG_EPOLL) += eventpoll.o -obj-$(CONFIG_COMPAT) += compat.o - -nfsd-$(CONFIG_NFSD) := nfsctl.o -obj-y += $(nfsd-y) $(nfsd-m) - -obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o -obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o -obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o - -# binfmt_script is always there -obj-y += binfmt_script.o - -obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o -obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o -obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o - -obj-$(CONFIG_FS_MBCACHE) += mbcache.o -obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o - -obj-$(CONFIG_QUOTA) += dquot.o -obj-$(CONFIG_QFMT_V1) += quota_v1.o -obj-$(CONFIG_QFMT_V2) += quota_v2.o -obj-$(CONFIG_QUOTACTL) += quota.o - -obj-$(CONFIG_PROC_FS) += proc/ -obj-y += partitions/ -obj-$(CONFIG_SYSFS) += sysfs/ -obj-y += devpts/ - -obj-$(CONFIG_PROFILING) += dcookies.o - -# Do not add any filesystems before this line -obj-$(CONFIG_REISERFS_FS) += reiserfs/ -obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 -obj-$(CONFIG_JBD) += jbd/ -obj-$(CONFIG_EXT2_FS) += ext2/ -obj-$(CONFIG_CRAMFS) += cramfs/ -obj-$(CONFIG_RAMFS) += ramfs/ -obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ -obj-$(CONFIG_CODA_FS) += coda/ -obj-$(CONFIG_INTERMEZZO_FS) += intermezzo/ -obj-$(CONFIG_MINIX_FS) += minix/ -obj-$(CONFIG_FAT_FS) += fat/ -obj-$(CONFIG_UMSDOS_FS) += umsdos/ -obj-$(CONFIG_MSDOS_FS) += msdos/ -obj-$(CONFIG_VFAT_FS) += vfat/ -obj-$(CONFIG_BFS_FS) += bfs/ -obj-$(CONFIG_ISO9660_FS) += isofs/ -obj-$(CONFIG_DEVFS_FS) += devfs/ -obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+ -obj-$(CONFIG_HFS_FS) += hfs/ -obj-$(CONFIG_VXFS_FS) += freevxfs/ -obj-$(CONFIG_NFS_FS) += nfs/ -obj-$(CONFIG_EXPORTFS) += exportfs/ -obj-$(CONFIG_NFSD) += nfsd/ -obj-$(CONFIG_LOCKD) += lockd/ -obj-$(CONFIG_NLS) += nls/ -obj-$(CONFIG_SYSV_FS) += sysv/ -obj-$(CONFIG_SMB_FS) += smbfs/ -obj-$(CONFIG_CIFS) += cifs/ -obj-$(CONFIG_NCP_FS) += ncpfs/ -obj-$(CONFIG_HPFS_FS) += hpfs/ -obj-$(CONFIG_NTFS_FS) += ntfs/ -obj-$(CONFIG_UFS_FS) += ufs/ -obj-$(CONFIG_EFS_FS) += efs/ -obj-$(CONFIG_JFFS_FS) += jffs/ -obj-$(CONFIG_JFFS2_FS) += jffs2/ -obj-$(CONFIG_AFFS_FS) += affs/ -obj-$(CONFIG_ROMFS_FS) += romfs/ -obj-$(CONFIG_QNX4FS_FS) += qnx4/ -obj-$(CONFIG_AUTOFS_FS) += autofs/ -obj-$(CONFIG_AUTOFS4_FS) += autofs4/ -obj-$(CONFIG_ADFS_FS) += adfs/ -obj-$(CONFIG_UDF_FS) += udf/ -obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ -obj-$(CONFIG_JFS_FS) += jfs/ -obj-$(CONFIG_XFS_FS) += xfs/ -obj-$(CONFIG_AFS_FS) += afs/ -obj-$(CONFIG_BEFS_FS) += befs/ diff --git a/include/asm-um/pgtable.h.orig b/include/asm-um/pgtable.h.orig deleted file mode 100644 index 148dd8e42..000000000 --- a/include/asm-um/pgtable.h.orig +++ /dev/null @@ -1,415 +0,0 @@ -/* - * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) - * Derived from include/asm-i386/pgtable.h - * Licensed under the GPL - */ - -#ifndef __UM_PGTABLE_H -#define __UM_PGTABLE_H - -#include "linux/sched.h" -#include "asm/processor.h" -#include "asm/page.h" -#include "asm/fixmap.h" - -extern pgd_t swapper_pg_dir[1024]; - -extern void *um_virt_to_phys(struct task_struct *task, unsigned long virt, - pte_t *pte_out); - -/* zero page used for uninitialized stuff */ -extern unsigned long *empty_zero_page; - -#define pgtable_cache_init() do ; while (0) - -/* PMD_SHIFT determines the size of the area a second-level page table can map */ -#define PMD_SHIFT 22 -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) - -/* PGDIR_SHIFT determines what a third-level page table entry can map */ -#define PGDIR_SHIFT 22 -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -/* - * entries per page directory level: the i386 is two-level, so - * we don't really have any PMD directory physically. - */ -#define PTRS_PER_PTE 1024 -#define PTRS_PER_PMD 1 -#define PTRS_PER_PGD 1024 -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_PGD_NR 0 - -#define pte_ERROR(e) \ - printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) -#define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) -#define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) - -/* - * pgd entries used up by user/kernel: - */ - -#define USER_PGD_PTRS (TASK_SIZE >> PGDIR_SHIFT) -#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) - -#ifndef __ASSEMBLY__ -/* Just any arbitrary offset to the start of the vmalloc VM area: the - * current 8MB value just means that there will be a 8MB "hole" after the - * physical memory until the kernel virtual memory starts. That means that - * any out-of-bounds memory accesses will hopefully be caught. - * The vmalloc() routines leaves a hole of 4kB between each vmalloced - * area for the same reason. ;) - */ - -extern unsigned long high_physmem; - -#define VMALLOC_OFFSET (__va_space) -#define VMALLOC_START (((unsigned long) high_physmem + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)) - -#ifdef CONFIG_HIGHMEM -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) -#else -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) -#endif - -#define _PAGE_PRESENT 0x001 -#define _PAGE_NEWPAGE 0x002 -#define _PAGE_PROTNONE 0x004 /* If not present */ -#define _PAGE_RW 0x008 -#define _PAGE_USER 0x010 -#define _PAGE_ACCESSED 0x020 -#define _PAGE_DIRTY 0x040 -#define _PAGE_NEWPROT 0x080 - -#define REGION_MASK 0xf0000000 -#define REGION_SHIFT 28 - -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) - -#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) -#define PAGE_KERNEL_RO __pgprot(_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED) - -/* - * The i386 can't do page protection for execute, and considers that the same are read. - * Also, write permissions imply read permissions. This is the closest we can get.. - */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY -#define __P101 PAGE_READONLY -#define __P110 PAGE_COPY -#define __P111 PAGE_COPY - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY -#define __S101 PAGE_READONLY -#define __S110 PAGE_SHARED -#define __S111 PAGE_SHARED - -/* - * Define this if things work differently on an i386 and an i486: - * it will (on an i486) warn about kernel memory accesses that are - * done without a 'verify_area(VERIFY_WRITE,..)' - */ -#undef TEST_VERIFY_AREA - -/* page table for 0-4MB for everybody */ -extern unsigned long pg0[1024]; - -/* - * BAD_PAGETABLE is used when we need a bogus page-table, while - * BAD_PAGE is used for a bogus page. - * - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern pte_t __bad_page(void); -extern pte_t * __bad_pagetable(void); - -#define BAD_PAGETABLE __bad_pagetable() -#define BAD_PAGE __bad_page() -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - -/* number of bits that fit into a memory pointer */ -#define BITS_PER_PTR (8*sizeof(unsigned long)) - -/* to align the pointer to a pointer address */ -#define PTR_MASK (~(sizeof(void*)-1)) - -/* sizeof(void*)==1<>(PAGE_SHIFT-SIZEOF_PTR_LOG2)&PTR_MASK&~PAGE_MASK) - -#define pte_none(x) !(pte_val(x) & ~_PAGE_NEWPAGE) -#define pte_present(x) (pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE)) - -#define pte_clear(xp) do { pte_val(*(xp)) = _PAGE_NEWPAGE; } while (0) - -#define phys_region_index(x) (((x) & REGION_MASK) >> REGION_SHIFT) -#define pte_region_index(x) phys_region_index(pte_val(x)) - -#define pmd_none(x) (!(pmd_val(x) & ~_PAGE_NEWPAGE)) -#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) -#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) -#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) - -#define pmd_newpage(x) (pmd_val(x) & _PAGE_NEWPAGE) -#define pmd_mkuptodate(x) (pmd_val(x) &= ~_PAGE_NEWPAGE) - -/* - * The "pgd_xxx()" functions here are trivial for a folded two-level - * setup: the pgd is never bad, and a pmd always exists (as it's folded - * into the pgd entry) - */ -static inline int pgd_none(pgd_t pgd) { return 0; } -static inline int pgd_bad(pgd_t pgd) { return 0; } -static inline int pgd_present(pgd_t pgd) { return 1; } -static inline void pgd_clear(pgd_t * pgdp) { } - - -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) - -extern struct page *pte_mem_map(pte_t pte); -extern struct page *phys_mem_map(unsigned long phys); -extern unsigned long phys_to_pfn(unsigned long p); -extern unsigned long pfn_to_phys(unsigned long pfn); - -#define pte_page(x) pfn_to_page(pte_pfn(x)) -#define pte_address(x) (__va(pte_val(x) & PAGE_MASK)) -#define mk_phys(a, r) ((a) + (r << REGION_SHIFT)) -#define phys_addr(p) ((p) & ~REGION_MASK) -#define phys_page(p) (phys_mem_map(p) + ((phys_addr(p)) >> PAGE_SHIFT)) -#define pte_pfn(x) phys_to_pfn(pte_val(x)) -#define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot)) -#define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot)) - -static inline pte_t pte_mknewprot(pte_t pte) -{ - pte_val(pte) |= _PAGE_NEWPROT; - return(pte); -} - -static inline pte_t pte_mknewpage(pte_t pte) -{ - pte_val(pte) |= _PAGE_NEWPAGE; - return(pte); -} - -static inline void set_pte(pte_t *pteptr, pte_t pteval) -{ - /* If it's a swap entry, it needs to be marked _PAGE_NEWPAGE so - * fix_range knows to unmap it. _PAGE_NEWPROT is specific to - * mapped pages. - */ - *pteptr = pte_mknewpage(pteval); - if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr); -} - -/* - * (pmds are folded into pgds so this doesn't get actually called, - * but the define is needed for a generic inline function.) - */ -#define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) -#define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) - -/* - * The following only work if pte_present() is true. - * Undefined behaviour if not.. - */ -static inline int pte_read(pte_t pte) -{ - return((pte_val(pte) & _PAGE_USER) && - !(pte_val(pte) & _PAGE_PROTNONE)); -} - -static inline int pte_exec(pte_t pte){ - return((pte_val(pte) & _PAGE_USER) && - !(pte_val(pte) & _PAGE_PROTNONE)); -} - -static inline int pte_write(pte_t pte) -{ - return((pte_val(pte) & _PAGE_RW) && - !(pte_val(pte) & _PAGE_PROTNONE)); -} - -static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } -static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_newpage(pte_t pte) { return pte_val(pte) & _PAGE_NEWPAGE; } -static inline int pte_newprot(pte_t pte) -{ - return(pte_present(pte) && (pte_val(pte) & _PAGE_NEWPROT)); -} - -static inline pte_t pte_rdprotect(pte_t pte) -{ - pte_val(pte) &= ~_PAGE_USER; - return(pte_mknewprot(pte)); -} - -static inline pte_t pte_exprotect(pte_t pte) -{ - pte_val(pte) &= ~_PAGE_USER; - return(pte_mknewprot(pte)); -} - -static inline pte_t pte_mkclean(pte_t pte) -{ - pte_val(pte) &= ~_PAGE_DIRTY; - return(pte); -} - -static inline pte_t pte_mkold(pte_t pte) -{ - pte_val(pte) &= ~_PAGE_ACCESSED; - return(pte); -} - -static inline pte_t pte_wrprotect(pte_t pte) -{ - pte_val(pte) &= ~_PAGE_RW; - return(pte_mknewprot(pte)); -} - -static inline pte_t pte_mkread(pte_t pte) -{ - pte_val(pte) |= _PAGE_USER; - return(pte_mknewprot(pte)); -} - -static inline pte_t pte_mkexec(pte_t pte) -{ - pte_val(pte) |= _PAGE_USER; - return(pte_mknewprot(pte)); -} - -static inline pte_t pte_mkdirty(pte_t pte) -{ - pte_val(pte) |= _PAGE_DIRTY; - return(pte); -} - -static inline pte_t pte_mkyoung(pte_t pte) -{ - pte_val(pte) |= _PAGE_ACCESSED; - return(pte); -} - -static inline pte_t pte_mkwrite(pte_t pte) -{ - pte_val(pte) |= _PAGE_RW; - return(pte_mknewprot(pte)); -} - -static inline pte_t pte_mkuptodate(pte_t pte) -{ - pte_val(pte) &= ~_PAGE_NEWPAGE; - if(pte_present(pte)) pte_val(pte) &= ~_PAGE_NEWPROT; - return(pte); -} - -extern unsigned long page_to_phys(struct page *page); - -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ - -#define mk_pte(page, pgprot) \ -({ \ - pte_t __pte; \ - \ - pte_val(__pte) = page_to_phys(page) + pgprot_val(pgprot);\ - if(pte_present(__pte)) pte_mknewprot(pte_mknewpage(__pte)); \ - __pte; \ -}) - -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ - pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot); - if(pte_present(pte)) pte = pte_mknewpage(pte_mknewprot(pte)); - return pte; -} - -#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) -#define pmd_page(pmd) (phys_mem_map(pmd_val(pmd) & PAGE_MASK) + \ - ((phys_addr(pmd_val(pmd)) >> PAGE_SHIFT))) - -/* to find an entry in a page-table-directory. */ -#define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) - -/* to find an entry in a page-table-directory */ -#define pgd_offset(mm, address) \ -((mm)->pgd + ((address) >> PGDIR_SHIFT)) - -/* to find an entry in a kernel page-table-directory */ -#define pgd_offset_k(address) pgd_offset(&init_mm, address) - -#define pmd_index(address) \ - (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) - -/* Find an entry in the second-level page table.. */ -static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) -{ - return (pmd_t *) dir; -} - -/* Find an entry in the third-level page table.. */ -#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_kernel(dir, address) \ - ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) -#define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) -#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0) -#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) - -#define update_mmu_cache(vma,address,pte) do ; while (0) - -/* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 3) & 0x7f) -#define __swp_offset(x) ((x).val >> 10) - -#define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 3) | ((offset) << 10) }) -#define __pte_to_swp_entry(pte) \ - ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) -#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) - -#define kern_addr_valid(addr) (1) - -#include - -#endif - -#endif -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/include/asm-um/unistd.h.orig b/include/asm-um/unistd.h.orig deleted file mode 100644 index 5850620bb..000000000 --- a/include/asm-um/unistd.h.orig +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) - * Licensed under the GPL - */ - -#ifndef _UM_UNISTD_H_ -#define _UM_UNISTD_H_ - -#include -#include "linux/resource.h" -#include "asm/uaccess.h" - -extern int um_execve(const char *file, char *const argv[], char *const env[]); - -#ifdef __KERNEL__ -#define __ARCH_WANT_IPC_PARSE_VERSION -#define __ARCH_WANT_OLD_READDIR -#define __ARCH_WANT_OLD_STAT -#define __ARCH_WANT_STAT64 -#define __ARCH_WANT_SYS_ALARM -#define __ARCH_WANT_SYS_GETHOSTNAME -#define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK -#define __ARCH_WANT_SYS_SIGNAL -#define __ARCH_WANT_SYS_TIME -#define __ARCH_WANT_SYS_UTIME -#define __ARCH_WANT_SYS_WAITPID -#define __ARCH_WANT_SYS_SOCKETCALL -#define __ARCH_WANT_SYS_FADVISE64 -#define __ARCH_WANT_SYS_GETPGRP -#define __ARCH_WANT_SYS_LLSEEK -#define __ARCH_WANT_SYS_NICE -#define __ARCH_WANT_SYS_OLD_GETRLIMIT -#define __ARCH_WANT_SYS_OLDUMOUNT -#define __ARCH_WANT_SYS_SIGPENDING -#define __ARCH_WANT_SYS_SIGPROCMASK -#define __ARCH_WANT_SYS_RT_SIGACTION -#endif - -#ifdef __KERNEL_SYSCALLS__ - -#include -#include - -#define KERNEL_CALL(ret_t, sys, args...) \ - mm_segment_t fs = get_fs(); \ - ret_t ret; \ - set_fs(KERNEL_DS); \ - ret = sys(args); \ - set_fs(fs); \ - return ret; - -static inline long open(const char *pathname, int flags, int mode) -{ - KERNEL_CALL(int, sys_open, pathname, flags, mode) -} - -static inline long dup(unsigned int fd) -{ - KERNEL_CALL(int, sys_dup, fd); -} - -static inline long close(unsigned int fd) -{ - KERNEL_CALL(int, sys_close, fd); -} - -static inline int execve(const char *filename, char *const argv[], - char *const envp[]) -{ - KERNEL_CALL(int, um_execve, filename, argv, envp); -} - -static inline long waitpid(pid_t pid, unsigned int *status, int options) -{ - KERNEL_CALL(pid_t, sys_wait4, pid, status, options, NULL) -} - -static inline pid_t setsid(void) -{ - KERNEL_CALL(pid_t, sys_setsid) -} - -static inline long lseek(unsigned int fd, off_t offset, unsigned int whence) -{ - KERNEL_CALL(long, sys_lseek, fd, offset, whence) -} - -static inline int read(unsigned int fd, char * buf, int len) -{ - KERNEL_CALL(int, sys_read, fd, buf, len) -} - -static inline int write(unsigned int fd, char * buf, int len) -{ - KERNEL_CALL(int, sys_write, fd, buf, len) -} - -long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); -int sys_execve(char *file, char **argv, char **env); -long sys_clone(unsigned long clone_flags, unsigned long newsp, - int *parent_tid, int *child_tid); -long sys_fork(void); -long sys_vfork(void); -int sys_pipe(unsigned long *fildes); -int sys_ptrace(long request, long pid, long addr, long data); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); - -#endif - -/* Save the value of __KERNEL_SYSCALLS__, undefine it, include the underlying - * arch's unistd.h for the system call numbers, and restore the old - * __KERNEL_SYSCALLS__. - */ - -#ifdef __KERNEL_SYSCALLS__ -#define __SAVE_KERNEL_SYSCALLS__ __KERNEL_SYSCALLS__ -#endif - -#undef __KERNEL_SYSCALLS__ -#include "asm/arch/unistd.h" - -#ifdef __KERNEL_SYSCALLS__ -#define __KERNEL_SYSCALLS__ __SAVE_KERNEL_SYSCALLS__ -#endif - -#endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/include/linux/gfp.h.orig b/include/linux/gfp.h.orig deleted file mode 100644 index 8980d1fd7..000000000 --- a/include/linux/gfp.h.orig +++ /dev/null @@ -1,128 +0,0 @@ -#ifndef __LINUX_GFP_H -#define __LINUX_GFP_H - -#include -#include -#include -#include - -struct vm_area_struct; - -/* - * GFP bitmasks.. - */ -/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */ -#define __GFP_DMA 0x01 -#define __GFP_HIGHMEM 0x02 - -/* - * Action modifiers - doesn't change the zoning - * - * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt - * _might_ fail. This depends upon the particular VM implementation. - * - * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller - * cannot handle allocation failures. - * - * __GFP_NORETRY: The VM implementation must not retry indefinitely. - */ -#define __GFP_WAIT 0x10 /* Can wait and reschedule? */ -#define __GFP_HIGH 0x20 /* Should access emergency pools? */ -#define __GFP_IO 0x40 /* Can start physical IO? */ -#define __GFP_FS 0x80 /* Can call down to low-level FS? */ -#define __GFP_COLD 0x100 /* Cache-cold page required */ -#define __GFP_NOWARN 0x200 /* Suppress page allocation failure warning */ -#define __GFP_REPEAT 0x400 /* Retry the allocation. Might fail */ -#define __GFP_NOFAIL 0x800 /* Retry for ever. Cannot fail */ -#define __GFP_NORETRY 0x1000 /* Do not retry. Might fail */ -#define __GFP_NO_GROW 0x2000 /* Slab internal usage */ -#define __GFP_COMP 0x4000 /* Add compound page metadata */ - -#define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */ -#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) - -/* if you forget to add the bitmask here kernel will crash, period */ -#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ - __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ - __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP) - -#define GFP_ATOMIC (__GFP_HIGH) -#define GFP_NOIO (__GFP_WAIT) -#define GFP_NOFS (__GFP_WAIT | __GFP_IO) -#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) -#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS) -#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM) - -/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some - platforms, used as appropriate on others */ - -#define GFP_DMA __GFP_DMA - - -/* - * There is only one page-allocator function, and two main namespaces to - * it. The alloc_page*() variants return 'struct page *' and as such - * can allocate highmem pages, the *get*page*() variants return - * virtual kernel addresses to the allocated page(s). - */ - -/* - * We get the zone list from the current node and the gfp_mask. - * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones. - * - * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets - * optimized to &contig_page_data at compile-time. - */ -extern struct page * -FASTCALL(__alloc_pages(unsigned int, unsigned int, struct zonelist *)); - -static inline struct page *alloc_pages_node(int nid, unsigned int gfp_mask, - unsigned int order) -{ - if (unlikely(order >= MAX_ORDER)) - return NULL; - - return __alloc_pages(gfp_mask, order, - NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK)); -} - -#ifdef CONFIG_NUMA -extern struct page *alloc_pages_current(unsigned gfp_mask, unsigned order); - -static inline struct page * -alloc_pages(unsigned int gfp_mask, unsigned int order) -{ - if (unlikely(order >= MAX_ORDER)) - return NULL; - - return alloc_pages_current(gfp_mask, order); -} -extern struct page *alloc_page_vma(unsigned gfp_mask, - struct vm_area_struct *vma, unsigned long addr); -#else -#define alloc_pages(gfp_mask, order) \ - alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) -#endif -#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) - -extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); -extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); - -#define __get_free_page(gfp_mask) \ - __get_free_pages((gfp_mask),0) - -#define __get_dma_pages(gfp_mask, order) \ - __get_free_pages((gfp_mask) | GFP_DMA,(order)) - -extern void FASTCALL(__free_pages(struct page *page, unsigned int order)); -extern void FASTCALL(free_pages(unsigned long addr, unsigned int order)); -extern void FASTCALL(free_hot_page(struct page *page)); -extern void FASTCALL(free_cold_page(struct page *page)); - -#define __free_page(page) __free_pages((page), 0) -#define free_page(addr) free_pages((addr),0) - -void page_alloc_init(void); - -#endif /* __LINUX_GFP_H */ diff --git a/include/linux/mm.h.orig b/include/linux/mm.h.orig deleted file mode 100644 index 8f8a8a3a3..000000000 --- a/include/linux/mm.h.orig +++ /dev/null @@ -1,729 +0,0 @@ -#ifndef _LINUX_MM_H -#define _LINUX_MM_H - -#include -#include - -#ifdef __KERNEL__ - -#include -#include -#include -#include -#include -#include -#include - -struct mempolicy; -struct anon_vma; - -#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ -extern unsigned long max_mapnr; -#endif - -extern unsigned long num_physpages; -extern void * high_memory; -extern int page_cluster; - -#include -#include -#include -#include - -#ifndef MM_VM_SIZE -#define MM_VM_SIZE(mm) TASK_SIZE -#endif - -/* - * Linux kernel virtual memory manager primitives. - * The idea being to have a "virtual" mm in the same way - * we have a virtual fs - giving a cleaner interface to the - * mm details, and allowing different kinds of memory mappings - * (from shared memory to executable loading to arbitrary - * mmap() functions). - */ - -/* - * This struct defines a memory VMM memory area. There is one of these - * per VM-area/task. A VM area is any part of the process virtual memory - * space that has a special rule for the page-fault handlers (ie a shared - * library, the executable area etc). - */ -struct vm_area_struct { - struct mm_struct * vm_mm; /* The address space we belong to. */ - unsigned long vm_start; /* Our start address within vm_mm. */ - unsigned long vm_end; /* The first byte after our end address - within vm_mm. */ - - /* linked list of VM areas per task, sorted by address */ - struct vm_area_struct *vm_next; - - pgprot_t vm_page_prot; /* Access permissions of this VMA. */ - unsigned long vm_flags; /* Flags, listed below. */ - - struct rb_node vm_rb; - - /* - * For areas with an address space and backing store, - * linkage into the address_space->i_mmap prio tree, or - * linkage to the list of like vmas hanging off its node, or - * linkage of vma in the address_space->i_mmap_nonlinear list. - */ - union { - struct { - struct list_head list; - void *parent; /* aligns with prio_tree_node parent */ - struct vm_area_struct *head; - } vm_set; - - struct prio_tree_node prio_tree_node; - } shared; - - /* - * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma - * list, after a COW of one of the file pages. A MAP_SHARED vma - * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack - * or brk vma (with NULL file) can only be in an anon_vma list. - */ - struct list_head anon_vma_node; /* Serialized by anon_vma->lock */ - struct anon_vma *anon_vma; /* Serialized by page_table_lock */ - - /* Function pointers to deal with this struct. */ - struct vm_operations_struct * vm_ops; - - /* Information about our backing store: */ - unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE - units, *not* PAGE_CACHE_SIZE */ - struct file * vm_file; /* File we map to (can be NULL). */ - void * vm_private_data; /* was vm_pte (shared mem) */ - -#ifdef CONFIG_NUMA - struct mempolicy *vm_policy; /* NUMA policy for the VMA */ -#endif -}; - -/* - * vm_flags.. - */ -#define VM_READ 0x00000001 /* currently active flags */ -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 - -#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_MAYSHARE 0x00000080 - -#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ -#define VM_GROWSUP 0x00000200 -#define VM_SHM 0x00000400 /* shared memory area, don't swap out */ -#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ - -#define VM_EXECUTABLE 0x00001000 -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ - - /* Used by sys_madvise() */ -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ - -#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ -#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ -#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ -#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ -#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ - -#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS -#endif - -#ifdef CONFIG_STACK_GROWSUP -#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) -#else -#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) -#endif - -#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) -#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK -#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) -#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) -#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) - -/* - * mapping from the currently active vm_flags protection bits (the - * low four bits) to a page protection mask.. - */ -extern pgprot_t protection_map[16]; - - -/* - * These are the virtual MM functions - opening of an area, closing and - * unmapping it (needed to keep files on disk up-to-date etc), pointer - * to the functions called when a no-page or a wp-page exception occurs. - */ -struct vm_operations_struct { - void (*open)(struct vm_area_struct * area); - void (*close)(struct vm_area_struct * area); - struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type); - int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); -#ifdef CONFIG_NUMA - int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); - struct mempolicy *(*get_policy)(struct vm_area_struct *vma, - unsigned long addr); -#endif -}; - -struct mmu_gather; -struct inode; - -#ifdef ARCH_HAS_ATOMIC_UNSIGNED -typedef unsigned page_flags_t; -#else -typedef unsigned long page_flags_t; -#endif - -/* - * Each physical page in the system has a struct page associated with - * it to keep track of whatever it is we are using the page for at the - * moment. Note that we have no way to track which tasks are using - * a page. - */ -struct page { - page_flags_t flags; /* Atomic flags, some possibly - * updated asynchronously */ - atomic_t _count; /* Usage count, see below. */ - unsigned int mapcount; /* Count of ptes mapped in mms, - * to show when page is mapped - * & limit reverse map searches, - * protected by PG_maplock. - */ - unsigned long private; /* Mapping-private opaque data: - * usually used for buffer_heads - * if PagePrivate set; used for - * swp_entry_t if PageSwapCache - */ - struct address_space *mapping; /* If PG_anon clear, points to - * inode address_space, or NULL. - * If page mapped as anonymous - * memory, PG_anon is set, and - * it points to anon_vma object. - */ - pgoff_t index; /* Our offset within mapping. */ - struct list_head lru; /* Pageout list, eg. active_list - * protected by zone->lru_lock ! - */ - /* - * On machines where all RAM is mapped into kernel address space, - * we can simply calculate the virtual address. On machines with - * highmem some memory is mapped into kernel virtual memory - * dynamically, so we need a place to store that address. - * Note that this field could be 16 bits on x86 ... ;) - * - * Architectures with slow multiplication can define - * WANT_PAGE_VIRTUAL in asm/page.h - */ -#if defined(WANT_PAGE_VIRTUAL) - void *virtual; /* Kernel virtual address (NULL if - not kmapped, ie. highmem) */ -#endif /* WANT_PAGE_VIRTUAL */ -}; - -/* - * FIXME: take this include out, include page-flags.h in - * files which need it (119 of them) - */ -#include - -/* - * Methods to modify the page usage count. - * - * What counts for a page usage: - * - cache mapping (page->mapping) - * - private data (page->private) - * - page mapped in a task's page tables, each mapping - * is counted separately - * - * Also, many kernel routines increase the page count before a critical - * routine so they can be sure the page doesn't go away from under them. - * - * Since 2.6.6 (approx), a free page has ->_count = -1. This is so that we - * can use atomic_add_negative(-1, page->_count) to detect when the page - * becomes free and so that we can also use atomic_inc_and_test to atomically - * detect when we just tried to grab a ref on a page which some other CPU has - * already deemed to be freeable. - * - * NO code should make assumptions about this internal detail! Use the provided - * macros which retain the old rules: page_count(page) == 0 is a free page. - */ - -/* - * Drop a ref, return true if the logical refcount fell to zero (the page has - * no users) - */ -#define put_page_testzero(p) \ - ({ \ - BUG_ON(page_count(p) == 0); \ - atomic_add_negative(-1, &(p)->_count); \ - }) - -/* - * Grab a ref, return true if the page previously had a logical refcount of - * zero. ie: returns true if we just grabbed an already-deemed-to-be-free page - */ -#define get_page_testone(p) atomic_inc_and_test(&(p)->_count) - -#define set_page_count(p,v) atomic_set(&(p)->_count, v - 1) -#define __put_page(p) atomic_dec(&(p)->_count) - -extern void FASTCALL(__page_cache_release(struct page *)); - -#ifdef CONFIG_HUGETLB_PAGE - -static inline int page_count(struct page *p) -{ - if (PageCompound(p)) - p = (struct page *)p->private; - return atomic_read(&(p)->_count) + 1; -} - -static inline void get_page(struct page *page) -{ - if (unlikely(PageCompound(page))) - page = (struct page *)page->private; - atomic_inc(&page->_count); -} - -void put_page(struct page *page); - -#else /* CONFIG_HUGETLB_PAGE */ - -#define page_count(p) (atomic_read(&(p)->_count) + 1) - -static inline void get_page(struct page *page) -{ - atomic_inc(&page->_count); -} - -static inline void put_page(struct page *page) -{ - if (!PageReserved(page) && put_page_testzero(page)) - __page_cache_release(page); -} - -#endif /* CONFIG_HUGETLB_PAGE */ - -/* - * Multiple processes may "see" the same page. E.g. for untouched - * mappings of /dev/null, all processes see the same page full of - * zeroes, and text pages of executables and shared libraries have - * only one copy in memory, at most, normally. - * - * For the non-reserved pages, page_count(page) denotes a reference count. - * page_count() == 0 means the page is free. - * page_count() == 1 means the page is used for exactly one purpose - * (e.g. a private data page of one process). - * - * A page may be used for kmalloc() or anyone else who does a - * __get_free_page(). In this case the page_count() is at least 1, and - * all other fields are unused but should be 0 or NULL. The - * management of this page is the responsibility of the one who uses - * it. - * - * The other pages (we may call them "process pages") are completely - * managed by the Linux memory manager: I/O, buffers, swapping etc. - * The following discussion applies only to them. - * - * A page may belong to an inode's memory mapping. In this case, - * page->mapping is the pointer to the inode, and page->index is the - * file offset of the page, in units of PAGE_CACHE_SIZE. - * - * A page contains an opaque `private' member, which belongs to the - * page's address_space. Usually, this is the address of a circular - * list of the page's disk buffers. - * - * For pages belonging to inodes, the page_count() is the number of - * attaches, plus 1 if `private' contains something, plus one for - * the page cache itself. - * - * All pages belonging to an inode are in these doubly linked lists: - * mapping->clean_pages, mapping->dirty_pages and mapping->locked_pages; - * using the page->list list_head. These fields are also used for - * freelist managemet (when page_count()==0). - * - * There is also a per-mapping radix tree mapping index to the page - * in memory if present. The tree is rooted at mapping->root. - * - * All process pages can do I/O: - * - inode pages may need to be read from disk, - * - inode pages which have been modified and are MAP_SHARED may need - * to be written to disk, - * - private pages which have been modified may need to be swapped out - * to swap space and (later) to be read back into memory. - */ - -/* - * The zone field is never updated after free_area_init_core() - * sets it, so none of the operations on it need to be atomic. - * We'll have up to (MAX_NUMNODES * MAX_NR_ZONES) zones total, - * so we use (MAX_NODES_SHIFT + MAX_ZONES_SHIFT) here to get enough bits. - */ -#define NODEZONE_SHIFT (sizeof(page_flags_t)*8 - MAX_NODES_SHIFT - MAX_ZONES_SHIFT) -#define NODEZONE(node, zone) ((node << ZONES_SHIFT) | zone) - -static inline unsigned long page_zonenum(struct page *page) -{ - return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT)); -} -static inline unsigned long page_to_nid(struct page *page) -{ - return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT)); -} - -struct zone; -extern struct zone *zone_table[]; - -static inline struct zone *page_zone(struct page *page) -{ - return zone_table[page->flags >> NODEZONE_SHIFT]; -} - -static inline void set_page_zone(struct page *page, unsigned long nodezone_num) -{ - page->flags &= ~(~0UL << NODEZONE_SHIFT); - page->flags |= nodezone_num << NODEZONE_SHIFT; -} - -#ifndef CONFIG_DISCONTIGMEM -/* The array of struct pages - for discontigmem use pgdat->lmem_map */ -extern struct page *mem_map; -#endif - -static inline void *lowmem_page_address(struct page *page) -{ - return __va(page_to_pfn(page) << PAGE_SHIFT); -} - -#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) -#define HASHED_PAGE_VIRTUAL -#endif - -#if defined(WANT_PAGE_VIRTUAL) -#define page_address(page) ((page)->virtual) -#define set_page_address(page, address) \ - do { \ - (page)->virtual = (address); \ - } while(0) -#define page_address_init() do { } while(0) -#endif - -#if defined(HASHED_PAGE_VIRTUAL) -void *page_address(struct page *page); -void set_page_address(struct page *page, void *virtual); -void page_address_init(void); -#endif - -#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) -#define page_address(page) lowmem_page_address(page) -#define set_page_address(page, address) do { } while(0) -#define page_address_init() do { } while(0) -#endif - -/* - * On an anonymous page mapped into a user virtual memory area, - * page->mapping points to its anon_vma, not to a struct address_space. - * - * Please note that, confusingly, "page_mapping" refers to the inode - * address_space which maps the page from disk; whereas "page_mapped" - * refers to user virtual address space into which the page is mapped. - */ -extern struct address_space swapper_space; -static inline struct address_space *page_mapping(struct page *page) -{ - struct address_space *mapping = NULL; - - if (unlikely(PageSwapCache(page))) - mapping = &swapper_space; - else if (likely(!PageAnon(page))) - mapping = page->mapping; - return mapping; -} - -/* - * Return the pagecache index of the passed page. Regular pagecache pages - * use ->index whereas swapcache pages use ->private - */ -static inline pgoff_t page_index(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return page->private; - return page->index; -} - -/* - * Return true if this page is mapped into pagetables. - */ -static inline int page_mapped(struct page *page) -{ - return page->mapcount != 0; -} - -/* - * Error return values for the *_nopage functions - */ -#define NOPAGE_SIGBUS (NULL) -#define NOPAGE_OOM ((struct page *) (-1)) - -/* - * Different kinds of faults, as returned by handle_mm_fault(). - * Used to decide whether a process gets delivered SIGBUS or - * just gets major/minor fault counters bumped up. - */ -#define VM_FAULT_OOM (-1) -#define VM_FAULT_SIGBUS 0 -#define VM_FAULT_MINOR 1 -#define VM_FAULT_MAJOR 2 - -#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) - -extern void show_free_areas(void); - -struct page *shmem_nopage(struct vm_area_struct * vma, - unsigned long address, int *type); -int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new); -struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, - unsigned long addr); -struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags); -void shmem_lock(struct file * file, int lock); -int shmem_zero_setup(struct vm_area_struct *); - -/* - * Parameter block passed down to zap_pte_range in exceptional cases. - */ -struct zap_details { - struct vm_area_struct *nonlinear_vma; /* Check page->index if set */ - struct address_space *check_mapping; /* Check page->mapping if set */ - pgoff_t first_index; /* Lowest page->index to unmap */ - pgoff_t last_index; /* Highest page->index to unmap */ - int atomic; /* May not schedule() */ -}; - -void zap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *); -int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, - struct vm_area_struct *start_vma, unsigned long start_addr, - unsigned long end_addr, unsigned long *nr_accounted, - struct zap_details *); -void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr); -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); -int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, - unsigned long size, pgprot_t prot); -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows); - -static inline void unmap_shared_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen) -{ - unmap_mapping_range(mapping, holebegin, holelen, 0); -} - -extern int vmtruncate(struct inode * inode, loff_t offset); -extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); -extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); -extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); -extern int make_pages_present(unsigned long addr, unsigned long end); -extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); -void install_arg_page(struct vm_area_struct *, struct page *, unsigned long); - -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); - -int __set_page_dirty_buffers(struct page *page); -int __set_page_dirty_nobuffers(struct page *page); -int redirty_page_for_writepage(struct writeback_control *wbc, - struct page *page); -int FASTCALL(set_page_dirty(struct page *page)); -int set_page_dirty_lock(struct page *page); -int clear_page_dirty_for_io(struct page *page); - -/* - * Prototype to add a shrinker callback for ageable caches. - * - * These functions are passed a count `nr_to_scan' and a gfpmask. They should - * scan `nr_to_scan' objects, attempting to free them. - * - * The callback must the number of objects which remain in the cache. - * - * The callback will be passes nr_to_scan == 0 when the VM is querying the - * cache size, so a fastpath for that case is appropriate. - */ -typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask); - -/* - * Add an aging callback. The int is the number of 'seeks' it takes - * to recreate one of the objects that these functions age. - */ - -#define DEFAULT_SEEKS 2 -struct shrinker; -extern struct shrinker *set_shrinker(int, shrinker_t); -extern void remove_shrinker(struct shrinker *shrinker); - -/* - * On a two-level page table, this ends up being trivial. Thus the - * inlining and the symmetry break with pte_alloc_map() that does all - * of this out-of-line. - */ -static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - if (pgd_none(*pgd)) - return __pmd_alloc(mm, pgd, address); - return pmd_offset(pgd, address); -} - -extern void free_area_init(unsigned long * zones_size); -extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, - unsigned long * zones_size, unsigned long zone_start_pfn, - unsigned long *zholes_size); -extern void memmap_init_zone(struct page *, unsigned long, int, - unsigned long, unsigned long); -extern void mem_init(void); -extern void show_mem(void); -extern void si_meminfo(struct sysinfo * val); -extern void si_meminfo_node(struct sysinfo *val, int nid); - -static inline void vma_prio_tree_init(struct vm_area_struct *vma) -{ - vma->shared.vm_set.list.next = NULL; - vma->shared.vm_set.list.prev = NULL; - vma->shared.vm_set.parent = NULL; - vma->shared.vm_set.head = NULL; -} - -/* prio_tree.c */ -void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); -void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); -void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *); -struct vm_area_struct *vma_prio_tree_next( - struct vm_area_struct *, struct prio_tree_root *, - struct prio_tree_iter *, pgoff_t begin, pgoff_t end); - -/* mmap.c */ -extern void vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); -extern struct vm_area_struct *vma_merge(struct mm_struct *, - struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *); -extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int split_vma(struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); -extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, - struct rb_node **, struct rb_node *); -extern struct vm_area_struct *copy_vma(struct vm_area_struct **, - unsigned long addr, unsigned long len, pgoff_t pgoff); -extern void exit_mmap(struct mm_struct *); - -extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); - -extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long pgoff); - -static inline unsigned long do_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - unsigned long ret = -EINVAL; - if ((offset + PAGE_ALIGN(len)) < offset) - goto out; - if (!(offset & ~PAGE_MASK)) - ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); -out: - return ret; -} - -extern int do_munmap(struct mm_struct *, unsigned long, size_t); - -extern unsigned long do_brk(unsigned long, unsigned long); - -/* filemap.c */ -extern unsigned long page_unuse(struct page *); -extern void truncate_inode_pages(struct address_space *, loff_t); - -/* generic vm_area_ops exported for stackable file systems */ -struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *); - -/* mm/page-writeback.c */ -int write_one_page(struct page *page, int wait); - -/* readahead.c */ -#define VM_MAX_READAHEAD 128 /* kbytes */ -#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ - -int do_page_cache_readahead(struct address_space *mapping, struct file *filp, - unsigned long offset, unsigned long nr_to_read); -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - unsigned long offset, unsigned long nr_to_read); -void page_cache_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - unsigned long offset); -void handle_ra_miss(struct address_space *mapping, - struct file_ra_state *ra, pgoff_t offset); -unsigned long max_sane_readahead(unsigned long nr); - -/* Do stack extension */ -extern int expand_stack(struct vm_area_struct * vma, unsigned long address); - -/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); -extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, - struct vm_area_struct **pprev); - -/* Look up the first VMA which intersects the interval start_addr..end_addr-1, - NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) -{ - struct vm_area_struct * vma = find_vma(mm,start_addr); - - if (vma && end_addr <= vma->vm_start) - vma = NULL; - return vma; -} - -static inline unsigned long vma_pages(struct vm_area_struct *vma) -{ - return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; -} - -extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); - -extern unsigned int nr_used_zone_pages(void); - -extern struct page * vmalloc_to_page(void *addr); -extern struct page * follow_page(struct mm_struct *mm, unsigned long address, - int write); -extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, - unsigned long to, unsigned long size, pgprot_t prot); - -#ifndef CONFIG_DEBUG_PAGEALLOC -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) -{ -} -#endif - -#ifndef CONFIG_ARCH_GATE_AREA -extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk); -int in_gate_area(struct task_struct *task, unsigned long addr); -#endif - -#endif /* __KERNEL__ */ -#endif /* _LINUX_MM_H */ diff --git a/mm/Makefile.orig b/mm/Makefile.orig deleted file mode 100644 index d22feb38a..000000000 --- a/mm/Makefile.orig +++ /dev/null @@ -1,17 +0,0 @@ -# -# Makefile for the linux memory manager. -# - -mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ - mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - shmem.o vmalloc.o - -obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ - page_alloc.o page-writeback.o pdflush.o prio_tree.o \ - readahead.o slab.o swap.o truncate.o vmscan.o \ - $(mmu-y) - -obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o -obj-$(CONFIG_HUGETLBFS) += hugetlb.o -obj-$(CONFIG_NUMA) += mempolicy.o diff --git a/mm/mmap.c.orig b/mm/mmap.c.orig deleted file mode 100644 index d6fd2fe13..000000000 --- a/mm/mmap.c.orig +++ /dev/null @@ -1,1805 +0,0 @@ -/* - * mm/mmap.c - * - * Written by obz. - * - * Address space accounting code - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* - * WARNING: the debugging will use recursive algorithms so never enable this - * unless you know what you are doing. - */ -#undef DEBUG_MM_RB - -/* description of effects of mapping type and prot in current implementation. - * this is due to the limited x86 page protection hardware. The expected - * behavior is in parens: - * - * map_type prot - * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC - * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes - * w: (no) no w: (no) no w: (yes) yes w: (no) no - * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * - * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes - * w: (no) no w: (no) no w: (copy) copy w: (no) no - * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * - */ -pgprot_t protection_map[16] = { - __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, - __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 -}; - -int sysctl_overcommit_memory = 0; /* default is heuristic overcommit */ -int sysctl_overcommit_ratio = 50; /* default is 50% */ -int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; -atomic_t vm_committed_space = ATOMIC_INIT(0); - -EXPORT_SYMBOL(sysctl_overcommit_memory); -EXPORT_SYMBOL(sysctl_overcommit_ratio); -EXPORT_SYMBOL(sysctl_max_map_count); -EXPORT_SYMBOL(vm_committed_space); - -/* - * Requires inode->i_mapping->i_mmap_lock - */ -static void __remove_shared_vm_struct(struct vm_area_struct *vma, - struct file *file, struct address_space *mapping) -{ - if (vma->vm_flags & VM_DENYWRITE) - atomic_inc(&file->f_dentry->d_inode->i_writecount); - if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable--; - - flush_dcache_mmap_lock(mapping); - if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_del_init(&vma->shared.vm_set.list); - else - vma_prio_tree_remove(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); -} - -/* - * Remove one vm structure and free it. - */ -static void remove_vm_struct(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - - if (file) { - struct address_space *mapping = file->f_mapping; - spin_lock(&mapping->i_mmap_lock); - __remove_shared_vm_struct(vma, file, mapping); - spin_unlock(&mapping->i_mmap_lock); - } - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); - if (file) - fput(file); - anon_vma_unlink(vma); - mpol_free(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); -} - -/* - * sys_brk() for the most part doesn't need the global kernel - * lock, except when an application is doing something nasty - * like trying to un-brk an area that has already been mapped - * to a regular file. in this case, the unmapping will need - * to invoke file system routines that need the global lock. - */ -asmlinkage unsigned long sys_brk(unsigned long brk) -{ - unsigned long rlim, retval; - unsigned long newbrk, oldbrk; - struct mm_struct *mm = current->mm; - - down_write(&mm->mmap_sem); - - if (brk < mm->end_code) - goto out; - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(mm->brk); - if (oldbrk == newbrk) - goto set_brk; - - /* Always allow shrinking brk. */ - if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk)) - goto set_brk; - goto out; - } - - /* Check against rlimit.. */ - rlim = current->rlim[RLIMIT_DATA].rlim_cur; - if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) - goto out; - - /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) - goto out; - - /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) - goto out; -set_brk: - mm->brk = brk; -out: - retval = mm->brk; - up_write(&mm->mmap_sem); - return retval; -} - -#ifdef DEBUG_MM_RB -static int browse_rb(struct rb_root *root) -{ - int i = 0, j; - struct rb_node *nd, *pn = NULL; - unsigned long prev = 0, pend = 0; - - for (nd = rb_first(root); nd; nd = rb_next(nd)) { - struct vm_area_struct *vma; - vma = rb_entry(nd, struct vm_area_struct, vm_rb); - if (vma->vm_start < prev) - printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; - if (vma->vm_start < pend) - printk("vm_start %lx pend %lx\n", vma->vm_start, pend); - if (vma->vm_start > vma->vm_end) - printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); - i++; - pn = nd; - } - j = 0; - for (nd = pn; nd; nd = rb_prev(nd)) { - j++; - } - if (i != j) - printk("backwards %d, forwards %d\n", j, i), i = 0; - return i; -} - -void validate_mm(struct mm_struct *mm) -{ - int bug = 0; - int i = 0; - struct vm_area_struct *tmp = mm->mmap; - while (tmp) { - tmp = tmp->vm_next; - i++; - } - if (i != mm->map_count) - printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; - i = browse_rb(&mm->mm_rb); - if (i != mm->map_count) - printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; - if (bug) - BUG(); -} -#else -#define validate_mm(mm) do { } while (0) -#endif - -static struct vm_area_struct * -find_vma_prepare(struct mm_struct *mm, unsigned long addr, - struct vm_area_struct **pprev, struct rb_node ***rb_link, - struct rb_node ** rb_parent) -{ - struct vm_area_struct * vma; - struct rb_node ** __rb_link, * __rb_parent, * rb_prev; - - __rb_link = &mm->mm_rb.rb_node; - rb_prev = __rb_parent = NULL; - vma = NULL; - - while (*__rb_link) { - struct vm_area_struct *vma_tmp; - - __rb_parent = *__rb_link; - vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - return vma; - __rb_link = &__rb_parent->rb_left; - } else { - rb_prev = __rb_parent; - __rb_link = &__rb_parent->rb_right; - } - } - - *pprev = NULL; - if (rb_prev) - *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - *rb_link = __rb_link; - *rb_parent = __rb_parent; - return vma; -} - -static inline void -__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node *rb_parent) -{ - if (prev) { - vma->vm_next = prev->vm_next; - prev->vm_next = vma; - } else { - mm->mmap = vma; - if (rb_parent) - vma->vm_next = rb_entry(rb_parent, - struct vm_area_struct, vm_rb); - else - vma->vm_next = NULL; - } -} - -void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, - struct rb_node **rb_link, struct rb_node *rb_parent) -{ - rb_link_node(&vma->vm_rb, rb_parent, rb_link); - rb_insert_color(&vma->vm_rb, &mm->mm_rb); -} - -static inline void __vma_link_file(struct vm_area_struct *vma) -{ - struct file * file; - - file = vma->vm_file; - if (file) { - struct address_space *mapping = file->f_mapping; - - if (vma->vm_flags & VM_DENYWRITE) - atomic_dec(&file->f_dentry->d_inode->i_writecount); - if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; - - flush_dcache_mmap_lock(mapping); - if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_add_tail(&vma->shared.vm_set.list, - &mapping->i_mmap_nonlinear); - else - vma_prio_tree_insert(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - } -} - -static void -__vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) -{ - __vma_link_list(mm, vma, prev, rb_parent); - __vma_link_rb(mm, vma, rb_link, rb_parent); - __anon_vma_link(vma); -} - -static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) -{ - struct address_space *mapping = NULL; - - if (vma->vm_file) - mapping = vma->vm_file->f_mapping; - - if (mapping) - spin_lock(&mapping->i_mmap_lock); - anon_vma_lock(vma); - - __vma_link(mm, vma, prev, rb_link, rb_parent); - __vma_link_file(vma); - - anon_vma_unlock(vma); - if (mapping) - spin_unlock(&mapping->i_mmap_lock); - - mark_mm_hugetlb(mm, vma); - mm->map_count++; - validate_mm(mm); -} - -/* - * Helper for vma_adjust in the split_vma insert case: - * insert vm structure into list and rbtree and anon_vma, - * but it has already been inserted into prio_tree earlier. - */ -static void -__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) -{ - struct vm_area_struct * __vma, * prev; - struct rb_node ** rb_link, * rb_parent; - - __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); - if (__vma && __vma->vm_start < vma->vm_end) - BUG(); - __vma_link(mm, vma, prev, rb_link, rb_parent); - mm->map_count++; -} - -static inline void -__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) -{ - prev->vm_next = vma->vm_next; - rb_erase(&vma->vm_rb, &mm->mm_rb); - if (mm->mmap_cache == vma) - mm->mmap_cache = prev; -} - -/* - * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that - * is already present in an i_mmap tree without adjusting the tree. - * The following helper function should be used when such adjustments - * are necessary. The "insert" vma (if any) is to be inserted - * before we drop the necessary locks. - */ -void vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) -{ - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next = vma->vm_next; - struct address_space *mapping = NULL; - struct prio_tree_root *root = NULL; - struct file *file = vma->vm_file; - struct anon_vma *anon_vma = NULL; - long adjust_next = 0; - int remove_next = 0; - - if (next && !insert) { - if (end >= next->vm_end) { - /* - * vma expands, overlapping all the next, and - * perhaps the one after too (mprotect case 6). - */ -again: remove_next = 1 + (end > next->vm_end); - end = next->vm_end; - anon_vma = next->anon_vma; - } else if (end > next->vm_start) { - /* - * vma expands, overlapping part of the next: - * mprotect case 5 shifting the boundary up. - */ - adjust_next = (end - next->vm_start) >> PAGE_SHIFT; - anon_vma = next->anon_vma; - } else if (end < vma->vm_end) { - /* - * vma shrinks, and !insert tells it's not - * split_vma inserting another: so it must be - * mprotect case 4 shifting the boundary down. - */ - adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); - anon_vma = next->anon_vma; - } - } - - if (file) { - mapping = file->f_mapping; - if (!(vma->vm_flags & VM_NONLINEAR)) - root = &mapping->i_mmap; - spin_lock(&mapping->i_mmap_lock); - if (insert) { - /* - * Put into prio_tree now, so instantiated pages - * are visible to arm/parisc __flush_dcache_page - * throughout; but we cannot insert into address - * space until vma start or end is updated. - */ - __vma_link_file(insert); - } - } - - /* - * When changing only vma->vm_end, we don't really need - * anon_vma lock: but is that case worth optimizing out? - */ - if (vma->anon_vma) - anon_vma = vma->anon_vma; - if (anon_vma) - spin_lock(&anon_vma->lock); - - if (root) { - flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, root); - if (adjust_next) - vma_prio_tree_remove(next, root); - } - - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; - if (adjust_next) { - next->vm_start += adjust_next << PAGE_SHIFT; - next->vm_pgoff += adjust_next; - } - - if (root) { - if (adjust_next) { - vma_prio_tree_init(next); - vma_prio_tree_insert(next, root); - } - vma_prio_tree_init(vma); - vma_prio_tree_insert(vma, root); - flush_dcache_mmap_unlock(mapping); - } - - if (remove_next) { - /* - * vma_merge has merged next into vma, and needs - * us to remove next before dropping the locks. - */ - __vma_unlink(mm, next, vma); - if (file) - __remove_shared_vm_struct(next, file, mapping); - if (next->anon_vma) - __anon_vma_merge(vma, next); - } else if (insert) { - /* - * split_vma has split insert from vma, and needs - * us to insert it before dropping the locks - * (it may either follow vma or precede it). - */ - __insert_vm_struct(mm, insert); - } - - if (anon_vma) - spin_unlock(&anon_vma->lock); - if (mapping) - spin_unlock(&mapping->i_mmap_lock); - - if (remove_next) { - if (file) - fput(file); - mm->map_count--; - mpol_free(vma_policy(next)); - kmem_cache_free(vm_area_cachep, next); - /* - * In mprotect's case 6 (see comments on vma_merge), - * we must remove another next too. It would clutter - * up the code too much to do both in one go. - */ - if (remove_next == 2) { - next = vma->vm_next; - goto again; - } - } - - validate_mm(mm); -} - -/* - * If the vma has a ->close operation then the driver probably needs to release - * per-vma resources, so we don't attempt to merge those. - */ -#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED) - -static inline int is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags) -{ - if (vma->vm_flags != vm_flags) - return 0; - if (vma->vm_file != file) - return 0; - if (vma->vm_ops && vma->vm_ops->close) - return 0; - return 1; -} - -static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, - struct anon_vma *anon_vma2) -{ - return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); -} - -/* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) - * in front of (at a lower virtual address and file offset than) the vma. - * - * We cannot merge two vmas if they have differently assigned (non-NULL) - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. - * - * We don't check here for the merged mmap wrapping around the end of pagecache - * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which - * wrap, nor mmaps which cover the final page at index -1UL. - */ -static int -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) -{ - if (is_mergeable_vma(vma, file, vm_flags) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { - if (vma->vm_pgoff == vm_pgoff) - return 1; - } - return 0; -} - -/* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) - * beyond (at a higher virtual address and file offset than) the vma. - * - * We cannot merge two vmas if they have differently assigned (non-NULL) - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. - */ -static int -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) -{ - if (is_mergeable_vma(vma, file, vm_flags) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { - pgoff_t vm_pglen; - vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - if (vma->vm_pgoff + vm_pglen == vm_pgoff) - return 1; - } - return 0; -} - -/* - * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out - * whether that can be merged with its predecessor or its successor. - * Or both (it neatly fills a hole). - * - * In most cases - when called for mmap, brk or mremap - [addr,end) is - * certain not to be mapped by the time vma_merge is called; but when - * called for mprotect, it is certain to be already mapped (either at - * an offset within prev, or at the start of next), and the flags of - * this area are about to be changed to vm_flags - and the no-change - * case has already been eliminated. - * - * The following mprotect cases have to be considered, where AAAA is - * the area passed down from mprotect_fixup, never extending beyond one - * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: - * - * AAAA AAAA AAAA AAAA - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX - * cannot merge might become might become might become - * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or - * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or - * mremap move: PPPPNNNNNNNN 8 - * AAAA - * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN - * might become case 1 below case 2 below case 3 below - * - * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: - * mprotect_fixup updates vm_flags & vm_page_prot on successful return. - */ -struct vm_area_struct *vma_merge(struct mm_struct *mm, - struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy) -{ - pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *area, *next; - - /* - * We later require that vma->vm_flags == vm_flags, - * so this tests vma->vm_flags & VM_SPECIAL, too. - */ - if (vm_flags & VM_SPECIAL) - return NULL; - - if (prev) - next = prev->vm_next; - else - next = mm->mmap; - area = next; - if (next && next->vm_end == end) /* cases 6, 7, 8 */ - next = next->vm_next; - - /* - * Can it merge with the predecessor? - */ - if (prev && prev->vm_end == addr && - mpol_equal(vma_policy(prev), policy) && - can_vma_merge_after(prev, vm_flags, - anon_vma, file, pgoff)) { - /* - * OK, it can. Can we now merge in the successor as well? - */ - if (next && end == next->vm_start && - mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen) && - is_mergeable_anon_vma(prev->anon_vma, - next->anon_vma)) { - /* cases 1, 6 */ - vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL); - } else /* cases 2, 5, 7 */ - vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL); - return prev; - } - - /* - * Can this new request be merged in front of next? - */ - if (next && end == next->vm_start && - mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen)) { - if (prev && addr < prev->vm_end) /* case 4 */ - vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL); - else /* cases 3, 8 */ - vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL); - return area; - } - - return NULL; -} - -/* - * find_mergeable_anon_vma is used by anon_vma_prepare, to check - * neighbouring vmas for a suitable anon_vma, before it goes off - * to allocate a new anon_vma. It checks because a repetitive - * sequence of mprotects and faults may otherwise lead to distinct - * anon_vmas being allocated, preventing vma merge in subsequent - * mprotect. - */ -struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) -{ - struct vm_area_struct *near; - unsigned long vm_flags; - - near = vma->vm_next; - if (!near) - goto try_prev; - - /* - * Since only mprotect tries to remerge vmas, match flags - * which might be mprotected into each other later on. - * Neither mlock nor madvise tries to remerge at present, - * so leave their flags as obstructing a merge. - */ - vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); - vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); - - if (near->anon_vma && vma->vm_end == near->vm_start && - mpol_equal(vma_policy(vma), vma_policy(near)) && - can_vma_merge_before(near, vm_flags, - NULL, vma->vm_file, vma->vm_pgoff + - ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) - return near->anon_vma; -try_prev: - /* - * It is potentially slow to have to call find_vma_prev here. - * But it's only on the first write fault on the vma, not - * every time, and we could devise a way to avoid it later - * (e.g. stash info in next's anon_vma_node when assigning - * an anon_vma, or when trying vma_merge). Another time. - */ - if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma) - BUG(); - if (!near) - goto none; - - vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); - vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); - - if (near->anon_vma && near->vm_end == vma->vm_start && - mpol_equal(vma_policy(near), vma_policy(vma)) && - can_vma_merge_after(near, vm_flags, - NULL, vma->vm_file, vma->vm_pgoff)) - return near->anon_vma; -none: - /* - * There's no absolute need to look only at touching neighbours: - * we could search further afield for "compatible" anon_vmas. - * But it would probably just be a waste of time searching, - * or lead to too many vmas hanging off the same anon_vma. - * We're trying to allow mprotect remerging later on, - * not trying to minimize memory used for anon_vmas. - */ - return NULL; -} - -/* - * The caller must hold down_write(current->mm->mmap_sem). - */ - -unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flags, unsigned long pgoff) -{ - struct mm_struct * mm = current->mm; - struct vm_area_struct * vma, * prev; - struct inode *inode; - unsigned int vm_flags; - int correct_wcount = 0; - int error; - struct rb_node ** rb_link, * rb_parent; - int accountable = 1; - unsigned long charged = 0; - - if (file) { - if (is_file_hugepages(file)) - accountable = 0; - - if (!file->f_op || !file->f_op->mmap) - return -ENODEV; - - if ((prot & PROT_EXEC) && - (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)) - return -EPERM; - } - - if (!len) - return addr; - - /* Careful about overflows.. */ - len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) - return -EINVAL; - - /* offset overflow? */ - if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) - return -EINVAL; - - /* Too many mappings? */ - if (mm->map_count > sysctl_max_map_count) - return -ENOMEM; - - /* Obtain the address to map to. we verify (or select) it and ensure - * that it represents a valid section of the address space. - */ - addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (addr & ~PAGE_MASK) - return addr; - - /* Do simple checking here so the lower-level routines won't have - * to. we assume access permissions have been handled by the open - * of the memory object, so we don't do any here. - */ - vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | - mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - - if (flags & MAP_LOCKED) { - if (!capable(CAP_IPC_LOCK)) - return -EPERM; - vm_flags |= VM_LOCKED; - } - /* mlock MCL_FUTURE? */ - if (vm_flags & VM_LOCKED) { - unsigned long locked = mm->locked_vm << PAGE_SHIFT; - locked += len; - if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) - return -EAGAIN; - } - - inode = file ? file->f_dentry->d_inode : NULL; - - if (file) { - switch (flags & MAP_TYPE) { - case MAP_SHARED: - if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) - return -EACCES; - - /* - * Make sure we don't allow writing to an append-only - * file.. - */ - if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) - return -EACCES; - - /* - * Make sure there are no mandatory locks on the file. - */ - if (locks_verify_locked(inode)) - return -EAGAIN; - - vm_flags |= VM_SHARED | VM_MAYSHARE; - if (!(file->f_mode & FMODE_WRITE)) - vm_flags &= ~(VM_MAYWRITE | VM_SHARED); - - /* fall through */ - case MAP_PRIVATE: - if (!(file->f_mode & FMODE_READ)) - return -EACCES; - break; - - default: - return -EINVAL; - } - } else { - switch (flags & MAP_TYPE) { - case MAP_SHARED: - vm_flags |= VM_SHARED | VM_MAYSHARE; - break; - case MAP_PRIVATE: - /* - * Set pgoff according to addr for anon_vma. - */ - pgoff = addr >> PAGE_SHIFT; - break; - default: - return -EINVAL; - } - } - - error = security_file_mmap(file, prot, flags); - if (error) - return error; - - /* Clear old maps */ - error = -ENOMEM; -munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { - if (do_munmap(mm, addr, len)) - return -ENOMEM; - goto munmap_back; - } - - /* Check against address space limit. */ - if ((mm->total_vm << PAGE_SHIFT) + len - > current->rlim[RLIMIT_AS].rlim_cur) - return -ENOMEM; - - if (accountable && (!(flags & MAP_NORESERVE) || - sysctl_overcommit_memory > 1)) { - if (vm_flags & VM_SHARED) { - /* Check memory availability in shmem_file_setup? */ - vm_flags |= VM_ACCOUNT; - } else if (vm_flags & VM_WRITE) { - /* - * Private writable mapping: check memory availability - */ - charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) - return -ENOMEM; - vm_flags |= VM_ACCOUNT; - } - } - - /* - * Can we just expand an old private anonymous mapping? - * The VM_SHARED test is necessary because shmem_zero_setup - * will create the file object for a shared anonymous map below. - */ - if (!file && !(vm_flags & VM_SHARED) && - vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, NULL, pgoff, NULL)) - goto out; - - /* - * Determine the object being mapped and call the appropriate - * specific mapper. the address has already been validated, but - * not unmapped, but the maps are removed from the list. - */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!vma) { - error = -ENOMEM; - goto unacct_error; - } - memset(vma, 0, sizeof(*vma)); - - vma->vm_mm = mm; - vma->vm_start = addr; - vma->vm_end = addr + len; - vma->vm_flags = vm_flags; - vma->vm_page_prot = protection_map[vm_flags & 0x0f]; - vma->vm_pgoff = pgoff; - - if (file) { - error = -EINVAL; - if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) - goto free_vma; - if (vm_flags & VM_DENYWRITE) { - error = deny_write_access(file); - if (error) - goto free_vma; - correct_wcount = 1; - } - vma->vm_file = file; - get_file(file); - error = file->f_op->mmap(file, vma); - if (error) - goto unmap_and_free_vma; - } else if (vm_flags & VM_SHARED) { - error = shmem_zero_setup(vma); - if (error) - goto free_vma; - } - - /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform - * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) - * that memory reservation must be checked; but that reservation - * belongs to shared memory object, not to vma: so now clear it. - */ - if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) - vma->vm_flags &= ~VM_ACCOUNT; - - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM - */ - addr = vma->vm_start; - - if (!file || !vma_merge(mm, prev, addr, vma->vm_end, - vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { - vma_link(mm, vma, prev, rb_link, rb_parent); - if (correct_wcount) - atomic_inc(&inode->i_writecount); - } else { - if (file) { - if (correct_wcount) - atomic_inc(&inode->i_writecount); - fput(file); - } - mpol_free(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); - } -out: - mm->total_vm += len >> PAGE_SHIFT; - if (vm_flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); - } - if (flags & MAP_POPULATE) { - up_write(&mm->mmap_sem); - sys_remap_file_pages(addr, len, 0, - pgoff, flags & MAP_NONBLOCK); - down_write(&mm->mmap_sem); - } - return addr; - -unmap_and_free_vma: - if (correct_wcount) - atomic_inc(&inode->i_writecount); - vma->vm_file = NULL; - fput(file); - - /* Undo any partial mapping done by a device driver. */ - zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); -free_vma: - kmem_cache_free(vm_area_cachep, vma); -unacct_error: - if (charged) - vm_unacct_memory(charged); - return error; -} - -EXPORT_SYMBOL(do_mmap_pgoff); - -/* Get an address range which is currently unmapped. - * For shmat() with addr=0. - * - * Ugly calling convention alert: - * Return value with the low bits set means error value, - * ie - * if (ret & ~PAGE_MASK) - * error = ret; - * - * This function "knows" that -ENOMEM has the bits set. - */ -#ifndef HAVE_ARCH_UNMAPPED_AREA -static inline unsigned long -arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long start_addr; - - if (len > TASK_SIZE) - return -ENOMEM; - - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) - return addr; - } - start_addr = addr = mm->free_area_cache; - -full_search: - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != TASK_UNMAPPED_BASE) { - start_addr = addr = TASK_UNMAPPED_BASE; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - /* - * Remember the place where we stopped the search: - */ - mm->free_area_cache = addr + len; - return addr; - } - addr = vma->vm_end; - } -} -#else -extern unsigned long -arch_get_unmapped_area(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); -#endif - -unsigned long -get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - if (flags & MAP_FIXED) { - unsigned long ret; - - if (addr > TASK_SIZE - len) - return -ENOMEM; - if (addr & ~PAGE_MASK) - return -EINVAL; - if (file && is_file_hugepages(file)) { - /* - * Check if the given range is hugepage aligned, and - * can be made suitable for hugepages. - */ - ret = prepare_hugepage_range(addr, len); - } else { - /* - * Ensure that a normal request is not falling in a - * reserved hugepage range. For some archs like IA-64, - * there is a separate region for hugepages. - */ - ret = is_hugepage_only_range(addr, len); - } - if (ret) - return -EINVAL; - return addr; - } - - if (file && file->f_op && file->f_op->get_unmapped_area) - return file->f_op->get_unmapped_area(file, addr, len, - pgoff, flags); - - return arch_get_unmapped_area(file, addr, len, pgoff, flags); -} - -EXPORT_SYMBOL(get_unmapped_area); - -/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) -{ - struct vm_area_struct *vma = NULL; - - if (mm) { - /* Check the cache first. */ - /* (Cache hit rate is typically around 35%.) */ - vma = mm->mmap_cache; - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { - struct rb_node * rb_node; - - rb_node = mm->mm_rb.rb_node; - vma = NULL; - - while (rb_node) { - struct vm_area_struct * vma_tmp; - - vma_tmp = rb_entry(rb_node, - struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } - if (vma) - mm->mmap_cache = vma; - } - } - return vma; -} - -EXPORT_SYMBOL(find_vma); - -/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ -struct vm_area_struct * -find_vma_prev(struct mm_struct *mm, unsigned long addr, - struct vm_area_struct **pprev) -{ - struct vm_area_struct *vma = NULL, *prev = NULL; - struct rb_node * rb_node; - if (!mm) - goto out; - - /* Guard against addr being lower than the first VMA */ - vma = mm->mmap; - - /* Go through the RB tree quickly. */ - rb_node = mm->mm_rb.rb_node; - - while (rb_node) { - struct vm_area_struct *vma_tmp; - vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); - - if (addr < vma_tmp->vm_end) { - rb_node = rb_node->rb_left; - } else { - prev = vma_tmp; - if (!prev->vm_next || (addr < prev->vm_next->vm_end)) - break; - rb_node = rb_node->rb_right; - } - } - -out: - *pprev = prev; - return prev ? prev->vm_next : vma; -} - -#ifdef CONFIG_STACK_GROWSUP -/* - * vma is the first one with address > vma->vm_end. Have to extend vma. - */ -int expand_stack(struct vm_area_struct * vma, unsigned long address) -{ - unsigned long grow; - - if (!(vma->vm_flags & VM_GROWSUP)) - return -EFAULT; - - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ - if (unlikely(anon_vma_prepare(vma))) - return -ENOMEM; - anon_vma_lock(vma); - - /* - * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_sem in read mode. We need the - * anon_vma lock to serialize against concurrent expand_stacks. - */ - address += 4 + PAGE_SIZE - 1; - address &= PAGE_MASK; - grow = (address - vma->vm_end) >> PAGE_SHIFT; - - /* Overcommit.. */ - if (security_vm_enough_memory(grow)) { - anon_vma_unlock(vma); - return -ENOMEM; - } - - if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur || - ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > - current->rlim[RLIMIT_AS].rlim_cur) { - anon_vma_unlock(vma); - vm_unacct_memory(grow); - return -ENOMEM; - } - vma->vm_end = address; - vma->vm_mm->total_vm += grow; - if (vma->vm_flags & VM_LOCKED) - vma->vm_mm->locked_vm += grow; - anon_vma_unlock(vma); - return 0; -} - -struct vm_area_struct * -find_extend_vma(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma, *prev; - - addr &= PAGE_MASK; - vma = find_vma_prev(mm, addr, &prev); - if (vma && (vma->vm_start <= addr)) - return vma; - if (!prev || expand_stack(prev, addr)) - return NULL; - if (prev->vm_flags & VM_LOCKED) { - make_pages_present(addr, prev->vm_end); - } - return prev; -} -#else -/* - * vma is the first one with address < vma->vm_start. Have to extend vma. - */ -int expand_stack(struct vm_area_struct *vma, unsigned long address) -{ - unsigned long grow; - - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ - if (unlikely(anon_vma_prepare(vma))) - return -ENOMEM; - anon_vma_lock(vma); - - /* - * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_sem in read mode. We need the - * anon_vma lock to serialize against concurrent expand_stacks. - */ - address &= PAGE_MASK; - grow = (vma->vm_start - address) >> PAGE_SHIFT; - - /* Overcommit.. */ - if (security_vm_enough_memory(grow)) { - anon_vma_unlock(vma); - return -ENOMEM; - } - - if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || - ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > - current->rlim[RLIMIT_AS].rlim_cur) { - anon_vma_unlock(vma); - vm_unacct_memory(grow); - return -ENOMEM; - } - vma->vm_start = address; - vma->vm_pgoff -= grow; - vma->vm_mm->total_vm += grow; - if (vma->vm_flags & VM_LOCKED) - vma->vm_mm->locked_vm += grow; - anon_vma_unlock(vma); - return 0; -} - -struct vm_area_struct * -find_extend_vma(struct mm_struct * mm, unsigned long addr) -{ - struct vm_area_struct * vma; - unsigned long start; - - addr &= PAGE_MASK; - vma = find_vma(mm,addr); - if (!vma) - return NULL; - if (vma->vm_start <= addr) - return vma; - if (!(vma->vm_flags & VM_GROWSDOWN)) - return NULL; - start = vma->vm_start; - if (expand_stack(vma, addr)) - return NULL; - if (vma->vm_flags & VM_LOCKED) { - make_pages_present(addr, start); - } - return vma; -} -#endif - -/* - * Try to free as many page directory entries as we can, - * without having to work very hard at actually scanning - * the page tables themselves. - * - * Right now we try to free page tables if we have a nice - * PGDIR-aligned area that got free'd up. We could be more - * granular if we want to, but this is fast and simple, - * and covers the bad cases. - * - * "prev", if it exists, points to a vma before the one - * we just free'd - but there's no telling how much before. - */ -static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, - unsigned long start, unsigned long end) -{ - unsigned long first = start & PGDIR_MASK; - unsigned long last = end + PGDIR_SIZE - 1; - unsigned long start_index, end_index; - struct mm_struct *mm = tlb->mm; - - if (!prev) { - prev = mm->mmap; - if (!prev) - goto no_mmaps; - if (prev->vm_end > start) { - if (last > prev->vm_start) - last = prev->vm_start; - goto no_mmaps; - } - } - for (;;) { - struct vm_area_struct *next = prev->vm_next; - - if (next) { - if (next->vm_start < start) { - prev = next; - continue; - } - if (last > next->vm_start) - last = next->vm_start; - } - if (prev->vm_end > first) - first = prev->vm_end + PGDIR_SIZE - 1; - break; - } -no_mmaps: - if (last < first) /* for arches with discontiguous pgd indices */ - return; - /* - * If the PGD bits are not consecutive in the virtual address, the - * old method of shifting the VA >> by PGDIR_SHIFT doesn't work. - */ - start_index = pgd_index(first); - if (start_index < FIRST_USER_PGD_NR) - start_index = FIRST_USER_PGD_NR; - end_index = pgd_index(last); - if (end_index > start_index) { - clear_page_tables(tlb, start_index, end_index - start_index); - flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK); - } -} - -/* Normal function to fix up a mapping - * This function is the default for when an area has no specific - * function. This may be used as part of a more specific routine. - * - * By the time this function is called, the area struct has been - * removed from the process mapping list. - */ -static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) -{ - size_t len = area->vm_end - area->vm_start; - - area->vm_mm->total_vm -= len >> PAGE_SHIFT; - if (area->vm_flags & VM_LOCKED) - area->vm_mm->locked_vm -= len >> PAGE_SHIFT; - /* - * Is this a new hole at the lowest possible address? - */ - if (area->vm_start >= TASK_UNMAPPED_BASE && - area->vm_start < area->vm_mm->free_area_cache) - area->vm_mm->free_area_cache = area->vm_start; - - remove_vm_struct(area); -} - -/* - * Update the VMA and inode share lists. - * - * Ok - we have the memory areas we should free on the 'free' list, - * so release them, and do the vma updates. - */ -static void unmap_vma_list(struct mm_struct *mm, - struct vm_area_struct *mpnt) -{ - do { - struct vm_area_struct *next = mpnt->vm_next; - unmap_vma(mm, mpnt); - mpnt = next; - } while (mpnt != NULL); - validate_mm(mm); -} - -/* - * Get rid of page table information in the indicated region. - * - * Called with the page table lock held. - */ -static void unmap_region(struct mm_struct *mm, - struct vm_area_struct *vma, - struct vm_area_struct *prev, - unsigned long start, - unsigned long end) -{ - struct mmu_gather *tlb; - unsigned long nr_accounted = 0; - - lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); - unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); - vm_unacct_memory(nr_accounted); - - if (is_hugepage_only_range(start, end - start)) - hugetlb_free_pgtables(tlb, prev, start, end); - else - free_pgtables(tlb, prev, start, end); - tlb_finish_mmu(tlb, start, end); -} - -/* - * Create a list of vma's touched by the unmap, removing them from the mm's - * vma list as we go.. - */ -static void -detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, unsigned long end) -{ - struct vm_area_struct **insertion_point; - struct vm_area_struct *tail_vma = NULL; - - insertion_point = (prev ? &prev->vm_next : &mm->mmap); - do { - rb_erase(&vma->vm_rb, &mm->mm_rb); - mm->map_count--; - tail_vma = vma; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); - *insertion_point = vma; - tail_vma->vm_next = NULL; - mm->mmap_cache = NULL; /* Kill the cache. */ -} - -/* - * Split a vma into two pieces at address 'addr', a new vma is allocated - * either for the first part or the the tail. - */ -int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, - unsigned long addr, int new_below) -{ - struct mempolicy *pol; - struct vm_area_struct *new; - - if (mm->map_count >= sysctl_max_map_count) - return -ENOMEM; - - new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!new) - return -ENOMEM; - - /* most fields are the same, copy all, and then fixup */ - *new = *vma; - vma_prio_tree_init(new); - - if (new_below) - new->vm_end = addr; - else { - new->vm_start = addr; - new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); - } - - pol = mpol_copy(vma_policy(vma)); - if (IS_ERR(pol)) { - kmem_cache_free(vm_area_cachep, new); - return PTR_ERR(pol); - } - vma_set_policy(new, pol); - - if (new->vm_file) - get_file(new->vm_file); - - if (new->vm_ops && new->vm_ops->open) - new->vm_ops->open(new); - - if (new_below) - vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + - ((addr - new->vm_start) >> PAGE_SHIFT), new); - else - vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); - - return 0; -} - -/* Munmap is split into 2 main parts -- this part which finds - * what needs doing, and the areas themselves, which do the - * work. This now handles partial unmappings. - * Jeremy Fitzhardinge - */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) -{ - unsigned long end; - struct vm_area_struct *mpnt, *prev, *last; - - if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) - return -EINVAL; - - if ((len = PAGE_ALIGN(len)) == 0) - return -EINVAL; - - /* Find the first overlapping VMA */ - mpnt = find_vma_prev(mm, start, &prev); - if (!mpnt) - return 0; - /* we have start < mpnt->vm_end */ - - if (is_vm_hugetlb_page(mpnt)) { - int ret = is_aligned_hugepage_range(start, len); - - if (ret) - return ret; - } - - /* if it doesn't overlap, we have nothing.. */ - end = start + len; - if (mpnt->vm_start >= end) - return 0; - - /* Something will probably happen, so notify. */ - if (mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) - profile_exec_unmap(mm); - - /* - * If we need to split any vma, do it now to save pain later. - * - * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially - * unmapped vm_area_struct will remain in use: so lower split_vma - * places tmp vma above, and higher split_vma places tmp vma below. - */ - if (start > mpnt->vm_start) { - if (split_vma(mm, mpnt, start, 0)) - return -ENOMEM; - prev = mpnt; - } - - /* Does it split the last one? */ - last = find_vma(mm, end); - if (last && end > last->vm_start) { - if (split_vma(mm, last, end, 1)) - return -ENOMEM; - } - mpnt = prev? prev->vm_next: mm->mmap; - - /* - * Remove the vma's, and unmap the actual pages - */ - detach_vmas_to_be_unmapped(mm, mpnt, prev, end); - spin_lock(&mm->page_table_lock); - unmap_region(mm, mpnt, prev, start, end); - spin_unlock(&mm->page_table_lock); - - /* Fix up all other VM information */ - unmap_vma_list(mm, mpnt); - - return 0; -} - -EXPORT_SYMBOL(do_munmap); - -asmlinkage long sys_munmap(unsigned long addr, size_t len) -{ - int ret; - struct mm_struct *mm = current->mm; - - down_write(&mm->mmap_sem); - ret = do_munmap(mm, addr, len); - up_write(&mm->mmap_sem); - return ret; -} - -/* - * this is really a simplified "do_mmap". it only handles - * anonymous maps. eventually we may be able to do some - * brk-specific accounting here. - */ -unsigned long do_brk(unsigned long addr, unsigned long len) -{ - struct mm_struct * mm = current->mm; - struct vm_area_struct * vma, * prev; - unsigned long flags; - struct rb_node ** rb_link, * rb_parent; - pgoff_t pgoff = addr >> PAGE_SHIFT; - - len = PAGE_ALIGN(len); - if (!len) - return addr; - - if ((addr + len) > TASK_SIZE || (addr + len) < addr) - return -EINVAL; - - /* - * mlock MCL_FUTURE? - */ - if (mm->def_flags & VM_LOCKED) { - unsigned long locked = mm->locked_vm << PAGE_SHIFT; - locked += len; - if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) - return -EAGAIN; - } - - /* - * Clear old maps. this also does some error checking for us - */ - munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { - if (do_munmap(mm, addr, len)) - return -ENOMEM; - goto munmap_back; - } - - /* Check against address space limits *after* clearing old maps... */ - if ((mm->total_vm << PAGE_SHIFT) + len - > current->rlim[RLIMIT_AS].rlim_cur) - return -ENOMEM; - - if (mm->map_count > sysctl_max_map_count) - return -ENOMEM; - - if (security_vm_enough_memory(len >> PAGE_SHIFT)) - return -ENOMEM; - - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - - /* Can we just expand an old private anonymous mapping? */ - if (vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL)) - goto out; - - /* - * create a vma struct for an anonymous mapping - */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; - } - memset(vma, 0, sizeof(*vma)); - - vma->vm_mm = mm; - vma->vm_start = addr; - vma->vm_end = addr + len; - vma->vm_pgoff = pgoff; - vma->vm_flags = flags; - vma->vm_page_prot = protection_map[flags & 0x0f]; - vma_link(mm, vma, prev, rb_link, rb_parent); -out: - mm->total_vm += len >> PAGE_SHIFT; - if (flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); - } - return addr; -} - -EXPORT_SYMBOL(do_brk); - -/* Release all mmaps. */ -void exit_mmap(struct mm_struct *mm) -{ - struct mmu_gather *tlb; - struct vm_area_struct *vma; - unsigned long nr_accounted = 0; - - profile_exit_mmap(mm); - - lru_add_drain(); - - spin_lock(&mm->page_table_lock); - - tlb = tlb_gather_mmu(mm, 1); - flush_cache_mm(mm); - /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ - mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, - ~0UL, &nr_accounted, NULL); - vm_unacct_memory(nr_accounted); - BUG_ON(mm->map_count); /* This is just debugging */ - clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); - tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); - - vma = mm->mmap; - mm->mmap = mm->mmap_cache = NULL; - mm->mm_rb = RB_ROOT; - mm->rss = 0; - mm->total_vm = 0; - mm->locked_vm = 0; - - spin_unlock(&mm->page_table_lock); - - /* - * Walk the list again, actually closing and freeing it - * without holding any MM locks. - */ - while (vma) { - struct vm_area_struct *next = vma->vm_next; - remove_vm_struct(vma); - vma = next; - } -} - -/* Insert vm structure into process list sorted by address - * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_lock is taken here. - */ -void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) -{ - struct vm_area_struct * __vma, * prev; - struct rb_node ** rb_link, * rb_parent; - - /* - * The vm_pgoff of a purely anonymous vma should be irrelevant - * until its first write fault, when page's anon_vma and index - * are set. But now set the vm_pgoff it will almost certainly - * end up with (unless mremap moves it elsewhere before that - * first wfault), so /proc/pid/maps tells a consistent story. - * - * By setting it to reflect the virtual start address of the - * vma, merges and splits can happen in a seamless way, just - * using the existing file pgoff checks and manipulations. - * Similarly in do_mmap_pgoff and in do_brk. - */ - if (!vma->vm_file) { - BUG_ON(vma->anon_vma); - vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; - } - __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); - if (__vma && __vma->vm_start < vma->vm_end) - BUG(); - vma_link(mm, vma, prev, rb_link, rb_parent); -} - -/* - * Copy the vma structure to a new location in the same mm, - * prior to moving page table entries, to effect an mremap move. - */ -struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, - unsigned long addr, unsigned long len, pgoff_t pgoff) -{ - struct vm_area_struct *vma = *vmap; - unsigned long vma_start = vma->vm_start; - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *new_vma, *prev; - struct rb_node **rb_link, *rb_parent; - struct mempolicy *pol; - - /* - * If anonymous vma has not yet been faulted, update new pgoff - * to match new location, to increase its chance of merging. - */ - if (!vma->vm_file && !vma->anon_vma) - pgoff = addr >> PAGE_SHIFT; - - find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); - if (new_vma) { - /* - * Source vma may have been merged into new_vma - */ - if (vma_start >= new_vma->vm_start && - vma_start < new_vma->vm_end) - *vmap = new_vma; - } else { - new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (new_vma) { - *new_vma = *vma; - vma_prio_tree_init(new_vma); - pol = mpol_copy(vma_policy(vma)); - if (IS_ERR(pol)) { - kmem_cache_free(vm_area_cachep, new_vma); - return NULL; - } - vma_set_policy(new_vma, pol); - new_vma->vm_start = addr; - new_vma->vm_end = addr + len; - new_vma->vm_pgoff = pgoff; - if (new_vma->vm_file) - get_file(new_vma->vm_file); - if (new_vma->vm_ops && new_vma->vm_ops->open) - new_vma->vm_ops->open(new_vma); - vma_link(mm, new_vma, prev, rb_link, rb_parent); - } - } - return new_vma; -} diff --git a/mm/mprotect.c.orig b/mm/mprotect.c.orig deleted file mode 100644 index 5b438e1a0..000000000 --- a/mm/mprotect.c.orig +++ /dev/null @@ -1,282 +0,0 @@ -/* - * mm/mprotect.c - * - * (C) Copyright 1994 Linus Torvalds - * (C) Copyright 2002 Christoph Hellwig - * - * Address space accounting code - * (C) Copyright 2002 Red Hat Inc, All Rights Reserved - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -static inline void -change_pte_range(pmd_t *pmd, unsigned long address, - unsigned long size, pgprot_t newprot) -{ - pte_t * pte; - unsigned long end; - - if (pmd_none(*pmd)) - return; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - pte = pte_offset_map(pmd, address); - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - if (pte_present(*pte)) { - pte_t entry; - - /* Avoid an SMP race with hardware updated dirty/clean - * bits by wiping the pte and then setting the new pte - * into place. - */ - entry = ptep_get_and_clear(pte); - set_pte(pte, pte_modify(entry, newprot)); - } - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); - pte_unmap(pte - 1); -} - -static inline void -change_pmd_range(pgd_t *pgd, unsigned long address, - unsigned long size, pgprot_t newprot) -{ - pmd_t * pmd; - unsigned long end; - - if (pgd_none(*pgd)) - return; - if (pgd_bad(*pgd)) { - pgd_ERROR(*pgd); - pgd_clear(pgd); - return; - } - pmd = pmd_offset(pgd, address); - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - change_pte_range(pmd, address, end - address, newprot); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); -} - -static void -change_protection(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgprot_t newprot) -{ - pgd_t *dir; - unsigned long beg = start; - - dir = pgd_offset(current->mm, start); - flush_cache_range(vma, beg, end); - if (start >= end) - BUG(); - spin_lock(¤t->mm->page_table_lock); - do { - change_pmd_range(dir, start, end - start, newprot); - start = (start + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (start && (start < end)); - flush_tlb_range(vma, beg, end); - spin_unlock(¤t->mm->page_table_lock); - return; -} - -static int -mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, - unsigned long start, unsigned long end, unsigned int newflags) -{ - struct mm_struct * mm = vma->vm_mm; - unsigned long charged = 0; - pgprot_t newprot; - pgoff_t pgoff; - int error; - - if (newflags == vma->vm_flags) { - *pprev = vma; - return 0; - } - - /* - * If we make a private mapping writable we increase our commit; - * but (without finer accounting) cannot reduce our commit if we - * make it unwritable again. - * - * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting - * a MAP_NORESERVE private mapping to writable will now reserve. - */ - if (newflags & VM_WRITE) { - if (!(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { - charged = (end - start) >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) - return -ENOMEM; - newflags |= VM_ACCOUNT; - } - } - - newprot = protection_map[newflags & 0xf]; - - /* - * First try to merge with previous and/or next vma. - */ - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vma_merge(mm, *pprev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); - if (*pprev) { - vma = *pprev; - goto success; - } - - if (start != vma->vm_start) { - error = split_vma(mm, vma, start, 1); - if (error) - goto fail; - } - /* - * Unless it returns an error, this function always sets *pprev to - * the first vma for which vma->vm_end >= end. - */ - *pprev = vma; - - if (end != vma->vm_end) { - error = split_vma(mm, vma, end, 0); - if (error) - goto fail; - } - -success: - /* - * vm_flags and vm_page_prot are protected by the mmap_sem - * held in write mode. - */ - vma->vm_flags = newflags; - vma->vm_page_prot = newprot; - change_protection(vma, start, end, newprot); - return 0; - -fail: - vm_unacct_memory(charged); - return error; -} - -asmlinkage long -sys_mprotect(unsigned long start, size_t len, unsigned long prot) -{ - unsigned long vm_flags, nstart, end, tmp; - struct vm_area_struct *vma, *prev; - int error = -EINVAL; - const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); - prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); - if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ - return -EINVAL; - - if (start & ~PAGE_MASK) - return -EINVAL; - len = PAGE_ALIGN(len); - end = start + len; - if (end < start) - return -ENOMEM; - if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) - return -EINVAL; - if (end == start) - return 0; - - vm_flags = calc_vm_prot_bits(prot); - - down_write(¤t->mm->mmap_sem); - - vma = find_vma_prev(current->mm, start, &prev); - error = -ENOMEM; - if (!vma) - goto out; - if (unlikely(grows & PROT_GROWSDOWN)) { - if (vma->vm_start >= end) - goto out; - start = vma->vm_start; - error = -EINVAL; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out; - } - else { - if (vma->vm_start > start) - goto out; - if (unlikely(grows & PROT_GROWSUP)) { - end = vma->vm_end; - error = -EINVAL; - if (!(vma->vm_flags & VM_GROWSUP)) - goto out; - } - } - if (start > vma->vm_start) - prev = vma; - - for (nstart = start ; ; ) { - unsigned int newflags; - - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - - if (is_vm_hugetlb_page(vma)) { - error = -EACCES; - goto out; - } - - newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); - - if ((newflags & ~(newflags >> 4)) & 0xf) { - error = -EACCES; - goto out; - } - - error = security_file_mprotect(vma, prot); - if (error) - goto out; - - tmp = vma->vm_end; - if (tmp > end) - tmp = end; - error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); - if (error) - goto out; - nstart = tmp; - - if (nstart < prev->vm_end) - nstart = prev->vm_end; - if (nstart >= end) - goto out; - - vma = prev->vm_next; - if (!vma || vma->vm_start != nstart) { - error = -ENOMEM; - goto out; - } - } -out: - up_write(¤t->mm->mmap_sem); - return error; -} diff --git a/mm/page_alloc.c.orig b/mm/page_alloc.c.orig deleted file mode 100644 index 3dbbeb2f3..000000000 --- a/mm/page_alloc.c.orig +++ /dev/null @@ -1,2013 +0,0 @@ -/* - * linux/mm/page_alloc.c - * - * Manages the free list, the system allocates free pages here. - * Note that kmalloc() lives in slab.c - * - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * Swap reorganised 29.12.95, Stephen Tweedie - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 - * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 - * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 - * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 - * (lots of bits borrowed from Ingo Molnar & Andrew Morton) - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -DECLARE_BITMAP(node_online_map, MAX_NUMNODES); -struct pglist_data *pgdat_list; -unsigned long totalram_pages; -unsigned long totalhigh_pages; -int nr_swap_pages; -int numnodes = 1; -int sysctl_lower_zone_protection = 0; - -EXPORT_SYMBOL(totalram_pages); -EXPORT_SYMBOL(nr_swap_pages); - -/* - * Used by page_zone() to look up the address of the struct zone whose - * id is encoded in the upper bits of page->flags - */ -struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; -EXPORT_SYMBOL(zone_table); - -static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; -int min_free_kbytes = 1024; - -/* - * Temporary debugging check for pages not lying within a given zone. - */ -static int bad_range(struct zone *zone, struct page *page) -{ - if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) - return 1; - if (page_to_pfn(page) < zone->zone_start_pfn) - return 1; - if (zone != page_zone(page)) - return 1; - return 0; -} - -static void bad_page(const char *function, struct page *page) -{ - printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", - function, current->comm, page); - printk(KERN_EMERG "flags:0x%08lx mapping:%p mapcount:%d count:%d\n", - (unsigned long)page->flags, page->mapping, - (int)page->mapcount, page_count(page)); - printk(KERN_EMERG "Backtrace:\n"); - dump_stack(); - printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); - page->flags &= ~(1 << PG_private | - 1 << PG_locked | - 1 << PG_lru | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_maplock | - 1 << PG_anon | - 1 << PG_swapcache | - 1 << PG_writeback); - set_page_count(page, 0); - page->mapping = NULL; - page->mapcount = 0; -} - -#ifndef CONFIG_HUGETLB_PAGE -#define prep_compound_page(page, order) do { } while (0) -#define destroy_compound_page(page, order) do { } while (0) -#else -/* - * Higher-order pages are called "compound pages". They are structured thusly: - * - * The first PAGE_SIZE page is called the "head page". - * - * The remaining PAGE_SIZE pages are called "tail pages". - * - * All pages have PG_compound set. All pages have their ->private pointing at - * the head page (even the head page has this). - * - * The first tail page's ->mapping, if non-zero, holds the address of the - * compound page's put_page() function. - * - * The order of the allocation is stored in the first tail page's ->index - * This is only for debug at present. This usage means that zero-order pages - * may not be compound. - */ -static void prep_compound_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - - page[1].mapping = 0; - page[1].index = order; - for (i = 0; i < nr_pages; i++) { - struct page *p = page + i; - - SetPageCompound(p); - p->private = (unsigned long)page; - } -} - -static void destroy_compound_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - - if (!PageCompound(page)) - return; - - if (page[1].index != order) - bad_page(__FUNCTION__, page); - - for (i = 0; i < nr_pages; i++) { - struct page *p = page + i; - - if (!PageCompound(p)) - bad_page(__FUNCTION__, page); - if (p->private != (unsigned long)page) - bad_page(__FUNCTION__, page); - ClearPageCompound(p); - } -} -#endif /* CONFIG_HUGETLB_PAGE */ - -/* - * Freeing function for a buddy system allocator. - * - * The concept of a buddy system is to maintain direct-mapped table - * (containing bit values) for memory blocks of various "orders". - * The bottom level table contains the map for the smallest allocatable - * units of memory (here, pages), and each level above it describes - * pairs of units from the levels below, hence, "buddies". - * At a high level, all that happens here is marking the table entry - * at the bottom level available, and propagating the changes upward - * as necessary, plus some accounting needed to play nicely with other - * parts of the VM system. - * At each level, we keep one bit for each pair of blocks, which - * is set to 1 iff only one of the pair is allocated. So when we - * are allocating or freeing one, we can derive the state of the - * other. That is, if we allocate a small block, and both were - * free, the remainder of the region must be split into blocks. - * If a block is freed, and its buddy is also free, then this - * triggers coalescing into a block of larger size. - * - * -- wli - */ - -static inline void __free_pages_bulk (struct page *page, struct page *base, - struct zone *zone, struct free_area *area, unsigned long mask, - unsigned int order) -{ - unsigned long page_idx, index; - - if (order) - destroy_compound_page(page, order); - page_idx = page - base; - if (page_idx & ~mask) - BUG(); - index = page_idx >> (1 + order); - - zone->free_pages -= mask; - while (mask + (1 << (MAX_ORDER-1))) { - struct page *buddy1, *buddy2; - - BUG_ON(area >= zone->free_area + MAX_ORDER); - if (!__test_and_change_bit(index, area->map)) - /* - * the buddy page is still allocated. - */ - break; - /* - * Move the buddy up one level. - * This code is taking advantage of the identity: - * -mask = 1+~mask - */ - buddy1 = base + (page_idx ^ -mask); - buddy2 = base + page_idx; - BUG_ON(bad_range(zone, buddy1)); - BUG_ON(bad_range(zone, buddy2)); - list_del(&buddy1->lru); - mask <<= 1; - area++; - index >>= 1; - page_idx &= mask; - } - list_add(&(base + page_idx)->lru, &area->free_list); -} - -static inline void free_pages_check(const char *function, struct page *page) -{ - if ( page_mapped(page) || - page->mapping != NULL || - page_count(page) != 0 || - (page->flags & ( - 1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_reclaim | - 1 << PG_slab | - 1 << PG_maplock | - 1 << PG_anon | - 1 << PG_swapcache | - 1 << PG_writeback ))) - bad_page(function, page); - if (PageDirty(page)) - ClearPageDirty(page); -} - -/* - * Frees a list of pages. - * Assumes all pages on list are in same zone, and of same order. - * count is the number of pages to free, or 0 for all on the list. - * - * If the zone was previously in an "all pages pinned" state then look to - * see if this freeing clears that state. - * - * And clear the zone's pages_scanned counter, to hold off the "all pages are - * pinned" detection logic. - */ -static int -free_pages_bulk(struct zone *zone, int count, - struct list_head *list, unsigned int order) -{ - unsigned long mask, flags; - struct free_area *area; - struct page *base, *page = NULL; - int ret = 0; - - mask = (~0UL) << order; - base = zone->zone_mem_map; - area = zone->free_area + order; - spin_lock_irqsave(&zone->lock, flags); - zone->all_unreclaimable = 0; - zone->pages_scanned = 0; - while (!list_empty(list) && count--) { - page = list_entry(list->prev, struct page, lru); - /* have to delete it as __free_pages_bulk list manipulates */ - list_del(&page->lru); - __free_pages_bulk(page, base, zone, area, mask, order); - ret++; - } - spin_unlock_irqrestore(&zone->lock, flags); - return ret; -} - -void __free_pages_ok(struct page *page, unsigned int order) -{ - LIST_HEAD(list); - int i; - - mod_page_state(pgfree, 1 << order); - for (i = 0 ; i < (1 << order) ; ++i) - free_pages_check(__FUNCTION__, page + i); - list_add(&page->lru, &list); - kernel_map_pages(page, 1<> (1+(order)), (area)->map) - -static inline struct page * -expand(struct zone *zone, struct page *page, - unsigned long index, int low, int high, struct free_area *area) -{ - unsigned long size = 1 << high; - - while (high > low) { - BUG_ON(bad_range(zone, page)); - area--; - high--; - size >>= 1; - list_add(&page->lru, &area->free_list); - MARK_USED(index, high, area); - index += size; - page += size; - } - return page; -} - -static inline void set_page_refs(struct page *page, int order) -{ -#ifdef CONFIG_MMU - set_page_count(page, 1); -#else - int i; - - /* - * We need to reference all the pages for this order, otherwise if - * anyone accesses one of the pages with (get/put) it will be freed. - */ - for (i = 0; i < (1 << order); i++) - set_page_count(page+i, 1); -#endif /* CONFIG_MMU */ -} - -/* - * This page is about to be returned from the page allocator - */ -static void prep_new_page(struct page *page, int order) -{ - if (page->mapping || page_mapped(page) || - (page->flags & ( - 1 << PG_private | - 1 << PG_locked | - 1 << PG_lru | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_reclaim | - 1 << PG_maplock | - 1 << PG_anon | - 1 << PG_swapcache | - 1 << PG_writeback ))) - bad_page(__FUNCTION__, page); - - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | - 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_checked | 1 << PG_mappedtodisk); - page->private = 0; - set_page_refs(page, order); -} - -/* - * Do the hard work of removing an element from the buddy allocator. - * Call me with the zone->lock already held. - */ -static struct page *__rmqueue(struct zone *zone, unsigned int order) -{ - struct free_area * area; - unsigned int current_order; - struct page *page; - unsigned int index; - - for (current_order = order; current_order < MAX_ORDER; ++current_order) { - area = zone->free_area + current_order; - if (list_empty(&area->free_list)) - continue; - - page = list_entry(area->free_list.next, struct page, lru); - list_del(&page->lru); - index = page - zone->zone_mem_map; - if (current_order != MAX_ORDER-1) - MARK_USED(index, current_order, area); - zone->free_pages -= 1UL << order; - return expand(zone, page, index, order, current_order, area); - } - - return NULL; -} - -/* - * Obtain a specified number of elements from the buddy allocator, all under - * a single hold of the lock, for efficiency. Add them to the supplied list. - * Returns the number of new pages which were placed at *list. - */ -static int rmqueue_bulk(struct zone *zone, unsigned int order, - unsigned long count, struct list_head *list) -{ - unsigned long flags; - int i; - int allocated = 0; - struct page *page; - - spin_lock_irqsave(&zone->lock, flags); - for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) - break; - allocated++; - list_add_tail(&page->lru, list); - } - spin_unlock_irqrestore(&zone->lock, flags); - return allocated; -} - -#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) -static void __drain_pages(unsigned int cpu) -{ - struct zone *zone; - int i; - - for_each_zone(zone) { - struct per_cpu_pageset *pset; - - pset = &zone->pageset[cpu]; - for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { - struct per_cpu_pages *pcp; - - pcp = &pset->pcp[i]; - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); - } - } -} -#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ - -#ifdef CONFIG_PM -int is_head_of_free_region(struct page *page) -{ - struct zone *zone = page_zone(page); - unsigned long flags; - int order; - struct list_head *curr; - - /* - * Should not matter as we need quiescent system for - * suspend anyway, but... - */ - spin_lock_irqsave(&zone->lock, flags); - for (order = MAX_ORDER - 1; order >= 0; --order) - list_for_each(curr, &zone->free_area[order].free_list) - if (page == list_entry(curr, struct page, lru)) { - spin_unlock_irqrestore(&zone->lock, flags); - return 1 << order; - } - spin_unlock_irqrestore(&zone->lock, flags); - return 0; -} - -/* - * Spill all of this CPU's per-cpu pages back into the buddy allocator. - */ -void drain_local_pages(void) -{ - unsigned long flags; - - local_irq_save(flags); - __drain_pages(smp_processor_id()); - local_irq_restore(flags); -} -#endif /* CONFIG_PM */ - -static void zone_statistics(struct zonelist *zonelist, struct zone *z) -{ -#ifdef CONFIG_NUMA - unsigned long flags; - int cpu; - pg_data_t *pg = z->zone_pgdat; - pg_data_t *orig = zonelist->zones[0]->zone_pgdat; - struct per_cpu_pageset *p; - - local_irq_save(flags); - cpu = smp_processor_id(); - p = &z->pageset[cpu]; - if (pg == orig) { - z->pageset[cpu].numa_hit++; - } else { - p->numa_miss++; - zonelist->zones[0]->pageset[cpu].numa_foreign++; - } - if (pg == NODE_DATA(numa_node_id())) - p->local_node++; - else - p->other_node++; - local_irq_restore(flags); -#endif -} - -/* - * Free a 0-order page - */ -static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); -static void fastcall free_hot_cold_page(struct page *page, int cold) -{ - struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; - unsigned long flags; - - kernel_map_pages(page, 1, 0); - inc_page_state(pgfree); - free_pages_check(__FUNCTION__, page); - pcp = &zone->pageset[get_cpu()].pcp[cold]; - local_irq_save(flags); - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); - list_add(&page->lru, &pcp->list); - pcp->count++; - local_irq_restore(flags); - put_cpu(); -} - -void fastcall free_hot_page(struct page *page) -{ - free_hot_cold_page(page, 0); -} - -void fastcall free_cold_page(struct page *page) -{ - free_hot_cold_page(page, 1); -} - -/* - * Really, prep_compound_page() should be called from __rmqueue_bulk(). But - * we cheat by calling it from here, in the order > 0 path. Saves a branch - * or two. - */ - -static struct page * -buffered_rmqueue(struct zone *zone, int order, int gfp_flags) -{ - unsigned long flags; - struct page *page = NULL; - int cold = !!(gfp_flags & __GFP_COLD); - - if (order == 0) { - struct per_cpu_pages *pcp; - - pcp = &zone->pageset[get_cpu()].pcp[cold]; - local_irq_save(flags); - if (pcp->count <= pcp->low) - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); - if (pcp->count) { - page = list_entry(pcp->list.next, struct page, lru); - list_del(&page->lru); - pcp->count--; - } - local_irq_restore(flags); - put_cpu(); - } - - if (page == NULL) { - spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); - } - - if (page != NULL) { - BUG_ON(bad_range(zone, page)); - mod_page_state_zone(zone, pgalloc, 1 << order); - prep_new_page(page, order); - if (order && (gfp_flags & __GFP_COMP)) - prep_compound_page(page, order); - } - return page; -} - -/* - * This is the 'heart' of the zoned buddy allocator. - * - * Herein lies the mysterious "incremental min". That's the - * - * local_low = z->pages_low; - * min += local_low; - * - * thing. The intent here is to provide additional protection to low zones for - * allocation requests which _could_ use higher zones. So a GFP_HIGHMEM - * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL - * request. This preserves additional space in those lower zones for requests - * which really do need memory from those zones. It means that on a decent - * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA - * zone untouched. - */ -struct page * fastcall -__alloc_pages(unsigned int gfp_mask, unsigned int order, - struct zonelist *zonelist) -{ - const int wait = gfp_mask & __GFP_WAIT; - unsigned long min; - struct zone **zones; - struct page *page; - struct reclaim_state reclaim_state; - struct task_struct *p = current; - int i; - int alloc_type; - int do_retry; - - might_sleep_if(wait); - - zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ - if (zones[0] == NULL) /* no zones in the zonelist */ - return NULL; - - alloc_type = zone_idx(zones[0]); - - /* Go through the zonelist once, looking for a zone with enough free */ - for (i = 0; zones[i] != NULL; i++) { - struct zone *z = zones[i]; - - min = (1<protection[alloc_type]; - - /* - * We let real-time tasks dip their real-time paws a little - * deeper into reserves. - */ - if (rt_task(p)) - min -= z->pages_low >> 1; - - if (z->free_pages >= min || - (!wait && z->free_pages >= z->pages_high)) { - page = buffered_rmqueue(z, order, gfp_mask); - if (page) { - zone_statistics(zonelist, z); - goto got_pg; - } - } - } - - /* we're somewhat low on memory, failed to find what we needed */ - for (i = 0; zones[i] != NULL; i++) - wakeup_kswapd(zones[i]); - - /* Go through the zonelist again, taking __GFP_HIGH into account */ - for (i = 0; zones[i] != NULL; i++) { - struct zone *z = zones[i]; - - min = (1<protection[alloc_type]; - - if (gfp_mask & __GFP_HIGH) - min -= z->pages_low >> 2; - if (rt_task(p)) - min -= z->pages_low >> 1; - - if (z->free_pages >= min || - (!wait && z->free_pages >= z->pages_high)) { - page = buffered_rmqueue(z, order, gfp_mask); - if (page) { - zone_statistics(zonelist, z); - goto got_pg; - } - } - } - - /* here we're in the low on memory slow path */ - -rebalance: - if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) { - /* go through the zonelist yet again, ignoring mins */ - for (i = 0; zones[i] != NULL; i++) { - struct zone *z = zones[i]; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) { - zone_statistics(zonelist, z); - goto got_pg; - } - } - goto nopage; - } - - /* Atomic allocations - we can't balance anything */ - if (!wait) - goto nopage; - - p->flags |= PF_MEMALLOC; - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; - - try_to_free_pages(zones, gfp_mask, order); - - p->reclaim_state = NULL; - p->flags &= ~PF_MEMALLOC; - - /* go through the zonelist yet one more time */ - for (i = 0; zones[i] != NULL; i++) { - struct zone *z = zones[i]; - - min = (1UL << order) + z->protection[alloc_type]; - - if (z->free_pages >= min || - (!wait && z->free_pages >= z->pages_high)) { - page = buffered_rmqueue(z, order, gfp_mask); - if (page) { - zone_statistics(zonelist, z); - goto got_pg; - } - } - } - - /* - * Don't let big-order allocations loop unless the caller explicitly - * requests that. Wait for some write requests to complete then retry. - * - * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that - * may not be true in other implementations. - */ - do_retry = 0; - if (!(gfp_mask & __GFP_NORETRY)) { - if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) - do_retry = 1; - if (gfp_mask & __GFP_NOFAIL) - do_retry = 1; - } - if (do_retry) { - blk_congestion_wait(WRITE, HZ/50); - goto rebalance; - } - -nopage: - if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { - printk(KERN_WARNING "%s: page allocation failure." - " order:%d, mode:0x%x\n", - p->comm, order, gfp_mask); - dump_stack(); - } - return NULL; -got_pg: - kernel_map_pages(page, 1 << order, 1); - return page; -} - -EXPORT_SYMBOL(__alloc_pages); - -#ifdef CONFIG_NUMA -/* Early boot: Everything is done by one cpu, but the data structures will be - * used by all cpus - spread them on all nodes. - */ -static __init unsigned long get_boot_pages(unsigned int gfp_mask, unsigned int order) -{ -static int nodenr; - int i = nodenr; - struct page *page; - - for (;;) { - if (i > nodenr + numnodes) - return 0; - if (node_present_pages(i%numnodes)) { - struct zone **z; - /* The node contains memory. Check that there is - * memory in the intended zonelist. - */ - z = NODE_DATA(i%numnodes)->node_zonelists[gfp_mask & GFP_ZONEMASK].zones; - while (*z) { - if ( (*z)->free_pages > (1UL<= 0) - free_hot_cold_page(pvec->pages[i], pvec->cold); -} - -fastcall void __free_pages(struct page *page, unsigned int order) -{ - if (!PageReserved(page) && put_page_testzero(page)) { - if (order == 0) - free_hot_page(page); - else - __free_pages_ok(page, order); - } -} - -EXPORT_SYMBOL(__free_pages); - -fastcall void free_pages(unsigned long addr, unsigned int order) -{ - if (addr != 0) { - BUG_ON(!virt_addr_valid(addr)); - __free_pages(virt_to_page(addr), order); - } -} - -EXPORT_SYMBOL(free_pages); - -/* - * Total amount of free (allocatable) RAM: - */ -unsigned int nr_free_pages(void) -{ - unsigned int sum = 0; - struct zone *zone; - - for_each_zone(zone) - sum += zone->free_pages; - - return sum; -} - -EXPORT_SYMBOL(nr_free_pages); - -unsigned int nr_used_zone_pages(void) -{ - unsigned int pages = 0; - struct zone *zone; - - for_each_zone(zone) - pages += zone->nr_active + zone->nr_inactive; - - return pages; -} - -#ifdef CONFIG_NUMA -unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) -{ - unsigned int i, sum = 0; - - for (i = 0; i < MAX_NR_ZONES; i++) - sum += pgdat->node_zones[i].free_pages; - - return sum; -} -#endif - -static unsigned int nr_free_zone_pages(int offset) -{ - pg_data_t *pgdat; - unsigned int sum = 0; - - for_each_pgdat(pgdat) { - struct zonelist *zonelist = pgdat->node_zonelists + offset; - struct zone **zonep = zonelist->zones; - struct zone *zone; - - for (zone = *zonep++; zone; zone = *zonep++) { - unsigned long size = zone->present_pages; - unsigned long high = zone->pages_high; - if (size > high) - sum += size - high; - } - } - - return sum; -} - -/* - * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL - */ -unsigned int nr_free_buffer_pages(void) -{ - return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK); -} - -/* - * Amount of free RAM allocatable within all zones - */ -unsigned int nr_free_pagecache_pages(void) -{ - return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); -} - -#ifdef CONFIG_HIGHMEM -unsigned int nr_free_highpages (void) -{ - pg_data_t *pgdat; - unsigned int pages = 0; - - for_each_pgdat(pgdat) - pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; - - return pages; -} -#endif - -#ifdef CONFIG_NUMA -static void show_node(struct zone *zone) -{ - printk("Node %d ", zone->zone_pgdat->node_id); -} -#else -#define show_node(zone) do { } while (0) -#endif - -/* - * Accumulate the page_state information across all CPUs. - * The result is unavoidably approximate - it can change - * during and after execution of this function. - */ -DEFINE_PER_CPU(struct page_state, page_states) = {0}; -EXPORT_PER_CPU_SYMBOL(page_states); - -atomic_t nr_pagecache = ATOMIC_INIT(0); -EXPORT_SYMBOL(nr_pagecache); -#ifdef CONFIG_SMP -DEFINE_PER_CPU(long, nr_pagecache_local) = 0; -#endif - -void __get_page_state(struct page_state *ret, int nr) -{ - int cpu = 0; - - memset(ret, 0, sizeof(*ret)); - while (cpu < NR_CPUS) { - unsigned long *in, *out, off; - - if (!cpu_possible(cpu)) { - cpu++; - continue; - } - - in = (unsigned long *)&per_cpu(page_states, cpu); - cpu++; - if (cpu < NR_CPUS && cpu_possible(cpu)) - prefetch(&per_cpu(page_states, cpu)); - out = (unsigned long *)ret; - for (off = 0; off < nr; off++) - *out++ += *in++; - } -} - -void get_page_state(struct page_state *ret) -{ - int nr; - - nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); - nr /= sizeof(unsigned long); - - __get_page_state(ret, nr + 1); -} - -void get_full_page_state(struct page_state *ret) -{ - __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); -} - -unsigned long __read_page_state(unsigned offset) -{ - unsigned long ret = 0; - int cpu; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - unsigned long in; - - if (!cpu_possible(cpu)) - continue; - - in = (unsigned long)&per_cpu(page_states, cpu) + offset; - ret += *((unsigned long *)in); - } - return ret; -} - -void get_zone_counts(unsigned long *active, - unsigned long *inactive, unsigned long *free) -{ - struct zone *zone; - - *active = 0; - *inactive = 0; - *free = 0; - for_each_zone(zone) { - *active += zone->nr_active; - *inactive += zone->nr_inactive; - *free += zone->free_pages; - } -} - -void si_meminfo(struct sysinfo *val) -{ - val->totalram = totalram_pages; - val->sharedram = 0; - val->freeram = nr_free_pages(); - val->bufferram = nr_blockdev_pages(); -#ifdef CONFIG_HIGHMEM - val->totalhigh = totalhigh_pages; - val->freehigh = nr_free_highpages(); -#else - val->totalhigh = 0; - val->freehigh = 0; -#endif - val->mem_unit = PAGE_SIZE; -} - -EXPORT_SYMBOL(si_meminfo); - -#ifdef CONFIG_NUMA -void si_meminfo_node(struct sysinfo *val, int nid) -{ - pg_data_t *pgdat = NODE_DATA(nid); - - val->totalram = pgdat->node_present_pages; - val->freeram = nr_free_pages_pgdat(pgdat); - val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; - val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; - val->mem_unit = PAGE_SIZE; -} -#endif - -#define K(x) ((x) << (PAGE_SHIFT-10)) - -/* - * Show free area list (used inside shift_scroll-lock stuff) - * We also calculate the percentage fragmentation. We do this by counting the - * memory on each free list with the exception of the first item on the list. - */ -void show_free_areas(void) -{ - struct page_state ps; - int cpu, temperature; - unsigned long active; - unsigned long inactive; - unsigned long free; - struct zone *zone; - - for_each_zone(zone) { - show_node(zone); - printk("%s per-cpu:", zone->name); - - if (!zone->present_pages) { - printk(" empty\n"); - continue; - } else - printk("\n"); - - for (cpu = 0; cpu < NR_CPUS; ++cpu) { - struct per_cpu_pageset *pageset; - - if (!cpu_possible(cpu)) - continue; - - pageset = zone->pageset + cpu; - - for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: low %d, high %d, batch %d\n", - cpu, - temperature ? "cold" : "hot", - pageset->pcp[temperature].low, - pageset->pcp[temperature].high, - pageset->pcp[temperature].batch); - } - } - - get_page_state(&ps); - get_zone_counts(&active, &inactive, &free); - - printk("\nFree pages: %11ukB (%ukB HighMem)\n", - K(nr_free_pages()), - K(nr_free_highpages())); - - printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " - "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", - active, - inactive, - ps.nr_dirty, - ps.nr_writeback, - ps.nr_unstable, - nr_free_pages(), - ps.nr_slab, - ps.nr_mapped, - ps.nr_page_table_pages); - - for_each_zone(zone) { - int i; - - show_node(zone); - printk("%s" - " free:%lukB" - " min:%lukB" - " low:%lukB" - " high:%lukB" - " active:%lukB" - " inactive:%lukB" - " present:%lukB" - "\n", - zone->name, - K(zone->free_pages), - K(zone->pages_min), - K(zone->pages_low), - K(zone->pages_high), - K(zone->nr_active), - K(zone->nr_inactive), - K(zone->present_pages) - ); - printk("protections[]:"); - for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %lu", zone->protection[i]); - printk("\n"); - } - - for_each_zone(zone) { - struct list_head *elem; - unsigned long nr, flags, order, total = 0; - - show_node(zone); - printk("%s: ", zone->name); - if (!zone->present_pages) { - printk("empty\n"); - continue; - } - - spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { - nr = 0; - list_for_each(elem, &zone->free_area[order].free_list) - ++nr; - total += nr << order; - printk("%lu*%lukB ", nr, K(1UL) << order); - } - spin_unlock_irqrestore(&zone->lock, flags); - printk("= %lukB\n", K(total)); - } - - show_swap_cache_info(); -} - -/* - * Builds allocation fallback zone lists. - */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) -{ - switch (k) { - struct zone *zone; - default: - BUG(); - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { -#ifndef CONFIG_HIGHMEM - BUG(); -#endif - zonelist->zones[j++] = zone; - } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->present_pages) - zonelist->zones[j++] = zone; - } - - return j; -} - -#ifdef CONFIG_NUMA -#define MAX_NODE_LOAD (numnodes) -static int __initdata node_load[MAX_NUMNODES]; -/** - * find_next_best_node - find the next node that should appear in a given - * node's fallback list - * @node: node whose fallback list we're appending - * @used_node_mask: pointer to the bitmap of already used nodes - * - * We use a number of factors to determine which is the next node that should - * appear on a given node's fallback list. The node should not have appeared - * already in @node's fallback list, and it should be the next closest node - * according to the distance array (which contains arbitrary distance values - * from each node to each node in the system), and should also prefer nodes - * with no CPUs, since presumably they'll have very little allocation pressure - * on them otherwise. - * It returns -1 if no node is found. - */ -static int __init find_next_best_node(int node, void *used_node_mask) -{ - int i, n, val; - int min_val = INT_MAX; - int best_node = -1; - - for (i = 0; i < numnodes; i++) { - cpumask_t tmp; - - /* Start from local node */ - n = (node+i)%numnodes; - - /* Don't want a node to appear more than once */ - if (test_bit(n, used_node_mask)) - continue; - - /* Use the distance array to find the distance */ - val = node_distance(node, n); - - /* Give preference to headless and unused nodes */ - tmp = node_to_cpumask(n); - if (!cpus_empty(tmp)) - val += PENALTY_FOR_NODE_WITH_CPUS; - - /* Slight preference for less loaded node */ - val *= (MAX_NODE_LOAD*MAX_NUMNODES); - val += node_load[n]; - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - if (best_node >= 0) - set_bit(best_node, used_node_mask); - - return best_node; -} - -static void __init build_zonelists(pg_data_t *pgdat) -{ - int i, j, k, node, local_node; - int prev_node, load; - struct zonelist *zonelist; - DECLARE_BITMAP(used_mask, MAX_NUMNODES); - - /* initialize zonelists */ - for (i = 0; i < MAX_NR_ZONES; i++) { - zonelist = pgdat->node_zonelists + i; - memset(zonelist, 0, sizeof(*zonelist)); - zonelist->zones[0] = NULL; - } - - /* NUMA-aware ordering of nodes */ - local_node = pgdat->node_id; - load = numnodes; - prev_node = local_node; - bitmap_zero(used_mask, MAX_NUMNODES); - while ((node = find_next_best_node(local_node, used_mask)) >= 0) { - /* - * We don't want to pressure a particular node. - * So adding penalty to the first node in same - * distance group to make it round-robin. - */ - if (node_distance(local_node, node) != - node_distance(local_node, prev_node)) - node_load[node] += load; - prev_node = node; - load--; - for (i = 0; i < MAX_NR_ZONES; i++) { - zonelist = pgdat->node_zonelists + i; - for (j = 0; zonelist->zones[j] != NULL; j++); - - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; - - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); - zonelist->zones[j] = NULL; - } - } -} - -#else /* CONFIG_NUMA */ - -static void __init build_zonelists(pg_data_t *pgdat) -{ - int i, j, k, node, local_node; - - local_node = pgdat->node_id; - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zonelist *zonelist; - - zonelist = pgdat->node_zonelists + i; - memset(zonelist, 0, sizeof(*zonelist)); - - j = 0; - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; - - j = build_zonelists_node(pgdat, zonelist, j, k); - /* - * Now we build the zonelist so that it contains the zones - * of all the other nodes. - * We don't want to pressure a particular node, so when - * building the zones for node N, we make sure that the - * zones coming right after the local ones are those from - * node N+1 (modulo N) - */ - for (node = local_node + 1; node < numnodes; node++) - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); - for (node = 0; node < local_node; node++) - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); - - zonelist->zones[j] = NULL; - } -} - -#endif /* CONFIG_NUMA */ - -void __init build_all_zonelists(void) -{ - int i; - - for(i = 0 ; i < numnodes ; i++) - build_zonelists(NODE_DATA(i)); - printk("Built %i zonelists\n", numnodes); -} - -/* - * Helper functions to size the waitqueue hash table. - * Essentially these want to choose hash table sizes sufficiently - * large so that collisions trying to wait on pages are rare. - * But in fact, the number of active page waitqueues on typical - * systems is ridiculously low, less than 200. So this is even - * conservative, even though it seems large. - * - * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to - * waitqueues, i.e. the size of the waitq table given the number of pages. - */ -#define PAGES_PER_WAITQUEUE 256 - -static inline unsigned long wait_table_size(unsigned long pages) -{ - unsigned long size = 1; - - pages /= PAGES_PER_WAITQUEUE; - - while (size < pages) - size <<= 1; - - /* - * Once we have dozens or even hundreds of threads sleeping - * on IO we've got bigger problems than wait queue collision. - * Limit the size of the wait table to a reasonable size. - */ - size = min(size, 4096UL); - - return max(size, 4UL); -} - -/* - * This is an integer logarithm so that shifts can be used later - * to extract the more random high bits from the multiplicative - * hash function before the remainder is taken. - */ -static inline unsigned long wait_table_bits(unsigned long size) -{ - return ffz(~size); -} - -#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) - -static void __init calculate_zone_totalpages(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) -{ - unsigned long realtotalpages, totalpages = 0; - int i; - - for (i = 0; i < MAX_NR_ZONES; i++) - totalpages += zones_size[i]; - pgdat->node_spanned_pages = totalpages; - - realtotalpages = totalpages; - if (zholes_size) - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= zholes_size[i]; - pgdat->node_present_pages = realtotalpages; - printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); -} - - -/* - * Initially all pages are reserved - free ones are freed - * up by free_all_bootmem() once the early boot process is - * done. Non-atomic initialization, single-pass. - */ -void __init memmap_init_zone(struct page *start, unsigned long size, int nid, - unsigned long zone, unsigned long start_pfn) -{ - struct page *page; - - for (page = start; page < (start + size); page++) { - set_page_zone(page, NODEZONE(nid, zone)); - set_page_count(page, 0); - SetPageReserved(page); - INIT_LIST_HEAD(&page->lru); -#ifdef WANT_PAGE_VIRTUAL - /* The shift won't overflow because ZONE_NORMAL is below 4G. */ - if (zone != ZONE_HIGHMEM) - set_page_address(page, __va(start_pfn << PAGE_SHIFT)); -#endif - start_pfn++; - } -} - -#ifndef __HAVE_ARCH_MEMMAP_INIT -#define memmap_init(start, size, nid, zone, start_pfn) \ - memmap_init_zone((start), (size), (nid), (zone), (start_pfn)) -#endif - -/* - * Set up the zone data structures: - * - mark all pages reserved - * - mark all memory queues empty - * - clear the memory bitmaps - */ -static void __init free_area_init_core(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) -{ - unsigned long i, j; - const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); - int cpu, nid = pgdat->node_id; - struct page *lmem_map = pgdat->node_mem_map; - unsigned long zone_start_pfn = pgdat->node_start_pfn; - - pgdat->nr_zones = 0; - init_waitqueue_head(&pgdat->kswapd_wait); - - for (j = 0; j < MAX_NR_ZONES; j++) { - struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize; - unsigned long batch; - - zone_table[NODEZONE(nid, j)] = zone; - realsize = size = zones_size[j]; - if (zholes_size) - realsize -= zholes_size[j]; - - zone->spanned_pages = size; - zone->present_pages = realsize; - zone->name = zone_names[j]; - spin_lock_init(&zone->lock); - spin_lock_init(&zone->lru_lock); - zone->zone_pgdat = pgdat; - zone->free_pages = 0; - - zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - - /* - * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/4 of a meg - there's - * no point in going beyond the size of L2 cache. - * - * OK, so we don't know how big the cache is. So guess. - */ - batch = zone->present_pages / 1024; - if (batch * PAGE_SIZE > 256 * 1024) - batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ - if (batch < 1) - batch = 1; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - struct per_cpu_pages *pcp; - - pcp = &zone->pageset[cpu].pcp[0]; /* hot */ - pcp->count = 0; - pcp->low = 2 * batch; - pcp->high = 6 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - - pcp = &zone->pageset[cpu].pcp[1]; /* cold */ - pcp->count = 0; - pcp->low = 0; - pcp->high = 2 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - } - printk(" %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); - INIT_LIST_HEAD(&zone->active_list); - INIT_LIST_HEAD(&zone->inactive_list); - atomic_set(&zone->nr_scan_active, 0); - atomic_set(&zone->nr_scan_inactive, 0); - zone->nr_active = 0; - zone->nr_inactive = 0; - if (!size) - continue; - - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_size = wait_table_size(size); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); - - for(i = 0; i < zone->wait_table_size; ++i) - init_waitqueue_head(zone->wait_table + i); - - pgdat->nr_zones = j+1; - - zone->zone_mem_map = lmem_map; - zone->zone_start_pfn = zone_start_pfn; - - if ((zone_start_pfn) & (zone_required_alignment-1)) - printk("BUG: wrong zone alignment, it will crash\n"); - - memmap_init(lmem_map, size, nid, j, zone_start_pfn); - - zone_start_pfn += size; - lmem_map += size; - - for (i = 0; ; i++) { - unsigned long bitmap_size; - - INIT_LIST_HEAD(&zone->free_area[i].free_list); - if (i == MAX_ORDER-1) { - zone->free_area[i].map = NULL; - break; - } - - /* - * Page buddy system uses "index >> (i+1)", - * where "index" is at most "size-1". - * - * The extra "+3" is to round down to byte - * size (8 bits per byte assumption). Thus - * we get "(size-1) >> (i+4)" as the last byte - * we can access. - * - * The "+1" is because we want to round the - * byte allocation up rather than down. So - * we should have had a "+7" before we shifted - * down by three. Also, we have to add one as - * we actually _use_ the last bit (it's [0,n] - * inclusive, not [0,n[). - * - * So we actually had +7+1 before we shift - * down by 3. But (n+8) >> 3 == (n >> 3) + 1 - * (modulo overflows, which we do not have). - * - * Finally, we LONG_ALIGN because all bitmap - * operations are on longs. - */ - bitmap_size = (size-1) >> (i+4); - bitmap_size = LONG_ALIGN(bitmap_size+1); - zone->free_area[i].map = - (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); - } - } -} - -void __init free_area_init_node(int nid, struct pglist_data *pgdat, - struct page *node_mem_map, unsigned long *zones_size, - unsigned long node_start_pfn, unsigned long *zholes_size) -{ - unsigned long size; - - pgdat->node_id = nid; - pgdat->node_start_pfn = node_start_pfn; - calculate_zone_totalpages(pgdat, zones_size, zholes_size); - if (!node_mem_map) { - size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); - node_mem_map = alloc_bootmem_node(pgdat, size); - } - pgdat->node_mem_map = node_mem_map; - - free_area_init_core(pgdat, zones_size, zholes_size); -} - -#ifndef CONFIG_DISCONTIGMEM -static bootmem_data_t contig_bootmem_data; -struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; - -EXPORT_SYMBOL(contig_page_data); - -void __init free_area_init(unsigned long *zones_size) -{ - free_area_init_node(0, &contig_page_data, NULL, zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); - mem_map = contig_page_data.node_mem_map; -} -#endif - -#ifdef CONFIG_PROC_FS - -#include - -static void *frag_start(struct seq_file *m, loff_t *pos) -{ - pg_data_t *pgdat; - loff_t node = *pos; - - for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) - --node; - - return pgdat; -} - -static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) -{ - pg_data_t *pgdat = (pg_data_t *)arg; - - (*pos)++; - return pgdat->pgdat_next; -} - -static void frag_stop(struct seq_file *m, void *arg) -{ -} - -/* - * This walks the freelist for each zone. Whilst this is slow, I'd rather - * be slow here than slow down the fast path by keeping stats - mjbligh - */ -static int frag_show(struct seq_file *m, void *arg) -{ - pg_data_t *pgdat = (pg_data_t *)arg; - struct zone *zone; - struct zone *node_zones = pgdat->node_zones; - unsigned long flags; - int order; - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!zone->present_pages) - continue; - - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) { - unsigned long nr_bufs = 0; - struct list_head *elem; - - list_for_each(elem, &(zone->free_area[order].free_list)) - ++nr_bufs; - seq_printf(m, "%6lu ", nr_bufs); - } - spin_unlock_irqrestore(&zone->lock, flags); - seq_putc(m, '\n'); - } - return 0; -} - -struct seq_operations fragmentation_op = { - .start = frag_start, - .next = frag_next, - .stop = frag_stop, - .show = frag_show, -}; - -static char *vmstat_text[] = { - "nr_dirty", - "nr_writeback", - "nr_unstable", - "nr_page_table_pages", - "nr_mapped", - "nr_slab", - - "pgpgin", - "pgpgout", - "pswpin", - "pswpout", - "pgalloc_high", - - "pgalloc_normal", - "pgalloc_dma", - "pgfree", - "pgactivate", - "pgdeactivate", - - "pgfault", - "pgmajfault", - "pgrefill_high", - "pgrefill_normal", - "pgrefill_dma", - - "pgsteal_high", - "pgsteal_normal", - "pgsteal_dma", - "pgscan_kswapd_high", - "pgscan_kswapd_normal", - - "pgscan_kswapd_dma", - "pgscan_direct_high", - "pgscan_direct_normal", - "pgscan_direct_dma", - "pginodesteal", - - "slabs_scanned", - "kswapd_steal", - "kswapd_inodesteal", - "pageoutrun", - "allocstall", - - "pgrotated", -}; - -static void *vmstat_start(struct seq_file *m, loff_t *pos) -{ - struct page_state *ps; - - if (*pos >= ARRAY_SIZE(vmstat_text)) - return NULL; - - ps = kmalloc(sizeof(*ps), GFP_KERNEL); - m->private = ps; - if (!ps) - return ERR_PTR(-ENOMEM); - get_full_page_state(ps); - ps->pgpgin /= 2; /* sectors -> kbytes */ - ps->pgpgout /= 2; - return (unsigned long *)ps + *pos; -} - -static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) -{ - (*pos)++; - if (*pos >= ARRAY_SIZE(vmstat_text)) - return NULL; - return (unsigned long *)m->private + *pos; -} - -static int vmstat_show(struct seq_file *m, void *arg) -{ - unsigned long *l = arg; - unsigned long off = l - (unsigned long *)m->private; - - seq_printf(m, "%s %lu\n", vmstat_text[off], *l); - return 0; -} - -static void vmstat_stop(struct seq_file *m, void *arg) -{ - kfree(m->private); - m->private = NULL; -} - -struct seq_operations vmstat_op = { - .start = vmstat_start, - .next = vmstat_next, - .stop = vmstat_stop, - .show = vmstat_show, -}; - -#endif /* CONFIG_PROC_FS */ - -#ifdef CONFIG_HOTPLUG_CPU -static int page_alloc_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - int cpu = (unsigned long)hcpu; - long *count; - - if (action == CPU_DEAD) { - /* Drain local pagecache count. */ - count = &per_cpu(nr_pagecache_local, cpu); - atomic_add(*count, &nr_pagecache); - *count = 0; - local_irq_disable(); - __drain_pages(cpu); - local_irq_enable(); - } - return NOTIFY_OK; -} -#endif /* CONFIG_HOTPLUG_CPU */ - -void __init page_alloc_init(void) -{ - hotcpu_notifier(page_alloc_cpu_notify, 0); -} - -static unsigned long higherzone_val(struct zone *z, int max_zone, - int alloc_type) -{ - int z_idx = zone_idx(z); - struct zone *higherzone; - unsigned long pages; - - /* there is no higher zone to get a contribution from */ - if (z_idx == MAX_NR_ZONES-1) - return 0; - - higherzone = &z->zone_pgdat->node_zones[z_idx+1]; - - /* We always start with the higher zone's protection value */ - pages = higherzone->protection[alloc_type]; - - /* - * We get a lower-zone-protection contribution only if there are - * pages in the higher zone and if we're not the highest zone - * in the current zonelist. e.g., never happens for GFP_DMA. Happens - * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA - * and ZONE_NORMAL for a GFP_HIGHMEM allocation. - */ - if (higherzone->present_pages && z_idx < alloc_type) - pages += higherzone->pages_low * sysctl_lower_zone_protection; - - return pages; -} - -/* - * setup_per_zone_protection - called whenver min_free_kbytes or - * sysctl_lower_zone_protection changes. Ensures that each zone - * has a correct pages_protected value, so an adequate number of - * pages are left in the zone after a successful __alloc_pages(). - * - * This algorithm is way confusing. I tries to keep the same behavior - * as we had with the incremental min iterative algorithm. - */ -static void setup_per_zone_protection(void) -{ - struct pglist_data *pgdat; - struct zone *zones, *zone; - int max_zone; - int i, j; - - for_each_pgdat(pgdat) { - zones = pgdat->node_zones; - - for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++) - if (zones[i].present_pages) - max_zone = i; - - /* - * For each of the different allocation types: - * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM - */ - for (i = 0; i < MAX_NR_ZONES; i++) { - /* - * For each of the zones: - * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA - */ - for (j = MAX_NR_ZONES-1; j >= 0; j--) { - zone = &zones[j]; - - /* - * We never protect zones that don't have memory - * in them (j>max_zone) or zones that aren't in - * the zonelists for a certain type of - * allocation (j>i). We have to assign these to - * zero because the lower zones take - * contributions from the higher zones. - */ - if (j > max_zone || j > i) { - zone->protection[i] = 0; - continue; - } - /* - * The contribution of the next higher zone - */ - zone->protection[i] = higherzone_val(zone, - max_zone, i); - zone->protection[i] += zone->pages_low; - } - } - } -} - -/* - * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures - * that the pages_{min,low,high} values for each zone are set correctly - * with respect to min_free_kbytes. - */ -static void setup_per_zone_pages_min(void) -{ - unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); - unsigned long lowmem_pages = 0; - struct zone *zone; - unsigned long flags; - - /* Calculate total number of !ZONE_HIGHMEM pages */ - for_each_zone(zone) { - if (!is_highmem(zone)) - lowmem_pages += zone->present_pages; - } - - for_each_zone(zone) { - spin_lock_irqsave(&zone->lru_lock, flags); - if (is_highmem(zone)) { - /* - * Often, highmem doesn't need to reserve any pages. - * But the pages_min/low/high values are also used for - * batching up page reclaim activity so we need a - * decent value here. - */ - int min_pages; - - min_pages = zone->present_pages / 1024; - if (min_pages < SWAP_CLUSTER_MAX) - min_pages = SWAP_CLUSTER_MAX; - if (min_pages > 128) - min_pages = 128; - zone->pages_min = min_pages; - } else { - /* if it's a lowmem zone, reserve a number of pages - * proportionate to the zone's size. - */ - zone->pages_min = (pages_min * zone->present_pages) / - lowmem_pages; - } - - zone->pages_low = zone->pages_min * 2; - zone->pages_high = zone->pages_min * 3; - spin_unlock_irqrestore(&zone->lru_lock, flags); - } -} - -/* - * Initialise min_free_kbytes. - * - * For small machines we want it small (128k min). For large machines - * we want it large (16MB max). But it is not linear, because network - * bandwidth does not increase linearly with machine size. We use - * - * min_free_kbytes = sqrt(lowmem_kbytes) - * - * which yields - * - * 16MB: 128k - * 32MB: 181k - * 64MB: 256k - * 128MB: 362k - * 256MB: 512k - * 512MB: 724k - * 1024MB: 1024k - * 2048MB: 1448k - * 4096MB: 2048k - * 8192MB: 2896k - * 16384MB: 4096k - */ -static int __init init_per_zone_pages_min(void) -{ - unsigned long lowmem_kbytes; - - lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); - - min_free_kbytes = int_sqrt(lowmem_kbytes); - if (min_free_kbytes < 128) - min_free_kbytes = 128; - if (min_free_kbytes > 16384) - min_free_kbytes = 16384; - setup_per_zone_pages_min(); - setup_per_zone_protection(); - return 0; -} -module_init(init_per_zone_pages_min) - -/* - * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so - * that we can call two helper functions whenever min_free_kbytes - * changes. - */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length) -{ - proc_dointvec(table, write, file, buffer, length); - setup_per_zone_pages_min(); - setup_per_zone_protection(); - return 0; -} - -/* - * lower_zone_protection_sysctl_handler - just a wrapper around - * proc_dointvec() so that we can call setup_per_zone_protection() - * whenever sysctl_lower_zone_protection changes. - */ -int lower_zone_protection_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length) -{ - proc_dointvec_minmax(table, write, file, buffer, length); - setup_per_zone_protection(); - return 0; -} -- 2.47.0