diff -uNr linux-2.5.22/Makefile linux-2.5.22-rmap13b/Makefile --- linux-2.5.22/Makefile Tue Jun 18 13:41:43 2002 +++ linux-2.5.22-rmap13b/Makefile Tue Jun 18 13:48:41 2002 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 5 SUBLEVEL = 22 -EXTRAVERSION = +EXTRAVERSION = -rmap13b # We are using a recursive build, so we need to do a little thinking # to get the ordering right. diff -uNr linux-2.5.22/drivers/block/blkpg.c linux-2.5.22-rmap13b/drivers/block/blkpg.c --- linux-2.5.22/drivers/block/blkpg.c Tue Jun 18 13:41:52 2002 +++ linux-2.5.22-rmap13b/drivers/block/blkpg.c Tue Jun 18 13:47:38 2002 @@ -35,9 +35,9 @@ #include #include #include /* for EXPORT_SYMBOL */ +#include #include #include - #include /* @@ -218,6 +218,9 @@ request_queue_t *q; u64 ullval = 0; int intval; + blkelv_ioctl_arg_t elevator_arg; + elevator_t *e; + int ret; unsigned short usval; kdev_t dev = to_kdev_t(bdev->bd_dev); int holder; diff -uNr linux-2.5.22/drivers/block/elevator.c linux-2.5.22-rmap13b/drivers/block/elevator.c --- linux-2.5.22/drivers/block/elevator.c Thu May 2 17:22:56 2002 +++ linux-2.5.22-rmap13b/drivers/block/elevator.c Tue Jun 18 13:47:38 2002 @@ -162,32 +162,46 @@ int elevator_linus_merge(request_queue_t *q, struct request **req, struct bio *bio) { + int max_bomb_segments; + int *latency; struct list_head *entry; struct request *__rq; + int merge_only = 0; int ret; if ((ret = elv_try_last_merge(q, req, bio))) return ret; + latency = q->elevator.elevator_data; + max_bomb_segments = 0; + if (latency) + max_bomb_segments = latency[2]; + entry = &q->queue_head; ret = ELEVATOR_NO_MERGE; while ((entry = entry->prev) != &q->queue_head) { __rq = list_entry_rq(entry); - if (__rq->flags & (REQ_BARRIER | REQ_STARTED)) + if (__rq->flags & (REQ_BARRIER | REQ_STARTED)) { + max_bomb_segments = 0; /* No read promotions */ break; + } - /* - * simply "aging" of requests in queue - */ - if (elv_linus_sequence(__rq)-- <= 0) - break; + if (elv_linus_sequence(__rq)-- <= 0) { + /* + * OK, we've exceeded someone's latency limit. + * But we still continue to look for merges, + * because they're so much better than seeks. + */ + merge_only = 1; + } if (!(__rq->flags & REQ_CMD)) continue; if (elv_linus_sequence(__rq) < bio_sectors(bio)) - break; + merge_only = 1; - if (!*req && bio_rq_in_between(bio, __rq, &q->queue_head)) + if (!*req && !merge_only && + bio_rq_in_between(bio, __rq, &q->queue_head)) *req = __rq; if ((ret = elv_try_merge(__rq, bio))) { @@ -199,6 +213,49 @@ } } + /* + * If we failed to merge a read anywhere in the request + * queue, we really don't want to place it at the end + * of the list, behind lots of writes. So place it near + * the front. + * + * We don't want to place it in front of _all_ writes: that + * would create lots of seeking, and isn't tunable. + * We try to avoid promoting this read in front of existing + * reads. + * + * max_bomb_segments becomes the maximum number of write + * requests which we allow to remain in place in front of + * a newly introduced read. We weight things a little bit, + * so large writes are more expensive than small ones, but it's + * requests which count, not sectors. + */ + if (max_bomb_segments && bio_data_dir(bio) == READ && + ret == ELEVATOR_NO_MERGE) { + int cur_latency = 0; + struct request * const cur_request = *req; + + entry = q->queue_head.next; + while (entry != &q->queue_head) { + __rq = list_entry_rq(entry); + if (__rq == cur_request) { + /* + * This is where the old algorithm placed it. + * There's no point pushing it further back, + * so leave it here, in sorted order. + */ + break; + } + if (rq_data_dir(__rq) == WRITE) { + cur_latency += 1 + __rq->nr_sectors / 64; + if (cur_latency >= max_bomb_segments) { + *req = __rq; + break; + } + } + entry = entry->next; + } + } return ret; } @@ -251,12 +308,13 @@ { int *latency; - latency = kmalloc(2 * sizeof(int), GFP_KERNEL); + latency = kmalloc(3 * sizeof(int), GFP_KERNEL); if (!latency) return -ENOMEM; latency[READ] = 8192; latency[WRITE] = 16384; + latency[2] = 6; /* max_bomb_segments */ e->elevator_data = latency; return 0; diff -uNr linux-2.5.22/drivers/block/ll_rw_blk.c linux-2.5.22-rmap13b/drivers/block/ll_rw_blk.c --- linux-2.5.22/drivers/block/ll_rw_blk.c Tue Jun 18 13:41:53 2002 +++ linux-2.5.22-rmap13b/drivers/block/ll_rw_blk.c Tue Jun 18 13:47:38 2002 @@ -2002,8 +2002,8 @@ queue_nr_requests = (total_ram >> 8) & ~15; /* One per quarter-megabyte */ if (queue_nr_requests < 32) queue_nr_requests = 32; - if (queue_nr_requests > 512) - queue_nr_requests = 512; + if (queue_nr_requests > 1024) + queue_nr_requests = 1024; /* * Batch frees according to queue length diff -uNr linux-2.5.22/drivers/pci/pci-driver.c linux-2.5.22-rmap13b/drivers/pci/pci-driver.c --- linux-2.5.22/drivers/pci/pci-driver.c Wed Jun 12 16:07:01 2002 +++ linux-2.5.22-rmap13b/drivers/pci/pci-driver.c Tue Jun 18 13:47:38 2002 @@ -210,3 +210,4 @@ EXPORT_SYMBOL(pci_register_driver); EXPORT_SYMBOL(pci_unregister_driver); EXPORT_SYMBOL(pci_dev_driver); +EXPORT_SYMBOL(pci_bus_type); diff -uNr linux-2.5.22/drivers/scsi/constants.c linux-2.5.22-rmap13b/drivers/scsi/constants.c --- linux-2.5.22/drivers/scsi/constants.c Tue Jun 18 13:42:02 2002 +++ linux-2.5.22-rmap13b/drivers/scsi/constants.c Tue Jun 18 13:47:38 2002 @@ -992,11 +992,14 @@ s = 4; } -#if !(CONSTANTS & CONST_SENSE) +#if !(CONSTANTS & CONST_SENSE) +{ + int i; printk("Raw sense data:"); for (i = 0; i < s; ++i) printk("0x%02x ", sense_buffer[i]); printk("\n"); +} #endif } diff -uNr linux-2.5.22/fs/buffer.c linux-2.5.22-rmap13b/fs/buffer.c --- linux-2.5.22/fs/buffer.c Tue Jun 18 13:42:06 2002 +++ linux-2.5.22-rmap13b/fs/buffer.c Tue Jun 18 13:47:38 2002 @@ -475,17 +475,13 @@ } /* - * FIXME: What is this function actually trying to do? Why "zones[0]"? + * FIXME: What is this function actually trying to do? * Is it still correct/needed if/when blockdev mappings use GFP_HIGHUSER? */ static void free_more_memory(void) { - zone_t *zone; - - zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0]; - wakeup_bdflush(); - try_to_free_pages(zone, GFP_NOFS, 0); + try_to_free_pages(GFP_NOFS); blk_run_queues(); __set_current_state(TASK_RUNNING); yield(); diff -uNr linux-2.5.22/fs/dcache.c linux-2.5.22-rmap13b/fs/dcache.c --- linux-2.5.22/fs/dcache.c Wed Jun 12 16:07:07 2002 +++ linux-2.5.22-rmap13b/fs/dcache.c Tue Jun 18 13:47:38 2002 @@ -602,8 +602,7 @@ count = dentry_stat.nr_unused / priority; prune_dcache(count); - kmem_cache_shrink(dentry_cache); - return 0; + return kmem_cache_shrink(dentry_cache); } #define NAME_ALLOC_LEN(len) ((len+16) & ~15) diff -uNr linux-2.5.22/fs/dquot.c linux-2.5.22-rmap13b/fs/dquot.c --- linux-2.5.22/fs/dquot.c Tue Jun 18 13:42:06 2002 +++ linux-2.5.22-rmap13b/fs/dquot.c Tue Jun 18 13:47:38 2002 @@ -498,8 +498,7 @@ count = dqstats.free_dquots / priority; prune_dqcache(count); unlock_kernel(); - kmem_cache_shrink(dquot_cachep); - return 0; + return kmem_cache_shrink(dquot_cachep); } /* diff -uNr linux-2.5.22/fs/exec.c linux-2.5.22-rmap13b/fs/exec.c --- linux-2.5.22/fs/exec.c Wed Jun 12 15:44:33 2002 +++ linux-2.5.22-rmap13b/fs/exec.c Tue Jun 18 13:47:38 2002 @@ -36,6 +36,7 @@ #include #include #include +#include #define __NO_VERSION__ #include #include @@ -283,6 +284,7 @@ flush_dcache_page(page); flush_page_to_ram(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); + page_add_rmap(page, pte); pte_unmap(pte); tsk->mm->rss++; spin_unlock(&tsk->mm->page_table_lock); diff -uNr linux-2.5.22/fs/inode.c linux-2.5.22-rmap13b/fs/inode.c --- linux-2.5.22/fs/inode.c Tue Jun 18 13:42:06 2002 +++ linux-2.5.22-rmap13b/fs/inode.c Tue Jun 18 13:47:38 2002 @@ -431,8 +431,7 @@ count = inodes_stat.nr_unused / priority; prune_icache(count); - kmem_cache_shrink(inode_cachep); - return 0; + return kmem_cache_shrink(inode_cachep); } /* diff -uNr linux-2.5.22/fs/mpage.c linux-2.5.22-rmap13b/fs/mpage.c --- linux-2.5.22/fs/mpage.c Tue Jun 18 13:42:06 2002 +++ linux-2.5.22-rmap13b/fs/mpage.c Tue Jun 18 13:47:38 2002 @@ -518,15 +518,6 @@ if (page->mapping && TestClearPageDirty(page) && !PageWriteback(page)) { - /* FIXME: batch this up */ - if (!PageActive(page) && PageLRU(page)) { - spin_lock(&pagemap_lru_lock); - if (!PageActive(page) && PageLRU(page)) { - list_del(&page->lru); - list_add(&page->lru, &inactive_list); - } - spin_unlock(&pagemap_lru_lock); - } bio = mpage_writepage(bio, page, get_block, &last_block_in_bio, &ret); if (ret || (nr_to_write && --(*nr_to_write) <= 0)) diff -uNr linux-2.5.22/fs/proc/proc_misc.c linux-2.5.22-rmap13b/fs/proc/proc_misc.c --- linux-2.5.22/fs/proc/proc_misc.c Wed Jun 12 15:44:33 2002 +++ linux-2.5.22-rmap13b/fs/proc/proc_misc.c Tue Jun 18 13:47:38 2002 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -150,7 +151,9 @@ "Cached: %8lu kB\n" "SwapCached: %8lu kB\n" "Active: %8lu kB\n" - "Inactive: %8lu kB\n" + "Inact_dirty: %8lu kB\n" + "Inact_clean: %8lu kB\n" + "Inact_target: %8lu kB\n" "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" "LowTotal: %8lu kB\n" @@ -164,8 +167,10 @@ K(i.sharedram), K(ps.nr_pagecache-swapper_space.nrpages), K(swapper_space.nrpages), - K(ps.nr_active), - K(ps.nr_inactive), + K(ps.nr_active_pages), + K(ps.nr_inactive_dirty_pages), + K(ps.nr_inactive_clean_pages), + K(inactive_target()), K(i.totalhigh), K(i.freehigh), K(i.totalram-i.totalhigh), diff -uNr linux-2.5.22/include/asm-alpha/rmap.h linux-2.5.22-rmap13b/include/asm-alpha/rmap.h --- linux-2.5.22/include/asm-alpha/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-alpha/rmap.h Tue Jun 18 13:47:38 2002 @@ -0,0 +1,7 @@ +#ifndef _ALPHA_RMAP_H +#define _ALPHA_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -uNr linux-2.5.22/include/asm-arm/proc-armv/rmap.h linux-2.5.22-rmap13b/include/asm-arm/proc-armv/rmap.h --- linux-2.5.22/include/asm-arm/proc-armv/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-arm/proc-armv/rmap.h Tue Jun 18 13:47:38 2002 @@ -0,0 +1,72 @@ +#ifndef _ARMV_RMAP_H +#define _ARMV_RMAP_H +/* + * linux/include/asm-arm/proc-armv/rmap.h + * + * Architecture dependant parts of the reverse mapping code, + * + * We use the struct page of the page table page to find a pointer + * to an array of two 'struct arm_rmap_info's, one for each of the + * two page tables in each page. + * + * - rmi->mm points to the process' mm_struct + * - rmi->index has the high bits of the address + * - the lower bits of the address are calculated from the + * offset of the page table entry within the page table page + */ +#include + +struct arm_rmap_info { + struct mm_struct *mm; + unsigned long index; +}; + +static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm, unsigned long address) +{ + struct page * page = virt_to_page(ptep); + struct arm_rmap_info *rmi = (void *)page->mapping; + + if (((unsigned long)ptep)&2048) + rmi++; + + rmi->mm = mm; + rmi->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); +} + +static inline void pgtable_remove_rmap(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + struct arm_rmap_info *rmi = (void *)page->mapping; + + if (((unsigned long)ptep)&2048) + rmi++; + + rmi->mm = NULL; + rmi->index = 0; +} + +static inline struct mm_struct * ptep_to_mm(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + struct arm_rmap_info *rmi = (void *)page->mapping; + + if (((unsigned long)ptep)&2048) + rmi++; + + return rmi->mm; +} + +static inline unsigned long ptep_to_address(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + struct arm_rmap_info *rmi = (void *)page->mapping; + unsigned long low_bits; + + if (((unsigned long)ptep)&2048) + rmi++; + + low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; + return rmi->index + low_bits; +} + +#endif /* _ARMV_RMAP_H */ diff -uNr linux-2.5.22/include/asm-arm/rmap.h linux-2.5.22-rmap13b/include/asm-arm/rmap.h --- linux-2.5.22/include/asm-arm/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-arm/rmap.h Tue Jun 18 13:47:38 2002 @@ -0,0 +1,6 @@ +#ifndef _ARM_RMAP_H +#define _ARM_RMAP_H + +#include + +#endif /* _ARM_RMAP_H */ diff -uNr linux-2.5.22/include/asm-cris/rmap.h linux-2.5.22-rmap13b/include/asm-cris/rmap.h --- linux-2.5.22/include/asm-cris/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-cris/rmap.h Tue Jun 18 13:47:38 2002 @@ -0,0 +1,7 @@ +#ifndef _CRIS_RMAP_H +#define _CRIS_RMAP_H + +/* nothing to see, move along :) */ +#include + +#endif diff -uNr linux-2.5.22/include/asm-generic/rmap.h linux-2.5.22-rmap13b/include/asm-generic/rmap.h --- linux-2.5.22/include/asm-generic/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-generic/rmap.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,66 @@ +#ifndef _GENERIC_RMAP_H +#define _GENERIC_RMAP_H +/* + * linux/include/asm-generic/rmap.h + * + * Architecture dependant parts of the reverse mapping code, + * this version should work for most architectures with a + * 'normal' page table layout. + * + * We use the struct page of the page table page to find out + * the process and full address of a page table entry: + * - page->mapping points to the process' mm_struct + * - page->index has the high bits of the address + * - the lower bits of the address are calculated from the + * offset of the page table entry within the page table page + */ +#include + +static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) +{ +#ifdef BROKEN_PPC_PTE_ALLOC_ONE + /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ + extern int mem_init_done; + + if (!mem_init_done) + return; +#endif + page->mapping = (void *)mm; + page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); +} + +static inline void pgtable_add_rmap_kernel(pte_t * ptep, struct mm_struct * mm, unsigned long address) +{ + struct page * page = virt_to_page(ptep); +#ifdef BROKEN_PPC_PTE_ALLOC_ONE + /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ + extern int mem_init_done; + + if (!mem_init_done) + return; +#endif + page->mapping = (void *)mm; + page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); +} + +static inline void pgtable_remove_rmap(struct page * page) +{ + page->mapping = NULL; + page->index = 0; +} + +static inline struct mm_struct * ptep_to_mm(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + return (struct mm_struct *) page->mapping; +} + +static inline unsigned long ptep_to_address(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + unsigned long low_bits; + low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; + return page->index + low_bits; +} + +#endif /* _GENERIC_RMAP_H */ diff -uNr linux-2.5.22/include/asm-i386/rmap.h linux-2.5.22-rmap13b/include/asm-i386/rmap.h --- linux-2.5.22/include/asm-i386/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-i386/rmap.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,7 @@ +#ifndef _I386_RMAP_H +#define _I386_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -uNr linux-2.5.22/include/asm-ia64/rmap.h linux-2.5.22-rmap13b/include/asm-ia64/rmap.h --- linux-2.5.22/include/asm-ia64/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-ia64/rmap.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,7 @@ +#ifndef _IA64_RMAP_H +#define _IA64_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -uNr linux-2.5.22/include/asm-m68k/rmap.h linux-2.5.22-rmap13b/include/asm-m68k/rmap.h --- linux-2.5.22/include/asm-m68k/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-m68k/rmap.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,7 @@ +#ifndef _M68K_RMAP_H +#define _M68K_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -uNr linux-2.5.22/include/asm-mips/rmap.h linux-2.5.22-rmap13b/include/asm-mips/rmap.h --- linux-2.5.22/include/asm-mips/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-mips/rmap.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,7 @@ +#ifndef _MIPS_RMAP_H +#define _MIPS_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -uNr linux-2.5.22/include/asm-mips64/rmap.h linux-2.5.22-rmap13b/include/asm-mips64/rmap.h --- linux-2.5.22/include/asm-mips64/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-mips64/rmap.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,7 @@ +#ifndef _MIPS64_RMAP_H +#define _MIPS64_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -uNr linux-2.5.22/include/asm-parisc/rmap.h linux-2.5.22-rmap13b/include/asm-parisc/rmap.h --- linux-2.5.22/include/asm-parisc/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-parisc/rmap.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,7 @@ +#ifndef _PARISC_RMAP_H +#define _PARISC_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -uNr linux-2.5.22/include/asm-ppc/rmap.h linux-2.5.22-rmap13b/include/asm-ppc/rmap.h --- linux-2.5.22/include/asm-ppc/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-ppc/rmap.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,9 @@ +#ifndef _PPC_RMAP_H +#define _PPC_RMAP_H + +/* PPC calls pte_alloc() before mem_map[] is setup ... */ +#define BROKEN_PPC_PTE_ALLOC_ONE + +#include + +#endif diff -uNr linux-2.5.22/include/asm-s390/rmap.h linux-2.5.22-rmap13b/include/asm-s390/rmap.h --- linux-2.5.22/include/asm-s390/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-s390/rmap.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,7 @@ +#ifndef _S390_RMAP_H +#define _S390_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -uNr linux-2.5.22/include/asm-s390x/rmap.h linux-2.5.22-rmap13b/include/asm-s390x/rmap.h --- linux-2.5.22/include/asm-s390x/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-s390x/rmap.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,7 @@ +#ifndef _S390X_RMAP_H +#define _S390X_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -uNr linux-2.5.22/include/asm-sh/rmap.h linux-2.5.22-rmap13b/include/asm-sh/rmap.h --- linux-2.5.22/include/asm-sh/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-sh/rmap.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,7 @@ +#ifndef _SH_RMAP_H +#define _SH_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -uNr linux-2.5.22/include/asm-sparc/rmap.h linux-2.5.22-rmap13b/include/asm-sparc/rmap.h --- linux-2.5.22/include/asm-sparc/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-sparc/rmap.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,7 @@ +#ifndef _SPARC_RMAP_H +#define _SPARC_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -uNr linux-2.5.22/include/asm-sparc64/rmap.h linux-2.5.22-rmap13b/include/asm-sparc64/rmap.h --- linux-2.5.22/include/asm-sparc64/rmap.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/asm-sparc64/rmap.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,7 @@ +#ifndef _SPARC64_RMAP_H +#define _SPARC64_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -uNr linux-2.5.22/include/linux/elevator.h linux-2.5.22-rmap13b/include/linux/elevator.h --- linux-2.5.22/include/linux/elevator.h Thu May 2 17:22:39 2002 +++ linux-2.5.22-rmap13b/include/linux/elevator.h Tue Jun 18 13:47:39 2002 @@ -16,6 +16,8 @@ typedef int (elevator_init_fn) (request_queue_t *, elevator_t *); typedef void (elevator_exit_fn) (request_queue_t *, elevator_t *); +struct blkelv_ioctl_arg_s; +typedef int (elevator_ioctl_fn)(elevator_t *, int cmd, struct blkelv_ioctl_arg_s *); struct elevator_s { @@ -32,6 +34,8 @@ elevator_init_fn *elevator_init_fn; elevator_exit_fn *elevator_exit_fn; + elevator_ioctl_fn *elevator_ioctl_fn; + void *elevator_data; }; @@ -68,8 +72,13 @@ int write_latency; int max_bomb_segments; } blkelv_ioctl_arg_t; -#define BLKELVGET _IOR(0x12,106,sizeof(blkelv_ioctl_arg_t)) -#define BLKELVSET _IOW(0x12,107,sizeof(blkelv_ioctl_arg_t)) +/* + * We used to have `sizeof(blkelv_ioctl_arg_t)' in here, but that + * was always wrong, and sizeof(sizeof(struct)) caused erratic behaviour + * from the compiler. Change it to `int'. - akpm + */ +#define BLKELVGET _IOR(0x12,106,int) +#define BLKELVSET _IOW(0x12,107,int) extern int elevator_init(request_queue_t *, elevator_t *, elevator_t); extern void elevator_exit(request_queue_t *, elevator_t *); diff -uNr linux-2.5.22/include/linux/init_task.h linux-2.5.22-rmap13b/include/linux/init_task.h --- linux-2.5.22/include/linux/init_task.h Wed May 29 04:39:33 2002 +++ linux-2.5.22-rmap13b/include/linux/init_task.h Tue Jun 18 13:47:39 2002 @@ -27,6 +27,7 @@ mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \ page_table_lock: SPIN_LOCK_UNLOCKED, \ mmlist: LIST_HEAD_INIT(name.mmlist), \ + rlimit_rss: RLIM_INFINITY, \ } #define INIT_SIGNALS { \ diff -uNr linux-2.5.22/include/linux/mm.h linux-2.5.22-rmap13b/include/linux/mm.h --- linux-2.5.22/include/linux/mm.h Wed Jun 12 15:44:34 2002 +++ linux-2.5.22-rmap13b/include/linux/mm.h Tue Jun 18 13:47:39 2002 @@ -19,9 +19,6 @@ extern unsigned long num_physpages; extern void * high_memory; extern int page_cluster; -/* The inactive_clean lists are per zone. */ -extern struct list_head active_list; -extern struct list_head inactive_list; #include #include @@ -130,6 +127,9 @@ struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused); }; +/* forward declaration; pte_chain is meant to be internal to rmap.c */ +struct pte_chain; + /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -154,6 +154,9 @@ updated asynchronously */ struct list_head lru; /* Pageout list, eg. active_list; protected by pagemap_lru_lock !! */ + unsigned char age; /* Page aging counter. */ + struct pte_chain * pte_chain; /* Reverse pte mapping pointer. + * protected by PG_chainlock */ unsigned long private; /* mapping-private opaque data */ /* @@ -291,13 +294,17 @@ #define page_address(page) ((page)->virtual) -#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */ +#elif defined(CONFIG_DISCONTIGMEM) + +extern unsigned long page_address(struct page * page); + +#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL && !CONFIG_DISCONTIGMEM */ #define page_address(page) \ __va( (((page) - page_zone(page)->zone_mem_map) << PAGE_SHIFT) \ + page_zone(page)->zone_start_paddr) -#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */ +#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL && !CONFIG_DISCONTIGMEM */ /* * Error return values for the *_nopage functions @@ -308,6 +315,7 @@ /* The array of struct pages */ extern struct page *mem_map; +extern void FASTCALL(fixup_freespace(struct zone_struct *, int)); extern void show_free_areas(void); extern void show_free_areas_node(pg_data_t *pgdat); diff -uNr linux-2.5.22/include/linux/mm_inline.h linux-2.5.22-rmap13b/include/linux/mm_inline.h --- linux-2.5.22/include/linux/mm_inline.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/include/linux/mm_inline.h Tue Jun 18 13:47:39 2002 @@ -0,0 +1,276 @@ +#ifndef _LINUX_MM_INLINE_H +#define _LINUX_MM_INLINE_H + +#include + +/* + * These inline functions tend to need bits and pieces of all the + * other VM include files, meaning they cannot be defined inside + * one of the other VM include files. + * + * The include file mess really needs to be cleaned up... + */ + +static inline void add_page_to_active_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageActive(page); + list_add(&page->lru, &zone->active_list); + zone->active_pages++; + inc_page_state(nr_active_pages); +} + +static inline void add_page_to_inactive_dirty_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageInactiveDirty(page); + list_add(&page->lru, &zone->inactive_dirty_list); + zone->inactive_dirty_pages++; + inc_page_state(nr_inactive_dirty_pages); +} + +static inline void add_page_to_inactive_clean_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageInactiveClean(page); + list_add(&page->lru, &zone->inactive_clean_list); + zone->inactive_clean_pages++; + inc_page_state(nr_inactive_clean_pages); +} + +static inline void del_page_from_active_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + list_del(&page->lru); + ClearPageActive(page); + dec_page_state(nr_active_pages); + zone->active_pages--; + DEBUG_LRU_PAGE(page); +} + +static inline void del_page_from_inactive_dirty_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + list_del(&page->lru); + ClearPageInactiveDirty(page); + dec_page_state(nr_inactive_dirty_pages); + zone->inactive_dirty_pages--; + DEBUG_LRU_PAGE(page); +} + +static inline void del_page_from_inactive_clean_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + list_del(&page->lru); + ClearPageInactiveClean(page); + zone->inactive_clean_pages--; + dec_page_state(nr_inactive_clean_pages); + DEBUG_LRU_PAGE(page); +} + +/* + * Inline functions to control some balancing in the VM. + * + * Note that we do both global and per-zone balancing, with + * most of the balancing done globally. + */ +#define PLENTY_FACTOR 2 +#define ALL_ZONES NULL +#define ANY_ZONE (struct zone_struct *)(~0UL) +#define INACTIVE_FACTOR 5 + +#define VM_MIN 0 +#define VM_LOW 1 +#define VM_HIGH 2 +#define VM_PLENTY 3 +static inline int zone_free_limit(struct zone_struct * zone, int limit) +{ + int free, target, delta; + + /* This is really nasty, but GCC should completely optimise it away. */ + if (limit == VM_MIN) + target = zone->pages_min; + else if (limit == VM_LOW) + target = zone->pages_low; + else if (limit == VM_HIGH) + target = zone->pages_high; + else + target = zone->pages_high * PLENTY_FACTOR; + + free = zone->free_pages + zone->inactive_clean_pages; + delta = target - free; + + return delta; +} + +static inline int free_limit(struct zone_struct * zone, int limit) +{ + int shortage = 0, local; + + if (zone == ALL_ZONES) { + for_each_zone(zone) + shortage += zone_free_limit(zone, limit); + } else if (zone == ANY_ZONE) { + for_each_zone(zone) { + local = zone_free_limit(zone, limit); + shortage += max(local, 0); + } + } else { + shortage = zone_free_limit(zone, limit); + } + + return shortage; +} + +/** + * free_min - test for critically low amount of free pages + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if we have a serious shortage of free and + * clean pages, zero or negative if there is no serious shortage. + */ +static inline int free_min(struct zone_struct * zone) +{ + return free_limit(zone, VM_MIN); +} + +/** + * free_low - test for low amount of free pages + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if we have a shortage of free and + * clean pages, zero or negative if there is no shortage. + */ +static inline int free_low(struct zone_struct * zone) +{ + return free_limit(zone, VM_LOW); +} + +/** + * free_high - test if amount of free pages is less than ideal + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if the number of free and clean + * pages is below kswapd's target, zero or negative if we + * have more than enough free and clean pages. + */ +static inline int free_high(struct zone_struct * zone) +{ + return free_limit(zone, VM_HIGH); +} + +/** + * free_plenty - test if enough pages are freed + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if the number of free + clean pages + * in a zone is not yet excessive and kswapd is still allowed to + * free pages here, a negative value if kswapd should leave the + * zone alone. + */ +static inline int free_plenty(struct zone_struct * zone) +{ + return free_limit(zone, VM_PLENTY); +} + +/* + * The inactive page target is the free target + 20% of (active + inactive) + * pages. + */ +static inline int zone_inactive_limit(struct zone_struct * zone, int limit) +{ + int inactive, target, inactive_base; + + inactive_base = zone->active_pages + zone->inactive_dirty_pages; + inactive_base /= INACTIVE_FACTOR; + + /* GCC should optimise this away completely. */ + if (limit == VM_MIN) + target = zone->pages_high + inactive_base / 2; + else if (limit == VM_LOW) + target = zone->pages_high + inactive_base; + else + target = zone->pages_high + inactive_base * 2; + + inactive = zone->free_pages + zone->inactive_clean_pages; + inactive += zone->inactive_dirty_pages; + + return target - inactive; +} + +static inline int inactive_limit(struct zone_struct * zone, int limit) +{ + int shortage = 0, local; + + if (zone == ALL_ZONES) { + for_each_zone(zone) + shortage += zone_inactive_limit(zone, limit); + } else if (zone == ANY_ZONE) { + for_each_zone(zone) { + local = zone_inactive_limit(zone, limit); + shortage += max(local, 0); + } + } else { + shortage = zone_inactive_limit(zone, limit); + } + + return shortage; +} + +/** + * inactive_min - test for serious shortage of (free + inactive clean) pages + * @zone: zone to test, ALL_ZONES for global testing + * + * Returns the shortage as a positive number, a negative number + * if we have no serious shortage of (free + inactive clean) pages + */ +static inline int inactive_min(struct zone_struct * zone) +{ + return inactive_limit(zone, VM_MIN); +} + +/** + * inactive_low - test for shortage of (free + inactive clean) pages + * @zone: zone to test, ALL_ZONES for global testing + * + * Returns the shortage as a positive number, a negative number + * if we have no shortage of (free + inactive clean) pages + */ +static inline int inactive_low(struct zone_struct * zone) +{ + return inactive_limit(zone, VM_LOW); +} + +/** + * inactive_high - less than ideal amount of (free + inactive) pages + * @zone: zone to test, ALL_ZONES for global testing + * + * Returns the shortage as a positive number, a negative number + * if we have more than enough (free + inactive) pages + */ +static inline int inactive_high(struct zone_struct * zone) +{ + return inactive_limit(zone, VM_HIGH); +} + +/* + * inactive_target - number of inactive pages we ought to have. + */ +static inline int inactive_target(void) +{ + struct page_state ps; + int target; + + get_page_state(&ps); + target = ps.nr_active_pages + ps.nr_inactive_dirty_pages + + ps.nr_inactive_clean_pages; + + target /= INACTIVE_FACTOR; + + return target; +} + +#endif /* _LINUX_MM_INLINE_H */ diff -uNr linux-2.5.22/include/linux/mmzone.h linux-2.5.22-rmap13b/include/linux/mmzone.h --- linux-2.5.22/include/linux/mmzone.h Wed Jun 12 16:07:12 2002 +++ linux-2.5.22-rmap13b/include/linux/mmzone.h Tue Jun 18 13:47:39 2002 @@ -25,6 +25,9 @@ } free_area_t; struct pglist_data; +struct pte_chain; + +#define MAX_CHUNKS_PER_NODE 8 /* * On machines where it is needed (eg PCs) we divide physical memory @@ -40,13 +43,20 @@ */ spinlock_t lock; unsigned long free_pages; - unsigned long pages_min, pages_low, pages_high; - int need_balance; + unsigned long active_pages; + unsigned long inactive_dirty_pages; + unsigned long inactive_clean_pages; + unsigned long pages_min, pages_low, pages_high, pages_plenty; /* * free areas of different sizes */ + struct list_head active_list; + struct list_head inactive_dirty_list; + struct list_head inactive_clean_list; free_area_t free_area[MAX_ORDER]; + spinlock_t pte_chain_freelist_lock; + struct pte_chain *pte_chain_freelist; /* * wait_table -- the array holding the hash table @@ -81,6 +91,13 @@ */ struct pglist_data *zone_pgdat; struct page *zone_mem_map; + +#if defined(CONFIG_DISCONTIGMEM) && !defined(CONFIG_HIGHMEM) + struct page *zone_chunk_page_start[MAX_CHUNKS_PER_ZONE + 1]; + unsigned long zone_chunk_phys_start[MAX_CHUNKS_PER_ZONE]; + unsigned int zone_nr_chunks; +#endif + unsigned long zone_start_paddr; unsigned long zone_start_mapnr; @@ -142,15 +159,6 @@ extern int numnodes; extern pg_data_t *pgdat_list; -static inline int memclass(zone_t *pgzone, zone_t *classzone) -{ - if (pgzone->zone_pgdat != classzone->zone_pgdat) - return 0; - if (pgzone > classzone) - return 0; - return 1; -} - /* * The following two are not meant for general usage. They are here as * prototypes for the discontig memory code. @@ -163,6 +171,60 @@ extern pg_data_t contig_page_data; +/** + * for_each_pgdat - helper macro to iterate over all nodes + * @pgdat - pg_data_t * variable + * + * Meant to help with common loops of the form + * pgdat = pgdat_list; + * while(pgdat) { + * ... + * pgdat = pgdat->node_next; + * } + */ +#define for_each_pgdat(pgdat) \ + for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) + + +/* + * next_zone - helper magic for for_each_zone() + * Thanks to William Lee Irwin III for this piece of ingenuity. + */ +static inline zone_t *next_zone(zone_t *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone - pgdat->node_zones < MAX_NR_ZONES - 1) + zone++; + + else if (pgdat->node_next) { + pgdat = pgdat->node_next; + zone = pgdat->node_zones; + } else + zone = NULL; + + return zone; +} + +/** + * for_each_zone - helper macro to iterate over all memory zones + * @zone - zone_t * variable + * + * The user only needs to declare the zone variable, for_each_zone + * fills it in. This basically means for_each_zone() is an + * easier to read version of this piece of code: + * + * for(pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) + * for(i = 0; i < MAX_NR_ZONES; ++i) { + * zone_t * z = pgdat->node_zones + i; + * ... + * } + * } + */ +#define for_each_zone(zone) \ + for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) + + #ifndef CONFIG_DISCONTIGMEM #define NODE_DATA(nid) (&contig_page_data) diff -uNr linux-2.5.22/include/linux/page-flags.h linux-2.5.22-rmap13b/include/linux/page-flags.h --- linux-2.5.22/include/linux/page-flags.h Wed Jun 12 15:44:34 2002 +++ linux-2.5.22-rmap13b/include/linux/page-flags.h Tue Jun 18 13:52:52 2002 @@ -47,7 +47,7 @@ * locked- and dirty-page accounting. The top eight bits of page->flags are * used for page->zone, so putting flag bits there doesn't work. */ -#define PG_locked 0 /* Page is locked. Don't touch. */ +#define PG_locked 0 /* Page is locked. Don't touch. */ #define PG_error 1 #define PG_referenced 2 #define PG_uptodate 3 @@ -55,16 +55,19 @@ #define PG_dirty_dontuse 4 #define PG_lru 5 #define PG_active 6 -#define PG_slab 7 /* slab debug (Suparna wants this) */ - -#define PG_highmem 8 -#define PG_checked 9 /* kill me in 2.5.. */ -#define PG_arch_1 10 -#define PG_reserved 11 - -#define PG_private 12 /* Has something at ->private */ -#define PG_writeback 13 /* Page is under writeback */ -#define PG_nosave 15 /* Used for system suspend/resume */ +#define PG_inactive_clean 7 +#define PG_inactive_dirty 8 +#define PG_slab 9 /* slab debug (Suparna wants this) */ + +#define PG_highmem 10 +#define PG_checked 11 /* kill me in 2.5.. */ +#define PG_arch_1 12 +#define PG_reserved 13 + +#define PG_private 14 /* Has something at ->private */ +#define PG_writeback 15 /* Page is under writeback */ +#define PG_nosave 16 /* Used for system suspend/resume */ +#define PG_chainlock 17 /* lock bit for ->pte_chain */ /* * Global page accounting. One instance per CPU. @@ -73,8 +76,9 @@ unsigned long nr_dirty; unsigned long nr_writeback; unsigned long nr_pagecache; - unsigned long nr_active; /* on active_list LRU */ - unsigned long nr_inactive; /* on inactive_list LRU */ + unsigned long nr_active_pages; /* on active_list LRU */ + unsigned long nr_inactive_clean_pages; /* on inactive_clean_list LRU */ + unsigned long nr_inactive_dirty_pages; /* on inactive_dirty_list LRU */ } ____cacheline_aligned_in_smp page_states[NR_CPUS]; extern void get_page_state(struct page_state *ret); @@ -150,12 +154,22 @@ }) #define PageLRU(page) test_bit(PG_lru, &(page)->flags) -#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) -#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) +#define SetPageLRU(page) set_bit(PG_lru, &(page)->flags) +#define ClearPageLRU(page) clear_bit(PG_lru, &(page)->flags) #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) +#define TestandSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags) +#define TestandClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) + +#define PageInactiveDirty(page) test_bit(PG_inactive_dirty, &(page)->flags) +#define SetPageInactiveDirty(page) set_bit(PG_inactive_dirty, &(page)->flags) +#define ClearPageInactiveDirty(page) clear_bit(PG_inactive_dirty, &(page)->flags) + +#define PageInactiveClean(page) test_bit(PG_inactive_clean, &(page)->flags) +#define SetPageInactiveClean(page) set_bit(PG_inactive_clean, &(page)->flags) +#define ClearPageInactiveClean(page) clear_bit(PG_inactive_clean, &(page)->flags) #define PageSlab(page) test_bit(PG_slab, &(page)->flags) #define SetPageSlab(page) set_bit(PG_slab, &(page)->flags) @@ -217,6 +231,29 @@ #define TestClearPageNosave(page) test_and_clear_bit(PG_nosave, &(page)->flags) /* + * inlines for acquisition and release of PG_chainlock + */ +static inline void pte_chain_lock(struct page *page) +{ + /* + * Assuming the lock is uncontended, this never enters + * the body of the outer loop. If it is contended, then + * within the inner loop a non-atomic test is used to + * busywait with less bus contention for a good time to + * attempt to acquire the lock bit. + */ + while (test_and_set_bit(PG_chainlock, &page->flags)) { + while (test_bit(PG_chainlock, &page->flags)) + cpu_relax(); + } +} + +static inline void pte_chain_unlock(struct page *page) +{ + clear_bit(PG_chainlock, &page->flags); +} + +/* * The PageSwapCache predicate doesn't use a PG_flag at this time, * but it may again do so one day. */ diff -uNr linux-2.5.22/include/linux/sched.h linux-2.5.22-rmap13b/include/linux/sched.h --- linux-2.5.22/include/linux/sched.h Tue Jun 18 13:42:10 2002 +++ linux-2.5.22-rmap13b/include/linux/sched.h Tue Jun 18 13:47:39 2002 @@ -193,6 +193,7 @@ unsigned long def_flags; unsigned long cpu_vm_mask; unsigned long swap_address; + unsigned long rlimit_rss; unsigned dumpable:1; @@ -272,9 +273,6 @@ struct list_head tasks; struct mm_struct *mm, *active_mm; - struct list_head local_pages; - - unsigned int allocation_order, nr_local_pages; /* task state */ struct linux_binfmt *binfmt; diff -uNr linux-2.5.22/include/linux/swap.h linux-2.5.22-rmap13b/include/linux/swap.h --- linux-2.5.22/include/linux/swap.h Wed Jun 12 16:06:35 2002 +++ linux-2.5.22-rmap13b/include/linux/swap.h Tue Jun 18 13:47:39 2002 @@ -120,18 +120,38 @@ struct address_space; struct zone_t; +/* linux/mm/rmap.c */ +extern int FASTCALL(page_referenced(struct page *)); +extern void FASTCALL(page_add_rmap(struct page *, pte_t *)); +extern void FASTCALL(page_remove_rmap(struct page *, pte_t *)); +extern int FASTCALL(try_to_unmap(struct page *)); +extern int FASTCALL(page_over_rsslimit(struct page *)); + +/* return values of try_to_unmap */ +#define SWAP_SUCCESS 0 +#define SWAP_AGAIN 1 +#define SWAP_FAIL 2 +#define SWAP_ERROR 3 + /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); extern void FASTCALL(__lru_cache_del(struct page *)); extern void FASTCALL(lru_cache_del(struct page *)); extern void FASTCALL(activate_page(struct page *)); +extern void FASTCALL(activate_page_nolock(struct page *)); +extern void FASTCALL(deactivate_page(struct page *)); +extern void FASTCALL(deactivate_page_nolock(struct page *)); +extern void FASTCALL(drop_page(struct page *)); extern void swap_setup(void); /* linux/mm/vmscan.c */ +extern struct page * FASTCALL(reclaim_page(zone_t *)); extern wait_queue_head_t kswapd_wait; -extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int)); +extern int FASTCALL(try_to_free_pages(unsigned int gfp_mask)); +extern void wakeup_kswapd(unsigned int); +extern void rss_free_pages(unsigned int); /* linux/mm/page_io.c */ extern void rw_swap_page(int, struct page *); @@ -145,6 +165,7 @@ extern void show_swap_cache_info(void); #endif extern int add_to_swap_cache(struct page *, swp_entry_t); +extern int add_to_swap(struct page *); extern void __delete_from_swap_cache(struct page *page); extern void delete_from_swap_cache(struct page *page); extern int move_to_swap_cache(struct page *page, swp_entry_t entry); @@ -182,43 +203,26 @@ extern void FASTCALL(mark_page_accessed(struct page *)); /* + * Page aging defines. These seem to work great in FreeBSD, + * no need to reinvent the wheel. + */ +#define PAGE_AGE_START 5 +#define PAGE_AGE_ADV 3 +#define PAGE_AGE_DECL 1 +#define PAGE_AGE_MAX 64 + +/* * List add/del helper macros. These must be called * with the pagemap_lru_lock held! */ #define DEBUG_LRU_PAGE(page) \ do { \ - if (!PageLRU(page)) \ - BUG(); \ if (PageActive(page)) \ BUG(); \ -} while (0) - -#define add_page_to_active_list(page) \ -do { \ - DEBUG_LRU_PAGE(page); \ - SetPageActive(page); \ - list_add(&(page)->lru, &active_list); \ - inc_page_state(nr_active); \ -} while (0) - -#define add_page_to_inactive_list(page) \ -do { \ - DEBUG_LRU_PAGE(page); \ - list_add(&(page)->lru, &inactive_list); \ - inc_page_state(nr_inactive); \ -} while (0) - -#define del_page_from_active_list(page) \ -do { \ - list_del(&(page)->lru); \ - ClearPageActive(page); \ - dec_page_state(nr_active); \ -} while (0) - -#define del_page_from_inactive_list(page) \ -do { \ - list_del(&(page)->lru); \ - dec_page_state(nr_inactive); \ + if (PageInactiveDirty(page)) \ + BUG(); \ + if (PageInactiveClean(page)) \ + BUG(); \ } while (0) extern spinlock_t swaplock; diff -uNr linux-2.5.22/kernel/fork.c linux-2.5.22-rmap13b/kernel/fork.c --- linux-2.5.22/kernel/fork.c Tue Jun 18 13:42:10 2002 +++ linux-2.5.22-rmap13b/kernel/fork.c Tue Jun 18 13:47:39 2002 @@ -189,7 +189,6 @@ mm->map_count = 0; mm->rss = 0; mm->cpu_vm_mask = 0; - mm->swap_address = 0; pprev = &mm->mmap; /* @@ -308,9 +307,6 @@ void mmput(struct mm_struct *mm) { if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) { - extern struct mm_struct *swap_mm; - if (swap_mm == mm) - swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); list_del(&mm->mmlist); mmlist_nr--; spin_unlock(&mmlist_lock); @@ -703,8 +699,6 @@ p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; - INIT_LIST_HEAD(&p->local_pages); - retval = -ENOMEM; /* copy all the process information */ if (copy_semundo(clone_flags, p)) diff -uNr linux-2.5.22/kernel/sys.c linux-2.5.22-rmap13b/kernel/sys.c --- linux-2.5.22/kernel/sys.c Wed May 29 04:19:50 2002 +++ linux-2.5.22-rmap13b/kernel/sys.c Tue Jun 18 13:47:39 2002 @@ -1163,6 +1163,12 @@ if (resource == RLIMIT_NOFILE) { if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN) return -EPERM; + } else if (resource == RLIMIT_RSS && current->mm) { + /* rlimit is specified in bytes, convert to pages */ + unsigned long pages = RLIM_INFINITY; + if (new_rlim.rlim_cur != RLIM_INFINITY) + pages = new_rlim.rlim_cur >> PAGE_SHIFT; + current->mm->rlimit_rss = pages; } *old_rlim = new_rlim; return 0; diff -uNr linux-2.5.22/mm/Makefile linux-2.5.22-rmap13b/mm/Makefile --- linux-2.5.22/mm/Makefile Thu May 2 17:22:54 2002 +++ linux-2.5.22-rmap13b/mm/Makefile Tue Jun 18 13:47:39 2002 @@ -16,6 +16,6 @@ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \ - pdflush.o page-writeback.o + pdflush.o page-writeback.o rmap.o include $(TOPDIR)/Rules.make diff -uNr linux-2.5.22/mm/bootmem.c linux-2.5.22-rmap13b/mm/bootmem.c --- linux-2.5.22/mm/bootmem.c Wed Jun 12 16:07:13 2002 +++ linux-2.5.22-rmap13b/mm/bootmem.c Tue Jun 18 13:47:39 2002 @@ -339,12 +339,11 @@ pg_data_t *pgdat = pgdat_list; void *ptr; - while (pgdat) { + for_each_pgdat(pgdat) if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal))) return(ptr); - pgdat = pgdat->node_next; - } + /* * Whoops, we cannot satisfy the allocation request. */ diff -uNr linux-2.5.22/mm/filemap.c linux-2.5.22-rmap13b/mm/filemap.c --- linux-2.5.22/mm/filemap.c Tue Jun 18 13:42:11 2002 +++ linux-2.5.22-rmap13b/mm/filemap.c Tue Jun 18 13:47:39 2002 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -176,6 +177,10 @@ */ static void truncate_complete_page(struct page *page) { + /* Page has already been removed from processes, by vmtruncate() */ + if (page->pte_chain) + BUG(); + /* Leave it on the LRU if it gets converted into anonymous buffers */ if (!PagePrivate(page) || do_invalidatepage(page, 0)) lru_cache_del(page); @@ -643,7 +648,7 @@ * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. * * The first mb is necessary to safely close the critical section opened by the - * TryLockPage(), the second mb is necessary to enforce ordering between + * TestSetPageLocked(), the second mb is necessary to enforce ordering between * the clear_bit and the read of the waitqueue (to avoid SMP races with a * parallel wait_on_page_locked()). */ @@ -862,9 +867,7 @@ return find_or_create_page(mapping, index, mapping->gfp_mask); } - -/* - * Same as grab_cache_page, but do not wait if the page is unavailable. +/* Same as grab_cache_page, but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should * be safe to call while holding the lock for another page. @@ -913,16 +916,23 @@ /* * Mark a page as having seen activity. * - * If it was already so marked, move it - * to the active queue and drop the referenced - * bit. Otherwise, just mark it for future - * action.. + * We immediately reclaim + * the inactive clean pages because those are counted as freeable. + * We don't modify the inactive dirty ones because we're never sure + * if those are freeable anyway. */ void mark_page_accessed(struct page *page) { - if (!PageActive(page) && PageReferenced(page)) { + if (PageInactiveClean(page)) { + struct zone_struct *zone = page_zone(page); + int free = zone->free_pages + zone->inactive_clean_pages; + activate_page(page); - ClearPageReferenced(page); + if (free < zone->pages_low) + wakeup_kswapd(GFP_NOIO); + if (zone->free_pages < zone->pages_min) + fixup_freespace(zone, 1); + return; } @@ -1429,7 +1439,7 @@ /* Limit it to a sane percentage of the inactive list.. */ get_page_state(&ps); - max = ps.nr_inactive / 2; + max = ps.nr_inactive_clean_pages / 2; if (nr > max) nr = max; @@ -2227,16 +2237,18 @@ } do { - unsigned long index; - unsigned long offset; + unsigned long index, offset; long page_fault; char *kaddr; + int deactivate = 1; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) + if (bytes > count) { bytes = count; + deactivate = 0; + } /* * Bring in the user page that we will copy from _first_. @@ -2286,8 +2298,11 @@ } } kunmap(page); - SetPageReferenced(page); unlock_page(page); + if (deactivate) + deactivate_page(page); + else + mark_page_accessed(page); page_cache_release(page); if (status < 0) break; diff -uNr linux-2.5.22/mm/memory.c linux-2.5.22-rmap13b/mm/memory.c --- linux-2.5.22/mm/memory.c Tue Jun 18 13:42:11 2002 +++ linux-2.5.22-rmap13b/mm/memory.c Tue Jun 18 13:47:39 2002 @@ -44,8 +44,10 @@ #include #include #include +#include #include +#include #include #include #include @@ -79,8 +81,7 @@ */ static inline void free_one_pmd(mmu_gather_t *tlb, pmd_t * dir) { - struct page *pte; - + struct page *page; if (pmd_none(*dir)) return; if (pmd_bad(*dir)) { @@ -88,9 +89,10 @@ pmd_clear(dir); return; } - pte = pmd_page(*dir); + page = pmd_page(*dir); pmd_clear(dir); - pte_free_tlb(tlb, pte); + pgtable_remove_rmap(page); + pte_free_tlb(tlb, page); } static inline void free_one_pgd(mmu_gather_t *tlb, pgd_t * dir) @@ -150,6 +152,7 @@ pte_free(new); goto out; } + pgtable_add_rmap(new, mm, address); pmd_populate(mm, pmd, new); } out: @@ -177,6 +180,7 @@ pte_free_kernel(new); goto out; } + pgtable_add_rmap_kernel(new, mm, address); pmd_populate_kernel(mm, pmd, new); } out: @@ -260,10 +264,13 @@ if (pte_none(pte)) goto cont_copy_pte_range_noset; + /* pte contains position in swap, so copy. */ if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); - goto cont_copy_pte_range; + set_pte(dst_pte, pte); + goto cont_copy_pte_range_noset; } + ptepage = pte_page(pte); pfn = pte_pfn(pte); if (!pfn_valid(pfn)) goto cont_copy_pte_range; @@ -272,7 +279,7 @@ goto cont_copy_pte_range; /* If it's a COW mapping, write protect it both in the parent and the child */ - if (cow && pte_write(pte)) { + if (cow) { ptep_set_wrprotect(src_pte); pte = *src_pte; } @@ -285,6 +292,7 @@ dst->rss++; cont_copy_pte_range: set_pte(dst_pte, pte); + page_add_rmap(ptepage, dst_pte); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) { pte_unmap_nested(src_pte); @@ -342,6 +350,7 @@ if (pte_dirty(pte)) set_page_dirty(page); tlb->freed++; + page_remove_rmap(page, ptep); tlb_remove_page(tlb, page); } } @@ -992,7 +1001,9 @@ if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; + page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + page_add_rmap(new_page, page_table); lru_cache_add(new_page); /* Free the old page.. */ @@ -1110,6 +1121,10 @@ struct page *new_page; unsigned long offset; + /* Low on free memory ? Don't make things worse. */ + if (free_low(ALL_ZONES) < 0) + return; + /* * Get the number of handles we should do readahead io to. */ @@ -1192,6 +1207,7 @@ flush_page_to_ram(page); flush_icache_page(vma, page); set_pte(page_table, pte); + page_add_rmap(page, page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); @@ -1208,14 +1224,13 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, pmd_t *pmd, int write_access, unsigned long addr) { pte_t entry; + struct page * page = ZERO_PAGE(addr); /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); /* ..except if it's a write access */ if (write_access) { - struct page *page; - /* Allocate our own private page. */ pte_unmap(page_table); spin_unlock(&mm->page_table_lock); @@ -1241,6 +1256,7 @@ } set_pte(page_table, entry); + page_add_rmap(page, page_table); /* ignores ZERO_PAGE */ pte_unmap(page_table); /* No need to invalidate - it was non-present before */ @@ -1297,6 +1313,8 @@ new_page = page; } + mark_page_accessed(new_page); + spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); @@ -1319,7 +1337,9 @@ if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); set_pte(page_table, entry); + page_add_rmap(new_page, page_table); pte_unmap(page_table); + } else { /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); @@ -1398,6 +1418,14 @@ current->state = TASK_RUNNING; pgd = pgd_offset(mm, address); + /* + * If we are over our RSS limit and the system needs memory, + * we will free memory for the non-hogs and slow down a bit. + */ + if (mm->rlimit_rss && mm->rss > mm->rlimit_rss && + free_high(ALL_ZONES) > 0) + rss_free_pages(GFP_HIGHUSER); + /* * We need the page table lock to synchronize with kswapd * and the SMP-safe atomic PTE updates. diff -uNr linux-2.5.22/mm/mremap.c linux-2.5.22-rmap13b/mm/mremap.c --- linux-2.5.22/mm/mremap.c Thu May 2 17:22:54 2002 +++ linux-2.5.22-rmap13b/mm/mremap.c Tue Jun 18 13:47:39 2002 @@ -68,8 +68,14 @@ { int error = 0; pte_t pte; + struct page * page = NULL; + + if (pte_present(*src)) + page = pte_page(*src); if (!pte_none(*src)) { + if (page) + page_remove_rmap(page, src); pte = ptep_get_and_clear(src); if (!dst) { /* No dest? We must put it back. */ @@ -77,6 +83,8 @@ error++; } set_pte(dst, pte); + if (page) + page_add_rmap(page, dst); } return error; } diff -uNr linux-2.5.22/mm/numa.c linux-2.5.22-rmap13b/mm/numa.c --- linux-2.5.22/mm/numa.c Wed Jun 12 15:44:34 2002 +++ linux-2.5.22-rmap13b/mm/numa.c Tue Jun 18 13:47:39 2002 @@ -44,6 +44,57 @@ #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) +#ifndef CONFIG_HIGHMEM +unsigned long page_address(struct page * page) +{ + struct zone_struct * zone; + struct page * chunk_page_start; + unsigned long chunk_phys_addr; + int idx_chunk; + + zone = page_zone(page); + /* + * We have to check if the page is on + * a chunk that contains pages from 2 zones. + */ + if(!(page < zone->zone_chunk_page_start[0])) + goto known_zone; + /* + * We need to get the previous zone. + * If there is no such zone, we are in trouble. + */ + if(!page->zone) + BUG(); + + zone = zone_table[(page->zone) - 1]; + + if(zone->zone_pgdat->node_id == page_zone(page)->zone_pgdat->node_id) + goto known_zone; + /* + * Getting here means we have a chunk spread over 2 nodes. + * That shouldn't happen. + */ + BUG(); + + known_zone: + for(idx_chunk = 0 ; idx_chunk < MAX_CHUNKS_PER_ZONE ; idx_chunk++){ + if(page >= zone->zone_chunk_page_start[idx_chunk] && + page < zone->zone_chunk_page_start[idx_chunk + 1]) + break; + } + /* + * We know which chunk the page belongs to. + */ + chunk_phys_addr = zone->zone_chunk_phys_start[idx_chunk]; + chunk_page_start = zone->zone_chunk_page_start[idx_chunk]; + return (unsigned long)__va(chunk_phys_addr + + ((page - chunk_page_start) << PAGE_SHIFT )); + + + +} +#endif + static spinlock_t node_lock = SPIN_LOCK_UNLOCKED; void show_free_areas_node(pg_data_t *pgdat) diff -uNr linux-2.5.22/mm/oom_kill.c linux-2.5.22-rmap13b/mm/oom_kill.c --- linux-2.5.22/mm/oom_kill.c Thu May 2 17:22:37 2002 +++ linux-2.5.22-rmap13b/mm/oom_kill.c Tue Jun 18 13:47:39 2002 @@ -168,7 +168,8 @@ static void oom_kill(void) { struct task_struct *p, *q; - + extern wait_queue_head_t kswapd_done; + read_lock(&tasklist_lock); p = select_bad_process(); @@ -182,6 +183,9 @@ } read_unlock(&tasklist_lock); + /* Chances are by this time our victim is sleeping on kswapd. */ + wake_up(&kswapd_done); + /* * Make kswapd go out of the way, so "p" has a good chance of * killing itself before someone else gets the chance to ask diff -uNr linux-2.5.22/mm/page-writeback.c linux-2.5.22-rmap13b/mm/page-writeback.c --- linux-2.5.22/mm/page-writeback.c Wed Jun 12 16:06:35 2002 +++ linux-2.5.22-rmap13b/mm/page-writeback.c Tue Jun 18 13:47:39 2002 @@ -258,7 +258,6 @@ int generic_vm_writeback(struct page *page, int *nr_to_write) { struct inode *inode = page->mapping->host; - /* * We don't own this inode, and we don't want the address_space * vanishing while writeback is walking its pages. @@ -320,7 +319,7 @@ * If a page is already under I/O, generic_writepages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() - * and msync() need to guarentee that all the data which was dirty at the time + * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. The way to do this is * to run filemap_fdatawait() before calling filemap_fdatawrite(). * @@ -363,15 +362,6 @@ /* It may have been removed from swapcache: check ->mapping */ if (page->mapping && TestClearPageDirty(page) && !PageWriteback(page)) { - /* FIXME: batch this up */ - if (!PageActive(page) && PageLRU(page)) { - spin_lock(&pagemap_lru_lock); - if (!PageActive(page) && PageLRU(page)) { - list_del(&page->lru); - list_add(&page->lru, &inactive_list); - } - spin_unlock(&pagemap_lru_lock); - } err = writepage(page); if (!ret) ret = err; diff -uNr linux-2.5.22/mm/page_alloc.c linux-2.5.22-rmap13b/mm/page_alloc.c --- linux-2.5.22/mm/page_alloc.c Wed Jun 12 16:07:13 2002 +++ linux-2.5.22-rmap13b/mm/page_alloc.c Tue Jun 18 13:56:16 2002 @@ -14,12 +14,11 @@ #include #include +#include #include -#include #include #include #include -#include #include #include #include @@ -27,8 +26,6 @@ unsigned long totalram_pages; unsigned long totalhigh_pages; int nr_swap_pages; -LIST_HEAD(active_list); -LIST_HEAD(inactive_list); pg_data_t *pgdat_list; /* @@ -42,6 +39,8 @@ static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; +static int zone_extrafree_ratio[MAX_NR_ZONES] __initdata = { 128, 512, 0, }; +static int zone_extrafree_max[MAX_NR_ZONES] __initdata = { 1024 , 1024, 0, }; /* * Temporary debugging check. @@ -87,18 +86,19 @@ BUG(); if (PageLocked(page)) BUG(); - if (PageLRU(page)) - BUG(); if (PageActive(page)) BUG(); + if (PageInactiveDirty(page)) + BUG(); + if (PageInactiveClean(page)) + BUG(); + if (page->pte_chain) + BUG(); if (PageWriteback(page)) BUG(); ClearPageDirty(page); page->flags &= ~(1<flags & PF_FREE_PAGES) - goto local_freelist; - back_local_freelist: + page->age = PAGE_AGE_START; zone = page_zone(page); @@ -146,17 +146,6 @@ list_add(&(base + page_idx)->list, &area->free_list); spin_unlock_irqrestore(&zone->lock, flags); - return; - - local_freelist: - if (current->nr_local_pages) - goto back_local_freelist; - if (in_interrupt()) - goto back_local_freelist; - - list_add(&page->list, ¤t->local_pages); - page->index = order; - current->nr_local_pages++; } #define MARK_USED(index, order, area) \ @@ -215,10 +204,7 @@ set_page_count(page, 1); if (BAD_RANGE(zone,page)) BUG(); - if (PageLRU(page)) - BUG(); - if (PageActive(page)) - BUG(); + DEBUG_LRU_PAGE(page); return page; } curr_order++; @@ -261,76 +247,83 @@ } #endif -static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); -static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) +/* + * If we are able to directly reclaim pages, we move pages from the + * inactive_clean list onto the free list until the zone has enough + * free pages or until the inactive_clean pages are exhausted. + * If we cannot do this work ourselves, call kswapd. + */ +void FASTCALL(fixup_freespace(zone_t * zone, int direct_reclaim)); +void fixup_freespace(zone_t * zone, int direct_reclaim) +{ + if (direct_reclaim) { + struct page * page; + do { + if ((page = reclaim_page(zone))) + __free_pages_ok(page, 0); + } while (page && zone->free_pages <= zone->pages_min); + } else + wakeup_kswapd(GFP_ATOMIC); +} + +#define PAGES_KERNEL 0 +#define PAGES_MIN 1 +#define PAGES_LOW 2 +#define PAGES_HIGH 3 + +/* + * This function does the dirty work for __alloc_pages + * and is separated out to keep the code size smaller. + * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM) + */ +static struct page * __alloc_pages_limit(zonelist_t *zonelist, + unsigned long order, int limit, int direct_reclaim) { - struct page * page = NULL; - int __freed = 0; - - if (!(gfp_mask & __GFP_WAIT)) - goto out; - if (in_interrupt()) - BUG(); - - current->allocation_order = order; - current->flags |= PF_MEMALLOC | PF_FREE_PAGES; + zone_t **zone = zonelist->zones; + unsigned long water_mark = 0; - __freed = try_to_free_pages(classzone, gfp_mask, order); - - current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); + for (;;) { + zone_t *z = *(zone++); - if (current->nr_local_pages) { - struct list_head * entry, * local_pages; - struct page * tmp; - int nr_pages; - - local_pages = ¤t->local_pages; - - if (likely(__freed)) { - /* pick from the last inserted so we're lifo */ - entry = local_pages->next; - do { - tmp = list_entry(entry, struct page, list); - if (tmp->index == order && memclass(page_zone(tmp), classzone)) { - list_del(entry); - current->nr_local_pages--; - set_page_count(tmp, 1); - page = tmp; - - if (PagePrivate(page)) - BUG(); - if (page->mapping) - BUG(); - if (PageLocked(page)) - BUG(); - if (PageLRU(page)) - BUG(); - if (PageActive(page)) - BUG(); - if (PageDirty(page)) - BUG(); - if (PageWriteback(page)) - BUG(); + if (!z) + break; + if (!z->size) + BUG(); - break; - } - } while ((entry = entry->next) != local_pages); + /* + * We allocate if the number of (free + inactive_clean) + * pages is above the watermark. + */ + switch (limit) { + case PAGES_KERNEL: + water_mark = z->pages_min / 2; + break; + case PAGES_MIN: + water_mark = z->pages_min; + break; + case PAGES_LOW: + water_mark = z->pages_low; + break; + default: + case PAGES_HIGH: + water_mark = z->pages_high; } - nr_pages = current->nr_local_pages; - /* free in reverse order so that the global order will be lifo */ - while ((entry = local_pages->prev) != local_pages) { - list_del(entry); - tmp = list_entry(entry, struct page, list); - __free_pages_ok(tmp, tmp->index); - if (!nr_pages--) - BUG(); + if (z->free_pages + z->inactive_clean_pages >= water_mark) { + struct page *page = NULL; + /* If possible, reclaim a page directly. */ + if (direct_reclaim) + page = reclaim_page(z); + /* If that fails, fall back to rmqueue. */ + if (!page) + page = rmqueue(z, order); + if (page) + return page; } - current->nr_local_pages = 0; } - out: - *freed = __freed; - return page; + + /* Found nothing. */ + return NULL; } /* @@ -338,107 +331,248 @@ */ struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) { - unsigned long min; - zone_t **zone, * classzone; + zone_t **zone; + int min, direct_reclaim = 0; struct page * page; - int freed; + /* + * (If anyone calls gfp from interrupts nonatomically then it + * will sooner or later tripped up by a schedule().) + * + * We fall back to lower-level zones if allocation + * in a higher zone fails. + */ + + /* + * Can we take pages directly from the inactive_clean + * list? + */ + if (order == 0 && (gfp_mask & __GFP_WAIT)) + direct_reclaim = 1; + +try_again: + /* + * First, see if we have any zones with lots of free memory. + * + * We allocate free memory first because it doesn't contain + * any data we would want to cache. + */ zone = zonelist->zones; - classzone = *zone; - if (classzone == NULL) + if (!*zone) return NULL; min = 1UL << order; for (;;) { zone_t *z = *(zone++); if (!z) break; + if (!z->size) + BUG(); - min += z->pages_low; + min += z->pages_min; if (z->free_pages > min) { page = rmqueue(z, order); if (page) return page; - } + } else if (z->free_pages < z->pages_min) + fixup_freespace(z, direct_reclaim); } - classzone->need_balance = 1; - mb(); - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + /* + * Next, try to allocate a page from a zone with a HIGH + * amount of (free + inactive_clean) pages. + * + * If there is a lot of activity, inactive_target + * will be high and we'll have a good chance of + * finding a page using the HIGH limit. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim); + if (page) + return page; + /* + * Then try to allocate a page from a zone with more + * than zone->pages_low of (free + inactive_clean) pages. + * + * When the working set is very large and VM activity + * is low, we're most likely to have our allocation + * succeed here. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim); + if (page) + return page; + + /* + * OK, none of the zones on our zonelist has lots + * of pages free. + * + * We wake up kswapd, in the hope that kswapd will + * resolve this situation before memory gets tight. + * + * We'll also help a bit trying to free pages, this + * way statistics will make sure really fast allocators + * are slowed down more than slow allocators and other + * programs in the system shouldn't be impacted as much + * by the hogs. + */ + wakeup_kswapd(gfp_mask); + + /* + * After waking up kswapd, we try to allocate a page + * from any zone which isn't critical yet. + * + * Kswapd should, in most situations, bring the situation + * back to normal in no time. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim); + if (page) + return page; + + /* + * Kernel allocations can eat a few emergency pages. + * We should be able to run without this, find out why + * the SCSI layer isn't happy ... + */ + if (gfp_mask & __GFP_HIGH) { + page = __alloc_pages_limit(zonelist, order, PAGES_KERNEL, direct_reclaim); + if (page) + return page; + } + + /* + * Oh well, we didn't succeed. + */ + if (!(current->flags & PF_MEMALLOC)) { + /* + * Are we dealing with a higher order allocation? + * + * If so, try to defragment some memory. + */ + if (order > 0 && (gfp_mask & __GFP_WAIT)) + goto defragment; + + /* + * If we arrive here, we are really tight on memory. + * Since kswapd didn't succeed in freeing pages for us, + * we need to help it. + * + * Single page allocs loop until the allocation succeeds. + * Multi-page allocs can fail due to memory fragmentation; + * in that case we bail out to prevent infinite loops and + * hanging device drivers ... + * + * Another issue are GFP_NOFS allocations; because they + * do not have __GFP_FS set it's possible we cannot make + * any progress freeing pages, in that case it's better + * to give up than to deadlock the kernel looping here. + * + * NFS: we must yield the CPU (to rpciod) to avoid deadlock. + */ + if (gfp_mask & __GFP_WAIT) { + __set_current_state(TASK_RUNNING); + yield(); + if (!order || free_high(ALL_ZONES) >= 0) { + int progress = try_to_free_pages(gfp_mask); + if (progress || (gfp_mask & __GFP_FS)) + goto try_again; + /* + * Fail if no progress was made and the + * allocation may not be able to block on IO. + */ + return NULL; + } + } + } + + /* + * Final phase: allocate anything we can! + * + * Higher order allocations, GFP_ATOMIC allocations and + * recursive allocations (PF_MEMALLOC) end up here. + * + * Only recursive allocations can use the very last pages + * in the system, otherwise it would be just too easy to + * deadlock the system... + */ zone = zonelist->zones; min = 1UL << order; for (;;) { - unsigned long local_min; zone_t *z = *(zone++); + struct page * page = NULL; if (!z) break; - local_min = z->pages_min; - if (!(gfp_mask & __GFP_WAIT)) - local_min >>= 2; - min += local_min; - if (z->free_pages > min) { + /* + * SUBTLE: direct_reclaim is only possible if the task + * becomes PF_MEMALLOC while looping above. This will + * happen when the OOM killer selects this task for + * death. + */ + if (direct_reclaim) { + page = reclaim_page(z); + if (page) + return page; + } + + /* XXX: is pages_min/4 a good amount to reserve for this? */ + min += z->pages_min / 4; + if (z->free_pages > min || ((current->flags & PF_MEMALLOC) && !in_interrupt())) { page = rmqueue(z, order); if (page) return page; } } + goto out_failed; - /* here we're in the low on memory slow path */ -rebalance: - if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) { + /* + * Naive "defragmentation" for higher-order allocations. First we + * free the inactive_clean pages to see if we can allocate our + * allocation, then we call page_launder() to clean some dirty + * pages, and last we try once more. + * + * We might want to turn this into something which defragments + * memory based on physical page, simply by looking for unmapped + * pages next to pages on the free list... + */ +defragment: + { + int freed = 0; +defragment_again: zone = zonelist->zones; for (;;) { zone_t *z = *(zone++); if (!z) break; - - page = rmqueue(z, order); - if (page) - return page; - } -nopage: - if (!(current->flags & PF_RADIX_TREE)) { - printk("%s: page allocation failure." - " order:%d, mode:0x%x\n", - current->comm, order, gfp_mask); + if (!z->size) + continue; + while (z->inactive_clean_pages) { + struct page * page; + /* Move one page to the free list. */ + page = reclaim_page(z); + if (!page) + break; + __free_page(page); + /* Try if the allocation succeeds. */ + page = rmqueue(z, order); + if (page) + return page; + } } - return NULL; - } - - /* Atomic allocations - we can't balance anything */ - if (!(gfp_mask & __GFP_WAIT)) - goto nopage; - page = balance_classzone(classzone, gfp_mask, order, &freed); - if (page) - return page; - - zone = zonelist->zones; - min = 1UL << order; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - - min += z->pages_min; - if (z->free_pages > min) { - page = rmqueue(z, order); - if (page) - return page; + /* XXX: do real defragmentation instead of calling launder ? */ + if (!freed & !(current->flags & PF_MEMALLOC)) { + freed = 1; + current->flags |= PF_MEMALLOC; + try_to_free_pages(gfp_mask); + current->flags &= ~PF_MEMALLOC; + goto defragment_again; } } - /* Don't let big-order allocations loop */ - if (order > 3) - goto nopage; - - /* Yield for kswapd, and try again */ - __set_current_state(TASK_RUNNING); - yield(); - goto rebalance; +out_failed: + /* No luck.. */ + printk(KERN_ERR "__alloc_pages: %u-order allocation failed.\n", order); + return NULL; } /* @@ -497,37 +631,30 @@ { unsigned int sum; zone_t *zone; - pg_data_t *pgdat = pgdat_list; sum = 0; - while (pgdat) { - for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) - sum += zone->free_pages; - pgdat = pgdat->node_next; - } + for_each_zone(zone) + sum += zone->free_pages; + return sum; } -static unsigned int nr_free_zone_pages(int offset) +static unsigned int nr_free_zone_pages (int offset) { - pg_data_t *pgdat = pgdat_list; + pg_data_t *pgdat; unsigned int sum = 0; - do { + for_each_pgdat(pgdat) { zonelist_t *zonelist = pgdat->node_zonelists + offset; zone_t **zonep = zonelist->zones; zone_t *zone; for (zone = *zonep++; zone; zone = *zonep++) { - unsigned long size = zone->size; - unsigned long high = zone->pages_high; - if (size > high) - sum += size - high; + sum += zone->free_pages; + sum += zone->inactive_clean_pages; + sum += zone->inactive_dirty_pages; } - - pgdat = pgdat->node_next; - } while (pgdat); - + } return sum; } @@ -550,13 +677,12 @@ #if CONFIG_HIGHMEM unsigned int nr_free_highpages (void) { - pg_data_t *pgdat = pgdat_list; + pg_data_t *pgdat; unsigned int pages = 0; - while (pgdat) { + for_each_pgdat(pgdat) pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; - pgdat = pgdat->node_next; - } + return pages; } #endif @@ -581,8 +707,9 @@ ret->nr_dirty += ps->nr_dirty; ret->nr_writeback += ps->nr_writeback; ret->nr_pagecache += ps->nr_pagecache; - ret->nr_active += ps->nr_active; - ret->nr_inactive += ps->nr_inactive; + ret->nr_active_pages += ps->nr_active_pages; + ret->nr_inactive_clean_pages += ps->nr_inactive_clean_pages; + ret->nr_inactive_dirty_pages += ps->nr_inactive_dirty_pages; } } @@ -645,12 +772,13 @@ tmpdat = tmpdat->node_next; } - printk("( Active:%lu inactive:%lu dirty:%lu writeback:%lu free:%u )\n", - ps.nr_active, - ps.nr_inactive, - ps.nr_dirty, - ps.nr_writeback, - nr_free_pages()); + printk("( Active:%lu inactive_dirty:%lu inactive_clean:%lu dirty:%lu writeback:%lu free:%u )\n", + ps.nr_active_pages, + ps.nr_inactive_dirty_pages, + ps.nr_inactive_clean_pages, + ps.nr_dirty, + ps.nr_writeback, + nr_free_pages()); for (type = 0; type < MAX_NR_ZONES; type++) { struct list_head *head, *curr; @@ -786,6 +914,7 @@ * - mark all memory queues empty * - clear the memory bitmaps */ +extern unsigned int kswapd_minfree; void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, unsigned long *zones_size, unsigned long zone_start_paddr, unsigned long *zholes_size, struct page *lmem_map) @@ -832,7 +961,7 @@ offset = lmem_map - mem_map; for (j = 0; j < MAX_NR_ZONES; j++) { zone_t *zone = pgdat->node_zones + j; - unsigned long mask; + unsigned long mask, extrafree = 0; unsigned long size, realsize; zone_table[nid * MAX_NR_ZONES + j] = zone; @@ -846,7 +975,14 @@ zone->lock = SPIN_LOCK_UNLOCKED; zone->zone_pgdat = pgdat; zone->free_pages = 0; - zone->need_balance = 0; + zone->inactive_clean_pages = 0; + zone->inactive_dirty_pages = 0; + zone->pte_chain_freelist = NULL; + INIT_LIST_HEAD(&zone->active_list); + INIT_LIST_HEAD(&zone->inactive_dirty_list); + INIT_LIST_HEAD(&zone->inactive_clean_list); + spin_lock_init(&zone->pte_chain_freelist_lock); + if (!size) continue; @@ -866,15 +1002,22 @@ pgdat->nr_zones = j+1; + /* + * On large memory machines we keep extra memory + * free for kernel allocations. + */ + if (zone_extrafree_ratio[j]) + extrafree = min_t(int, (realtotalpages / zone_extrafree_ratio[j]), zone_extrafree_max[j]); + if (extrafree < zone_balance_max[j]) + extrafree = 0; + mask = (realsize / zone_balance_ratio[j]); if (mask < zone_balance_min[j]) mask = zone_balance_min[j]; - else if (mask > zone_balance_max[j]) - mask = zone_balance_max[j]; - zone->pages_min = mask; - zone->pages_low = mask*2; - zone->pages_high = mask*3; - + zone->pages_min = extrafree + min(mask, (unsigned long)zone_balance_max[j]); + zone->pages_low = extrafree + mask*2; + zone->pages_high = extrafree + mask*3; + zone->pages_plenty = extrafree + mask*6; zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; zone->zone_start_paddr = zone_start_paddr; @@ -882,6 +1025,8 @@ if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1)) printk("BUG: wrong zone alignment, it will crash\n"); + kswapd_minfree += zone->pages_min; + /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is diff -uNr linux-2.5.22/mm/pdflush.c linux-2.5.22-rmap13b/mm/pdflush.c --- linux-2.5.22/mm/pdflush.c Wed Jun 12 15:44:34 2002 +++ linux-2.5.22-rmap13b/mm/pdflush.c Tue Jun 18 13:47:39 2002 @@ -16,7 +16,6 @@ #include #include - /* * Minimum and maximum number of pdflush instances */ @@ -97,7 +96,6 @@ spin_lock_irq(&pdflush_lock); nr_pdflush_threads++; -// printk("pdflush %d [%d] starts\n", nr_pdflush_threads, current->pid); for ( ; ; ) { struct pdflush_work *pdf; @@ -144,7 +142,6 @@ my_work->fn = NULL; } nr_pdflush_threads--; -// printk("pdflush %d [%d] ends\n", nr_pdflush_threads, current->pid); spin_unlock_irq(&pdflush_lock); return 0; } diff -uNr linux-2.5.22/mm/readahead.c linux-2.5.22-rmap13b/mm/readahead.c --- linux-2.5.22/mm/readahead.c Tue Jun 18 13:42:11 2002 +++ linux-2.5.22-rmap13b/mm/readahead.c Tue Jun 18 13:47:39 2002 @@ -174,6 +174,42 @@ } /* + * We combine this with readahead to deactivate pages when we + * think there's sequential IO going on. Note that this is + * harmless since we don't actually evict the pages from memory + * but just move them to the inactive list. + * + * Rik van Riel, 2000 + */ +static void drop_behind(struct file * file, unsigned long index) +{ + struct inode *inode = file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + struct page *page; + + /* Nothing to drop-behind if we're on the first page. */ + if (!index) + return; + + /* + * Go backwards from index-1 and drop all pages in the + * readahead window. Since the readahead window may have + * been increased since the last time we were called, we + * stop when the page isn't there. + */ + spin_lock(&pagemap_lru_lock); + while (--index >= file->f_ra.start) { + spin_lock(&mapping->page_lock); + page = radix_tree_lookup(&mapping->page_tree, index); + spin_unlock(&mapping->page_lock); + if (!page || !PageActive(page)) + break; + drop_page(page); + } + spin_unlock(&pagemap_lru_lock); +} + +/* * page_cache_readahead is the main function. If performs the adaptive * readahead window size management and submits the readahead I/O. */ @@ -288,6 +324,11 @@ } } out: + /* + * Move the pages that have already been passed + * to the inactive list. + */ + drop_behind(file, offset); return; } diff -uNr linux-2.5.22/mm/rmap.c linux-2.5.22-rmap13b/mm/rmap.c --- linux-2.5.22/mm/rmap.c Wed Dec 31 17:00:00 1969 +++ linux-2.5.22-rmap13b/mm/rmap.c Tue Jun 18 13:47:39 2002 @@ -0,0 +1,436 @@ +/* + * mm/rmap.c - physical to virtual reverse mappings + * + * Copyright 2001, Rik van Riel + * Released under the General Public License (GPL). + * + * + * Simple, low overhead pte-based reverse mapping scheme. + * This is kept modular because we may want to experiment + * with object-based reverse mapping schemes. Please try + * to keep this thing as modular as possible. + */ + +/* + * Locking: + * - the page->pte_chain is protected by the PG_chainlock bit, + * which nests within the pagemap_lru_lock, then the + * mm->page_table_lock, and then the page lock. + * - because swapout locking is opposite to the locking order + * in the page fault path, the swapout path uses trylocks + * on the mm->page_table_lock + */ +#include +#include +#include + +#include +#include +#include +#include +#include + +/*#define DEBUG_RMAP */ + +/* + * Shared pages have a chain of pte_chain structures, used to locate + * all the mappings to this page. We only need a pointer to the pte + * here, the page struct for the page table page contains the process + * it belongs to and the offset within that process. + * + * A singly linked list should be fine for most, if not all, workloads. + * On fork-after-exec the mapping we'll be removing will still be near + * the start of the list, on mixed application systems the short-lived + * processes will have their mappings near the start of the list and + * in systems with long-lived applications the relative overhead of + * exit() will be lower since the applications are long-lived. + */ +struct pte_chain { + struct pte_chain * next; + pte_t * ptep; +}; + +static inline struct pte_chain * pte_chain_alloc(zone_t *); +static inline void pte_chain_free(struct pte_chain *, struct pte_chain *, + struct page *, zone_t *); +static void alloc_new_pte_chains(zone_t *); + +/** + * page_referenced - test if the page was referenced + * @page: the page to test + * + * Quick test_and_clear_referenced for all mappings to a page, + * returns the number of processes which referenced the page. + * Caller needs to hold the pte_chain_lock. + */ +int page_referenced(struct page * page) +{ + struct pte_chain * pc; + int referenced = 0; + + if (TestClearPageReferenced(page)) + referenced++; + + /* Check all the page tables mapping this page. */ + for (pc = page->pte_chain; pc; pc = pc->next) { + if (ptep_test_and_clear_young(pc->ptep)) + referenced++; + } + return referenced; +} + +/** + * page_add_rmap - add reverse mapping entry to a page + * @page: the page to add the mapping to + * @ptep: the page table entry mapping this page + * + * Add a new pte reverse mapping to a page. + * The caller needs to hold the mm->page_table_lock. + */ +void page_add_rmap(struct page * page, pte_t * ptep) +{ + struct pte_chain * pte_chain; + unsigned long pfn = pte_pfn(*ptep); + +#ifdef DEBUG_RMAP + if (!page || !ptep) + BUG(); + if (!pte_present(*ptep)) + BUG(); + if (!ptep_to_mm(ptep)) + BUG(); +#endif + + if (!pfn_valid(pfn) || PageReserved(page)) + return; + +#ifdef DEBUG_RMAP + pte_chain_lock(page); + { + struct pte_chain * pc; + for (pc = page->pte_chain; pc; pc = pc->next) { + if (pc->ptep == ptep) + BUG(); + } + } + pte_chain_unlock(page); +#endif + + pte_chain = pte_chain_alloc(page_zone(page)); + + pte_chain_lock(page); + + /* Hook up the pte_chain to the page. */ + pte_chain->ptep = ptep; + pte_chain->next = page->pte_chain; + page->pte_chain = pte_chain; + + pte_chain_unlock(page); +} + +/** + * page_remove_rmap - take down reverse mapping to a page + * @page: page to remove mapping from + * @ptep: page table entry to remove + * + * Removes the reverse mapping from the pte_chain of the page, + * after that the caller can clear the page table entry and free + * the page. + * Caller needs to hold the mm->page_table_lock. + */ +void page_remove_rmap(struct page * page, pte_t * ptep) +{ + struct pte_chain * pc, * prev_pc = NULL; + unsigned long pfn = pte_pfn(*ptep); + zone_t *zone; + + if (!page || !ptep) + BUG(); + if (!pfn_valid(pfn) || PageReserved(page)) + return; + + zone = page_zone(page); + + pte_chain_lock(page); + for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) { + if (pc->ptep == ptep) { + pte_chain_free(pc, prev_pc, page, zone); + goto out; + } + } +#ifdef DEBUG_RMAP + /* Not found. This should NEVER happen! */ + printk(KERN_ERR "page_remove_rmap: pte_chain %p not present.\n", ptep); + printk(KERN_ERR "page_remove_rmap: only found: "); + for (pc = page->pte_chain; pc; pc = pc->next) + printk("%p ", pc->ptep); + printk("\n"); + printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n"); +#endif + +out: + pte_chain_unlock(page); + return; + +} + +/** + * try_to_unmap_one - worker function for try_to_unmap + * @page: page to unmap + * @ptep: page table entry to unmap from page + * + * Internal helper function for try_to_unmap, called for each page + * table entry mapping a page. Because locking order here is opposite + * to the locking order used by the page fault path, we use trylocks. + * Locking: + * pagemap_lru_lock page_launder() + * page lock page_launder(), trylock + * pte_chain_lock page_launder() + * mm->page_table_lock try_to_unmap_one(), trylock + */ +static int FASTCALL(try_to_unmap_one(struct page *, pte_t *)); +static int try_to_unmap_one(struct page * page, pte_t * ptep) +{ + unsigned long address = ptep_to_address(ptep); + struct mm_struct * mm = ptep_to_mm(ptep); + struct vm_area_struct * vma; + pte_t pte; + int ret; + + if (!mm) + BUG(); + + /* + * We need the page_table_lock to protect us from page faults, + * munmap, fork, etc... + */ + if (!spin_trylock(&mm->page_table_lock)) + return SWAP_AGAIN; + + /* During mremap, it's possible pages are not in a VMA. */ + vma = find_vma(mm, address); + if (!vma) { + ret = SWAP_FAIL; + goto out_unlock; + } + + /* The page is mlock()d, we cannot swap it out. */ + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unlock; + } + + /* Nuke the page table entry. */ + pte = ptep_get_and_clear(ptep); + flush_tlb_page(vma, address); + flush_cache_page(vma, address); + + /* Store the swap location in the pte. See handle_pte_fault() ... */ + if (PageSwapCache(page)) { + swp_entry_t entry; + entry.val = page->index; + swap_duplicate(entry); + set_pte(ptep, swp_entry_to_pte(entry)); + } + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pte)) + set_page_dirty(page); + + mm->rss--; + page_cache_release(page); + ret = SWAP_SUCCESS; + +out_unlock: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold pagemap_lru_lock + * and the page lock. Return values are: + * + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_FAIL - the page is unswappable + * SWAP_ERROR - an error occurred + */ +int try_to_unmap(struct page * page) +{ + struct pte_chain * pc, * next_pc, * prev_pc = NULL; + zone_t *zone = page_zone(page); + int ret = SWAP_SUCCESS; + + /* This page should not be on the pageout lists. */ + if (PageReserved(page)) + BUG(); + if (!PageLocked(page)) + BUG(); + /* We need backing store to swap out a page. */ + if (!page->mapping) + BUG(); + + for (pc = page->pte_chain; pc; pc = next_pc) { + next_pc = pc->next; + switch (try_to_unmap_one(page, pc->ptep)) { + case SWAP_SUCCESS: + /* Free the pte_chain struct. */ + pte_chain_free(pc, prev_pc, page, zone); + break; + case SWAP_AGAIN: + /* Skip this pte, remembering status. */ + prev_pc = pc; + ret = SWAP_AGAIN; + continue; + case SWAP_FAIL: + return SWAP_FAIL; + case SWAP_ERROR: + return SWAP_ERROR; + } + } + + return ret; +} + +/** + * page_over_rsslimit - test if the page is over its RSS limit + * @page - page to test + * + * This function returns true if the process owning this page + * is over its RSS (resident set size) limit. For shared pages + * we penalise it only if all processes using it are over their + * rss limits. + * The caller needs to hold the page's pte_chain_lock. + */ +int page_over_rsslimit(struct page * page) +{ + struct pte_chain * pte_chain = page->pte_chain; + struct mm_struct * mm; + pte_t * ptep; + + /* No process is using the page. */ + if (!pte_chain) + return 0; + + do { + ptep = pte_chain->ptep; + mm = ptep_to_mm(ptep); + + /* + * If the process is under its RSS limit, stop + * scanning and don't penalise the page. + */ + if(!mm->rlimit_rss || mm->rss <= mm->rlimit_rss) + return 0; + + pte_chain = pte_chain->next; + } while (pte_chain); + + return 1; +} + +/** + ** No more VM stuff below this comment, only pte_chain helper + ** functions. + **/ + +static inline void pte_chain_push(zone_t * zone, + struct pte_chain * pte_chain) +{ + pte_chain->ptep = NULL; + pte_chain->next = zone->pte_chain_freelist; + zone->pte_chain_freelist = pte_chain; +} + +static inline struct pte_chain * pte_chain_pop(zone_t * zone) +{ + struct pte_chain *pte_chain; + + pte_chain = zone->pte_chain_freelist; + zone->pte_chain_freelist = pte_chain->next; + pte_chain->next = NULL; + + return pte_chain; +} + +/** + * pte_chain_free - free pte_chain structure + * @pte_chain: pte_chain struct to free + * @prev_pte_chain: previous pte_chain on the list (may be NULL) + * @page: page this pte_chain hangs off (may be NULL) + * @zone: memory zone to free pte chain in + * + * This function unlinks pte_chain from the singly linked list it + * may be on and adds the pte_chain to the free list. May also be + * called for new pte_chain structures which aren't on any list yet. + * Caller needs to hold the pte_chain_lock if the page is non-NULL. + */ +static inline void pte_chain_free(struct pte_chain * pte_chain, + struct pte_chain * prev_pte_chain, struct page * page, + zone_t * zone) +{ + if (prev_pte_chain) + prev_pte_chain->next = pte_chain->next; + else if (page) + page->pte_chain = pte_chain->next; + + spin_lock(&zone->pte_chain_freelist_lock); + pte_chain_push(zone, pte_chain); + spin_unlock(&zone->pte_chain_freelist_lock); +} + +/** + * pte_chain_alloc - allocate a pte_chain struct + * @zone: memory zone to allocate pte_chain for + * + * Returns a pointer to a fresh pte_chain structure. Allocates new + * pte_chain structures as required. + * Caller needs to hold the page's pte_chain_lock. + */ +static inline struct pte_chain * pte_chain_alloc(zone_t * zone) +{ + struct pte_chain * pte_chain; + + spin_lock(&zone->pte_chain_freelist_lock); + + /* Allocate new pte_chain structs as needed. */ + if (!zone->pte_chain_freelist) + alloc_new_pte_chains(zone); + + /* Grab the first pte_chain from the freelist. */ + pte_chain = pte_chain_pop(zone); + + spin_unlock(&zone->pte_chain_freelist_lock); + + return pte_chain; +} + +/** + * alloc_new_pte_chains - convert a free page to pte_chain structures + * @zone: memory zone to allocate pte_chains for + * + * Grabs a free page and converts it to pte_chain structures. We really + * should pre-allocate these earlier in the pagefault path or come up + * with some other trick. + * + * Note that we cannot use the slab cache because the pte_chain structure + * is way smaller than the minimum size of a slab cache allocation. + * Caller needs to hold the zone->pte_chain_freelist_lock + */ +static void alloc_new_pte_chains(zone_t *zone) +{ + struct pte_chain * pte_chain = (void *) get_zeroed_page(GFP_ATOMIC); + int i = PAGE_SIZE / sizeof(struct pte_chain); + + if (pte_chain) { + for (; i-- > 0; pte_chain++) + pte_chain_push(zone, pte_chain); + } else { + /* Yeah yeah, I'll fix the pte_chain allocation ... */ + panic("Fix pte_chain allocation, you lazy bastard!\n"); + } +} diff -uNr linux-2.5.22/mm/swap.c linux-2.5.22-rmap13b/mm/swap.c --- linux-2.5.22/mm/swap.c Thu May 2 17:22:50 2002 +++ linux-2.5.22-rmap13b/mm/swap.c Tue Jun 18 13:55:21 2002 @@ -15,10 +15,11 @@ #include #include -#include #include #include #include +#include /* for try_to_release_page() */ +#include #include #include /* for copy_to/from_user */ @@ -33,15 +34,97 @@ 8, /* do swap I/O in clusters of this size */ }; +/** + * (de)activate_page - move pages from/to active and inactive lists + * @page: the page we want to move + * @nolock - are we already holding the pagemap_lru_lock? + * + * Deactivate_page will move an active page to the right + * inactive list, while activate_page will move a page back + * from one of the inactive lists to the active list. If + * called on a page which is not on any of the lists, the + * page is left alone. + */ +void deactivate_page_nolock(struct page * page) +{ + /* + * Don't touch it if it's not on the active list. + * (some pages aren't on any list at all) + */ + ClearPageReferenced(page); + page->age = 0; + if (PageActive(page)) { + del_page_from_active_list(page); + add_page_to_inactive_dirty_list(page); + } +} + +void deactivate_page(struct page * page) +{ + spin_lock(&pagemap_lru_lock); + deactivate_page_nolock(page); + spin_unlock(&pagemap_lru_lock); +} + +/** + * drop_page - like deactivate_page, but try inactive_clean list + * @page: the page to drop + * + * Try to move a page to the inactive_clean list, this succeeds if the + * page is clean and not in use by anybody. If the page cannot be placed + * on the inactive_clean list it is placed on the inactive_dirty list + * instead. + * + * Note: this function gets called with the pagemap_lru_lock held. + */ +void drop_page(struct page * page) +{ + if (!TestSetPageLocked(page)) { + if (page->mapping && PagePrivate(page)) { + page_cache_get(page); + spin_unlock(&pagemap_lru_lock); + try_to_release_page(page, GFP_NOIO); + spin_lock(&pagemap_lru_lock); + page_cache_release(page); + } + unlock_page(page); + } + + /* Make sure the page really is reclaimable. */ + pte_chain_lock(page); + if (!page->mapping || PageDirty(page) || page->pte_chain || + PagePrivate(page) || page_count(page) > 1) + deactivate_page_nolock(page); + + else if (page_count(page) == 1) { + ClearPageReferenced(page); + page->age = 0; + if (PageActive(page)) { + del_page_from_active_list(page); + add_page_to_inactive_clean_list(page); + } else if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_inactive_clean_list(page); + } + } + pte_chain_unlock(page); +} + /* * Move an inactive page to the active list. */ -static inline void activate_page_nolock(struct page * page) +void activate_page_nolock(struct page * page) { - if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(page); + if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); add_page_to_active_list(page); } + + /* Make sure the page gets a fair chance at staying active. */ + page->age = max((int)page->age, PAGE_AGE_START); } void activate_page(struct page * page) @@ -57,29 +140,31 @@ */ void lru_cache_add(struct page * page) { - if (!TestSetPageLRU(page)) { + if (!PageLRU(page)) { spin_lock(&pagemap_lru_lock); - add_page_to_inactive_list(page); + SetPageLRU(page); + add_page_to_active_list(page); spin_unlock(&pagemap_lru_lock); } } /** * __lru_cache_del: remove a page from the page lists - * @page: the page to add + * @page: the page to remove * * This function is for when the caller already holds * the pagemap_lru_lock. */ void __lru_cache_del(struct page * page) { - if (TestClearPageLRU(page)) { - if (PageActive(page)) { - del_page_from_active_list(page); - } else { - del_page_from_inactive_list(page); - } + if (PageActive(page)) { + del_page_from_active_list(page); + } else if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); } + ClearPageLRU(page); } /** diff -uNr linux-2.5.22/mm/swap_state.c linux-2.5.22-rmap13b/mm/swap_state.c --- linux-2.5.22/mm/swap_state.c Wed Jun 12 16:06:35 2002 +++ linux-2.5.22-rmap13b/mm/swap_state.c Tue Jun 18 13:47:39 2002 @@ -125,6 +125,63 @@ return 0; } +/** + * add_to_swap - allocate swap space for a page + * @page: page we want to move to swap + * + * Allocate swap space for the page and add the page to the + * swap cache. Caller needs to hold the page lock. + */ +int add_to_swap(struct page * page) +{ + swp_entry_t entry; + int flags; + + if (!PageLocked(page)) + BUG(); + + for (;;) { + entry = get_swap_page(); + if (!entry.val) + return 0; + + /* Radix-tree node allocations are performing + * GFP_ATOMIC allocations under PF_MEMALLOC. + * They can completely exhaust the page allocator. + * + * So PF_MEMALLOC is dropped here. This causes the slab + * allocations to fail earlier, so radix-tree nodes will + * then be allocated from the mempool reserves. */ + + flags = current->flags; + current->flags &= ~PF_MEMALLOC; + current->flags |= PF_RADIX_TREE; + + /* + * Add it to the swap cache and mark it dirty + * (adding to the page cache will clear the dirty + * and uptodate bits, so we need to do it again) + */ + switch (add_to_swap_cache(page, entry)) { + case 0: /* Success */ + current->flags = flags; + SetPageUptodate(page); + set_page_dirty(page); + swap_free(entry); + return 1; + case -ENOMEM: /* radix-tree allocation */ + current->flags = flags; + swap_free(entry); + return 0; + default: /* ENOENT: raced */ + break; + } + /* Raced with "speculative" read_swap_cache_async */ + current->flags = flags; + swap_free(entry); + } +} + /* * This must be called only on pages that have * been verified to be in the swap cache. diff -uNr linux-2.5.22/mm/swapfile.c linux-2.5.22-rmap13b/mm/swapfile.c --- linux-2.5.22/mm/swapfile.c Tue Jun 18 13:42:11 2002 +++ linux-2.5.22-rmap13b/mm/swapfile.c Tue Jun 18 13:47:39 2002 @@ -384,6 +384,7 @@ return; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + page_add_rmap(page, dir); swap_free(entry); ++vma->vm_mm->rss; } diff -uNr linux-2.5.22/mm/vmscan.c linux-2.5.22-rmap13b/mm/vmscan.c --- linux-2.5.22/mm/vmscan.c Wed Jun 12 16:06:36 2002 +++ linux-2.5.22-rmap13b/mm/vmscan.c Tue Jun 18 13:56:10 2002 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include /* for try_to_release_page() */ @@ -29,6 +30,8 @@ #include #include +static void refill_freelist(void); +static void wakeup_memwaiters(void); /* * The "priority" of VM scanning is how much of the queues we * will scan in one go. A value of 6 for DEF_PRIORITY implies @@ -37,432 +40,274 @@ */ #define DEF_PRIORITY (6) -static inline int is_page_cache_freeable(struct page * page) +static inline void age_page_up(struct page *page) { - return page_count(page) - !!PagePrivate(page) == 1; + page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX); } -/* - * On the swap_out path, the radix-tree node allocations are performing - * GFP_ATOMIC allocations under PF_MEMALLOC. They can completely - * exhaust the page allocator. This is bad; some pages should be left - * available for the I/O system to start sending the swapcache contents - * to disk. - * - * So PF_MEMALLOC is dropped here. This causes the slab allocations to fail - * earlier, so radix-tree nodes will then be allocated from the mempool - * reserves. - */ -static inline int -swap_out_add_to_swap_cache(struct page *page, swp_entry_t entry) +static inline void age_page_down(struct page *page) { - int flags = current->flags; - int ret; - - current->flags &= ~PF_MEMALLOC; - current->flags |= PF_RADIX_TREE; - ret = add_to_swap_cache(page, entry); - current->flags = flags; - return ret; + page->age -= min(PAGE_AGE_DECL, (int)page->age); } -/* - * The swap-out function returns 1 if it successfully - * scanned all the pages it was asked to (`count'). - * It returns zero if it couldn't do anything, - * - * rss may decrease because pages are shared, but this - * doesn't count as having freed a page. - */ - -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone) +/* Must be called with page's pte_chain_lock held. */ +static inline int page_mapping_inuse(struct page * page) { - pte_t pte; - swp_entry_t entry; + struct address_space *mapping = page->mapping; - /* Don't look at this pte if it's been accessed recently. */ - if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) { - mark_page_accessed(page); - return 0; - } - - /* Don't bother unmapping pages that are active */ - if (PageActive(page)) - return 0; - - /* Don't bother replenishing zones not under pressure.. */ - if (!memclass(page_zone(page), classzone)) - return 0; + /* Page is in somebody's page tables. */ + if (page->pte_chain) + return 1; - if (TestSetPageLocked(page)) + /* XXX: does this happen ? */ + if (!mapping) return 0; - if (PageWriteback(page)) - goto out_unlock; + /* File is mmap'd by somebody. */ + if (!list_empty(&mapping->i_mmap) || !list_empty(&mapping->i_mmap_shared)) + return 1; - /* From this point on, the odds are that we're going to - * nuke this pte, so read and clear the pte. This hook - * is needed on CPUs which update the accessed and dirty - * bits in hardware. - */ - flush_cache_page(vma, address); - pte = ptep_get_and_clear(page_table); - flush_tlb_page(vma, address); + return 0; +} - if (pte_dirty(pte)) - set_page_dirty(page); - /* - * Is the page already in the swap cache? If so, then - * we can just drop our reference to it without doing - * any IO - it's already up-to-date on disk. - */ - if (PageSwapCache(page)) { - entry.val = page->index; - swap_duplicate(entry); -set_swap_pte: - set_pte(page_table, swp_entry_to_pte(entry)); -drop_pte: - mm->rss--; - unlock_page(page); - { - int freeable = page_count(page) - - !!PagePrivate(page) <= 2; - page_cache_release(page); - return freeable; - } - } +/** + * reclaim_page - reclaims one page from the inactive_clean list + * @zone: reclaim a page from this zone + * + * The pages on the inactive_clean can be instantly reclaimed. + * The tests look impressive, but most of the time we'll grab + * the first page of the list and exit successfully. + */ +struct page * reclaim_page(zone_t * zone) +{ + struct address_space * mapping; + struct page * page = NULL; + struct list_head * page_lru; + swp_entry_t entry = {0}; + int maxscan; /* - * Is it a clean page? Then it must be recoverable - * by just paging it in again, and we can just drop - * it.. or if it's dirty but has backing store, - * just mark the page dirty and drop it. - * - * However, this won't actually free any real - * memory, as the page will just be in the page cache - * somewhere, and as such we should just continue - * our scan. - * - * Basically, this just makes it possible for us to do - * some real work in the future in "refill_inactive()". - */ - if (page->mapping) - goto drop_pte; - if (!PageDirty(page)) - goto drop_pte; + * We need to hold the page_lock around all tests to make sure + * reclaim_page() cannot race with find_get_page() and friends. + */ + spin_lock(&pagemap_lru_lock); + maxscan = zone->inactive_clean_pages; + while (maxscan-- && !list_empty(&zone->inactive_clean_list)) { + page_lru = zone->inactive_clean_list.prev; + page = list_entry(page_lru, struct page, lru); - /* - * Anonymous buffercache pages can be left behind by - * concurrent truncate and pagefault. - */ - if (PagePrivate(page)) - goto preserve; + mapping = page->mapping; + spin_lock(&mapping->page_lock); - /* - * This is a dirty, swappable page. First of all, - * get a suitable swap entry for it, and make sure - * we have the swap cache set up to associate the - * page with that swap entry. - */ - for (;;) { - entry = get_swap_page(); - if (!entry.val) - break; - /* Add it to the swap cache and mark it dirty - * (adding to the page cache will clear the dirty - * and uptodate bits, so we need to do it again) - */ - switch (swap_out_add_to_swap_cache(page, entry)) { - case 0: /* Success */ - SetPageUptodate(page); - set_page_dirty(page); - goto set_swap_pte; - case -ENOMEM: /* radix-tree allocation */ - swap_free(entry); - goto preserve; - default: /* ENOENT: raced */ - break; + /* Wrong page on list?! (list corruption, should not happen) */ + if (unlikely(!PageInactiveClean(page))) { + printk("VM: reclaim_page, wrong page on list.\n"); + list_del(page_lru); + page_zone(page)->inactive_clean_pages--; + goto unlock; + } + + /* Page is being freed */ + if (unlikely(!page_count(page))) { + list_del(page_lru); + list_add(page_lru, &zone->inactive_clean_list); + goto unlock; + } + + /* Page cannot be reclaimed ? Move to inactive_dirty list. */ + pte_chain_lock(page); + if (unlikely(page->pte_chain || PagePrivate(page) || + PageReferenced(page) || PageDirty(page) || + page_count(page) > 1 || TestSetPageLocked(page))) { + del_page_from_inactive_clean_list(page); + add_page_to_inactive_dirty_list(page); + pte_chain_unlock(page); + goto unlock; } - /* Raced with "speculative" read_swap_cache_async */ - swap_free(entry); - } - - /* No swap space left */ -preserve: - set_pte(page_table, pte); -out_unlock: - unlock_page(page); - return 0; -} - -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) -{ - pte_t * pte; - unsigned long pmd_end; - if (pmd_none(*dir)) - return count; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); - return count; - } - - pte = pte_offset_map(dir, address); - - pmd_end = (address + PMD_SIZE) & PMD_MASK; - if (end > pmd_end) - end = pmd_end; + /* + * From here until reaching either the bottom of the loop + * or found_page: the pte_chain_lock is held. + */ - do { - if (pte_present(*pte)) { - unsigned long pfn = pte_pfn(*pte); - struct page *page = pfn_to_page(pfn); - - if (pfn_valid(pfn) && !PageReserved(page)) { - count -= try_to_swap_out(mm, vma, address, pte, page, classzone); - if (!count) { - address += PAGE_SIZE; - pte++; - break; - } - } + /* OK, remove the page from the caches. */ + if (PageSwapCache(page)) { + entry.val = page->index; + __delete_from_swap_cache(page); + goto found_page; } - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); - pte_unmap(pte - 1); - mm->swap_address = address; - return count; -} -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) -{ - pmd_t * pmd; - unsigned long pgd_end; + if (page->mapping) { + __remove_inode_page(page); + goto found_page; + } - if (pgd_none(*dir)) - return count; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return count; + /* We should never ever get here. */ + printk(KERN_ERR "VM: reclaim_page, found unknown page\n"); + list_del(page_lru); + zone->inactive_clean_pages--; + pte_chain_unlock(page); + unlock_page(page); +unlock: + spin_unlock(&mapping->page_lock); } + spin_unlock(&pagemap_lru_lock); + return NULL; - pmd = pmd_offset(dir, address); - - pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; - if (pgd_end && (end > pgd_end)) - end = pgd_end; - - do { - count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); - if (!count) - break; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return count; -} - -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone) -{ - pgd_t *pgdir; - unsigned long end; - - /* Don't swap out areas which are reserved */ - if (vma->vm_flags & VM_RESERVED) - return count; - - pgdir = pgd_offset(mm, address); - - end = vma->vm_end; - if (address >= end) - BUG(); - do { - count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); - if (!count) - break; - address = (address + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while (address && (address < end)); - return count; +found_page: + __lru_cache_del(page); + pte_chain_unlock(page); + spin_unlock(&mapping->page_lock); + spin_unlock(&pagemap_lru_lock); + if (entry.val) + swap_free(entry); + unlock_page(page); + page->age = PAGE_AGE_START; + if (page_count(page) != 1) + printk("VM: reclaim_page, found page with count %d!\n", + page_count(page)); + return page; } -/* Placeholder for swap_out(): may be updated by fork.c:mmput() */ -struct mm_struct *swap_mm = &init_mm; -/* - * Returns remaining count of pages to be swapped out by followup call. +/** + * page_launder_zone - clean dirty inactive pages, move to inactive_clean list + * @zone: zone to free pages in + * @gfp_mask: what operations we are allowed to do + * + * This function is called when we are low on free / inactive_clean + * pages, its purpose is to refill the free/clean list as efficiently + * as possible. + * + * This means we do writes asynchronously as long as possible and will + * only sleep on IO when we don't have another option. Since writeouts + * cause disk seeks and make read IO slower, we skip writes alltogether + * when the amount of dirty pages is small. + * + * This code is heavily inspired by the FreeBSD source code. Thanks + * go out to Matthew Dillon. */ -static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone) -{ - unsigned long address; - struct vm_area_struct* vma; - - /* - * Find the proper vm-area after freezing the vma chain - * and ptes. - */ - spin_lock(&mm->page_table_lock); - address = mm->swap_address; - if (address == TASK_SIZE || swap_mm != mm) { - /* We raced: don't count this mm but try again */ - ++*mmcounter; - goto out_unlock; - } - vma = find_vma(mm, address); - if (vma) { - if (address < vma->vm_start) - address = vma->vm_start; - - for (;;) { - count = swap_out_vma(mm, vma, address, count, classzone); - vma = vma->vm_next; - if (!vma) - break; - if (!count) - goto out_unlock; - address = vma->vm_start; - } - } - /* Indicate that we reached the end of address space */ - mm->swap_address = TASK_SIZE; - -out_unlock: - spin_unlock(&mm->page_table_lock); - return count; -} - -static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)); -static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone) -{ - int counter, nr_pages = SWAP_CLUSTER_MAX; - struct mm_struct *mm; - - counter = mmlist_nr; - do { - if (need_resched()) { - __set_current_state(TASK_RUNNING); - schedule(); - } - - spin_lock(&mmlist_lock); - mm = swap_mm; - while (mm->swap_address == TASK_SIZE || mm == &init_mm) { - mm->swap_address = 0; - mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); - if (mm == swap_mm) - goto empty; - swap_mm = mm; - } - - /* Make sure the mm doesn't disappear when we drop the lock.. */ - atomic_inc(&mm->mm_users); - spin_unlock(&mmlist_lock); - - nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone); - - mmput(mm); - - if (!nr_pages) - return 1; - } while (--counter >= 0); - - return 0; - -empty: - spin_unlock(&mmlist_lock); - return 0; -} - -static int -shrink_cache(int nr_pages, zone_t *classzone, - unsigned int gfp_mask, int priority, int max_scan) +int page_launder_zone(zone_t * zone, int gfp_mask, int priority) { + int maxscan, cleaned_pages = 0, target = free_plenty(zone); struct list_head * entry; - struct address_space *mapping; - int max_mapped = nr_pages << (9 - priority); + struct address_space * mapping; + /* The main launder loop. */ spin_lock(&pagemap_lru_lock); - while (--max_scan >= 0 && - (entry = inactive_list.prev) != &inactive_list) { + maxscan = zone->inactive_dirty_pages >> priority; + while (maxscan-- && !list_empty(&zone->inactive_dirty_list)) { struct page * page; - + + /* Low latency reschedule point */ if (need_resched()) { spin_unlock(&pagemap_lru_lock); - __set_current_state(TASK_RUNNING); schedule(); spin_lock(&pagemap_lru_lock); continue; } + entry = zone->inactive_dirty_list.prev; page = list_entry(entry, struct page, lru); - if (unlikely(!PageLRU(page))) - BUG(); - if (unlikely(PageActive(page))) - BUG(); + if (cleaned_pages > target) + break; list_del(entry); - list_add(entry, &inactive_list); + list_add(entry, &zone->inactive_dirty_list); + + /* Wrong page on list?! (list corruption, should not happen) */ + if (!PageInactiveDirty(page)) { + printk("VM: page_launder, wrong page on list.\n"); + list_del(entry); + dec_page_state(nr_inactive_dirty_pages); + page_zone(page)->inactive_dirty_pages--; + continue; + } /* - * Zero page counts can happen because we unlink the pages - * _after_ decrementing the usage count.. + * Page is being freed, don't worry about it. */ if (unlikely(!page_count(page))) continue; - if (!memclass(page_zone(page), classzone)) + if (unlikely(TestSetPageLocked(page))) continue; - /* Racy check to avoid trylocking when not worthwhile */ - if (!PagePrivate(page) && (page_count(page) != 1 || !page->mapping)) - goto page_mapped; + if (PageWriteback(page)) { /* The non-racy check */ + unlock_page(page); + continue; + } /* - * IO in progress? Leave it at the back of the list. + * The page is in active use or really unfreeable. Move to + * the active list and adjust the page age if needed. */ - if (unlikely(PageWriteback(page))) { - if (gfp_mask & __GFP_FS) { - page_cache_get(page); - spin_unlock(&pagemap_lru_lock); - wait_on_page_writeback(page); + pte_chain_lock(page); + if (page_referenced(page) && page_mapping_inuse(page) && + !page_over_rsslimit(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + page->age = max((int)page->age, PAGE_AGE_START); + pte_chain_unlock(page); + unlock_page(page); + continue; + } + + /* + * Anonymous process memory without backing store. Try to + * allocate it some swap space here. + * + * XXX: implement swap clustering ? + */ + if (page->pte_chain && !page->mapping && !PagePrivate(page)) { + page_cache_get(page); + pte_chain_unlock(page); + spin_unlock(&pagemap_lru_lock); + if (!add_to_swap(page)) { + activate_page(page); + unlock_page(page); page_cache_release(page); spin_lock(&pagemap_lru_lock); + continue; } - continue; + page_cache_release(page); + spin_lock(&pagemap_lru_lock); + pte_chain_lock(page); } - if (TestSetPageLocked(page)) - continue; - - if (PageWriteback(page)) { /* The non-racy check */ - unlock_page(page); - continue; + /* + * The page is mapped into the page tables of one or more + * processes. Try to unmap it here. + */ + if (page->pte_chain) { + switch (try_to_unmap(page)) { + case SWAP_ERROR: + case SWAP_FAIL: + goto page_active; + case SWAP_AGAIN: + pte_chain_unlock(page); + unlock_page(page); + continue; + case SWAP_SUCCESS: + ; /* try to free the page below */ + } } - + pte_chain_unlock(page); mapping = page->mapping; - if (PageDirty(page) && is_page_cache_freeable(page) && - page->mapping && (gfp_mask & __GFP_FS)) { + if (PageDirty(page) && mapping && (gfp_mask & __GFP_FS)) { /* * It is not critical here to write it only if * the page is unmapped beause any direct writer - * like O_DIRECT would set the page's dirty bitflag - * on the phisical page after having successfully + * like O_DIRECT would set the PG_dirty bitflag + * on the physical page after having successfully * pinned it and after the I/O to the page is finished, * so the direct writes to the page cannot get lost. */ int (*writeback)(struct page *, int *); - const int nr_pages = SWAP_CLUSTER_MAX; - int nr_to_write = nr_pages; + int nr_to_write = SWAP_CLUSTER_MAX; writeback = mapping->a_ops->vm_writeback; if (writeback == NULL) @@ -470,7 +315,6 @@ page_cache_get(page); spin_unlock(&pagemap_lru_lock); (*writeback)(page, &nr_to_write); - max_scan -= (nr_pages - nr_to_write); page_cache_release(page); spin_lock(&pagemap_lru_lock); continue; @@ -484,7 +328,7 @@ if (PagePrivate(page)) { spin_unlock(&pagemap_lru_lock); - /* avoid to free a locked page */ + /* To avoid freeing our page before we're done. */ page_cache_get(page); if (try_to_release_page(page, gfp_mask)) { @@ -501,15 +345,14 @@ /* effectively free the page here */ page_cache_release(page); - - if (--nr_pages) - continue; - break; + cleaned_pages++; + continue; } else { /* - * The page is still in pagecache so undo the stuff - * before the try_to_release_page since we've not - * finished and we can now try the next step. + * We freed the buffers but may have + * slept; undo the stuff we did before + * try_to_release_page and fall through + * to the next step. */ page_cache_release(page); @@ -526,236 +369,285 @@ } /* - * This is the non-racy check for busy page. + * If the page is really freeable now, move it to the + * inactive_clean list. + * + * We re-test everything since the page could have been + * used by somebody else while we waited on IO above. + * This test is not safe from races, but only the one + * in reclaim_page() needs to be. */ - if (mapping) { - write_lock(&mapping->page_lock); - if (is_page_cache_freeable(page)) - goto page_freeable; - write_unlock(&mapping->page_lock); + pte_chain_lock(page); + if (mapping && !PageDirty(page) && !page->pte_chain && + page_count(page) == 1) { + del_page_from_inactive_dirty_list(page); + add_page_to_inactive_clean_list(page); + pte_chain_unlock(page); + unlock_page(page); + cleaned_pages++; + } else { + /* + * OK, we don't know what to do with the page. + * It's no use keeping it here, so we move it to + * the active list. + */ +page_active: + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + pte_chain_unlock(page); + unlock_page(page); } - unlock_page(page); -page_mapped: - if (--max_mapped >= 0) - continue; + } + spin_unlock(&pagemap_lru_lock); - /* - * Alert! We've found too many mapped pages on the - * inactive list, so we start swapping out now! - */ - spin_unlock(&pagemap_lru_lock); - swap_out(priority, gfp_mask, classzone); - return nr_pages; + /* Return the number of pages moved to the inactive_clean list. */ + return cleaned_pages; +} + +/** + * page_launder - clean dirty inactive pages, move to inactive_clean list + * @gfp_mask: what operations we are allowed to do + * + * This function iterates over all zones and calls page_launder_zone(), + * balancing still needs to be added... + */ +int page_launder(int gfp_mask) +{ + int maxtry = 1 << DEF_PRIORITY; + struct zone_struct * zone; + int freed = 0; + + /* Global balancing while we have a global shortage. */ + while (maxtry-- && free_high(ALL_ZONES) >= 0) { + for_each_zone(zone) + if (free_plenty(zone) >= 0) + freed += page_launder_zone(zone, gfp_mask, 6); + } + + /* Clean up the remaining zones with a serious shortage, if any. */ + for_each_zone(zone) + if (free_min(zone) >= 0) + freed += page_launder_zone(zone, gfp_mask, 0); + + return freed; +} + +/** + * refill_inactive_zone - scan the active list and find pages to deactivate + * @priority: how much are we allowed to scan + * + * This function will scan a portion of the active list of a zone to find + * unused pages, those pages will then be moved to the inactive list. + */ +int refill_inactive_zone(struct zone_struct * zone, int priority) +{ + int maxscan = zone->active_pages >> priority; + int target = inactive_high(zone); + struct list_head * page_lru; + int nr_deactivated = 0; + struct page * page; + + /* Take the lock while messing with the list... */ + spin_lock(&pagemap_lru_lock); + while (maxscan-- && !list_empty(&zone->active_list)) { + page_lru = zone->active_list.prev; + page = list_entry(page_lru, struct page, lru); + + /* Wrong page on list?! (list corruption, should not happen) */ + if (unlikely(!PageActive(page))) { + printk("VM: refill_inactive, wrong page on list.\n"); + list_del(page_lru); + dec_page_state(nr_active_pages); + continue; + } + + /* Needed to follow page->mapping */ + if (TestSetPageLocked(page)) { + list_del(page_lru); + list_add(page_lru, &zone->active_list); + continue; + } -page_freeable: /* - * It is critical to check PageDirty _after_ we made sure - * the page is freeable* so not in use by anybody. + * If the object the page is in is not in use we don't + * bother with page aging. If the page is touched again + * while on the inactive_clean list it'll be reactivated. + * From here until the end of the current iteration + * both PG_locked and the pte_chain_lock are held. */ - if (PageDirty(page)) { - write_unlock(&mapping->page_lock); + pte_chain_lock(page); + if (!page_mapping_inuse(page)) { + pte_chain_unlock(page); unlock_page(page); + drop_page(page); continue; } - /* point of no return */ - if (likely(!PageSwapCache(page))) { - __remove_inode_page(page); - write_unlock(&mapping->page_lock); + /* + * Do aging on the pages. + */ + if (page_referenced(page)) { + age_page_up(page); } else { - swp_entry_t swap; - swap.val = page->index; - __delete_from_swap_cache(page); - write_unlock(&mapping->page_lock); - swap_free(swap); + age_page_down(page); } - __lru_cache_del(page); + /* + * If the page age is 'hot' and the process using the + * page doesn't exceed its RSS limit we keep the page. + * Otherwise we move it to the inactive_dirty list. + */ + if (page->age && !page_over_rsslimit(page)) { + list_del(page_lru); + list_add(page_lru, &zone->active_list); + } else { + deactivate_page_nolock(page); + if (++nr_deactivated > target) { + pte_chain_unlock(page); + unlock_page(page); + goto done; + } + } + pte_chain_unlock(page); unlock_page(page); - /* effectively free the page here */ - page_cache_release(page); - - if (--nr_pages) - continue; - break; + /* Low latency reschedule point */ + if (need_resched()) { + spin_unlock(&pagemap_lru_lock); + schedule(); + spin_lock(&pagemap_lru_lock); + } } - spin_unlock(&pagemap_lru_lock); - return nr_pages; +done: + spin_unlock(&pagemap_lru_lock); + return nr_deactivated; } -/* - * This moves pages from the active list to - * the inactive list. +/** + * refill_inactive - checks all zones and refills the inactive list as needed * - * We move them the other way when we see the - * reference bit on the page. + * This function tries to balance page eviction from all zones by aging + * the pages from each zone in the same ratio until the global inactive + * shortage is resolved. After that it does one last "clean-up" scan to + * fix up local inactive shortages. */ -static void refill_inactive(int nr_pages) +int refill_inactive(void) { - struct list_head * entry; - - spin_lock(&pagemap_lru_lock); - entry = active_list.prev; - while (nr_pages-- && entry != &active_list) { - struct page * page; + int maxtry = 1 << DEF_PRIORITY; + zone_t * zone; + int ret = 0; - page = list_entry(entry, struct page, lru); - entry = entry->prev; - if (TestClearPageReferenced(page)) { - list_del(&page->lru); - list_add(&page->lru, &active_list); - continue; + /* Global balancing while we have a global shortage. */ + while (maxtry-- && inactive_low(ALL_ZONES) >= 0) { + for_each_zone(zone) { + if (inactive_high(zone) >= 0) + ret += refill_inactive_zone(zone, DEF_PRIORITY); } + } - del_page_from_active_list(page); - add_page_to_inactive_list(page); - SetPageReferenced(page); + /* Local balancing for zones which really need it. */ + for_each_zone(zone) { + if (inactive_min(zone) >= 0) + ret += refill_inactive_zone(zone, 0); } - spin_unlock(&pagemap_lru_lock); + + return ret; } -static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)); -static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages) +/** + * background_aging - slow background aging of zones + * @priority: priority at which to scan + * + * When the VM load is low or nonexistant, this function is + * called once a second to "sort" the pages in the VM. This + * way we know which pages to evict once a load spike happens. + * The effects of this function are very slow, the CPU usage + * should be minimal to nonexistant under most loads. + */ +static inline void background_aging(int priority) { - int chunk_size = nr_pages; - unsigned long ratio; - struct page_state ps; - int max_scan; + struct zone_struct * zone; - nr_pages -= kmem_cache_reap(gfp_mask); - if (nr_pages <= 0) - return 0; + for_each_zone(zone) + if (inactive_high(zone) > 0) + refill_inactive_zone(zone, priority); +} - nr_pages = chunk_size; +/* + * Worker function for kswapd and try_to_free_pages, we get + * called whenever there is a shortage of free/inactive_clean + * pages. + * + * This function will also move pages to the inactive list, + * if needed. + */ +static int do_try_to_free_pages(unsigned int gfp_mask) +{ + int ret = 0; /* - * Try to keep the active list 2/3 of the size of the cache + * Eat memory from filesystem page cache, + * dentry, inode and filesystem quota caches. */ - get_page_state(&ps); - ratio = (unsigned long)nr_pages * ps.nr_active / - ((ps.nr_inactive | 1) * 2); - refill_inactive(ratio); - max_scan = ps.nr_inactive / priority; - nr_pages = shrink_cache(nr_pages, classzone, - gfp_mask, priority, max_scan); - if (nr_pages <= 0) - return 0; - - wakeup_bdflush(); - - shrink_dcache_memory(priority, gfp_mask); - - /* After shrinking the dcache, get rid of unused inodes too .. */ - shrink_icache_memory(1, gfp_mask); + ret += page_launder(gfp_mask); + ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask); + ret += shrink_icache_memory(1, gfp_mask); #ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); + ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); #endif + /* + * Move pages from the active list to the inactive list. + */ + refill_inactive(); - return nr_pages; -} + /* + * Reclaim unused slab cache memory. + */ + ret += kmem_cache_reap(gfp_mask); -int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order) -{ - int priority = DEF_PRIORITY; - int nr_pages = SWAP_CLUSTER_MAX; + refill_freelist(); - do { - nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages); - if (nr_pages <= 0) - return 1; - } while (--priority); + /* Start IO when needed. */ + if (free_plenty(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0) + blk_run_queues(); /* * Hmm.. Cache shrink failed - time to kill something? * Mhwahahhaha! This is the part I really like. Giggle. */ - out_of_memory(); - return 0; -} - -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); - -static int check_classzone_need_balance(zone_t * classzone) -{ - zone_t * first_classzone; - - first_classzone = classzone->zone_pgdat->node_zones; - while (classzone >= first_classzone) { - if (classzone->free_pages > classzone->pages_high) - return 0; - classzone--; - } - return 1; + if (!ret && free_min(ANY_ZONE) > 0) + out_of_memory(); + return ret; } -static int kswapd_balance_pgdat(pg_data_t * pgdat) +/** + * refill_freelist - move inactive_clean pages to free list if needed + * + * Move some pages from the inactive_clean lists to the free + * lists so atomic allocations have pages to work from. This + * function really only does something when we don't have a + * userspace load on __alloc_pages(). + * + * We refill the freelist in a bump from pages_min to pages_min * 2 + * in order to give the buddy allocator something to play with. + */ +static void refill_freelist(void) { - int need_more_balance = 0, i; + struct page * page; zone_t * zone; - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - cond_resched(); - if (!zone->need_balance) - continue; - if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { - zone->need_balance = 0; - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); + for_each_zone(zone) { + if (!zone->size || zone->free_pages >= zone->pages_min) continue; - } - if (check_classzone_need_balance(zone)) - need_more_balance = 1; - else - zone->need_balance = 0; - } - - return need_more_balance; -} - -static void kswapd_balance(void) -{ - int need_more_balance; - pg_data_t * pgdat; - - do { - need_more_balance = 0; - pgdat = pgdat_list; - do - need_more_balance |= kswapd_balance_pgdat(pgdat); - while ((pgdat = pgdat->node_next)); - } while (need_more_balance); -} - -static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) -{ - zone_t * zone; - int i; - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!zone->need_balance) - continue; - return 0; + while (zone->free_pages < zone->pages_min * 2) { + page = reclaim_page(zone); + if (!page) + break; + __free_page(page); + } } - - return 1; -} - -static int kswapd_can_sleep(void) -{ - pg_data_t * pgdat; - - pgdat = pgdat_list; - do { - if (kswapd_can_sleep_pgdat(pgdat)) - continue; - return 0; - } while ((pgdat = pgdat->node_next)); - - return 1; } /* @@ -774,7 +666,6 @@ int kswapd(void *unused) { struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); daemonize(); strcpy(tsk->comm, "kswapd"); @@ -798,26 +689,151 @@ * Kswapd main loop. */ for (;;) { + static long recalc = 0; if (current->flags & PF_FREEZE) refrigerator(PF_IOTHREAD); - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kswapd_wait, &wait); + /* + * We try to rebalance the VM either when we have a + * global shortage of free pages or when one particular + * zone is very short on free pages. + */ + if (free_high(ALL_ZONES) >= 0 || free_low(ANY_ZONE) > 0) + do_try_to_free_pages(GFP_KSWAPD); - mb(); - if (kswapd_can_sleep()) - schedule(); + refill_freelist(); - __set_current_state(TASK_RUNNING); + /* Once a second ... */ + if (time_after(jiffies, recalc + HZ)) { + recalc = jiffies; + + /* Do background page aging. */ + background_aging(DEF_PRIORITY); + } + + wakeup_memwaiters(); + } +} + +static int kswapd_overloaded; +unsigned int kswapd_minfree; /* initialized in mm/page_alloc.c */ +DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); +DECLARE_WAIT_QUEUE_HEAD(kswapd_done); + +/** + * wakeup_kswapd - wake up the pageout daemon + * gfp_mask: page freeing flags + * + * This function wakes up kswapd and can, under heavy VM pressure, + * put the calling task to sleep temporarily. + */ +void wakeup_kswapd(unsigned int gfp_mask) +{ + DECLARE_WAITQUEUE(wait, current); + + /* If we're in the memory freeing business ourself, don't sleep + * but just wake kswapd and go back to businesss. + */ + if (current->flags & PF_MEMALLOC) { + wake_up_interruptible(&kswapd_wait); + return; + } + + /* We need all of kswapd's GFP flags, otherwise we can't sleep on it. + * We still wake kswapd of course. + */ + if ((gfp_mask & GFP_KSWAPD) != GFP_KSWAPD) { + wake_up_interruptible(&kswapd_wait); + return; + } + + add_wait_queue(&kswapd_done, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + + /* Wake kswapd .... */ + wake_up_interruptible(&kswapd_wait); + + /* ... and check if we need to wait on it */ + if ((free_low(ALL_ZONES) > (kswapd_minfree / 2)) && !kswapd_overloaded) + schedule(); + set_current_state(TASK_RUNNING); + remove_wait_queue(&kswapd_done, &wait); +} + +static void wakeup_memwaiters(void) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&kswapd_wait, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + /* Don't let the processes waiting on memory get stuck, ever. */ + wake_up(&kswapd_done); + + /* Enough free RAM, we can easily keep up with memory demand. */ + if (free_high(ALL_ZONES) <= 0) { + schedule_timeout(HZ); remove_wait_queue(&kswapd_wait, &wait); + return; + } + remove_wait_queue(&kswapd_wait, &wait); - /* - * If we actually get into a low-memory situation, - * the processes needing more memory will wake us - * up on a more timely basis. - */ - kswapd_balance(); - blk_run_queues(); + /* OK, the VM is very loaded. Sleep instead of using all CPU. */ + kswapd_overloaded = 1; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ / 4); + kswapd_overloaded = 0; + return; +} + +/** + * try_to_free_pages - run the pageout code ourselves + * gfp_mask: mask of things the pageout code is allowed to do + * + * When the load on the system gets higher, it can happen + * that kswapd no longer manages to keep enough memory + * free. In those cases user programs allocating memory + * will call try_to_free_pages() and help the pageout code. + * This has the effects of freeing memory and slowing down + * the largest memory hogs a bit. + */ +int try_to_free_pages(unsigned int gfp_mask) +{ + int ret = 1; + if (gfp_mask & __GFP_WAIT) { + current->flags |= PF_MEMALLOC; + ret = do_try_to_free_pages(gfp_mask); + current->flags &= ~PF_MEMALLOC; } + return ret; +} + +/** + * rss_free_pages - run part of the pageout code and slow down a bit + * @gfp_mask: mask of things the pageout code is allowed to do + * + * This function is called when a task is over its RSS limit and + * has a page fault. It's goal is to free some memory so non-hogs + * can run faster and slow down itself when needed so it won't eat + * the memory non-hogs can use. + */ +void rss_free_pages(unsigned int gfp_mask) +{ + long pause = 0; + if (current->flags & PF_MEMALLOC) + return; + current->flags |= PF_MEMALLOC; + + do { + page_launder(gfp_mask); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(pause); + set_current_state(TASK_RUNNING); + pause++; + } while (free_high(ALL_ZONES) >= 0); + + current->flags &= ~PF_MEMALLOC; + return; } static int __init kswapd_init(void)