diff -uNr linux-2.5.22/Makefile linux-2.5.22-rmap13b/Makefile
--- linux-2.5.22/Makefile	Tue Jun 18 13:41:43 2002
+++ linux-2.5.22-rmap13b/Makefile	Tue Jun 18 13:48:41 2002
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 5
 SUBLEVEL = 22
-EXTRAVERSION =
+EXTRAVERSION = -rmap13b
 
 # We are using a recursive build, so we need to do a little thinking
 # to get the ordering right.
diff -uNr linux-2.5.22/drivers/block/blkpg.c linux-2.5.22-rmap13b/drivers/block/blkpg.c
--- linux-2.5.22/drivers/block/blkpg.c	Tue Jun 18 13:41:52 2002
+++ linux-2.5.22-rmap13b/drivers/block/blkpg.c	Tue Jun 18 13:47:38 2002
@@ -35,9 +35,9 @@
 #include <linux/blkpg.h>
 #include <linux/genhd.h>
 #include <linux/module.h>               /* for EXPORT_SYMBOL */
+#include <linux/elevator.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
-
 #include <asm/uaccess.h>
 
 /*
@@ -218,6 +218,9 @@
 	request_queue_t *q;
 	u64 ullval = 0;
 	int intval;
+	blkelv_ioctl_arg_t elevator_arg;
+	elevator_t *e;
+	int ret;
 	unsigned short usval;
 	kdev_t dev = to_kdev_t(bdev->bd_dev);
 	int holder;
diff -uNr linux-2.5.22/drivers/block/elevator.c linux-2.5.22-rmap13b/drivers/block/elevator.c
--- linux-2.5.22/drivers/block/elevator.c	Thu May  2 17:22:56 2002
+++ linux-2.5.22-rmap13b/drivers/block/elevator.c	Tue Jun 18 13:47:38 2002
@@ -162,32 +162,46 @@
 int elevator_linus_merge(request_queue_t *q, struct request **req,
 			 struct bio *bio)
 {
+	int max_bomb_segments;
+	int *latency;
 	struct list_head *entry;
 	struct request *__rq;
+	int merge_only = 0;
 	int ret;
 
 	if ((ret = elv_try_last_merge(q, req, bio)))
 		return ret;
 
+	latency = q->elevator.elevator_data;
+	max_bomb_segments = 0;
+	if (latency)
+		max_bomb_segments = latency[2];
+
 	entry = &q->queue_head;
 	ret = ELEVATOR_NO_MERGE;
 	while ((entry = entry->prev) != &q->queue_head) {
 		__rq = list_entry_rq(entry);
 
-		if (__rq->flags & (REQ_BARRIER | REQ_STARTED))
+		if (__rq->flags & (REQ_BARRIER | REQ_STARTED)) {
+			max_bomb_segments = 0;	/* No read promotions */
 			break;
+		}
 
-		/*
-		 * simply "aging" of requests in queue
-		 */
-		if (elv_linus_sequence(__rq)-- <= 0)
-			break;
+		if (elv_linus_sequence(__rq)-- <= 0) {
+			/*
+			 * OK, we've exceeded someone's latency limit.
+			 * But we still continue to look for merges,
+			 * because they're so much better than seeks.
+			 */
+			merge_only = 1;
+		}
 		if (!(__rq->flags & REQ_CMD))
 			continue;
 		if (elv_linus_sequence(__rq) < bio_sectors(bio))
-			break;
+			merge_only = 1;
 
-		if (!*req && bio_rq_in_between(bio, __rq, &q->queue_head))
+		if (!*req && !merge_only &&
+				bio_rq_in_between(bio, __rq, &q->queue_head))
 			*req = __rq;
 
 		if ((ret = elv_try_merge(__rq, bio))) {
@@ -199,6 +213,49 @@
 		}
 	}
 
+	/*
+	 * If we failed to merge a read anywhere in the request
+	 * queue, we really don't want to place it at the end
+	 * of the list, behind lots of writes.  So place it near
+	 * the front.
+	 *
+	 * We don't want to place it in front of _all_ writes: that
+	 * would create lots of seeking, and isn't tunable.
+	 * We try to avoid promoting this read in front of existing
+	 * reads.
+	 *
+	 * max_bomb_segments becomes the maximum number of write
+	 * requests which we allow to remain in place in front of
+	 * a newly introduced read.  We weight things a little bit,
+	 * so large writes are more expensive than small ones, but it's
+	 * requests which count, not sectors.
+	 */
+	if (max_bomb_segments && bio_data_dir(bio) == READ &&
+					ret == ELEVATOR_NO_MERGE) {
+		int cur_latency = 0;
+		struct request * const cur_request = *req;
+
+		entry = q->queue_head.next;
+		while (entry != &q->queue_head) {
+			__rq = list_entry_rq(entry);
+			if (__rq == cur_request) {
+				/*
+				 * This is where the old algorithm placed it.
+				 * There's no point pushing it further back,
+				 * so leave it here, in sorted order.
+				 */
+				break;
+			}
+			if (rq_data_dir(__rq) == WRITE) {
+				cur_latency += 1 + __rq->nr_sectors / 64;
+				if (cur_latency >= max_bomb_segments) {
+					*req = __rq;
+					break;
+				}
+			}
+			entry = entry->next;
+		}
+	}
 	return ret;
 }
 
@@ -251,12 +308,13 @@
 {
 	int *latency;
 
-	latency = kmalloc(2 * sizeof(int), GFP_KERNEL);
+	latency = kmalloc(3 * sizeof(int), GFP_KERNEL);
 	if (!latency)
 		return -ENOMEM;
 
 	latency[READ] = 8192;
 	latency[WRITE] = 16384;
+	latency[2] = 6;		/* max_bomb_segments */
 
 	e->elevator_data = latency;
 	return 0;
diff -uNr linux-2.5.22/drivers/block/ll_rw_blk.c linux-2.5.22-rmap13b/drivers/block/ll_rw_blk.c
--- linux-2.5.22/drivers/block/ll_rw_blk.c	Tue Jun 18 13:41:53 2002
+++ linux-2.5.22-rmap13b/drivers/block/ll_rw_blk.c	Tue Jun 18 13:47:38 2002
@@ -2002,8 +2002,8 @@
 	queue_nr_requests = (total_ram >> 8) & ~15;	/* One per quarter-megabyte */
 	if (queue_nr_requests < 32)
 		queue_nr_requests = 32;
-	if (queue_nr_requests > 512)
-		queue_nr_requests = 512;
+	if (queue_nr_requests > 1024)
+		queue_nr_requests = 1024;
 
 	/*
 	 * Batch frees according to queue length
diff -uNr linux-2.5.22/drivers/pci/pci-driver.c linux-2.5.22-rmap13b/drivers/pci/pci-driver.c
--- linux-2.5.22/drivers/pci/pci-driver.c	Wed Jun 12 16:07:01 2002
+++ linux-2.5.22-rmap13b/drivers/pci/pci-driver.c	Tue Jun 18 13:47:38 2002
@@ -210,3 +210,4 @@
 EXPORT_SYMBOL(pci_register_driver);
 EXPORT_SYMBOL(pci_unregister_driver);
 EXPORT_SYMBOL(pci_dev_driver);
+EXPORT_SYMBOL(pci_bus_type);
diff -uNr linux-2.5.22/drivers/scsi/constants.c linux-2.5.22-rmap13b/drivers/scsi/constants.c
--- linux-2.5.22/drivers/scsi/constants.c	Tue Jun 18 13:42:02 2002
+++ linux-2.5.22-rmap13b/drivers/scsi/constants.c	Tue Jun 18 13:47:38 2002
@@ -992,11 +992,14 @@
 		s = 4;
 	}
     
-#if !(CONSTANTS & CONST_SENSE)
+#if !(CONSTANTS & CONST_SENSE) 
+{
+	int i;
 	printk("Raw sense data:");
 	for (i = 0; i < s; ++i) 
 		printk("0x%02x ", sense_buffer[i]);
 	printk("\n");
+}
 #endif
 }
 
diff -uNr linux-2.5.22/fs/buffer.c linux-2.5.22-rmap13b/fs/buffer.c
--- linux-2.5.22/fs/buffer.c	Tue Jun 18 13:42:06 2002
+++ linux-2.5.22-rmap13b/fs/buffer.c	Tue Jun 18 13:47:38 2002
@@ -475,17 +475,13 @@
 }
 
 /*
- * FIXME: What is this function actually trying to do?  Why "zones[0]"?
+ * FIXME: What is this function actually trying to do? 
  * Is it still correct/needed if/when blockdev mappings use GFP_HIGHUSER?
  */
 static void free_more_memory(void)
 {
-	zone_t *zone;
-
-	zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0];
-
 	wakeup_bdflush();
-	try_to_free_pages(zone, GFP_NOFS, 0);
+	try_to_free_pages(GFP_NOFS);
 	blk_run_queues();
 	__set_current_state(TASK_RUNNING);
 	yield();
diff -uNr linux-2.5.22/fs/dcache.c linux-2.5.22-rmap13b/fs/dcache.c
--- linux-2.5.22/fs/dcache.c	Wed Jun 12 16:07:07 2002
+++ linux-2.5.22-rmap13b/fs/dcache.c	Tue Jun 18 13:47:38 2002
@@ -602,8 +602,7 @@
 	count = dentry_stat.nr_unused / priority;
 
 	prune_dcache(count);
-	kmem_cache_shrink(dentry_cache);
-	return 0;
+	return kmem_cache_shrink(dentry_cache);
 }
 
 #define NAME_ALLOC_LEN(len)	((len+16) & ~15)
diff -uNr linux-2.5.22/fs/dquot.c linux-2.5.22-rmap13b/fs/dquot.c
--- linux-2.5.22/fs/dquot.c	Tue Jun 18 13:42:06 2002
+++ linux-2.5.22-rmap13b/fs/dquot.c	Tue Jun 18 13:47:38 2002
@@ -498,8 +498,7 @@
 	count = dqstats.free_dquots / priority;
 	prune_dqcache(count);
 	unlock_kernel();
-	kmem_cache_shrink(dquot_cachep);
-	return 0;
+	return kmem_cache_shrink(dquot_cachep);
 }
 
 /*
diff -uNr linux-2.5.22/fs/exec.c linux-2.5.22-rmap13b/fs/exec.c
--- linux-2.5.22/fs/exec.c	Wed Jun 12 15:44:33 2002
+++ linux-2.5.22-rmap13b/fs/exec.c	Tue Jun 18 13:47:38 2002
@@ -36,6 +36,7 @@
 #include <linux/spinlock.h>
 #include <linux/personality.h>
 #include <linux/binfmts.h>
+#include <linux/swap.h>
 #define __NO_VERSION__
 #include <linux/module.h>
 #include <linux/namei.h>
@@ -283,6 +284,7 @@
 	flush_dcache_page(page);
 	flush_page_to_ram(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
+	page_add_rmap(page, pte);
 	pte_unmap(pte);
 	tsk->mm->rss++;
 	spin_unlock(&tsk->mm->page_table_lock);
diff -uNr linux-2.5.22/fs/inode.c linux-2.5.22-rmap13b/fs/inode.c
--- linux-2.5.22/fs/inode.c	Tue Jun 18 13:42:06 2002
+++ linux-2.5.22-rmap13b/fs/inode.c	Tue Jun 18 13:47:38 2002
@@ -431,8 +431,7 @@
 	count = inodes_stat.nr_unused / priority;
 
 	prune_icache(count);
-	kmem_cache_shrink(inode_cachep);
-	return 0;
+	return kmem_cache_shrink(inode_cachep);
 }
 
 /*
diff -uNr linux-2.5.22/fs/mpage.c linux-2.5.22-rmap13b/fs/mpage.c
--- linux-2.5.22/fs/mpage.c	Tue Jun 18 13:42:06 2002
+++ linux-2.5.22-rmap13b/fs/mpage.c	Tue Jun 18 13:47:38 2002
@@ -518,15 +518,6 @@
 
 		if (page->mapping && TestClearPageDirty(page) &&
 					!PageWriteback(page)) {
-			/* FIXME: batch this up */
-			if (!PageActive(page) && PageLRU(page)) {
-				spin_lock(&pagemap_lru_lock);
-				if (!PageActive(page) && PageLRU(page)) {
-					list_del(&page->lru);
-					list_add(&page->lru, &inactive_list);
-				}
-				spin_unlock(&pagemap_lru_lock);
-			}
 			bio = mpage_writepage(bio, page, get_block,
 					&last_block_in_bio, &ret);
 			if (ret || (nr_to_write && --(*nr_to_write) <= 0))
diff -uNr linux-2.5.22/fs/proc/proc_misc.c linux-2.5.22-rmap13b/fs/proc/proc_misc.c
--- linux-2.5.22/fs/proc/proc_misc.c	Wed Jun 12 15:44:33 2002
+++ linux-2.5.22-rmap13b/fs/proc/proc_misc.c	Tue Jun 18 13:47:38 2002
@@ -27,6 +27,7 @@
 #include <linux/ioport.h>
 #include <linux/config.h>
 #include <linux/mm.h>
+#include <linux/mm_inline.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
@@ -150,7 +151,9 @@
 		"Cached:       %8lu kB\n"
 		"SwapCached:   %8lu kB\n"
 		"Active:       %8lu kB\n"
-		"Inactive:     %8lu kB\n"
+		"Inact_dirty:  %8lu kB\n"
+		"Inact_clean:  %8lu kB\n"
+		"Inact_target: %8lu kB\n"
 		"HighTotal:    %8lu kB\n"
 		"HighFree:     %8lu kB\n"
 		"LowTotal:     %8lu kB\n"
@@ -164,8 +167,10 @@
 		K(i.sharedram),
 		K(ps.nr_pagecache-swapper_space.nrpages),
 		K(swapper_space.nrpages),
-		K(ps.nr_active),
-		K(ps.nr_inactive),
+		K(ps.nr_active_pages),
+		K(ps.nr_inactive_dirty_pages),
+		K(ps.nr_inactive_clean_pages),
+		K(inactive_target()),
 		K(i.totalhigh),
 		K(i.freehigh),
 		K(i.totalram-i.totalhigh),
diff -uNr linux-2.5.22/include/asm-alpha/rmap.h linux-2.5.22-rmap13b/include/asm-alpha/rmap.h
--- linux-2.5.22/include/asm-alpha/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-alpha/rmap.h	Tue Jun 18 13:47:38 2002
@@ -0,0 +1,7 @@
+#ifndef _ALPHA_RMAP_H
+#define _ALPHA_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/asm-arm/proc-armv/rmap.h linux-2.5.22-rmap13b/include/asm-arm/proc-armv/rmap.h
--- linux-2.5.22/include/asm-arm/proc-armv/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-arm/proc-armv/rmap.h	Tue Jun 18 13:47:38 2002
@@ -0,0 +1,72 @@
+#ifndef _ARMV_RMAP_H
+#define _ARMV_RMAP_H
+/*
+ * linux/include/asm-arm/proc-armv/rmap.h
+ *
+ * Architecture dependant parts of the reverse mapping code,
+ *
+ * We use the struct page of the page table page to find a pointer
+ * to an array of two 'struct arm_rmap_info's, one for each of the
+ * two page tables in each page.
+ * 
+ * - rmi->mm points to the process' mm_struct
+ * - rmi->index has the high bits of the address
+ * - the lower bits of the address are calculated from the
+ *   offset of the page table entry within the page table page
+ */
+#include <linux/mm.h>
+
+struct arm_rmap_info {
+	struct mm_struct *mm;
+	unsigned long index;
+};
+
+static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm, unsigned long address)
+{
+	struct page * page = virt_to_page(ptep);
+	struct arm_rmap_info *rmi = (void *)page->mapping;
+
+	if (((unsigned long)ptep)&2048)
+		rmi++;
+
+	rmi->mm = mm;
+	rmi->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
+}
+
+static inline void pgtable_remove_rmap(pte_t * ptep)
+{
+	struct page * page = virt_to_page(ptep);
+	struct arm_rmap_info *rmi = (void *)page->mapping;
+
+	if (((unsigned long)ptep)&2048)
+		rmi++;
+
+	rmi->mm = NULL;
+	rmi->index = 0;
+}
+
+static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
+{
+	struct page * page = virt_to_page(ptep);
+	struct arm_rmap_info *rmi = (void *)page->mapping;
+
+	if (((unsigned long)ptep)&2048)
+		rmi++;
+
+	return rmi->mm;
+}
+
+static inline unsigned long ptep_to_address(pte_t * ptep)
+{
+	struct page * page = virt_to_page(ptep);
+	struct arm_rmap_info *rmi = (void *)page->mapping;
+	unsigned long low_bits;
+
+	if (((unsigned long)ptep)&2048)
+		rmi++;
+
+	low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
+	return rmi->index + low_bits;
+}
+
+#endif /* _ARMV_RMAP_H */
diff -uNr linux-2.5.22/include/asm-arm/rmap.h linux-2.5.22-rmap13b/include/asm-arm/rmap.h
--- linux-2.5.22/include/asm-arm/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-arm/rmap.h	Tue Jun 18 13:47:38 2002
@@ -0,0 +1,6 @@
+#ifndef _ARM_RMAP_H
+#define _ARM_RMAP_H
+
+#include <asm/proc/rmap.h>
+
+#endif /* _ARM_RMAP_H */
diff -uNr linux-2.5.22/include/asm-cris/rmap.h linux-2.5.22-rmap13b/include/asm-cris/rmap.h
--- linux-2.5.22/include/asm-cris/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-cris/rmap.h	Tue Jun 18 13:47:38 2002
@@ -0,0 +1,7 @@
+#ifndef _CRIS_RMAP_H
+#define _CRIS_RMAP_H
+
+/* nothing to see, move along :) */
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/asm-generic/rmap.h linux-2.5.22-rmap13b/include/asm-generic/rmap.h
--- linux-2.5.22/include/asm-generic/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-generic/rmap.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,66 @@
+#ifndef _GENERIC_RMAP_H
+#define _GENERIC_RMAP_H
+/*
+ * linux/include/asm-generic/rmap.h
+ *
+ * Architecture dependant parts of the reverse mapping code,
+ * this version should work for most architectures with a
+ * 'normal' page table layout.
+ *
+ * We use the struct page of the page table page to find out
+ * the process and full address of a page table entry:
+ * - page->mapping points to the process' mm_struct
+ * - page->index has the high bits of the address
+ * - the lower bits of the address are calculated from the
+ *   offset of the page table entry within the page table page
+ */
+#include <linux/mm.h>
+
+static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address)
+{
+#ifdef BROKEN_PPC_PTE_ALLOC_ONE
+	/* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */
+	extern int mem_init_done;
+
+	if (!mem_init_done)
+		return;
+#endif
+	page->mapping = (void *)mm;
+	page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
+}
+
+static inline void pgtable_add_rmap_kernel(pte_t * ptep, struct mm_struct * mm, unsigned long address)
+{
+	struct page * page = virt_to_page(ptep);
+#ifdef BROKEN_PPC_PTE_ALLOC_ONE
+	/* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */
+	extern int mem_init_done;
+
+	if (!mem_init_done)
+		return;
+#endif
+	page->mapping = (void *)mm;
+	page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
+}
+
+static inline void pgtable_remove_rmap(struct page * page)
+{
+	page->mapping = NULL;
+	page->index = 0;
+}
+
+static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
+{
+	struct page * page = virt_to_page(ptep);
+	return (struct mm_struct *) page->mapping;
+}
+
+static inline unsigned long ptep_to_address(pte_t * ptep)
+{
+	struct page * page = virt_to_page(ptep);
+	unsigned long low_bits;
+	low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
+	return page->index + low_bits;
+}
+
+#endif /* _GENERIC_RMAP_H */
diff -uNr linux-2.5.22/include/asm-i386/rmap.h linux-2.5.22-rmap13b/include/asm-i386/rmap.h
--- linux-2.5.22/include/asm-i386/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-i386/rmap.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,7 @@
+#ifndef _I386_RMAP_H
+#define _I386_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/asm-ia64/rmap.h linux-2.5.22-rmap13b/include/asm-ia64/rmap.h
--- linux-2.5.22/include/asm-ia64/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-ia64/rmap.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,7 @@
+#ifndef _IA64_RMAP_H
+#define _IA64_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/asm-m68k/rmap.h linux-2.5.22-rmap13b/include/asm-m68k/rmap.h
--- linux-2.5.22/include/asm-m68k/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-m68k/rmap.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,7 @@
+#ifndef _M68K_RMAP_H
+#define _M68K_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/asm-mips/rmap.h linux-2.5.22-rmap13b/include/asm-mips/rmap.h
--- linux-2.5.22/include/asm-mips/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-mips/rmap.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,7 @@
+#ifndef _MIPS_RMAP_H
+#define _MIPS_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/asm-mips64/rmap.h linux-2.5.22-rmap13b/include/asm-mips64/rmap.h
--- linux-2.5.22/include/asm-mips64/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-mips64/rmap.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,7 @@
+#ifndef _MIPS64_RMAP_H
+#define _MIPS64_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/asm-parisc/rmap.h linux-2.5.22-rmap13b/include/asm-parisc/rmap.h
--- linux-2.5.22/include/asm-parisc/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-parisc/rmap.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,7 @@
+#ifndef _PARISC_RMAP_H
+#define _PARISC_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/asm-ppc/rmap.h linux-2.5.22-rmap13b/include/asm-ppc/rmap.h
--- linux-2.5.22/include/asm-ppc/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-ppc/rmap.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,9 @@
+#ifndef _PPC_RMAP_H
+#define _PPC_RMAP_H
+
+/* PPC calls pte_alloc() before mem_map[] is setup ... */
+#define BROKEN_PPC_PTE_ALLOC_ONE
+
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/asm-s390/rmap.h linux-2.5.22-rmap13b/include/asm-s390/rmap.h
--- linux-2.5.22/include/asm-s390/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-s390/rmap.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,7 @@
+#ifndef _S390_RMAP_H
+#define _S390_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/asm-s390x/rmap.h linux-2.5.22-rmap13b/include/asm-s390x/rmap.h
--- linux-2.5.22/include/asm-s390x/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-s390x/rmap.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,7 @@
+#ifndef _S390X_RMAP_H
+#define _S390X_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/asm-sh/rmap.h linux-2.5.22-rmap13b/include/asm-sh/rmap.h
--- linux-2.5.22/include/asm-sh/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-sh/rmap.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,7 @@
+#ifndef _SH_RMAP_H
+#define _SH_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/asm-sparc/rmap.h linux-2.5.22-rmap13b/include/asm-sparc/rmap.h
--- linux-2.5.22/include/asm-sparc/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-sparc/rmap.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,7 @@
+#ifndef _SPARC_RMAP_H
+#define _SPARC_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/asm-sparc64/rmap.h linux-2.5.22-rmap13b/include/asm-sparc64/rmap.h
--- linux-2.5.22/include/asm-sparc64/rmap.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/asm-sparc64/rmap.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,7 @@
+#ifndef _SPARC64_RMAP_H
+#define _SPARC64_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -uNr linux-2.5.22/include/linux/elevator.h linux-2.5.22-rmap13b/include/linux/elevator.h
--- linux-2.5.22/include/linux/elevator.h	Thu May  2 17:22:39 2002
+++ linux-2.5.22-rmap13b/include/linux/elevator.h	Tue Jun 18 13:47:39 2002
@@ -16,6 +16,8 @@
 
 typedef int (elevator_init_fn) (request_queue_t *, elevator_t *);
 typedef void (elevator_exit_fn) (request_queue_t *, elevator_t *);
+struct blkelv_ioctl_arg_s;
+typedef int (elevator_ioctl_fn)(elevator_t *, int cmd, struct blkelv_ioctl_arg_s *);
 
 struct elevator_s
 {
@@ -32,6 +34,8 @@
 	elevator_init_fn *elevator_init_fn;
 	elevator_exit_fn *elevator_exit_fn;
 
+	elevator_ioctl_fn *elevator_ioctl_fn;
+
 	void *elevator_data;
 };
 
@@ -68,8 +72,13 @@
 	int write_latency;
 	int max_bomb_segments;
 } blkelv_ioctl_arg_t;
-#define BLKELVGET   _IOR(0x12,106,sizeof(blkelv_ioctl_arg_t))
-#define BLKELVSET   _IOW(0x12,107,sizeof(blkelv_ioctl_arg_t))
+/*
+ * We used to have `sizeof(blkelv_ioctl_arg_t)' in here, but that
+ * was always wrong, and sizeof(sizeof(struct)) caused erratic behaviour
+ * from the compiler.  Change it to `int'. - akpm
+ */
+#define BLKELVGET   _IOR(0x12,106,int)
+#define BLKELVSET   _IOW(0x12,107,int)
 
 extern int elevator_init(request_queue_t *, elevator_t *, elevator_t);
 extern void elevator_exit(request_queue_t *, elevator_t *);
diff -uNr linux-2.5.22/include/linux/init_task.h linux-2.5.22-rmap13b/include/linux/init_task.h
--- linux-2.5.22/include/linux/init_task.h	Wed May 29 04:39:33 2002
+++ linux-2.5.22-rmap13b/include/linux/init_task.h	Tue Jun 18 13:47:39 2002
@@ -27,6 +27,7 @@
 	mmap_sem:	__RWSEM_INITIALIZER(name.mmap_sem), \
 	page_table_lock: SPIN_LOCK_UNLOCKED, 		\
 	mmlist:		LIST_HEAD_INIT(name.mmlist),	\
+	rlimit_rss:	RLIM_INFINITY,			\
 }
 
 #define INIT_SIGNALS {	\
diff -uNr linux-2.5.22/include/linux/mm.h linux-2.5.22-rmap13b/include/linux/mm.h
--- linux-2.5.22/include/linux/mm.h	Wed Jun 12 15:44:34 2002
+++ linux-2.5.22-rmap13b/include/linux/mm.h	Tue Jun 18 13:47:39 2002
@@ -19,9 +19,6 @@
 extern unsigned long num_physpages;
 extern void * high_memory;
 extern int page_cluster;
-/* The inactive_clean lists are per zone. */
-extern struct list_head active_list;
-extern struct list_head inactive_list;
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -130,6 +127,9 @@
 	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused);
 };
 
+/* forward declaration; pte_chain is meant to be internal to rmap.c */
+struct pte_chain;
+
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
@@ -154,6 +154,9 @@
 					   updated asynchronously */
 	struct list_head lru;		/* Pageout list, eg. active_list;
 					   protected by pagemap_lru_lock !! */
+	unsigned char age;		/* Page aging counter. */
+	struct pte_chain * pte_chain;	/* Reverse pte mapping pointer.
+					 * protected by PG_chainlock */
 	unsigned long private;		/* mapping-private opaque data */
 
 	/*
@@ -291,13 +294,17 @@
 
 #define page_address(page) ((page)->virtual)
 
-#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
+#elif defined(CONFIG_DISCONTIGMEM)
+
+extern unsigned long page_address(struct page * page);
+
+#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL && !CONFIG_DISCONTIGMEM */
 
 #define page_address(page)						\
 	__va( (((page) - page_zone(page)->zone_mem_map) << PAGE_SHIFT)	\
 			+ page_zone(page)->zone_start_paddr)
 
-#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
+#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL && !CONFIG_DISCONTIGMEM */
 
 /*
  * Error return values for the *_nopage functions
@@ -308,6 +315,7 @@
 /* The array of struct pages */
 extern struct page *mem_map;
 
+extern void FASTCALL(fixup_freespace(struct zone_struct *, int));
 extern void show_free_areas(void);
 extern void show_free_areas_node(pg_data_t *pgdat);
 
diff -uNr linux-2.5.22/include/linux/mm_inline.h linux-2.5.22-rmap13b/include/linux/mm_inline.h
--- linux-2.5.22/include/linux/mm_inline.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/include/linux/mm_inline.h	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,276 @@
+#ifndef _LINUX_MM_INLINE_H
+#define _LINUX_MM_INLINE_H
+
+#include <linux/mm.h>
+
+/*
+ * These inline functions tend to need bits and pieces of all the
+ * other VM include files, meaning they cannot be defined inside
+ * one of the other VM include files.
+ *
+ * The include file mess really needs to be cleaned up...
+ */
+
+static inline void add_page_to_active_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	DEBUG_LRU_PAGE(page);
+	SetPageActive(page);
+	list_add(&page->lru, &zone->active_list);
+	zone->active_pages++;
+	inc_page_state(nr_active_pages);
+}
+
+static inline void add_page_to_inactive_dirty_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	DEBUG_LRU_PAGE(page);
+	SetPageInactiveDirty(page);
+	list_add(&page->lru, &zone->inactive_dirty_list);
+	zone->inactive_dirty_pages++;
+	inc_page_state(nr_inactive_dirty_pages);
+}
+
+static inline void add_page_to_inactive_clean_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	DEBUG_LRU_PAGE(page);
+	SetPageInactiveClean(page);
+	list_add(&page->lru, &zone->inactive_clean_list);
+	zone->inactive_clean_pages++;
+	inc_page_state(nr_inactive_clean_pages);
+}
+
+static inline void del_page_from_active_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	list_del(&page->lru);
+	ClearPageActive(page);
+	dec_page_state(nr_active_pages);
+	zone->active_pages--;
+	DEBUG_LRU_PAGE(page);
+}
+
+static inline void del_page_from_inactive_dirty_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	list_del(&page->lru);
+	ClearPageInactiveDirty(page);
+	dec_page_state(nr_inactive_dirty_pages);
+	zone->inactive_dirty_pages--;
+	DEBUG_LRU_PAGE(page);
+}
+
+static inline void del_page_from_inactive_clean_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	list_del(&page->lru);
+	ClearPageInactiveClean(page);
+	zone->inactive_clean_pages--;
+	dec_page_state(nr_inactive_clean_pages);
+	DEBUG_LRU_PAGE(page);
+}
+
+/*
+ * Inline functions to control some balancing in the VM.
+ *
+ * Note that we do both global and per-zone balancing, with
+ * most of the balancing done globally.
+ */
+#define	PLENTY_FACTOR	2
+#define	ALL_ZONES	NULL
+#define	ANY_ZONE	(struct zone_struct *)(~0UL)
+#define INACTIVE_FACTOR	5
+
+#define	VM_MIN	0
+#define	VM_LOW	1
+#define	VM_HIGH	2
+#define VM_PLENTY 3
+static inline int zone_free_limit(struct zone_struct * zone, int limit)
+{
+	int free, target, delta;
+
+	/* This is really nasty, but GCC should completely optimise it away. */
+	if (limit == VM_MIN)
+		target = zone->pages_min;
+	else if (limit == VM_LOW)
+		target = zone->pages_low;
+	else if (limit == VM_HIGH)
+		target = zone->pages_high;
+	else
+		target = zone->pages_high * PLENTY_FACTOR;
+
+	free = zone->free_pages + zone->inactive_clean_pages;
+	delta = target - free;
+
+	return delta;
+}
+
+static inline int free_limit(struct zone_struct * zone, int limit)
+{
+	int shortage = 0, local;
+
+	if (zone == ALL_ZONES) {
+		for_each_zone(zone)
+			shortage += zone_free_limit(zone, limit);
+	} else if (zone == ANY_ZONE) {
+		for_each_zone(zone) {
+			local = zone_free_limit(zone, limit);
+			shortage += max(local, 0);
+		}
+	} else {
+		shortage = zone_free_limit(zone, limit);
+	}
+
+	return shortage;
+}
+
+/**
+ * free_min - test for critically low amount of free pages
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if we have a serious shortage of free and
+ * clean pages, zero or negative if there is no serious shortage.
+ */
+static inline int free_min(struct zone_struct * zone)
+{
+	return free_limit(zone, VM_MIN);
+}
+
+/**
+ * free_low - test for low amount of free pages
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if we have a shortage of free and
+ * clean pages, zero or negative if there is no shortage.
+ */
+static inline int free_low(struct zone_struct * zone)
+{
+	return free_limit(zone, VM_LOW);
+}
+
+/**
+ * free_high - test if amount of free pages is less than ideal
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if the number of free and clean
+ * pages is below kswapd's target, zero or negative if we
+ * have more than enough free and clean pages.
+ */
+static inline int free_high(struct zone_struct * zone)
+{
+	return free_limit(zone, VM_HIGH);
+}
+
+/**
+ * free_plenty - test if enough pages are freed
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if the number of free + clean pages
+ * in a zone is not yet excessive and kswapd is still allowed to
+ * free pages here, a negative value if kswapd should leave the
+ * zone alone.
+ */
+static inline int free_plenty(struct zone_struct * zone)
+{
+	return free_limit(zone, VM_PLENTY);
+}
+
+/*
+ * The inactive page target is the free target + 20% of (active + inactive)
+ * pages. 
+ */
+static inline int zone_inactive_limit(struct zone_struct * zone, int limit)
+{
+	int inactive, target, inactive_base;
+
+	inactive_base = zone->active_pages + zone->inactive_dirty_pages;
+	inactive_base /= INACTIVE_FACTOR;
+
+	/* GCC should optimise this away completely. */
+	if (limit == VM_MIN)
+		target = zone->pages_high + inactive_base / 2;
+	else if (limit == VM_LOW)
+		target = zone->pages_high + inactive_base;
+	else
+		target = zone->pages_high + inactive_base * 2;
+
+	inactive = zone->free_pages + zone->inactive_clean_pages;
+	inactive += zone->inactive_dirty_pages;
+
+	return target - inactive;
+}
+
+static inline int inactive_limit(struct zone_struct * zone, int limit)
+{
+	int shortage = 0, local;
+
+	if (zone == ALL_ZONES) {
+		for_each_zone(zone)
+			shortage += zone_inactive_limit(zone, limit);
+	} else if (zone == ANY_ZONE) {
+		for_each_zone(zone) {
+			local = zone_inactive_limit(zone, limit);
+			shortage += max(local, 0);
+		}
+	} else {
+		shortage = zone_inactive_limit(zone, limit);
+	}
+
+	return shortage;
+}
+
+/**
+ * inactive_min - test for serious shortage of (free + inactive clean) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have no serious shortage of (free + inactive clean) pages
+ */
+static inline int inactive_min(struct zone_struct * zone)
+{
+	return inactive_limit(zone, VM_MIN);
+}
+
+/**
+ * inactive_low - test for shortage of (free + inactive clean) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have no shortage of (free + inactive clean) pages
+ */
+static inline int inactive_low(struct zone_struct * zone)
+{
+	return inactive_limit(zone, VM_LOW);
+}
+
+/**
+ * inactive_high - less than ideal amount of (free + inactive) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have more than enough (free + inactive) pages
+ */
+static inline int inactive_high(struct zone_struct * zone)
+{
+	return inactive_limit(zone, VM_HIGH);
+}
+
+/*
+ * inactive_target - number of inactive pages we ought to have.
+ */
+static inline int inactive_target(void)
+{
+	struct page_state ps;
+	int target;
+
+	get_page_state(&ps);
+	target = ps.nr_active_pages + ps.nr_inactive_dirty_pages
+			+ ps.nr_inactive_clean_pages;
+
+	target /= INACTIVE_FACTOR;
+
+	return target;
+}
+
+#endif /* _LINUX_MM_INLINE_H */
diff -uNr linux-2.5.22/include/linux/mmzone.h linux-2.5.22-rmap13b/include/linux/mmzone.h
--- linux-2.5.22/include/linux/mmzone.h	Wed Jun 12 16:07:12 2002
+++ linux-2.5.22-rmap13b/include/linux/mmzone.h	Tue Jun 18 13:47:39 2002
@@ -25,6 +25,9 @@
 } free_area_t;
 
 struct pglist_data;
+struct pte_chain;
+
+#define MAX_CHUNKS_PER_NODE 8
 
 /*
  * On machines where it is needed (eg PCs) we divide physical memory
@@ -40,13 +43,20 @@
 	 */
 	spinlock_t		lock;
 	unsigned long		free_pages;
-	unsigned long		pages_min, pages_low, pages_high;
-	int			need_balance;
+	unsigned long		active_pages;
+	unsigned long		inactive_dirty_pages;
+	unsigned long		inactive_clean_pages;
+	unsigned long		pages_min, pages_low, pages_high, pages_plenty;
 
 	/*
 	 * free areas of different sizes
 	 */
+	struct list_head	active_list;
+	struct list_head	inactive_dirty_list;
+	struct list_head	inactive_clean_list;
 	free_area_t		free_area[MAX_ORDER];
+	spinlock_t		pte_chain_freelist_lock;
+	struct pte_chain	*pte_chain_freelist;
 
 	/*
 	 * wait_table		-- the array holding the hash table
@@ -81,6 +91,13 @@
 	 */
 	struct pglist_data	*zone_pgdat;
 	struct page		*zone_mem_map;
+
+#if defined(CONFIG_DISCONTIGMEM) && !defined(CONFIG_HIGHMEM)
+	struct page             *zone_chunk_page_start[MAX_CHUNKS_PER_ZONE + 1];
+	unsigned long           zone_chunk_phys_start[MAX_CHUNKS_PER_ZONE];
+	unsigned int            zone_nr_chunks;
+#endif
+
 	unsigned long		zone_start_paddr;
 	unsigned long		zone_start_mapnr;
 
@@ -142,15 +159,6 @@
 extern int numnodes;
 extern pg_data_t *pgdat_list;
 
-static inline int memclass(zone_t *pgzone, zone_t *classzone)
-{
-	if (pgzone->zone_pgdat != classzone->zone_pgdat)
-		return 0;
-	if (pgzone > classzone)
-		return 0;
-	return 1;
-}
-
 /*
  * The following two are not meant for general usage. They are here as
  * prototypes for the discontig memory code.
@@ -163,6 +171,60 @@
 
 extern pg_data_t contig_page_data;
 
+/**
+ * for_each_pgdat - helper macro to iterate over all nodes
+ * @pgdat - pg_data_t * variable
+ *
+ * Meant to help with common loops of the form
+ * pgdat = pgdat_list;
+ * while(pgdat) {
+ * 	...
+ * 	pgdat = pgdat->node_next;
+ * }
+ */
+#define for_each_pgdat(pgdat) \
+	for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
+
+
+/*
+ * next_zone - helper magic for for_each_zone()
+ * Thanks to William Lee Irwin III for this piece of ingenuity.
+ */
+static inline zone_t *next_zone(zone_t *zone)
+{
+	pg_data_t *pgdat = zone->zone_pgdat;
+
+	if (zone - pgdat->node_zones < MAX_NR_ZONES - 1)
+		zone++;
+
+	else if (pgdat->node_next) {
+		pgdat = pgdat->node_next;
+		zone = pgdat->node_zones;
+	} else
+		zone = NULL;
+
+	return zone;
+}
+
+/**
+ * for_each_zone - helper macro to iterate over all memory zones
+ * @zone - zone_t * variable
+ *
+ * The user only needs to declare the zone variable, for_each_zone
+ * fills it in. This basically means for_each_zone() is an
+ * easier to read version of this piece of code:
+ *
+ * for(pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
+ * 	for(i = 0; i < MAX_NR_ZONES; ++i) {
+ * 		zone_t * z = pgdat->node_zones + i;
+ * 		...
+ * 	}
+ * }
+ */
+#define for_each_zone(zone) \
+	for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
+
+
 #ifndef CONFIG_DISCONTIGMEM
 
 #define NODE_DATA(nid)		(&contig_page_data)
diff -uNr linux-2.5.22/include/linux/page-flags.h linux-2.5.22-rmap13b/include/linux/page-flags.h
--- linux-2.5.22/include/linux/page-flags.h	Wed Jun 12 15:44:34 2002
+++ linux-2.5.22-rmap13b/include/linux/page-flags.h	Tue Jun 18 13:52:52 2002
@@ -47,7 +47,7 @@
  * locked- and dirty-page accounting.  The top eight bits of page->flags are
  * used for page->zone, so putting flag bits there doesn't work.
  */
-#define PG_locked	 0	/* Page is locked. Don't touch. */
+#define PG_locked		 0	/* Page is locked. Don't touch. */
 #define PG_error		 1
 #define PG_referenced		 2
 #define PG_uptodate		 3
@@ -55,16 +55,19 @@
 #define PG_dirty_dontuse	 4
 #define PG_lru			 5
 #define PG_active		 6
-#define PG_slab			 7	/* slab debug (Suparna wants this) */
-
-#define PG_highmem		 8
-#define PG_checked		 9	/* kill me in 2.5.<early>. */
-#define PG_arch_1		10
-#define PG_reserved		11
-
-#define PG_private		12	/* Has something at ->private */
-#define PG_writeback		13	/* Page is under writeback */
-#define PG_nosave		15	/* Used for system suspend/resume */
+#define PG_inactive_clean	 7
+#define PG_inactive_dirty	 8
+#define PG_slab			 9	/* slab debug (Suparna wants this) */
+
+#define PG_highmem		10
+#define PG_checked		11	/* kill me in 2.5.<early>. */
+#define PG_arch_1		12
+#define PG_reserved		13
+
+#define PG_private		14	/* Has something at ->private */
+#define PG_writeback		15	/* Page is under writeback */
+#define PG_nosave		16	/* Used for system suspend/resume */
+#define PG_chainlock		17	/* lock bit for ->pte_chain */
 
 /*
  * Global page accounting.  One instance per CPU.
@@ -73,8 +76,9 @@
 	unsigned long nr_dirty;
 	unsigned long nr_writeback;
 	unsigned long nr_pagecache;
-	unsigned long nr_active;	/* on active_list LRU */
-	unsigned long nr_inactive;	/* on inactive_list LRU */
+	unsigned long nr_active_pages;	/* on active_list LRU */
+	unsigned long nr_inactive_clean_pages; /* on inactive_clean_list LRU */
+	unsigned long nr_inactive_dirty_pages; /* on inactive_dirty_list LRU */
 } ____cacheline_aligned_in_smp page_states[NR_CPUS];
 
 extern void get_page_state(struct page_state *ret);
@@ -150,12 +154,22 @@
 	})
 
 #define PageLRU(page)		test_bit(PG_lru, &(page)->flags)
-#define TestSetPageLRU(page)	test_and_set_bit(PG_lru, &(page)->flags)
-#define TestClearPageLRU(page)	test_and_clear_bit(PG_lru, &(page)->flags)
+#define SetPageLRU(page)	set_bit(PG_lru, &(page)->flags)
+#define ClearPageLRU(page)	clear_bit(PG_lru, &(page)->flags)
 
 #define PageActive(page)	test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
 #define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
+#define TestandSetPageActive(page)	test_and_set_bit(PG_active, &(page)->flags)
+#define TestandClearPageActive(page)	test_and_clear_bit(PG_active, &(page)->flags)
+
+#define PageInactiveDirty(page)	test_bit(PG_inactive_dirty, &(page)->flags)
+#define SetPageInactiveDirty(page)	set_bit(PG_inactive_dirty, &(page)->flags)
+#define ClearPageInactiveDirty(page)	clear_bit(PG_inactive_dirty, &(page)->flags)
+
+#define PageInactiveClean(page)	test_bit(PG_inactive_clean, &(page)->flags)
+#define SetPageInactiveClean(page)	set_bit(PG_inactive_clean, &(page)->flags)
+#define ClearPageInactiveClean(page)	clear_bit(PG_inactive_clean, &(page)->flags)
 
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
 #define SetPageSlab(page)	set_bit(PG_slab, &(page)->flags)
@@ -217,6 +231,29 @@
 #define TestClearPageNosave(page)	test_and_clear_bit(PG_nosave, &(page)->flags)
 
 /*
+ * inlines for acquisition and release of PG_chainlock
+ */
+static inline void pte_chain_lock(struct page *page)
+{
+	/*
+	 * Assuming the lock is uncontended, this never enters
+	 * the body of the outer loop. If it is contended, then
+	 * within the inner loop a non-atomic test is used to
+	 * busywait with less bus contention for a good time to
+	 * attempt to acquire the lock bit.
+	 */
+	while (test_and_set_bit(PG_chainlock, &page->flags)) {
+		while (test_bit(PG_chainlock, &page->flags))
+			cpu_relax();
+	}
+}
+
+static inline void pte_chain_unlock(struct page *page)
+{
+	clear_bit(PG_chainlock, &page->flags);
+}
+
+/*
  * The PageSwapCache predicate doesn't use a PG_flag at this time,
  * but it may again do so one day.
  */
diff -uNr linux-2.5.22/include/linux/sched.h linux-2.5.22-rmap13b/include/linux/sched.h
--- linux-2.5.22/include/linux/sched.h	Tue Jun 18 13:42:10 2002
+++ linux-2.5.22-rmap13b/include/linux/sched.h	Tue Jun 18 13:47:39 2002
@@ -193,6 +193,7 @@
 	unsigned long def_flags;
 	unsigned long cpu_vm_mask;
 	unsigned long swap_address;
+	unsigned long rlimit_rss;
 
 	unsigned dumpable:1;
 
@@ -272,9 +273,6 @@
 	struct list_head tasks;
 
 	struct mm_struct *mm, *active_mm;
-	struct list_head local_pages;
-
-	unsigned int allocation_order, nr_local_pages;
 
 /* task state */
 	struct linux_binfmt *binfmt;
diff -uNr linux-2.5.22/include/linux/swap.h linux-2.5.22-rmap13b/include/linux/swap.h
--- linux-2.5.22/include/linux/swap.h	Wed Jun 12 16:06:35 2002
+++ linux-2.5.22-rmap13b/include/linux/swap.h	Tue Jun 18 13:47:39 2002
@@ -120,18 +120,38 @@
 struct address_space;
 struct zone_t;
 
+/* linux/mm/rmap.c */
+extern int FASTCALL(page_referenced(struct page *));
+extern void FASTCALL(page_add_rmap(struct page *, pte_t *));
+extern void FASTCALL(page_remove_rmap(struct page *, pte_t *));
+extern int FASTCALL(try_to_unmap(struct page *));
+extern int FASTCALL(page_over_rsslimit(struct page *));
+
+/* return values of try_to_unmap */
+#define	SWAP_SUCCESS	0
+#define	SWAP_AGAIN	1
+#define	SWAP_FAIL	2
+#define	SWAP_ERROR	3
+
 /* linux/mm/swap.c */
 extern void FASTCALL(lru_cache_add(struct page *));
 extern void FASTCALL(__lru_cache_del(struct page *));
 extern void FASTCALL(lru_cache_del(struct page *));
 
 extern void FASTCALL(activate_page(struct page *));
+extern void FASTCALL(activate_page_nolock(struct page *));
+extern void FASTCALL(deactivate_page(struct page *));
+extern void FASTCALL(deactivate_page_nolock(struct page *));
+extern void FASTCALL(drop_page(struct page *));
 
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
+extern struct page * FASTCALL(reclaim_page(zone_t *));
 extern wait_queue_head_t kswapd_wait;
-extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int));
+extern int FASTCALL(try_to_free_pages(unsigned int gfp_mask));
+extern void wakeup_kswapd(unsigned int);
+extern void rss_free_pages(unsigned int);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *);
@@ -145,6 +165,7 @@
 extern void show_swap_cache_info(void);
 #endif
 extern int add_to_swap_cache(struct page *, swp_entry_t);
+extern int add_to_swap(struct page *);
 extern void __delete_from_swap_cache(struct page *page);
 extern void delete_from_swap_cache(struct page *page);
 extern int move_to_swap_cache(struct page *page, swp_entry_t entry);
@@ -182,43 +203,26 @@
 extern void FASTCALL(mark_page_accessed(struct page *));
 
 /*
+ * Page aging defines. These seem to work great in FreeBSD,
+ * no need to reinvent the wheel.
+ */
+#define PAGE_AGE_START 5
+#define PAGE_AGE_ADV 3
+#define PAGE_AGE_DECL 1
+#define PAGE_AGE_MAX 64
+
+/*
  * List add/del helper macros. These must be called
  * with the pagemap_lru_lock held!
  */
 #define DEBUG_LRU_PAGE(page)			\
 do {						\
-	if (!PageLRU(page))			\
-		BUG();				\
 	if (PageActive(page))			\
 		BUG();				\
-} while (0)
-
-#define add_page_to_active_list(page)		\
-do {						\
-	DEBUG_LRU_PAGE(page);			\
-	SetPageActive(page);			\
-	list_add(&(page)->lru, &active_list);	\
-	inc_page_state(nr_active);		\
-} while (0)
-
-#define add_page_to_inactive_list(page)		\
-do {						\
-	DEBUG_LRU_PAGE(page);			\
-	list_add(&(page)->lru, &inactive_list);	\
-	inc_page_state(nr_inactive);		\
-} while (0)
-
-#define del_page_from_active_list(page)		\
-do {						\
-	list_del(&(page)->lru);			\
-	ClearPageActive(page);			\
-	dec_page_state(nr_active);		\
-} while (0)
-
-#define del_page_from_inactive_list(page)	\
-do {						\
-	list_del(&(page)->lru);			\
-	dec_page_state(nr_inactive);		\
+	if (PageInactiveDirty(page))		\
+		BUG();				\
+	if (PageInactiveClean(page))		\
+		BUG();				\
 } while (0)
 
 extern spinlock_t swaplock;
diff -uNr linux-2.5.22/kernel/fork.c linux-2.5.22-rmap13b/kernel/fork.c
--- linux-2.5.22/kernel/fork.c	Tue Jun 18 13:42:10 2002
+++ linux-2.5.22-rmap13b/kernel/fork.c	Tue Jun 18 13:47:39 2002
@@ -189,7 +189,6 @@
 	mm->map_count = 0;
 	mm->rss = 0;
 	mm->cpu_vm_mask = 0;
-	mm->swap_address = 0;
 	pprev = &mm->mmap;
 
 	/*
@@ -308,9 +307,6 @@
 void mmput(struct mm_struct *mm)
 {
 	if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
-		extern struct mm_struct *swap_mm;
-		if (swap_mm == mm)
-			swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
 		list_del(&mm->mmlist);
 		mmlist_nr--;
 		spin_unlock(&mmlist_lock);
@@ -703,8 +699,6 @@
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = jiffies;
 
-	INIT_LIST_HEAD(&p->local_pages);
-
 	retval = -ENOMEM;
 	/* copy all the process information */
 	if (copy_semundo(clone_flags, p))
diff -uNr linux-2.5.22/kernel/sys.c linux-2.5.22-rmap13b/kernel/sys.c
--- linux-2.5.22/kernel/sys.c	Wed May 29 04:19:50 2002
+++ linux-2.5.22-rmap13b/kernel/sys.c	Tue Jun 18 13:47:39 2002
@@ -1163,6 +1163,12 @@
 	if (resource == RLIMIT_NOFILE) {
 		if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN)
 			return -EPERM;
+	} else if (resource == RLIMIT_RSS && current->mm) {
+		/* rlimit is specified in bytes, convert to pages */
+		unsigned long pages = RLIM_INFINITY;
+		if (new_rlim.rlim_cur != RLIM_INFINITY)
+			pages = new_rlim.rlim_cur >> PAGE_SHIFT;
+		current->mm->rlimit_rss = pages;
 	}
 	*old_rlim = new_rlim;
 	return 0;
diff -uNr linux-2.5.22/mm/Makefile linux-2.5.22-rmap13b/mm/Makefile
--- linux-2.5.22/mm/Makefile	Thu May  2 17:22:54 2002
+++ linux-2.5.22-rmap13b/mm/Makefile	Tue Jun 18 13:47:39 2002
@@ -16,6 +16,6 @@
 	    vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
 	    page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
 	    shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
-	    pdflush.o page-writeback.o
+	    pdflush.o page-writeback.o rmap.o
 
 include $(TOPDIR)/Rules.make
diff -uNr linux-2.5.22/mm/bootmem.c linux-2.5.22-rmap13b/mm/bootmem.c
--- linux-2.5.22/mm/bootmem.c	Wed Jun 12 16:07:13 2002
+++ linux-2.5.22-rmap13b/mm/bootmem.c	Tue Jun 18 13:47:39 2002
@@ -339,12 +339,11 @@
 	pg_data_t *pgdat = pgdat_list;
 	void *ptr;
 
-	while (pgdat) {
+	for_each_pgdat(pgdat)
 		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
 						align, goal)))
 			return(ptr);
-		pgdat = pgdat->node_next;
-	}
+
 	/*
 	 * Whoops, we cannot satisfy the allocation request.
 	 */
diff -uNr linux-2.5.22/mm/filemap.c linux-2.5.22-rmap13b/mm/filemap.c
--- linux-2.5.22/mm/filemap.c	Tue Jun 18 13:42:11 2002
+++ linux-2.5.22-rmap13b/mm/filemap.c	Tue Jun 18 13:47:39 2002
@@ -14,6 +14,7 @@
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/mm_inline.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
@@ -176,6 +177,10 @@
  */
 static void truncate_complete_page(struct page *page)
 {
+	/* Page has already been removed from processes, by vmtruncate()  */
+	if (page->pte_chain)
+		BUG();
+
 	/* Leave it on the LRU if it gets converted into anonymous buffers */
 	if (!PagePrivate(page) || do_invalidatepage(page, 0))
 		lru_cache_del(page);
@@ -643,7 +648,7 @@
  * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
  *
  * The first mb is necessary to safely close the critical section opened by the
- * TryLockPage(), the second mb is necessary to enforce ordering between
+ * TestSetPageLocked(), the second mb is necessary to enforce ordering between
  * the clear_bit and the read of the waitqueue (to avoid SMP races with a
  * parallel wait_on_page_locked()).
  */
@@ -862,9 +867,7 @@
 	return find_or_create_page(mapping, index, mapping->gfp_mask);
 }
 
-
-/*
- * Same as grab_cache_page, but do not wait if the page is unavailable.
+/* Same as grab_cache_page, but do not wait if the page is unavailable.
  * This is intended for speculative data generators, where the data can
  * be regenerated if the page couldn't be grabbed.  This routine should
  * be safe to call while holding the lock for another page.
@@ -913,16 +916,23 @@
 /*
  * Mark a page as having seen activity.
  *
- * If it was already so marked, move it
- * to the active queue and drop the referenced
- * bit. Otherwise, just mark it for future
- * action..
+ * We immediately reclaim
+ * the inactive clean pages because those are counted as freeable.
+ * We don't modify the inactive dirty ones because we're never sure
+ * if those are freeable anyway.
  */
 void mark_page_accessed(struct page *page)
 {
-	if (!PageActive(page) && PageReferenced(page)) {
+	if (PageInactiveClean(page)) {
+		struct zone_struct *zone = page_zone(page);
+		int free = zone->free_pages + zone->inactive_clean_pages;
+
 		activate_page(page);
-		ClearPageReferenced(page);
+		if (free < zone->pages_low)
+			wakeup_kswapd(GFP_NOIO);
+		if (zone->free_pages < zone->pages_min)
+			fixup_freespace(zone, 1);
+
 		return;
 	}
 
@@ -1429,7 +1439,7 @@
 
 	/* Limit it to a sane percentage of the inactive list.. */
 	get_page_state(&ps);
-	max = ps.nr_inactive / 2;
+	max = ps.nr_inactive_clean_pages / 2;
 	if (nr > max)
 		nr = max;
 
@@ -2227,16 +2237,18 @@
 	}
 
 	do {
-		unsigned long index;
-		unsigned long offset;
+		unsigned long index, offset;
 		long page_fault;
 		char *kaddr;
+		int deactivate = 1;
 
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count)
+		if (bytes > count) {
 			bytes = count;
+			deactivate = 0;
+		}
 
 		/*
 		 * Bring in the user page that we will copy from _first_.
@@ -2286,8 +2298,11 @@
 			}
 		}
 		kunmap(page);
-		SetPageReferenced(page);
 		unlock_page(page);
+		if (deactivate)
+			deactivate_page(page);
+		else
+			mark_page_accessed(page);
 		page_cache_release(page);
 		if (status < 0)
 			break;
diff -uNr linux-2.5.22/mm/memory.c linux-2.5.22-rmap13b/mm/memory.c
--- linux-2.5.22/mm/memory.c	Tue Jun 18 13:42:11 2002
+++ linux-2.5.22-rmap13b/mm/memory.c	Tue Jun 18 13:47:39 2002
@@ -44,8 +44,10 @@
 #include <linux/iobuf.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/mm_inline.h>
 
 #include <asm/pgalloc.h>
+#include <asm/rmap.h>
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
@@ -79,8 +81,7 @@
  */
 static inline void free_one_pmd(mmu_gather_t *tlb, pmd_t * dir)
 {
-	struct page *pte;
-
+	struct page *page;
 	if (pmd_none(*dir))
 		return;
 	if (pmd_bad(*dir)) {
@@ -88,9 +89,10 @@
 		pmd_clear(dir);
 		return;
 	}
-	pte = pmd_page(*dir);
+	page = pmd_page(*dir);
 	pmd_clear(dir);
-	pte_free_tlb(tlb, pte);
+	pgtable_remove_rmap(page); 
+	pte_free_tlb(tlb, page);
 }
 
 static inline void free_one_pgd(mmu_gather_t *tlb, pgd_t * dir)
@@ -150,6 +152,7 @@
 			pte_free(new);
 			goto out;
 		}
+		pgtable_add_rmap(new, mm, address);
 		pmd_populate(mm, pmd, new);
 	}
 out:
@@ -177,6 +180,7 @@
 			pte_free_kernel(new);
 			goto out;
 		}
+		pgtable_add_rmap_kernel(new, mm, address);
 		pmd_populate_kernel(mm, pmd, new);
 	}
 out:
@@ -260,10 +264,13 @@
 
 				if (pte_none(pte))
 					goto cont_copy_pte_range_noset;
+				/* pte contains position in swap, so copy. */
 				if (!pte_present(pte)) {
 					swap_duplicate(pte_to_swp_entry(pte));
-					goto cont_copy_pte_range;
+					set_pte(dst_pte, pte);
+					goto cont_copy_pte_range_noset;
 				}
+				ptepage = pte_page(pte);
 				pfn = pte_pfn(pte);
 				if (!pfn_valid(pfn))
 					goto cont_copy_pte_range;
@@ -272,7 +279,7 @@
 					goto cont_copy_pte_range;
 
 				/* If it's a COW mapping, write protect it both in the parent and the child */
-				if (cow && pte_write(pte)) {
+				if (cow) {
 					ptep_set_wrprotect(src_pte);
 					pte = *src_pte;
 				}
@@ -285,6 +292,7 @@
 				dst->rss++;
 
 cont_copy_pte_range:		set_pte(dst_pte, pte);
+				page_add_rmap(ptepage, dst_pte);
 cont_copy_pte_range_noset:	address += PAGE_SIZE;
 				if (address >= end) {
 					pte_unmap_nested(src_pte);
@@ -342,6 +350,7 @@
 					if (pte_dirty(pte))
 						set_page_dirty(page);
 					tlb->freed++;
+					page_remove_rmap(page, ptep);
 					tlb_remove_page(tlb, page);
 				}
 			}
@@ -992,7 +1001,9 @@
 	if (pte_same(*page_table, pte)) {
 		if (PageReserved(old_page))
 			++mm->rss;
+		page_remove_rmap(old_page, page_table);
 		break_cow(vma, new_page, address, page_table);
+		page_add_rmap(new_page, page_table);
 		lru_cache_add(new_page);
 
 		/* Free the old page.. */
@@ -1110,6 +1121,10 @@
 	struct page *new_page;
 	unsigned long offset;
 
+	/* Low on free memory ?  Don't make things worse. */
+	if (free_low(ALL_ZONES) < 0)
+		return;
+
 	/*
 	 * Get the number of handles we should do readahead io to.
 	 */
@@ -1192,6 +1207,7 @@
 	flush_page_to_ram(page);
 	flush_icache_page(vma, page);
 	set_pte(page_table, pte);
+	page_add_rmap(page, page_table);
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
@@ -1208,14 +1224,13 @@
 static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, pmd_t *pmd, int write_access, unsigned long addr)
 {
 	pte_t entry;
+	struct page * page = ZERO_PAGE(addr);
 
 	/* Read-only mapping of ZERO_PAGE. */
 	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 
 	/* ..except if it's a write access */
 	if (write_access) {
-		struct page *page;
-
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
 		spin_unlock(&mm->page_table_lock);
@@ -1241,6 +1256,7 @@
 	}
 
 	set_pte(page_table, entry);
+	page_add_rmap(page, page_table); /* ignores ZERO_PAGE */
 	pte_unmap(page_table);
 
 	/* No need to invalidate - it was non-present before */
@@ -1297,6 +1313,8 @@
 		new_page = page;
 	}
 
+	mark_page_accessed(new_page);
+
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 
@@ -1319,7 +1337,9 @@
 		if (write_access)
 			entry = pte_mkwrite(pte_mkdirty(entry));
 		set_pte(page_table, entry);
+		page_add_rmap(new_page, page_table);
 		pte_unmap(page_table);
+		
 	} else {
 		/* One of our sibling threads was faster, back out. */
 		pte_unmap(page_table);
@@ -1398,6 +1418,14 @@
 	current->state = TASK_RUNNING;
 	pgd = pgd_offset(mm, address);
 
+	/* 
+	 * If we are over our RSS limit and the system needs memory,
+	 * we will free memory for the non-hogs and slow down a bit.
+	 */
+	if (mm->rlimit_rss && mm->rss > mm->rlimit_rss &&
+					free_high(ALL_ZONES) > 0)
+		rss_free_pages(GFP_HIGHUSER);
+
 	/*
 	 * We need the page table lock to synchronize with kswapd
 	 * and the SMP-safe atomic PTE updates.
diff -uNr linux-2.5.22/mm/mremap.c linux-2.5.22-rmap13b/mm/mremap.c
--- linux-2.5.22/mm/mremap.c	Thu May  2 17:22:54 2002
+++ linux-2.5.22-rmap13b/mm/mremap.c	Tue Jun 18 13:47:39 2002
@@ -68,8 +68,14 @@
 {
 	int error = 0;
 	pte_t pte;
+	struct page * page = NULL;
+
+	if (pte_present(*src))
+		page = pte_page(*src);
 
 	if (!pte_none(*src)) {
+		if (page)
+			page_remove_rmap(page, src);
 		pte = ptep_get_and_clear(src);
 		if (!dst) {
 			/* No dest?  We must put it back. */
@@ -77,6 +83,8 @@
 			error++;
 		}
 		set_pte(dst, pte);
+		if (page)
+			page_add_rmap(page, dst);
 	}
 	return error;
 }
diff -uNr linux-2.5.22/mm/numa.c linux-2.5.22-rmap13b/mm/numa.c
--- linux-2.5.22/mm/numa.c	Wed Jun 12 15:44:34 2002
+++ linux-2.5.22-rmap13b/mm/numa.c	Tue Jun 18 13:47:39 2002
@@ -44,6 +44,57 @@
 
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 
+#ifndef CONFIG_HIGHMEM
+unsigned long page_address(struct page * page)
+{
+	struct zone_struct * zone;
+	struct page * chunk_page_start;
+	unsigned long chunk_phys_addr;
+	int idx_chunk;
+
+	zone = page_zone(page);
+	/*
+	 * We have to check if the page is on
+	 * a chunk that contains pages from 2 zones.
+	 */
+	if(!(page < zone->zone_chunk_page_start[0]))
+		goto known_zone;
+	/*
+	 * We need to get the previous zone.
+	 * If there is no such zone, we are in trouble.
+	 */
+	if(!page->zone)
+		BUG();
+
+	zone = zone_table[(page->zone) - 1];
+
+	if(zone->zone_pgdat->node_id == page_zone(page)->zone_pgdat->node_id)
+		goto known_zone;
+	/*
+	 * Getting here means we have a chunk spread over 2 nodes.
+	 * That shouldn't happen.
+	 */
+	BUG();
+
+ known_zone:
+	for(idx_chunk = 0 ; idx_chunk < MAX_CHUNKS_PER_ZONE ; idx_chunk++){
+		if(page >= zone->zone_chunk_page_start[idx_chunk] &&
+		   page < zone->zone_chunk_page_start[idx_chunk + 1])
+			break;
+	}
+	/*
+	 * We know which chunk the page belongs to.
+	 */
+	chunk_phys_addr = zone->zone_chunk_phys_start[idx_chunk];
+	chunk_page_start = zone->zone_chunk_page_start[idx_chunk];
+	return (unsigned long)__va(chunk_phys_addr +
+				   ((page - chunk_page_start) << PAGE_SHIFT ));
+
+
+
+}
+#endif
+
 static spinlock_t node_lock = SPIN_LOCK_UNLOCKED;
 
 void show_free_areas_node(pg_data_t *pgdat)
diff -uNr linux-2.5.22/mm/oom_kill.c linux-2.5.22-rmap13b/mm/oom_kill.c
--- linux-2.5.22/mm/oom_kill.c	Thu May  2 17:22:37 2002
+++ linux-2.5.22-rmap13b/mm/oom_kill.c	Tue Jun 18 13:47:39 2002
@@ -168,7 +168,8 @@
 static void oom_kill(void)
 {
 	struct task_struct *p, *q;
-	
+	extern wait_queue_head_t kswapd_done;
+
 	read_lock(&tasklist_lock);
 	p = select_bad_process();
 
@@ -182,6 +183,9 @@
 	}
 	read_unlock(&tasklist_lock);
 
+	/* Chances are by this time our victim is sleeping on kswapd. */
+	wake_up(&kswapd_done);
+
 	/*
 	 * Make kswapd go out of the way, so "p" has a good chance of
 	 * killing itself before someone else gets the chance to ask
diff -uNr linux-2.5.22/mm/page-writeback.c linux-2.5.22-rmap13b/mm/page-writeback.c
--- linux-2.5.22/mm/page-writeback.c	Wed Jun 12 16:06:35 2002
+++ linux-2.5.22-rmap13b/mm/page-writeback.c	Tue Jun 18 13:47:39 2002
@@ -258,7 +258,6 @@
 int generic_vm_writeback(struct page *page, int *nr_to_write)
 {
 	struct inode *inode = page->mapping->host;
-
 	/*
 	 * We don't own this inode, and we don't want the address_space
 	 * vanishing while writeback is walking its pages.
@@ -320,7 +319,7 @@
  * If a page is already under I/O, generic_writepages() skips it, even
  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
- * and msync() need to guarentee that all the data which was dirty at the time
+ * and msync() need to guarantee that all the data which was dirty at the time
  * the call was made get new I/O started against them.  The way to do this is
  * to run filemap_fdatawait() before calling filemap_fdatawrite().
  *
@@ -363,15 +362,6 @@
 		/* It may have been removed from swapcache: check ->mapping */
 		if (page->mapping && TestClearPageDirty(page) &&
 					!PageWriteback(page)) {
-			/* FIXME: batch this up */
-			if (!PageActive(page) && PageLRU(page)) {
-				spin_lock(&pagemap_lru_lock);
-				if (!PageActive(page) && PageLRU(page)) {
-					list_del(&page->lru);
-					list_add(&page->lru, &inactive_list);
-				}
-				spin_unlock(&pagemap_lru_lock);
-			}
 			err = writepage(page);
 			if (!ret)
 				ret = err;
diff -uNr linux-2.5.22/mm/page_alloc.c linux-2.5.22-rmap13b/mm/page_alloc.c
--- linux-2.5.22/mm/page_alloc.c	Wed Jun 12 16:07:13 2002
+++ linux-2.5.22-rmap13b/mm/page_alloc.c	Tue Jun 18 13:56:16 2002
@@ -14,12 +14,11 @@
 
 #include <linux/config.h>
 #include <linux/mm.h>
+#include <linux/mm_inline.h>
 #include <linux/swap.h>
-#include <linux/swapctl.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
-#include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
@@ -27,8 +26,6 @@
 unsigned long totalram_pages;
 unsigned long totalhigh_pages;
 int nr_swap_pages;
-LIST_HEAD(active_list);
-LIST_HEAD(inactive_list);
 pg_data_t *pgdat_list;
 
 /*
@@ -42,6 +39,8 @@
 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
+static int zone_extrafree_ratio[MAX_NR_ZONES] __initdata = { 128, 512, 0, };
+static int zone_extrafree_max[MAX_NR_ZONES] __initdata = { 1024 , 1024, 0, };
 
 /*
  * Temporary debugging check.
@@ -87,18 +86,19 @@
 		BUG();
 	if (PageLocked(page))
 		BUG();
-	if (PageLRU(page))
-		BUG();
 	if (PageActive(page))
 		BUG();
+        if (PageInactiveDirty(page))
+                BUG();
+        if (PageInactiveClean(page))
+                BUG();
+        if (page->pte_chain)
+                BUG();
 	if (PageWriteback(page))
 		BUG();
 	ClearPageDirty(page);
 	page->flags &= ~(1<<PG_referenced);
-
-	if (current->flags & PF_FREE_PAGES)
-		goto local_freelist;
- back_local_freelist:
+	page->age = PAGE_AGE_START;
 
 	zone = page_zone(page);
 
@@ -146,17 +146,6 @@
 	list_add(&(base + page_idx)->list, &area->free_list);
 
 	spin_unlock_irqrestore(&zone->lock, flags);
-	return;
-
- local_freelist:
-	if (current->nr_local_pages)
-		goto back_local_freelist;
-	if (in_interrupt())
-		goto back_local_freelist;		
-
-	list_add(&page->list, &current->local_pages);
-	page->index = order;
-	current->nr_local_pages++;
 }
 
 #define MARK_USED(index, order, area) \
@@ -215,10 +204,7 @@
 			set_page_count(page, 1);
 			if (BAD_RANGE(zone,page))
 				BUG();
-			if (PageLRU(page))
-				BUG();
-			if (PageActive(page))
-				BUG();
+			DEBUG_LRU_PAGE(page);
 			return page;	
 		}
 		curr_order++;
@@ -261,76 +247,83 @@
 }
 #endif
 
-static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
-static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
+/*
+ * If we are able to directly reclaim pages, we move pages from the
+ * inactive_clean list onto the free list until the zone has enough
+ * free pages or until the inactive_clean pages are exhausted.
+ * If we cannot do this work ourselves, call kswapd.
+ */
+void FASTCALL(fixup_freespace(zone_t * zone, int direct_reclaim));
+void fixup_freespace(zone_t * zone, int direct_reclaim)
+{
+	if (direct_reclaim) {
+		struct page * page;
+		do {
+			if ((page = reclaim_page(zone)))
+				__free_pages_ok(page, 0);
+		} while (page && zone->free_pages <= zone->pages_min);
+	} else
+		wakeup_kswapd(GFP_ATOMIC);
+}
+
+#define PAGES_KERNEL	0
+#define PAGES_MIN	1
+#define PAGES_LOW	2
+#define PAGES_HIGH	3
+
+/*
+ * This function does the dirty work for __alloc_pages
+ * and is separated out to keep the code size smaller.
+ * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
+ */
+static struct page * __alloc_pages_limit(zonelist_t *zonelist,
+			unsigned long order, int limit, int direct_reclaim)
 {
-	struct page * page = NULL;
-	int __freed = 0;
-
-	if (!(gfp_mask & __GFP_WAIT))
-		goto out;
-	if (in_interrupt())
-		BUG();
-
-	current->allocation_order = order;
-	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
+	zone_t **zone = zonelist->zones;
+	unsigned long water_mark = 0;
 
-	__freed = try_to_free_pages(classzone, gfp_mask, order);
-
-	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
+	for (;;) {
+		zone_t *z = *(zone++);
 
-	if (current->nr_local_pages) {
-		struct list_head * entry, * local_pages;
-		struct page * tmp;
-		int nr_pages;
-
-		local_pages = &current->local_pages;
-
-		if (likely(__freed)) {
-			/* pick from the last inserted so we're lifo */
-			entry = local_pages->next;
-			do {
-				tmp = list_entry(entry, struct page, list);
-				if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
-					list_del(entry);
-					current->nr_local_pages--;
-					set_page_count(tmp, 1);
-					page = tmp;
-
-					if (PagePrivate(page))
-						BUG();
-					if (page->mapping)
-						BUG();
-					if (PageLocked(page))
-						BUG();
-					if (PageLRU(page))
-						BUG();
-					if (PageActive(page))
-						BUG();
-					if (PageDirty(page))
-						BUG();
-					if (PageWriteback(page))
-						BUG();
+		if (!z)
+			break;
+		if (!z->size)
+			BUG();
 
-					break;
-				}
-			} while ((entry = entry->next) != local_pages);
+		/*
+		 * We allocate if the number of (free + inactive_clean)
+		 * pages is above the watermark.
+		 */
+		switch (limit) {
+			case PAGES_KERNEL:
+				water_mark = z->pages_min / 2;
+				break;
+			case PAGES_MIN:
+				water_mark = z->pages_min;
+				break;
+			case PAGES_LOW:
+				water_mark = z->pages_low;
+				break;
+			default:
+			case PAGES_HIGH:
+				water_mark = z->pages_high;
 		}
 
-		nr_pages = current->nr_local_pages;
-		/* free in reverse order so that the global order will be lifo */
-		while ((entry = local_pages->prev) != local_pages) {
-			list_del(entry);
-			tmp = list_entry(entry, struct page, list);
-			__free_pages_ok(tmp, tmp->index);
-			if (!nr_pages--)
-				BUG();
+		if (z->free_pages + z->inactive_clean_pages >= water_mark) {
+			struct page *page = NULL;
+			/* If possible, reclaim a page directly. */
+			if (direct_reclaim)
+				page = reclaim_page(z);
+			/* If that fails, fall back to rmqueue. */
+			if (!page)
+				page = rmqueue(z, order);
+			if (page)
+				return page;
 		}
-		current->nr_local_pages = 0;
 	}
- out:
-	*freed = __freed;
-	return page;
+
+	/* Found nothing. */
+	return NULL;
 }
 
 /*
@@ -338,107 +331,248 @@
  */
 struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
 {
-	unsigned long min;
-	zone_t **zone, * classzone;
+	zone_t **zone;
+	int min, direct_reclaim = 0;
 	struct page * page;
-	int freed;
 
+	/*
+	 * (If anyone calls gfp from interrupts nonatomically then it
+	 * will sooner or later tripped up by a schedule().)
+	 *
+	 * We fall back to lower-level zones if allocation
+	 * in a higher zone fails.
+	 */
+
+	/*
+	 * Can we take pages directly from the inactive_clean
+	 * list?
+	 */
+	if (order == 0 && (gfp_mask & __GFP_WAIT))
+		direct_reclaim = 1;
+
+try_again:
+	/*
+	 * First, see if we have any zones with lots of free memory.
+	 *
+	 * We allocate free memory first because it doesn't contain
+	 * any data we would want to cache.
+	 */
 	zone = zonelist->zones;
-	classzone = *zone;
-	if (classzone == NULL)
+	if (!*zone)
 		return NULL;
 	min = 1UL << order;
 	for (;;) {
 		zone_t *z = *(zone++);
 		if (!z)
 			break;
+		if (!z->size)
+			BUG();
 
-		min += z->pages_low;
+		min += z->pages_min;
 		if (z->free_pages > min) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
-		}
+		} else if (z->free_pages < z->pages_min)
+			fixup_freespace(z, direct_reclaim);
 	}
 
-	classzone->need_balance = 1;
-	mb();
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
+	/*
+	 * Next, try to allocate a page from a zone with a HIGH
+	 * amount of (free + inactive_clean) pages.
+	 *
+	 * If there is a lot of activity, inactive_target
+	 * will be high and we'll have a good chance of
+	 * finding a page using the HIGH limit.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
+	if (page)
+		return page;
 
+	/*
+	 * Then try to allocate a page from a zone with more
+	 * than zone->pages_low of (free + inactive_clean) pages.
+	 *
+	 * When the working set is very large and VM activity
+	 * is low, we're most likely to have our allocation
+	 * succeed here.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
+	if (page)
+		return page;
+
+	/*
+	 * OK, none of the zones on our zonelist has lots
+	 * of pages free.
+	 *
+	 * We wake up kswapd, in the hope that kswapd will
+	 * resolve this situation before memory gets tight.
+	 *
+	 * We'll also help a bit trying to free pages, this
+	 * way statistics will make sure really fast allocators
+	 * are slowed down more than slow allocators and other
+	 * programs in the system shouldn't be impacted as much
+	 * by the hogs.
+	 */
+	wakeup_kswapd(gfp_mask);
+
+	/*
+	 * After waking up kswapd, we try to allocate a page
+	 * from any zone which isn't critical yet.
+	 *
+	 * Kswapd should, in most situations, bring the situation
+	 * back to normal in no time.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
+	if (page)
+		return page;
+
+	/*
+	 * Kernel allocations can eat a few emergency pages.
+	 * We should be able to run without this, find out why
+	 * the SCSI layer isn't happy ...
+	 */
+	if (gfp_mask & __GFP_HIGH) {
+		page = __alloc_pages_limit(zonelist, order, PAGES_KERNEL, direct_reclaim);
+		if (page)
+			return page;
+	}
+
+	/*
+	 * Oh well, we didn't succeed.
+	 */
+	if (!(current->flags & PF_MEMALLOC)) {
+		/*
+		 * Are we dealing with a higher order allocation?
+		 *
+		 * If so, try to defragment some memory.
+		 */
+		if (order > 0 && (gfp_mask & __GFP_WAIT))
+			goto defragment;
+
+		/*
+		 * If we arrive here, we are really tight on memory.
+		 * Since kswapd didn't succeed in freeing pages for us,
+		 * we need to help it.
+		 *
+		 * Single page allocs loop until the allocation succeeds.
+		 * Multi-page allocs can fail due to memory fragmentation;
+		 * in that case we bail out to prevent infinite loops and
+		 * hanging device drivers ...
+		 *
+		 * Another issue are GFP_NOFS allocations; because they
+		 * do not have __GFP_FS set it's possible we cannot make
+		 * any progress freeing pages, in that case it's better
+		 * to give up than to deadlock the kernel looping here.
+		 *
+		 * NFS: we must yield the CPU (to rpciod) to avoid deadlock.
+		 */
+		if (gfp_mask & __GFP_WAIT) {
+			__set_current_state(TASK_RUNNING);
+			yield();
+			if (!order || free_high(ALL_ZONES) >= 0) {
+				int progress = try_to_free_pages(gfp_mask);
+				if (progress || (gfp_mask & __GFP_FS))
+					goto try_again;
+				/*
+				 * Fail if no progress was made and the
+				 * allocation may not be able to block on IO.
+				 */
+				return NULL;
+			}
+		}
+	}
+
+	/*
+	 * Final phase: allocate anything we can!
+	 *
+	 * Higher order allocations, GFP_ATOMIC allocations and
+	 * recursive allocations (PF_MEMALLOC) end up here.
+	 *
+	 * Only recursive allocations can use the very last pages
+	 * in the system, otherwise it would be just too easy to
+	 * deadlock the system...
+	 */
 	zone = zonelist->zones;
 	min = 1UL << order;
 	for (;;) {
-		unsigned long local_min;
 		zone_t *z = *(zone++);
+		struct page * page = NULL;
 		if (!z)
 			break;
 
-		local_min = z->pages_min;
-		if (!(gfp_mask & __GFP_WAIT))
-			local_min >>= 2;
-		min += local_min;
-		if (z->free_pages > min) {
+		/*
+		 * SUBTLE: direct_reclaim is only possible if the task
+		 * becomes PF_MEMALLOC while looping above. This will
+		 * happen when the OOM killer selects this task for
+		 * death.
+		 */
+		if (direct_reclaim) {
+			page = reclaim_page(z);
+			if (page)
+				return page;
+		}
+
+		/* XXX: is pages_min/4 a good amount to reserve for this? */
+		min += z->pages_min / 4;
+		if (z->free_pages > min || ((current->flags & PF_MEMALLOC) && !in_interrupt())) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
 		}
 	}
+	goto out_failed;
 
-	/* here we're in the low on memory slow path */
 
-rebalance:
-	if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
+	/*
+	 * Naive "defragmentation" for higher-order allocations. First we
+	 * free the inactive_clean pages to see if we can allocate our
+	 * allocation, then we call page_launder() to clean some dirty
+	 * pages, and last we try once more.
+	 *
+	 * We might want to turn this into something which defragments
+	 * memory based on physical page, simply by looking for unmapped
+	 * pages next to pages on the free list...
+	 */
+defragment:
+	{
+		int freed = 0;
+defragment_again:
 		zone = zonelist->zones;
 		for (;;) {
 			zone_t *z = *(zone++);
 			if (!z)
 				break;
-
-			page = rmqueue(z, order);
-			if (page)
-				return page;
-		}
-nopage:
-		if (!(current->flags & PF_RADIX_TREE)) {
-			printk("%s: page allocation failure."
-				" order:%d, mode:0x%x\n",
-				current->comm, order, gfp_mask);
+			if (!z->size)
+				continue;
+			while (z->inactive_clean_pages) {
+				struct page * page;
+				/* Move one page to the free list. */
+				page = reclaim_page(z);
+				if (!page)
+					break;
+				__free_page(page);
+				/* Try if the allocation succeeds. */
+				page = rmqueue(z, order);
+				if (page)
+					return page;
+			}
 		}
-		return NULL;
-	}
-
-	/* Atomic allocations - we can't balance anything */
-	if (!(gfp_mask & __GFP_WAIT))
-		goto nopage;
 
-	page = balance_classzone(classzone, gfp_mask, order, &freed);
-	if (page)
-		return page;
-
-	zone = zonelist->zones;
-	min = 1UL << order;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
-
-		min += z->pages_min;
-		if (z->free_pages > min) {
-			page = rmqueue(z, order);
-			if (page)
-				return page;
+		/* XXX: do real defragmentation instead of calling launder ? */
+		if (!freed & !(current->flags & PF_MEMALLOC)) {
+			freed = 1;
+			current->flags |= PF_MEMALLOC;
+			try_to_free_pages(gfp_mask);
+			current->flags &= ~PF_MEMALLOC;
+			goto defragment_again;
 		}
 	}
 
-	/* Don't let big-order allocations loop */
-	if (order > 3)
-		goto nopage;
-
-	/* Yield for kswapd, and try again */
-	__set_current_state(TASK_RUNNING);
-	yield();
-	goto rebalance;
+out_failed:
+	/* No luck.. */
+	printk(KERN_ERR "__alloc_pages: %u-order allocation failed.\n", order);
+	return NULL;
 }
 
 /*
@@ -497,37 +631,30 @@
 {
 	unsigned int sum;
 	zone_t *zone;
-	pg_data_t *pgdat = pgdat_list;
 
 	sum = 0;
-	while (pgdat) {
-		for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
-			sum += zone->free_pages;
-		pgdat = pgdat->node_next;
-	}
+	for_each_zone(zone)
+		sum += zone->free_pages;
+	
 	return sum;
 }
 
-static unsigned int nr_free_zone_pages(int offset)
+static unsigned int nr_free_zone_pages (int offset)
 {
-	pg_data_t *pgdat = pgdat_list;
+	pg_data_t *pgdat;
 	unsigned int sum = 0;
 
-	do {
+	for_each_pgdat(pgdat) {
 		zonelist_t *zonelist = pgdat->node_zonelists + offset;
 		zone_t **zonep = zonelist->zones;
 		zone_t *zone;
 
 		for (zone = *zonep++; zone; zone = *zonep++) {
-			unsigned long size = zone->size;
-			unsigned long high = zone->pages_high;
-			if (size > high)
-				sum += size - high;
+			sum += zone->free_pages;
+			sum += zone->inactive_clean_pages;
+			sum += zone->inactive_dirty_pages;
 		}
-
-		pgdat = pgdat->node_next;
-	} while (pgdat);
-
+	}
 	return sum;
 }
 
@@ -550,13 +677,12 @@
 #if CONFIG_HIGHMEM
 unsigned int nr_free_highpages (void)
 {
-	pg_data_t *pgdat = pgdat_list;
+	pg_data_t *pgdat;
 	unsigned int pages = 0;
 
-	while (pgdat) {
+	for_each_pgdat(pgdat)
 		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
-		pgdat = pgdat->node_next;
-	}
+
 	return pages;
 }
 #endif
@@ -581,8 +707,9 @@
 		ret->nr_dirty += ps->nr_dirty;
 		ret->nr_writeback += ps->nr_writeback;
 		ret->nr_pagecache += ps->nr_pagecache;
-		ret->nr_active += ps->nr_active;
-		ret->nr_inactive += ps->nr_inactive;
+		ret->nr_active_pages += ps->nr_active_pages;
+		ret->nr_inactive_clean_pages += ps->nr_inactive_clean_pages;
+		ret->nr_inactive_dirty_pages += ps->nr_inactive_dirty_pages;
 	}
 }
 
@@ -645,12 +772,13 @@
 		tmpdat = tmpdat->node_next;
 	}
 
-	printk("( Active:%lu inactive:%lu dirty:%lu writeback:%lu free:%u )\n",
-		ps.nr_active,
-		ps.nr_inactive,
-		ps.nr_dirty,
-		ps.nr_writeback,
-		nr_free_pages());
+ 	printk("( Active:%lu inactive_dirty:%lu inactive_clean:%lu dirty:%lu writeback:%lu free:%u )\n",
+ 		ps.nr_active_pages,
+ 		ps.nr_inactive_dirty_pages,
+ 		ps.nr_inactive_clean_pages,
+ 		ps.nr_dirty,
+ 		ps.nr_writeback,
+ 		nr_free_pages());
 
 	for (type = 0; type < MAX_NR_ZONES; type++) {
 		struct list_head *head, *curr;
@@ -786,6 +914,7 @@
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  */
+extern unsigned int kswapd_minfree;
 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 	unsigned long *zones_size, unsigned long zone_start_paddr, 
 	unsigned long *zholes_size, struct page *lmem_map)
@@ -832,7 +961,7 @@
 	offset = lmem_map - mem_map;	
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		zone_t *zone = pgdat->node_zones + j;
-		unsigned long mask;
+		unsigned long mask, extrafree = 0;
 		unsigned long size, realsize;
 
 		zone_table[nid * MAX_NR_ZONES + j] = zone;
@@ -846,7 +975,14 @@
 		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
-		zone->need_balance = 0;
+		zone->inactive_clean_pages = 0;
+		zone->inactive_dirty_pages = 0;
+		zone->pte_chain_freelist = NULL;
+		INIT_LIST_HEAD(&zone->active_list);
+		INIT_LIST_HEAD(&zone->inactive_dirty_list);
+		INIT_LIST_HEAD(&zone->inactive_clean_list);
+		spin_lock_init(&zone->pte_chain_freelist_lock);
+
 		if (!size)
 			continue;
 
@@ -866,15 +1002,22 @@
 
 		pgdat->nr_zones = j+1;
 
+		/*
+		 * On large memory machines we keep extra memory
+		 * free for kernel allocations.
+		 */
+		if (zone_extrafree_ratio[j])
+			extrafree = min_t(int, (realtotalpages / zone_extrafree_ratio[j]), zone_extrafree_max[j]);
+		if (extrafree < zone_balance_max[j])
+			extrafree = 0;
+
 		mask = (realsize / zone_balance_ratio[j]);
 		if (mask < zone_balance_min[j])
 			mask = zone_balance_min[j];
-		else if (mask > zone_balance_max[j])
-			mask = zone_balance_max[j];
-		zone->pages_min = mask;
-		zone->pages_low = mask*2;
-		zone->pages_high = mask*3;
-
+		zone->pages_min = extrafree + min(mask, (unsigned long)zone_balance_max[j]);
+		zone->pages_low = extrafree + mask*2;
+		zone->pages_high = extrafree + mask*3;
+		zone->pages_plenty = extrafree + mask*6;
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
 		zone->zone_start_paddr = zone_start_paddr;
@@ -882,6 +1025,8 @@
 		if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
 			printk("BUG: wrong zone alignment, it will crash\n");
 
+		kswapd_minfree += zone->pages_min;
+
 		/*
 		 * Initially all pages are reserved - free ones are freed
 		 * up by free_all_bootmem() once the early boot process is
diff -uNr linux-2.5.22/mm/pdflush.c linux-2.5.22-rmap13b/mm/pdflush.c
--- linux-2.5.22/mm/pdflush.c	Wed Jun 12 15:44:34 2002
+++ linux-2.5.22-rmap13b/mm/pdflush.c	Tue Jun 18 13:47:39 2002
@@ -16,7 +16,6 @@
 #include <linux/module.h>
 #include <linux/suspend.h>
 
-
 /*
  * Minimum and maximum number of pdflush instances
  */
@@ -97,7 +96,6 @@
 
 	spin_lock_irq(&pdflush_lock);
 	nr_pdflush_threads++;
-//	printk("pdflush %d [%d] starts\n", nr_pdflush_threads, current->pid);
 	for ( ; ; ) {
 		struct pdflush_work *pdf;
 
@@ -144,7 +142,6 @@
 		my_work->fn = NULL;
 	}
 	nr_pdflush_threads--;
-//	printk("pdflush %d [%d] ends\n", nr_pdflush_threads, current->pid);
 	spin_unlock_irq(&pdflush_lock);
 	return 0;
 }
diff -uNr linux-2.5.22/mm/readahead.c linux-2.5.22-rmap13b/mm/readahead.c
--- linux-2.5.22/mm/readahead.c	Tue Jun 18 13:42:11 2002
+++ linux-2.5.22-rmap13b/mm/readahead.c	Tue Jun 18 13:47:39 2002
@@ -174,6 +174,42 @@
 }
 
 /*
+ * We combine this with readahead to deactivate pages when we
+ * think there's sequential IO going on.  Note that this is
+ * harmless since we don't actually evict the pages from memory
+ * but just move them to the inactive list.
+ *
+ * Rik van Riel, 2000
+ */
+static void drop_behind(struct file * file, unsigned long index)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+
+	/* Nothing to drop-behind if we're on the first page. */
+	if (!index)
+		return;
+
+	/*
+	 * Go backwards from index-1 and drop all pages in the
+	 * readahead window. Since the readahead window may have
+	 * been increased since the last time we were called, we
+	 * stop when the page isn't there.
+	 */
+	spin_lock(&pagemap_lru_lock);
+	while (--index >= file->f_ra.start) {
+		spin_lock(&mapping->page_lock);
+		page = radix_tree_lookup(&mapping->page_tree, index);
+		spin_unlock(&mapping->page_lock);
+		if (!page || !PageActive(page))
+			break;
+		drop_page(page);
+	}
+	spin_unlock(&pagemap_lru_lock);
+}
+
+/*
  * page_cache_readahead is the main function.  If performs the adaptive
  * readahead window size management and submits the readahead I/O.
  */
@@ -288,6 +324,11 @@
 		}
 	}
 out:
+	/*
+	 * Move the pages that have already been passed
+	 * to the inactive list.
+	 */
+	drop_behind(file, offset);
 	return;
 }
 
diff -uNr linux-2.5.22/mm/rmap.c linux-2.5.22-rmap13b/mm/rmap.c
--- linux-2.5.22/mm/rmap.c	Wed Dec 31 17:00:00 1969
+++ linux-2.5.22-rmap13b/mm/rmap.c	Tue Jun 18 13:47:39 2002
@@ -0,0 +1,436 @@
+/*
+ * mm/rmap.c - physical to virtual reverse mappings
+ *
+ * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
+ * Released under the General Public License (GPL).
+ *
+ *
+ * Simple, low overhead pte-based reverse mapping scheme.
+ * This is kept modular because we may want to experiment
+ * with object-based reverse mapping schemes. Please try
+ * to keep this thing as modular as possible.
+ */
+
+/*
+ * Locking:
+ * - the page->pte_chain is protected by the PG_chainlock bit,
+ *   which nests within the pagemap_lru_lock, then the
+ *   mm->page_table_lock, and then the page lock.
+ * - because swapout locking is opposite to the locking order
+ *   in the page fault path, the swapout path uses trylocks
+ *   on the mm->page_table_lock
+ */
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/swapops.h>
+
+#include <asm/pgalloc.h>
+#include <asm/rmap.h>
+#include <asm/smplock.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+/*#define DEBUG_RMAP */
+
+/*
+ * Shared pages have a chain of pte_chain structures, used to locate
+ * all the mappings to this page. We only need a pointer to the pte
+ * here, the page struct for the page table page contains the process
+ * it belongs to and the offset within that process.
+ *
+ * A singly linked list should be fine for most, if not all, workloads.
+ * On fork-after-exec the mapping we'll be removing will still be near
+ * the start of the list, on mixed application systems the short-lived
+ * processes will have their mappings near the start of the list and
+ * in systems with long-lived applications the relative overhead of
+ * exit() will be lower since the applications are long-lived.
+ */
+struct pte_chain {
+	struct pte_chain * next;
+	pte_t * ptep;
+};
+
+static inline struct pte_chain * pte_chain_alloc(zone_t *);
+static inline void pte_chain_free(struct pte_chain *, struct pte_chain *,
+		struct page *, zone_t *);
+static void alloc_new_pte_chains(zone_t *);
+
+/**
+ * page_referenced - test if the page was referenced
+ * @page: the page to test
+ *
+ * Quick test_and_clear_referenced for all mappings to a page,
+ * returns the number of processes which referenced the page.
+ * Caller needs to hold the pte_chain_lock.
+ */
+int page_referenced(struct page * page)
+{
+	struct pte_chain * pc;
+	int referenced = 0;
+
+	if (TestClearPageReferenced(page))
+		referenced++;
+
+	/* Check all the page tables mapping this page. */
+	for (pc = page->pte_chain; pc; pc = pc->next) {
+		if (ptep_test_and_clear_young(pc->ptep))
+			referenced++;
+	}
+	return referenced;
+}
+
+/**
+ * page_add_rmap - add reverse mapping entry to a page
+ * @page: the page to add the mapping to
+ * @ptep: the page table entry mapping this page
+ *
+ * Add a new pte reverse mapping to a page.
+ * The caller needs to hold the mm->page_table_lock.
+ */
+void page_add_rmap(struct page * page, pte_t * ptep)
+{
+	struct pte_chain * pte_chain;
+	unsigned long pfn = pte_pfn(*ptep);
+
+#ifdef DEBUG_RMAP
+	if (!page || !ptep)
+		BUG();
+	if (!pte_present(*ptep))
+		BUG();
+	if (!ptep_to_mm(ptep))
+		BUG();
+#endif
+
+	if (!pfn_valid(pfn) || PageReserved(page))
+		return;
+
+#ifdef DEBUG_RMAP
+	pte_chain_lock(page);
+	{
+		struct pte_chain * pc;
+		for (pc = page->pte_chain; pc; pc = pc->next) {
+			if (pc->ptep == ptep)
+				BUG();
+		}
+	}
+	pte_chain_unlock(page);
+#endif
+
+	pte_chain = pte_chain_alloc(page_zone(page));
+
+	pte_chain_lock(page);
+
+	/* Hook up the pte_chain to the page. */
+	pte_chain->ptep = ptep;
+	pte_chain->next = page->pte_chain;
+	page->pte_chain = pte_chain;
+
+	pte_chain_unlock(page);
+}
+
+/**
+ * page_remove_rmap - take down reverse mapping to a page
+ * @page: page to remove mapping from
+ * @ptep: page table entry to remove
+ *
+ * Removes the reverse mapping from the pte_chain of the page,
+ * after that the caller can clear the page table entry and free
+ * the page.
+ * Caller needs to hold the mm->page_table_lock.
+ */
+void page_remove_rmap(struct page * page, pte_t * ptep)
+{
+	struct pte_chain * pc, * prev_pc = NULL;
+	unsigned long pfn = pte_pfn(*ptep);
+	zone_t *zone;
+
+	if (!page || !ptep)
+		BUG();
+	if (!pfn_valid(pfn) || PageReserved(page))
+		return;
+
+	zone = page_zone(page);
+
+	pte_chain_lock(page);
+	for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) {
+		if (pc->ptep == ptep) {
+			pte_chain_free(pc, prev_pc, page, zone);
+			goto out;
+		}
+	}
+#ifdef DEBUG_RMAP
+	/* Not found. This should NEVER happen! */
+	printk(KERN_ERR "page_remove_rmap: pte_chain %p not present.\n", ptep);
+	printk(KERN_ERR "page_remove_rmap: only found: ");
+	for (pc = page->pte_chain; pc; pc = pc->next)
+		printk("%p ", pc->ptep);
+	printk("\n");
+	printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n");
+#endif
+
+out:
+	pte_chain_unlock(page);
+	return;
+			
+}
+
+/**
+ * try_to_unmap_one - worker function for try_to_unmap
+ * @page: page to unmap
+ * @ptep: page table entry to unmap from page
+ *
+ * Internal helper function for try_to_unmap, called for each page
+ * table entry mapping a page. Because locking order here is opposite
+ * to the locking order used by the page fault path, we use trylocks.
+ * Locking:
+ *	pagemap_lru_lock		page_launder()
+ *	    page lock			page_launder(), trylock
+ *		pte_chain_lock		page_launder()
+ *		    mm->page_table_lock	try_to_unmap_one(), trylock
+ */
+static int FASTCALL(try_to_unmap_one(struct page *, pte_t *));
+static int try_to_unmap_one(struct page * page, pte_t * ptep)
+{
+	unsigned long address = ptep_to_address(ptep);
+	struct mm_struct * mm = ptep_to_mm(ptep);
+	struct vm_area_struct * vma;
+	pte_t pte;
+	int ret;
+
+	if (!mm)
+		BUG();
+
+	/*
+	 * We need the page_table_lock to protect us from page faults,
+	 * munmap, fork, etc...
+	 */
+	if (!spin_trylock(&mm->page_table_lock))
+		return SWAP_AGAIN;
+
+	/* During mremap, it's possible pages are not in a VMA. */
+	vma = find_vma(mm, address);
+	if (!vma) {
+		ret = SWAP_FAIL;
+		goto out_unlock;
+	}
+
+	/* The page is mlock()d, we cannot swap it out. */
+	if (vma->vm_flags & VM_LOCKED) {
+		ret = SWAP_FAIL;
+		goto out_unlock;
+	}
+
+	/* Nuke the page table entry. */
+	pte = ptep_get_and_clear(ptep);
+	flush_tlb_page(vma, address);
+	flush_cache_page(vma, address);
+
+	/* Store the swap location in the pte. See handle_pte_fault() ... */
+	if (PageSwapCache(page)) {
+		swp_entry_t entry;
+		entry.val = page->index;
+		swap_duplicate(entry);
+		set_pte(ptep, swp_entry_to_pte(entry));
+	}
+
+	/* Move the dirty bit to the physical page now the pte is gone. */
+	if (pte_dirty(pte))
+		set_page_dirty(page);
+
+	mm->rss--;
+	page_cache_release(page);
+	ret = SWAP_SUCCESS;
+
+out_unlock:
+	spin_unlock(&mm->page_table_lock);
+	return ret;
+}
+
+/**
+ * try_to_unmap - try to remove all page table mappings to a page
+ * @page: the page to get unmapped
+ *
+ * Tries to remove all the page table entries which are mapping this
+ * page, used in the pageout path.  Caller must hold pagemap_lru_lock
+ * and the page lock.  Return values are:
+ *
+ * SWAP_SUCCESS	- we succeeded in removing all mappings
+ * SWAP_AGAIN	- we missed a trylock, try again later
+ * SWAP_FAIL	- the page is unswappable
+ * SWAP_ERROR	- an error occurred
+ */
+int try_to_unmap(struct page * page)
+{
+	struct pte_chain * pc, * next_pc, * prev_pc = NULL;
+	zone_t *zone = page_zone(page);
+	int ret = SWAP_SUCCESS;
+
+	/* This page should not be on the pageout lists. */
+	if (PageReserved(page))
+		BUG();
+	if (!PageLocked(page))
+		BUG();
+	/* We need backing store to swap out a page. */
+	if (!page->mapping)
+		BUG();
+
+	for (pc = page->pte_chain; pc; pc = next_pc) {
+		next_pc = pc->next;
+		switch (try_to_unmap_one(page, pc->ptep)) {
+			case SWAP_SUCCESS:
+				/* Free the pte_chain struct. */
+				pte_chain_free(pc, prev_pc, page, zone);
+				break;
+			case SWAP_AGAIN:
+				/* Skip this pte, remembering status. */
+				prev_pc = pc;
+				ret = SWAP_AGAIN;
+				continue;
+			case SWAP_FAIL:
+				return SWAP_FAIL;
+			case SWAP_ERROR:
+				return SWAP_ERROR;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * page_over_rsslimit - test if the page is over its RSS limit
+ * @page - page to test
+ *
+ * This function returns true if the process owning this page
+ * is over its RSS (resident set size) limit.  For shared pages
+ * we penalise it only if all processes using it are over their
+ * rss limits.
+ * The caller needs to hold the page's pte_chain_lock.
+ */
+int page_over_rsslimit(struct page * page)
+{
+	struct pte_chain * pte_chain = page->pte_chain;
+	struct mm_struct * mm;
+	pte_t * ptep;
+
+	/* No process is using the page. */
+	if (!pte_chain)
+		return 0;
+
+	do {
+		ptep = pte_chain->ptep;
+		mm = ptep_to_mm(ptep);
+
+		/*
+		 * If the process is under its RSS limit, stop
+		 * scanning and don't penalise the page.
+		 */
+		if(!mm->rlimit_rss || mm->rss <= mm->rlimit_rss)
+			return 0;
+		
+		pte_chain = pte_chain->next;
+	} while (pte_chain);
+
+	return 1;
+}
+
+/**
+ ** No more VM stuff below this comment, only pte_chain helper
+ ** functions.
+ **/
+
+static inline void pte_chain_push(zone_t * zone,
+		struct pte_chain * pte_chain)
+{
+	pte_chain->ptep = NULL;
+	pte_chain->next = zone->pte_chain_freelist;
+	zone->pte_chain_freelist = pte_chain;
+}
+
+static inline struct pte_chain * pte_chain_pop(zone_t * zone)
+{
+	struct pte_chain *pte_chain;
+
+	pte_chain = zone->pte_chain_freelist;
+	zone->pte_chain_freelist = pte_chain->next;
+	pte_chain->next = NULL;
+
+	return pte_chain;
+}
+
+/**
+ * pte_chain_free - free pte_chain structure
+ * @pte_chain: pte_chain struct to free
+ * @prev_pte_chain: previous pte_chain on the list (may be NULL)
+ * @page: page this pte_chain hangs off (may be NULL)
+ * @zone: memory zone to free pte chain in
+ *
+ * This function unlinks pte_chain from the singly linked list it
+ * may be on and adds the pte_chain to the free list. May also be
+ * called for new pte_chain structures which aren't on any list yet.
+ * Caller needs to hold the pte_chain_lock if the page is non-NULL.
+ */
+static inline void pte_chain_free(struct pte_chain * pte_chain,
+		struct pte_chain * prev_pte_chain, struct page * page,
+		zone_t * zone)
+{
+	if (prev_pte_chain)
+		prev_pte_chain->next = pte_chain->next;
+	else if (page)
+		page->pte_chain = pte_chain->next;
+
+	spin_lock(&zone->pte_chain_freelist_lock);
+	pte_chain_push(zone, pte_chain);
+	spin_unlock(&zone->pte_chain_freelist_lock);
+}
+
+/**
+ * pte_chain_alloc - allocate a pte_chain struct
+ * @zone: memory zone to allocate pte_chain for
+ *
+ * Returns a pointer to a fresh pte_chain structure. Allocates new
+ * pte_chain structures as required.
+ * Caller needs to hold the page's pte_chain_lock.
+ */
+static inline struct pte_chain * pte_chain_alloc(zone_t * zone)
+{
+	struct pte_chain * pte_chain;
+
+	spin_lock(&zone->pte_chain_freelist_lock);
+
+	/* Allocate new pte_chain structs as needed. */
+	if (!zone->pte_chain_freelist)
+		alloc_new_pte_chains(zone);
+
+	/* Grab the first pte_chain from the freelist. */
+	pte_chain = pte_chain_pop(zone);
+
+	spin_unlock(&zone->pte_chain_freelist_lock);
+
+	return pte_chain;
+}
+
+/**
+ * alloc_new_pte_chains - convert a free page to pte_chain structures
+ * @zone: memory zone to allocate pte_chains for
+ *
+ * Grabs a free page and converts it to pte_chain structures. We really
+ * should pre-allocate these earlier in the pagefault path or come up
+ * with some other trick.
+ *
+ * Note that we cannot use the slab cache because the pte_chain structure
+ * is way smaller than the minimum size of a slab cache allocation.
+ * Caller needs to hold the zone->pte_chain_freelist_lock
+ */
+static void alloc_new_pte_chains(zone_t *zone)
+{
+	struct pte_chain * pte_chain = (void *) get_zeroed_page(GFP_ATOMIC);
+	int i = PAGE_SIZE / sizeof(struct pte_chain);
+
+	if (pte_chain) {
+		for (; i-- > 0; pte_chain++)
+			pte_chain_push(zone, pte_chain);
+	} else {
+		/* Yeah yeah, I'll fix the pte_chain allocation ... */
+		panic("Fix pte_chain allocation, you lazy bastard!\n");
+	}
+}
diff -uNr linux-2.5.22/mm/swap.c linux-2.5.22-rmap13b/mm/swap.c
--- linux-2.5.22/mm/swap.c	Thu May  2 17:22:50 2002
+++ linux-2.5.22-rmap13b/mm/swap.c	Tue Jun 18 13:55:21 2002
@@ -15,10 +15,11 @@
 
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
-#include <linux/swap.h>
 #include <linux/swapctl.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
+#include <linux/buffer_head.h> /* for try_to_release_page() */
+#include <linux/mm_inline.h>
 
 #include <asm/dma.h>
 #include <asm/uaccess.h> /* for copy_to/from_user */
@@ -33,15 +34,97 @@
 	8,	/* do swap I/O in clusters of this size */
 };
 
+/**
+ * (de)activate_page - move pages from/to active and inactive lists
+ * @page: the page we want to move
+ * @nolock - are we already holding the pagemap_lru_lock?
+ *
+ * Deactivate_page will move an active page to the right
+ * inactive list, while activate_page will move a page back
+ * from one of the inactive lists to the active list. If
+ * called on a page which is not on any of the lists, the
+ * page is left alone.
+ */
+void deactivate_page_nolock(struct page * page)
+{
+	/*
+	 * Don't touch it if it's not on the active list.
+	 * (some pages aren't on any list at all)
+	 */
+	ClearPageReferenced(page);
+	page->age = 0;
+	if (PageActive(page)) {
+		del_page_from_active_list(page);
+		add_page_to_inactive_dirty_list(page);
+	}
+}	
+
+void deactivate_page(struct page * page)
+{
+	spin_lock(&pagemap_lru_lock);
+	deactivate_page_nolock(page);
+	spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * drop_page - like deactivate_page, but try inactive_clean list
+ * @page: the page to drop
+ *
+ * Try to move a page to the inactive_clean list, this succeeds if the
+ * page is clean and not in use by anybody. If the page cannot be placed
+ * on the inactive_clean list it is placed on the inactive_dirty list
+ * instead.
+ *
+ * Note: this function gets called with the pagemap_lru_lock held.
+ */
+void drop_page(struct page * page)
+{
+	if (!TestSetPageLocked(page)) {
+		if (page->mapping && PagePrivate(page)) {
+			page_cache_get(page);
+			spin_unlock(&pagemap_lru_lock);
+			try_to_release_page(page, GFP_NOIO);
+			spin_lock(&pagemap_lru_lock);
+			page_cache_release(page);
+		}
+		unlock_page(page);
+	}
+
+	/* Make sure the page really is reclaimable. */
+	pte_chain_lock(page);
+	if (!page->mapping || PageDirty(page) || page->pte_chain ||
+			PagePrivate(page) || page_count(page) > 1)
+		deactivate_page_nolock(page);
+
+	else if (page_count(page) == 1) {
+		ClearPageReferenced(page);
+		page->age = 0;
+		if (PageActive(page)) {
+			del_page_from_active_list(page);
+			add_page_to_inactive_clean_list(page);
+		} else if (PageInactiveDirty(page)) {
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_inactive_clean_list(page);
+		}
+	}
+	pte_chain_unlock(page);
+}
+
 /*
  * Move an inactive page to the active list.
  */
-static inline void activate_page_nolock(struct page * page)
+void activate_page_nolock(struct page * page)
 {
-	if (PageLRU(page) && !PageActive(page)) {
-		del_page_from_inactive_list(page);
+	if (PageInactiveDirty(page)) {
+		del_page_from_inactive_dirty_list(page);
+		add_page_to_active_list(page);
+	} else if (PageInactiveClean(page)) {
+		del_page_from_inactive_clean_list(page);
 		add_page_to_active_list(page);
 	}
+
+	/* Make sure the page gets a fair chance at staying active. */
+	page->age = max((int)page->age, PAGE_AGE_START);
 }
 
 void activate_page(struct page * page)
@@ -57,29 +140,31 @@
  */
 void lru_cache_add(struct page * page)
 {
-	if (!TestSetPageLRU(page)) {
+	if (!PageLRU(page)) {
 		spin_lock(&pagemap_lru_lock);
-		add_page_to_inactive_list(page);
+		SetPageLRU(page);
+		add_page_to_active_list(page);
 		spin_unlock(&pagemap_lru_lock);
 	}
 }
 
 /**
  * __lru_cache_del: remove a page from the page lists
- * @page: the page to add
+ * @page: the page to remove
  *
  * This function is for when the caller already holds
  * the pagemap_lru_lock.
  */
 void __lru_cache_del(struct page * page)
 {
-	if (TestClearPageLRU(page)) {
-		if (PageActive(page)) {
-			del_page_from_active_list(page);
-		} else {
-			del_page_from_inactive_list(page);
-		}
+	if (PageActive(page)) {
+		del_page_from_active_list(page);
+	} else if (PageInactiveDirty(page)) {
+		del_page_from_inactive_dirty_list(page);
+	} else if (PageInactiveClean(page)) {
+		del_page_from_inactive_clean_list(page);
 	}
+	ClearPageLRU(page);
 }
 
 /**
diff -uNr linux-2.5.22/mm/swap_state.c linux-2.5.22-rmap13b/mm/swap_state.c
--- linux-2.5.22/mm/swap_state.c	Wed Jun 12 16:06:35 2002
+++ linux-2.5.22-rmap13b/mm/swap_state.c	Tue Jun 18 13:47:39 2002
@@ -125,6 +125,63 @@
 	return 0;
 }
 
+/**
+ * add_to_swap - allocate swap space for a page
+ * @page: page we want to move to swap
+ *
+ * Allocate swap space for the page and add the page to the
+ * swap cache.  Caller needs to hold the page lock. 
+ */
+int add_to_swap(struct page * page)
+{
+	swp_entry_t entry;
+	int flags;
+
+	if (!PageLocked(page))
+		BUG();
+
+	for (;;) {
+		entry = get_swap_page();
+		if (!entry.val)
+			return 0;
+
+		/* Radix-tree node allocations are performing
+		 * GFP_ATOMIC allocations under PF_MEMALLOC.  
+		 * They can completely exhaust the page allocator.  
+		 *
+		 * So PF_MEMALLOC is dropped here.  This causes the slab 
+		 * allocations to fail earlier, so radix-tree nodes will 
+		 * then be allocated from the mempool reserves. */
+
+		flags = current->flags;
+		current->flags &= ~PF_MEMALLOC;
+		current->flags |= PF_RADIX_TREE;
+
+		/*
+		 * Add it to the swap cache and mark it dirty
+		 * (adding to the page cache will clear the dirty
+		 * and uptodate bits, so we need to do it again)
+		 */
+		switch (add_to_swap_cache(page, entry)) {
+		case 0:				/* Success */
+			current->flags = flags;
+			SetPageUptodate(page);
+			set_page_dirty(page);
+			swap_free(entry);
+			return 1;
+		case -ENOMEM:			/* radix-tree allocation */
+			current->flags = flags;
+			swap_free(entry);
+			return 0;
+		default:			/* ENOENT: raced */
+			break;
+		}
+		/* Raced with "speculative" read_swap_cache_async */
+		current->flags = flags;
+		swap_free(entry);
+	}
+}
+
 /*
  * This must be called only on pages that have
  * been verified to be in the swap cache.
diff -uNr linux-2.5.22/mm/swapfile.c linux-2.5.22-rmap13b/mm/swapfile.c
--- linux-2.5.22/mm/swapfile.c	Tue Jun 18 13:42:11 2002
+++ linux-2.5.22-rmap13b/mm/swapfile.c	Tue Jun 18 13:47:39 2002
@@ -384,6 +384,7 @@
 		return;
 	get_page(page);
 	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
+	page_add_rmap(page, dir);
 	swap_free(entry);
 	++vma->vm_mm->rss;
 }
diff -uNr linux-2.5.22/mm/vmscan.c linux-2.5.22-rmap13b/mm/vmscan.c
--- linux-2.5.22/mm/vmscan.c	Wed Jun 12 16:06:36 2002
+++ linux-2.5.22-rmap13b/mm/vmscan.c	Tue Jun 18 13:56:10 2002
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/file.h>
+#include <linux/mm_inline.h>
 #include <linux/writeback.h>
 #include <linux/suspend.h>
 #include <linux/buffer_head.h>		/* for try_to_release_page() */
@@ -29,6 +30,8 @@
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
 
+static void refill_freelist(void);
+static void wakeup_memwaiters(void);
 /*
  * The "priority" of VM scanning is how much of the queues we
  * will scan in one go. A value of 6 for DEF_PRIORITY implies
@@ -37,432 +40,274 @@
  */
 #define DEF_PRIORITY (6)
 
-static inline int is_page_cache_freeable(struct page * page)
+static inline void age_page_up(struct page *page)
 {
-	return page_count(page) - !!PagePrivate(page) == 1;
+	page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX); 
 }
 
-/*
- * On the swap_out path, the radix-tree node allocations are performing
- * GFP_ATOMIC allocations under PF_MEMALLOC.  They can completely
- * exhaust the page allocator.  This is bad; some pages should be left
- * available for the I/O system to start sending the swapcache contents
- * to disk.
- *
- * So PF_MEMALLOC is dropped here.  This causes the slab allocations to fail
- * earlier, so radix-tree nodes will then be allocated from the mempool
- * reserves.
- */
-static inline int
-swap_out_add_to_swap_cache(struct page *page, swp_entry_t entry)
+static inline void age_page_down(struct page *page)
 {
-	int flags = current->flags;
-	int ret;
-
-	current->flags &= ~PF_MEMALLOC;
-	current->flags |= PF_RADIX_TREE;
-	ret = add_to_swap_cache(page, entry);
-	current->flags = flags;
-	return ret;
+	page->age -= min(PAGE_AGE_DECL, (int)page->age);
 }
 
-/*
- * The swap-out function returns 1 if it successfully
- * scanned all the pages it was asked to (`count').
- * It returns zero if it couldn't do anything,
- *
- * rss may decrease because pages are shared, but this
- * doesn't count as having freed a page.
- */
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
+/* Must be called with page's pte_chain_lock held. */
+static inline int page_mapping_inuse(struct page * page)
 {
-	pte_t pte;
-	swp_entry_t entry;
+	struct address_space *mapping = page->mapping;
 
-	/* Don't look at this pte if it's been accessed recently. */
-	if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
-		mark_page_accessed(page);
-		return 0;
-	}
-
-	/* Don't bother unmapping pages that are active */
-	if (PageActive(page))
-		return 0;
-
-	/* Don't bother replenishing zones not under pressure.. */
-	if (!memclass(page_zone(page), classzone))
-		return 0;
+	/* Page is in somebody's page tables. */
+	if (page->pte_chain)
+		return 1;
 
-	if (TestSetPageLocked(page))
+	/* XXX: does this happen ? */
+	if (!mapping)
 		return 0;
 
-	if (PageWriteback(page))
-		goto out_unlock;
+	/* File is mmap'd by somebody. */
+	if (!list_empty(&mapping->i_mmap) || !list_empty(&mapping->i_mmap_shared))
+		return 1;
 
-	/* From this point on, the odds are that we're going to
-	 * nuke this pte, so read and clear the pte.  This hook
-	 * is needed on CPUs which update the accessed and dirty
-	 * bits in hardware.
-	 */
-	flush_cache_page(vma, address);
-	pte = ptep_get_and_clear(page_table);
-	flush_tlb_page(vma, address);
+	return 0;
+}
 
-	if (pte_dirty(pte))
-		set_page_dirty(page);
 
-	/*
-	 * Is the page already in the swap cache? If so, then
-	 * we can just drop our reference to it without doing
-	 * any IO - it's already up-to-date on disk.
-	 */
-	if (PageSwapCache(page)) {
-		entry.val = page->index;
-		swap_duplicate(entry);
-set_swap_pte:
-		set_pte(page_table, swp_entry_to_pte(entry));
-drop_pte:
-		mm->rss--;
-		unlock_page(page);
-		{
-			int freeable = page_count(page) -
-				!!PagePrivate(page) <= 2;
-			page_cache_release(page);
-			return freeable;
-		}
-	}
+/**
+ * reclaim_page - reclaims one page from the inactive_clean list
+ * @zone: reclaim a page from this zone
+ *
+ * The pages on the inactive_clean can be instantly reclaimed.
+ * The tests look impressive, but most of the time we'll grab
+ * the first page of the list and exit successfully.
+ */
+struct page * reclaim_page(zone_t * zone)
+{
+	struct address_space * mapping;
+	struct page * page = NULL;
+	struct list_head * page_lru;
+	swp_entry_t entry = {0};
+	int maxscan;
 
 	/*
-	 * Is it a clean page? Then it must be recoverable
-	 * by just paging it in again, and we can just drop
-	 * it..  or if it's dirty but has backing store,
-	 * just mark the page dirty and drop it.
-	 *
-	 * However, this won't actually free any real
-	 * memory, as the page will just be in the page cache
-	 * somewhere, and as such we should just continue
-	 * our scan.
-	 *
-	 * Basically, this just makes it possible for us to do
-	 * some real work in the future in "refill_inactive()".
-	 */
-	if (page->mapping)
-		goto drop_pte;
-	if (!PageDirty(page))
-		goto drop_pte;
+	 * We need to hold the page_lock around all tests to make sure
+ 	 * reclaim_page() cannot race with find_get_page() and friends.
+ 	 */
+	spin_lock(&pagemap_lru_lock);
+	maxscan = zone->inactive_clean_pages;
+	while (maxscan-- && !list_empty(&zone->inactive_clean_list)) {
+		page_lru = zone->inactive_clean_list.prev;
+		page = list_entry(page_lru, struct page, lru);
 
-	/*
-	 * Anonymous buffercache pages can be left behind by
-	 * concurrent truncate and pagefault.
-	 */
-	if (PagePrivate(page))
-		goto preserve;
+		mapping = page->mapping;
+		spin_lock(&mapping->page_lock);
 
-	/*
-	 * This is a dirty, swappable page.  First of all,
-	 * get a suitable swap entry for it, and make sure
-	 * we have the swap cache set up to associate the
-	 * page with that swap entry.
-	 */
-	for (;;) {
-		entry = get_swap_page();
-		if (!entry.val)
-			break;
-		/* Add it to the swap cache and mark it dirty
-		 * (adding to the page cache will clear the dirty
-		 * and uptodate bits, so we need to do it again)
-		 */
-		switch (swap_out_add_to_swap_cache(page, entry)) {
-		case 0:				/* Success */
-			SetPageUptodate(page);
-			set_page_dirty(page);
-			goto set_swap_pte;
-		case -ENOMEM:			/* radix-tree allocation */
-			swap_free(entry);
-			goto preserve;
-		default:			/* ENOENT: raced */
-			break;
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (unlikely(!PageInactiveClean(page))) {
+			printk("VM: reclaim_page, wrong page on list.\n");
+			list_del(page_lru);
+			page_zone(page)->inactive_clean_pages--;
+			goto unlock;
+		}
+
+		/* Page is being freed */
+		if (unlikely(!page_count(page))) {
+			list_del(page_lru);
+			list_add(page_lru, &zone->inactive_clean_list);
+			goto unlock;
+		}
+
+		/* Page cannot be reclaimed ?  Move to inactive_dirty list. */
+		pte_chain_lock(page);
+		if (unlikely(page->pte_chain || PagePrivate(page) ||
+				PageReferenced(page) || PageDirty(page) ||
+				page_count(page) > 1 || TestSetPageLocked(page))) {
+			del_page_from_inactive_clean_list(page);
+			add_page_to_inactive_dirty_list(page);
+			pte_chain_unlock(page);
+			goto unlock;
 		}
-		/* Raced with "speculative" read_swap_cache_async */
-		swap_free(entry);
-	}
-
-	/* No swap space left */
-preserve:
-	set_pte(page_table, pte);
-out_unlock:
-	unlock_page(page);
-	return 0;
-}
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
-{
-	pte_t * pte;
-	unsigned long pmd_end;
 
-	if (pmd_none(*dir))
-		return count;
-	if (pmd_bad(*dir)) {
-		pmd_ERROR(*dir);
-		pmd_clear(dir);
-		return count;
-	}
-	
-	pte = pte_offset_map(dir, address);
-	
-	pmd_end = (address + PMD_SIZE) & PMD_MASK;
-	if (end > pmd_end)
-		end = pmd_end;
+		/*
+		 * From here until reaching either the bottom of the loop
+		 * or found_page: the pte_chain_lock is held.
+		 */
 
-	do {
-		if (pte_present(*pte)) {
-			unsigned long pfn = pte_pfn(*pte);
-			struct page *page = pfn_to_page(pfn);
-
-			if (pfn_valid(pfn) && !PageReserved(page)) {
-				count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
-				if (!count) {
-					address += PAGE_SIZE;
-					pte++;
-					break;
-				}
-			}
+		/* OK, remove the page from the caches. */
+                if (PageSwapCache(page)) {
+			entry.val = page->index;
+			__delete_from_swap_cache(page);
+			goto found_page;
 		}
-		address += PAGE_SIZE;
-		pte++;
-	} while (address && (address < end));
-	pte_unmap(pte - 1);
-	mm->swap_address = address;
-	return count;
-}
 
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
-{
-	pmd_t * pmd;
-	unsigned long pgd_end;
+		if (page->mapping) {
+			__remove_inode_page(page);
+			goto found_page;
+		}
 
-	if (pgd_none(*dir))
-		return count;
-	if (pgd_bad(*dir)) {
-		pgd_ERROR(*dir);
-		pgd_clear(dir);
-		return count;
+		/* We should never ever get here. */
+		printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
+		list_del(page_lru);
+		zone->inactive_clean_pages--;
+		pte_chain_unlock(page);
+		unlock_page(page);
+unlock:
+		spin_unlock(&mapping->page_lock);
 	}
+	spin_unlock(&pagemap_lru_lock);
+	return NULL;
 
-	pmd = pmd_offset(dir, address);
-
-	pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;	
-	if (pgd_end && (end > pgd_end))
-		end = pgd_end;
-	
-	do {
-		count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
-		if (!count)
-			break;
-		address = (address + PMD_SIZE) & PMD_MASK;
-		pmd++;
-	} while (address && (address < end));
-	return count;
-}
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
-{
-	pgd_t *pgdir;
-	unsigned long end;
-
-	/* Don't swap out areas which are reserved */
-	if (vma->vm_flags & VM_RESERVED)
-		return count;
-
-	pgdir = pgd_offset(mm, address);
-
-	end = vma->vm_end;
-	if (address >= end)
-		BUG();
-	do {
-		count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
-		if (!count)
-			break;
-		address = (address + PGDIR_SIZE) & PGDIR_MASK;
-		pgdir++;
-	} while (address && (address < end));
-	return count;
+found_page:
+	__lru_cache_del(page);
+	pte_chain_unlock(page);
+	spin_unlock(&mapping->page_lock);
+	spin_unlock(&pagemap_lru_lock);
+	if (entry.val)
+		swap_free(entry);
+	unlock_page(page);
+	page->age = PAGE_AGE_START;
+	if (page_count(page) != 1)
+		printk("VM: reclaim_page, found page with count %d!\n",
+				page_count(page));
+	return page;
 }
 
-/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
-struct mm_struct *swap_mm = &init_mm;
 
-/*
- * Returns remaining count of pages to be swapped out by followup call.
+/**
+ * page_launder_zone - clean dirty inactive pages, move to inactive_clean list
+ * @zone: zone to free pages in
+ * @gfp_mask: what operations we are allowed to do
+ *
+ * This function is called when we are low on free / inactive_clean
+ * pages, its purpose is to refill the free/clean list as efficiently
+ * as possible.
+ *
+ * This means we do writes asynchronously as long as possible and will
+ * only sleep on IO when we don't have another option. Since writeouts
+ * cause disk seeks and make read IO slower, we skip writes alltogether
+ * when the amount of dirty pages is small.
+ *
+ * This code is heavily inspired by the FreeBSD source code. Thanks
+ * go out to Matthew Dillon.
  */
-static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
-{
-	unsigned long address;
-	struct vm_area_struct* vma;
-
-	/*
-	 * Find the proper vm-area after freezing the vma chain 
-	 * and ptes.
-	 */
-	spin_lock(&mm->page_table_lock);
-	address = mm->swap_address;
-	if (address == TASK_SIZE || swap_mm != mm) {
-		/* We raced: don't count this mm but try again */
-		++*mmcounter;
-		goto out_unlock;
-	}
-	vma = find_vma(mm, address);
-	if (vma) {
-		if (address < vma->vm_start)
-			address = vma->vm_start;
-
-		for (;;) {
-			count = swap_out_vma(mm, vma, address, count, classzone);
-			vma = vma->vm_next;
-			if (!vma)
-				break;
-			if (!count)
-				goto out_unlock;
-			address = vma->vm_start;
-		}
-	}
-	/* Indicate that we reached the end of address space */
-	mm->swap_address = TASK_SIZE;
-
-out_unlock:
-	spin_unlock(&mm->page_table_lock);
-	return count;
-}
-
-static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone));
-static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)
-{
-	int counter, nr_pages = SWAP_CLUSTER_MAX;
-	struct mm_struct *mm;
-
-	counter = mmlist_nr;
-	do {
-		if (need_resched()) {
-			__set_current_state(TASK_RUNNING);
-			schedule();
-		}
-
-		spin_lock(&mmlist_lock);
-		mm = swap_mm;
-		while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
-			mm->swap_address = 0;
-			mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
-			if (mm == swap_mm)
-				goto empty;
-			swap_mm = mm;
-		}
-
-		/* Make sure the mm doesn't disappear when we drop the lock.. */
-		atomic_inc(&mm->mm_users);
-		spin_unlock(&mmlist_lock);
-
-		nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
-
-		mmput(mm);
-
-		if (!nr_pages)
-			return 1;
-	} while (--counter >= 0);
-
-	return 0;
-
-empty:
-	spin_unlock(&mmlist_lock);
-	return 0;
-}
-
-static int
-shrink_cache(int nr_pages, zone_t *classzone,
-		unsigned int gfp_mask, int priority, int max_scan)
+int page_launder_zone(zone_t * zone, int gfp_mask, int priority)
 {
+	int maxscan, cleaned_pages = 0, target = free_plenty(zone);
 	struct list_head * entry;
-	struct address_space *mapping;
-	int max_mapped = nr_pages << (9 - priority);
+	struct address_space * mapping;
 
+	/* The main launder loop. */
 	spin_lock(&pagemap_lru_lock);
-	while (--max_scan >= 0 &&
-			(entry = inactive_list.prev) != &inactive_list) {
+	maxscan = zone->inactive_dirty_pages >> priority;
+	while (maxscan-- && !list_empty(&zone->inactive_dirty_list)) {
 		struct page * page;
-
+		
+		/* Low latency reschedule point */
 		if (need_resched()) {
 			spin_unlock(&pagemap_lru_lock);
-			__set_current_state(TASK_RUNNING);
 			schedule();
 			spin_lock(&pagemap_lru_lock);
 			continue;
 		}
 
+		entry = zone->inactive_dirty_list.prev;
 		page = list_entry(entry, struct page, lru);
 
-		if (unlikely(!PageLRU(page)))
-			BUG();
-		if (unlikely(PageActive(page)))
-			BUG();
+		if (cleaned_pages > target)
+			break;
 
 		list_del(entry);
-		list_add(entry, &inactive_list);
+		list_add(entry, &zone->inactive_dirty_list);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (!PageInactiveDirty(page)) {
+			printk("VM: page_launder, wrong page on list.\n");
+			list_del(entry);
+			dec_page_state(nr_inactive_dirty_pages);
+			page_zone(page)->inactive_dirty_pages--;
+			continue;
+		}
 
 		/*
-		 * Zero page counts can happen because we unlink the pages
-		 * _after_ decrementing the usage count..
+		 * Page is being freed, don't worry about it.
 		 */
 		if (unlikely(!page_count(page)))
 			continue;
 
-		if (!memclass(page_zone(page), classzone))
+		if (unlikely(TestSetPageLocked(page)))
 			continue;
 
-		/* Racy check to avoid trylocking when not worthwhile */
-		if (!PagePrivate(page) && (page_count(page) != 1 || !page->mapping))
-			goto page_mapped;
+		if (PageWriteback(page)) {	/* The non-racy check */
+			unlock_page(page);
+			continue;
+		}
 
 		/*
-		 * IO in progress? Leave it at the back of the list.
+		 * The page is in active use or really unfreeable. Move to
+		 * the active list and adjust the page age if needed.
 		 */
-		if (unlikely(PageWriteback(page))) {
-			if (gfp_mask & __GFP_FS) {
-				page_cache_get(page);
-				spin_unlock(&pagemap_lru_lock);
-				wait_on_page_writeback(page);
+		pte_chain_lock(page);
+		if (page_referenced(page) && page_mapping_inuse(page) &&
+				!page_over_rsslimit(page)) {
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_active_list(page);
+			page->age = max((int)page->age, PAGE_AGE_START);
+			pte_chain_unlock(page);
+			unlock_page(page);
+			continue;
+		}
+
+		/*
+		 * Anonymous process memory without backing store. Try to
+		 * allocate it some swap space here.
+		 *
+		 * XXX: implement swap clustering ?
+		 */
+		if (page->pte_chain && !page->mapping && !PagePrivate(page)) {
+			page_cache_get(page);
+			pte_chain_unlock(page);
+			spin_unlock(&pagemap_lru_lock);
+			if (!add_to_swap(page)) {
+				activate_page(page);
+				unlock_page(page);
 				page_cache_release(page);
 				spin_lock(&pagemap_lru_lock);
+				continue;
 			}
-			continue;
+			page_cache_release(page);
+			spin_lock(&pagemap_lru_lock);
+			pte_chain_lock(page);
 		}
 
-		if (TestSetPageLocked(page))
-			continue;
-
-		if (PageWriteback(page)) {	/* The non-racy check */
-			unlock_page(page);
-			continue;
+		/*
+		 * The page is mapped into the page tables of one or more
+		 * processes. Try to unmap it here.
+		 */
+		if (page->pte_chain) {
+			switch (try_to_unmap(page)) {
+				case SWAP_ERROR:
+				case SWAP_FAIL:
+					goto page_active;
+				case SWAP_AGAIN:
+					pte_chain_unlock(page);
+					unlock_page(page);
+					continue;
+				case SWAP_SUCCESS:
+					; /* try to free the page below */
+			}
 		}
-
+		pte_chain_unlock(page);
 		mapping = page->mapping;
 
-		if (PageDirty(page) && is_page_cache_freeable(page) &&
-				page->mapping && (gfp_mask & __GFP_FS)) {
+		if (PageDirty(page) && mapping && (gfp_mask & __GFP_FS)) {
 			/*
 			 * It is not critical here to write it only if
 			 * the page is unmapped beause any direct writer
-			 * like O_DIRECT would set the page's dirty bitflag
-			 * on the phisical page after having successfully
+			 * like O_DIRECT would set the PG_dirty bitflag
+			 * on the physical page after having successfully
 			 * pinned it and after the I/O to the page is finished,
 			 * so the direct writes to the page cannot get lost.
 			 */
 			int (*writeback)(struct page *, int *);
-			const int nr_pages = SWAP_CLUSTER_MAX;
-			int nr_to_write = nr_pages;
+			int nr_to_write = SWAP_CLUSTER_MAX;
 
 			writeback = mapping->a_ops->vm_writeback;
 			if (writeback == NULL)
@@ -470,7 +315,6 @@
 			page_cache_get(page);
 			spin_unlock(&pagemap_lru_lock);
 			(*writeback)(page, &nr_to_write);
-			max_scan -= (nr_pages - nr_to_write);
 			page_cache_release(page);
 			spin_lock(&pagemap_lru_lock);
 			continue;
@@ -484,7 +328,7 @@
 		if (PagePrivate(page)) {
 			spin_unlock(&pagemap_lru_lock);
 
-			/* avoid to free a locked page */
+			/* To avoid freeing our page before we're done. */
 			page_cache_get(page);
 
 			if (try_to_release_page(page, gfp_mask)) {
@@ -501,15 +345,14 @@
 
 					/* effectively free the page here */
 					page_cache_release(page);
-
-					if (--nr_pages)
-						continue;
-					break;
+					cleaned_pages++;
+					continue;
 				} else {
 					/*
-					 * The page is still in pagecache so undo the stuff
-					 * before the try_to_release_page since we've not
-					 * finished and we can now try the next step.
+					 * We freed the buffers but may have
+					 * slept; undo the stuff we did before
+					 * try_to_release_page and fall through
+					 * to the next step.
 					 */
 					page_cache_release(page);
 
@@ -526,236 +369,285 @@
 		}
 
 		/*
-		 * This is the non-racy check for busy page.
+		 * If the page is really freeable now, move it to the
+		 * inactive_clean list.
+		 *
+		 * We re-test everything since the page could have been
+		 * used by somebody else while we waited on IO above.
+		 * This test is not safe from races, but only the one
+		 * in reclaim_page() needs to be.
 		 */
-		if (mapping) {
-			write_lock(&mapping->page_lock);
-			if (is_page_cache_freeable(page))
-				goto page_freeable;
-			write_unlock(&mapping->page_lock);
+		pte_chain_lock(page);
+		if (mapping && !PageDirty(page) && !page->pte_chain &&
+				page_count(page) == 1) {
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_inactive_clean_list(page);
+			pte_chain_unlock(page);
+			unlock_page(page);
+			cleaned_pages++;
+		} else {
+			/*
+			 * OK, we don't know what to do with the page.
+			 * It's no use keeping it here, so we move it to
+			 * the active list.
+			 */
+page_active:
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_active_list(page);
+			pte_chain_unlock(page);
+			unlock_page(page);
 		}
-		unlock_page(page);
-page_mapped:
-		if (--max_mapped >= 0)
-			continue;
+	}
+	spin_unlock(&pagemap_lru_lock);
 
-		/*
-		 * Alert! We've found too many mapped pages on the
-		 * inactive list, so we start swapping out now!
-		 */
-		spin_unlock(&pagemap_lru_lock);
-		swap_out(priority, gfp_mask, classzone);
-		return nr_pages;
+	/* Return the number of pages moved to the inactive_clean list. */
+	return cleaned_pages;
+}
+
+/**
+ * page_launder - clean dirty inactive pages, move to inactive_clean list
+ * @gfp_mask: what operations we are allowed to do
+ *
+ * This function iterates over all zones and calls page_launder_zone(),
+ * balancing still needs to be added...
+ */
+int page_launder(int gfp_mask)
+{
+	int maxtry = 1 << DEF_PRIORITY;
+	struct zone_struct * zone;
+	int freed = 0;
+
+	/* Global balancing while we have a global shortage. */
+	while (maxtry-- && free_high(ALL_ZONES) >= 0) {
+		for_each_zone(zone)
+			if (free_plenty(zone) >= 0)
+				freed += page_launder_zone(zone, gfp_mask, 6);
+	}
+	
+	/* Clean up the remaining zones with a serious shortage, if any. */
+	for_each_zone(zone)
+		if (free_min(zone) >= 0)
+			freed += page_launder_zone(zone, gfp_mask, 0);
+
+	return freed;
+}
+
+/**
+ * refill_inactive_zone - scan the active list and find pages to deactivate
+ * @priority: how much are we allowed to scan
+ *
+ * This function will scan a portion of the active list of a zone to find
+ * unused pages, those pages will then be moved to the inactive list.
+ */
+int refill_inactive_zone(struct zone_struct * zone, int priority)
+{
+	int maxscan = zone->active_pages >> priority;
+	int target = inactive_high(zone);
+	struct list_head * page_lru;
+	int nr_deactivated = 0;
+	struct page * page;
+
+	/* Take the lock while messing with the list... */
+	spin_lock(&pagemap_lru_lock);
+	while (maxscan-- && !list_empty(&zone->active_list)) {
+		page_lru = zone->active_list.prev;
+		page = list_entry(page_lru, struct page, lru);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (unlikely(!PageActive(page))) {
+			printk("VM: refill_inactive, wrong page on list.\n");
+			list_del(page_lru);
+			dec_page_state(nr_active_pages);
+			continue;
+		}
+		
+		/* Needed to follow page->mapping */
+		if (TestSetPageLocked(page)) {
+			list_del(page_lru);
+			list_add(page_lru, &zone->active_list);
+			continue;
+		}
 
-page_freeable:
 		/*
-		 * It is critical to check PageDirty _after_ we made sure
-		 * the page is freeable* so not in use by anybody.
+		 * If the object the page is in is not in use we don't
+		 * bother with page aging.  If the page is touched again
+		 * while on the inactive_clean list it'll be reactivated.
+		 * From here until the end of the current iteration
+		 * both PG_locked and the pte_chain_lock are held.
 		 */
-		if (PageDirty(page)) {
-			write_unlock(&mapping->page_lock);
+		pte_chain_lock(page);
+		if (!page_mapping_inuse(page)) {
+			pte_chain_unlock(page);
 			unlock_page(page);
+			drop_page(page);
 			continue;
 		}
 
-		/* point of no return */
-		if (likely(!PageSwapCache(page))) {
-			__remove_inode_page(page);
-			write_unlock(&mapping->page_lock);
+		/*
+		 * Do aging on the pages.
+		 */
+		if (page_referenced(page)) {
+			age_page_up(page);
 		} else {
-			swp_entry_t swap;
-			swap.val = page->index;
-			__delete_from_swap_cache(page);
-			write_unlock(&mapping->page_lock);
-			swap_free(swap);
+			age_page_down(page);
 		}
 
-		__lru_cache_del(page);
+		/* 
+		 * If the page age is 'hot' and the process using the
+		 * page doesn't exceed its RSS limit we keep the page.
+		 * Otherwise we move it to the inactive_dirty list.
+		 */
+		if (page->age && !page_over_rsslimit(page)) {
+			list_del(page_lru);
+			list_add(page_lru, &zone->active_list);
+		} else {
+			deactivate_page_nolock(page);
+			if (++nr_deactivated > target) {
+				pte_chain_unlock(page);
+				unlock_page(page);
+				goto done;
+			}
+		}
+		pte_chain_unlock(page);
 		unlock_page(page);
 
-		/* effectively free the page here */
-		page_cache_release(page);
-
-		if (--nr_pages)
-			continue;
-		break;
+		/* Low latency reschedule point */
+		if (need_resched()) {
+			spin_unlock(&pagemap_lru_lock);
+			schedule();
+			spin_lock(&pagemap_lru_lock);
+		}
 	}
-	spin_unlock(&pagemap_lru_lock);
 
-	return nr_pages;
+done:
+	spin_unlock(&pagemap_lru_lock);
+	return nr_deactivated;
 }
 
-/*
- * This moves pages from the active list to
- * the inactive list.
+/**
+ * refill_inactive - checks all zones and refills the inactive list as needed
  *
- * We move them the other way when we see the
- * reference bit on the page.
+ * This function tries to balance page eviction from all zones by aging
+ * the pages from each zone in the same ratio until the global inactive
+ * shortage is resolved. After that it does one last "clean-up" scan to
+ * fix up local inactive shortages.
  */
-static void refill_inactive(int nr_pages)
+int refill_inactive(void)
 {
-	struct list_head * entry;
-
-	spin_lock(&pagemap_lru_lock);
-	entry = active_list.prev;
-	while (nr_pages-- && entry != &active_list) {
-		struct page * page;
+	int maxtry = 1 << DEF_PRIORITY;
+	zone_t * zone;
+	int ret = 0;
 
-		page = list_entry(entry, struct page, lru);
-		entry = entry->prev;
-		if (TestClearPageReferenced(page)) {
-			list_del(&page->lru);
-			list_add(&page->lru, &active_list);
-			continue;
+	/* Global balancing while we have a global shortage. */
+	while (maxtry-- && inactive_low(ALL_ZONES) >= 0) {
+		for_each_zone(zone) {
+			if (inactive_high(zone) >= 0)
+				ret += refill_inactive_zone(zone, DEF_PRIORITY);
 		}
+	}
 
-		del_page_from_active_list(page);
-		add_page_to_inactive_list(page);
-		SetPageReferenced(page);
+	/* Local balancing for zones which really need it. */
+	for_each_zone(zone) {
+		if (inactive_min(zone) >= 0)
+			ret += refill_inactive_zone(zone, 0);
 	}
-	spin_unlock(&pagemap_lru_lock);
+
+	return ret;
 }
 
-static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages));
-static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
+/**
+ * background_aging - slow background aging of zones
+ * @priority: priority at which to scan
+ *
+ * When the VM load is low or nonexistant, this function is
+ * called once a second to "sort" the pages in the VM. This
+ * way we know which pages to evict once a load spike happens.
+ * The effects of this function are very slow, the CPU usage
+ * should be minimal to nonexistant under most loads.
+ */
+static inline void background_aging(int priority)
 {
-	int chunk_size = nr_pages;
-	unsigned long ratio;
-	struct page_state ps;
-	int max_scan;
+	struct zone_struct * zone;
 
-	nr_pages -= kmem_cache_reap(gfp_mask);
-	if (nr_pages <= 0)
-		return 0;
+	for_each_zone(zone)
+		if (inactive_high(zone) > 0)
+			refill_inactive_zone(zone, priority);
+}
 
-	nr_pages = chunk_size;
+/*
+ * Worker function for kswapd and try_to_free_pages, we get
+ * called whenever there is a shortage of free/inactive_clean
+ * pages.
+ *
+ * This function will also move pages to the inactive list,
+ * if needed.
+ */
+static int do_try_to_free_pages(unsigned int gfp_mask)
+{
+	int ret = 0;
 
 	/*
-	 * Try to keep the active list 2/3 of the size of the cache
+	 * Eat memory from filesystem page cache, 
+	 * dentry, inode and filesystem quota caches.
 	 */
-	get_page_state(&ps);
-	ratio = (unsigned long)nr_pages * ps.nr_active /
-				((ps.nr_inactive | 1) * 2);
-	refill_inactive(ratio);
-	max_scan = ps.nr_inactive / priority;
-	nr_pages = shrink_cache(nr_pages, classzone,
-				gfp_mask, priority, max_scan);
-	if (nr_pages <= 0)
-		return 0;
-
-	wakeup_bdflush();
-
-	shrink_dcache_memory(priority, gfp_mask);
-
-	/* After shrinking the dcache, get rid of unused inodes too .. */
-	shrink_icache_memory(1, gfp_mask);
+	ret += page_launder(gfp_mask);
+	ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
+	ret += shrink_icache_memory(1, gfp_mask);
 #ifdef CONFIG_QUOTA
-	shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+	ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
 #endif
+	/*
+	 * Move pages from the active list to the inactive list.
+	 */
+	refill_inactive();
 
-	return nr_pages;
-}
+	/* 	
+	 * Reclaim unused slab cache memory.
+	 */
+	ret += kmem_cache_reap(gfp_mask);
 
-int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order)
-{
-	int priority = DEF_PRIORITY;
-	int nr_pages = SWAP_CLUSTER_MAX;
+	refill_freelist();
 
-	do {
-		nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);
-		if (nr_pages <= 0)
-			return 1;
-	} while (--priority);
+	/* Start IO when needed. */
+	if (free_plenty(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0)
+		blk_run_queues();
 
 	/*
 	 * Hmm.. Cache shrink failed - time to kill something?
 	 * Mhwahahhaha! This is the part I really like. Giggle.
 	 */
-	out_of_memory();
-	return 0;
-}
-
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
-
-static int check_classzone_need_balance(zone_t * classzone)
-{
-	zone_t * first_classzone;
-
-	first_classzone = classzone->zone_pgdat->node_zones;
-	while (classzone >= first_classzone) {
-		if (classzone->free_pages > classzone->pages_high)
-			return 0;
-		classzone--;
-	}
-	return 1;
+	if (!ret && free_min(ANY_ZONE) > 0)
+		out_of_memory();
+	return ret;
 }
 
-static int kswapd_balance_pgdat(pg_data_t * pgdat)
+/**
+ * refill_freelist - move inactive_clean pages to free list if needed
+ *
+ * Move some pages from the inactive_clean lists to the free
+ * lists so atomic allocations have pages to work from. This
+ * function really only does something when we don't have a 
+ * userspace load on __alloc_pages().
+ *
+ * We refill the freelist in a bump from pages_min to pages_min * 2
+ * in order to give the buddy allocator something to play with.
+ */
+static void refill_freelist(void)
 {
-	int need_more_balance = 0, i;
+	struct page * page;
 	zone_t * zone;
 
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
-		cond_resched();
-		if (!zone->need_balance)
-			continue;
-		if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
-			zone->need_balance = 0;
-			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ);
+	for_each_zone(zone) {
+		if (!zone->size || zone->free_pages >= zone->pages_min)
 			continue;
-		}
-		if (check_classzone_need_balance(zone))
-			need_more_balance = 1;
-		else
-			zone->need_balance = 0;
-	}
-
-	return need_more_balance;
-}
-
-static void kswapd_balance(void)
-{
-	int need_more_balance;
-	pg_data_t * pgdat;
-
-	do {
-		need_more_balance = 0;
-		pgdat = pgdat_list;
-		do
-			need_more_balance |= kswapd_balance_pgdat(pgdat);
-		while ((pgdat = pgdat->node_next));
-	} while (need_more_balance);
-}
-
-static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
-{
-	zone_t * zone;
-	int i;
 
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
-		if (!zone->need_balance)
-			continue;
-		return 0;
+		while (zone->free_pages < zone->pages_min * 2) {
+			page = reclaim_page(zone);
+			if (!page)
+				break;
+			__free_page(page);
+		}
 	}
-
-	return 1;
-}
-
-static int kswapd_can_sleep(void)
-{
-	pg_data_t * pgdat;
-
-	pgdat = pgdat_list;
-	do {
-		if (kswapd_can_sleep_pgdat(pgdat))
-			continue;
-		return 0;
-	} while ((pgdat = pgdat->node_next));
-
-	return 1;
 }
 
 /*
@@ -774,7 +666,6 @@
 int kswapd(void *unused)
 {
 	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
 
 	daemonize();
 	strcpy(tsk->comm, "kswapd");
@@ -798,26 +689,151 @@
 	 * Kswapd main loop.
 	 */
 	for (;;) {
+		static long recalc = 0;
 		if (current->flags & PF_FREEZE)
 			refrigerator(PF_IOTHREAD);
-		__set_current_state(TASK_INTERRUPTIBLE);
-		add_wait_queue(&kswapd_wait, &wait);
+		/*
+		 * We try to rebalance the VM either when we have a
+		 * global shortage of free pages or when one particular
+		 * zone is very short on free pages.
+		 */
+		if (free_high(ALL_ZONES) >= 0 || free_low(ANY_ZONE) > 0)
+			do_try_to_free_pages(GFP_KSWAPD);
 
-		mb();
-		if (kswapd_can_sleep())
-			schedule();
+		refill_freelist();
 
-		__set_current_state(TASK_RUNNING);
+		/* Once a second ... */
+		if (time_after(jiffies, recalc + HZ)) {
+			recalc = jiffies;
+
+			/* Do background page aging. */
+			background_aging(DEF_PRIORITY);
+		}
+
+		wakeup_memwaiters();
+	}
+}
+
+static int kswapd_overloaded;
+unsigned int kswapd_minfree; /* initialized in mm/page_alloc.c */
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
+
+/**
+ * wakeup_kswapd - wake up the pageout daemon
+ * gfp_mask: page freeing flags
+ *
+ * This function wakes up kswapd and can, under heavy VM pressure,
+ * put the calling task to sleep temporarily.
+ */
+void wakeup_kswapd(unsigned int gfp_mask)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	/* If we're in the memory freeing business ourself, don't sleep
+	 * but just wake kswapd and go back to businesss.
+	 */
+	if (current->flags & PF_MEMALLOC) {
+		wake_up_interruptible(&kswapd_wait);
+		return;
+	}
+
+	/* We need all of kswapd's GFP flags, otherwise we can't sleep on it.
+	 * We still wake kswapd of course.
+	 */
+	if ((gfp_mask & GFP_KSWAPD) != GFP_KSWAPD) {
+		wake_up_interruptible(&kswapd_wait);
+		return;
+	}
+	
+	add_wait_queue(&kswapd_done, &wait);
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        
+        /* Wake kswapd .... */
+        wake_up_interruptible(&kswapd_wait);
+        
+        /* ... and check if we need to wait on it */
+	if ((free_low(ALL_ZONES) > (kswapd_minfree / 2)) && !kswapd_overloaded)
+		schedule();
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&kswapd_done, &wait);
+}
+
+static void wakeup_memwaiters(void)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	        
+	add_wait_queue(&kswapd_wait, &wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/* Don't let the processes waiting on memory get stuck, ever. */
+	wake_up(&kswapd_done);
+
+	/* Enough free RAM, we can easily keep up with memory demand. */
+	if (free_high(ALL_ZONES) <= 0) {
+		schedule_timeout(HZ);
 		remove_wait_queue(&kswapd_wait, &wait);
+		return;
+	}
+	remove_wait_queue(&kswapd_wait, &wait);
 
-		/*
-		 * If we actually get into a low-memory situation,
-		 * the processes needing more memory will wake us
-		 * up on a more timely basis.
-		 */
-		kswapd_balance();
-		blk_run_queues();
+	/* OK, the VM is very loaded. Sleep instead of using all CPU. */
+	kswapd_overloaded = 1;
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(HZ / 4);
+	kswapd_overloaded = 0;
+	return;
+}
+
+/**
+ * try_to_free_pages - run the pageout code ourselves
+ * gfp_mask: mask of things the pageout code is allowed to do
+ *
+ * When the load on the system gets higher, it can happen
+ * that kswapd no longer manages to keep enough memory
+ * free. In those cases user programs allocating memory
+ * will call try_to_free_pages() and help the pageout code.
+ * This has the effects of freeing memory and slowing down
+ * the largest memory hogs a bit.
+ */
+int try_to_free_pages(unsigned int gfp_mask)
+{
+	int ret = 1;
+	if (gfp_mask & __GFP_WAIT) {
+		current->flags |= PF_MEMALLOC;
+		ret = do_try_to_free_pages(gfp_mask);
+		current->flags &= ~PF_MEMALLOC;
 	}
+	return ret;
+}
+
+/**
+ * rss_free_pages - run part of the pageout code and slow down a bit
+ * @gfp_mask: mask of things the pageout code is allowed to do
+ *
+ * This function is called when a task is over its RSS limit and
+ * has a page fault.  It's goal is to free some memory so non-hogs
+ * can run faster and slow down itself when needed so it won't eat
+ * the memory non-hogs can use.
+ */
+void rss_free_pages(unsigned int gfp_mask)
+{
+	long pause = 0;
+	if (current->flags & PF_MEMALLOC)
+		return;
+	current->flags |= PF_MEMALLOC;
+
+	do {
+		page_launder(gfp_mask);
+
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(pause);
+		set_current_state(TASK_RUNNING);
+		pause++;
+	} while (free_high(ALL_ZONES) >= 0);
+
+	current->flags &= ~PF_MEMALLOC;
+	return;
 }
 
 static int __init kswapd_init(void)