diff -uNr linux-2.5.28/Makefile linux-2.5.28-rmap/Makefile
--- linux-2.5.28/Makefile	Wed Jul 24 20:43:47 2002
+++ linux-2.5.28-rmap/Makefile	Wed Jul 24 20:47:13 2002
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 5
 SUBLEVEL = 28
-EXTRAVERSION =
+EXTRAVERSION = -rmap
 
 # *DOCUMENTATION*
 # Too see a list of typical targets execute "make help"
diff -uNr linux-2.5.28/fs/buffer.c linux-2.5.28-rmap/fs/buffer.c
--- linux-2.5.28/fs/buffer.c	Wed Jul 24 20:43:53 2002
+++ linux-2.5.28-rmap/fs/buffer.c	Wed Jul 24 20:47:19 2002
@@ -463,17 +463,13 @@
 }
 
 /*
- * FIXME: What is this function actually trying to do?  Why "zones[0]"?
+ * FIXME: What is this function actually trying to do? 
  * Is it still correct/needed if/when blockdev mappings use GFP_HIGHUSER?
  */
 static void free_more_memory(void)
 {
-	zone_t *zone;
-
-	zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0];
-
 	wakeup_bdflush();
-	try_to_free_pages(zone, GFP_NOFS, 0);
+	try_to_free_pages(GFP_NOFS);
 	blk_run_queues();
 	yield();
 }
diff -uNr linux-2.5.28/fs/dcache.c linux-2.5.28-rmap/fs/dcache.c
--- linux-2.5.28/fs/dcache.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/fs/dcache.c	Tue Jul 23 19:12:41 2002
@@ -603,8 +603,7 @@
 	count = dentry_stat.nr_unused / priority;
 
 	prune_dcache(count);
-	kmem_cache_shrink(dentry_cache);
-	return 0;
+	return kmem_cache_shrink(dentry_cache);
 }
 
 #define NAME_ALLOC_LEN(len)	((len+16) & ~15)
diff -uNr linux-2.5.28/fs/dquot.c linux-2.5.28-rmap/fs/dquot.c
--- linux-2.5.28/fs/dquot.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/fs/dquot.c	Tue Jul 23 19:12:41 2002
@@ -498,8 +498,7 @@
 	count = dqstats.free_dquots / priority;
 	prune_dqcache(count);
 	unlock_kernel();
-	kmem_cache_shrink(dquot_cachep);
-	return 0;
+	return kmem_cache_shrink(dquot_cachep);
 }
 
 /*
diff -uNr linux-2.5.28/fs/inode.c linux-2.5.28-rmap/fs/inode.c
--- linux-2.5.28/fs/inode.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/fs/inode.c	Tue Jul 23 19:12:41 2002
@@ -431,8 +431,7 @@
 	count = inodes_stat.nr_unused / priority;
 
 	prune_icache(count);
-	kmem_cache_shrink(inode_cachep);
-	return 0;
+	return kmem_cache_shrink(inode_cachep);
 }
 
 /*
diff -uNr linux-2.5.28/fs/mpage.c linux-2.5.28-rmap/fs/mpage.c
--- linux-2.5.28/fs/mpage.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/fs/mpage.c	Tue Jul 23 19:11:09 2002
@@ -557,12 +557,11 @@
 
 		if (page->mapping && !PageWriteback(page) &&
 					TestClearPageDirty(page)) {
-			/* FIXME: batch this up */
-			if (!PageActive(page) && PageLRU(page)) {
+			if (PageInactiveDirty(page)) {
 				spin_lock(&pagemap_lru_lock);
-				if (!PageActive(page) && PageLRU(page)) {
+				if (PageInactiveDirty(page)) {
 					list_del(&page->lru);
-					list_add(&page->lru, &inactive_list);
+					list_add(&page->lru, &page_zone(page)->inactive_dirty_list);
 				}
 				spin_unlock(&pagemap_lru_lock);
 			}
diff -uNr linux-2.5.28/fs/proc/proc_misc.c linux-2.5.28-rmap/fs/proc/proc_misc.c
--- linux-2.5.28/fs/proc/proc_misc.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/fs/proc/proc_misc.c	Wed Jul 24 21:02:05 2002
@@ -27,6 +27,7 @@
 #include <linux/ioport.h>
 #include <linux/config.h>
 #include <linux/mm.h>
+#include <linux/mm_inline.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
@@ -151,7 +152,9 @@
 		"Cached:       %8lu kB\n"
 		"SwapCached:   %8lu kB\n"
 		"Active:       %8lu kB\n"
-		"Inactive:     %8lu kB\n"
+		"Inact_dirty:  %8lu kB\n"
+		"Inact_clean:  %8lu kB\n"
+		"Inact_target: %8lu kB\n"
 		"HighTotal:    %8lu kB\n"
 		"HighFree:     %8lu kB\n"
 		"LowTotal:     %8lu kB\n"
@@ -161,15 +164,16 @@
 		"Dirty:        %8lu kB\n"
 		"Writeback:    %8lu kB\n"
 		"PageTables:   %8lu kB\n"
-		"PteChainTot:  %8lu kB\n"
-		"PteChainUsed: %8lu kB\n",
+		"ReverseMaps:  %8lu\n",
 		K(i.totalram),
 		K(i.freeram),
 		K(i.sharedram),
 		K(ps.nr_pagecache-swapper_space.nrpages),
 		K(swapper_space.nrpages),
-		K(ps.nr_active),
-		K(ps.nr_inactive),
+		K(ps.nr_active_pages),
+		K(ps.nr_inactive_dirty_pages),
+		K(ps.nr_inactive_clean_pages),
+		K(inactive_target()),
 		K(i.totalhigh),
 		K(i.freehigh),
 		K(i.totalram-i.totalhigh),
@@ -179,8 +183,7 @@
 		K(ps.nr_dirty),
 		K(ps.nr_writeback),
 		K(ps.nr_page_table_pages),
-		K(ps.nr_pte_chain_pages),
-		ps.used_pte_chains_bytes >> 10
+		ps.nr_reverse_maps
 		);
 
 	return proc_calc_metrics(page, start, off, count, eof, len);
diff -uNr linux-2.5.28/include/linux/init_task.h linux-2.5.28-rmap/include/linux/init_task.h
--- linux-2.5.28/include/linux/init_task.h	Wed Jul 24 20:43:54 2002
+++ linux-2.5.28-rmap/include/linux/init_task.h	Wed Jul 24 20:47:21 2002
@@ -27,6 +27,7 @@
 	mmap_sem:	__RWSEM_INITIALIZER(name.mmap_sem), \
 	page_table_lock: SPIN_LOCK_UNLOCKED, 		\
 	mmlist:		LIST_HEAD_INIT(name.mmlist),	\
+	rlimit_rss:	RLIM_INFINITY,			\
 }
 
 #define INIT_SIGNALS {	\
diff -uNr linux-2.5.28/include/linux/mm.h linux-2.5.28-rmap/include/linux/mm.h
--- linux-2.5.28/include/linux/mm.h	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/include/linux/mm.h	Tue Jul 23 19:09:15 2002
@@ -19,9 +19,6 @@
 extern unsigned long num_physpages;
 extern void * high_memory;
 extern int page_cluster;
-/* The inactive_clean lists are per zone. */
-extern struct list_head active_list;
-extern struct list_head inactive_list;
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -157,10 +154,10 @@
 					   updated asynchronously */
 	struct list_head lru;		/* Pageout list, eg. active_list;
 					   protected by pagemap_lru_lock !! */
-	union {
-		struct pte_chain * chain;	/* Reverse pte mapping pointer.
-					 * protected by PG_chainlock */
-		pte_t		 * direct;
+	unsigned char age;		/* Page aging counter. */
+	union {				/* Reverse pte mapping pointer, */
+		struct pte_chain * chain;	/* protected by PG_chainlock */
+		pte_t		 * direct;	
 	} pte;
 	unsigned long private;		/* mapping-private opaque data */
 
@@ -299,13 +296,17 @@
 
 #define page_address(page) ((page)->virtual)
 
-#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
+#elif defined(CONFIG_DISCONTIGMEM)
+
+extern unsigned long page_address(struct page * page);
+
+#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL && !CONFIG_DISCONTIGMEM */
 
 #define page_address(page)						\
 	__va( (((page) - page_zone(page)->zone_mem_map) << PAGE_SHIFT)	\
 			+ page_zone(page)->zone_start_paddr)
 
-#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
+#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL && !CONFIG_DISCONTIGMEM */
 
 /*
  * Error return values for the *_nopage functions
@@ -326,6 +327,7 @@
 /* The array of struct pages */
 extern struct page *mem_map;
 
+extern void FASTCALL(fixup_freespace(struct zone_struct *, int));
 extern void show_free_areas(void);
 extern void show_free_areas_node(pg_data_t *pgdat);
 
diff -uNr linux-2.5.28/include/linux/mm_inline.h linux-2.5.28-rmap/include/linux/mm_inline.h
--- linux-2.5.28/include/linux/mm_inline.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.28-rmap/include/linux/mm_inline.h	Tue Jul 23 19:09:19 2002
@@ -0,0 +1,278 @@
+#ifndef _LINUX_MM_INLINE_H
+#define _LINUX_MM_INLINE_H
+
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+
+/*
+ * These inline functions tend to need bits and pieces of all the
+ * other VM include files, meaning they cannot be defined inside
+ * one of the other VM include files.
+ *
+ * The include file mess really needs to be cleaned up...
+ */
+
+static inline void add_page_to_active_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	DEBUG_LRU_PAGE(page);
+	SetPageActive(page);
+	list_add(&page->lru, &zone->active_list);
+	zone->active_pages++;
+	inc_page_state(nr_active_pages);
+}
+
+static inline void add_page_to_inactive_dirty_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	DEBUG_LRU_PAGE(page);
+	SetPageInactiveDirty(page);
+	list_add(&page->lru, &zone->inactive_dirty_list);
+	zone->inactive_dirty_pages++;
+	inc_page_state(nr_inactive_dirty_pages);
+}
+
+static inline void add_page_to_inactive_clean_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	DEBUG_LRU_PAGE(page);
+	SetPageInactiveClean(page);
+	list_add(&page->lru, &zone->inactive_clean_list);
+	zone->inactive_clean_pages++;
+	inc_page_state(nr_inactive_clean_pages);
+}
+
+static inline void del_page_from_active_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	list_del(&page->lru);
+	ClearPageActive(page);
+	dec_page_state(nr_active_pages);
+	zone->active_pages--;
+	KERNEL_STAT_INC(pgdeactivate);
+	DEBUG_LRU_PAGE(page);
+}
+
+static inline void del_page_from_inactive_dirty_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	list_del(&page->lru);
+	ClearPageInactiveDirty(page);
+	dec_page_state(nr_inactive_dirty_pages);
+	zone->inactive_dirty_pages--;
+	DEBUG_LRU_PAGE(page);
+}
+
+static inline void del_page_from_inactive_clean_list(struct page * page)
+{
+	struct zone_struct * zone = page_zone(page);
+	list_del(&page->lru);
+	ClearPageInactiveClean(page);
+	zone->inactive_clean_pages--;
+	dec_page_state(nr_inactive_clean_pages);
+	DEBUG_LRU_PAGE(page);
+}
+
+/*
+ * Inline functions to control some balancing in the VM.
+ *
+ * Note that we do both global and per-zone balancing, with
+ * most of the balancing done globally.
+ */
+#define	PLENTY_FACTOR	2
+#define	ALL_ZONES	NULL
+#define	ANY_ZONE	(struct zone_struct *)(~0UL)
+#define INACTIVE_FACTOR	5
+
+#define	VM_MIN	0
+#define	VM_LOW	1
+#define	VM_HIGH	2
+#define VM_PLENTY 3
+static inline int zone_free_limit(struct zone_struct * zone, int limit)
+{
+	int free, target, delta;
+
+	/* This is really nasty, but GCC should completely optimise it away. */
+	if (limit == VM_MIN)
+		target = zone->pages_min;
+	else if (limit == VM_LOW)
+		target = zone->pages_low;
+	else if (limit == VM_HIGH)
+		target = zone->pages_high;
+	else
+		target = zone->pages_high * PLENTY_FACTOR;
+
+	free = zone->free_pages + zone->inactive_clean_pages;
+	delta = target - free;
+
+	return delta;
+}
+
+static inline int free_limit(struct zone_struct * zone, int limit)
+{
+	int shortage = 0, local;
+
+	if (zone == ALL_ZONES) {
+		for_each_zone(zone)
+			shortage += zone_free_limit(zone, limit);
+	} else if (zone == ANY_ZONE) {
+		for_each_zone(zone) {
+			local = zone_free_limit(zone, limit);
+			shortage += max(local, 0);
+		}
+	} else {
+		shortage = zone_free_limit(zone, limit);
+	}
+
+	return shortage;
+}
+
+/**
+ * free_min - test for critically low amount of free pages
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if we have a serious shortage of free and
+ * clean pages, zero or negative if there is no serious shortage.
+ */
+static inline int free_min(struct zone_struct * zone)
+{
+	return free_limit(zone, VM_MIN);
+}
+
+/**
+ * free_low - test for low amount of free pages
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if we have a shortage of free and
+ * clean pages, zero or negative if there is no shortage.
+ */
+static inline int free_low(struct zone_struct * zone)
+{
+	return free_limit(zone, VM_LOW);
+}
+
+/**
+ * free_high - test if amount of free pages is less than ideal
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if the number of free and clean
+ * pages is below kswapd's target, zero or negative if we
+ * have more than enough free and clean pages.
+ */
+static inline int free_high(struct zone_struct * zone)
+{
+	return free_limit(zone, VM_HIGH);
+}
+
+/**
+ * free_plenty - test if enough pages are freed
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if the number of free + clean pages
+ * in a zone is not yet excessive and kswapd is still allowed to
+ * free pages here, a negative value if kswapd should leave the
+ * zone alone.
+ */
+static inline int free_plenty(struct zone_struct * zone)
+{
+	return free_limit(zone, VM_PLENTY);
+}
+
+/*
+ * The inactive page target is the free target + 20% of (active + inactive)
+ * pages. 
+ */
+static inline int zone_inactive_limit(struct zone_struct * zone, int limit)
+{
+	int inactive, target, inactive_base;
+
+	inactive_base = zone->active_pages + zone->inactive_dirty_pages;
+	inactive_base /= INACTIVE_FACTOR;
+
+	/* GCC should optimise this away completely. */
+	if (limit == VM_MIN)
+		target = zone->pages_high + inactive_base / 2;
+	else if (limit == VM_LOW)
+		target = zone->pages_high + inactive_base;
+	else
+		target = zone->pages_high + inactive_base * 2;
+
+	inactive = zone->free_pages + zone->inactive_clean_pages;
+	inactive += zone->inactive_dirty_pages;
+
+	return target - inactive;
+}
+
+static inline int inactive_limit(struct zone_struct * zone, int limit)
+{
+	int shortage = 0, local;
+
+	if (zone == ALL_ZONES) {
+		for_each_zone(zone)
+			shortage += zone_inactive_limit(zone, limit);
+	} else if (zone == ANY_ZONE) {
+		for_each_zone(zone) {
+			local = zone_inactive_limit(zone, limit);
+			shortage += max(local, 0);
+		}
+	} else {
+		shortage = zone_inactive_limit(zone, limit);
+	}
+
+	return shortage;
+}
+
+/**
+ * inactive_min - test for serious shortage of (free + inactive clean) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have no serious shortage of (free + inactive clean) pages
+ */
+static inline int inactive_min(struct zone_struct * zone)
+{
+	return inactive_limit(zone, VM_MIN);
+}
+
+/**
+ * inactive_low - test for shortage of (free + inactive clean) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have no shortage of (free + inactive clean) pages
+ */
+static inline int inactive_low(struct zone_struct * zone)
+{
+	return inactive_limit(zone, VM_LOW);
+}
+
+/**
+ * inactive_high - less than ideal amount of (free + inactive) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have more than enough (free + inactive) pages
+ */
+static inline int inactive_high(struct zone_struct * zone)
+{
+	return inactive_limit(zone, VM_HIGH);
+}
+
+/*
+ * inactive_target - number of inactive pages we ought to have.
+ */
+static inline unsigned long inactive_target(void)
+{
+	struct page_state ps;
+	int target;
+
+	get_page_state(&ps);
+	target = ps.nr_active_pages + ps.nr_inactive_dirty_pages
+			+ ps.nr_inactive_clean_pages;
+
+	target /= INACTIVE_FACTOR;
+
+	return target;
+}
+
+#endif /* _LINUX_MM_INLINE_H */
diff -uNr linux-2.5.28/include/linux/mmzone.h linux-2.5.28-rmap/include/linux/mmzone.h
--- linux-2.5.28/include/linux/mmzone.h	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/include/linux/mmzone.h	Tue Jul 23 19:09:15 2002
@@ -26,6 +26,8 @@
 
 struct pglist_data;
 
+#define MAX_CHUNKS_PER_NODE 8
+
 /*
  * On machines where it is needed (eg PCs) we divide physical memory
  * into multiple physical zones. On a PC we have 3 zones:
@@ -40,12 +42,17 @@
 	 */
 	spinlock_t		lock;
 	unsigned long		free_pages;
-	unsigned long		pages_min, pages_low, pages_high;
-	int			need_balance;
+	unsigned long		active_pages;
+	unsigned long		inactive_dirty_pages;
+	unsigned long		inactive_clean_pages;
+	unsigned long		pages_min, pages_low, pages_high, pages_plenty;
 
 	/*
 	 * free areas of different sizes
 	 */
+	struct list_head	active_list;
+	struct list_head	inactive_dirty_list;
+	struct list_head	inactive_clean_list;
 	free_area_t		free_area[MAX_ORDER];
 
 	/*
@@ -81,6 +88,13 @@
 	 */
 	struct pglist_data	*zone_pgdat;
 	struct page		*zone_mem_map;
+
+#if defined(CONFIG_DISCONTIGMEM) && !defined(CONFIG_HIGHMEM)
+	struct page             *zone_chunk_page_start[MAX_CHUNKS_PER_ZONE + 1];
+	unsigned long           zone_chunk_phys_start[MAX_CHUNKS_PER_ZONE];
+	unsigned int            zone_nr_chunks;
+#endif
+
 	unsigned long		zone_start_paddr;
 	unsigned long		zone_start_mapnr;
 
@@ -142,15 +156,6 @@
 extern int numnodes;
 extern pg_data_t *pgdat_list;
 
-static inline int memclass(zone_t *pgzone, zone_t *classzone)
-{
-	if (pgzone->zone_pgdat != classzone->zone_pgdat)
-		return 0;
-	if (pgzone > classzone)
-		return 0;
-	return 1;
-}
-
 /*
  * The following two are not meant for general usage. They are here as
  * prototypes for the discontig memory code.
@@ -163,6 +168,60 @@
 
 extern pg_data_t contig_page_data;
 
+/**
+ * for_each_pgdat - helper macro to iterate over all nodes
+ * @pgdat - pg_data_t * variable
+ *
+ * Meant to help with common loops of the form
+ * pgdat = pgdat_list;
+ * while(pgdat) {
+ * 	...
+ * 	pgdat = pgdat->node_next;
+ * }
+ */
+#define for_each_pgdat(pgdat) \
+	for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
+
+
+/*
+ * next_zone - helper magic for for_each_zone()
+ * Thanks to William Lee Irwin III for this piece of ingenuity.
+ */
+static inline zone_t *next_zone(zone_t *zone)
+{
+	pg_data_t *pgdat = zone->zone_pgdat;
+
+	if (zone - pgdat->node_zones < MAX_NR_ZONES - 1)
+		zone++;
+
+	else if (pgdat->node_next) {
+		pgdat = pgdat->node_next;
+		zone = pgdat->node_zones;
+	} else
+		zone = NULL;
+
+	return zone;
+}
+
+/**
+ * for_each_zone - helper macro to iterate over all memory zones
+ * @zone - zone_t * variable
+ *
+ * The user only needs to declare the zone variable, for_each_zone
+ * fills it in. This basically means for_each_zone() is an
+ * easier to read version of this piece of code:
+ *
+ * for(pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
+ * 	for(i = 0; i < MAX_NR_ZONES; ++i) {
+ * 		zone_t * z = pgdat->node_zones + i;
+ * 		...
+ * 	}
+ * }
+ */
+#define for_each_zone(zone) \
+	for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
+
+
 #ifndef CONFIG_DISCONTIGMEM
 
 #define NODE_DATA(nid)		(&contig_page_data)
diff -uNr linux-2.5.28/include/linux/page-flags.h linux-2.5.28-rmap/include/linux/page-flags.h
--- linux-2.5.28/include/linux/page-flags.h	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/include/linux/page-flags.h	Tue Jul 23 19:09:16 2002
@@ -47,7 +47,7 @@
  * locked- and dirty-page accounting.  The top eight bits of page->flags are
  * used for page->zone, so putting flag bits there doesn't work.
  */
-#define PG_locked	 	 0	/* Page is locked. Don't touch. */
+#define PG_locked		 0	/* Page is locked. Don't touch. */
 #define PG_error		 1
 #define PG_referenced		 2
 #define PG_uptodate		 3
@@ -55,20 +55,20 @@
 #define PG_dirty_dontuse	 4
 #define PG_lru			 5
 #define PG_active		 6
-#define PG_slab			 7	/* slab debug (Suparna wants this) */
-
-#define PG_highmem		 8
-#define PG_checked		 9	/* kill me in 2.5.<early>. */
-#define PG_arch_1		10
-#define PG_reserved		11
-
-#define PG_private		12	/* Has something at ->private */
-#define PG_writeback		13	/* Page is under writeback */
-#define PG_nosave		14	/* Used for system suspend/resume */
-#define PG_chainlock		15	/* lock bit for ->pte_chain */
-
-#define PG_direct		16	/* ->pte_chain points directly at pte */
-
+#define PG_inactive_clean	 7
+#define PG_inactive_dirty	 8
+#define PG_slab			 9	/* slab debug (Suparna wants this) */
+
+#define PG_highmem		10
+#define PG_checked		11	/* kill me in 2.5.<early>. */
+#define PG_arch_1		12
+#define PG_reserved		13
+
+#define PG_private		14	/* Has something at ->private */
+#define PG_writeback		15	/* Page is under writeback */
+#define PG_nosave		16	/* Used for system suspend/resume */
+#define PG_chainlock		17	/* lock bit for ->pte_chain */
+#define PG_direct		18	/* ->pte_chain points directly at pte */
 /*
  * Global page accounting.  One instance per CPU.
  */
@@ -76,11 +76,11 @@
 	unsigned long nr_dirty;
 	unsigned long nr_writeback;
 	unsigned long nr_pagecache;
-	unsigned long nr_active;	/* on active_list LRU */
-	unsigned long nr_inactive;	/* on inactive_list LRU */
+	unsigned long nr_active_pages;	/* on active_list LRU */
+	unsigned long nr_inactive_clean_pages; /* on inactive_clean_list LRU */
+	unsigned long nr_inactive_dirty_pages; /* on inactive_dirty_list LRU */
 	unsigned long nr_page_table_pages;
-	unsigned long nr_pte_chain_pages;
-	unsigned long used_pte_chains_bytes;
+	unsigned long nr_reverse_maps;
 } ____cacheline_aligned_in_smp page_states[NR_CPUS];
 
 extern void get_page_state(struct page_state *ret);
@@ -156,12 +156,22 @@
 	})
 
 #define PageLRU(page)		test_bit(PG_lru, &(page)->flags)
-#define TestSetPageLRU(page)	test_and_set_bit(PG_lru, &(page)->flags)
-#define TestClearPageLRU(page)	test_and_clear_bit(PG_lru, &(page)->flags)
+#define SetPageLRU(page)	set_bit(PG_lru, &(page)->flags)
+#define ClearPageLRU(page)	clear_bit(PG_lru, &(page)->flags)
 
 #define PageActive(page)	test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
 #define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
+#define TestandSetPageActive(page)	test_and_set_bit(PG_active, &(page)->flags)
+#define TestandClearPageActive(page)	test_and_clear_bit(PG_active, &(page)->flags)
+
+#define PageInactiveDirty(page)	test_bit(PG_inactive_dirty, &(page)->flags)
+#define SetPageInactiveDirty(page)	set_bit(PG_inactive_dirty, &(page)->flags)
+#define ClearPageInactiveDirty(page)	clear_bit(PG_inactive_dirty, &(page)->flags)
+
+#define PageInactiveClean(page)	test_bit(PG_inactive_clean, &(page)->flags)
+#define SetPageInactiveClean(page)	set_bit(PG_inactive_clean, &(page)->flags)
+#define ClearPageInactiveClean(page)	clear_bit(PG_inactive_clean, &(page)->flags)
 
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
 #define SetPageSlab(page)	set_bit(PG_slab, &(page)->flags)
diff -uNr linux-2.5.28/include/linux/sched.h linux-2.5.28-rmap/include/linux/sched.h
--- linux-2.5.28/include/linux/sched.h	Wed Jul 24 20:43:54 2002
+++ linux-2.5.28-rmap/include/linux/sched.h	Wed Jul 24 20:47:21 2002
@@ -190,6 +190,7 @@
 	unsigned long def_flags;
 	unsigned long cpu_vm_mask;
 	unsigned long swap_address;
+	unsigned long rlimit_rss;
 
 	unsigned dumpable:1;
 
@@ -269,9 +270,6 @@
 	struct list_head tasks;
 
 	struct mm_struct *mm, *active_mm;
-	struct list_head local_pages;
-
-	unsigned int allocation_order, nr_local_pages;
 
 /* task state */
 	struct linux_binfmt *binfmt;
diff -uNr linux-2.5.28/include/linux/swap.h linux-2.5.28-rmap/include/linux/swap.h
--- linux-2.5.28/include/linux/swap.h	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/include/linux/swap.h	Tue Jul 23 19:09:15 2002
@@ -161,12 +161,19 @@
 extern void FASTCALL(lru_cache_del(struct page *));
 
 extern void FASTCALL(activate_page(struct page *));
+extern void FASTCALL(activate_page_nolock(struct page *));
+extern void FASTCALL(deactivate_page(struct page *));
+extern void FASTCALL(deactivate_page_nolock(struct page *));
+extern void FASTCALL(drop_page(struct page *));
 
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
+extern struct page * FASTCALL(reclaim_page(zone_t *));
 extern wait_queue_head_t kswapd_wait;
-extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int));
+extern int FASTCALL(try_to_free_pages(unsigned int gfp_mask));
+extern void wakeup_kswapd(unsigned int);
+extern void rss_free_pages(unsigned int);
 
 /* linux/mm/page_io.c */
 int swap_readpage(struct file *file, struct page *page);
@@ -220,43 +227,26 @@
 extern void FASTCALL(mark_page_accessed(struct page *));
 
 /*
+ * Page aging defines. These seem to work great in FreeBSD,
+ * no need to reinvent the wheel.
+ */
+#define PAGE_AGE_START 5
+#define PAGE_AGE_ADV 3
+#define PAGE_AGE_DECL 1
+#define PAGE_AGE_MAX 64
+
+/*
  * List add/del helper macros. These must be called
  * with the pagemap_lru_lock held!
  */
 #define DEBUG_LRU_PAGE(page)			\
 do {						\
-	if (!PageLRU(page))			\
-		BUG();				\
 	if (PageActive(page))			\
 		BUG();				\
-} while (0)
-
-#define add_page_to_active_list(page)		\
-do {						\
-	DEBUG_LRU_PAGE(page);			\
-	SetPageActive(page);			\
-	list_add(&(page)->lru, &active_list);	\
-	inc_page_state(nr_active);		\
-} while (0)
-
-#define add_page_to_inactive_list(page)		\
-do {						\
-	DEBUG_LRU_PAGE(page);			\
-	list_add(&(page)->lru, &inactive_list);	\
-	inc_page_state(nr_inactive);		\
-} while (0)
-
-#define del_page_from_active_list(page)		\
-do {						\
-	list_del(&(page)->lru);			\
-	ClearPageActive(page);			\
-	dec_page_state(nr_active);		\
-} while (0)
-
-#define del_page_from_inactive_list(page)	\
-do {						\
-	list_del(&(page)->lru);			\
-	dec_page_state(nr_inactive);		\
+	if (PageInactiveDirty(page))		\
+		BUG();				\
+	if (PageInactiveClean(page))		\
+		BUG();				\
 } while (0)
 
 extern spinlock_t swaplock;
diff -uNr linux-2.5.28/init/main.c linux-2.5.28-rmap/init/main.c
--- linux-2.5.28/init/main.c	Wed Jul 24 20:43:54 2002
+++ linux-2.5.28-rmap/init/main.c	Wed Jul 24 20:47:21 2002
@@ -70,7 +70,7 @@
 extern void sysctl_init(void);
 extern void signals_init(void);
 extern void buffer_init(void);
-
+extern void pte_chain_init(void);
 extern void radix_tree_init(void);
 extern void free_initmem(void);
 
@@ -386,7 +386,7 @@
 	mem_init();
 	kmem_cache_sizes_init();
 	pgtable_cache_init();
-
+	pte_chain_init();
 	mempages = num_physpages;
 
 	fork_init(mempages);
diff -uNr linux-2.5.28/kernel/fork.c linux-2.5.28-rmap/kernel/fork.c
--- linux-2.5.28/kernel/fork.c	Wed Jul 24 20:43:54 2002
+++ linux-2.5.28-rmap/kernel/fork.c	Wed Jul 24 20:47:21 2002
@@ -703,8 +703,6 @@
 	p->start_time = jiffies;
 	p->security = NULL;
 
-	INIT_LIST_HEAD(&p->local_pages);
-
 	retval = -ENOMEM;
 	if (security_ops->task_alloc_security(p))
 		goto bad_fork_cleanup;
diff -uNr linux-2.5.28/kernel/suspend.c linux-2.5.28-rmap/kernel/suspend.c
--- linux-2.5.28/kernel/suspend.c	Wed Jul 24 20:08:57 2002
+++ linux-2.5.28-rmap/kernel/suspend.c	Wed Jul 24 21:17:06 2002
@@ -611,7 +611,7 @@
 static void free_some_memory(void)
 {
 	printk("Freeing memory: ");
-	while (try_to_free_pages(&contig_page_data.node_zones[ZONE_HIGHMEM], GFP_KSWAPD, 0))
+	while (try_to_free_pages(GFP_KSWAPD))
 		printk(".");
 	printk("|\n");
 }
diff -uNr linux-2.5.28/kernel/sys.c linux-2.5.28-rmap/kernel/sys.c
--- linux-2.5.28/kernel/sys.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/kernel/sys.c	Tue Jul 23 19:11:14 2002
@@ -1166,6 +1166,12 @@
 	if (resource == RLIMIT_NOFILE) {
 		if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN)
 			return -EPERM;
+	} else if (resource == RLIMIT_RSS && current->mm) {
+		/* rlimit is specified in bytes, convert to pages */
+		unsigned long pages = RLIM_INFINITY;
+		if (new_rlim.rlim_cur != RLIM_INFINITY)
+			pages = new_rlim.rlim_cur >> PAGE_SHIFT;
+		current->mm->rlimit_rss = pages;
 	}
 
 	retval = security_ops->task_setrlimit(resource, &new_rlim);
diff -uNr linux-2.5.28/mm/bootmem.c linux-2.5.28-rmap/mm/bootmem.c
--- linux-2.5.28/mm/bootmem.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/mm/bootmem.c	Tue Jul 23 19:11:14 2002
@@ -339,12 +339,11 @@
 	pg_data_t *pgdat = pgdat_list;
 	void *ptr;
 
-	while (pgdat) {
+	for_each_pgdat(pgdat)
 		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
 						align, goal)))
 			return(ptr);
-		pgdat = pgdat->node_next;
-	}
+
 	/*
 	 * Whoops, we cannot satisfy the allocation request.
 	 */
diff -uNr linux-2.5.28/mm/filemap.c linux-2.5.28-rmap/mm/filemap.c
--- linux-2.5.28/mm/filemap.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/mm/filemap.c	Tue Jul 23 19:11:14 2002
@@ -13,8 +13,8 @@
 #include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
-#include <linux/kernel_stat.h>
 #include <linux/mm.h>
+#include <linux/mm_inline.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
@@ -529,7 +529,7 @@
  *
  * In the case of swapcache, try_to_swap_out() has already locked the page, so
  * SetPageLocked() is ugly-but-OK there too.  The required page state has been
- * set up by swap_out_add_to_swap_cache().
+ * set up by add_to_swap().
  */
 int add_to_page_cache(struct page *page,
 		struct address_space *mapping, unsigned long offset)
@@ -848,15 +848,23 @@
 /*
  * Mark a page as having seen activity.
  *
- * inactive,unreferenced	->	inactive,referenced
- * inactive,referenced		->	active,unreferenced
- * active,unreferenced		->	active,referenced
+ * We immediately reclaim
+ * the inactive clean pages because those are counted as freeable.
+ * We don't modify the inactive dirty ones because we're never sure
+ * if those are freeable anyway.
  */
 void mark_page_accessed(struct page *page)
 {
-	if (!PageActive(page) && PageReferenced(page)) {
+	if (PageInactiveClean(page)) {
+		struct zone_struct *zone = page_zone(page);
+		int free = zone->free_pages + zone->inactive_clean_pages;
+
 		activate_page(page);
-		ClearPageReferenced(page);
+		if (free < zone->pages_low)
+			wakeup_kswapd(GFP_NOIO);
+		if (zone->free_pages < zone->pages_min)
+			fixup_freespace(zone, 1);
+
 		return;
 	} else if (!PageReferenced(page)) {
 		SetPageReferenced(page);
@@ -1245,7 +1253,7 @@
 
 	/* Limit it to a sane percentage of the inactive list.. */
 	get_page_state(&ps);
-	max = ps.nr_inactive / 2;
+	max = ps.nr_inactive_clean_pages / 2;
 	if (nr > max)
 		nr = max;
 
@@ -2060,16 +2068,18 @@
 	}
 
 	do {
-		unsigned long index;
-		unsigned long offset;
+		unsigned long index, offset;
 		long page_fault;
 		char *kaddr;
+		int deactivate = 1;
 
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count)
+		if (bytes > count) {
 			bytes = count;
+			deactivate = 0;
+		}
 
 		/*
 		 * Bring in the user page that we will copy from _first_.
@@ -2119,9 +2129,11 @@
 			}
 		}
 		kunmap(page);
-		if (!PageReferenced(page))
-			SetPageReferenced(page);
 		unlock_page(page);
+		if (deactivate)
+			deactivate_page(page);
+		else
+			mark_page_accessed(page);
 		page_cache_release(page);
 		if (status < 0)
 			break;
diff -uNr linux-2.5.28/mm/memory.c linux-2.5.28-rmap/mm/memory.c
--- linux-2.5.28/mm/memory.c	Wed Jul 24 20:43:54 2002
+++ linux-2.5.28-rmap/mm/memory.c	Wed Jul 24 20:47:21 2002
@@ -36,7 +36,6 @@
  *		(Gerhard.Wichert@pdb.siemens.de)
  */
 
-#include <linux/kernel_stat.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
@@ -45,6 +44,7 @@
 #include <linux/iobuf.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/mm_inline.h>
 
 #include <asm/pgalloc.h>
 #include <asm/rmap.h>
@@ -1127,6 +1127,10 @@
 	struct page *new_page;
 	unsigned long offset;
 
+	/* Low on free memory ?  Don't make things worse. */
+	if (free_low(ALL_ZONES) < 0)
+		return;
+
 	/*
 	 * Get the number of handles we should do readahead io to.
 	 */
@@ -1319,6 +1323,8 @@
 		new_page = page;
 	}
 
+	mark_page_accessed(new_page);
+
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 
@@ -1421,6 +1427,14 @@
 	current->state = TASK_RUNNING;
 	pgd = pgd_offset(mm, address);
 
+	/* 
+	 * If we are over our RSS limit and the system needs memory,
+	 * we will free memory for the non-hogs and slow down a bit.
+	 */
+	if (mm->rlimit_rss && mm->rss > mm->rlimit_rss &&
+					free_high(ALL_ZONES) > 0)
+		rss_free_pages(GFP_HIGHUSER);
+
 	KERNEL_STAT_INC(pgfault);
 	/*
 	 * We need the page table lock to synchronize with kswapd
@@ -1457,6 +1471,7 @@
 	if (!new)
 		return NULL;
 
+	KERNEL_STAT_INC(pgfault);
 	/*
 	 * Because we dropped the lock, we should re-check the
 	 * entry, as somebody else could have populated it..
diff -uNr linux-2.5.28/mm/numa.c linux-2.5.28-rmap/mm/numa.c
--- linux-2.5.28/mm/numa.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/mm/numa.c	Tue Jul 23 19:11:14 2002
@@ -44,6 +44,57 @@
 
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 
+#ifndef CONFIG_HIGHMEM
+unsigned long page_address(struct page * page)
+{
+	struct zone_struct * zone;
+	struct page * chunk_page_start;
+	unsigned long chunk_phys_addr;
+	int idx_chunk;
+
+	zone = page_zone(page);
+	/*
+	 * We have to check if the page is on
+	 * a chunk that contains pages from 2 zones.
+	 */
+	if(!(page < zone->zone_chunk_page_start[0]))
+		goto known_zone;
+	/*
+	 * We need to get the previous zone.
+	 * If there is no such zone, we are in trouble.
+	 */
+	if(!page->zone)
+		BUG();
+
+	zone = zone_table[(page->zone) - 1];
+
+	if(zone->zone_pgdat->node_id == page_zone(page)->zone_pgdat->node_id)
+		goto known_zone;
+	/*
+	 * Getting here means we have a chunk spread over 2 nodes.
+	 * That shouldn't happen.
+	 */
+	BUG();
+
+ known_zone:
+	for(idx_chunk = 0 ; idx_chunk < MAX_CHUNKS_PER_ZONE ; idx_chunk++){
+		if(page >= zone->zone_chunk_page_start[idx_chunk] &&
+		   page < zone->zone_chunk_page_start[idx_chunk + 1])
+			break;
+	}
+	/*
+	 * We know which chunk the page belongs to.
+	 */
+	chunk_phys_addr = zone->zone_chunk_phys_start[idx_chunk];
+	chunk_page_start = zone->zone_chunk_page_start[idx_chunk];
+	return (unsigned long)__va(chunk_phys_addr +
+				   ((page - chunk_page_start) << PAGE_SHIFT ));
+
+
+
+}
+#endif
+
 static spinlock_t node_lock = SPIN_LOCK_UNLOCKED;
 
 void show_free_areas_node(pg_data_t *pgdat)
diff -uNr linux-2.5.28/mm/oom_kill.c linux-2.5.28-rmap/mm/oom_kill.c
--- linux-2.5.28/mm/oom_kill.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/mm/oom_kill.c	Tue Jul 23 19:11:10 2002
@@ -168,7 +168,8 @@
 static void oom_kill(void)
 {
 	struct task_struct *p, *q;
-	
+	extern wait_queue_head_t kswapd_done;
+
 	read_lock(&tasklist_lock);
 	p = select_bad_process();
 
@@ -182,6 +183,9 @@
 	}
 	read_unlock(&tasklist_lock);
 
+	/* Chances are by this time our victim is sleeping on kswapd. */
+	wake_up(&kswapd_done);
+
 	/*
 	 * Make kswapd go out of the way, so "p" has a good chance of
 	 * killing itself before someone else gets the chance to ask
diff -uNr linux-2.5.28/mm/page_alloc.c linux-2.5.28-rmap/mm/page_alloc.c
--- linux-2.5.28/mm/page_alloc.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/mm/page_alloc.c	Tue Jul 23 19:11:14 2002
@@ -15,6 +15,7 @@
 #include <linux/config.h>
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
+#include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
@@ -26,8 +27,6 @@
 unsigned long totalram_pages;
 unsigned long totalhigh_pages;
 int nr_swap_pages;
-LIST_HEAD(active_list);
-LIST_HEAD(inactive_list);
 pg_data_t *pgdat_list;
 
 /*
@@ -41,6 +40,8 @@
 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
+static int zone_extrafree_ratio[MAX_NR_ZONES] __initdata = { 128, 512, 0, };
+static int zone_extrafree_max[MAX_NR_ZONES] __initdata = { 1024 , 1024, 0, };
 
 /*
  * Temporary debugging check for pages not lying within a given zone.
@@ -89,33 +90,21 @@
 
 	KERNEL_STAT_ADD(pgfree, 1<<order);
 
+	DEBUG_LRU_PAGE(page);
 	BUG_ON(PagePrivate(page));
 	BUG_ON(page->mapping != NULL);
 	BUG_ON(PageLocked(page));
-	BUG_ON(PageLRU(page));
-	BUG_ON(PageActive(page));
 	BUG_ON(PageWriteback(page));
 	BUG_ON(page->pte.chain != NULL);
 	if (PageDirty(page))
 		ClearPageDirty(page);
-	BUG_ON(page_count(page) != 0);
-
-	if (unlikely(current->flags & PF_FREE_PAGES)) {
-		if (!current->nr_local_pages && !in_interrupt()) {
-			list_add(&page->list, &current->local_pages);
-			page->index = order;
-			current->nr_local_pages++;
-			goto out;
-		}
-	}
 
 	zone = page_zone(page);
 
 	mask = (~0UL) << order;
 	base = zone->zone_mem_map;
 	page_idx = page - base;
-	if (page_idx & ~mask)
-		BUG();
+	BUG_ON(page_idx & ~mask);
 	index = page_idx >> (1 + order);
 	area = zone->free_area + order;
 
@@ -147,8 +136,6 @@
 	}
 	list_add(&(base + page_idx)->list, &area->free_list);
 	spin_unlock_irqrestore(&zone->lock, flags);
-out:
-	return;
 }
 
 #define MARK_USED(index, order, area) \
@@ -173,24 +160,6 @@
 	return page;
 }
 
-/*
- * This page is about to be returned from the page allocator
- */
-static inline void prep_new_page(struct page *page)
-{
-	BUG_ON(page->mapping);
-	BUG_ON(PagePrivate(page));
-	BUG_ON(PageLocked(page));
-	BUG_ON(PageLRU(page));
-	BUG_ON(PageActive(page));
-	BUG_ON(PageDirty(page));
-	BUG_ON(PageWriteback(page));
-	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
-			1 << PG_referenced | 1 << PG_arch_1 |
-			1 << PG_checked);
-	set_page_count(page, 1);
-}
-
 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
 static struct page * rmqueue(zone_t *zone, unsigned int order)
 {
@@ -215,20 +184,23 @@
 			if (curr_order != MAX_ORDER-1)
 				MARK_USED(index, curr_order, area);
 			zone->free_pages -= 1UL << order;
-
 			page = expand(zone, page, index, order, curr_order, area);
 			spin_unlock_irqrestore(&zone->lock, flags);
+			DEBUG_LRU_PAGE(page);
+			BUG_ON(bad_range(zone, page));
 
-			if (bad_range(zone, page))
-				BUG();
-			prep_new_page(page);
-			return page;	
+			/* prepare new page for use */
+			set_page_count(page, 1);
+			page->age = PAGE_AGE_START;
+			page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+					 1 << PG_referenced | 1 << PG_arch_1 |
+					 1 << PG_checked);
+			return page;
 		}
 		curr_order++;
 		area++;
 	} while (curr_order < MAX_ORDER);
 	spin_unlock_irqrestore(&zone->lock, flags);
-
 	return NULL;
 }
 
@@ -264,57 +236,82 @@
 }
 #endif
 
-static /* inline */ struct page *
-balance_classzone(zone_t * classzone, unsigned int gfp_mask,
-			unsigned int order, int * freed)
-{
-	struct page * page = NULL;
-	int __freed = 0;
-
-	BUG_ON(in_interrupt());
-
-	current->allocation_order = order;
-	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
-
-	__freed = try_to_free_pages(classzone, gfp_mask, order);
-
-	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
-
-	if (current->nr_local_pages) {
-		struct list_head * entry, * local_pages;
-		struct page * tmp;
-		int nr_pages;
-
-		local_pages = &current->local_pages;
-
-		if (likely(__freed)) {
-			/* pick from the last inserted so we're lifo */
-			entry = local_pages->next;
-			do {
-				tmp = list_entry(entry, struct page, list);
-				if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
-					list_del(entry);
-					page = tmp;
-					current->nr_local_pages--;
-					prep_new_page(page);
-					break;
-				}
-			} while ((entry = entry->next) != local_pages);
+/*
+ * If we are able to directly reclaim pages, we move pages from the
+ * inactive_clean list onto the free list until the zone has enough
+ * free pages or until the inactive_clean pages are exhausted.
+ * If we cannot do this work ourselves, call kswapd.
+ */
+void FASTCALL(fixup_freespace(zone_t * zone, int direct_reclaim));
+void fixup_freespace(zone_t * zone, int direct_reclaim)
+{
+	if (direct_reclaim) {
+		struct page * page;
+		do {
+			if ((page = reclaim_page(zone)))
+				__free_pages(page, 0);
+		} while (page && zone->free_pages <= zone->pages_min);
+	} else
+		wakeup_kswapd(GFP_ATOMIC);
+}
+
+#define PAGES_KERNEL	0
+#define PAGES_MIN	1
+#define PAGES_LOW	2
+#define PAGES_HIGH	3
+
+/*
+ * This function does the dirty work for __alloc_pages
+ * and is separated out to keep the code size smaller.
+ * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
+ */
+static struct page * __alloc_pages_limit(zonelist_t *zonelist,
+			unsigned long order, int limit, int direct_reclaim)
+{
+	zone_t **zone = zonelist->zones;
+	unsigned long water_mark = 0;
+
+	for (;;) {
+		zone_t *z = *(zone++);
+
+		if (!z)
+			break;
+		BUG_ON(!z->size);
+
+		/*
+		 * We allocate if the number of (free + inactive_clean)
+		 * pages is above the watermark.
+		 */
+		switch (limit) {
+			case PAGES_KERNEL:
+				water_mark = z->pages_min / 2;
+				break;
+			case PAGES_MIN:
+				water_mark = z->pages_min;
+				break;
+			case PAGES_LOW:
+				water_mark = z->pages_low;
+				break;
+			default:
+			case PAGES_HIGH:
+				water_mark = z->pages_high;
 		}
 
-		nr_pages = current->nr_local_pages;
-		/* free in reverse order so that the global order will be lifo */
-		while ((entry = local_pages->prev) != local_pages) {
-			list_del(entry);
-			tmp = list_entry(entry, struct page, list);
-			__free_pages_ok(tmp, tmp->index);
-			if (!nr_pages--)
-				BUG();
+		if (z->free_pages + z->inactive_clean_pages >= water_mark) {
+			struct page *page = NULL;
+			/* If possible, reclaim a page directly. */
+			if (direct_reclaim)
+				page = reclaim_page(z);
+			/* If that fails, fall back to rmqueue. */
+			if (!page)
+				page = rmqueue(z, order);
+			if (page)
+				return page;
 		}
-		current->nr_local_pages = 0;
 	}
-	*freed = __freed;
-	return page;
+
+	/* Found nothing. */
+	return NULL;
 }
 
 /*
@@ -322,105 +319,249 @@
  */
 struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
 {
-	unsigned long min;
-	zone_t **zones, *classzone;
+	zone_t **zone;
+	int min, direct_reclaim = 0;
 	struct page * page;
-	int freed, i;
 
 	KERNEL_STAT_ADD(pgalloc, 1<<order);
 
-	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
-	classzone = zones[0]; 
-	if (classzone == NULL)    /* no zones in the zonelist */
-		return NULL;
+	/*
+	 * (If anyone calls gfp from interrupts nonatomically then it
+	 * will sooner or later tripped up by a schedule().)
+	 *
+	 * We fall back to lower-level zones if allocation
+	 * in a higher zone fails.
+	 */
+
+	/*
+	 * Can we take pages directly from the inactive_clean
+	 * list?
+	 */
+	if (order == 0 && (gfp_mask & __GFP_WAIT))
+		direct_reclaim = 1;
 
-	/* Go through the zonelist once, looking for a zone with enough free */
+try_again:
+	/*
+	 * First, see if we have any zones with lots of free memory.
+	 *
+	 * We allocate free memory first because it doesn't contain
+	 * any data we would want to cache.
+	 */
+	zone = zonelist->zones;
+	if (!*zone)
+		return NULL;
 	min = 1UL << order;
-	for (i = 0; zones[i] != NULL; i++) {
-		zone_t *z = zones[i];
+	for (;;) {
+		zone_t *z = *(zone++);
+		if (!z)
+			break;
+		BUG_ON(!z->size);
 
-		/* the incremental min is allegedly to discourage fallback */
-		min += z->pages_low;
+		min += z->pages_min;
 		if (z->free_pages > min) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
-		}
+		} else if (z->free_pages < z->pages_min)
+			fixup_freespace(z, direct_reclaim);
 	}
 
-	classzone->need_balance = 1;
-	mb();
-	/* we're somewhat low on memory, failed to find what we needed */
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
+	/*
+	 * Next, try to allocate a page from a zone with a HIGH
+	 * amount of (free + inactive_clean) pages.
+	 *
+	 * If there is a lot of activity, inactive_target
+	 * will be high and we'll have a good chance of
+	 * finding a page using the HIGH limit.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
+	if (page)
+		return page;
 
-	/* Go through the zonelist again, taking __GFP_HIGH into account */
-	min = 1UL << order;
-	for (i = 0; zones[i] != NULL; i++) {
-		unsigned long local_min;
-		zone_t *z = zones[i];
-
-		local_min = z->pages_min;
-		if (gfp_mask & __GFP_HIGH)
-			local_min >>= 2;
-		min += local_min;
-		if (z->free_pages > min) {
-			page = rmqueue(z, order);
-			if (page)
-				return page;
+	/*
+	 * Then try to allocate a page from a zone with more
+	 * than zone->pages_low of (free + inactive_clean) pages.
+	 *
+	 * When the working set is very large and VM activity
+	 * is low, we're most likely to have our allocation
+	 * succeed here.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
+	if (page)
+		return page;
+
+	/*
+	 * OK, none of the zones on our zonelist has lots
+	 * of pages free.
+	 *
+	 * We wake up kswapd, in the hope that kswapd will
+	 * resolve this situation before memory gets tight.
+	 *
+	 * We'll also help a bit trying to free pages, this
+	 * way statistics will make sure really fast allocators
+	 * are slowed down more than slow allocators and other
+	 * programs in the system shouldn't be impacted as much
+	 * by the hogs.
+	 */
+	wakeup_kswapd(gfp_mask);
+
+	/*
+	 * After waking up kswapd, we try to allocate a page
+	 * from any zone which isn't critical yet.
+	 *
+	 * Kswapd should, in most situations, bring the situation
+	 * back to normal in no time.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
+	if (page)
+		return page;
+
+	/*
+	 * Kernel allocations can eat a few emergency pages.
+	 * We should be able to run without this, find out why
+	 * the SCSI layer isn't happy ...
+	 */
+	if (gfp_mask & __GFP_HIGH) {
+		page = __alloc_pages_limit(zonelist, order, PAGES_KERNEL, direct_reclaim);
+		if (page)
+			return page;
+	}
+
+	/*
+	 * Oh well, we didn't succeed.
+	 */
+	KERNEL_STAT_INC(allocstall);
+	if (!(current->flags & PF_MEMALLOC)) {
+		/*
+		 * Are we dealing with a higher order allocation?
+		 *
+		 * If so, try to defragment some memory.
+		 */
+		if (order > 0 && (gfp_mask & __GFP_WAIT))
+			goto defragment;
+
+		/*
+		 * If we arrive here, we are really tight on memory.
+		 * Since kswapd didn't succeed in freeing pages for us,
+		 * we need to help it.
+		 *
+		 * Single page allocs loop until the allocation succeeds.
+		 * Multi-page allocs can fail due to memory fragmentation;
+		 * in that case we bail out to prevent infinite loops and
+		 * hanging device drivers ...
+		 *
+		 * Another issue are GFP_NOFS allocations; because they
+		 * do not have __GFP_FS set it's possible we cannot make
+		 * any progress freeing pages, in that case it's better
+		 * to give up than to deadlock the kernel looping here.
+		 *
+		 * NFS: we must yield the CPU (to rpciod) to avoid deadlock.
+		 */
+		if (gfp_mask & __GFP_WAIT) {
+			yield();
+			if (!order || free_high(ALL_ZONES) >= 0) {
+				int progress = try_to_free_pages(gfp_mask);
+				if (progress || (gfp_mask & __GFP_FS))
+					goto try_again;
+				/*
+				 * Fail if no progress was made and the
+				 * allocation may not be able to block on IO.
+				 */
+				return NULL;
+			}
 		}
 	}
 
-	/* here we're in the low on memory slow path */
+	/*
+	 * Final phase: allocate anything we can!
+	 *
+	 * Higher order allocations, GFP_ATOMIC allocations and
+	 * recursive allocations (PF_MEMALLOC) end up here.
+	 *
+	 * Only recursive allocations can use the very last pages
+	 * in the system, otherwise it would be just too easy to
+	 * deadlock the system...
+	 */
+	zone = zonelist->zones;
+	min = 1UL << order;
+	for (;;) {
+		zone_t *z = *(zone++);
+		struct page * page = NULL;
+		if (!z)
+			break;
 
-rebalance:
-	if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
-		/* go through the zonelist yet again, ignoring mins */
-		for (i = 0; zones[i] != NULL; i++) {
-			zone_t *z = zones[i];
+		/*
+		 * SUBTLE: direct_reclaim is only possible if the task
+		 * becomes PF_MEMALLOC while looping above. This will
+		 * happen when the OOM killer selects this task for
+		 * death.
+		 */
+		if (direct_reclaim) {
+			page = reclaim_page(z);
+			if (page)
+				return page;
+		}
 
+		/* XXX: is pages_min/4 a good amount to reserve for this? */
+		min += z->pages_min / 4;
+		if (z->free_pages > min || ((current->flags & PF_MEMALLOC) && !in_interrupt())) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
 		}
-nopage:
-		if (!(current->flags & PF_NOWARN)) {
-			printk("%s: page allocation failure."
-				" order:%d, mode:0x%x\n",
-				current->comm, order, gfp_mask);
-		}
-		return NULL;
 	}
+	goto out_failed;
 
-	/* Atomic allocations - we can't balance anything */
-	if (!(gfp_mask & __GFP_WAIT))
-		goto nopage;
 
-	KERNEL_STAT_INC(allocstall);
-	page = balance_classzone(classzone, gfp_mask, order, &freed);
-	if (page)
-		return page;
-
-	/* go through the zonelist yet one more time */
-	min = 1UL << order;
-	for (i = 0; zones[i] != NULL; i++) {
-		zone_t *z = zones[i];
+	/*
+	 * Naive "defragmentation" for higher-order allocations. First we
+	 * free the inactive_clean pages to see if we can allocate our
+	 * allocation, then we call page_launder() to clean some dirty
+	 * pages, and last we try once more.
+	 *
+	 * We might want to turn this into something which defragments
+	 * memory based on physical page, simply by looking for unmapped
+	 * pages next to pages on the free list...
+	 */
+defragment:
+	{
+		int freed = 0;
+defragment_again:
+		zone = zonelist->zones;
+		for (;;) {
+			zone_t *z = *(zone++);
+			if (!z)
+				break;
+			if (!z->size)
+				continue;
+			while (z->inactive_clean_pages) {
+				struct page * page;
+				/* Move one page to the free list. */
+				page = reclaim_page(z);
+				if (!page)
+					break;
+				__free_page(page);
+				/* Try if the allocation succeeds. */
+				page = rmqueue(z, order);
+				if (page)
+					return page;
+			}
+		}
 
-		min += z->pages_min;
-		if (z->free_pages > min) {
-			page = rmqueue(z, order);
-			if (page)
-				return page;
+		/* XXX: do real defragmentation instead of calling launder ? */
+		if (!freed & !(current->flags & PF_MEMALLOC)) {
+			freed = 1;
+			current->flags |= PF_MEMALLOC;
+			try_to_free_pages(gfp_mask);
+			current->flags &= ~PF_MEMALLOC;
+			goto defragment_again;
 		}
 	}
 
-	/* Don't let big-order allocations loop */
-	if (order > 3)
-		goto nopage;
-
-	/* Yield for kswapd, and try again */
-	yield();
-	goto rebalance;
+out_failed:
+	/* No luck.. */
+	printk(KERN_ERR "__alloc_pages: %u-order allocation failed.\n", order);
+	return NULL;
 }
 
 /*
@@ -477,36 +618,32 @@
  */
 unsigned int nr_free_pages(void)
 {
-	unsigned int i, sum = 0;
-	pg_data_t *pgdat;
-
-	for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
-		for (i = 0; i < MAX_NR_ZONES; ++i)
-			sum += pgdat->node_zones[i].free_pages;
+	unsigned int sum;
+	zone_t *zone;
 
+	sum = 0;
+	for_each_zone(zone)
+		sum += zone->free_pages;
+	
 	return sum;
 }
 
-static unsigned int nr_free_zone_pages(int offset)
+static unsigned int nr_free_zone_pages (int offset)
 {
-	pg_data_t *pgdat = pgdat_list;
+	pg_data_t *pgdat;
 	unsigned int sum = 0;
 
-	do {
+	for_each_pgdat(pgdat) {
 		zonelist_t *zonelist = pgdat->node_zonelists + offset;
 		zone_t **zonep = zonelist->zones;
 		zone_t *zone;
 
 		for (zone = *zonep++; zone; zone = *zonep++) {
-			unsigned long size = zone->size;
-			unsigned long high = zone->pages_high;
-			if (size > high)
-				sum += size - high;
+			sum += zone->free_pages;
+			sum += zone->inactive_clean_pages;
+			sum += zone->inactive_dirty_pages;
 		}
-
-		pgdat = pgdat->node_next;
-	} while (pgdat);
-
+	}
 	return sum;
 }
 
@@ -529,13 +666,12 @@
 #if CONFIG_HIGHMEM
 unsigned int nr_free_highpages (void)
 {
-	pg_data_t *pgdat = pgdat_list;
+	pg_data_t *pgdat;
 	unsigned int pages = 0;
 
-	while (pgdat) {
+	for_each_pgdat(pgdat)
 		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
-		pgdat = pgdat->node_next;
-	}
+
 	return pages;
 }
 #endif
@@ -563,11 +699,11 @@
 		ret->nr_dirty += ps->nr_dirty;
 		ret->nr_writeback += ps->nr_writeback;
 		ret->nr_pagecache += ps->nr_pagecache;
-		ret->nr_active += ps->nr_active;
-		ret->nr_inactive += ps->nr_inactive;
+		ret->nr_active_pages += ps->nr_active_pages;
+		ret->nr_inactive_clean_pages += ps->nr_inactive_clean_pages;
+		ret->nr_inactive_dirty_pages += ps->nr_inactive_dirty_pages;
 		ret->nr_page_table_pages += ps->nr_page_table_pages;
-		ret->nr_pte_chain_pages += ps->nr_pte_chain_pages;
-		ret->used_pte_chains_bytes += ps->used_pte_chains_bytes;
+		ret->nr_reverse_maps += ps->nr_reverse_maps;
 	}
 }
 
@@ -630,12 +766,13 @@
 		tmpdat = tmpdat->node_next;
 	}
 
-	printk("( Active:%lu inactive:%lu dirty:%lu writeback:%lu free:%u )\n",
-		ps.nr_active,
-		ps.nr_inactive,
-		ps.nr_dirty,
-		ps.nr_writeback,
-		nr_free_pages());
+ 	printk("( Active:%lu inactive_dirty:%lu inactive_clean:%lu dirty:%lu writeback:%lu free:%u )\n",
+ 		ps.nr_active_pages,
+ 		ps.nr_inactive_dirty_pages,
+ 		ps.nr_inactive_clean_pages,
+ 		ps.nr_dirty,
+ 		ps.nr_writeback,
+ 		nr_free_pages());
 
 	for (type = 0; type < MAX_NR_ZONES; type++) {
 		struct list_head *head, *curr;
@@ -771,6 +908,7 @@
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  */
+extern unsigned int kswapd_minfree;
 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 	unsigned long *zones_size, unsigned long zone_start_paddr, 
 	unsigned long *zholes_size, struct page *lmem_map)
@@ -816,7 +954,7 @@
 	offset = lmem_map - mem_map;	
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		zone_t *zone = pgdat->node_zones + j;
-		unsigned long mask;
+		unsigned long mask, extrafree = 0;
 		unsigned long size, realsize;
 
 		zone_table[nid * MAX_NR_ZONES + j] = zone;
@@ -830,7 +968,12 @@
 		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
-		zone->need_balance = 0;
+		zone->inactive_clean_pages = 0;
+		zone->inactive_dirty_pages = 0;
+		INIT_LIST_HEAD(&zone->active_list);
+		INIT_LIST_HEAD(&zone->inactive_dirty_list);
+		INIT_LIST_HEAD(&zone->inactive_clean_list);
+
 		if (!size)
 			continue;
 
@@ -850,15 +993,22 @@
 
 		pgdat->nr_zones = j+1;
 
+		/*
+		 * On large memory machines we keep extra memory
+		 * free for kernel allocations.
+		 */
+		if (zone_extrafree_ratio[j])
+			extrafree = min_t(int, (realtotalpages / zone_extrafree_ratio[j]), zone_extrafree_max[j]);
+		if (extrafree < zone_balance_max[j])
+			extrafree = 0;
+
 		mask = (realsize / zone_balance_ratio[j]);
 		if (mask < zone_balance_min[j])
 			mask = zone_balance_min[j];
-		else if (mask > zone_balance_max[j])
-			mask = zone_balance_max[j];
-		zone->pages_min = mask;
-		zone->pages_low = mask*2;
-		zone->pages_high = mask*3;
-
+		zone->pages_min = extrafree + min(mask, (unsigned long)zone_balance_max[j]);
+		zone->pages_low = extrafree + mask*2;
+		zone->pages_high = extrafree + mask*3;
+		zone->pages_plenty = extrafree + mask*6;
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
 		zone->zone_start_paddr = zone_start_paddr;
@@ -866,6 +1016,8 @@
 		if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
 			printk("BUG: wrong zone alignment, it will crash\n");
 
+		kswapd_minfree += zone->pages_min;
+
 		/*
 		 * Initially all pages are reserved - free ones are freed
 		 * up by free_all_bootmem() once the early boot process is
diff -uNr linux-2.5.28/mm/readahead.c linux-2.5.28-rmap/mm/readahead.c
--- linux-2.5.28/mm/readahead.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/mm/readahead.c	Tue Jul 23 19:11:14 2002
@@ -204,6 +204,42 @@
 }
 
 /*
+ * We combine this with readahead to deactivate pages when we
+ * think there's sequential IO going on.  Note that this is
+ * harmless since we don't actually evict the pages from memory
+ * but just move them to the inactive list.
+ *
+ * Rik van Riel, 2000
+ */
+static void drop_behind(struct file * file, unsigned long index)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+
+	/* Nothing to drop-behind if we're on the first page. */
+	if (!index)
+		return;
+
+	/*
+	 * Go backwards from index-1 and drop all pages in the
+	 * readahead window. Since the readahead window may have
+	 * been increased since the last time we were called, we
+	 * stop when the page isn't there.
+	 */
+	spin_lock(&pagemap_lru_lock);
+	while (--index >= file->f_ra.start) {
+		read_lock(&mapping->page_lock);
+		page = radix_tree_lookup(&mapping->page_tree, index);
+		read_unlock(&mapping->page_lock);
+		if (!page || !PageActive(page))
+			break;
+		drop_page(page);
+	}
+	spin_unlock(&pagemap_lru_lock);
+}
+
+/*
  * page_cache_readahead is the main function.  If performs the adaptive
  * readahead window size management and submits the readahead I/O.
  */
@@ -325,6 +361,11 @@
 		}
 	}
 out:
+	/*
+	 * Move the pages that have already been passed
+	 * to the inactive list.
+	 */
+	drop_behind(file, offset);
 	return;
 }
 
diff -uNr linux-2.5.28/mm/rmap.c linux-2.5.28-rmap/mm/rmap.c
--- linux-2.5.28/mm/rmap.c	Wed Jul 24 20:43:54 2002
+++ linux-2.5.28-rmap/mm/rmap.c	Wed Jul 24 21:02:05 2002
@@ -23,6 +23,8 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swapops.h>
+#include <linux/slab.h>
+#include <linux/init.h>
 
 #include <asm/pgalloc.h>
 #include <asm/rmap.h>
@@ -50,10 +52,10 @@
 	pte_t * ptep;
 };
 
+static kmem_cache_t *pte_chain_cache;
 static inline struct pte_chain * pte_chain_alloc(void);
 static inline void pte_chain_free(struct pte_chain *, struct pte_chain *,
 		struct page *);
-static void alloc_new_pte_chains(void);
 
 /**
  * page_referenced - test if the page was referenced
@@ -98,12 +100,9 @@
 	unsigned long pfn = pte_pfn(*ptep);
 
 #ifdef DEBUG_RMAP
-	if (!page || !ptep)
-		BUG();
-	if (!pte_present(*ptep))
-		BUG();
-	if (!ptep_to_mm(ptep))
-		BUG();
+	BUG_ON(!page || !ptep);
+	BUG_ON(!pte_present(*ptep));
+	BUG_ON(!ptep_to_mm(ptep));
 #endif
 
 	if (!pfn_valid(pfn) || PageReserved(page))
@@ -114,12 +113,10 @@
 	{
 		struct pte_chain * pc;
 		if (PageDirect(page)) {
-			if (page->pte.direct == ptep)
-				BUG();
+			BUG_ON(page->pte.direct == ptep);
 		} else {
 			for (pc = page->pte.chain; pc; pc = pc->next) {
-				if (pc->ptep == ptep)
-					BUG();
+				BUG_ON(pc->ptep == ptep);
 			}
 		}
 	}
@@ -148,6 +145,7 @@
 	}
 
 	pte_chain_unlock(page);
+	inc_page_state(nr_reverse_maps);
 }
 
 /**
@@ -165,8 +163,7 @@
 	struct pte_chain * pc, * prev_pc = NULL;
 	unsigned long pfn = page_to_pfn(page);
 
-	if (!page || !ptep)
-		BUG();
+	BUG_ON(!page || !ptep);
 	if (!pfn_valid(pfn) || PageReserved(page))
 		return;
 
@@ -208,9 +205,9 @@
 #endif
 
 out:
+	dec_page_state(nr_reverse_maps);
 	pte_chain_unlock(page);
-	return;
-			
+	return;			
 }
 
 /**
@@ -236,8 +233,7 @@
 	pte_t pte;
 	int ret;
 
-	if (!mm)
-		BUG();
+	BUG_ON(!mm);
 
 	/*
 	 * We need the page_table_lock to protect us from page faults,
@@ -304,13 +300,10 @@
 	int ret = SWAP_SUCCESS;
 
 	/* This page should not be on the pageout lists. */
-	if (PageReserved(page))
-		BUG();
-	if (!PageLocked(page))
-		BUG();
+	BUG_ON(PageReserved(page));
+	BUG_ON(!PageLocked(page));
 	/* We need backing store to swap out a page. */
-	if (!page->mapping)
-		BUG();
+	BUG_ON(!page->mapping);
 
 	if (PageDirect(page)) {
 		ret = try_to_unmap_one(page, page->pte.direct);
@@ -318,7 +311,7 @@
 			page->pte.direct = NULL;
 			ClearPageDirect(page);
 		}
-	} else {		
+	} else {
 		for (pc = page->pte.chain; pc; pc = next_pc) {
 			next_pc = pc->next;
 			switch (try_to_unmap_one(page, pc->ptep)) {
@@ -351,33 +344,53 @@
 }
 
 /**
- ** No more VM stuff below this comment, only pte_chain helper
- ** functions.
- **/
-
-struct pte_chain * pte_chain_freelist;
-spinlock_t pte_chain_freelist_lock = SPIN_LOCK_UNLOCKED;
-
-/* Maybe we should have standard ops for singly linked lists ... - Rik */
-static inline void pte_chain_push(struct pte_chain * pte_chain)
+ * page_over_rsslimit - test if the page is over its RSS limit
+ * @page - page to test
+ *
+ * This function returns true if the process owning this page
+ * is over its RSS (resident set size) limit.  For shared pages
+ * we penalise it only if all processes using it are over their
+ * rss limits.
+ * The caller needs to hold the page's pte_chain_lock.
+ */
+int page_over_rsslimit(struct page * page)
 {
-	pte_chain->ptep = NULL;
-	pte_chain->next = pte_chain_freelist;
-	pte_chain_freelist = pte_chain;
-}
+	struct mm_struct * mm;
+	pte_t * ptep;
 
-static inline struct pte_chain * pte_chain_pop(void)
-{
-	struct pte_chain *pte_chain;
+	/* No process is using the page. */
+	if (!page->pte.chain)
+		return 0;
 
-	pte_chain = pte_chain_freelist;
-	pte_chain_freelist = pte_chain->next;
-	pte_chain->next = NULL;
+	if (PageDirect(page)) {
+		ptep = page->pte.direct;
+		mm = ptep_to_mm(ptep);
+		if(!mm->rlimit_rss || mm->rss <= mm->rlimit_rss)
+			return 0;
+	} else {
+		do {
+			ptep = page->pte.chain->ptep;
+			mm = ptep_to_mm(ptep);
+			/*
+			 * If the process is under its RSS limit, stop
+			 * scanning and don't penalise the page.
+			 */
+			if(!mm->rlimit_rss || mm->rss <= mm->rlimit_rss)
+				return 0;
 
-	return pte_chain;
+			page->pte.chain = page->pte.chain->next;
+		} while (page->pte.chain);
+	}
+	return 1;
 }
 
 /**
+ ** No more VM stuff below this comment, only pte_chain helper
+ ** functions.
+ **/
+
+
+/**
  * pte_chain_free - free pte_chain structure
  * @pte_chain: pte_chain struct to free
  * @prev_pte_chain: previous pte_chain on the list (may be NULL)
@@ -391,15 +404,12 @@
 static inline void pte_chain_free(struct pte_chain * pte_chain,
 		struct pte_chain * prev_pte_chain, struct page * page)
 {
-	mod_page_state(used_pte_chains_bytes, -sizeof(struct pte_chain));
 	if (prev_pte_chain)
 		prev_pte_chain->next = pte_chain->next;
 	else if (page)
 		page->pte.chain = pte_chain->next;
 
-	spin_lock(&pte_chain_freelist_lock);
-	pte_chain_push(pte_chain);
-	spin_unlock(&pte_chain_freelist_lock);
+	kmem_cache_free(pte_chain_cache, pte_chain);
 }
 
 /**
@@ -409,47 +419,20 @@
  * pte_chain structures as required.
  * Caller needs to hold the page's pte_chain_lock.
  */
-static inline struct pte_chain * pte_chain_alloc()
+static inline struct pte_chain *pte_chain_alloc(void)
 {
-	struct pte_chain * pte_chain;
-
-	spin_lock(&pte_chain_freelist_lock);
-
-	/* Allocate new pte_chain structs as needed. */
-	if (!pte_chain_freelist)
-		alloc_new_pte_chains();
-
-	/* Grab the first pte_chain from the freelist. */
-	pte_chain = pte_chain_pop();
-
-	spin_unlock(&pte_chain_freelist_lock);
-
-	mod_page_state(used_pte_chains_bytes, sizeof(struct pte_chain));
-	return pte_chain;
+	return kmem_cache_alloc(pte_chain_cache, GFP_ATOMIC);
 }
 
-/**
- * alloc_new_pte_chains - convert a free page to pte_chain structures
- *
- * Grabs a free page and converts it to pte_chain structures. We really
- * should pre-allocate these earlier in the pagefault path or come up
- * with some other trick.
- *
- * Note that we cannot use the slab cache because the pte_chain structure
- * is way smaller than the minimum size of a slab cache allocation.
- * Caller needs to hold the pte_chain_freelist_lock
- */
-static void alloc_new_pte_chains()
+void __init pte_chain_init(void)
 {
-	struct pte_chain * pte_chain = (void *) get_zeroed_page(GFP_ATOMIC);
-	int i = PAGE_SIZE / sizeof(struct pte_chain);
+	pte_chain_cache = kmem_cache_create( "pte_chain", 
+					     sizeof(struct pte_chain),
+					     0,
+					     0,
+					     NULL,
+					     NULL);
 
-	if (pte_chain) {
-		inc_page_state(nr_pte_chain_pages);
-		for (; i-- > 0; pte_chain++)
-			pte_chain_push(pte_chain);
-	} else {
-		/* Yeah yeah, I'll fix the pte_chain allocation ... */
-		panic("Fix pte_chain allocation, you lazy bastard!\n");
-	}
+	if (!pte_chain_cache)
+		panic("failed to create pte_chain cache!\n");
 }
diff -uNr linux-2.5.28/mm/swap.c linux-2.5.28-rmap/mm/swap.c
--- linux-2.5.28/mm/swap.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/mm/swap.c	Tue Jul 23 19:11:14 2002
@@ -14,11 +14,11 @@
  */
 
 #include <linux/mm.h>
-#include <linux/kernel_stat.h>
-#include <linux/swap.h>
 #include <linux/swapctl.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
+#include <linux/buffer_head.h> /* for try_to_release_page() */
+#include <linux/mm_inline.h>
 
 #include <asm/dma.h>
 #include <asm/uaccess.h> /* for copy_to/from_user */
@@ -33,16 +33,99 @@
 	8,	/* do swap I/O in clusters of this size */
 };
 
+/**
+ * (de)activate_page - move pages from/to active and inactive lists
+ * @page: the page we want to move
+ * @nolock - are we already holding the pagemap_lru_lock?
+ *
+ * Deactivate_page will move an active page to the right
+ * inactive list, while activate_page will move a page back
+ * from one of the inactive lists to the active list. If
+ * called on a page which is not on any of the lists, the
+ * page is left alone.
+ */
+void deactivate_page_nolock(struct page * page)
+{
+	/*
+	 * Don't touch it if it's not on the active list.
+	 * (some pages aren't on any list at all)
+	 */
+	ClearPageReferenced(page);
+	page->age = 0;
+	if (PageActive(page)) {
+		del_page_from_active_list(page);
+		add_page_to_inactive_dirty_list(page);
+	}
+}	
+
+void deactivate_page(struct page * page)
+{
+	spin_lock(&pagemap_lru_lock);
+	deactivate_page_nolock(page);
+	spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * drop_page - like deactivate_page, but try inactive_clean list
+ * @page: the page to drop
+ *
+ * Try to move a page to the inactive_clean list, this succeeds if the
+ * page is clean and not in use by anybody. If the page cannot be placed
+ * on the inactive_clean list it is placed on the inactive_dirty list
+ * instead.
+ *
+ * Note: this function gets called with the pagemap_lru_lock held.
+ */
+void drop_page(struct page * page)
+{
+	if (!TestSetPageLocked(page)) {
+		if (page->mapping && PagePrivate(page)) {
+			page_cache_get(page);
+			spin_unlock(&pagemap_lru_lock);
+			try_to_release_page(page, GFP_NOIO);
+			spin_lock(&pagemap_lru_lock);
+			page_cache_release(page);
+		}
+		unlock_page(page);
+	}
+
+	/* Make sure the page really is reclaimable. */
+	pte_chain_lock(page);
+	if (!page->mapping || PageDirty(page) || page->pte.chain ||
+			PagePrivate(page) || page_count(page) > 1)
+		deactivate_page_nolock(page);
+
+	else if (page_count(page) == 1) {
+		ClearPageReferenced(page);
+		page->age = 0;
+		if (PageActive(page)) {
+			del_page_from_active_list(page);
+			add_page_to_inactive_clean_list(page);
+		} else if (PageInactiveDirty(page)) {
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_inactive_clean_list(page);
+		}
+	}
+	pte_chain_unlock(page);
+}
+
 /*
  * Move an inactive page to the active list.
  */
-static inline void activate_page_nolock(struct page * page)
+void activate_page_nolock(struct page * page)
 {
-	if (PageLRU(page) && !PageActive(page)) {
-		del_page_from_inactive_list(page);
+	if (PageInactiveDirty(page)) {
+		del_page_from_inactive_dirty_list(page);
+		add_page_to_active_list(page);
+		KERNEL_STAT_INC(pgactivate);
+	} else if (PageInactiveClean(page)) {
+		del_page_from_inactive_clean_list(page);
 		add_page_to_active_list(page);
 		KERNEL_STAT_INC(pgactivate);
 	}
+
+	/* Make sure the page gets a fair chance at staying active. */
+	page->age = max((int)page->age, PAGE_AGE_START);
 }
 
 void activate_page(struct page * page)
@@ -58,29 +141,31 @@
  */
 void lru_cache_add(struct page * page)
 {
-	if (!TestSetPageLRU(page)) {
+	if (!PageLRU(page)) {
 		spin_lock(&pagemap_lru_lock);
-		add_page_to_inactive_list(page);
+		SetPageLRU(page);
+		add_page_to_active_list(page);
 		spin_unlock(&pagemap_lru_lock);
 	}
 }
 
 /**
  * __lru_cache_del: remove a page from the page lists
- * @page: the page to add
+ * @page: the page to remove
  *
  * This function is for when the caller already holds
  * the pagemap_lru_lock.
  */
 void __lru_cache_del(struct page * page)
 {
-	if (TestClearPageLRU(page)) {
-		if (PageActive(page)) {
-			del_page_from_active_list(page);
-		} else {
-			del_page_from_inactive_list(page);
-		}
+	if (PageActive(page)) {
+		del_page_from_active_list(page);
+	} else if (PageInactiveDirty(page)) {
+		del_page_from_inactive_dirty_list(page);
+	} else if (PageInactiveClean(page)) {
+		del_page_from_inactive_clean_list(page);
 	}
+	ClearPageLRU(page);
 }
 
 /**
diff -uNr linux-2.5.28/mm/vmscan.c linux-2.5.28-rmap/mm/vmscan.c
--- linux-2.5.28/mm/vmscan.c	Wed Jul 24 20:39:57 2002
+++ linux-2.5.28-rmap/mm/vmscan.c	Tue Jul 23 19:12:42 2002
@@ -13,7 +13,6 @@
 
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/swapctl.h>
 #include <linux/smp_lock.h>
@@ -21,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/file.h>
+#include <linux/mm_inline.h>
 #include <linux/writeback.h>
 #include <linux/suspend.h>
 #include <linux/buffer_head.h>		/* for try_to_release_page() */
@@ -29,6 +29,8 @@
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
 
+static void refill_freelist(void);
+static void wakeup_memwaiters(void);
 /*
  * The "priority" of VM scanning is how much of the queues we
  * will scan in one go. A value of 6 for DEF_PRIORITY implies
@@ -37,9 +39,14 @@
  */
 #define DEF_PRIORITY (6)
 
-static inline int is_page_cache_freeable(struct page * page)
+static inline void age_page_up(struct page *page)
 {
-	return page_count(page) - !!PagePrivate(page) == 1;
+	page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX); 
+}
+
+static inline void age_page_down(struct page *page)
+{
+	page->age -= min(PAGE_AGE_DECL, (int)page->age);
 }
 
 /* Must be called with page's pte_chain_lock held. */
@@ -62,70 +69,175 @@
 	return 0;
 }
 
-static int
-shrink_cache(int nr_pages, zone_t *classzone,
-		unsigned int gfp_mask, int priority, int max_scan)
+
+/**
+ * reclaim_page - reclaims one page from the inactive_clean list
+ * @zone: reclaim a page from this zone
+ *
+ * The pages on the inactive_clean can be instantly reclaimed.
+ * The tests look impressive, but most of the time we'll grab
+ * the first page of the list and exit successfully.
+ */
+struct page * reclaim_page(zone_t * zone)
 {
+	struct address_space * mapping;
+	struct page * page = NULL;
+	struct list_head * page_lru;
+	swp_entry_t entry = {0};
+	int maxscan;
+
+	/*
+	 * We need to hold the page_lock around all tests to make sure
+ 	 * reclaim_page() cannot race with find_get_page() and friends.
+ 	 */
+	spin_lock(&pagemap_lru_lock);
+	maxscan = zone->inactive_clean_pages;
+	while (maxscan-- && !list_empty(&zone->inactive_clean_list)) {
+		page_lru = zone->inactive_clean_list.prev;
+		page = list_entry(page_lru, struct page, lru);
+
+		mapping = page->mapping;
+		write_lock(&mapping->page_lock);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (unlikely(!PageInactiveClean(page))) {
+			printk("VM: reclaim_page, wrong page on list.\n");
+			list_del(page_lru);
+			page_zone(page)->inactive_clean_pages--;
+			goto unlock;
+		}
+
+		/* Page is being freed */
+		if (unlikely(!page_count(page))) {
+			list_del(page_lru);
+			list_add(page_lru, &zone->inactive_clean_list);
+			goto unlock;
+		}
+
+		/* Page cannot be reclaimed ?  Move to inactive_dirty list. */
+		pte_chain_lock(page);
+		if (unlikely(page->pte.chain || PagePrivate(page) ||
+				PageReferenced(page) || PageDirty(page) ||
+				page_count(page) > 1 || TestSetPageLocked(page))) {
+			del_page_from_inactive_clean_list(page);
+			add_page_to_inactive_dirty_list(page);
+			pte_chain_unlock(page);
+			goto unlock;
+		}
+
+		/*
+		 * From here until reaching either the bottom of the loop
+		 * or found_page: the pte_chain_lock is held.
+		 */
+
+		/* OK, remove the page from the caches. */
+                if (PageSwapCache(page)) {
+			entry.val = page->index;
+			__delete_from_swap_cache(page);
+			goto found_page;
+		}
+
+		if (page->mapping) {
+			__remove_inode_page(page);
+			goto found_page;
+		}
+
+		/* We should never ever get here. */
+		printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
+		list_del(page_lru);
+		zone->inactive_clean_pages--;
+		pte_chain_unlock(page);
+		unlock_page(page);
+unlock:
+		write_unlock(&mapping->page_lock);
+	}
+	spin_unlock(&pagemap_lru_lock);
+	return NULL;
+
+found_page:
+	__lru_cache_del(page);
+	pte_chain_unlock(page);
+	write_unlock(&mapping->page_lock);
+	spin_unlock(&pagemap_lru_lock);
+	if (entry.val)
+		swap_free(entry);
+	unlock_page(page);
+	/* initialize page flags */
+	page->age = PAGE_AGE_START;
+	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+			1 << PG_referenced | 1 << PG_arch_1 |
+			1 << PG_checked);
+	if (page_count(page) != 1)
+		printk("VM: reclaim_page, found page with count %d!\n",
+				page_count(page));
+	return page;
+}
+
+
+/**
+ * page_launder_zone - clean dirty inactive pages, move to inactive_clean list
+ * @zone: zone to free pages in
+ * @gfp_mask: what operations we are allowed to do
+ *
+ * This function is called when we are low on free / inactive_clean
+ * pages, its purpose is to refill the free/clean list as efficiently
+ * as possible.
+ *
+ * This means we do writes asynchronously as long as possible and will
+ * only sleep on IO when we don't have another option. Since writeouts
+ * cause disk seeks and make read IO slower, we skip writes alltogether
+ * when the amount of dirty pages is small.
+ *
+ * This code is heavily inspired by the FreeBSD source code. Thanks
+ * go out to Matthew Dillon.
+ */
+int page_launder_zone(zone_t * zone, int gfp_mask, int priority)
+{
+	int maxscan, cleaned_pages = 0, target = free_plenty(zone);
 	struct list_head * entry;
-	struct address_space *mapping;
+	struct address_space * mapping;
 
+	/* The main launder loop. */
 	spin_lock(&pagemap_lru_lock);
-	while (--max_scan >= 0 &&
-			(entry = inactive_list.prev) != &inactive_list) {
+	maxscan = zone->inactive_dirty_pages >> priority;
+	while (maxscan-- && !list_empty(&zone->inactive_dirty_list)) {
 		struct page *page;
 		int may_enter_fs;
 
+		/* Low latency reschedule point */
 		if (need_resched()) {
 			spin_unlock(&pagemap_lru_lock);
-			__set_current_state(TASK_RUNNING);
 			schedule();
 			spin_lock(&pagemap_lru_lock);
 			continue;
 		}
 
+		entry = zone->inactive_dirty_list.prev;
 		page = list_entry(entry, struct page, lru);
 
-		if (unlikely(!PageLRU(page)))
-			BUG();
-		if (unlikely(PageActive(page)))
-			BUG();
+		if (cleaned_pages > target)
+			break;
 
 		list_del(entry);
-		list_add(entry, &inactive_list);
+		list_add(entry, &zone->inactive_dirty_list);
 		KERNEL_STAT_INC(pgscan);
 
-		/*
-		 * Zero page counts can happen because we unlink the pages
-		 * _after_ decrementing the usage count..
-		 */
-		if (unlikely(!page_count(page)))
-			continue;
-
-		if (!memclass(page_zone(page), classzone))
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (!PageInactiveDirty(page)) {
+			printk("VM: page_launder, wrong page on list.\n");
+			list_del(entry);
+			dec_page_state(nr_inactive_dirty_pages);
+			page_zone(page)->inactive_dirty_pages--;
 			continue;
+		}
 
 		/*
-		 * swap activity never enters the filesystem and is safe
-		 * for GFP_NOFS allocations.
-		 */
-		may_enter_fs = (gfp_mask & __GFP_FS) ||
-				(PageSwapCache(page) && (gfp_mask & __GFP_IO));
-
-		/*
-		 * IO in progress? Leave it at the back of the list.
+		 * Page is being freed, don't worry about it.
 		 */
-		if (unlikely(PageWriteback(page))) {
-			if (may_enter_fs) {
-				page_cache_get(page);
-				spin_unlock(&pagemap_lru_lock);
-				wait_on_page_writeback(page);
-				page_cache_release(page);
-				spin_lock(&pagemap_lru_lock);
-			}
+		if (unlikely(!page_count(page)))
 			continue;
-		}
 
-		if (TestSetPageLocked(page))
+		if (unlikely(TestSetPageLocked(page)))
 			continue;
 
 		if (PageWriteback(page)) {	/* The non-racy check */
@@ -135,12 +247,14 @@
 
 		/*
 		 * The page is in active use or really unfreeable. Move to
-		 * the active list.
+		 * the active list and adjust the page age if needed.
 		 */
 		pte_chain_lock(page);
-		if (page_referenced(page) && page_mapping_inuse(page)) {
-			del_page_from_inactive_list(page);
+		if (page_referenced(page) && page_mapping_inuse(page) &&
+				!page_over_rsslimit(page)) {
+			del_page_from_inactive_dirty_list(page);
 			add_page_to_active_list(page);
+			page->age = max((int)page->age, PAGE_AGE_START);
 			pte_chain_unlock(page);
 			unlock_page(page);
 			KERNEL_STAT_INC(pgactivate);
@@ -189,19 +303,24 @@
 		pte_chain_unlock(page);
 		mapping = page->mapping;
 
-		if (PageDirty(page) && is_page_cache_freeable(page) &&
-				page->mapping && may_enter_fs) {
+		/*
+		 * swap activity never enters the filesystem and is safe
+		 * for GFP_NOFS allocations.
+		 */
+		may_enter_fs = (gfp_mask & __GFP_FS) ||
+				(PageSwapCache(page) && (gfp_mask & __GFP_IO));
+
+		if (PageDirty(page) && mapping && may_enter_fs) {
 			/*
 			 * It is not critical here to write it only if
 			 * the page is unmapped beause any direct writer
-			 * like O_DIRECT would set the page's dirty bitflag
+			 * like O_DIRECT would set the PG_dirty bitflag
 			 * on the physical page after having successfully
 			 * pinned it and after the I/O to the page is finished,
 			 * so the direct writes to the page cannot get lost.
 			 */
 			int (*writeback)(struct page *, int *);
-			const int nr_pages = SWAP_CLUSTER_MAX;
-			int nr_to_write = nr_pages;
+			int nr_to_write = SWAP_CLUSTER_MAX;
 
 			writeback = mapping->a_ops->vm_writeback;
 			if (writeback == NULL)
@@ -209,7 +328,6 @@
 			page_cache_get(page);
 			spin_unlock(&pagemap_lru_lock);
 			(*writeback)(page, &nr_to_write);
-			max_scan -= (nr_pages - nr_to_write);
 			page_cache_release(page);
 			spin_lock(&pagemap_lru_lock);
 			continue;
@@ -232,7 +350,7 @@
 		if (PagePrivate(page)) {
 			spin_unlock(&pagemap_lru_lock);
 
-			/* avoid to free a locked page */
+			/* To avoid freeing our page before we're done. */
 			page_cache_get(page);
 
 			if (try_to_release_page(page, gfp_mask)) {
@@ -240,268 +358,311 @@
 					/* effectively free the page here */
 					unlock_page(page);
 					page_cache_release(page);
-
+					KERNEL_STAT_INC(pgsteal);
 					spin_lock(&pagemap_lru_lock);
-					if (--nr_pages)
-						continue;
-					break;
+					cleaned_pages++;
+					continue;
 				} else {
 					/*
-					 * The page is still in pagecache so undo the stuff
-					 * before the try_to_release_page since we've not
-					 * finished and we can now try the next step.
+					 * We freed the buffers but may have
+					 * slept; undo the stuff we did before
+					 * try_to_release_page and fall through
+					 * to the next step.
 					 */
 					page_cache_release(page);
-
 					spin_lock(&pagemap_lru_lock);
 				}
 			} else {
 				/* failed to drop the buffers so stop here */
 				unlock_page(page);
 				page_cache_release(page);
-
 				spin_lock(&pagemap_lru_lock);
 				continue;
 			}
 		}
 
 		/*
-		 * This is the non-racy check for busy page.
-		 */
-		if (mapping) {
-			write_lock(&mapping->page_lock);
-			if (is_page_cache_freeable(page))
-				goto page_freeable;
-			write_unlock(&mapping->page_lock);
-		}
-		unlock_page(page);
-		continue;
-page_freeable:
-		/*
-		 * It is critical to check PageDirty _after_ we made sure
-		 * the page is freeable* so not in use by anybody.
+		 * If the page is really freeable now, move it to the
+		 * inactive_clean list.
+		 *
+		 * We re-test everything since the page could have been
+		 * used by somebody else while we waited on IO above.
+		 * This test is not safe from races, but only the one
+		 * in reclaim_page() needs to be.
 		 */
-		if (PageDirty(page)) {
-			write_unlock(&mapping->page_lock);
+		pte_chain_lock(page);
+		if (mapping && !PageDirty(page) && !page->pte.chain &&
+				page_count(page) == 1) {
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_inactive_clean_list(page);
+			pte_chain_unlock(page);
 			unlock_page(page);
-			continue;
-		}
-
-		/* point of no return */
-		if (likely(!PageSwapCache(page))) {
-			__remove_inode_page(page);
-			write_unlock(&mapping->page_lock);
+			cleaned_pages++;
 		} else {
-			swp_entry_t swap;
-			swap.val = page->index;
-			__delete_from_swap_cache(page);
-			write_unlock(&mapping->page_lock);
-			swap_free(swap);
+			/*
+			 * OK, we don't know what to do with the page.
+			 * It's no use keeping it here, so we move it to
+			 * the active list.
+			 */
+page_active:
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_active_list(page);
+			pte_chain_unlock(page);
+			unlock_page(page);
+			KERNEL_STAT_INC(pgactivate);
 		}
+	}
+	spin_unlock(&pagemap_lru_lock);
 
-		__lru_cache_del(page);
-		unlock_page(page);
+	/* Return the number of pages moved to the inactive_clean list. */
+	return cleaned_pages;
+}
 
-		/* effectively free the page here */
-		page_cache_release(page);
-		KERNEL_STAT_INC(pgsteal);
-		if (--nr_pages)
-			continue;
-		goto out;
-page_active:
-		/*
-		 * OK, we don't know what to do with the page.
-		 * It's no use keeping it here, so we move it to
-		 * the active list.
-		 */
-		del_page_from_inactive_list(page);
-		add_page_to_active_list(page);
-		pte_chain_unlock(page);
-		unlock_page(page);
-		KERNEL_STAT_INC(pgactivate);
+/**
+ * page_launder - clean dirty inactive pages, move to inactive_clean list
+ * @gfp_mask: what operations we are allowed to do
+ *
+ * This function iterates over all zones and calls page_launder_zone(),
+ * balancing still needs to be added...
+ */
+int page_launder(int gfp_mask)
+{
+	int maxtry = 1 << DEF_PRIORITY;
+	struct zone_struct * zone;
+	int freed = 0;
+
+	/* Global balancing while we have a global shortage. */
+	while (maxtry-- && free_high(ALL_ZONES) >= 0) {
+		for_each_zone(zone)
+			if (free_plenty(zone) >= 0)
+				freed += page_launder_zone(zone, gfp_mask, 6);
 	}
-out:	spin_unlock(&pagemap_lru_lock);
-	return nr_pages;
+	
+	/* Clean up the remaining zones with a serious shortage, if any. */
+	for_each_zone(zone)
+		if (free_min(zone) >= 0)
+			freed += page_launder_zone(zone, gfp_mask, 0);
+
+	return freed;
 }
 
-/*
- * This moves pages from the active list to
- * the inactive list.
+/**
+ * refill_inactive_zone - scan the active list and find pages to deactivate
+ * @priority: how much are we allowed to scan
  *
- * We move them the other way if the page is 
- * referenced by one or more processes, from rmap
+ * This function will scan a portion of the active list of a zone to find
+ * unused pages, those pages will then be moved to the inactive list.
  */
-static void refill_inactive(int nr_pages)
+int refill_inactive_zone(struct zone_struct * zone, int priority)
 {
-	struct list_head * entry;
+	int maxscan = zone->active_pages >> priority;
+	int target = inactive_high(zone);
+	struct list_head * page_lru;
+	int nr_deactivated = 0;
+	struct page * page;
 
+	/* Take the lock while messing with the list... */
 	spin_lock(&pagemap_lru_lock);
-	entry = active_list.prev;
-	while (nr_pages-- && entry != &active_list) {
-		struct page * page;
-
-		page = list_entry(entry, struct page, lru);
-		entry = entry->prev;
+	while (maxscan-- && !list_empty(&zone->active_list)) {
+		page_lru = zone->active_list.prev;
+		page = list_entry(page_lru, struct page, lru);
 
-		KERNEL_STAT_INC(pgscan);
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (unlikely(!PageActive(page))) {
+			printk("VM: refill_inactive, wrong page on list.\n");
+			list_del(page_lru);
+			dec_page_state(nr_active_pages);
+			continue;
+		}
+		
+		/* Needed to follow page->mapping */
+		if (TestSetPageLocked(page)) {
+			list_del(page_lru);
+			list_add(page_lru, &zone->active_list);
+			KERNEL_STAT_INC(pgscan);
+			continue;
+		}
 
+		/*
+		 * If the object the page is in is not in use we don't
+		 * bother with page aging.  If the page is touched again
+		 * while on the inactive_clean list it'll be reactivated.
+		 * From here until the end of the current iteration
+		 * both PG_locked and the pte_chain_lock are held.
+		 */
 		pte_chain_lock(page);
-		if (page->pte.chain && page_referenced(page)) {
-			list_del(&page->lru);
-			list_add(&page->lru, &active_list);
+		if (!page_mapping_inuse(page)) {
 			pte_chain_unlock(page);
+			unlock_page(page);
+			drop_page(page);
 			continue;
 		}
-		del_page_from_active_list(page);
-		add_page_to_inactive_list(page);
+
+		/*
+		 * Do aging on the pages.
+		 */
+		if (page_referenced(page)) {
+			age_page_up(page);
+		} else {
+			age_page_down(page);
+		}
+
+		/* 
+		 * If the page age is 'hot' and the process using the
+		 * page doesn't exceed its RSS limit we keep the page.
+		 * Otherwise we move it to the inactive_dirty list.
+		 */
+		if (page->age && !page_over_rsslimit(page)) {
+			list_del(page_lru);
+			list_add(page_lru, &zone->active_list);
+		} else {
+			deactivate_page_nolock(page);
+			if (++nr_deactivated > target) {
+				pte_chain_unlock(page);
+				unlock_page(page);
+				goto done;
+			}
+		}
 		pte_chain_unlock(page);
-		KERNEL_STAT_INC(pgdeactivate);
+		unlock_page(page);
+
+		/* Low latency reschedule point */
+		if (need_resched()) {
+			spin_unlock(&pagemap_lru_lock);
+			schedule();
+			spin_lock(&pagemap_lru_lock);
+		}
 	}
+
+done:
 	spin_unlock(&pagemap_lru_lock);
+	return nr_deactivated;
 }
 
-static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages));
-static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
+/**
+ * refill_inactive - checks all zones and refills the inactive list as needed
+ *
+ * This function tries to balance page eviction from all zones by aging
+ * the pages from each zone in the same ratio until the global inactive
+ * shortage is resolved. After that it does one last "clean-up" scan to
+ * fix up local inactive shortages.
+ */
+int refill_inactive(void)
 {
-	int chunk_size = nr_pages;
-	unsigned long ratio;
-	struct page_state ps;
-	int max_scan;
+	int maxtry = 1 << DEF_PRIORITY;
+	zone_t * zone;
+	int ret = 0;
 
-	nr_pages -= kmem_cache_reap(gfp_mask);
-	if (nr_pages <= 0)
-		return 0;
+	/* Global balancing while we have a global shortage. */
+	while (maxtry-- && inactive_low(ALL_ZONES) >= 0) {
+		for_each_zone(zone) {
+			if (inactive_high(zone) >= 0)
+				ret += refill_inactive_zone(zone, DEF_PRIORITY);
+		}
+	}
 
-	nr_pages = chunk_size;
+	/* Local balancing for zones which really need it. */
+	for_each_zone(zone) {
+		if (inactive_min(zone) >= 0)
+			ret += refill_inactive_zone(zone, 0);
+	}
 
-	/*
-	 * Try to keep the active list 2/3 of the size of the cache
-	 */
-	get_page_state(&ps);
-	ratio = (unsigned long)nr_pages * ps.nr_active /
-				((ps.nr_inactive | 1) * 2);
-	refill_inactive(ratio);
-	max_scan = ps.nr_inactive / priority;
-	nr_pages = shrink_cache(nr_pages, classzone,
-				gfp_mask, priority, max_scan);
-	if (nr_pages <= 0)
-		return 0;
+	return ret;
+}
 
-	wakeup_bdflush();
+/**
+ * background_aging - slow background aging of zones
+ * @priority: priority at which to scan
+ *
+ * When the VM load is low or nonexistant, this function is
+ * called once a second to "sort" the pages in the VM. This
+ * way we know which pages to evict once a load spike happens.
+ * The effects of this function are very slow, the CPU usage
+ * should be minimal to nonexistant under most loads.
+ */
+static inline void background_aging(int priority)
+{
+	struct zone_struct * zone;
 
-	shrink_dcache_memory(priority, gfp_mask);
+	for_each_zone(zone)
+		if (inactive_high(zone) > 0)
+			refill_inactive_zone(zone, priority);
+}
 
-	/* After shrinking the dcache, get rid of unused inodes too .. */
-	shrink_icache_memory(1, gfp_mask);
+/*
+ * Worker function for kswapd and try_to_free_pages, we get
+ * called whenever there is a shortage of free/inactive_clean
+ * pages.
+ *
+ * This function will also move pages to the inactive list,
+ * if needed.
+ */
+static int do_try_to_free_pages(unsigned int gfp_mask)
+{
+	int ret = 0;
+
+	/*
+	 * Eat memory from filesystem page cache, 
+	 * dentry, inode and filesystem quota caches.
+	 */
+	ret += page_launder(gfp_mask);
+	ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
+	ret += shrink_icache_memory(1, gfp_mask);
 #ifdef CONFIG_QUOTA
-	shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+	ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
 #endif
+	/*
+	 * Move pages from the active list to the inactive list.
+	 */
+	refill_inactive();
 
-	return nr_pages;
-}
-
-int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order)
-{
-	int priority = DEF_PRIORITY;
-	int nr_pages = SWAP_CLUSTER_MAX;
+	/* 	
+	 * Reclaim unused slab cache memory.
+	 */
+	ret += kmem_cache_reap(gfp_mask);
 
-	KERNEL_STAT_INC(pageoutrun);
+	refill_freelist();
 
-	do {
-		nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);
-		if (nr_pages <= 0)
-			return 1;
-	} while (--priority);
+	/* Start IO when needed. */
+	if (free_plenty(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0)
+		blk_run_queues();
 
 	/*
 	 * Hmm.. Cache shrink failed - time to kill something?
 	 * Mhwahahhaha! This is the part I really like. Giggle.
 	 */
-	out_of_memory();
-	return 0;
-}
-
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
-
-static int check_classzone_need_balance(zone_t * classzone)
-{
-	zone_t * first_classzone;
-
-	first_classzone = classzone->zone_pgdat->node_zones;
-	while (classzone >= first_classzone) {
-		if (classzone->free_pages > classzone->pages_high)
-			return 0;
-		classzone--;
-	}
-	return 1;
+	if (!ret && free_min(ANY_ZONE) > 0)
+		out_of_memory();
+	return ret;
 }
 
-static int kswapd_balance_pgdat(pg_data_t * pgdat)
+/**
+ * refill_freelist - move inactive_clean pages to free list if needed
+ *
+ * Move some pages from the inactive_clean lists to the free
+ * lists so atomic allocations have pages to work from. This
+ * function really only does something when we don't have a 
+ * userspace load on __alloc_pages().
+ *
+ * We refill the freelist in a bump from pages_min to pages_min * 2
+ * in order to give the buddy allocator something to play with.
+ */
+static void refill_freelist(void)
 {
-	int need_more_balance = 0, i;
+	struct page * page;
 	zone_t * zone;
 
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
-		cond_resched();
-		if (!zone->need_balance)
-			continue;
-		if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
-			zone->need_balance = 0;
-			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ);
+	for_each_zone(zone) {
+		if (!zone->size || zone->free_pages >= zone->pages_min)
 			continue;
-		}
-		if (check_classzone_need_balance(zone))
-			need_more_balance = 1;
-		else
-			zone->need_balance = 0;
-	}
 
-	return need_more_balance;
-}
-
-static void kswapd_balance(void)
-{
-	int need_more_balance;
-	pg_data_t * pgdat;
-
-	do {
-		need_more_balance = 0;
-		pgdat = pgdat_list;
-		do
-			need_more_balance |= kswapd_balance_pgdat(pgdat);
-		while ((pgdat = pgdat->node_next));
-	} while (need_more_balance);
-}
-
-static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
-{
-	zone_t * zone;
-	int i;
-
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
-		if (!zone->need_balance)
-			continue;
-		return 0;
+		while (zone->free_pages < zone->pages_min * 2) {
+			page = reclaim_page(zone);
+			if (!page)
+				break;
+			__free_page(page);
+		}
 	}
-
-	return 1;
-}
-
-static int kswapd_can_sleep(void)
-{
-	pg_data_t * pgdat;
-
-	pgdat = pgdat_list;
-	do {
-		if (kswapd_can_sleep_pgdat(pgdat))
-			continue;
-		return 0;
-	} while ((pgdat = pgdat->node_next));
-
-	return 1;
 }
 
 /*
@@ -520,7 +681,6 @@
 int kswapd(void *unused)
 {
 	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
 
 	daemonize();
 	strcpy(tsk->comm, "kswapd");
@@ -544,26 +704,152 @@
 	 * Kswapd main loop.
 	 */
 	for (;;) {
+		static long recalc = 0;
 		if (current->flags & PF_FREEZE)
 			refrigerator(PF_IOTHREAD);
-		__set_current_state(TASK_INTERRUPTIBLE);
-		add_wait_queue(&kswapd_wait, &wait);
+		/*
+		 * We try to rebalance the VM either when we have a
+		 * global shortage of free pages or when one particular
+		 * zone is very short on free pages.
+		 */
+		if (free_high(ALL_ZONES) >= 0 || free_low(ANY_ZONE) > 0)
+			do_try_to_free_pages(GFP_KSWAPD);
 
-		mb();
-		if (kswapd_can_sleep())
-			schedule();
+		refill_freelist();
+
+		/* Once a second ... */
+		if (time_after(jiffies, recalc + HZ)) {
+			recalc = jiffies;
+
+			/* Do background page aging. */
+			background_aging(DEF_PRIORITY);
+		}
+
+		wakeup_memwaiters();
+	}
+}
 
-		__set_current_state(TASK_RUNNING);
+static int kswapd_overloaded;
+unsigned int kswapd_minfree; /* initialized in mm/page_alloc.c */
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
+
+/**
+ * wakeup_kswapd - wake up the pageout daemon
+ * gfp_mask: page freeing flags
+ *
+ * This function wakes up kswapd and can, under heavy VM pressure,
+ * put the calling task to sleep temporarily.
+ */
+void wakeup_kswapd(unsigned int gfp_mask)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	/* If we're in the memory freeing business ourself, don't sleep
+	 * but just wake kswapd and go back to businesss.
+	 */
+	if (current->flags & PF_MEMALLOC) {
+		wake_up_interruptible(&kswapd_wait);
+		return;
+	}
+
+	/* We need all of kswapd's GFP flags, otherwise we can't sleep on it.
+	 * We still wake kswapd of course.
+	 */
+	if ((gfp_mask & GFP_KSWAPD) != GFP_KSWAPD) {
+		wake_up_interruptible(&kswapd_wait);
+		return;
+	}
+	
+	add_wait_queue(&kswapd_done, &wait);
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        
+        /* Wake kswapd .... */
+        wake_up_interruptible(&kswapd_wait);
+        
+        /* ... and check if we need to wait on it */
+	if ((free_low(ALL_ZONES) > (kswapd_minfree / 2)) && !kswapd_overloaded)
+		schedule();
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&kswapd_done, &wait);
+}
+
+static void wakeup_memwaiters(void)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	        
+	add_wait_queue(&kswapd_wait, &wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/* Don't let the processes waiting on memory get stuck, ever. */
+	wake_up(&kswapd_done);
+
+	/* Enough free RAM, we can easily keep up with memory demand. */
+	if (free_high(ALL_ZONES) <= 0) {
+		schedule_timeout(HZ);
 		remove_wait_queue(&kswapd_wait, &wait);
+		return;
+	}
+	remove_wait_queue(&kswapd_wait, &wait);
 
-		/*
-		 * If we actually get into a low-memory situation,
-		 * the processes needing more memory will wake us
-		 * up on a more timely basis.
-		 */
-		kswapd_balance();
-		blk_run_queues();
+	/* OK, the VM is very loaded. Sleep instead of using all CPU. */
+	kswapd_overloaded = 1;
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(HZ / 4);
+	kswapd_overloaded = 0;
+	return;
+}
+
+/**
+ * try_to_free_pages - run the pageout code ourselves
+ * gfp_mask: mask of things the pageout code is allowed to do
+ *
+ * When the load on the system gets higher, it can happen
+ * that kswapd no longer manages to keep enough memory
+ * free. In those cases user programs allocating memory
+ * will call try_to_free_pages() and help the pageout code.
+ * This has the effects of freeing memory and slowing down
+ * the largest memory hogs a bit.
+ */
+int try_to_free_pages(unsigned int gfp_mask)
+{
+	int ret = 1;
+	if (gfp_mask & __GFP_WAIT) {
+		KERNEL_STAT_INC(pageoutrun);
+		current->flags |= PF_MEMALLOC;
+		ret = do_try_to_free_pages(gfp_mask);
+		current->flags &= ~PF_MEMALLOC;
 	}
+	return ret;
+}
+
+/**
+ * rss_free_pages - run part of the pageout code and slow down a bit
+ * @gfp_mask: mask of things the pageout code is allowed to do
+ *
+ * This function is called when a task is over its RSS limit and
+ * has a page fault.  It's goal is to free some memory so non-hogs
+ * can run faster and slow down itself when needed so it won't eat
+ * the memory non-hogs can use.
+ */
+void rss_free_pages(unsigned int gfp_mask)
+{
+	long pause = 0;
+	if (current->flags & PF_MEMALLOC)
+		return;
+	current->flags |= PF_MEMALLOC;
+
+	do {
+		page_launder(gfp_mask);
+
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(pause);
+		set_current_state(TASK_RUNNING);
+		pause++;
+	} while (free_high(ALL_ZONES) >= 0);
+
+	current->flags &= ~PF_MEMALLOC;
+	return;
 }
 
 static int __init kswapd_init(void)