diff -uNr linux-2.5.28/Makefile linux-2.5.28-rmap/Makefile --- linux-2.5.28/Makefile Wed Jul 24 20:43:47 2002 +++ linux-2.5.28-rmap/Makefile Wed Jul 24 20:47:13 2002 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 5 SUBLEVEL = 28 -EXTRAVERSION = +EXTRAVERSION = -rmap # *DOCUMENTATION* # Too see a list of typical targets execute "make help" diff -uNr linux-2.5.28/fs/buffer.c linux-2.5.28-rmap/fs/buffer.c --- linux-2.5.28/fs/buffer.c Wed Jul 24 20:43:53 2002 +++ linux-2.5.28-rmap/fs/buffer.c Wed Jul 24 20:47:19 2002 @@ -463,17 +463,13 @@ } /* - * FIXME: What is this function actually trying to do? Why "zones[0]"? + * FIXME: What is this function actually trying to do? * Is it still correct/needed if/when blockdev mappings use GFP_HIGHUSER? */ static void free_more_memory(void) { - zone_t *zone; - - zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0]; - wakeup_bdflush(); - try_to_free_pages(zone, GFP_NOFS, 0); + try_to_free_pages(GFP_NOFS); blk_run_queues(); yield(); } diff -uNr linux-2.5.28/fs/dcache.c linux-2.5.28-rmap/fs/dcache.c --- linux-2.5.28/fs/dcache.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/fs/dcache.c Tue Jul 23 19:12:41 2002 @@ -603,8 +603,7 @@ count = dentry_stat.nr_unused / priority; prune_dcache(count); - kmem_cache_shrink(dentry_cache); - return 0; + return kmem_cache_shrink(dentry_cache); } #define NAME_ALLOC_LEN(len) ((len+16) & ~15) diff -uNr linux-2.5.28/fs/dquot.c linux-2.5.28-rmap/fs/dquot.c --- linux-2.5.28/fs/dquot.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/fs/dquot.c Tue Jul 23 19:12:41 2002 @@ -498,8 +498,7 @@ count = dqstats.free_dquots / priority; prune_dqcache(count); unlock_kernel(); - kmem_cache_shrink(dquot_cachep); - return 0; + return kmem_cache_shrink(dquot_cachep); } /* diff -uNr linux-2.5.28/fs/inode.c linux-2.5.28-rmap/fs/inode.c --- linux-2.5.28/fs/inode.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/fs/inode.c Tue Jul 23 19:12:41 2002 @@ -431,8 +431,7 @@ count = inodes_stat.nr_unused / priority; prune_icache(count); - kmem_cache_shrink(inode_cachep); - return 0; + return kmem_cache_shrink(inode_cachep); } /* diff -uNr linux-2.5.28/fs/mpage.c linux-2.5.28-rmap/fs/mpage.c --- linux-2.5.28/fs/mpage.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/fs/mpage.c Tue Jul 23 19:11:09 2002 @@ -557,12 +557,11 @@ if (page->mapping && !PageWriteback(page) && TestClearPageDirty(page)) { - /* FIXME: batch this up */ - if (!PageActive(page) && PageLRU(page)) { + if (PageInactiveDirty(page)) { spin_lock(&pagemap_lru_lock); - if (!PageActive(page) && PageLRU(page)) { + if (PageInactiveDirty(page)) { list_del(&page->lru); - list_add(&page->lru, &inactive_list); + list_add(&page->lru, &page_zone(page)->inactive_dirty_list); } spin_unlock(&pagemap_lru_lock); } diff -uNr linux-2.5.28/fs/proc/proc_misc.c linux-2.5.28-rmap/fs/proc/proc_misc.c --- linux-2.5.28/fs/proc/proc_misc.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/fs/proc/proc_misc.c Wed Jul 24 21:02:05 2002 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -151,7 +152,9 @@ "Cached: %8lu kB\n" "SwapCached: %8lu kB\n" "Active: %8lu kB\n" - "Inactive: %8lu kB\n" + "Inact_dirty: %8lu kB\n" + "Inact_clean: %8lu kB\n" + "Inact_target: %8lu kB\n" "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" "LowTotal: %8lu kB\n" @@ -161,15 +164,16 @@ "Dirty: %8lu kB\n" "Writeback: %8lu kB\n" "PageTables: %8lu kB\n" - "PteChainTot: %8lu kB\n" - "PteChainUsed: %8lu kB\n", + "ReverseMaps: %8lu\n", K(i.totalram), K(i.freeram), K(i.sharedram), K(ps.nr_pagecache-swapper_space.nrpages), K(swapper_space.nrpages), - K(ps.nr_active), - K(ps.nr_inactive), + K(ps.nr_active_pages), + K(ps.nr_inactive_dirty_pages), + K(ps.nr_inactive_clean_pages), + K(inactive_target()), K(i.totalhigh), K(i.freehigh), K(i.totalram-i.totalhigh), @@ -179,8 +183,7 @@ K(ps.nr_dirty), K(ps.nr_writeback), K(ps.nr_page_table_pages), - K(ps.nr_pte_chain_pages), - ps.used_pte_chains_bytes >> 10 + ps.nr_reverse_maps ); return proc_calc_metrics(page, start, off, count, eof, len); diff -uNr linux-2.5.28/include/linux/init_task.h linux-2.5.28-rmap/include/linux/init_task.h --- linux-2.5.28/include/linux/init_task.h Wed Jul 24 20:43:54 2002 +++ linux-2.5.28-rmap/include/linux/init_task.h Wed Jul 24 20:47:21 2002 @@ -27,6 +27,7 @@ mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \ page_table_lock: SPIN_LOCK_UNLOCKED, \ mmlist: LIST_HEAD_INIT(name.mmlist), \ + rlimit_rss: RLIM_INFINITY, \ } #define INIT_SIGNALS { \ diff -uNr linux-2.5.28/include/linux/mm.h linux-2.5.28-rmap/include/linux/mm.h --- linux-2.5.28/include/linux/mm.h Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/include/linux/mm.h Tue Jul 23 19:09:15 2002 @@ -19,9 +19,6 @@ extern unsigned long num_physpages; extern void * high_memory; extern int page_cluster; -/* The inactive_clean lists are per zone. */ -extern struct list_head active_list; -extern struct list_head inactive_list; #include #include @@ -157,10 +154,10 @@ updated asynchronously */ struct list_head lru; /* Pageout list, eg. active_list; protected by pagemap_lru_lock !! */ - union { - struct pte_chain * chain; /* Reverse pte mapping pointer. - * protected by PG_chainlock */ - pte_t * direct; + unsigned char age; /* Page aging counter. */ + union { /* Reverse pte mapping pointer, */ + struct pte_chain * chain; /* protected by PG_chainlock */ + pte_t * direct; } pte; unsigned long private; /* mapping-private opaque data */ @@ -299,13 +296,17 @@ #define page_address(page) ((page)->virtual) -#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */ +#elif defined(CONFIG_DISCONTIGMEM) + +extern unsigned long page_address(struct page * page); + +#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL && !CONFIG_DISCONTIGMEM */ #define page_address(page) \ __va( (((page) - page_zone(page)->zone_mem_map) << PAGE_SHIFT) \ + page_zone(page)->zone_start_paddr) -#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */ +#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL && !CONFIG_DISCONTIGMEM */ /* * Error return values for the *_nopage functions @@ -326,6 +327,7 @@ /* The array of struct pages */ extern struct page *mem_map; +extern void FASTCALL(fixup_freespace(struct zone_struct *, int)); extern void show_free_areas(void); extern void show_free_areas_node(pg_data_t *pgdat); diff -uNr linux-2.5.28/include/linux/mm_inline.h linux-2.5.28-rmap/include/linux/mm_inline.h --- linux-2.5.28/include/linux/mm_inline.h Wed Dec 31 17:00:00 1969 +++ linux-2.5.28-rmap/include/linux/mm_inline.h Tue Jul 23 19:09:19 2002 @@ -0,0 +1,278 @@ +#ifndef _LINUX_MM_INLINE_H +#define _LINUX_MM_INLINE_H + +#include +#include + +/* + * These inline functions tend to need bits and pieces of all the + * other VM include files, meaning they cannot be defined inside + * one of the other VM include files. + * + * The include file mess really needs to be cleaned up... + */ + +static inline void add_page_to_active_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageActive(page); + list_add(&page->lru, &zone->active_list); + zone->active_pages++; + inc_page_state(nr_active_pages); +} + +static inline void add_page_to_inactive_dirty_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageInactiveDirty(page); + list_add(&page->lru, &zone->inactive_dirty_list); + zone->inactive_dirty_pages++; + inc_page_state(nr_inactive_dirty_pages); +} + +static inline void add_page_to_inactive_clean_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + DEBUG_LRU_PAGE(page); + SetPageInactiveClean(page); + list_add(&page->lru, &zone->inactive_clean_list); + zone->inactive_clean_pages++; + inc_page_state(nr_inactive_clean_pages); +} + +static inline void del_page_from_active_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + list_del(&page->lru); + ClearPageActive(page); + dec_page_state(nr_active_pages); + zone->active_pages--; + KERNEL_STAT_INC(pgdeactivate); + DEBUG_LRU_PAGE(page); +} + +static inline void del_page_from_inactive_dirty_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + list_del(&page->lru); + ClearPageInactiveDirty(page); + dec_page_state(nr_inactive_dirty_pages); + zone->inactive_dirty_pages--; + DEBUG_LRU_PAGE(page); +} + +static inline void del_page_from_inactive_clean_list(struct page * page) +{ + struct zone_struct * zone = page_zone(page); + list_del(&page->lru); + ClearPageInactiveClean(page); + zone->inactive_clean_pages--; + dec_page_state(nr_inactive_clean_pages); + DEBUG_LRU_PAGE(page); +} + +/* + * Inline functions to control some balancing in the VM. + * + * Note that we do both global and per-zone balancing, with + * most of the balancing done globally. + */ +#define PLENTY_FACTOR 2 +#define ALL_ZONES NULL +#define ANY_ZONE (struct zone_struct *)(~0UL) +#define INACTIVE_FACTOR 5 + +#define VM_MIN 0 +#define VM_LOW 1 +#define VM_HIGH 2 +#define VM_PLENTY 3 +static inline int zone_free_limit(struct zone_struct * zone, int limit) +{ + int free, target, delta; + + /* This is really nasty, but GCC should completely optimise it away. */ + if (limit == VM_MIN) + target = zone->pages_min; + else if (limit == VM_LOW) + target = zone->pages_low; + else if (limit == VM_HIGH) + target = zone->pages_high; + else + target = zone->pages_high * PLENTY_FACTOR; + + free = zone->free_pages + zone->inactive_clean_pages; + delta = target - free; + + return delta; +} + +static inline int free_limit(struct zone_struct * zone, int limit) +{ + int shortage = 0, local; + + if (zone == ALL_ZONES) { + for_each_zone(zone) + shortage += zone_free_limit(zone, limit); + } else if (zone == ANY_ZONE) { + for_each_zone(zone) { + local = zone_free_limit(zone, limit); + shortage += max(local, 0); + } + } else { + shortage = zone_free_limit(zone, limit); + } + + return shortage; +} + +/** + * free_min - test for critically low amount of free pages + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if we have a serious shortage of free and + * clean pages, zero or negative if there is no serious shortage. + */ +static inline int free_min(struct zone_struct * zone) +{ + return free_limit(zone, VM_MIN); +} + +/** + * free_low - test for low amount of free pages + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if we have a shortage of free and + * clean pages, zero or negative if there is no shortage. + */ +static inline int free_low(struct zone_struct * zone) +{ + return free_limit(zone, VM_LOW); +} + +/** + * free_high - test if amount of free pages is less than ideal + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if the number of free and clean + * pages is below kswapd's target, zero or negative if we + * have more than enough free and clean pages. + */ +static inline int free_high(struct zone_struct * zone) +{ + return free_limit(zone, VM_HIGH); +} + +/** + * free_plenty - test if enough pages are freed + * @zone: zone to test, ALL_ZONES to test memory globally + * + * Returns a positive value if the number of free + clean pages + * in a zone is not yet excessive and kswapd is still allowed to + * free pages here, a negative value if kswapd should leave the + * zone alone. + */ +static inline int free_plenty(struct zone_struct * zone) +{ + return free_limit(zone, VM_PLENTY); +} + +/* + * The inactive page target is the free target + 20% of (active + inactive) + * pages. + */ +static inline int zone_inactive_limit(struct zone_struct * zone, int limit) +{ + int inactive, target, inactive_base; + + inactive_base = zone->active_pages + zone->inactive_dirty_pages; + inactive_base /= INACTIVE_FACTOR; + + /* GCC should optimise this away completely. */ + if (limit == VM_MIN) + target = zone->pages_high + inactive_base / 2; + else if (limit == VM_LOW) + target = zone->pages_high + inactive_base; + else + target = zone->pages_high + inactive_base * 2; + + inactive = zone->free_pages + zone->inactive_clean_pages; + inactive += zone->inactive_dirty_pages; + + return target - inactive; +} + +static inline int inactive_limit(struct zone_struct * zone, int limit) +{ + int shortage = 0, local; + + if (zone == ALL_ZONES) { + for_each_zone(zone) + shortage += zone_inactive_limit(zone, limit); + } else if (zone == ANY_ZONE) { + for_each_zone(zone) { + local = zone_inactive_limit(zone, limit); + shortage += max(local, 0); + } + } else { + shortage = zone_inactive_limit(zone, limit); + } + + return shortage; +} + +/** + * inactive_min - test for serious shortage of (free + inactive clean) pages + * @zone: zone to test, ALL_ZONES for global testing + * + * Returns the shortage as a positive number, a negative number + * if we have no serious shortage of (free + inactive clean) pages + */ +static inline int inactive_min(struct zone_struct * zone) +{ + return inactive_limit(zone, VM_MIN); +} + +/** + * inactive_low - test for shortage of (free + inactive clean) pages + * @zone: zone to test, ALL_ZONES for global testing + * + * Returns the shortage as a positive number, a negative number + * if we have no shortage of (free + inactive clean) pages + */ +static inline int inactive_low(struct zone_struct * zone) +{ + return inactive_limit(zone, VM_LOW); +} + +/** + * inactive_high - less than ideal amount of (free + inactive) pages + * @zone: zone to test, ALL_ZONES for global testing + * + * Returns the shortage as a positive number, a negative number + * if we have more than enough (free + inactive) pages + */ +static inline int inactive_high(struct zone_struct * zone) +{ + return inactive_limit(zone, VM_HIGH); +} + +/* + * inactive_target - number of inactive pages we ought to have. + */ +static inline unsigned long inactive_target(void) +{ + struct page_state ps; + int target; + + get_page_state(&ps); + target = ps.nr_active_pages + ps.nr_inactive_dirty_pages + + ps.nr_inactive_clean_pages; + + target /= INACTIVE_FACTOR; + + return target; +} + +#endif /* _LINUX_MM_INLINE_H */ diff -uNr linux-2.5.28/include/linux/mmzone.h linux-2.5.28-rmap/include/linux/mmzone.h --- linux-2.5.28/include/linux/mmzone.h Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/include/linux/mmzone.h Tue Jul 23 19:09:15 2002 @@ -26,6 +26,8 @@ struct pglist_data; +#define MAX_CHUNKS_PER_NODE 8 + /* * On machines where it is needed (eg PCs) we divide physical memory * into multiple physical zones. On a PC we have 3 zones: @@ -40,12 +42,17 @@ */ spinlock_t lock; unsigned long free_pages; - unsigned long pages_min, pages_low, pages_high; - int need_balance; + unsigned long active_pages; + unsigned long inactive_dirty_pages; + unsigned long inactive_clean_pages; + unsigned long pages_min, pages_low, pages_high, pages_plenty; /* * free areas of different sizes */ + struct list_head active_list; + struct list_head inactive_dirty_list; + struct list_head inactive_clean_list; free_area_t free_area[MAX_ORDER]; /* @@ -81,6 +88,13 @@ */ struct pglist_data *zone_pgdat; struct page *zone_mem_map; + +#if defined(CONFIG_DISCONTIGMEM) && !defined(CONFIG_HIGHMEM) + struct page *zone_chunk_page_start[MAX_CHUNKS_PER_ZONE + 1]; + unsigned long zone_chunk_phys_start[MAX_CHUNKS_PER_ZONE]; + unsigned int zone_nr_chunks; +#endif + unsigned long zone_start_paddr; unsigned long zone_start_mapnr; @@ -142,15 +156,6 @@ extern int numnodes; extern pg_data_t *pgdat_list; -static inline int memclass(zone_t *pgzone, zone_t *classzone) -{ - if (pgzone->zone_pgdat != classzone->zone_pgdat) - return 0; - if (pgzone > classzone) - return 0; - return 1; -} - /* * The following two are not meant for general usage. They are here as * prototypes for the discontig memory code. @@ -163,6 +168,60 @@ extern pg_data_t contig_page_data; +/** + * for_each_pgdat - helper macro to iterate over all nodes + * @pgdat - pg_data_t * variable + * + * Meant to help with common loops of the form + * pgdat = pgdat_list; + * while(pgdat) { + * ... + * pgdat = pgdat->node_next; + * } + */ +#define for_each_pgdat(pgdat) \ + for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) + + +/* + * next_zone - helper magic for for_each_zone() + * Thanks to William Lee Irwin III for this piece of ingenuity. + */ +static inline zone_t *next_zone(zone_t *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone - pgdat->node_zones < MAX_NR_ZONES - 1) + zone++; + + else if (pgdat->node_next) { + pgdat = pgdat->node_next; + zone = pgdat->node_zones; + } else + zone = NULL; + + return zone; +} + +/** + * for_each_zone - helper macro to iterate over all memory zones + * @zone - zone_t * variable + * + * The user only needs to declare the zone variable, for_each_zone + * fills it in. This basically means for_each_zone() is an + * easier to read version of this piece of code: + * + * for(pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) + * for(i = 0; i < MAX_NR_ZONES; ++i) { + * zone_t * z = pgdat->node_zones + i; + * ... + * } + * } + */ +#define for_each_zone(zone) \ + for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) + + #ifndef CONFIG_DISCONTIGMEM #define NODE_DATA(nid) (&contig_page_data) diff -uNr linux-2.5.28/include/linux/page-flags.h linux-2.5.28-rmap/include/linux/page-flags.h --- linux-2.5.28/include/linux/page-flags.h Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/include/linux/page-flags.h Tue Jul 23 19:09:16 2002 @@ -47,7 +47,7 @@ * locked- and dirty-page accounting. The top eight bits of page->flags are * used for page->zone, so putting flag bits there doesn't work. */ -#define PG_locked 0 /* Page is locked. Don't touch. */ +#define PG_locked 0 /* Page is locked. Don't touch. */ #define PG_error 1 #define PG_referenced 2 #define PG_uptodate 3 @@ -55,20 +55,20 @@ #define PG_dirty_dontuse 4 #define PG_lru 5 #define PG_active 6 -#define PG_slab 7 /* slab debug (Suparna wants this) */ - -#define PG_highmem 8 -#define PG_checked 9 /* kill me in 2.5.. */ -#define PG_arch_1 10 -#define PG_reserved 11 - -#define PG_private 12 /* Has something at ->private */ -#define PG_writeback 13 /* Page is under writeback */ -#define PG_nosave 14 /* Used for system suspend/resume */ -#define PG_chainlock 15 /* lock bit for ->pte_chain */ - -#define PG_direct 16 /* ->pte_chain points directly at pte */ - +#define PG_inactive_clean 7 +#define PG_inactive_dirty 8 +#define PG_slab 9 /* slab debug (Suparna wants this) */ + +#define PG_highmem 10 +#define PG_checked 11 /* kill me in 2.5.. */ +#define PG_arch_1 12 +#define PG_reserved 13 + +#define PG_private 14 /* Has something at ->private */ +#define PG_writeback 15 /* Page is under writeback */ +#define PG_nosave 16 /* Used for system suspend/resume */ +#define PG_chainlock 17 /* lock bit for ->pte_chain */ +#define PG_direct 18 /* ->pte_chain points directly at pte */ /* * Global page accounting. One instance per CPU. */ @@ -76,11 +76,11 @@ unsigned long nr_dirty; unsigned long nr_writeback; unsigned long nr_pagecache; - unsigned long nr_active; /* on active_list LRU */ - unsigned long nr_inactive; /* on inactive_list LRU */ + unsigned long nr_active_pages; /* on active_list LRU */ + unsigned long nr_inactive_clean_pages; /* on inactive_clean_list LRU */ + unsigned long nr_inactive_dirty_pages; /* on inactive_dirty_list LRU */ unsigned long nr_page_table_pages; - unsigned long nr_pte_chain_pages; - unsigned long used_pte_chains_bytes; + unsigned long nr_reverse_maps; } ____cacheline_aligned_in_smp page_states[NR_CPUS]; extern void get_page_state(struct page_state *ret); @@ -156,12 +156,22 @@ }) #define PageLRU(page) test_bit(PG_lru, &(page)->flags) -#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) -#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) +#define SetPageLRU(page) set_bit(PG_lru, &(page)->flags) +#define ClearPageLRU(page) clear_bit(PG_lru, &(page)->flags) #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) +#define TestandSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags) +#define TestandClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) + +#define PageInactiveDirty(page) test_bit(PG_inactive_dirty, &(page)->flags) +#define SetPageInactiveDirty(page) set_bit(PG_inactive_dirty, &(page)->flags) +#define ClearPageInactiveDirty(page) clear_bit(PG_inactive_dirty, &(page)->flags) + +#define PageInactiveClean(page) test_bit(PG_inactive_clean, &(page)->flags) +#define SetPageInactiveClean(page) set_bit(PG_inactive_clean, &(page)->flags) +#define ClearPageInactiveClean(page) clear_bit(PG_inactive_clean, &(page)->flags) #define PageSlab(page) test_bit(PG_slab, &(page)->flags) #define SetPageSlab(page) set_bit(PG_slab, &(page)->flags) diff -uNr linux-2.5.28/include/linux/sched.h linux-2.5.28-rmap/include/linux/sched.h --- linux-2.5.28/include/linux/sched.h Wed Jul 24 20:43:54 2002 +++ linux-2.5.28-rmap/include/linux/sched.h Wed Jul 24 20:47:21 2002 @@ -190,6 +190,7 @@ unsigned long def_flags; unsigned long cpu_vm_mask; unsigned long swap_address; + unsigned long rlimit_rss; unsigned dumpable:1; @@ -269,9 +270,6 @@ struct list_head tasks; struct mm_struct *mm, *active_mm; - struct list_head local_pages; - - unsigned int allocation_order, nr_local_pages; /* task state */ struct linux_binfmt *binfmt; diff -uNr linux-2.5.28/include/linux/swap.h linux-2.5.28-rmap/include/linux/swap.h --- linux-2.5.28/include/linux/swap.h Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/include/linux/swap.h Tue Jul 23 19:09:15 2002 @@ -161,12 +161,19 @@ extern void FASTCALL(lru_cache_del(struct page *)); extern void FASTCALL(activate_page(struct page *)); +extern void FASTCALL(activate_page_nolock(struct page *)); +extern void FASTCALL(deactivate_page(struct page *)); +extern void FASTCALL(deactivate_page_nolock(struct page *)); +extern void FASTCALL(drop_page(struct page *)); extern void swap_setup(void); /* linux/mm/vmscan.c */ +extern struct page * FASTCALL(reclaim_page(zone_t *)); extern wait_queue_head_t kswapd_wait; -extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int)); +extern int FASTCALL(try_to_free_pages(unsigned int gfp_mask)); +extern void wakeup_kswapd(unsigned int); +extern void rss_free_pages(unsigned int); /* linux/mm/page_io.c */ int swap_readpage(struct file *file, struct page *page); @@ -220,43 +227,26 @@ extern void FASTCALL(mark_page_accessed(struct page *)); /* + * Page aging defines. These seem to work great in FreeBSD, + * no need to reinvent the wheel. + */ +#define PAGE_AGE_START 5 +#define PAGE_AGE_ADV 3 +#define PAGE_AGE_DECL 1 +#define PAGE_AGE_MAX 64 + +/* * List add/del helper macros. These must be called * with the pagemap_lru_lock held! */ #define DEBUG_LRU_PAGE(page) \ do { \ - if (!PageLRU(page)) \ - BUG(); \ if (PageActive(page)) \ BUG(); \ -} while (0) - -#define add_page_to_active_list(page) \ -do { \ - DEBUG_LRU_PAGE(page); \ - SetPageActive(page); \ - list_add(&(page)->lru, &active_list); \ - inc_page_state(nr_active); \ -} while (0) - -#define add_page_to_inactive_list(page) \ -do { \ - DEBUG_LRU_PAGE(page); \ - list_add(&(page)->lru, &inactive_list); \ - inc_page_state(nr_inactive); \ -} while (0) - -#define del_page_from_active_list(page) \ -do { \ - list_del(&(page)->lru); \ - ClearPageActive(page); \ - dec_page_state(nr_active); \ -} while (0) - -#define del_page_from_inactive_list(page) \ -do { \ - list_del(&(page)->lru); \ - dec_page_state(nr_inactive); \ + if (PageInactiveDirty(page)) \ + BUG(); \ + if (PageInactiveClean(page)) \ + BUG(); \ } while (0) extern spinlock_t swaplock; diff -uNr linux-2.5.28/init/main.c linux-2.5.28-rmap/init/main.c --- linux-2.5.28/init/main.c Wed Jul 24 20:43:54 2002 +++ linux-2.5.28-rmap/init/main.c Wed Jul 24 20:47:21 2002 @@ -70,7 +70,7 @@ extern void sysctl_init(void); extern void signals_init(void); extern void buffer_init(void); - +extern void pte_chain_init(void); extern void radix_tree_init(void); extern void free_initmem(void); @@ -386,7 +386,7 @@ mem_init(); kmem_cache_sizes_init(); pgtable_cache_init(); - + pte_chain_init(); mempages = num_physpages; fork_init(mempages); diff -uNr linux-2.5.28/kernel/fork.c linux-2.5.28-rmap/kernel/fork.c --- linux-2.5.28/kernel/fork.c Wed Jul 24 20:43:54 2002 +++ linux-2.5.28-rmap/kernel/fork.c Wed Jul 24 20:47:21 2002 @@ -703,8 +703,6 @@ p->start_time = jiffies; p->security = NULL; - INIT_LIST_HEAD(&p->local_pages); - retval = -ENOMEM; if (security_ops->task_alloc_security(p)) goto bad_fork_cleanup; diff -uNr linux-2.5.28/kernel/suspend.c linux-2.5.28-rmap/kernel/suspend.c --- linux-2.5.28/kernel/suspend.c Wed Jul 24 20:08:57 2002 +++ linux-2.5.28-rmap/kernel/suspend.c Wed Jul 24 21:17:06 2002 @@ -611,7 +611,7 @@ static void free_some_memory(void) { printk("Freeing memory: "); - while (try_to_free_pages(&contig_page_data.node_zones[ZONE_HIGHMEM], GFP_KSWAPD, 0)) + while (try_to_free_pages(GFP_KSWAPD)) printk("."); printk("|\n"); } diff -uNr linux-2.5.28/kernel/sys.c linux-2.5.28-rmap/kernel/sys.c --- linux-2.5.28/kernel/sys.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/kernel/sys.c Tue Jul 23 19:11:14 2002 @@ -1166,6 +1166,12 @@ if (resource == RLIMIT_NOFILE) { if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN) return -EPERM; + } else if (resource == RLIMIT_RSS && current->mm) { + /* rlimit is specified in bytes, convert to pages */ + unsigned long pages = RLIM_INFINITY; + if (new_rlim.rlim_cur != RLIM_INFINITY) + pages = new_rlim.rlim_cur >> PAGE_SHIFT; + current->mm->rlimit_rss = pages; } retval = security_ops->task_setrlimit(resource, &new_rlim); diff -uNr linux-2.5.28/mm/bootmem.c linux-2.5.28-rmap/mm/bootmem.c --- linux-2.5.28/mm/bootmem.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/mm/bootmem.c Tue Jul 23 19:11:14 2002 @@ -339,12 +339,11 @@ pg_data_t *pgdat = pgdat_list; void *ptr; - while (pgdat) { + for_each_pgdat(pgdat) if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal))) return(ptr); - pgdat = pgdat->node_next; - } + /* * Whoops, we cannot satisfy the allocation request. */ diff -uNr linux-2.5.28/mm/filemap.c linux-2.5.28-rmap/mm/filemap.c --- linux-2.5.28/mm/filemap.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/mm/filemap.c Tue Jul 23 19:11:14 2002 @@ -13,8 +13,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -529,7 +529,7 @@ * * In the case of swapcache, try_to_swap_out() has already locked the page, so * SetPageLocked() is ugly-but-OK there too. The required page state has been - * set up by swap_out_add_to_swap_cache(). + * set up by add_to_swap(). */ int add_to_page_cache(struct page *page, struct address_space *mapping, unsigned long offset) @@ -848,15 +848,23 @@ /* * Mark a page as having seen activity. * - * inactive,unreferenced -> inactive,referenced - * inactive,referenced -> active,unreferenced - * active,unreferenced -> active,referenced + * We immediately reclaim + * the inactive clean pages because those are counted as freeable. + * We don't modify the inactive dirty ones because we're never sure + * if those are freeable anyway. */ void mark_page_accessed(struct page *page) { - if (!PageActive(page) && PageReferenced(page)) { + if (PageInactiveClean(page)) { + struct zone_struct *zone = page_zone(page); + int free = zone->free_pages + zone->inactive_clean_pages; + activate_page(page); - ClearPageReferenced(page); + if (free < zone->pages_low) + wakeup_kswapd(GFP_NOIO); + if (zone->free_pages < zone->pages_min) + fixup_freespace(zone, 1); + return; } else if (!PageReferenced(page)) { SetPageReferenced(page); @@ -1245,7 +1253,7 @@ /* Limit it to a sane percentage of the inactive list.. */ get_page_state(&ps); - max = ps.nr_inactive / 2; + max = ps.nr_inactive_clean_pages / 2; if (nr > max) nr = max; @@ -2060,16 +2068,18 @@ } do { - unsigned long index; - unsigned long offset; + unsigned long index, offset; long page_fault; char *kaddr; + int deactivate = 1; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) + if (bytes > count) { bytes = count; + deactivate = 0; + } /* * Bring in the user page that we will copy from _first_. @@ -2119,9 +2129,11 @@ } } kunmap(page); - if (!PageReferenced(page)) - SetPageReferenced(page); unlock_page(page); + if (deactivate) + deactivate_page(page); + else + mark_page_accessed(page); page_cache_release(page); if (status < 0) break; diff -uNr linux-2.5.28/mm/memory.c linux-2.5.28-rmap/mm/memory.c --- linux-2.5.28/mm/memory.c Wed Jul 24 20:43:54 2002 +++ linux-2.5.28-rmap/mm/memory.c Wed Jul 24 20:47:21 2002 @@ -36,7 +36,6 @@ * (Gerhard.Wichert@pdb.siemens.de) */ -#include #include #include #include @@ -45,6 +44,7 @@ #include #include #include +#include #include #include @@ -1127,6 +1127,10 @@ struct page *new_page; unsigned long offset; + /* Low on free memory ? Don't make things worse. */ + if (free_low(ALL_ZONES) < 0) + return; + /* * Get the number of handles we should do readahead io to. */ @@ -1319,6 +1323,8 @@ new_page = page; } + mark_page_accessed(new_page); + spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); @@ -1421,6 +1427,14 @@ current->state = TASK_RUNNING; pgd = pgd_offset(mm, address); + /* + * If we are over our RSS limit and the system needs memory, + * we will free memory for the non-hogs and slow down a bit. + */ + if (mm->rlimit_rss && mm->rss > mm->rlimit_rss && + free_high(ALL_ZONES) > 0) + rss_free_pages(GFP_HIGHUSER); + KERNEL_STAT_INC(pgfault); /* * We need the page table lock to synchronize with kswapd @@ -1457,6 +1471,7 @@ if (!new) return NULL; + KERNEL_STAT_INC(pgfault); /* * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. diff -uNr linux-2.5.28/mm/numa.c linux-2.5.28-rmap/mm/numa.c --- linux-2.5.28/mm/numa.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/mm/numa.c Tue Jul 23 19:11:14 2002 @@ -44,6 +44,57 @@ #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) +#ifndef CONFIG_HIGHMEM +unsigned long page_address(struct page * page) +{ + struct zone_struct * zone; + struct page * chunk_page_start; + unsigned long chunk_phys_addr; + int idx_chunk; + + zone = page_zone(page); + /* + * We have to check if the page is on + * a chunk that contains pages from 2 zones. + */ + if(!(page < zone->zone_chunk_page_start[0])) + goto known_zone; + /* + * We need to get the previous zone. + * If there is no such zone, we are in trouble. + */ + if(!page->zone) + BUG(); + + zone = zone_table[(page->zone) - 1]; + + if(zone->zone_pgdat->node_id == page_zone(page)->zone_pgdat->node_id) + goto known_zone; + /* + * Getting here means we have a chunk spread over 2 nodes. + * That shouldn't happen. + */ + BUG(); + + known_zone: + for(idx_chunk = 0 ; idx_chunk < MAX_CHUNKS_PER_ZONE ; idx_chunk++){ + if(page >= zone->zone_chunk_page_start[idx_chunk] && + page < zone->zone_chunk_page_start[idx_chunk + 1]) + break; + } + /* + * We know which chunk the page belongs to. + */ + chunk_phys_addr = zone->zone_chunk_phys_start[idx_chunk]; + chunk_page_start = zone->zone_chunk_page_start[idx_chunk]; + return (unsigned long)__va(chunk_phys_addr + + ((page - chunk_page_start) << PAGE_SHIFT )); + + + +} +#endif + static spinlock_t node_lock = SPIN_LOCK_UNLOCKED; void show_free_areas_node(pg_data_t *pgdat) diff -uNr linux-2.5.28/mm/oom_kill.c linux-2.5.28-rmap/mm/oom_kill.c --- linux-2.5.28/mm/oom_kill.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/mm/oom_kill.c Tue Jul 23 19:11:10 2002 @@ -168,7 +168,8 @@ static void oom_kill(void) { struct task_struct *p, *q; - + extern wait_queue_head_t kswapd_done; + read_lock(&tasklist_lock); p = select_bad_process(); @@ -182,6 +183,9 @@ } read_unlock(&tasklist_lock); + /* Chances are by this time our victim is sleeping on kswapd. */ + wake_up(&kswapd_done); + /* * Make kswapd go out of the way, so "p" has a good chance of * killing itself before someone else gets the chance to ask diff -uNr linux-2.5.28/mm/page_alloc.c linux-2.5.28-rmap/mm/page_alloc.c --- linux-2.5.28/mm/page_alloc.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/mm/page_alloc.c Tue Jul 23 19:11:14 2002 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -26,8 +27,6 @@ unsigned long totalram_pages; unsigned long totalhigh_pages; int nr_swap_pages; -LIST_HEAD(active_list); -LIST_HEAD(inactive_list); pg_data_t *pgdat_list; /* @@ -41,6 +40,8 @@ static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; +static int zone_extrafree_ratio[MAX_NR_ZONES] __initdata = { 128, 512, 0, }; +static int zone_extrafree_max[MAX_NR_ZONES] __initdata = { 1024 , 1024, 0, }; /* * Temporary debugging check for pages not lying within a given zone. @@ -89,33 +90,21 @@ KERNEL_STAT_ADD(pgfree, 1<mapping != NULL); BUG_ON(PageLocked(page)); - BUG_ON(PageLRU(page)); - BUG_ON(PageActive(page)); BUG_ON(PageWriteback(page)); BUG_ON(page->pte.chain != NULL); if (PageDirty(page)) ClearPageDirty(page); - BUG_ON(page_count(page) != 0); - - if (unlikely(current->flags & PF_FREE_PAGES)) { - if (!current->nr_local_pages && !in_interrupt()) { - list_add(&page->list, ¤t->local_pages); - page->index = order; - current->nr_local_pages++; - goto out; - } - } zone = page_zone(page); mask = (~0UL) << order; base = zone->zone_mem_map; page_idx = page - base; - if (page_idx & ~mask) - BUG(); + BUG_ON(page_idx & ~mask); index = page_idx >> (1 + order); area = zone->free_area + order; @@ -147,8 +136,6 @@ } list_add(&(base + page_idx)->list, &area->free_list); spin_unlock_irqrestore(&zone->lock, flags); -out: - return; } #define MARK_USED(index, order, area) \ @@ -173,24 +160,6 @@ return page; } -/* - * This page is about to be returned from the page allocator - */ -static inline void prep_new_page(struct page *page) -{ - BUG_ON(page->mapping); - BUG_ON(PagePrivate(page)); - BUG_ON(PageLocked(page)); - BUG_ON(PageLRU(page)); - BUG_ON(PageActive(page)); - BUG_ON(PageDirty(page)); - BUG_ON(PageWriteback(page)); - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | - 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_checked); - set_page_count(page, 1); -} - static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order)); static struct page * rmqueue(zone_t *zone, unsigned int order) { @@ -215,20 +184,23 @@ if (curr_order != MAX_ORDER-1) MARK_USED(index, curr_order, area); zone->free_pages -= 1UL << order; - page = expand(zone, page, index, order, curr_order, area); spin_unlock_irqrestore(&zone->lock, flags); + DEBUG_LRU_PAGE(page); + BUG_ON(bad_range(zone, page)); - if (bad_range(zone, page)) - BUG(); - prep_new_page(page); - return page; + /* prepare new page for use */ + set_page_count(page, 1); + page->age = PAGE_AGE_START; + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | + 1 << PG_referenced | 1 << PG_arch_1 | + 1 << PG_checked); + return page; } curr_order++; area++; } while (curr_order < MAX_ORDER); spin_unlock_irqrestore(&zone->lock, flags); - return NULL; } @@ -264,57 +236,82 @@ } #endif -static /* inline */ struct page * -balance_classzone(zone_t * classzone, unsigned int gfp_mask, - unsigned int order, int * freed) -{ - struct page * page = NULL; - int __freed = 0; - - BUG_ON(in_interrupt()); - - current->allocation_order = order; - current->flags |= PF_MEMALLOC | PF_FREE_PAGES; - - __freed = try_to_free_pages(classzone, gfp_mask, order); - - current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); - - if (current->nr_local_pages) { - struct list_head * entry, * local_pages; - struct page * tmp; - int nr_pages; - - local_pages = ¤t->local_pages; - - if (likely(__freed)) { - /* pick from the last inserted so we're lifo */ - entry = local_pages->next; - do { - tmp = list_entry(entry, struct page, list); - if (tmp->index == order && memclass(page_zone(tmp), classzone)) { - list_del(entry); - page = tmp; - current->nr_local_pages--; - prep_new_page(page); - break; - } - } while ((entry = entry->next) != local_pages); +/* + * If we are able to directly reclaim pages, we move pages from the + * inactive_clean list onto the free list until the zone has enough + * free pages or until the inactive_clean pages are exhausted. + * If we cannot do this work ourselves, call kswapd. + */ +void FASTCALL(fixup_freespace(zone_t * zone, int direct_reclaim)); +void fixup_freespace(zone_t * zone, int direct_reclaim) +{ + if (direct_reclaim) { + struct page * page; + do { + if ((page = reclaim_page(zone))) + __free_pages(page, 0); + } while (page && zone->free_pages <= zone->pages_min); + } else + wakeup_kswapd(GFP_ATOMIC); +} + +#define PAGES_KERNEL 0 +#define PAGES_MIN 1 +#define PAGES_LOW 2 +#define PAGES_HIGH 3 + +/* + * This function does the dirty work for __alloc_pages + * and is separated out to keep the code size smaller. + * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM) + */ +static struct page * __alloc_pages_limit(zonelist_t *zonelist, + unsigned long order, int limit, int direct_reclaim) +{ + zone_t **zone = zonelist->zones; + unsigned long water_mark = 0; + + for (;;) { + zone_t *z = *(zone++); + + if (!z) + break; + BUG_ON(!z->size); + + /* + * We allocate if the number of (free + inactive_clean) + * pages is above the watermark. + */ + switch (limit) { + case PAGES_KERNEL: + water_mark = z->pages_min / 2; + break; + case PAGES_MIN: + water_mark = z->pages_min; + break; + case PAGES_LOW: + water_mark = z->pages_low; + break; + default: + case PAGES_HIGH: + water_mark = z->pages_high; } - nr_pages = current->nr_local_pages; - /* free in reverse order so that the global order will be lifo */ - while ((entry = local_pages->prev) != local_pages) { - list_del(entry); - tmp = list_entry(entry, struct page, list); - __free_pages_ok(tmp, tmp->index); - if (!nr_pages--) - BUG(); + if (z->free_pages + z->inactive_clean_pages >= water_mark) { + struct page *page = NULL; + /* If possible, reclaim a page directly. */ + if (direct_reclaim) + page = reclaim_page(z); + /* If that fails, fall back to rmqueue. */ + if (!page) + page = rmqueue(z, order); + if (page) + return page; } - current->nr_local_pages = 0; } - *freed = __freed; - return page; + + /* Found nothing. */ + return NULL; } /* @@ -322,105 +319,249 @@ */ struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) { - unsigned long min; - zone_t **zones, *classzone; + zone_t **zone; + int min, direct_reclaim = 0; struct page * page; - int freed, i; KERNEL_STAT_ADD(pgalloc, 1<zones; /* the list of zones suitable for gfp_mask */ - classzone = zones[0]; - if (classzone == NULL) /* no zones in the zonelist */ - return NULL; + /* + * (If anyone calls gfp from interrupts nonatomically then it + * will sooner or later tripped up by a schedule().) + * + * We fall back to lower-level zones if allocation + * in a higher zone fails. + */ + + /* + * Can we take pages directly from the inactive_clean + * list? + */ + if (order == 0 && (gfp_mask & __GFP_WAIT)) + direct_reclaim = 1; - /* Go through the zonelist once, looking for a zone with enough free */ +try_again: + /* + * First, see if we have any zones with lots of free memory. + * + * We allocate free memory first because it doesn't contain + * any data we would want to cache. + */ + zone = zonelist->zones; + if (!*zone) + return NULL; min = 1UL << order; - for (i = 0; zones[i] != NULL; i++) { - zone_t *z = zones[i]; + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; + BUG_ON(!z->size); - /* the incremental min is allegedly to discourage fallback */ - min += z->pages_low; + min += z->pages_min; if (z->free_pages > min) { page = rmqueue(z, order); if (page) return page; - } + } else if (z->free_pages < z->pages_min) + fixup_freespace(z, direct_reclaim); } - classzone->need_balance = 1; - mb(); - /* we're somewhat low on memory, failed to find what we needed */ - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + /* + * Next, try to allocate a page from a zone with a HIGH + * amount of (free + inactive_clean) pages. + * + * If there is a lot of activity, inactive_target + * will be high and we'll have a good chance of + * finding a page using the HIGH limit. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim); + if (page) + return page; - /* Go through the zonelist again, taking __GFP_HIGH into account */ - min = 1UL << order; - for (i = 0; zones[i] != NULL; i++) { - unsigned long local_min; - zone_t *z = zones[i]; - - local_min = z->pages_min; - if (gfp_mask & __GFP_HIGH) - local_min >>= 2; - min += local_min; - if (z->free_pages > min) { - page = rmqueue(z, order); - if (page) - return page; + /* + * Then try to allocate a page from a zone with more + * than zone->pages_low of (free + inactive_clean) pages. + * + * When the working set is very large and VM activity + * is low, we're most likely to have our allocation + * succeed here. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim); + if (page) + return page; + + /* + * OK, none of the zones on our zonelist has lots + * of pages free. + * + * We wake up kswapd, in the hope that kswapd will + * resolve this situation before memory gets tight. + * + * We'll also help a bit trying to free pages, this + * way statistics will make sure really fast allocators + * are slowed down more than slow allocators and other + * programs in the system shouldn't be impacted as much + * by the hogs. + */ + wakeup_kswapd(gfp_mask); + + /* + * After waking up kswapd, we try to allocate a page + * from any zone which isn't critical yet. + * + * Kswapd should, in most situations, bring the situation + * back to normal in no time. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim); + if (page) + return page; + + /* + * Kernel allocations can eat a few emergency pages. + * We should be able to run without this, find out why + * the SCSI layer isn't happy ... + */ + if (gfp_mask & __GFP_HIGH) { + page = __alloc_pages_limit(zonelist, order, PAGES_KERNEL, direct_reclaim); + if (page) + return page; + } + + /* + * Oh well, we didn't succeed. + */ + KERNEL_STAT_INC(allocstall); + if (!(current->flags & PF_MEMALLOC)) { + /* + * Are we dealing with a higher order allocation? + * + * If so, try to defragment some memory. + */ + if (order > 0 && (gfp_mask & __GFP_WAIT)) + goto defragment; + + /* + * If we arrive here, we are really tight on memory. + * Since kswapd didn't succeed in freeing pages for us, + * we need to help it. + * + * Single page allocs loop until the allocation succeeds. + * Multi-page allocs can fail due to memory fragmentation; + * in that case we bail out to prevent infinite loops and + * hanging device drivers ... + * + * Another issue are GFP_NOFS allocations; because they + * do not have __GFP_FS set it's possible we cannot make + * any progress freeing pages, in that case it's better + * to give up than to deadlock the kernel looping here. + * + * NFS: we must yield the CPU (to rpciod) to avoid deadlock. + */ + if (gfp_mask & __GFP_WAIT) { + yield(); + if (!order || free_high(ALL_ZONES) >= 0) { + int progress = try_to_free_pages(gfp_mask); + if (progress || (gfp_mask & __GFP_FS)) + goto try_again; + /* + * Fail if no progress was made and the + * allocation may not be able to block on IO. + */ + return NULL; + } } } - /* here we're in the low on memory slow path */ + /* + * Final phase: allocate anything we can! + * + * Higher order allocations, GFP_ATOMIC allocations and + * recursive allocations (PF_MEMALLOC) end up here. + * + * Only recursive allocations can use the very last pages + * in the system, otherwise it would be just too easy to + * deadlock the system... + */ + zone = zonelist->zones; + min = 1UL << order; + for (;;) { + zone_t *z = *(zone++); + struct page * page = NULL; + if (!z) + break; -rebalance: - if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) { - /* go through the zonelist yet again, ignoring mins */ - for (i = 0; zones[i] != NULL; i++) { - zone_t *z = zones[i]; + /* + * SUBTLE: direct_reclaim is only possible if the task + * becomes PF_MEMALLOC while looping above. This will + * happen when the OOM killer selects this task for + * death. + */ + if (direct_reclaim) { + page = reclaim_page(z); + if (page) + return page; + } + /* XXX: is pages_min/4 a good amount to reserve for this? */ + min += z->pages_min / 4; + if (z->free_pages > min || ((current->flags & PF_MEMALLOC) && !in_interrupt())) { page = rmqueue(z, order); if (page) return page; } -nopage: - if (!(current->flags & PF_NOWARN)) { - printk("%s: page allocation failure." - " order:%d, mode:0x%x\n", - current->comm, order, gfp_mask); - } - return NULL; } + goto out_failed; - /* Atomic allocations - we can't balance anything */ - if (!(gfp_mask & __GFP_WAIT)) - goto nopage; - KERNEL_STAT_INC(allocstall); - page = balance_classzone(classzone, gfp_mask, order, &freed); - if (page) - return page; - - /* go through the zonelist yet one more time */ - min = 1UL << order; - for (i = 0; zones[i] != NULL; i++) { - zone_t *z = zones[i]; + /* + * Naive "defragmentation" for higher-order allocations. First we + * free the inactive_clean pages to see if we can allocate our + * allocation, then we call page_launder() to clean some dirty + * pages, and last we try once more. + * + * We might want to turn this into something which defragments + * memory based on physical page, simply by looking for unmapped + * pages next to pages on the free list... + */ +defragment: + { + int freed = 0; +defragment_again: + zone = zonelist->zones; + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; + if (!z->size) + continue; + while (z->inactive_clean_pages) { + struct page * page; + /* Move one page to the free list. */ + page = reclaim_page(z); + if (!page) + break; + __free_page(page); + /* Try if the allocation succeeds. */ + page = rmqueue(z, order); + if (page) + return page; + } + } - min += z->pages_min; - if (z->free_pages > min) { - page = rmqueue(z, order); - if (page) - return page; + /* XXX: do real defragmentation instead of calling launder ? */ + if (!freed & !(current->flags & PF_MEMALLOC)) { + freed = 1; + current->flags |= PF_MEMALLOC; + try_to_free_pages(gfp_mask); + current->flags &= ~PF_MEMALLOC; + goto defragment_again; } } - /* Don't let big-order allocations loop */ - if (order > 3) - goto nopage; - - /* Yield for kswapd, and try again */ - yield(); - goto rebalance; +out_failed: + /* No luck.. */ + printk(KERN_ERR "__alloc_pages: %u-order allocation failed.\n", order); + return NULL; } /* @@ -477,36 +618,32 @@ */ unsigned int nr_free_pages(void) { - unsigned int i, sum = 0; - pg_data_t *pgdat; - - for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) - for (i = 0; i < MAX_NR_ZONES; ++i) - sum += pgdat->node_zones[i].free_pages; + unsigned int sum; + zone_t *zone; + sum = 0; + for_each_zone(zone) + sum += zone->free_pages; + return sum; } -static unsigned int nr_free_zone_pages(int offset) +static unsigned int nr_free_zone_pages (int offset) { - pg_data_t *pgdat = pgdat_list; + pg_data_t *pgdat; unsigned int sum = 0; - do { + for_each_pgdat(pgdat) { zonelist_t *zonelist = pgdat->node_zonelists + offset; zone_t **zonep = zonelist->zones; zone_t *zone; for (zone = *zonep++; zone; zone = *zonep++) { - unsigned long size = zone->size; - unsigned long high = zone->pages_high; - if (size > high) - sum += size - high; + sum += zone->free_pages; + sum += zone->inactive_clean_pages; + sum += zone->inactive_dirty_pages; } - - pgdat = pgdat->node_next; - } while (pgdat); - + } return sum; } @@ -529,13 +666,12 @@ #if CONFIG_HIGHMEM unsigned int nr_free_highpages (void) { - pg_data_t *pgdat = pgdat_list; + pg_data_t *pgdat; unsigned int pages = 0; - while (pgdat) { + for_each_pgdat(pgdat) pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; - pgdat = pgdat->node_next; - } + return pages; } #endif @@ -563,11 +699,11 @@ ret->nr_dirty += ps->nr_dirty; ret->nr_writeback += ps->nr_writeback; ret->nr_pagecache += ps->nr_pagecache; - ret->nr_active += ps->nr_active; - ret->nr_inactive += ps->nr_inactive; + ret->nr_active_pages += ps->nr_active_pages; + ret->nr_inactive_clean_pages += ps->nr_inactive_clean_pages; + ret->nr_inactive_dirty_pages += ps->nr_inactive_dirty_pages; ret->nr_page_table_pages += ps->nr_page_table_pages; - ret->nr_pte_chain_pages += ps->nr_pte_chain_pages; - ret->used_pte_chains_bytes += ps->used_pte_chains_bytes; + ret->nr_reverse_maps += ps->nr_reverse_maps; } } @@ -630,12 +766,13 @@ tmpdat = tmpdat->node_next; } - printk("( Active:%lu inactive:%lu dirty:%lu writeback:%lu free:%u )\n", - ps.nr_active, - ps.nr_inactive, - ps.nr_dirty, - ps.nr_writeback, - nr_free_pages()); + printk("( Active:%lu inactive_dirty:%lu inactive_clean:%lu dirty:%lu writeback:%lu free:%u )\n", + ps.nr_active_pages, + ps.nr_inactive_dirty_pages, + ps.nr_inactive_clean_pages, + ps.nr_dirty, + ps.nr_writeback, + nr_free_pages()); for (type = 0; type < MAX_NR_ZONES; type++) { struct list_head *head, *curr; @@ -771,6 +908,7 @@ * - mark all memory queues empty * - clear the memory bitmaps */ +extern unsigned int kswapd_minfree; void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, unsigned long *zones_size, unsigned long zone_start_paddr, unsigned long *zholes_size, struct page *lmem_map) @@ -816,7 +954,7 @@ offset = lmem_map - mem_map; for (j = 0; j < MAX_NR_ZONES; j++) { zone_t *zone = pgdat->node_zones + j; - unsigned long mask; + unsigned long mask, extrafree = 0; unsigned long size, realsize; zone_table[nid * MAX_NR_ZONES + j] = zone; @@ -830,7 +968,12 @@ zone->lock = SPIN_LOCK_UNLOCKED; zone->zone_pgdat = pgdat; zone->free_pages = 0; - zone->need_balance = 0; + zone->inactive_clean_pages = 0; + zone->inactive_dirty_pages = 0; + INIT_LIST_HEAD(&zone->active_list); + INIT_LIST_HEAD(&zone->inactive_dirty_list); + INIT_LIST_HEAD(&zone->inactive_clean_list); + if (!size) continue; @@ -850,15 +993,22 @@ pgdat->nr_zones = j+1; + /* + * On large memory machines we keep extra memory + * free for kernel allocations. + */ + if (zone_extrafree_ratio[j]) + extrafree = min_t(int, (realtotalpages / zone_extrafree_ratio[j]), zone_extrafree_max[j]); + if (extrafree < zone_balance_max[j]) + extrafree = 0; + mask = (realsize / zone_balance_ratio[j]); if (mask < zone_balance_min[j]) mask = zone_balance_min[j]; - else if (mask > zone_balance_max[j]) - mask = zone_balance_max[j]; - zone->pages_min = mask; - zone->pages_low = mask*2; - zone->pages_high = mask*3; - + zone->pages_min = extrafree + min(mask, (unsigned long)zone_balance_max[j]); + zone->pages_low = extrafree + mask*2; + zone->pages_high = extrafree + mask*3; + zone->pages_plenty = extrafree + mask*6; zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; zone->zone_start_paddr = zone_start_paddr; @@ -866,6 +1016,8 @@ if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1)) printk("BUG: wrong zone alignment, it will crash\n"); + kswapd_minfree += zone->pages_min; + /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is diff -uNr linux-2.5.28/mm/readahead.c linux-2.5.28-rmap/mm/readahead.c --- linux-2.5.28/mm/readahead.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/mm/readahead.c Tue Jul 23 19:11:14 2002 @@ -204,6 +204,42 @@ } /* + * We combine this with readahead to deactivate pages when we + * think there's sequential IO going on. Note that this is + * harmless since we don't actually evict the pages from memory + * but just move them to the inactive list. + * + * Rik van Riel, 2000 + */ +static void drop_behind(struct file * file, unsigned long index) +{ + struct inode *inode = file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + struct page *page; + + /* Nothing to drop-behind if we're on the first page. */ + if (!index) + return; + + /* + * Go backwards from index-1 and drop all pages in the + * readahead window. Since the readahead window may have + * been increased since the last time we were called, we + * stop when the page isn't there. + */ + spin_lock(&pagemap_lru_lock); + while (--index >= file->f_ra.start) { + read_lock(&mapping->page_lock); + page = radix_tree_lookup(&mapping->page_tree, index); + read_unlock(&mapping->page_lock); + if (!page || !PageActive(page)) + break; + drop_page(page); + } + spin_unlock(&pagemap_lru_lock); +} + +/* * page_cache_readahead is the main function. If performs the adaptive * readahead window size management and submits the readahead I/O. */ @@ -325,6 +361,11 @@ } } out: + /* + * Move the pages that have already been passed + * to the inactive list. + */ + drop_behind(file, offset); return; } diff -uNr linux-2.5.28/mm/rmap.c linux-2.5.28-rmap/mm/rmap.c --- linux-2.5.28/mm/rmap.c Wed Jul 24 20:43:54 2002 +++ linux-2.5.28-rmap/mm/rmap.c Wed Jul 24 21:02:05 2002 @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include @@ -50,10 +52,10 @@ pte_t * ptep; }; +static kmem_cache_t *pte_chain_cache; static inline struct pte_chain * pte_chain_alloc(void); static inline void pte_chain_free(struct pte_chain *, struct pte_chain *, struct page *); -static void alloc_new_pte_chains(void); /** * page_referenced - test if the page was referenced @@ -98,12 +100,9 @@ unsigned long pfn = pte_pfn(*ptep); #ifdef DEBUG_RMAP - if (!page || !ptep) - BUG(); - if (!pte_present(*ptep)) - BUG(); - if (!ptep_to_mm(ptep)) - BUG(); + BUG_ON(!page || !ptep); + BUG_ON(!pte_present(*ptep)); + BUG_ON(!ptep_to_mm(ptep)); #endif if (!pfn_valid(pfn) || PageReserved(page)) @@ -114,12 +113,10 @@ { struct pte_chain * pc; if (PageDirect(page)) { - if (page->pte.direct == ptep) - BUG(); + BUG_ON(page->pte.direct == ptep); } else { for (pc = page->pte.chain; pc; pc = pc->next) { - if (pc->ptep == ptep) - BUG(); + BUG_ON(pc->ptep == ptep); } } } @@ -148,6 +145,7 @@ } pte_chain_unlock(page); + inc_page_state(nr_reverse_maps); } /** @@ -165,8 +163,7 @@ struct pte_chain * pc, * prev_pc = NULL; unsigned long pfn = page_to_pfn(page); - if (!page || !ptep) - BUG(); + BUG_ON(!page || !ptep); if (!pfn_valid(pfn) || PageReserved(page)) return; @@ -208,9 +205,9 @@ #endif out: + dec_page_state(nr_reverse_maps); pte_chain_unlock(page); - return; - + return; } /** @@ -236,8 +233,7 @@ pte_t pte; int ret; - if (!mm) - BUG(); + BUG_ON(!mm); /* * We need the page_table_lock to protect us from page faults, @@ -304,13 +300,10 @@ int ret = SWAP_SUCCESS; /* This page should not be on the pageout lists. */ - if (PageReserved(page)) - BUG(); - if (!PageLocked(page)) - BUG(); + BUG_ON(PageReserved(page)); + BUG_ON(!PageLocked(page)); /* We need backing store to swap out a page. */ - if (!page->mapping) - BUG(); + BUG_ON(!page->mapping); if (PageDirect(page)) { ret = try_to_unmap_one(page, page->pte.direct); @@ -318,7 +311,7 @@ page->pte.direct = NULL; ClearPageDirect(page); } - } else { + } else { for (pc = page->pte.chain; pc; pc = next_pc) { next_pc = pc->next; switch (try_to_unmap_one(page, pc->ptep)) { @@ -351,33 +344,53 @@ } /** - ** No more VM stuff below this comment, only pte_chain helper - ** functions. - **/ - -struct pte_chain * pte_chain_freelist; -spinlock_t pte_chain_freelist_lock = SPIN_LOCK_UNLOCKED; - -/* Maybe we should have standard ops for singly linked lists ... - Rik */ -static inline void pte_chain_push(struct pte_chain * pte_chain) + * page_over_rsslimit - test if the page is over its RSS limit + * @page - page to test + * + * This function returns true if the process owning this page + * is over its RSS (resident set size) limit. For shared pages + * we penalise it only if all processes using it are over their + * rss limits. + * The caller needs to hold the page's pte_chain_lock. + */ +int page_over_rsslimit(struct page * page) { - pte_chain->ptep = NULL; - pte_chain->next = pte_chain_freelist; - pte_chain_freelist = pte_chain; -} + struct mm_struct * mm; + pte_t * ptep; -static inline struct pte_chain * pte_chain_pop(void) -{ - struct pte_chain *pte_chain; + /* No process is using the page. */ + if (!page->pte.chain) + return 0; - pte_chain = pte_chain_freelist; - pte_chain_freelist = pte_chain->next; - pte_chain->next = NULL; + if (PageDirect(page)) { + ptep = page->pte.direct; + mm = ptep_to_mm(ptep); + if(!mm->rlimit_rss || mm->rss <= mm->rlimit_rss) + return 0; + } else { + do { + ptep = page->pte.chain->ptep; + mm = ptep_to_mm(ptep); + /* + * If the process is under its RSS limit, stop + * scanning and don't penalise the page. + */ + if(!mm->rlimit_rss || mm->rss <= mm->rlimit_rss) + return 0; - return pte_chain; + page->pte.chain = page->pte.chain->next; + } while (page->pte.chain); + } + return 1; } /** + ** No more VM stuff below this comment, only pte_chain helper + ** functions. + **/ + + +/** * pte_chain_free - free pte_chain structure * @pte_chain: pte_chain struct to free * @prev_pte_chain: previous pte_chain on the list (may be NULL) @@ -391,15 +404,12 @@ static inline void pte_chain_free(struct pte_chain * pte_chain, struct pte_chain * prev_pte_chain, struct page * page) { - mod_page_state(used_pte_chains_bytes, -sizeof(struct pte_chain)); if (prev_pte_chain) prev_pte_chain->next = pte_chain->next; else if (page) page->pte.chain = pte_chain->next; - spin_lock(&pte_chain_freelist_lock); - pte_chain_push(pte_chain); - spin_unlock(&pte_chain_freelist_lock); + kmem_cache_free(pte_chain_cache, pte_chain); } /** @@ -409,47 +419,20 @@ * pte_chain structures as required. * Caller needs to hold the page's pte_chain_lock. */ -static inline struct pte_chain * pte_chain_alloc() +static inline struct pte_chain *pte_chain_alloc(void) { - struct pte_chain * pte_chain; - - spin_lock(&pte_chain_freelist_lock); - - /* Allocate new pte_chain structs as needed. */ - if (!pte_chain_freelist) - alloc_new_pte_chains(); - - /* Grab the first pte_chain from the freelist. */ - pte_chain = pte_chain_pop(); - - spin_unlock(&pte_chain_freelist_lock); - - mod_page_state(used_pte_chains_bytes, sizeof(struct pte_chain)); - return pte_chain; + return kmem_cache_alloc(pte_chain_cache, GFP_ATOMIC); } -/** - * alloc_new_pte_chains - convert a free page to pte_chain structures - * - * Grabs a free page and converts it to pte_chain structures. We really - * should pre-allocate these earlier in the pagefault path or come up - * with some other trick. - * - * Note that we cannot use the slab cache because the pte_chain structure - * is way smaller than the minimum size of a slab cache allocation. - * Caller needs to hold the pte_chain_freelist_lock - */ -static void alloc_new_pte_chains() +void __init pte_chain_init(void) { - struct pte_chain * pte_chain = (void *) get_zeroed_page(GFP_ATOMIC); - int i = PAGE_SIZE / sizeof(struct pte_chain); + pte_chain_cache = kmem_cache_create( "pte_chain", + sizeof(struct pte_chain), + 0, + 0, + NULL, + NULL); - if (pte_chain) { - inc_page_state(nr_pte_chain_pages); - for (; i-- > 0; pte_chain++) - pte_chain_push(pte_chain); - } else { - /* Yeah yeah, I'll fix the pte_chain allocation ... */ - panic("Fix pte_chain allocation, you lazy bastard!\n"); - } + if (!pte_chain_cache) + panic("failed to create pte_chain cache!\n"); } diff -uNr linux-2.5.28/mm/swap.c linux-2.5.28-rmap/mm/swap.c --- linux-2.5.28/mm/swap.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/mm/swap.c Tue Jul 23 19:11:14 2002 @@ -14,11 +14,11 @@ */ #include -#include -#include #include #include #include +#include /* for try_to_release_page() */ +#include #include #include /* for copy_to/from_user */ @@ -33,16 +33,99 @@ 8, /* do swap I/O in clusters of this size */ }; +/** + * (de)activate_page - move pages from/to active and inactive lists + * @page: the page we want to move + * @nolock - are we already holding the pagemap_lru_lock? + * + * Deactivate_page will move an active page to the right + * inactive list, while activate_page will move a page back + * from one of the inactive lists to the active list. If + * called on a page which is not on any of the lists, the + * page is left alone. + */ +void deactivate_page_nolock(struct page * page) +{ + /* + * Don't touch it if it's not on the active list. + * (some pages aren't on any list at all) + */ + ClearPageReferenced(page); + page->age = 0; + if (PageActive(page)) { + del_page_from_active_list(page); + add_page_to_inactive_dirty_list(page); + } +} + +void deactivate_page(struct page * page) +{ + spin_lock(&pagemap_lru_lock); + deactivate_page_nolock(page); + spin_unlock(&pagemap_lru_lock); +} + +/** + * drop_page - like deactivate_page, but try inactive_clean list + * @page: the page to drop + * + * Try to move a page to the inactive_clean list, this succeeds if the + * page is clean and not in use by anybody. If the page cannot be placed + * on the inactive_clean list it is placed on the inactive_dirty list + * instead. + * + * Note: this function gets called with the pagemap_lru_lock held. + */ +void drop_page(struct page * page) +{ + if (!TestSetPageLocked(page)) { + if (page->mapping && PagePrivate(page)) { + page_cache_get(page); + spin_unlock(&pagemap_lru_lock); + try_to_release_page(page, GFP_NOIO); + spin_lock(&pagemap_lru_lock); + page_cache_release(page); + } + unlock_page(page); + } + + /* Make sure the page really is reclaimable. */ + pte_chain_lock(page); + if (!page->mapping || PageDirty(page) || page->pte.chain || + PagePrivate(page) || page_count(page) > 1) + deactivate_page_nolock(page); + + else if (page_count(page) == 1) { + ClearPageReferenced(page); + page->age = 0; + if (PageActive(page)) { + del_page_from_active_list(page); + add_page_to_inactive_clean_list(page); + } else if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_inactive_clean_list(page); + } + } + pte_chain_unlock(page); +} + /* * Move an inactive page to the active list. */ -static inline void activate_page_nolock(struct page * page) +void activate_page_nolock(struct page * page) { - if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(page); + if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + KERNEL_STAT_INC(pgactivate); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); add_page_to_active_list(page); KERNEL_STAT_INC(pgactivate); } + + /* Make sure the page gets a fair chance at staying active. */ + page->age = max((int)page->age, PAGE_AGE_START); } void activate_page(struct page * page) @@ -58,29 +141,31 @@ */ void lru_cache_add(struct page * page) { - if (!TestSetPageLRU(page)) { + if (!PageLRU(page)) { spin_lock(&pagemap_lru_lock); - add_page_to_inactive_list(page); + SetPageLRU(page); + add_page_to_active_list(page); spin_unlock(&pagemap_lru_lock); } } /** * __lru_cache_del: remove a page from the page lists - * @page: the page to add + * @page: the page to remove * * This function is for when the caller already holds * the pagemap_lru_lock. */ void __lru_cache_del(struct page * page) { - if (TestClearPageLRU(page)) { - if (PageActive(page)) { - del_page_from_active_list(page); - } else { - del_page_from_inactive_list(page); - } + if (PageActive(page)) { + del_page_from_active_list(page); + } else if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); } + ClearPageLRU(page); } /** diff -uNr linux-2.5.28/mm/vmscan.c linux-2.5.28-rmap/mm/vmscan.c --- linux-2.5.28/mm/vmscan.c Wed Jul 24 20:39:57 2002 +++ linux-2.5.28-rmap/mm/vmscan.c Tue Jul 23 19:12:42 2002 @@ -13,7 +13,6 @@ #include #include -#include #include #include #include @@ -21,6 +20,7 @@ #include #include #include +#include #include #include #include /* for try_to_release_page() */ @@ -29,6 +29,8 @@ #include #include +static void refill_freelist(void); +static void wakeup_memwaiters(void); /* * The "priority" of VM scanning is how much of the queues we * will scan in one go. A value of 6 for DEF_PRIORITY implies @@ -37,9 +39,14 @@ */ #define DEF_PRIORITY (6) -static inline int is_page_cache_freeable(struct page * page) +static inline void age_page_up(struct page *page) { - return page_count(page) - !!PagePrivate(page) == 1; + page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX); +} + +static inline void age_page_down(struct page *page) +{ + page->age -= min(PAGE_AGE_DECL, (int)page->age); } /* Must be called with page's pte_chain_lock held. */ @@ -62,70 +69,175 @@ return 0; } -static int -shrink_cache(int nr_pages, zone_t *classzone, - unsigned int gfp_mask, int priority, int max_scan) + +/** + * reclaim_page - reclaims one page from the inactive_clean list + * @zone: reclaim a page from this zone + * + * The pages on the inactive_clean can be instantly reclaimed. + * The tests look impressive, but most of the time we'll grab + * the first page of the list and exit successfully. + */ +struct page * reclaim_page(zone_t * zone) { + struct address_space * mapping; + struct page * page = NULL; + struct list_head * page_lru; + swp_entry_t entry = {0}; + int maxscan; + + /* + * We need to hold the page_lock around all tests to make sure + * reclaim_page() cannot race with find_get_page() and friends. + */ + spin_lock(&pagemap_lru_lock); + maxscan = zone->inactive_clean_pages; + while (maxscan-- && !list_empty(&zone->inactive_clean_list)) { + page_lru = zone->inactive_clean_list.prev; + page = list_entry(page_lru, struct page, lru); + + mapping = page->mapping; + write_lock(&mapping->page_lock); + + /* Wrong page on list?! (list corruption, should not happen) */ + if (unlikely(!PageInactiveClean(page))) { + printk("VM: reclaim_page, wrong page on list.\n"); + list_del(page_lru); + page_zone(page)->inactive_clean_pages--; + goto unlock; + } + + /* Page is being freed */ + if (unlikely(!page_count(page))) { + list_del(page_lru); + list_add(page_lru, &zone->inactive_clean_list); + goto unlock; + } + + /* Page cannot be reclaimed ? Move to inactive_dirty list. */ + pte_chain_lock(page); + if (unlikely(page->pte.chain || PagePrivate(page) || + PageReferenced(page) || PageDirty(page) || + page_count(page) > 1 || TestSetPageLocked(page))) { + del_page_from_inactive_clean_list(page); + add_page_to_inactive_dirty_list(page); + pte_chain_unlock(page); + goto unlock; + } + + /* + * From here until reaching either the bottom of the loop + * or found_page: the pte_chain_lock is held. + */ + + /* OK, remove the page from the caches. */ + if (PageSwapCache(page)) { + entry.val = page->index; + __delete_from_swap_cache(page); + goto found_page; + } + + if (page->mapping) { + __remove_inode_page(page); + goto found_page; + } + + /* We should never ever get here. */ + printk(KERN_ERR "VM: reclaim_page, found unknown page\n"); + list_del(page_lru); + zone->inactive_clean_pages--; + pte_chain_unlock(page); + unlock_page(page); +unlock: + write_unlock(&mapping->page_lock); + } + spin_unlock(&pagemap_lru_lock); + return NULL; + +found_page: + __lru_cache_del(page); + pte_chain_unlock(page); + write_unlock(&mapping->page_lock); + spin_unlock(&pagemap_lru_lock); + if (entry.val) + swap_free(entry); + unlock_page(page); + /* initialize page flags */ + page->age = PAGE_AGE_START; + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | + 1 << PG_referenced | 1 << PG_arch_1 | + 1 << PG_checked); + if (page_count(page) != 1) + printk("VM: reclaim_page, found page with count %d!\n", + page_count(page)); + return page; +} + + +/** + * page_launder_zone - clean dirty inactive pages, move to inactive_clean list + * @zone: zone to free pages in + * @gfp_mask: what operations we are allowed to do + * + * This function is called when we are low on free / inactive_clean + * pages, its purpose is to refill the free/clean list as efficiently + * as possible. + * + * This means we do writes asynchronously as long as possible and will + * only sleep on IO when we don't have another option. Since writeouts + * cause disk seeks and make read IO slower, we skip writes alltogether + * when the amount of dirty pages is small. + * + * This code is heavily inspired by the FreeBSD source code. Thanks + * go out to Matthew Dillon. + */ +int page_launder_zone(zone_t * zone, int gfp_mask, int priority) +{ + int maxscan, cleaned_pages = 0, target = free_plenty(zone); struct list_head * entry; - struct address_space *mapping; + struct address_space * mapping; + /* The main launder loop. */ spin_lock(&pagemap_lru_lock); - while (--max_scan >= 0 && - (entry = inactive_list.prev) != &inactive_list) { + maxscan = zone->inactive_dirty_pages >> priority; + while (maxscan-- && !list_empty(&zone->inactive_dirty_list)) { struct page *page; int may_enter_fs; + /* Low latency reschedule point */ if (need_resched()) { spin_unlock(&pagemap_lru_lock); - __set_current_state(TASK_RUNNING); schedule(); spin_lock(&pagemap_lru_lock); continue; } + entry = zone->inactive_dirty_list.prev; page = list_entry(entry, struct page, lru); - if (unlikely(!PageLRU(page))) - BUG(); - if (unlikely(PageActive(page))) - BUG(); + if (cleaned_pages > target) + break; list_del(entry); - list_add(entry, &inactive_list); + list_add(entry, &zone->inactive_dirty_list); KERNEL_STAT_INC(pgscan); - /* - * Zero page counts can happen because we unlink the pages - * _after_ decrementing the usage count.. - */ - if (unlikely(!page_count(page))) - continue; - - if (!memclass(page_zone(page), classzone)) + /* Wrong page on list?! (list corruption, should not happen) */ + if (!PageInactiveDirty(page)) { + printk("VM: page_launder, wrong page on list.\n"); + list_del(entry); + dec_page_state(nr_inactive_dirty_pages); + page_zone(page)->inactive_dirty_pages--; continue; + } /* - * swap activity never enters the filesystem and is safe - * for GFP_NOFS allocations. - */ - may_enter_fs = (gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (gfp_mask & __GFP_IO)); - - /* - * IO in progress? Leave it at the back of the list. + * Page is being freed, don't worry about it. */ - if (unlikely(PageWriteback(page))) { - if (may_enter_fs) { - page_cache_get(page); - spin_unlock(&pagemap_lru_lock); - wait_on_page_writeback(page); - page_cache_release(page); - spin_lock(&pagemap_lru_lock); - } + if (unlikely(!page_count(page))) continue; - } - if (TestSetPageLocked(page)) + if (unlikely(TestSetPageLocked(page))) continue; if (PageWriteback(page)) { /* The non-racy check */ @@ -135,12 +247,14 @@ /* * The page is in active use or really unfreeable. Move to - * the active list. + * the active list and adjust the page age if needed. */ pte_chain_lock(page); - if (page_referenced(page) && page_mapping_inuse(page)) { - del_page_from_inactive_list(page); + if (page_referenced(page) && page_mapping_inuse(page) && + !page_over_rsslimit(page)) { + del_page_from_inactive_dirty_list(page); add_page_to_active_list(page); + page->age = max((int)page->age, PAGE_AGE_START); pte_chain_unlock(page); unlock_page(page); KERNEL_STAT_INC(pgactivate); @@ -189,19 +303,24 @@ pte_chain_unlock(page); mapping = page->mapping; - if (PageDirty(page) && is_page_cache_freeable(page) && - page->mapping && may_enter_fs) { + /* + * swap activity never enters the filesystem and is safe + * for GFP_NOFS allocations. + */ + may_enter_fs = (gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (gfp_mask & __GFP_IO)); + + if (PageDirty(page) && mapping && may_enter_fs) { /* * It is not critical here to write it only if * the page is unmapped beause any direct writer - * like O_DIRECT would set the page's dirty bitflag + * like O_DIRECT would set the PG_dirty bitflag * on the physical page after having successfully * pinned it and after the I/O to the page is finished, * so the direct writes to the page cannot get lost. */ int (*writeback)(struct page *, int *); - const int nr_pages = SWAP_CLUSTER_MAX; - int nr_to_write = nr_pages; + int nr_to_write = SWAP_CLUSTER_MAX; writeback = mapping->a_ops->vm_writeback; if (writeback == NULL) @@ -209,7 +328,6 @@ page_cache_get(page); spin_unlock(&pagemap_lru_lock); (*writeback)(page, &nr_to_write); - max_scan -= (nr_pages - nr_to_write); page_cache_release(page); spin_lock(&pagemap_lru_lock); continue; @@ -232,7 +350,7 @@ if (PagePrivate(page)) { spin_unlock(&pagemap_lru_lock); - /* avoid to free a locked page */ + /* To avoid freeing our page before we're done. */ page_cache_get(page); if (try_to_release_page(page, gfp_mask)) { @@ -240,268 +358,311 @@ /* effectively free the page here */ unlock_page(page); page_cache_release(page); - + KERNEL_STAT_INC(pgsteal); spin_lock(&pagemap_lru_lock); - if (--nr_pages) - continue; - break; + cleaned_pages++; + continue; } else { /* - * The page is still in pagecache so undo the stuff - * before the try_to_release_page since we've not - * finished and we can now try the next step. + * We freed the buffers but may have + * slept; undo the stuff we did before + * try_to_release_page and fall through + * to the next step. */ page_cache_release(page); - spin_lock(&pagemap_lru_lock); } } else { /* failed to drop the buffers so stop here */ unlock_page(page); page_cache_release(page); - spin_lock(&pagemap_lru_lock); continue; } } /* - * This is the non-racy check for busy page. - */ - if (mapping) { - write_lock(&mapping->page_lock); - if (is_page_cache_freeable(page)) - goto page_freeable; - write_unlock(&mapping->page_lock); - } - unlock_page(page); - continue; -page_freeable: - /* - * It is critical to check PageDirty _after_ we made sure - * the page is freeable* so not in use by anybody. + * If the page is really freeable now, move it to the + * inactive_clean list. + * + * We re-test everything since the page could have been + * used by somebody else while we waited on IO above. + * This test is not safe from races, but only the one + * in reclaim_page() needs to be. */ - if (PageDirty(page)) { - write_unlock(&mapping->page_lock); + pte_chain_lock(page); + if (mapping && !PageDirty(page) && !page->pte.chain && + page_count(page) == 1) { + del_page_from_inactive_dirty_list(page); + add_page_to_inactive_clean_list(page); + pte_chain_unlock(page); unlock_page(page); - continue; - } - - /* point of no return */ - if (likely(!PageSwapCache(page))) { - __remove_inode_page(page); - write_unlock(&mapping->page_lock); + cleaned_pages++; } else { - swp_entry_t swap; - swap.val = page->index; - __delete_from_swap_cache(page); - write_unlock(&mapping->page_lock); - swap_free(swap); + /* + * OK, we don't know what to do with the page. + * It's no use keeping it here, so we move it to + * the active list. + */ +page_active: + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + pte_chain_unlock(page); + unlock_page(page); + KERNEL_STAT_INC(pgactivate); } + } + spin_unlock(&pagemap_lru_lock); - __lru_cache_del(page); - unlock_page(page); + /* Return the number of pages moved to the inactive_clean list. */ + return cleaned_pages; +} - /* effectively free the page here */ - page_cache_release(page); - KERNEL_STAT_INC(pgsteal); - if (--nr_pages) - continue; - goto out; -page_active: - /* - * OK, we don't know what to do with the page. - * It's no use keeping it here, so we move it to - * the active list. - */ - del_page_from_inactive_list(page); - add_page_to_active_list(page); - pte_chain_unlock(page); - unlock_page(page); - KERNEL_STAT_INC(pgactivate); +/** + * page_launder - clean dirty inactive pages, move to inactive_clean list + * @gfp_mask: what operations we are allowed to do + * + * This function iterates over all zones and calls page_launder_zone(), + * balancing still needs to be added... + */ +int page_launder(int gfp_mask) +{ + int maxtry = 1 << DEF_PRIORITY; + struct zone_struct * zone; + int freed = 0; + + /* Global balancing while we have a global shortage. */ + while (maxtry-- && free_high(ALL_ZONES) >= 0) { + for_each_zone(zone) + if (free_plenty(zone) >= 0) + freed += page_launder_zone(zone, gfp_mask, 6); } -out: spin_unlock(&pagemap_lru_lock); - return nr_pages; + + /* Clean up the remaining zones with a serious shortage, if any. */ + for_each_zone(zone) + if (free_min(zone) >= 0) + freed += page_launder_zone(zone, gfp_mask, 0); + + return freed; } -/* - * This moves pages from the active list to - * the inactive list. +/** + * refill_inactive_zone - scan the active list and find pages to deactivate + * @priority: how much are we allowed to scan * - * We move them the other way if the page is - * referenced by one or more processes, from rmap + * This function will scan a portion of the active list of a zone to find + * unused pages, those pages will then be moved to the inactive list. */ -static void refill_inactive(int nr_pages) +int refill_inactive_zone(struct zone_struct * zone, int priority) { - struct list_head * entry; + int maxscan = zone->active_pages >> priority; + int target = inactive_high(zone); + struct list_head * page_lru; + int nr_deactivated = 0; + struct page * page; + /* Take the lock while messing with the list... */ spin_lock(&pagemap_lru_lock); - entry = active_list.prev; - while (nr_pages-- && entry != &active_list) { - struct page * page; - - page = list_entry(entry, struct page, lru); - entry = entry->prev; + while (maxscan-- && !list_empty(&zone->active_list)) { + page_lru = zone->active_list.prev; + page = list_entry(page_lru, struct page, lru); - KERNEL_STAT_INC(pgscan); + /* Wrong page on list?! (list corruption, should not happen) */ + if (unlikely(!PageActive(page))) { + printk("VM: refill_inactive, wrong page on list.\n"); + list_del(page_lru); + dec_page_state(nr_active_pages); + continue; + } + + /* Needed to follow page->mapping */ + if (TestSetPageLocked(page)) { + list_del(page_lru); + list_add(page_lru, &zone->active_list); + KERNEL_STAT_INC(pgscan); + continue; + } + /* + * If the object the page is in is not in use we don't + * bother with page aging. If the page is touched again + * while on the inactive_clean list it'll be reactivated. + * From here until the end of the current iteration + * both PG_locked and the pte_chain_lock are held. + */ pte_chain_lock(page); - if (page->pte.chain && page_referenced(page)) { - list_del(&page->lru); - list_add(&page->lru, &active_list); + if (!page_mapping_inuse(page)) { pte_chain_unlock(page); + unlock_page(page); + drop_page(page); continue; } - del_page_from_active_list(page); - add_page_to_inactive_list(page); + + /* + * Do aging on the pages. + */ + if (page_referenced(page)) { + age_page_up(page); + } else { + age_page_down(page); + } + + /* + * If the page age is 'hot' and the process using the + * page doesn't exceed its RSS limit we keep the page. + * Otherwise we move it to the inactive_dirty list. + */ + if (page->age && !page_over_rsslimit(page)) { + list_del(page_lru); + list_add(page_lru, &zone->active_list); + } else { + deactivate_page_nolock(page); + if (++nr_deactivated > target) { + pte_chain_unlock(page); + unlock_page(page); + goto done; + } + } pte_chain_unlock(page); - KERNEL_STAT_INC(pgdeactivate); + unlock_page(page); + + /* Low latency reschedule point */ + if (need_resched()) { + spin_unlock(&pagemap_lru_lock); + schedule(); + spin_lock(&pagemap_lru_lock); + } } + +done: spin_unlock(&pagemap_lru_lock); + return nr_deactivated; } -static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)); -static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages) +/** + * refill_inactive - checks all zones and refills the inactive list as needed + * + * This function tries to balance page eviction from all zones by aging + * the pages from each zone in the same ratio until the global inactive + * shortage is resolved. After that it does one last "clean-up" scan to + * fix up local inactive shortages. + */ +int refill_inactive(void) { - int chunk_size = nr_pages; - unsigned long ratio; - struct page_state ps; - int max_scan; + int maxtry = 1 << DEF_PRIORITY; + zone_t * zone; + int ret = 0; - nr_pages -= kmem_cache_reap(gfp_mask); - if (nr_pages <= 0) - return 0; + /* Global balancing while we have a global shortage. */ + while (maxtry-- && inactive_low(ALL_ZONES) >= 0) { + for_each_zone(zone) { + if (inactive_high(zone) >= 0) + ret += refill_inactive_zone(zone, DEF_PRIORITY); + } + } - nr_pages = chunk_size; + /* Local balancing for zones which really need it. */ + for_each_zone(zone) { + if (inactive_min(zone) >= 0) + ret += refill_inactive_zone(zone, 0); + } - /* - * Try to keep the active list 2/3 of the size of the cache - */ - get_page_state(&ps); - ratio = (unsigned long)nr_pages * ps.nr_active / - ((ps.nr_inactive | 1) * 2); - refill_inactive(ratio); - max_scan = ps.nr_inactive / priority; - nr_pages = shrink_cache(nr_pages, classzone, - gfp_mask, priority, max_scan); - if (nr_pages <= 0) - return 0; + return ret; +} - wakeup_bdflush(); +/** + * background_aging - slow background aging of zones + * @priority: priority at which to scan + * + * When the VM load is low or nonexistant, this function is + * called once a second to "sort" the pages in the VM. This + * way we know which pages to evict once a load spike happens. + * The effects of this function are very slow, the CPU usage + * should be minimal to nonexistant under most loads. + */ +static inline void background_aging(int priority) +{ + struct zone_struct * zone; - shrink_dcache_memory(priority, gfp_mask); + for_each_zone(zone) + if (inactive_high(zone) > 0) + refill_inactive_zone(zone, priority); +} - /* After shrinking the dcache, get rid of unused inodes too .. */ - shrink_icache_memory(1, gfp_mask); +/* + * Worker function for kswapd and try_to_free_pages, we get + * called whenever there is a shortage of free/inactive_clean + * pages. + * + * This function will also move pages to the inactive list, + * if needed. + */ +static int do_try_to_free_pages(unsigned int gfp_mask) +{ + int ret = 0; + + /* + * Eat memory from filesystem page cache, + * dentry, inode and filesystem quota caches. + */ + ret += page_launder(gfp_mask); + ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask); + ret += shrink_icache_memory(1, gfp_mask); #ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); + ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); #endif + /* + * Move pages from the active list to the inactive list. + */ + refill_inactive(); - return nr_pages; -} - -int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order) -{ - int priority = DEF_PRIORITY; - int nr_pages = SWAP_CLUSTER_MAX; + /* + * Reclaim unused slab cache memory. + */ + ret += kmem_cache_reap(gfp_mask); - KERNEL_STAT_INC(pageoutrun); + refill_freelist(); - do { - nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages); - if (nr_pages <= 0) - return 1; - } while (--priority); + /* Start IO when needed. */ + if (free_plenty(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0) + blk_run_queues(); /* * Hmm.. Cache shrink failed - time to kill something? * Mhwahahhaha! This is the part I really like. Giggle. */ - out_of_memory(); - return 0; -} - -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); - -static int check_classzone_need_balance(zone_t * classzone) -{ - zone_t * first_classzone; - - first_classzone = classzone->zone_pgdat->node_zones; - while (classzone >= first_classzone) { - if (classzone->free_pages > classzone->pages_high) - return 0; - classzone--; - } - return 1; + if (!ret && free_min(ANY_ZONE) > 0) + out_of_memory(); + return ret; } -static int kswapd_balance_pgdat(pg_data_t * pgdat) +/** + * refill_freelist - move inactive_clean pages to free list if needed + * + * Move some pages from the inactive_clean lists to the free + * lists so atomic allocations have pages to work from. This + * function really only does something when we don't have a + * userspace load on __alloc_pages(). + * + * We refill the freelist in a bump from pages_min to pages_min * 2 + * in order to give the buddy allocator something to play with. + */ +static void refill_freelist(void) { - int need_more_balance = 0, i; + struct page * page; zone_t * zone; - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - cond_resched(); - if (!zone->need_balance) - continue; - if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { - zone->need_balance = 0; - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); + for_each_zone(zone) { + if (!zone->size || zone->free_pages >= zone->pages_min) continue; - } - if (check_classzone_need_balance(zone)) - need_more_balance = 1; - else - zone->need_balance = 0; - } - return need_more_balance; -} - -static void kswapd_balance(void) -{ - int need_more_balance; - pg_data_t * pgdat; - - do { - need_more_balance = 0; - pgdat = pgdat_list; - do - need_more_balance |= kswapd_balance_pgdat(pgdat); - while ((pgdat = pgdat->node_next)); - } while (need_more_balance); -} - -static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) -{ - zone_t * zone; - int i; - - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!zone->need_balance) - continue; - return 0; + while (zone->free_pages < zone->pages_min * 2) { + page = reclaim_page(zone); + if (!page) + break; + __free_page(page); + } } - - return 1; -} - -static int kswapd_can_sleep(void) -{ - pg_data_t * pgdat; - - pgdat = pgdat_list; - do { - if (kswapd_can_sleep_pgdat(pgdat)) - continue; - return 0; - } while ((pgdat = pgdat->node_next)); - - return 1; } /* @@ -520,7 +681,6 @@ int kswapd(void *unused) { struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); daemonize(); strcpy(tsk->comm, "kswapd"); @@ -544,26 +704,152 @@ * Kswapd main loop. */ for (;;) { + static long recalc = 0; if (current->flags & PF_FREEZE) refrigerator(PF_IOTHREAD); - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kswapd_wait, &wait); + /* + * We try to rebalance the VM either when we have a + * global shortage of free pages or when one particular + * zone is very short on free pages. + */ + if (free_high(ALL_ZONES) >= 0 || free_low(ANY_ZONE) > 0) + do_try_to_free_pages(GFP_KSWAPD); - mb(); - if (kswapd_can_sleep()) - schedule(); + refill_freelist(); + + /* Once a second ... */ + if (time_after(jiffies, recalc + HZ)) { + recalc = jiffies; + + /* Do background page aging. */ + background_aging(DEF_PRIORITY); + } + + wakeup_memwaiters(); + } +} - __set_current_state(TASK_RUNNING); +static int kswapd_overloaded; +unsigned int kswapd_minfree; /* initialized in mm/page_alloc.c */ +DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); +DECLARE_WAIT_QUEUE_HEAD(kswapd_done); + +/** + * wakeup_kswapd - wake up the pageout daemon + * gfp_mask: page freeing flags + * + * This function wakes up kswapd and can, under heavy VM pressure, + * put the calling task to sleep temporarily. + */ +void wakeup_kswapd(unsigned int gfp_mask) +{ + DECLARE_WAITQUEUE(wait, current); + + /* If we're in the memory freeing business ourself, don't sleep + * but just wake kswapd and go back to businesss. + */ + if (current->flags & PF_MEMALLOC) { + wake_up_interruptible(&kswapd_wait); + return; + } + + /* We need all of kswapd's GFP flags, otherwise we can't sleep on it. + * We still wake kswapd of course. + */ + if ((gfp_mask & GFP_KSWAPD) != GFP_KSWAPD) { + wake_up_interruptible(&kswapd_wait); + return; + } + + add_wait_queue(&kswapd_done, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + + /* Wake kswapd .... */ + wake_up_interruptible(&kswapd_wait); + + /* ... and check if we need to wait on it */ + if ((free_low(ALL_ZONES) > (kswapd_minfree / 2)) && !kswapd_overloaded) + schedule(); + set_current_state(TASK_RUNNING); + remove_wait_queue(&kswapd_done, &wait); +} + +static void wakeup_memwaiters(void) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&kswapd_wait, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + /* Don't let the processes waiting on memory get stuck, ever. */ + wake_up(&kswapd_done); + + /* Enough free RAM, we can easily keep up with memory demand. */ + if (free_high(ALL_ZONES) <= 0) { + schedule_timeout(HZ); remove_wait_queue(&kswapd_wait, &wait); + return; + } + remove_wait_queue(&kswapd_wait, &wait); - /* - * If we actually get into a low-memory situation, - * the processes needing more memory will wake us - * up on a more timely basis. - */ - kswapd_balance(); - blk_run_queues(); + /* OK, the VM is very loaded. Sleep instead of using all CPU. */ + kswapd_overloaded = 1; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ / 4); + kswapd_overloaded = 0; + return; +} + +/** + * try_to_free_pages - run the pageout code ourselves + * gfp_mask: mask of things the pageout code is allowed to do + * + * When the load on the system gets higher, it can happen + * that kswapd no longer manages to keep enough memory + * free. In those cases user programs allocating memory + * will call try_to_free_pages() and help the pageout code. + * This has the effects of freeing memory and slowing down + * the largest memory hogs a bit. + */ +int try_to_free_pages(unsigned int gfp_mask) +{ + int ret = 1; + if (gfp_mask & __GFP_WAIT) { + KERNEL_STAT_INC(pageoutrun); + current->flags |= PF_MEMALLOC; + ret = do_try_to_free_pages(gfp_mask); + current->flags &= ~PF_MEMALLOC; } + return ret; +} + +/** + * rss_free_pages - run part of the pageout code and slow down a bit + * @gfp_mask: mask of things the pageout code is allowed to do + * + * This function is called when a task is over its RSS limit and + * has a page fault. It's goal is to free some memory so non-hogs + * can run faster and slow down itself when needed so it won't eat + * the memory non-hogs can use. + */ +void rss_free_pages(unsigned int gfp_mask) +{ + long pause = 0; + if (current->flags & PF_MEMALLOC) + return; + current->flags |= PF_MEMALLOC; + + do { + page_launder(gfp_mask); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(pause); + set_current_state(TASK_RUNNING); + pause++; + } while (free_high(ALL_ZONES) >= 0); + + current->flags &= ~PF_MEMALLOC; + return; } static int __init kswapd_init(void)