1 /* 2 * linux/mm/slab.c 3 * Written by Mark Hemment, 1996/97. 4 * (markhe@nextd.demon.co.uk) 5 * 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 7 * 8 * Major cleanup, different bufctl logic, per-cpu arrays 9 * (c) 2000 Manfred Spraul 10 * 11 * Cleanup, make the head arrays unconditional, preparation for NUMA 12 * (c) 2002 Manfred Spraul 13 * 14 * An implementation of the Slab Allocator as described in outline in; 15 * UNIX Internals: The New Frontiers by Uresh Vahalia 16 * Pub: Prentice Hall ISBN 0-13-101908-2 17 * or with a little more detail in; 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 19 * Jeff Bonwick (Sun Microsystems). 20 * Presented at: USENIX Summer 1994 Technical Conference 21 * 22 * The memory is organized in caches, one cache for each object type. 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 24 * Each cache consists out of many slabs (they are small (usually one 25 * page long) and always contiguous), and each slab contains multiple 26 * initialized objects. 27 * 28 * This means, that your constructor is used only for newly allocated 29 * slabs and you must pass objects with the same intializations to 30 * kmem_cache_free. 31 * 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 33 * normal). If you need a special memory type, then must create a new 34 * cache for that memory type. 35 * 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 37 * full slabs with 0 free objects 38 * partial slabs 39 * empty slabs with no allocated objects 40 * 41 * If partial slabs exist, then new allocations come from these slabs, 42 * otherwise from empty slabs or new slabs are allocated. 43 * 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 46 * 47 * Each cache has a short per-cpu head array, most allocs 48 * and frees go into that array, and if that array overflows, then 1/2 49 * of the entries in the array are given back into the global cache. 50 * The head array is strictly LIFO and should improve the cache hit rates. 51 * On SMP, it additionally reduces the spinlock operations. 52 * 53 * The c_cpuarray may not be read with enabled local interrupts - 54 * it's changed with a smp_call_function(). 55 * 56 * SMP synchronization: 57 * constructors and destructors are called without any locking. 58 * Several members in kmem_cache_t and struct slab never change, they 59 * are accessed without any locking. 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 61 * and local interrupts are disabled so slab code is preempt-safe. 62 * The non-constant members are protected with a per-cache irq spinlock. 63 * 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 65 * in 2000 - many ideas in the current implementation are derived from 66 * his patch. 67 * 68 * Further notes from the original documentation: 69 * 70 * 11 April '97. Started multi-threading - markhe 71 * The global cache-chain is protected by the semaphore 'cache_chain_sem'. 72 * The sem is only needed when accessing/extending the cache-chain, which 73 * can never happen inside an interrupt (kmem_cache_create(), 74 * kmem_cache_shrink() and kmem_cache_reap()). 75 * 76 * At present, each engine can be growing a cache. This should be blocked. 77 * 78 */ 79 80 #include <linux/config.h> 81 #include <linux/slab.h> 82 #include <linux/mm.h> 83 #include <linux/swap.h> 84 #include <linux/cache.h> 85 #include <linux/interrupt.h> 86 #include <linux/init.h> 87 #include <linux/compiler.h> 88 #include <linux/seq_file.h> 89 #include <linux/notifier.h> 90 #include <linux/kallsyms.h> 91 #include <linux/cpu.h> 92 #include <linux/sysctl.h> 93 #include <linux/module.h> 94 95 #include <asm/uaccess.h> 96 #include <asm/cacheflush.h> 97 #include <asm/tlbflush.h> 98 99 /* 100 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, 101 * SLAB_RED_ZONE & SLAB_POISON. 102 * 0 for faster, smaller code (especially in the critical paths). 103 * 104 * STATS - 1 to collect stats for /proc/slabinfo. 105 * 0 for faster, smaller code (especially in the critical paths). 106 * 107 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 108 */ 109 110 #ifdef CONFIG_DEBUG_SLAB 111 #define DEBUG 1 112 #define STATS 1 113 #define FORCED_DEBUG 1 114 #else 115 #define DEBUG 0 116 #define STATS 0 117 #define FORCED_DEBUG 0 118 #endif 119 120 121 /* Shouldn't this be in a header file somewhere? */ 122 #define BYTES_PER_WORD sizeof(void *) 123 124 #ifndef cache_line_size 125 #define cache_line_size() L1_CACHE_BYTES 126 #endif 127 128 #ifndef ARCH_KMALLOC_MINALIGN 129 #define ARCH_KMALLOC_MINALIGN 0 130 #endif 131 132 #ifndef ARCH_KMALLOC_FLAGS 133 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 134 #endif 135 136 /* Legal flag mask for kmem_cache_create(). */ 137 #if DEBUG 138 # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ 139 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 140 SLAB_NO_REAP | SLAB_CACHE_DMA | \ 141 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ 142 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC) 143 #else 144 # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ 145 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ 146 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC) 147 #endif 148 149 /* 150 * kmem_bufctl_t: 151 * 152 * Bufctl's are used for linking objs within a slab 153 * linked offsets. 154 * 155 * This implementation relies on "struct page" for locating the cache & 156 * slab an object belongs to. 157 * This allows the bufctl structure to be small (one int), but limits 158 * the number of objects a slab (not a cache) can contain when off-slab 159 * bufctls are used. The limit is the size of the largest general cache 160 * that does not use off-slab slabs. 161 * For 32bit archs with 4 kB pages, is this 56. 162 * This is not serious, as it is only for large objects, when it is unwise 163 * to have too many per slab. 164 * Note: This limit can be raised by introducing a general cache whose size 165 * is less than 512 (PAGE_SIZE<<3), but greater than 256. 166 */ 167 168 #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) 169 #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) 170 #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) 171 172 /* Max number of objs-per-slab for caches which use off-slab slabs. 173 * Needed to avoid a possible looping condition in cache_grow(). 174 */ 175 static unsigned long offslab_limit; 176 177 /* 178 * struct slab 179 * 180 * Manages the objs in a slab. Placed either at the beginning of mem allocated 181 * for a slab, or allocated from an general cache. 182 * Slabs are chained into three list: fully used, partial, fully free slabs. 183 */ 184 struct slab { 185 struct list_head list; 186 unsigned long colouroff; 187 void *s_mem; /* including colour offset */ 188 unsigned int inuse; /* num of objs active in slab */ 189 kmem_bufctl_t free; 190 }; 191 192 /* 193 * struct array_cache 194 * 195 * Per cpu structures 196 * Purpose: 197 * - LIFO ordering, to hand out cache-warm objects from _alloc 198 * - reduce the number of linked list operations 199 * - reduce spinlock operations 200 * 201 * The limit is stored in the per-cpu structure to reduce the data cache 202 * footprint. 203 * 204 */ 205 struct array_cache { 206 unsigned int avail; 207 unsigned int limit; 208 unsigned int batchcount; 209 unsigned int touched; 210 }; 211 212 /* bootstrap: The caches do not work without cpuarrays anymore, 213 * but the cpuarrays are allocated from the generic caches... 214 */ 215 #define BOOT_CPUCACHE_ENTRIES 1 216 struct arraycache_init { 217 struct array_cache cache; 218 void * entries[BOOT_CPUCACHE_ENTRIES]; 219 }; 220 221 /* 222 * The slab lists of all objects. 223 * Hopefully reduce the internal fragmentation 224 * NUMA: The spinlock could be moved from the kmem_cache_t 225 * into this structure, too. Figure out what causes 226 * fewer cross-node spinlock operations. 227 */ 228 struct kmem_list3 { 229 struct list_head slabs_partial; /* partial list first, better asm code */ 230 struct list_head slabs_full; 231 struct list_head slabs_free; 232 unsigned long free_objects; 233 int free_touched; 234 unsigned long next_reap; 235 struct array_cache *shared; 236 }; 237 238 #define LIST3_INIT(parent) \ 239 { \ 240 .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \ 241 .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \ 242 .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \ 243 } 244 #define list3_data(cachep) \ 245 (&(cachep)->lists) 246 247 /* NUMA: per-node */ 248 #define list3_data_ptr(cachep, ptr) \ 249 list3_data(cachep) 250 251 /* 252 * kmem_cache_t 253 * 254 * manages a cache. 255 */ 256 257 struct kmem_cache_s { 258 /* 1) per-cpu data, touched during every alloc/free */ 259 struct array_cache *array[NR_CPUS]; 260 unsigned int batchcount; 261 unsigned int limit; 262 /* 2) touched by every alloc & free from the backend */ 263 struct kmem_list3 lists; 264 /* NUMA: kmem_3list_t *nodelists[MAX_NUMNODES] */ 265 unsigned int objsize; 266 unsigned int flags; /* constant flags */ 267 unsigned int num; /* # of objs per slab */ 268 unsigned int free_limit; /* upper limit of objects in the lists */ 269 spinlock_t spinlock; 270 271 /* 3) cache_grow/shrink */ 272 /* order of pgs per slab (2^n) */ 273 unsigned int gfporder; 274 275 /* force GFP flags, e.g. GFP_DMA */ 276 unsigned int gfpflags; 277 278 size_t colour; /* cache colouring range */ 279 unsigned int colour_off; /* colour offset */ 280 unsigned int colour_next; /* cache colouring */ 281 kmem_cache_t *slabp_cache; 282 unsigned int slab_size; 283 unsigned int dflags; /* dynamic flags */ 284 285 /* constructor func */ 286 void (*ctor)(void *, kmem_cache_t *, unsigned long); 287 288 /* de-constructor func */ 289 void (*dtor)(void *, kmem_cache_t *, unsigned long); 290 291 /* 4) cache creation/removal */ 292 const char *name; 293 struct list_head next; 294 295 /* 5) statistics */ 296 #if STATS 297 unsigned long num_active; 298 unsigned long num_allocations; 299 unsigned long high_mark; 300 unsigned long grown; 301 unsigned long reaped; 302 unsigned long errors; 303 unsigned long max_freeable; 304 atomic_t allochit; 305 atomic_t allocmiss; 306 atomic_t freehit; 307 atomic_t freemiss; 308 #endif 309 #if DEBUG 310 int dbghead; 311 int reallen; 312 #endif 313 }; 314 315 #define CFLGS_OFF_SLAB (0x80000000UL) 316 #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 317 318 #define BATCHREFILL_LIMIT 16 319 /* Optimization question: fewer reaps means less 320 * probability for unnessary cpucache drain/refill cycles. 321 * 322 * OTHO the cpuarrays can contain lots of objects, 323 * which could lock up otherwise freeable slabs. 324 */ 325 #define REAPTIMEOUT_CPUC (2*HZ) 326 #define REAPTIMEOUT_LIST3 (4*HZ) 327 328 #if STATS 329 #define STATS_INC_ACTIVE(x) ((x)->num_active++) 330 #define STATS_DEC_ACTIVE(x) ((x)->num_active--) 331 #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 332 #define STATS_INC_GROWN(x) ((x)->grown++) 333 #define STATS_INC_REAPED(x) ((x)->reaped++) 334 #define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ 335 (x)->high_mark = (x)->num_active; \ 336 } while (0) 337 #define STATS_INC_ERR(x) ((x)->errors++) 338 #define STATS_SET_FREEABLE(x, i) \ 339 do { if ((x)->max_freeable < i) \ 340 (x)->max_freeable = i; \ 341 } while (0) 342 343 #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 344 #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 345 #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 346 #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 347 #else 348 #define STATS_INC_ACTIVE(x) do { } while (0) 349 #define STATS_DEC_ACTIVE(x) do { } while (0) 350 #define STATS_INC_ALLOCED(x) do { } while (0) 351 #define STATS_INC_GROWN(x) do { } while (0) 352 #define STATS_INC_REAPED(x) do { } while (0) 353 #define STATS_SET_HIGH(x) do { } while (0) 354 #define STATS_INC_ERR(x) do { } while (0) 355 #define STATS_SET_FREEABLE(x, i) \ 356 do { } while (0) 357 358 #define STATS_INC_ALLOCHIT(x) do { } while (0) 359 #define STATS_INC_ALLOCMISS(x) do { } while (0) 360 #define STATS_INC_FREEHIT(x) do { } while (0) 361 #define STATS_INC_FREEMISS(x) do { } while (0) 362 #endif 363 364 #if DEBUG 365 /* Magic nums for obj red zoning. 366 * Placed in the first word before and the first word after an obj. 367 */ 368 #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ 369 #define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ 370 371 /* ...and for poisoning */ 372 #define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ 373 #define POISON_FREE 0x6b /* for use-after-free poisoning */ 374 #define POISON_END 0xa5 /* end-byte of poisoning */ 375 376 /* memory layout of objects: 377 * 0 : objp 378 * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that 379 * the end of an object is aligned with the end of the real 380 * allocation. Catches writes behind the end of the allocation. 381 * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1: 382 * redzone word. 383 * cachep->dbghead: The real object. 384 * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 385 * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] 386 */ 387 static int obj_dbghead(kmem_cache_t *cachep) 388 { 389 return cachep->dbghead; 390 } 391 392 static int obj_reallen(kmem_cache_t *cachep) 393 { 394 return cachep->reallen; 395 } 396 397 static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp) 398 { 399 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 400 return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD); 401 } 402 403 static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) 404 { 405 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 406 if (cachep->flags & SLAB_STORE_USER) 407 return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD); 408 return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD); 409 } 410 411 static void **dbg_userword(kmem_cache_t *cachep, void *objp) 412 { 413 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 414 return (void**)(objp+cachep->objsize-BYTES_PER_WORD); 415 } 416 417 #else 418 419 #define obj_dbghead(x) 0 420 #define obj_reallen(cachep) (cachep->objsize) 421 #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 422 #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 423 #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 424 425 #endif 426 427 /* 428 * Maximum size of an obj (in 2^order pages) 429 * and absolute limit for the gfp order. 430 */ 431 #if defined(CONFIG_LARGE_ALLOCS) 432 #define MAX_OBJ_ORDER 13 /* up to 32Mb */ 433 #define MAX_GFP_ORDER 13 /* up to 32Mb */ 434 #elif defined(CONFIG_MMU) 435 #define MAX_OBJ_ORDER 5 /* 32 pages */ 436 #define MAX_GFP_ORDER 5 /* 32 pages */ 437 #else 438 #define MAX_OBJ_ORDER 8 /* up to 1Mb */ 439 #define MAX_GFP_ORDER 8 /* up to 1Mb */ 440 #endif 441 442 /* 443 * Do not go above this order unless 0 objects fit into the slab. 444 */ 445 #define BREAK_GFP_ORDER_HI 1 446 #define BREAK_GFP_ORDER_LO 0 447 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 448 449 /* Macros for storing/retrieving the cachep and or slab from the 450 * global 'mem_map'. These are used to find the slab an obj belongs to. 451 * With kfree(), these are used to find the cache which an obj belongs to. 452 */ 453 #define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x)) 454 #define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next) 455 #define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x)) 456 #define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev) 457 458 /* These are the default caches for kmalloc. Custom caches can have other sizes. */ 459 struct cache_sizes malloc_sizes[] = { 460 #define CACHE(x) { .cs_size = (x) }, 461 #include <linux/kmalloc_sizes.h> 462 { 0, } 463 #undef CACHE 464 }; 465 466 EXPORT_SYMBOL(malloc_sizes); 467 468 /* Must match cache_sizes above. Out of line to keep cache footprint low. */ 469 struct cache_names { 470 char *name; 471 char *name_dma; 472 }; 473 474 static struct cache_names __initdata cache_names[] = { 475 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 476 #include <linux/kmalloc_sizes.h> 477 { NULL, } 478 #undef CACHE 479 }; 480 481 struct arraycache_init initarray_cache __initdata = { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 482 struct arraycache_init initarray_generic __initdata = { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 483 484 /* internal cache of cache description objs */ 485 static kmem_cache_t cache_cache = { 486 .lists = LIST3_INIT(cache_cache.lists), 487 .batchcount = 1, 488 .limit = BOOT_CPUCACHE_ENTRIES, 489 .objsize = sizeof(kmem_cache_t), 490 .flags = SLAB_NO_REAP, 491 .spinlock = SPIN_LOCK_UNLOCKED, 492 .name = "kmem_cache", 493 #if DEBUG 494 .reallen = sizeof(kmem_cache_t), 495 #endif 496 }; 497 498 /* Guard access to the cache-chain. */ 499 static struct semaphore cache_chain_sem; 500 501 struct list_head cache_chain; 502 503 /* 504 * vm_enough_memory() looks at this to determine how many 505 * slab-allocated pages are possibly freeable under pressure 506 * 507 * SLAB_RECLAIM_ACCOUNT turns this on per-slab 508 */ 509 atomic_t slab_reclaim_pages; 510 EXPORT_SYMBOL(slab_reclaim_pages); 511 512 /* 513 * chicken and egg problem: delay the per-cpu array allocation 514 * until the general caches are up. 515 */ 516 enum { 517 NONE, 518 PARTIAL, 519 FULL 520 } g_cpucache_up; 521 522 static DEFINE_PER_CPU(struct timer_list, reap_timers); 523 524 static void reap_timer_fnc(unsigned long data); 525 static void free_block(kmem_cache_t* cachep, void** objpp, int len); 526 static void enable_cpucache (kmem_cache_t *cachep); 527 528 static inline void ** ac_entry(struct array_cache *ac) 529 { 530 return (void**)(ac+1); 531 } 532 533 static inline struct array_cache *ac_data(kmem_cache_t *cachep) 534 { 535 return cachep->array[smp_processor_id()]; 536 } 537 538 /* Cal the num objs, wastage, and bytes left over for a given slab size. */ 539 static void cache_estimate (unsigned long gfporder, size_t size, size_t align, 540 int flags, size_t *left_over, unsigned int *num) 541 { 542 int i; 543 size_t wastage = PAGE_SIZE<<gfporder; 544 size_t extra = 0; 545 size_t base = 0; 546 547 if (!(flags & CFLGS_OFF_SLAB)) { 548 base = sizeof(struct slab); 549 extra = sizeof(kmem_bufctl_t); 550 } 551 i = 0; 552 while (i*size + ALIGN(base+i*extra, align) <= wastage) 553 i++; 554 if (i > 0) 555 i--; 556 557 if (i > SLAB_LIMIT) 558 i = SLAB_LIMIT; 559 560 *num = i; 561 wastage -= i*size; 562 wastage -= ALIGN(base+i*extra, align); 563 *left_over = wastage; 564 } 565 566 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 567 568 static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) 569 { 570 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 571 function, cachep->name, msg); 572 dump_stack(); 573 } 574 575 /* 576 * Start the reap timer running on the target CPU. We run at around 1 to 2Hz. 577 * Add the CPU number into the expiry time to minimize the possibility of the 578 * CPUs getting into lockstep and contending for the global cache chain lock. 579 */ 580 static void __devinit start_cpu_timer(int cpu) 581 { 582 struct timer_list *rt = &per_cpu(reap_timers, cpu); 583 584 if (rt->function == NULL) { 585 init_timer(rt); 586 rt->expires = jiffies + HZ + 3*cpu; 587 rt->data = cpu; 588 rt->function = reap_timer_fnc; 589 add_timer_on(rt, cpu); 590 } 591 } 592 593 #ifdef CONFIG_HOTPLUG_CPU 594 static void stop_cpu_timer(int cpu) 595 { 596 struct timer_list *rt = &per_cpu(reap_timers, cpu); 597 598 if (rt->function) { 599 del_timer_sync(rt); 600 WARN_ON(timer_pending(rt)); 601 rt->function = NULL; 602 } 603 } 604 #endif 605 606 static struct array_cache *alloc_arraycache(int cpu, int entries, int batchcount) 607 { 608 int memsize = sizeof(void*)*entries+sizeof(struct array_cache); 609 struct array_cache *nc = NULL; 610 611 if (cpu != -1) { 612 nc = kmem_cache_alloc_node(kmem_find_general_cachep(memsize, 613 GFP_KERNEL), cpu_to_node(cpu)); 614 } 615 if (!nc) 616 nc = kmalloc(memsize, GFP_KERNEL); 617 if (nc) { 618 nc->avail = 0; 619 nc->limit = entries; 620 nc->batchcount = batchcount; 621 nc->touched = 0; 622 } 623 return nc; 624 } 625 626 static int __devinit cpuup_callback(struct notifier_block *nfb, 627 unsigned long action, 628 void *hcpu) 629 { 630 long cpu = (long)hcpu; 631 kmem_cache_t* cachep; 632 633 switch (action) { 634 case CPU_UP_PREPARE: 635 down(&cache_chain_sem); 636 list_for_each_entry(cachep, &cache_chain, next) { 637 struct array_cache *nc; 638 639 nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount); 640 if (!nc) 641 goto bad; 642 643 spin_lock_irq(&cachep->spinlock); 644 cachep->array[cpu] = nc; 645 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount 646 + cachep->num; 647 spin_unlock_irq(&cachep->spinlock); 648 649 } 650 up(&cache_chain_sem); 651 break; 652 case CPU_ONLINE: 653 start_cpu_timer(cpu); 654 break; 655 #ifdef CONFIG_HOTPLUG_CPU 656 case CPU_DEAD: 657 stop_cpu_timer(cpu); 658 /* fall thru */ 659 case CPU_UP_CANCELED: 660 down(&cache_chain_sem); 661 662 list_for_each_entry(cachep, &cache_chain, next) { 663 struct array_cache *nc; 664 665 spin_lock_irq(&cachep->spinlock); 666 /* cpu is dead; no one can alloc from it. */ 667 nc = cachep->array[cpu]; 668 cachep->array[cpu] = NULL; 669 cachep->free_limit -= cachep->batchcount; 670 free_block(cachep, ac_entry(nc), nc->avail); 671 spin_unlock_irq(&cachep->spinlock); 672 kfree(nc); 673 } 674 up(&cache_chain_sem); 675 break; 676 #endif 677 } 678 return NOTIFY_OK; 679 bad: 680 up(&cache_chain_sem); 681 return NOTIFY_BAD; 682 } 683 684 static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; 685 686 /* Initialisation. 687 * Called after the gfp() functions have been enabled, and before smp_init(). 688 */ 689 void __init kmem_cache_init(void) 690 { 691 size_t left_over; 692 struct cache_sizes *sizes; 693 struct cache_names *names; 694 695 /* 696 * Fragmentation resistance on low memory - only use bigger 697 * page orders on machines with more than 32MB of memory. 698 */ 699 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 700 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 701 702 703 /* Bootstrap is tricky, because several objects are allocated 704 * from caches that do not exist yet: 705 * 1) initialize the cache_cache cache: it contains the kmem_cache_t 706 * structures of all caches, except cache_cache itself: cache_cache 707 * is statically allocated. 708 * Initially an __init data area is used for the head array, it's 709 * replaced with a kmalloc allocated array at the end of the bootstrap. 710 * 2) Create the first kmalloc cache. 711 * The kmem_cache_t for the new cache is allocated normally. An __init 712 * data area is used for the head array. 713 * 3) Create the remaining kmalloc caches, with minimally sized head arrays. 714 * 4) Replace the __init data head arrays for cache_cache and the first 715 * kmalloc cache with kmalloc allocated arrays. 716 * 5) Resize the head arrays of the kmalloc caches to their final sizes. 717 */ 718 719 /* 1) create the cache_cache */ 720 init_MUTEX(&cache_chain_sem); 721 INIT_LIST_HEAD(&cache_chain); 722 list_add(&cache_cache.next, &cache_chain); 723 cache_cache.colour_off = cache_line_size(); 724 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 725 726 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); 727 728 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, 729 &left_over, &cache_cache.num); 730 if (!cache_cache.num) 731 BUG(); 732 733 cache_cache.colour = left_over/cache_cache.colour_off; 734 cache_cache.colour_next = 0; 735 cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + 736 sizeof(struct slab), cache_line_size()); 737 738 /* 2+3) create the kmalloc caches */ 739 sizes = malloc_sizes; 740 names = cache_names; 741 742 while (sizes->cs_size) { 743 /* For performance, all the general caches are L1 aligned. 744 * This should be particularly beneficial on SMP boxes, as it 745 * eliminates "false sharing". 746 * Note for systems short on memory removing the alignment will 747 * allow tighter packing of the smaller caches. */ 748 sizes->cs_cachep = kmem_cache_create(names->name, 749 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 750 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 751 752 /* Inc off-slab bufctl limit until the ceiling is hit. */ 753 if (!(OFF_SLAB(sizes->cs_cachep))) { 754 offslab_limit = sizes->cs_size-sizeof(struct slab); 755 offslab_limit /= sizeof(kmem_bufctl_t); 756 } 757 758 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 759 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 760 (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC), 761 NULL, NULL); 762 763 sizes++; 764 names++; 765 } 766 /* 4) Replace the bootstrap head arrays */ 767 { 768 void * ptr; 769 770 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 771 local_irq_disable(); 772 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 773 memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init)); 774 cache_cache.array[smp_processor_id()] = ptr; 775 local_irq_enable(); 776 777 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 778 local_irq_disable(); 779 BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache); 780 memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep), 781 sizeof(struct arraycache_init)); 782 malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr; 783 local_irq_enable(); 784 } 785 786 /* 5) resize the head arrays to their final sizes */ 787 { 788 kmem_cache_t *cachep; 789 down(&cache_chain_sem); 790 list_for_each_entry(cachep, &cache_chain, next) 791 enable_cpucache(cachep); 792 up(&cache_chain_sem); 793 } 794 795 /* Done! */ 796 g_cpucache_up = FULL; 797 798 /* Register a cpu startup notifier callback 799 * that initializes ac_data for all new cpus 800 */ 801 register_cpu_notifier(&cpucache_notifier); 802 803 804 /* The reap timers are started later, with a module init call: 805 * That part of the kernel is not yet operational. 806 */ 807 } 808 809 int __init cpucache_init(void) 810 { 811 int cpu; 812 813 /* 814 * Register the timers that return unneeded 815 * pages to gfp. 816 */ 817 for (cpu = 0; cpu < NR_CPUS; cpu++) { 818 if (cpu_online(cpu)) 819 start_cpu_timer(cpu); 820 } 821 822 return 0; 823 } 824 825 __initcall(cpucache_init); 826 827 /* 828 * Interface to system's page allocator. No need to hold the cache-lock. 829 * 830 * If we requested dmaable memory, we will get it. Even if we 831 * did not request dmaable memory, we might get it, but that 832 * would be relatively rare and ignorable. 833 */ 834 static void *kmem_getpages(kmem_cache_t *cachep, int flags, int nodeid) 835 { 836 struct page *page; 837 void *addr; 838 int i; 839 840 flags |= cachep->gfpflags; 841 if (likely(nodeid == -1)) { 842 addr = (void*)__get_free_pages(flags, cachep->gfporder); 843 if (!addr) 844 return NULL; 845 page = virt_to_page(addr); 846 } else { 847 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 848 if (!page) 849 return NULL; 850 addr = page_address(page); 851 } 852 853 i = (1 << cachep->gfporder); 854 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 855 atomic_add(i, &slab_reclaim_pages); 856 add_page_state(nr_slab, i); 857 while (i--) { 858 SetPageSlab(page); 859 page++; 860 } 861 return addr; 862 } 863 864 /* 865 * Interface to system's page release. 866 */ 867 static void kmem_freepages(kmem_cache_t *cachep, void *addr) 868 { 869 unsigned long i = (1<<cachep->gfporder); 870 struct page *page = virt_to_page(addr); 871 const unsigned long nr_freed = i; 872 873 while (i--) { 874 if (!TestClearPageSlab(page)) 875 BUG(); 876 page++; 877 } 878 sub_page_state(nr_slab, nr_freed); 879 if (current->reclaim_state) 880 current->reclaim_state->reclaimed_slab += nr_freed; 881 free_pages((unsigned long)addr, cachep->gfporder); 882 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 883 atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages); 884 } 885 886 #if DEBUG 887 888 #ifdef CONFIG_DEBUG_PAGEALLOC 889 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, unsigned long caller) 890 { 891 int size = obj_reallen(cachep); 892 893 addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; 894 895 if (size < 5*sizeof(unsigned long)) 896 return; 897 898 *addr++=0x12345678; 899 *addr++=caller; 900 *addr++=smp_processor_id(); 901 size -= 3*sizeof(unsigned long); 902 { 903 unsigned long *sptr = &caller; 904 unsigned long svalue; 905 906 while (!kstack_end(sptr)) { 907 svalue = *sptr++; 908 if (kernel_text_address(svalue)) { 909 *addr++=svalue; 910 size -= sizeof(unsigned long); 911 if (size <= sizeof(unsigned long)) 912 break; 913 } 914 } 915 916 } 917 *addr++=0x87654321; 918 } 919 #endif 920 921 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) 922 { 923 int size = obj_reallen(cachep); 924 addr = &((char*)addr)[obj_dbghead(cachep)]; 925 926 memset(addr, val, size); 927 *(unsigned char *)(addr+size-1) = POISON_END; 928 } 929 930 static void dump_line(char *data, int offset, int limit) 931 { 932 int i; 933 printk(KERN_ERR "%03x:", offset); 934 for (i=0;i<limit;i++) { 935 printk(" %02x", (unsigned char)data[offset+i]); 936 } 937 printk("\n"); 938 } 939 #endif 940 941 static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines) 942 { 943 #if DEBUG 944 int i, size; 945 char *realobj; 946 947 if (cachep->flags & SLAB_RED_ZONE) { 948 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 949 *dbg_redzone1(cachep, objp), 950 *dbg_redzone2(cachep, objp)); 951 } 952 953 if (cachep->flags & SLAB_STORE_USER) { 954 printk(KERN_ERR "Last user: [<%p>]", *dbg_userword(cachep, objp)); 955 print_symbol("(%s)", (unsigned long)*dbg_userword(cachep, objp)); 956 printk("\n"); 957 } 958 realobj = (char*)objp+obj_dbghead(cachep); 959 size = obj_reallen(cachep); 960 for (i=0; i<size && lines;i+=16, lines--) { 961 int limit; 962 limit = 16; 963 if (i+limit > size) 964 limit = size-i; 965 dump_line(realobj, i, limit); 966 } 967 #endif 968 } 969 970 #if DEBUG 971 972 static void check_poison_obj(kmem_cache_t *cachep, void *objp) 973 { 974 char *realobj; 975 int size, i; 976 int lines = 0; 977 978 realobj = (char*)objp+obj_dbghead(cachep); 979 size = obj_reallen(cachep); 980 981 for (i=0;i<size;i++) { 982 char exp = POISON_FREE; 983 if (i == size-1) 984 exp = POISON_END; 985 if (realobj[i] != exp) { 986 int limit; 987 /* Mismatch ! */ 988 /* Print header */ 989 if (lines == 0) { 990 printk(KERN_ERR "Slab corruption: start=%p, len=%d\n", 991 realobj, size); 992 print_objinfo(cachep, objp, 0); 993 } 994 /* Hexdump the affected line */ 995 i = (i/16)*16; 996 limit = 16; 997 if (i+limit > size) 998 limit = size-i; 999 dump_line(realobj, i, limit); 1000 i += 16; 1001 lines++; 1002 /* Limit to 5 lines */ 1003 if (lines > 5) 1004 break; 1005 } 1006 } 1007 if (lines != 0) { 1008 /* Print some data about the neighboring objects, if they 1009 * exist: 1010 */ 1011 struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp)); 1012 int objnr; 1013 1014 objnr = (objp-slabp->s_mem)/cachep->objsize; 1015 if (objnr) { 1016 objp = slabp->s_mem+(objnr-1)*cachep->objsize; 1017 realobj = (char*)objp+obj_dbghead(cachep); 1018 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1019 realobj, size); 1020 print_objinfo(cachep, objp, 2); 1021 } 1022 if (objnr+1 < cachep->num) { 1023 objp = slabp->s_mem+(objnr+1)*cachep->objsize; 1024 realobj = (char*)objp+obj_dbghead(cachep); 1025 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1026 realobj, size); 1027 print_objinfo(cachep, objp, 2); 1028 } 1029 } 1030 } 1031 #endif 1032 1033 /* Destroy all the objs in a slab, and release the mem back to the system. 1034 * Before calling the slab must have been unlinked from the cache. 1035 * The cache-lock is not held/needed. 1036 */ 1037 static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) 1038 { 1039 #if DEBUG 1040 int i; 1041 for (i = 0; i < cachep->num; i++) { 1042 void *objp = slabp->s_mem + cachep->objsize * i; 1043 1044 if (cachep->flags & SLAB_POISON) { 1045 #ifdef CONFIG_DEBUG_PAGEALLOC 1046 if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) 1047 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); 1048 else 1049 check_poison_obj(cachep, objp); 1050 #else 1051 check_poison_obj(cachep, objp); 1052 #endif 1053 } 1054 if (cachep->flags & SLAB_RED_ZONE) { 1055 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1056 slab_error(cachep, "start of a freed object " 1057 "was overwritten"); 1058 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1059 slab_error(cachep, "end of a freed object " 1060 "was overwritten"); 1061 } 1062 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1063 (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0); 1064 } 1065 #else 1066 if (cachep->dtor) { 1067 int i; 1068 for (i = 0; i < cachep->num; i++) { 1069 void* objp = slabp->s_mem+cachep->objsize*i; 1070 (cachep->dtor)(objp, cachep, 0); 1071 } 1072 } 1073 #endif 1074 1075 kmem_freepages(cachep, slabp->s_mem-slabp->colouroff); 1076 if (OFF_SLAB(cachep)) 1077 kmem_cache_free(cachep->slabp_cache, slabp); 1078 } 1079 1080 /** 1081 * kmem_cache_create - Create a cache. 1082 * @name: A string which is used in /proc/slabinfo to identify this cache. 1083 * @size: The size of objects to be created in this cache. 1084 * @align: The required alignment for the objects. 1085 * @flags: SLAB flags 1086 * @ctor: A constructor for the objects. 1087 * @dtor: A destructor for the objects. 1088 * 1089 * Returns a ptr to the cache on success, NULL on failure. 1090 * Cannot be called within a int, but can be interrupted. 1091 * The @ctor is run when new pages are allocated by the cache 1092 * and the @dtor is run before the pages are handed back. 1093 * 1094 * @name must be valid until the cache is destroyed. This implies that 1095 * the module calling this has to destroy the cache before getting 1096 * unloaded. 1097 * 1098 * The flags are 1099 * 1100 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 1101 * to catch references to uninitialised memory. 1102 * 1103 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 1104 * for buffer overruns. 1105 * 1106 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under 1107 * memory pressure. 1108 * 1109 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 1110 * cacheline. This can be beneficial if you're counting cycles as closely 1111 * as davem. 1112 */ 1113 kmem_cache_t * 1114 kmem_cache_create (const char *name, size_t size, size_t align, 1115 unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), 1116 void (*dtor)(void*, kmem_cache_t *, unsigned long)) 1117 { 1118 size_t left_over, slab_size; 1119 kmem_cache_t *cachep = NULL; 1120 1121 /* 1122 * Sanity checks... these are all serious usage bugs. 1123 */ 1124 if ((!name) || 1125 in_interrupt() || 1126 (size < BYTES_PER_WORD) || 1127 (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) || 1128 (dtor && !ctor)) { 1129 printk(KERN_ERR "%s: Early error in slab %s\n", 1130 __FUNCTION__, name); 1131 BUG(); 1132 } 1133 1134 #if DEBUG 1135 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 1136 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { 1137 /* No constructor, but inital state check requested */ 1138 printk(KERN_ERR "%s: No con, but init state check " 1139 "requested - %s\n", __FUNCTION__, name); 1140 flags &= ~SLAB_DEBUG_INITIAL; 1141 } 1142 1143 #if FORCED_DEBUG 1144 /* 1145 * Enable redzoning and last user accounting, except for caches with 1146 * large objects, if the increased size would increase the object size 1147 * above the next power of two: caches with object sizes just above a 1148 * power of two have a significant amount of internal fragmentation. 1149 */ 1150 if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) 1151 flags |= SLAB_RED_ZONE|SLAB_STORE_USER; 1152 flags |= SLAB_POISON; 1153 #endif 1154 #endif 1155 /* 1156 * Always checks flags, a caller might be expecting debug 1157 * support which isn't available. 1158 */ 1159 if (flags & ~CREATE_MASK) 1160 BUG(); 1161 1162 if (align) { 1163 /* combinations of forced alignment and advanced debugging is 1164 * not yet implemented. 1165 */ 1166 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); 1167 } else { 1168 if (flags & SLAB_HWCACHE_ALIGN) { 1169 /* Default alignment: as specified by the arch code. 1170 * Except if an object is really small, then squeeze multiple 1171 * into one cacheline. 1172 */ 1173 align = cache_line_size(); 1174 while (size <= align/2) 1175 align /= 2; 1176 } else { 1177 align = BYTES_PER_WORD; 1178 } 1179 } 1180 1181 /* Get cache's description obj. */ 1182 cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); 1183 if (!cachep) 1184 goto opps; 1185 memset(cachep, 0, sizeof(kmem_cache_t)); 1186 1187 /* Check that size is in terms of words. This is needed to avoid 1188 * unaligned accesses for some archs when redzoning is used, and makes 1189 * sure any on-slab bufctl's are also correctly aligned. 1190 */ 1191 if (size & (BYTES_PER_WORD-1)) { 1192 size += (BYTES_PER_WORD-1); 1193 size &= ~(BYTES_PER_WORD-1); 1194 } 1195 1196 #if DEBUG 1197 cachep->reallen = size; 1198 1199 if (flags & SLAB_RED_ZONE) { 1200 /* redzoning only works with word aligned caches */ 1201 align = BYTES_PER_WORD; 1202 1203 /* add space for red zone words */ 1204 cachep->dbghead += BYTES_PER_WORD; 1205 size += 2*BYTES_PER_WORD; 1206 } 1207 if (flags & SLAB_STORE_USER) { 1208 /* user store requires word alignment and 1209 * one word storage behind the end of the real 1210 * object. 1211 */ 1212 align = BYTES_PER_WORD; 1213 size += BYTES_PER_WORD; 1214 } 1215 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1216 if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { 1217 cachep->dbghead += PAGE_SIZE - size; 1218 size = PAGE_SIZE; 1219 } 1220 #endif 1221 #endif 1222 1223 /* Determine if the slab management is 'on' or 'off' slab. */ 1224 if (size >= (PAGE_SIZE>>3)) 1225 /* 1226 * Size is large, assume best to place the slab management obj 1227 * off-slab (should allow better packing of objs). 1228 */ 1229 flags |= CFLGS_OFF_SLAB; 1230 1231 size = ALIGN(size, align); 1232 1233 if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { 1234 /* 1235 * A VFS-reclaimable slab tends to have most allocations 1236 * as GFP_NOFS and we really don't want to have to be allocating 1237 * higher-order pages when we are unable to shrink dcache. 1238 */ 1239 cachep->gfporder = 0; 1240 cache_estimate(cachep->gfporder, size, align, flags, 1241 &left_over, &cachep->num); 1242 } else { 1243 /* 1244 * Calculate size (in pages) of slabs, and the num of objs per 1245 * slab. This could be made much more intelligent. For now, 1246 * try to avoid using high page-orders for slabs. When the 1247 * gfp() funcs are more friendly towards high-order requests, 1248 * this should be changed. 1249 */ 1250 do { 1251 unsigned int break_flag = 0; 1252 cal_wastage: 1253 cache_estimate(cachep->gfporder, size, align, flags, 1254 &left_over, &cachep->num); 1255 if (break_flag) 1256 break; 1257 if (cachep->gfporder >= MAX_GFP_ORDER) 1258 break; 1259 if (!cachep->num) 1260 goto next; 1261 if (flags & CFLGS_OFF_SLAB && 1262 cachep->num > offslab_limit) { 1263 /* This num of objs will cause problems. */ 1264 cachep->gfporder--; 1265 break_flag++; 1266 goto cal_wastage; 1267 } 1268 1269 /* 1270 * Large num of objs is good, but v. large slabs are 1271 * currently bad for the gfp()s. 1272 */ 1273 if (cachep->gfporder >= slab_break_gfp_order) 1274 break; 1275 1276 if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder)) 1277 break; /* Acceptable internal fragmentation. */ 1278 next: 1279 cachep->gfporder++; 1280 } while (1); 1281 } 1282 1283 if (!cachep->num) { 1284 printk("kmem_cache_create: couldn't create cache %s.\n", name); 1285 kmem_cache_free(&cache_cache, cachep); 1286 cachep = NULL; 1287 goto opps; 1288 } 1289 slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) 1290 + sizeof(struct slab), align); 1291 1292 /* 1293 * If the slab has been placed off-slab, and we have enough space then 1294 * move it on-slab. This is at the expense of any extra colouring. 1295 */ 1296 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { 1297 flags &= ~CFLGS_OFF_SLAB; 1298 left_over -= slab_size; 1299 } 1300 1301 if (flags & CFLGS_OFF_SLAB) { 1302 /* really off slab. No need for manual alignment */ 1303 slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); 1304 } 1305 1306 cachep->colour_off = cache_line_size(); 1307 /* Offset must be a multiple of the alignment. */ 1308 if (cachep->colour_off < align) 1309 cachep->colour_off = align; 1310 cachep->colour = left_over/cachep->colour_off; 1311 cachep->slab_size = slab_size; 1312 cachep->flags = flags; 1313 cachep->gfpflags = 0; 1314 if (flags & SLAB_CACHE_DMA) 1315 cachep->gfpflags |= GFP_DMA; 1316 spin_lock_init(&cachep->spinlock); 1317 cachep->objsize = size; 1318 /* NUMA */ 1319 INIT_LIST_HEAD(&cachep->lists.slabs_full); 1320 INIT_LIST_HEAD(&cachep->lists.slabs_partial); 1321 INIT_LIST_HEAD(&cachep->lists.slabs_free); 1322 1323 if (flags & CFLGS_OFF_SLAB) 1324 cachep->slabp_cache = kmem_find_general_cachep(slab_size,0); 1325 cachep->ctor = ctor; 1326 cachep->dtor = dtor; 1327 cachep->name = name; 1328 1329 /* Don't let CPUs to come and go */ 1330 lock_cpu_hotplug(); 1331 1332 if (g_cpucache_up == FULL) { 1333 enable_cpucache(cachep); 1334 } else { 1335 if (g_cpucache_up == NONE) { 1336 /* Note: the first kmem_cache_create must create 1337 * the cache that's used by kmalloc(24), otherwise 1338 * the creation of further caches will BUG(). 1339 */ 1340 cachep->array[smp_processor_id()] = &initarray_generic.cache; 1341 g_cpucache_up = PARTIAL; 1342 } else { 1343 cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL); 1344 } 1345 BUG_ON(!ac_data(cachep)); 1346 ac_data(cachep)->avail = 0; 1347 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 1348 ac_data(cachep)->batchcount = 1; 1349 ac_data(cachep)->touched = 0; 1350 cachep->batchcount = 1; 1351 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1352 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount 1353 + cachep->num; 1354 } 1355 1356 cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 + 1357 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 1358 1359 /* Need the semaphore to access the chain. */ 1360 down(&cache_chain_sem); 1361 { 1362 struct list_head *p; 1363 mm_segment_t old_fs; 1364 1365 old_fs = get_fs(); 1366 set_fs(KERNEL_DS); 1367 list_for_each(p, &cache_chain) { 1368 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); 1369 char tmp; 1370 /* This happens when the module gets unloaded and doesn't 1371 destroy its slab cache and noone else reuses the vmalloc 1372 area of the module. Print a warning. */ 1373 if (__get_user(tmp,pc->name)) { 1374 printk("SLAB: cache with size %d has lost its name\n", 1375 pc->objsize); 1376 continue; 1377 } 1378 if (!strcmp(pc->name,name)) { 1379 printk("kmem_cache_create: duplicate cache %s\n",name); 1380 up(&cache_chain_sem); 1381 unlock_cpu_hotplug(); 1382 BUG(); 1383 } 1384 } 1385 set_fs(old_fs); 1386 } 1387 1388 /* cache setup completed, link it into the list */ 1389 list_add(&cachep->next, &cache_chain); 1390 up(&cache_chain_sem); 1391 unlock_cpu_hotplug(); 1392 opps: 1393 if (!cachep && (flags & SLAB_PANIC)) 1394 panic("kmem_cache_create(): failed to create slab `%s'\n", 1395 name); 1396 return cachep; 1397 } 1398 EXPORT_SYMBOL(kmem_cache_create); 1399 1400 #if DEBUG 1401 static void check_irq_off(void) 1402 { 1403 BUG_ON(!irqs_disabled()); 1404 } 1405 1406 static void check_irq_on(void) 1407 { 1408 BUG_ON(irqs_disabled()); 1409 } 1410 1411 static void check_spinlock_acquired(kmem_cache_t *cachep) 1412 { 1413 #ifdef CONFIG_SMP 1414 check_irq_off(); 1415 BUG_ON(spin_trylock(&cachep->spinlock)); 1416 #endif 1417 } 1418 #else 1419 #define check_irq_off() do { } while(0) 1420 #define check_irq_on() do { } while(0) 1421 #define check_spinlock_acquired(x) do { } while(0) 1422 #endif 1423 1424 /* 1425 * Waits for all CPUs to execute func(). 1426 */ 1427 static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) 1428 { 1429 check_irq_on(); 1430 preempt_disable(); 1431 1432 local_irq_disable(); 1433 func(arg); 1434 local_irq_enable(); 1435 1436 if (smp_call_function(func, arg, 1, 1)) 1437 BUG(); 1438 1439 preempt_enable(); 1440 } 1441 1442 static void drain_array_locked(kmem_cache_t* cachep, 1443 struct array_cache *ac, int force); 1444 1445 static void do_drain(void *arg) 1446 { 1447 kmem_cache_t *cachep = (kmem_cache_t*)arg; 1448 struct array_cache *ac; 1449 1450 check_irq_off(); 1451 ac = ac_data(cachep); 1452 spin_lock(&cachep->spinlock); 1453 free_block(cachep, &ac_entry(ac)[0], ac->avail); 1454 spin_unlock(&cachep->spinlock); 1455 ac->avail = 0; 1456 } 1457 1458 static void drain_cpu_caches(kmem_cache_t *cachep) 1459 { 1460 smp_call_function_all_cpus(do_drain, cachep); 1461 check_irq_on(); 1462 spin_lock_irq(&cachep->spinlock); 1463 if (cachep->lists.shared) 1464 drain_array_locked(cachep, cachep->lists.shared, 1); 1465 spin_unlock_irq(&cachep->spinlock); 1466 } 1467 1468 1469 /* NUMA shrink all list3s */ 1470 static int __cache_shrink(kmem_cache_t *cachep) 1471 { 1472 struct slab *slabp; 1473 int ret; 1474 1475 drain_cpu_caches(cachep); 1476 1477 check_irq_on(); 1478 spin_lock_irq(&cachep->spinlock); 1479 1480 for(;;) { 1481 struct list_head *p; 1482 1483 p = cachep->lists.slabs_free.prev; 1484 if (p == &cachep->lists.slabs_free) 1485 break; 1486 1487 slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list); 1488 #if DEBUG 1489 if (slabp->inuse) 1490 BUG(); 1491 #endif 1492 list_del(&slabp->list); 1493 1494 cachep->lists.free_objects -= cachep->num; 1495 spin_unlock_irq(&cachep->spinlock); 1496 slab_destroy(cachep, slabp); 1497 spin_lock_irq(&cachep->spinlock); 1498 } 1499 ret = !list_empty(&cachep->lists.slabs_full) || 1500 !list_empty(&cachep->lists.slabs_partial); 1501 spin_unlock_irq(&cachep->spinlock); 1502 return ret; 1503 } 1504 1505 /** 1506 * kmem_cache_shrink - Shrink a cache. 1507 * @cachep: The cache to shrink. 1508 * 1509 * Releases as many slabs as possible for a cache. 1510 * To help debugging, a zero exit status indicates all slabs were released. 1511 */ 1512 int kmem_cache_shrink(kmem_cache_t *cachep) 1513 { 1514 if (!cachep || in_interrupt()) 1515 BUG(); 1516 1517 return __cache_shrink(cachep); 1518 } 1519 1520 EXPORT_SYMBOL(kmem_cache_shrink); 1521 1522 /** 1523 * kmem_cache_destroy - delete a cache 1524 * @cachep: the cache to destroy 1525 * 1526 * Remove a kmem_cache_t object from the slab cache. 1527 * Returns 0 on success. 1528 * 1529 * It is expected this function will be called by a module when it is 1530 * unloaded. This will remove the cache completely, and avoid a duplicate 1531 * cache being allocated each time a module is loaded and unloaded, if the 1532 * module doesn't have persistent in-kernel storage across loads and unloads. 1533 * 1534 * The cache must be empty before calling this function. 1535 * 1536 * The caller must guarantee that noone will allocate memory from the cache 1537 * during the kmem_cache_destroy(). 1538 */ 1539 int kmem_cache_destroy (kmem_cache_t * cachep) 1540 { 1541 int i; 1542 1543 if (!cachep || in_interrupt()) 1544 BUG(); 1545 1546 /* Don't let CPUs to come and go */ 1547 lock_cpu_hotplug(); 1548 1549 /* Find the cache in the chain of caches. */ 1550 down(&cache_chain_sem); 1551 /* 1552 * the chain is never empty, cache_cache is never destroyed 1553 */ 1554 list_del(&cachep->next); 1555 up(&cache_chain_sem); 1556 1557 if (__cache_shrink(cachep)) { 1558 slab_error(cachep, "Can't free all objects"); 1559 down(&cache_chain_sem); 1560 list_add(&cachep->next,&cache_chain); 1561 up(&cache_chain_sem); 1562 unlock_cpu_hotplug(); 1563 return 1; 1564 } 1565 1566 /* no cpu_online check required here since we clear the percpu 1567 * array on cpu offline and set this to NULL. 1568 */ 1569 for (i = 0; i < NR_CPUS; i++) 1570 kfree(cachep->array[i]); 1571 1572 /* NUMA: free the list3 structures */ 1573 kfree(cachep->lists.shared); 1574 cachep->lists.shared = NULL; 1575 kmem_cache_free(&cache_cache, cachep); 1576 1577 unlock_cpu_hotplug(); 1578 1579 return 0; 1580 } 1581 1582 EXPORT_SYMBOL(kmem_cache_destroy); 1583 1584 /* Get the memory for a slab management obj. */ 1585 static struct slab* alloc_slabmgmt (kmem_cache_t *cachep, 1586 void *objp, int colour_off, int local_flags) 1587 { 1588 struct slab *slabp; 1589 1590 if (OFF_SLAB(cachep)) { 1591 /* Slab management obj is off-slab. */ 1592 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); 1593 if (!slabp) 1594 return NULL; 1595 } else { 1596 slabp = objp+colour_off; 1597 colour_off += cachep->slab_size; 1598 } 1599 slabp->inuse = 0; 1600 slabp->colouroff = colour_off; 1601 slabp->s_mem = objp+colour_off; 1602 1603 return slabp; 1604 } 1605 1606 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 1607 { 1608 return (kmem_bufctl_t *)(slabp+1); 1609 } 1610 1611 static void cache_init_objs (kmem_cache_t * cachep, 1612 struct slab * slabp, unsigned long ctor_flags) 1613 { 1614 int i; 1615 1616 for (i = 0; i < cachep->num; i++) { 1617 void* objp = slabp->s_mem+cachep->objsize*i; 1618 #if DEBUG 1619 /* need to poison the objs? */ 1620 if (cachep->flags & SLAB_POISON) 1621 poison_obj(cachep, objp, POISON_FREE); 1622 if (cachep->flags & SLAB_STORE_USER) 1623 *dbg_userword(cachep, objp) = NULL; 1624 1625 if (cachep->flags & SLAB_RED_ZONE) { 1626 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 1627 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 1628 } 1629 /* 1630 * Constructors are not allowed to allocate memory from 1631 * the same cache which they are a constructor for. 1632 * Otherwise, deadlock. They must also be threaded. 1633 */ 1634 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 1635 cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags); 1636 1637 if (cachep->flags & SLAB_RED_ZONE) { 1638 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1639 slab_error(cachep, "constructor overwrote the" 1640 " end of an object"); 1641 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1642 slab_error(cachep, "constructor overwrote the" 1643 " start of an object"); 1644 } 1645 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 1646 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 1647 #else 1648 if (cachep->ctor) 1649 cachep->ctor(objp, cachep, ctor_flags); 1650 #endif 1651 slab_bufctl(slabp)[i] = i+1; 1652 } 1653 slab_bufctl(slabp)[i-1] = BUFCTL_END; 1654 slabp->free = 0; 1655 } 1656 1657 static void kmem_flagcheck(kmem_cache_t *cachep, int flags) 1658 { 1659 if (flags & SLAB_DMA) { 1660 if (!(cachep->gfpflags & GFP_DMA)) 1661 BUG(); 1662 } else { 1663 if (cachep->gfpflags & GFP_DMA) 1664 BUG(); 1665 } 1666 } 1667 1668 static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) 1669 { 1670 int i; 1671 struct page *page; 1672 1673 /* Nasty!!!!!! I hope this is OK. */ 1674 i = 1 << cachep->gfporder; 1675 page = virt_to_page(objp); 1676 do { 1677 SET_PAGE_CACHE(page, cachep); 1678 SET_PAGE_SLAB(page, slabp); 1679 page++; 1680 } while (--i); 1681 } 1682 1683 /* 1684 * Grow (by 1) the number of slabs within a cache. This is called by 1685 * kmem_cache_alloc() when there are no active objs left in a cache. 1686 */ 1687 static int cache_grow (kmem_cache_t * cachep, int flags) 1688 { 1689 struct slab *slabp; 1690 void *objp; 1691 size_t offset; 1692 int local_flags; 1693 unsigned long ctor_flags; 1694 1695 /* Be lazy and only check for valid flags here, 1696 * keeping it out of the critical path in kmem_cache_alloc(). 1697 */ 1698 if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) 1699 BUG(); 1700 if (flags & SLAB_NO_GROW) 1701 return 0; 1702 1703 ctor_flags = SLAB_CTOR_CONSTRUCTOR; 1704 local_flags = (flags & SLAB_LEVEL_MASK); 1705 if (!(local_flags & __GFP_WAIT)) 1706 /* 1707 * Not allowed to sleep. Need to tell a constructor about 1708 * this - it might need to know... 1709 */ 1710 ctor_flags |= SLAB_CTOR_ATOMIC; 1711 1712 /* About to mess with non-constant members - lock. */ 1713 check_irq_off(); 1714 spin_lock(&cachep->spinlock); 1715 1716 /* Get colour for the slab, and cal the next value. */ 1717 offset = cachep->colour_next; 1718 cachep->colour_next++; 1719 if (cachep->colour_next >= cachep->colour) 1720 cachep->colour_next = 0; 1721 offset *= cachep->colour_off; 1722 1723 spin_unlock(&cachep->spinlock); 1724 1725 if (local_flags & __GFP_WAIT) 1726 local_irq_enable(); 1727 1728 /* 1729 * The test for missing atomic flag is performed here, rather than 1730 * the more obvious place, simply to reduce the critical path length 1731 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 1732 * will eventually be caught here (where it matters). 1733 */ 1734 kmem_flagcheck(cachep, flags); 1735 1736 1737 /* Get mem for the objs. */ 1738 if (!(objp = kmem_getpages(cachep, flags, -1))) 1739 goto failed; 1740 1741 /* Get slab management. */ 1742 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) 1743 goto opps1; 1744 1745 set_slab_attr(cachep, slabp, objp); 1746 1747 cache_init_objs(cachep, slabp, ctor_flags); 1748 1749 if (local_flags & __GFP_WAIT) 1750 local_irq_disable(); 1751 check_irq_off(); 1752 spin_lock(&cachep->spinlock); 1753 1754 /* Make slab active. */ 1755 list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free)); 1756 STATS_INC_GROWN(cachep); 1757 list3_data(cachep)->free_objects += cachep->num; 1758 spin_unlock(&cachep->spinlock); 1759 return 1; 1760 opps1: 1761 kmem_freepages(cachep, objp); 1762 failed: 1763 if (local_flags & __GFP_WAIT) 1764 local_irq_disable(); 1765 return 0; 1766 } 1767 1768 #if DEBUG 1769 1770 /* 1771 * Perform extra freeing checks: 1772 * - detect bad pointers. 1773 * - POISON/RED_ZONE checking 1774 * - destructor calls, for caches with POISON+dtor 1775 */ 1776 static void kfree_debugcheck(const void *objp) 1777 { 1778 struct page *page; 1779 1780 if (!virt_addr_valid(objp)) { 1781 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 1782 (unsigned long)objp); 1783 BUG(); 1784 } 1785 page = virt_to_page(objp); 1786 if (!PageSlab(page)) { 1787 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp); 1788 BUG(); 1789 } 1790 } 1791 1792 static void *cache_free_debugcheck (kmem_cache_t * cachep, void * objp, void *caller) 1793 { 1794 struct page *page; 1795 unsigned int objnr; 1796 struct slab *slabp; 1797 1798 objp -= obj_dbghead(cachep); 1799 kfree_debugcheck(objp); 1800 page = virt_to_page(objp); 1801 1802 if (GET_PAGE_CACHE(page) != cachep) { 1803 printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n", 1804 GET_PAGE_CACHE(page),cachep); 1805 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); 1806 printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name); 1807 WARN_ON(1); 1808 } 1809 slabp = GET_PAGE_SLAB(page); 1810 1811 if (cachep->flags & SLAB_RED_ZONE) { 1812 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { 1813 slab_error(cachep, "double free, or memory outside" 1814 " object was overwritten"); 1815 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 1816 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 1817 } 1818 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 1819 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 1820 } 1821 if (cachep->flags & SLAB_STORE_USER) 1822 *dbg_userword(cachep, objp) = caller; 1823 1824 objnr = (objp-slabp->s_mem)/cachep->objsize; 1825 1826 BUG_ON(objnr >= cachep->num); 1827 BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize); 1828 1829 if (cachep->flags & SLAB_DEBUG_INITIAL) { 1830 /* Need to call the slab's constructor so the 1831 * caller can perform a verify of its state (debugging). 1832 * Called without the cache-lock held. 1833 */ 1834 cachep->ctor(objp+obj_dbghead(cachep), 1835 cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); 1836 } 1837 if (cachep->flags & SLAB_POISON && cachep->dtor) { 1838 /* we want to cache poison the object, 1839 * call the destruction callback 1840 */ 1841 cachep->dtor(objp+obj_dbghead(cachep), cachep, 0); 1842 } 1843 if (cachep->flags & SLAB_POISON) { 1844 #ifdef CONFIG_DEBUG_PAGEALLOC 1845 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 1846 store_stackinfo(cachep, objp, (unsigned long)caller); 1847 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 1848 } else { 1849 poison_obj(cachep, objp, POISON_FREE); 1850 } 1851 #else 1852 poison_obj(cachep, objp, POISON_FREE); 1853 #endif 1854 } 1855 return objp; 1856 } 1857 1858 static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) 1859 { 1860 int i; 1861 int entries = 0; 1862 1863 check_spinlock_acquired(cachep); 1864 /* Check slab's freelist to see if this obj is there. */ 1865 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 1866 entries++; 1867 if (entries > cachep->num || i < 0 || i >= cachep->num) 1868 goto bad; 1869 } 1870 if (entries != cachep->num - slabp->inuse) { 1871 int i; 1872 bad: 1873 printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 1874 cachep->name, cachep->num, slabp, slabp->inuse); 1875 for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) { 1876 if ((i%16)==0) 1877 printk("\n%03x:", i); 1878 printk(" %02x", ((unsigned char*)slabp)[i]); 1879 } 1880 printk("\n"); 1881 BUG(); 1882 } 1883 } 1884 #else 1885 #define kfree_debugcheck(x) do { } while(0) 1886 #define cache_free_debugcheck(x,objp,z) (objp) 1887 #define check_slabp(x,y) do { } while(0) 1888 #endif 1889 1890 static void* cache_alloc_refill(kmem_cache_t* cachep, int flags) 1891 { 1892 int batchcount; 1893 struct kmem_list3 *l3; 1894 struct array_cache *ac; 1895 1896 check_irq_off(); 1897 ac = ac_data(cachep); 1898 retry: 1899 batchcount = ac->batchcount; 1900 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 1901 /* if there was little recent activity on this 1902 * cache, then perform only a partial refill. 1903 * Otherwise we could generate refill bouncing. 1904 */ 1905 batchcount = BATCHREFILL_LIMIT; 1906 } 1907 l3 = list3_data(cachep); 1908 1909 BUG_ON(ac->avail > 0); 1910 spin_lock(&cachep->spinlock); 1911 if (l3->shared) { 1912 struct array_cache *shared_array = l3->shared; 1913 if (shared_array->avail) { 1914 if (batchcount > shared_array->avail) 1915 batchcount = shared_array->avail; 1916 shared_array->avail -= batchcount; 1917 ac->avail = batchcount; 1918 memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail], 1919 sizeof(void*)*batchcount); 1920 shared_array->touched = 1; 1921 goto alloc_done; 1922 } 1923 } 1924 while (batchcount > 0) { 1925 struct list_head *entry; 1926 struct slab *slabp; 1927 /* Get slab alloc is to come from. */ 1928 entry = l3->slabs_partial.next; 1929 if (entry == &l3->slabs_partial) { 1930 l3->free_touched = 1; 1931 entry = l3->slabs_free.next; 1932 if (entry == &l3->slabs_free) 1933 goto must_grow; 1934 } 1935 1936 slabp = list_entry(entry, struct slab, list); 1937 check_slabp(cachep, slabp); 1938 check_spinlock_acquired(cachep); 1939 while (slabp->inuse < cachep->num && batchcount--) { 1940 kmem_bufctl_t next; 1941 STATS_INC_ALLOCED(cachep); 1942 STATS_INC_ACTIVE(cachep); 1943 STATS_SET_HIGH(cachep); 1944 1945 /* get obj pointer */ 1946 ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize; 1947 1948 slabp->inuse++; 1949 next = slab_bufctl(slabp)[slabp->free]; 1950 #if DEBUG 1951 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 1952 #endif 1953 slabp->free = next; 1954 } 1955 check_slabp(cachep, slabp); 1956 1957 /* move slabp to correct slabp list: */ 1958 list_del(&slabp->list); 1959 if (slabp->free == BUFCTL_END) 1960 list_add(&slabp->list, &l3->slabs_full); 1961 else 1962 list_add(&slabp->list, &l3->slabs_partial); 1963 } 1964 1965 must_grow: 1966 l3->free_objects -= ac->avail; 1967 alloc_done: 1968 spin_unlock(&cachep->spinlock); 1969 1970 if (unlikely(!ac->avail)) { 1971 int x; 1972 x = cache_grow(cachep, flags); 1973 1974 // cache_grow can reenable interrupts, then ac could change. 1975 ac = ac_data(cachep); 1976 if (!x && ac->avail == 0) // no objects in sight? abort 1977 return NULL; 1978 1979 if (!ac->avail) // objects refilled by interrupt? 1980 goto retry; 1981 } 1982 ac->touched = 1; 1983 return ac_entry(ac)[--ac->avail]; 1984 } 1985 1986 static inline void 1987 cache_alloc_debugcheck_before(kmem_cache_t *cachep, int flags) 1988 { 1989 might_sleep_if(flags & __GFP_WAIT); 1990 #if DEBUG 1991 kmem_flagcheck(cachep, flags); 1992 #endif 1993 } 1994 1995 #if DEBUG 1996 static void * 1997 cache_alloc_debugcheck_after(kmem_cache_t *cachep, 1998 unsigned long flags, void *objp, void *caller) 1999 { 2000 if (!objp) 2001 return objp; 2002 if (cachep->flags & SLAB_POISON) { 2003 #ifdef CONFIG_DEBUG_PAGEALLOC 2004 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2005 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1); 2006 else 2007 check_poison_obj(cachep, objp); 2008 #else 2009 check_poison_obj(cachep, objp); 2010 #endif 2011 poison_obj(cachep, objp, POISON_INUSE); 2012 } 2013 if (cachep->flags & SLAB_STORE_USER) 2014 *dbg_userword(cachep, objp) = caller; 2015 2016 if (cachep->flags & SLAB_RED_ZONE) { 2017 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2018 slab_error(cachep, "double free, or memory outside" 2019 " object was overwritten"); 2020 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2021 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 2022 } 2023 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2024 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2025 } 2026 objp += obj_dbghead(cachep); 2027 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2028 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2029 2030 if (!(flags & __GFP_WAIT)) 2031 ctor_flags |= SLAB_CTOR_ATOMIC; 2032 2033 cachep->ctor(objp, cachep, ctor_flags); 2034 } 2035 return objp; 2036 } 2037 #else 2038 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2039 #endif 2040 2041 2042 static inline void * __cache_alloc (kmem_cache_t *cachep, int flags) 2043 { 2044 unsigned long save_flags; 2045 void* objp; 2046 struct array_cache *ac; 2047 2048 cache_alloc_debugcheck_before(cachep, flags); 2049 2050 local_irq_save(save_flags); 2051 ac = ac_data(cachep); 2052 if (likely(ac->avail)) { 2053 STATS_INC_ALLOCHIT(cachep); 2054 ac->touched = 1; 2055 objp = ac_entry(ac)[--ac->avail]; 2056 } else { 2057 STATS_INC_ALLOCMISS(cachep); 2058 objp = cache_alloc_refill(cachep, flags); 2059 } 2060 local_irq_restore(save_flags); 2061 objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0)); 2062 return objp; 2063 } 2064 2065 /* 2066 * NUMA: different approach needed if the spinlock is moved into 2067 * the l3 structure 2068 */ 2069 2070 static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects) 2071 { 2072 int i; 2073 2074 check_spinlock_acquired(cachep); 2075 2076 /* NUMA: move add into loop */ 2077 cachep->lists.free_objects += nr_objects; 2078 2079 for (i = 0; i < nr_objects; i++) { 2080 void *objp = objpp[i]; 2081 struct slab *slabp; 2082 unsigned int objnr; 2083 2084 slabp = GET_PAGE_SLAB(virt_to_page(objp)); 2085 list_del(&slabp->list); 2086 objnr = (objp - slabp->s_mem) / cachep->objsize; 2087 check_slabp(cachep, slabp); 2088 #if DEBUG 2089 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2090 printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n", 2091 cachep->name, objp); 2092 BUG(); 2093 } 2094 #endif 2095 slab_bufctl(slabp)[objnr] = slabp->free; 2096 slabp->free = objnr; 2097 STATS_DEC_ACTIVE(cachep); 2098 slabp->inuse--; 2099 check_slabp(cachep, slabp); 2100 2101 /* fixup slab chains */ 2102 if (slabp->inuse == 0) { 2103 if (cachep->lists.free_objects > cachep->free_limit) { 2104 cachep->lists.free_objects -= cachep->num; 2105 slab_destroy(cachep, slabp); 2106 } else { 2107 list_add(&slabp->list, 2108 &list3_data_ptr(cachep, objp)->slabs_free); 2109 } 2110 } else { 2111 /* Unconditionally move a slab to the end of the 2112 * partial list on free - maximum time for the 2113 * other objects to be freed, too. 2114 */ 2115 list_add_tail(&slabp->list, 2116 &list3_data_ptr(cachep, objp)->slabs_partial); 2117 } 2118 } 2119 } 2120 2121 static void cache_flusharray (kmem_cache_t* cachep, struct array_cache *ac) 2122 { 2123 int batchcount; 2124 2125 batchcount = ac->batchcount; 2126 #if DEBUG 2127 BUG_ON(!batchcount || batchcount > ac->avail); 2128 #endif 2129 check_irq_off(); 2130 spin_lock(&cachep->spinlock); 2131 if (cachep->lists.shared) { 2132 struct array_cache *shared_array = cachep->lists.shared; 2133 int max = shared_array->limit-shared_array->avail; 2134 if (max) { 2135 if (batchcount > max) 2136 batchcount = max; 2137 memcpy(&ac_entry(shared_array)[shared_array->avail], 2138 &ac_entry(ac)[0], 2139 sizeof(void*)*batchcount); 2140 shared_array->avail += batchcount; 2141 goto free_done; 2142 } 2143 } 2144 2145 free_block(cachep, &ac_entry(ac)[0], batchcount); 2146 free_done: 2147 #if STATS 2148 { 2149 int i = 0; 2150 struct list_head *p; 2151 2152 p = list3_data(cachep)->slabs_free.next; 2153 while (p != &(list3_data(cachep)->slabs_free)) { 2154 struct slab *slabp; 2155 2156 slabp = list_entry(p, struct slab, list); 2157 BUG_ON(slabp->inuse); 2158 2159 i++; 2160 p = p->next; 2161 } 2162 STATS_SET_FREEABLE(cachep, i); 2163 } 2164 #endif 2165 spin_unlock(&cachep->spinlock); 2166 ac->avail -= batchcount; 2167 memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount], 2168 sizeof(void*)*ac->avail); 2169 } 2170 2171 /* 2172 * __cache_free 2173 * Release an obj back to its cache. If the obj has a constructed 2174 * state, it must be in this state _before_ it is released. 2175 * 2176 * Called with disabled ints. 2177 */ 2178 static inline void __cache_free (kmem_cache_t *cachep, void* objp) 2179 { 2180 struct array_cache *ac = ac_data(cachep); 2181 2182 check_irq_off(); 2183 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 2184 2185 if (likely(ac->avail < ac->limit)) { 2186 STATS_INC_FREEHIT(cachep); 2187 ac_entry(ac)[ac->avail++] = objp; 2188 return; 2189 } else { 2190 STATS_INC_FREEMISS(cachep); 2191 cache_flusharray(cachep, ac); 2192 ac_entry(ac)[ac->avail++] = objp; 2193 } 2194 } 2195 2196 /** 2197 * kmem_cache_alloc - Allocate an object 2198 * @cachep: The cache to allocate from. 2199 * @flags: See kmalloc(). 2200 * 2201 * Allocate an object from this cache. The flags are only relevant 2202 * if the cache has no available objects. 2203 */ 2204 void * kmem_cache_alloc (kmem_cache_t *cachep, int flags) 2205 { 2206 return __cache_alloc(cachep, flags); 2207 } 2208 2209 EXPORT_SYMBOL(kmem_cache_alloc); 2210 2211 /** 2212 * kmem_ptr_validate - check if an untrusted pointer might 2213 * be a slab entry. 2214 * @cachep: the cache we're checking against 2215 * @ptr: pointer to validate 2216 * 2217 * This verifies that the untrusted pointer looks sane: 2218 * it is _not_ a guarantee that the pointer is actually 2219 * part of the slab cache in question, but it at least 2220 * validates that the pointer can be dereferenced and 2221 * looks half-way sane. 2222 * 2223 * Currently only used for dentry validation. 2224 */ 2225 int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) 2226 { 2227 unsigned long addr = (unsigned long) ptr; 2228 unsigned long min_addr = PAGE_OFFSET; 2229 unsigned long align_mask = BYTES_PER_WORD-1; 2230 unsigned long size = cachep->objsize; 2231 struct page *page; 2232 2233 if (unlikely(addr < min_addr)) 2234 goto out; 2235 if (unlikely(addr > (unsigned long)high_memory - size)) 2236 goto out; 2237 if (unlikely(addr & align_mask)) 2238 goto out; 2239 if (unlikely(!kern_addr_valid(addr))) 2240 goto out; 2241 if (unlikely(!kern_addr_valid(addr + size - 1))) 2242 goto out; 2243 page = virt_to_page(ptr); 2244 if (unlikely(!PageSlab(page))) 2245 goto out; 2246 if (unlikely(GET_PAGE_CACHE(page) != cachep)) 2247 goto out; 2248 return 1; 2249 out: 2250 return 0; 2251 } 2252 2253 /** 2254 * kmem_cache_alloc_node - Allocate an object on the specified node 2255 * @cachep: The cache to allocate from. 2256 * @flags: See kmalloc(). 2257 * @nodeid: node number of the target node. 2258 * 2259 * Identical to kmem_cache_alloc, except that this function is slow 2260 * and can sleep. And it will allocate memory on the given node, which 2261 * can improve the performance for cpu bound structures. 2262 */ 2263 void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid) 2264 { 2265 size_t offset; 2266 void *objp; 2267 struct slab *slabp; 2268 kmem_bufctl_t next; 2269 2270 /* The main algorithms are not node aware, thus we have to cheat: 2271 * We bypass all caches and allocate a new slab. 2272 * The following code is a streamlined copy of cache_grow(). 2273 */ 2274 2275 /* Get colour for the slab, and update the next value. */ 2276 spin_lock_irq(&cachep->spinlock); 2277 offset = cachep->colour_next; 2278 cachep->colour_next++; 2279 if (cachep->colour_next >= cachep->colour) 2280 cachep->colour_next = 0; 2281 offset *= cachep->colour_off; 2282 spin_unlock_irq(&cachep->spinlock); 2283 2284 /* Get mem for the objs. */ 2285 if (!(objp = kmem_getpages(cachep, GFP_KERNEL, nodeid))) 2286 goto failed; 2287 2288 /* Get slab management. */ 2289 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, GFP_KERNEL))) 2290 goto opps1; 2291 2292 set_slab_attr(cachep, slabp, objp); 2293 cache_init_objs(cachep, slabp, SLAB_CTOR_CONSTRUCTOR); 2294 2295 /* The first object is ours: */ 2296 objp = slabp->s_mem + slabp->free*cachep->objsize; 2297 slabp->inuse++; 2298 next = slab_bufctl(slabp)[slabp->free]; 2299 #if DEBUG 2300 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2301 #endif 2302 slabp->free = next; 2303 2304 /* add the remaining objects into the cache */ 2305 spin_lock_irq(&cachep->spinlock); 2306 check_slabp(cachep, slabp); 2307 STATS_INC_GROWN(cachep); 2308 /* Make slab active. */ 2309 if (slabp->free == BUFCTL_END) { 2310 list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_full)); 2311 } else { 2312 list_add_tail(&slabp->list, 2313 &(list3_data(cachep)->slabs_partial)); 2314 list3_data(cachep)->free_objects += cachep->num-1; 2315 } 2316 spin_unlock_irq(&cachep->spinlock); 2317 objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp, 2318 __builtin_return_address(0)); 2319 return objp; 2320 opps1: 2321 kmem_freepages(cachep, objp); 2322 failed: 2323 return NULL; 2324 2325 } 2326 EXPORT_SYMBOL(kmem_cache_alloc_node); 2327 2328 /** 2329 * kmalloc - allocate memory 2330 * @size: how many bytes of memory are required. 2331 * @flags: the type of memory to allocate. 2332 * 2333 * kmalloc is the normal method of allocating memory 2334 * in the kernel. 2335 * 2336 * The @flags argument may be one of: 2337 * 2338 * %GFP_USER - Allocate memory on behalf of user. May sleep. 2339 * 2340 * %GFP_KERNEL - Allocate normal kernel ram. May sleep. 2341 * 2342 * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers. 2343 * 2344 * Additionally, the %GFP_DMA flag may be set to indicate the memory 2345 * must be suitable for DMA. This can mean different things on different 2346 * platforms. For example, on i386, it means that the memory must come 2347 * from the first 16MB. 2348 */ 2349 void * __kmalloc (size_t size, int flags) 2350 { 2351 struct cache_sizes *csizep = malloc_sizes; 2352 2353 for (; csizep->cs_size; csizep++) { 2354 if (size > csizep->cs_size) 2355 continue; 2356 #if DEBUG 2357 /* This happens if someone tries to call 2358 * kmem_cache_create(), or kmalloc(), before 2359 * the generic caches are initialized. 2360 */ 2361 BUG_ON(csizep->cs_cachep == NULL); 2362 #endif 2363 return __cache_alloc(flags & GFP_DMA ? 2364 csizep->cs_dmacachep : csizep->cs_cachep, flags); 2365 } 2366 return NULL; 2367 } 2368 2369 EXPORT_SYMBOL(__kmalloc); 2370 2371 #ifdef CONFIG_SMP 2372 /** 2373 * __alloc_percpu - allocate one copy of the object for every present 2374 * cpu in the system, zeroing them. 2375 * Objects should be dereferenced using per_cpu_ptr/get_cpu_ptr 2376 * macros only. 2377 * 2378 * @size: how many bytes of memory are required. 2379 * @align: the alignment, which can't be greater than SMP_CACHE_BYTES. 2380 */ 2381 void *__alloc_percpu(size_t size, size_t align) 2382 { 2383 int i; 2384 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); 2385 2386 if (!pdata) 2387 return NULL; 2388 2389 for (i = 0; i < NR_CPUS; i++) { 2390 if (!cpu_possible(i)) 2391 continue; 2392 pdata->ptrs[i] = kmem_cache_alloc_node( 2393 kmem_find_general_cachep(size, GFP_KERNEL), 2394 cpu_to_node(i)); 2395 2396 if (!pdata->ptrs[i]) 2397 goto unwind_oom; 2398 memset(pdata->ptrs[i], 0, size); 2399 } 2400 2401 /* Catch derefs w/o wrappers */ 2402 return (void *) (~(unsigned long) pdata); 2403 2404 unwind_oom: 2405 while (--i >= 0) { 2406 if (!cpu_possible(i)) 2407 continue; 2408 kfree(pdata->ptrs[i]); 2409 } 2410 kfree(pdata); 2411 return NULL; 2412 } 2413 2414 EXPORT_SYMBOL(__alloc_percpu); 2415 #endif 2416 2417 /** 2418 * kmem_cache_free - Deallocate an object 2419 * @cachep: The cache the allocation was from. 2420 * @objp: The previously allocated object. 2421 * 2422 * Free an object which was previously allocated from this 2423 * cache. 2424 */ 2425 void kmem_cache_free (kmem_cache_t *cachep, void *objp) 2426 { 2427 unsigned long flags; 2428 2429 local_irq_save(flags); 2430 __cache_free(cachep, objp); 2431 local_irq_restore(flags); 2432 } 2433 2434 EXPORT_SYMBOL(kmem_cache_free); 2435 2436 /** 2437 * kfree - free previously allocated memory 2438 * @objp: pointer returned by kmalloc. 2439 * 2440 * Don't free memory not originally allocated by kmalloc() 2441 * or you will run into trouble. 2442 */ 2443 void kfree (const void *objp) 2444 { 2445 kmem_cache_t *c; 2446 unsigned long flags; 2447 2448 if (!objp) 2449 return; 2450 local_irq_save(flags); 2451 kfree_debugcheck(objp); 2452 c = GET_PAGE_CACHE(virt_to_page(objp)); 2453 __cache_free(c, (void*)objp); 2454 local_irq_restore(flags); 2455 } 2456 2457 EXPORT_SYMBOL(kfree); 2458 2459 #ifdef CONFIG_SMP 2460 /** 2461 * free_percpu - free previously allocated percpu memory 2462 * @objp: pointer returned by alloc_percpu. 2463 * 2464 * Don't free memory not originally allocated by alloc_percpu() 2465 * The complemented objp is to check for that. 2466 */ 2467 void 2468 free_percpu(const void *objp) 2469 { 2470 int i; 2471 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); 2472 2473 for (i = 0; i < NR_CPUS; i++) { 2474 if (!cpu_possible(i)) 2475 continue; 2476 kfree(p->ptrs[i]); 2477 } 2478 } 2479 2480 EXPORT_SYMBOL(free_percpu); 2481 #endif 2482 2483 unsigned int kmem_cache_size(kmem_cache_t *cachep) 2484 { 2485 return obj_reallen(cachep); 2486 } 2487 2488 EXPORT_SYMBOL(kmem_cache_size); 2489 2490 kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags) 2491 { 2492 struct cache_sizes *csizep = malloc_sizes; 2493 2494 /* This function could be moved to the header file, and 2495 * made inline so consumers can quickly determine what 2496 * cache pointer they require. 2497 */ 2498 for ( ; csizep->cs_size; csizep++) { 2499 if (size > csizep->cs_size) 2500 continue; 2501 break; 2502 } 2503 return (gfpflags & GFP_DMA) ? csizep->cs_dmacachep : csizep->cs_cachep; 2504 } 2505 2506 EXPORT_SYMBOL(kmem_find_general_cachep); 2507 2508 struct ccupdate_struct { 2509 kmem_cache_t *cachep; 2510 struct array_cache *new[NR_CPUS]; 2511 }; 2512 2513 static void do_ccupdate_local(void *info) 2514 { 2515 struct ccupdate_struct *new = (struct ccupdate_struct *)info; 2516 struct array_cache *old; 2517 2518 check_irq_off(); 2519 old = ac_data(new->cachep); 2520 2521 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 2522 new->new[smp_processor_id()] = old; 2523 } 2524 2525 2526 static int do_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount, int shared) 2527 { 2528 struct ccupdate_struct new; 2529 struct array_cache *new_shared; 2530 int i; 2531 2532 memset(&new.new,0,sizeof(new.new)); 2533 for (i = 0; i < NR_CPUS; i++) { 2534 if (cpu_online(i)) { 2535 new.new[i] = alloc_arraycache(i, limit, batchcount); 2536 if (!new.new[i]) { 2537 for (i--; i >= 0; i--) kfree(new.new[i]); 2538 return -ENOMEM; 2539 } 2540 } else { 2541 new.new[i] = NULL; 2542 } 2543 } 2544 new.cachep = cachep; 2545 2546 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); 2547 2548 check_irq_on(); 2549 spin_lock_irq(&cachep->spinlock); 2550 cachep->batchcount = batchcount; 2551 cachep->limit = limit; 2552 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num; 2553 spin_unlock_irq(&cachep->spinlock); 2554 2555 for (i = 0; i < NR_CPUS; i++) { 2556 struct array_cache *ccold = new.new[i]; 2557 if (!ccold) 2558 continue; 2559 spin_lock_irq(&cachep->spinlock); 2560 free_block(cachep, ac_entry(ccold), ccold->avail); 2561 spin_unlock_irq(&cachep->spinlock); 2562 kfree(ccold); 2563 } 2564 new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d); 2565 if (new_shared) { 2566 struct array_cache *old; 2567 2568 spin_lock_irq(&cachep->spinlock); 2569 old = cachep->lists.shared; 2570 cachep->lists.shared = new_shared; 2571 if (old) 2572 free_block(cachep, ac_entry(old), old->avail); 2573 spin_unlock_irq(&cachep->spinlock); 2574 kfree(old); 2575 } 2576 2577 return 0; 2578 } 2579 2580 2581 static void enable_cpucache (kmem_cache_t *cachep) 2582 { 2583 int err; 2584 int limit, shared; 2585 2586 /* The head array serves three purposes: 2587 * - create a LIFO ordering, i.e. return objects that are cache-warm 2588 * - reduce the number of spinlock operations. 2589 * - reduce the number of linked list operations on the slab and 2590 * bufctl chains: array operations are cheaper. 2591 * The numbers are guessed, we should auto-tune as described by 2592 * Bonwick. 2593 */ 2594 if (cachep->objsize > 131072) 2595 limit = 1; 2596 else if (cachep->objsize > PAGE_SIZE) 2597 limit = 8; 2598 else if (cachep->objsize > 1024) 2599 limit = 24; 2600 else if (cachep->objsize > 256) 2601 limit = 54; 2602 else 2603 limit = 120; 2604 2605 /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound 2606 * allocation behaviour: Most allocs on one cpu, most free operations 2607 * on another cpu. For these cases, an efficient object passing between 2608 * cpus is necessary. This is provided by a shared array. The array 2609 * replaces Bonwick's magazine layer. 2610 * On uniprocessor, it's functionally equivalent (but less efficient) 2611 * to a larger limit. Thus disabled by default. 2612 */ 2613 shared = 0; 2614 #ifdef CONFIG_SMP 2615 if (cachep->objsize <= PAGE_SIZE) 2616 shared = 8; 2617 #endif 2618 2619 #if DEBUG 2620 /* With debugging enabled, large batchcount lead to excessively 2621 * long periods with disabled local interrupts. Limit the 2622 * batchcount 2623 */ 2624 if (limit > 32) 2625 limit = 32; 2626 #endif 2627 err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); 2628 if (err) 2629 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 2630 cachep->name, -err); 2631 } 2632 2633 static void drain_array(kmem_cache_t *cachep, struct array_cache *ac) 2634 { 2635 int tofree; 2636 2637 check_irq_off(); 2638 if (ac->touched) { 2639 ac->touched = 0; 2640 } else if (ac->avail) { 2641 tofree = (ac->limit+4)/5; 2642 if (tofree > ac->avail) { 2643 tofree = (ac->avail+1)/2; 2644 } 2645 spin_lock(&cachep->spinlock); 2646 free_block(cachep, ac_entry(ac), tofree); 2647 spin_unlock(&cachep->spinlock); 2648 ac->avail -= tofree; 2649 memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree], 2650 sizeof(void*)*ac->avail); 2651 } 2652 } 2653 2654 static void drain_array_locked(kmem_cache_t *cachep, 2655 struct array_cache *ac, int force) 2656 { 2657 int tofree; 2658 2659 check_spinlock_acquired(cachep); 2660 if (ac->touched && !force) { 2661 ac->touched = 0; 2662 } else if (ac->avail) { 2663 tofree = force ? ac->avail : (ac->limit+4)/5; 2664 if (tofree > ac->avail) { 2665 tofree = (ac->avail+1)/2; 2666 } 2667 free_block(cachep, ac_entry(ac), tofree); 2668 ac->avail -= tofree; 2669 memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree], 2670 sizeof(void*)*ac->avail); 2671 } 2672 } 2673 2674 /** 2675 * cache_reap - Reclaim memory from caches. 2676 * 2677 * Called from a timer, every few seconds 2678 * Purpose: 2679 * - clear the per-cpu caches for this CPU. 2680 * - return freeable pages to the main free memory pool. 2681 * 2682 * If we cannot acquire the cache chain semaphore then just give up - we'll 2683 * try again next timer interrupt. 2684 */ 2685 static void cache_reap (void) 2686 { 2687 struct list_head *walk; 2688 2689 #if DEBUG 2690 BUG_ON(!in_interrupt()); 2691 BUG_ON(in_irq()); 2692 #endif 2693 if (down_trylock(&cache_chain_sem)) 2694 return; 2695 2696 list_for_each(walk, &cache_chain) { 2697 kmem_cache_t *searchp; 2698 struct list_head* p; 2699 int tofree; 2700 struct slab *slabp; 2701 2702 searchp = list_entry(walk, kmem_cache_t, next); 2703 2704 if (searchp->flags & SLAB_NO_REAP) 2705 goto next; 2706 2707 check_irq_on(); 2708 local_irq_disable(); 2709 drain_array(searchp, ac_data(searchp)); 2710 2711 if(time_after(searchp->lists.next_reap, jiffies)) 2712 goto next_irqon; 2713 2714 spin_lock(&searchp->spinlock); 2715 if(time_after(searchp->lists.next_reap, jiffies)) { 2716 goto next_unlock; 2717 } 2718 searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3; 2719 2720 if (searchp->lists.shared) 2721 drain_array_locked(searchp, searchp->lists.shared, 0); 2722 2723 if (searchp->lists.free_touched) { 2724 searchp->lists.free_touched = 0; 2725 goto next_unlock; 2726 } 2727 2728 tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num); 2729 do { 2730 p = list3_data(searchp)->slabs_free.next; 2731 if (p == &(list3_data(searchp)->slabs_free)) 2732 break; 2733 2734 slabp = list_entry(p, struct slab, list); 2735 BUG_ON(slabp->inuse); 2736 list_del(&slabp->list); 2737 STATS_INC_REAPED(searchp); 2738 2739 /* Safe to drop the lock. The slab is no longer 2740 * linked to the cache. 2741 * searchp cannot disappear, we hold 2742 * cache_chain_lock 2743 */ 2744 searchp->lists.free_objects -= searchp->num; 2745 spin_unlock_irq(&searchp->spinlock); 2746 slab_destroy(searchp, slabp); 2747 spin_lock_irq(&searchp->spinlock); 2748 } while(--tofree > 0); 2749 next_unlock: 2750 spin_unlock(&searchp->spinlock); 2751 next_irqon: 2752 local_irq_enable(); 2753 next: 2754 ; 2755 } 2756 check_irq_on(); 2757 up(&cache_chain_sem); 2758 } 2759 2760 /* 2761 * This is a timer handler. There is one per CPU. It is called periodially 2762 * to shrink this CPU's caches. Otherwise there could be memory tied up 2763 * for long periods (or for ever) due to load changes. 2764 */ 2765 static void reap_timer_fnc(unsigned long cpu) 2766 { 2767 struct timer_list *rt = &__get_cpu_var(reap_timers); 2768 2769 /* CPU hotplug can drag us off cpu: don't run on wrong CPU */ 2770 if (!cpu_is_offline(cpu)) { 2771 cache_reap(); 2772 mod_timer(rt, jiffies + REAPTIMEOUT_CPUC + cpu); 2773 } 2774 } 2775 2776 #ifdef CONFIG_PROC_FS 2777 2778 static void *s_start(struct seq_file *m, loff_t *pos) 2779 { 2780 loff_t n = *pos; 2781 struct list_head *p; 2782 2783 down(&cache_chain_sem); 2784 if (!n) { 2785 /* 2786 * Output format version, so at least we can change it 2787 * without _too_ many complaints. 2788 */ 2789 #if STATS 2790 seq_puts(m, "slabinfo - version: 2.0 (statistics)\n"); 2791 #else 2792 seq_puts(m, "slabinfo - version: 2.0\n"); 2793 #endif 2794 seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); 2795 seq_puts(m, " : tunables <batchcount> <limit> <sharedfactor>"); 2796 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 2797 #if STATS 2798 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <freelimit>"); 2799 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 2800 #endif 2801 seq_putc(m, '\n'); 2802 } 2803 p = cache_chain.next; 2804 while (n--) { 2805 p = p->next; 2806 if (p == &cache_chain) 2807 return NULL; 2808 } 2809 return list_entry(p, kmem_cache_t, next); 2810 } 2811 2812 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 2813 { 2814 kmem_cache_t *cachep = p; 2815 ++*pos; 2816 return cachep->next.next == &cache_chain ? NULL 2817 : list_entry(cachep->next.next, kmem_cache_t, next); 2818 } 2819 2820 static void s_stop(struct seq_file *m, void *p) 2821 { 2822 up(&cache_chain_sem); 2823 } 2824 2825 static int s_show(struct seq_file *m, void *p) 2826 { 2827 kmem_cache_t *cachep = p; 2828 struct list_head *q; 2829 struct slab *slabp; 2830 unsigned long active_objs; 2831 unsigned long num_objs; 2832 unsigned long active_slabs = 0; 2833 unsigned long num_slabs; 2834 const char *name; 2835 char *error = NULL; 2836 2837 check_irq_on(); 2838 spin_lock_irq(&cachep->spinlock); 2839 active_objs = 0; 2840 num_slabs = 0; 2841 list_for_each(q,&cachep->lists.slabs_full) { 2842 slabp = list_entry(q, struct slab, list); 2843 if (slabp->inuse != cachep->num && !error) 2844 error = "slabs_full accounting error"; 2845 active_objs += cachep->num; 2846 active_slabs++; 2847 } 2848 list_for_each(q,&cachep->lists.slabs_partial) { 2849 slabp = list_entry(q, struct slab, list); 2850 if (slabp->inuse == cachep->num && !error) 2851 error = "slabs_partial inuse accounting error"; 2852 if (!slabp->inuse && !error) 2853 error = "slabs_partial/inuse accounting error"; 2854 active_objs += slabp->inuse; 2855 active_slabs++; 2856 } 2857 list_for_each(q,&cachep->lists.slabs_free) { 2858 slabp = list_entry(q, struct slab, list); 2859 if (slabp->inuse && !error) 2860 error = "slabs_free/inuse accounting error"; 2861 num_slabs++; 2862 } 2863 num_slabs+=active_slabs; 2864 num_objs = num_slabs*cachep->num; 2865 if (num_objs - active_objs != cachep->lists.free_objects && !error) 2866 error = "free_objects accounting error"; 2867 2868 name = cachep->name; 2869 if (error) 2870 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 2871 2872 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 2873 name, active_objs, num_objs, cachep->objsize, 2874 cachep->num, (1<<cachep->gfporder)); 2875 seq_printf(m, " : tunables %4u %4u %4u", 2876 cachep->limit, cachep->batchcount, 2877 cachep->lists.shared->limit/cachep->batchcount); 2878 seq_printf(m, " : slabdata %6lu %6lu %6u", 2879 active_slabs, num_slabs, cachep->lists.shared->avail); 2880 #if STATS 2881 { /* list3 stats */ 2882 unsigned long high = cachep->high_mark; 2883 unsigned long allocs = cachep->num_allocations; 2884 unsigned long grown = cachep->grown; 2885 unsigned long reaped = cachep->reaped; 2886 unsigned long errors = cachep->errors; 2887 unsigned long max_freeable = cachep->max_freeable; 2888 unsigned long free_limit = cachep->free_limit; 2889 2890 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu", 2891 allocs, high, grown, reaped, errors, 2892 max_freeable, free_limit); 2893 } 2894 /* cpu stats */ 2895 { 2896 unsigned long allochit = atomic_read(&cachep->allochit); 2897 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 2898 unsigned long freehit = atomic_read(&cachep->freehit); 2899 unsigned long freemiss = atomic_read(&cachep->freemiss); 2900 2901 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 2902 allochit, allocmiss, freehit, freemiss); 2903 } 2904 #endif 2905 seq_putc(m, '\n'); 2906 spin_unlock_irq(&cachep->spinlock); 2907 return 0; 2908 } 2909 2910 /* 2911 * slabinfo_op - iterator that generates /proc/slabinfo 2912 * 2913 * Output layout: 2914 * cache-name 2915 * num-active-objs 2916 * total-objs 2917 * object size 2918 * num-active-slabs 2919 * total-slabs 2920 * num-pages-per-slab 2921 * + further values on SMP and with statistics enabled 2922 */ 2923 2924 struct seq_operations slabinfo_op = { 2925 .start = s_start, 2926 .next = s_next, 2927 .stop = s_stop, 2928 .show = s_show, 2929 }; 2930 2931 #define MAX_SLABINFO_WRITE 128 2932 /** 2933 * slabinfo_write - Tuning for the slab allocator 2934 * @file: unused 2935 * @buffer: user buffer 2936 * @count: data length 2937 * @ppos: unused 2938 */ 2939 ssize_t slabinfo_write(struct file *file, const char __user *buffer, 2940 size_t count, loff_t *ppos) 2941 { 2942 char kbuf[MAX_SLABINFO_WRITE+1], *tmp; 2943 int limit, batchcount, shared, res; 2944 struct list_head *p; 2945 2946 if (count > MAX_SLABINFO_WRITE) 2947 return -EINVAL; 2948 if (copy_from_user(&kbuf, buffer, count)) 2949 return -EFAULT; 2950 kbuf[MAX_SLABINFO_WRITE] = '\0'; 2951 2952 tmp = strchr(kbuf, ' '); 2953 if (!tmp) 2954 return -EINVAL; 2955 *tmp = '\0'; 2956 tmp++; 2957 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 2958 return -EINVAL; 2959 2960 /* Find the cache in the chain of caches. */ 2961 down(&cache_chain_sem); 2962 res = -EINVAL; 2963 list_for_each(p,&cache_chain) { 2964 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); 2965 2966 if (!strcmp(cachep->name, kbuf)) { 2967 if (limit < 1 || 2968 batchcount < 1 || 2969 batchcount > limit || 2970 shared < 0) { 2971 res = -EINVAL; 2972 } else { 2973 res = do_tune_cpucache(cachep, limit, batchcount, shared); 2974 } 2975 break; 2976 } 2977 } 2978 up(&cache_chain_sem); 2979 if (res >= 0) 2980 res = count; 2981 return res; 2982 } 2983 #endif 2984 2985 unsigned int ksize(const void *objp) 2986 { 2987 kmem_cache_t *c; 2988 unsigned long flags; 2989 unsigned int size = 0; 2990 2991 if (likely(objp != NULL)) { 2992 local_irq_save(flags); 2993 c = GET_PAGE_CACHE(virt_to_page(objp)); 2994 size = kmem_cache_size(c); 2995 local_irq_restore(flags); 2996 } 2997 2998 return size; 2999 } 3000 3001 void ptrinfo(unsigned long addr) 3002 { 3003 struct page *page; 3004 3005 printk("Dumping data about address %p.\n", (void*)addr); 3006 if (!virt_addr_valid((void*)addr)) { 3007 printk("virt addr invalid.\n"); 3008 return; 3009 } 3010 #ifdef CONFIG_MMU 3011 do { 3012 pgd_t *pgd = pgd_offset_k(addr); 3013 pmd_t *pmd; 3014 if (pgd_none(*pgd)) { 3015 printk("No pgd.\n"); 3016 break; 3017 } 3018 pmd = pmd_offset(pgd, addr); 3019 if (pmd_none(*pmd)) { 3020 printk("No pmd.\n"); 3021 break; 3022 } 3023 #ifdef CONFIG_X86 3024 if (pmd_large(*pmd)) { 3025 printk("Large page.\n"); 3026 break; 3027 } 3028 #endif 3029 printk("normal page, pte_val 0x%llx\n", 3030 (unsigned long long)pte_val(*pte_offset_kernel(pmd, addr))); 3031 } while(0); 3032 #endif 3033 3034 page = virt_to_page((void*)addr); 3035 printk("struct page at %p, flags %08lx\n", 3036 page, (unsigned long)page->flags); 3037 if (PageSlab(page)) { 3038 kmem_cache_t *c; 3039 struct slab *s; 3040 unsigned long flags; 3041 int objnr; 3042 void *objp; 3043 3044 c = GET_PAGE_CACHE(page); 3045 printk("belongs to cache %s.\n",c->name); 3046 3047 spin_lock_irqsave(&c->spinlock, flags); 3048 s = GET_PAGE_SLAB(page); 3049 printk("slabp %p with %d inuse objects (from %d).\n", 3050 s, s->inuse, c->num); 3051 check_slabp(c,s); 3052 3053 objnr = (addr-(unsigned long)s->s_mem)/c->objsize; 3054 objp = s->s_mem+c->objsize*objnr; 3055 printk("points into object no %d, starting at %p, len %d.\n", 3056 objnr, objp, c->objsize); 3057 if (objnr >= c->num) { 3058 printk("Bad obj number.\n"); 3059 } else { 3060 kernel_map_pages(virt_to_page(objp), 3061 c->objsize/PAGE_SIZE, 1); 3062 3063 print_objinfo(c, objp, 2); 3064 } 3065 spin_unlock_irqrestore(&c->spinlock, flags); 3066 3067 } 3068 } 3069
This page was automatically generated by LXR 0.3.1. • Linux is a registered trademark of Linus Torvalds