1 /* 2 * Simple NUMA memory policy for the Linux kernel. 3 * 4 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 5 * Subject to the GNU Public License, version 2. 6 * 7 * NUMA policy allows the user to give hints in which node(s) memory should 8 * be allocated. 9 * 10 * Support four policies per VMA and per process: 11 * 12 * The VMA policy has priority over the process policy for a page fault. 13 * 14 * interleave Allocate memory interleaved over a set of nodes, 15 * with normal fallback if it fails. 16 * For VMA based allocations this interleaves based on the 17 * offset into the backing object or offset into the mapping 18 * for anonymous memory. For process policy an process counter 19 * is used. 20 * bind Only allocate memory on a specific set of nodes, 21 * no fallback. 22 * preferred Try a specific node first before normal fallback. 23 * As a special case node -1 here means do the allocation 24 * on the local CPU. This is normally identical to default, 25 * but useful to set in a VMA when you have a non default 26 * process policy. 27 * default Allocate on the local node first, or when on a VMA 28 * use the process policy. This is what Linux always did 29 * in a NUMA aware kernel and still does by, ahem, default. 30 * 31 * The process policy is applied for most non interrupt memory allocations 32 * in that process' context. Interrupts ignore the policies and always 33 * try to allocate on the local CPU. The VMA policy is only applied for memory 34 * allocations for a VMA in the VM. 35 * 36 * Currently there are a few corner cases in swapping where the policy 37 * is not applied, but the majority should be handled. When process policy 38 * is used it is not remembered over swap outs/swap ins. 39 * 40 * Only the highest zone in the zone hierarchy gets policied. Allocations 41 * requesting a lower zone just use default policy. This implies that 42 * on systems with highmem kernel lowmem allocation don't get policied. 43 * Same with GFP_DMA allocations. 44 * 45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between 46 * all users and remembered even when nobody has memory mapped. 47 */ 48 49 /* Notebook: 50 fix mmap readahead to honour policy and enable policy for any page cache 51 object 52 statistics for bigpages 53 global policy for page cache? currently it uses process policy. Requires 54 first item above. 55 handle mremap for shared memory (currently ignored for the policy) 56 grows down? 57 make bind policy root only? It can trigger oom much faster and the 58 kernel is not always grateful with that. 59 could replace all the switch()es with a mempolicy_ops structure. 60 */ 61 62 #include <linux/mempolicy.h> 63 #include <linux/mm.h> 64 #include <linux/highmem.h> 65 #include <linux/hugetlb.h> 66 #include <linux/kernel.h> 67 #include <linux/sched.h> 68 #include <linux/mm.h> 69 #include <linux/gfp.h> 70 #include <linux/slab.h> 71 #include <linux/string.h> 72 #include <linux/module.h> 73 #include <linux/interrupt.h> 74 #include <linux/init.h> 75 #include <linux/compat.h> 76 #include <linux/mempolicy.h> 77 #include <asm/uaccess.h> 78 79 static kmem_cache_t *policy_cache; 80 static kmem_cache_t *sn_cache; 81 82 #define PDprintk(fmt...) 83 84 /* Highest zone. An specific allocation for a zone below that is not 85 policied. */ 86 static int policy_zone; 87 88 static struct mempolicy default_policy = { 89 .refcnt = ATOMIC_INIT(1), /* never free it */ 90 .policy = MPOL_DEFAULT, 91 }; 92 93 /* Check if all specified nodes are online */ 94 static int nodes_online(unsigned long *nodes) 95 { 96 DECLARE_BITMAP(online2, MAX_NUMNODES); 97 98 bitmap_copy(online2, node_online_map, MAX_NUMNODES); 99 if (bitmap_empty(online2, MAX_NUMNODES)) 100 set_bit(0, online2); 101 if (!bitmap_subset(nodes, online2, MAX_NUMNODES)) 102 return -EINVAL; 103 return 0; 104 } 105 106 /* Do sanity checking on a policy */ 107 static int mpol_check_policy(int mode, unsigned long *nodes) 108 { 109 int empty = bitmap_empty(nodes, MAX_NUMNODES); 110 111 switch (mode) { 112 case MPOL_DEFAULT: 113 if (!empty) 114 return -EINVAL; 115 break; 116 case MPOL_BIND: 117 case MPOL_INTERLEAVE: 118 /* Preferred will only use the first bit, but allow 119 more for now. */ 120 if (empty) 121 return -EINVAL; 122 break; 123 } 124 return nodes_online(nodes); 125 } 126 127 /* Copy a node mask from user space. */ 128 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, 129 unsigned long maxnode, int mode) 130 { 131 unsigned long k; 132 unsigned long nlongs; 133 unsigned long endmask; 134 135 --maxnode; 136 bitmap_zero(nodes, MAX_NUMNODES); 137 if (maxnode == 0 || !nmask) 138 return 0; 139 140 nlongs = BITS_TO_LONGS(maxnode); 141 if ((maxnode % BITS_PER_LONG) == 0) 142 endmask = ~0UL; 143 else 144 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; 145 146 /* When the user specified more nodes than supported just check 147 if the non supported part is all zero. */ 148 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { 149 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { 150 unsigned long t; 151 if (get_user(t, nmask + k)) 152 return -EFAULT; 153 if (k == nlongs - 1) { 154 if (t & endmask) 155 return -EINVAL; 156 } else if (t) 157 return -EINVAL; 158 } 159 nlongs = BITS_TO_LONGS(MAX_NUMNODES); 160 endmask = ~0UL; 161 } 162 163 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long))) 164 return -EFAULT; 165 nodes[nlongs-1] &= endmask; 166 return mpol_check_policy(mode, nodes); 167 } 168 169 /* Generate a custom zonelist for the BIND policy. */ 170 static struct zonelist *bind_zonelist(unsigned long *nodes) 171 { 172 struct zonelist *zl; 173 int num, max, nd; 174 175 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); 176 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); 177 if (!zl) 178 return NULL; 179 num = 0; 180 for (nd = find_first_bit(nodes, MAX_NUMNODES); 181 nd < MAX_NUMNODES; 182 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) { 183 int k; 184 for (k = MAX_NR_ZONES-1; k >= 0; k--) { 185 struct zone *z = &NODE_DATA(nd)->node_zones[k]; 186 if (!z->present_pages) 187 continue; 188 zl->zones[num++] = z; 189 if (k > policy_zone) 190 policy_zone = k; 191 } 192 } 193 BUG_ON(num >= max); 194 zl->zones[num] = NULL; 195 return zl; 196 } 197 198 /* Create a new policy */ 199 static struct mempolicy *mpol_new(int mode, unsigned long *nodes) 200 { 201 struct mempolicy *policy; 202 203 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); 204 if (mode == MPOL_DEFAULT) 205 return NULL; 206 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 207 if (!policy) 208 return ERR_PTR(-ENOMEM); 209 atomic_set(&policy->refcnt, 1); 210 switch (mode) { 211 case MPOL_INTERLEAVE: 212 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); 213 break; 214 case MPOL_PREFERRED: 215 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); 216 if (policy->v.preferred_node >= MAX_NUMNODES) 217 policy->v.preferred_node = -1; 218 break; 219 case MPOL_BIND: 220 policy->v.zonelist = bind_zonelist(nodes); 221 if (policy->v.zonelist == NULL) { 222 kmem_cache_free(policy_cache, policy); 223 return ERR_PTR(-ENOMEM); 224 } 225 break; 226 } 227 policy->policy = mode; 228 return policy; 229 } 230 231 /* Ensure all existing pages follow the policy. */ 232 static int 233 verify_pages(unsigned long addr, unsigned long end, unsigned long *nodes) 234 { 235 while (addr < end) { 236 struct page *p; 237 pte_t *pte; 238 pmd_t *pmd; 239 pgd_t *pgd = pgd_offset_k(addr); 240 if (pgd_none(*pgd)) { 241 addr = (addr + PGDIR_SIZE) & PGDIR_MASK; 242 continue; 243 } 244 pmd = pmd_offset(pgd, addr); 245 if (pmd_none(*pmd)) { 246 addr = (addr + PMD_SIZE) & PMD_MASK; 247 continue; 248 } 249 p = NULL; 250 pte = pte_offset_map(pmd, addr); 251 if (pte_present(*pte)) 252 p = pte_page(*pte); 253 pte_unmap(pte); 254 if (p) { 255 unsigned nid = page_to_nid(p); 256 if (!test_bit(nid, nodes)) 257 return -EIO; 258 } 259 addr += PAGE_SIZE; 260 } 261 return 0; 262 } 263 264 /* Step 1: check the range */ 265 static struct vm_area_struct * 266 check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 267 unsigned long *nodes, unsigned long flags) 268 { 269 int err; 270 struct vm_area_struct *first, *vma, *prev; 271 272 first = find_vma(mm, start); 273 if (!first) 274 return ERR_PTR(-EFAULT); 275 prev = NULL; 276 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 277 if (!vma->vm_next && vma->vm_end < end) 278 return ERR_PTR(-EFAULT); 279 if (prev && prev->vm_end < vma->vm_start) 280 return ERR_PTR(-EFAULT); 281 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { 282 err = verify_pages(vma->vm_start, vma->vm_end, nodes); 283 if (err) { 284 first = ERR_PTR(err); 285 break; 286 } 287 } 288 prev = vma; 289 } 290 return first; 291 } 292 293 /* Apply policy to a single VMA */ 294 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) 295 { 296 int err = 0; 297 struct mempolicy *old = vma->vm_policy; 298 299 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", 300 vma->vm_start, vma->vm_end, vma->vm_pgoff, 301 vma->vm_ops, vma->vm_file, 302 vma->vm_ops ? vma->vm_ops->set_policy : NULL); 303 304 if (vma->vm_ops && vma->vm_ops->set_policy) 305 err = vma->vm_ops->set_policy(vma, new); 306 if (!err) { 307 mpol_get(new); 308 vma->vm_policy = new; 309 mpol_free(old); 310 } 311 return err; 312 } 313 314 /* Step 2: apply policy to a range and do splits. */ 315 static int mbind_range(struct vm_area_struct *vma, unsigned long start, 316 unsigned long end, struct mempolicy *new) 317 { 318 struct vm_area_struct *next; 319 int err; 320 321 err = 0; 322 for (; vma && vma->vm_start < end; vma = next) { 323 next = vma->vm_next; 324 if (vma->vm_start < start) 325 err = split_vma(vma->vm_mm, vma, start, 1); 326 if (!err && vma->vm_end > end) 327 err = split_vma(vma->vm_mm, vma, end, 0); 328 if (!err) 329 err = policy_vma(vma, new); 330 if (err) 331 break; 332 } 333 return err; 334 } 335 336 /* Change policy for a memory range */ 337 asmlinkage long sys_mbind(unsigned long start, unsigned long len, 338 unsigned long mode, 339 unsigned long __user *nmask, unsigned long maxnode, 340 unsigned flags) 341 { 342 struct vm_area_struct *vma; 343 struct mm_struct *mm = current->mm; 344 struct mempolicy *new; 345 unsigned long end; 346 DECLARE_BITMAP(nodes, MAX_NUMNODES); 347 int err; 348 349 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) 350 return -EINVAL; 351 if (start & ~PAGE_MASK) 352 return -EINVAL; 353 if (mode == MPOL_DEFAULT) 354 flags &= ~MPOL_MF_STRICT; 355 len = (len + PAGE_SIZE - 1) & PAGE_MASK; 356 end = start + len; 357 if (end < start) 358 return -EINVAL; 359 if (end == start) 360 return 0; 361 362 err = get_nodes(nodes, nmask, maxnode, mode); 363 if (err) 364 return err; 365 366 new = mpol_new(mode, nodes); 367 if (IS_ERR(new)) 368 return PTR_ERR(new); 369 370 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, 371 mode,nodes[0]); 372 373 down_write(&mm->mmap_sem); 374 vma = check_range(mm, start, end, nodes, flags); 375 err = PTR_ERR(vma); 376 if (!IS_ERR(vma)) 377 err = mbind_range(vma, start, end, new); 378 up_write(&mm->mmap_sem); 379 mpol_free(new); 380 return err; 381 } 382 383 /* Set the process memory policy */ 384 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 385 unsigned long maxnode) 386 { 387 int err; 388 struct mempolicy *new; 389 DECLARE_BITMAP(nodes, MAX_NUMNODES); 390 391 if (mode > MPOL_MAX) 392 return -EINVAL; 393 err = get_nodes(nodes, nmask, maxnode, mode); 394 if (err) 395 return err; 396 new = mpol_new(mode, nodes); 397 if (IS_ERR(new)) 398 return PTR_ERR(new); 399 mpol_free(current->mempolicy); 400 current->mempolicy = new; 401 if (new && new->policy == MPOL_INTERLEAVE) 402 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); 403 return 0; 404 } 405 406 /* Fill a zone bitmap for a policy */ 407 static void get_zonemask(struct mempolicy *p, unsigned long *nodes) 408 { 409 int i; 410 411 bitmap_zero(nodes, MAX_NUMNODES); 412 switch (p->policy) { 413 case MPOL_BIND: 414 for (i = 0; p->v.zonelist->zones[i]; i++) 415 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); 416 break; 417 case MPOL_DEFAULT: 418 break; 419 case MPOL_INTERLEAVE: 420 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); 421 break; 422 case MPOL_PREFERRED: 423 /* or use current node instead of online map? */ 424 if (p->v.preferred_node < 0) 425 bitmap_copy(nodes, node_online_map, MAX_NUMNODES); 426 else 427 __set_bit(p->v.preferred_node, nodes); 428 break; 429 default: 430 BUG(); 431 } 432 } 433 434 static int lookup_node(struct mm_struct *mm, unsigned long addr) 435 { 436 struct page *p; 437 int err; 438 439 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); 440 if (err >= 0) { 441 err = page_zone(p)->zone_pgdat->node_id; 442 put_page(p); 443 } 444 return err; 445 } 446 447 /* Copy a kernel node mask to user space */ 448 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 449 void *nodes, unsigned nbytes) 450 { 451 unsigned long copy = ALIGN(maxnode-1, 64) / 8; 452 453 if (copy > nbytes) { 454 if (copy > PAGE_SIZE) 455 return -EINVAL; 456 if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 457 return -EFAULT; 458 copy = nbytes; 459 } 460 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0; 461 } 462 463 /* Retrieve NUMA policy */ 464 asmlinkage long sys_get_mempolicy(int __user *policy, 465 unsigned long __user *nmask, 466 unsigned long maxnode, 467 unsigned long addr, unsigned long flags) 468 { 469 int err, pval; 470 struct mm_struct *mm = current->mm; 471 struct vm_area_struct *vma = NULL; 472 struct mempolicy *pol = current->mempolicy; 473 474 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 475 return -EINVAL; 476 if (nmask != NULL && maxnode < numnodes) 477 return -EINVAL; 478 if (flags & MPOL_F_ADDR) { 479 down_read(&mm->mmap_sem); 480 vma = find_vma_intersection(mm, addr, addr+1); 481 if (!vma) { 482 up_read(&mm->mmap_sem); 483 return -EFAULT; 484 } 485 if (vma->vm_ops && vma->vm_ops->get_policy) 486 pol = vma->vm_ops->get_policy(vma, addr); 487 else 488 pol = vma->vm_policy; 489 } else if (addr) 490 return -EINVAL; 491 492 if (!pol) 493 pol = &default_policy; 494 495 if (flags & MPOL_F_NODE) { 496 if (flags & MPOL_F_ADDR) { 497 err = lookup_node(mm, addr); 498 if (err < 0) 499 goto out; 500 pval = err; 501 } else if (pol == current->mempolicy && 502 pol->policy == MPOL_INTERLEAVE) { 503 pval = current->il_next; 504 } else { 505 err = -EINVAL; 506 goto out; 507 } 508 } else 509 pval = pol->policy; 510 511 if (vma) { 512 up_read(¤t->mm->mmap_sem); 513 vma = NULL; 514 } 515 516 if (policy && put_user(pval, policy)) 517 return -EFAULT; 518 519 err = 0; 520 if (nmask) { 521 DECLARE_BITMAP(nodes, MAX_NUMNODES); 522 get_zonemask(pol, nodes); 523 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes)); 524 } 525 526 out: 527 if (vma) 528 up_read(¤t->mm->mmap_sem); 529 return err; 530 } 531 532 #ifdef CONFIG_COMPAT 533 /* The other functions are compatible */ 534 asmlinkage long compat_get_mempolicy(int __user *policy, 535 unsigned __user *nmask, unsigned maxnode, 536 unsigned addr, unsigned flags) 537 { 538 long err; 539 unsigned long __user *nm = NULL; 540 if (nmask) 541 nm = compat_alloc_user_space(ALIGN(maxnode-1, 64) / 8); 542 err = sys_get_mempolicy(policy, nm, maxnode, addr, flags); 543 if (!err && copy_in_user(nmask, nm, ALIGN(maxnode-1, 32)/8)) 544 err = -EFAULT; 545 return err; 546 } 547 #endif 548 549 /* Return effective policy for a VMA */ 550 static struct mempolicy * 551 get_vma_policy(struct vm_area_struct *vma, unsigned long addr) 552 { 553 struct mempolicy *pol = current->mempolicy; 554 555 if (vma) { 556 if (vma->vm_ops && vma->vm_ops->get_policy) 557 pol = vma->vm_ops->get_policy(vma, addr); 558 else if (vma->vm_policy && 559 vma->vm_policy->policy != MPOL_DEFAULT) 560 pol = vma->vm_policy; 561 } 562 if (!pol) 563 pol = &default_policy; 564 return pol; 565 } 566 567 /* Return a zonelist representing a mempolicy */ 568 static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy) 569 { 570 int nd; 571 572 switch (policy->policy) { 573 case MPOL_PREFERRED: 574 nd = policy->v.preferred_node; 575 if (nd < 0) 576 nd = numa_node_id(); 577 break; 578 case MPOL_BIND: 579 /* Lower zones don't get a policy applied */ 580 if (gfp >= policy_zone) 581 return policy->v.zonelist; 582 /*FALL THROUGH*/ 583 case MPOL_INTERLEAVE: /* should not happen */ 584 case MPOL_DEFAULT: 585 nd = numa_node_id(); 586 break; 587 default: 588 nd = 0; 589 BUG(); 590 } 591 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK); 592 } 593 594 /* Do dynamic interleaving for a process */ 595 static unsigned interleave_nodes(struct mempolicy *policy) 596 { 597 unsigned nid, next; 598 struct task_struct *me = current; 599 600 nid = me->il_next; 601 BUG_ON(nid >= MAX_NUMNODES); 602 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid); 603 if (next >= MAX_NUMNODES) 604 next = find_first_bit(policy->v.nodes, MAX_NUMNODES); 605 me->il_next = next; 606 return nid; 607 } 608 609 /* Do static interleaving for a VMA with known offset. */ 610 static unsigned offset_il_node(struct mempolicy *pol, 611 struct vm_area_struct *vma, unsigned long off) 612 { 613 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); 614 unsigned target = (unsigned)off % nnodes; 615 int c; 616 int nid = -1; 617 618 c = 0; 619 do { 620 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); 621 c++; 622 } while (c <= target); 623 BUG_ON(nid >= MAX_NUMNODES); 624 BUG_ON(!test_bit(nid, pol->v.nodes)); 625 return nid; 626 } 627 628 /* Allocate a page in interleaved policy. 629 Own path because it needs to do special accounting. */ 630 static struct page *alloc_page_interleave(unsigned gfp, unsigned order, unsigned nid) 631 { 632 struct zonelist *zl; 633 struct page *page; 634 635 BUG_ON(!test_bit(nid, node_online_map)); 636 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); 637 page = __alloc_pages(gfp, order, zl); 638 if (page && page_zone(page) == zl->zones[0]) { 639 zl->zones[0]->pageset[get_cpu()].interleave_hit++; 640 put_cpu(); 641 } 642 return page; 643 } 644 645 /** 646 * alloc_page_vma - Allocate a page for a VMA. 647 * 648 * @gfp: 649 * %GFP_USER user allocation. 650 * %GFP_KERNEL kernel allocations, 651 * %GFP_HIGHMEM highmem/user allocations, 652 * %GFP_FS allocation should not call back into a file system. 653 * %GFP_ATOMIC don't sleep. 654 * 655 * @vma: Pointer to VMA or NULL if not available. 656 * @addr: Virtual Address of the allocation. Must be inside the VMA. 657 * 658 * This function allocates a page from the kernel page pool and applies 659 * a NUMA policy associated with the VMA or the current process. 660 * When VMA is not NULL caller must hold down_read on the mmap_sem of the 661 * mm_struct of the VMA to prevent it from going away. Should be used for 662 * all allocations for pages that will be mapped into 663 * user space. Returns NULL when no page can be allocated. 664 * 665 * Should be called with the mm_sem of the vma hold. 666 */ 667 struct page * 668 alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr) 669 { 670 struct mempolicy *pol = get_vma_policy(vma, addr); 671 672 if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 673 unsigned nid; 674 if (vma) { 675 unsigned long off; 676 BUG_ON(addr >= vma->vm_end); 677 BUG_ON(addr < vma->vm_start); 678 off = vma->vm_pgoff; 679 off += (addr - vma->vm_start) >> PAGE_SHIFT; 680 nid = offset_il_node(pol, vma, off); 681 } else { 682 /* fall back to process interleaving */ 683 nid = interleave_nodes(pol); 684 } 685 return alloc_page_interleave(gfp, 0, nid); 686 } 687 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); 688 } 689 690 /** 691 * alloc_pages_current - Allocate pages. 692 * 693 * @gfp: 694 * %GFP_USER user allocation, 695 * %GFP_KERNEL kernel allocation, 696 * %GFP_HIGHMEM highmem allocation, 697 * %GFP_FS don't call back into a file system. 698 * %GFP_ATOMIC don't sleep. 699 * @order: Power of two of allocation size in pages. 0 is a single page. 700 * 701 * Allocate a page from the kernel page pool. When not in 702 * interrupt context and apply the current process NUMA policy. 703 * Returns NULL when no page can be allocated. 704 */ 705 struct page *alloc_pages_current(unsigned gfp, unsigned order) 706 { 707 struct mempolicy *pol = current->mempolicy; 708 709 if (!pol || in_interrupt()) 710 pol = &default_policy; 711 if (pol->policy == MPOL_INTERLEAVE) 712 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 713 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); 714 } 715 EXPORT_SYMBOL(alloc_pages_current); 716 717 /* Slow path of a mempolicy copy */ 718 struct mempolicy *__mpol_copy(struct mempolicy *old) 719 { 720 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 721 722 if (!new) 723 return ERR_PTR(-ENOMEM); 724 *new = *old; 725 atomic_set(&new->refcnt, 1); 726 if (new->policy == MPOL_BIND) { 727 int sz = ksize(old->v.zonelist); 728 new->v.zonelist = kmalloc(sz, SLAB_KERNEL); 729 if (!new->v.zonelist) { 730 kmem_cache_free(policy_cache, new); 731 return ERR_PTR(-ENOMEM); 732 } 733 memcpy(new->v.zonelist, old->v.zonelist, sz); 734 } 735 return new; 736 } 737 738 /* Slow path of a mempolicy comparison */ 739 int __mpol_equal(struct mempolicy *a, struct mempolicy *b) 740 { 741 if (!a || !b) 742 return 0; 743 if (a->policy != b->policy) 744 return 0; 745 switch (a->policy) { 746 case MPOL_DEFAULT: 747 return 1; 748 case MPOL_INTERLEAVE: 749 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); 750 case MPOL_PREFERRED: 751 return a->v.preferred_node == b->v.preferred_node; 752 case MPOL_BIND: { 753 int i; 754 for (i = 0; a->v.zonelist->zones[i]; i++) 755 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i]) 756 return 0; 757 return b->v.zonelist->zones[i] == NULL; 758 } 759 default: 760 BUG(); 761 return 0; 762 } 763 } 764 765 /* Slow path of a mpol destructor. */ 766 void __mpol_free(struct mempolicy *p) 767 { 768 if (!atomic_dec_and_test(&p->refcnt)) 769 return; 770 if (p->policy == MPOL_BIND) 771 kfree(p->v.zonelist); 772 p->policy = MPOL_DEFAULT; 773 kmem_cache_free(policy_cache, p); 774 } 775 776 /* 777 * Hugetlb policy. Same as above, just works with node numbers instead of 778 * zonelists. 779 */ 780 781 /* Find first node suitable for an allocation */ 782 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr) 783 { 784 struct mempolicy *pol = get_vma_policy(vma, addr); 785 786 switch (pol->policy) { 787 case MPOL_DEFAULT: 788 return numa_node_id(); 789 case MPOL_BIND: 790 return pol->v.zonelist->zones[0]->zone_pgdat->node_id; 791 case MPOL_INTERLEAVE: 792 return interleave_nodes(pol); 793 case MPOL_PREFERRED: 794 return pol->v.preferred_node >= 0 ? 795 pol->v.preferred_node : numa_node_id(); 796 } 797 BUG(); 798 return 0; 799 } 800 801 /* Find secondary valid nodes for an allocation */ 802 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) 803 { 804 struct mempolicy *pol = get_vma_policy(vma, addr); 805 806 switch (pol->policy) { 807 case MPOL_PREFERRED: 808 case MPOL_DEFAULT: 809 case MPOL_INTERLEAVE: 810 return 1; 811 case MPOL_BIND: { 812 struct zone **z; 813 for (z = pol->v.zonelist->zones; *z; z++) 814 if ((*z)->zone_pgdat->node_id == nid) 815 return 1; 816 return 0; 817 } 818 default: 819 BUG(); 820 return 0; 821 } 822 } 823 824 /* 825 * Shared memory backing store policy support. 826 * 827 * Remember policies even when nobody has shared memory mapped. 828 * The policies are kept in Red-Black tree linked from the inode. 829 * They are protected by the sp->sem semaphore, which should be held 830 * for any accesses to the tree. 831 */ 832 833 /* lookup first element intersecting start-end */ 834 /* Caller holds sp->sem */ 835 static struct sp_node * 836 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 837 { 838 struct rb_node *n = sp->root.rb_node; 839 840 while (n) { 841 struct sp_node *p = rb_entry(n, struct sp_node, nd); 842 if (start >= p->end) { 843 n = n->rb_right; 844 } else if (end < p->start) { 845 n = n->rb_left; 846 } else { 847 break; 848 } 849 } 850 if (!n) 851 return NULL; 852 for (;;) { 853 struct sp_node *w = NULL; 854 struct rb_node *prev = rb_prev(n); 855 if (!prev) 856 break; 857 w = rb_entry(prev, struct sp_node, nd); 858 if (w->end <= start) 859 break; 860 n = prev; 861 } 862 return rb_entry(n, struct sp_node, nd); 863 } 864 865 /* Insert a new shared policy into the list. */ 866 /* Caller holds sp->sem */ 867 static void sp_insert(struct shared_policy *sp, struct sp_node *new) 868 { 869 struct rb_node **p = &sp->root.rb_node; 870 struct rb_node *parent = NULL; 871 struct sp_node *nd; 872 873 while (*p) { 874 parent = *p; 875 nd = rb_entry(parent, struct sp_node, nd); 876 if (new->start < nd->start) 877 p = &(*p)->rb_left; 878 else if (new->end > nd->end) 879 p = &(*p)->rb_right; 880 else 881 BUG(); 882 } 883 rb_link_node(&new->nd, parent, p); 884 rb_insert_color(&new->nd, &sp->root); 885 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, 886 new->policy ? new->policy->policy : 0); 887 } 888 889 /* Find shared policy intersecting idx */ 890 struct mempolicy * 891 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) 892 { 893 struct mempolicy *pol = NULL; 894 struct sp_node *sn; 895 896 down(&sp->sem); 897 sn = sp_lookup(sp, idx, idx+1); 898 if (sn) { 899 mpol_get(sn->policy); 900 pol = sn->policy; 901 } 902 up(&sp->sem); 903 return pol; 904 } 905 906 static void sp_delete(struct shared_policy *sp, struct sp_node *n) 907 { 908 PDprintk("deleting %lx-l%x\n", n->start, n->end); 909 rb_erase(&n->nd, &sp->root); 910 mpol_free(n->policy); 911 kmem_cache_free(sn_cache, n); 912 } 913 914 struct sp_node * 915 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) 916 { 917 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 918 919 if (!n) 920 return NULL; 921 n->start = start; 922 n->end = end; 923 mpol_get(pol); 924 n->policy = pol; 925 return n; 926 } 927 928 /* Replace a policy range. */ 929 static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 930 unsigned long end, struct sp_node *new) 931 { 932 struct sp_node *n, *new2; 933 934 down(&sp->sem); 935 n = sp_lookup(sp, start, end); 936 /* Take care of old policies in the same range. */ 937 while (n && n->start < end) { 938 struct rb_node *next = rb_next(&n->nd); 939 if (n->start >= start) { 940 if (n->end <= end) 941 sp_delete(sp, n); 942 else 943 n->start = end; 944 } else { 945 /* Old policy spanning whole new range. */ 946 if (n->end > end) { 947 new2 = sp_alloc(end, n->end, n->policy); 948 if (!new2) { 949 up(&sp->sem); 950 return -ENOMEM; 951 } 952 n->end = end; 953 sp_insert(sp, new2); 954 } 955 /* Old crossing beginning, but not end (easy) */ 956 if (n->start < start && n->end > start) 957 n->end = start; 958 } 959 if (!next) 960 break; 961 n = rb_entry(next, struct sp_node, nd); 962 } 963 if (new) 964 sp_insert(sp, new); 965 up(&sp->sem); 966 return 0; 967 } 968 969 int mpol_set_shared_policy(struct shared_policy *info, 970 struct vm_area_struct *vma, struct mempolicy *npol) 971 { 972 int err; 973 struct sp_node *new = NULL; 974 unsigned long sz = vma_pages(vma); 975 976 PDprintk("set_shared_policy %lx sz %lu %d %lx\n", 977 vma->vm_pgoff, 978 sz, npol? npol->policy : -1, 979 npol ? npol->v.nodes[0] : -1); 980 981 if (npol) { 982 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 983 if (!new) 984 return -ENOMEM; 985 } 986 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 987 if (err && new) 988 kmem_cache_free(sn_cache, new); 989 return err; 990 } 991 992 /* Free a backing policy store on inode delete. */ 993 void mpol_free_shared_policy(struct shared_policy *p) 994 { 995 struct sp_node *n; 996 struct rb_node *next; 997 998 down(&p->sem); 999 next = rb_first(&p->root); 1000 while (next) { 1001 n = rb_entry(next, struct sp_node, nd); 1002 next = rb_next(&n->nd); 1003 rb_erase(&n->nd, &p->root); 1004 mpol_free(n->policy); 1005 kmem_cache_free(sn_cache, n); 1006 } 1007 up(&p->sem); 1008 } 1009 1010 /* assumes fs == KERNEL_DS */ 1011 void __init numa_policy_init(void) 1012 { 1013 policy_cache = kmem_cache_create("numa_policy", 1014 sizeof(struct mempolicy), 1015 0, SLAB_PANIC, NULL, NULL); 1016 1017 sn_cache = kmem_cache_create("shared_policy_node", 1018 sizeof(struct sp_node), 1019 0, SLAB_PANIC, NULL, NULL); 1020 1021 /* Set interleaving policy for system init. This way not all 1022 the data structures allocated at system boot end up in node zero. */ 1023 1024 if (sys_set_mempolicy(MPOL_INTERLEAVE, node_online_map, MAX_NUMNODES) < 0) 1025 printk("numa_policy_init: interleaving failed\n"); 1026 } 1027 1028 /* Reset policy of current process to default. 1029 * Assumes fs == KERNEL_DS */ 1030 void numa_default_policy(void) 1031 { 1032 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); 1033 } 1034
This page was automatically generated by LXR 0.3.1. • Linux is a registered trademark of Linus Torvalds