• source navigation  • diff markup  • identifier search  • freetext search  • file search  • 


Version: 2.6.8   2.6.16   2.6.25   2.6.30   2.6.34  

Architecture: i386   arm   mips   ppc   alpha   m68k   sparc   sparc64  

linux/mm/mempolicy.c


  1 /*
  2  * Simple NUMA memory policy for the Linux kernel.
  3  *
  4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
  5  * Subject to the GNU Public License, version 2.
  6  *
  7  * NUMA policy allows the user to give hints in which node(s) memory should
  8  * be allocated.
  9  *
 10  * Support four policies per VMA and per process:
 11  *
 12  * The VMA policy has priority over the process policy for a page fault.
 13  *
 14  * interleave     Allocate memory interleaved over a set of nodes,
 15  *                with normal fallback if it fails.
 16  *                For VMA based allocations this interleaves based on the
 17  *                offset into the backing object or offset into the mapping
 18  *                for anonymous memory. For process policy an process counter
 19  *                is used.
 20  * bind           Only allocate memory on a specific set of nodes,
 21  *                no fallback.
 22  * preferred       Try a specific node first before normal fallback.
 23  *                As a special case node -1 here means do the allocation
 24  *                on the local CPU. This is normally identical to default,
 25  *                but useful to set in a VMA when you have a non default
 26  *                process policy.
 27  * default        Allocate on the local node first, or when on a VMA
 28  *                use the process policy. This is what Linux always did
 29  *                in a NUMA aware kernel and still does by, ahem, default.
 30  *
 31  * The process policy is applied for most non interrupt memory allocations
 32  * in that process' context. Interrupts ignore the policies and always
 33  * try to allocate on the local CPU. The VMA policy is only applied for memory
 34  * allocations for a VMA in the VM.
 35  *
 36  * Currently there are a few corner cases in swapping where the policy
 37  * is not applied, but the majority should be handled. When process policy
 38  * is used it is not remembered over swap outs/swap ins.
 39  *
 40  * Only the highest zone in the zone hierarchy gets policied. Allocations
 41  * requesting a lower zone just use default policy. This implies that
 42  * on systems with highmem kernel lowmem allocation don't get policied.
 43  * Same with GFP_DMA allocations.
 44  *
 45  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
 46  * all users and remembered even when nobody has memory mapped.
 47  */
 48 
 49 /* Notebook:
 50    fix mmap readahead to honour policy and enable policy for any page cache
 51    object
 52    statistics for bigpages
 53    global policy for page cache? currently it uses process policy. Requires
 54    first item above.
 55    handle mremap for shared memory (currently ignored for the policy)
 56    grows down?
 57    make bind policy root only? It can trigger oom much faster and the
 58    kernel is not always grateful with that.
 59    could replace all the switch()es with a mempolicy_ops structure.
 60 */
 61 
 62 #include <linux/mempolicy.h>
 63 #include <linux/mm.h>
 64 #include <linux/highmem.h>
 65 #include <linux/hugetlb.h>
 66 #include <linux/kernel.h>
 67 #include <linux/sched.h>
 68 #include <linux/mm.h>
 69 #include <linux/gfp.h>
 70 #include <linux/slab.h>
 71 #include <linux/string.h>
 72 #include <linux/module.h>
 73 #include <linux/interrupt.h>
 74 #include <linux/init.h>
 75 #include <linux/compat.h>
 76 #include <linux/mempolicy.h>
 77 #include <asm/uaccess.h>
 78 
 79 static kmem_cache_t *policy_cache;
 80 static kmem_cache_t *sn_cache;
 81 
 82 #define PDprintk(fmt...)
 83 
 84 /* Highest zone. An specific allocation for a zone below that is not
 85    policied. */
 86 static int policy_zone;
 87 
 88 static struct mempolicy default_policy = {
 89         .refcnt = ATOMIC_INIT(1), /* never free it */
 90         .policy = MPOL_DEFAULT,
 91 };
 92 
 93 /* Check if all specified nodes are online */
 94 static int nodes_online(unsigned long *nodes)
 95 {
 96         DECLARE_BITMAP(online2, MAX_NUMNODES);
 97 
 98         bitmap_copy(online2, node_online_map, MAX_NUMNODES);
 99         if (bitmap_empty(online2, MAX_NUMNODES))
100                 set_bit(0, online2);
101         if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
102                 return -EINVAL;
103         return 0;
104 }
105 
106 /* Do sanity checking on a policy */
107 static int mpol_check_policy(int mode, unsigned long *nodes)
108 {
109         int empty = bitmap_empty(nodes, MAX_NUMNODES);
110 
111         switch (mode) {
112         case MPOL_DEFAULT:
113                 if (!empty)
114                         return -EINVAL;
115                 break;
116         case MPOL_BIND:
117         case MPOL_INTERLEAVE:
118                 /* Preferred will only use the first bit, but allow
119                    more for now. */
120                 if (empty)
121                         return -EINVAL;
122                 break;
123         }
124         return nodes_online(nodes);
125 }
126 
127 /* Copy a node mask from user space. */
128 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
129                      unsigned long maxnode, int mode)
130 {
131         unsigned long k;
132         unsigned long nlongs;
133         unsigned long endmask;
134 
135         --maxnode;
136         bitmap_zero(nodes, MAX_NUMNODES);
137         if (maxnode == 0 || !nmask)
138                 return 0;
139 
140         nlongs = BITS_TO_LONGS(maxnode);
141         if ((maxnode % BITS_PER_LONG) == 0)
142                 endmask = ~0UL;
143         else
144                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
145 
146         /* When the user specified more nodes than supported just check
147            if the non supported part is all zero. */
148         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
149                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
150                         unsigned long t;
151                         if (get_user(t,  nmask + k))
152                                 return -EFAULT;
153                         if (k == nlongs - 1) {
154                                 if (t & endmask)
155                                         return -EINVAL;
156                         } else if (t)
157                                 return -EINVAL;
158                 }
159                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
160                 endmask = ~0UL;
161         }
162 
163         if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
164                 return -EFAULT;
165         nodes[nlongs-1] &= endmask;
166         return mpol_check_policy(mode, nodes);
167 }
168 
169 /* Generate a custom zonelist for the BIND policy. */
170 static struct zonelist *bind_zonelist(unsigned long *nodes)
171 {
172         struct zonelist *zl;
173         int num, max, nd;
174 
175         max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
176         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
177         if (!zl)
178                 return NULL;
179         num = 0;
180         for (nd = find_first_bit(nodes, MAX_NUMNODES);
181              nd < MAX_NUMNODES;
182              nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
183                 int k;
184                 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
185                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
186                         if (!z->present_pages)
187                                 continue;
188                         zl->zones[num++] = z;
189                         if (k > policy_zone)
190                                 policy_zone = k;
191                 }
192         }
193         BUG_ON(num >= max);
194         zl->zones[num] = NULL;
195         return zl;
196 }
197 
198 /* Create a new policy */
199 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
200 {
201         struct mempolicy *policy;
202 
203         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
204         if (mode == MPOL_DEFAULT)
205                 return NULL;
206         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
207         if (!policy)
208                 return ERR_PTR(-ENOMEM);
209         atomic_set(&policy->refcnt, 1);
210         switch (mode) {
211         case MPOL_INTERLEAVE:
212                 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
213                 break;
214         case MPOL_PREFERRED:
215                 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
216                 if (policy->v.preferred_node >= MAX_NUMNODES)
217                         policy->v.preferred_node = -1;
218                 break;
219         case MPOL_BIND:
220                 policy->v.zonelist = bind_zonelist(nodes);
221                 if (policy->v.zonelist == NULL) {
222                         kmem_cache_free(policy_cache, policy);
223                         return ERR_PTR(-ENOMEM);
224                 }
225                 break;
226         }
227         policy->policy = mode;
228         return policy;
229 }
230 
231 /* Ensure all existing pages follow the policy. */
232 static int
233 verify_pages(unsigned long addr, unsigned long end, unsigned long *nodes)
234 {
235         while (addr < end) {
236                 struct page *p;
237                 pte_t *pte;
238                 pmd_t *pmd;
239                 pgd_t *pgd = pgd_offset_k(addr);
240                 if (pgd_none(*pgd)) {
241                         addr = (addr + PGDIR_SIZE) & PGDIR_MASK;
242                         continue;
243                 }
244                 pmd = pmd_offset(pgd, addr);
245                 if (pmd_none(*pmd)) {
246                         addr = (addr + PMD_SIZE) & PMD_MASK;
247                         continue;
248                 }
249                 p = NULL;
250                 pte = pte_offset_map(pmd, addr);
251                 if (pte_present(*pte))
252                         p = pte_page(*pte);
253                 pte_unmap(pte);
254                 if (p) {
255                         unsigned nid = page_to_nid(p);
256                         if (!test_bit(nid, nodes))
257                                 return -EIO;
258                 }
259                 addr += PAGE_SIZE;
260         }
261         return 0;
262 }
263 
264 /* Step 1: check the range */
265 static struct vm_area_struct *
266 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
267             unsigned long *nodes, unsigned long flags)
268 {
269         int err;
270         struct vm_area_struct *first, *vma, *prev;
271 
272         first = find_vma(mm, start);
273         if (!first)
274                 return ERR_PTR(-EFAULT);
275         prev = NULL;
276         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
277                 if (!vma->vm_next && vma->vm_end < end)
278                         return ERR_PTR(-EFAULT);
279                 if (prev && prev->vm_end < vma->vm_start)
280                         return ERR_PTR(-EFAULT);
281                 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
282                         err = verify_pages(vma->vm_start, vma->vm_end, nodes);
283                         if (err) {
284                                 first = ERR_PTR(err);
285                                 break;
286                         }
287                 }
288                 prev = vma;
289         }
290         return first;
291 }
292 
293 /* Apply policy to a single VMA */
294 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
295 {
296         int err = 0;
297         struct mempolicy *old = vma->vm_policy;
298 
299         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
300                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
301                  vma->vm_ops, vma->vm_file,
302                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
303 
304         if (vma->vm_ops && vma->vm_ops->set_policy)
305                 err = vma->vm_ops->set_policy(vma, new);
306         if (!err) {
307                 mpol_get(new);
308                 vma->vm_policy = new;
309                 mpol_free(old);
310         }
311         return err;
312 }
313 
314 /* Step 2: apply policy to a range and do splits. */
315 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
316                        unsigned long end, struct mempolicy *new)
317 {
318         struct vm_area_struct *next;
319         int err;
320 
321         err = 0;
322         for (; vma && vma->vm_start < end; vma = next) {
323                 next = vma->vm_next;
324                 if (vma->vm_start < start)
325                         err = split_vma(vma->vm_mm, vma, start, 1);
326                 if (!err && vma->vm_end > end)
327                         err = split_vma(vma->vm_mm, vma, end, 0);
328                 if (!err)
329                         err = policy_vma(vma, new);
330                 if (err)
331                         break;
332         }
333         return err;
334 }
335 
336 /* Change policy for a memory range */
337 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
338                           unsigned long mode,
339                           unsigned long __user *nmask, unsigned long maxnode,
340                           unsigned flags)
341 {
342         struct vm_area_struct *vma;
343         struct mm_struct *mm = current->mm;
344         struct mempolicy *new;
345         unsigned long end;
346         DECLARE_BITMAP(nodes, MAX_NUMNODES);
347         int err;
348 
349         if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
350                 return -EINVAL;
351         if (start & ~PAGE_MASK)
352                 return -EINVAL;
353         if (mode == MPOL_DEFAULT)
354                 flags &= ~MPOL_MF_STRICT;
355         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
356         end = start + len;
357         if (end < start)
358                 return -EINVAL;
359         if (end == start)
360                 return 0;
361 
362         err = get_nodes(nodes, nmask, maxnode, mode);
363         if (err)
364                 return err;
365 
366         new = mpol_new(mode, nodes);
367         if (IS_ERR(new))
368                 return PTR_ERR(new);
369 
370         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
371                         mode,nodes[0]);
372 
373         down_write(&mm->mmap_sem);
374         vma = check_range(mm, start, end, nodes, flags);
375         err = PTR_ERR(vma);
376         if (!IS_ERR(vma))
377                 err = mbind_range(vma, start, end, new);
378         up_write(&mm->mmap_sem);
379         mpol_free(new);
380         return err;
381 }
382 
383 /* Set the process memory policy */
384 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
385                                    unsigned long maxnode)
386 {
387         int err;
388         struct mempolicy *new;
389         DECLARE_BITMAP(nodes, MAX_NUMNODES);
390 
391         if (mode > MPOL_MAX)
392                 return -EINVAL;
393         err = get_nodes(nodes, nmask, maxnode, mode);
394         if (err)
395                 return err;
396         new = mpol_new(mode, nodes);
397         if (IS_ERR(new))
398                 return PTR_ERR(new);
399         mpol_free(current->mempolicy);
400         current->mempolicy = new;
401         if (new && new->policy == MPOL_INTERLEAVE)
402                 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
403         return 0;
404 }
405 
406 /* Fill a zone bitmap for a policy */
407 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
408 {
409         int i;
410 
411         bitmap_zero(nodes, MAX_NUMNODES);
412         switch (p->policy) {
413         case MPOL_BIND:
414                 for (i = 0; p->v.zonelist->zones[i]; i++)
415                         __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
416                 break;
417         case MPOL_DEFAULT:
418                 break;
419         case MPOL_INTERLEAVE:
420                 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
421                 break;
422         case MPOL_PREFERRED:
423                 /* or use current node instead of online map? */
424                 if (p->v.preferred_node < 0)
425                         bitmap_copy(nodes, node_online_map, MAX_NUMNODES);
426                 else
427                         __set_bit(p->v.preferred_node, nodes);
428                 break;
429         default:
430                 BUG();
431         }
432 }
433 
434 static int lookup_node(struct mm_struct *mm, unsigned long addr)
435 {
436         struct page *p;
437         int err;
438 
439         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
440         if (err >= 0) {
441                 err = page_zone(p)->zone_pgdat->node_id;
442                 put_page(p);
443         }
444         return err;
445 }
446 
447 /* Copy a kernel node mask to user space */
448 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
449                               void *nodes, unsigned nbytes)
450 {
451         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
452 
453         if (copy > nbytes) {
454                 if (copy > PAGE_SIZE)
455                         return -EINVAL;
456                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
457                         return -EFAULT;
458                 copy = nbytes;
459         }
460         return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
461 }
462 
463 /* Retrieve NUMA policy */
464 asmlinkage long sys_get_mempolicy(int __user *policy,
465                                   unsigned long __user *nmask,
466                                   unsigned long maxnode,
467                                   unsigned long addr, unsigned long flags)
468 {
469         int err, pval;
470         struct mm_struct *mm = current->mm;
471         struct vm_area_struct *vma = NULL;
472         struct mempolicy *pol = current->mempolicy;
473 
474         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
475                 return -EINVAL;
476         if (nmask != NULL && maxnode < numnodes)
477                 return -EINVAL;
478         if (flags & MPOL_F_ADDR) {
479                 down_read(&mm->mmap_sem);
480                 vma = find_vma_intersection(mm, addr, addr+1);
481                 if (!vma) {
482                         up_read(&mm->mmap_sem);
483                         return -EFAULT;
484                 }
485                 if (vma->vm_ops && vma->vm_ops->get_policy)
486                         pol = vma->vm_ops->get_policy(vma, addr);
487                 else
488                         pol = vma->vm_policy;
489         } else if (addr)
490                 return -EINVAL;
491 
492         if (!pol)
493                 pol = &default_policy;
494 
495         if (flags & MPOL_F_NODE) {
496                 if (flags & MPOL_F_ADDR) {
497                         err = lookup_node(mm, addr);
498                         if (err < 0)
499                                 goto out;
500                         pval = err;
501                 } else if (pol == current->mempolicy &&
502                                 pol->policy == MPOL_INTERLEAVE) {
503                         pval = current->il_next;
504                 } else {
505                         err = -EINVAL;
506                         goto out;
507                 }
508         } else
509                 pval = pol->policy;
510 
511         if (vma) {
512                 up_read(&current->mm->mmap_sem);
513                 vma = NULL;
514         }
515 
516         if (policy && put_user(pval, policy))
517                 return -EFAULT;
518 
519         err = 0;
520         if (nmask) {
521                 DECLARE_BITMAP(nodes, MAX_NUMNODES);
522                 get_zonemask(pol, nodes);
523                 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
524         }
525 
526  out:
527         if (vma)
528                 up_read(&current->mm->mmap_sem);
529         return err;
530 }
531 
532 #ifdef CONFIG_COMPAT
533 /* The other functions are compatible */
534 asmlinkage long compat_get_mempolicy(int __user *policy,
535                                   unsigned __user *nmask, unsigned  maxnode,
536                                   unsigned addr, unsigned  flags)
537 {
538         long err;
539         unsigned long __user *nm = NULL;
540         if (nmask)
541                 nm = compat_alloc_user_space(ALIGN(maxnode-1, 64) / 8);
542         err = sys_get_mempolicy(policy, nm, maxnode, addr, flags);
543         if (!err && copy_in_user(nmask, nm, ALIGN(maxnode-1, 32)/8))
544                 err = -EFAULT;
545         return err;
546 }
547 #endif
548 
549 /* Return effective policy for a VMA */
550 static struct mempolicy *
551 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
552 {
553         struct mempolicy *pol = current->mempolicy;
554 
555         if (vma) {
556                 if (vma->vm_ops && vma->vm_ops->get_policy)
557                         pol = vma->vm_ops->get_policy(vma, addr);
558                 else if (vma->vm_policy &&
559                                 vma->vm_policy->policy != MPOL_DEFAULT)
560                         pol = vma->vm_policy;
561         }
562         if (!pol)
563                 pol = &default_policy;
564         return pol;
565 }
566 
567 /* Return a zonelist representing a mempolicy */
568 static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy)
569 {
570         int nd;
571 
572         switch (policy->policy) {
573         case MPOL_PREFERRED:
574                 nd = policy->v.preferred_node;
575                 if (nd < 0)
576                         nd = numa_node_id();
577                 break;
578         case MPOL_BIND:
579                 /* Lower zones don't get a policy applied */
580                 if (gfp >= policy_zone)
581                         return policy->v.zonelist;
582                 /*FALL THROUGH*/
583         case MPOL_INTERLEAVE: /* should not happen */
584         case MPOL_DEFAULT:
585                 nd = numa_node_id();
586                 break;
587         default:
588                 nd = 0;
589                 BUG();
590         }
591         return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
592 }
593 
594 /* Do dynamic interleaving for a process */
595 static unsigned interleave_nodes(struct mempolicy *policy)
596 {
597         unsigned nid, next;
598         struct task_struct *me = current;
599 
600         nid = me->il_next;
601         BUG_ON(nid >= MAX_NUMNODES);
602         next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
603         if (next >= MAX_NUMNODES)
604                 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
605         me->il_next = next;
606         return nid;
607 }
608 
609 /* Do static interleaving for a VMA with known offset. */
610 static unsigned offset_il_node(struct mempolicy *pol,
611                 struct vm_area_struct *vma, unsigned long off)
612 {
613         unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
614         unsigned target = (unsigned)off % nnodes;
615         int c;
616         int nid = -1;
617 
618         c = 0;
619         do {
620                 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
621                 c++;
622         } while (c <= target);
623         BUG_ON(nid >= MAX_NUMNODES);
624         BUG_ON(!test_bit(nid, pol->v.nodes));
625         return nid;
626 }
627 
628 /* Allocate a page in interleaved policy.
629    Own path because it needs to do special accounting. */
630 static struct page *alloc_page_interleave(unsigned gfp, unsigned order, unsigned nid)
631 {
632         struct zonelist *zl;
633         struct page *page;
634 
635         BUG_ON(!test_bit(nid, node_online_map));
636         zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
637         page = __alloc_pages(gfp, order, zl);
638         if (page && page_zone(page) == zl->zones[0]) {
639                 zl->zones[0]->pageset[get_cpu()].interleave_hit++;
640                 put_cpu();
641         }
642         return page;
643 }
644 
645 /**
646  *      alloc_page_vma  - Allocate a page for a VMA.
647  *
648  *      @gfp:
649  *      %GFP_USER    user allocation.
650  *      %GFP_KERNEL  kernel allocations,
651  *      %GFP_HIGHMEM highmem/user allocations,
652  *      %GFP_FS      allocation should not call back into a file system.
653  *      %GFP_ATOMIC  don't sleep.
654  *
655  *      @vma:  Pointer to VMA or NULL if not available.
656  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
657  *
658  *      This function allocates a page from the kernel page pool and applies
659  *      a NUMA policy associated with the VMA or the current process.
660  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
661  *      mm_struct of the VMA to prevent it from going away. Should be used for
662  *      all allocations for pages that will be mapped into
663  *      user space. Returns NULL when no page can be allocated.
664  *
665  *      Should be called with the mm_sem of the vma hold.
666  */
667 struct page *
668 alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr)
669 {
670         struct mempolicy *pol = get_vma_policy(vma, addr);
671 
672         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
673                 unsigned nid;
674                 if (vma) {
675                         unsigned long off;
676                         BUG_ON(addr >= vma->vm_end);
677                         BUG_ON(addr < vma->vm_start);
678                         off = vma->vm_pgoff;
679                         off += (addr - vma->vm_start) >> PAGE_SHIFT;
680                         nid = offset_il_node(pol, vma, off);
681                 } else {
682                         /* fall back to process interleaving */
683                         nid = interleave_nodes(pol);
684                 }
685                 return alloc_page_interleave(gfp, 0, nid);
686         }
687         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
688 }
689 
690 /**
691  *      alloc_pages_current - Allocate pages.
692  *
693  *      @gfp:
694  *              %GFP_USER   user allocation,
695  *              %GFP_KERNEL kernel allocation,
696  *              %GFP_HIGHMEM highmem allocation,
697  *              %GFP_FS     don't call back into a file system.
698  *              %GFP_ATOMIC don't sleep.
699  *      @order: Power of two of allocation size in pages. 0 is a single page.
700  *
701  *      Allocate a page from the kernel page pool.  When not in
702  *      interrupt context and apply the current process NUMA policy.
703  *      Returns NULL when no page can be allocated.
704  */
705 struct page *alloc_pages_current(unsigned gfp, unsigned order)
706 {
707         struct mempolicy *pol = current->mempolicy;
708 
709         if (!pol || in_interrupt())
710                 pol = &default_policy;
711         if (pol->policy == MPOL_INTERLEAVE)
712                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
713         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
714 }
715 EXPORT_SYMBOL(alloc_pages_current);
716 
717 /* Slow path of a mempolicy copy */
718 struct mempolicy *__mpol_copy(struct mempolicy *old)
719 {
720         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
721 
722         if (!new)
723                 return ERR_PTR(-ENOMEM);
724         *new = *old;
725         atomic_set(&new->refcnt, 1);
726         if (new->policy == MPOL_BIND) {
727                 int sz = ksize(old->v.zonelist);
728                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
729                 if (!new->v.zonelist) {
730                         kmem_cache_free(policy_cache, new);
731                         return ERR_PTR(-ENOMEM);
732                 }
733                 memcpy(new->v.zonelist, old->v.zonelist, sz);
734         }
735         return new;
736 }
737 
738 /* Slow path of a mempolicy comparison */
739 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
740 {
741         if (!a || !b)
742                 return 0;
743         if (a->policy != b->policy)
744                 return 0;
745         switch (a->policy) {
746         case MPOL_DEFAULT:
747                 return 1;
748         case MPOL_INTERLEAVE:
749                 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
750         case MPOL_PREFERRED:
751                 return a->v.preferred_node == b->v.preferred_node;
752         case MPOL_BIND: {
753                 int i;
754                 for (i = 0; a->v.zonelist->zones[i]; i++)
755                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
756                                 return 0;
757                 return b->v.zonelist->zones[i] == NULL;
758         }
759         default:
760                 BUG();
761                 return 0;
762         }
763 }
764 
765 /* Slow path of a mpol destructor. */
766 void __mpol_free(struct mempolicy *p)
767 {
768         if (!atomic_dec_and_test(&p->refcnt))
769                 return;
770         if (p->policy == MPOL_BIND)
771                 kfree(p->v.zonelist);
772         p->policy = MPOL_DEFAULT;
773         kmem_cache_free(policy_cache, p);
774 }
775 
776 /*
777  * Hugetlb policy. Same as above, just works with node numbers instead of
778  * zonelists.
779  */
780 
781 /* Find first node suitable for an allocation */
782 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
783 {
784         struct mempolicy *pol = get_vma_policy(vma, addr);
785 
786         switch (pol->policy) {
787         case MPOL_DEFAULT:
788                 return numa_node_id();
789         case MPOL_BIND:
790                 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
791         case MPOL_INTERLEAVE:
792                 return interleave_nodes(pol);
793         case MPOL_PREFERRED:
794                 return pol->v.preferred_node >= 0 ?
795                                 pol->v.preferred_node : numa_node_id();
796         }
797         BUG();
798         return 0;
799 }
800 
801 /* Find secondary valid nodes for an allocation */
802 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
803 {
804         struct mempolicy *pol = get_vma_policy(vma, addr);
805 
806         switch (pol->policy) {
807         case MPOL_PREFERRED:
808         case MPOL_DEFAULT:
809         case MPOL_INTERLEAVE:
810                 return 1;
811         case MPOL_BIND: {
812                 struct zone **z;
813                 for (z = pol->v.zonelist->zones; *z; z++)
814                         if ((*z)->zone_pgdat->node_id == nid)
815                                 return 1;
816                 return 0;
817         }
818         default:
819                 BUG();
820                 return 0;
821         }
822 }
823 
824 /*
825  * Shared memory backing store policy support.
826  *
827  * Remember policies even when nobody has shared memory mapped.
828  * The policies are kept in Red-Black tree linked from the inode.
829  * They are protected by the sp->sem semaphore, which should be held
830  * for any accesses to the tree.
831  */
832 
833 /* lookup first element intersecting start-end */
834 /* Caller holds sp->sem */
835 static struct sp_node *
836 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
837 {
838         struct rb_node *n = sp->root.rb_node;
839 
840         while (n) {
841                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
842                 if (start >= p->end) {
843                         n = n->rb_right;
844                 } else if (end < p->start) {
845                         n = n->rb_left;
846                 } else {
847                         break;
848                 }
849         }
850         if (!n)
851                 return NULL;
852         for (;;) {
853                 struct sp_node *w = NULL;
854                 struct rb_node *prev = rb_prev(n);
855                 if (!prev)
856                         break;
857                 w = rb_entry(prev, struct sp_node, nd);
858                 if (w->end <= start)
859                         break;
860                 n = prev;
861         }
862         return rb_entry(n, struct sp_node, nd);
863 }
864 
865 /* Insert a new shared policy into the list. */
866 /* Caller holds sp->sem */
867 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
868 {
869         struct rb_node **p = &sp->root.rb_node;
870         struct rb_node *parent = NULL;
871         struct sp_node *nd;
872 
873         while (*p) {
874                 parent = *p;
875                 nd = rb_entry(parent, struct sp_node, nd);
876                 if (new->start < nd->start)
877                         p = &(*p)->rb_left;
878                 else if (new->end > nd->end)
879                         p = &(*p)->rb_right;
880                 else
881                         BUG();
882         }
883         rb_link_node(&new->nd, parent, p);
884         rb_insert_color(&new->nd, &sp->root);
885         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
886                  new->policy ? new->policy->policy : 0);
887 }
888 
889 /* Find shared policy intersecting idx */
890 struct mempolicy *
891 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
892 {
893         struct mempolicy *pol = NULL;
894         struct sp_node *sn;
895 
896         down(&sp->sem);
897         sn = sp_lookup(sp, idx, idx+1);
898         if (sn) {
899                 mpol_get(sn->policy);
900                 pol = sn->policy;
901         }
902         up(&sp->sem);
903         return pol;
904 }
905 
906 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
907 {
908         PDprintk("deleting %lx-l%x\n", n->start, n->end);
909         rb_erase(&n->nd, &sp->root);
910         mpol_free(n->policy);
911         kmem_cache_free(sn_cache, n);
912 }
913 
914 struct sp_node *
915 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
916 {
917         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
918 
919         if (!n)
920                 return NULL;
921         n->start = start;
922         n->end = end;
923         mpol_get(pol);
924         n->policy = pol;
925         return n;
926 }
927 
928 /* Replace a policy range. */
929 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
930                                  unsigned long end, struct sp_node *new)
931 {
932         struct sp_node *n, *new2;
933 
934         down(&sp->sem);
935         n = sp_lookup(sp, start, end);
936         /* Take care of old policies in the same range. */
937         while (n && n->start < end) {
938                 struct rb_node *next = rb_next(&n->nd);
939                 if (n->start >= start) {
940                         if (n->end <= end)
941                                 sp_delete(sp, n);
942                         else
943                                 n->start = end;
944                 } else {
945                         /* Old policy spanning whole new range. */
946                         if (n->end > end) {
947                                 new2 = sp_alloc(end, n->end, n->policy);
948                                 if (!new2) {
949                                         up(&sp->sem);
950                                         return -ENOMEM;
951                                 }
952                                 n->end = end;
953                                 sp_insert(sp, new2);
954                         }
955                         /* Old crossing beginning, but not end (easy) */
956                         if (n->start < start && n->end > start)
957                                 n->end = start;
958                 }
959                 if (!next)
960                         break;
961                 n = rb_entry(next, struct sp_node, nd);
962         }
963         if (new)
964                 sp_insert(sp, new);
965         up(&sp->sem);
966         return 0;
967 }
968 
969 int mpol_set_shared_policy(struct shared_policy *info,
970                         struct vm_area_struct *vma, struct mempolicy *npol)
971 {
972         int err;
973         struct sp_node *new = NULL;
974         unsigned long sz = vma_pages(vma);
975 
976         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
977                  vma->vm_pgoff,
978                  sz, npol? npol->policy : -1,
979                 npol ? npol->v.nodes[0] : -1);
980 
981         if (npol) {
982                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
983                 if (!new)
984                         return -ENOMEM;
985         }
986         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
987         if (err && new)
988                 kmem_cache_free(sn_cache, new);
989         return err;
990 }
991 
992 /* Free a backing policy store on inode delete. */
993 void mpol_free_shared_policy(struct shared_policy *p)
994 {
995         struct sp_node *n;
996         struct rb_node *next;
997 
998         down(&p->sem);
999         next = rb_first(&p->root);
1000         while (next) {
1001                 n = rb_entry(next, struct sp_node, nd);
1002                 next = rb_next(&n->nd);
1003                 rb_erase(&n->nd, &p->root);
1004                 mpol_free(n->policy);
1005                 kmem_cache_free(sn_cache, n);
1006         }
1007         up(&p->sem);
1008 }
1009 
1010 /* assumes fs == KERNEL_DS */
1011 void __init numa_policy_init(void)
1012 {
1013         policy_cache = kmem_cache_create("numa_policy",
1014                                          sizeof(struct mempolicy),
1015                                          0, SLAB_PANIC, NULL, NULL);
1016 
1017         sn_cache = kmem_cache_create("shared_policy_node",
1018                                      sizeof(struct sp_node),
1019                                      0, SLAB_PANIC, NULL, NULL);
1020 
1021         /* Set interleaving policy for system init. This way not all
1022            the data structures allocated at system boot end up in node zero. */
1023 
1024         if (sys_set_mempolicy(MPOL_INTERLEAVE, node_online_map, MAX_NUMNODES) < 0)
1025                 printk("numa_policy_init: interleaving failed\n");
1026 }
1027 
1028 /* Reset policy of current process to default.
1029  * Assumes fs == KERNEL_DS */
1030 void numa_default_policy(void)
1031 {
1032         sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1033 }
1034 

This page was automatically generated by LXR 0.3.1. •  Linux is a registered trademark of Linus Torvalds