1 /* 2 * mm/prio_tree.c - priority search tree for mapping->i_mmap 3 * 4 * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu> 5 * 6 * This file is released under the GPL v2. 7 * 8 * Based on the radix priority search tree proposed by Edward M. McCreight 9 * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 10 * 11 * 02Feb2004 Initial version 12 */ 13 14 #include <linux/init.h> 15 #include <linux/module.h> 16 #include <linux/mm.h> 17 #include <linux/prio_tree.h> 18 19 /* 20 * A clever mix of heap and radix trees forms a radix priority search tree (PST) 21 * which is useful for storing intervals, e.g, we can consider a vma as a closed 22 * interval of file pages [offset_begin, offset_end], and store all vmas that 23 * map a file in a PST. Then, using the PST, we can answer a stabbing query, 24 * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a 25 * given input interval X (a set of consecutive file pages), in "O(log n + m)" 26 * time where 'log n' is the height of the PST, and 'm' is the number of stored 27 * intervals (vmas) that overlap (map) with the input interval X (the set of 28 * consecutive file pages). 29 * 30 * In our implementation, we store closed intervals of the form [radix_index, 31 * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST 32 * is designed for storing intervals with unique radix indices, i.e., each 33 * interval have different radix_index. However, this limitation can be easily 34 * overcome by using the size, i.e., heap_index - radix_index, as part of the 35 * index, so we index the tree using [(radix_index,size), heap_index]. 36 * 37 * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit 38 * machine, the maximum height of a PST can be 64. We can use a balanced version 39 * of the priority search tree to optimize the tree height, but the balanced 40 * tree proposed by McCreight is too complex and memory-hungry for our purpose. 41 */ 42 43 /* 44 * The following macros are used for implementing prio_tree for i_mmap 45 */ 46 47 #define RADIX_INDEX(vma) ((vma)->vm_pgoff) 48 #define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) 49 /* avoid overflow */ 50 #define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) 51 52 #define GET_INDEX_VMA(vma, radix, heap) \ 53 do { \ 54 radix = RADIX_INDEX(vma); \ 55 heap = HEAP_INDEX(vma); \ 56 } while (0) 57 58 #define GET_INDEX(node, radix, heap) \ 59 do { \ 60 struct vm_area_struct *__tmp = \ 61 prio_tree_entry(node, struct vm_area_struct, shared.prio_tree_node);\ 62 GET_INDEX_VMA(__tmp, radix, heap); \ 63 } while (0) 64 65 static unsigned long index_bits_to_maxindex[BITS_PER_LONG]; 66 67 void __init prio_tree_init(void) 68 { 69 unsigned int i; 70 71 for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++) 72 index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1; 73 index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL; 74 } 75 76 /* 77 * Maximum heap_index that can be stored in a PST with index_bits bits 78 */ 79 static inline unsigned long prio_tree_maxindex(unsigned int bits) 80 { 81 return index_bits_to_maxindex[bits - 1]; 82 } 83 84 /* 85 * Extend a priority search tree so that it can store a node with heap_index 86 * max_heap_index. In the worst case, this algorithm takes O((log n)^2). 87 * However, this function is used rarely and the common case performance is 88 * not bad. 89 */ 90 static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root, 91 struct prio_tree_node *node, unsigned long max_heap_index) 92 { 93 static void prio_tree_remove(struct prio_tree_root *, 94 struct prio_tree_node *); 95 struct prio_tree_node *first = NULL, *prev, *last = NULL; 96 97 if (max_heap_index > prio_tree_maxindex(root->index_bits)) 98 root->index_bits++; 99 100 while (max_heap_index > prio_tree_maxindex(root->index_bits)) { 101 root->index_bits++; 102 103 if (prio_tree_empty(root)) 104 continue; 105 106 if (first == NULL) { 107 first = root->prio_tree_node; 108 prio_tree_remove(root, root->prio_tree_node); 109 INIT_PRIO_TREE_NODE(first); 110 last = first; 111 } else { 112 prev = last; 113 last = root->prio_tree_node; 114 prio_tree_remove(root, root->prio_tree_node); 115 INIT_PRIO_TREE_NODE(last); 116 prev->left = last; 117 last->parent = prev; 118 } 119 } 120 121 INIT_PRIO_TREE_NODE(node); 122 123 if (first) { 124 node->left = first; 125 first->parent = node; 126 } else 127 last = node; 128 129 if (!prio_tree_empty(root)) { 130 last->left = root->prio_tree_node; 131 last->left->parent = last; 132 } 133 134 root->prio_tree_node = node; 135 return node; 136 } 137 138 /* 139 * Replace a prio_tree_node with a new node and return the old node 140 */ 141 static struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root, 142 struct prio_tree_node *old, struct prio_tree_node *node) 143 { 144 INIT_PRIO_TREE_NODE(node); 145 146 if (prio_tree_root(old)) { 147 BUG_ON(root->prio_tree_node != old); 148 /* 149 * We can reduce root->index_bits here. However, it is complex 150 * and does not help much to improve performance (IMO). 151 */ 152 node->parent = node; 153 root->prio_tree_node = node; 154 } else { 155 node->parent = old->parent; 156 if (old->parent->left == old) 157 old->parent->left = node; 158 else 159 old->parent->right = node; 160 } 161 162 if (!prio_tree_left_empty(old)) { 163 node->left = old->left; 164 old->left->parent = node; 165 } 166 167 if (!prio_tree_right_empty(old)) { 168 node->right = old->right; 169 old->right->parent = node; 170 } 171 172 return old; 173 } 174 175 /* 176 * Insert a prio_tree_node @node into a radix priority search tree @root. The 177 * algorithm typically takes O(log n) time where 'log n' is the number of bits 178 * required to represent the maximum heap_index. In the worst case, the algo 179 * can take O((log n)^2) - check prio_tree_expand. 180 * 181 * If a prior node with same radix_index and heap_index is already found in 182 * the tree, then returns the address of the prior node. Otherwise, inserts 183 * @node into the tree and returns @node. 184 */ 185 static struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root, 186 struct prio_tree_node *node) 187 { 188 struct prio_tree_node *cur, *res = node; 189 unsigned long radix_index, heap_index; 190 unsigned long r_index, h_index, index, mask; 191 int size_flag = 0; 192 193 GET_INDEX(node, radix_index, heap_index); 194 195 if (prio_tree_empty(root) || 196 heap_index > prio_tree_maxindex(root->index_bits)) 197 return prio_tree_expand(root, node, heap_index); 198 199 cur = root->prio_tree_node; 200 mask = 1UL << (root->index_bits - 1); 201 202 while (mask) { 203 GET_INDEX(cur, r_index, h_index); 204 205 if (r_index == radix_index && h_index == heap_index) 206 return cur; 207 208 if (h_index < heap_index || 209 (h_index == heap_index && r_index > radix_index)) { 210 struct prio_tree_node *tmp = node; 211 node = prio_tree_replace(root, cur, node); 212 cur = tmp; 213 /* swap indices */ 214 index = r_index; 215 r_index = radix_index; 216 radix_index = index; 217 index = h_index; 218 h_index = heap_index; 219 heap_index = index; 220 } 221 222 if (size_flag) 223 index = heap_index - radix_index; 224 else 225 index = radix_index; 226 227 if (index & mask) { 228 if (prio_tree_right_empty(cur)) { 229 INIT_PRIO_TREE_NODE(node); 230 cur->right = node; 231 node->parent = cur; 232 return res; 233 } else 234 cur = cur->right; 235 } else { 236 if (prio_tree_left_empty(cur)) { 237 INIT_PRIO_TREE_NODE(node); 238 cur->left = node; 239 node->parent = cur; 240 return res; 241 } else 242 cur = cur->left; 243 } 244 245 mask >>= 1; 246 247 if (!mask) { 248 mask = 1UL << (root->index_bits - 1); 249 size_flag = 1; 250 } 251 } 252 /* Should not reach here */ 253 BUG(); 254 return NULL; 255 } 256 257 /* 258 * Remove a prio_tree_node @node from a radix priority search tree @root. The 259 * algorithm takes O(log n) time where 'log n' is the number of bits required 260 * to represent the maximum heap_index. 261 */ 262 static void prio_tree_remove(struct prio_tree_root *root, 263 struct prio_tree_node *node) 264 { 265 struct prio_tree_node *cur; 266 unsigned long r_index, h_index_right, h_index_left; 267 268 cur = node; 269 270 while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) { 271 if (!prio_tree_left_empty(cur)) 272 GET_INDEX(cur->left, r_index, h_index_left); 273 else { 274 cur = cur->right; 275 continue; 276 } 277 278 if (!prio_tree_right_empty(cur)) 279 GET_INDEX(cur->right, r_index, h_index_right); 280 else { 281 cur = cur->left; 282 continue; 283 } 284 285 /* both h_index_left and h_index_right cannot be 0 */ 286 if (h_index_left >= h_index_right) 287 cur = cur->left; 288 else 289 cur = cur->right; 290 } 291 292 if (prio_tree_root(cur)) { 293 BUG_ON(root->prio_tree_node != cur); 294 INIT_PRIO_TREE_ROOT(root); 295 return; 296 } 297 298 if (cur->parent->right == cur) 299 cur->parent->right = cur->parent; 300 else 301 cur->parent->left = cur->parent; 302 303 while (cur != node) 304 cur = prio_tree_replace(root, cur->parent, cur); 305 } 306 307 /* 308 * Following functions help to enumerate all prio_tree_nodes in the tree that 309 * overlap with the input interval X [radix_index, heap_index]. The enumeration 310 * takes O(log n + m) time where 'log n' is the height of the tree (which is 311 * proportional to # of bits required to represent the maximum heap_index) and 312 * 'm' is the number of prio_tree_nodes that overlap the interval X. 313 */ 314 315 static struct prio_tree_node *prio_tree_left( 316 struct prio_tree_root *root, struct prio_tree_iter *iter, 317 unsigned long radix_index, unsigned long heap_index, 318 unsigned long *r_index, unsigned long *h_index) 319 { 320 if (prio_tree_left_empty(iter->cur)) 321 return NULL; 322 323 GET_INDEX(iter->cur->left, *r_index, *h_index); 324 325 if (radix_index <= *h_index) { 326 iter->cur = iter->cur->left; 327 iter->mask >>= 1; 328 if (iter->mask) { 329 if (iter->size_level) 330 iter->size_level++; 331 } else { 332 if (iter->size_level) { 333 BUG_ON(!prio_tree_left_empty(iter->cur)); 334 BUG_ON(!prio_tree_right_empty(iter->cur)); 335 iter->size_level++; 336 iter->mask = ULONG_MAX; 337 } else { 338 iter->size_level = 1; 339 iter->mask = 1UL << (root->index_bits - 1); 340 } 341 } 342 return iter->cur; 343 } 344 345 return NULL; 346 } 347 348 static struct prio_tree_node *prio_tree_right( 349 struct prio_tree_root *root, struct prio_tree_iter *iter, 350 unsigned long radix_index, unsigned long heap_index, 351 unsigned long *r_index, unsigned long *h_index) 352 { 353 unsigned long value; 354 355 if (prio_tree_right_empty(iter->cur)) 356 return NULL; 357 358 if (iter->size_level) 359 value = iter->value; 360 else 361 value = iter->value | iter->mask; 362 363 if (heap_index < value) 364 return NULL; 365 366 GET_INDEX(iter->cur->right, *r_index, *h_index); 367 368 if (radix_index <= *h_index) { 369 iter->cur = iter->cur->right; 370 iter->mask >>= 1; 371 iter->value = value; 372 if (iter->mask) { 373 if (iter->size_level) 374 iter->size_level++; 375 } else { 376 if (iter->size_level) { 377 BUG_ON(!prio_tree_left_empty(iter->cur)); 378 BUG_ON(!prio_tree_right_empty(iter->cur)); 379 iter->size_level++; 380 iter->mask = ULONG_MAX; 381 } else { 382 iter->size_level = 1; 383 iter->mask = 1UL << (root->index_bits - 1); 384 } 385 } 386 return iter->cur; 387 } 388 389 return NULL; 390 } 391 392 static struct prio_tree_node *prio_tree_parent(struct prio_tree_iter *iter) 393 { 394 iter->cur = iter->cur->parent; 395 if (iter->mask == ULONG_MAX) 396 iter->mask = 1UL; 397 else if (iter->size_level == 1) 398 iter->mask = 1UL; 399 else 400 iter->mask <<= 1; 401 if (iter->size_level) 402 iter->size_level--; 403 if (!iter->size_level && (iter->value & iter->mask)) 404 iter->value ^= iter->mask; 405 return iter->cur; 406 } 407 408 static inline int overlap(unsigned long radix_index, unsigned long heap_index, 409 unsigned long r_index, unsigned long h_index) 410 { 411 return heap_index >= r_index && radix_index <= h_index; 412 } 413 414 /* 415 * prio_tree_first: 416 * 417 * Get the first prio_tree_node that overlaps with the interval [radix_index, 418 * heap_index]. Note that always radix_index <= heap_index. We do a pre-order 419 * traversal of the tree. 420 */ 421 static struct prio_tree_node *prio_tree_first(struct prio_tree_root *root, 422 struct prio_tree_iter *iter, unsigned long radix_index, 423 unsigned long heap_index) 424 { 425 unsigned long r_index, h_index; 426 427 INIT_PRIO_TREE_ITER(iter); 428 429 if (prio_tree_empty(root)) 430 return NULL; 431 432 GET_INDEX(root->prio_tree_node, r_index, h_index); 433 434 if (radix_index > h_index) 435 return NULL; 436 437 iter->mask = 1UL << (root->index_bits - 1); 438 iter->cur = root->prio_tree_node; 439 440 while (1) { 441 if (overlap(radix_index, heap_index, r_index, h_index)) 442 return iter->cur; 443 444 if (prio_tree_left(root, iter, radix_index, heap_index, 445 &r_index, &h_index)) 446 continue; 447 448 if (prio_tree_right(root, iter, radix_index, heap_index, 449 &r_index, &h_index)) 450 continue; 451 452 break; 453 } 454 return NULL; 455 } 456 457 /* 458 * prio_tree_next: 459 * 460 * Get the next prio_tree_node that overlaps with the input interval in iter 461 */ 462 static struct prio_tree_node *prio_tree_next(struct prio_tree_root *root, 463 struct prio_tree_iter *iter, unsigned long radix_index, 464 unsigned long heap_index) 465 { 466 unsigned long r_index, h_index; 467 468 repeat: 469 while (prio_tree_left(root, iter, radix_index, 470 heap_index, &r_index, &h_index)) { 471 if (overlap(radix_index, heap_index, r_index, h_index)) 472 return iter->cur; 473 } 474 475 while (!prio_tree_right(root, iter, radix_index, 476 heap_index, &r_index, &h_index)) { 477 while (!prio_tree_root(iter->cur) && 478 iter->cur->parent->right == iter->cur) 479 prio_tree_parent(iter); 480 481 if (prio_tree_root(iter->cur)) 482 return NULL; 483 484 prio_tree_parent(iter); 485 } 486 487 if (overlap(radix_index, heap_index, r_index, h_index)) 488 return iter->cur; 489 490 goto repeat; 491 } 492 493 /* 494 * Radix priority search tree for address_space->i_mmap 495 * 496 * For each vma that map a unique set of file pages i.e., unique [radix_index, 497 * heap_index] value, we have a corresponing priority search tree node. If 498 * multiple vmas have identical [radix_index, heap_index] value, then one of 499 * them is used as a tree node and others are stored in a vm_set list. The tree 500 * node points to the first vma (head) of the list using vm_set.head. 501 * 502 * prio_tree_root 503 * | 504 * A vm_set.head 505 * / \ / 506 * L R -> H-I-J-K-M-N-O-P-Q-S 507 * ^ ^ <-- vm_set.list --> 508 * tree nodes 509 * 510 * We need some way to identify whether a vma is a tree node, head of a vm_set 511 * list, or just a member of a vm_set list. We cannot use vm_flags to store 512 * such information. The reason is, in the above figure, it is possible that 513 * vm_flags' of R and H are covered by the different mmap_sems. When R is 514 * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold 515 * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. 516 * That's why some trick involving shared.vm_set.parent is used for identifying 517 * tree nodes and list head nodes. 518 * 519 * vma radix priority search tree node rules: 520 * 521 * vma->shared.vm_set.parent != NULL ==> a tree node 522 * vma->shared.vm_set.head != NULL ==> list of others mapping same range 523 * vma->shared.vm_set.head == NULL ==> no others map the same range 524 * 525 * vma->shared.vm_set.parent == NULL 526 * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range 527 * vma->shared.vm_set.head == NULL ==> a list node 528 */ 529 530 /* 531 * Add a new vma known to map the same set of pages as the old vma: 532 * useful for fork's dup_mmap as well as vma_prio_tree_insert below. 533 * Note that it just happens to work correctly on i_mmap_nonlinear too. 534 */ 535 void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) 536 { 537 /* Leave these BUG_ONs till prio_tree patch stabilizes */ 538 BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); 539 BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); 540 541 if (!old->shared.vm_set.parent) 542 list_add(&vma->shared.vm_set.list, 543 &old->shared.vm_set.list); 544 else if (old->shared.vm_set.head) 545 list_add_tail(&vma->shared.vm_set.list, 546 &old->shared.vm_set.head->shared.vm_set.list); 547 else { 548 INIT_LIST_HEAD(&vma->shared.vm_set.list); 549 vma->shared.vm_set.head = old; 550 old->shared.vm_set.head = vma; 551 } 552 } 553 554 void vma_prio_tree_insert(struct vm_area_struct *vma, 555 struct prio_tree_root *root) 556 { 557 struct prio_tree_node *ptr; 558 struct vm_area_struct *old; 559 560 ptr = prio_tree_insert(root, &vma->shared.prio_tree_node); 561 if (ptr != &vma->shared.prio_tree_node) { 562 old = prio_tree_entry(ptr, struct vm_area_struct, 563 shared.prio_tree_node); 564 vma_prio_tree_add(vma, old); 565 } 566 } 567 568 void vma_prio_tree_remove(struct vm_area_struct *vma, 569 struct prio_tree_root *root) 570 { 571 struct vm_area_struct *node, *head, *new_head; 572 573 if (!vma->shared.vm_set.head) { 574 if (!vma->shared.vm_set.parent) 575 list_del_init(&vma->shared.vm_set.list); 576 else 577 prio_tree_remove(root, &vma->shared.prio_tree_node); 578 } else { 579 /* Leave this BUG_ON till prio_tree patch stabilizes */ 580 BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); 581 if (vma->shared.vm_set.parent) { 582 head = vma->shared.vm_set.head; 583 if (!list_empty(&head->shared.vm_set.list)) { 584 new_head = list_entry( 585 head->shared.vm_set.list.next, 586 struct vm_area_struct, 587 shared.vm_set.list); 588 list_del_init(&head->shared.vm_set.list); 589 } else 590 new_head = NULL; 591 592 prio_tree_replace(root, &vma->shared.prio_tree_node, 593 &head->shared.prio_tree_node); 594 head->shared.vm_set.head = new_head; 595 if (new_head) 596 new_head->shared.vm_set.head = head; 597 598 } else { 599 node = vma->shared.vm_set.head; 600 if (!list_empty(&vma->shared.vm_set.list)) { 601 new_head = list_entry( 602 vma->shared.vm_set.list.next, 603 struct vm_area_struct, 604 shared.vm_set.list); 605 list_del_init(&vma->shared.vm_set.list); 606 node->shared.vm_set.head = new_head; 607 new_head->shared.vm_set.head = node; 608 } else 609 node->shared.vm_set.head = NULL; 610 } 611 } 612 } 613 614 /* 615 * Helper function to enumerate vmas that map a given file page or a set of 616 * contiguous file pages. The function returns vmas that at least map a single 617 * page in the given range of contiguous file pages. 618 */ 619 struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, 620 struct prio_tree_root *root, struct prio_tree_iter *iter, 621 pgoff_t begin, pgoff_t end) 622 { 623 struct prio_tree_node *ptr; 624 struct vm_area_struct *next; 625 626 if (!vma) { 627 /* 628 * First call is with NULL vma 629 */ 630 ptr = prio_tree_first(root, iter, begin, end); 631 if (ptr) { 632 next = prio_tree_entry(ptr, struct vm_area_struct, 633 shared.prio_tree_node); 634 prefetch(next->shared.vm_set.head); 635 return next; 636 } else 637 return NULL; 638 } 639 640 if (vma->shared.vm_set.parent) { 641 if (vma->shared.vm_set.head) { 642 next = vma->shared.vm_set.head; 643 prefetch(next->shared.vm_set.list.next); 644 return next; 645 } 646 } else { 647 next = list_entry(vma->shared.vm_set.list.next, 648 struct vm_area_struct, shared.vm_set.list); 649 if (!next->shared.vm_set.head) { 650 prefetch(next->shared.vm_set.list.next); 651 return next; 652 } 653 } 654 655 ptr = prio_tree_next(root, iter, begin, end); 656 if (ptr) { 657 next = prio_tree_entry(ptr, struct vm_area_struct, 658 shared.prio_tree_node); 659 prefetch(next->shared.vm_set.head); 660 return next; 661 } else 662 return NULL; 663 } 664
This page was automatically generated by LXR 0.3.1. • Linux is a registered trademark of Linus Torvalds