1 /* 2 * linux/mm/swapfile.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * Swap reorganised 29.12.95, Stephen Tweedie 6 */ 7 8 #include <linux/config.h> 9 #include <linux/mm.h> 10 #include <linux/hugetlb.h> 11 #include <linux/mman.h> 12 #include <linux/slab.h> 13 #include <linux/kernel_stat.h> 14 #include <linux/swap.h> 15 #include <linux/vmalloc.h> 16 #include <linux/pagemap.h> 17 #include <linux/namei.h> 18 #include <linux/shm.h> 19 #include <linux/blkdev.h> 20 #include <linux/writeback.h> 21 #include <linux/proc_fs.h> 22 #include <linux/seq_file.h> 23 #include <linux/init.h> 24 #include <linux/module.h> 25 #include <linux/rmap.h> 26 #include <linux/security.h> 27 #include <linux/backing-dev.h> 28 29 #include <asm/pgtable.h> 30 #include <asm/tlbflush.h> 31 #include <linux/swapops.h> 32 33 spinlock_t swaplock = SPIN_LOCK_UNLOCKED; 34 unsigned int nr_swapfiles; 35 long total_swap_pages; 36 static int swap_overflow; 37 38 EXPORT_SYMBOL(total_swap_pages); 39 40 static const char Bad_file[] = "Bad swap file entry "; 41 static const char Unused_file[] = "Unused swap file entry "; 42 static const char Bad_offset[] = "Bad swap offset entry "; 43 static const char Unused_offset[] = "Unused swap offset entry "; 44 45 struct swap_list_t swap_list = {-1, -1}; 46 47 struct swap_info_struct swap_info[MAX_SWAPFILES]; 48 49 static DECLARE_MUTEX(swapon_sem); 50 51 /* 52 * We need this because the bdev->unplug_fn can sleep and we cannot 53 * hold swap_list_lock while calling the unplug_fn. And swap_list_lock 54 * cannot be turned into a semaphore. 55 */ 56 static DECLARE_RWSEM(swap_unplug_sem); 57 58 #define SWAPFILE_CLUSTER 256 59 60 void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) 61 { 62 swp_entry_t entry; 63 64 down_read(&swap_unplug_sem); 65 entry.val = page->private; 66 if (PageSwapCache(page)) { 67 struct block_device *bdev = swap_info[swp_type(entry)].bdev; 68 struct backing_dev_info *bdi; 69 70 /* 71 * If the page is removed from swapcache from under us (with a 72 * racy try_to_unuse/swapoff) we need an additional reference 73 * count to avoid reading garbage from page->private above. If 74 * the WARN_ON triggers during a swapoff it maybe the race 75 * condition and it's harmless. However if it triggers without 76 * swapoff it signals a problem. 77 */ 78 WARN_ON(page_count(page) <= 1); 79 80 bdi = bdev->bd_inode->i_mapping->backing_dev_info; 81 bdi->unplug_io_fn(bdi, page); 82 } 83 up_read(&swap_unplug_sem); 84 } 85 86 static inline int scan_swap_map(struct swap_info_struct *si) 87 { 88 unsigned long offset; 89 /* 90 * We try to cluster swap pages by allocating them 91 * sequentially in swap. Once we've allocated 92 * SWAPFILE_CLUSTER pages this way, however, we resort to 93 * first-free allocation, starting a new cluster. This 94 * prevents us from scattering swap pages all over the entire 95 * swap partition, so that we reduce overall disk seek times 96 * between swap pages. -- sct */ 97 if (si->cluster_nr) { 98 while (si->cluster_next <= si->highest_bit) { 99 offset = si->cluster_next++; 100 if (si->swap_map[offset]) 101 continue; 102 si->cluster_nr--; 103 goto got_page; 104 } 105 } 106 si->cluster_nr = SWAPFILE_CLUSTER; 107 108 /* try to find an empty (even not aligned) cluster. */ 109 offset = si->lowest_bit; 110 check_next_cluster: 111 if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) 112 { 113 unsigned long nr; 114 for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) 115 if (si->swap_map[nr]) 116 { 117 offset = nr+1; 118 goto check_next_cluster; 119 } 120 /* We found a completly empty cluster, so start 121 * using it. 122 */ 123 goto got_page; 124 } 125 /* No luck, so now go finegrined as usual. -Andrea */ 126 for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { 127 if (si->swap_map[offset]) 128 continue; 129 si->lowest_bit = offset+1; 130 got_page: 131 if (offset == si->lowest_bit) 132 si->lowest_bit++; 133 if (offset == si->highest_bit) 134 si->highest_bit--; 135 if (si->lowest_bit > si->highest_bit) { 136 si->lowest_bit = si->max; 137 si->highest_bit = 0; 138 } 139 si->swap_map[offset] = 1; 140 si->inuse_pages++; 141 nr_swap_pages--; 142 si->cluster_next = offset+1; 143 return offset; 144 } 145 si->lowest_bit = si->max; 146 si->highest_bit = 0; 147 return 0; 148 } 149 150 swp_entry_t get_swap_page(void) 151 { 152 struct swap_info_struct * p; 153 unsigned long offset; 154 swp_entry_t entry; 155 int type, wrapped = 0; 156 157 entry.val = 0; /* Out of memory */ 158 swap_list_lock(); 159 type = swap_list.next; 160 if (type < 0) 161 goto out; 162 if (nr_swap_pages <= 0) 163 goto out; 164 165 while (1) { 166 p = &swap_info[type]; 167 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { 168 swap_device_lock(p); 169 offset = scan_swap_map(p); 170 swap_device_unlock(p); 171 if (offset) { 172 entry = swp_entry(type,offset); 173 type = swap_info[type].next; 174 if (type < 0 || 175 p->prio != swap_info[type].prio) { 176 swap_list.next = swap_list.head; 177 } else { 178 swap_list.next = type; 179 } 180 goto out; 181 } 182 } 183 type = p->next; 184 if (!wrapped) { 185 if (type < 0 || p->prio != swap_info[type].prio) { 186 type = swap_list.head; 187 wrapped = 1; 188 } 189 } else 190 if (type < 0) 191 goto out; /* out of swap space */ 192 } 193 out: 194 swap_list_unlock(); 195 return entry; 196 } 197 198 static struct swap_info_struct * swap_info_get(swp_entry_t entry) 199 { 200 struct swap_info_struct * p; 201 unsigned long offset, type; 202 203 if (!entry.val) 204 goto out; 205 type = swp_type(entry); 206 if (type >= nr_swapfiles) 207 goto bad_nofile; 208 p = & swap_info[type]; 209 if (!(p->flags & SWP_USED)) 210 goto bad_device; 211 offset = swp_offset(entry); 212 if (offset >= p->max) 213 goto bad_offset; 214 if (!p->swap_map[offset]) 215 goto bad_free; 216 swap_list_lock(); 217 if (p->prio > swap_info[swap_list.next].prio) 218 swap_list.next = type; 219 swap_device_lock(p); 220 return p; 221 222 bad_free: 223 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); 224 goto out; 225 bad_offset: 226 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); 227 goto out; 228 bad_device: 229 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); 230 goto out; 231 bad_nofile: 232 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 233 out: 234 return NULL; 235 } 236 237 static void swap_info_put(struct swap_info_struct * p) 238 { 239 swap_device_unlock(p); 240 swap_list_unlock(); 241 } 242 243 static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) 244 { 245 int count = p->swap_map[offset]; 246 247 if (count < SWAP_MAP_MAX) { 248 count--; 249 p->swap_map[offset] = count; 250 if (!count) { 251 if (offset < p->lowest_bit) 252 p->lowest_bit = offset; 253 if (offset > p->highest_bit) 254 p->highest_bit = offset; 255 nr_swap_pages++; 256 p->inuse_pages--; 257 } 258 } 259 return count; 260 } 261 262 /* 263 * Caller has made sure that the swapdevice corresponding to entry 264 * is still around or has not been recycled. 265 */ 266 void swap_free(swp_entry_t entry) 267 { 268 struct swap_info_struct * p; 269 270 p = swap_info_get(entry); 271 if (p) { 272 swap_entry_free(p, swp_offset(entry)); 273 swap_info_put(p); 274 } 275 } 276 277 /* 278 * Check if we're the only user of a swap page, 279 * when the page is locked. 280 */ 281 static int exclusive_swap_page(struct page *page) 282 { 283 int retval = 0; 284 struct swap_info_struct * p; 285 swp_entry_t entry; 286 287 entry.val = page->private; 288 p = swap_info_get(entry); 289 if (p) { 290 /* Is the only swap cache user the cache itself? */ 291 if (p->swap_map[swp_offset(entry)] == 1) { 292 /* Recheck the page count with the swapcache lock held.. */ 293 spin_lock_irq(&swapper_space.tree_lock); 294 if (page_count(page) == 2) 295 retval = 1; 296 spin_unlock_irq(&swapper_space.tree_lock); 297 } 298 swap_info_put(p); 299 } 300 return retval; 301 } 302 303 /* 304 * We can use this swap cache entry directly 305 * if there are no other references to it. 306 * 307 * Here "exclusive_swap_page()" does the real 308 * work, but we opportunistically check whether 309 * we need to get all the locks first.. 310 */ 311 int can_share_swap_page(struct page *page) 312 { 313 int retval = 0; 314 315 if (!PageLocked(page)) 316 BUG(); 317 switch (page_count(page)) { 318 case 3: 319 if (!PagePrivate(page)) 320 break; 321 /* Fallthrough */ 322 case 2: 323 if (!PageSwapCache(page)) 324 break; 325 retval = exclusive_swap_page(page); 326 break; 327 case 1: 328 if (PageReserved(page)) 329 break; 330 retval = 1; 331 } 332 return retval; 333 } 334 335 /* 336 * Work out if there are any other processes sharing this 337 * swap cache page. Free it if you can. Return success. 338 */ 339 int remove_exclusive_swap_page(struct page *page) 340 { 341 int retval; 342 struct swap_info_struct * p; 343 swp_entry_t entry; 344 345 BUG_ON(PagePrivate(page)); 346 BUG_ON(!PageLocked(page)); 347 348 if (!PageSwapCache(page)) 349 return 0; 350 if (PageWriteback(page)) 351 return 0; 352 if (page_count(page) != 2) /* 2: us + cache */ 353 return 0; 354 355 entry.val = page->private; 356 p = swap_info_get(entry); 357 if (!p) 358 return 0; 359 360 /* Is the only swap cache user the cache itself? */ 361 retval = 0; 362 if (p->swap_map[swp_offset(entry)] == 1) { 363 /* Recheck the page count with the swapcache lock held.. */ 364 spin_lock_irq(&swapper_space.tree_lock); 365 if ((page_count(page) == 2) && !PageWriteback(page)) { 366 __delete_from_swap_cache(page); 367 SetPageDirty(page); 368 retval = 1; 369 } 370 spin_unlock_irq(&swapper_space.tree_lock); 371 } 372 swap_info_put(p); 373 374 if (retval) { 375 swap_free(entry); 376 page_cache_release(page); 377 } 378 379 return retval; 380 } 381 382 /* 383 * Free the swap entry like above, but also try to 384 * free the page cache entry if it is the last user. 385 */ 386 void free_swap_and_cache(swp_entry_t entry) 387 { 388 struct swap_info_struct * p; 389 struct page *page = NULL; 390 391 p = swap_info_get(entry); 392 if (p) { 393 if (swap_entry_free(p, swp_offset(entry)) == 1) { 394 spin_lock_irq(&swapper_space.tree_lock); 395 page = radix_tree_lookup(&swapper_space.page_tree, 396 entry.val); 397 if (page && TestSetPageLocked(page)) 398 page = NULL; 399 spin_unlock_irq(&swapper_space.tree_lock); 400 } 401 swap_info_put(p); 402 } 403 if (page) { 404 int one_user; 405 406 BUG_ON(PagePrivate(page)); 407 page_cache_get(page); 408 one_user = (page_count(page) == 2); 409 /* Only cache user (+us), or swap space full? Free it! */ 410 if (!PageWriteback(page) && (one_user || vm_swap_full())) { 411 delete_from_swap_cache(page); 412 SetPageDirty(page); 413 } 414 unlock_page(page); 415 page_cache_release(page); 416 } 417 } 418 419 /* 420 * The swap entry has been read in advance, and we return 1 to indicate 421 * that the page has been used or is no longer needed. 422 * 423 * Always set the resulting pte to be nowrite (the same as COW pages 424 * after one process has exited). We don't know just how many PTEs will 425 * share this swap entry, so be cautious and let do_wp_page work out 426 * what to do if a write is requested later. 427 */ 428 /* vma->vm_mm->page_table_lock is held */ 429 static void 430 unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, 431 swp_entry_t entry, struct page *page) 432 { 433 vma->vm_mm->rss++; 434 get_page(page); 435 set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); 436 page_add_anon_rmap(page, vma, address); 437 swap_free(entry); 438 } 439 440 /* vma->vm_mm->page_table_lock is held */ 441 static unsigned long unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, 442 unsigned long address, unsigned long size, unsigned long offset, 443 swp_entry_t entry, struct page *page) 444 { 445 pte_t * pte; 446 unsigned long end; 447 pte_t swp_pte = swp_entry_to_pte(entry); 448 449 if (pmd_none(*dir)) 450 return 0; 451 if (pmd_bad(*dir)) { 452 pmd_ERROR(*dir); 453 pmd_clear(dir); 454 return 0; 455 } 456 pte = pte_offset_map(dir, address); 457 offset += address & PMD_MASK; 458 address &= ~PMD_MASK; 459 end = address + size; 460 if (end > PMD_SIZE) 461 end = PMD_SIZE; 462 do { 463 /* 464 * swapoff spends a _lot_ of time in this loop! 465 * Test inline before going to call unuse_pte. 466 */ 467 if (unlikely(pte_same(*pte, swp_pte))) { 468 unuse_pte(vma, offset + address, pte, entry, page); 469 pte_unmap(pte); 470 471 /* 472 * Move the page to the active list so it is not 473 * immediately swapped out again after swapon. 474 */ 475 activate_page(page); 476 477 /* add 1 since address may be 0 */ 478 return 1 + offset + address; 479 } 480 address += PAGE_SIZE; 481 pte++; 482 } while (address && (address < end)); 483 pte_unmap(pte - 1); 484 return 0; 485 } 486 487 /* vma->vm_mm->page_table_lock is held */ 488 static unsigned long unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, 489 unsigned long address, unsigned long size, 490 swp_entry_t entry, struct page *page) 491 { 492 pmd_t * pmd; 493 unsigned long offset, end; 494 unsigned long foundaddr; 495 496 if (pgd_none(*dir)) 497 return 0; 498 if (pgd_bad(*dir)) { 499 pgd_ERROR(*dir); 500 pgd_clear(dir); 501 return 0; 502 } 503 pmd = pmd_offset(dir, address); 504 offset = address & PGDIR_MASK; 505 address &= ~PGDIR_MASK; 506 end = address + size; 507 if (end > PGDIR_SIZE) 508 end = PGDIR_SIZE; 509 if (address >= end) 510 BUG(); 511 do { 512 foundaddr = unuse_pmd(vma, pmd, address, end - address, 513 offset, entry, page); 514 if (foundaddr) 515 return foundaddr; 516 address = (address + PMD_SIZE) & PMD_MASK; 517 pmd++; 518 } while (address && (address < end)); 519 return 0; 520 } 521 522 /* vma->vm_mm->page_table_lock is held */ 523 static unsigned long unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, 524 swp_entry_t entry, struct page *page) 525 { 526 unsigned long start = vma->vm_start, end = vma->vm_end; 527 unsigned long foundaddr; 528 529 if (start >= end) 530 BUG(); 531 do { 532 foundaddr = unuse_pgd(vma, pgdir, start, end - start, 533 entry, page); 534 if (foundaddr) 535 return foundaddr; 536 start = (start + PGDIR_SIZE) & PGDIR_MASK; 537 pgdir++; 538 } while (start && (start < end)); 539 return 0; 540 } 541 542 static int unuse_process(struct mm_struct * mm, 543 swp_entry_t entry, struct page* page) 544 { 545 struct vm_area_struct* vma; 546 unsigned long foundaddr = 0; 547 548 /* 549 * Go through process' page directory. 550 */ 551 if (!down_read_trylock(&mm->mmap_sem)) { 552 /* 553 * Our reference to the page stops try_to_unmap_one from 554 * unmapping its ptes, so swapoff can make progress. 555 */ 556 unlock_page(page); 557 down_read(&mm->mmap_sem); 558 lock_page(page); 559 } 560 spin_lock(&mm->page_table_lock); 561 for (vma = mm->mmap; vma; vma = vma->vm_next) { 562 if (!is_vm_hugetlb_page(vma)) { 563 pgd_t * pgd = pgd_offset(mm, vma->vm_start); 564 foundaddr = unuse_vma(vma, pgd, entry, page); 565 if (foundaddr) 566 break; 567 } 568 } 569 spin_unlock(&mm->page_table_lock); 570 up_read(&mm->mmap_sem); 571 /* 572 * Currently unuse_process cannot fail, but leave error handling 573 * at call sites for now, since we change it from time to time. 574 */ 575 return 0; 576 } 577 578 /* 579 * Scan swap_map from current position to next entry still in use. 580 * Recycle to start on reaching the end, returning 0 when empty. 581 */ 582 static int find_next_to_unuse(struct swap_info_struct *si, int prev) 583 { 584 int max = si->max; 585 int i = prev; 586 int count; 587 588 /* 589 * No need for swap_device_lock(si) here: we're just looking 590 * for whether an entry is in use, not modifying it; false 591 * hits are okay, and sys_swapoff() has already prevented new 592 * allocations from this area (while holding swap_list_lock()). 593 */ 594 for (;;) { 595 if (++i >= max) { 596 if (!prev) { 597 i = 0; 598 break; 599 } 600 /* 601 * No entries in use at top of swap_map, 602 * loop back to start and recheck there. 603 */ 604 max = prev + 1; 605 prev = 0; 606 i = 1; 607 } 608 count = si->swap_map[i]; 609 if (count && count != SWAP_MAP_BAD) 610 break; 611 } 612 return i; 613 } 614 615 /* 616 * We completely avoid races by reading each swap page in advance, 617 * and then search for the process using it. All the necessary 618 * page table adjustments can then be made atomically. 619 */ 620 static int try_to_unuse(unsigned int type) 621 { 622 struct swap_info_struct * si = &swap_info[type]; 623 struct mm_struct *start_mm; 624 unsigned short *swap_map; 625 unsigned short swcount; 626 struct page *page; 627 swp_entry_t entry; 628 int i = 0; 629 int retval = 0; 630 int reset_overflow = 0; 631 int shmem; 632 633 /* 634 * When searching mms for an entry, a good strategy is to 635 * start at the first mm we freed the previous entry from 636 * (though actually we don't notice whether we or coincidence 637 * freed the entry). Initialize this start_mm with a hold. 638 * 639 * A simpler strategy would be to start at the last mm we 640 * freed the previous entry from; but that would take less 641 * advantage of mmlist ordering (now preserved by swap_out()), 642 * which clusters forked address spaces together, most recent 643 * child immediately after parent. If we race with dup_mmap(), 644 * we very much want to resolve parent before child, otherwise 645 * we may miss some entries: using last mm would invert that. 646 */ 647 start_mm = &init_mm; 648 atomic_inc(&init_mm.mm_users); 649 650 /* 651 * Keep on scanning until all entries have gone. Usually, 652 * one pass through swap_map is enough, but not necessarily: 653 * mmput() removes mm from mmlist before exit_mmap() and its 654 * zap_page_range(). That's not too bad, those entries are 655 * on their way out, and handled faster there than here. 656 * do_munmap() behaves similarly, taking the range out of mm's 657 * vma list before zap_page_range(). But unfortunately, when 658 * unmapping a part of a vma, it takes the whole out first, 659 * then reinserts what's left after (might even reschedule if 660 * open() method called) - so swap entries may be invisible 661 * to swapoff for a while, then reappear - but that is rare. 662 */ 663 while ((i = find_next_to_unuse(si, i)) != 0) { 664 if (signal_pending(current)) { 665 retval = -EINTR; 666 break; 667 } 668 669 /* 670 * Get a page for the entry, using the existing swap 671 * cache page if there is one. Otherwise, get a clean 672 * page and read the swap into it. 673 */ 674 swap_map = &si->swap_map[i]; 675 entry = swp_entry(type, i); 676 page = read_swap_cache_async(entry, NULL, 0); 677 if (!page) { 678 /* 679 * Either swap_duplicate() failed because entry 680 * has been freed independently, and will not be 681 * reused since sys_swapoff() already disabled 682 * allocation from here, or alloc_page() failed. 683 */ 684 if (!*swap_map) 685 continue; 686 retval = -ENOMEM; 687 break; 688 } 689 690 /* 691 * Don't hold on to start_mm if it looks like exiting. 692 */ 693 if (atomic_read(&start_mm->mm_users) == 1) { 694 mmput(start_mm); 695 start_mm = &init_mm; 696 atomic_inc(&init_mm.mm_users); 697 } 698 699 /* 700 * Wait for and lock page. When do_swap_page races with 701 * try_to_unuse, do_swap_page can handle the fault much 702 * faster than try_to_unuse can locate the entry. This 703 * apparently redundant "wait_on_page_locked" lets try_to_unuse 704 * defer to do_swap_page in such a case - in some tests, 705 * do_swap_page and try_to_unuse repeatedly compete. 706 */ 707 wait_on_page_locked(page); 708 wait_on_page_writeback(page); 709 lock_page(page); 710 wait_on_page_writeback(page); 711 712 /* 713 * Remove all references to entry, without blocking. 714 * Whenever we reach init_mm, there's no address space 715 * to search, but use it as a reminder to search shmem. 716 */ 717 shmem = 0; 718 swcount = *swap_map; 719 if (swcount > 1) { 720 if (start_mm == &init_mm) 721 shmem = shmem_unuse(entry, page); 722 else 723 retval = unuse_process(start_mm, entry, page); 724 } 725 if (*swap_map > 1) { 726 int set_start_mm = (*swap_map >= swcount); 727 struct list_head *p = &start_mm->mmlist; 728 struct mm_struct *new_start_mm = start_mm; 729 struct mm_struct *prev_mm = start_mm; 730 struct mm_struct *mm; 731 732 atomic_inc(&new_start_mm->mm_users); 733 atomic_inc(&prev_mm->mm_users); 734 spin_lock(&mmlist_lock); 735 while (*swap_map > 1 && !retval && 736 (p = p->next) != &start_mm->mmlist) { 737 mm = list_entry(p, struct mm_struct, mmlist); 738 atomic_inc(&mm->mm_users); 739 spin_unlock(&mmlist_lock); 740 mmput(prev_mm); 741 prev_mm = mm; 742 743 cond_resched(); 744 745 swcount = *swap_map; 746 if (swcount <= 1) 747 ; 748 else if (mm == &init_mm) { 749 set_start_mm = 1; 750 shmem = shmem_unuse(entry, page); 751 } else 752 retval = unuse_process(mm, entry, page); 753 if (set_start_mm && *swap_map < swcount) { 754 mmput(new_start_mm); 755 atomic_inc(&mm->mm_users); 756 new_start_mm = mm; 757 set_start_mm = 0; 758 } 759 spin_lock(&mmlist_lock); 760 } 761 spin_unlock(&mmlist_lock); 762 mmput(prev_mm); 763 mmput(start_mm); 764 start_mm = new_start_mm; 765 } 766 if (retval) { 767 unlock_page(page); 768 page_cache_release(page); 769 break; 770 } 771 772 /* 773 * How could swap count reach 0x7fff when the maximum 774 * pid is 0x7fff, and there's no way to repeat a swap 775 * page within an mm (except in shmem, where it's the 776 * shared object which takes the reference count)? 777 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. 778 * 779 * If that's wrong, then we should worry more about 780 * exit_mmap() and do_munmap() cases described above: 781 * we might be resetting SWAP_MAP_MAX too early here. 782 * We know "Undead"s can happen, they're okay, so don't 783 * report them; but do report if we reset SWAP_MAP_MAX. 784 */ 785 if (*swap_map == SWAP_MAP_MAX) { 786 swap_device_lock(si); 787 *swap_map = 1; 788 swap_device_unlock(si); 789 reset_overflow = 1; 790 } 791 792 /* 793 * If a reference remains (rare), we would like to leave 794 * the page in the swap cache; but try_to_unmap could 795 * then re-duplicate the entry once we drop page lock, 796 * so we might loop indefinitely; also, that page could 797 * not be swapped out to other storage meanwhile. So: 798 * delete from cache even if there's another reference, 799 * after ensuring that the data has been saved to disk - 800 * since if the reference remains (rarer), it will be 801 * read from disk into another page. Splitting into two 802 * pages would be incorrect if swap supported "shared 803 * private" pages, but they are handled by tmpfs files. 804 * 805 * Note shmem_unuse already deleted a swappage from 806 * the swap cache, unless the move to filepage failed: 807 * in which case it left swappage in cache, lowered its 808 * swap count to pass quickly through the loops above, 809 * and now we must reincrement count to try again later. 810 */ 811 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 812 struct writeback_control wbc = { 813 .sync_mode = WB_SYNC_NONE, 814 }; 815 816 swap_writepage(page, &wbc); 817 lock_page(page); 818 wait_on_page_writeback(page); 819 } 820 if (PageSwapCache(page)) { 821 if (shmem) 822 swap_duplicate(entry); 823 else 824 delete_from_swap_cache(page); 825 } 826 827 /* 828 * So we could skip searching mms once swap count went 829 * to 1, we did not mark any present ptes as dirty: must 830 * mark page dirty so shrink_list will preserve it. 831 */ 832 SetPageDirty(page); 833 unlock_page(page); 834 page_cache_release(page); 835 836 /* 837 * Make sure that we aren't completely killing 838 * interactive performance. 839 */ 840 cond_resched(); 841 } 842 843 mmput(start_mm); 844 if (reset_overflow) { 845 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); 846 swap_overflow = 0; 847 } 848 return retval; 849 } 850 851 /* 852 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 853 * corresponds to page offset `offset'. 854 */ 855 sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) 856 { 857 struct swap_extent *se = sis->curr_swap_extent; 858 struct swap_extent *start_se = se; 859 860 for ( ; ; ) { 861 struct list_head *lh; 862 863 if (se->start_page <= offset && 864 offset < (se->start_page + se->nr_pages)) { 865 return se->start_block + (offset - se->start_page); 866 } 867 lh = se->list.prev; 868 if (lh == &sis->extent_list) 869 lh = lh->prev; 870 se = list_entry(lh, struct swap_extent, list); 871 sis->curr_swap_extent = se; 872 BUG_ON(se == start_se); /* It *must* be present */ 873 } 874 } 875 876 /* 877 * Free all of a swapdev's extent information 878 */ 879 static void destroy_swap_extents(struct swap_info_struct *sis) 880 { 881 while (!list_empty(&sis->extent_list)) { 882 struct swap_extent *se; 883 884 se = list_entry(sis->extent_list.next, 885 struct swap_extent, list); 886 list_del(&se->list); 887 kfree(se); 888 } 889 sis->nr_extents = 0; 890 } 891 892 /* 893 * Add a block range (and the corresponding page range) into this swapdev's 894 * extent list. The extent list is kept sorted in block order. 895 * 896 * This function rather assumes that it is called in ascending sector_t order. 897 * It doesn't look for extent coalescing opportunities. 898 */ 899 static int 900 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 901 unsigned long nr_pages, sector_t start_block) 902 { 903 struct swap_extent *se; 904 struct swap_extent *new_se; 905 struct list_head *lh; 906 907 lh = sis->extent_list.next; /* The highest-addressed block */ 908 while (lh != &sis->extent_list) { 909 se = list_entry(lh, struct swap_extent, list); 910 if (se->start_block + se->nr_pages == start_block && 911 se->start_page + se->nr_pages == start_page) { 912 /* Merge it */ 913 se->nr_pages += nr_pages; 914 return 0; 915 } 916 lh = lh->next; 917 } 918 919 /* 920 * No merge. Insert a new extent, preserving ordering. 921 */ 922 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 923 if (new_se == NULL) 924 return -ENOMEM; 925 new_se->start_page = start_page; 926 new_se->nr_pages = nr_pages; 927 new_se->start_block = start_block; 928 929 lh = sis->extent_list.prev; /* The lowest block */ 930 while (lh != &sis->extent_list) { 931 se = list_entry(lh, struct swap_extent, list); 932 if (se->start_block > start_block) 933 break; 934 lh = lh->prev; 935 } 936 list_add_tail(&new_se->list, lh); 937 sis->nr_extents++; 938 return 0; 939 } 940 941 /* 942 * A `swap extent' is a simple thing which maps a contiguous range of pages 943 * onto a contiguous range of disk blocks. An ordered list of swap extents 944 * is built at swapon time and is then used at swap_writepage/swap_readpage 945 * time for locating where on disk a page belongs. 946 * 947 * If the swapfile is an S_ISBLK block device, a single extent is installed. 948 * This is done so that the main operating code can treat S_ISBLK and S_ISREG 949 * swap files identically. 950 * 951 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 952 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 953 * swapfiles are handled *identically* after swapon time. 954 * 955 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 956 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If 957 * some stray blocks are found which do not fall within the PAGE_SIZE alignment 958 * requirements, they are simply tossed out - we will never use those blocks 959 * for swapping. 960 * 961 * For S_ISREG swapfiles we hold i_sem across the life of the swapon. This 962 * prevents root from shooting her foot off by ftruncating an in-use swapfile, 963 * which will scribble on the fs. 964 * 965 * The amount of disk space which a single swap extent represents varies. 966 * Typically it is in the 1-4 megabyte range. So we can have hundreds of 967 * extents in the list. To avoid much list walking, we cache the previous 968 * search location in `curr_swap_extent', and start new searches from there. 969 * This is extremely effective. The average number of iterations in 970 * map_swap_page() has been measured at about 0.3 per page. - akpm. 971 */ 972 static int setup_swap_extents(struct swap_info_struct *sis) 973 { 974 struct inode *inode; 975 unsigned blocks_per_page; 976 unsigned long page_no; 977 unsigned blkbits; 978 sector_t probe_block; 979 sector_t last_block; 980 int ret; 981 982 inode = sis->swap_file->f_mapping->host; 983 if (S_ISBLK(inode->i_mode)) { 984 ret = add_swap_extent(sis, 0, sis->max, 0); 985 goto done; 986 } 987 988 blkbits = inode->i_blkbits; 989 blocks_per_page = PAGE_SIZE >> blkbits; 990 991 /* 992 * Map all the blocks into the extent list. This code doesn't try 993 * to be very smart. 994 */ 995 probe_block = 0; 996 page_no = 0; 997 last_block = i_size_read(inode) >> blkbits; 998 while ((probe_block + blocks_per_page) <= last_block && 999 page_no < sis->max) { 1000 unsigned block_in_page; 1001 sector_t first_block; 1002 1003 first_block = bmap(inode, probe_block); 1004 if (first_block == 0) 1005 goto bad_bmap; 1006 1007 /* 1008 * It must be PAGE_SIZE aligned on-disk 1009 */ 1010 if (first_block & (blocks_per_page - 1)) { 1011 probe_block++; 1012 goto reprobe; 1013 } 1014 1015 for (block_in_page = 1; block_in_page < blocks_per_page; 1016 block_in_page++) { 1017 sector_t block; 1018 1019 block = bmap(inode, probe_block + block_in_page); 1020 if (block == 0) 1021 goto bad_bmap; 1022 if (block != first_block + block_in_page) { 1023 /* Discontiguity */ 1024 probe_block++; 1025 goto reprobe; 1026 } 1027 } 1028 1029 /* 1030 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks 1031 */ 1032 ret = add_swap_extent(sis, page_no, 1, 1033 first_block >> (PAGE_SHIFT - blkbits)); 1034 if (ret) 1035 goto out; 1036 page_no++; 1037 probe_block += blocks_per_page; 1038 reprobe: 1039 continue; 1040 } 1041 ret = 0; 1042 if (page_no == 0) 1043 ret = -EINVAL; 1044 sis->max = page_no; 1045 sis->highest_bit = page_no - 1; 1046 done: 1047 sis->curr_swap_extent = list_entry(sis->extent_list.prev, 1048 struct swap_extent, list); 1049 goto out; 1050 bad_bmap: 1051 printk(KERN_ERR "swapon: swapfile has holes\n"); 1052 ret = -EINVAL; 1053 out: 1054 return ret; 1055 } 1056 1057 #if 0 /* We don't need this yet */ 1058 #include <linux/backing-dev.h> 1059 int page_queue_congested(struct page *page) 1060 { 1061 struct backing_dev_info *bdi; 1062 1063 BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ 1064 1065 if (PageSwapCache(page)) { 1066 swp_entry_t entry = { .val = page->private }; 1067 struct swap_info_struct *sis; 1068 1069 sis = get_swap_info_struct(swp_type(entry)); 1070 bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; 1071 } else 1072 bdi = page->mapping->backing_dev_info; 1073 return bdi_write_congested(bdi); 1074 } 1075 #endif 1076 1077 asmlinkage long sys_swapoff(const char __user * specialfile) 1078 { 1079 struct swap_info_struct * p = NULL; 1080 unsigned short *swap_map; 1081 struct file *swap_file, *victim; 1082 struct address_space *mapping; 1083 struct inode *inode; 1084 char * pathname; 1085 int i, type, prev; 1086 int err; 1087 1088 if (!capable(CAP_SYS_ADMIN)) 1089 return -EPERM; 1090 1091 pathname = getname(specialfile); 1092 err = PTR_ERR(pathname); 1093 if (IS_ERR(pathname)) 1094 goto out; 1095 1096 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); 1097 putname(pathname); 1098 err = PTR_ERR(victim); 1099 if (IS_ERR(victim)) 1100 goto out; 1101 1102 mapping = victim->f_mapping; 1103 prev = -1; 1104 swap_list_lock(); 1105 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1106 p = swap_info + type; 1107 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { 1108 if (p->swap_file->f_mapping == mapping) 1109 break; 1110 } 1111 prev = type; 1112 } 1113 if (type < 0) { 1114 err = -EINVAL; 1115 swap_list_unlock(); 1116 goto out_dput; 1117 } 1118 if (!security_vm_enough_memory(p->pages)) 1119 vm_unacct_memory(p->pages); 1120 else { 1121 err = -ENOMEM; 1122 swap_list_unlock(); 1123 goto out_dput; 1124 } 1125 if (prev < 0) { 1126 swap_list.head = p->next; 1127 } else { 1128 swap_info[prev].next = p->next; 1129 } 1130 if (type == swap_list.next) { 1131 /* just pick something that's safe... */ 1132 swap_list.next = swap_list.head; 1133 } 1134 nr_swap_pages -= p->pages; 1135 total_swap_pages -= p->pages; 1136 p->flags &= ~SWP_WRITEOK; 1137 swap_list_unlock(); 1138 current->flags |= PF_SWAPOFF; 1139 err = try_to_unuse(type); 1140 current->flags &= ~PF_SWAPOFF; 1141 1142 /* wait for any unplug function to finish */ 1143 down_write(&swap_unplug_sem); 1144 up_write(&swap_unplug_sem); 1145 1146 if (err) { 1147 /* re-insert swap space back into swap_list */ 1148 swap_list_lock(); 1149 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) 1150 if (p->prio >= swap_info[i].prio) 1151 break; 1152 p->next = i; 1153 if (prev < 0) 1154 swap_list.head = swap_list.next = p - swap_info; 1155 else 1156 swap_info[prev].next = p - swap_info; 1157 nr_swap_pages += p->pages; 1158 total_swap_pages += p->pages; 1159 p->flags |= SWP_WRITEOK; 1160 swap_list_unlock(); 1161 goto out_dput; 1162 } 1163 down(&swapon_sem); 1164 swap_list_lock(); 1165 swap_device_lock(p); 1166 swap_file = p->swap_file; 1167 p->swap_file = NULL; 1168 p->max = 0; 1169 swap_map = p->swap_map; 1170 p->swap_map = NULL; 1171 p->flags = 0; 1172 destroy_swap_extents(p); 1173 swap_device_unlock(p); 1174 swap_list_unlock(); 1175 up(&swapon_sem); 1176 vfree(swap_map); 1177 inode = mapping->host; 1178 if (S_ISBLK(inode->i_mode)) { 1179 struct block_device *bdev = I_BDEV(inode); 1180 set_blocksize(bdev, p->old_block_size); 1181 bd_release(bdev); 1182 } else { 1183 down(&inode->i_sem); 1184 inode->i_flags &= ~S_SWAPFILE; 1185 up(&inode->i_sem); 1186 } 1187 filp_close(swap_file, NULL); 1188 err = 0; 1189 1190 out_dput: 1191 filp_close(victim, NULL); 1192 out: 1193 return err; 1194 } 1195 1196 #ifdef CONFIG_PROC_FS 1197 /* iterator */ 1198 static void *swap_start(struct seq_file *swap, loff_t *pos) 1199 { 1200 struct swap_info_struct *ptr = swap_info; 1201 int i; 1202 loff_t l = *pos; 1203 1204 down(&swapon_sem); 1205 1206 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1207 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1208 continue; 1209 if (!l--) 1210 return ptr; 1211 } 1212 1213 return NULL; 1214 } 1215 1216 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 1217 { 1218 struct swap_info_struct *ptr = v; 1219 struct swap_info_struct *endptr = swap_info + nr_swapfiles; 1220 1221 for (++ptr; ptr < endptr; ptr++) { 1222 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1223 continue; 1224 ++*pos; 1225 return ptr; 1226 } 1227 1228 return NULL; 1229 } 1230 1231 static void swap_stop(struct seq_file *swap, void *v) 1232 { 1233 up(&swapon_sem); 1234 } 1235 1236 static int swap_show(struct seq_file *swap, void *v) 1237 { 1238 struct swap_info_struct *ptr = v; 1239 struct file *file; 1240 int len; 1241 1242 if (v == swap_info) 1243 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 1244 1245 file = ptr->swap_file; 1246 len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); 1247 seq_printf(swap, "%*s%s\t%d\t%ld\t%d\n", 1248 len < 40 ? 40 - len : 1, " ", 1249 S_ISBLK(file->f_dentry->d_inode->i_mode) ? 1250 "partition" : "file\t", 1251 ptr->pages << (PAGE_SHIFT - 10), 1252 ptr->inuse_pages << (PAGE_SHIFT - 10), 1253 ptr->prio); 1254 return 0; 1255 } 1256 1257 static struct seq_operations swaps_op = { 1258 .start = swap_start, 1259 .next = swap_next, 1260 .stop = swap_stop, 1261 .show = swap_show 1262 }; 1263 1264 static int swaps_open(struct inode *inode, struct file *file) 1265 { 1266 return seq_open(file, &swaps_op); 1267 } 1268 1269 static struct file_operations proc_swaps_operations = { 1270 .open = swaps_open, 1271 .read = seq_read, 1272 .llseek = seq_lseek, 1273 .release = seq_release, 1274 }; 1275 1276 static int __init procswaps_init(void) 1277 { 1278 struct proc_dir_entry *entry; 1279 1280 entry = create_proc_entry("swaps", 0, NULL); 1281 if (entry) 1282 entry->proc_fops = &proc_swaps_operations; 1283 return 0; 1284 } 1285 __initcall(procswaps_init); 1286 #endif /* CONFIG_PROC_FS */ 1287 1288 /* 1289 * Written 01/25/92 by Simmule Turner, heavily changed by Linus. 1290 * 1291 * The swapon system call 1292 */ 1293 asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) 1294 { 1295 struct swap_info_struct * p; 1296 char *name = NULL; 1297 struct block_device *bdev = NULL; 1298 struct file *swap_file = NULL; 1299 struct address_space *mapping; 1300 unsigned int type; 1301 int i, prev; 1302 int error; 1303 static int least_priority; 1304 union swap_header *swap_header = NULL; 1305 int swap_header_version; 1306 int nr_good_pages = 0; 1307 unsigned long maxpages = 1; 1308 int swapfilesize; 1309 unsigned short *swap_map; 1310 struct page *page = NULL; 1311 struct inode *inode = NULL; 1312 int did_down = 0; 1313 1314 if (!capable(CAP_SYS_ADMIN)) 1315 return -EPERM; 1316 swap_list_lock(); 1317 p = swap_info; 1318 for (type = 0 ; type < nr_swapfiles ; type++,p++) 1319 if (!(p->flags & SWP_USED)) 1320 break; 1321 error = -EPERM; 1322 /* 1323 * Test if adding another swap device is possible. There are 1324 * two limiting factors: 1) the number of bits for the swap 1325 * type swp_entry_t definition and 2) the number of bits for 1326 * the swap type in the swap ptes as defined by the different 1327 * architectures. To honor both limitations a swap entry 1328 * with swap offset 0 and swap type ~0UL is created, encoded 1329 * to a swap pte, decoded to a swp_entry_t again and finally 1330 * the swap type part is extracted. This will mask all bits 1331 * from the initial ~0UL that can't be encoded in either the 1332 * swp_entry_t or the architecture definition of a swap pte. 1333 */ 1334 if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) { 1335 swap_list_unlock(); 1336 goto out; 1337 } 1338 if (type >= nr_swapfiles) 1339 nr_swapfiles = type+1; 1340 INIT_LIST_HEAD(&p->extent_list); 1341 p->flags = SWP_USED; 1342 p->nr_extents = 0; 1343 p->swap_file = NULL; 1344 p->old_block_size = 0; 1345 p->swap_map = NULL; 1346 p->lowest_bit = 0; 1347 p->highest_bit = 0; 1348 p->cluster_nr = 0; 1349 p->inuse_pages = 0; 1350 p->sdev_lock = SPIN_LOCK_UNLOCKED; 1351 p->next = -1; 1352 if (swap_flags & SWAP_FLAG_PREFER) { 1353 p->prio = 1354 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; 1355 } else { 1356 p->prio = --least_priority; 1357 } 1358 swap_list_unlock(); 1359 name = getname(specialfile); 1360 error = PTR_ERR(name); 1361 if (IS_ERR(name)) { 1362 name = NULL; 1363 goto bad_swap_2; 1364 } 1365 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); 1366 error = PTR_ERR(swap_file); 1367 if (IS_ERR(swap_file)) { 1368 swap_file = NULL; 1369 goto bad_swap_2; 1370 } 1371 1372 p->swap_file = swap_file; 1373 mapping = swap_file->f_mapping; 1374 inode = mapping->host; 1375 1376 error = -EBUSY; 1377 for (i = 0; i < nr_swapfiles; i++) { 1378 struct swap_info_struct *q = &swap_info[i]; 1379 1380 if (i == type || !q->swap_file) 1381 continue; 1382 if (mapping == q->swap_file->f_mapping) 1383 goto bad_swap; 1384 } 1385 1386 error = -EINVAL; 1387 if (S_ISBLK(inode->i_mode)) { 1388 bdev = I_BDEV(inode); 1389 error = bd_claim(bdev, sys_swapon); 1390 if (error < 0) { 1391 bdev = NULL; 1392 goto bad_swap; 1393 } 1394 p->old_block_size = block_size(bdev); 1395 error = set_blocksize(bdev, PAGE_SIZE); 1396 if (error < 0) 1397 goto bad_swap; 1398 p->bdev = bdev; 1399 } else if (S_ISREG(inode->i_mode)) { 1400 p->bdev = inode->i_sb->s_bdev; 1401 down(&inode->i_sem); 1402 did_down = 1; 1403 if (IS_SWAPFILE(inode)) { 1404 error = -EBUSY; 1405 goto bad_swap; 1406 } 1407 } else { 1408 goto bad_swap; 1409 } 1410 1411 swapfilesize = i_size_read(inode) >> PAGE_SHIFT; 1412 1413 /* 1414 * Read the swap header. 1415 */ 1416 if (!mapping->a_ops->readpage) { 1417 error = -EINVAL; 1418 goto bad_swap; 1419 } 1420 page = read_cache_page(mapping, 0, 1421 (filler_t *)mapping->a_ops->readpage, swap_file); 1422 if (IS_ERR(page)) { 1423 error = PTR_ERR(page); 1424 goto bad_swap; 1425 } 1426 wait_on_page_locked(page); 1427 if (!PageUptodate(page)) 1428 goto bad_swap; 1429 kmap(page); 1430 swap_header = page_address(page); 1431 1432 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) 1433 swap_header_version = 1; 1434 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) 1435 swap_header_version = 2; 1436 else { 1437 printk("Unable to find swap-space signature\n"); 1438 error = -EINVAL; 1439 goto bad_swap; 1440 } 1441 1442 switch (swap_header_version) { 1443 case 1: 1444 printk(KERN_ERR "version 0 swap is no longer supported. " 1445 "Use mkswap -v1 %s\n", name); 1446 error = -EINVAL; 1447 goto bad_swap; 1448 case 2: 1449 /* Check the swap header's sub-version and the size of 1450 the swap file and bad block lists */ 1451 if (swap_header->info.version != 1) { 1452 printk(KERN_WARNING 1453 "Unable to handle swap header version %d\n", 1454 swap_header->info.version); 1455 error = -EINVAL; 1456 goto bad_swap; 1457 } 1458 1459 p->lowest_bit = 1; 1460 /* 1461 * Find out how many pages are allowed for a single swap 1462 * device. There are two limiting factors: 1) the number of 1463 * bits for the swap offset in the swp_entry_t type and 1464 * 2) the number of bits in the a swap pte as defined by 1465 * the different architectures. In order to find the 1466 * largest possible bit mask a swap entry with swap type 0 1467 * and swap offset ~0UL is created, encoded to a swap pte, 1468 * decoded to a swp_entry_t again and finally the swap 1469 * offset is extracted. This will mask all the bits from 1470 * the initial ~0UL mask that can't be encoded in either 1471 * the swp_entry_t or the architecture definition of a 1472 * swap pte. 1473 */ 1474 maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; 1475 if (maxpages > swap_header->info.last_page) 1476 maxpages = swap_header->info.last_page; 1477 p->highest_bit = maxpages - 1; 1478 1479 error = -EINVAL; 1480 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1481 goto bad_swap; 1482 1483 /* OK, set up the swap map and apply the bad block list */ 1484 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { 1485 error = -ENOMEM; 1486 goto bad_swap; 1487 } 1488 1489 error = 0; 1490 memset(p->swap_map, 0, maxpages * sizeof(short)); 1491 for (i=0; i<swap_header->info.nr_badpages; i++) { 1492 int page = swap_header->info.badpages[i]; 1493 if (page <= 0 || page >= swap_header->info.last_page) 1494 error = -EINVAL; 1495 else 1496 p->swap_map[page] = SWAP_MAP_BAD; 1497 } 1498 nr_good_pages = swap_header->info.last_page - 1499 swap_header->info.nr_badpages - 1500 1 /* header page */; 1501 if (error) 1502 goto bad_swap; 1503 } 1504 1505 if (swapfilesize && maxpages > swapfilesize) { 1506 printk(KERN_WARNING 1507 "Swap area shorter than signature indicates\n"); 1508 error = -EINVAL; 1509 goto bad_swap; 1510 } 1511 if (!nr_good_pages) { 1512 printk(KERN_WARNING "Empty swap-file\n"); 1513 error = -EINVAL; 1514 goto bad_swap; 1515 } 1516 p->swap_map[0] = SWAP_MAP_BAD; 1517 p->max = maxpages; 1518 p->pages = nr_good_pages; 1519 1520 error = setup_swap_extents(p); 1521 if (error) 1522 goto bad_swap; 1523 1524 down(&swapon_sem); 1525 swap_list_lock(); 1526 swap_device_lock(p); 1527 p->flags = SWP_ACTIVE; 1528 nr_swap_pages += nr_good_pages; 1529 total_swap_pages += nr_good_pages; 1530 printk(KERN_INFO "Adding %dk swap on %s. Priority:%d extents:%d\n", 1531 nr_good_pages<<(PAGE_SHIFT-10), name, 1532 p->prio, p->nr_extents); 1533 1534 /* insert swap space into swap_list: */ 1535 prev = -1; 1536 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 1537 if (p->prio >= swap_info[i].prio) { 1538 break; 1539 } 1540 prev = i; 1541 } 1542 p->next = i; 1543 if (prev < 0) { 1544 swap_list.head = swap_list.next = p - swap_info; 1545 } else { 1546 swap_info[prev].next = p - swap_info; 1547 } 1548 swap_device_unlock(p); 1549 swap_list_unlock(); 1550 up(&swapon_sem); 1551 error = 0; 1552 goto out; 1553 bad_swap: 1554 if (bdev) { 1555 set_blocksize(bdev, p->old_block_size); 1556 bd_release(bdev); 1557 } 1558 bad_swap_2: 1559 swap_list_lock(); 1560 swap_map = p->swap_map; 1561 p->swap_file = NULL; 1562 p->swap_map = NULL; 1563 p->flags = 0; 1564 if (!(swap_flags & SWAP_FLAG_PREFER)) 1565 ++least_priority; 1566 swap_list_unlock(); 1567 destroy_swap_extents(p); 1568 if (swap_map) 1569 vfree(swap_map); 1570 if (swap_file) 1571 filp_close(swap_file, NULL); 1572 out: 1573 if (page && !IS_ERR(page)) { 1574 kunmap(page); 1575 page_cache_release(page); 1576 } 1577 if (name) 1578 putname(name); 1579 if (did_down) { 1580 if (!error) 1581 inode->i_flags |= S_SWAPFILE; 1582 up(&inode->i_sem); 1583 } 1584 return error; 1585 } 1586 1587 void si_swapinfo(struct sysinfo *val) 1588 { 1589 unsigned int i; 1590 unsigned long nr_to_be_unused = 0; 1591 1592 swap_list_lock(); 1593 for (i = 0; i < nr_swapfiles; i++) { 1594 if (!(swap_info[i].flags & SWP_USED) || 1595 (swap_info[i].flags & SWP_WRITEOK)) 1596 continue; 1597 nr_to_be_unused += swap_info[i].inuse_pages; 1598 } 1599 val->freeswap = nr_swap_pages + nr_to_be_unused; 1600 val->totalswap = total_swap_pages + nr_to_be_unused; 1601 swap_list_unlock(); 1602 } 1603 1604 /* 1605 * Verify that a swap entry is valid and increment its swap map count. 1606 * 1607 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 1608 * "permanent", but will be reclaimed by the next swapoff. 1609 */ 1610 int swap_duplicate(swp_entry_t entry) 1611 { 1612 struct swap_info_struct * p; 1613 unsigned long offset, type; 1614 int result = 0; 1615 1616 type = swp_type(entry); 1617 if (type >= nr_swapfiles) 1618 goto bad_file; 1619 p = type + swap_info; 1620 offset = swp_offset(entry); 1621 1622 swap_device_lock(p); 1623 if (offset < p->max && p->swap_map[offset]) { 1624 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { 1625 p->swap_map[offset]++; 1626 result = 1; 1627 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { 1628 if (swap_overflow++ < 5) 1629 printk(KERN_WARNING "swap_dup: swap entry overflow\n"); 1630 p->swap_map[offset] = SWAP_MAP_MAX; 1631 result = 1; 1632 } 1633 } 1634 swap_device_unlock(p); 1635 out: 1636 return result; 1637 1638 bad_file: 1639 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 1640 goto out; 1641 } 1642 1643 struct swap_info_struct * 1644 get_swap_info_struct(unsigned type) 1645 { 1646 return &swap_info[type]; 1647 } 1648 1649 /* 1650 * swap_device_lock prevents swap_map being freed. Don't grab an extra 1651 * reference on the swaphandle, it doesn't matter if it becomes unused. 1652 */ 1653 int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 1654 { 1655 int ret = 0, i = 1 << page_cluster; 1656 unsigned long toff; 1657 struct swap_info_struct *swapdev = swp_type(entry) + swap_info; 1658 1659 if (!page_cluster) /* no readahead */ 1660 return 0; 1661 toff = (swp_offset(entry) >> page_cluster) << page_cluster; 1662 if (!toff) /* first page is swap header */ 1663 toff++, i--; 1664 *offset = toff; 1665 1666 swap_device_lock(swapdev); 1667 do { 1668 /* Don't read-ahead past the end of the swap area */ 1669 if (toff >= swapdev->max) 1670 break; 1671 /* Don't read in free or bad pages */ 1672 if (!swapdev->swap_map[toff]) 1673 break; 1674 if (swapdev->swap_map[toff] == SWAP_MAP_BAD) 1675 break; 1676 toff++; 1677 ret++; 1678 } while (--i); 1679 swap_device_unlock(swapdev); 1680 return ret; 1681 } 1682
This page was automatically generated by LXR 0.3.1. • Linux is a registered trademark of Linus Torvalds