1 /* 2 * mm/readahead.c - address_space-level file readahead. 3 * 4 * Copyright (C) 2002, Linus Torvalds 5 * 6 * 09Apr2002 akpm@zip.com.au 7 * Initial version. 8 */ 9 10 #include <linux/kernel.h> 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/blkdev.h> 15 #include <linux/backing-dev.h> 16 #include <linux/pagevec.h> 17 18 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 19 { 20 } 21 EXPORT_SYMBOL(default_unplug_io_fn); 22 23 struct backing_dev_info default_backing_dev_info = { 24 .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, 25 .state = 0, 26 .unplug_io_fn = default_unplug_io_fn, 27 }; 28 EXPORT_SYMBOL_GPL(default_backing_dev_info); 29 30 /* 31 * Initialise a struct file's readahead state 32 */ 33 void 34 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) 35 { 36 memset(ra, 0, sizeof(*ra)); 37 ra->ra_pages = mapping->backing_dev_info->ra_pages; 38 ra->average = ra->ra_pages / 2; 39 } 40 EXPORT_SYMBOL(file_ra_state_init); 41 42 /* 43 * Return max readahead size for this inode in number-of-pages. 44 */ 45 static inline unsigned long get_max_readahead(struct file_ra_state *ra) 46 { 47 return ra->ra_pages; 48 } 49 50 static inline unsigned long get_min_readahead(struct file_ra_state *ra) 51 { 52 return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; 53 } 54 55 #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) 56 57 /** 58 * read_cache_pages - populate an address space with some pages, and 59 * start reads against them. 60 * @mapping: the address_space 61 * @pages: The address of a list_head which contains the target pages. These 62 * pages have their ->index populated and are otherwise uninitialised. 63 * @filler: callback routine for filling a single page. 64 * @data: private data for the callback routine. 65 * 66 * Hides the details of the LRU cache etc from the filesystems. 67 */ 68 int read_cache_pages(struct address_space *mapping, struct list_head *pages, 69 int (*filler)(void *, struct page *), void *data) 70 { 71 struct page *page; 72 struct pagevec lru_pvec; 73 int ret = 0; 74 75 pagevec_init(&lru_pvec, 0); 76 77 while (!list_empty(pages)) { 78 page = list_to_page(pages); 79 list_del(&page->lru); 80 if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { 81 page_cache_release(page); 82 continue; 83 } 84 ret = filler(data, page); 85 if (!pagevec_add(&lru_pvec, page)) 86 __pagevec_lru_add(&lru_pvec); 87 if (ret) { 88 while (!list_empty(pages)) { 89 struct page *victim; 90 91 victim = list_to_page(pages); 92 list_del(&victim->lru); 93 page_cache_release(victim); 94 } 95 break; 96 } 97 } 98 pagevec_lru_add(&lru_pvec); 99 return ret; 100 } 101 102 EXPORT_SYMBOL(read_cache_pages); 103 104 static int read_pages(struct address_space *mapping, struct file *filp, 105 struct list_head *pages, unsigned nr_pages) 106 { 107 unsigned page_idx; 108 struct pagevec lru_pvec; 109 int ret = 0; 110 111 if (mapping->a_ops->readpages) { 112 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 113 goto out; 114 } 115 116 pagevec_init(&lru_pvec, 0); 117 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 118 struct page *page = list_to_page(pages); 119 list_del(&page->lru); 120 if (!add_to_page_cache(page, mapping, 121 page->index, GFP_KERNEL)) { 122 mapping->a_ops->readpage(filp, page); 123 if (!pagevec_add(&lru_pvec, page)) 124 __pagevec_lru_add(&lru_pvec); 125 } else { 126 page_cache_release(page); 127 } 128 } 129 pagevec_lru_add(&lru_pvec); 130 out: 131 return ret; 132 } 133 134 /* 135 * Readahead design. 136 * 137 * The fields in struct file_ra_state represent the most-recently-executed 138 * readahead attempt: 139 * 140 * start: Page index at which we started the readahead 141 * size: Number of pages in that read 142 * Together, these form the "current window". 143 * Together, start and size represent the `readahead window'. 144 * next_size: The number of pages to read on the next readahead miss. 145 * Has the magical value -1UL if readahead has been disabled. 146 * prev_page: The page which the readahead algorithm most-recently inspected. 147 * prev_page is mainly an optimisation: if page_cache_readahead 148 * sees that it is again being called for a page which it just 149 * looked at, it can return immediately without making any state 150 * changes. 151 * ahead_start, 152 * ahead_size: Together, these form the "ahead window". 153 * ra_pages: The externally controlled max readahead for this fd. 154 * 155 * When readahead is in the "maximally shrunk" state (next_size == -1UL), 156 * readahead is disabled. In this state, prev_page and size are used, inside 157 * handle_ra_miss(), to detect the resumption of sequential I/O. Once there 158 * has been a decent run of sequential I/O (defined by get_min_readahead), 159 * readahead is reenabled. 160 * 161 * The readahead code manages two windows - the "current" and the "ahead" 162 * windows. The intent is that while the application is walking the pages 163 * in the current window, I/O is underway on the ahead window. When the 164 * current window is fully traversed, it is replaced by the ahead window 165 * and the ahead window is invalidated. When this copying happens, the 166 * new current window's pages are probably still locked. When I/O has 167 * completed, we submit a new batch of I/O, creating a new ahead window. 168 * 169 * So: 170 * 171 * ----|----------------|----------------|----- 172 * ^start ^start+size 173 * ^ahead_start ^ahead_start+ahead_size 174 * 175 * ^ When this page is read, we submit I/O for the 176 * ahead window. 177 * 178 * A `readahead hit' occurs when a read request is made against a page which is 179 * inside the current window. Hits are good, and the window size (next_size) 180 * is grown aggressively when hits occur. Two pages are added to the next 181 * window size on each hit, which will end up doubling the next window size by 182 * the time I/O is submitted for it. 183 * 184 * If readahead hits are more sparse (say, the application is only reading 185 * every second page) then the window will build more slowly. 186 * 187 * On a readahead miss (the application seeked away) the readahead window is 188 * shrunk by 25%. We don't want to drop it too aggressively, because it is a 189 * good assumption that an application which has built a good readahead window 190 * will continue to perform linear reads. Either at the new file position, or 191 * at the old one after another seek. 192 * 193 * After enough misses, readahead is fully disabled. (next_size = -1UL). 194 * 195 * There is a special-case: if the first page which the application tries to 196 * read happens to be the first page of the file, it is assumed that a linear 197 * read is about to happen and the window is immediately set to half of the 198 * device maximum. 199 * 200 * A page request at (start + size) is not a miss at all - it's just a part of 201 * sequential file reading. 202 * 203 * This function is to be called for every page which is read, rather than when 204 * it is time to perform readahead. This is so the readahead algorithm can 205 * centrally work out the access patterns. This could be costly with many tiny 206 * read()s, so we specifically optimise for that case with prev_page. 207 */ 208 209 /* 210 * do_page_cache_readahead actually reads a chunk of disk. It allocates all 211 * the pages first, then submits them all for I/O. This avoids the very bad 212 * behaviour which would occur if page allocations are causing VM writeback. 213 * We really don't want to intermingle reads and writes like that. 214 * 215 * Returns the number of pages which actually had IO started against them. 216 */ 217 static inline int 218 __do_page_cache_readahead(struct address_space *mapping, struct file *filp, 219 unsigned long offset, unsigned long nr_to_read) 220 { 221 struct inode *inode = mapping->host; 222 struct page *page; 223 unsigned long end_index; /* The last page we want to read */ 224 LIST_HEAD(page_pool); 225 int page_idx; 226 int ret = 0; 227 loff_t isize = i_size_read(inode); 228 229 if (isize == 0) 230 goto out; 231 232 end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); 233 234 /* 235 * Preallocate as many pages as we will need. 236 */ 237 spin_lock_irq(&mapping->tree_lock); 238 for (page_idx = 0; page_idx < nr_to_read; page_idx++) { 239 unsigned long page_offset = offset + page_idx; 240 241 if (page_offset > end_index) 242 break; 243 244 page = radix_tree_lookup(&mapping->page_tree, page_offset); 245 if (page) 246 continue; 247 248 spin_unlock_irq(&mapping->tree_lock); 249 page = page_cache_alloc_cold(mapping); 250 spin_lock_irq(&mapping->tree_lock); 251 if (!page) 252 break; 253 page->index = page_offset; 254 list_add(&page->lru, &page_pool); 255 ret++; 256 } 257 spin_unlock_irq(&mapping->tree_lock); 258 259 /* 260 * Now start the IO. We ignore I/O errors - if the page is not 261 * uptodate then the caller will launch readpage again, and 262 * will then handle the error. 263 */ 264 if (ret) 265 read_pages(mapping, filp, &page_pool, ret); 266 BUG_ON(!list_empty(&page_pool)); 267 out: 268 return ret; 269 } 270 271 /* 272 * Chunk the readahead into 2 megabyte units, so that we don't pin too much 273 * memory at once. 274 */ 275 int force_page_cache_readahead(struct address_space *mapping, struct file *filp, 276 unsigned long offset, unsigned long nr_to_read) 277 { 278 int ret = 0; 279 280 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) 281 return -EINVAL; 282 283 while (nr_to_read) { 284 int err; 285 286 unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; 287 288 if (this_chunk > nr_to_read) 289 this_chunk = nr_to_read; 290 err = __do_page_cache_readahead(mapping, filp, 291 offset, this_chunk); 292 if (err < 0) { 293 ret = err; 294 break; 295 } 296 ret += err; 297 offset += this_chunk; 298 nr_to_read -= this_chunk; 299 } 300 return ret; 301 } 302 303 /* 304 * This version skips the IO if the queue is read-congested, and will tell the 305 * block layer to abandon the readahead if request allocation would block. 306 * 307 * force_page_cache_readahead() will ignore queue congestion and will block on 308 * request queues. 309 */ 310 int do_page_cache_readahead(struct address_space *mapping, struct file *filp, 311 unsigned long offset, unsigned long nr_to_read) 312 { 313 if (!bdi_read_congested(mapping->backing_dev_info)) 314 return __do_page_cache_readahead(mapping, filp, 315 offset, nr_to_read); 316 return 0; 317 } 318 319 /* 320 * Check how effective readahead is being. If the amount of started IO is 321 * less than expected then the file is partly or fully in pagecache and 322 * readahead isn't helping. Shrink the window. 323 * 324 * But don't shrink it too much - the application may read the same page 325 * occasionally. 326 */ 327 static inline void 328 check_ra_success(struct file_ra_state *ra, pgoff_t attempt, 329 pgoff_t actual, pgoff_t orig_next_size) 330 { 331 if (actual == 0) { 332 if (orig_next_size > 1) { 333 ra->next_size = orig_next_size - 1; 334 if (ra->ahead_size) 335 ra->ahead_size = ra->next_size; 336 } else { 337 ra->next_size = -1UL; 338 ra->size = 0; 339 } 340 } 341 } 342 343 /* 344 * page_cache_readahead is the main function. If performs the adaptive 345 * readahead window size management and submits the readahead I/O. 346 */ 347 void 348 page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, 349 struct file *filp, unsigned long offset) 350 { 351 unsigned max; 352 unsigned orig_next_size; 353 unsigned actual; 354 int first_access=0; 355 unsigned long average; 356 357 /* 358 * Here we detect the case where the application is performing 359 * sub-page sized reads. We avoid doing extra work and bogusly 360 * perturbing the readahead window expansion logic. 361 * If next_size is zero, this is the very first read for this 362 * file handle, or the window is maximally shrunk. 363 */ 364 if (offset == ra->prev_page) { 365 if (ra->next_size != 0) 366 goto out; 367 } 368 369 if (ra->next_size == -1UL) 370 goto out; /* Maximally shrunk */ 371 372 max = get_max_readahead(ra); 373 if (max == 0) 374 goto out; /* No readahead */ 375 376 orig_next_size = ra->next_size; 377 378 if (ra->next_size == 0) { 379 /* 380 * Special case - first read. 381 * We'll assume it's a whole-file read, and 382 * grow the window fast. 383 */ 384 first_access=1; 385 ra->next_size = max / 2; 386 ra->prev_page = offset; 387 ra->serial_cnt++; 388 goto do_io; 389 } 390 391 if (offset == ra->prev_page + 1) { 392 if (ra->serial_cnt <= (max * 2)) 393 ra->serial_cnt++; 394 } else { 395 /* 396 * to avoid rounding errors, ensure that 'average' 397 * tends towards the value of ra->serial_cnt. 398 */ 399 average = ra->average; 400 if (average < ra->serial_cnt) { 401 average++; 402 } 403 ra->average = (average + ra->serial_cnt) / 2; 404 ra->serial_cnt = 1; 405 } 406 ra->prev_page = offset; 407 408 if (offset >= ra->start && offset <= (ra->start + ra->size)) { 409 /* 410 * A readahead hit. Either inside the window, or one 411 * page beyond the end. Expand the next readahead size. 412 */ 413 ra->next_size += 2; 414 } else { 415 /* 416 * A miss - lseek, pagefault, pread, etc. Shrink the readahead 417 * window. 418 */ 419 ra->next_size -= 2; 420 } 421 422 if ((long)ra->next_size > (long)max) 423 ra->next_size = max; 424 if ((long)ra->next_size <= 0L) { 425 ra->next_size = -1UL; 426 ra->size = 0; 427 goto out; /* Readahead is off */ 428 } 429 430 /* 431 * Is this request outside the current window? 432 */ 433 if (offset < ra->start || offset >= (ra->start + ra->size)) { 434 /* 435 * A miss against the current window. Have we merely 436 * advanced into the ahead window? 437 */ 438 if (offset == ra->ahead_start) { 439 /* 440 * Yes, we have. The ahead window now becomes 441 * the current window. 442 */ 443 ra->start = ra->ahead_start; 444 ra->size = ra->ahead_size; 445 ra->prev_page = ra->start; 446 ra->ahead_start = 0; 447 ra->ahead_size = 0; 448 449 /* 450 * Control now returns, probably to sleep until I/O 451 * completes against the first ahead page. 452 * When the second page in the old ahead window is 453 * requested, control will return here and more I/O 454 * will be submitted to build the new ahead window. 455 */ 456 goto out; 457 } 458 do_io: 459 /* 460 * This is the "unusual" path. We come here during 461 * startup or after an lseek. We invalidate the 462 * ahead window and get some I/O underway for the new 463 * current window. 464 */ 465 if (!first_access) { 466 /* Heuristic: there is a high probability 467 * that around ra->average number of 468 * pages shall be accessed in the next 469 * current window. 470 */ 471 average = ra->average; 472 if (ra->serial_cnt > average) 473 average = (ra->serial_cnt + ra->average + 1) / 2; 474 475 ra->next_size = min(average , (unsigned long)max); 476 } 477 ra->start = offset; 478 ra->size = ra->next_size; 479 ra->ahead_start = 0; /* Invalidate these */ 480 ra->ahead_size = 0; 481 actual = do_page_cache_readahead(mapping, filp, offset, 482 ra->size); 483 if(!first_access) { 484 /* 485 * do not adjust the readahead window size the first 486 * time, the ahead window might get closed if all 487 * the pages are already in the cache. 488 */ 489 check_ra_success(ra, ra->size, actual, orig_next_size); 490 } 491 } else { 492 /* 493 * This read request is within the current window. It may be 494 * time to submit I/O for the ahead window while the 495 * application is about to step into the ahead window. 496 */ 497 if (ra->ahead_start == 0) { 498 /* 499 * If the average io-size is more than maximum 500 * readahead size of the file the io pattern is 501 * sequential. Hence bring in the readahead window 502 * immediately. 503 * If the average io-size is less than maximum 504 * readahead size of the file the io pattern is 505 * random. Hence don't bother to readahead. 506 */ 507 average = ra->average; 508 if (ra->serial_cnt > average) 509 average = (ra->serial_cnt + ra->average + 1) / 2; 510 511 if (average > max) { 512 ra->ahead_start = ra->start + ra->size; 513 ra->ahead_size = ra->next_size; 514 actual = do_page_cache_readahead(mapping, filp, 515 ra->ahead_start, ra->ahead_size); 516 check_ra_success(ra, ra->ahead_size, 517 actual, orig_next_size); 518 } 519 } 520 } 521 out: 522 return; 523 } 524 525 526 /* 527 * handle_ra_miss() is called when it is known that a page which should have 528 * been present in the pagecache (we just did some readahead there) was in fact 529 * not found. This will happen if it was evicted by the VM (readahead 530 * thrashing) or if the readahead window is maximally shrunk. 531 * 532 * If the window has been maximally shrunk (next_size == -1UL) then look to see 533 * if we are getting misses against sequential file offsets. If so, and this 534 * persists then resume readahead. 535 * 536 * Otherwise we're thrashing, so shrink the readahead window by three pages. 537 * This is because it is grown by two pages on a readahead hit. Theory being 538 * that the readahead window size will stabilise around the maximum level at 539 * which there is no thrashing. 540 */ 541 void handle_ra_miss(struct address_space *mapping, 542 struct file_ra_state *ra, pgoff_t offset) 543 { 544 if (ra->next_size == -1UL) { 545 const unsigned long max = get_max_readahead(ra); 546 547 if (offset != ra->prev_page + 1) { 548 ra->size = ra->size?ra->size-1:0; /* Not sequential */ 549 } else { 550 ra->size++; /* A sequential read */ 551 if (ra->size >= max) { /* Resume readahead */ 552 ra->start = offset - max; 553 ra->next_size = max; 554 ra->size = max; 555 ra->ahead_start = 0; 556 ra->ahead_size = 0; 557 ra->average = max / 2; 558 } 559 } 560 ra->prev_page = offset; 561 } else { 562 const unsigned long min = get_min_readahead(ra); 563 564 ra->next_size -= 3; 565 if (ra->next_size < min) 566 ra->next_size = min; 567 } 568 } 569 570 /* 571 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a 572 * sensible upper limit. 573 */ 574 unsigned long max_sane_readahead(unsigned long nr) 575 { 576 unsigned long active; 577 unsigned long inactive; 578 unsigned long free; 579 580 get_zone_counts(&active, &inactive, &free); 581 return min(nr, (inactive + free) / 2); 582 } 583
This page was automatically generated by LXR 0.3.1. • Linux is a registered trademark of Linus Torvalds