/* * Mach Operating System * Copyright (c) 1994,1990,1989,1988,1987 Carnegie Mellon University. * Copyright (c) 1993,1994 The University of Utah and * the Computer Systems Laboratory (CSL). * All rights reserved. * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF * THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ /* * File: vm_fault.c * Author: Avadis Tevanian, Jr., Michael Wayne Young * * Page fault handling module. */ #include #include #include #include /* for error codes */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* For memory_object_data_{request,unlock} */ #include #include #if MACH_PCSAMPLE #include #endif /* * State needed by vm_fault_continue. * This is a little hefty to drop directly * into the thread structure. */ typedef struct vm_fault_state { struct vm_map *vmf_map; vm_offset_t vmf_vaddr; vm_prot_t vmf_fault_type; boolean_t vmf_change_wiring; void (*vmf_continuation)(); vm_map_version_t vmf_version; boolean_t vmf_wired; struct vm_object *vmf_object; vm_offset_t vmf_offset; vm_prot_t vmf_prot; boolean_t vmfp_backoff; struct vm_object *vmfp_object; vm_offset_t vmfp_offset; struct vm_page *vmfp_first_m; vm_prot_t vmfp_access; } vm_fault_state_t; struct kmem_cache vm_fault_state_cache; int vm_object_absent_max = 50; boolean_t vm_fault_dirty_handling = FALSE; boolean_t vm_fault_interruptible = TRUE; boolean_t software_reference_bits = TRUE; #if MACH_KDB extern struct db_watchpoint *db_watchpoint_list; #endif /* MACH_KDB */ /* * Routine: vm_fault_init * Purpose: * Initialize our private data structures. */ void vm_fault_init(void) { kmem_cache_init(&vm_fault_state_cache, "vm_fault_state", sizeof(vm_fault_state_t), 0, NULL, NULL, NULL, 0); } /* * Routine: vm_fault_cleanup * Purpose: * Clean up the result of vm_fault_page. * Results: * The paging reference for "object" is released. * "object" is unlocked. * If "top_page" is not null, "top_page" is * freed and the paging reference for the object * containing it is released. * * In/out conditions: * "object" must be locked. */ void vm_fault_cleanup( vm_object_t object, vm_page_t top_page) { assert(have_vm_object_lock(object)); vm_object_paging_end(object); vm_object_unlock(object); if (top_page != VM_PAGE_NULL) { object = top_page->object; vm_object_lock(object); VM_PAGE_FREE(top_page); vm_object_paging_end(object); vm_object_unlock(object); } } #if MACH_PCSAMPLE /* * Do PC sampling on current thread, assuming * that it is the thread taking this page fault. * * Must check for THREAD_NULL, since faults * can occur before threads are running. */ #define vm_stat_sample(flavor) \ MACRO_BEGIN \ thread_t _thread_ = current_thread(); \ \ if (_thread_ != THREAD_NULL) \ take_pc_sample_macro(_thread_, (flavor)); \ MACRO_END #else #define vm_stat_sample(x) #endif /* MACH_PCSAMPLE */ /* * Routine: vm_fault_page * Purpose: * Find the resident page for the virtual memory * specified by the given virtual memory object * and offset. * Additional arguments: * The required permissions for the page is given * in "fault_type". Desired permissions are included * in "protection". * * If the desired page is known to be resident (for * example, because it was previously wired down), asserting * the "unwiring" parameter will speed the search. * * If the operation can be interrupted (by thread_abort * or thread_terminate), then the "interruptible" * parameter should be asserted. * * Results: * The page containing the proper data is returned * in "result_page". * * In/out conditions: * The source object must be locked and referenced, * and must donate one paging reference. The reference * is not affected. The paging reference and lock are * consumed. * * If the call succeeds, the object in which "result_page" * resides is left locked and holding a paging reference. * If this is not the original object, a busy page in the * original object is returned in "top_page", to prevent other * callers from pursuing this same data, along with a paging * reference for the original object. The "top_page" should * be destroyed when this guarantee is no longer required. * The "result_page" is also left busy. It is not removed * from the pageout queues. */ vm_fault_return_t vm_fault_page( /* Arguments: */ vm_object_t first_object, /* Object to begin search */ vm_offset_t first_offset, /* Offset into object */ vm_prot_t fault_type, /* What access is requested */ boolean_t must_be_resident,/* Must page be resident? */ boolean_t interruptible, /* May fault be interrupted? */ /* Modifies in place: */ vm_prot_t *protection, /* Protection for mapping */ /* Returns: */ vm_page_t *result_page, /* Page found, if successful */ vm_page_t *top_page, /* Page in top object, if * not result_page. */ /* More arguments: */ boolean_t resume, /* We are restarting. */ void (*continuation)()) /* Continuation for blocking. */ { vm_page_t m; vm_object_t object; vm_offset_t offset; vm_page_t first_m; vm_object_t next_object; vm_object_t copy_object; boolean_t look_for_page; vm_prot_t access_required; /* We need to unlock an object before making requests to a memory manager. We use this object to temporarily store object attributes needed for the request to avoid accessing the object while it is unlocked. */ struct { struct ipc_port * pager; pager_request_t pager_request; vm_offset_t paging_offset; } obj; if (resume) { vm_fault_state_t *state = (vm_fault_state_t *) current_thread()->ith_other; if (state->vmfp_backoff) goto after_block_and_backoff; object = state->vmfp_object; offset = state->vmfp_offset; first_m = state->vmfp_first_m; access_required = state->vmfp_access; goto after_thread_block; } assert(have_vm_object_lock(first_object)); vm_stat_sample(SAMPLED_PC_VM_FAULTS_ANY); vm_stat.faults++; /* needs lock XXX */ current_task()->faults++; /* * Recovery actions */ #define RELEASE_PAGE(m) \ MACRO_BEGIN \ PAGE_WAKEUP_DONE(m); \ vm_page_lock_queues(); \ if (!m->active && !m->inactive) \ vm_page_activate(m); \ vm_page_unlock_queues(); \ MACRO_END if (vm_fault_dirty_handling #if MACH_KDB /* * If there are watchpoints set, then * we don't want to give away write permission * on a read fault. Make the task write fault, * so that the watchpoint code notices the access. */ || db_watchpoint_list #endif /* MACH_KDB */ ) { /* * If we aren't asking for write permission, * then don't give it away. We're using write * faults to set the dirty bit. */ if (!(fault_type & VM_PROT_WRITE)) *protection &= ~VM_PROT_WRITE; } if (!vm_fault_interruptible) interruptible = FALSE; /* * INVARIANTS (through entire routine): * * 1) At all times, we must either have the object * lock or a busy page in some object to prevent * some other thread from trying to bring in * the same page. * * Note that we cannot hold any locks during the * pager access or when waiting for memory, so * we use a busy page then. * * Note also that we aren't as concerned about more than * one thread attempting to memory_object_data_unlock * the same page at once, so we don't hold the page * as busy then, but do record the highest unlock * value so far. [Unlock requests may also be delivered * out of order.] * * 2) To prevent another thread from racing us down the * shadow chain and entering a new page in the top * object before we do, we must keep a busy page in * the top object while following the shadow chain. * * 3) We must increment paging_in_progress on any object * for which we have a busy page, to prevent * vm_object_collapse from removing the busy page * without our noticing. * * 4) We leave busy pages on the pageout queues. * If the pageout daemon comes across a busy page, * it will remove the page from the pageout queues. */ /* * Search for the page at object/offset. */ object = first_object; offset = first_offset; first_m = VM_PAGE_NULL; access_required = fault_type; /* * See whether this page is resident */ while (TRUE) { m = vm_page_lookup(object, offset); if (m != VM_PAGE_NULL) { /* * If the page is being brought in, * wait for it and then retry. * * A possible optimization: if the page * is known to be resident, we can ignore * pages that are absent (regardless of * whether they're busy). */ if (m->busy) { kern_return_t wait_result; PAGE_ASSERT_WAIT(m, interruptible); vm_object_unlock(object); if (continuation != (void (*)()) 0) { vm_fault_state_t *state = (vm_fault_state_t *) current_thread()->ith_other; /* * Save variables in case * thread_block discards * our kernel stack. */ state->vmfp_backoff = FALSE; state->vmfp_object = object; state->vmfp_offset = offset; state->vmfp_first_m = first_m; state->vmfp_access = access_required; state->vmf_prot = *protection; counter(c_vm_fault_page_block_busy_user++); thread_block(continuation); } else { counter(c_vm_fault_page_block_busy_kernel++); thread_block((void (*)()) 0); } after_thread_block: wait_result = current_thread()->wait_result; vm_object_lock(object); if (wait_result != THREAD_AWAKENED) { vm_fault_cleanup(object, first_m); if (wait_result == THREAD_RESTART) return(VM_FAULT_RETRY); else return(VM_FAULT_INTERRUPTED); } continue; } /* * If the page is in error, give up now. */ if (m->error) { VM_PAGE_FREE(m); vm_fault_cleanup(object, first_m); return(VM_FAULT_MEMORY_ERROR); } /* * If the page isn't busy, but is absent, * then it was deemed "unavailable". */ if (m->absent) { /* * Remove the non-existent page (unless it's * in the top object) and move on down to the * next object (if there is one). */ offset += object->shadow_offset; access_required = VM_PROT_READ; next_object = object->shadow; if (next_object == VM_OBJECT_NULL) { vm_page_t real_m; assert(!must_be_resident); /* * Absent page at bottom of shadow * chain; zero fill the page we left * busy in the first object, and flush * the absent page. But first we * need to allocate a real page. */ real_m = vm_page_grab(!object->internal); if (real_m == VM_PAGE_NULL) { vm_fault_cleanup(object, first_m); return(VM_FAULT_MEMORY_SHORTAGE); } if (object != first_object) { VM_PAGE_FREE(m); vm_object_paging_end(object); vm_object_unlock(object); object = first_object; offset = first_offset; m = first_m; first_m = VM_PAGE_NULL; vm_object_lock(object); } VM_PAGE_FREE(m); assert(real_m->busy); vm_page_lock_queues(); vm_page_insert(real_m, object, offset); vm_page_unlock_queues(); m = real_m; /* * Zero fill the page. Then * break because this is the * page we wanted. Checking * the page lock is a waste * of time; this page was * either absent or newly * allocated -- in both cases * it can't be page locked by * a pager. */ vm_page_zero_fill(m); vm_stat_sample(SAMPLED_PC_VM_ZFILL_FAULTS); vm_stat.zero_fill_count++; current_task()->zero_fills++; pmap_clear_modify(m->phys_addr); break; } else { if (must_be_resident) { vm_object_paging_end(object); } else if (object != first_object) { vm_object_paging_end(object); VM_PAGE_FREE(m); } else { first_m = m; m->absent = FALSE; vm_object_absent_release(object); m->busy = TRUE; vm_page_lock_queues(); VM_PAGE_QUEUES_REMOVE(m); vm_page_unlock_queues(); } vm_object_lock(next_object); vm_object_unlock(object); object = next_object; vm_object_paging_begin(object); continue; } } /* * If the desired access to this page has * been locked out, request that it be unlocked. */ if (access_required & m->page_lock) { if ((access_required & m->unlock_request) != access_required) { vm_prot_t new_unlock_request; kern_return_t rc; if (!object->pager_ready) { vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible); goto block_and_backoff; } new_unlock_request = m->unlock_request = (access_required | m->unlock_request); obj.pager = object->pager; obj.pager_request = object->pager_request; obj.paging_offset = object->paging_offset; vm_object_unlock(object); if ((rc = memory_object_data_unlock( obj.pager, obj.pager_request, offset + obj.paging_offset, PAGE_SIZE, new_unlock_request)) != KERN_SUCCESS) { printf("vm_fault: memory_object_data_unlock failed\n"); vm_object_lock(object); vm_fault_cleanup(object, first_m); return((rc == MACH_SEND_INTERRUPTED) ? VM_FAULT_INTERRUPTED : VM_FAULT_MEMORY_ERROR); } vm_object_lock(object); continue; } PAGE_ASSERT_WAIT(m, interruptible); goto block_and_backoff; } /* * We mark the page busy and leave it on * the pageout queues. If the pageout * deamon comes across it, then it will * remove the page. */ if (!software_reference_bits) { vm_page_lock_queues(); if (m->inactive) { vm_stat_sample(SAMPLED_PC_VM_REACTIVATION_FAULTS); vm_stat.reactivations++; current_task()->reactivations++; } VM_PAGE_QUEUES_REMOVE(m); vm_page_unlock_queues(); } assert(!m->busy); m->busy = TRUE; assert(!m->absent); break; } look_for_page = (object->pager_created) #if MACH_PAGEMAP && (vm_external_state_get(object->existence_info, offset + object->paging_offset) != VM_EXTERNAL_STATE_ABSENT) #endif /* MACH_PAGEMAP */ ; if ((look_for_page || (object == first_object)) && !must_be_resident) { /* * Allocate a new page for this object/offset * pair. */ m = vm_page_grab_fictitious(); if (m == VM_PAGE_NULL) { vm_fault_cleanup(object, first_m); return(VM_FAULT_FICTITIOUS_SHORTAGE); } vm_page_lock_queues(); vm_page_insert(m, object, offset); vm_page_unlock_queues(); } if (look_for_page && !must_be_resident) { kern_return_t rc; /* * If the memory manager is not ready, we * cannot make requests. */ if (!object->pager_ready) { vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible); VM_PAGE_FREE(m); goto block_and_backoff; } if (object->internal) { /* * Requests to the default pager * must reserve a real page in advance, * because the pager's data-provided * won't block for pages. */ if (m->fictitious && !vm_page_convert(m, FALSE)) { VM_PAGE_FREE(m); vm_fault_cleanup(object, first_m); return(VM_FAULT_MEMORY_SHORTAGE); } } else if (object->absent_count > vm_object_absent_max) { /* * If there are too many outstanding page * requests pending on this object, we * wait for them to be resolved now. */ vm_object_absent_assert_wait(object, interruptible); VM_PAGE_FREE(m); goto block_and_backoff; } /* * Indicate that the page is waiting for data * from the memory manager. */ m->absent = TRUE; object->absent_count++; /* Save attributes for the request. */ obj.pager = object->pager; obj.pager_request = object->pager_request; obj.paging_offset = object->paging_offset; /* * We have a busy page, so we can * release the object lock. */ vm_object_unlock(object); /* * Call the memory manager to retrieve the data. */ vm_stat.pageins++; vm_stat_sample(SAMPLED_PC_VM_PAGEIN_FAULTS); current_task()->pageins++; if ((rc = memory_object_data_request(obj.pager, obj.pager_request, m->offset + obj.paging_offset, PAGE_SIZE, access_required)) != KERN_SUCCESS) { if (rc != MACH_SEND_INTERRUPTED) printf("%s(0x%p, 0x%p, 0x%lx, 0x%x, 0x%x) failed, %x\n", "memory_object_data_request", obj.pager, obj.pager_request, m->offset + obj.paging_offset, PAGE_SIZE, access_required, rc); /* * Don't want to leave a busy page around, * but the data request may have blocked, * so check if it's still there and busy. */ vm_object_lock(object); if (m == vm_page_lookup(object,offset) && m->absent && m->busy) VM_PAGE_FREE(m); vm_fault_cleanup(object, first_m); return((rc == MACH_SEND_INTERRUPTED) ? VM_FAULT_INTERRUPTED : VM_FAULT_MEMORY_ERROR); } /* * Retry with same object/offset, since new data may * be in a different page (i.e., m is meaningless at * this point). */ vm_object_lock(object); continue; } /* * For the XP system, the only case in which we get here is if * object has no pager (or unwiring). If the pager doesn't * have the page this is handled in the m->absent case above * (and if you change things here you should look above). */ if (object == first_object) first_m = m; else { assert(m == VM_PAGE_NULL); } /* * Move on to the next object. Lock the next * object before unlocking the current one. */ access_required = VM_PROT_READ; offset += object->shadow_offset; next_object = object->shadow; if (next_object == VM_OBJECT_NULL) { assert(!must_be_resident); /* * If there's no object left, fill the page * in the top object with zeros. But first we * need to allocate a real page. */ if (object != first_object) { vm_object_paging_end(object); vm_object_unlock(object); object = first_object; offset = first_offset; vm_object_lock(object); } m = first_m; assert(m->object == object); first_m = VM_PAGE_NULL; if (m->fictitious && !vm_page_convert(m, !object->internal)) { VM_PAGE_FREE(m); vm_fault_cleanup(object, VM_PAGE_NULL); return(VM_FAULT_MEMORY_SHORTAGE); } vm_page_zero_fill(m); vm_stat_sample(SAMPLED_PC_VM_ZFILL_FAULTS); vm_stat.zero_fill_count++; current_task()->zero_fills++; pmap_clear_modify(m->phys_addr); break; } else { vm_object_lock(next_object); if ((object != first_object) || must_be_resident) vm_object_paging_end(object); vm_object_unlock(object); object = next_object; vm_object_paging_begin(object); } } /* * PAGE HAS BEEN FOUND. * * This page (m) is: * busy, so that we can play with it; * not absent, so that nobody else will fill it; * possibly eligible for pageout; * * The top-level page (first_m) is: * VM_PAGE_NULL if the page was found in the * top-level object; * busy, not absent, and ineligible for pageout. * * The current object (object) is locked. A paging * reference is held for the current and top-level * objects. */ assert(m->busy && !m->absent); assert((first_m == VM_PAGE_NULL) || (first_m->busy && !first_m->absent && !first_m->active && !first_m->inactive)); /* * If the page is being written, but isn't * already owned by the top-level object, * we have to copy it into a new page owned * by the top-level object. */ if (object != first_object) { /* * We only really need to copy if we * want to write it. */ if (fault_type & VM_PROT_WRITE) { vm_page_t copy_m; assert(!must_be_resident); /* * If we try to collapse first_object at this * point, we may deadlock when we try to get * the lock on an intermediate object (since we * have the bottom object locked). We can't * unlock the bottom object, because the page * we found may move (by collapse) if we do. * * Instead, we first copy the page. Then, when * we have no more use for the bottom object, * we unlock it and try to collapse. * * Note that we copy the page even if we didn't * need to... that's the breaks. */ /* * Allocate a page for the copy */ copy_m = vm_page_grab(!first_object->internal); if (copy_m == VM_PAGE_NULL) { RELEASE_PAGE(m); vm_fault_cleanup(object, first_m); return(VM_FAULT_MEMORY_SHORTAGE); } vm_page_copy(m, copy_m); /* * If another map is truly sharing this * page with us, we have to flush all * uses of the original page, since we * can't distinguish those which want the * original from those which need the * new copy. * * XXXO If we know that only one map has * access to this page, then we could * avoid the pmap_page_protect() call. */ vm_page_lock_queues(); vm_page_deactivate(m); pmap_page_protect(m->phys_addr, VM_PROT_NONE); vm_page_unlock_queues(); /* * We no longer need the old page or object. */ PAGE_WAKEUP_DONE(m); vm_object_paging_end(object); vm_object_unlock(object); vm_stat.cow_faults++; vm_stat_sample(SAMPLED_PC_VM_COW_FAULTS); current_task()->cow_faults++; object = first_object; offset = first_offset; vm_object_lock(object); VM_PAGE_FREE(first_m); first_m = VM_PAGE_NULL; assert(copy_m->busy); vm_page_lock_queues(); vm_page_insert(copy_m, object, offset); vm_page_unlock_queues(); m = copy_m; /* * Now that we've gotten the copy out of the * way, let's try to collapse the top object. * But we have to play ugly games with * paging_in_progress to do that... */ vm_object_paging_end(object); vm_object_collapse(object); vm_object_paging_begin(object); } else { *protection &= (~VM_PROT_WRITE); } } /* * Now check whether the page needs to be pushed into the * copy object. The use of asymmetric copy on write for * shared temporary objects means that we may do two copies to * satisfy the fault; one above to get the page from a * shadowed object, and one here to push it into the copy. */ while ((copy_object = first_object->copy) != VM_OBJECT_NULL) { vm_offset_t copy_offset; vm_page_t copy_m; /* * If the page is being written, but hasn't been * copied to the copy-object, we have to copy it there. */ if ((fault_type & VM_PROT_WRITE) == 0) { *protection &= ~VM_PROT_WRITE; break; } /* * If the page was guaranteed to be resident, * we must have already performed the copy. */ if (must_be_resident) break; /* * Try to get the lock on the copy_object. */ if (!vm_object_lock_try(copy_object)) { vm_object_unlock(object); simple_lock_pause(); /* wait a bit */ vm_object_lock(object); continue; } /* * Make another reference to the copy-object, * to keep it from disappearing during the * copy. */ assert(copy_object->ref_count > 0); copy_object->ref_count++; /* * Does the page exist in the copy? */ copy_offset = first_offset - copy_object->shadow_offset; copy_m = vm_page_lookup(copy_object, copy_offset); if (copy_m != VM_PAGE_NULL) { if (copy_m->busy) { /* * If the page is being brought * in, wait for it and then retry. */ PAGE_ASSERT_WAIT(copy_m, interruptible); RELEASE_PAGE(m); copy_object->ref_count--; assert(copy_object->ref_count > 0); vm_object_unlock(copy_object); goto block_and_backoff; } } else { /* * Allocate a page for the copy */ copy_m = vm_page_alloc(copy_object, copy_offset); if (copy_m == VM_PAGE_NULL) { RELEASE_PAGE(m); copy_object->ref_count--; assert(copy_object->ref_count > 0); vm_object_unlock(copy_object); vm_fault_cleanup(object, first_m); return(VM_FAULT_MEMORY_SHORTAGE); } /* * Must copy page into copy-object. */ vm_page_copy(m, copy_m); /* * If the old page was in use by any users * of the copy-object, it must be removed * from all pmaps. (We can't know which * pmaps use it.) */ vm_page_lock_queues(); pmap_page_protect(m->phys_addr, VM_PROT_NONE); copy_m->dirty = TRUE; vm_page_unlock_queues(); /* * If there's a pager, then immediately * page out this page, using the "initialize" * option. Else, we use the copy. */ if (!copy_object->pager_created) { vm_page_lock_queues(); vm_page_activate(copy_m); vm_page_unlock_queues(); PAGE_WAKEUP_DONE(copy_m); } else { /* * The page is already ready for pageout: * not on pageout queues and busy. * Unlock everything except the * copy_object itself. */ vm_object_unlock(object); /* * Write the page to the copy-object, * flushing it from the kernel. */ vm_pageout_page(copy_m, TRUE, TRUE); /* * Since the pageout may have * temporarily dropped the * copy_object's lock, we * check whether we'll have * to deallocate the hard way. */ if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) { vm_object_unlock(copy_object); vm_object_deallocate(copy_object); vm_object_lock(object); continue; } /* * Pick back up the old object's * lock. [It is safe to do so, * since it must be deeper in the * object tree.] */ vm_object_lock(object); } /* * Because we're pushing a page upward * in the object tree, we must restart * any faults that are waiting here. * [Note that this is an expansion of * PAGE_WAKEUP that uses the THREAD_RESTART * wait result]. Can't turn off the page's * busy bit because we're not done with it. */ if (m->wanted) { m->wanted = FALSE; thread_wakeup_with_result((event_t) m, THREAD_RESTART); } } /* * The reference count on copy_object must be * at least 2: one for our extra reference, * and at least one from the outside world * (we checked that when we last locked * copy_object). */ copy_object->ref_count--; assert(copy_object->ref_count > 0); vm_object_unlock(copy_object); break; } *result_page = m; *top_page = first_m; /* * If the page can be written, assume that it will be. * [Earlier, we restrict the permission to allow write * access only if the fault so required, so we don't * mark read-only data as dirty.] */ if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE)) m->dirty = TRUE; return(VM_FAULT_SUCCESS); block_and_backoff: vm_fault_cleanup(object, first_m); if (continuation != (void (*)()) 0) { vm_fault_state_t *state = (vm_fault_state_t *) current_thread()->ith_other; /* * Save variables in case we must restart. */ state->vmfp_backoff = TRUE; state->vmf_prot = *protection; counter(c_vm_fault_page_block_backoff_user++); thread_block(continuation); } else { counter(c_vm_fault_page_block_backoff_kernel++); thread_block((void (*)()) 0); } after_block_and_backoff: if (current_thread()->wait_result == THREAD_AWAKENED) return VM_FAULT_RETRY; else return VM_FAULT_INTERRUPTED; #undef RELEASE_PAGE } /* * Routine: vm_fault * Purpose: * Handle page faults, including pseudo-faults * used to change the wiring status of pages. * Returns: * If an explicit (expression) continuation is supplied, * then we call the continuation instead of returning. * Implementation: * Explicit continuations make this a little icky, * because it hasn't been rewritten to embrace CPS. * Instead, we have resume arguments for vm_fault and * vm_fault_page, to let continue the fault computation. * * vm_fault and vm_fault_page save mucho state * in the moral equivalent of a closure. The state * structure is allocated when first entering vm_fault * and deallocated when leaving vm_fault. */ void vm_fault_continue(void) { vm_fault_state_t *state = (vm_fault_state_t *) current_thread()->ith_other; (void) vm_fault(state->vmf_map, state->vmf_vaddr, state->vmf_fault_type, state->vmf_change_wiring, TRUE, state->vmf_continuation); /*NOTREACHED*/ } kern_return_t vm_fault( vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, boolean_t change_wiring, boolean_t resume, void (*continuation)()) { vm_map_version_t version; /* Map version for verificiation */ boolean_t wired; /* Should mapping be wired down? */ vm_object_t object; /* Top-level object */ vm_offset_t offset; /* Top-level offset */ vm_prot_t prot; /* Protection for mapping */ vm_object_t old_copy_object; /* Saved copy object */ vm_page_t result_page; /* Result of vm_fault_page */ vm_page_t top_page; /* Placeholder page */ kern_return_t kr; vm_page_t m; /* Fast access to result_page */ if (resume) { vm_fault_state_t *state = (vm_fault_state_t *) current_thread()->ith_other; /* * Retrieve cached variables and * continue vm_fault_page. */ object = state->vmf_object; if (object == VM_OBJECT_NULL) goto RetryFault; version = state->vmf_version; wired = state->vmf_wired; offset = state->vmf_offset; prot = state->vmf_prot; kr = vm_fault_page(object, offset, fault_type, (change_wiring && !wired), !change_wiring, &prot, &result_page, &top_page, TRUE, vm_fault_continue); goto after_vm_fault_page; } if (continuation != (void (*)()) 0) { /* * We will probably need to save state. */ char * state; /* * if this assignment stmt is written as * 'active_threads[cpu_number()] = kmem_cache_alloc()', * cpu_number may be evaluated before kmem_cache_alloc; * if kmem_cache_alloc blocks, cpu_number will be wrong */ state = (char *) kmem_cache_alloc(&vm_fault_state_cache); current_thread()->ith_other = state; } RetryFault: ; /* * Find the backing store object and offset into * it to begin the search. */ if ((kr = vm_map_lookup(&map, vaddr, fault_type, &version, &object, &offset, &prot, &wired)) != KERN_SUCCESS) { goto done; } /* * If the page is wired, we must fault for the current protection * value, to avoid further faults. */ if (wired) fault_type = prot; /* * Make a reference to this object to * prevent its disposal while we are messing with * it. Once we have the reference, the map is free * to be diddled. Since objects reference their * shadows (and copies), they will stay around as well. */ assert(object->ref_count > 0); object->ref_count++; vm_object_paging_begin(object); if (continuation != (void (*)()) 0) { vm_fault_state_t *state = (vm_fault_state_t *) current_thread()->ith_other; /* * Save variables, in case vm_fault_page discards * our kernel stack and we have to restart. */ state->vmf_map = map; state->vmf_vaddr = vaddr; state->vmf_fault_type = fault_type; state->vmf_change_wiring = change_wiring; state->vmf_continuation = continuation; state->vmf_version = version; state->vmf_wired = wired; state->vmf_object = object; state->vmf_offset = offset; state->vmf_prot = prot; kr = vm_fault_page(object, offset, fault_type, (change_wiring && !wired), !change_wiring, &prot, &result_page, &top_page, FALSE, vm_fault_continue); } else { kr = vm_fault_page(object, offset, fault_type, (change_wiring && !wired), !change_wiring, &prot, &result_page, &top_page, FALSE, (void (*)()) 0); } after_vm_fault_page: /* * If we didn't succeed, lose the object reference immediately. */ if (kr != VM_FAULT_SUCCESS) vm_object_deallocate(object); /* * See why we failed, and take corrective action. */ switch (kr) { case VM_FAULT_SUCCESS: break; case VM_FAULT_RETRY: goto RetryFault; case VM_FAULT_INTERRUPTED: kr = KERN_SUCCESS; goto done; case VM_FAULT_MEMORY_SHORTAGE: if (continuation != (void (*)()) 0) { vm_fault_state_t *state = (vm_fault_state_t *) current_thread()->ith_other; /* * Save variables in case VM_PAGE_WAIT * discards our kernel stack. */ state->vmf_map = map; state->vmf_vaddr = vaddr; state->vmf_fault_type = fault_type; state->vmf_change_wiring = change_wiring; state->vmf_continuation = continuation; state->vmf_object = VM_OBJECT_NULL; VM_PAGE_WAIT(vm_fault_continue); } else VM_PAGE_WAIT((void (*)()) 0); goto RetryFault; case VM_FAULT_FICTITIOUS_SHORTAGE: vm_page_more_fictitious(); goto RetryFault; case VM_FAULT_MEMORY_ERROR: kr = KERN_MEMORY_ERROR; goto done; } m = result_page; assert((change_wiring && !wired) ? (top_page == VM_PAGE_NULL) : ((top_page == VM_PAGE_NULL) == (m->object == object))); /* * How to clean up the result of vm_fault_page. This * happens whether the mapping is entered or not. */ #define UNLOCK_AND_DEALLOCATE \ MACRO_BEGIN \ vm_fault_cleanup(m->object, top_page); \ vm_object_deallocate(object); \ MACRO_END /* * What to do with the resulting page from vm_fault_page * if it doesn't get entered into the physical map: */ #define RELEASE_PAGE(m) \ MACRO_BEGIN \ PAGE_WAKEUP_DONE(m); \ vm_page_lock_queues(); \ if (!m->active && !m->inactive) \ vm_page_activate(m); \ vm_page_unlock_queues(); \ MACRO_END /* * We must verify that the maps have not changed * since our last lookup. */ old_copy_object = m->object->copy; vm_object_unlock(m->object); while (!vm_map_verify(map, &version)) { vm_object_t retry_object; vm_offset_t retry_offset; vm_prot_t retry_prot; /* * To avoid trying to write_lock the map while another * thread has it read_locked (in vm_map_pageable), we * do not try for write permission. If the page is * still writable, we will get write permission. If it * is not, or has been marked needs_copy, we enter the * mapping without write permission, and will merely * take another fault. */ kr = vm_map_lookup(&map, vaddr, fault_type & ~VM_PROT_WRITE, &version, &retry_object, &retry_offset, &retry_prot, &wired); if (kr != KERN_SUCCESS) { vm_object_lock(m->object); RELEASE_PAGE(m); UNLOCK_AND_DEALLOCATE; goto done; } vm_object_unlock(retry_object); vm_object_lock(m->object); if ((retry_object != object) || (retry_offset != offset)) { RELEASE_PAGE(m); UNLOCK_AND_DEALLOCATE; goto RetryFault; } /* * Check whether the protection has changed or the object * has been copied while we left the map unlocked. */ prot &= retry_prot; vm_object_unlock(m->object); } vm_object_lock(m->object); /* * If the copy object changed while the top-level object * was unlocked, then we must take away write permission. */ if (m->object->copy != old_copy_object) prot &= ~VM_PROT_WRITE; /* * If we want to wire down this page, but no longer have * adequate permissions, we must start all over. */ if (wired && (prot != fault_type)) { vm_map_verify_done(map, &version); RELEASE_PAGE(m); UNLOCK_AND_DEALLOCATE; goto RetryFault; } /* * It's critically important that a wired-down page be faulted * only once in each map for which it is wired. */ vm_object_unlock(m->object); /* * Put this page into the physical map. * We had to do the unlock above because pmap_enter * may cause other faults. The page may be on * the pageout queues. If the pageout daemon comes * across the page, it will remove it from the queues. */ PMAP_ENTER(map->pmap, vaddr, m, prot, wired); /* * If the page is not wired down and isn't already * on a pageout queue, then put it where the * pageout daemon can find it. */ vm_object_lock(m->object); vm_page_lock_queues(); if (change_wiring) { if (wired) vm_page_wire(m); else vm_page_unwire(m); } else if (software_reference_bits) { if (!m->active && !m->inactive) vm_page_activate(m); m->reference = TRUE; } else { vm_page_activate(m); } vm_page_unlock_queues(); /* * Unlock everything, and return */ vm_map_verify_done(map, &version); PAGE_WAKEUP_DONE(m); kr = KERN_SUCCESS; UNLOCK_AND_DEALLOCATE; #undef UNLOCK_AND_DEALLOCATE #undef RELEASE_PAGE done: if (continuation != (void (*)()) 0) { vm_fault_state_t *state = (vm_fault_state_t *) current_thread()->ith_other; kmem_cache_free(&vm_fault_state_cache, (vm_offset_t) state); (*continuation)(kr); /*NOTREACHED*/ } return(kr); } /* * vm_fault_wire: * * Wire down a range of virtual addresses in a map. */ void vm_fault_wire( vm_map_t map, vm_map_entry_t entry) { vm_offset_t va; pmap_t pmap; vm_offset_t end_addr = entry->vme_end; pmap = vm_map_pmap(map); /* * Inform the physical mapping system that the * range of addresses may not fault, so that * page tables and such can be locked down as well. */ pmap_pageable(pmap, entry->vme_start, end_addr, FALSE); /* * We simulate a fault to get the page and enter it * in the physical map. */ for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { if (vm_fault_wire_fast(map, va, entry) != KERN_SUCCESS) (void) vm_fault(map, va, VM_PROT_NONE, TRUE, FALSE, (void (*)()) 0); } } /* * vm_fault_unwire: * * Unwire a range of virtual addresses in a map. */ void vm_fault_unwire( vm_map_t map, vm_map_entry_t entry) { vm_offset_t va; pmap_t pmap; vm_offset_t end_addr = entry->vme_end; vm_object_t object; pmap = vm_map_pmap(map); object = (entry->is_sub_map) ? VM_OBJECT_NULL : entry->object.vm_object; /* * Since the pages are wired down, we must be able to * get their mappings from the physical map system. */ for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { pmap_change_wiring(pmap, va, FALSE); if (object == VM_OBJECT_NULL) { vm_map_lock_set_recursive(map); (void) vm_fault(map, va, VM_PROT_NONE, TRUE, FALSE, (void (*)()) 0); vm_map_lock_clear_recursive(map); } else { vm_prot_t prot; vm_page_t result_page; vm_page_t top_page; vm_fault_return_t result; do { prot = VM_PROT_NONE; vm_object_lock(object); vm_object_paging_begin(object); result = vm_fault_page(object, entry->offset + (va - entry->vme_start), VM_PROT_NONE, TRUE, FALSE, &prot, &result_page, &top_page, FALSE, (void (*)()) 0); } while (result == VM_FAULT_RETRY); if (result != VM_FAULT_SUCCESS) panic("vm_fault_unwire: failure"); vm_page_lock_queues(); vm_page_unwire(result_page); vm_page_unlock_queues(); PAGE_WAKEUP_DONE(result_page); vm_fault_cleanup(result_page->object, top_page); } } /* * Inform the physical mapping system that the range * of addresses may fault, so that page tables and * such may be unwired themselves. */ pmap_pageable(pmap, entry->vme_start, end_addr, TRUE); } /* * vm_fault_wire_fast: * * Handle common case of a wire down page fault at the given address. * If successful, the page is inserted into the associated physical map. * The map entry is passed in to avoid the overhead of a map lookup. * * NOTE: the given address should be truncated to the * proper page address. * * KERN_SUCCESS is returned if the page fault is handled; otherwise, * a standard error specifying why the fault is fatal is returned. * * The map in question must be referenced, and remains so. * Caller has a read lock on the map. * * This is a stripped version of vm_fault() for wiring pages. Anything * other than the common case will return KERN_FAILURE, and the caller * is expected to call vm_fault(). */ kern_return_t vm_fault_wire_fast( vm_map_t map, vm_offset_t va, vm_map_entry_t entry) { vm_object_t object; vm_offset_t offset; vm_page_t m; vm_prot_t prot; vm_stat.faults++; /* needs lock XXX */ current_task()->faults++; /* * Recovery actions */ #undef RELEASE_PAGE #define RELEASE_PAGE(m) { \ PAGE_WAKEUP_DONE(m); \ vm_page_lock_queues(); \ vm_page_unwire(m); \ vm_page_unlock_queues(); \ } #undef UNLOCK_THINGS #define UNLOCK_THINGS { \ object->paging_in_progress--; \ vm_object_unlock(object); \ } #undef UNLOCK_AND_DEALLOCATE #define UNLOCK_AND_DEALLOCATE { \ UNLOCK_THINGS; \ vm_object_deallocate(object); \ } /* * Give up and have caller do things the hard way. */ #define GIVE_UP { \ UNLOCK_AND_DEALLOCATE; \ return(KERN_FAILURE); \ } /* * If this entry is not directly to a vm_object, bail out. */ if (entry->is_sub_map) return(KERN_FAILURE); /* * Find the backing store object and offset into it. */ object = entry->object.vm_object; offset = (va - entry->vme_start) + entry->offset; prot = entry->protection; /* * Make a reference to this object to prevent its * disposal while we are messing with it. */ vm_object_lock(object); assert(object->ref_count > 0); object->ref_count++; object->paging_in_progress++; /* * INVARIANTS (through entire routine): * * 1) At all times, we must either have the object * lock or a busy page in some object to prevent * some other thread from trying to bring in * the same page. * * 2) Once we have a busy page, we must remove it from * the pageout queues, so that the pageout daemon * will not grab it away. * */ /* * Look for page in top-level object. If it's not there or * there's something going on, give up. */ m = vm_page_lookup(object, offset); if ((m == VM_PAGE_NULL) || (m->error) || (m->busy) || (m->absent) || (prot & m->page_lock)) { GIVE_UP; } /* * Wire the page down now. All bail outs beyond this * point must unwire the page. */ vm_page_lock_queues(); vm_page_wire(m); vm_page_unlock_queues(); /* * Mark page busy for other threads. */ assert(!m->busy); m->busy = TRUE; assert(!m->absent); /* * Give up if the page is being written and there's a copy object */ if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) { RELEASE_PAGE(m); GIVE_UP; } /* * Put this page into the physical map. * We have to unlock the object because pmap_enter * may cause other faults. */ vm_object_unlock(object); PMAP_ENTER(map->pmap, va, m, prot, TRUE); /* * Must relock object so that paging_in_progress can be cleared. */ vm_object_lock(object); /* * Unlock everything, and return */ PAGE_WAKEUP_DONE(m); UNLOCK_AND_DEALLOCATE; return(KERN_SUCCESS); } /* * Routine: vm_fault_copy_cleanup * Purpose: * Release a page used by vm_fault_copy. */ void vm_fault_copy_cleanup( vm_page_t page, vm_page_t top_page) { vm_object_t object = page->object; vm_object_lock(object); PAGE_WAKEUP_DONE(page); vm_page_lock_queues(); if (!page->active && !page->inactive) vm_page_activate(page); vm_page_unlock_queues(); vm_fault_cleanup(object, top_page); } /* * Routine: vm_fault_copy * * Purpose: * Copy pages from one virtual memory object to another -- * neither the source nor destination pages need be resident. * * Before actually copying a page, the version associated with * the destination address map wil be verified. * * In/out conditions: * The caller must hold a reference, but not a lock, to * each of the source and destination objects and to the * destination map. * * Results: * Returns KERN_SUCCESS if no errors were encountered in * reading or writing the data. Returns KERN_INTERRUPTED if * the operation was interrupted (only possible if the * "interruptible" argument is asserted). Other return values * indicate a permanent error in copying the data. * * The actual amount of data copied will be returned in the * "copy_size" argument. In the event that the destination map * verification failed, this amount may be less than the amount * requested. */ kern_return_t vm_fault_copy( vm_object_t src_object, vm_offset_t src_offset, vm_size_t *src_size, /* INOUT */ vm_object_t dst_object, vm_offset_t dst_offset, vm_map_t dst_map, vm_map_version_t *dst_version, boolean_t interruptible) { vm_page_t result_page; vm_prot_t prot; vm_page_t src_page; vm_page_t src_top_page; vm_page_t dst_page; vm_page_t dst_top_page; vm_size_t amount_done; vm_object_t old_copy_object; #define RETURN(x) \ MACRO_BEGIN \ *src_size = amount_done; \ MACRO_RETURN(x); \ MACRO_END amount_done = 0; do { /* while (amount_done != *src_size) */ RetrySourceFault: ; if (src_object == VM_OBJECT_NULL) { /* * No source object. We will just * zero-fill the page in dst_object. */ src_page = VM_PAGE_NULL; } else { prot = VM_PROT_READ; vm_object_lock(src_object); vm_object_paging_begin(src_object); switch (vm_fault_page(src_object, src_offset, VM_PROT_READ, FALSE, interruptible, &prot, &result_page, &src_top_page, FALSE, (void (*)()) 0)) { case VM_FAULT_SUCCESS: break; case VM_FAULT_RETRY: goto RetrySourceFault; case VM_FAULT_INTERRUPTED: RETURN(MACH_SEND_INTERRUPTED); case VM_FAULT_MEMORY_SHORTAGE: VM_PAGE_WAIT((void (*)()) 0); goto RetrySourceFault; case VM_FAULT_FICTITIOUS_SHORTAGE: vm_page_more_fictitious(); goto RetrySourceFault; case VM_FAULT_MEMORY_ERROR: return(KERN_MEMORY_ERROR); } src_page = result_page; assert((src_top_page == VM_PAGE_NULL) == (src_page->object == src_object)); assert ((prot & VM_PROT_READ) != VM_PROT_NONE); vm_object_unlock(src_page->object); } RetryDestinationFault: ; prot = VM_PROT_WRITE; vm_object_lock(dst_object); vm_object_paging_begin(dst_object); switch (vm_fault_page(dst_object, dst_offset, VM_PROT_WRITE, FALSE, FALSE /* interruptible */, &prot, &result_page, &dst_top_page, FALSE, (void (*)()) 0)) { case VM_FAULT_SUCCESS: break; case VM_FAULT_RETRY: goto RetryDestinationFault; case VM_FAULT_INTERRUPTED: if (src_page != VM_PAGE_NULL) vm_fault_copy_cleanup(src_page, src_top_page); RETURN(MACH_SEND_INTERRUPTED); case VM_FAULT_MEMORY_SHORTAGE: VM_PAGE_WAIT((void (*)()) 0); goto RetryDestinationFault; case VM_FAULT_FICTITIOUS_SHORTAGE: vm_page_more_fictitious(); goto RetryDestinationFault; case VM_FAULT_MEMORY_ERROR: if (src_page != VM_PAGE_NULL) vm_fault_copy_cleanup(src_page, src_top_page); return(KERN_MEMORY_ERROR); } assert ((prot & VM_PROT_WRITE) != VM_PROT_NONE); dst_page = result_page; old_copy_object = dst_page->object->copy; vm_object_unlock(dst_page->object); if (!vm_map_verify(dst_map, dst_version)) { BailOut: ; if (src_page != VM_PAGE_NULL) vm_fault_copy_cleanup(src_page, src_top_page); vm_fault_copy_cleanup(dst_page, dst_top_page); break; } vm_object_lock(dst_page->object); if (dst_page->object->copy != old_copy_object) { vm_object_unlock(dst_page->object); vm_map_verify_done(dst_map, dst_version); goto BailOut; } vm_object_unlock(dst_page->object); /* * Copy the page, and note that it is dirty * immediately. */ if (src_page == VM_PAGE_NULL) vm_page_zero_fill(dst_page); else vm_page_copy(src_page, dst_page); dst_page->dirty = TRUE; /* * Unlock everything, and return */ vm_map_verify_done(dst_map, dst_version); if (src_page != VM_PAGE_NULL) vm_fault_copy_cleanup(src_page, src_top_page); vm_fault_copy_cleanup(dst_page, dst_top_page); amount_done += PAGE_SIZE; src_offset += PAGE_SIZE; dst_offset += PAGE_SIZE; } while (amount_done != *src_size); RETURN(KERN_SUCCESS); #undef RETURN /*NOTREACHED*/ } #ifdef notdef /* * Routine: vm_fault_page_overwrite * * Description: * A form of vm_fault_page that assumes that the * resulting page will be overwritten in its entirety, * making it unnecessary to obtain the correct *contents* * of the page. * * Implementation: * XXX Untested. Also unused. Eventually, this technology * could be used in vm_fault_copy() to advantage. */ vm_fault_return_t vm_fault_page_overwrite( vm_object_t dst_object, vm_offset_t dst_offset, vm_page_t *result_page) /* OUT */ { vm_page_t dst_page; #define interruptible FALSE /* XXX */ while (TRUE) { /* * Look for a page at this offset */ while ((dst_page = vm_page_lookup(dst_object, dst_offset)) == VM_PAGE_NULL) { /* * No page, no problem... just allocate one. */ dst_page = vm_page_alloc(dst_object, dst_offset); if (dst_page == VM_PAGE_NULL) { vm_object_unlock(dst_object); VM_PAGE_WAIT((void (*)()) 0); vm_object_lock(dst_object); continue; } /* * Pretend that the memory manager * write-protected the page. * * Note that we will be asking for write * permission without asking for the data * first. */ dst_page->overwriting = TRUE; dst_page->page_lock = VM_PROT_WRITE; dst_page->absent = TRUE; dst_object->absent_count++; break; /* * When we bail out, we might have to throw * away the page created here. */ #define DISCARD_PAGE \ MACRO_BEGIN \ vm_object_lock(dst_object); \ dst_page = vm_page_lookup(dst_object, dst_offset); \ if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \ VM_PAGE_FREE(dst_page); \ vm_object_unlock(dst_object); \ MACRO_END } /* * If the page is write-protected... */ if (dst_page->page_lock & VM_PROT_WRITE) { /* * ... and an unlock request hasn't been sent */ if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) { vm_prot_t u; kern_return_t rc; /* * ... then send one now. */ if (!dst_object->pager_ready) { vm_object_assert_wait(dst_object, VM_OBJECT_EVENT_PAGER_READY, interruptible); vm_object_unlock(dst_object); thread_block((void (*)()) 0); if (current_thread()->wait_result != THREAD_AWAKENED) { DISCARD_PAGE; return(VM_FAULT_INTERRUPTED); } continue; } u = dst_page->unlock_request |= VM_PROT_WRITE; vm_object_unlock(dst_object); if ((rc = memory_object_data_unlock( dst_object->pager, dst_object->pager_request, dst_offset + dst_object->paging_offset, PAGE_SIZE, u)) != KERN_SUCCESS) { printf("vm_object_overwrite: memory_object_data_unlock failed\n"); DISCARD_PAGE; return((rc == MACH_SEND_INTERRUPTED) ? VM_FAULT_INTERRUPTED : VM_FAULT_MEMORY_ERROR); } vm_object_lock(dst_object); continue; } /* ... fall through to wait below */ } else { /* * If the page isn't being used for other * purposes, then we're done. */ if ( ! (dst_page->busy || dst_page->absent || dst_page->error) ) break; } PAGE_ASSERT_WAIT(dst_page, interruptible); vm_object_unlock(dst_object); thread_block((void (*)()) 0); if (current_thread()->wait_result != THREAD_AWAKENED) { DISCARD_PAGE; return(VM_FAULT_INTERRUPTED); } } *result_page = dst_page; return(VM_FAULT_SUCCESS); #undef interruptible #undef DISCARD_PAGE } #endif /* notdef */