diff options
Diffstat (limited to 'ufs/pager.c')
-rw-r--r-- | ufs/pager.c | 806 |
1 files changed, 806 insertions, 0 deletions
diff --git a/ufs/pager.c b/ufs/pager.c new file mode 100644 index 00000000..3038932d --- /dev/null +++ b/ufs/pager.c @@ -0,0 +1,806 @@ +/* Pager for ufs + Copyright (C) 1994, 1995, 1996, 1997, 1999 Free Software Foundation + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2, or (at + your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#include "ufs.h" +#include <strings.h> +#include <stdio.h> +#include <unistd.h> +#include <hurd/store.h> + +spin_lock_t node2pagelock = SPIN_LOCK_INITIALIZER; + +spin_lock_t unlocked_pagein_lock = SPIN_LOCK_INITIALIZER; + +#ifdef DONT_CACHE_MEMORY_OBJECTS +#define MAY_CACHE 0 +#else +#define MAY_CACHE 1 +#endif + +struct port_bucket *pager_bucket; + +/* Mapped image of the disk. */ +void *disk_image; + +/* Find the location on disk of page OFFSET in pager UPI. Return the + disk address (in disk block) in *ADDR. If *NPLOCK is set on + return, then release that mutex after I/O on the data has + completed. Set DISKSIZE to be the amount of valid data on disk. + (If this is an unallocated block, then set *ADDR to zero.) + ISREAD is non-zero iff this is for a pagein. */ +static error_t +find_address (struct user_pager_info *upi, + vm_address_t offset, + daddr_t *addr, + int *disksize, + struct rwlock **nplock, + int isread) +{ + error_t err; + struct rwlock *lock; + + assert (upi->type == DISK || upi->type == FILE_DATA); + + if (upi->type == DISK) + { + *disksize = __vm_page_size; + *addr = offset / DEV_BSIZE; + *nplock = 0; + return 0; + } + else + { + struct iblock_spec indirs[NIADDR + 1]; + struct node *np; + + np = upi->np; + + if (isread) + { + try_again: + + /* If we should allow an unlocked pagein, do so. (This + still has a slight race; there could be a pageout in progress + which is blocked on NP->np->allocptrlock itself. In that + case the pagein that should proceed unimpeded is blocked + in the pager library waiting for the pageout to complete. + I think this is sufficiently rare to put it off for the time + being.) */ + + spin_lock (&unlocked_pagein_lock); + if (offset >= upi->allow_unlocked_pagein + && (offset + vm_page_size + <= upi->allow_unlocked_pagein + upi->unlocked_pagein_length)) + { + spin_unlock (&unlocked_pagein_lock); + *nplock = 0; + goto have_lock; + } + spin_unlock (&unlocked_pagein_lock); + + /* Block on the rwlock if necessary; but when we wake up, + don't acquire it; check again from the top. + This is mutated inline from rwlock.h. */ + lock = &np->dn->allocptrlock; + mutex_lock (&lock->master); + if (lock->readers == -1 || lock->writers_waiting) + { + lock->readers_waiting++; + condition_wait (&lock->wakeup, &lock->master); + lock->readers_waiting--; + mutex_unlock (&lock->master); + goto try_again; + } + lock->readers++; + mutex_unlock (&lock->master); + *nplock = lock; + } + else + { + rwlock_reader_lock (&np->dn->allocptrlock); + *nplock = &np->dn->allocptrlock; + } + + have_lock: + + if (offset >= np->allocsize) + { + if (*nplock) + rwlock_reader_unlock (*nplock); + if (isread) + return EIO; + else + { + *addr = 0; + *disksize = 0; + return 0; + } + } + + if (offset + __vm_page_size > np->allocsize) + *disksize = np->allocsize - offset; + else + *disksize = __vm_page_size; + + err = fetch_indir_spec (np, lblkno (sblock, offset), indirs); + if (err && *nplock) + rwlock_reader_unlock (*nplock); + else + { + if (indirs[0].bno) + *addr = (fsbtodb (sblock, indirs[0].bno) + + blkoff (sblock, offset) / DEV_BSIZE); + else + *addr = 0; + } + + return err; + } +} + + +/* Implement the pager_read_page callback from the pager library. See + <hurd/pager.h> for the interface description. */ +error_t +pager_read_page (struct user_pager_info *pager, + vm_offset_t page, + vm_address_t *buf, + int *writelock) +{ + error_t err; + struct rwlock *nplock; + daddr_t addr; + int disksize; + + err = find_address (pager, page, &addr, &disksize, &nplock, 1); + if (err) + return err; + + if (addr) + { + size_t read = 0; + err = store_read (store, addr << log2_dev_blocks_per_dev_bsize, + disksize, (void **)buf, &read); + if (read != disksize) + err = EIO; + if (!err && disksize != __vm_page_size) + bzero ((void *)(*buf + disksize), __vm_page_size - disksize); + *writelock = 0; + } + else + { +#if 0 + printf ("Write-locked pagein Object %#x\tOffset %#x\n", pager, page); + fflush (stdout); +#endif + *buf = (vm_address_t) mmap (0, vm_page_size, PROT_READ|PROT_WRITE, + MAP_ANON, 0, 0); + *writelock = 1; + } + + if (nplock) + rwlock_reader_unlock (nplock); + + return err; +} + +/* Implement the pager_write_page callback from the pager library. See + <hurd/pager.h> for the interface description. */ +error_t +pager_write_page (struct user_pager_info *pager, + vm_offset_t page, + vm_address_t buf) +{ + daddr_t addr; + int disksize; + struct rwlock *nplock; + error_t err; + + err = find_address (pager, page, &addr, &disksize, &nplock, 0); + if (err) + return err; + + if (addr) + { + size_t wrote; + err = store_write (store, addr << log2_dev_blocks_per_dev_bsize, + (void *)buf, disksize, &wrote); + if (wrote != disksize) + err = EIO; + } + else + err = 0; + + if (nplock) + rwlock_reader_unlock (nplock); + + return err; +} + +/* Implement the pager_unlock_page callback from the pager library. See + <hurd/pager.h> for the interface description. */ +error_t +pager_unlock_page (struct user_pager_info *pager, + vm_offset_t address) +{ + struct node *np; + error_t err; + struct iblock_spec indirs[NIADDR + 1]; + daddr_t bno; + struct disknode *dn; + struct dinode *di; + + /* Zero an sblock->fs_bsize piece of disk starting at BNO, + synchronously. We do this on newly allocated indirect + blocks before setting the pointer to them to ensure that an + indirect block absolutely never points to garbage. */ + void zero_disk_block (int bno) + { + bzero (indir_block (bno), sblock->fs_bsize); + sync_disk_blocks (bno, sblock->fs_bsize, 1); + }; + + /* Problem--where to get cred values for allocation here? */ + +#if 0 + printf ("Unlock page request, Object %#x\tOffset %#x...", pager, address); + fflush (stdout); +#endif + + if (pager->type == DISK) + return 0; + + np = pager->np; + dn = np->dn; + di = dino (dn->number); + + rwlock_writer_lock (&dn->allocptrlock); + + /* If this is the last block, we don't let it get unlocked. */ + if (address + __vm_page_size + > blkroundup (sblock, np->allocsize) - sblock->fs_bsize) + { + printf ("attempt to unlock at last block denied\n"); + fflush (stdout); + rwlock_writer_unlock (&dn->allocptrlock); + return EIO; + } + + err = fetch_indir_spec (np, lblkno (sblock, address), indirs); + if (err) + { + rwlock_writer_unlock (&dn->allocptrlock); + return EIO; + } + + err = diskfs_catch_exception (); + if (err) + { + rwlock_writer_unlock (&dn->allocptrlock); + return EIO; + } + + /* See if we need a triple indirect block; fail if we do. */ + assert (indirs[0].offset == -1 + || indirs[1].offset == -1 + || indirs[2].offset == -1); + + /* Check to see if this block is allocated. */ + if (indirs[0].bno == 0) + { + size_t wrote; + + if (indirs[0].offset == -1) + { + err = ffs_alloc (np, lblkno (sblock, address), + ffs_blkpref (np, lblkno (sblock, address), + lblkno (sblock, address), di->di_db), + sblock->fs_bsize, &bno, 0); + if (err) + goto out; + + assert (lblkno (sblock, address) < NDADDR); + err = store_write (store, + fsbtodb (sblock, bno) + << log2_dev_blocks_per_dev_bsize, + zeroblock, sblock->fs_bsize, &wrote); + if (!err && wrote != sblock->fs_bsize) + err = EIO; + if (err) + goto out; + + indirs[0].bno = bno; + write_disk_entry (di->di_db[lblkno (sblock, address)], bno); + record_poke (di, sizeof (struct dinode)); + } + else + { + daddr_t *siblock; + + /* We need to set siblock to the single indirect block + array; see if the single indirect block is allocated. */ + if (indirs[1].bno == 0) + { + if (indirs[1].offset == -1) + { + err = ffs_alloc (np, lblkno (sblock, address), + ffs_blkpref (np, lblkno (sblock, address), + INDIR_SINGLE, di->di_ib), + sblock->fs_bsize, &bno, 0); + if (err) + goto out; + zero_disk_block (bno); + indirs[1].bno = bno; + write_disk_entry (di->di_ib[INDIR_SINGLE], bno); + record_poke (di, sizeof (struct dinode)); + } + else + { + daddr_t *diblock; + + /* We need to set diblock to the double indirect + block array; see if the double indirect block is + allocated. */ + if (indirs[2].bno == 0) + { + /* This assert because triple indirection is + not supported. */ + assert (indirs[2].offset == -1); + + err = ffs_alloc (np, lblkno (sblock, address), + ffs_blkpref (np, lblkno (sblock, + address), + INDIR_DOUBLE, di->di_ib), + sblock->fs_bsize, &bno, 0); + if (err) + goto out; + zero_disk_block (bno); + indirs[2].bno = bno; + write_disk_entry (di->di_ib[INDIR_DOUBLE], bno); + record_poke (di, sizeof (struct dinode)); + } + + diblock = indir_block (indirs[2].bno); + mark_indir_dirty (np, indirs[2].bno); + + /* Now we can allocate the single indirect block */ + + err = ffs_alloc (np, lblkno (sblock, address), + ffs_blkpref (np, lblkno (sblock, address), + indirs[1].offset, diblock), + sblock->fs_bsize, &bno, 0); + if (err) + goto out; + zero_disk_block (bno); + indirs[1].bno = bno; + write_disk_entry (diblock[indirs[1].offset], bno); + record_poke (diblock, sblock->fs_bsize); + } + } + + siblock = indir_block (indirs[1].bno); + mark_indir_dirty (np, indirs[1].bno); + + /* Now we can allocate the data block. */ + + err = ffs_alloc (np, lblkno (sblock, address), + ffs_blkpref (np, lblkno (sblock, address), + indirs[0].offset, siblock), + sblock->fs_bsize, &bno, 0); + if (err) + goto out; + + err = store_write (store, + fsbtodb (sblock, bno) + << log2_dev_blocks_per_dev_bsize, + zeroblock, sblock->fs_bsize, &wrote); + if (!err && wrote != sblock->fs_bsize) + err = EIO; + if (err) + goto out; + + indirs[0].bno = bno; + write_disk_entry (siblock[indirs[0].offset], bno); + record_poke (siblock, sblock->fs_bsize); + } + } + + out: + diskfs_end_catch_exception (); + rwlock_writer_unlock (&dn->allocptrlock); + return err; +} + +/* Implement the pager_report_extent callback from the pager library. See + <hurd/pager.h> for the interface description. */ +inline error_t +pager_report_extent (struct user_pager_info *pager, + vm_address_t *offset, + vm_size_t *size) +{ + assert (pager->type == DISK || pager->type == FILE_DATA); + + *offset = 0; + + if (pager->type == DISK) + *size = store->size; + else + *size = pager->np->allocsize; + + return 0; +} + +/* Implement the pager_clear_user_data callback from the pager library. + See <hurd/pager.h> for the interface description. */ +void +pager_clear_user_data (struct user_pager_info *upi) +{ + /* XXX Do the right thing for the disk pager here too. */ + if (upi->type == FILE_DATA) + { + spin_lock (&node2pagelock); + if (upi->np->dn->fileinfo == upi) + upi->np->dn->fileinfo = 0; + spin_unlock (&node2pagelock); + diskfs_nrele_light (upi->np); + } + free (upi); +} + +void +pager_dropweak (struct user_pager_info *upi __attribute__ ((unused))) +{ +} + + + +/* Create the DISK pager. */ +void +create_disk_pager (void) +{ + struct user_pager_info *upi = malloc (sizeof (struct user_pager_info)); + + upi->type = DISK; + upi->np = 0; + pager_bucket = ports_create_bucket (); + diskfs_start_disk_pager (upi, pager_bucket, MAY_CACHE, store->size, + &disk_image); + upi->p = diskfs_disk_pager; +} + +/* This syncs a single file (NP) to disk. Wait for all I/O to complete + if WAIT is set. NP->lock must be held. */ +void +diskfs_file_update (struct node *np, + int wait) +{ + struct dirty_indir *d, *tmp; + struct user_pager_info *upi; + + spin_lock (&node2pagelock); + upi = np->dn->fileinfo; + if (upi) + ports_port_ref (upi->p); + spin_unlock (&node2pagelock); + + if (upi) + { + pager_sync (upi->p, wait); + ports_port_deref (upi->p); + } + + for (d = np->dn->dirty; d; d = tmp) + { + sync_disk_blocks (d->bno, sblock->fs_bsize, wait); + tmp = d->next; + free (d); + } + np->dn->dirty = 0; + + diskfs_node_update (np, wait); +} + +/* Invalidate any pager data associated with NODE. */ +void +flush_node_pager (struct node *node) +{ + struct user_pager_info *upi; + struct disknode *dn = node->dn; + struct dirty_indir *dirty = dn->dirty; + + spin_lock (&node2pagelock); + upi = dn->fileinfo; + if (upi) + ports_port_ref (upi->p); + spin_unlock (&node2pagelock); + + if (upi) + { + pager_flush (upi->p, 1); + ports_port_deref (upi->p); + } + + dn->dirty = 0; + + while (dirty) + { + struct dirty_indir *next = dirty->next; + free (dirty); + dirty = next; + } +} + +/* Call this to create a FILE_DATA pager and return a send right. + NP must be locked. PROT is the max protection desired. */ +mach_port_t +diskfs_get_filemap (struct node *np, vm_prot_t prot) +{ + struct user_pager_info *upi; + mach_port_t right; + + assert (S_ISDIR (np->dn_stat.st_mode) + || S_ISREG (np->dn_stat.st_mode) + || (S_ISLNK (np->dn_stat.st_mode) + && (!direct_symlink_extension + || np->dn_stat.st_size >= sblock->fs_maxsymlinklen))); + + spin_lock (&node2pagelock); + do + if (!np->dn->fileinfo) + { + upi = malloc (sizeof (struct user_pager_info)); + upi->type = FILE_DATA; + upi->np = np; + upi->max_prot = prot; + upi->allow_unlocked_pagein = 0; + upi->unlocked_pagein_length = 0; + diskfs_nref_light (np); + upi->p = pager_create (upi, pager_bucket, + MAY_CACHE, MEMORY_OBJECT_COPY_DELAY); + if (upi->p == 0) + { + diskfs_nrele_light (np); + free (upi); + spin_unlock (&node2pagelock); + return MACH_PORT_NULL; + } + np->dn->fileinfo = upi; + right = pager_get_port (np->dn->fileinfo->p); + ports_port_deref (np->dn->fileinfo->p); + } + else + { + np->dn->fileinfo->max_prot |= prot; + + /* Because NP->dn->fileinfo->p is not a real reference, + this might be nearly deallocated. If that's so, then + the port right will be null. In that case, clear here + and loop. The deallocation will complete separately. */ + right = pager_get_port (np->dn->fileinfo->p); + if (right == MACH_PORT_NULL) + np->dn->fileinfo = 0; + } + while (right == MACH_PORT_NULL); + + spin_unlock (&node2pagelock); + + mach_port_insert_right (mach_task_self (), right, right, + MACH_MSG_TYPE_MAKE_SEND); + + return right; +} + +/* Call this when we should turn off caching so that unused memory object + ports get freed. */ +void +drop_pager_softrefs (struct node *np) +{ + struct user_pager_info *upi; + + spin_lock (&node2pagelock); + upi = np->dn->fileinfo; + if (upi) + ports_port_ref (upi->p); + spin_unlock (&node2pagelock); + + if (MAY_CACHE && upi) + pager_change_attributes (upi->p, 0, MEMORY_OBJECT_COPY_DELAY, 0); + if (upi) + ports_port_deref (upi->p); +} + +/* Call this when we should turn on caching because it's no longer + important for unused memory object ports to get freed. */ +void +allow_pager_softrefs (struct node *np) +{ + struct user_pager_info *upi; + + spin_lock (&node2pagelock); + upi = np->dn->fileinfo; + if (upi) + ports_port_ref (upi->p); + spin_unlock (&node2pagelock); + + if (MAY_CACHE && upi) + pager_change_attributes (upi->p, 1, MEMORY_OBJECT_COPY_DELAY, 0); + if (upi) + ports_port_deref (upi->p); +} + +static void +block_caching () +{ + error_t block_cache (void *arg) + { + struct pager *p = arg; + + pager_change_attributes (p, 0, MEMORY_OBJECT_COPY_DELAY, 1); + return 0; + } + + /* Loop through the pagers and turn off caching one by one, + synchronously. That should cause termination of each pager. */ + ports_bucket_iterate (pager_bucket, block_cache); +} + +static void +enable_caching () +{ + error_t enable_cache (void *arg) + { + struct pager *p = arg; + struct user_pager_info *upi = pager_get_upi (p); + + pager_change_attributes (p, 1, MEMORY_OBJECT_COPY_DELAY, 0); + + /* It's possible that we didn't have caching on before, because + the user here is the only reference to the underlying node + (actually, that's quite likely inside this particular + routine), and if that node has no links. So dinkle the node + ref counting scheme here, which will cause caching to be + turned off, if that's really necessary. */ + if (upi->type == FILE_DATA) + { + diskfs_nref (upi->np); + diskfs_nrele (upi->np); + } + + return 0; + } + + ports_bucket_iterate (pager_bucket, enable_cache); +} + +/* Tell diskfs if there are pagers exported, and if none, then + prevent any new ones from showing up. */ +int +diskfs_pager_users () +{ + int npagers = ports_count_bucket (pager_bucket); + + if (npagers <= 1) + return 0; + + if (MAY_CACHE) + { + block_caching (); + + /* Give it a second; the kernel doesn't actually shutdown + immediately. XXX */ + sleep (1); + + npagers = ports_count_bucket (pager_bucket); + if (npagers <= 1) + return 0; + + /* Darn, there are actual honest users. Turn caching back on, + and return failure. */ + enable_caching (); + } + + ports_enable_bucket (pager_bucket); + + return 1; +} + +/* Return the bitwise or of the maximum prot parameter (the second arg to + diskfs_get_filemap) for all active user pagers. */ +vm_prot_t +diskfs_max_user_pager_prot () +{ + vm_prot_t max_prot = 0; + int npagers = ports_count_bucket (pager_bucket); + + if (npagers > 1) + /* More than just the disk pager. */ + { + error_t add_pager_max_prot (void *v_p) + { + struct pager *p = v_p; + struct user_pager_info *upi = pager_get_upi (p); + if (upi->type == FILE_DATA) + max_prot |= upi->max_prot; + /* Stop iterating if MAX_PROT is as filled as it's going to get. */ + return max_prot == (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); + } + + block_caching (); /* Make any silly pagers go away. */ + + /* Give it a second; the kernel doesn't actually shutdown + immediately. XXX */ + sleep (1); + + ports_bucket_iterate (pager_bucket, add_pager_max_prot); + + enable_caching (); + } + + ports_enable_bucket (pager_bucket); + + return max_prot; +} + +/* Call this to find out the struct pager * corresponding to the + FILE_DATA pager of inode IP. This should be used *only* as a subsequent + argument to register_memory_fault_area, and will be deleted when + the kernel interface is fixed. NP must be locked. */ +struct pager * +diskfs_get_filemap_pager_struct (struct node *np) +{ + /* This is safe because fileinfo can't be cleared; there must be + an active mapping for this to be called. */ + return np->dn->fileinfo->p; +} + +/* Shutdown all the pagers. */ +void +diskfs_shutdown_pager () +{ + error_t shutdown_one (void *arg) + { + struct pager *p = arg; + /* Don't ever shut down the disk pager. */ + if (p != diskfs_disk_pager) + pager_shutdown (p); + return 0; + } + + copy_sblock (); + write_all_disknodes (); + ports_bucket_iterate (pager_bucket, shutdown_one); + sync_disk (1); +} + +/* Sync all the pagers. */ +void +diskfs_sync_everything (int wait) +{ + error_t sync_one (void *arg) + { + struct pager *p = arg; + /* Make sure the disk pager is done last. */ + if (p != diskfs_disk_pager) + pager_sync (p, wait); + return 0; + } + + copy_sblock (); + write_all_disknodes (); + ports_bucket_iterate (pager_bucket, sync_one); + sync_disk (wait); +} |