/* Inode management routines

   Copyright (C) 1994,95,96,97,98,99 Free Software Foundation, Inc.

   Converted for ext2fs by Miles Bader <miles@gnu.org>

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2, or (at
   your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */

#include "ext2fs.h"
#include <string.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/statfs.h>
#include <sys/statvfs.h>

/* these flags aren't actually defined by a header file yet, so temporarily
   disable them if necessary.  */
#ifndef UF_APPEND
#define UF_APPEND 0
#endif
#ifndef UF_NODUMP
#define UF_NODUMP 0
#endif
#ifndef UF_IMMUTABLE
#define UF_IMMUTABLE 0
#endif

#define	INOHSZ	512
#if	((INOHSZ&(INOHSZ-1)) == 0)
#define	INOHASH(ino)	((ino)&(INOHSZ-1))
#else
#define	INOHASH(ino)	(((unsigned)(ino))%INOHSZ)
#endif

static struct node *nodehash[INOHSZ];

static error_t read_node (struct node *np);

spin_lock_t generation_lock = SPIN_LOCK_INITIALIZER;

/* Initialize the inode hash table. */
void
inode_init ()
{
  int n;
  for (n = 0; n < INOHSZ; n++)
    nodehash[n] = 0;
}

/* Fetch inode INUM, set *NPP to the node structure;
   gain one user reference and lock the node.  */
error_t
diskfs_cached_lookup (int inum, struct node **npp)
{
  error_t err;
  struct node *np;
  struct disknode *dn;

  spin_lock (&diskfs_node_refcnt_lock);
  for (np = nodehash[INOHASH(inum)]; np; np = np->dn->hnext)
    if (np->cache_id == inum)
      {
	np->references++;
	spin_unlock (&diskfs_node_refcnt_lock);
	mutex_lock (&np->lock);
	*npp = np;
	return 0;
      }

  /* Format specific data for the new node.  */
  dn = malloc (sizeof (struct disknode));
  if (! dn)
    {
      spin_unlock (&diskfs_node_refcnt_lock);
      return ENOMEM;
    }
  dn->dirents = 0;
  dn->dir_idx = 0;
  dn->pager = 0;
  rwlock_init (&dn->alloc_lock);
  pokel_init (&dn->indir_pokel, diskfs_disk_pager, disk_image);

  /* Create the new node.  */
  np = diskfs_make_node (dn);
  np->cache_id = inum;

  mutex_lock (&np->lock);

  /* Put NP in NODEHASH.  */
  dn->hnext = nodehash[INOHASH(inum)];
  if (dn->hnext)
    dn->hnext->dn->hprevp = &dn->hnext;
  dn->hprevp = &nodehash[INOHASH(inum)];
  nodehash[INOHASH(inum)] = np;

  spin_unlock (&diskfs_node_refcnt_lock);

  /* Get the contents of NP off disk.  */
  err = read_node (np);

  if (!diskfs_check_readonly () && !np->dn_stat.st_gen)
    {
      spin_lock (&generation_lock);
      if (++next_generation < diskfs_mtime->seconds)
	next_generation = diskfs_mtime->seconds;
      np->dn_stat.st_gen = next_generation;
      spin_unlock (&generation_lock);
      np->dn_set_ctime = 1;
    }

  if (err)
    return err;
  else
    {
      *npp = np;
      return 0;
    }
}

/* Lookup node INUM (which must have a reference already) and return it
   without allocating any new references. */
struct node *
ifind (ino_t inum)
{
  struct node *np;

  spin_lock (&diskfs_node_refcnt_lock);
  for (np = nodehash[INOHASH(inum)]; np; np = np->dn->hnext)
    {
      if (np->cache_id != inum)
	continue;

      assert (np->references);
      spin_unlock (&diskfs_node_refcnt_lock);
      return np;
    }
  assert (0);
}

/* The last reference to a node has gone away; drop
   it from the hash table and clean all state in the dn structure. */
void
diskfs_node_norefs (struct node *np)
{
  *np->dn->hprevp = np->dn->hnext;
  if (np->dn->hnext)
    np->dn->hnext->dn->hprevp = np->dn->hprevp;

  if (np->dn->dirents)
    free (np->dn->dirents);
  assert (!np->dn->pager);

  /* Move any pending writes of indirect blocks.  */
  pokel_inherit (&global_pokel, &np->dn->indir_pokel);
  pokel_finalize (&np->dn->indir_pokel);

  free (np->dn);
  free (np);
}

/* The last hard reference to a node has gone away; arrange to have
   all the weak references dropped that can be. */
void
diskfs_try_dropping_softrefs (struct node *np)
{
  drop_pager_softrefs (np);
}

/* The last hard reference to a node has gone away. */
void
diskfs_lost_hardrefs (struct node *np)
{
}

/* A new hard reference to a node has been created; it's now OK to
   have unused weak references. */
void
diskfs_new_hardrefs (struct node *np)
{
  allow_pager_softrefs (np);
}

/* Read stat information out of the ext2_inode. */
static error_t
read_node (struct node *np)
{
  error_t err;
  static int fsid, fsidset;
  struct stat *st = &np->dn_stat;
  struct disknode *dn = np->dn;
  struct ext2_inode *di = dino (np->cache_id);
  struct ext2_inode_info *info = &dn->info;

  err = diskfs_catch_exception ();
  if (err)
    return err;

  if (!fsidset)
    {
      fsid = getpid ();
      fsidset = 1;
    }

  st->st_fstype = FSTYPE_EXT2FS;
  st->st_fsid = fsid;
  st->st_ino = np->cache_id;
  st->st_blksize = vm_page_size * 2;

  st->st_nlink = di->i_links_count;
  st->st_size = di->i_size;
  st->st_gen = di->i_generation;

  st->st_atime = di->i_atime;
  st->st_mtime = di->i_mtime;
  st->st_ctime = di->i_ctime;

#ifdef XXX
  st->st_atime_usec = di->i_atime.ts_nsec / 1000;
  st->st_mtime_usec = di->i_mtime.ts_nsec / 1000;
  st->st_ctime_usec = di->i_ctime.ts_nsec / 1000;
#endif

  st->st_blocks = di->i_blocks;

  st->st_flags = 0;
  if (di->i_flags & EXT2_APPEND_FL)
    st->st_flags |= UF_APPEND;
  if (di->i_flags & EXT2_NODUMP_FL)
    st->st_flags |= UF_NODUMP;
  if (di->i_flags & EXT2_IMMUTABLE_FL)
    st->st_flags |= UF_IMMUTABLE;

  if (sblock->s_creator_os == EXT2_OS_HURD)
    {
      st->st_mode = di->i_mode | (di->i_mode_high << 16);
      st->st_mode &= ~S_ITRANS;
      if (di->i_translator)
	st->st_mode |= S_IPTRANS;

      st->st_uid = di->i_uid | (di->i_uid_high << 16);
      st->st_gid = di->i_gid | (di->i_gid_high << 16);

      st->st_author = di->i_author;
      if (st->st_author == -1)
	st->st_author = st->st_uid;
    }
  else
    {
      st->st_mode = di->i_mode & ~S_ITRANS;
      st->st_uid = di->i_uid;
      st->st_gid = di->i_gid;
      st->st_author = st->st_uid;
      np->author_tracks_uid = 1;
    }

  /* Setup the ext2fs auxiliary inode info.  */
  info->i_dtime = di->i_dtime;
  info->i_flags = di->i_flags;
  info->i_faddr = di->i_faddr;
  info->i_frag_no = di->i_frag;
  info->i_frag_size = di->i_fsize;
  info->i_osync = 0;
  info->i_file_acl = di->i_file_acl;
  if (S_ISDIR (st->st_mode))
    info->i_dir_acl = di->i_dir_acl;
  else
    {
      info->i_dir_acl = 0;
      info->i_high_size = di->i_size_high;
      if (info->i_high_size)	/* XXX */
	{
	  ext2_warning ("cannot handle large file inode %d", np->cache_id);
	  return EFBIG;
	}
    }
  info->i_block_group = inode_group_num (np->cache_id);
  info->i_next_alloc_block = 0;
  info->i_next_alloc_goal = 0;
  info->i_prealloc_count = 0;

  /* Set to a conservative value.  */
  dn->last_page_partially_writable = 0;

  if (S_ISCHR (st->st_mode) || S_ISBLK (st->st_mode))
    st->st_rdev = di->i_block[0];
  else
    {
      memcpy (info->i_data, di->i_block,
	      EXT2_N_BLOCKS * sizeof info->i_data[0]);
      st->st_rdev = 0;
    }

  diskfs_end_catch_exception ();

  if (S_ISREG (st->st_mode) || S_ISDIR (st->st_mode)
      || (S_ISLNK (st->st_mode) && st->st_blocks))
    {
      unsigned offset;

      np->allocsize = np->dn_stat.st_size;

      /* Round up to a block multiple.  */
      offset = np->allocsize & ((1 << log2_block_size) - 1);
      if (offset > 0)
	np->allocsize += block_size - offset;
    }
  else
    /* Allocsize should be zero for anything except directories, files, and
       long symlinks.  These are the only things allowed to have any blocks
       allocated as well, although st_size may be zero for any type (cases
       where st_blocks=0 and st_size>0 include fast symlinks, and, under
       linux, some devices).  */
    np->allocsize = 0;

  return 0;
}

/* Return EINVAL if this is not a hurd filesystem and any bits are set in L
   except the low 16 bits, else 0.  */
static inline error_t
check_high_bits (struct node *np, long l)
{
  if (sblock->s_creator_os == EXT2_OS_HURD)
    return 0;
  else
    return ((l & ~0xFFFF) == 0) ? 0 : EINVAL;
}

/* Return 0 if NP's owner can be changed to UID; otherwise return an error
   code. */
error_t
diskfs_validate_owner_change (struct node *np, uid_t uid)
{
  return check_high_bits (np, uid);
}

/* Return 0 if NP's group can be changed to GID; otherwise return an error
   code. */
error_t
diskfs_validate_group_change (struct node *np, gid_t gid)
{
  return check_high_bits (np, gid);
}

/* Return 0 if NP's mode can be changed to MODE; otherwise return an error
   code.  It must always be possible to clear the mode; diskfs will not ask
   for permission before doing so.  */
error_t
diskfs_validate_mode_change (struct node *np, mode_t mode)
{
  return check_high_bits (np, mode);
}

/* Return 0 if NP's author can be changed to AUTHOR; otherwise return an
   error code. */
error_t
diskfs_validate_author_change (struct node *np, uid_t author)
{
  if (sblock->s_creator_os == EXT2_OS_HURD)
    return 0;
  else
    /* For non-hurd filesystems, the author & owner are the same.  */
    return (author == np->dn_stat.st_uid) ? 0 : EINVAL;
}

/* The user may define this function.  Return 0 if NP's flags can be
   changed to FLAGS; otherwise return an error code.  It must always
   be possible to clear the flags.   */
error_t
diskfs_validate_flags_change (struct node *np, int flags)
{
  if (flags & ~(UF_NODUMP | UF_IMMUTABLE | UF_APPEND))
    return EINVAL;
  else
    return 0;
}

/* Writes everything from NP's inode to the disk image, and returns a pointer
   to it, or NULL if nothing need be done.  */
static struct ext2_inode *
write_node (struct node *np)
{
  error_t err;
  struct stat *st = &np->dn_stat;
  struct ext2_inode *di = dino (np->cache_id);

  if (np->dn->info.i_prealloc_count)
    ext2_discard_prealloc (np);

  assert (!np->dn_set_ctime && !np->dn_set_atime && !np->dn_set_mtime);
  if (np->dn_stat_dirty)
    {
      struct ext2_inode_info *info = &np->dn->info;

      assert (!diskfs_readonly);

      ext2_debug ("writing inode %d to disk", np->cache_id);

      err = diskfs_catch_exception ();
      if (err)
	return NULL;

      di->i_generation = st->st_gen;

      /* We happen to know that the stat mode bits are the same
	 as the ext2fs mode bits. */
      /* XXX? */

      /* Only the low 16 bits of these fields are standard across all ext2
	 implementations.  */
      di->i_mode = st->st_mode & 0xFFFF & ~S_ITRANS;
      di->i_uid = st->st_uid & 0xFFFF;
      di->i_gid = st->st_gid & 0xFFFF;

      if (sblock->s_creator_os == EXT2_OS_HURD)
	/* If this is a hurd-compatible filesystem, write the high bits too. */
	{
	  di->i_mode_high = (st->st_mode >> 16) & 0xffff & ~S_ITRANS;
	  di->i_uid_high = st->st_uid >> 16;
	  di->i_gid_high = st->st_gid >> 16;
	  di->i_author = st->st_author;
	}
      else
	/* No hurd extensions should be turned on.  */
	{
	  assert ((st->st_uid & ~0xFFFF) == 0);
	  assert ((st->st_gid & ~0xFFFF) == 0);
	  assert ((st->st_mode & ~0xFFFF) == 0);
	  assert (np->author_tracks_uid && st->st_author == st->st_uid);
	}

      di->i_links_count = st->st_nlink;
      di->i_size = st->st_size;

      di->i_atime = st->st_atime;
      di->i_mtime = st->st_mtime;
      di->i_ctime = st->st_ctime;
#ifdef XXX
      di->i_atime.ts_nsec = st->st_atime_usec * 1000;
      di->i_mtime.ts_nsec = st->st_mtime_usec * 1000;
      di->i_ctime.ts_nsec = st->st_ctime_usec * 1000;
#endif

      di->i_blocks = st->st_blocks;

      /* Convert generic flags in ST->st_flags to ext2-specific flags in DI
         (but don't mess with ext2 flags we don't know about).  The original
	 set was copied from DI into INFO by read_node, but might have been
	 modified for ext2fs-specific reasons; so we use INFO->i_flags
	 to start with, and then apply the flags in ST->st_flags.  */
      info->i_flags &= ~(EXT2_APPEND_FL | EXT2_NODUMP_FL | EXT2_IMMUTABLE_FL);
      if (st->st_flags & UF_APPEND)
	info->i_flags |= EXT2_APPEND_FL;
      if (st->st_flags & UF_NODUMP)
	info->i_flags |= EXT2_NODUMP_FL;
      if (st->st_flags & UF_IMMUTABLE)
	info->i_flags |= EXT2_IMMUTABLE_FL;
      di->i_flags = info->i_flags;

      if (!(st->st_mode & S_IPTRANS) && sblock->s_creator_os == EXT2_OS_HURD)
	di->i_translator = 0;

      /* Set dtime non-zero to indicate a deleted file.  */
      di->i_dtime = (st->st_mode ? 0 : di->i_mtime);

      if (S_ISCHR(st->st_mode) || S_ISBLK(st->st_mode))
	di->i_block[0] = st->st_rdev;
      else
	{
	  int block;
	  for (block = 0; block < EXT2_N_BLOCKS; block++)
	    di->i_block[block] = np->dn->info.i_data[block];
	}

      diskfs_end_catch_exception ();
      np->dn_stat_dirty = 0;

      return di;
    }
  else
    return NULL;
}

/* Reload all data specific to NODE from disk, without writing anything.
   Always called with DISKFS_READONLY true.  */
error_t
diskfs_node_reload (struct node *node)
{
  struct disknode *dn = node->dn;

  if (dn->dirents)
    {
      free (dn->dirents);
      dn->dirents = 0;
    }
  pokel_flush (&dn->indir_pokel);
  flush_node_pager (node);
  read_node (node);

  return 0;
}

/* For each active node, call FUN.  The node is to be locked around the call
   to FUN.  If FUN returns non-zero for any node, then immediately stop, and
   return that value.  */
error_t
diskfs_node_iterate (error_t (*fun)(struct node *))
{
  error_t err = 0;
  int n, num_nodes = 0;
  struct node *node, **node_list, **p;

  spin_lock (&diskfs_node_refcnt_lock);

  /* We must copy everything from the hash table into another data structure
     to avoid running into any problems with the hash-table being modified
     during processing (normally we delegate access to hash-table with
     diskfs_node_refcnt_lock, but we can't hold this while locking the
     individual node locks).  */

  for (n = 0; n < INOHSZ; n++)
    for (node = nodehash[n]; node; node = node->dn->hnext)
      num_nodes++;

  node_list = alloca (num_nodes * sizeof (struct node *));
  p = node_list;
  for (n = 0; n < INOHSZ; n++)
    for (node = nodehash[n]; node; node = node->dn->hnext)
      {
	*p++ = node;
	node->references++;
      }

  spin_unlock (&diskfs_node_refcnt_lock);

  p = node_list;
  while (num_nodes-- > 0)
    {
      node = *p++;
      if (!err)
	{
	  mutex_lock (&node->lock);
	  err = (*fun)(node);
	  mutex_unlock (&node->lock);
	}
      diskfs_nrele (node);
    }

  return err;
}

/* Write all active disknodes into the ext2_inode pager. */
void
write_all_disknodes ()
{
  error_t write_one_disknode (struct node *node)
    {
      struct ext2_inode *di;

      diskfs_set_node_times (node);

      /* Sync the indirect blocks here; they'll all be done before any
	 inodes.  Waiting for them shouldn't be too bad.  */
      pokel_sync (&node->dn->indir_pokel, 1);

      /* Update the inode image.  */
      di = write_node (node);
      if (di)
	record_global_poke (di);

      return 0;
    }

  diskfs_node_iterate (write_one_disknode);
}

/* Sync the info in NP->dn_stat and any associated format-specific
   information to disk.  If WAIT is true, then return only after the
   physicial media has been completely updated.  */
void
diskfs_write_disknode (struct node *np, int wait)
{
  struct ext2_inode *di = write_node (np);
  if (di)
    {
      if (wait)
	sync_global_ptr (di, 1);
      else
	record_global_poke (di);
    }
}

/* Set *ST with appropriate values to reflect the current state of the
   filesystem.  */
error_t
diskfs_set_statfs (struct statfs *st)
{
  st->f_type = FSTYPE_EXT2FS;
  st->f_bsize = block_size;
  st->f_blocks = sblock->s_blocks_count;
  st->f_bfree = sblock->s_free_blocks_count;
  st->f_bavail = st->f_bfree - sblock->s_r_blocks_count;
  st->f_files = sblock->s_inodes_count;
  st->f_ffree = sblock->s_free_inodes_count;
  st->f_fsid = getpid ();
  st->f_namelen = 0;
  st->f_favail = st->f_ffree;
  st->f_frsize = frag_size;
  return 0;
}

/* Implement the diskfs_set_translator callback from the diskfs
   library; see <hurd/diskfs.h> for the interface description. */
error_t
diskfs_set_translator (struct node *np, const char *name, unsigned namelen,
		       struct protid *cred)
{
  daddr_t blkno;
  error_t err;
  char buf[block_size];
  struct ext2_inode *di;

  assert (!diskfs_readonly);

  if (sblock->s_creator_os != EXT2_OS_HURD)
    return EOPNOTSUPP;

  if (namelen + 2 > block_size)
    return ENAMETOOLONG;

  err = diskfs_catch_exception ();
  if (err)
    return err;

  di = dino (np->cache_id);
  blkno = di->i_translator;

  if (namelen && !blkno)
    {
      /* Allocate block for translator */
      blkno =
	ext2_new_block ((np->dn->info.i_block_group
			 * EXT2_BLOCKS_PER_GROUP (sblock))
			+ sblock->s_first_data_block,
			0, 0, 0);
      if (blkno == 0)
	{
	  diskfs_end_catch_exception ();
	  return ENOSPC;
	}

      di->i_translator = blkno;
      record_global_poke (di);

      np->dn_stat.st_blocks += 1 << log2_stat_blocks_per_fs_block;
      np->dn_set_ctime = 1;
    }
  else if (!namelen && blkno)
    {
      /* Clear block for translator going away. */
      di->i_translator = 0;
      record_global_poke (di);
      ext2_free_blocks (blkno, 1);

      np->dn_stat.st_blocks -= 1 << log2_stat_blocks_per_fs_block;
      np->dn_stat.st_mode &= ~S_IPTRANS;
      np->dn_set_ctime = 1;
    }

  if (namelen)
    {
      buf[0] = namelen & 0xFF;
      buf[1] = (namelen >> 8) & 0xFF;
      bcopy (name, buf + 2, namelen);

      bcopy (buf, bptr (blkno), block_size);
      record_global_poke (bptr (blkno));

      np->dn_stat.st_mode |= S_IPTRANS;
      np->dn_set_ctime = 1;
    }

  diskfs_end_catch_exception ();
  return err;
}

/* Implement the diskfs_get_translator callback from the diskfs library.
   See <hurd/diskfs.h> for the interface description. */
error_t
diskfs_get_translator (struct node *np, char **namep, unsigned *namelen)
{
  error_t err;
  daddr_t blkno;
  unsigned datalen;
  void *transloc;

  assert (sblock->s_creator_os == EXT2_OS_HURD);

  err = diskfs_catch_exception ();
  if (err)
    return err;

  blkno = (dino (np->cache_id))->i_translator;
  assert (blkno);
  transloc = bptr (blkno);

  datalen =
    ((unsigned char *)transloc)[0] + (((unsigned char *)transloc)[1] << 8);
  *namep = malloc (datalen);
  bcopy (transloc + 2, *namep, datalen);

  diskfs_end_catch_exception ();

  *namelen = datalen;
  return 0;
}

/* The maximum size of a symlink store in the inode (including '\0').  */
#define MAX_INODE_SYMLINK \
  (EXT2_N_BLOCKS * sizeof (((struct ext2_inode *)0)->i_block[0]))

/* Write an in-inode symlink, or return EINVAL if we can't.  */
static error_t
write_symlink (struct node *node, const char *target)
{
  size_t len = strlen (target) + 1;

  if (len > MAX_INODE_SYMLINK)
    return EINVAL;

  assert (node->dn_stat.st_blocks == 0);

  bcopy (target, node->dn->info.i_data, len);
  node->dn_stat.st_size = len - 1;
  node->dn_set_ctime = 1;
  node->dn_set_mtime = 1;

  return 0;
}

/* Read an in-inode symlink, or return EINVAL if we can't.  */
static error_t
read_symlink (struct node *node, char *target)
{
  if (node->dn_stat.st_blocks)
    return EINVAL;

  assert (node->dn_stat.st_size < MAX_INODE_SYMLINK);

  bcopy (node->dn->info.i_data, target, node->dn_stat.st_size);
  return 0;
}

/* If this function is nonzero (and diskfs_shortcut_symlink is set) it
   is called to set a symlink.  If it returns EINVAL or isn't set,
   then the normal method (writing the contents into the file data) is
   used.  If it returns any other error, it is returned to the user.  */
error_t (*diskfs_create_symlink_hook)(struct node *np, const char *target) =
  write_symlink;

/* If this function is nonzero (and diskfs_shortcut_symlink is set) it
   is called to read the contents of a symlink.  If it returns EINVAL or
   isn't set, then the normal method (reading from the file data) is
   used.  If it returns any other error, it is returned to the user. */
error_t (*diskfs_read_symlink_hook)(struct node *np, char *target) =
  read_symlink;

/* Called when all hard ports have gone away. */
void
diskfs_shutdown_soft_ports ()
{
  /* Should initiate termination of internally held pager ports
     (the only things that should be soft) XXX */
}