/* Pager for everything UFS pages
   Copyright (C) 1991, 1992 Free Software Foundation

This file is part of the GNU Hurd.

The GNU Hurd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.

The GNU Hurd is distributed in the hope that it will be useful, 
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with the GNU Hurd; see the file COPYING.  If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */

/* Written by Michael I. Bushnell.  */

#include "ufs.h"
#include "fs.h"
#include "inode.h"
#include "dinode.h"
#include "memory_object.h"
#include <mach/vm_prot.h>
#include <mach/mig_errors.h>
#include <mach/message.h>
#include <mach/notify.h>
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <stdlib.h>

/* These are structs of various sorts that the pager needs.  */

/* This structure hangs off each memory object port.  */
struct controlinfo 
{
  int porttype;			/* always PT_PAGER */
  enum pager_type
    {
      DINODE,			/* disk inodes */
      CG,			/* cylinder group headers */
      DINDIR,			/* double indirect blocks */
      SINDIR,			/* single indirect blocks */
      FILE_DATA,			/* read/write file contents */
      FILE_DATA_RO,			/* read only file contents */
    } pager_type;
  enum
    {
      NOTINIT,			/* before memory_object_init */
      NORMAL,			/* while running */
      TERMINATING,		/* after shutdown starts */
      TERMINATED,		/* after m_o_terminate, before nosenders */
    } pager_state;
  
  struct mutex interlock;	/* modification to this struct and pagemap */
  struct condition wakeup;	/* when we are pausing for some reason */
  
  int refcnt;			/* count of references. */

  /* Interface ports */
  memory_object_t memobj;	/* receive right */
  memory_object_control_t memobjcntl; /* send right */
  memory_object_name_t memobjname; /* send right */

  int seqno;			/* sequence number of last request */
  int waitingforseqno;		/* wakeup when done with request */
  
  int vsize;			/* virtual size in bytes, invalid for
				   file pagers */
  
  struct inode *ip;		/* associated inode, SINDIR and FILE_DATA only */

  /* Each pager is kept on a list of pagers of its type or on a free list.  */
  struct controlinfo *next, **pprev;
  
  int synccount;		/* number of syncs in progress */
  int writecount;		/* number of syncing pages left to write */

  /* This table has one element per page (only sized as needed),
     indicating the current state of that page.  */
  struct pagemap *pagemap;
  int pagemapsize;

  struct controlinfo *opager;	/* FILE_DATA_RO->FILE_DATA, FILE_DATA->FILE_DATA_RO */
};

/* For printing */
char *pg_types[] = 
{
  "DINODE", "CG", "DINDIR", "SINDIR", "FILE_DATA", "FILE_DATA_RO",
};

/* This records the state of a single page; should be only one byte. */
struct pagemap
{
  u_char
    pagingout:1,		/* being written to disk */
    pageinwait:1,		/* provide data back when write done */
    syncing:1,			/* decrement writecount when write done */
    invalid:1;			/* data on disk is irrevocably wrong */

  enum page_errors
    {
      PAGE_NOERR,
      PAGE_ENOSPC,
      PAGE_EIO,
      PAGE_EDQUOT,
    } next_error: 2;		/* issue this error on next data_request,
				   but only if it asks for write access. */
  
  enum page_errors error:2;	/* doesn't belong here XXX */
};

static int page_errors[4] = { 
  KERN_SUCCESS, ENOSPC,
  EIO, EDQUOT, };
  

/* Global variables for this file */

/* Filesystem blocks of inodes per cylinder group */
static int infsb_pcg;

/* These are the pagers that exist only once that we create at start time.  */
struct controlinfo *cginfo, *inodeinfo, *dininfo;

/* Lock of all controlinfo refcounts. */
static struct mutex refcntlock;

/* This is a filesystem block full of zeroes.  */
static char *zeroblock;

/* Lists of controlinfo structs */
static struct controlinfo *cifreelist; /* free structures.  */
static struct controlinfo *silist; /* SINDIR pagers */
static struct controlinfo *filelist; /* FILE_DATA pagers */
static struct mutex cilistlock;	/* locks above vars */

/* Forward declarations */
static struct controlinfo *pager_cvt (mach_port_t);
static void wait_for_seqno (struct controlinfo *, int);
static void pagemap_resize (struct controlinfo *, int);
static void mark_object_error (struct controlinfo *, int, int, int);
static daddr_t indir_alloc (struct inode *, int, int);
static void mark_next_request_error (struct controlinfo *, int, int, int);
static struct controlinfo *cialloc (enum pager_type);
static int pager_release (struct controlinfo *);
static void cienqueue (struct controlinfo *);
static void dindir_drop (struct inode *);
static void sindir_drop (struct inode *, int, int);


/* Paging in and out: */

/* Called by the kernel when data is needed upon page fault */
kern_return_t
seqnos_memory_object_data_request (mach_port_t object, 
				   mach_port_seqno_t seqno,
				   mach_port_t control,
				   vm_offset_t offset,
				   vm_size_t length,
				   vm_prot_t access)
{
  struct controlinfo *ci;
  struct pagemap *entry;
  struct mutex *volatile slp;
  int *volatile slip;
  struct condition *volatile slc;
  int doread, doerror;
  int daddr;
  int fsbaddr;
  int vblkno;
  error_t err;
  void *page;
  volatile int zero_len;

  if (!(ci = pager_cvt (object)))
    return EOPNOTSUPP;
  
  /* sanity checks -- we don't do multi-page requests yet.  */
  if (control != ci->memobjcntl)
    {
      printf ("incg data request: wrong control port");
      return 0;
    }
  if (length != __vm_page_size)
    {
      printf ("incg data request: bad length size");
      return 0;
    }
  if (offset % __vm_page_size)
    {
      printf ("incg data request: misaligned request");
      return 0;
    }

  /* Acquire the right to meddle with the pagemap */
  mutex_lock (&ci->interlock);
  wait_for_seqno (ci, seqno);

  if (ci->pager_state != NORMAL)
    {
      printf ("pager in wrong state for read\n");
      if (ci->waitingforseqno)
	{
	  ci->waitingforseqno = 0;
	  condition_broadcast (&ci->wakeup);
	}
      mutex_unlock (&ci->interlock);
      return 0;
    }

  pagemap_resize (ci, offset + length);

  /* If someone is paging this out right now, the disk contents are
     unreliable, so we have to wait.  It is too expensive (right now) to
     find the data and return it, and then interrupt the write, so we just
     mark the page and have the writing thread do m_o_data_ready when it
     gets around to it.  */
  entry = &ci->pagemap[offset / __vm_page_size];
  if (entry->pagingout)
    {
      doread = 0;
      entry->pageinwait = 1;
    }
  else
    doread = 1;

  if (entry->invalid)
    doerror = 1;
  else
    doerror = 0;

  if (entry->next_error != PAGE_NOERR && (access & VM_PROT_WRITE))
    {
      memory_object_data_error (control, offset, length, 
				page_errors[entry->next_error]);
      mark_object_error (ci, offset, length, page_errors[entry->next_error]);
      entry->next_error = 0;
      doread = 0;
    }

  /* Let someone else in.  */
  if (ci->waitingforseqno)
    {
      ci->waitingforseqno = 0;
      condition_broadcast (&ci->wakeup);
    }
  mutex_unlock (&ci->interlock);

  if (!doread)
    return 0;
  if (doerror)
    goto error_read;
  
  /* Now we have to find the data on disk, and then bring it in.  */

  vblkno = lblkno (offset);
  zero_len = 0;
  slp = 0;

  switch (ci->pager_type)
    {
    case CG:
      fsbaddr = cgstart (vblkno);
      break;

    case DINODE:
      fsbaddr = cgimin (vblkno / infsb_pcg) + blkstofrags (vblkno % infsb_pcg);
      break;
      
    case DINDIR:
      {
	struct inode *ip = ifind (vblkno);
	
	mutex_lock (&ip->i_shrinklock);
	slp = &ip->i_shrinklock;
	slip = &ip->i_shrinkcount;
	slc = &ip->i_shrinkwait;
	ip->i_shrinkcount++;
	mutex_unlock (&ip->i_shrinklock);
	
	mutex_lock (&ip->i_dinlock);
	if (catch_exception ())
	  {
	    mutex_unlock (&ip->i_dinlock);
	    goto error_read;
	  }

	fsbaddr = ip->di->di_ib[INDIR_DOUBLE];
	mutex_unlock (&ip->i_dinlock);

	end_catch_exception ();
      }
      break;
      
    case SINDIR:
      {
	struct inode *ip = ci->ip;
	
	mutex_lock (&ip->i_shrinklock);
	slp = &ip->i_shrinklock;
	slip = &ip->i_shrinkcount;
	slc = &ip->i_shrinkwait;
	ip->i_shrinkcount++;
	mutex_unlock (&ip->i_shrinklock);
	
	mutex_lock (&ip->i_sinlock);
	if (catch_exception ())
	  {
	    mutex_unlock (&ip->i_sinlock);
	    goto error_read;
	  }

	if (vblkno == 0)
	  fsbaddr = ip->di->di_ib[INDIR_SINGLE];
	else
	  {
	    if (!ip->i_dinloc)
	      din_map (ip);
	    fsbaddr = ip->i_dinloc[vblkno - 1];
	  }
	
	mutex_unlock (&ip->i_sinlock);
	end_catch_exception ();
      }
      break;
      
    case FILE_DATA:
      {
	struct inode *ip = ci->ip;
	int file_size = get_inode_vsize (ip);
	
	mutex_lock (&ip->i_shrinklock);
	slp = &ip->i_shrinklock;
	slip = &ip->i_shrinkcount;
	slc = &ip->i_shrinkwait;
	ip->i_shrinkcount++;
	mutex_unlock (&ip->i_shrinklock);

	mutex_lock (&ip->i_datalock);
	
	if (offset >= file_size)
	  goto error_read;
	if (offset + length > file_size)
	  zero_len = (offset + length) - file_size;

	if (catch_exception ())
	  {
	    mutex_unlock (&ip->i_datalock);
	    goto error_read;
	  }
	
	if (vblkno < NDADDR)
	  fsbaddr = ip->di->di_db[vblkno];
	else
	  {
	    if (!ip->i_sinloc)
	      sin_map (ip);
	    fsbaddr = ip->i_sinloc[vblkno - NDADDR];
	  }
	mutex_unlock (&ip->i_datalock);
	
	end_catch_exception ();
      }
      break;
      
    case FILE_DATA_RO:
      panic ("FILE_DATA_RO read");
    default:
      panic ("m_o_d_r: unknown pager type");
    }
  
  if (!fsbaddr)
    {
      /* Note that we need to prevent writes so that an alloc can
	 occur synchronously with the write, instead of having to 
	 wait until pageout.  Also note that vm_allocate provides
	 zero-filled memory automatically.  */
      void *zeroblock;
      vm_allocate (mach_task_self (), (u_int *)&zeroblock, length, 1);
      memory_object_data_supply (ci->memobjcntl, offset, (u_int) zeroblock,
				 length, 1,  VM_PROT_WRITE, 0, 
				 MACH_PORT_NULL);
      mark_object_error (ci, offset, length, 0);
      return 0;
    }
  
  daddr = fsbtodb (fsbaddr) + blkoff (offset) / DEV_BSIZE;
  
  err = dev_read_sync (daddr, &page, __vm_page_size);

  /* Now it is OK for the file size to change, so we can release our lock.  */
  if (slp)
    {
      mutex_lock (slp);
      if (!--(*slip))
	condition_broadcast (slc);
      mutex_unlock (slp);
    }
  
  if (err)
    goto error_read;
  
  if (zero_len)
    bzero (page + __vm_page_size - zero_len, zero_len);
  
  memory_object_data_supply (ci->memobjcntl, offset, (u_int) page, length, 1,
			     VM_PROT_NONE, 0, MACH_PORT_NULL);
  mark_object_error (ci, offset, length, 0);
  
  return 0;
  
 error_read:
  memory_object_data_error (ci->memobjcntl, offset, length, EIO);
  mark_object_error (ci, offset, length, EIO);
  return 0;
}

/* Called by the kernel to write data to the backing store */
kern_return_t
seqnos_memory_object_data_return (mach_port_t object, 
				  mach_port_seqno_t seqno,
				  mach_port_t control,
				  vm_offset_t offset,
				  pointer_t data,
				  vm_size_t length,
				  int dirty,
				  int kcopy)
{
  struct controlinfo *ci;
  struct pagemap *entry;
  struct mutex *volatile slp;
  int *volatile slip;
  struct condition *volatile slc;
  volatile int short_len;
  error_t err;
  int vblkno, fsbaddr, daddr;
  int write_size;

  if (!(ci = pager_cvt (object)))
    return EOPNOTSUPP;
  
  /* sanity checks -- we don't do multi-page requests yet.  */
  if (control != ci->memobjcntl)
    {
      printf ("incg data request: wrong control port");
      return 0;
    }
  if (length != __vm_page_size)
    {
      printf ("incg data request: bad length size");
      return 0;
    }
  if (offset % __vm_page_size)
    {
      printf ("incg data request: misaligned request");
      return 0;
    }

  if (!dirty)
    return 0;

  /* Acquire the right to meddle with the pagemap */
  mutex_lock (&ci->interlock);
  wait_for_seqno (ci, seqno);

  if (ci->pager_state != NORMAL)
    {
      printf ("pager in wrong state for write\n");
      if (ci->waitingforseqno)
	{
	  ci->waitingforseqno = 0;
	  condition_broadcast (&ci->wakeup);
	}
      mutex_unlock (&ci->interlock);
      return 0;
    }

  pagemap_resize (ci, offset + length);

  entry = &ci->pagemap[offset / __vm_page_size];

  if (entry->pagingout || entry->syncing)
    panic ("double pageout");
   
  /* Mark this page as being paged out.  */
  entry->pagingout = 1;

  /* If this is written as part of a sync, do bookkeeping so we know when the
     sync is done.  */
  if (ci->synccount)
    {
      entry->syncing = 1;
      ci->writecount++;
    }

  /* Let someone else in. */
  if (ci->waitingforseqno)
    {
      ci->waitingforseqno = 0;
      condition_broadcast (&ci->wakeup);
    }
  mutex_unlock (&ci->interlock);

  /* Now we have to find out where to write the data.  */
  vblkno = lblkno (offset);
  short_len = 0;
  slp = 0;
  
  switch (ci->pager_type)
    {
    case CG:
      fsbaddr = cgstart (vblkno);
      break;

    case DINODE:
      fsbaddr = cgimin (vblkno / infsb_pcg) + blkstofrags (vblkno % infsb_pcg);
      break;
      
    case DINDIR:
      {
	struct inode *ip = ifind (vblkno);
	
	mutex_lock (&ip->i_shrinklock);
	slp = &ip->i_shrinklock;
	slip = &ip->i_shrinkcount;
	slc = &ip->i_shrinkwait;
	ip->i_shrinkcount++;
	mutex_unlock (&ip->i_shrinklock);

	mutex_lock (&ip->i_dinlock);
	
	if (catch_exception ())
	  {
	    mutex_unlock (&ip->i_dinlock);
	    err = EIO;
	    goto out;
	  }

	fsbaddr = ip->di->di_ib[INDIR_DOUBLE];
	mutex_unlock (&ip->i_dinlock);

	end_catch_exception ();
      }
      break;
      
    case SINDIR:
      {
	struct inode *ip = ci->ip;
	
	mutex_lock (&ip->i_shrinklock);
	slp = &ip->i_shrinklock;
	slip = &ip->i_shrinkcount;
	slc = &ip->i_shrinkwait;
	ip->i_shrinkcount++;
	mutex_unlock (&ip->i_shrinklock);

	mutex_lock (&ip->i_sinlock);

	if (catch_exception ())
	  {
	    mutex_unlock (&ip->i_sinlock);
	    err = EIO;
	    goto out;
	  }

	if (vblkno == 0)
	  fsbaddr = ip->di->di_ib[INDIR_SINGLE];
	else
	  {
	    if (!ip->i_dinloc)
	      din_map (ip);
	    fsbaddr = ip->i_dinloc[vblkno - 1];
	  }
	
	mutex_unlock (&ip->i_sinlock);

	end_catch_exception ();
      }
      break;
      
    case FILE_DATA:
      {
	struct inode *ip = ci->ip;
	int file_size = get_inode_vsize (ip);
	
	mutex_lock (&ip->i_shrinklock);
	slp = &ip->i_shrinklock;
	slip = &ip->i_shrinkcount;
	slc = &ip->i_shrinkwait;
	ip->i_shrinkcount++;
	mutex_unlock (&ip->i_shrinklock);

	if (offset >= file_size)
	  {
	    printf ("Excess file write: uh oh...\n");
	    err = EIO;
	    goto out;
	  }

	if (offset + length > file_size)
	  short_len = file_size - (offset + length);

	mutex_lock (&ip->i_datalock);

	if (catch_exception ())
	  {
	    mutex_unlock (&ip->i_datalock);
	    err = EIO;
	    goto out;
	  }
	
	if (vblkno < NDADDR)
	  fsbaddr = ip->di->di_db[vblkno];
	else
	  {
	    if (!ip->i_sinloc)
	      sin_map (ip);
	    fsbaddr = ip->i_sinloc[vblkno - NDADDR];
	  }
	mutex_unlock (&ip->i_datalock);
	
	end_catch_exception ();
      }
      break;
      
    case FILE_DATA_RO:
      panic ("FILE_DATA_RO write");
    default:
      panic ("m_o_d_w: unknown pager type");
    }

  if (!fsbaddr)
    {
      printf ("Attempt to write unallocated disk; ignored.\n");
      err = 0;
      goto out;
    }
  
  daddr = fsbtodb (fsbaddr) + blkoff (offset) / DEV_BSIZE;
  
  write_size = __vm_page_size - short_len;
  if (write_size % DEV_BSIZE)
    {
      write_size += DEV_BSIZE - (write_size % DEV_BSIZE);
      bzero ((char *) data + write_size - (short_len % DEV_BSIZE), 
	     short_len % DEV_BSIZE);
    }
  
  err = dev_write_sync (daddr, (void *) data, write_size);
  
 out:
      
  /* Acquire the right to meddle with the pagemap */
  mutex_lock (&ci->interlock);
  pagemap_resize (ci, offset + length);
  entry = &ci->pagemap[offset / __vm_page_size];

  if (err && !entry->pageinwait)
    /* The only thing we can do here is mark the page, and give errors 
       from now on when it is to be read.  This is imperfect, because 
       if all users go away, the pagemap will be freed, and this information
       lost.  Oh well.  It's still better than Un*x.  Of course, if we 
       are about to hand this data to the kernel, the error isn't a problem,
       hence the check for pageinwait.  */
    entry->invalid = 1;

  if (entry->pageinwait)
    memory_object_data_supply (ci->memobjcntl, offset, data, length, 1,
			       VM_PROT_NONE, 0, MACH_PORT_NULL);
  else
    vm_deallocate (mach_task_self (), data, length);

  if (entry->syncing)
    ci->writecount--;

  /* inode_update needs wakeups on each write just in case.  */
  if (ci->writecount == 0 || (ci == inodeinfo && entry->syncing))
    condition_broadcast (&ci->wakeup);

  entry->syncing = 0;
  entry->pagingout = 0;
  entry->pageinwait = 0;

  mutex_unlock (&ci->interlock);

  /* Now it is OK for the file size to change, so we can release our lock.  */
  if (slp)
    {
      mutex_lock (slp);
      if (!--(*slip))
	condition_broadcast (slc);
      mutex_unlock (slp);
    }

  return 0;
}

/* We provide all data read-write except for "holes".  Therefore,
   lock requests should only happen for holes, and this routine does the
   allocation.  */
kern_return_t
seqnos_memory_object_data_unlock (mach_port_t object, 
			   mach_port_seqno_t seqno,
			   mach_port_t control,
			   vm_offset_t offset,
			   vm_size_t length,
			   vm_prot_t access)
{
  struct controlinfo *ci;
  volatile int err;
  struct inode *volatile ip;
  daddr_t newblk;
  daddr_t vblkno;
  daddr_t *slot;
  daddr_t *table;
  
  if (!(ci = pager_cvt (object)))
    return EOPNOTSUPP;

  mutex_lock (&ci->interlock);

  wait_for_seqno (ci, seqno);

  if (ci->waitingforseqno)
    {
      ci->waitingforseqno = 0;
      condition_broadcast (&ci->wakeup);
    }
  mutex_unlock (&ci->interlock);

  if (ci->pager_state != NORMAL)
    {
      printf ("pager in wrong state for unlock\n");
      return 0;
    }

  if (control != ci->memobjcntl)
    {
      printf ("incg data unlock: wrong control port");
      return 0;
    }
  /* The only thing we ever block is writes */
  if ((access & VM_PROT_WRITE) == 0)
    {
      printf ("incg data unlock: not unlock writes");
      return 0;
    }
  if (offset % __vm_page_size)
    {
      printf ("incg data unlock: misaligned request");
      return 0;
    }
  if (length != __vm_page_size)
    {
      printf ("incg data request: bad length size");
      return 0;
    }

  err = 0;
  vblkno = offset / sblock->fs_bsize;
  ip = ci->ip;

  switch (ci->pager_type)
    {
    case DINDIR:
      ip = ifind (vblkno);
      
      mutex_lock (&ip->i_dinlock);
      if (catch_exception ())
	err = EIO;
      else
	{
	  if (!ip->di->di_ib[INDIR_DOUBLE])
	    {
	      newblk = indir_alloc (ip, INDIR_DOUBLE, 0);
	      if (newblk)
		ip->di->di_ib[INDIR_DOUBLE] = newblk;
	      else
		err = ENOSPC;
	    }
	  end_catch_exception ();
	}
      mutex_unlock (&ip->i_dinlock);
      break;
    
    case SINDIR:
      mutex_lock (&ip->i_sinlock);
      
      if (catch_exception ())
	err = EIO;
      else
	{
	  if (vblkno == 0)
	    slot = &ip->di->di_ib[INDIR_SINGLE];
	  else
	    {
	      if (!ip->i_dinloc)
		din_map (ip);
	      slot = &ip->i_dinloc[vblkno - 1];
	    }
	  
	  if (!*slot)
	    {
	      newblk = indir_alloc (ip, INDIR_SINGLE, vblkno);
	      if (newblk)
		*slot = newblk;
	      else
		err = ENOSPC;
	    }

	  end_catch_exception ();
	}
      mutex_unlock (&ip->i_sinlock);
      break;
      
    case FILE_DATA:
      mutex_lock (&ip->i_datalock);
      
      /* Make sure this is before the last block of the file.  */
      if (offset + length
	  > blkroundup (get_inode_vsize (ip)) - sblock->fs_bsize)
	{
/*	  printf ("attempt to unlock at last block\n");*/
	  mutex_unlock (&ip->i_datalock);
	  return 0;
	}
      
      if (catch_exception ())
	err = EIO;
      else
	{
	  if (vblkno < NDADDR)
	    {
	      slot = &ip->di->di_db[vblkno];
	      table = ip->di->di_db;
	    }
	  else
	    {
	      if (!ip->i_sinloc)
		sin_map (ip);
	      slot = &ip->i_sinloc[vblkno - NDADDR];
	      table = ip->i_sinloc;
	    }

	  if (!*slot)
	    {
	      alloc (ip, vblkno, 
		     blkpref (ip, vblkno, slot - table, table),
		     sblock->fs_bsize, &newblk, 0);
	      if (newblk)
		*slot = newblk;
	      else
		err = ENOSPC;
	    }

	  end_catch_exception ();
	}
      mutex_unlock (&ip->i_datalock);
      
    default:
      panic ("lock request");
    }
  
  if (!err)
    /* We can go ahead and release the lock.  */
    memory_object_lock_request (control, offset, length, 
				MEMORY_OBJECT_RETURN_NONE, 0, 
				VM_PROT_NONE, MACH_PORT_NULL);
  else
    {
      /* Flush the page, and set a bit so that m_o_data_request knows
	 to issue an error.  */
      memory_object_lock_request (control, offset, length, 
				  MEMORY_OBJECT_RETURN_NONE, 1, 
				  VM_PROT_WRITE, MACH_PORT_NULL);
      mark_next_request_error (ci, offset, length, err);
    }
  return 0;
}




/* Data sync:  */

/* Sync is done by asking the kernel to write modified pages back to
   disk.  We keep a number of counts to assist.  Synccount is the number
   of sync calls waiting for the kernel to finish making write requests.
   Writecount is the number of synced pages that have to be written.
   Each page that affects writecount gets the syncing bit set (this is
   handled in m_o_data_write above).  */

/* This syncs a pager with disk.  */
static void
pager_sync (struct controlinfo *ci,
	    int wait)
{
  mutex_lock (&ci->interlock);
  ci->synccount++;
  
  /* We have to prohibit all writes (LOSE) and then reenable them
     on each fault until this interface is fixed.  */
  memory_object_lock_request (ci->memobjcntl, 0, 
			      (ci->pager_type == FILE_DATA
			       ? get_inode_vsize (ci->ip)
			       : ci->vsize), 
			      1, 0, VM_PROT_WRITE, ci->memobj);
  if (wait)
    while (ci->synccount || ci->writecount)
      condition_wait (&ci->wakeup, &ci->interlock);
  mutex_unlock (&ci->interlock);
}

/* This gets called after the kernel has finished issuing writes for 
   all the pages.  */
/* Called by the kernel when a lock request has finished. */
kern_return_t
seqnos_memory_object_lock_completed (mach_port_t object,
				     mach_port_seqno_t seqno,
				     mach_port_t control,
				     vm_offset_t offset,
				     vm_size_t length)
{
  struct controlinfo *ci;

  if (!(ci = pager_cvt (object)))
    return EOPNOTSUPP;

  if (control != ci->memobjcntl)
    {
      printf ("incg lock completed: wrong control port");
      return 0;
    }
  if (offset % __vm_page_size)
    {
      printf ("incg lock completed: misaligned request");
      return 0;
    }
  if (length % __vm_page_size)
    {
      printf ("incg lock completed: bad length size");
      return 0;
    }

  mutex_lock(&ci->interlock);
  wait_for_seqno (ci, seqno);
  assert (ci->synccount);
  ci->synccount--;
  if (ci->waitingforseqno)
    {
      ci->waitingforseqno = 0;
      condition_broadcast (&ci->wakeup);
    }
  else if (ci->synccount == 0)
    condition_broadcast (&ci->wakeup);
  mutex_unlock (&ci->interlock);
  return 0;
}

/* Write a single inode to disk.  This might sync more than actually
   necessary; it's really just an attempt to avoid syncing all the
   inodes.  */
void
inode_update (struct inode *ip,
	      int wait)
{
  vm_offset_t offset, offsetpg;
  
  offset = ip->i_number * sizeof (struct dinode);
  offsetpg = offset / __vm_page_size;
  offset = offsetpg * __vm_page_size;
  
  mutex_lock (&inodeinfo->interlock);
  inodeinfo->synccount++;

  memory_object_lock_request (inodeinfo->memobjcntl, offset, __vm_page_size,
			      1, 0, VM_PROT_NONE, inodeinfo->memobj);
  
  
  if (wait)
    while (inodeinfo->synccount || inodeinfo->pagemap[offsetpg].syncing)
      condition_wait (&inodeinfo->wakeup, &inodeinfo->interlock);
  mutex_unlock (&inodeinfo->interlock);
}

/* This syncs a single file to disk.  */
void
file_update (struct inode *ip,
	     int wait)
{
  mutex_lock (&ip->i_datalock);
  if (ip->i_fileinfo)
    pager_sync (ip->i_fileinfo, wait);
  mutex_unlock (&ip->i_datalock);
  
  mutex_lock (&ip->i_sinlock);
  if (ip->i_sininfo)
    pager_sync (ip->i_sininfo, wait);
  mutex_unlock (&ip->i_sinlock);
  
  /* This is overkill, but I don't want to deal with the better way now. */
  pager_sync (dininfo, wait);
  
  inode_update (ip, wait);
}

/* This syncs the entire filesystem. */
void 
sync_everything (int wait)
{
  struct controlinfo *ci;
  
  mutex_lock (&cilistlock);
  for (ci = filelist; ci; ci = ci->next)
    pager_sync (ci, wait);
  for (ci = silist; ci; ci = ci->next)
    pager_sync (ci, wait);
  mutex_unlock (&cilistlock);
  
  pager_sync (dininfo, wait);
  pager_sync (inodeinfo, wait);
  pager_sync (cginfo, wait);
}



/* Pager creation, initialization, and destruction:  */

/* Call this to create a FILE_DATA pager.  */
mach_port_t
get_filemap (struct inode *ip)
{
  struct controlinfo *ci;
  mach_port_t ret;
  
  mutex_lock (&ip->i_datalock);
  
  if (ip->i_fileinfo)
    {
      ci = ip->i_fileinfo;
      ret = ci->memobj;
    }
  else
    {
      ci = cialloc (FILE_DATA);
      ip->i_refcnt++;
      ip->i_fileinfo = ci;
      ci->ip = ip;
      ret = ci->memobj;
    }

  mach_port_insert_right (mach_task_self (), ret, ret,
			  MACH_MSG_TYPE_MAKE_SEND);
  mutex_unlock (&ip->i_datalock);
  return ret;
}

/* This is called by the FILE_DATA pager to map the single indirect data 
   (and create a pager).  */
void
sin_map (struct inode *ip)
{
  int err;
  struct controlinfo *ci;
  int size;
  
  if (ip->i_sinloc)
    panic ("sin_map");

  if (ip->i_sininfo)
    ci = ip->i_sininfo;
  else
    {
      ci = cialloc (SINDIR);
      
      ip->i_refcnt++;
      ci->ip = ip;
      ip->i_sininfo = ci;
     
      size = get_inode_vsize (ip);
      
      /* Compute size in fsblocks */
      ci->vsize = (size + sblock->fs_bsize - 1) / sblock->fs_bsize;
      
      /* Subtract off direct blocks */
      ci->vsize -= NDADDR;
      
      /* It takes four bytes for each entry */
      ci->vsize *= sizeof (daddr_t);
      
      /* Round up to an integral number of indirect blocks */
      ci->vsize = (ci->vsize + sblock->fs_bsize - 1) / sblock->fs_bsize;
    }
  
  mach_port_insert_right (mach_task_self (), ci->memobj, ci->memobj,
			  MACH_MSG_TYPE_MAKE_SEND);
  err = vm_map (mach_task_self (), (u_int *)&ip->i_sinloc, ci->vsize, 0, 1,
		ci->memobj, 0, 0, VM_PROT_READ|VM_PROT_WRITE,
		VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_NONE);
  mach_port_deallocate (mach_task_self (), ci->memobj);
  
  if (err)
    panic_with_error ("sin_map mapping", err);
  
  register_memory_fault_area (ip->i_sinloc, ci->vsize);
}

/* This is called when a file grows to see if the single indirect
   mapping needs to grow too.  */
void
sin_remap (struct inode *ip,
	   int newsize)
{
  struct controlinfo *ci;
  int err;
  
  ci = ip->i_sininfo;
  
  /* This is the same calculation as in sin_map.  */
  newsize = (newsize + sblock->fs_bsize - 1) / sblock->fs_bsize;
  newsize -= NDADDR;
  newsize *= sizeof (daddr_t);
  newsize = (newsize + sblock->fs_bsize - 1) / sblock->fs_bsize;
 
  if (newsize < ci->vsize)
    panic ("sin_remap size shrink");
  if (newsize != ci->vsize)
    {
      unregister_memory_fault_area (ip->i_sinloc, ci->vsize);
      vm_deallocate (mach_task_self (), (u_int) ip->i_sinloc, ci->vsize);
      
      ci->vsize = newsize;
      
      err = vm_map (mach_task_self (), (u_int *)&ip->i_sinloc, ci->vsize,
		    0, 1, ci->memobj, 0, 0, VM_PROT_READ|VM_PROT_WRITE,
		    VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_NONE);
      if (err)
	panic_with_error ("sin_remap mapping", err);
      register_memory_fault_area (ip->i_sinloc, ci->vsize);
    }
}

/* This is called by the sindir pager to map the double indirect block
   of a file.  */
void
din_map (struct inode *ip)
{
  int err;
  vm_offset_t indiroff;
  
  if (ip->i_dinloc)
    panic ("din_map");

  indiroff = ip->i_number * sblock->fs_bsize;
  err = vm_map (mach_task_self (), (u_int *)&ip->i_dinloc, sblock->fs_bsize,
		0, 1, dininfo->memobj, indiroff, 0, VM_PROT_READ|VM_PROT_WRITE,
		VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_NONE);
  if (err)
    panic_with_error ("din_map", err);

  register_memory_fault_area (ip->i_dinloc, sblock->fs_bsize);
}

/* This is called by a kernel to initialize the memory object.  */
kern_return_t
seqnos_memory_object_init (mach_port_t object, 
			   mach_port_seqno_t seqno,
			   mach_port_t control,
			   mach_port_t name,
			   vm_size_t pagesize)
{
  struct controlinfo *ci;

  if (!(ci = pager_cvt (object)))
    return EOPNOTSUPP;

  mutex_lock (&ci->interlock);

  if (ci->pager_state != NOTINIT)
    {
      printf ("pager dup init");
      return 0;
    }
  if (pagesize != __vm_page_size)
    {
      printf ("incg init: bad page size");
      return 0;
    }

  ci->memobjcntl = control;
  ci->memobjname = name;
  ci->pagemapsize = 0;
  ci->seqno = seqno;

  /* Tell the kernel we're ready */
  memory_object_ready (control, 1, MEMORY_OBJECT_COPY_NONE);

  ci->pager_state = NORMAL;
  mutex_unlock (&ci->interlock);
  return 0;
}

/* Deallocation is done on a weird as-needed demand-driven basis.  
   Memory_object_terminate causes a deallocation back to the NOTINIT
   state we were at before the init happened.  Nosenders notifications
   cause a complete clean up of everything related to the pager.  */


/* Called by the kernel when a shutdown has finished. */
kern_return_t
seqnos_memory_object_terminate (mach_port_t object, 
				mach_port_seqno_t seqno,
				mach_port_t control,
				mach_port_t name)
{
  struct controlinfo *ci;
  struct inode *ip;
  
  if (!(ci = pager_cvt (object)))
    return EOPNOTSUPP;
  
  if (control != ci->memobjcntl)
    {
      printf ("incg terminate: wrong control port");
      return 0;
    }
  if (name != ci->memobjname)
    {
      printf ("incg terminate: wrong name port");
      return 0;
    }

  if (ci->pager_type != FILE_DATA && ci->pager_type != SINDIR)
    {
      printf ("unexpected m_o_terminate\n");
      return 0;
    }      

  mutex_lock (&ci->interlock);

  wait_for_seqno (ci, seqno);

  mach_port_deallocate (mach_task_self (), control);
  mach_port_deallocate (mach_task_self (), name);

  /* Free the pagemap */
  if (ci->pagemapsize)
    vm_deallocate (mach_task_self (), (u_int)ci->pagemap, ci->pagemapsize);
  ip = ci->ip;
      
  switch (ci->pager_type)
    {
    case FILE_DATA:
      ip->i_fileinfo = 0;
      if (ip->i_sinloc)
	{
	  unregister_memory_fault_area (ip->i_sinloc, ip->i_sininfo->vsize);
	  vm_deallocate (mach_task_self (), (u_int) ip->i_sinloc,
			 ip->i_sininfo->vsize);
	}
      ip->i_sinloc = 0;
      mach_port_deallocate (mach_task_self (), ip->i_sininfo->memobj);
      break;

    case SINDIR:
      ip->i_sininfo = 0;
      if (ip->i_dinloc)
	{
	  unregister_memory_fault_area (ip->i_dinloc, sblock->fs_bsize);
	  vm_deallocate (mach_task_self (), (u_int) ip->i_dinloc, 
			 sblock->fs_bsize);
	}
      ip->i_dinloc = 0;
      break;
      
    default:
      panic ("memory_object_terminate");
    }
  ci->pager_state = NOTINIT;
  return 0;
}

struct controlinfo *
pager_cvt (mach_port_t port)
{
  struct controlinfo *ci;

  /* This works because the receive right holds a reference, and it
     can't delete the last reference (and thus make the port type wrong)
     without claiming credreflock.  */

  mutex_lock (&refcntlock);
  if (*(int *)port != PT_PAGER)
    return 0;

  ci= (struct controlinfo *) port;
  ci->refcnt++;
  mutex_unlock (&refcntlock);
  return ci;
}

void
pager_nosenders (mach_port_t port, int seqno)
{
  struct controlinfo *ci;
  int lastref;
  
  if (!(ci = pager_cvt (port)))
    return;
  
  wait_for_seqno (ci, seqno);
  
  lastref = pager_release (ci);
  
  if (!lastref)
    {
      mutex_lock (&ci->interlock);
      if (ci->waitingforseqno)
	{
	  ci->waitingforseqno = 0;
	  condition_broadcast (&ci->wakeup);
	}
      mutex_unlock (&ci->interlock);
    }
}

int
pager_release (struct controlinfo *ci)
{
  mutex_lock (&refcntlock);
  if (--ci->refcnt)
    {
      mutex_unlock (&refcntlock);
      return 0;
    }

  mach_port_move_member (mach_task_self (), ci->memobj, MACH_PORT_NULL);
  ci->porttype = PT_NONE;
  mutex_unlock (&refcntlock);

  irele (ci->ip);

  mutex_lock (&cilistlock);
  *ci->pprev = ci->next;
  if (ci->next)
    ci->next->pprev = ci->pprev;
  ci->next = cifreelist;
  cifreelist = ci;
  mutex_unlock (&cilistlock);

  return 1;
}

struct controlinfo *
cialloc (enum pager_type type)
{
  struct controlinfo *ci;
  error_t err;
  mach_port_t foo;

  mutex_lock (&cilistlock);
  if (cifreelist)
    {
      ci = cifreelist;
      cifreelist = ci->next;
      mutex_unlock (&cilistlock);
    }
  else
    {
      mutex_unlock (&cilistlock);
      ci = malloc (sizeof (struct controlinfo));
      err = mach_port_allocate_name (mach_task_self (), 
				     MACH_PORT_RIGHT_RECEIVE,
				     (mach_port_t) ci);
      if (err == KERN_NAME_EXISTS)
	{
	  /* Isn't this ugly? */
	  struct controlinfo *newci = cialloc (type);
	  free (ci);
	  ci = newci;
	}
    }
  
  ci->porttype = PT_PAGER;
  ci->pager_type = type;
  ci->refcnt = 1;
  ci->pager_state = NOTINIT;
  ci->memobj = (mach_port_t) ci;
  mutex_init (&ci->interlock);
  condition_init (&ci->wakeup);
  ci->pagemapsize = ci->synccount = ci->writecount = 0;
  ci->pagemap = 0;
  mach_port_request_notification (mach_task_self (), (mach_port_t) ci,
				  MACH_NOTIFY_NO_SENDERS, 1,
				  (mach_port_t) ci,
				  MACH_MSG_TYPE_MAKE_SEND_ONCE, &foo);
  cienqueue (ci);
  mach_port_move_member (mach_task_self (), (mach_port_t) ci, ufs_portset);
  return ci;
}
  
void
cienqueue (struct controlinfo *ci)
{
  struct controlinfo **listhead;
  
  switch (ci->pager_type)
    {
    case FILE_DATA:
      listhead = &filelist;
      break;
    case SINDIR:
      listhead = &silist;
      break;
    default:
      return;
    }
  
  mutex_lock (&cilistlock);
  
  ci->pprev = listhead;
  ci->next = *listhead;
  *listhead = ci;
  if (ci->next)
    ci->next->pprev = &ci->next;
  
  mutex_unlock (&cilistlock);
}


/* Utility */
void
wait_for_seqno (struct controlinfo *ci,
		int seqno)
{
  while (seqno != ci->seqno + 1)
    {
      ci->waitingforseqno = 1;
      condition_wait (&ci->wakeup, &ci->interlock);
    }
  ci->seqno = seqno;
}

/* Allocate an indirect block */
static daddr_t
indir_alloc (struct inode *ip,
	     int type,		/* INDIR_DOUBLE or INDIR_SINGLE */
	     int ind)		/* which in the series? */
{
  daddr_t bn;
  daddr_t lbn;
  int error;

  switch (type)
    {
    case INDIR_DOUBLE:
      lbn = NDADDR + sblock->fs_bsize / sizeof (daddr_t);
      break;
    case INDIR_SINGLE:
      if (ind == 0)
	lbn = NDADDR;
      else
	lbn = NDADDR + ind * sblock->fs_bsize / sizeof (daddr_t);
      break;
    default:
      panic ("indir_alloc type");
    }
  
  if (error = alloc (ip, NDADDR,
		     blkpref (ip, lbn, 0, (daddr_t *)0),
		     sblock->fs_bsize, &bn, 0))
    return 0;

  /* We do this write synchronously so that the inode never
     points at an indirect block full of garbage */
  if (dev_write_sync (fsbtodb (bn), zeroblock, sblock->fs_bsize))
    {
      blkfree (bn, sblock->fs_bsize);
      return 0;
    }
  else
    return bn;
}

void
mark_next_request_error(struct controlinfo *ci,
			int offset,
			int length,
			error_t error)
{
  int page_error;
  struct pagemap *p;

  offset /= __vm_page_size;
  length /= __vm_page_size;
  
  switch (error)
    {
    case 0:
      page_error = PAGE_NOERR;
      break;
    case ENOSPC:
      page_error = PAGE_ENOSPC;
      break;
    case EIO:
      page_error = PAGE_EIO;
      break;
    case EDQUOT:
      page_error = PAGE_EDQUOT;
      break;
    default:
      panic ("mark_object_error");
      break;
    }
  
  for (p = ci->pagemap; p < ci->pagemap + length; p++)
    p->next_error = page_error;
}

void
mark_object_error(struct controlinfo *ci,
		  int offset,
		  int length,
		  error_t error)
{
  int page_error = 0;
  struct pagemap *p;

  offset /= __vm_page_size;
  length /= __vm_page_size;
  
  switch (error)
    {
    case 0:
      page_error = PAGE_NOERR;
      break;
    case ENOSPC:
      page_error = PAGE_ENOSPC;
      break;
    case EIO:
      page_error = PAGE_EIO;
      break;
    case EDQUOT:
      page_error = PAGE_EDQUOT;
      break;
    default:
      panic ("mark_object_error");
      break;
    }
  
  for (p = ci->pagemap; p < ci->pagemap + length; p++)
    p->error = page_error;
}

  
/* Make sure the pagemap can deal with address off */
void
pagemap_resize (struct controlinfo *ci,
		int off)
{
  void *newaddr;
  int newsize;
  
  off /= __vm_page_size;
  if (ci->pagemapsize >= off)
    return;
  
  newsize = round_page (off);
  vm_allocate (mach_task_self (), (u_int *)&newaddr, newsize, 1);
  bcopy (ci->pagemap, newaddr, ci->pagemapsize);
  vm_deallocate (mach_task_self (), (u_int)ci->pagemap, ci->pagemapsize);
  ci->pagemap = newaddr;
  ci->pagemapsize = newsize;
}


/* Truncation of inodes.  This is a pain.  Sigh.  */
void
inode_truncate (struct inode *ip,
		off_t length)
{
  daddr_t lastblock, olastblock, bn;
  off_t osize;
  int bsize, idx;

  osize = ip->di->di_size;
  if (length < osize)
    return;

  /* Calculate block number of last block */
  lastblock = lblkno (length + sblock->fs_bsize - 1) - 1;
  olastblock = lblkno (osize + sblock->fs_bsize - 1) - 1;

  /* If the prune is not to a block boundary, zero the bit upto the
     next block boundary. */
  if (blkoff (length))
    fs_rdwr (ip, zeroblock, length, 
	     blksize (ip, lastblock) - blkoff (length), 1, 0);

  mutex_lock (&ip->i_shrinklock);
  while (ip->i_shrinkcount)
    condition_wait (&ip->i_shrinkwait, &ip->i_shrinklock);

  mutex_lock (&ip->i_datalock);

  /* Update the size now.  If we crash, fsck can finish freeing the
     blocks. */
  ip->di->di_size = length;

  /* Flush the old data.  This operation is bad if people do
     copy-on-write sharing of files.  As a result, the FILE_DATA pager has its
     copy strategy set to MEMORY_OBJECT_COPY_NONE.  At some point this will
     have to be changed to MEMORY_OBJECT_COPY_CALL, but I'm not up for it
     yet.  */
  if (ip->i_fileinfo)
    memory_object_lock_request (ip->i_fileinfo->memobjcntl, 
				((lastblock == -1 ? 0 : lastblock)
				 * sblock->fs_bsize),
				(olastblock - lastblock) * sblock->fs_bsize,
				0, 1, VM_PROT_NONE, MACH_PORT_NULL);

  /* Drop data blocks mapped by indirect blocks */
  if (olastblock > NDADDR)
    {
      if (!ip->i_sinloc)
	sin_map (ip);
      for (idx = lastblock + 1; idx <= olastblock; idx ++)
	{
	  if (ip->i_sinloc[idx])
	    {
	      blkfree (ip->i_sinloc[idx], sblock->fs_bsize);
	      ip->i_sinloc[idx] = 0;
	    }
	}

      /* Prune the block pointer handled by the sindir pager.  This will
	 free all the indirect blocks and such as necessary.  */
      sindir_drop (ip, lblkno((lastblock - NDADDR) * sizeof (daddr_t)),
		   lblkno ((olastblock - NDADDR) * sizeof (daddr_t)));

      /* Unmap the old sindir mapping */
      unregister_memory_fault_area (ip->i_sinloc, ip->i_sininfo->vsize);
      vm_deallocate (mach_task_self (), (u_int)ip->i_sinloc,
		     ip->i_sininfo->vsize);
      sin_map (ip);
    }

  /* Prune the blocks mapped directly from the inode */
  for (idx = lastblock + 1; idx < NDADDR; idx++)
    {
      bn = ip->di->di_db[idx];
      if (idx)
	{
	  ip->di->di_ib[idx] = 0;
	  if (bn > olastblock)
	    panic ("inode_truncate 1");
	  if (bn == olastblock)
	    {
	      bsize = blksize (ip, idx);
	      blkfree (bn, bsize);
	    }
	  else
	    blkfree (bn, sblock->fs_bsize);
	}
    }
  
  if (lastblock >= 0 && lastblock < NDADDR)
    {
      /* Look for a change in the size of the last direct block */
      bn = ip->di->di_db[lastblock];
      if (bn)
	{
	  off_t oldspace, newspace;
	  
	  oldspace = blksize (ip, lastblock);
	  newspace = fragroundup (blkoff (length));;
	  if (newspace == 0)
	    panic ("inode_truncate: newspace");
	  if (oldspace - newspace)
	    {
	      bn += numfrags (newspace);
	      blkfree (bn, oldspace - newspace);
	    }
	}
    }

  if (lastblock < NDADDR)
    ip->i_allocsize = blkroundup (length);
  else
    ip->i_allocsize = fragstoblks (fragroundup (length));

  mutex_unlock (&ip->i_datalock);
  mutex_unlock (&ip->i_shrinklock);
  file_update (ip, 1);
}  

/* This drops the double indirect block */
static void
dindir_drop (struct inode *ip)
{
  mutex_lock (&ip->i_dinlock);
  
  memory_object_lock_request (dininfo->memobjcntl, 
			      ip->i_number * sblock->fs_bsize,
			      sblock->fs_bsize, 0, 1, VM_PROT_NONE,
			      MACH_PORT_NULL);

  if (ip->di->di_ib[INDIR_DOUBLE])
    {
      blkfree (ip->di->di_ib[INDIR_DOUBLE], sblock->fs_bsize);
      ip->di->di_ib[INDIR_DOUBLE] = 0;
    }

  mutex_unlock (&ip->i_dinlock);
}
  

/* This hideous routine is used by inode_truncate to clean the info in
   sin_loc. */
static void
sindir_drop (struct inode *ip,
	     int lastiblock,
	     int olastiblock)
{
  int idx;
  
  mutex_lock (&ip->i_sinlock);
  
  memory_object_lock_request (ip->i_sininfo->memobjcntl,
			      (lastiblock + 1) * sblock->fs_bsize,
			      (olastiblock - lastiblock) * sblock->fs_bsize,
			      0, 1, VM_PROT_NONE, MACH_PORT_NULL);

  /* Drop indirect blocks found in the double indirect block */
  if (olastiblock > 1)
    {
      if (!ip->i_dinloc)
	din_map (ip);
      for (idx = lastiblock + 1; idx = olastiblock; idx++)
	{
	  if (ip->i_dinloc[idx])
	    {
	      blkfree (ip->i_dinloc[idx], sblock->fs_bsize);
	      ip->i_dinloc[idx] = 0;
	    }
	}
      
      /* If we no longer need the double indirect block, drop it. */
      if (lastiblock <= 1)
	{
	  dindir_drop (ip);
	  unregister_memory_fault_area (ip->i_dinloc, sblock->fs_bsize);
	  vm_deallocate (mach_task_self (), (u_int)ip->i_dinloc,
			 sblock->fs_bsize);
	  ip->i_dinloc = 0;
	}
    }
  
  /* Drop the block from the inode if we don't need it any more */
  if (lastiblock < 0 && ip->di->di_ib[INDIR_SINGLE])
    {
      blkfree (ip->di->di_ib[INDIR_SINGLE], sblock->fs_bsize);
      ip->di->di_ib[INDIR_SINGLE] = 0;
    }
  mutex_unlock (&ip->i_sinlock);
}


/* Initialize the pager system */
void
pager_init ()
{
  int err;

  /* firewalls: */
  if (DEV_BSIZE % sizeof (struct dinode))
    panic ("inode size wrong");
  if (__vm_page_size % DEV_BSIZE)
    panic ("page size unusable");
  if (sblock->fs_bsize % DEV_BSIZE)
    panic ("nonintegral filesystem block size");
  if (sblock->fs_ipg % sblock->fs_inopb)
    panic ("nonintegral number of inodes per cylinder group");
  if (__vm_page_size > sblock->fs_bsize)
    panic ("page size larger than block size");

  mutex_init (&cilistlock);
  mutex_init (&refcntlock);
  cifreelist = silist = filelist = 0;

  inodeinfo = cialloc (DINODE);
  cginfo = cialloc (CG);
  dininfo = cialloc (DINDIR);

  mach_port_insert_right (mach_task_self (), inodeinfo->memobj,
			  inodeinfo->memobj, MACH_MSG_TYPE_MAKE_SEND);
  mach_port_insert_right (mach_task_self (), cginfo->memobj,
			  cginfo->memobj, MACH_MSG_TYPE_MAKE_SEND);
  mach_port_insert_right (mach_task_self (), dininfo->memobj,
			  dininfo->memobj, MACH_MSG_TYPE_MAKE_SEND);

  err = vm_allocate (mach_task_self (), (u_int *)&zeroblock,
		     sblock->fs_bsize, 1);
  if (err)
    panic_with_error ("incg zeroblock", err);
  
  /* Find out how much memory we need */
  infsb_pcg = sblock->fs_ipg / sblock->fs_inopb;
  inodeinfo->vsize = round_page (sblock->fs_ipg * sblock->fs_ncg 
				 * sizeof (struct dinode));

  cginfo->vsize = round_page (sblock->fs_bsize * sblock->fs_ncg);

  dininfo->vsize = round_page (sblock->fs_ipg * sblock->fs_ncg
			      * sblock->fs_bsize);
  
  /* Do the mapping of the inodes and cylinder groups */

  /* Note that this will fail when vm_map is made synchronous with
     pagers.  At that time, this will have to occur after the request
     threads are started.  */
  err = vm_map (mach_task_self (), (u_int *)&dinodes, inodeinfo->vsize,
		0, 1, inodeinfo->memobj, 0, 0, VM_PROT_READ|VM_PROT_WRITE,
		VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_NONE);
  if (!err)
    err = vm_map (mach_task_self (), (u_int *)&cgs, cginfo->vsize, 0, 1,
		  cginfo->memobj, 0, 0, VM_PROT_READ|VM_PROT_WRITE,
		  VM_PROT_READ|VM_PROT_WRITE, VM_INHERIT_NONE);
  if (err)
    panic_with_error ("inode_pager_init map", err);
  
  register_memory_fault_area (dinodes, inodeinfo->vsize);
  register_memory_fault_area (cgs, cginfo->vsize);
}



/* Unused stubs. */
error_t
seqnos_memory_object_copy (mach_port_t old,
			   mach_port_seqno_t seq,
			   memory_object_control_t old_ctl,
			   vm_offset_t off,
			   vm_size_t len,
			   mach_port_t new)
{
  printf ("m_o_copy called\n");
  return EOPNOTSUPP;
}

error_t
seqnos_memory_object_data_write (mach_port_t old,
				 mach_port_seqno_t seq,
				 mach_port_t ctl,
				 vm_offset_t off,
				 pointer_t data,
				 vm_size_t data_cnt)
{
  printf ("m_o_data_write called\n");
  return EOPNOTSUPP;
}

error_t
seqnos_memory_object_supply_completed (mach_port_t obj,
				       mach_port_seqno_t seq,
				       mach_port_t ctl,
				       vm_offset_t off,
				       vm_size_t len,
				       error_t result,
				       vm_offset_t err_off)
{
  printf ("m_o_supply_completed called\n");
  return EOPNOTSUPP;
}

error_t
seqnos_memory_object_change_completed (mach_port_t obj,
				       mach_port_seqno_t seq,
				       boolean_t maycache,
				       memory_object_copy_strategy_t strat)
{
  printf ("m_o_change_completed called\n");
  return EOPNOTSUPP;
}

