/* wordtable.c -- Copyright 1989, 1990 Liam R. Quin.  All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file ../COPYRIGHT for full details.
 */

/* Symbol Table Interface to text retrieval database.
 * Handles both the internal and external indexes.
 *
 * This originally used a linked list.  Converting to a hash table reduced
 * the time to index comp.os.vms from nearly an hour to one and a half
 * minutes...
 *
 * Liam Quin, 1989
 */

/* 
 * $Id: wordtable.c,v 2.16 92/03/30 21:45:59 lee Exp $
 */

#ifndef lint
 static char *Rcs = "$Id: wordtable.c,v 2.16 92/03/30 21:45:59 lee Exp $";
#endif

#include <stdio.h>
#include <malloc.h>
#include <ctype.h>
#include <sys/types.h>
#include <fcntl.h> /* for O_RDWR wtc */

#include "globals.h" /* defines and declarations for database filenames */
#include "error.h"

#include "smalldb.h"
#include "fileinfo.h"
#include "wordinfo.h"
#include "pblock.h"
#include "wordrules.h"
#include "emalloc.h"

extern t_WID GetNextWID();

static void NewEntry();
static void UpdateEntry();

#ifndef MaxWordPlacesInAWordBlock
# define MaxWordPlacesInAWordBlock ((WIDBLOCKSIZE-(MinWordLength+2)/3))
#endif

#ifndef HASHSIZ
# define HASHSIZ 32768 /* MUST be a power of two */
#endif /*!HASHSIZ*/

#ifndef MAXWORDSINCACHE
# define MAXWORDSINCACHE  (HASHSIZ * 10)
#endif

int MaxWordsInCache = MAXWORDSINCACHE;

extern int AsciiTrace;

/** System calls and library functions used in this file: **/

/** Lqtext calls */
extern unsigned int Putpblock();
extern void DeleteWordPlaces();
extern t_WordInfo *MakeWordInfo();

/** System calls: */

/** Library Functions: */
extern char *strncpy();
extern void perror();
extern void exit();

#ifdef ASCIITRACE
extern void fprintWordInfo();
#endif
/**/

extern char *progname;
static int HashSize = HASHSIZ; /* MUST be a power of two */

#define NPLACES 4
#define NPLACESBIGINCR 16
#define NPLACESHUGEINCR 128
/* This is small to optimise the common case -- by far the majority of
 * words are used less than 10 times.  In the cases where we've gone
 * wrong, well, there'll be a few thousand.  We add slowly until we
 * get to NPLACE * 3, and then we go up in NPLACESBIGINCR lumps.
 */

typedef struct s_HashEl {
    char *Word;
    t_WID WID;
    int PlacesUsed;
    int PlacesAllocated;
    t_WordPlace *Places;
} t_HashEl;

static t_HashEl **SymbolTable;
static int LastEl = 0;
static int WordsInCache = 0;

static void
InitHash()
{
    if (MaxWordsInCache) {
#ifdef ASCIITRACE
	HashSize = 1; /* no minimum when debugging */
#else
	HashSize = 1024; /* silently enforced minimum... */
#endif
	/* BUG: for really large MaxWordsInCache (2^31), could loop forever */
	while (HashSize < MaxWordsInCache / 10) {
	    HashSize <<= 1;
	}
    }

    if (HashSize < 1) {
	Error(E_FATAL, "InitHash: hash size (%d/%d) is too small!\n",
				HashSize, MaxWordsInCache);
    }

    SymbolTable = (t_HashEl **) ecalloc(HashSize, sizeof(t_HashEl *));
    LastEl = HashSize; /* Used as a sentinel */
    /** MaxWordsInCache = HashSize; **/
#ifdef ASCIITRACE
    if (AsciiTrace > 2) {
	fprintf(stderr, "%s: allocated %ld hash slots for up to %ld words\n",
	    progname,
	    HashSize,
	    MaxWordsInCache
	);
    }
#endif
}

static t_HashEl ZeroEl = {
    0,
};

static void
SetElEmpty(El)	/* Initialisation function for Hash Elements */
    t_HashEl *El;
{
    *El = ZeroEl; /* structure assignment */
    El->PlacesAllocated = NPLACES;
    El->PlacesUsed = 0;
    El->Places = (t_WordPlace *) emalloc(sizeof(t_WordPlace) * NPLACES);
}

#ifndef Hash
INLINE
int
Hash(WordInfo)
    t_WordInfo *WordInfo;
{
    register unsigned long n = 0;
    register int len = WordInfo->Length;
    register char *str = WordInfo->Word;

#ifndef NODUFF /* clever stuff for speedup... dmr-approved!... */

#define HASHC	n = *str++ + 65599 * n

    if (len > 0) {
	register int loop = (len + 8 - 1) >> 3;

	switch(len & (8 - 1)) {
	case 0:	do {
		HASHC;	case 7:	HASHC;
	case 6:	HASHC;	case 5:	HASHC;
	case 4:	HASHC;	case 3:	HASHC;
	case 2:	HASHC;	case 1:	HASHC;
		} while (--loop);
	}

    }
#else /* NODUFF */
    while (len--)
	n = *str++ + 65599 * n;
#endif /* NODUFF */
    /**
    return n & (HashSize - 1);
    **/
    return n % HashSize;
}
#endif /* Hash */

void DumpCache();

void
AddWord(WordInfo)
    t_WordInfo *WordInfo;
{
    register t_HashEl *HashEl;
    int Slot, FirstSlot;

    if (!WordInfo || !WordInfo->Word || !WordInfo->Word[0]) {
	Error(E_WARN, "Null word in AddWord(0x%x)", WordInfo);
	return;
    }

    if (!LastEl) {
	InitHash();
    }
    
    ++WordsInCache;

    if (MaxWordsInCache && WordsInCache > MaxWordsInCache) {
	DumpCache(1);
    }

#if 0 /* this has moved to Common.c */
    if (WordInfo->Word[0] == 'q') {
	register char *xp;

	for (xp = &WordInfo->Word[1]; *xp; xp++) {
	    if (*xp != 'x') break;
	}
	if (!*xp) {
	    return;
	}
    }
#endif

    if (WordInfo->WordPlace.FID == 0) {
	Error(E_BUG, "AddWord: FID 0 for \"%s\"", WordInfo->Word);
    }

    FirstSlot = Slot = Hash(WordInfo);

    for (;;) {
	if (SymbolTable[Slot] == (t_HashEl *) NULL) {
	    extern char *strcpy();
	    extern t_WID Word2WID();

	    /* make a new element */
	    HashEl = SymbolTable[Slot] = (t_HashEl *) emalloc(sizeof(t_HashEl));
	    SetElEmpty(HashEl);
	    HashEl->Word = emalloc(WordInfo->Length + 1);
	    (void) strcpy(HashEl->Word, WordInfo->Word);
	    HashEl->WID = Word2WID(HashEl->Word, WordInfo->Length);
	    break;
	} else if (STREQ(SymbolTable[Slot]->Word, WordInfo->Word)) {
	    HashEl = SymbolTable[Slot];
	    break;
	}

	if (++Slot >= HashSize) Slot = 0;

	if (Slot == FirstSlot) {
	    /* We need to dump the cache and start again */
	    DumpCache(1);
	}
    }

    /* If we get here, all we need to do is add the WordPlace */

    if (HashEl->PlacesAllocated - HashEl->PlacesUsed <= 0) {
	if (HashEl->PlacesAllocated <= NPLACES * 3) {
	    HashEl->PlacesAllocated += NPLACES;
	} else if (HashEl->PlacesAllocated <= NPLACESBIGINCR) {
	    HashEl->PlacesAllocated += NPLACESBIGINCR;
	} else {
	    HashEl->PlacesAllocated += NPLACESHUGEINCR;
	}
	HashEl->Places = (t_WordPlace *) erealloc(
	    (char *) HashEl->Places,
	    sizeof(t_WordPlace) * HashEl->PlacesAllocated
	);
    }

    HashEl->Places[HashEl->PlacesUsed++] = WordInfo->WordPlace;
    WordsInCache++;
#ifdef ASCIITRACE
    if (AsciiTrace > 9) {
	fprintf(stderr, "Slot %d Word %s len %d places %d\n",
		Slot, SymbolTable[Slot]->Word,
		WordInfo->Length, SymbolTable[Slot]->PlacesUsed);
    }
#endif
    return;
}

void
DumpCache(CallFree)
    int CallFree;
{
    extern void FlushCache();
    register int i;
    register t_HashEl *HashEl;
    int Progress = 0;

    for (i = 0; i != LastEl; i++) {
	if (SymbolTable[i]) {
	    unsigned len;

	    HashEl = SymbolTable[i];

	    /* We are going to make a new index entry for the word.
	     * There are two cases -- depending on whether the word
	     * is already indexed or not.
	     * In the former case we must merge the new information.
	     * In the latter case we don't have to read the old info,
	     * but we must make a new entry in the WID Index.
	     */

	    len = strlen(HashEl->Word);

	    if (HashEl->WID == (t_WID) 0) {
		HashEl->WID = GetNextWID(0); /* 0 = don't bother to write */
		NewEntry(HashEl, len);
	    } else {
		UpdateEntry(HashEl, len);
	    }

	    WordsInCache -= HashEl->PlacesUsed;

	    /* Reclaim storage */
	    if (CallFree) {
		extern void SlayWordInfo();

		efree(HashEl->Word);
		efree((char *) HashEl->Places);
		efree((char *) HashEl);
	    }
	    SymbolTable[i] = 0;
	}

	if (AsciiTrace > 1) {
	    if (i >= Progress * (HashSize / 16)) {
		fputc("01234567890ABCDEF?!"[Progress], stderr);
		++Progress;
	    }
	}

	if (WordsInCache <= 0) break;
    }
    WordsInCache = 0;
    FlushCache(0); /* the number is non-zero if we only need to clear 1 slot */
}

static void
NewEntry(HashEl, Length)
    t_HashEl *HashEl;
    int Length;
{
    t_pblock *pblock;
    register int i;
    t_WordInfo *WordInfo;

    /** make a WIDIndex entry and mark it as invalid (NOTDONE) */

    /* In order to do this, we must make a "pblock", a structure that
     * reflects the physical database.  This is fairly low-level stuff
     * for efficiency's sake...
     */

    /* allocate a pblock structure.  These are rather devious things, a
     * structure with an array tacked onto the end.
     */
    pblock = (t_pblock *) emalloc(sizeof(t_pblock) +
			HashEl->PlacesUsed * sizeof(t_WordPlace));
    
    pblock->WID = HashEl->WID;
    pblock->ChainStart = 0L; /* address on disk -- not there yet, so 0! */
    pblock->NumberOfWordPlaces = HashEl->PlacesUsed;

    /* fill in the WordPlaces */
    for (i = 0; i < HashEl->PlacesUsed; i++) {
	pblock->WordPlaces[i] = HashEl->Places[i]; /* struct copy */
	/* TODO: call qcmp to see if we need a sort */
    }

    /* Now fill in enough of WordInfo to let us use the low-level routines: */
    WordInfo = MakeWordInfo(HashEl->WID, Length, HashEl->Word);
    WordInfo->Offset = 0L;

    WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces;

    /* First, let's make an index entry: */

    if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock) {
	(void) MkWIB(WordInfo, pblock);
    }

    /** write out the new entry */
    if (WordInfo->WordPlacesInHere == pblock->NumberOfWordPlaces) {
	/* In this case it all fits into the WID index */
	pblock->ChainStart = 0L;
    } else {
	(void) Putpblock(WordInfo, pblock);
    }
    if (PutWordInfoIntoIndex(WordInfo, pblock->ChainStart) < 0) {
	Error(E_SYS|E_FATAL,
	    "NewEntry: Couldn't insert \"%s\" in database at 0x%lx",
			    WordInfo->Word, pblock->ChainStart);
    }

    /** reclaim storage */
    if (pblock) {
	(void) efree((char *) pblock);
    }
    SlayWordInfo(WordInfo);
}

static void
UpdateEntry(HashEl, Length)
    t_HashEl *HashEl;
    int Length;
{
    extern t_pblock *Getpblock();
    extern t_WordInfo *WID2WordInfo();
    register int i;
    t_pblock *pblock;
    t_WordInfo *WordInfo;
    int MightNeedToSort = 0;

    /** get the old entry */

#ifdef ASCIITRACE
    if (AsciiTrace > 4) {
	fprintf(stderr, "UpdateEntry(%s/WID %ld, wordlen %d)\n",
			HashEl->Word, HashEl->WID, Length);
    }
#endif

    WordInfo = WID2WordInfo(HashEl->WID);
    if (WordInfo == (t_WordInfo *) 0) {
	/* someone else has just deleted it! */
#ifdef ASCIITRACE
	if (AsciiTrace > 2) {
	    fprintf(stderr, "%s: Word %s was deleted!  Making a new entry\n",
		progname, WordInfo->Word);
	}
#endif
	HashEl->WID = GetNextWID(0);
	NewEntry(HashEl, Length);
	return;
    }
    /* It would be best if we could append to the old entry... which is what
     * I had in mind when I designed the disk storage stuff... but you can't.
     */
#ifdef ASCIITRACE
    if (AsciiTrace & 32) {
	fprintWordInfo(stderr, WordInfo, "UpdateEntry");
    }
#endif

    if (WordInfo->WordPlacesInHere == WordInfo->NumberOfWordPlaces) {
	pblock = (t_pblock *) 0;
    } else {
	pblock = Getpblock(WordInfo);
    }

    if (pblock) {
	pblock = (t_pblock *) erealloc((char *) pblock, sizeof(t_pblock) +
	     (pblock->NumberOfWordPlaces +
				    HashEl->PlacesUsed) * sizeof(t_WordPlace));

    } else {
	/* All of the entries used to fit in the index block. */

	pblock = (t_pblock *) emalloc(sizeof(t_pblock) +
			(WordInfo->WordPlacesInHere + HashEl->PlacesUsed) *
			sizeof(t_WordPlace));
	pblock->NumberOfWordPlaces = 0;
        if (WordInfo->WordPlacesInHere < WordInfo->NumberOfWordPlaces) {
	    extern t_WordPlace *GetWordPlaces();

	    if (WordInfo->WordPlaceStart) {
		WordInfo->WordPlaces = GetWordPlaces(
		    WordInfo->WID,
		    WordInfo->WordPlaceStart,
		    WIDBLOCKSIZE - (WordInfo->WordPlaceStart - WordInfo->DataBlock),
		    0L,
		    WordInfo->NumberOfWordPlaces
		);
	    }
	}

	/* Assert: the wordplaces in WordInfo are sorted */
	for (i = 0; i < WordInfo->NumberOfWordPlaces; i++) {
	    pblock->WordPlaces[pblock->NumberOfWordPlaces++] =
				WordInfo->WordPlaces[i]; /* structure copy */

	}
    }

    /* delete the old entry from disk */
    if (WordInfo->Offset) {
	/* Remove the old information from disk.
	 * This isn't as bad as it sounds, as it will be at the start
	 * of the freelist, so when we write it out again it will be
	 * in the buffer cache...  But it would still be faster to append.
	 */
	DeleteWordPlaces(WordInfo->Offset, WordInfo->WID);
    }

    pblock->WID = HashEl->WID;
    WordInfo->Offset = pblock->ChainStart = 0L; /* it's invalid now... */

    /* Merge the WordPlaces */

    /* Assert: we need only compare the last old entry and the
     * first new one to see if we might need a sort.  Note that
     * there must _be_ entries in pblock, as otherwise we'd have called
     * NewEntry() and not UpdateEntry().
     */

    if (pblock->WordPlaces[pblock->NumberOfWordPlaces - 1].FID >=
				HashEl->Places[0].FID) {
	MightNeedToSort = 1;
    }

    for (i = 0; i < HashEl->PlacesUsed; i++) {
	pblock->WordPlaces[pblock->NumberOfWordPlaces++] =
				HashEl->Places[i]; /* copy the struct: */
	/* TODO: call qcmp to check for sorting (actually only need to
	 * check the FIDs of the new entries)
	 */
    }
    
    if (MightNeedToSort) {
	SortWordPlaces(pblock->NumberOfWordPlaces, pblock->WordPlaces);
    }

    WordInfo->NumberOfWordPlaces = pblock->NumberOfWordPlaces;

    /* First, let's make an index entry: */
    if (pblock->NumberOfWordPlaces <= MaxWordPlacesInAWordBlock) {
	(void) MkWIB(WordInfo, pblock);
    }

    /** write out the new entry */
    if (WordInfo->WordPlacesInHere == pblock->NumberOfWordPlaces) {
	/* In this case it all fits into the WID index */
	pblock->ChainStart = 0L;
    } else {
	(void) Putpblock(WordInfo, pblock);
    }
    if (PutWordInfoIntoIndex(WordInfo, pblock->ChainStart) < 0) {
	Error(E_FATAL|E_SYS,
	    "UpdateEntry: Couldn't update \"%s\" in database at 0x%lx",
			    WordInfo->Word, pblock->ChainStart
	);
    }


    /** reclaim storage */
    if (pblock) {
	(void) efree((char *)pblock);
    }
    (void) SlayWordInfo(WordInfo);
}


/*
 * $Log:	wordtable.c,v $
 * Revision 2.16  92/03/30  21:45:59  lee
 * Fixes to trace & progress reports.
 * 
 * Revision 2.15  92/02/16  20:57:11  lee
 * Added argument to FlushCache.
 * 
 * Revision 2.14  92/02/15  06:49:01  lee
 * gcc -Wall fixes.
 * 
 * Revision 2.13  92/02/02  05:23:53  lee
 * New symbol table.
 * 
 * Revision 2.12  92/02/01  23:46:22  lee
 * made Duff's device the default.
 * 
 * Revision 2.11  91/02/20  19:07:37  lee
 * The qxxx fix only worked if ASCIITRACE was defined!
 * 
 * Revision 2.10  90/10/06  00:51:05  lee
 * Prepared for first beta release.
 * 
 * Revision 2.9  90/10/05  23:44:30  lee
 * Major experimentation with new symbol table failed...
 * 
 * Revision 2.8  90/09/26  19:45:02  lee
 * Added call to mallocmap() in ifdef MALLTRACE.
 * 
 * Revision 2.7  90/09/20  18:58:25  lee
 * Added some comments, and deleted a needless test.  Reorderered a loop
 * in the (probably vain) hope of a speed-up in the face of paging...
 * 
 * Revision 2.6  90/09/19  20:25:44  lee
 * Don't index "qxxxxxxxx" words (this is a hook for filters...)
 * 
 * Revision 2.5  90/08/29  21:46:11  lee
 * Alpha release
 * 
 * Revision 2.4  90/08/09  19:17:37  lee
 * BSD lint and Saber
 * 
 * Revision 2.3  90/03/21  17:32:31  lee
 * new hashing function, masses, masses better -- the old one only ever
 * used abuot 6% of the available values!
 * 
 * Revision 2.2  89/10/08  20:47:47  lee
 * Working version of nx-text engine.  Addfile and wordinfo work OK.
 * 
 * Revision 2.1  89/10/02  01:16:22  lee
 * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
 * 
 * Revision 1.3  89/09/17  23:05:15  lee
 * Various fixes; NumberInBlock now a short...
 * 
 * Revision 1.2  89/09/16  21:18:55  lee
 * First demonstratable version.
 * 
 * Revision 1.1  89/09/07  21:06:20  lee
 * Initial revision
 * 
 */
