/* lqword.c -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 */

/* lqword -- simple program to print information about individual words.
 *
 * $Id: lqword.c,v 2.12 92/02/15 06:50:17 lee Exp $
 */

#include "globals.h" /* defines and declarations for database filenames */
#include "error.h"

#include <stdio.h>
#include <sys/types.h>
#include <malloc.h>
#include <fcntl.h> /* for fileinfo.h */
#include <ctype.h>

#ifdef SYSV
# include <limits.h>
  /* for USI_MAX, the largest unsigned integer.
   * 4.3 BSD doesn't seem to have this.  I don't know how to get this
   * on BSD systems.
   */
#endif

#ifndef USI_MAX
# define USI_MAX ((unsigned int) -1)
#endif

#include "fileinfo.h"
#include "wordinfo.h"
#include "smalldb.h"
#include "pblock.h"
#include "wordrules.h"
#include "emalloc.h"

/*** Declarations: ***/
/** System calls and library routines: **/
extern void exit();

/** System calls: **/

/** Unix Library Functions: **/
extern char *strncpy();
#ifndef tolower
 extern int tolower();
#endif

/** lqtext library functions: **/
extern char *UnFlag();
extern t_WordInfo *WID2WordInfo();
extern int TooCommon();
extern void cleanupdb();
extern void SetDefaults();
extern void DefaultUsage();
extern void DeleteWord();

/** functions defined within this file: */
void PrintWordInfo(), AllWordInfo();
void Display(), ShowWordList();
void dbmmarch();

/** Macros and variable definitions **/

#define DISPLAY_ALL 1
#define DISPLAY_NAME 2
    /* These are the possible DisplayMode values -- see main() */

char *progname = 0;
    /* Used for error messages */

int SilentMode = 0;
    /* Set if we were invoked with the -s option.  In this mode, we behave
     * like grep -s, and exit with a zero exit status if one or more of
     * the words were found in the database.
     */

int ListMode = 0;
    /* Set if we are to provide a terser output format suitable for use
     * with lqshow(1L).
     */

int AsciiTrace = 0;
    /* If this is non-zero, we provide debugging information.  The lqtext
     * library also uses this variable.  Setting it to values greater
     * than 1 or 2 will generally provide large amounts of debugging
     * information.  If the library was compiled with -UASCIITRACE,
     * however, there will be much less diagnostic output at higher
     * levels.
     */

int DoNames = 1;
int CountFiles = 0;

static char *Revision = "lqword 2.2";

/** end of declarations... **/


int
main(argc, argv)
    int argc;
    char *argv[];
{
    extern int optind, getopt();  /* For getopt(3) */
    extern char *optarg;	  /* For getopt(3) */
    int ch;			  /* For getopt(3) */
    int ErrorFlag = 0;		  /* For getopt(3) */
    int DisplayMode = 0;
	/* DisplayMode indicates what kind of information we are to
	 * print in response to queries.  The values understood are
	 * the DISPLAY_* constants.  Perhaps this should be an enum.
	 */

    progname = argv[0];
	/* I see this as a library program, so I am leaving the full
	 * path.  lqaddfile(1L) and lqphrase(1L) set progname to be
	 * the filename of the command, rather than the full pathname.
	 */

    SetDefaults(argc, argv);
	/* Deal with any arguments that are understood by all lqtext
	 * programs.
	 */

    while ((ch = getopt(argc, argv, "aACD:lNsVxZz:")) != EOF) {
	switch (ch) {
	case 'a':
	    DisplayMode = DISPLAY_NAME;
	    break;
	case 'A':
	    DisplayMode = DISPLAY_ALL;
	    break;
	case 'C':
	    CountFiles = 1;
	    break;
	case 'D':
	    /* This actually removes all entries for the given word
	     * from the database.  You need write permission, of
	     * course.
	     */
	    {
		extern void lqWriteAccess();

		lqWriteAccess();
		DeleteWord(optarg); /* MISFEATURE */
	    }
	    break;
	case 'l':
	    ListMode = 1;
	    break;
	case 'N':
	    DoNames = 0;
	    break;
	case 's':
	    SilentMode = 1;
	    break;
	case 'V':
	    fprintf(stderr, "%s version %s\n", progname, Revision);
	    break;
	case 'x':
	    ErrorFlag++;
	    break;
	case '?':
	    ErrorFlag++;
	    break;
	case 'z':
	case 'Z':
	    break; /* done by SetDefaults(); */
	}
    }

    /* Normally put call to lrqError here to give a helpful message,
     * but not yet ready to ship the error handling package, sorry
     */
    if (ErrorFlag) {
	fprintf(stderr, "%s: options are:\n", progname);
	fputs("\
	-D Word -- delete the named word (DANGEROUS!)\n\
	-l	-- list mode, for use with lqshow\n\
	-s	-- silent mode (like grep -s)\n", stderr);
	DefaultUsage();
	    /* DefaultUsage() prints the list of the standard options. */
	fputs("\n\
In addition, if no words are given, the following are understood:\n\
	-a	-- print all words\n\
	-A	-- print all matches to all words\n", stderr);
	exit(1);
    }

    if (optind >= argc) {
	if (SilentMode) exit(1);
	    /* if there were no words given, none of them matched.
	     * It could be argued that this case should be an error.
	     */
	if (DisplayMode) {
	    if (!SilentMode && !ListMode) {
		/* Print some pretty headers */
		if (CountFiles) {
		    printf("WID\t%-*s\tmatches\tfiles\n",
			MaxWordLength + 2, "Word"
		    );
		} else {
		    printf("       WID | Where   | Total   | Word\n");
		    puts(
    "===========|=========|=========|============================================");
		}
	    }
	    AllWordInfo(DisplayMode);
	} else {
	    /* In this case, there were no command-line options and no
	     * display-mode flags, so we do the default thing.
	     * This happens to be to print every word in the database.
	     * This is probably bogus behaviour -- there should be a better
	     * way of finding words that match a given pattern than using
	     * lqword | grep
	     * which is what this allows.
	     */
	    dbmmarch();
	}
    } else {
	if (!SilentMode && !ListMode) {
	    /* Print some pretty headers */
	    if (CountFiles) {
		printf("WID\t%-*s\tmatches\tfiles\n",
		    MaxWordLength + 2, "Word"
		);
	    } else {
		printf("       WID | Where   | Total   | Word\n");
		puts(
"===========|=========|=========|============================================");
	    }
	}

	while (optind < argc) {
	    PrintWordInfo(argv[optind++]);
	}
    }
    cleanupdb();
	/* close database files.  This is particularly important if we are
	 * updating the database -- the horrible -D option -- but should
	 * probably be done by liblqtext itself.
	 */
    exit(SilentMode); /* 0 or 1 (this is a little devious) */

    /*NOTREACHED*/
    return 1;
	/* this is for versions of lint and gcc that don't understand
	 * that exit() doesn't return -- or, if it douse, that there is
	 * nothing that can be done about it!
	 */
}

void
PrintWordInfo(Word)
    char *Word;
{
    extern t_WordInfo *FindWordInfoFromIndex();
    extern long atol();
    extern t_WID Word2WID();
    extern char *WordRoot();

    register char *p;
    t_WordInfo *WordInfo;
    t_WID WID;
    t_WordInfo Root;

    Root.WordPlace.Flags = 0;

    /** Find the canonical form of the word, with plurals reduced to the
     ** singular and letters folded into lower case.
     **/

    /* First, remember if the word originally started with an upper case
     * letter:
     */
    if (isupper(*Word)) {
	Root.WordPlace.Flags |= WPF_UPPERCASE;
    }

    /* now convert to lower case and measure its length at the same time: */
    for (p = Word; *p; p++) {
	if (isupper(*p)) *p = tolower(*p);
    }

    Root.Length = p - Word;
    Root.Word = Word;

    /* Now call WordRoot() to find the canonical form: */
    Word = WordRoot(&Root);

    /** Now see if the canonical word is too common to list: **/

    if (TooCommon(&Root)) {
	/* It is listed in the common word list, so don't bother looking
	 * it up at all
	 */
	if (!SilentMode) {
	    Error(E_WARN, "No index information for: %s (too common)", Word);
	}
	return;
    }

    /** It is not too common, so look it up: **/

    if (((WID = Word2WID(Word, Root.Length)) == (t_WID) 0) ||
	(WordInfo = WID2WordInfo(WID)) == (t_WordInfo *) 0) {
	if (!SilentMode) {
	    if (WID) {
		/* In this case the word is in the database (since it has
		 * a non-zero WID), but not in the word index.  This might
		 * happen if the word is being deleted (or added) by someone
		 * else at this very moment, or if the database is corrupt.
		 */
		Error(E_WARN, "No index information for: %s (WID %lu)",
								Word, WID);
	    } else {
		/* In this case the word is neither listed as common nor
		 * found in the database.  Either it was spelt differently
		 * there or it isn't there at all.
		 */
		Error(E_WARN, "No index information for: %s", Word);
	    }
	}
	return;
    }
    if (SilentMode && WordInfo->NumberOfWordPlaces > 0) {
	/* We found something, so there is no point looking further --
	 * we already know enough to exit.  If a lot of words are given,
	 * this could be a big efficiency win.
	 */
	exit(0);
    }

    /** Now we have the database entry for the word, so let's print it!
     **/
    Display(WordInfo, DISPLAY_ALL);

    /** Now return the storage used...
     **/
    if (WordInfo) {
	SlayWordInfo(WordInfo);
    }

    /** All done for this word.
     **/
}

/* Display() -- print information about a single word */
void
Display(WordInfo, Verbose)
    t_WordInfo *WordInfo;
    int Verbose;
{
    char *Buf = emalloc(WordInfo->Length + 1);

    /* Words in a t_WordInfo might not be null terminated, since the
     * storage overhead and the work of putting the nulls there turns out to
     * be significant...
     */
    (void) strncpy(Buf, WordInfo->Word, WordInfo->Length);
    Buf[WordInfo->Length] = '\0';

    if (CountFiles) {
	printf("%lu\t%-*s\t%ld\t",
			    (unsigned long) WordInfo->WID,
			     MaxWordLength + 2,
			     WordInfo->Word,
			    (unsigned long) WordInfo->NumberOfWordPlaces
	);
    } else if (!ListMode) {
	/* Print a little header for the word, unless we were asked not to */
	printf("%10lu | %7lu | %7lu | %s\n", WordInfo->WID,
					     WordInfo->Offset,
					     WordInfo->NumberOfWordPlaces,
					     WordInfo->Word
	);
    }

    if ((CountFiles || ListMode || Verbose == DISPLAY_ALL) &&
					WordInfo->NumberOfWordPlaces) {
	/* If  there are occurrences in the database (there might not be if
	 *     the word has been deleted, or has only just been added),
	 * and  we want all the matches,
	 * then  print the list of matches in the appropriate format:
	 */
	ShowWordList(WordInfo);
    }

    (void) efree(Buf); /* reclaim storage */
}

void
ShowWordList(WordInfo)
    t_WordInfo *WordInfo;
{
    extern t_pblock *Getpblock();
    t_FileInfo *GetFileInfo();

    long FilesWithThisWord = 0;
    static t_FileInfo *FileInfo = (t_FileInfo *) 0;
    t_pblock *pblock = (t_pblock *) 0;
    t_WordPlace *PP = (t_WordPlace *) 0;
    int Place;
    char *LastRoot = "[internal error lqword.c 392]";
	/* the message is in case I make a coding error!.  The number
	 * was once the line number of the message, but it only needs to
	 * be a distinct enough message to search for.
	 */

    if (WordInfo->WordPlacesInHere >= WordInfo->NumberOfWordPlaces) {
	/* In this case, the match info all fits in the index, so it
	 * does not matter if automatic pre-fetching from the overflow
	 * file "data" happens or not (i.e. if we are using Lazy Evaluation,
	 * it doesn't happen, but it makes no difference in this case).
	 */
	PP = WordInfo->WordPlaces;
    } else if ((pblock = Getpblock(WordInfo)) != (t_pblock *) 0) {
	PP = pblock->WordPlaces;
	/* If Lazy Evaluation is enabled, liblqtext might not have fetched
	 * all of the match information from the overflow database, in
	 * which case we must do it now.
	 */
    }

    if (PP) {
	t_FID LastFID = USI_MAX;
	    /* This is not a plausible FID (File IDentifier), so it
	     * will force a call to GetFileInfo() in the loop below.
	     */
	unsigned int LastFlags = 256 * 2;
	    /* Similarly, this is an impossible flag value, since the
	     * flags are constrained to fit in a single byte.
	     */

	/* cycle through the Place... */
	for (Place = 0; Place < WordInfo->NumberOfWordPlaces; Place++) {

	    char BIF[100]; char WIB[100];
	    register char *p;
	    char *Bp, *Wp;
	    long l;

	    if (LastFlags != PP[Place].Flags) {
		LastFlags = PP[Place].Flags;
		LastRoot = UnFlag(WordInfo, LastFlags);
		    /* UnFlag() takes a canonical (singular, lower-case)
		     * word and a set of flags, and reverses the
		     * transformations implied by the flags.  For example,
		     * if WordInfo->Word is "boy" and flags contain the
		     * Plural flag, you should get "boys" returned.
		     * Since we don't remember whether a word was in all
		     * caps or had only the first letter capitalised (at
		     * the moment, anyway), the routine will return Boys
		     * even if the input was BOYS or BoYs.
		     * Possessives (the boy's books) may also be indicated.
		     */
	    }

	    if (LastFID != PP[Place].FID || FileInfo == (t_FileInfo *) 0) {
		/* The first part of the test means we don't call the
		 * function to retrieve the file name lots of times if
		 * there are multiple matches in the same data file. 
		 * This turns out to be a common case.
		 */

		/* Reclaim storage */
		if (FileInfo) {
		    if (FileInfo->Name) {
			(void) efree(FileInfo->Name);
		    }
		    (void) efree(FileInfo);
		    FileInfo = 0;
		}

		/* Find the file name from the FID.  This routine should
		 * be called FID2FileName(), and may in fact be renamed
		 * in the future.
		 */
		if (DoNames) {
		    if ((FileInfo = GetFileInfo(LastFID = PP[Place].FID)) ==
						    (t_FileInfo *) 0) {
			/* No filename information available.  This sometimes
			 * happens if you rin lqword diring an lqaddfile
			 * session and match a word in one of the new files.
			 * Note that if the output is for reuse, we don't
			 * want to include references to files whose names
			 * we don't have!
			 */
			if (!ListMode) {
			    printf("%20s | %-.5lu/%-.3lu | [FID %d]\n",
				LastRoot,
				PP[Place].BlockInFile,
				PP[Place].WordInBlock,
				PP[Place].FID);
			}
			continue;
		    }
		}
		++FilesWithThisWord;
	    }

	    if (CountFiles) continue;

	    /* This is an inline printf, because otherwise this call
	     * to printf takes over 20% of the execution time, and nearly
	     * 40% for a frequent word (e.g. over 1000 places) !!
	     */
	    p = &BIF[sizeof(BIF) - 1];
	    *p = '\0';
	    if (PP[Place].BlockInFile == 0) {
		*--p = '0';
	    } else for (l = PP[Place].BlockInFile; l; l /= 10) {
		*--p = "0123456789"[l % 10];
	    }
	    Bp = p;

	    p = &WIB[sizeof(WIB) - 1];
	    *p = '\0';
	    {
		register int i = PP[Place].WordInBlock;
		if (i == 0) {
		    *--p = '0';
		} else for (; i; i /= 10) {
		    *--p = "0123456789"[i % 10];
		}
		Wp = p;
	    }

  	    if (ListMode) {
		if (!DoNames) {
		    printf("%ld ", WordInfo->WID);
		}

		while (*Bp) {
		    putchar(*Bp);
		    Bp++;
		}
		putchar(' ');
		while (*Wp) {
		    putchar(*Wp);
		    Wp++;
		}
		putchar(' ');
		if (DoNames) {
		    puts(FileInfo->Name);
		} else {
		    printf("%ld\n", PP[Place].FID);
		}
  	    } else {
		/* Well, if we are not reusing the output, maybe the speed
		 * is not quite so critical...
		 */
  		printf("%20s | %5lu/%3lu F=%3u S=%3u | ",
		    LastRoot,
		    PP[Place].BlockInFile,
		    PP[Place].WordInBlock,
		    PP[Place].Flags, /* XXX */
		    PP[Place].StuffBefore
		);
		if (DoNames) {
		    puts(FileInfo->Name);
		} else {
		    printf("%ld\n", PP[Place].FID);
		}
	    }
	}
    }

    if (CountFiles) {
	printf("%ld\n", FilesWithThisWord);
    }

    if (pblock) {
	/* If we had to go and get the matches ourselves, we had better
	 * release the storage.
	 * Actually we should also be freeing the FileInfo and possibly
	 * the WordInfo as well, but the pblock is the biggest... and I
	 * am only adding comments today, not fixing code (I hope)...
	 * NOTDONE FIXME
	 */
	(void) efree(pblock);
    }
}

void
AllWordInfo(Verbose)
    int Verbose;
{
    extern char *WID2Word();
    extern t_WID GetMaxWID();

    t_WID i;
    t_WID MaxWid = GetMaxWID();
    t_WordInfo *WordInfo;
    char *Name;

    /* Loop over all possible WID numbers and print information
     * for each of them.
     */
    for (i = (t_WID) 1; i <= MaxWid; i++) {
	if ((WordInfo = WID2WordInfo(i)) != (t_WordInfo *) 0) {
	    Display(WordInfo, Verbose);
	    SlayWordInfo(WordInfo);
	}
    } /* for each WID */

    if (!ListMode) {
	printf("Maximum WID is %lu\n", MaxWid);
    }
}

/* dbmmarch -- print every value in a dbm database.  This might go
 * wrong (omitting some values) if the database is being concurrently
 * updated.
 */
void
dbmmarch()
{
    DBM *db;
    datum d;

    if ((db = startdb(WordIndex)) == (DBM *) 0) {
	/* WordIndex is the list of words, defined in "globals.h".
	 * If we didn't open it, the user probably has not set
	 * $LQTEXTDIR, or didn't use the -d database-dir option that
	 * is handled bu SetDefaults() called from main().
	 */
	Error(E_FATAL, "Can't open database file \"%s\"", WordIndex);
    }

    /* The word database contains WID-->word matches, that look like
     * (key = "Word", content = WID)
     */
    for (d = dbm_firstkey(db);
	 d.dptr != (char *) 0 && d.dsize != 0;
	 d = dbm_nextkey(db)
    ) {
	register char *s;

	/* IMPORTANT NOTE:
	 * The words are not nul-terminated in the database.  It is
	 * therefore not safe to use printf() or puts() unless we make
	 * a copy or are careful...
	 */
	for (s = d.dptr; s - d.dptr < d.dsize; s++) {
	    putchar(*s);
	}
	putchar('\n');
    }
    enddb(db);
}

/*
 * $Log:	lqword.c,v $
 * Revision 2.12  92/02/15  06:50:17  lee
 * generate return 1 at the end of main in all cases, for gcc.
 * 
 * Revision 2.11  92/02/15  05:17:03  lee
 * Added call to lqWriteAccess for -D; might not work, though.
 * 
 * Revision 2.8  90/10/06  00:51:00  lee
 * Prepared for first beta release.
 * 
 * Revision 2.7  90/08/29  21:45:37  lee
 * Alpha release
 * 
 * Revision 2.6  90/08/08  22:22:53  lee
 * Added heavy comments.  Cleaned up dbmmarch() and made some other
 * minor fixes.
 * 
 * Revision 2.5  90/08/08  21:06:21  lee
 * Added -x option; removed rude message about getpts bugs.
 * 
 * Revision 2.4  90/04/21  18:50:38  lee
 * fixed a serious bug in the -l mode -- now prints the entire match!
 * 
 * Revision 2.3  90/03/27  13:20:57  lee
 * now passes gcc -Wall
 * 
 * Revision 2.2  89/10/08  20:47:23  lee
 * Working version of nx-text engine.  Addfile and wordinfo work OK.
 * 
 * Revision 2.1  89/10/02  01:16:10  lee
 * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
 * 
 * Revision 1.3  89/09/17  23:04:42  lee
 * Various fixes; NumberInBlock now a short...
 * 
 * Revision 1.2  89/09/16  21:18:50  lee
 * First demonstratable version.
 * 
 * Revision 1.1  89/09/07  21:06:14  lee
 * Initial revision
 * 
 */
