/* wordrules.h -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 */

/* $Id: wordrules.h,v 1.2 90/10/06 02:18:39 lee Rel1-10 $
 *
 */

/* Rules for determining what an indexable word looks like;
 * These are implemented by the various filters, as well as by
 * the indexing software itself.  This means that the filters
 * don't need to keep track of word lengths, as addfile will do this,
 * but that they should not emit non-word stuff if they can help it,
 * turning it into the equivalent amount (in bytes) of white-space
 * instead.
 * They should also turn words they don't want indexed into "qxxx",
 * with the right number of x's (e.g. "bare" --> "qxxx").
 */

/* A "word" is a letter followed by any combination of
 * letters, digits or '_'.  An embedded (not trailing) ' is also allowed
 * (_ is allowed so that one can index progamming languages; strictly
 * speaking, a lot of languages allow _ at the start too, but I don't
 * want to get confused by nroff output etc., which contains lines of
 * underscores)
 *
 * This scheme currently excludes numbers...
 * 31, 31.4 and 31.9e4 will all be ignored.  So will 1987.
 */

#define StartsWord(ch) isalpha(ch)
#define WithinWord(ch) (isalnum(ch) || (ch == '_') || (ch == '\''))
#define EndsWord(ch) isalnum(ch)

/* Don't index words unless they are at least MinWordLength characters
 * long!
 */
#define MinWordLength 3
#define MaxWordLength 18 /* truncate words to this */
/* The Following is for *.WordPlace.BlockInFile.  If words are constrained
 * to be 3 or more characters long, there can be at most
 * (FileBlockSize / 4) of them in a block (since words must be separated
 * by at least one character).
 * Hence, 7 bits, which allows 0..127 giving 128  distinct values,
 * gives us a block that is 128 * (MinWordLength + 1) bytes long.
 */
#define FileBlockSize (128 * (MinWordLength + 1))

/* WordPlace Flags:
 * When a plural word is found, or a possessive word, it is reduced to
 * being singular, and flags are set appropriately.
 * Also, a flag is set to say if the word started with a Capital Letter.
 * This puts Window, windows, and Window's all together, but enables them
 * to be differentiated for searching if required.
 * These flags are implemented by WordInfo and addfile, not by the various
 * filters, but the filters must preserve capitalisation of the first letter
 * in each word, and pass through apostrophes within words (like this's).
 */

#define WPF_WASPLURAL		0001 /* The word...  ended in s */
#define WPF_UPPERCASE		0002 /* ...Started with a capital letter */
#define WPF_POSSESSIVE		0004 /* ...ended in 's */
#define WPF_ENDEDINING		0010 /* ...ended in ing */
#define WPF_LASTWASCOMMON	0020 /* the previous word was common */
#define WPF_LASTHADLETTERS	0040 /* we skipped some letters to get here */
#define WPF_HASSTUFFBEFORE	0100 /* Other than 1 byte of garbage before */
#define WPF_LASTINBLOCK		0200 /* I'm the last word in this block */

/* new note (jan 90):
 * You can't currently have both plural and posessive in the most common case
 * of the boys' muddy feet (for example), as the trailing ' gets deleted.
 * this doesn't matter, but perhaps that combination should be reserved for
 * had-another-standard-ending??? e.g. -ed or -ing, that isn't often followed by
 * -s or -'s...
 *
 * Also, ENDEDINING (ended in "ing") is currently unused entirely.
 * Perhaps if it is set, the plural and possessive bits should index which of
 * four endings was found, although this would preclude special-casing of the
 * s's combination.  Probably better that way.
 *
 * I should very much like to have another flag or two, perhaps embedded in
 * one of the other fields.  This might be feasible if there is a pre-scan
 * when the index is written to determine the most common (modal) flags and
 * distance (currently I assume 1) and to omit these whenever they are the default.
 * In this case, the fact that every occurrence of Jesus starts with a capital
 * letter (and ends in -s, *blush*), can still lead to most of the flags being
 * omitted.
 *
 * The next revision will separate the list of FIDs from the rest of the information,
 * in which case the embedding of the flags becomes a little trickier.  This
 * belongs in the TODO file now, sorry.
 *
 * Liam Quin, January 22nd 1990, at home in Warrington, England (ugh)
 *
 */

/*
 * $Log:	wordrules.h,v $
 * Revision 1.2  90/10/06  02:18:39  lee
 * Prepared for first beta release.
 * 
 * Revision 1.1  90/08/09  19:16:05  lee
 * Initial revision
 * 
 * Revision 2.2  89/10/08  20:47:35  lee
 * Working version of nx-text engine.  Addfile and wordinfo work OK.
 * 
 * Revision 2.1  89/10/02  01:16:19  lee
 * New index format, with Block/WordInBlock/Flags/BytesSkipped info.
 * 
 * Revision 1.2  89/09/16  21:15:52  lee
 * First demonstratable version.
 * 
 * Revision 1.1  89/09/07  21:06:17  lee
 * Initial revision
 * 
 */
