/* FilterType.c -- Copyright 1989 Liam R. Quin.  All Rights Reserved.
 * This code is NOT in the public domain.
 * See the file COPYRIGHT for full details.
 */

/* FilterType -- determine how to deal with a given file.
 * Part of Liam Quin's NX-Text text retrieval package.
 *
 * $Id: FilterType.c,v 1.10 92/01/31 00:08:08 lee Exp $
 *
 * $Log:	FilterType.c,v $
 * Revision 1.10  92/01/31  00:08:08  lee
 * deleted unused variable AsciiCount.
 * 
 * Revision 1.9  91/06/13  20:27:49  lee
 * fixed a bug in the heuristics...
 * 
 * Revision 1.8  91/03/21  23:36:08  lee
 * Hmmm... didn't think about that change enough!
 * 
 * Revision 1.7  91/03/21  23:29:53  lee
 * oops -- should be using read(), not fread()!!!
 * 
 * Revision 1.6  90/10/06  00:11:56  lee
 * Prepared for first beta release.
 * 
 * Revision 1.5  90/09/24  21:20:31  lee
 * changed a free() to an efree() -- the last one!
 * 
 * Revision 1.4  90/09/20  20:07:35  lee
 * fixed a tiny memory hole...
 * 
 * Revision 1.3  90/08/29  21:46:35  lee
 * Alpha release.
 * 
 * Revision 1.2  90/08/09  19:16:18  lee
 * BSD lint and fixes...
 * 
 * Revision 2.2  89/10/08  20:44:34  lee
 * Working version of nx-text engine.  Addfile and wordinfo work OK.
 * 
 *
 */

#include <stdio.h>
#include <malloc.h>
#include "emalloc.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <ctype.h>

#include "fileinfo.h"
#define FILTERDEF /* see filter.h */
#include "filter.h"
#include "wordrules.h" /* for min word length -- don't index files shorter */

#define Prefix(pref,str) ((*(pref)==(*str))&&!strncmp(pref,str,strlen(pref)))

extern int open(), close();
extern int read();
extern int strcontains();

/* The current filter types are:
 * FTYPE_NEWS  1
 * FTYPE_MAIL  2
 * FTYPE_CDMS  3
 * FTYPE_MOSTLYASCII 4
 * FTYPE_C_SOURCE 5
 */

/* InitFilterTable might one day be called from Defaults.c....
 * At which point, it will read an ascii file that describes the
 * various filters, I suppose.
 *
 * For,now, it does nothing.  It is only called once, and should return 0
 * for success or -1 for failure.
 */
int
InitFilterTable()
{
    return 0;
}

int
GetFilterType(FileInfo, StatBuf)
    t_FileInfo *FileInfo;
    struct stat *StatBuf;
{
    int Type = MaxFilterType + 1;
    char Buffer[1024];
    int fd = (-1); /* initialised for lint */
    int AmountRead = 0; /* initialised for lint */
    int ch;
    int Length;
    FILE *fp = (FILE *) 0;

    /* GetFilterType() is called to determine which input filter (if any)
     * should be used to read a given file.
     * This routine should know about compressed files.
     *
     * It currently knows about mail, news and C files.
     * There are also hooks for CDMS files (a word-processing package).
     *
     * If the file should not be indexed at all (e.g. it's a core dump),
     * we return -1.
     */

    if (!FileInfo || !FileInfo->Name || !*(FileInfo->Name)) return (-1);

    if (StatBuf->st_size < MinWordLength) return (-1);

    Length = strlen(FileInfo->Name);

    if (FileInfo->Name[Length - 1] == 'Z' && Length > 2 &&
					FileInfo->Name[Length - 2] == '.') {
	char *Buf = emalloc(Length + 10);

	(void) sprintf(Buf, "zcat < \"%s\"", FileInfo->Name);

	fp = popen(Buf, "r");
	(void) efree(Buf);
	if (fp == (FILE *) 0) {
	    return (-1);
	}
    }

    if (fp) {
	/* Stdio popen() read: */
	if ((AmountRead = fread(Buffer, sizeof Buffer, 1,fp)) < MinWordLength) {
	    (void) pclose(fp);
	    fp = (FILE *) 0; /* try again with read() */
	}
    }

    if (!fp) {
	/* Unix read(2) system call: */
	if ((fd = open(FileInfo->Name, O_RDONLY, 0)) < 0) {
	    return -1;
	}
	if ((AmountRead = read(fd, Buffer, sizeof(Buffer)-1)) < MinWordLength) {
	    (void) close(fd);
	    return -1;
	}
    }
    if (fp) {
	(void) pclose(fp);
    } else {
	(void) close(fd);
    }

    /* Check the magic table for CDMS: */
    if ((unsigned char) Buffer[0] == 128 && Buffer[1] == 'M') {
	if (AmountRead > 35) { /* size of CDMS file header */
	    Type = FTYPE_CDMS;
	    return (FileInfo->FilterType = Type);
	}
    }
     
    if (AmountRead < 30) {
	register char *p = Buffer;

	/* who cares if it's this small? */
	for (; p - Buffer < AmountRead; p++) {
	    if (!isascii(*p)) {
		return (-1);
	    }
	}
	return 0;
    }

    /* Not cdms -- try news/mail;
     * mail files start with From;
     * news starts with From, Path or Relay-Version
     */
    if (isupper(Buffer[0])) {
	Buffer[AmountRead] = '\0';
	AmountRead--;
	if (Prefix("Xref: ", Buffer)) {
	    return (FileInfo->FilterType = FTYPE_NEWS);
	} else if (Prefix("Newsgroups: ", Buffer)) {
	    return (FileInfo->FilterType = FTYPE_NEWS);
	} else if (Prefix("Relay-Version: ", Buffer)) {
	    return (FileInfo->FilterType = FTYPE_NEWS);
	} else if (Prefix("From", Buffer)) {
	    if (strcontains("\nPath: ", Buffer)) {
		/* bug: should only check header, not body! */
		return FTYPE_NEWS;
	    } else {
		return FTYPE_MAIL;
	    }
	} else if (Prefix("Path: ", Buffer)) {
	    if (strcontains("\nNewsgroups: ", Buffer)) {
		return FTYPE_NEWS;
	    } else {
		return FTYPE_MAIL;
	    }
	} else if (Prefix("Return-Path: ", Buffer)) {
	    return FTYPE_MAIL; /* MH-style mail */
	}
    }

    /* look for C, trying not to get muddled up with shell scripts */
    ch = FileInfo->Name[Length - 1];

    if ((ch == 'c' || ch == 'h') && (Length > 2) &&
			    FileInfo->Name[Length - 2] == '.') {
	/* We could require one of
	 * . a comment
	 * . a #[ ^i]*(include|define|ifn?def|if)[ ^i]+
	 * . main[ ^i\n]*(
	 * . a declaration -- int, char, long, unsigned, static
	 * in the first block of the file.
	 * Can't be bothered today.
	 */
	if (strcontains("#line", Buffer)) {
	    return (-1); /* preprocessed already, index the original! */
	    /* we ought to say why we are not indexing it! */
	}

	/* we are very predisposed to thinking of this as C... */
	if (Prefix("#include", Buffer)		||
		strcontains("/*", Buffer)		||
		strcontains("#define", Buffer)	||
		strcontains("argc", Buffer)		||
		strcontains("()", Buffer)		||
		strcontains("#include", Buffer)) {
	    return FTYPE_C_SOURCE;
	}
    }

    /* if still not done, choose between Don't Index and Ascii Filter
     * (which simply strips non-ascii characters).
     */
    if (Type >= MaxFilterType) {
	register char *p;
	int OtherCount = 0;

	for (p = Buffer; p - Buffer < AmountRead; p++) {
	    if (!*p) {
		/* If it has nulls in it, it isn't a normal file,
		 * and we have no idea what to do with it!
		 * (if we did know, it would have had a magic number,
		 * so we wouldn't have got here)
		 */
		Type = (-1);
		break;
	    }
	    if (!isascii(*p)) OtherCount++;
	}
	if (Type > 0) {
	    if (OtherCount < (p - Buffer) / 5) {
		Type = (OtherCount) ? FTYPE_MOSTLYASCII : 0;
	    } else {
		Type = (-1); /* too much garbage */
	    }
	}
    }

    if (Type > MaxFilterType) Type = -1; /* don't index */
    return Type;
}
