/*
$Id: html.c,v 1.71 2005/05/12 15:41:05 karman Exp $
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
**
This file is part of Swish-e.
Swish-e is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Swish-e is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Swish-e; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
See the COPYING file that accompanies the Swish-e distribution for details
of the GNU GPL and the special exception available for linking against
the Swish-e library.
** Mon May 9 15:51:39 CDT 2005
** added GPL
**---------------------------------------------------------
** ** ** PATCHED 5/13/96, CJC
** Added MatchAndChange for regex in replace rule G.Hill 2/10/98
**
** change sprintf to snprintf to avoid corruption
** added safestrcpy() macro to avoid corruption from strcpy overflow
** SRE 11/17/99
**
** fixed cast to int problems pointed out by "gcc -Wall"
** SRE 2/22/00
**
** 2001-03-17 rasc save real_filename as title (instead full real_path)
** was: compatibility issue to v 1.x.x
**
** 2001-05-09 rasc entities completly rewritten (new module)
** small fix in parseHTMLsummary
**
**
*/
#include "swish.h"
#include "mem.h"
#include "swstring.h"
#include "index.h"
#include "compress.h"
#include "merge.h"
#include "search.h"
#include "docprop.h"
#include "metanames.h"
#include "html.h"
#include "entities.h"
#include "fs.h"
#include "error.h"
/* #### */
static char *parsetag(SWISH *sw, char *parsetag, char *buffer, int max_lines, int case_sensitive);
static struct metaEntry *getHTMLMeta(IndexFILE * indexf, char *tag, SWISH *sw, char *name,
char **parsed_tag, char *filename)
{
char *temp;
int lenword = 0;
char *word = NULL;
char buffer[MAXSTRLEN + 1];
int i;
struct metaEntry *e = NULL;
word = buffer;
lenword = sizeof(buffer) - 1;
if (!name)
{
if (!(temp = (char *) lstrstr((char *) tag, (char *) "NAME")))
return NULL;
}
else
temp = name;
temp += 4; /* strlen("NAME") */
/* Get to the '=' sign disreguarding any other char */
while (*temp)
{
if (*temp && (*temp != '=')) /* TAB */
temp++;
else
{
temp++;
break;
}
}
/* Get to the beginning of the word disreguarding blanks and quotes */
/* TAB */
while (*temp)
{
if (*temp == ' ' || *temp == '"')
temp++;
else
break;
}
/* Copy the word and convert to lowercase */
/* TAB */
/* while (temp !=NULL && strncmp(temp," ",1) */
/* && strncmp(temp,"\"",1) && i<= MAXWORDLEN ) { */
/* and the above <= was wrong, should be < which caused the
null insertion below to be off by two bytes */
for (i = 0; temp != NULL && *temp && *temp != ' ' && *temp != '"';)
{
if (i == lenword)
{
lenword *= 2;
if(word == buffer)
{
word = (char *) emalloc(lenword + 1);
memcpy(word,buffer,sizeof(buffer));
}
else
word = (char *) erealloc(word, lenword + 1);
}
word[i] = *temp++;
i++;
}
if (i == lenword)
{
lenword *= 2;
word = (char *) erealloc(word, lenword + 1);
}
word[i] = '\0';
/* Use Rainer's routine */
strtolower(word);
*parsed_tag = word;
if ((e = getMetaNameByName(&indexf->header, word)))
return e;
if ( (sw->UndefinedMetaTags == UNDEF_META_AUTO) && word && *word)
{
if (sw->verbose)
printf("Adding automatic MetaName '%s' found in file '%s'\n", word, filename);
return addMetaEntry(&indexf->header, word, META_INDEX, 0);
}
/* If it is ok not to have the name listed, just index as no-name */
if (sw->UndefinedMetaTags == UNDEF_META_ERROR)
progerr("UndefinedMetaNames=error. Found meta name '%s' in file '%s', not listed as a MetaNames in config", word, filename);
if(word != buffer)
efree(word);
return NULL;
}
/* Parses the Meta tag */
static int parseMetaData(SWISH * sw, IndexFILE * indexf, char *tag, int filenum, int structure, char *name, char *content, FileRec *thisFileEntry,
int *position, char *filename)
{
int metaName;
struct metaEntry *metaNameEntry;
char *temp,
*start,
*convtag;
int wordcount = 0; /* Word count */
char *parsed_tag;
/* Lookup (or add if "auto") meta name for tag */
metaNameEntry = getHTMLMeta(indexf, tag, sw, name, &parsed_tag, filename);
metaName = metaNameEntry ? metaNameEntry->metaID : 1;
temp = content + 7; /* 7 is strlen("CONTENT") */
/* Get to the " sign disreguarding other characters */
if ((temp = strchr(temp, '\"')))
{
structure |= IN_META;
start = temp + 1;
/* Jump escaped \" */
temp = strchr(start, '\"');
while (temp)
{
if (*(temp - 1) == '\\')
temp = strchr(temp + 1, '\"');
else
break;
}
if (temp)
*temp = '\0'; /* terminate CONTENT, temporarily */
/* Convert entities, if requested, and remove newlines */
convtag = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)start);
remove_newlines(convtag); /** why isn't this just done for the entire doc? */
/* Index only if a metaEntry was found, or if not not ReqMetaName */
if ( sw->UndefinedMetaTags != UNDEF_META_IGNORE || metaNameEntry)
{
/* Meta tags get bummped */
/* I'm not clear this works as well as I'd like because it always bumps on a new Meta tag,
* but in order to disable this behavior the name MUST be a meta name.
* Probably better to let getHTMLMeta() return the name as a string.
*/
if (!metaNameEntry || !isDontBumpMetaName(sw->dontbumpstarttagslist, metaNameEntry->metaName))
position[0]++;
wordcount = indexstring(sw, convtag, filenum, structure, 1, &metaName, position);
if (!metaNameEntry || !isDontBumpMetaName(sw->dontbumpendtagslist, metaNameEntry->metaName))
position[0]++;
}
/* If it is a property store it */
if ((metaNameEntry = getPropNameByName(&indexf->header, parsed_tag)))
if (!addDocProperty(&thisFileEntry->docProperties, metaNameEntry, (unsigned char*)convtag, strlen(convtag), 0))
progwarn("property '%s' not added for document '%s'\n", metaNameEntry->metaName, filename);
if (temp)
*temp = '\"'; /* restore string */
}
return wordcount;
}
/* Extracts anything in
tags from an HTML file and returns it.
** Otherwise, only the file name without its path is returned.
*/
char *parseHTMLtitle(SWISH *sw, char *buffer)
{
char *title;
char *empty_title;
empty_title = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone,1);
*empty_title = '\0';
if (!buffer)
return empty_title;
if ((title = parsetag(sw, "title", buffer, TITLETOPLINES, CASE_SENSITIVE_OFF)))
return title;
return empty_title;
}
/* Check if a particular title (read: file!) should be ignored
** according to the settings in the configuration file.
*/
/* This is to check "title contains" option in config file */
int isoktitle(SWISH * sw, char *title)
{
struct MOD_FS *fs = sw->FS;
return !match_regex_list(title, fs->filerules.title, "FileRules title");
}
/* This returns the value corresponding to the HTML structures
** a word is in.
*/
static int getstructure(char *tag, int structure)
{
/* int len; *//* not used - 2/22/00 */
char oldChar = 0;
char *endOfTag = NULL;
char *pos;
pos = tag;
while (*pos)
{
if (isspace((int) ((unsigned char) *pos)))
{
endOfTag = pos; /* remember where we are... */
oldChar = *pos; /* ...and what we saw */
*pos = '\0'; /* truncate string, for now */
}
else
pos++;
}
/* Store Word Context
** Modified DLN 1999-10-24 - Comments and Cleaning
** TODO: Make sure that these allow for HTML attributes
* */
/* HEAD */
if (strcasecmp(tag, "/head") == 0)
structure &= ~IN_HEAD; /* Out */
else if (strcasecmp(tag, "head") == 0)
structure |= IN_HEAD; /* In */
/* TITLE */
else if (strcasecmp(tag, "/title") == 0)
structure &= ~IN_TITLE;
else if (strcasecmp(tag, "title") == 0)
structure |= IN_TITLE;
/* BODY */
else if (strcasecmp(tag, "/body") == 0)
structure &= ~IN_BODY; /* In */
else if (strcasecmp(tag, "body") == 0)
structure |= IN_BODY; /* Out */
/* H1, H2, H3, H4, H5, H6 */
else if (tag[0] == '/' && tolower((int)((unsigned char)tag[1])) == 'h' && isdigit((int)((unsigned char)tag[2]))) /* cast to int - 2/22/00 */
structure &= ~IN_HEADER; /* In */
else if (tolower((int)((unsigned char)tag[0])) == 'h' && isdigit((int)(unsigned char)tag[1])) /* cast to int - 2/22/00 */
structure |= IN_HEADER; /* Out */
/* EM, STRONG */
else if ((strcasecmp(tag, "/em") == 0) || (strcasecmp(tag, "/strong") == 0))
structure &= ~IN_EMPHASIZED; /* Out */
else if ((strcasecmp(tag, "em") == 0) || (strcasecmp(tag, "strong") == 0))
structure |= IN_EMPHASIZED; /* In */
/* B, I are seperate for semantics */
else if ((strcasecmp(tag, "/b") == 0) || (strcasecmp(tag, "/i") == 0))
structure &= ~IN_EMPHASIZED; /* Out */
else if ((strcasecmp(tag, "b") == 0) || (strcasecmp(tag, "i") == 0))
structure |= IN_EMPHASIZED; /* In */
/* The End */
if (endOfTag != NULL)
{
*endOfTag = oldChar;
}
return structure;
}
/* Get the MetaData index when the whole tag is passed */
/* Patch by Tom Brown */
/* TAB, this routine is/was somewhat pathetic... but it was pathetic in
1.2.4 too ... someone needed a course in defensive programming... there are
lots of tests below for temp != NULL, but what is desired is *temp != '\0'
(e.g. simply *temp) ... I'm going to remove some strncmp(temp,constant,1)
which are must faster as *temp != constant ...
Anyhow, the test case I've got that's core dumping is:
no trailing quote, no trailing space... and with the missing/broken check for+ end of string it scribbles over the stack...
*/
static char *parseHtmlSummary(char *buffer, char *field, int size, SWISH * sw)
{
char *p,
*q,
*tag,
*endtag,
c = '\0';
char *summary,
*beginsum,
*endsum,
*tmp,
*tmp2,
*tmp3;
int found,
lensummary;
/* Get the summary if no metaname/field is given */
if (!field && size)
{
/* Jump title if it exists */
if ((p = lstrstr(buffer, "")))
{
p += 8;
}
else
p = buffer;
/* Let us try to find */
if ((q = lstrstr(p, "');
}
else
q = p;
summary = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone,strlen(p)+1);
strcpy(summary,p);
remove_newlines(summary);
//$$$$ Todo: remove tag and content of scripts, css, java, embeddedobjects, comments, etc
remove_tags(summary);
summary = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)summary);
/* use only the required memory -save those not used */
/* 2001-03-13 rasc copy only bytes of string */
if((int) strlen(summary) > size)
summary[size]='\0';
return summary;
}
for (p = buffer, summary = NULL, found = 0, beginsum = NULL, endsum = NULL; p && *p;)
{
if ((tag = strchr(p, '<')) && ((tag == p) || (*(tag - 1) != '\\')))
{ /* Look for non escaped '<' */
tag++;
for (endtag = tag;;)
if ((endtag = strchr(endtag, '>')))
{
if (*(endtag - 1) != '\\')
break;
else
endtag++;
}
else
break;
if (endtag)
{
c = *endtag;
*endtag++ = '\0';
if ((tag[0] == '!') && lstrstr(tag, "META") && (lstrstr(tag, "START") || lstrstr(tag, "END")))
{ /* Check for META TAG TYPE 1 */
if (lstrstr(tag, "START"))
{
if ((tmp = lstrstr(tag, "NAME")))
{
tmp += 4;
if (lstrstr(tmp, field))
{
beginsum = endtag;
found = 1;
}
p = endtag;
}
else
p = endtag;
}
else if (lstrstr(tag, "END"))
{
if (!found)
{
p = endtag;
}
else
{
endsum = tag - 1;
*(endtag - 1) = c;
break;
}
}
} /* Check for META TAG TYPE 2 */
else if ((tag[0] != '!') && lstrstr(tag, "META") && (tmp = lstrstr(tag, "NAME")) && (tmp2 = lstrstr(tag, "CONTENT")))
{
tmp += 4;
tmp3 = lstrstr(tmp, field);
if (tmp3 && tmp3 < tmp2)
{
tmp2 += 7;
if ((tmp = strchr(tmp2, '=')))
{
for (++tmp; isspace((int) ((unsigned char) *tmp)); tmp++);
if (*tmp == '\"')
{
beginsum = tmp + 1;
for (tmp = endtag - 1; tmp > beginsum; tmp--)
if (*tmp == '\"')
break;
if (tmp == beginsum)
endsum = endtag - 1;
else
endsum = tmp;
}
else
{
beginsum = tmp;
endsum = endtag - 1;
}
found = 1;
*(endtag - 1) = c;
break;
}
}
p = endtag;
} /* Default: Continue */
else
{
p = endtag;
}
}
else
p = NULL; /* tag not closed ->END */
if (endtag)
*(endtag - 1) = c;
}
else
{ /* No more '<' */
p = NULL;
}
}
if (found && beginsum && endsum && endsum > beginsum)
{
lensummary = endsum - beginsum;
summary = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lensummary + 1);
memcpy(summary, beginsum, lensummary);
summary[lensummary] = '\0';
}
/* If field is set an no metaname is found, let us search */
/* for something like bla bla */
if (!summary && field)
{
summary = parsetag(sw, field, buffer, 0, CASE_SENSITIVE_OFF);
}
/* Finally check for something after title (if exists) and */
/* after (if exists) */
if (!summary)
{
/* Jump title if it exists */
if ((p = lstrstr(buffer, "")))
{
p += 8;
}
else
p = buffer;
/* Let us try to find */
if ((q = lstrstr(p, "');
}
else
q = p;
summary = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone,strlen(q) + 1);
strcpy(summary,q);
}
if (summary)
{
remove_newlines(summary);
remove_tags(summary);
summary = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)summary);
}
if (summary && size && ((int) strlen(summary)) > size)
summary[size] = '\0';
return summary;
}
#define NO_TAG 0
#define TAG_CLOSE 1
#define TAG_FOUND 2
/* Gets the content between "" and "" from buffer
limiting the scan to the first max_lines lines (0 means all lines) */
static char *parsetag(SWISH *sw, char *parsetag, char *buffer, int max_lines, int case_sensitive)
{
register int c,
d;
register char *p,
*r;
char *tag;
int lencontent;
char *content;
int i,
j,
lines,
status,
tagbuflen,
totaltaglen,
curlencontent;
char *begintag;
char *endtag;
char *newbuf;
char *(*f_strstr) ();
if (case_sensitive)
f_strstr = strstr;
else
f_strstr = lstrstr;
lencontent = strlen(parsetag);
begintag = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lencontent + 3);
endtag = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lencontent + 4);
sprintf(begintag, "<%s>", parsetag);
sprintf(endtag, "%s>", parsetag);
tag = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, 1);
tag[0] = '\0';
content = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, (lencontent = MAXSTRLEN) + 1);
lines = 0;
status = NO_TAG;
p = content;
*p = '\0';
for (r = buffer;;)
{
c = *r++;
if (c == '\n')
{
lines++;
if (max_lines && lines == max_lines)
break;
}
if (!c)
return NULL;
switch (c)
{
case '<':
tag = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, (tagbuflen = MAXSTRLEN) + 1);
totaltaglen = 0;
tag[totaltaglen++] = '<';
/* Collect until find '>' */
while (1)
{
d = *r++;
if (!d)
return NULL;
if (totaltaglen == tagbuflen)
{
newbuf = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, tagbuflen + 200 + 1);
memcpy(newbuf,tag,tagbuflen + 1);
tag = newbuf;
tagbuflen += 200;
}
tag[totaltaglen++] = d;
if (d == '>')
{
tag[totaltaglen] = '\0';
break;
}
}
if (f_strstr(tag, endtag))
{
status = TAG_CLOSE;
*p = '\0';
/* nulls to spaces */
for (i = 0; content[i]; i++)
if (content[i] == '\n')
content[i] = ' ';
/* skip over initial spaces and quotes */
for (i = 0; isspace((int) ((unsigned char) content[i])) || content[i] == '\"'; i++)
;
/* shift buffer to left */
for (j = 0; content[i]; j++)
content[j] = content[i++];
content[j] = '\0';
/* remove trailing spaces, nulls, quotes */
for (j = strlen(content) - 1; ( j >= 0 ) && ( isspace((int) ((unsigned char) content[j])) || content[j] == '\0' || content[j] == '\"'); j--)
content[j] = '\0';
/* replace double quotes with single quotes -- why? */
for (j = 0; content[j]; j++)
if (content[j] == '\"')
content[j] = '\'';
if (*content)
return (content);
else
return NULL;
}
else if (f_strstr(tag, begintag))
{
status = TAG_FOUND;
}
break;
default:
if (status == TAG_FOUND)
{
curlencontent = p - content;
if (curlencontent == lencontent)
{
newbuf = Mem_ZoneAlloc(sw->Index->perDocTmpZone,(lencontent * 2) + 1);
memcpy(newbuf,content,lencontent + 1);
lencontent *= 2;
content = newbuf;
p = content + curlencontent;
}
*p = c;
p++;
}
}
}
return NULL;
}
/* Parses the words in a comment.
*/
int parsecomment(SWISH * sw, char *tag, int filenum, int structure, int metaID, int *position)
{
structure |= IN_COMMENTS;
return indexstring(sw, tag + 1, filenum, structure, 1, &metaID, position);
}
/* Indexes all the words in a html file and adds the appropriate information
** to the appropriate structures.
*/
/* Indexes all the words in a html file and adds the appropriate information
** to the appropriate structures.
*/
int countwords_HTML(SWISH *sw, FileProp *fprop, FileRec *fi, char *buffer)
{
int ftotalwords;
int *metaID;
int metaIDlen;
int position; /* Position of word in file */
int currentmetanames;
char *p,
*newp,
*tag,
*endtag;
int structure;
FileRec *thisFileEntry = fi;
struct metaEntry *metaNameEntry;
IndexFILE *indexf = sw->indexlist;
struct MOD_Index *idx = sw->Index;
char *Content = NULL,
*Name = NULL,
*summary = NULL;
char *title = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)parseHTMLtitle(sw, buffer));
if (!isoktitle(sw, title))
return -2;
if (fprop->stordesc)
summary = parseHtmlSummary(buffer, fprop->stordesc->field, fprop->stordesc->size, sw);
addCommonProperties( sw, fprop, fi, title, summary, 0 );
/* Init meta info */
metaID = (int *) Mem_ZoneAlloc(sw->Index->perDocTmpZone,(metaIDlen = 16) * sizeof(int));
currentmetanames = 0;
ftotalwords = 0;
structure = IN_FILE;
metaID[0] = 1;
position = 1;
for (p = buffer; p && *p;)
{
/* Look for non escaped '<' */
if ((tag = strchr(p, '<')) && ((tag == p) || (*(tag - 1) != '\\')))
{
/* Index up to the tag */
*tag++ = '\0';
newp = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p);
if ( ! currentmetanames )
currentmetanames++;
ftotalwords += indexstring(sw, newp, idx->filenum, structure, currentmetanames, metaID, &position);
/* Now let us look for a not escaped '>' */
for (endtag = tag;;)
if ((endtag = strchr(endtag, '>')))
{
if (*(endtag - 1) != '\\')
break;
else
endtag++;
}
else
break;
if (endtag)
{
*endtag++ = '\0';
if ((tag[0] == '!') && lstrstr(tag, "META") && (lstrstr(tag, "START") || lstrstr(tag, "END")))
{
/* Check for META TAG TYPE 1 */
structure |= IN_META;
if (lstrstr(tag, "START"))
{
char *parsed_tag;
if (
(metaNameEntry =
getHTMLMeta(indexf, tag, sw, NULL, &parsed_tag, fprop->real_path)))
{
/* realloc memory if needed */
if (currentmetanames == metaIDlen)
{
int *newbuf = (int *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, metaIDlen * 2 * sizeof(int));
memcpy((char *)newbuf,(char *)metaID,metaIDlen * sizeof(int));
metaID = newbuf;
metaIDlen *= 2;
}
/* add metaname to array of current metanames */
metaID[currentmetanames] = metaNameEntry->metaID;
/* Bump position for all metanames unless metaname in dontbumppositionOnmetatags */
if (!isDontBumpMetaName(sw->dontbumpstarttagslist, metaNameEntry->metaName))
position++;
currentmetanames++;
p = endtag;
/* If it is also a property store it until a < is found */
if ((metaNameEntry = getPropNameByName(&indexf->header, parsed_tag)))
{
if ((endtag = strchr(p, '<')))
*endtag = '\0';
p = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p);
remove_newlines(p); /** why isn't this just done for the entire doc? */
if (!addDocProperty(&thisFileEntry->docProperties, metaNameEntry, (unsigned char *)p, strlen(p), 0))
progwarn("property '%s' not added for document '%s'\n", metaNameEntry->metaName, fprop->real_path);
if (endtag)
*endtag = '<';
continue;
}
}
}
else if (lstrstr(tag, "END"))
{
/* this will close the last metaname */
if (currentmetanames)
{
currentmetanames--;
if (!currentmetanames)
metaID[0] = 1;
}
}
p = endtag;
}
/* Check for META TAG TYPE 2 */
else if ((tag[0] != '!') && lstrstr(tag, "META") && (Name = lstrstr(tag, "NAME")) && (Content = lstrstr(tag, "CONTENT")))
{
ftotalwords += parseMetaData(sw, indexf, tag, idx->filenum, structure, Name, Content, thisFileEntry, &position, fprop->real_path);
p = endtag;
} /* Check for COMMENT */
else if ((tag[0] == '!') && sw->indexComments)
{
ftotalwords += parsecomment(sw, tag, idx->filenum, structure, 1, &position);
p = endtag;
} /* Default: Continue */
else
{
structure = getstructure(tag, structure);
p = endtag;
}
}
else
p = tag; /* tag not closed: continue */
}
else
{ /* No more '<' */
newp = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p);
if ( ! currentmetanames )
currentmetanames++;
ftotalwords += indexstring(sw, newp, idx->filenum, structure, currentmetanames, metaID, &position);
p = NULL;
}
}
return ftotalwords;
}