/* $Id: index.c,v 1.249 2007/01/05 22:14:10 whmoseley Exp $ ** ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94 ** This file is part of Swish-e. Swish-e is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. Swish-e is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Swish-e; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA See the COPYING file that accompanies the Swish-e distribution for details of the GNU GPL and the special exception available for linking against the Swish-e library. ** Mon May 9 15:51:39 CDT 2005 ** added GPL **-------------------------------------------------------------------- ** ** ** PATCHED 5/13/96, CJC ** ** Added code to countwords and countwordstr to disreguard the last char ** if requiered by the config.h ** G. Hill 3/12/97 ghill@library.berkeley.edu ** ** Changed addentry, countwords, countwordstr, parsecomment, rintindex ** added createMetaEntryList, getMeta, parseMetaData ** to support METADATA ** G. Hill 3/18/97 ghill@library.berkeley.edu ** ** Changed removestops to support printing of stop words ** G. Hill 4/7/97 ** ** Changed countwords, countwrdstr, and parseMetaData to disreguard the ** first char if required by the config.h ** G.Hill 10/16/97 ghill@library.berkeley.edu ** ** Added stripIgnoreLastChars and isIgnoreLastChar routines which iteratively ** remove all ignore characters from the end of each word. ** P. Bergner 10/5/97 bergner@lcse.umn.edu ** ** Added stripIgnoreFirstChars and isIgnoreFirstChar to make stripping of ** the ignore first chars iterative. ** G. Hill 11/19/97 ghill@library.berkeley.edu ** ** Added possibility of use of quotes and brackets in meta CONTENT countwords, parsemetadata ** G. Hill 1/14/98 ** ** Added regex for replace rule G.Hill 1/98 ** ** REQMETANAME - don't index meta tags not specified in MetaNames ** 10/11/99 - Bill Moseley ** ** change sprintf to snprintf to avoid corruption, use MAXPROPLEN instead of literal "20", ** added include of merge.h - missing declaration caused compile error in prototypes, ** added word length arg to Stem() call for strcat overflow checking in stemmer.c ** added safestrcpy() macro to avoid corruption from strcpy overflow ** SRE 11/17/99 ** ** fixed misc problems pointed out by "gcc -Wall" ** SRE 2/22/00 ** ** Added code for storing word positions in index file ** Jose Ruiz 3/00 jmruiz@boe.es ** ** 04/00 - Jose Ruiz ** Added code for a hash table in index file for searching words ** via getfileinfo in search.c (Lots of addons). Better perfomance ** with big databases and or heavy searchs (a* or b* or c*) ** ** 04/00 - Jose Ruiz ** Improved number compression function (compress) ** New number decompress function ** Both converted into macros for better performance ** ** 07/00 and 08/00 - Jose Ruiz ** Many modifications to make some functions thread safe ** ** 08/00 - Jose Ruiz ** New function indexstring. Up to now there were 4 functions doing almost ** the same thing: countwords, countwordstr, parseMetaData and parsecomment ** From now on, these 4 functions calls indexstring wich is the common part ** to all of them. In fact, countwordstr, parseMetaData and parsecomment ** are now simple frontends to indexstring ** ** 2000-11 - rasc ** some redesgin, place common index code into a common routine ** FileProp structures, routines ** ** -- ** TODO ** $$ there has still to be some resesign to be done. ** $$ swish-e was originally designed to index html only. So the routines ** $$ are for historically reasons scattered ** $$ (e.g. isoktitle (), is ishtml() etc.) ** ** 2000-12 Jose Ruiz ** obsolete routine ishtml removed ** isoktitle moved to html.c ** ** 2001-03-02 rasc Header: write translatecharacters ** 2001-03-14 rasc resultHeaderOutput -H n ** 2001-03-24 rasc timeroutines rearranged ** 2001-06-08 wsm Store word after ENTRY to save memory ** 2001-08 jmruiz All locations stuff rewritten to save memory ** */ #include "swish.h" #include "mem.h" #include "swstring.h" #include "index.h" #include "hash.h" #include "check.h" #include "search.h" #include "merge.h" #include "docprop.h" #include "stemmer.h" #include "double_metaphone.h" #include "error.h" #include "file.h" #include "compress.h" /* Removed due to problems with patents #include "deflate.h" */ #include "html.h" #include "xml.h" #include "parser.h" #include "txt.h" #include "metanames.h" #include "result_sort.h" #include "result_output.h" #include "filter.h" #include "date_time.h" #include "db.h" #include "dump.h" #include "swish_qsort.h" #include "swish_words.h" #include "list.h" static void index_path_parts( SWISH *sw, char *path, path_extract_list *list, INDEXDATAHEADER *header, docProperties **properties ); static void SwapLocData(SWISH *,ENTRY *,unsigned char *,int); static void unSwapLocData(SWISH *,int, ENTRY *); static void sortSwapLocData(ENTRY *); /* -- init structures for this module */ void initModule_Index (SWISH *sw) { int i; struct MOD_Index *idx; idx = (struct MOD_Index *) emalloc(sizeof(struct MOD_Index)); memset( idx, 0, sizeof( struct MOD_Index ) ); sw->Index = idx; idx->filenum = 0; idx->entryArray = NULL; idx->len_compression_buffer = MAXSTRLEN; /* For example */ idx->compression_buffer=(unsigned char *)emalloc(idx->len_compression_buffer); idx->len_worddata_buffer = MAXSTRLEN; /* For example */ idx->worddata_buffer=(unsigned char *)emalloc(idx->len_worddata_buffer); idx->sz_worddata_buffer = 0; /* Init entries hash table */ for (i=0; ihashentries[i] = NULL; idx->hashentriesdirty[i] = 0; } /* Economic flag and temp files*/ idx->swap_locdata = SWAP_LOC_DEFAULT; for(i=0;iinode_hash[i]=NULL; /* initialize buffers used by indexstring */ idx->word = (char *) emalloc((idx->lenword = MAXWORDLEN) + 1); idx->swishword = (char *) emalloc((idx->lenswishword = MAXWORDLEN) + 1); idx->plimit=PLIMIT; idx->flimit=FLIMIT; idx->nIgnoreLimitWords = 0; idx->IgnoreLimitPositionsArray = NULL; /* Swapping access file functions */ idx->swap_tell = sw_ftell; idx->swap_write = sw_fwrite; idx->swap_close = sw_fclose; idx->swap_seek = sw_fseek; idx->swap_read = sw_fread; idx->swap_getc = sw_fgetc; idx->swap_putc = sw_fputc; for( i = 0; i swap_location_name[i] = NULL; idx->fp_loc_write[i] = NULL; idx->fp_loc_read[i] = NULL; } /* Index in blocks of chunk_size documents */ idx->chunk_size = INDEX_DEFAULT_CHUNK_SIZE; /* Use this value to avoid using big zones just as a temporary location storage */ idx->optimalChunkLocZoneSize = INDEX_DEFAULT_OPTIMAL_CHUNK_ZONE_SIZE_FOR_LOCATIONS; idx->freeLocMemChain = NULL; /* memory zones for common structures */ idx->perDocTmpZone = Mem_ZoneCreate("Per Doc Temporal Zone", 0, 0); idx->currentChunkLocZone = Mem_ZoneCreate("Current Chunk Locators", 0, 0); idx->totalLocZone = Mem_ZoneCreate("All Locators", 0, 0); idx->entryZone = Mem_ZoneCreate("struct ENTRY", 0, 0); /* table for storing which metaIDs to index */ idx->metaIDtable.max = 200; /* totally random guess */ idx->metaIDtable.num = 0; idx->metaIDtable.array = (int *)emalloc( idx->metaIDtable.max * sizeof(int) ); idx->metaIDtable.defaultID = -1; /* $$$ this is only a fix while http.c and httpserver.c still exist */ idx->tmpdir = estrdup("."); /* By default, we are not in update mode */ idx->update_mode = MODE_INDEX; return; } /* -- release all wired memory for this module -- 2001-04-11 rasc */ void freeModule_Index (SWISH *sw) { struct MOD_Index *idx = sw->Index; int i; /* we need to call the real free here */ for( i = 0; i < MAX_LOC_SWAP_FILES ; i++) { if (idx->swap_location_name[i] && isfile(idx->swap_location_name[i])) { if (idx->fp_loc_read[i]) idx->swap_close(idx->fp_loc_read[i]); if (idx->fp_loc_write[i]) idx->swap_close(idx->fp_loc_write[i]); remove(idx->swap_location_name[i]); } if (idx->swap_location_name[i]) efree(idx->swap_location_name[i]); } if(idx->tmpdir) efree(idx->tmpdir); /* Free compression buffer */ efree(idx->compression_buffer); /* free worddata buffer */ efree(idx->worddata_buffer); /* free word buffers used by indexstring */ efree(idx->word); efree(idx->swishword); /* free IgnoreLimit stuff */ if(idx->IgnoreLimitPositionsArray) { for(i=0; iindexlist->header.totalfiles; i++) { if(idx->IgnoreLimitPositionsArray[i]) { efree(idx->IgnoreLimitPositionsArray[i]->pos); efree(idx->IgnoreLimitPositionsArray[i]); } } efree(idx->IgnoreLimitPositionsArray); } /* should be free by now!!! But just in case... */ if (idx->entryZone) Mem_ZoneFree(&idx->entryZone); if (idx->totalLocZone) Mem_ZoneFree(&idx->totalLocZone); if (idx->currentChunkLocZone) Mem_ZoneFree(&idx->currentChunkLocZone); if (idx->perDocTmpZone) Mem_ZoneFree(&idx->perDocTmpZone); if ( idx->entryArray ) efree( idx->entryArray); efree( idx->metaIDtable.array ); /* free module data */ efree (idx); sw->Index = NULL; return; } /* ** ---------------------------------------------- ** ** Module config code starts here ** ** ---------------------------------------------- */ /* -- Config Directives -- Configuration directives for this Module -- return: 0/1 = none/config applied */ int configModule_Index (SWISH *sw, StringList *sl) { struct MOD_Index *idx = sw->Index; char *w0 = sl->word[0]; int retval = 1; char *env_tmp = NULL; if (strcasecmp(w0, "tmpdir") == 0) { if (sl->n == 2) { idx->tmpdir = erealloc( idx->tmpdir, strlen( sl->word[1] ) + 1 ); strcpy( idx->tmpdir, sl->word[1] ); normalize_path( idx->tmpdir ); if (!isdirectory(idx->tmpdir)) progerr("%s: %s is not a directory", w0, idx->tmpdir); if ( !( env_tmp = getenv("TMPDIR")) ) if ( !(env_tmp = getenv("TMP")) ) env_tmp = getenv("TEMP"); if ( env_tmp ) progwarn("Configuration setting for TmpDir '%s' will be overridden by environment setting '%s'", idx->tmpdir, env_tmp ); } else progerr("%s: requires one value", w0); } else if (strcasecmp(w0, "IgnoreLimit") == 0) { if (sl->n == 3) { idx->plimit = atol(sl->word[1]); idx->flimit = atol(sl->word[2]); } else progerr("%s: requires two values", w0); } else { retval = 0; /* not a module directive */ } return retval; } /************************************************************************** * Remove a file from the index. Used when the parser aborts * while indexing. Typically because of FileRules. * **************************************************************************/ static void remove_last_file_from_list(SWISH * sw, IndexFILE * indexf) { struct MOD_Index *idx = sw->Index; int i; ENTRY *ep, *prev_ep; LOCATION *l; /* Should be removed */ if(idx->filenum == 0 || indexf->header.totalfiles == 0) progerr("Internal error in remove_last_file_from_list"); /* walk the hash list to remove words */ for (i = 0; i < VERYBIGHASHSIZE; i++) { if (idx->hashentriesdirty[i]) { idx->hashentriesdirty[i] = 0; for (ep = idx->hashentries[i], prev_ep =NULL; ep; ep = ep->next) { if(ep->currentChunkLocationList && (ep->currentChunkLocationList != ep->currentlocation)) { if(ep->currentChunkLocationList->filenum == idx->filenum) { /* Now remove locations */ /* Go until filenum changes or reach the compressed ** area (currentlocation) */ for(l = ep->currentChunkLocationList; l; l = l->next) { if(ep->currentlocation == l || (l->filenum != idx->filenum)) break; } /* Adjust tfrequency if entry is in file */ if(l != ep->currentChunkLocationList) { /* Remove last filenum chunks */ ep->currentChunkLocationList = l; /* Decrease word frequency */ ep->tfrequency--; /* Reset last_filenum. At this moment ** ep->u1.last_filenum point contains idx->filenum ** and, after removing the last file it should ** point to the previous one but we do not know ** its value because the previous chunks can bei ** compressed. Fortunately, we are only using ** this value in add_entry routine to ensure ** that we are in a new file. So, resetting ** the value to 0 should be enough */ ep->u1.last_filenum = 0; } } /* If there is no locations we must also remove the word */ /* Do not call efree to remove the entry, entries use ** a MemZone (perDocTmpZone) - Will be freed later */ if(!ep->currentChunkLocationList) { if(!ep->allLocationList) { if(!prev_ep) { idx->hashentries[i] = ep->next; } else { prev_ep->next = ep->next; } /* Adjust word counters */ idx->entryArray->numWords--; indexf->header.totalwords--; } } } else { prev_ep = ep; } } } } /* Decrease index filenum iand totalfiles counter*/ idx->filenum--; indexf->header.totalfiles--; } /************************************************************************** * Index just the file name (or the title) for NoContents files * $$$ this can be removed if libxml2 is used full time **************************************************************************/ static int index_no_content(SWISH * sw, FileProp * fprop, FileRec *fi, char *buffer) { struct MOD_Index *idx = sw->Index; char *title = ""; int n; int position = 1; /* Position of word */ int metaID = 1; /* THIS ASSUMES that that's the default ID number */ /* Look for title if HTML document */ if (fprop->doctype == HTML) { title = parseHTMLtitle( sw , buffer ); if (!isoktitle(sw, title)) return -2; /* skipped because of title */ } #ifdef HAVE_LIBXML2 if (fprop->doctype == HTML2 || !fprop->doctype) return parse_HTML( sw, fprop, fi, buffer ); #endif addCommonProperties( sw, fprop, fi, title, NULL, 0 ); n = indexstring( sw, *title == '\0' ? fprop->real_path : title , idx->filenum, IN_FILE, 1, &metaID, &position); /** ??? $$$ doesn't look right -- check this ***/ if ( *title != '\0' ) efree( title ); return n; } /********************************************************************* ** 2001-08 jmruiz - A couple of specialized routines to be used with ** locations and MemZones. The main goal is avoid malloc/realloc/free ** wich produces a lot of fragmentation ** ** The memory will be allocated in blocks of 64 bytes inside a zone. ** (I have tried both 32 and 64. 32 looks fine ** In this way, there is some overhead because when a new block is ** requested from the MemZone, the space is not recovered. But this ** only true for the current document because the MemZone is reset ** onces the document is processed. Then, the space is recovered ** after a MemZoneReset is issued ** ** 2001-09 jmruiz Improved. Now unused space is recovered when asking ** for space. Free nlocks are maintained using a linked list ********************************************************************/ #define LOC_BLOCK_SIZE 32 /* Must be greater than sizeof(LOCATION) and a power of 2 */ #define LOC_MIN_SIZE ((sizeof(LOCATION) + LOC_BLOCK_SIZE - 1) & (~(LOC_BLOCK_SIZE - 1))) struct loc_chain { struct loc_chain *next; int size; }; /******************************************************************** ** 2001-08 jmruiz ** Routine to allocate memory inside a zone for a plain LOCATION ** (frequency is 1). Since we are asking for LOC_BLOCK_SIZE bytes, we ** are loosing some of the space. ** The advantage is that we do not need to call realloc so often. In ** fact, most realloc function work this way. They asks for more memory ** to avoid the overhead of the sequence malloc, memcpy, free. ********************************************************************/ LOCATION *alloc_location(struct MOD_Index *idx,int size) { struct loc_chain *tmp = (struct loc_chain *) idx->freeLocMemChain; struct loc_chain *big = NULL; LOCATION *tmp2 = NULL; int avail = 0; struct loc_chain *p_avail = NULL; /* Search for a previously freed location of the same size */ while(tmp) { if(tmp->size == size) { if(!tmp2) idx->freeLocMemChain = (LOCATION *)tmp->next; else tmp2->next = (LOCATION *)tmp->next; return (LOCATION *)tmp; } else if(tmp->size > size) { /* Just reserve it to be used if we do not find a match */ big = tmp; } else { p_avail = tmp; avail = tmp->size; /* Check consecutive for consecutive blocks */ while(((unsigned char *)tmp + tmp->size) == (unsigned char *)tmp->next) { avail += tmp->next->size; if(avail == size) { if(!tmp2) idx->freeLocMemChain = (LOCATION *)tmp->next->next; else tmp2->next = (LOCATION *)tmp->next->next; return (LOCATION *)p_avail; } else if(avail > size) { break; } else { tmp = tmp->next; } } } tmp2 = (LOCATION *)tmp; tmp = tmp->next; } /* Perhaps we have a block with greater size */ if(big) { /* Split it */ while(big->size > size) { big->size >>= 1; tmp = (struct loc_chain *) ((unsigned char *)big + big->size); tmp->next = big->next; tmp->size = big->size; if(tmp->size == size) return (LOCATION *)tmp; big->next = tmp; big = tmp; } } /* NO memory in free chain of the same size - Asks for size */ return (LOCATION *)Mem_ZoneAlloc(idx->currentChunkLocZone, size); } LOCATION *new_location(struct MOD_Index *idx) { return (LOCATION *)alloc_location(idx, LOC_MIN_SIZE); } int is_location_full(int size) { int i; /* Fast test. Since LOC_BLOCK_SIZE is the minimum size ... */ if(size % LOC_BLOCK_SIZE) return 0; /* it is not a power of two */ /* Check if size is a power of 2 (32,64,128,256,...) in binary ..000100... */ for(i=LOC_BLOCK_SIZE;;i <<= 1) { if(size>i) { continue; } if((size & i) == size) { return 1; } else { break; } } return 0; } /******************************************************************** ** 2001-08 jmruiz ** Routine to reallocate memory inside a zone for a previous allocated ** LOCATION (frequency > 1). ** A new block is allocated only if the previous becomes full ********************************************************************/ LOCATION *add_position_location(void *oldp, struct MOD_Index *idx, int frequency) { LOCATION *newp = NULL; struct loc_chain *tmp = NULL; int oldsize; oldsize = sizeof(LOCATION) + (frequency - 1) * sizeof(int); /* Check for available size in block */ if(is_location_full(oldsize)) { /* Not enough size - Allocate a new block. Size rounded to LOC_BLOCK_SIZE */ newp = (LOCATION *)alloc_location(idx,oldsize << 1); memcpy((void *)newp,(void *)oldp,oldsize); /* Add old zone to the free chain of blocks */ tmp = (struct loc_chain *)oldp; tmp->next = (struct loc_chain *)idx->freeLocMemChain; tmp->size = oldsize; idx->freeLocMemChain = (LOCATION *) tmp; } else /* Enough size */ newp = oldp; return newp; } #ifdef USE_BTREE /********************************************************************** * file_is_newer_than_existing * - returns true if the new file (based on fprop) is newer than the existing * * Call with: * fprop - file prop for the new file * sw - current SWISH * existing_number - file number * * Returns: * true if new file is newer than existing and existing should be deleted * ***********************************************************************/ static int file_is_newer_than_existing( SWISH *sw, FileProp * fprop, int existing_filenum) { IndexFILE *indexf = sw->indexlist; int ret; propEntry *existing_prop; propEntry *new_prop; FileRec fi; int error_flag; unsigned long tmp; /* Get the date from the existing file */ /* Fetch metaEntry */ if(!indexf->modified_meta) indexf->modified_meta = getPropNameByName( &indexf->header, AUTOPROPERTY_LASTMODIFIED ); memset(&fi, 0, sizeof( FileRec )); fi.filenum = existing_filenum; existing_prop = ReadSingleDocPropertiesFromDisk(indexf, &fi, indexf->modified_meta->metaID, 0 ); /* Don't need the fi struct any more, but can't call freefileinfo() */ if ( fi.prop_index ) efree( fi.prop_index ); /* Create a property for the new file */ if ( !fprop->mtime ) /* was a time provided? */ new_prop = NULL; /* $$$ should be MISSING or NO_PROPERTY */ else /* create a property */ { tmp = PACKLONG(fprop->mtime); new_prop = CreateProperty( indexf->modified_meta, (unsigned char *)&tmp, sizeof( tmp ), 1, &error_flag ); } /* -1 = existing < new_prop | +1 existing > new_prop */ ret = Compare_Properties( indexf->modified_meta, existing_prop, new_prop ); /* Skip new file is older or same age as existing file */ /* and at least one of the files has a date */ /* if neither file has a date then the new one will replace existing */ if (ret >= 0 && ( existing_prop || new_prop ) ) { if (sw->verbose >= 3) { if ( new_prop ) printf(" - Update mode - File '%s' same or older than existing filenum: %d - (Skipping it)\n", fprop->real_path, existing_filenum); else printf(" - Update mode - File '%s' exists with date, but new files does not have a date. Keeping existing file. filenum: %d\n", fprop->real_path, existing_filenum); } freeProperty( existing_prop ); freeProperty( new_prop ); return 0; } /* new file is newer so replace old file */ if (sw->verbose >= 3) printf(" - Update mode - File '%s' replaced existing file number %d because %s\n", fprop->real_path, existing_filenum, existing_prop && new_prop ? "it is newer than the existing file" : existing_prop || new_prop ? "only the new file has a date" : "neither file has a date" ); freeProperty( existing_prop ); freeProperty( new_prop ); return 1; /* go ahead and remove */ } #endif /*************************************************************************** * check_for_replace - Tests if replacing a file in update mode * Call with: * SWISH *sw - the swish object * FileProp *fprop - file properties * * Returns: * true if ok to index new file * false means skip indexing this file * ****************************************************************************/ static int check_for_replace( SWISH *sw, FileProp * fprop ) { #ifndef USE_BTREE return 1; #else int existing_filenum; int existing_is_deleted; int existing_word_count = 0; IndexFILE *indexf = sw->indexlist; int update_mode = sw->Index->update_mode; char *update_string; int return_value; int delete_existing = 0; /* flag to delete existing file */ /* Return true if not in update or remove mode */ if ( MODE_UPDATE != update_mode && MODE_REMOVE != update_mode ) return 1; update_string = MODE_UPDATE == update_mode ? "Update" : "Remove"; /* Always skip in remove mode. In update, skip index if doesn't exist as below */ return_value = MODE_UPDATE == update_mode ? 1 : 0; /* how to exit based on mode */ existing_filenum = DB_ReadFileNum(sw, fprop->real_path, indexf->DB); /* If the file already exists then likely will remove the file, so lookup the old word count */ /* Assumes that zero words indicates that a file was already deleted */ if ( existing_filenum ) existing_word_count = getTotalWordsInFile( indexf, existing_filenum ); existing_is_deleted = existing_filenum && ( 0 == existing_word_count); /* just to make it clear */ if ( sw->verbose >= 5 ) printf("\nFile %s. Existing filenum: %d. Existing is deleted: %d Existing wordcount: %d\n", fprop->real_path, existing_filenum, existing_is_deleted, existing_word_count ); /* Skip if file does not already exist or has not already been deleted */ if ( !existing_filenum ) { if ( sw->verbose >= 4 ) printf(" - %s Mode - File '%s' is a new file and does not exist in the index\n", update_string, fprop->real_path ); return return_value; } if ( existing_is_deleted ) { if ( sw->verbose >= 4 ) printf(" - %s Mode - Existing file '%s' has already been deleted from the index\n", update_string, fprop->real_path ); return return_value; } /* At this point we have an existing file */ /* MODE_REMOVE is always deleted. In update mode, delete if current file is newer than existing */ delete_existing = MODE_REMOVE == update_mode ? 1 : file_is_newer_than_existing( sw, fprop, existing_filenum ); if ( delete_existing ) { DB_RemoveFileNum(sw,existing_filenum,indexf->DB); indexf->header.removedfiles++; indexf->header.removed_word_positions += existing_word_count; if ( sw->verbose >= 3 ) printf(" - %s Mode - Removed existing file '%s' (#%d) from index\n", update_string, fprop->real_path, existing_filenum ); return return_value; /* keep indexing in update mode, other wise don't index */ } return 0; /* don't index the new file -- we are either keeping or are in remove mode */ #endif } /*********************************************************************** -- Start the real indexing process for a file. -- This routine will be called by the different indexing methods -- (httpd, filesystem, etc.) -- The indexed file may be the -- - real file on filesystem -- - tmpfile or work file (shadow of the real file) -- Checks if file has to be send thru filter (file stream) -- 2000-11-19 rasc ***********************************************************************/ void do_index_file(SWISH * sw, FileProp * fprop) { int (*countwords)(SWISH *sw,FileProp *fprop, FileRec *fi, char *buffer); IndexFILE *indexf = sw->indexlist; int wordcount; char *rd_buffer = NULL; /* complete file read into buffer */ struct MOD_Index *idx = sw->Index; char strType[30]; int i; FileRec fi; /* place to hold doc properties */ memset( &fi, 0, sizeof( FileRec ) ); wordcount = -1; /* skip file is the last_mod date is newer than the check date */ if (sw->mtime_limit && fprop->mtime < sw->mtime_limit) { if (sw->verbose >= 3) progwarn("Skipping %s: last_mod date is too old\n", fprop->real_path); /* external program must seek past this data (fseek fails) */ if (fprop->fp) flush_stream( fprop ); return; } /* Upon entry, if fprop->fp is non-NULL then it's already opened and ready to be read from. This is the case with "prog" external programs, *except* when a filter is selected for the file type. If a filter is used with "prog" a temporary file was created (fprop->work_file), and fprop->fp will be NULL (as is with http and fs access methods). 2001-05-13 moseley */ /* Get input file handle */ if (fprop->hasfilter) { /* This checks for a non-null file handle -- but may return true with popen regardless */ if ( !FilterOpen(fprop) ) progerr("Failed to open filter for file '%s'",fprop->real_path); } else if ( !fprop->fp ) { fprop->fp = sw_fopen(fprop->work_path, F_READ_TEXT ); if ( !fprop->fp ) { progwarnno("Failed to open: '%s': ", fprop->work_path); return; } } else /* Already open - flag to prevent closing the stream used with "prog" */ fprop->external_program++; /** Replace the path for ReplaceRules **/ if ( sw->replaceRegexps ) { int matched = 0; fprop->real_path = process_regex_list( fprop->real_path, sw->replaceRegexps, &matched ); } /* Check for the need to remove an existing file first */ if ( !check_for_replace( sw, fprop ) ) { /* report file skipped if not in remove mode * all files are skipped in remove mode */ if ( sw->verbose >= 1 && MODE_REMOVE != sw->Index->update_mode ) printf("Document '%s' not added to index.\n\n", fprop->real_path); if (fprop->fp) flush_stream( fprop ); return; } /** Read the buffer, if not a stream parser **/ #ifdef HAVE_LIBXML2 if ( !fprop->doctype || fprop->doctype == HTML2 || fprop->doctype == XML2 || fprop->doctype == TXT2 ) rd_buffer = NULL; else #endif /* -- Read all data, last 1 is flag that we are expecting text only */ rd_buffer = read_stream(sw, fprop, 1); /* just for fun so we can show total bytes shown */ sw->indexlist->total_bytes += fprop->fsize; /* Set which parser to use */ switch (fprop->doctype) { case TXT: strcpy(strType,"TXT"); countwords = countwords_TXT; break; case HTML: strcpy(strType,"HTML"); countwords = countwords_HTML; break; case XML: strcpy(strType,"XML"); countwords = countwords_XML; break; case WML: strcpy(strType,"WML"); countwords = countwords_HTML; break; #ifdef HAVE_LIBXML2 case XML2: strcpy(strType,"XML2"); countwords = parse_XML; break; case HTML2: strcpy(strType,"HTML2"); countwords = parse_HTML; break; case TXT2: strcpy(strType,"TXT2"); countwords = parse_TXT; break; default: strcpy(strType,"DEFAULT (HTML2)"); countwords = parse_HTML; break; #else /* Default if libxml not installed */ default: strcpy(strType,"DEFAULT (HTML)"); countwords = countwords_HTML; break; #endif } if (sw->verbose >= 3) printf(" - Using %s parser - ",strType); /* Check for NoContents flag and just save the path name */ /* $$$ Note, really need to only read_stream if reading from a pipe. */ /* $$$ waste of disk IO and memory if reading from file system */ if (fprop->index_no_content) countwords = index_no_content; /* Make sure all meta flags are cleared (incase a parser aborts) */ ClearInMetaFlags( &indexf->header ); /* Now bump the file counter */ idx->filenum++; indexf->header.totalfiles++; fi.filenum = idx->filenum; /** PARSE **/ wordcount = countwords(sw, fprop, &fi, rd_buffer); if (!fprop->external_program) /* external_program is not set if a filter is in use */ { if (fprop->hasfilter) FilterClose(fprop); /* close filter pipe - should the filter be flushed? */ else sw_fclose(fprop->fp); /* close file */ } /* Else, it's -S prog so make sure we read all the bytes we are suppose to read! */ /* Can remove the check for fprop->bytes_read once read_stream is no longer used */ else if ( fprop->bytes_read && fprop->bytes_read < fprop->fsize ) flush_stream( fprop ); if (sw->verbose >= 3) { if (wordcount > 0) printf(" (%d words)\n", wordcount); else if (wordcount == 0) printf(" (no words indexed)\n"); else if (wordcount == -1) printf(" (not opened)\n"); else if (wordcount == -2) printf(" (Skipped due to 'FileRules title' setting)\n"); else if (wordcount == -3) printf(" (Skipped due to Robots Exclusion Rule in meta tag)\n"); fflush(stdout); } /* If indexing aborted, remove the last file entry */ if ( wordcount == -3 || wordcount == -2 ) { remove_last_file_from_list( sw, indexf ); return; } /* Continue if a file was not indexed */ if ( wordcount < 0 ) return; if ( DEBUG_MASK & DEBUG_PROPERTIES ) dump_file_properties( indexf, &fi ); /* write properties to disk, and release docprop array (and the prop index array) */ /* Currently this just passes sw, and assumes only one index file when indexing */ WritePropertiesToDisk( sw , &fi ); #ifdef USE_BTREE /* Add the value pair (real_path, filenum) to the database */ DB_WriteFileNum(sw,fi.filenum,fprop->real_path,strlen(fprop->real_path),indexf->DB); /* We always need this value in USE_BTREE mode */ setTotalWordsPerFile(indexf, fi.filenum - 1,wordcount); #else /* Save total words per file */ if ( !indexf->header.ignoreTotalWordCountWhenRanking ) { setTotalWordsPerFile(indexf, fi.filenum - 1,wordcount); } #endif /* Compress the entries */ { ENTRY *ep; /* walk the hash list, and compress entries */ for (i = 0; i < VERYBIGHASHSIZE; i++) { if (idx->hashentriesdirty[i]) { idx->hashentriesdirty[i] = 0; for (ep = idx->hashentries[i]; ep; ep = ep->next) CompressCurrentLocEntry(sw, ep); } } /* Coalesce word positions int a more optimal schema to avoid maintain the location data contiguous */ if(idx->filenum && ((!(idx->filenum % idx->chunk_size)) || (Mem_ZoneSize(idx->currentChunkLocZone) > idx->optimalChunkLocZoneSize))) { for (i = 0; i < VERYBIGHASHSIZE; i++) for (ep = idx->hashentries[i]; ep; ep = ep->next) coalesce_word_locations(sw, ep); /* Make zone available for reuse */ Mem_ZoneReset(idx->currentChunkLocZone); idx->freeLocMemChain = NULL; } } /* Make zone available for reuse */ Mem_ZoneReset(idx->perDocTmpZone); return; } ENTRY *getentry(SWISH * sw, char *word) { IndexFILE *indexf = sw->indexlist; struct MOD_Index *idx = sw->Index; int hashval; ENTRY *e; if (!idx->entryArray) { idx->entryArray = (ENTRYARRAY *) emalloc(sizeof(ENTRYARRAY)); idx->entryArray->numWords = 0; idx->entryArray->elist = NULL; } /* Compute hash value of word */ hashval = verybighash(word); /* Look for the word in the hash array */ for (e = idx->hashentries[hashval]; e; e = e->next) if (strcmp(e->word, word) == 0) break; /* flag hash entry used this file, so that the locations can be "compressed" in do_index_file */ idx->hashentriesdirty[hashval] = 1; /* Word found, return it */ if (e) return e; /* Word not found, so create a new word */ e = (ENTRY *) Mem_ZoneAlloc(idx->entryZone, sizeof(ENTRY) + strlen(word)); strcpy(e->word, word); e->next = idx->hashentries[hashval]; idx->hashentries[hashval] = e; /* Init values */ e->tfrequency = 0; e->u1.last_filenum = 0; e->currentlocation = NULL; e->currentChunkLocationList = NULL; e->allLocationList = NULL; idx->entryArray->numWords++; indexf->header.totalwords++; return e; } /* Adds a word to the master index tree. */ void addentry(SWISH * sw, ENTRY *e, int filenum, int structure, int metaID, int position) { int found; LOCATION *tp, *newtp, *prevtp; IndexFILE *indexf = sw->indexlist; struct MOD_Index *idx = sw->Index; indexf->total_word_positions_cur_run++; if ( DEBUG_MASK & DEBUG_WORDS ) { struct metaEntry *m = getMetaNameByID(&indexf->header, metaID); printf(" Adding:[%d:%s(%d)] '%s' Pos:%d Stuct:0x%0X (", filenum, m ? m->metaName : "PROP_UNKNOWN", metaID, e->word, position, structure); if ( structure & IN_EMPHASIZED ) printf(" EM"); if ( structure & IN_HEADER ) printf(" HEADING"); if ( structure & IN_COMMENTS ) printf(" COMMENT"); if ( structure & IN_META ) printf(" META"); if ( structure & IN_BODY ) printf(" BODY"); if ( structure & IN_HEAD ) printf(" HEAD"); if ( structure & IN_TITLE ) printf(" TITLE"); if ( structure & IN_FILE ) printf(" FILE"); printf(" )\n"); } /* Check for first time */ if(!e->tfrequency) { /* create a location record */ tp = (LOCATION *) new_location(idx); tp->filenum = filenum; tp->frequency = 1; tp->metaID = metaID; tp->posdata[0] = SET_POSDATA(position,structure); tp->next = NULL; e->currentChunkLocationList = tp; e->tfrequency = 1; e->u1.last_filenum = filenum; return; } /* Word found -- look for same metaID and filename */ /* $$$ To do it right, should probably compare the structure, too */ /* Note: filename not needed due to compress we are only looking at the current file */ /* Oct 18, 2001 -- filename is needed since merge adds words in non-filenum order */ tp = e->currentChunkLocationList; found = 0; while (tp != e->currentlocation) { if(tp->metaID == metaID && tp->filenum == filenum ) { found =1; break; } tp = tp->next; } /* matching metaID NOT found. So, add a new LOCATION record onto the word */ /* This expands the size of the location array for this word by one */ if(!found) { /* create the new LOCATION entry */ tp = (LOCATION *) new_location(idx); tp->filenum = filenum; tp->frequency = 1; /* count of times this word in this file:metaID */ tp->metaID = metaID; tp->posdata[0] = SET_POSDATA(position,structure); /* add the new LOCATION onto the array */ tp->next = e->currentChunkLocationList; e->currentChunkLocationList = tp; /* Count number of different files that this word is used in */ if ( e->u1.last_filenum != filenum ) { e->tfrequency++; e->u1.last_filenum = filenum; } return; /* all done */ } /* Otherwise, found matching LOCATION record (matches filenum and metaID) */ /* Just add the position number onto the end by expanding the size of the LOCATION record */ /* 2001/08 jmruiz - Much better memory usage occurs if we use MemZones */ /* MemZone will be reset when the doc is completely proccesed */ newtp = add_position_location(tp, idx, tp->frequency); if(newtp != tp) { if(e->currentChunkLocationList == tp) e->currentChunkLocationList = newtp; else for(prevtp = e->currentChunkLocationList;;prevtp = prevtp->next) { if(prevtp->next == tp) { prevtp->next = newtp; break; } } tp = newtp; } tp->posdata[tp->frequency++] = SET_POSDATA(position,structure); } /******************************************************************* * Adds common file properties to the last entry in the file array * (which should be the current one) * * * Call with: * *SWISH - need for indexing words * *fprop * *fi * *summary - document summary (why here?) * start - start position of a sub-document * size - size in bytes of document * * Returns: * void * * Note: * Uses cached meta entries (created in metanames.c) to save the * metaEntry lookup by name costs * ********************************************************************/ void addCommonProperties( SWISH *sw, FileProp *fprop, FileRec *fi, char *title, char *summary, int start ) { struct metaEntry *q; docProperties **properties = &fi->docProperties; unsigned long tmp; int metaID; INDEXDATAHEADER *header = &sw->indexlist->header; char *filename = fprop->real_path; /* should always have a path */ int filenum = fi->filenum; /* Check if filename is internal swish metadata -- should be! */ if ((q = getPropNameByName(header, AUTOPROPERTY_DOCPATH))) addDocProperty( properties, q, (unsigned char *)filename, strlen(filename),0); /* Perhaps we want it to be indexed ... */ if ((q = getMetaNameByName(header, AUTOPROPERTY_DOCPATH))) { int metaID, positionMeta; metaID = q->metaID; positionMeta = 1; indexstring(sw, filename, filenum, IN_FILE, 1, &metaID, &positionMeta); } /* This allows extracting out parts of a path and indexing as a separate meta name */ if ( sw->pathExtractList ) index_path_parts( sw, fprop->orig_path, sw->pathExtractList, header, properties ); /* Check if title is internal swish metadata */ if ( title ) { if ( (q = getPropNameByName(header, AUTOPROPERTY_TITLE))) addDocProperty(properties, q, (unsigned char *)title, strlen(title),0); /* Perhaps we want it to be indexed ... */ if ( (q = getMetaNameByName(header, AUTOPROPERTY_TITLE))) { int positionMeta; metaID = q->metaID; positionMeta = 1; indexstring(sw, title, filenum, IN_FILE, 1, &metaID, &positionMeta); } } if ( summary ) { if ( (q = getPropNameByName(header, AUTOPROPERTY_SUMMARY))) addDocProperty(properties, q, (unsigned char *)summary, strlen(summary),0); if ( (q = getMetaNameByName(header, AUTOPROPERTY_SUMMARY))) { int metaID, positionMeta; metaID = q->metaID; positionMeta = 1; indexstring(sw, summary, filenum, IN_FILE, 1, &metaID, &positionMeta); } } /* Currently don't allow indexing by date or size or position */ /* mtime is a time_t, but we don't have an entry for NOT A TIME. Does anyone care about the first second of 1970? */ if ( fprop->mtime && (q = getPropNameByName(header, AUTOPROPERTY_LASTMODIFIED))) { tmp = (unsigned long) fprop->mtime; tmp = PACKLONG(tmp); /* make it portable */ addDocProperty(properties, q, (unsigned char *) &tmp, sizeof(tmp),1); } if ( (q = getPropNameByName(header, AUTOPROPERTY_DOCSIZE))) { /* Use the disk size, if available */ tmp = (unsigned long) ( fprop->source_size ? fprop->source_size : fprop->fsize); tmp = PACKLONG(tmp); /* make it portable */ addDocProperty(properties, q, (unsigned char *) &tmp, sizeof(tmp),1); } if ( (q = getPropNameByName(header, AUTOPROPERTY_STARTPOS))) { tmp = (unsigned long) start; tmp = PACKLONG(tmp); /* make it portable */ addDocProperty(properties, q, (unsigned char *) &tmp, sizeof(tmp),1); } } /******************************************************************* * extracts out parts from a path name and indexes that part * ********************************************************************/ static void index_path_parts( SWISH *sw, char *path, path_extract_list *list, INDEXDATAHEADER *header, docProperties **properties ) { int metaID; int positionMeta = 1; int matched = 0; /* flag if any patterns matched */ while ( list ) { char *str = process_regex_list( estrdup(path), list->regex, &matched ); if ( !matched ) { /* use default? */ if ( list->meta_entry->extractpath_default ) { metaID = list->meta_entry->metaID; indexstring(sw, list->meta_entry->extractpath_default, sw->Index->filenum, IN_FILE, 1, &metaID, &positionMeta); } } else { struct metaEntry *q; metaID = list->meta_entry->metaID; indexstring(sw, str, sw->Index->filenum, IN_FILE, 1, &metaID, &positionMeta); if ((q = getPropNameByName(header, list->meta_entry->metaName ))) addDocProperty( properties, q, (unsigned char *)str, strlen(str),0); efree( str ); } matched = 0; list = list->next; } } /* Just goes through the master list of files and ** counts 'em. */ int getfilecount(IndexFILE * indexf) { return indexf->header.totalfiles; } /* Removes words that occur in over _plimit_ percent of the files and ** that occur in over _flimit_ files (marks them as stopwords, that is). */ /* 05/00 Jose Ruiz ** Recompute positions when a stopword is removed from lists ** This piece of code is terrorific because the first goal ** was getting the best possible performace. So, the code is not ** very clear. ** The main problem is to recalculate word positions for all ** the words after removing the automatic stop words. This means ** looking at all word's positions for each automatic stop word ** and decrement its position */ /* 2001/02 jmruiz - rewritten - all the proccess is made in one pass to achieve better performance */ /* 2001-08 jmruiz - rewritten - adapted to new locations and zone schema */ /* 2002-07 jmruiz - rewritten - adapted to new -e schema */ int getNumberOfIgnoreLimitWords(SWISH *sw) { return sw->Index->nIgnoreLimitWords; } void getPositionsFromIgnoreLimitWords(SWISH * sw) { int i, k, m, stopwords, percent, chunk_size, metaID, frequency, tmpval, filenum; unsigned int *posdata; unsigned int local_posdata[MAX_STACK_POSITIONS]; LOCATION *l, *next; ENTRY *ep, *ep2; ENTRY **estop = NULL; int estopsz = 0, estopmsz = 0; int totalwords; IndexFILE *indexf = sw->indexlist; int totalfiles = getfilecount(indexf); struct IgnoreLimitPositions **filepos = NULL; struct IgnoreLimitPositions *fpos; struct MOD_Index *idx = sw->Index; unsigned char *p, *q, *compressed_data, flag; int last_loc_swap; stopwords = 0; totalwords = indexf->header.totalwords; idx->nIgnoreLimitWords = 0; idx->IgnoreLimitPositionsArray = NULL; if (!totalwords || idx->plimit >= NO_PLIMIT) return; if (sw->verbose) { printf("\r Getting IgnoreLimit stopwords: ..."); fflush(stdout); } if (!estopmsz) { estopmsz = 1; estop = (ENTRY **) emalloc(estopmsz * sizeof(ENTRY *)); } /* this is the easy part: Remove the automatic stopwords from the hash array */ /* Builds a list estop[] of ENTRY's that need to be removed */ for (i = 0; i < VERYBIGHASHSIZE; i++) { for (ep2 = NULL, ep = sw->Index->hashentries[i]; ep; ep = ep->next) { percent = (ep->tfrequency * 100) / totalfiles; if (percent >= idx->plimit && ep->tfrequency >= idx->flimit) { add_word_to_hash_table( &indexf->header.hashstoplist, ep->word, HASHSIZE); /* for printing words removed at the end */ idx->IgnoreLimitWords = addswline( idx->IgnoreLimitWords, ep->word); stopwords++; /* unlink the ENTRY from the hash */ if (ep2) ep2->next = ep->next; else sw->Index->hashentries[i] = ep->next; totalwords--; sw->Index->entryArray->numWords--; indexf->header.totalwords--; /* Reallocte if more space is needed */ if (estopsz == estopmsz) { estopmsz *= 2; estop = (ENTRY **) erealloc(estop, estopmsz * sizeof(ENTRY *)); } /* estop is an array of ENTRY's that need to be removed */ estop[estopsz++] = ep; } else ep2 = ep; } } /* If we have automatic stopwords we have to recalculate word positions */ if (estopsz) { /* Build an array with all the files positions to be removed */ filepos = (struct IgnoreLimitPositions **) emalloc(totalfiles * sizeof(struct IgnoreLimitPositions *)); for (i = 0; i < totalfiles; i++) filepos[i] = NULL; /* Process each automatic stop word */ for (i = 0; i < estopsz; i++) { ep = estop[i]; if (sw->verbose) { printf("\r Getting IgnoreLimit stopwords: %25s",ep->word); fflush(stdout); } if(sw->Index->swap_locdata) { /* jmruiz - Be careful with this lines!!!! If we have a lot of words, ** probably this code can be very slow and may be rethought. ** Fortunately, only a few words must usually raise a IgnoreLimit option */ last_loc_swap = (verybighash(ep->word) * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1); unSwapLocData(sw, last_loc_swap, ep ); } /* Run through location list to get positions */ for(l=ep->allLocationList;l;) { compressed_data = (unsigned char *) l; /* Preserve next element */ next = *(LOCATION **)compressed_data; /* Jump pointer to next element */ p = compressed_data + sizeof(LOCATION *); metaID = uncompress2(&p); memcpy((char *)&chunk_size,(char *)p,sizeof(chunk_size)); p += sizeof(chunk_size); filenum = 0; while(chunk_size) { /* Read on all items */ q = p; uncompress_location_values(&p,&flag,&tmpval,&frequency); filenum += tmpval; if(frequency > MAX_STACK_POSITIONS) posdata = (unsigned int *) emalloc(frequency * sizeof(int)); else posdata = local_posdata; uncompress_location_positions(&p,flag,frequency,posdata); chunk_size -= (p-q); /* Now build the list by filenum of meta/position info */ if (!filepos[filenum - 1]) { fpos = (struct IgnoreLimitPositions *) emalloc(sizeof(struct IgnoreLimitPositions)); fpos->pos = (int *) emalloc(frequency * 2 * sizeof(int)); fpos->n = 0; filepos[filenum - 1] = fpos; } else /* file exists in array. just append the meta and position data */ { fpos = filepos[filenum - 1]; fpos->pos = (int *) erealloc(fpos->pos, (fpos->n + frequency) * 2 * sizeof(int)); } for (m = fpos->n * 2, k = 0; k < frequency; k++) { fpos->pos[m++] = metaID; fpos->pos[m++] = GET_POSITION(posdata[k]); } fpos->n += frequency; if(posdata != local_posdata) efree(posdata); } l = next; } if(sw->Index->swap_locdata) Mem_ZoneReset(idx->totalLocZone); } /* sort each file sort entries by metaname/position */ for (i = 0; i < totalfiles; i++) { if (filepos[i]) swish_qsort(filepos[i]->pos, filepos[i]->n, 2 * sizeof(int), &icomp2); } } idx->nIgnoreLimitWords = estopsz; idx->IgnoreLimitPositionsArray = filepos; if (sw->verbose) { printf("\r Getting IgnoreLimit stopwords: Complete \n"); fflush(stdout); } } /* 2001-08 jmruiz - Adjust positions if there was IgnoreLimit stopwords ** In all cases, removes null end of chunk marks */ void adjustWordPositions(unsigned char *worddata, int *sz_worddata, int n_files, struct IgnoreLimitPositions **ilp) { int frequency, metaID, tmpval, r_filenum, w_filenum; unsigned int *posdata; int i,j,k; unsigned long r_nextposmeta; unsigned char *w_nextposmeta; unsigned int local_posdata[MAX_STACK_POSITIONS]; unsigned char r_flag, *w_flag; unsigned char *p, *q; /* TODO total_word_pos count for index header -- should be adjusted to reflect any positions +/- */ p = worddata; tmpval = uncompress2(&p); /* tfrequency */ metaID = uncompress2(&p); /* metaID */ r_nextposmeta = UNPACKLONG2(p); w_nextposmeta = p; p += sizeof(long); q = p; r_filenum = w_filenum = 0; while(1) { /* Read on all items */ uncompress_location_values(&p,&r_flag,&tmpval,&frequency); r_filenum += tmpval; if(frequency <= MAX_STACK_POSITIONS) posdata = local_posdata; else posdata = (unsigned int *) emalloc(frequency * sizeof(int)); uncompress_location_positions(&p,r_flag,frequency,posdata); if(n_files && ilp && ilp[r_filenum - 1]) { for(i = 0; i < ilp[r_filenum - 1]->n; i++) { tmpval = ilp[r_filenum - 1]->pos[2 * i]; if( tmpval >= metaID) break; } if(tmpval == metaID) { for(j = 0; j < frequency ; j++) { for(k = i; k < ilp[r_filenum - 1]->n ; k++) { if(ilp[r_filenum - 1]->pos[2 * k] != metaID || ilp[r_filenum - 1]->pos[2 * k + 1] > GET_POSITION(posdata[j])) break; /* End */ } posdata[j] = SET_POSDATA(GET_POSITION(posdata[j]) - (k-i), GET_STRUCTURE(posdata[j])); } } } /* Store the filenum incrementally to save space */ compress_location_values(&q,&w_flag,r_filenum - w_filenum,frequency, posdata); w_filenum = r_filenum; /* store positions */ compress_location_positions(&q,w_flag,frequency,posdata); if(posdata != local_posdata) efree(posdata); if(!p[0]) /* End of chunk mark */ { r_filenum = 0; /* reset filenum */ p++; } if ((p - worddata) == *sz_worddata) break; /* End of worddata */ if ((unsigned long)(p - worddata) == r_nextposmeta) { if(q != p) PACKLONG2(q - worddata, w_nextposmeta); metaID = uncompress2(&p); q = compress3(metaID,q); r_nextposmeta = UNPACKLONG2(p); p += sizeof(long); w_nextposmeta = q; q += sizeof(long); w_filenum = 0; } } *sz_worddata = q - worddata; PACKLONG2(*sz_worddata, w_nextposmeta); } /* ** This is an all new ranking algorithm. I can't say it is based on anything, ** but it does seem to be better than what was used before! ** 2001/05 wsm ** ** Parameters: ** sw ** Pointer to SWISH structure ** ** freq ** Number of times this word appeared in this file ** ** tfreq ** Number of files this word appeared in this index (not used for ranking) ** ** words ** Number of owrds in this file ** ** structure ** Bit mask of context where this word appeared ** ** ignoreTotalWordCount ** Ignore total word count when ranking (config file parameter) */ int entrystructcmp(const void *e1, const void *e2) { const ENTRY *ep1 = *(ENTRY * const *) e1; const ENTRY *ep2 = *(ENTRY * const *) e2; return (strcmp(ep1->word, ep2->word)); } /* Sorts the words */ void sort_words(SWISH * sw) { int i, j; ENTRY *e; if (!sw->Index->entryArray || !sw->Index->entryArray->numWords) return; if (sw->verbose) { printf("Sorting %s words alphabetically\n", comma_long( sw->Index->entryArray->numWords ) ); fflush(stdout); } /* Build the array with the pointers to the entries */ sw->Index->entryArray->elist = (ENTRY **) emalloc(sw->Index->entryArray->numWords * sizeof(ENTRY *)); /* Fill the array with all the entries */ for (i = 0, j = 0; i < VERYBIGHASHSIZE; i++) for (e = sw->Index->hashentries[i]; e; e = e->next) sw->Index->entryArray->elist[j++] = e; /* Sort them */ swish_qsort(sw->Index->entryArray->elist, sw->Index->entryArray->numWords, sizeof(ENTRY *), &entrystructcmp); } /* Sort chunk locations of entry e by metaID, filenum */ static void sortChunkLocations(ENTRY * e) { int i, j, k, filenum,metaID,frequency; unsigned char flag; unsigned char *ptmp, *ptmp2, *compressed_data; int *pi = NULL; LOCATION *l, *prev = NULL, **lp; /* Very trivial case */ if (!e) return; if(!e->currentChunkLocationList) return; /* Get the number of locations in chunk */ for(i = 0, l = e->currentChunkLocationList; l; i++) l=*(LOCATION **)l; /* Get next location */ /* Compute array wide */ j = 2 * sizeof(int) + sizeof(void *); /* Compute array size */ ptmp = (void *) emalloc(j * i); /* Build an array with the elements to compare and pointers to data */ for(l = e->currentChunkLocationList, ptmp2 = ptmp; l; ) { pi = (int *) ptmp2; compressed_data = (unsigned char *)l; /* Jump next offset */ compressed_data += sizeof(LOCATION *); metaID = uncompress2(&compressed_data); uncompress_location_values(&compressed_data,&flag,&filenum,&frequency); pi[0] = metaID; pi[1] = filenum; ptmp2 += 2 * sizeof(int); lp = (LOCATION **)ptmp2; *lp = l; ptmp2 += sizeof(void *); /* Get next location */ l=*(LOCATION **)l; /* Get next location */ } /* Sort them */ swish_qsort(ptmp, i, j, &icomp2); /* Store results */ for (k = 0, ptmp2 = ptmp; k < i; k++) { ptmp2 += 2 * sizeof(int); l = *(LOCATION **)ptmp2; if(!k) e->currentChunkLocationList = l; else prev->next =l; ptmp2 += sizeof(void *); prev = l; } l->next =NULL; /* Free the memory of the array */ efree(ptmp); } void coalesce_all_word_locations(SWISH * sw, IndexFILE * indexf) { int i; ENTRY *epi; for (i = 0; i < VERYBIGHASHSIZE; i++) { if ((epi = sw->Index->hashentries[i])) { while (epi) { coalesce_word_locations(sw, epi); epi = epi->next; } } } } /* Write the index entries that hold the word, rank, and other information. */ #ifndef USE_BTREE void write_index(SWISH * sw, IndexFILE * indexf) { int i; ENTRYARRAY *ep; ENTRY *epi; int totalwords; int percent, lastPercent, n; int last_loc_swap; #define DELTA 10 if ( !(ep = sw->Index->entryArray )) return; /* nothing to do */ totalwords = ep->numWords; DB_InitWriteWords(sw, indexf->DB); if (sw->verbose) { printf(" Writing word text: ..."); fflush(stdout); } /* This is not longer needed. So free it as soon as possible */ Mem_ZoneFree(&sw->Index->perDocTmpZone); /* This is not longer needed. So free it as soon as possible */ Mem_ZoneFree(&sw->Index->currentChunkLocZone); /* If we are swaping locs to file, reset memory zone */ if(sw->Index->swap_locdata) Mem_ZoneReset(sw->Index->totalLocZone); n = lastPercent = 0; for (i = 0; i < totalwords; i++) { if ( sw->verbose && totalwords > 10000 ) // just some random guess { n++; percent = (n * 100)/totalwords; if (percent - lastPercent >= DELTA ) { printf("\r Writing word text: %3d%%", percent ); fflush(stdout); lastPercent = percent; } } epi = ep->elist[i]; /* why check for stopwords here? removestopwords could have remove them */ if ( !is_word_in_hash_table( indexf->header.hashstoplist, epi->word ) ) { /* Write word to index file */ write_word(sw, epi, indexf); } else epi->u1.wordID = (sw_off_t)-1; /* flag as a stop word */ } if (sw->verbose) { printf("\r Writing word text: Complete\n" ); printf(" Writing word hash: ..."); fflush(stdout); } n = lastPercent = 0; for (i = 0; i < VERYBIGHASHSIZE; i++) { if ( sw->verbose ) { n++; percent = (n * 100)/VERYBIGHASHSIZE; if (percent - lastPercent >= DELTA ) { printf("\r Writing word hash: %3d%%", percent ); fflush(stdout); lastPercent = percent; } } if ((epi = sw->Index->hashentries[i])) { while (epi) { /* If it is not a stopword write it */ if (epi->u1.wordID > (sw_off_t)0) DB_WriteWordHash(sw, epi->word,epi->u1.wordID,indexf->DB); epi = epi->next; } } } if (sw->verbose) { printf("\r Writing word hash: Complete\n" ); printf(" Writing word data: ..."); fflush(stdout); } n = lastPercent = last_loc_swap = -1; for (i = 0; i < VERYBIGHASHSIZE; i++) { /* If we are in economic mode -e restore locations */ if(sw->Index->swap_locdata) { if (((i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1)) != last_loc_swap) { /* Free not longer needed memory */ Mem_ZoneReset(sw->Index->totalLocZone); last_loc_swap = (i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1); unSwapLocData(sw, last_loc_swap, NULL ); } } if ((epi = sw->Index->hashentries[i])) { while (epi) { /* If we are in economic mode -e we must sort locations by metaID, filenum */ if(sw->Index->swap_locdata) { sortSwapLocData(epi); } if ( sw->verbose && totalwords > 10000 ) // just some random guess { n++; percent = (n * 100)/totalwords; if (percent - lastPercent >= DELTA ) { printf("\r Writing word data: %3d%%", percent ); fflush(stdout); lastPercent = percent; } } if (epi->u1.wordID > (sw_off_t)0) /* Not a stopword */ { build_worddata(sw, epi); write_worddata(sw, epi, indexf); } epi = epi->next; } } } if (sw->verbose) printf("\r Writing word data: Complete\n" ); DB_EndWriteWords(sw, indexf->DB); /* free all ENTRY structs at once */ Mem_ZoneFree(&sw->Index->entryZone); /* free all location compressed data */ Mem_ZoneFree(&sw->Index->totalLocZone); efree(ep->elist); } #else void write_index(SWISH * sw, IndexFILE * indexf) { int i; ENTRYARRAY *ep; ENTRY *epi; int totalwords; int percent, lastPercent, n; int last_loc_swap; long old_wordid; unsigned char *buffer =NULL; int sz_buffer = 0; #define DELTA 10 if ( !(ep = sw->Index->entryArray )) return; /* nothing to do */ totalwords = ep->numWords; /* Write words */ DB_InitWriteWords(sw, indexf->DB); if (sw->verbose) { printf(" Writing word text: ..."); fflush(stdout); } /* This is not longer needed. So free it as soon as possible */ Mem_ZoneFree(&sw->Index->perDocTmpZone); /* This is not longer needed. So free it as soon as possible */ Mem_ZoneFree(&sw->Index->currentChunkLocZone); /* If we are swaping locs to file, reset memory zone */ if(sw->Index->swap_locdata) Mem_ZoneReset(sw->Index->totalLocZone); n = lastPercent = last_loc_swap = -1; for (i = 0; i < VERYBIGHASHSIZE; i++) { /* If we are in economic mode -e restore locations */ if(sw->Index->swap_locdata) { if (((i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1)) != last_loc_swap) { /* Free not longer needed memory */ Mem_ZoneReset(sw->Index->totalLocZone); last_loc_swap = (i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1); unSwapLocData(sw, last_loc_swap, NULL ); } } if ((epi = sw->Index->hashentries[i])) { while (epi) { /* If we are in economic mode -e we must sort locations by metaID, filenum */ if(sw->Index->swap_locdata) { sortSwapLocData(epi); } if ( sw->verbose && totalwords > 10000 ) // just some random guess { n++; percent = (n * 100)/totalwords; if (percent - lastPercent >= DELTA ) { printf("\r Writing word data: %3d%%", percent ); fflush(stdout); lastPercent = percent; } } /* why check for stopwords here? removestopwords could have remove them */ if ( !is_word_in_hash_table( indexf->header.hashstoplist, epi->word ) ) /* Not a stopword */ { /* Build worddata buffer */ build_worddata(sw, epi); /* let's see if word is already in the index */ old_wordid = read_worddata(sw, epi, indexf, &buffer, &sz_buffer); /* If exists, we have to add the new worddata buffer to the old one */ if(old_wordid) { add_worddata(sw, buffer, sz_buffer); efree(buffer); buffer = NULL; sz_buffer = 0; delete_worddata(sw, old_wordid, indexf); write_worddata(sw, epi, indexf); update_wordID(sw, epi, indexf); } else { /* Reset last error. It was set in read_worddata if ** word was not found */ sw->lasterror = RC_OK; /* Write word to index file */ write_worddata(sw, epi, indexf); write_word(sw, epi, indexf); } } epi = epi->next; } } } if (sw->verbose) { printf("\r Writing word text: Complete\n" ); fflush(stdout); } DB_EndWriteWords(sw, indexf->DB); /* free all ENTRY structs at once */ Mem_ZoneFree(&sw->Index->entryZone); /* free all location compressed data */ Mem_ZoneFree(&sw->Index->totalLocZone); efree(ep->elist); } #endif static void addword( char *word, SWISH * sw, int filenum, int structure, int numMetaNames, int *metaID, int *word_position) { int i; /* Add the word for each nested metaname. */ for (i = 0; i < numMetaNames; i++) (void) addentry(sw, getentry(sw,word), filenum, structure, metaID[i], *word_position); (*word_position)++; } /* Gets the next white-space delimited word */ int next_word( char **buf, char **word, int *lenword ) { int i; /* skip any whitespace */ while ( **buf && isspace( (unsigned char) **buf) ) (*buf)++; i = 0; while ( **buf && !isspace( (unsigned char) **buf) ) { /* reallocate buffer, if needed */ if ( i == *lenword ) { *lenword *= 2; *word = erealloc(*word, *lenword + 1); } (*word)[i++] = **buf; (*buf)++; } if ( i ) { (*word)[i] = '\0'; return 1; } return 0; } /* Gets the next non WordChars delimited word */ /* Bumps position if needed */ int next_swish_word(SWISH * sw, char **buf, char **word, int *lenword, int *word_position ) { int i; IndexFILE *indexf = sw->indexlist; int bump_flag = 0; /* skip non-wordchars and check for bump chars */ while ( **buf && !iswordchar(indexf->header, **buf ) ) { if (!bump_flag && isBumpPositionCounterChar(&indexf->header, (int) **buf)) bump_flag++; (*buf)++; } i = 0; while ( **buf && iswordchar(indexf->header, **buf) ) { /* It doesn't really make sense to have a WordChar that's also a bump char */ if (!bump_flag && isBumpPositionCounterChar(&indexf->header, (int) **buf)) bump_flag++; /* reallocate buffer, if needed */ if ( i == *lenword ) { *lenword *= 2; *word = erealloc(*word, *lenword + 1); } (*word)[i++] = **buf; (*buf)++; } /* If any bump chars were found then bump to prevent phrase matching */ if ( bump_flag ) (*word_position)++; if ( i ) { (*word)[i] = '\0'; stripIgnoreLastChars(&indexf->header, *word); stripIgnoreFirstChars(&indexf->header, *word); return *word ? 1 : 0; } return 0; } /****************************************************************** * Build the list of metaIDs that need to be indexed * * Returns number of IDs found * * ******************************************************************/ static int build_metaID_list( SWISH *sw ) { struct MOD_Index *idx = sw->Index; METAIDTABLE *metas = &idx->metaIDtable; IndexFILE *indexf = sw->indexlist; INDEXDATAHEADER *header = &indexf->header; struct metaEntry *m; int i; /* cache the default metaID for speed */ if ( metas->defaultID == -1 ) { m = getMetaNameByName( header, AUTOPROPERTY_DEFAULT ); metas->defaultID = m ? m->metaID : 0; } metas->num = 0; /* Would be smart to track number of metas flagged so not to loop through all for every lookup */ for ( i = 0; i < header->metaCounter; i++) { m = header->metaEntryArray[i]; if ( (m->metaType & META_INDEX) && m->in_tag ) { if ( ++metas->num > metas->max ) metas->array = (int *)erealloc( metas->array, (metas->max = metas->num + 200) ); metas->array[metas->num - 1] = m->metaID; } } /* If no metas found to index, then add default metaID */ if ( !metas->num && metas->defaultID ) metas->array[metas->num++] = metas->defaultID; return metas->num; } /****************************************************************** * Index a string * * ******************************************************************/ /* 05/2001 Jose Ruiz - Changed word and swishword buffers to make this routine ** thread safe */ int indexstring(SWISH * sw, char *s, int filenum, int structure, int numMetaNames, int *metaID, int *position) { int wordcount = 0; IndexFILE *indexf = sw->indexlist; char *buf_pos; /* pointer to current position */ char *cur_pos; /* pointer to position with a word */ struct MOD_Index *idx = sw->Index; /* Assign word buffers */ char *word = idx->word; int lenword = idx->lenword; char *swishword = idx->swishword; int lenswishword = idx->lenswishword; struct swline *sp_stem = NULL; int not_fuzzy = FUZZY_NONE == indexf->header.fuzzy_data->stemmer->fuzzy_mode; /* Generate list of metaIDs to index unless passed in */ if ( !metaID ) { if ( !(numMetaNames = build_metaID_list( sw )) ) return 0; else metaID = idx->metaIDtable.array; } /* current pointer into buffer */ buf_pos = s; /* get the next word as defined by whitespace */ while ( next_word( &buf_pos, &word, &lenword ) ) { if ( DEBUG_MASK & DEBUG_PARSED_WORDS ) printf("White-space found word '%s'\n", word ); strtolower(word); /* is this a useful feature? */ if ( indexf->header.hashuselist.count ) { if ( is_word_in_hash_table( indexf->header.hashuselist, word ) ) { addword(word, sw, filenum, structure, numMetaNames, metaID, position ); wordcount++; } continue; } /* Check for buzzwords */ if ( indexf->header.hashbuzzwordlist.count ) { /* only strip when buzzwords are being used since stripped again as a "swish word" */ stripIgnoreLastChars(&indexf->header, word); stripIgnoreFirstChars(&indexf->header, word); if ( !*word ) /* stripped clean? */ continue; if ( is_word_in_hash_table( indexf->header.hashbuzzwordlist, word ) ) { addword(word, sw, filenum, structure, numMetaNames, metaID, position ); wordcount++; continue; } } /* Translate chars */ TranslateChars(indexf->header.translatecharslookuptable, (unsigned char *)word); cur_pos = word; /* Now split the word up into "swish words" */ while ( next_swish_word( sw, &cur_pos, &swishword, &lenswishword, position ) ) { /* Weed out Numbers - or anything that's all the listed chars */ if ( indexf->header.numberchars_used_flag ) { unsigned char *c = (unsigned char *)swishword; /* look for any char that's NOT in the lookup table */ while ( *c ) { if ( !indexf->header.numbercharslookuptable[(int) *c ] ) break; c++; } /* if got all the way through the string then it's only those chars */ if ( !*c ) continue; /* skip this word */ } /* Check Begin & EndCharacters */ if (!indexf->header.begincharslookuptable[(int) ((unsigned char) swishword[0])]) continue; if (!indexf->header.endcharslookuptable[(int) ((unsigned char) swishword[strlen(swishword) - 1])]) continue; /* limit by stopwords, min/max length, max number of digits, ... */ if (!isokword(sw, swishword, indexf)) continue; /* Now translate word if fuzzy mode */ if ( not_fuzzy ) { addword(swishword, sw, filenum, structure, numMetaNames, metaID, position ); wordcount++; } else { char **current_word; int not_first = 0; FUZZY_WORD *fw; sp_stem = NULL; /* pointer to cached stemmed word */ #ifndef NOSTEMCACHE /* 2003/08 jmruiz */ /* Snowball's stemming is very slow. So, let's try some */ /* caching. */ /* Cache for stemming */ /* Only use it if we are not in economic mode (-e) */ /* Note that double-metaphone may produce two values, so don't cache * or figure a way to invalidate the cache entry if there's two words. */ if( !idx->swap_locdata ) { if((sp_stem = is_word_in_hash_table( indexf->hashstemcache, swishword))) { /* cache hit! */ if ( sp_stem->other.data ) /* might be null if the stemmer returned more than one value */ { addword(sp_stem->other.data, sw, filenum, structure, numMetaNames, metaID, position ); wordcount++; continue; /* done, continue on */ } } else { sp_stem = add_word_to_hash_table( &indexf->hashstemcache, swishword, VERYBIGHASHSIZE); sp_stem->other.data = NULL; } } #endif /* NOSTEMCACHE */ /* Convert the word */ fw = fuzzy_convert( indexf->header.fuzzy_data, swishword ); current_word = fw->word_list; while ( *current_word ) { /* when a word stems to more than one word all words should have the same position number */ /* currently, that's only a rare case with double metaphone */ if ( not_first++ ) (*position)--; addword(*current_word, sw, filenum, structure, numMetaNames, metaID, position ); wordcount++; /* If using the stem cache, add reference to the word just entered in the index */ /* but since the "data" payload currently can hold only one value, only cache if one stem */ if ( sp_stem && !sp_stem->other.data && ( 1 == fw->list_size) ) sp_stem->other.data = (getentry(sw,*current_word))->word; current_word++; /* move to next word in list */ } fuzzy_free_word( fw ); } } } /* Buffers can be reallocated - So, reasign them */ idx->word = word; idx->lenword = lenword; idx->swishword = swishword; idx->lenswishword = lenswishword; return wordcount; } /* Coalesce word current word location into the linked list */ void add_coalesced(SWISH *sw, ENTRY *e, unsigned char *coalesced, int sz_coalesced, int metaID) { int tmp; LOCATION *tloc, *tprev; LOCATION **tmploc, **tmploc2; unsigned char *tp; /* Check for economic mode (-e) and swap data to disk */ if(sw->Index->swap_locdata) { tmploc = (LOCATION **)coalesced; *tmploc = (LOCATION *)e; /* Preserve e in buffer */ /* The cast is for avoiding the warning */ SwapLocData(sw, e, coalesced, sz_coalesced); return; } /* Add to the linked list keeping the data sorted by metaname, filenum */ for(tprev =NULL, tloc = e->allLocationList; tloc; ) { tp = (unsigned char *)tloc + sizeof(void *); tmp = uncompress2(&tp); /* Read metaID */ if(tmp > metaID) break; tprev = tloc; tmploc = (LOCATION **)tloc; tloc = *tmploc; } if(! tprev) { tmploc = (LOCATION **)coalesced; *tmploc = e->allLocationList; e->allLocationList = (LOCATION *)coalesced; } else { tmploc = (LOCATION **)coalesced; tmploc2 = (LOCATION **)tprev; *tmploc = *tmploc2; *tmploc2 = (LOCATION *)coalesced; } } void coalesce_word_locations(SWISH * sw, ENTRY *e) { int curmetaID, metaID, curfilenum, filenum, frequency, num_locs, worst_case_size; unsigned int tmp; unsigned char *p, *q, *size_p = NULL; unsigned char uflag, *cflag; LOCATION *loc, *next; static unsigned char static_buffer[COALESCE_BUFFER_MAX_SIZE]; unsigned char *buffer; unsigned int sz_buffer; unsigned char *coalesced_buffer; unsigned int *posdata; unsigned int local_posdata[MAX_STACK_POSITIONS]; /* Check for new locations in the current chunk */ if(!e->currentChunkLocationList) return; /* Sort all pending word locations by metaID, filenum */ sortChunkLocations(e); /* Init buffer to static buffer */ buffer = static_buffer; sz_buffer = COALESCE_BUFFER_MAX_SIZE; /* Init vars */ curmetaID = 0; curfilenum = 0; q = buffer; /* Destination buffer */ num_locs = 0; /* Number of coalesced LOCATIONS */ /* Run on all locations */ for(loc = e->currentChunkLocationList; loc; ) { p = (unsigned char *) loc; /* get next LOCATION in linked list*/ next = * (LOCATION **) loc; p += sizeof(LOCATION *); /* get metaID of LOCATION */ metaID = uncompress2(&p); /* Check for new metaID */ if(metaID != curmetaID) { /* If exits previous data add it to the linked list */ if(curmetaID) { /* add to the linked list and reset values */ /* Update the size of chunk's data in *size_p */ tmp = q - (size_p + sizeof(unsigned int)); /* Write the size */ memcpy(size_p, (char *)&tmp, sizeof(tmp) ); /* Add to the linked list keeping the data sorted by metaname, filenum */ /* Allocate memory space */ coalesced_buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->totalLocZone,q-buffer); /* Copy content to it */ memcpy(coalesced_buffer,buffer,q-buffer); /* Add to the linked list */ add_coalesced(sw, e, coalesced_buffer, q - buffer, curmetaID); } /* Reset values */ curfilenum = 0; curmetaID = metaID; q = buffer + sizeof(void *); /* Make room for linked list pointer */ q = compress3(metaID,q); /* Add metaID */ size_p = q; /* Preserve position for size */ q += sizeof(unsigned int); /* Make room for size */ num_locs = 0; } uncompress_location_values(&p,&uflag,&filenum,&frequency); worst_case_size = sizeof(unsigned char *) + (3 + frequency) * MAXINTCOMPSIZE; while ((q + worst_case_size) - buffer > (int)sz_buffer) { if(!num_locs) { //progerr("Buffer too short in coalesce_word_locations. Increase COALESCE_BUFFER_MAX_SIZE in config.h and rebuild."); /* Allocate new buffer */ unsigned char * new_buffer = emalloc(sz_buffer * 2 + worst_case_size); memcpy(new_buffer,buffer,sz_buffer); sz_buffer = sz_buffer * 2 +worst_case_size; if(buffer != static_buffer) efree(buffer); /* Adjust pointers */ q = new_buffer + (q - buffer); size_p = new_buffer + (size_p -buffer); /* Asign buffer and continue */ buffer = new_buffer; break; } /* add to the linked list and reset values */ /* Update the size of chunk's data in *size_p */ tmp = q - (size_p + sizeof(unsigned int)); /* tmp contains the size */ /* Write the size */ memcpy(size_p,(char *)&tmp,sizeof(tmp)); /* Add to the linked list keeping the data sorted by metaname, filenum */ /* Allocate memory space */ coalesced_buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->totalLocZone,q-buffer); /* Copy content to it */ memcpy(coalesced_buffer,buffer,q-buffer); /* Add to the linked list */ add_coalesced(sw, e, coalesced_buffer, q - buffer, curmetaID); /* Reset values */ curfilenum = 0; curmetaID = metaID; q = buffer + sizeof(void *); /* Make room for linked list pointer */ q = compress3(metaID,q); size_p = q; /* Preserve position for size */ q += sizeof(unsigned int); /* Make room for size */ num_locs = 0; } if(frequency > MAX_STACK_POSITIONS) posdata = (unsigned int *)emalloc(frequency * sizeof(int)); else posdata = local_posdata; uncompress_location_positions(&p,uflag,frequency,posdata); /* Store the filenum incrementally to save space */ compress_location_values(&q,&cflag,filenum - curfilenum,frequency, posdata); curfilenum = filenum; compress_location_positions(&q,cflag,frequency,posdata); if(frequency > MAX_STACK_POSITIONS) efree(posdata); num_locs++; loc = next; } if (num_locs) { /* add to the linked list and reset values */ /* Update the size of chunk's data in *size_p */ tmp = q - (size_p + sizeof(unsigned int)); /* tmp contains the size */ /* Write the size */ memcpy(size_p,(char *)&tmp,sizeof(tmp)); /* Add to the linked list keeping the data sorted by metaname, filenum */ /* Allocate memory space */ coalesced_buffer = (unsigned char *)Mem_ZoneAlloc(sw->Index->totalLocZone,q-buffer); /* Copy content to it */ memcpy(coalesced_buffer,buffer,q-buffer); /* Add to the linked list */ add_coalesced(sw, e, coalesced_buffer, q - buffer, curmetaID); } e->currentChunkLocationList = NULL; e->currentlocation = NULL; /* If we are swaping locs to file, reset also correspondant memory zone */ if(sw->Index->swap_locdata) Mem_ZoneReset(sw->Index->totalLocZone); /* Free buffer if not using static buffer */ if(buffer != static_buffer) efree(buffer); } /* 09/00 Jose Ruiz ** Function to swap location data to a temporal file or ramdisk to save ** memory. Unfortunately we cannot use it with IgnoreLimit option ** enabled. ** The data has been compressed previously in memory. ** Returns the pointer to the file. */ static void SwapLocData(SWISH * sw, ENTRY *e, unsigned char *buf, int lenbuf) { int idx_swap_file; struct MOD_Index *idx = sw->Index; /* 2002-07 jmruiz - Get de corrsponding swap file */ /* Get the corresponding hash value to this word */ /* IMPORTANT!!!! - The routine being used here to compute the hash */ /* must be the same used to store the words */ /* Then we must get the corresponding swap file index */ /* Since we cannot have so many swap files (VERYBIGHASHSIZE for verybighash */ /* routine) we must scale the hash into SWAP_FILES */ idx_swap_file = (verybighash(e->word) * (MAX_LOC_SWAP_FILES -1))/(VERYBIGHASHSIZE -1); if (!idx->fp_loc_write[idx_swap_file]) { idx->fp_loc_write[idx_swap_file] = create_tempfile(sw, F_WRITE_BINARY, "loc", &idx->swap_location_name[idx_swap_file], 0 ); } compress1(lenbuf, idx->fp_loc_write[idx_swap_file], idx->swap_putc); if (idx->swap_write(buf, 1, lenbuf, idx->fp_loc_write[idx_swap_file]) != (unsigned int) lenbuf) { progerr("Cannot write location to swap file"); } } /* 2002-07 jmruiz - New -e schema */ /* Get location data from swap file */ /* If e is null, all data will be restored */ /* If e si not null, only the location for this data will be readed */ static void unSwapLocData(SWISH * sw, int idx_swap_file, ENTRY *ep) { unsigned char *buf; int lenbuf; struct MOD_Index *idx = sw->Index; ENTRY *e; LOCATION *l; FILE *fp; /* Check if some swap file is being used */ if(!idx->fp_loc_write[idx_swap_file] && !idx->fp_loc_read[idx_swap_file]) return; /* Check if the file is opened for write and close it */ if(idx->fp_loc_write[idx_swap_file]) { /* Write a 0 to mark the end of locations */ if ( idx->swap_putc(0,idx->fp_loc_write[idx_swap_file]) == EOF ) progerrno("Failed to write locaton mark to index: "); idx->swap_close(idx->fp_loc_write[idx_swap_file]); idx->fp_loc_write[idx_swap_file] = NULL; } /* Reopen in read mode for (for faster reads, I suppose) */ if(!idx->fp_loc_read[idx_swap_file]) { if (!(idx->fp_loc_read[idx_swap_file] = sw_fopen(idx->swap_location_name[idx_swap_file], F_READ_BINARY))) progerrno("Could not open temp file %s: ", idx->swap_location_name[idx_swap_file]); } else { /* File already opened for read -> reset pointer */ sw_fseek(idx->fp_loc_read[idx_swap_file],(sw_off_t)0,SEEK_SET); } fp = idx->fp_loc_read[idx_swap_file]; while((lenbuf = uncompress1(fp, idx->swap_getc))) { if(ep == NULL) { buf = (unsigned char *) Mem_ZoneAlloc(idx->totalLocZone,lenbuf); idx->swap_read(buf, lenbuf, 1, fp); e = *(ENTRY **)buf; /* Store the locations in reverse order - Faster. They will be ** sorted later */ l = (LOCATION *) buf; l->next = e->allLocationList; e->allLocationList = l; } else { idx->swap_read(&e,sizeof(ENTRY *),1,fp); if(ep == e) { buf = (unsigned char *) Mem_ZoneAlloc(idx->totalLocZone,lenbuf); memcpy(buf,&e,sizeof(ENTRY *)); idx->swap_read(buf + sizeof(ENTRY *),lenbuf - sizeof(ENTRY *),1,fp); /* Store the locations in reverse order - Faster. They will be ** sorted later */ l = (LOCATION *) buf; l->next = e->allLocationList; e->allLocationList = l; } else { /* Just advance file pointer */ idx->swap_seek(fp,(sw_off_t)(lenbuf - sizeof(ENTRY *)),SEEK_CUR); } } } } /* 2002-07 jmruiz - Sorts unswaped location data by metaname, filenum */ static void sortSwapLocData(ENTRY *e) { int i, j, k, metaID; int *pi = NULL; unsigned char *ptmp, *ptmp2, *compressed_data; LOCATION **tmploc; LOCATION *l, *prev=NULL, **lp; /* Count the locations */ for(i = 0, l = e->allLocationList; l;i++, l = l->next); /* Very trivial case */ if(i < 2) return; /* */ /* Now, let's sort by metanum, offset in file */ /* Compute array wide for sort */ j = 2 * sizeof(int) + sizeof(void *); /* Compute array size */ ptmp = (void *) emalloc(j * i); /* Build an array with the elements to compare and pointers to data */ /* Very important to remind - data was read from the loc ** swap file in reverse order, so, to get data sorted ** by filenum we just need to use a reverse counter (i - k) ** as the other value for sorting (it has the same effect ** as filenum) */ for(k=0, ptmp2 = ptmp, l = e->allLocationList ; k < i; k++, l = l->next) { pi = (int *) ptmp2; compressed_data = (unsigned char *)l; /* Jump fileoffset */ compressed_data += sizeof(LOCATION *); metaID = uncompress2(&compressed_data); pi[0] = metaID; pi[1] = i-k; ptmp2 += 2 * sizeof(int); lp = (LOCATION **)ptmp2; *lp = l; ptmp2 += sizeof(void *); } /* Sort them */ swish_qsort(ptmp, i, j, &icomp2); /* Store results */ for (k = 0, ptmp2 = ptmp; k < i; k++) { ptmp2 += 2 * sizeof(int); l = *(LOCATION **)ptmp2; if(!k) e->allLocationList = l; else { tmploc = (LOCATION **)prev; *tmploc = l; } ptmp2 += sizeof(void *); prev = l; } tmploc = (LOCATION **)l; *tmploc = NULL; /* Free the memory of the sorting array */ efree(ptmp); }