/* $Id: swish_words.c,v 1.46 2006/08/27 20:22:54 karman Exp $ This file is part of Swish-e. Swish-e is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. Swish-e is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Swish-e; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA See the COPYING file that accompanies the Swish-e distribution for details of the GNU GPL and the special exception available for linking against the Swish-e library. Mon May 9 10:57:22 CDT 2005 -- added GPL notice ** ** 2001-05-23 moseley created - replaced parser in search.c ** ** 2001-12-11 moseley, updated to deal with swish operators inside of phrases ** Still broken with regard to double-quotes inside of phrases ** Very unlikely someone would want to search for a single double quote ** within a phrase. It currently works if the double-quotes doesn't have ** white space around. Really should tag the words as being operators, or ** or "swish words", or let the backslash stay in the query until searching. ** */ #include "swish.h" #include "mem.h" #include "swstring.h" #include "search.h" #include "index.h" #include "file.h" #include "list.h" #include "hash.h" #include "stemmer.h" #include "double_metaphone.h" #include "error.h" #include "metanames.h" #include "config.h" // for _AND_WORD... //#include "search_alt.h" // for AND_WORD... humm maybe needs better organization #include "swish_words.h" static struct swline *tokenize_query_string( SEARCH_OBJECT *srch, char *words, INDEXDATAHEADER *header ); static struct swline *ignore_words_in_query(DB_RESULTS *db_results, struct swline *searchwordlist); static struct swline *fixmetanames(struct swline *); static struct swline *fixnot1(struct swline *); static struct swline *fixnot2(struct swline *); static struct swline *expandphrase(struct swline *, char); static char *isBooleanOperatorWord( char * word ); static void print_swline( char *msg, struct swline *word_list ) { #ifdef SWISH_WORDS_DEBUG struct swline *sl = word_list; printf("%s: ", msg ); while ( sl ) { printf("%s ", sl->line ); sl = sl->next; } printf("\n"); #endif } struct MOD_Swish_Words { char *word; int lenword; }; /* -- init structures for this module */ void initModule_Swish_Words (SWISH *sw) { struct MOD_Swish_Words *self; self = (struct MOD_Swish_Words *) emalloc(sizeof(struct MOD_Swish_Words)); sw->SwishWords = self; /* initialize buffers used by indexstring */ self->word = (char *) emalloc((self->lenword = MAXWORDLEN) + 1); return; } void freeModule_Swish_Words (SWISH *sw) { struct MOD_Swish_Words *self = sw->SwishWords; efree( self->word ); efree ( self ); sw->SwishWords = NULL; return; } /* Returns true if the character is a search operator */ /* this could be a macro, but gcc is probably smart enough */ static int isSearchOperatorChar( int c, int phrase_delimiter, int inphrase ) { return inphrase ? ( '*' == c || '?' == c || c == phrase_delimiter ) : ( '(' == c || ')' == c || '=' == c || '*' == c || '?' == c || c == phrase_delimiter ); } /* This simply tokenizes by whitespace and by the special characters "()=" */ /* If within a phrase, then just splits by whitespace */ /* Funny how argv was joined into a string just to be split again... */ // $$$ BUG in next_token() is that "search *" or "searh*" becomes two tokens, and then // is patched back to at the end (so "search *" becomes "search*"). static int next_token( char **buf, char **word, int *lenword, int phrase_delimiter, int inphrase ) { int i; int backslash; int leading_space = 0; **word = '\0'; /* skip any leading whitespace */ while ( **buf && isspace( (unsigned char) **buf) ) { (*buf)++; leading_space = 1; /* for catching single wild cards */ /* current parsing will joing "foo" and "*" into "foo*" */ } /* extract out word */ i = 0; backslash = 0; while ( **buf && !isspace( (unsigned char) **buf) ) { // This should be looking at swish words, not raw input //if ( i > max_size + 4 ) /* leave a little room for operators */ // progerr( "Search word exceeded maxwordlimit setting." ); /* reallocate buffer, if needed -- only if maxwordlimit was set larger than MAXWORDLEN (1000) */ if ( i == *lenword ) { *lenword *= 2; *word = erealloc(*word, *lenword + 1); } /* backslash says take next char as-is */ /* note that you cannot backslash whitespace */ if ( '\\' == **buf && ! backslash++ ) { (*buf)++; continue; } if ( backslash || !isSearchOperatorChar( (unsigned char) **buf, phrase_delimiter, inphrase ) ) { backslash = 0; (*word)[i++] = **buf; /* can't this be done in one line? */ (*buf)++; } else if ( **buf == '?' ) { /* ? is a search operator so fails first test above but we want to pass it through like a normal word char if it is not the first character in the word */ if(!i) return WILDCARD_NOT_ALLOWED_AT_WORD_START; backslash = 0; (*word)[i++] = **buf; /* can't this be done in one line? */ (*buf)++; } else /* this is a search operator char */ { if ( **word ) /* break if characters already found - end of this token */ break; /* special hacks for wild cards */ if ( **buf == '*' ) { if ( leading_space ) return UNIQUE_WILDCARD_NOT_ALLOWED_IN_WORD; if ( (*buf)[1] && !inphrase && !isspace( (unsigned char) (*buf)[1])) if ( !isSearchOperatorChar( (unsigned char) (*buf)[1], phrase_delimiter, inphrase ) ) return WILDCARD_NOT_ALLOWED_WITHIN_WORD; } (*word)[i++] = **buf; /* save the search operator char as it's own token, and end. */ (*buf)++; break; } } /* flag if we found a token */ if ( i ) { (*word)[i] = '\0'; return 1; } return 0; } static int next_swish_word(INDEXDATAHEADER *header, char **buf, char **word, int *lenword ) { int i; /* Also set flag for "?" (wildcard), in general set and at end of a term/word * At start there is never a wildcard allowed, because of sequential lookup in index * performance issue !! */ header->wordcharslookuptable[63] = 1; header->endcharslookuptable[63] = 1; /* skip non-wordchars */ while ( **buf && !header->wordcharslookuptable[tolower((unsigned char)(**buf))] ) (*buf)++; i = 0; while ( **buf && header->wordcharslookuptable[tolower((unsigned char)(**buf))] ) { /* reallocate buffer, if needed */ if ( i + 1 == *lenword ) { *lenword *= 2; *word = erealloc(*word, *lenword + 1); } (*word)[i++] = **buf; (*word)[i] = '\0'; (*buf)++; } if ( i ) { stripIgnoreLastChars( header, *word); stripIgnoreFirstChars(header, *word); return **word ? 1 : 0; } return 0; } /* Convert a word into swish words */ static struct swline *parse_swish_words( SWISH *sw, INDEXDATAHEADER *header, char *word, int max_size ) { struct swline *swish_words = NULL; char *curpos; struct MOD_Swish_Words *self = sw->SwishWords; /* Some initial adjusting of the word */ TranslateChars(header->translatecharslookuptable, (unsigned char *)word); curpos = word; while( next_swish_word( header, &curpos, &self->word, &self->lenword ) ) { /* Check Begin & EndCharacters */ if (!header->begincharslookuptable[(int) ((unsigned char) self->word[0])]) continue; if (!header->endcharslookuptable[(int) ((unsigned char) self->word[strlen(self->word) - 1])]) continue; /* limit by stopwords, min/max length, max number of digits, ... */ /* ------- processed elsewhere for search --------- if (!isokword(sw, self->word, indexf)) continue; - stopwords are processed in search.c because removing them may have side effects - maxwordlen is checked when first tokenizing for security reasons - limit by vowels, consonants and digits is not needed since search will just fail ----------- */ if ( (int)strlen( self->word ) > max_size ) { sw->lasterror = SEARCH_WORD_TOO_BIG; return NULL; } if (!*self->word) continue; /* Now stem word, if set to setm */ { FUZZY_WORD *fw = fuzzy_convert( header->fuzzy_data, self->word ); if ( fw->list_size != 2 ) { swish_words = (struct swline *) addswline( swish_words, fw->string_list[0] ); } else { /* yuck! */ swish_words = (struct swline *) addswline( swish_words, "(" ); swish_words = (struct swline *) addswline( swish_words, fw->string_list[0] ); swish_words = (struct swline *) addswline( swish_words, OR_WORD ); swish_words = (struct swline *) addswline( swish_words, fw->string_list[1] ); swish_words = (struct swline *) addswline( swish_words, ")" ); } fuzzy_free_word( fw ); } } return swish_words; } /* This is really dumb. swline needs a ->prev entry, really search needs its own linked list */ /* Replaces a given node with another node (or nodes) */ static void replace_swline( struct swline **original, struct swline *entry, struct swline *new_words ) { struct swline *temp; temp = *original; /* check for case of first one */ if ( temp == entry ) { if ( new_words ) { new_words->other.nodep->next = temp->next; new_words->other.nodep = temp->other.nodep; *original = new_words; } else /* just delete first node */ { if ( entry->next ) entry->next->other.nodep = entry->other.nodep; /* point next one to last one */ *original = entry->next; } } else /* not first node */ { /* search for the preceeding node */ for ( temp = *original; temp && temp->next != entry; temp = temp->next ); if ( !temp ) progerr("Fatal Error: Failed to find insert point in replace_swline"); if ( new_words ) { if(!entry->next) /* Adding at the end. So, fix the last one */ (*original)->other.nodep = new_words->other.nodep; /* set the previous record to point to the start of the new entry (or entries) */ temp->next = new_words; /* set the end of the new string to point to the next entry */ new_words->other.nodep->next = entry->next; } else /* delete the entry */ { temp->next = temp->next->next; if(!temp->next) /* Adding at the end. So, fix the last one */ (*original)->other.nodep = temp; } } /* now free the removed item */ efree( entry ); } static int checkbuzzword(INDEXDATAHEADER *header, char *word ) { if ( !header->hashbuzzwordlist.count ) return 0; /* only strip when buzzwords are being used since stripped again as a "swish word" */ stripIgnoreLastChars( header, word ); stripIgnoreFirstChars( header, word ); if ( !*word ) /* stripped clean? */ return 0; return (int)is_word_in_hash_table( header->hashbuzzwordlist, word ); } /* I hope this doesn't live too long */ static void fudge_wildcard( struct swline **original, struct swline *entry ) { struct swline *wild_card, *new; wild_card = entry->next; /* New entry */ new = newswline_n(entry->line, strlen( entry->line ) + strlen(wild_card->line)); strcat( new->line, wild_card->line); /* Change entry by new */ new->other.nodep = new; // Group of 1 node (last is itself) replace_swline(original,entry,new); /* remove wild_card */ replace_swline(original,wild_card,(struct swline *)NULL); } /* Converts an operator word into an operator */ static char *isBooleanOperatorWord( char * word ) { /* don't need strcasecmp here, since word should alrady be lowercase -- need to check alt-search first */ if (!strcasecmp( word, _AND_WORD)) return AND_WORD; if (!strncasecmp( word, _NEAR_WORD, strlen(_NEAR_WORD))) return NEAR_WORD; if (!strcasecmp( word, _OR_WORD)) return OR_WORD; if (!strcasecmp( word, _NOT_WORD)) return NOT_WORD; return (char *)NULL; } /* This "fixes" the problem of showing operators in Parsed Words as */ /* Really, it's probably a much better way to display Parsed Words, but that's not the way it was first created */ /* Converts an operator into a string */ static char *isBooleanOperator( char * word ) { if (!strcasecmp( word, AND_WORD)) return _AND_WORD; if (!strncasecmp( word, NEAR_WORD, strlen(NEAR_WORD))) return NEAR_WORD; if (!strcasecmp( word, OR_WORD)) return _OR_WORD; if (!strcasecmp( word, NOT_WORD)) return _NOT_WORD; return (char *)NULL; } /* Simply replace with "and" */ /* it's required that the replacement string is <= to inital string. */ static void switch_back_operators( struct swline *sl ) { char *operator; while ( sl ) { if ( (operator = isBooleanOperator( sl->line )) ) strcpy( sl->line, operator ); sl = sl->next; } } static struct swline *tokenize_query_string( SEARCH_OBJECT *srch, char *words, INDEXDATAHEADER *header ) { char *curpos; /* current position in the words string */ struct swline *tokens = NULL; struct swline *temp; struct swline *new; struct swline *swish_words; struct swline *next_node; SWISH *sw = srch->sw; struct MOD_Swish_Words *self = sw->SwishWords; unsigned char PhraseDelimiter; int max_size; int inphrase = 0; int rc; PhraseDelimiter = (unsigned char) srch->PhraseDelimiter; max_size = header->maxwordlimit; curpos = words; /* split into words by whitespace and by the swish operator characters */ while ( (rc = next_token( &curpos, &self->word, &self->lenword, PhraseDelimiter, inphrase )) ) { /* catch single wild card early */ if ( rc < 0 ) { sw->lasterror = rc; return NULL; } tokens = (struct swline *) addswline( tokens, self->word ); if ( self->word[0] == PhraseDelimiter && !self->word[1] ) inphrase = !inphrase; } /* no search words found */ if ( !tokens ) return NULL; inphrase = 0; temp = tokens; while ( temp ) { /* do look-ahead processing first -- metanames */ if ( !inphrase && isMetaNameOpNext(temp->next) ) { if( !getMetaNameByName( header, temp->line ) ) { set_progerr( UNKNOWN_METANAME, sw, "'%s'", temp->line ); freeswline( tokens ); return NULL; } /* this might be an option with XML */ strtolower( temp->line ); temp = temp->next; continue; } /* skip operators */ if ( strlen( temp->line ) == 1 && isSearchOperatorChar( (unsigned char) temp->line[0], PhraseDelimiter, inphrase ) ) { if ( temp->line[0] == PhraseDelimiter && !temp->line[1] ) inphrase = !inphrase; temp = temp->next; continue; } /* this might be an option if case sensitive searches are used */ strtolower( temp->line ); /* check Boolean operators -- and replace with the operator string */ if ( !inphrase ) { char *operator, *nextoperator; char nearop[100]; if ( (operator = isBooleanOperatorWord( temp->line )) ) { /* replace the common "and not" with simply not" */ /* probably not the best place to do this level of processing */ /* since should also check for things like "and this" and "and and and not this" */ /* should probably be moved to end and recursively check for these (to catch "and and not") */ if ( temp->next && ( strcmp( operator, AND_WORD ) == 0) && ( (nextoperator = isBooleanOperatorWord( temp->next->line))) && ( strcmp( nextoperator, NOT_WORD ) == 0) ) { struct swline *andword = temp; /* save position of entry to remove */ temp = temp->next; /* now point to "not" word */ operator = nextoperator; /* Remove the "and" word */ replace_swline( &tokens, andword, (struct swline *)NULL ); /* cut it out */ } strcpy(nearop, operator); if (!strncasecmp( operator, NEAR_WORD, strlen(NEAR_WORD))) strcat(nearop, temp->line + strlen(_NEAR_WORD)); /* Replace the string with the operator string */ new = newswline(nearop); new->other.nodep = new; // Group of 1 node (last is itself) replace_swline( &tokens, temp, new ); /* change it */ temp = new->next; continue; } } /* buzzwords */ if ( checkbuzzword( header, temp->line ) ) { temp = temp->next; continue; } /* query words left. Turn into "swish_words" */ swish_words = NULL; swish_words = parse_swish_words( sw, header, temp->line, max_size); if ( sw->lasterror ) return NULL; next_node = temp->next; /* move into list.c at some point */ replace_swline( &tokens, temp, swish_words ); temp = next_node; } /* fudge wild cards back onto preceeding word */ /* $$$ This is broken because a query of "foo *" ends up "foo*" */ /* Now almost fixed "foo *" is an error, but */ /* Also doesn't check for an operator followed by "*" */ for ( temp = tokens ; temp; ) if ( temp->next && strcmp( temp->next->line, "*") == 0 ) { next_node = (temp->next)->next; fudge_wildcard( &tokens, temp ); temp = next_node; } else temp = temp->next; return tokens; } /********************************************************************************** * parse_swish_query -- convert a string in a SEARCH_OBJECT into a list of tokens * * Pass in: * db_results - container for the search results for a single index * * Returns: * false on error * sets sw->lasterror on fatal errors. * * but some errors are cleared and set in the search object to allow * processing to continue. For example, when searching multiple index * files one index may end up removing all the words in a query (if they * are all stopwords) where another index will not have the same stopword * list and produce a query. Room for improvement, of course. * * Notes: * calls tokenize_query_string() as first pass * ignore_words_in_query() to remove stop words (could be combined with tokenize, I think -- but removing stop words is a bit tricky) * expandpharse() prepares for phrase searching * fixmetanames() fixnot1() and fixnot2() make other adjustments * * Clearly, a better query parser is in order. * * Sep 29, 2002 - moseley * ***********************************************************************************/ struct swline *parse_swish_query( DB_RESULTS *db_results ) { struct swline *searchwordlist; IndexFILE *indexf = db_results->indexf; SEARCH_OBJECT *srch = db_results->srch; /* tokenize the query into swish words based on this current index */ /* returns false if no words or error */ /* may set sw->lasterror on unknown metanames or word too big */ if (!(searchwordlist = tokenize_query_string(srch, srch->query, &indexf->header))) return NULL; print_swline("after tokenize", searchwordlist ); /* Remove stopwords from the query -- also sets db_results->removed_stopwords */ /* This can set QUERY_SYNTAX_ERROR which should abort */ /* WORDS_TOO_COMMON & NO_WORDS_IN_SEARCH should be used if no index files are searched */ /* This is a bit ugly */ searchwordlist = ignore_words_in_query(db_results, searchwordlist); if ( !searchwordlist || srch->sw->lasterror ) { if ( searchwordlist ) freeswline( searchwordlist ); return NULL; } db_results->parsed_words = dupswline(searchwordlist); /* see notes in this function why this is done */ switch_back_operators( db_results->parsed_words ); /* Now hack up the query for searh processing */ /* $$$ please fix this! Let's get a real parser */ /* Expand phrase search: "kim harlow" becomes (kim PHRASE_WORD harlow) */ searchwordlist = expandphrase(searchwordlist, (char)srch->PhraseDelimiter); searchwordlist = fixmetanames(searchwordlist); searchwordlist = fixnot1(searchwordlist); searchwordlist = fixnot2(searchwordlist); print_swline("Final", searchwordlist ); return searchwordlist; } static int isrule(char *word) { if (!strcmp(word, AND_WORD) || !strncmp(word, NEAR_WORD, strlen(NEAR_WORD)) || !strcmp(word, OR_WORD) || !strcmp(word, NOT_WORD)) return 1; else return 0; } static int isnotrule(char *word) { if (!strcmp(word, NOT_WORD)) return 1; else return 0; } /****************************************************************************** * Remove the stop words from the tokenized query * rewritten Nov 24, 2001 - moseley * Still horrible! Need a real parse tree. *******************************************************************************/ static struct swline *ignore_words_in_query(DB_RESULTS *db_results, struct swline *searchwordlist) { IndexFILE *indexf = db_results->indexf; SEARCH_OBJECT *srch = db_results->srch; SWISH *sw = srch->sw; struct swline *cur_token = searchwordlist; struct swline *prev_token = NULL; struct swline *prev_prev_token = NULL; // for removing two things int in_phrase = 0; int word_count = 0; /* number of search words found */ int paren_count = 0; int stop_word_removed = 0; unsigned char phrase_delimiter = (unsigned char)srch->PhraseDelimiter; while ( cur_token ) { int remove = 0; char first_char = cur_token->line[0]; if ( cur_token == searchwordlist ) { prev_token = prev_prev_token = NULL; word_count = 0; paren_count = 0; in_phrase = 0; } while ( 1 ) // so we can use break. { /* Can't backslash here -- (because this code should really be include in swish_words.c) */ if ( first_char == phrase_delimiter ) { in_phrase = !in_phrase; if ( !in_phrase && prev_token && prev_token->line[0] == phrase_delimiter ) remove = 2; break; } /* leave everything alone inside a pharse */ if ( in_phrase ) { if ( is_word_in_hash_table( indexf->header.hashstoplist, cur_token->line ) ) { db_results->removed_stopwords = addswline( db_results->removed_stopwords, cur_token->line ); stop_word_removed++; remove = 1; } else word_count++; break; } /* Allow operators */ if ( first_char == '=' ) break; if ( first_char == '(' ) { paren_count++; break; } if ( first_char == ')' ) { paren_count--; if ( prev_token && prev_token->line[0] == '(' ) remove = 2; break; } /* Allow all metanames */ if ( isMetaNameOpNext(cur_token->next) ) break; /* Look for AND OR NOT - remove AND OR at start, and remove second of doubles */ if ( isrule(cur_token->line) ) { if ( prev_token ) { /* remove double tokens */ if ( isrule(prev_token->line ) ) remove = 1; } /* allow NOT at the start */ else if ( !isnotrule(cur_token->line) ) remove = 1; break; } /* Finally, is it a stop word? */ if ( is_word_in_hash_table( indexf->header.hashstoplist, cur_token->line ) ) { db_results->removed_stopwords = addswline( db_results->removed_stopwords, cur_token->line ); stop_word_removed++; remove = 1; } else word_count++; break; } /* Catch dangling metanames */ if ( !remove && !cur_token->next && isMetaNameOpNext( cur_token ) ) remove = 2; if ( remove ) { struct swline *tmp = cur_token; if ( cur_token == searchwordlist ) // we are removing first token searchwordlist = cur_token->next; else { prev_token->next = cur_token->next; // remove one in the middle cur_token = prev_token; // save if remove == 2 } efree( tmp ); if ( remove == 2 ) { tmp = cur_token; if ( cur_token == searchwordlist ) // we are removing first token searchwordlist = cur_token->next; else prev_prev_token->next = cur_token->next; // remove one in the middle efree( tmp ); } /* start at the beginning again */ cur_token = searchwordlist; continue; } if ( prev_token ) prev_prev_token = prev_token; prev_token = cur_token; cur_token = cur_token->next; } if ( in_phrase || paren_count ) sw->lasterror = QUERY_SYNTAX_ERROR; else if ( !word_count ) sw->lasterror = stop_word_removed ? WORDS_TOO_COMMON : NO_WORDS_IN_SEARCH; return searchwordlist; } /* 2001-09 jmruiz - Rewriting ** This puts parentheses in the right places around meta searches ** to avoid problems whith them. Basically "metaname = bla" ** becomes "(metanames = bla)" */ static struct swline *fixmetanames(struct swline *sp) { int metapar; struct swline *tmpp, *newp; tmpp = sp; newp = NULL; /* Fix metanames with parenthesys eg: metaname = bla => (metanames = bla) */ while (tmpp != NULL) { if (isMetaNameOpNext(tmpp->next)) { /* If it is a metaName add the name and = and skip to next */ newp = (struct swline *) addswline(newp, "("); newp = (struct swline *) addswline(newp, tmpp->line); newp = (struct swline *) addswline(newp, "="); tmpp = tmpp->next; tmpp = tmpp->next; if ( !tmpp ) return NULL; /* no more words! */ /* 06/00 Jose Ruiz ** Fix to consider parenthesys in the ** content of a MetaName */ if (tmpp->line[0] == '(') { metapar = 1; newp = (struct swline *) addswline(newp, tmpp->line); tmpp = tmpp->next; while (metapar && tmpp) { if (tmpp->line[0] == '(') metapar++; else if (tmpp->line[0] == ')') metapar--; newp = (struct swline *) addswline(newp, tmpp->line); if (metapar) tmpp = tmpp->next; } if (!tmpp) return (newp); } else newp = (struct swline *) addswline(newp, tmpp->line); newp = (struct swline *) addswline(newp, ")"); } else newp = (struct swline *) addswline(newp, tmpp->line); /* next one */ tmpp = tmpp->next; } freeswline(sp); return newp; } /* 2001 -09 jmruiz Rewritten ** This optimizes some NOT operator to be faster. ** ** "word1 not word" is changed by "word1 and_not word2" ** ** In the old way the previous query was... ** get results if word1 ** get results of word2 ** not results of word2 (If we have 100000 docs and word2 is in ** just 3 docs, this means read 99997 ** results) ** intersect both list of results ** ** The "new way" ** get results if word1 ** get results of word2 ** intersect (and_not_rule) both lists of results ** */ static struct swline *fixnot1(struct swline *sp) { struct swline *tmpp, *prev, *new; if (!sp) return NULL; /* 06/00 Jose Ruiz - Check if first word is NOT_RULE */ /* Change remaining NOT by AND_NOT_RULE */ for (tmpp = sp, prev = NULL; tmpp; prev = tmpp, tmpp = tmpp->next) { if (tmpp->line[0] == '(') continue; else if (isnotrule(tmpp->line)) { if(prev && prev->line[0]!='=' && prev->line[0]!='(') { new = newswline(AND_NOT_WORD); new->other.nodep = new; // group of 1 node replace_swline(&sp, tmpp, new); tmpp = new; } } } return sp; } /* 2001 -09 jmruiz - Totally new - Fix the meta=(not ahsg) bug ** Add parentheses to avoid the way operator NOT confuse complex queries */ static struct swline *fixnot2(struct swline *sp) { int openparen, found; struct swline *tmpp, *newp; char *magic = MAGIC_NOT_WORD; /* magic avoids parsing the ** "not" operator twice ** and put the code in an ** endless loop */ found = 1; while(found) { openparen = 0; found = 0; for (tmpp = sp , newp = NULL; tmpp ; tmpp = tmpp->next) { if (isnotrule(tmpp->line)) { found = 1; /* Add parentheses */ newp = (struct swline *) addswline(newp, "("); /* Change "NOT" by magic to avoid find it in next iteration */ newp = (struct swline *) addswline(newp, magic); for(tmpp = tmpp->next; tmpp; tmpp = tmpp->next) { if ((tmpp->line)[0] == '(') openparen++; else if(!openparen) { newp = (struct swline *) addswline(newp, tmpp->line); /* Add parentheses */ newp = (struct swline *) addswline(newp, ")"); break; } else if ((tmpp->line)[0] == ')') openparen--; newp = (struct swline *) addswline(newp, tmpp->line); } if(!tmpp) break; } else newp = (struct swline *) addswline(newp, tmpp->line); } freeswline(sp); sp = newp; } /* remove magic and put the "real" NOT in place */ for(tmpp = newp; tmpp ; tmpp = tmpp->next) { if(!strcmp(tmpp->line,magic)) { strcpy(tmpp->line, NOT_WORD); } } return newp; } /* expandstar removed - Jose Ruiz 04/00 */ /* Expands phrase search. Berkeley University becomes Berkeley PHRASE_WORD University */ /* It also fixes the and, not or problem when they appeared inside a phrase */ static struct swline *expandphrase(struct swline *sp, char delimiter) { struct swline *tmp, *newp; int inphrase; if (!sp) return NULL; inphrase = 0; newp = NULL; tmp = sp; while (tmp != NULL) { if ((tmp->line)[0] == delimiter) { if (inphrase) { inphrase = 0; newp = (struct swline *) addswline(newp, ")"); } else { inphrase++; newp = (struct swline *) addswline(newp, "("); } } else { if (inphrase) { if (inphrase > 1) newp = (struct swline *) addswline(newp, PHRASE_WORD); inphrase++; newp = (struct swline *) addswline(newp, tmp->line); } else newp = (struct swline *) addswline(newp, tmp->line); } tmp = tmp->next; } freeswline(sp); return newp; } /* These 2 routines fix the problem when a word ends with mutiple ** IGNORELASTCHAR's (eg, qwerty'. ). The old code correctly deleted ** the ".", but didn't check if the new last character ("'") is also ** an ignore character. */ void stripIgnoreLastChars(INDEXDATAHEADER *header, char *word) { int k,j,i = strlen(word); /* Get rid of specified last char's */ /* for (i=0; word[i] != '\0'; i++); */ /* Iteratively strip off the last character if it's an ignore character */ while ((i > 0) && (isIgnoreLastChar(header, word[--i]))) { word[i] = '\0'; /* We must take care of the escaped characeters */ /* Things like hello\c hello\\c hello\\\c can appear */ for(j=0,k=i-1;k>=0 && word[k]=='\\';k--,j++); /* j contains the number of \ */ if(j%2) /* Remove the escape if even */ { word[--i]='\0'; } } } void stripIgnoreFirstChars(INDEXDATAHEADER *header, char *word) { int j, k; int i = 0; /* Keep going until a char not to ignore is found */ /* We must take care of the escaped characeters */ /* Things like \chello \\chello can appear */ while (word[i]) { if(word[i]=='\\') /* Jump escape */ k=i+1; else k=i; if(!word[k] || !isIgnoreFirstChar(header, word[k])) break; else i=k+1; } /* If all the char's are valid, just return */ if (0 == i) return; else { for (k = i, j = 0; word[k] != '\0'; j++, k++) { word[j] = word[k]; } /* Add the NULL */ word[j] = '\0'; } }