/* $Id: httpserver.c,v 1.19 2005/05/12 15:41:05 karman Exp $ ** ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94 ** This file is part of Swish-e. Swish-e is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. Swish-e is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Swish-e; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA See the COPYING file that accompanies the Swish-e distribution for details of the GNU GPL and the special exception available for linking against the Swish-e library. ** Mon May 9 15:51:39 CDT 2005 ** added GPL **-------------------------------------------------------------------- ** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98 ** ** change sprintf to snprintf to avoid corruption ** SRE 11/17/99 ** ** fixed cast to int problems pointed out by "gcc -Wall" ** SRE 2/22/00 ** */ /* ** httpserver.c */ #include "acconfig.h" #ifdef HAVE_UNISTD_H #include #endif #include #include #include "swish.h" #include "mem.h" #include "swstring.h" #include "index.h" #include "http.h" #include "httpserver.h" #include "file.h" /* The list of servers that we are acting on. **/ static httpserverinfo *servers = 0; static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server); static char *isolatevalue(char *line, char *keyword, int *plen); static int serverinlist(char *url, struct swline *list); /* Find the robot rules for this URL. If haven't retrieved them ** yet, do so now. **/ httpserverinfo *getserverinfo(SWISH *sw, char *url) { httpserverinfo *server; char *method; int methodlen; char *serverport; int serverportlen; static int lencontenttype=0; static char *contenttype=NULL; static int lenbuffer=0; static char *buffer=NULL; FILE *fp; struct MOD_Index *idx = sw->Index; time_t last_modified; // argh, this is ugly char *file_prefix; // prefix for use with files written by swishspider -- should just be on the stack! if(!lenbuffer)buffer=emalloc((lenbuffer=MAXSTRLEN)+1); if(!lencontenttype)contenttype=emalloc((lencontenttype=MAXSTRLEN)+1); if ((method = url_method(url, &methodlen)) == 0) { return 0; } if ((serverport = url_serverport(url, &serverportlen)) == 0) { return 0; } /* Search for the rules **/ for (server = servers; server; server = server->next) { if (equivalentserver(sw, url, server->baseurl)) { return server; } } /* Create a new entry for this server and add it to the list. **/ server = (httpserverinfo *)emalloc(sizeof(httpserverinfo)); /* +3 for the ://, +1 for the trailing /, +1 for the terminating null **/ server->baseurl = (char *)emalloc(methodlen + serverportlen + 5); /* These 4 lines to avoid a call to non ANSI snprintf . May not be the best way but it ensures no buffer overruns */ memcpy (server->baseurl,method,methodlen); memcpy (server->baseurl+methodlen,"://",3); memcpy (server->baseurl+methodlen+3,serverport,serverportlen); strcpy (server->baseurl+methodlen+3+serverportlen,"/"); server->lastretrieval = 0; server->robotrules = 0; server->next = servers; servers = server; /* Only http(s) servers can full rules, all the other ones just get dummies ** (this is useful for holding last retrieval) ** ** http://info.webcrawler.com/mak/projects/robots/norobots.html holds what ** many people consider the official web exclusion rules. Unfortunately, ** the rules are not consistent about how records are formed. One line ** states "the file consists of one or more records separated by one or more ** blank lines" while another states "the record starts with one or more User-agent ** lines, followed by one or more Disallow lines." ** ** So, does a blank line after a User-agent line end a record? The spec is ** unclear on this matter. If the next legal line afer the blank line is ** a Disallow line, the blank line should most likely be ignored. But what ** if the next line is another User-agent line? For example: ** ** User-agent: MooBot ** ** User-agent: CreepySpider ** Disallow: /cgi-bin ** ** One interpretation (based on blank lines termination records) is that MooBot ** may visit any location (since there are no Disallows for it). Another ** interpretation (based on records needing both User-agent and Disallow lines) ** is that MooBot may not visit /cgi-bin ** ** While poking around, I found at least one site (www.sun.com) that uses blank ** lines within records. Because of that, I have decided to rely on records ** having both User-agent and Disallow lines (the second interpretation above). **/ if (strncmp(server->baseurl, "http", 4) == 0) { if((int)(strlen(server->baseurl)+20)>=lenbuffer) { lenbuffer=strlen(server->baseurl)+20+200; buffer=erealloc(buffer,lenbuffer+1); } sprintf(buffer, "%srobots.txt", server->baseurl); file_prefix = emalloc( strlen(idx->tmpdir) + MAXPIDLEN + strlen("/swishspider@.contents+fill") ); sprintf(file_prefix, "%s/swishspider@%ld", idx->tmpdir, (long) lgetpid()); if (get(sw,contenttype, &last_modified, &server->lastretrieval, file_prefix, buffer) == 200) { char *robots_buffer; int filelen; int bytes_read; if((int)(strlen(idx->tmpdir)+MAXPIDLEN+30)>=lenbuffer) { lenbuffer=strlen(idx->tmpdir)+MAXPIDLEN+30+200; buffer=erealloc(buffer,lenbuffer+1); } sprintf(buffer, "%s/swishspider@%ld.contents", idx->tmpdir, (long)lgetpid()); fp = fopen(buffer, F_READ_TEXT); filelen = getsize(buffer); robots_buffer = emalloc( filelen + 1 ); *robots_buffer = '\0'; bytes_read = fread(robots_buffer, 1, filelen, fp); robots_buffer[bytes_read] = '\0'; parserobotstxt( robots_buffer, bytes_read, server ); efree( robots_buffer ); //parserobotstxt(fp, server); fclose(fp); /* Have to close before unlink on Windows */ } efree( file_prefix ); cmdf(unlink, "%s/swishspider@%ld.response", idx->tmpdir, lgetpid()); cmdf(unlink, "%s/swishspider@%ld.contents", idx->tmpdir, lgetpid()); cmdf(unlink, "%s/swishspider@%ld.links", idx->tmpdir, lgetpid()); } return server; } int urldisallowed(SWISH *sw, char *url) { httpserverinfo *server; robotrules *rule; char *uri; int urilen; if ((server = getserverinfo(sw, url)) == 0) { return 1; } if ((uri = url_uri(url, &urilen)) == 0) { return 1; } for (rule = server->robotrules; rule; rule = rule->next) { if (strncmp(uri, rule->disallow, strlen(rule->disallow)) == 0) { return 1; } } return 0; } // quick fix to parse from Mac and Windows. // Pass in: // char **next_start == pointer to a *char that has where the next string starts. // char *last_char == pointer to last char in buffer. Buffer MUST have room for one more char // // returns NULL on no more strings static char *next_line( char **next_start, char *last_char ) { char *buffer = *next_start; char *start; // skip over any leading new lines or cr. while ( buffer <= last_char && ( *buffer == '\0' || *buffer == '\n' || *buffer == '\r' ) ) buffer++; if ( buffer > last_char ) return NULL; start = buffer; // start of this word // Now find the end of this string while ( buffer <= last_char && ( *buffer != '\0' && *buffer != '\n' && *buffer != '\r' ) ) buffer++; *buffer = '\0'; // mark the end of the string buffer++; *next_start = buffer; return start; } static char useragent[] = "user-agent:"; static char disallow[] = "disallow:"; static char swishspider[] = "swishspider"; static void parserobotstxt(char *robots_buffer, int buflen, httpserverinfo *server) { char *buffer; char *bufend = robots_buffer + buflen -1; // last char of string char *next_start = robots_buffer; enum {START, USERAGENT, DISALLOW} state = START; enum {SPECIFIC, GENERIC, SKIPPING} useragentstate = SKIPPING; char *p; int len; robotrules *entry; robotrules *entry2; server->useragent = 0; buffer = NULL; while ( (buffer = next_line( &next_start, bufend ) ) ) { if ( strchr( buffer, '#' ) ) *(strchr( buffer, '#' )) = '\0'; if ((*buffer == '#') || (*buffer == '\0')) continue; if (strncasecmp(buffer, useragent, sizeof(useragent) - 1) == 0) { switch (state) { case DISALLOW: /* Since we found our specific user-agent, we can ** skip the rest of the file. **/ if (useragentstate == SPECIFIC) { return; } useragentstate = SKIPPING; /* explict fallthrough */ case START: case USERAGENT: state = USERAGENT; if (useragentstate != SPECIFIC) { p = isolatevalue(buffer, useragent, &len); if ((len == (sizeof(swishspider) - 1)) && (strncasecmp(p, swishspider, sizeof(swishspider) - 1) == 0) ) { useragentstate = SPECIFIC; /* We might have already parsed generic rules, ** so clean them up if necessary. */ if (server->useragent) { efree(server->useragent); } for (entry = server->robotrules; entry; ) { entry2 = entry->next; efree(entry); entry = entry2; } server->robotrules = 0; server->useragent = (char *)emalloc(len + 1); strncpy(server->useragent, p, len); *(server->useragent + len) = '\0'; } else if ((len == 1) && (*p == '*')) { useragentstate = GENERIC; server->useragent = (char *)emalloc(2); strcpy(server->useragent, "*"); /* emalloc'd 2 bytes, no safestrcpy */ } } break; } } if (strncasecmp(buffer, disallow, sizeof(disallow) - 1) == 0) { state = DISALLOW; if (useragentstate != SKIPPING) { p = isolatevalue(buffer, disallow, &len); if (len) { entry = (robotrules *)emalloc(sizeof(robotrules)); entry->next = server->robotrules; server->robotrules = entry; entry->disallow = (char *)emalloc(len + 1); strncpy(entry->disallow, p, len); *(entry->disallow + len) = '\0'; } } } } } static char *isolatevalue(char *line, char *keyword, int *plen) { /* Find the beginning of the value **/ for (line += strlen(keyword); *line && isspace((int)((unsigned char)*line)); line++ ) { /* cast to int 2/22/00 */ } if ( !strlen(line) ) { *plen = 0; return line; } /* Strip off trailing spaces **/ for (*plen = strlen(line); isspace((int)((unsigned char)*(line + *plen - 1))); (*plen)--) { /* cast to int 2/22/00 */ } return line; } int equivalentserver(SWISH *sw, char *url, char *baseurl) { char *method; int methodlen; char *serverport; int serverportlen; char *basemethod; int basemethodlen; char *baseserverport; int baseserverportlen; struct multiswline *walk=NULL; struct MOD_HTTP *http = sw->HTTP; method = url_method(url, &methodlen); serverport = url_serverport(url, &serverportlen); basemethod = url_method(baseurl, &basemethodlen); baseserverport = url_serverport(baseurl, &baseserverportlen); if (!method || !serverport || !basemethod || !baseserverport) { return 0; } /* If this is the same server, we just go for it **/ if ((methodlen == basemethodlen) && (serverportlen == baseserverportlen) && (strncasecmp(method, basemethod, methodlen) == 0) && (strncasecmp(serverport, baseserverport, serverportlen) == 0)) { return 1; } /* Do we find the method/server info for this and the base url ** in the same equivalence list? **/ for (walk = http->equivalentservers; walk; walk = walk->next ) { if (serverinlist(url, walk->list) && serverinlist(baseurl, walk->list)) { return 1; } } return 0; } static int serverinlist(char *url, struct swline *list) { char *method; int methodlen; char *serverport; int serverportlen; char *listmethod; int listmethodlen; char *listserverport; int listserverportlen; method = url_method(url, &methodlen); serverport = url_serverport(url, &serverportlen); if (!method || !serverport) { return 0; } for ( ; list; list = list->next) { listmethod = url_method(list->line, &listmethodlen); listserverport = url_serverport(list->line, &listserverportlen); if (listmethod && listserverport) { if ((methodlen == listmethodlen) && (serverportlen == listserverportlen) && (strncasecmp(method, listmethod, methodlen) == 0) && (strncasecmp(serverport, listserverport, serverportlen) == 0)) { return 1; } } } return 0; }