/* web.h: prototypes of functions declared in parts of the program other than * the main module. */ #define SERVERROOT "/home/http" /* directory to put files in */ #define URLPREFIX "" /* prefix for re-written URLs */ #define DEFAULT_FILENAME "index.html" #define MAXFOLLOW 100 /* maximum number of '-f' options */ #define WEBCRAWL_VERSION "1.1" /* functions in 'path.c': */ void ResolveRelative (const char * path, const char * ref, char * buf); void GetDirectory (const char *path, char * buf); FILE * CreateFile (const char * path, const char * host, int port, const char *filename, int bOverwrite); int file_exists (const char * path, const char * host, int port, const char * filename); void get_filename(const char * path, const char * host, int port, const char *filename, char * name, int create_dirs); /* http.c: */ FILE * Download (char *, int, char *, char *, int, int); /* global variables */ struct OptStruct { /* URL following options */ int bAsk; /* ask for jump to new server? */ int bFollowNone; /* don't follow anything by default! */ int bImageOverride; /* get all inline images? */ int nAlwaysFollow; /* number of elements used in below array */ const char * pszAlwaysFollow[MAXFOLLOW]; /* substrings to always follow */ int nNeverFollow; /* number of elements used in below array */ const char * pszNeverFollow[MAXFOLLOW]; /* substrings to never follow */ FILE * fURLLog; /* file to log off-site URLs */ /* rewriting options */ int bRewrite; /* rewrite each page with local urls? */ char cRewriteMode; /* 'a', 'l', or 'f' - see usage */ int bRename; /* change filenames to (a) remove metachars and (b) end in correct extensions */ int bQueryAddPid; /* when renaming querys, store pid in name? */ /* recursion limiting */ int nRemote; /* remote hop count, or -1 for no limit */ int nLocal; /* local limit, as above */ /* general options */ const char * pszOutputDir; /* output dir, relative from pszServerRoot */ int bVerbose; /* set to 1 if verbose mode required */ /* HTTP options */ char * userAgent; /* eg. 'Mozilla/4.05 [en] (WinNT; I ;Nav)' */ int timeout; /* timeout in seconds */ int bNoDataTO; /* no data timeout, or overall timeout? */ }; #define DEF_USER_AGENT "Mozilla/4.05 [en] (X11; I; Linux 2.0.27 i586; Nav)" /* Alternative user agents: "Mozilla/4.05 [en] (WinNT; I ; Nav)" "Mozilla/4.0 (compatible; MSIE 4.01; Windows 95; DIL0001011)" */ const char * pszServerRoot; const char * pszURLPrefix; extern struct OptStruct options; extern char lastcontenttype[]; /* definitions for getxref module */ typedef struct xreflist { int nrefs; char * refs[200]; int startloc[200], endloc[200]; int alwaysget[200]; } xreflist; #define iswhitespace(a) ((a) == ' ' || (a) == '\t' || (a) == '\n' || (a)=='\r') int getxref(FILE * fp, xreflist * xrefs); /* functions in url.c */ int relative_url(char * newurl, const char * oldhost, int oldport, const char * oldfile, char * newhost, int * newport, char * newfile, int alwaysget); int SplitURL (char *, char *, char *, int *); /* rewrite.c: */ void rewrite(char * path, const char * host, int port, char * filename, xreflist * xr); /* rename.c: */ void rename_init(); void rename_readconfig(FILE * fp, const char * filename, int * lineno, char * buf, int buflen); char * rename_object(const char * pszHost, int port, char * pszObjectname, const char * contenttype);