#include #include #include #include "web.h" const char * pszServerRoot = SERVERROOT; const char * pszURLPrefix = URLPREFIX; FILE * GetPage (char *, int, char *, char *, int); int process_page (char * host, int port, char * file, char * destdir, int overwrite, int max_recurs); /* looks for /etc/webcrawl.conf, /usr/local/etc/webcrawl.conf, * $HOME/.webcrawl and processes all that are found */ void processconfig(FILE * fp, const char * filename) { char szBuf[256]; int line; fgets(szBuf, 256, fp); line = 1; while (!feof(fp)) { if (!strcmp(szBuf, "[rename]")) { rename_readconfig(fp, filename, &line, szBuf, 256); continue; /* next line returned in buffer */ } else { fprintf (stderr, "%s, line %d: I don't understand that\n", filename, line); exit(1); } } } void readconfig() { char buf[256]; FILE * fp; if ((fp = fopen("/etc/webcrawl.conf", "r")) != NULL) { processconfig(fp, "/etc/webcrawl.conf"); fclose(fp); } if ((fp = fopen("/usr/local/etc/webcrawl.conf", "r")) != NULL) { processconfig(fp, "/usr/local/etc/webcrawl.conf"); fclose(fp); } if (getenv("HOME")) { strcpy(buf, getenv("HOME")); strcat(buf, "/.webcrawl"); if ((fp = fopen(buf, "r")) != NULL) { processconfig(fp, buf); fclose(fp); } } } int main (int argc, char * * argv) { char * pszURL, * pszDestDir, szHost [128], szFile [128]; int port; argc--,argv++; if (argc < 2) { printf ("usage: webcrawl [options] \n"); printf ("options:\n"); printf (" URL selection (default=don't follow off-site links):\n"); printf (" -a ask the user whether to jump to new servers\n"); printf (" -f str always follow links to URLS that contain string" " 'str'\n"); printf (" -d str don't ever follow links containing 'str'\n"); printf (" -u f log unfollowed URLs to file 'f'\n"); printf (" -x don't follow any page links by default!\n"); printf (" -X don't load inline images by default\n"); printf (" Page re-writing:\n"); printf (" -n don't rewrite the pages with local URLs\n"); printf (" -r rewrite non relative URLs when: a - always\n"); printf (" l - URL is local, f (default) - target file " "exists\n"); printf (" -k keep existing names; disable renaming files " "to sane filenames\n"); printf (" -q disable insertion of process id in query " "filenames\n"); printf (" Recursion limiting:\n"); printf (" -l[x] n limit depth of search to find files to n, " "initially with -ll,\n"); printf (" after jump to remote site with -lr, " "both with -l\n"); printf (" General options:\n"); printf (" -v increase verbosity (use up to 4 times)\n"); printf (" -[op] d change o: server root directory, p: url " "rewriting prefix to d\n"); printf (" HTTP options:\n"); printf (" -A set the agent name\n" " default = '" DEF_USER_AGENT "'\n"); printf (" -t n set timeout to n seconds\n"); printf (" -T use no data timeout, rather than overall " "connection timeout\n"); printf ("\nweb address should not have a leading http://, and " "destination dir is taken\n"); printf ("relative to the server root directory (" SERVERROOT ").\n"); printf ("webcrawl version " WEBCRAWL_VERSION "\n"); return 1; } options.bAsk = 0; options.nAlwaysFollow = 0; options.bImageOverride = 1; options.fURLLog = NULL; options.bRewrite = 1; options.cRewriteMode = 'f'; options.bRename = 1; options.bQueryAddPid = 1; options.nRemote = -1; options.nLocal = -1; options.bVerbose = 0; options.userAgent = DEF_USER_AGENT; options.timeout = 0; options.bNoDataTO = 0; while (argc && **argv=='-') { switch(argv[0][1]) { /* options to control URL selection */ case 'a': options.bAsk = 1; break; case 'f': if (options.nAlwaysFollow == MAXFOLLOW) { fprintf(stderr, "Too many -f options on command line\n"); return 1; } options.pszAlwaysFollow[options.nAlwaysFollow++] = *(++argv); argc--; break; case 'd': if (options.nNeverFollow == MAXFOLLOW) { fprintf(stderr, "Too many -d options on command line\n"); return 1; } options.pszNeverFollow[options.nNeverFollow++] = *(++argv); argc--; break; case 'u': options.fURLLog = fopen(*(++argv), "w"); if (!options.fURLLog) { fprintf(stderr, "Couldn't open '%s'\n", *argv); return 1; } argc--; break; case 'x': options.bFollowNone = 1; break; case 'X': options.bImageOverride = 0; break; /* options to control page rewriting */ case 'n': options.bRewrite = 0; break; case 'r': if (argv[0][2] != 'a' && argv[0][2] != 'l' && argv[0][2] != 'f') { fprintf(stderr, "unrecognised -rx mode: %c\n", argv[0][2]); return 1; } options.cRewriteMode = argv[0][2]; break; case 'k': options.bRename = 0; break; case 'q': options.bQueryAddPid = 0; break; /* recursion limiting options */ case 'l': if (argv[0][2] == 'r') { options.nRemote = atoi(argv[1]); } else if (argv[0][2] == 'l') { options.nLocal = atoi(argv[1]); } else if (argv[0][2] == 0) { options.nRemote = options.nLocal = atoi(argv[1]); } else { printf("Unrecognised option: %s\n", *argv); return 1; } argv++, argc--; break; /* general options */ case 'v': options.bVerbose++; break; case 'o': pszServerRoot = *(++argv); argc--; break; case 'p': pszURLPrefix = *(++argv); argc--; break; /* HTTP-related options */ case 'A': options.userAgent = *(++argv); argc--; break; case 't': options.timeout = atoi(*(++argv)); argc--; break; case 'T': options.bNoDataTO = 1; break; default: printf("Unrecognised option: %s\n", *argv); return 1; } argc--; argv++; } pszURL = argv [0], pszDestDir = argv [1]; options.pszOutputDir = argv[1]; if (SplitURL (pszURL, szHost, szFile, &port) ) { printf ("Invalid URL : %s\n", pszURL); return 1; } rename_init(); return process_page (szHost, port, szFile, pszDestDir, 1, options.nLocal); } int process_page (char * host, int port, char * file, char * destdir, int overwrite, int max_recurs) { FILE * fp; xreflist xr; char newhost[128], newfile[128]; int newport; int i; if (max_recurs == 0) { printf("Maximum recursion level reached.\n"); return 0; } if (max_recurs > 0) max_recurs --; /* figure to pass on ! */ if (! (fp = GetPage(host, port, file, destdir, overwrite))) return 1; /* printf("Content-type: %s\n", lastcontenttype); */ if (!strcmp(lastcontenttype, "text/html")) { fseek(fp, 0, SEEK_SET); if (getxref(fp, &xr)) { fclose(fp); return 1; } fclose(fp); for (i = 0; i < xr.nrefs; i++) { if (relative_url(xr.refs[i], host, port, file, newhost, &newport, newfile, xr.alwaysget[i])) { fprintf(stderr, "not following link to: %s\n", xr.refs[i]); } else { if (strcmp(newhost, host)) process_page(newhost, newport, newfile, destdir, 0, options.nRemote); else process_page(newhost, newport, newfile, destdir, 0, max_recurs); } free(xr.refs[i]); } if (options.bRewrite) rewrite(destdir, host, port, file, &xr); } else fclose(fp); return 0; } FILE * GetPage (char * pszHost, int nPort, char * pszFile, char * pszDest, int bOverwrite) { /* FILE * fpDest = CreateFile (pszDest, pszFile, bOverwrite); if (!fpDest) return 1; */ return Download (pszHost, nPort, pszFile, pszDest, 0, bOverwrite); }