#!/usr/bin/perl -w #Checks the URLs of a website, printing out which local files are inaccessible #as well as the inaccessible links within each accessible local file. #Requires: geturls (in same directory), Lynx, comm, and wc #Files must be readable by world to be accessible; #directories must be readable and executable by world. # #Author: Zach Tomaszewski #Date: 1 October 2002 #init $status = 0; #whether to use lynx's status code instead of comm + wc $extensive = 0; #whether to also list indeterminable files %processed = (); #hash of processed/completed files; values=accessibility %containedURLs = (); #hash keyed by filename; values=list of contained urls #get file arguments if (@ARGV == 0 || $ARGV[0] eq "-h"){ #no arguments or request for help &usage(); }else { foreach $filename (@ARGV){ if ($filename eq "-s"){ $status = 1; }elsif ($filename eq "-e"){ $extensive = 1; }else { &process($filename); } } &printResults; } ## # Processes a given filename by expanding it if it's a directory # or dealing with it if it's a local file or checking if it's accessible # if it's an URL ## sub process { #Needs: filename/url my ($filename) = @_; #print "Processing: $filename "; #debugging statement (1 of 3) if (exists $processed{$filename}) { #already been handled return; }elsif ($filename =~ m|^http://|i) { #http url $processed{$filename} = &httpProcess($filename); }elsif ($filename =~ m!(://)|(mailto:)!i) { #some other web protocol $processed{$filename} = 0; #unclear status/ignore }else { #a local file (supposed to be anyway) $processed{$filename} = &localProcess($filename); } } ## # Processes an http URL to see if it is accessible # If it's accessible, returns >0 (2). # If it's inaccessible, returns <0 (-2). # (If unclear--status 3xx or alternate--returns 0.) ## sub httpProcess { #Needs: filename/url my ($filename) = @_; #print "HTTP\n"; #debugging statement (2 of 3) if ($status == 1) { #bypassing comm and wc to use status codes #grab first line of header info while dumping STDERR into the void my $statusCode = `lynx -head -dump $filename 2> /dev/null`; if ($statusCode eq "") { #no status because couldn't connect return -2; }elsif ($statusCode =~ /2\d\d/){ #successful return 2; }elsif ($statusCode =~ /3\d\d/){ #redirected return 0; }else { #error code return -2; } }else { #do this with comm and wc instead #grab page for filename, dumping STDERR into same file `lynx -dump $filename > urlpage 2> urlpage`; #check to see if able to connect open (URL, "urlpage") || die "Unable to open filedump urlpage: ", $!; my @urlpage = <URL>; close (URL); if ($urlpage[3] =~ /Unable to connect/) { #third line unlink ("urlpage"); return -2; } $filename =~ s|/$||; #remove any trailing slash $filename .= "/eRRorPaGEasdf"; #point to an error page on the same server `lynx -dump $filename > errorpage 2> errorpage`; my $linediff = `comm -3 urlpage errorpage | wc -l`; unlink ("urlpage", "errorpage"); #cleaning up my file mess if ($linediff > 16) { #quite different from errorpage return 2; }else { return -2; } } }#end httpProcess ## # Processes a local file by seeing if it is accessible. # If it is a directory, it process all the files in that directory. # For each file, it also processes the URLs in that file # Returns 1 if given file was successfully processed, # 0 if it was a processed directory or questionable file, # and -1 if it was inaccessible. ## sub localProcess { #Needs: filename my ($filename) = @_; #print "LOCAL\n"; #debugging statement (3 of 3) if (&openable($filename) ){ if (-d $filename){ #directory; open and process each file within opendir (DIR, $filename) || die "Unable to open directory $filename: ", $!; my @dirfiles = readdir(DIR); close DIR; @dirfiles = grep (!/^\.\.?$/, @dirfiles); #remove "." or ".." $filename =~ s|/$||; #remove any trailing slash from dir name foreach $file (@dirfiles){ $file = $filename . "/$file"; #prepend dir name to keep file unique &process($file); } return 0; #accessible directory and processed }elsif (-f $filename) {#file; open and run geturls my (@urls) = `./geturls $filename`; chomp @urls; #setting value equal to a ref to an anonymous array $containedURLs{$filename} = [(@urls)]; foreach $url (@{$containedURLs{$filename}}){ if ($url !~ m!(://)|(mailto:)!) { #local file; need to prepend directory #not using $filename again, so going to chop off end filename $filename =~ s|[^/]+$||i; $url = $filename . $url; } &process($url) if $url; #testing for empty lists to prevent warning } return 1; #accessible file and processed }else { #not a regular file or directory return 0; #play it safe and don't deal with it } }else { #not even accessible return -1; } }#end localProcess ## # Checks whether a file is readable by everyone (not just Perl). # If file is a directory, also checks that it is executable. # Returns 1 if openable, 0 if not. ## sub openable { #Needed: filename my ($filename) = @_; if (!-e $filename) {return 0; } #file doesn't exist my $mode = (stat $filename)[2]; #permissions $mode = sprintf("%lo", $mode); #to octal if (-d $filename){ #directory my ($o) = (split "", $mode)[4]; #gets permission for all users return ($o == 5 || $o == 7) ? 1 : 0; # 1 if r-x or rwx for all }else{ #normal file my ($o) = (split "", $mode)[5]; #gets permission for all users return ($o >= 4) ? 1 : 0; #file is at least readable } } ## # Prints out Inaccessible files from %processed # and then gives all accessible files analyzed with any inaccessible URLs # found in those files. ## sub printResults{ print "\nInaccessible files:\n"; #change to "< 0" to print out all inaccessible http urls too my @badfiles = grep($processed{$_} == -1, keys %processed); foreach $badfile (sort @badfiles){ print " $badfile\n"; } if (@badfiles == 0){ print " None.\n";} print "\n"; if ($extensive) { print "Questionable/Redirected files:\n"; my @qfiles = grep($processed{$_} == 0, keys %processed); #directories also included under 0, so remove those @qfiles = grep(!-d, @qfiles); foreach $qfile (sort @qfiles){ print " $qfile\n"; } if (@qfiles == 0){ print " None.\n";} print "\n"; } #print out good files with bad URLs underneath each #change condition to "> 0" to print out good http urls too my @goodfiles = grep($processed{$_} == 1, keys %processed); foreach $goodfile (sort @goodfiles){ print "$goodfile:\n"; foreach $url (sort @{$containedURLs{$goodfile}}){ if ($processed{$url} < 0){ #url is bad print " $url\n"; } } } print "\n"; }#end printResults ## # Prints the usage message for this program ## sub usage { print "\nChecks the URLs of one or more files or complete website.\n"; print "\nUsage: checkurls [-s] [file]...\n"; print " -s \t Status. Use Lynx's status code rather than comm and wc.\n"; print " -e \t Extensive. List all unchecked links of other protocols,\n"; print "\t such as mailto: and ftp:, redirected links, \n"; print "\t and non-normal local files.\n"; exit(); }
~ztomasze Index : TA
: Assignment 2 : Solution http://www2.hawaii.edu/~ztomasze |
Last Edited: 30 Oct 2002 ©2002 by Z. Tomaszewski. |