#!/usr/bin/perl -w
#Checks the URLs of a website, printing out which local files are inaccessible
#as well as the inaccessible links within each accessible local file.
#Requires: geturls (in same directory), Lynx, comm, and wc
#Files must be readable by world to be accessible;
#directories must be readable and executable by world.
#
#Author: Zach Tomaszewski
#Date: 1 October 2002
#init
$status = 0; #whether to use lynx's status code instead of comm + wc
$extensive = 0; #whether to also list indeterminable files
%processed = (); #hash of processed/completed files; values=accessibility
%containedURLs = (); #hash keyed by filename; values=list of contained urls
#get file arguments
if (@ARGV == 0 || $ARGV[0] eq "-h"){ #no arguments or request for help
&usage();
}else {
foreach $filename (@ARGV){
if ($filename eq "-s"){
$status = 1;
}elsif ($filename eq "-e"){
$extensive = 1;
}else {
&process($filename);
}
}
&printResults;
}
##
# Processes a given filename by expanding it if it's a directory
# or dealing with it if it's a local file or checking if it's accessible
# if it's an URL
##
sub process { #Needs: filename/url
my ($filename) = @_;
#print "Processing: $filename "; #debugging statement (1 of 3)
if (exists $processed{$filename}) { #already been handled
return;
}elsif ($filename =~ m|^http://|i) { #http url
$processed{$filename} = &httpProcess($filename);
}elsif ($filename =~ m!(://)|(mailto:)!i) { #some other web protocol
$processed{$filename} = 0; #unclear status/ignore
}else { #a local file (supposed to be anyway)
$processed{$filename} = &localProcess($filename);
}
}
##
# Processes an http URL to see if it is accessible
# If it's accessible, returns >0 (2).
# If it's inaccessible, returns <0 (-2).
# (If unclear--status 3xx or alternate--returns 0.)
##
sub httpProcess { #Needs: filename/url
my ($filename) = @_;
#print "HTTP\n"; #debugging statement (2 of 3)
if ($status == 1) { #bypassing comm and wc to use status codes
#grab first line of header info while dumping STDERR into the void
my $statusCode = `lynx -head -dump $filename 2> /dev/null`;
if ($statusCode eq "") { #no status because couldn't connect
return -2;
}elsif ($statusCode =~ /2\d\d/){ #successful
return 2;
}elsif ($statusCode =~ /3\d\d/){ #redirected
return 0;
}else { #error code
return -2;
}
}else { #do this with comm and wc instead
#grab page for filename, dumping STDERR into same file
`lynx -dump $filename > urlpage 2> urlpage`;
#check to see if able to connect
open (URL, "urlpage") || die "Unable to open filedump urlpage: ", $!;
my @urlpage = <URL>;
close (URL);
if ($urlpage[3] =~ /Unable to connect/) { #third line
unlink ("urlpage");
return -2;
}
$filename =~ s|/$||; #remove any trailing slash
$filename .= "/eRRorPaGEasdf"; #point to an error page on the same server
`lynx -dump $filename > errorpage 2> errorpage`;
my $linediff = `comm -3 urlpage errorpage | wc -l`;
unlink ("urlpage", "errorpage"); #cleaning up my file mess
if ($linediff > 16) { #quite different from errorpage
return 2;
}else {
return -2;
}
}
}#end httpProcess
##
# Processes a local file by seeing if it is accessible.
# If it is a directory, it process all the files in that directory.
# For each file, it also processes the URLs in that file
# Returns 1 if given file was successfully processed,
# 0 if it was a processed directory or questionable file,
# and -1 if it was inaccessible.
##
sub localProcess { #Needs: filename
my ($filename) = @_;
#print "LOCAL\n"; #debugging statement (3 of 3)
if (&openable($filename) ){
if (-d $filename){ #directory; open and process each file within
opendir (DIR, $filename) ||
die "Unable to open directory $filename: ", $!;
my @dirfiles = readdir(DIR);
close DIR;
@dirfiles = grep (!/^\.\.?$/, @dirfiles); #remove "." or ".."
$filename =~ s|/$||; #remove any trailing slash from dir name
foreach $file (@dirfiles){
$file = $filename . "/$file"; #prepend dir name to keep file unique
&process($file);
}
return 0; #accessible directory and processed
}elsif (-f $filename) {#file; open and run geturls
my (@urls) = `./geturls $filename`;
chomp @urls;
#setting value equal to a ref to an anonymous array
$containedURLs{$filename} = [(@urls)];
foreach $url (@{$containedURLs{$filename}}){
if ($url !~ m!(://)|(mailto:)!) { #local file; need to prepend directory
#not using $filename again, so going to chop off end filename
$filename =~ s|[^/]+$||i;
$url = $filename . $url;
}
&process($url) if $url; #testing for empty lists to prevent warning
}
return 1; #accessible file and processed
}else { #not a regular file or directory
return 0; #play it safe and don't deal with it
}
}else { #not even accessible
return -1;
}
}#end localProcess
##
# Checks whether a file is readable by everyone (not just Perl).
# If file is a directory, also checks that it is executable.
# Returns 1 if openable, 0 if not.
##
sub openable { #Needed: filename
my ($filename) = @_;
if (!-e $filename) {return 0; } #file doesn't exist
my $mode = (stat $filename)[2]; #permissions
$mode = sprintf("%lo", $mode); #to octal
if (-d $filename){ #directory
my ($o) = (split "", $mode)[4]; #gets permission for all users
return ($o == 5 || $o == 7) ? 1 : 0; # 1 if r-x or rwx for all
}else{ #normal file
my ($o) = (split "", $mode)[5]; #gets permission for all users
return ($o >= 4) ? 1 : 0; #file is at least readable
}
}
##
# Prints out Inaccessible files from %processed
# and then gives all accessible files analyzed with any inaccessible URLs
# found in those files.
##
sub printResults{
print "\nInaccessible files:\n";
#change to "< 0" to print out all inaccessible http urls too
my @badfiles = grep($processed{$_} == -1, keys %processed);
foreach $badfile (sort @badfiles){
print " $badfile\n";
}
if (@badfiles == 0){ print " None.\n";}
print "\n";
if ($extensive) {
print "Questionable/Redirected files:\n";
my @qfiles = grep($processed{$_} == 0, keys %processed);
#directories also included under 0, so remove those
@qfiles = grep(!-d, @qfiles);
foreach $qfile (sort @qfiles){
print " $qfile\n";
}
if (@qfiles == 0){ print " None.\n";}
print "\n";
}
#print out good files with bad URLs underneath each
#change condition to "> 0" to print out good http urls too
my @goodfiles = grep($processed{$_} == 1, keys %processed);
foreach $goodfile (sort @goodfiles){
print "$goodfile:\n";
foreach $url (sort @{$containedURLs{$goodfile}}){
if ($processed{$url} < 0){ #url is bad
print " $url\n";
}
}
}
print "\n";
}#end printResults
##
# Prints the usage message for this program
##
sub usage {
print "\nChecks the URLs of one or more files or complete website.\n";
print "\nUsage: checkurls [-s] [file]...\n";
print " -s \t Status. Use Lynx's status code rather than comm and wc.\n";
print " -e \t Extensive. List all unchecked links of other protocols,\n";
print "\t such as mailto: and ftp:, redirected links, \n";
print "\t and non-normal local files.\n";
exit();
}
| ~ztomasze Index : TA
: Assignment 2 : Solution http://www2.hawaii.edu/~ztomasze |
Last Edited: 30 Oct 2002 ©2002 by Z. Tomaszewski. |