#!/usr/bin/perl -w

#Checks the URLs of a website, printing out which local files are inaccessible
#as well as the inaccessible links within each accessible local file.
#Requires: geturls (in same directory), Lynx, comm, and wc
#Files must be readable by world to be accessible; 
#directories must be readable and executable by world.
#
#Author: Zach Tomaszewski
#Date: 1 October 2002


#init
$status = 0;  #whether to use lynx's status code instead of comm + wc
$extensive = 0;  #whether to also list indeterminable files
%processed = ();       #hash of processed/completed files; values=accessibility
%containedURLs = ();   #hash keyed by filename; values=list of contained urls

#get file arguments
if (@ARGV == 0 || $ARGV[0] eq "-h"){ #no arguments or request for help
  &usage();
}else { 
  foreach $filename (@ARGV){
    if ($filename eq "-s"){
      $status = 1;
    }elsif ($filename eq "-e"){
      $extensive = 1;
    }else {
      &process($filename);
    }
  }
  &printResults;
}


##
# Processes a given filename by expanding it if it's a directory
# or dealing with it if it's a local file or checking if it's accessible
# if it's an URL
##
sub process { #Needs: filename/url
  my ($filename) = @_;
    #print "Processing: $filename ";  #debugging statement (1 of 3)
  if (exists $processed{$filename}) { #already been handled
    return;
  }elsif ($filename =~ m|^http://|i) { #http url
    $processed{$filename} = &httpProcess($filename);
  }elsif ($filename =~ m!(://)|(mailto:)!i) { #some other web protocol
    $processed{$filename} = 0;  #unclear status/ignore
  }else { #a local file (supposed to be anyway)
    $processed{$filename} = &localProcess($filename);
  }
}


##
# Processes an http URL to see if it is accessible
# If it's accessible, returns >0 (2).
# If it's inaccessible, returns <0 (-2).
# (If unclear--status 3xx or alternate--returns 0.)
##
sub httpProcess { #Needs: filename/url
  my ($filename) = @_;
    #print "HTTP\n";  #debugging statement (2 of 3)
  if ($status == 1) {  #bypassing comm and wc to use status codes
    #grab first line of header info while dumping STDERR into the void
    my $statusCode = `lynx -head -dump $filename 2> /dev/null`;
    if ($statusCode eq "") { #no status because couldn't connect
      return -2;
    }elsif ($statusCode =~ /2\d\d/){ #successful
      return 2;
    }elsif ($statusCode =~ /3\d\d/){ #redirected
      return 0;
    }else {  #error code
      return -2;
    }

  }else { #do this with comm and wc instead
    #grab page for filename, dumping STDERR into same file
    `lynx -dump $filename > urlpage 2> urlpage`;

    #check to see if able to connect
    open (URL, "urlpage") || die "Unable to open filedump urlpage: ", $!;
      my @urlpage = <URL>;
    close (URL);
    if ($urlpage[3] =~ /Unable to connect/) { #third line
      unlink ("urlpage");
      return -2;
    }  

    $filename =~ s|/$||;     #remove any trailing slash
    $filename .= "/eRRorPaGEasdf";  #point to an error page on the same server
    `lynx -dump $filename > errorpage 2> errorpage`;
    my $linediff = `comm -3 urlpage errorpage | wc -l`;
    unlink ("urlpage", "errorpage");  #cleaning up my file mess
    if ($linediff > 16) { #quite different from errorpage
      return 2;
    }else {
      return -2;
    }
  }
}#end httpProcess


##
# Processes a local file by seeing if it is accessible.
# If it is a directory, it process all the files in that directory.
# For each file, it also processes the URLs in that file
# Returns 1 if given file was successfully processed, 
# 0 if it was a processed directory or questionable file,
# and -1 if it was inaccessible.
##
sub localProcess { #Needs: filename
  my ($filename) = @_;
    #print "LOCAL\n";  #debugging statement (3 of 3)
  if (&openable($filename) ){
    if (-d $filename){ #directory; open and process each file within
       opendir (DIR, $filename) || 
                die "Unable to open directory $filename: ", $!;
       my @dirfiles = readdir(DIR);
       close DIR;
       @dirfiles = grep (!/^\.\.?$/, @dirfiles);  #remove "." or ".."
       $filename =~ s|/$||;   #remove any trailing slash from dir name
       foreach $file (@dirfiles){
         $file = $filename . "/$file";  #prepend dir name to keep file unique
         &process($file);
       }
       return 0; #accessible directory and processed  
    }elsif (-f $filename) {#file; open and run geturls
      my (@urls) = `./geturls $filename`;
      chomp @urls;
      #setting value equal to a ref to an anonymous array
      $containedURLs{$filename} = [(@urls)]; 
      foreach $url (@{$containedURLs{$filename}}){ 
       if ($url !~ m!(://)|(mailto:)!) { #local file; need to prepend directory
          #not using $filename again, so going to chop off end filename
          $filename =~ s|[^/]+$||i; 
          $url = $filename . $url;
        }
        &process($url) if $url;  #testing for empty lists to prevent warning
      }
      return 1; #accessible file and processed
    }else { #not a regular file or directory  
      return 0;  #play it safe and don't deal with it
    }
  }else { #not even accessible
    return -1;
  }
}#end localProcess


##
# Checks whether a file is readable by everyone (not just Perl).
# If file is a directory, also checks that it is executable.
# Returns 1 if openable, 0 if not.
##
sub openable { #Needed: filename
  my ($filename) = @_;
  if (!-e $filename) {return 0; }  #file doesn't exist
  my $mode = (stat $filename)[2];  #permissions
  $mode = sprintf("%lo", $mode); #to octal
  if (-d $filename){ #directory
    my ($o) = (split "", $mode)[4];  #gets permission for all users
    return ($o == 5 || $o == 7) ? 1 : 0;  # 1 if r-x or rwx for all
  }else{ #normal file
    my ($o) = (split "", $mode)[5];  #gets permission for all users
    return ($o >= 4) ? 1 : 0;  #file is at least readable
  }
}


##
# Prints out Inaccessible files from %processed
# and then gives all accessible files analyzed with any inaccessible URLs 
# found in those files.
##
sub printResults{ 
  print "\nInaccessible files:\n";
  #change to "< 0" to print out all inaccessible http urls too
  my @badfiles = grep($processed{$_} == -1, keys %processed);
  foreach $badfile (sort @badfiles){
    print "  $badfile\n";
  }
  if (@badfiles == 0){ print " None.\n";}
  print "\n";

  if ($extensive) {
    print "Questionable/Redirected files:\n";
    my @qfiles = grep($processed{$_} == 0, keys %processed);
    #directories also included under 0, so remove those 
    @qfiles = grep(!-d, @qfiles);
    foreach $qfile (sort @qfiles){
      print "  $qfile\n";
    }
    if (@qfiles == 0){ print " None.\n";}
    print "\n";
  }

  #print out good files with bad URLs underneath each
  #change condition to "> 0" to print out good http urls too
  my @goodfiles = grep($processed{$_} == 1, keys %processed);
  foreach $goodfile (sort @goodfiles){
    print "$goodfile:\n";
    foreach $url (sort @{$containedURLs{$goodfile}}){
      if ($processed{$url} < 0){ #url is bad
        print "  $url\n";
      }
    }
  }    
  print "\n";
}#end printResults


##
# Prints the usage message for this program
##
sub usage {
  print "\nChecks the URLs of one or more files or complete website.\n";
  print "\nUsage: checkurls [-s] [file]...\n";
  print "  -s \t Status.  Use Lynx's status code rather than comm and wc.\n";
  print "  -e \t Extensive.  List all unchecked links of other protocols,\n";
  print "\t  such as mailto: and ftp:, redirected links, \n";
  print "\t  and non-normal local files.\n";
  exit();
}