#!/usr/bin/perl

#geturls
#Pulls the URLs from an html document, displaying them in sorted order
#Author: Zach Tomaszewski, Sep 18, 2002 1:03:01 PM

my $filename = ""; #the url-containing file.
my $filelines = ""; #all the lines of the given file
my @urls = (); #the collected URLs

if (@ARGV > 1) {
   &usage_error;
}elsif (@ARGV == 1) {
   $filename = $ARGV[0];
   open (IN, "$filename") || die "Could not open $filename: ", $!;
     $filelines = join ("", <IN>);
   close IN;
}else {
   $filelines = join ("", (<STDIN>));
}

@urls = $filelines =~ /<\s*a\s+\href\s*=\s*"?\s*([^>"]+)\s*"?\s*>/ig;

my %urlhash;
foreach (@urls){
  $urlhash{$_} = "";
}
@urls = sort keys %urlhash;

foreach (@urls){
  print "$_\n";
}

##END##

sub usage_error {
  print "Usage:\n";
  print "Either pass in a file to pass as a parameter (geturls sample.html)\n";
  print " or pass in a file in through standard in (geturls < sample.html)\n";
}