Posted By

iblis on 10/14/07


Tagged

html munging


Versions (?)

Who likes this?

2 people have marked this snippet as a favorite

melling
icecreamboyy


Grab all links from local or remote html file


 / Published in: Perl
 

  1. #!/usr/bin/perl -w
  2. use strict;
  3. use Getopt::Std;
  4. use LWP::Simple;
  5. use HTML::Parser;
  6. #
  7. # Grab all links from local or remote html file
  8. # perl html munging
  9. #
  10. # option -a (/ -r) grabs only absolute (/ relative) urls
  11.  
  12. # get options and argument
  13. #
  14. my %opts;
  15. getopts('ar', \%opts);
  16. my $arg = shift;
  17. die "Usage: $0 [-a | -r] filename [| URL]\n"
  18. if (not defined $arg or $opts{a} && $opts{r}); # allow either -a or -r
  19.  
  20. # get the page either from file or url
  21. #
  22. my $page;
  23. if ($arg =~ m!^http://!) {
  24. $page = get($arg)
  25. or die "Couldn't get $arg: $!\n";
  26. }
  27. else {
  28. open FH, "<", $arg
  29. or die "Couldn't open $arg: $!\n";
  30. $page = do { local $/; <FH> };
  31. close FH;
  32. }
  33.  
  34. # set the parser and parse
  35. #
  36. my $parser = HTML::Parser->new( api_version => 3,
  37. start_h => [\&start,"tagname, attr"],
  38. );
  39. my @links;
  40. sub start {
  41. my ($tag, $attr) = @_;
  42. if ($tag =~ /^a$/ and defined $attr->{href}) {
  43. if ($attr->{href} =~ m!^http://! and $opts{r}); # exclude absolute url when -r
  44. if ($attr->{href} !~ m!http://! and $opts{a}); # exclude relative url when -a
  45. push @links, $attr->{href};
  46. }
  47. }
  48. $parser->parse($page);
  49. $parser->eof;
  50.  
  51. # output
  52. #
  53. map {print "$_\n"} @links;
  54.  
  55.  
  56.  
  57.  
  58.  

Report this snippet  

You need to login to post a comment.