Revision: 3991
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at October 14, 2007 23:49 by iblis
Initial Code
#!/usr/bin/perl -w
use strict;
use Getopt::Std;
use LWP::Simple;
use HTML::Parser;
#
# Grab all links from local or remote html file
# perl html munging
#
# option -a (/ -r) grabs only absolute (/ relative) urls
# get options and argument
#
my %opts;
getopts('ar', \%opts);
my $arg = shift;
die "Usage: $0 [-a | -r] filename [| URL]\n"
if (not defined $arg or $opts{a} && $opts{r}); # allow either -a or -r
# get the page either from file or url
#
my $page;
if ($arg =~ m!^http://!) {
$page = get($arg)
or die "Couldn't get $arg: $!\n";
}
else {
open FH, "<", $arg
or die "Couldn't open $arg: $!\n";
$page = do { local $/; <FH> };
close FH;
}
# set the parser and parse
#
my $parser = HTML::Parser->new( api_version => 3,
start_h => [\&start,"tagname, attr"],
);
my @links;
sub start {
my ($tag, $attr) = @_;
if ($tag =~ /^a$/ and defined $attr->{href}) {
return
if ($attr->{href} =~ m!^http://! and $opts{r}); # exclude absolute url when -r
return
if ($attr->{href} !~ m!http://! and $opts{a}); # exclude relative url when -a
push @links, $attr->{href};
}
}
$parser->parse($page);
$parser->eof;
# output
#
map {print "$_\n"} @links;
Initial URL
Initial Description
Initial Title
Grab all links from local or remote html file
Initial Tags
html
Initial Language
Perl