1 ### File: LinkWalker.pm
2 ### Author: Lennart Borgman
3 ### All rights reserved
5 ##########################################################
7 ##########################################################
9 require LWP::UserAgent;
10 @ISA = qw(LWP::UserAgent);
12 ### Mirror to another file (why???)
15 my($self, $url, $file, $mirr_tmp) = @_;
16 die "no mirr_tmp" unless defined $mirr_tmp;
18 LWP::Debug::trace('()');
19 my $request = new HTTP::Request('GET', $url);
22 my($mtime) = (stat($file))[9];
24 $request->header('If-Modified-Since' =>
25 HTTP::Date::time2str($mtime));
28 my $tmpfile = "$file-$$";
30 my $response = $self->request($request, $tmpfile);
31 if ($response->is_success) {
33 my $file_length = (stat($tmpfile))[7];
34 my($content_length) = $response->header('Content-length');
36 if (defined $content_length and $file_length < $content_length) {
38 die "Transfer truncated: " .
39 "only $file_length out of $content_length bytes received\n";
40 } elsif (defined $content_length and $file_length > $content_length) {
42 die "Content-length mismatch: " .
43 "expected $content_length bytes, got $file_length\n";
47 # Some dosish systems fail to rename if the target exists
48 chmod 0777, $mirr_tmp;
51 rename($tmpfile, $mirr_tmp) or
52 die "Cannot rename '$tmpfile' to '$mirr_tmp': $!\n";
54 if (my $lm = $response->last_modified) {
55 # make sure the file has the same last modification time
56 utime $lm, $lm, $mirr_tmp;
66 ##########################################################
68 ##########################################################
69 package HTML::WalkerParser;
70 require HTML::ParserTagEnd;
71 @ISA = qw(HTML::ParserTagEnd);
73 use vars qw(%LINK_ELEMENT);
75 # Elements that might contain links and the name of the link attribute
81 img => [qw(src lowsrc usemap)], # 'lowsrc' is a Netscape invention
84 'link' => 'href', # need quoting since link is a perl builtin
86 applet => [qw(codebase code)],
88 iframe => 'src', # Netscape 2.0 extention
89 embed => 'src', # used in Netscape 2.0 for Shockwave and things like that
113 return unless exists $MAYBECONT{$tag};
114 return ($MAYBECONT{$tag} eq $att);
118 my($class, $parsed_fh) = @_;
119 my $self = $class->SUPER::new;
120 $self->{parsed_fh} = $parsed_fh;
130 ##########################################################
132 ##########################################################
133 package HTML::LinkWalker;
144 ##########################################################
146 ##########################################################
148 my $m_ua_personality = "LinkWalker/0.9";
155 my $m_subMirrorAction;
158 #############################
160 #############################
164 sub tell_bad_link($$$$$) {
170 $file = "START" unless defined $file;
171 $lnum = "(start)" unless defined $lnum;
172 my $longMsg = "<<$what>>";
173 my $shortMsg = $what;
175 my @lines = split("\\s+", $line);
176 my $disp_line = join("\n\t\t ", @lines);
177 $longMsg .= ",\n\t\tlink=$link\n\t\t$disp_line";
179 my @msg = ($shortMsg, $longMsg);
180 $m_CheckedLinks{$file}->{ERR}->{$lnum} = \@msg;
181 &$m_subReport("\t* Error * " . $what . "\n");
185 #############################
187 #############################
189 sub get_contenttype($) {
190 my $response = shift;
191 my @rh = $response->header("Content-Type");
194 if ((my $iPos = index($r, ";")) > -1) {
195 $c = substr($r, 0, $iPos);
200 sub is_linked_contenttype($) {
201 my $response = shift;
202 return (get_contenttype($response) eq "text/html");
205 sub ending_is_container($) {
206 my $link_addr = shift;
207 $link_addr =~ s!#.*$!!;
208 $link_addr =~ s!\?.*$!!;
209 return (($link_addr =~ m!\.s?html?$!i) ? 1 : 0);
215 sub mk_mirror_name($) {
216 my $orig_name = shift;
217 $orig_name =~ tr!\\!/!;
218 my $mirr_name = $orig_name;
219 my ($orig_host) = ($orig_name =~ m!(^https?://[^/]*)!i);
220 if (defined $orig_host) {
221 my $host = $orig_host;
224 $mirr_name =~ s!^$orig_host!$host!;
225 if (substr($mirr_name, -1) eq "/") { $mirr_name .= "default.html"; }
227 die "Can't find host in $orig_name\n";
229 my $mirr_full = sMirrorRoot() . $mirr_name;
231 my $sExt = $mirr_name; $sExt =~ s!.*\.([^\.]*$)!$1!;
232 $mirr_full = sMirrorRoot() . "temp.$sExt";
234 my $mirr_fold = $mirr_full;
235 $mirr_fold =~ s![^/]*$!!;
236 File::Path::mkpath($mirr_fold, 0, 0777);
240 #############################
242 #############################
244 my $uq_link_addr = shift;
245 if (!exists $m_is_outside{$uq_link_addr}) {
246 $m_is_outside{$uq_link_addr} = test_is_outside($uq_link_addr, \@m_sLinkRoots);
248 return $m_is_outside{$uq_link_addr};
250 sub set_is_container($$) {
251 my $uq_link_addr = shift;
252 return if exists $m_is_container{$uq_link_addr};
253 $m_is_container{$uq_link_addr} = shift;
255 sub is_outside_container($) {
256 my $uq_link_addr = shift;
257 if (exists $m_is_container{$uq_link_addr}) {
258 if ($m_is_container{$uq_link_addr}) {
259 return is_outside($uq_link_addr);
263 sub test_is_outside($$) {
264 my $uq_link_addr = shift;
265 my $link_roots = shift;
266 if (defined $link_roots) {
268 for my $link_root (@$link_roots) {
269 if (substr($uq_link_addr, 0, length($link_root)) eq $link_root) {
279 ##########################################################
281 ##########################################################
285 sub HTML::WalkerParser::declaration {
286 my($self, $decl) = @_;
287 return unless defined $self->{parsed_fh};
288 my $fh = $self->{parsed_fh};
289 print $fh "<!" . $decl . ">";
292 sub HTML::WalkerParser::start {
293 my($self, $tag, $attr, $ended) = @_;
294 &$m_start_cb($tag, $attr);
295 return unless defined $self->{parsed_fh};
297 for my $k (keys %$attr) {
298 my $encoded = encode_entities($$attr{$k});
299 $t .= qq( $k="$encoded");
306 my $fh = $self->{parsed_fh};
309 sub HTML::WalkerParser::end {
310 my ($self, $tag) = @_;
311 return unless defined $self->{parsed_fh};
312 my $fh = $self->{parsed_fh};
313 print $fh "</" . $tag . ">";
315 sub HTML::WalkerParser::text {
316 my ($self, $txt) = @_;
317 return unless defined $self->{parsed_fh};
318 my $fh = $self->{parsed_fh};
321 sub HTML::WalkerParser::comment {
322 my($self, $comment) = @_;
323 return unless defined $self->{parsed_fh};
324 my $fh = $self->{parsed_fh};
325 print $fh "<!--" . $comment . "-->";
331 ### Main parsing routine
333 sub parse_file($$$$$$$$$) {
334 my ($file_name, $parsed_fh, $uq_link_addr, $link_roots,
335 $ref_links, $ref_anchs, $ref_lines, $ref_tagname, $ref_attname) = @_;
338 $file_name = PathSubs::uniq_dir($file_name) . "default.html";
339 $uq_link_addr .= "/" unless substr($uq_link_addr, -1) eq "/";
340 $uq_link_addr .= "default.html";
341 &$m_subReport("dir => $file_name\n");
343 $fh = new IO::File($file_name);
344 die "Can't read $file_name: $!\n" unless defined $fh;
348 my $uq_link_fold = $uq_link_addr; $uq_link_fold =~ s![^/]*$!!;
352 my ($tag, $attr_hash) = @_;
353 for my $k (keys %$attr_hash) {
354 if (($k eq "id") || ($k eq "name")) {
355 my $v = $$attr_hash{$k};
356 $$ref_anchs{$v} = $n;
357 $$ref_lines{$n} = $line;
358 } elsif (exists $LINKATTRIBS{$k}) {
359 my $v = $$attr_hash{$k};
360 next if $v =~ m!^javascript:!;
361 next if $v =~ m!^ftp://!;
362 next if $v =~ m!^mailto://!;
363 if ($tag eq "base") { $base_href = $v if $k eq "href"; next; }
364 my $v_abs; my $v_rel;
365 my $v_is_abs = PathSubs::is_abs_path($v);
368 $v_rel = PathSubs::mk_relative_link($uq_link_addr, $v_abs);
371 if (defined $base_href) {
372 $v_abs = PathSubs::mk_abs_link($base_href, $v);
374 if (substr($v_rel, 0, 1) ne "#") {
375 $v_abs = $uq_link_fold . $v_rel;
377 $v_abs = $uq_link_addr . $v_rel;
379 $v_abs = PathSubs::resolve_dotdot($v_abs);
382 next if exists $m_CheckedLinks{$v_abs};
383 if (is_outside($v_abs)) {
385 if (ending_is_container($v_abs)) {
386 $m_CheckedLinks{$v_abs} = {};
387 tell_bad_link("Outside relative link ($v_rel)",
388 $uq_link_addr, $n, $v, $line);
391 ### Skip outside absolute links
392 ### Could be things like banners etc...
395 $$ref_links{$v_rel} = $n;
396 $$ref_lines{$n} = $line;
397 if (substr($v_rel, 0, 1) ne "#") {
398 my $v_rel_name = $v_rel;
399 $v_rel_name =~ s!#.*$!!;
400 $v_rel_name =~ s!\?.*$!!;
401 $$ref_tagname{$v_rel_name} = $tag;
402 $$ref_attname{$v_rel_name} = $k;
404 if ($v_is_abs && ($v_rel ne $v)) { $$attr_hash{$k} = $v_rel; }
409 $m_start_cb = $start_cb;
410 my $p = HTML::WalkerParser->new($parsed_fh);
411 while ($line = <$fh>) {
420 ##########################################################
422 ##########################################################
423 sub walk_link($$;$$$$) {
424 die "$#_" unless ($#_ == 1 || $#_ == 5);
425 my $link_fold = shift;
426 my $link_file = shift;
427 my $parent_url = shift;
428 my $parent_lnum = shift;
429 my $parent_link = shift;
430 my $parent_line = shift;
432 my $link_addr = $link_fold . $link_file;
434 my $is_file = ($link_addr !~ m!^https?://!i);
436 $uq_link_addr = PathSubs::uniq_file($link_addr);
438 $uq_link_addr = PathSubs::resolve_dotdot($link_addr);
440 return if exists $m_CheckedLinks{$uq_link_addr};
441 return if exists $m_MissedLinks{$uq_link_addr};
442 $m_CheckedLinks{$uq_link_addr} = {};
443 my $link_is_container = ending_is_container($uq_link_addr);
444 if ($link_is_container) {
445 set_is_container($uq_link_addr, 1);
446 return if is_outside($uq_link_addr);
448 return if $m_bOnlyCont;
455 if (!-r $uq_link_addr) {
456 tell_bad_link("Can't read file ($uq_link_addr)",
457 $parent_url, $parent_lnum, $parent_link, $parent_line);
458 $m_MissedLinks{$uq_link_addr} = 1;
461 $file_name = $uq_link_addr;
463 $file_name = mk_mirror_name($uq_link_addr);
465 $ua = new LWP::UserAgent;
466 $ua->agent($m_ua_personality);
470 $response = $ua->mirror($uq_link_addr, $file_name);
471 &$m_subMirrorAction($uq_link_addr, $file_name, $response);
473 my $request = new HTTP::Request('GET', $uq_link_addr);
474 $response = $ua->request($request, $file_name);
476 #dump_response($response); exit;
477 if ($response->code != 304) {
478 if (!$response->is_success) {
479 tell_bad_link($response->status_line . " ($uq_link_addr)",
480 $parent_url, $parent_lnum, $parent_link, $parent_line);
481 $m_MissedLinks{$uq_link_addr} = 1;
484 $bDoRewrite = $m_bMirror;
485 $contenttype = get_contenttype($response);
486 $link_is_container = is_linked_contenttype($response);
488 if ($uq_link_addr ne $response->base) {
490 my $base_file = mk_mirror_name($response->base);
491 if (!File::Copy::copy($file_name, $base_file)) {
492 die "Can't copy($file_name, $base_file): $!\n";
494 if (my $lm = $response->last_modified) { utime $lm, $lm, $base_file; }
495 $file_name = $base_file;
497 $uq_link_addr = $response->base;
500 ### Test again, could be new info from net!
501 if ($link_is_container) {
502 set_is_container($uq_link_addr, 1);
503 return if is_outside($uq_link_addr);
505 return if $m_bOnlyCont;
508 &$m_subReport("$uq_link_addr ...");
517 my $file_to_parse = $file_name;
519 $parsed_file = $file_to_parse . "-p$$";
520 &$m_subReport(" <<GET");
521 $parsed_fh = new IO::File("> $parsed_file");
522 die "Can't create $parsed_file: $!\n" unless defined $parsed_fh;
523 print $parsed_fh "<!-- parsed version -->\n";
526 parse_file($file_to_parse, $parsed_fh, $uq_link_addr,
528 \%links, \%anchs, \%lines, \%tagname, \%attname);
529 if (defined $parsed_fh) {
531 if (-e $file_name) { unlink $file_name or die "Can't unlink $file_name: $!"; }
532 rename($parsed_file, $file_name) or die "Can't rename($parsed_file, $file_name): $!\n";
533 if (my $lm = $response->last_modified) { utime $lm, $lm, $file_name; }
536 if ($link_is_container) { return if is_outside($uq_link_addr); }
538 $m_CheckedLinks{$uq_link_addr}->{ANC} = \%anchs;
541 $file_dir = $uq_link_addr;
542 $file_dir =~ s![^/]*$!!;
545 my $container_folder = $uq_link_addr; $container_folder =~ s![^/]*$!!;
546 &$m_subAction($uq_link_addr, $file_name, $contenttype);
547 for my $link (sort keys %links) {
548 # Next line is for onclick lines in prepared docs
549 next if ($link eq "#");
550 my $lnum = $links{$link};
551 my $line = $lines{$lnum};
553 tell_bad_link("Empty link", $uq_link_addr, $lnum, $link, $line);
556 if ($link =~ m!(.*)\?!) { $link = $1; }
558 if ($link =~ m!(.*)#(.*)!) { $link = $1; $anchor = $2; }
560 if (!exists $anchs{$anchor}) {
561 tell_bad_link("Anchor not found ($anchor)", $uq_link_addr, $lnum, $link, $line);
568 if ($link =~ m!^https?://!i) {
575 $sub_fold = $file_dir;
576 $uq_sublink = PathSubs::uniq_file($sub_fold . $sub_file);
578 $sub_fold = $container_folder;
579 $uq_sublink = $sub_fold . $sub_file;
582 next if (exists $m_CheckedLinks{$uq_sublink});
583 if (defined $anchor) {
584 $m_CheckedLinks{$uq_link_addr}->{EXTANC}->{$uq_sublink} =
585 { ANC=> $anchor, LINE=>$line, LNUM=>$lnum};
588 die "link=$link\tattr=$tagname{$link}\n" unless exists $tagname{$link};
589 next unless maybecont($tagname{$link}, $attname{$link});
591 if (is_outside($uq_link_addr)) {
592 if (maybecont($tagname{$link}, $attname{$link}) ) {
596 walk_link($sub_fold, $sub_file, $uq_link_addr, $lnum, $link, $line);
603 ############################################
604 ### Some more checks!
605 ############################################
606 sub check_external_anchors() {
607 &$m_subReport("\nChecking external anchors...\n");
608 for my $f (sort keys %m_CheckedLinks) {
609 my $fnode = $m_CheckedLinks{$f};
610 if (exists ${$fnode}{"EXTANC"}) {
611 my $extanc_hash = ${$fnode}{"EXTANC"};
612 for my $fx (keys %$extanc_hash) {
613 next unless (exists $m_CheckedLinks{$fx});
614 my $ea_hash = ${$extanc_hash}{$fx};
615 my $ea = ${$ea_hash}{ANC};
616 my $fxnode = $m_CheckedLinks{$fx};
617 my $fx_anc_hash = ${$fxnode}{"ANC"};
618 if (!exists ${$fx_anc_hash}{$ea}) {
619 my $line = ${$ea_hash}{LINE};
620 my $lnum = ${$ea_hash}{LNUM};
621 &$m_subReport("From $f\n");
622 tell_bad_link("Ext anchor not found ($fx#$ea)",
623 $f, $lnum, "$fx#$ea", $line);
628 } # check_external_anchors
632 #############################
634 #############################
635 sub report_errors($$) {
640 for my $f (sort keys %m_CheckedLinks) {
641 my $fnode = $m_CheckedLinks{$f};
642 if (exists ${$fnode}{ERR}) {
645 if (!defined $errors_reported) {
646 $errors_reported = 1;
647 &$m_subReport("\n\n*********** Summary ERRORS and WARNINGS **********\n");
649 &$m_subReport("$f\n");
650 my $err_hash = ${$fnode}{ERR};
651 for my $e (sort keys %$err_hash) {
652 my $refE = ${$err_hash}{$e};
653 &$m_subReport("\t" . ${$refE}[0] . "\n");
657 undef $errors_reported;
659 for my $f (sort keys %m_CheckedLinks) {
660 my $fnode = $m_CheckedLinks{$f};
661 if (exists ${$fnode}{ERR}) {
662 if (!defined $errors_reported) {
663 $errors_reported = 1;
664 &$m_subReport("\n\n*********** Detailed ERRORS and WARNINGS **********\n");
666 &$m_subReport("$f\n");
667 my $err_hash = ${$fnode}{ERR};
668 for my $e (sort keys %$err_hash) {
669 my $refE = ${$err_hash}{$e};
670 &$m_subReport("\tat line $e: " . ${$refE}[1] . "\n");
676 die "\n*** There where errors ***\n";
678 &$m_subReport("No errors found\n");
682 sub dump_response($) {
683 my $response = shift;
684 &$m_subReport( $response->code . " " . $response->message . "\n");
685 &$m_subReport( "****************************************\n");
686 #&$m_subReport( $response->request . "\n");
687 #&$m_subReport( "****************************************\n");
688 #&$m_subReport( $response->previous . "\n");
689 #&$m_subReport( "****************************************\n");
690 &$m_subReport( " i=" . $response->is_info .
691 ", s=" . $response->is_success .
692 ", r=" . $response->is_redirect .
693 ", e=" . $response->is_error . "\n");
694 &$m_subReport( "****************************************\n");
695 &$m_subReport( "content: " . $response->content . "\n");
696 &$m_subReport( "****************************************\n");
697 &$m_subReport( "base: " . $response->base . "\n");
698 &$m_subReport( "****************************************\n");
699 &$m_subReport( $response->as_string);
700 &$m_subReport( "****************************************\n");
701 &$m_subReport( $response->current_age . "\n");
702 &$m_subReport( "****************************************\n");
703 my @rh = $response->header("Content-Type");
704 for my $r (@rh) { &$m_subReport( "ct: $r\n"); }
705 &$m_subReport( "****************************************\n");
709 #############################
711 #############################
714 $m_sMirrorRoot = PathSubs::get_temp_path() . "LinkWalker/" unless defined $m_sMirrorRoot;
715 my $old = $m_sMirrorRoot;
716 $m_sMirrorRoot = PathSubs::uniq_dir($val) if defined $val;
721 my $old = $m_bMirror;
722 $m_bMirror = $val if defined $val;
726 sub subReporter(;$) {
728 my $old = $m_subReport;
729 $m_subReport = $val if defined $val;
734 my $old = $m_subAction;
735 $m_subAction = $val if defined $val;
740 my $old = $m_bOnlyCont;
741 $m_bOnlyCont = $val if defined $val;
744 sub ua_personality(;$) {
746 my $old = $m_ua_personality;
747 $m_ua_personality = $val if defined $val;
751 sub clear_roots() { @m_sLinkRoots = (); }
752 sub get_roots() { return \@m_sLinkRoots; }
753 sub add_root($) { push @m_sLinkRoots, shift; }
754 sub add_files_root($) {
757 my ($host) = ($file =~ m!(^https?://[^/]*)!i);
759 $default_root = $file;
761 die "Can't find $file\n" unless -e $file;
762 $default_root = PathSubs::uniq_file($file);
764 $default_root =~ s![^/]*$!!;
765 add_root($default_root);
770 $m_subReport = \&default_sub;
771 $m_subAction = \&default_sub;
772 $m_subMirrorAction = \&default_sub;