#!/usr/bin/perl use strict; # This source code is used to extract the URL's from a SubText website so that # appropriate .htaccess rules can be written for a Wordpress blog # # Copyright (C) 2009 Mike Taber - http://www.miketaber.net # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . use LWP::Simple; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use HTML::LinkExtor; # allows you to extract the links off of an HTML page. # reference links # http://www.cs.utk.edu/cs594ipm/perl/crawltut.html # Configuration my $primaryURL = "http://www.miketaber.net"; my $TLD = "miketaber.net"; ##################################################################### my %localURLs = (); getURLs($primaryURL); # now that all of the local links have been grabbed, print out the statistics print "======================================================================\n"; for my $thisURL ( sort keys %localURLs ) { if( $localURLs{$thisURL}{'parsedForLinks'} == 1 ) { # print $thisURL . ","; # print $localURLs{$thisURL}{'pageTitle'} . ","; # print $localURLs{$thisURL}{'totalLinkCount'} . ","; # print $localURLs{$thisURL}{'uniqueLinkCount'} . ","; # print $localURLs{$thisURL}{'localLinkCount'}; print "RewriteRule ^"; my $rewriteURL = substr($thisURL,length($primaryURL)+1); $rewriteURL =~ s/\./\\\./; print $rewriteURL . "\$ "; print "\"" . $localURLs{$thisURL}{'pageTitle'} . "\" [R=302,L,NC]"; print "\n"; } } my $testmode = 0; my $iter = 0; # recursively get the URL's on this page, saving each page in a hash table, and # marking it as parsedForLinks when the parsing is completed. After each page is parsed, # retrieve the content of each of the local URL's on the page and continue until the entire # site has been parsed sub getURLs { my $thisURL = shift; if($testmode){$iter++;} # if this page has already been parsed, skip it if( $localURLs{$thisURL}{'parsedForLinks'} == 1 || $iter > 10) { return; } print "$thisURL" . "..."; $localURLs{$thisURL}{'url'} = $thisURL; my $contents = get($thisURL); my $lccontents = lc($contents); my $tstart = index($lccontents,""); my $tend = index($lccontents,""); if( $tstart < $tend && $tstart > 0 && $tend > 0 ) { my $title = substr($contents,$tstart+7,$tend-$tstart-7); $localURLs{$thisURL}{'pageTitle'} = trim($title); } my ($page_parser) = HTML::LinkExtor->new(undef, $thisURL); $page_parser->parse($contents)->eof; my @links = $page_parser->links; my @pagelinks; foreach my $link (@links) { # convert all links to lowercase and add them to a new array push(@pagelinks, lc($$link[2])); #print "$$link[1]\t$$link[2]\n"; } #print scalar(@links) . " total links found\n"; $localURLs{$thisURL}{'totalLinkCount'} = scalar(@links); # declare a hash, add all of the page links to it, and then replace the page urls into a new, unique array my %saw; @saw{@pagelinks} = (); my @out = sort keys %saw; #print scalar(@out) . " unique links found\n"; $localURLs{$thisURL}{'uniqueLinkCount'} = scalar(@out); my @localLinks; foreach my $link(@out) { my $fLink = formatToLocalURL($link); if( length($fLink) > 0 ) { push(@localLinks,$fLink); } } $localURLs{$thisURL}{'localLinkCount'} = scalar(@localLinks); $localURLs{$thisURL}{'parsedForLinks'} = 1; print "Done!\n"; # for each local link, call this function recursively foreach my $localLink(@localLinks) { getURLs($localLink); } } sub formatToLocalURL { my $url = shift; # grab the first parameter if( substr($url,0,4) eq "http" ) { # this URL starts with http so strip it from the URL $url = substr($url,7); } my $i = index($url,"/"); # locate the first /, which indicates a page if($i < 0 ){$i = 1024; } # if this was a direct page link, simply use the full URL # test for the existence of either the primaryURL or the TLD existing prior to the first slash # if it is found, then this is a valid local URL if( ( index($url,substr($primaryURL,5)) >=0 && index($url,substr($primaryURL,5)) < $i ) || ( index($url, $TLD) >=0 && index($url, $TLD) < $i ) ) { # prepend the subdomain to the URL, as needed my $subdomain = substr($primaryURL,7,index($primaryURL,".")-7); if( substr($url,0,length($subdomain)) ne $subdomain) { $url = $subdomain . "." .$url; } # if this isn't a direct link to a .aspx page, skip it if( rindex($url,".aspx") != length($url) - 5 ) { return ""; } # ignore relative links if( index($url,'..') >= 0 ) { return ""; } # otherwise, return a formatted local URL return "http://" . $url; } } # This code referenced from: http://www.somacon.com/p114.php # Perl trim function to remove whitespace from the start and end of the string sub trim($) { my $string = shift; $string =~ s/^\s+//; $string =~ s/\s+$//; return $string; } # Left trim function to remove leading whitespace sub ltrim($) { my $string = shift; $string =~ s/^\s+//; return $string; } # Right trim function to remove trailing whitespace sub rtrim($) { my $string = shift; $string =~ s/\s+$//; return $string; }