#!/usr/bin/perl
use strict;
# This source code is used to extract the URL's from a SubText website so that
# appropriate .htaccess rules can be written for a Wordpress blog
#
# Copyright (C) 2009 Mike Taber - http://www.miketaber.net
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTML::LinkExtor; # allows you to extract the links off of an HTML page.
# reference links
# http://www.cs.utk.edu/cs594ipm/perl/crawltut.html
# Configuration
my $primaryURL = "http://www.miketaber.net";
my $TLD = "miketaber.net";
#####################################################################
my %localURLs = ();
getURLs($primaryURL);
# now that all of the local links have been grabbed, print out the statistics
print "======================================================================\n";
for my $thisURL ( sort keys %localURLs )
{
if( $localURLs{$thisURL}{'parsedForLinks'} == 1 )
{
# print $thisURL . ",";
# print $localURLs{$thisURL}{'pageTitle'} . ",";
# print $localURLs{$thisURL}{'totalLinkCount'} . ",";
# print $localURLs{$thisURL}{'uniqueLinkCount'} . ",";
# print $localURLs{$thisURL}{'localLinkCount'};
print "RewriteRule ^";
my $rewriteURL = substr($thisURL,length($primaryURL)+1);
$rewriteURL =~ s/\./\\\./;
print $rewriteURL . "\$ ";
print "\"" . $localURLs{$thisURL}{'pageTitle'} . "\" [R=302,L,NC]";
print "\n";
}
}
my $testmode = 0;
my $iter = 0;
# recursively get the URL's on this page, saving each page in a hash table, and
# marking it as parsedForLinks when the parsing is completed. After each page is parsed,
# retrieve the content of each of the local URL's on the page and continue until the entire
# site has been parsed
sub getURLs
{
my $thisURL = shift;
if($testmode){$iter++;}
# if this page has already been parsed, skip it
if( $localURLs{$thisURL}{'parsedForLinks'} == 1 || $iter > 10)
{
return;
}
print "$thisURL" . "...";
$localURLs{$thisURL}{'url'} = $thisURL;
my $contents = get($thisURL);
my $lccontents = lc($contents);
my $tstart = index($lccontents,"
");
my $tend = index($lccontents,"");
if( $tstart < $tend && $tstart > 0 && $tend > 0 )
{
my $title = substr($contents,$tstart+7,$tend-$tstart-7);
$localURLs{$thisURL}{'pageTitle'} = trim($title);
}
my ($page_parser) = HTML::LinkExtor->new(undef, $thisURL);
$page_parser->parse($contents)->eof;
my @links = $page_parser->links;
my @pagelinks;
foreach my $link (@links)
{
# convert all links to lowercase and add them to a new array
push(@pagelinks, lc($$link[2]));
#print "$$link[1]\t$$link[2]\n";
}
#print scalar(@links) . " total links found\n";
$localURLs{$thisURL}{'totalLinkCount'} = scalar(@links);
# declare a hash, add all of the page links to it, and then replace the page urls into a new, unique array
my %saw;
@saw{@pagelinks} = ();
my @out = sort keys %saw;
#print scalar(@out) . " unique links found\n";
$localURLs{$thisURL}{'uniqueLinkCount'} = scalar(@out);
my @localLinks;
foreach my $link(@out)
{
my $fLink = formatToLocalURL($link);
if( length($fLink) > 0 )
{
push(@localLinks,$fLink);
}
}
$localURLs{$thisURL}{'localLinkCount'} = scalar(@localLinks);
$localURLs{$thisURL}{'parsedForLinks'} = 1;
print "Done!\n";
# for each local link, call this function recursively
foreach my $localLink(@localLinks)
{
getURLs($localLink);
}
}
sub formatToLocalURL
{
my $url = shift; # grab the first parameter
if( substr($url,0,4) eq "http" )
{
# this URL starts with http so strip it from the URL
$url = substr($url,7);
}
my $i = index($url,"/"); # locate the first /, which indicates a page
if($i < 0 ){$i = 1024; } # if this was a direct page link, simply use the full URL
# test for the existence of either the primaryURL or the TLD existing prior to the first slash
# if it is found, then this is a valid local URL
if(
( index($url,substr($primaryURL,5)) >=0 && index($url,substr($primaryURL,5)) < $i ) ||
( index($url, $TLD) >=0 && index($url, $TLD) < $i )
)
{
# prepend the subdomain to the URL, as needed
my $subdomain = substr($primaryURL,7,index($primaryURL,".")-7);
if( substr($url,0,length($subdomain)) ne $subdomain)
{
$url = $subdomain . "." .$url;
}
# if this isn't a direct link to a .aspx page, skip it
if( rindex($url,".aspx") != length($url) - 5 )
{
return "";
}
# ignore relative links
if( index($url,'..') >= 0 )
{
return "";
}
# otherwise, return a formatted local URL
return "http://" . $url;
}
}
# This code referenced from: http://www.somacon.com/p114.php
# Perl trim function to remove whitespace from the start and end of the string
sub trim($)
{
my $string = shift;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
# Left trim function to remove leading whitespace
sub ltrim($)
{
my $string = shift;
$string =~ s/^\s+//;
return $string;
}
# Right trim function to remove trailing whitespace
sub rtrim($)
{
my $string = shift;
$string =~ s/\s+$//;
return $string;
}