Sunday, October 24, 2004

Michael Moore scraper improved

Now with added /x-ness!

#!/usr/bin/perl -w

use strict;
use XML::RSS;
use LWP::Simple;
use HTML::Entities;

sub findurl ($$)
{
	my $title = shift;
	my $pagein = shift;
	if ($pagein =~ /<a href="(index.php\?id=[^"]*)">$title<\/a>/i)
	{
		return "http://www.michaelmoore.com/words/diary/$1";
	}
}
							

my $rss = new XML::RSS (version => '1.0');
my $url = "http://www.michaelmoore.com/words/diary/index.php";
my $page = get($url);

$rss->channel(title       => "Mike's Blog",
              link        => $url,
              description => "Michael Moore's blog");

foreach (split ('<table ', $page))
{
	if (/<p><span\sclass="smallText"><i>
		([^>]*)
		<\/i><\/span><br>[\r\n]*<span\sclass="titleText">
		([^>]*)
		<\/span><\/p>[\r\n]*<p>
		(<p>.*<\/p>)
		[\r\n]*<\/p>\n/six)
	{
		$rss->add_item(title       => $2,
			       link        => findurl($2, $page),
		       	       description => $1 . encode_entities($3));
	}
}

print $rss->as_string;

0 Comments:

Post a Comment

<< Home