Sunday, October 24, 2004

Michael Moore's blog

Is here. Scraper, here:

#!/usr/bin/perl -w

use strict;
use XML::RSS;
use LWP::Simple;
use HTML::Entities;

sub findurl ($$)
{
	my $title = shift;
	my $pagein = shift;
	if ($pagein =~ /<a href="(index.php\?id=[^"]*)">$title<\/a>/i)
	{
		return "http://www.michaelmoore.com/words/diary/$1";
	}
}
							

my $rss = new XML::RSS (version => '1.0');

my $page = get("http://www.michaelmoore.com/words/diary/index.php");

$rss->channel(title       => "Mike's Blog",
              link        => "http://www.michaelmoore.com/words/diary/index.php",
              description => "Michael Moore's blog");

foreach (split ('<table ', $page))
{
	if (/<p><span class="smallText"><i>([^>]*)<\/i><\/span><br>[\r\n]*<span class="titleText">([^>]*)<\/span><\/p>[\r\n]*<p>(<p>.*<\/p>)[\r\n]*<\/p>\n/si)
	{
		$rss->add_item(title       => $2,
			       link        => findurl($2, $page),
		       	       description => $1 . encode_entities($3));
	}
}

print $rss->as_string;

0 Comments:

Post a Comment

<< Home