Wednesday, October 20, 2004

Scraping the Sun

I read The Sun. There, I admitted it. I work in a factory, and it's good to keep up with the news that everyone else reads, but mostly it's because I like looking at pictures of scantily clad women. [shrug] The Sun has a really annoying site though. Two of the pages I read most often are the "Viral Emails" and Bizarre Exposed. So I figured I'd script some of it. sun-viral.pl: This creates an RDF feed from the Viral page that I can use from Liferea.

#!/usr/bin/perl -w

use strict;
use XML::RSS;
use LWP::Simple;

#my $testpage = "http://www.thesun.co.uk/article/0,,13-2004480024,00.html";
my $toppage = get('http://www.thesun.co.uk/section/0,,1,00.html');
my $page;
my $pageurl;
my $rss = new XML::RSS (version => '1.0');

if ($toppage =~ m!(/article/0,,13[^"']*)!)
{
	 $pageurl = "http://www.thesun.co.uk" . $1;
}

if (!$pageurl)
{
	exit -1;
}
$page = get($pageurl);
$rss->channel(title       => 'The Sun: Viral Emails',
                 link        => $pageurl,
                 description => 'The Sun: Viral Emails');
      
my @lines = split ("<[tT][Rr]", $page);

foreach (@lines)
{
	if (m!'(/popupWindow/0,,13[^"']*)', \d+, \d+, 'email\d+'\);">([^<]*)</A>!i)
	{
		$rss->add_item(title       => "$2",
 			          link        => 'http://www.thesun.co.uk' . $1,
		       	          description => "$pageurl");
	}
}

print $rss->as_string;
sun-bizarre.pl: This scrapes Bizarre (I haven't gotten around to the rest of it yet)
#!/usr/bin/perl -w

use strict;
use XML::RSS;
use LWP::Simple;

my $page = get('http://www.thesun.co.uk/section/0,,4,00.html');

my $rss = new XML::RSS (version => '1.0');

$rss->channel(title       => 'The Sun: Bizarre',
     	      link        => 'http://www.thesun.co.uk/section/0,,4,00.html',
	      description => 'The Sun: Bizarre');

# Eek! .* is a bit much, though it works.
# Matches top story
if ($page =~ m!<tr><td colspan=\"\d\"><a href=\"(/article/0,,.*\.html)\"><img src=\"http://images.thesun.co.uk/picture/.*\.gif\" alt=\"([^"]*)\"!)
{
	#print "$1, $2";
	$rss->add_item(title => "$2",
	 	       link =>  'http://www.thesun.co.uk' . $1);
}

# Other stories

my @lines = split ("<[tT][Rr]", $page);

foreach (@lines)
{
	if (m!<td style=[^>]*><a (class=\"[^"]+\" )?href=\"(/article/0,,.*\.html)\"[^>]*>([^<]*)</a><br>[\r\n]*<span[^>]*>([^<]*)[\r\n]*<[/]?td!is)	
	{
#		print "$2, $3, $4";
		$rss->add_item(title       => "$3",
			       link        => 'http://www.thesun.co.uk' . $2,
			       description => "$4");
	}
}

print $rss->as_string;
sun-pic.pl: This downloads the "Click here" images -- I don't like popups, and I don't like having popups open as new tabs much eithier.
#!/usr/bin/perl -w

use strict;
use LWP::Simple;

my $uri = shift;
my $page = get ($uri);

my $uris;

foreach (split ("\r\n", $page))
{
	if (m!(/popupWindow/[^.]*.html)!i)
	{
		$uris .= "http://www.thesun.co.uk$1 ";
	}
	elsif (m!window.open\('(http://images.thesun.co.uk/picture/0,,[^,]*,00.[gj][ip][fg])'!i)
	{
		$uris .= "$1 ";
	}
}
`cd /home/jimmy/.download && wget --referer=$uri -x $uri $uris`;

foreach (split (" ", $uris))
{
	if (/htm[l]+$/i)
	{
		$page = get ($_);
		foreach (split ("\r\n", $page))
		{
			if (m!(http://images.thesun.co.uk/picture[s]?/0,,[^,]*,00.[gj][ip][fg])!i)
			{
				`cd /home/jimmy/Pictures && wget $1`;
			}
		}
	}
}

0 Comments:

Post a Comment

<< Home