I'm not the first one to run into this problem, but in searching around I couldn't actually find anyone who had actually bothered to solve the problem: how to get an RSS feed from a publicly accessible facebook wall. The facebook wall can either be a fan-page wall or a publicly accessible group's wall.
In my case I wanted to pull posts from various fan-page and group walls into a Drupal aggregator page. The following perl script does the job by scraping through the HTML for certain key id names and class names in facebook's mark-up and extracting the text and turning it into a valid RSS feed.
The script is called with the following syntax (assuming the file is named "facebookrss.pl"):
For pages with the standard URL syntax:
http://[yourserver]/[yourpath]/facebookrss.pl
#!/usr/bin/perl -w
use CGI qw(:standard);
use LWP::Simple qw(getstore);
use LWP::UserAgent;
use HTML::TokeParser::Simple;
use Time::Format qw(%time %strftime %manip);
use Date::Manip qw(ParseDate UnixDate DateCalc);
use HTML::Entities;
use URI::Escape;
use Encode;
$true = 1;
$false = !1;
#set user agent
$ua = LWP::UserAgent->new;
$ua->agent("Mozilla/2.02E (Win95; U)");
print header ('text/xml');
#print header ('text/plain'); #debug
$title = param("title");
$base = "http://www.facebook.com";
$suffixToken = "?";
$suffix = "v=wall";
$url = $base.param("url");
if ($url =~ /group\.php/) {
$url = uri_unescape($url);
$suffixToken = "&";
}
$url .= $suffixToken.$suffix;
$description = param("description");
$pubDate = 0;
#retrieve the page
$file = "";
$request = HTTP::Request->new(GET => $url);
$response = $ua->request($request);
#fix encoding if HTTP response header doesn't match (shouldn't matter with FB though)
if($response->is_success) {
$content = $response->content;
$encoding = "utf8"; # assume this is the default
if($content =~ /encoding="([^"]+)"/) {
$encoding = $1;
}
$file = $response->decoded_content((charset => $encoding));
}
#properly encode for RSS
$encTitle=HTML::Entities::encode($title);
$encUrl=HTML::Entities::encode($url);
$encDescription=HTML::Entities::encode($description);
# begin RSS output
print "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>
<rss version=\"2.0\">
<channel>
<title>$encTitle</title>
<link>$encUrl</link>
<description>$encDescription
</description>
";
# Parse HTML and pull entires
$p = HTML::TokeParser::Simple->new(\$file);
my $output = "";
my $count = 0;
%items = ();
$pubDate = "1/1/1970";
$realTimeText = "1/1/1970";
while (my $token = $p->get_token) {
if (@$token[0] eq "S") {
#when we hit div class used by FB for all wall posts it's time to get busy
if (($token->as_is =~ /UIIntentionalStory/)) {
$content = "";
$firstMediaItem = $true;
$realTimeFlag $false;
$token = $p->get_token;
while ( not( $token->as_is =~ /id="commentable_item_|UIActionLinks/) ) {
#put a space in whenever we hit a closign div tag to avoid mashing text together
if ($token->is_end_tag("div")) {
$content .= " ";
}
#add a newline before "attachments"
if ($token->is_start_tag("div") && ($token->as_is =~ /UIStoryAttachment/) && $firstMediaItem) {
$firstMediaItem = $false;
$content .= "<br />";
}
#add a newline before the content of the wall post
if ($token->is_start_tag("span") && ($token->as_is =~ /UIStory_Message/)) { #
$content .= "<br />";
}
#skip over tags we want to ignore and strip out useless attributes
if ( not($token->is_end_tag("h3")) &&
not($token->is_tag("span")) &&
not($token->is_tag("div")) ) {
$token->delete_attr("onclick");
$token->delete_attr("class");
$token->delete_attr("id");
$token->delete_attr("data-ft");
#if the URL in a link is a relative link to a photo, make it absolute
if ($token->is_start_tag("a")) {
$href = $token->get_attr("href");
if ($href) {
$href =~ s/\/photo\.php/http:\/\/www.facebook.com\/photo\.php/;
$token->set_attr("href",$href);
}
}
#hacking a little spacing aroung the profile image
#even though presentation *should* be completely separate from content
if ($token->is_start_tag("img")) {
$href = $token->get_attr("src");
if ($href =~ /http:\/\/profile/) {
$token->set_attr("align","left");
$token->set_attr("hspace","10");
$token->set_attr("class","picture");
$token->set_attr("style","margin-right: 10px !important;");
}
}
#Skip over "read more" tokens otherwise keep the token as-is
if ($token->as_is =~ /^read more$/i) {
$content .= "";
} else {
$content .= $token->as_is;
}
} #end if ( not($token->is_end_tag("h3")) &&...
$token $p->get_token;
} #end while ( not( $token->as_is =~ /id="commentable_item_|UIActionLinks/)
$token = $p->get_token;
while ( not( $token->is_text )) {
$token $p->get_token;
#check for a real timestamp
if ($token->is_tag("abbr") && ($token->as_is =~ /timestamp/i)) {
$realTimeFlag = $true;
$realTimeText = $token->get_attr("title");
}
}
#if we have a real timestamp, use it, otherwise parse the time text
if ($realTimeFlag) {
$timeText = $realTimeText;
} else {
$timeText = $token->as_is;
if ($timeText =~ /about/) {
$timeText =~ s/\sat/,/;
}
}
my $date = ParseDate($timeText);
my $epochDate UnixDate($date, "%s");
# parsing the text sometimes means it will be a week off,
# i.e. *next* Friday instead of *last* Friday
# so if we get a future date, subtract a week
if ($epochDate > time()) {
$epochDate = $epochDate - 604800;
$date = DateCalc($date, "-1 week");
}
my $epochPubDate = UnixDate($pubDate, "%s");
# keep track of the newest post, this will be our lastBuildDate
if ($epochDate > $epochPubDate) {$pubDate = $date};
my $dateText = UnixDate($date, "%a, %e %b %Y %H:%M:%S %Z");
my $link = $url;
my $linkText = "View wall";
my $title = "New wall post";
if ($content =~ /(\/note\.php.+?)"/) {
$link = "http://www.facebook.com".$1;
$title "New note";
$linkText = "View original note";
}
if ($content =~ /(\/album\.php.+?)"/) {
$link = "http://www.facebook.com".$1;
$title "New photos";
$linkText = "View album";
}
$content .= "<small><br /><a href=\"$link\">$linkText</a><br /></small>";
#use the epoch time the key for a hash of hashes that we can sort later
$items{$epochDate}{Link} = HTML::Entities::encode($link);
$items{$epochDate}{Text} = HTML::Entities::encode($title);
$items{$epochDate}{Description} = "<![CDATA[".$content."]]>";
$items{$epochDate}{Date} = $dateText ;
} #end if (($token->as_is =~ /UIIntentionalStory/))
} else {
next;
} #end if (@$token[0] eq "S")
}
# output the rest of the RSS
$pubDateText = UnixDate($pubDate, "%a, %e %b %Y %H:%M:%S %Z");
print "<lastBuildDate>$pubDateText</lastBuildDate>\n";
#sort the posts by last modified time using the epoch times
@itemKeys = reverse sort keys %items;
#write out well formed items, now in the right order
foreach $epoch (@itemKeys) {
my $item = $items{$epoch};
print "<item>\n";
print " <title>";
print $items{$epoch}{Text};
print "</title>\n";
print " <description>";
print $items{$epoch}{Description};
print "</description>\n";
print " <pubDate>";
print $items{$epoch}{Date};
print "</pubDate>\n";
print " <link>";
print $items{$epoch}{Link};
print "</link>\n";
print "</item>\n";
}
# End of the RSS feed
print "
</channel>
</rss>
";
exit (0);