2006-08-20
■ [CustomFeed] CustomFeed::Configをバージョンアップ

#plagger-jaで、d:id:woremacxさんよりCustomFeed::Configを譲り受けたので好き勝手に弄ってみました。
具体的には、
- オプションをFilter::EntryFullText相当に(extract_xpath等も使えるようになった)
- g:vader:id:wata_dさんのvaderグループ - wata_dの日記 - CustomFeed::Config + Filter::FindEnclosuresでネットラジオをpodcastを取り込み
といった感じ。
Plagger::Plugin::CustomFeed::Config
package Plagger::Plugin::CustomFeed::Config;
use strict;
use base qw( Plagger::Plugin );
use DirHandle;
use YAML;
use Encode;
use HTML::TokeParser;
use Plagger::UserAgent;
use Plagger::Util qw( decode_content extract_title );
use URI;
use URI::QueryParam;
sub init {
my $self = shift;
$self->SUPER::init(@_);
$self->load_plugins;
}
sub load_plugins {
my $self = shift;
my $dir = $self->assets_dir;
my $dh = DirHandle->new($dir) or Plagger->context->error("$dir: $!");
for my $file (grep -f $_->[0] && $_->[1] =~ /\.yaml$/,
map [ File::Spec->catfile($dir, $_), $_ ], sort $dh->read) {
$self->load_plugin(@$file);
}
}
sub load_plugin {
my($self, $file, $base) = @_;
Plagger->context->log(debug => "loading $file");
push @{$self->{plugins}}, YAML::LoadFile($file);
}
sub register {
my($self, $context) = @_;
$context->register_hook(
$self,
'customfeed.handle' => \&handle,
);
}
sub handle {
my($self, $context, $args) = @_;
my $orig = $args->{feed}->url;
my $count = 0;
for my $plugin (@{$self->{plugins}}) {
my $match = $plugin->{match} || '.'; # anything
next unless $args->{feed}->url =~ m/$match/i;
if ($args->{feed}->url =~ m!^$match!
&& $args->{feed}->url !~ /output=(?:rss|atom)/) {
$args->{plugin} = $plugin;
$self->aggregate($context, $args);
return 1;
}
}
return;
}
sub aggregate {
my($self, $context, $args) = @_;
my $url = URI->new($args->{feed}->url);
my $plugin = $args->{plugin};
if ($plugin->{fetch_before_hook}) {
eval $plugin->{fetch_before_hook};
Plagger->context->error($@) if $@;
}
$context->log(info => "GET $url");
my $agent = Plagger::UserAgent->new;
my $res = $agent->fetch($url, $self);
if ($res->http_response->is_error) {
$context->log(error => "GET $url failed: " . $res->status_code);
return;
}
my $content = decode_content($res);
my $title = $self->conf->{title} || extract_title($content);
my $feed = Plagger::Feed->new;
$feed->title($plugin->{extract_encoding} ? decode($plugin->{extract_encoding}, $title) : $title);
$feed->link($url);
if ($plugin->{extract_before_hook}) {
eval $plugin->{extract_before_hook};
Plagger->context->error($@) if $@;
}
while (1) {
my $data;
my $extract = decode_content($plugin->{extract});
if ($content =~ /$extract/sg) {
if (my @match = $& =~ /$plugin->{extract}/s) {
my @capture = split /\s+/, $plugin->{extract_capture};
for my $m (@match) {
my $val = shift @capture;
$data->{$val} = $data->{$val} . $m;
}
}
}
if ($plugin->{extract_xpath}) {
eval { require HTML::TreeBuilder::XPath };
if ($@) {
Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
return;
}
my $tree = HTML::TreeBuilder::XPath->new;
$tree->parse($content);
$tree->eof;
for my $capture (keys %{$plugin->{extract_xpath}}) {
my @children = $tree->findnodes($plugin->{extract_xpath}->{$capture});
$data->{$capture} = $children[0]->as_HTML;
}
}
unless ($data) {
last;
}
if ($plugin->{extract_after_hook}) {
eval $plugin->{extract_after_hook};
Plagger->context->error($@) if $@;
}
if ($data->{date}) {
if (my $format = $plugin->{extract_date_format}) {
$format = [ $format ] unless ref $format;
$data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
if ($data->{date} && $plugin->{extract_date_timezone}) {
$data->{date}->set_time_zone($plugin->{extract_date_timezone});
}
} else {
$data->{date} = Plagger::Date->parse_dwim($data->{date});
}
}
$context->log(info => "Add entry");
my $entry = Plagger::Entry->new;
$entry->id($data->{link});
$entry->title($plugin->{extract_encoding} ? decode($plugin->{extract_encoding}, $data->{title}) : $data->{title});
$entry->link($data->{link});
$entry->date($data->{date}) if $data->{date};
$entry->body($plugin->{extract_encoding} ? decode($plugin->{extract_encoding}, $data->{body}) : $data->{body}) if $data->{body};
$feed->add_entry($entry);
}
$context->update->add($feed);
return 1;
}
1;
__END__
=head1 NAME
Plagger::Plugin::CustomFeed::Config - Configurable way to create title and link only custom feeds
=head1 SYNOPSIS
- module: Subscription::Config
config:
feed:
- http://www.softantenna.com/index.html
- module: CustomFeed::Config
=head1 DESCRIPTION
This plugin creates a custom feed off of HTML pages.
Use with EntryFullText plugin to get full content and accurate
datetime of articles.
You can write custom feed handler by putting C<.yaml> files
under assets plugin directory.
=head1 AUTHOR
Kazushi Tominaga
=head1 SEE ALSO
L<Plagger>
=cut
コメント