2006-09-21
■ [CustomFeed] CustomFeed::Config修正

ひさびさにCustomFeed::Configをバージョンアップ。
具体的な内容は下記の通り。
- 活動日誌(2006-09-13)さんの先読みパッチを取り込み
- extract_xpathオプションが正常に動作していない問題を修正
- EntryFullTextのバージョンアップにあわせオプションを追加
ゆくゆくはEntryFullTextと統合する予定との事なので、基本的にEntryFullTextに合わせる形で修正していきます。
Plagger::Plugin::CustomFeed::Config
package Plagger::Plugin::CustomFeed::Config;
use strict;
use base qw( Plagger::Plugin );
use DirHandle;
use YAML;
use Encode;
use HTML::TokeParser;
use HTML::ResolveLink;
use Plagger::Date; # for metadata in plugins
use Plagger::Util qw( decode_content extract_title );
use Plagger::UserAgent;
use URI;
use URI::QueryParam;
sub init {
my $self = shift;
$self->SUPER::init(@_);
$self->load_plugins;
}
sub load_plugins {
my $self = shift;
my $dir = $self->assets_dir;
my $dh = DirHandle->new($dir) or Plagger->context->error("$dir: $!");
for my $file (grep -f $_->[0] && $_->[1] =~ /\.yaml$/,
map [ File::Spec->catfile($dir, $_), $_ ], sort $dh->read) {
$self->load_plugin(@$file);
}
}
sub load_plugin {
my($self, $file, $base) = @_;
Plagger->context->log(debug => "loading $file");
push @{$self->{plugins}}, YAML::LoadFile($file);
}
sub register {
my($self, $context) = @_;
$context->register_hook(
$self,
'customfeed.handle' => \&handle,
);
}
sub handle {
my($self, $context, $args) = @_;
my $orig = $args->{feed}->url;
my $count = 0;
for my $plugin (@{$self->{plugins}}) {
my $match = $plugin->{match} || '.'; # anything
next unless $args->{feed}->url =~ m/$match/i;
if ($args->{feed}->url =~ m!^$match!
&& $args->{feed}->url !~ /output=(?:rss|atom)/) {
$args->{plugin} = $plugin;
$self->aggregate($context, $args);
return 1;
}
}
return;
}
sub xml_escape {
for my $x (@_) {
$x = Plagger::Util::encode_xml($x);
}
}
sub aggregate {
my($self, $context, $args) = @_;
my $url = URI->new($args->{feed}->url);
my $plugin = $args->{plugin};
if ($plugin->{fetch_before_hook}) {
eval $plugin->{fetch_before_hook};
Plagger->context->error($@) if $@;
}
my $agent = Plagger::UserAgent->new;
my $res = $agent->fetch($url, $self, { NoNetwork => 60 * 60 * 3 } );
if (!$res->status && $res->is_error) {
$self->log(debug => "Fetch $url failed: " . $res->status_code);
return;
}
my $content = decode_content($res);
my $title = $self->conf->{title} || extract_title($content);
my $feed = Plagger::Feed->new;
$feed->title($plugin->{extract_encoding} ? decode($plugin->{extract_encoding}, $title) : $title);
$feed->link($url);
my $prev_pos = 0;
my $cur_pos = 0;
my %nodes = ();
if ($plugin->{extract_xpath}) {
eval { require HTML::TreeBuilder::XPath };
if ($@) {
Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
return;
}
my $tree = HTML::TreeBuilder::XPath->new;
$tree->parse($content);
$tree->eof;
for my $capture (keys %{$plugin->{extract_xpath}}) {
@{%nodes->{$capture}} = $tree->findnodes($plugin->{extract_xpath}->{$capture});
unless (@{%nodes->{$capture}}) {
Plagger->context->log(error => "Can't find node matching $plugin->{extract_xpath}->{$capture}");
}
}
}
while (1) {
my $data;
if ($plugin->{extract_before_hook}) {
eval $plugin->{extract_before_hook};
Plagger->context->error($@) if $@;
}
if ($plugin->{extract}) {
my $extract = decode_content($plugin->{extract});
if ($content =~ /$extract/sg) {
$cur_pos = pos $content;
my $str = substr($content, $prev_pos, length($content));
if (my @match = $str =~ /$plugin->{extract}/s) {
my @capture = split /\s+/, $plugin->{extract_capture};
for my $m (@match) {
my $val = shift @capture;
$data->{$val} = $data->{$val} . $m;
}
}
$prev_pos = $cur_pos;
}
}
if (%nodes) {
for my $capture (keys %{$plugin->{extract_xpath}}) {
no warnings 'redefine';
local *HTML::Element::_xml_escape = \&xml_escape;
my $children = shift @{%nodes->{$capture}};
if ($children) {
$data->{$capture} = $children->as_XML;
}
}
}
unless ($data) {
last;
}
if ($plugin->{extract_after_hook}) {
eval $plugin->{extract_after_hook};
Plagger->context->error($@) if $@;
}
if ($data->{date}) {
if (my $format = $plugin->{extract_date_format}) {
$format = [ $format ] unless ref $format;
$data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
if ($data->{date} && $plugin->{extract_date_timezone}) {
$data->{date}->set_time_zone($plugin->{extract_date_timezone});
}
} else {
$data->{date} = Plagger::Date->parse_dwim($data->{date});
}
}
$context->log(info => "Add entry");
my $entry = Plagger::Entry->new;
$entry->id($data->{link});
$entry->title($plugin->{extract_encoding} ? decode($plugin->{extract_encoding}, $data->{title}) : $data->{title});
$entry->link($data->{link});
$entry->body($plugin->{extract_encoding} ? decode($plugin->{extract_encoding}, $data->{body}) : $data->{body}) if $data->{body};
$entry->icon({ url => $data->{icon} }) if $data->{icon};
$entry->date($data->{date}) if $data->{date};
$feed->add_entry($entry);
}
$context->update->add($feed);
return 1;
}
1;
__END__
=head1 NAME
Plagger::Plugin::CustomFeed::Config - Configurable way to create title and link only custom feeds
=head1 SYNOPSIS
- module: Subscription::Config
config:
feed:
- http://www.softantenna.com/index.html
- module: CustomFeed::Config
=head1 DESCRIPTION
This plugin creates a custom feed off of HTML pages.
Use with EntryFullText plugin to get full content and accurate
datetime of articles.
You can write custom feed handler by putting C<.yaml> files
under assets plugin directory.
=head1 AUTHOR
Kazushi Tominaga
=head1 SEE ALSO
L<Plagger>
=cut
コメント