Seacolor Labs. このページをアンテナに追加 RSSフィード

2006-09-21

[] CustomFeed::Config修正 15:33  CustomFeed::Config修正 - Seacolor Labs. を含むブックマーク はてなブックマーク -  CustomFeed::Config修正 - Seacolor Labs.  CustomFeed::Config修正 - Seacolor Labs. のブックマークコメント

 ひさびさにCustomFeed::Configをバージョンアップ

 具体的な内容は下記の通り。

 ゆくゆくはEntryFullTextと統合する予定との事なので、基本的にEntryFullTextに合わせる形で修正していきます。

Plagger::Plugin::CustomFeed::Config

package Plagger::Plugin::CustomFeed::Config;
use strict;
use base qw( Plagger::Plugin );

use DirHandle;
use YAML;
use Encode;
use HTML::TokeParser;
use HTML::ResolveLink;
use Plagger::Date; # for metadata in plugins
use Plagger::Util qw( decode_content extract_title );
use Plagger::UserAgent;
use URI;
use URI::QueryParam;

sub init {
    my $self = shift;
    $self->SUPER::init(@_);
    $self->load_plugins;
}

sub load_plugins {
    my $self = shift;

    my $dir = $self->assets_dir;
    my $dh = DirHandle->new($dir) or Plagger->context->error("$dir: $!");
    for my $file (grep -f $_->[0] && $_->[1] =~ /\.yaml$/,
                  map [ File::Spec->catfile($dir, $_), $_ ], sort $dh->read) {
        $self->load_plugin(@$file);
    }
}

sub load_plugin {
    my($self, $file, $base) = @_;

    Plagger->context->log(debug => "loading $file");
    push @{$self->{plugins}}, YAML::LoadFile($file);
}

sub register {
    my($self, $context) = @_;
    $context->register_hook(
        $self,
        'customfeed.handle' => \&handle,
    );
}

sub handle {
    my($self, $context, $args) = @_;

    my $orig = $args->{feed}->url;
    my $count = 0;

    for my $plugin (@{$self->{plugins}}) {
        my $match = $plugin->{match} || '.'; # anything
        next unless $args->{feed}->url =~ m/$match/i;
        if ($args->{feed}->url =~ m!^$match!
            && $args->{feed}->url !~ /output=(?:rss|atom)/) {
            $args->{plugin} = $plugin;
            $self->aggregate($context, $args);
            return 1;
        }
    }
    return;
}

sub xml_escape {
    for my $x (@_) {
        $x = Plagger::Util::encode_xml($x);
    }
}

sub aggregate {
    my($self, $context, $args) = @_;

    my $url = URI->new($args->{feed}->url);
    my $plugin = $args->{plugin};

    if ($plugin->{fetch_before_hook}) {
        eval $plugin->{fetch_before_hook};
        Plagger->context->error($@) if $@;
    }

    my $agent = Plagger::UserAgent->new;
    my $res = $agent->fetch($url, $self, { NoNetwork => 60 * 60 * 3 } );
    if (!$res->status && $res->is_error) {
        $self->log(debug => "Fetch $url failed: " . $res->status_code);
        return;
    }

    my $content = decode_content($res);
    my $title   = $self->conf->{title} || extract_title($content);

    my $feed = Plagger::Feed->new;
    $feed->title($plugin->{extract_encoding} ? decode($plugin->{extract_encoding}, $title) : $title);
    $feed->link($url);

    my $prev_pos = 0;
    my $cur_pos = 0;
    my %nodes = ();

    if ($plugin->{extract_xpath}) {
        eval { require HTML::TreeBuilder::XPath };
        if ($@) {
            Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
            return;
        }

        my $tree = HTML::TreeBuilder::XPath->new;
        $tree->parse($content);
        $tree->eof;

        for my $capture (keys %{$plugin->{extract_xpath}}) {
            @{%nodes->{$capture}} = $tree->findnodes($plugin->{extract_xpath}->{$capture});
            unless (@{%nodes->{$capture}}) {
                Plagger->context->log(error => "Can't find node matching $plugin->{extract_xpath}->{$capture}");
            }
        }
    }

    while (1) {
        my $data;

        if ($plugin->{extract_before_hook}) {
            eval $plugin->{extract_before_hook};
            Plagger->context->error($@) if $@;
        }

        if ($plugin->{extract}) {
            my $extract = decode_content($plugin->{extract});
            if ($content =~ /$extract/sg) {
                $cur_pos = pos $content;
                my $str = substr($content, $prev_pos, length($content));
                if (my @match = $str =~ /$plugin->{extract}/s) {
                    my @capture = split /\s+/, $plugin->{extract_capture};
                    for my $m (@match) {
                        my $val = shift @capture;
                        $data->{$val} = $data->{$val} . $m;
                    }
                }
                $prev_pos = $cur_pos;
            }
        }

        if (%nodes) {
            for my $capture (keys %{$plugin->{extract_xpath}}) {
                no warnings 'redefine';
                local *HTML::Element::_xml_escape = \&xml_escape;
                my $children = shift @{%nodes->{$capture}};
                if ($children) {
                    $data->{$capture} = $children->as_XML;
                }
            }
        }

        unless ($data) {
            last;
        }

        if ($plugin->{extract_after_hook}) {
            eval $plugin->{extract_after_hook};
            Plagger->context->error($@) if $@;
        }
        
        if ($data->{date}) {
            if (my $format = $plugin->{extract_date_format}) {
                $format = [ $format ] unless ref $format;
                $data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
                if ($data->{date} && $plugin->{extract_date_timezone}) {
                    $data->{date}->set_time_zone($plugin->{extract_date_timezone});
                }
            } else {
                $data->{date} = Plagger::Date->parse_dwim($data->{date});
            }
        }

        $context->log(info => "Add entry");

        my $entry = Plagger::Entry->new;
        $entry->id($data->{link});
        $entry->title($plugin->{extract_encoding} ? decode($plugin->{extract_encoding}, $data->{title}) : $data->{title});
        $entry->link($data->{link});
        $entry->body($plugin->{extract_encoding} ? decode($plugin->{extract_encoding}, $data->{body}) : $data->{body}) if $data->{body};
        $entry->icon({ url => $data->{icon} }) if $data->{icon};
        $entry->date($data->{date}) if $data->{date};
        $feed->add_entry($entry);
    }

    $context->update->add($feed);

    return 1;
}

1;

__END__

=head1 NAME

Plagger::Plugin::CustomFeed::Config - Configurable way to create title and link only custom feeds

=head1 SYNOPSIS

  - module: Subscription::Config
    config:
      feed:
        - http://www.softantenna.com/index.html

  - module: CustomFeed::Config

=head1 DESCRIPTION

This plugin creates a custom feed off of HTML pages.
Use with EntryFullText plugin to get full content and accurate
datetime of articles.

You can write custom feed handler by putting C<.yaml> files 
under assets plugin directory.

=head1 AUTHOR

Kazushi Tominaga

=head1 SEE ALSO

L<Plagger>

=cut