Seacolor Labs. このページをアンテナに追加 RSSフィード

2006-08-20

[] CustomFeed::Configをバージョンアップ 21:44  CustomFeed::Configをバージョンアップ - Seacolor Labs. を含むブックマーク はてなブックマーク -  CustomFeed::Configをバージョンアップ - Seacolor Labs.  CustomFeed::Configをバージョンアップ - Seacolor Labs. のブックマークコメント

 #plagger-jaで、d:id:woremacxさんよりCustomFeed::Configを譲り受けたので好き勝手に弄ってみました。

 具体的には、

 といった感じ。

Plagger::Plugin::CustomFeed::Config

package Plagger::Plugin::CustomFeed::Config;
use strict;
use base qw( Plagger::Plugin );

use DirHandle;
use YAML;
use Encode;
use HTML::TokeParser;
use Plagger::UserAgent;
use Plagger::Util qw( decode_content extract_title );
use URI;
use URI::QueryParam;

sub init {
    my $self = shift;
    $self->SUPER::init(@_);
    $self->load_plugins;
}

sub load_plugins {
    my $self = shift;

    my $dir = $self->assets_dir;
    my $dh = DirHandle->new($dir) or Plagger->context->error("$dir: $!");
    for my $file (grep -f $_->[0] && $_->[1] =~ /\.yaml$/,
                  map [ File::Spec->catfile($dir, $_), $_ ], sort $dh->read) {
        $self->load_plugin(@$file);
    }
}

sub load_plugin {
    my($self, $file, $base) = @_;

    Plagger->context->log(debug => "loading $file");
    push @{$self->{plugins}}, YAML::LoadFile($file);
}

sub register {
    my($self, $context) = @_;
    $context->register_hook(
        $self,
        'customfeed.handle' => \&handle,
    );
}

sub handle {
    my($self, $context, $args) = @_;

    my $orig = $args->{feed}->url;
    my $count = 0;

    for my $plugin (@{$self->{plugins}}) {
        my $match = $plugin->{match} || '.'; # anything
        next unless $args->{feed}->url =~ m/$match/i;
        if ($args->{feed}->url =~ m!^$match!
            && $args->{feed}->url !~ /output=(?:rss|atom)/) {
            $args->{plugin} = $plugin;
            $self->aggregate($context, $args);
            return 1;
        }
    }
    return;
}

sub aggregate {
    my($self, $context, $args) = @_;

    my $url = URI->new($args->{feed}->url);
    my $plugin = $args->{plugin};

    if ($plugin->{fetch_before_hook}) {
        eval $plugin->{fetch_before_hook};
        Plagger->context->error($@) if $@;
    }

    $context->log(info => "GET $url");

    my $agent = Plagger::UserAgent->new;
    my $res = $agent->fetch($url, $self);

    if ($res->http_response->is_error) {
        $context->log(error => "GET $url failed: " . $res->status_code);
        return;
    }

    my $content = decode_content($res);
    my $title   = $self->conf->{title} || extract_title($content);

    my $feed = Plagger::Feed->new;
    $feed->title($plugin->{extract_encoding} ? decode($plugin->{extract_encoding}, $title) : $title);
    $feed->link($url);

    if ($plugin->{extract_before_hook}) {
        eval $plugin->{extract_before_hook};
        Plagger->context->error($@) if $@;
    }

    while (1) {
        my $data;

        my $extract = decode_content($plugin->{extract});
        if ($content =~ /$extract/sg) {
            if (my @match = $& =~ /$plugin->{extract}/s) {
                my @capture = split /\s+/, $plugin->{extract_capture};
                for my $m (@match) {
                    my $val = shift @capture;
                    $data->{$val} = $data->{$val} . $m;
                }
            }
        }

        if ($plugin->{extract_xpath}) {
            eval { require HTML::TreeBuilder::XPath };
            if ($@) {
                Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
                return;
            }
    
            my $tree = HTML::TreeBuilder::XPath->new;
            $tree->parse($content);
            $tree->eof;
    
            for my $capture (keys %{$plugin->{extract_xpath}}) {
                my @children = $tree->findnodes($plugin->{extract_xpath}->{$capture});
                $data->{$capture} = $children[0]->as_HTML;
            }
        }

        unless ($data) {
            last;
        }

        if ($plugin->{extract_after_hook}) {
            eval $plugin->{extract_after_hook};
            Plagger->context->error($@) if $@;
        }
        
        if ($data->{date}) {
            if (my $format = $plugin->{extract_date_format}) {
                $format = [ $format ] unless ref $format;
                $data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
                if ($data->{date} && $plugin->{extract_date_timezone}) {
                    $data->{date}->set_time_zone($plugin->{extract_date_timezone});
                }
            } else {
                $data->{date} = Plagger::Date->parse_dwim($data->{date});
            }
        }

        $context->log(info => "Add entry");

        my $entry = Plagger::Entry->new;
        $entry->id($data->{link});
        $entry->title($plugin->{extract_encoding} ? decode($plugin->{extract_encoding}, $data->{title}) : $data->{title});
        $entry->link($data->{link});
        $entry->date($data->{date}) if $data->{date};
        $entry->body($plugin->{extract_encoding} ? decode($plugin->{extract_encoding}, $data->{body}) : $data->{body}) if $data->{body};
        $feed->add_entry($entry);
    }

    $context->update->add($feed);

    return 1;
}

1;

__END__

=head1 NAME

Plagger::Plugin::CustomFeed::Config - Configurable way to create title and link only custom feeds

=head1 SYNOPSIS

  - module: Subscription::Config
    config:
      feed:
        - http://www.softantenna.com/index.html

  - module: CustomFeed::Config

=head1 DESCRIPTION

This plugin creates a custom feed off of HTML pages.
Use with EntryFullText plugin to get full content and accurate
datetime of articles.

You can write custom feed handler by putting C<.yaml> files 
under assets plugin directory.

=head1 AUTHOR

Kazushi Tominaga

=head1 SEE ALSO

L<Plagger>

=cut

2006-08-08

[] CustomFeed::Config対応に 22:34  CustomFeed::Config対応に - Seacolor Labs. を含むブックマーク はてなブックマーク -  CustomFeed::Config対応に - Seacolor Labs.  CustomFeed::Config対応に - Seacolor Labs. のブックマークコメント

 Subscription::SoftAntennaとSubscription::VersionUpInfoをworemacxの日記 - yaml で簡単なレシピを書ける CustomFeed プラグインレシピにしてみました。

 after_hookとencodeの設定ができなかったのでついでにCustomFeed::ConfigをHack。

Plagger::Plugin::CustomFeed::Config

package Plagger::Plugin::CustomFeed::Config;
use strict;
use base qw( Plagger::Plugin );

use DirHandle;
use YAML;
use Encode;
use HTML::TokeParser;
use Plagger::UserAgent;
use Plagger::Util qw( decode_content extract_title );
use URI;
use URI::QueryParam;

sub init {
    my $self = shift;
    $self->SUPER::init(@_);
    $self->load_plugins;
}

sub load_plugins {
    my $self = shift;

    my $dir = $self->assets_dir;
    my $dh = DirHandle->new($dir) or Plagger->context->error("$dir: $!");
    for my $file (grep -f $_->[0] && $_->[1] =~ /\.yaml$/,
                  map [ File::Spec->catfile($dir, $_), $_ ], sort $dh->read) {
        $self->load_plugin(@$file);
    }
}

sub load_plugin {
    my($self, $file, $base) = @_;

    Plagger->context->log(debug => "loading $file");
    push @{$self->{plugins}}, YAML::LoadFile($file);
}

sub register {
    my($self, $context) = @_;
    $context->register_hook(
        $self,
        'customfeed.handle' => \&handle,
    );
}

sub handle {
    my($self, $context, $args) = @_;

    my $orig = $args->{feed}->url;
    my $count = 0;

    for my $plugin (@{$self->{plugins}}) {
        my $match = $plugin->{match} || '.'; # anything
        next unless $args->{feed}->url =~ m/$match/i;
        if ($args->{feed}->url =~ m!^$match!
            && $args->{feed}->url !~ /output=(?:rss|atom)/) {
            $args->{plugin} = $plugin;
            $self->aggregate($context, $args);
            return 1;
        }
    }
    return;
}

sub aggregate {
    my($self, $context, $args) = @_;

    my $url = URI->new($args->{feed}->url);
    my $plugin = $args->{plugin};

    if ($plugin->{fetch_before_hook}) {
        eval $plugin->{fetch_before_hook};
        Plagger->context->error($@) if $@;
    }

    $context->log(info => "GET $url");

    my $agent = Plagger::UserAgent->new;
    my $res = $agent->fetch($url, $self);

    if ($res->http_response->is_error) {
        $context->log(error => "GET $url failed: " . $res->status_code);
        return;
    }

    my $content = decode_content($res);
    my $title   = extract_title($content);

    my $feed = Plagger::Feed->new;
    $feed->title($plugin->{extract_encode} ? decode($plugin->{extract_encode}, $title) : $title);
    $feed->link($url);

    if ($plugin->{extract_before_hook}) {
        eval $plugin->{extract_before_hook};
        Plagger->context->error($@) if $@;
    }

    while ($content =~ /$plugin->{extract}/sg) {
        if (my @match = $& =~ /$plugin->{extract}/s) {
            my @capture = split /\s+/, $plugin->{extract_capture};
            my $data;
            @{$data}{@capture} = @match;

            if ($plugin->{extract_after_hook}) {
                eval $plugin->{extract_after_hook};
                Plagger->context->error($@) if $@;
            }

            my $entry = Plagger::Entry->new;
            $entry->title($plugin->{extract_encode} ? decode($plugin->{extract_encode}, $data->{title}) : $data->{title});
            $entry->link($data->{link});
            if ($data->{body}) {
                $entry->body($plugin->{extract_encode} ? decode($plugin->{extract_encode}, $data->{body}) : $data->{body});
            }
            $feed->add_entry($entry);
        }
    }

    $context->update->add($feed);

    return 1;
}

1;

assets/plugins/CustomFeed-Config/SoftAntenna.yaml

match: http://www\.softantenna\.com/index\.html
extract: <li class=saitem><strong><a href="(https?://[^"]*)">(.*?)</a>[^\n]+</a>(.*?)</span><br>\n<span class=small>(.*?)</span>\n
extract_capture: link title1 title2 body
extract_encode: iso 2022-jp
extract_after_hook: $data->{title} =  $data->{title1} . $data->{title2}

assets/plugins/CustomFeed-Config/VersionUpInfo.yaml

match: http://www2s\.biglobe\.ne\.jp/~takao777/versionupinfo/index\.html
extract: <li><a href="(.*?)" target="_blank">(.*?)</a>(.*?)<!--[^\n]*-->(.*?)<(tt|em)>(.*?)</\5>
extract_capture: link title1 title2 body1 null body2
extract_after_hook: |
  $data->{title} =  $data->{title1} . $data->{title2};
  $data->{body} =  $data->{body1} . $data->{body2};

2006-08-06

[] CustomFeed::Mixiコミュニティ新書き込み対応に 19:27  CustomFeed::Mixiをコミュニティ最新書き込み対応に - Seacolor Labs. を含むブックマーク はてなブックマーク -  CustomFeed::Mixiをコミュニティ最新書き込み対応に - Seacolor Labs.  CustomFeed::Mixiをコミュニティ最新書き込み対応に - Seacolor Labs. のブックマークコメント

 CustomFeed::MixiをHackしてコミュニティ新書き込みを拾ってこれるようにしてみた。

 ただ、どうもやり方がスマートじゃないな~……動作に問題はないと思いますが。

 もっと巧いやり方を検討&募集中。


追記:06/08/08

 アンケートページの存在を忘れていたorz

 「動作に問題あるじゃん」すいませんすいませんすいませ(以下略

 WWW::Mixi自体がアンケートページに非対応であるため、とりあえず取得したページがアンケートページの場合はタイトルのみ取得するよう暫定対応。

 アンケートページの詳細取得はどう対応するか現在検討中

package Plagger::Plugin::CustomFeed::Mixi;
use strict;
use base qw( Plagger::Plugin );

use DateTime::Format::Strptime;
use Encode;
use WWW::Mixi;
use Time::HiRes;
use URI;

our $MAP = {
    FriendDiary => {
        start_url  => 'http://mixi.jp/new_friend_diary.pl',
        title      => 'マイミク最新日記',
        get_list   => 'parse_new_friend_diary',
        get_detail => 'get_view_diary',
        icon_re    => qr/owner_id=(\d+)/,
    },
    # can't get icon
    Message => {
        start_url  => 'http://mixi.jp/list_message.pl',
        title      => 'ミクシィメッセージ受信箱',
        get_list   => 'parse_list_message',
        get_detail => 'get_view_message',
    },
    # can't get icon & body
    RecentComment => {
        start_url  => 'http://mixi.jp/list_comment.pl',
        title      => 'ミクシィ最近のコメント一覧',
        get_list   => 'parse_list_comment',
    },
    Log => {
        start_url => 'http://mixi.jp/show_log.pl',
        title     => 'ミクシィ足跡',
        get_list => 'parse_show_log',
        icon_re => qr/[^_]id=(\d+)/,
    }, 
    Bbs => {
        start_url  => 'http://mixi.jp/new_bbs.pl',
        title      => 'コミュニティ最新書き込み',
        get_list   => 'parse_new_bbs',
        get_detail => 'dummy',
        get_detail_bbs => 'get_view_bbs',
        get_detail_event => 'get_view_event',
        get_detail_enquete => 'get_view_enquete',
    },
};

sub plugin_id {
    my $self = shift;
    $self->class_id . '-' . $self->conf->{email};
}

sub register {
    my($self, $context) = @_;
    $context->register_hook(
        $self,
        'subscription.load' => \&load,
    );
}

sub load {
    my($self, $context) = @_;

    my $cookie_jar = $self->cookie_jar;
    if (ref($cookie_jar) ne 'HTTP::Cookies') {
        # using foreign cookies = don't have to set email/password. Fake them
        $self->conf->{email}    ||= 'plagger@localhost';
        $self->conf->{password} ||= 'pl4gg5r';
    }

    $self->{mixi} = WWW::Mixi->new($self->conf->{email}, $self->conf->{password});
    $self->{mixi}->cookie_jar($cookie_jar);

    my $feed = Plagger::Feed->new;
       $feed->aggregator(sub { $self->aggregate(@_) });
    $context->subscription->add($feed);
}

sub aggregate {
    my($self, $context, $args) = @_;
    for my $type (@{$self->conf->{feed_type} || ['FriendDiary']}) {
        $context->error("$type not found") unless $MAP->{$type};
        $self->aggregate_feed($context, $type, $args);
    }
}
sub aggregate_feed {
    my($self, $context, $type, $args) = @_;

    my $start_url = $MAP->{$type}->{start_url};
    my $response  = $self->{mixi}->get($start_url);

    my $next_url = URI->new($start_url)->path;

    if ($response->content =~ /action=login\.pl/) {
        $context->log(debug => "Cookie not found. Logging in");

        if ($self->conf->{email} eq 'plagger@localhost') {
            $context->log(error => 'email/password should be set to login');
        }

        $response = $self->{mixi}->post("http://mixi.jp/login.pl", {
            next_url => $next_url,
            email    => $self->conf->{email},
            password => $self->conf->{password},
            sticky   => 'on',
        });
        if (!$response->is_success || $response->content =~ /action=login\.pl/) {
            $context->log(error => "Login failed.");
            return;
        }

        # meta refresh, ugh!
        if ($response->content =~ m!"0;url=(.*?)"!) {
            $response = $self->{mixi}->get($1);
        }
    }

    my $feed = Plagger::Feed->new;
    $feed->type('mixi');
    $feed->title($MAP->{$type}->{title});
    $feed->link($MAP->{$type}->{start_url});

    my $format = DateTime::Format::Strptime->new(pattern => '%Y/%m/%d %H:%M');

    my $meth = $MAP->{$type}->{get_list};
    my @msgs = $self->{mixi}->$meth($response);
    my $items = $self->conf->{fetch_items} || 20;
    $self->log(info => 'fetch ' . scalar(@msgs) . ' entries');

    my $i = 0;
    my $blocked = 0;
    for my $msg (@msgs) {
        next if $type eq 'FriendDiary' and not $msg->{image}; # external blog
        last if $i++ >= $items;

        my $entry = Plagger::Entry->new;
        $entry->title( decode('euc-jp', $msg->{subject}) );
        $entry->link($msg->{link});
        $entry->author( decode('euc-jp', $msg->{name}) );
        $entry->date( Plagger::Date->parse($format, $msg->{time}) );

        if ($self->conf->{show_icon} && !$blocked && defined $MAP->{$type}->{icon_re}) {
            my $owner_id = ($msg->{link} =~ $MAP->{$type}->{icon_re})[0];
            my $link = "http://mixi.jp/show_friend.pl?id=$owner_id";
            $context->log(info => "Fetch icon from $link");

            my $item = $self->cache->get_callback(
                "outline-$owner_id",
                sub {
                    Time::HiRes::sleep( $self->conf->{fetch_body_interval} || 1.5 );
                    my($item) = $self->{mixi}->get_show_friend_outline($link);
                    $item;
                },
                '12 hours',
            );
            if ($item && $item->{image} !~ /no_photo/) {
                # prefer smaller image
                my $image = $item->{image};
                   $image =~ s/\.jpg$/s.jpg/;
                $entry->icon({
                    title => decode('euc-jp', $item->{name}),
                    url   => $image,
                    link  => $link,
                });
            }
        }

        if ($self->conf->{fetch_body} && !$blocked && $msg->{link} =~ /view_/ && defined $MAP->{$type}->{get_detail}) {
            if (not $msg->{link} =~ /view_enquete/) {
                $context->log(info => "Fetch body from $msg->{link}");
                my $item = $self->cache->get_callback(
                    "item-$msg->{link}",
                    sub {
                        Time::HiRes::sleep( $self->conf->{fetch_body_interval} || 1.5 );
                        my $meth = get_meth($MAP->{$type}, $msg->{link});
                        my($item) = $self->{mixi}->$meth($msg->{link});
                        $item;
                    },
                    '12 hours',
                );
                if ($item) {
                    my $body = decode('euc-jp', $item->{description});
                       $body =~ s!(\r\n?|\n)!<br />!g;
                    for my $image (@{ $item->{images} }) {
                        # xxx this should be $entry->enclosures
                        $body .= qq(<div><a href="$image->{link}"><img src="$image->{thumb_link}" style="border:0" /></a></div>);
                    }
                    $entry->body($body);
    
                    $entry->date( Plagger::Date->parse($format, $item->{time}) );
                } else {
                    $context->log(warn => "Fetch body failed. You might be blocked?");
                    $blocked++;
                }
            }
        }

        $feed->add_entry($entry);
    }

    $context->update->add($feed);
}
sub get_meth {
    my($type, $link) = @_;
    
    my $meth = "";
    if ($link =~ /view_bbs/) {
        $meth = $type->{get_detail_bbs};
    } elsif ($link =~ /view_event/) {
        $meth = $type->{get_detail_event};
    } elsif ($link =~ /view_enquete/) {
        $meth = $type->{get_detail_enquete};
    } else {
        $meth = $type->{get_detail};
    }
    return $meth;
}

1;

2006-06-19

[][][] 製作予定のPlagger::Plugins 22:38  製作予定のPlagger::Plugins - Seacolor Labs. を含むブックマーク はてなブックマーク -  製作予定のPlagger::Plugins - Seacolor Labs.  製作予定のPlagger::Plugins - Seacolor Labs. のブックマークコメント

自分を鼓舞する意味も込めて製作予定のプラグインを書いておく。

・CustomFeed::VersionUpInfo
 OnlineSoft VersionUp.infoの新着情報を取得。

・Subscription::Spurl
 Spurl!APIを通してフィードを取得。

・Publish::Spurl
 Spurl!APIを通してフィード更新

Spurlプラグインは非常に有用そうなので是非欲しいところ。

頑張れ自分。

2006-06-15

[] CustomFeed::SoftAntenna作ってみました 13:14  CustomFeed::SoftAntenna作ってみました - Seacolor Labs. を含むブックマーク はてなブックマーク -  CustomFeed::SoftAntenna作ってみました - Seacolor Labs.  CustomFeed::SoftAntenna作ってみました - Seacolor Labs. のブックマークコメント

ソフトアンテナ更新情報を取得。

CustomFeedでやるのが正しいかとか以前に「それPlaggerでやらなくてもいいんじゃね?」という声が聞こえてきそうですがそれはまあ気にしない方向でひとつ。

package Plagger::Plugin::CustomFeed::SoftAntenna;
use strict;
use base qw( Plagger::Plugin );

use Plagger::UserAgent;
use Plagger::Util;
use Encode;
use URI;
use URI::QueryParam;

sub register {
    my($self, $context) = @_;
    $context->register_hook(
        $self,
        'customfeed.handle' => \&handle,
    );
}

sub handle {
    my($self, $context, $args) = @_;

    if ($args->{feed}->url =~ m!^http://www\.softantenna\.com/!) {
        $self->aggregate($context, $args);
        return 1;
    }

    return;
}

sub aggregate {
    my($self, $context, $args) = @_;

    my $url = URI->new($args->{feed}->url);

    $context->log(info => "GET $url");

    my $agent = Plagger::UserAgent->new;
    my $res = $agent->fetch($url, $self);

    if ($res->http_response->is_error) {
        $context->log(error => "GET $url failed: " . $res->status_line);
        return;
    }

    my $content = Plagger::Util::decode_content($res);
    my $title   = Plagger::Util::extract_title($content);

    my $feed = Plagger::Feed->new;
    $feed->title(decode('iso 2022-jp', $title));
    $feed->link($args->{feed}->url);

    while ($content =~ m!<li class=saitem><strong><a href="(https?://[^"]*)">(.*?)</a>[^\n]+\n<span class=small>(.*?)</span>\n!g) {
        my($link, $title, $body) = ($1, $2, $3);

        my $entry = Plagger::Entry->new;
        $entry->title(decode('iso 2022-jp', $title));
        $entry->link($link);
        $entry->body(decode('iso 2022-jp', $body));

        $feed->add_entry($entry);
    }

    $context->update->add($feed);
}

1;

__END__

=head1 NAME

Plagger::Plugin::CustomFeed::SoftAntenna - Create SoftAntenna feed

=head1 SYNOPSIS

  - module: Subscription::Config
    config:
      feed:
        - http://www.softantenna.com/index.html

  - module: CustomFeed::SoftAntenna

=head1 DESCRIPTION

This plugin creates a feed off of SoftAntenna HTML pages.

=head1 AUTHOR

Kazushi Tominaga

=head1 SEE ALSO

L<Plagger>

=cut