2007-10-28
■ [CustomFeed-Script][EFT] Web::Scraper and EFT 無表情

Web::Scraperを書くのがだるくなったのでCustomFeed::Simple+EFTでシンプルにハックすることにした。
今話題のニュースサイト,漫画(マンガ)情報サイト《無表情》をPlaggerでenjoy!
assets/plugins/CustomFeed-Script/muhyojo.pl
#!/usr/bin/perl # author: SweetPotato use strict; use warnings; use utf8; use DateTime; use DateTime::Duration; use Encode; use URI; use Web::Scraper 0.22; use YAML; my $url = 'http://muhyojo.web0.jp/'; my $s = scraper { process '//table[@width="100%" and not(@bgcolor)]/tbody/tr/td[.//div[@align="right"]]', 'info[]' => scraper { process '//b', 'category' => 'text'; process '//font[@color="red" or @color="orange"]', 'tag[]' => 'text'; process '//a[not(@class) and not(./img)]', 'title[]' => 'text', 'link[]' => ['@href', sub { URI->new_abs($_, $url)->as_string } ]; process '//font[@color="#696969"]', 'date[]' => ['text', sub { &mk_date($_) } ]; result qw/category tag title link date/; }; result qw/info/; }; my $res = $s->user_agent->get($url); unless ($res->is_success) { die "GET $url failed: " . $res->status_line; } my @entry; for my $info (@{ $s->scrape(Encode::decode('shiftjis', $res->content)) || [] }) { for (0 .. $#{$info->{title}}) { push @entry, +{ title => $info->{title}->[$_], link => $info->{link}->[$_], date => $info->{date}->[$_], tags => [$info->{category}, $info->{tag}->[$_]], }; } } binmode STDOUT, ':utf8'; print YAML::Dump +{ title => '漫画(マンガ)情報サイト《無表情》', link => $url, entry => \@entry, }; # guess year sub mk_date { my $md = shift; my ($month, $day) = ($md =~ /(\d{2})\/(\d{2})/) or return; my $today = DateTime->now->truncate(to => 'day'); my $this = $today->clone->set(month => $month, day => $day); my $last = $this->clone->subtract(years => 1); my $next = $this->clone->add(years => 1); my @date = sort { DateTime::Duration->compare($a->[1], $b->[1], $today) } map { [$_->[0], $_->[1]->is_positive ? $_->[1] : $_->[1]->inverse ] } map { [$_, $today - $_] } ($this, $last, $next); $date[0]->[0]->strftime('%Y-%m-%d'); }
assets/plugins/Entry-FullText/muhyojo.pl
# author: SweetPotato sub handle { my ($self, $args) = @_; $args->{entry}->link =~ qr!^http://muhyojo\.web0\.jp/publisher/[-\w]+/[-\w]+/(index\.html?)?#\d+$!; } sub extract { my ($self, $args) = @_; my ($name) = ($args->{entry}->link =~ /#(\d+)$/) or return; my ($data) = ($args->{content} =~ m!(<legend[^>]*><a name="?$name"?>.*?)</fieldset>!s) or return; $data; }
config.yaml
plugins: - module: Subscription::Config config: feed: - url: script:/path/to/muhyojo.pl - module: CustomFeed::Script - module: Filter::EntryFullText - module: Filter::ResolveRelativeLink - module: Filter::ForceTimeZone
コメント
トラックバック - http://plagger.g.hatena.ne.jp/SweetPotato/20071028