2008-01-07
■ [assets][CustomFeed-Script] Web::Scraper for CustomFeed::Script 栗田出版販売コミック刊行予定情報

このコードの最新版はCodeReposに置いてあります。
まんがの森コミックリストよりも情報が早いと噂の栗田出版販売コミック刊行予定情報をWeb::Scraper+Plaggerでenjoy!
assets/plugins/CustomFeed-Script/bookkurita-comicdb.pl
シンタックスハイライトするとなぜか表示されないので普通のスーパーpreで。見にくくてサーセン。
#!/usr/local/bin/perl
# author: SweetPotato
use strict;
use warnings;
use utf8;
use DateTime;
use Encode qw( decode );
use URI;
use Web::Scraper 0.22;
use YAML;
my $url = 'http://www3.kuradashi-shinkan.com/kanko/comicdb.asp';
my $s = scraper {
my $publisher;
process '//body/div/center/table/tr[not(@align)]',
'entry[]' => scraper {
process '/tr/td[position()=1 or position()=2]',
'tags[]' => ['text', \&trim];
process '/tr/td[1]', publisher => ['text', \&trim];
process '/tr/td[2]', series => ['text', \&trim];
process '/tr/td[3]', title => ['text', \&trim];
process '/tr/td[4]', author => ['text', \&trim];
process '/tr/td[5]', price => ['text', \&trim];
process '/tr/td[6]',
date => ['text', \&trim, \&mk_date],
part_or_day => ['text', \&trim, sub { m!/(.*?)$!; $1 } ];
process '/tr/td[7]', isbn => ['text', \&trim];
result qw( tags publisher series title author price date part_or_day isbn );
};
result qw( entry );
};
$s->user_agent->env_proxy;
my $res = $s->user_agent->get($url);
unless ($res->is_success) {
die "GET $url failed: " . $res->status_line;
}
my @entry = @{ $s->scrape(decode('cp932', $res->content)) || [] };
for my $e (@entry) {
$e->{body} = &mk_body($e);
delete $e->{$_} for qw( publisher series price part_or_day isbn );
}
binmode STDOUT, ":utf8";
print YAML::Dump +{
title => '栗田出版販売 コミック刊行予定情報',
link => $url,
entry => \@entry,
};
# guess year
sub mk_date {
my ($month, $day) = (shift =~ m!(.*)/(.*)!) or return;
$day = &part_to_day($day);
my $today = DateTime->now(time_zone => 'Asia/Tokyo')->truncate(to => 'day');
my $this = $today->clone->set(month => $month, day => $day);
my $last = $this->clone->subtract(years => 1);
my $next = $this->clone->add(years => 1);
my @date = sort { DateTime::Duration->compare($a->[1], $b->[1], $today) }
map { [$_->[0], $_->[1]->is_positive ? $_->[1] : $_->[1]->inverse ] }
map { [$_, $today - $_] } ($this, $last, $next);
$date[0]->[0]->ymd;
}
sub mk_body {
my $entry = shift;
$entry->{part_or_day} =~ /^\d+$/
? join ', ', map { $entry->{$_} } qw( author publisher series price isbn )
: join ', ', map { $entry->{$_} } qw( part_or_day author publisher series price isbn );
}
sub part_to_day {
$_ = shift;
return $_ if /^\d+$/;
return 21 if /下/;
return 11 if /中/;
return 1;
}
sub trim { s/^\s*|\s*$//g; $_ }
config.yaml
plugins: - module: Subscription::Config config: feed: - url: 'script:/path/to/bookkurita-comicdb.pl' - module: CustomFeed::Script - module: Publish::iCal config: dir: /path/to/dir filename: kurita.ics
コメント
トラックバック - http://plagger.g.hatena.ne.jp/SweetPotato/20080107