SweetPotato::Plagger このページをアンテナに追加 RSSフィード

2008-01-07

[][] Web::Scraper for CustomFeed::Script 栗田出版販売コミック刊行予定情報  Web::Scraper for CustomFeed::Script 栗田出版販売コミック刊行予定情報 - SweetPotato::Plagger を含むブックマーク はてなブックマーク -  Web::Scraper for CustomFeed::Script 栗田出版販売コミック刊行予定情報 - SweetPotato::Plagger  Web::Scraper for CustomFeed::Script 栗田出版販売コミック刊行予定情報 - SweetPotato::Plagger のブックマークコメント

このコードの最新版はCodeReposに置いてあります。

まんがの森コミックリストよりも情報が早いと噂の栗田出版販売コミック刊行予定情報をWeb::Scraper+Plaggerでenjoy!

assets/plugins/CustomFeed-Script/bookkurita-comicdb.pl

シンタックスハイライトするとなぜか表示されないので普通のスーパーpreで。見にくくてサーセン。

#!/usr/local/bin/perl
# author: SweetPotato
use strict;
use warnings;
use utf8;

use DateTime;
use Encode qw( decode );
use URI;
use Web::Scraper 0.22;
use YAML;

my $url = 'http://www3.kuradashi-shinkan.com/kanko/comicdb.asp';

my $s = scraper {
    my $publisher;
    process '//body/div/center/table/tr[not(@align)]',
      'entry[]' => scraper {
        process '/tr/td[position()=1 or position()=2]',
          'tags[]' => ['text', \&trim];
        process '/tr/td[1]', publisher => ['text', \&trim];
        process '/tr/td[2]', series => ['text', \&trim];
        process '/tr/td[3]', title  => ['text', \&trim];
        process '/tr/td[4]', author => ['text', \&trim];
        process '/tr/td[5]', price  => ['text', \&trim];
        process '/tr/td[6]',
          date => ['text', \&trim, \&mk_date],
          part_or_day => ['text', \&trim, sub { m!/(.*?)$!; $1 } ];
        process '/tr/td[7]', isbn => ['text', \&trim];
        result qw( tags publisher series title author price date part_or_day isbn );
    };
    result qw( entry );
};
$s->user_agent->env_proxy;

my $res = $s->user_agent->get($url);
unless ($res->is_success) {
    die "GET $url failed: " . $res->status_line;
}

my @entry = @{ $s->scrape(decode('cp932', $res->content)) || [] };
for my $e (@entry) {
    $e->{body} = &mk_body($e);
    delete $e->{$_} for qw( publisher series price part_or_day isbn );
}

binmode STDOUT, ":utf8";
print YAML::Dump +{
    title => '栗田出版販売 コミック刊行予定情報',
    link  => $url,
    entry => \@entry,
};

# guess year
sub mk_date {
    my ($month, $day) = (shift =~ m!(.*)/(.*)!) or return;
    $day = &part_to_day($day);

    my $today = DateTime->now(time_zone => 'Asia/Tokyo')->truncate(to => 'day');
    my $this = $today->clone->set(month => $month, day => $day);
    my $last = $this->clone->subtract(years => 1);
    my $next = $this->clone->add(years => 1);
    my @date = sort { DateTime::Duration->compare($a->[1], $b->[1], $today) }
               map { [$_->[0], $_->[1]->is_positive ? $_->[1] : $_->[1]->inverse ] }
               map { [$_, $today - $_] } ($this, $last, $next);

    $date[0]->[0]->ymd;
}

sub mk_body {
    my $entry = shift;

    $entry->{part_or_day} =~ /^\d+$/
        ? join ', ', map { $entry->{$_} } qw( author publisher series price isbn )
        : join ', ', map { $entry->{$_} } qw( part_or_day author publisher series price isbn );
}

sub part_to_day {
    $_ = shift;
    return $_ if /^\d+$/;
    return 21 if /下/;
    return 11 if /中/;
    return 1;
}

sub trim { s/^\s*|\s*$//g; $_ }

config.yaml

plugins:
  - module: Subscription::Config
    config:
      feed:
        - url: 'script:/path/to/bookkurita-comicdb.pl'

  - module: CustomFeed::Script

  - module: Publish::iCal
    config:
      dir: /path/to/dir
      filename: kurita.ics
トラックバック - http://plagger.g.hatena.ne.jp/SweetPotato/20080107