|
|
||
需要があったので書いてみました。
※7/8更新 日付に全角数字が含まれる場合に対応
#!/usr/bin/perl -w use strict; #use URI; use HTTP::Cookies; use LWP::UserAgent; use HTTP::Request::Common qw (POST); use YAML::Syck; use Web::Scraper; use utf8; use DateTime::Format::Japanese; use Encode; my $uri = URI->new("http://www.teraken.co.jp/menu/lunchmenu/index.html"); my $s = scraper { process "table tr", "list[]" => scraper { process 'div[align="right"]', "date" => 'TEXT'; process 'div[align="left"]', "text" => 'TEXT'; result 'date','text'; }; result 'list'; }; my $scr = $s->scrape($uri); #my $scr = $item->scrape($content); my $feed = { title => "sakura suisan", link => $uri->as_string, }; for my $menu (@{ $scr}) { if ($menu->{date} =~ m/.*(\d+)月(\d+)日(.)曜日/ ) { my $month = $1; my $day = $2; $month =~ tr/0-9/0-9/; $day =~ tr/0-9/0-9/; my $date = "2007-$month-$day"; push @{$feed->{entries}}, { title => $menu->{text}, date => $date, #date => $menu->{date}, }; } } print Dump($feed);
使用例。
global: plugin_path: - lib/Plagger/Plugin assets_path: assets timezone: Asia/Tokyo log: level: info plugins: - module: CustomFeed::Script - module: Subscription::Config config: feed: - script:assets/plugins/CustomFeed-Script/sakura.pl - module: Aggregator::Simple - module: Publish::iCal config: dir: . filename: sakuranch.ics
Champ2011/11/24 18:28Umm, are you really just gvinig this info out for nothing?
xpnosvwbc2011/11/28 22:11NXBeNP , [url=http://sbwckdiuibzc.com/]sbwckdiuibzc[/url], [link=http://noxsnoymfdfw.com/]noxsnoymfdfw[/link], http://mwphegoqmyyh.com/
llgynfedztp2011/12/01 17:21ZOwEAO <a href="http://aoiqjphicmak.com/">aoiqjphicmak</a>
mvywaxbznkk2011/12/06 00:09Tjmh7a , [url=http://igfizfiefnjb.com/]igfizfiefnjb[/url], [link=http://bgygbfwyktpf.com/]bgygbfwyktpf[/link], http://ncvauuxamkwg.com/
Plaggerのはてなグループ日記でよく見かけるFilter-EntryFullTextやCustomFeed-Configのassetsをエディタ開いてコピペするのがめんどくさいんでこんなモノを書いてみました。
package Plagger::Plugin::Filter::WriteSource; use strict; use base qw( Plagger::Plugin ); use File::Spec; use File::Path; use HTML::TokeParser; sub register { my($self, $context) = @_; $context->register_hook( $self, 'update.entry.fixup' => \&filter, ); } sub init { my $self = shift; $self->SUPER::init(@_); #Plugin::Filter::FetchEnclosureから拝借 defined $self->conf->{dir} or Plagger->context->error("config 'dir' is not set."); # XXX make it Plagger::Util function if ($self->conf->{dir} =~ /^[a-zA-Z]/ && $self->conf->{dir} !~ /:/) { $self->conf->{dir} = File::Spec->catfile( Cwd::cwd, $self->conf->{dir} ); } unless (-e $self->conf->{dir} && -d _) { Plagger->context->log(warn => $self->conf->{dir} . " does not exist. Creating"); mkpath $self->conf->{dir}; } } sub filter { my($self, $context, $args) = @_; my $feed_dir = File::Spec->catfile($self->conf->{dir}, $args->{feed}->id_safe); unless (-e $feed_dir && -d _) { $context->log(info => "mkdir $feed_dir"); mkdir $feed_dir, 0777; } my $body = $args->{entry}->body->data; #html解析 my $parser = HTML::TokeParser->new(\$body); my $file_name = ''; my $file_body = ''; my $i = 1; while ( my $token = $parser->get_tag('h4','pre') ) { my($tag, $attr, $attrseq, $text) = @$token; if ( $tag eq 'h4') { $file_name = $parser->get_text('/h4'); } elsif ($tag eq 'pre') { $file_body = $parser->get_text('/pre'); unless ($file_name) { $file_name = 'undifined' . $i; $i++; } #write my $entry_id = $args->{entry}->id_safe; my ($vol,$dir,$file) = File::Spec->splitpath( $file_name ); $dir = File::Spec->catdir( $feed_dir,$entry_id,$dir); unless (-e $dir) { mkpath($dir); } $file_name = File::Spec->catfile($dir,$file); my $title = $args->{entry}->{title}; $context->log(info => "wire $file_name at $title($entry_id)"); open my $fh, ">", $file_name or $context->error("$file_name: $i"); print $fh Encode::encode('utf-8',$file_body); close $fh; $file_name = undef; } } } 1; __END__ - module: Filter::WriteSource config: dir: ./tmp
このフィルタは指定したディレクトリに<pre>タグ内のソースを直前の<h4>のパスで保存します。
以下使用例。
plugins: - module: Subscription::Config config: feed: - url: http://plagger.g.hatena.ne.jp/SweetPotato/rss2 - url: http://plagger.g.hatena.ne.jp/acqua_alta/rss2 - url: http://plagger.g.hatena.ne.jp/Seacolor/rss2 - module: CustomFeed::Config - module: Filter::WriteSource config: dir: ./tmp - module: Publish::Feed config: format: RSS dir: . filename: %t.rss
todo:
SweetPotato使ってみたらtypoに気がつきました。
s/undifined/undefined/g