ゴミスクリプト
googleでサーチするスクリプト書いてみたけど,googleからアク禁食らった.
なんか方法あるんだろうな.
google以外を使うことにしたから,とりあえずゴミ箱へ..
package Lejay::GoogleSearch; =pod URLのリストを取得するモジュール =cut use strict; use LWP::UserAgent; use HTTP::Request; use Carp; sub new { my $self = shift; my $ua = LWP::UserAgent->new; $ua->timeout(10); $ua->agent('Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; ja-JP-mac; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.'); my $hash = { ua => $ua, start => 0, num => 100, end_flag => 0, cache => [], data => [], lastHTML => undef, }; return bless $hash, $self; } sub search { my $self = shift; $self->{end_flag} = 0; my $url = "http://www.google.co.jp/search?" . join( '&', ( "num=" . $self->{num}, "start=" . $self->{start}, "q=" . $self->{query}, "&lr=lang_ja", "ie=utf-8", "oe=utf-8", "aq=t", "rls=org.mozilla:ja-JP-mac:official", "client=firefox-a" )); my $req = HTTP::Request->new(GET => $url); my $res = $self->{ua}->request($req); if( ! $res->is_success){ croak q{google search error} } my $html = $res->content; $self->{lastHTML} = $html; my $count = 0; my $data = []; while($html =~ /<a href=\"([^<]+?)\" class=l.*?>(.+?)<\/a>/g) { my $url = $1; my $title = $2; $title =~ s!</?em>!!g; push( @$data,{ url => $url, title => $title, }); $count++; } if($count < $self->{num}){ $self->{end_flag} = 1; } $self->{data} = $data; $self->{cache} = [ @{ $self->{cache} }, @$data ]; return $self; } sub clearCache { my $self = shift; $self->{cache} = []; return $self; } sub getData { my $self = shift; return @{ $self->{data} }; } sub getAll { my $self = shift; return @{ $self->{cache} }; } sub getURLList { my $self = shift; return map { $_->{url} } @{ $self->{cache} }; } sub nextPage { my $self = shift; $self->{start} += $self->{num}; return $self; } sub chkEnd { my $self = shift; return $self->{end_flag}; } sub setStart { my $self = shift; my ( $start ) = @_; if( $start =~ m!\d+! ){ $self->{start} = $start; } return $self; } sub setQuery { my $self = shift; $self->{query} = join( '+',@_ ); return $self; } sub setProxy { my $self = shift; my( $proxy ) = @_; $self->{ua}->proxy('http', $proxy); return $self; } sub setTimeout { my $self = shift; my( $timeout ) = @_; if( $timeout =~ m!^\d+$! ){ $self->{ua}->timeout( $timeout ); } return $self; } sub setAgent { my $self = shift; my( $agent ) = @_; if( $agent ){ $self->{ua}->agent( $agent ); } return $self; } 1;