ゴミスクリプト

googleでサーチするスクリプト書いてみたけど,googleからアク禁食らった.
なんか方法あるんだろうな.
google以外を使うことにしたから,とりあえずゴミ箱へ..

package Lejay::GoogleSearch;
=pod
URLのリストを取得するモジュール
=cut
use strict;
use LWP::UserAgent;
use HTTP::Request;
use Carp;

sub new {
    my $self = shift;

    my $ua = LWP::UserAgent->new;
    $ua->timeout(10);
    $ua->agent('Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; ja-JP-mac; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.');

    my $hash = {
	ua    => $ua,
	start => 0,
	num   => 100,
	end_flag => 0,
	
	cache => [],
	data => [],
	lastHTML => undef,
    };
    return bless $hash, $self;
}

sub search {
    my $self = shift;
    $self->{end_flag} = 0;
    my $url = "http://www.google.co.jp/search?" .
	join( '&', (
		  "num=" . $self->{num},
		  "start=" . $self->{start},
		  "q=" . $self->{query},

		  "&lr=lang_ja",
		  "ie=utf-8",
		  "oe=utf-8",
		  "aq=t",
		  "rls=org.mozilla:ja-JP-mac:official",
		  "client=firefox-a"
	      ));

    my $req = HTTP::Request->new(GET => $url);
    my $res = $self->{ua}->request($req);


    if( ! $res->is_success){
	croak q{google search error}
    }
    
    my $html = $res->content;
    $self->{lastHTML} = $html;
    my $count = 0;
    my $data = [];
    while($html =~ /<a href=\"([^<]+?)\" class=l.*?>(.+?)<\/a>/g) {
	my $url = $1;
	my $title = $2;
	$title =~ s!</?em>!!g;
	push( @$data,{
	    url   => $url,
	    title => $title,
	      });

	$count++;
    }
    if($count < $self->{num}){
	$self->{end_flag} = 1;
    }

    $self->{data} = $data;
    $self->{cache} = [
	@{ $self->{cache} },
	@$data
	];
    return $self;
}

sub clearCache {
    my $self = shift;
    $self->{cache} = [];
    return $self;
}

sub getData {
    my $self = shift;
    return @{ $self->{data} };
}

sub getAll {
    my $self = shift;
    return @{ $self->{cache} };
}

sub getURLList {
    my $self = shift;
    return map { $_->{url} } @{ $self->{cache} };
}


sub nextPage {
    my $self = shift;
    $self->{start} += $self->{num};
    return $self;
}

sub chkEnd {
    my $self = shift;
    return $self->{end_flag};
}

sub setStart {
    my $self = shift;
    my ( $start ) = @_;
    if( $start =~ m!\d+! ){
	$self->{start} = $start;
    }
    return $self;
}

sub setQuery {
    my $self = shift;
    $self->{query} = join( '+',@_ );
    return $self;
}

sub setProxy {
    my $self = shift;
    my( $proxy ) = @_;
    $self->{ua}->proxy('http', $proxy);

    return $self;
}

sub setTimeout {
    my $self = shift;
    my( $timeout ) = @_;
    if( $timeout =~ m!^\d+$! ){
	$self->{ua}->timeout( $timeout );
    }
    
    return $self;
}


sub setAgent {
    my $self = shift;
    my( $agent ) = @_;
    if( $agent ){
	$self->{ua}->agent( $agent );
    }
    return $self;
}

1;