Google Scholar APIがないので,適当に書いてみた

前々から思ってたけど,なぜGoogle ScholarAPIがないんだ.
個人が作った物も全く公開されていない.
需要がないのかな.
まぁ,ないならないで,自分が作るしかないわけで,適当に作ってみた.

package Google::Scholar;
use utf8;
use strict;
use Carp;

use Web::Scraper;
use URI;

our $VERSION = '0.01';

sub new {
    my $self = shift;
    my %data = @_;
    
    $data{base_url} = q{http://scholar.google.com/scholar?hl=en};

#    return (bless \%data, $self)->get;
    return bless \%data, $self;
}


sub get {
    my $self = shift;
    my $url = $self->{url};
    
    $self->{start} = $self->{num} * $_[0] if( $_[0] =~ m!^\d+$!);

    my $q = {};
    # 全てのキーワードを含む
    $q->{q} = &_queryToString( $self->{query} )
	if(exists $self->{query});
    
    # フレーズを含む
    $q->{as_epq} = '%22'.&_urlencode( $self->{extract} ).'%22'
	if(exists $self->{extract});

    # いずれかのキーワードを含む
    $q->{as_oq} = &_queryToString( $self->{atLeast} )
	if(exists $self->{atLeast});

    # キーワードを含まない
    $q->{without} = &_queryToString( $self->{without} )
	if(exists $self->{without});

    # 著者検索
    $q->{sa_sauthors} = &_queryToString( $self->{author} )
	if(exists $self->{author});

    # 出版物
    $q->{as_publication} = &_queryToString( $self->{publication} )
	if(exists $self->{publication});

    # 日付
    $q->{as_ylo} = &_chkYear( $self->{from} )
	if(exists $self->{from});
    $q->{as_yhi} = &_chkYear( $self->{to} )
	if(exists $self->{to});

    # 取得数
    $self->{num} = 
	!$self->{num}      ? 30:
	$self->{num} > 100 ? 100:
	$self->{num} < 0   ? 30:
	$self->{num};
    $q->{num} = $self->{num};
    
    # 開始ページ
    $q->{start} =
	$self->{start} ? $self->{start} : 0;

    $self->{url} = $self->{base_url};
    foreach(keys %{$q}){
	$self->{url} .= qq[&$_=].$q->{$_};
    }

    my $paper_scraper = scraper {
	process 'span.w > a',
	title => 'TEXT',
	url   => '@href';

	process 'font > span.a',
#	meta => 'TEXT';
	meta => sub {
	    my @a = split / - /,$_->as_text;
	    my $year = $a[1] =~ m!((20\d{2})|(1[89]\d{2}))! ? $1 : '';
	    +{
		authors => $a[0],
		src => $a[1],
		year => $year,
		other => $a[2],
	    };
	};

	process '/p/b', 't_parts1[]' => 'TEXT';
	process '/p[@class="g"]/text()', 't_parts0[]' => 'TEXT';

	result 'title','url','meta','t_parts0','t_parts1';
    };

    my $page_scraper = scraper {
	process 'body > p.g', 'papers[]' => $paper_scraper;
	result 'papers';
    };

    $page_scraper->user_agent->agent("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");

    $self->{data} = $page_scraper->scrape( URI->new($self->{url}) );

    #整形
    foreach my $e(@{$self->{data}}){
	if(!$e->{title} && $e->{t_parts1}){
	    my $a = 0+@{$e->{t_parts0}};
	    my $b = 0+@{$e->{t_parts1}};
	    foreach(0..($a>$b?$a:$b)){
		$e->{title} .= $e->{t_parts0}->[$_] if $_ < $a;
		$e->{title} .= $e->{t_parts1}->[$_] if $_ < $b;
	    }
	}
	delete $e->{t_parts0};
	delete $e->{t_parts1};
    }
    return $self;
}


sub next {
    my $self = shift;
    $self->{start} += $self->{num};
    return $self->get;
}


sub getRawData {
    my $self = shift;
    return $self->{data};
}


sub _urlencode {
    $_[0] =~ s/([^a-zA-Z0-9_\/\.\?:=&#])/sprintf("%%%02X", unpack("C", $1))/eg;
    return $_[0];
}

sub _queryToString {
    my $param = $_[0];

    my @tmp = ();
    my @words = ref $param eq 'ARRAY'?
	@{ $param }:
	split / /,$param;
    foreach( @words ){
	push @tmp, &_urlencode( $_ );
    }
    return join('+',@tmp);
}

sub _chkYear {
    my $year = $_[0];
    return
	$year =~ m!^\d{4}$!?
	$year:'';
}
1;


使い方:

my $papers = new Google::Scholar(
    query => '',
    extract => '',
    atLeast => '',
    without => '',
    author => '',
    publication => '',
    from => '',
    to   => '',
    num  => '',
    start => '',
);
print Dump $papers->get->getRawData;
print Dump $papers->next->getRawData;
print Dump $papers->get(3)->getRawData;

使用例:

#!/usr/bin/perl
use lib "./lib";
use strict;
use YAML::Syck;

use Google::Scholar;

my $scholar = new Google::Scholar(
    query => 'XML',
    from => 2000,
    );
print Dump $scholar->get->getRawData;

出力例:

--- 
- 
  meta: 
    authors: … 'Neale, Y Oohata, K Paech, J Perl, A Pfeiffer, MG …
    other: Elsevier
    src: Nuclear Inst. and Methods in Physics Research, A, 2003
    year: 2003
  title: Geant4―a simulation toolkit
  url: !!perl/scalar:URI::http http://linkinghub.elsevier.com/retrieve/pii/S0168900203013688
- 
  meta: 
    authors: … , TO'Toole, G Parker, TM Perl, PK Russell, K Tonat
    other: Am Med Assoc
    src: JAMA, 2001
    year: 2001
  title: Tularemia as a Biological Weapon Medical and Public Health Management
  url: !!perl/scalar:URI::http http://jama.ama-assn.org/cgi/content/abstract/285/21/2763
- 
  meta: 
    authors: … , TO'Toole, G Parker, TM Perl, PK Russell, DL …
    other: Am Med Assoc
    src: JAMA, 2001
    year: 2001


これは酷い・・・
汚いソースでごめんなさい><