Google Scholar APIがないので,適当に書いてみた
前々から思ってたけど,なぜGoogle ScholarにAPIがないんだ.
個人が作った物も全く公開されていない.
需要がないのかな.
まぁ,ないならないで,自分が作るしかないわけで,適当に作ってみた.
package Google::Scholar; use utf8; use strict; use Carp; use Web::Scraper; use URI; our $VERSION = '0.01'; sub new { my $self = shift; my %data = @_; $data{base_url} = q{http://scholar.google.com/scholar?hl=en}; # return (bless \%data, $self)->get; return bless \%data, $self; } sub get { my $self = shift; my $url = $self->{url}; $self->{start} = $self->{num} * $_[0] if( $_[0] =~ m!^\d+$!); my $q = {}; # 全てのキーワードを含む $q->{q} = &_queryToString( $self->{query} ) if(exists $self->{query}); # フレーズを含む $q->{as_epq} = '%22'.&_urlencode( $self->{extract} ).'%22' if(exists $self->{extract}); # いずれかのキーワードを含む $q->{as_oq} = &_queryToString( $self->{atLeast} ) if(exists $self->{atLeast}); # キーワードを含まない $q->{without} = &_queryToString( $self->{without} ) if(exists $self->{without}); # 著者検索 $q->{sa_sauthors} = &_queryToString( $self->{author} ) if(exists $self->{author}); # 出版物 $q->{as_publication} = &_queryToString( $self->{publication} ) if(exists $self->{publication}); # 日付 $q->{as_ylo} = &_chkYear( $self->{from} ) if(exists $self->{from}); $q->{as_yhi} = &_chkYear( $self->{to} ) if(exists $self->{to}); # 取得数 $self->{num} = !$self->{num} ? 30: $self->{num} > 100 ? 100: $self->{num} < 0 ? 30: $self->{num}; $q->{num} = $self->{num}; # 開始ページ $q->{start} = $self->{start} ? $self->{start} : 0; $self->{url} = $self->{base_url}; foreach(keys %{$q}){ $self->{url} .= qq[&$_=].$q->{$_}; } my $paper_scraper = scraper { process 'span.w > a', title => 'TEXT', url => '@href'; process 'font > span.a', # meta => 'TEXT'; meta => sub { my @a = split / - /,$_->as_text; my $year = $a[1] =~ m!((20\d{2})|(1[89]\d{2}))! ? $1 : ''; +{ authors => $a[0], src => $a[1], year => $year, other => $a[2], }; }; process '/p/b', 't_parts1[]' => 'TEXT'; process '/p[@class="g"]/text()', 't_parts0[]' => 'TEXT'; result 'title','url','meta','t_parts0','t_parts1'; }; my $page_scraper = scraper { process 'body > p.g', 'papers[]' => $paper_scraper; result 'papers'; }; $page_scraper->user_agent->agent("Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"); $self->{data} = $page_scraper->scrape( URI->new($self->{url}) ); #整形 foreach my $e(@{$self->{data}}){ if(!$e->{title} && $e->{t_parts1}){ my $a = 0+@{$e->{t_parts0}}; my $b = 0+@{$e->{t_parts1}}; foreach(0..($a>$b?$a:$b)){ $e->{title} .= $e->{t_parts0}->[$_] if $_ < $a; $e->{title} .= $e->{t_parts1}->[$_] if $_ < $b; } } delete $e->{t_parts0}; delete $e->{t_parts1}; } return $self; } sub next { my $self = shift; $self->{start} += $self->{num}; return $self->get; } sub getRawData { my $self = shift; return $self->{data}; } sub _urlencode { $_[0] =~ s/([^a-zA-Z0-9_\/\.\?:=&#])/sprintf("%%%02X", unpack("C", $1))/eg; return $_[0]; } sub _queryToString { my $param = $_[0]; my @tmp = (); my @words = ref $param eq 'ARRAY'? @{ $param }: split / /,$param; foreach( @words ){ push @tmp, &_urlencode( $_ ); } return join('+',@tmp); } sub _chkYear { my $year = $_[0]; return $year =~ m!^\d{4}$!? $year:''; } 1;
使い方:
my $papers = new Google::Scholar( query => '', extract => '', atLeast => '', without => '', author => '', publication => '', from => '', to => '', num => '', start => '', ); print Dump $papers->get->getRawData; print Dump $papers->next->getRawData; print Dump $papers->get(3)->getRawData;
使用例:
#!/usr/bin/perl use lib "./lib"; use strict; use YAML::Syck; use Google::Scholar; my $scholar = new Google::Scholar( query => 'XML', from => 2000, ); print Dump $scholar->get->getRawData;
出力例:
--- - meta: authors: … 'Neale, Y Oohata, K Paech, J Perl, A Pfeiffer, MG … other: Elsevier src: Nuclear Inst. and Methods in Physics Research, A, 2003 year: 2003 title: Geant4―a simulation toolkit url: !!perl/scalar:URI::http http://linkinghub.elsevier.com/retrieve/pii/S0168900203013688 - meta: authors: … , TO'Toole, G Parker, TM Perl, PK Russell, K Tonat other: Am Med Assoc src: JAMA, 2001 year: 2001 title: Tularemia as a Biological Weapon Medical and Public Health Management url: !!perl/scalar:URI::http http://jama.ama-assn.org/cgi/content/abstract/285/21/2763 - meta: authors: … , TO'Toole, G Parker, TM Perl, PK Russell, DL … other: Am Med Assoc src: JAMA, 2001 year: 2001
これは酷い・・・
汚いソースでごめんなさい><