본문 바로가기

Back-end/JS

PahntomJS CapserJS를 통한 크롤링 정리 - bing 크롤링

PhantomJS CapserJS를 통한 크롤링 정리 -2

bing 검색결과 가져오기

bing의 경우 검색결과를 보면 아래와 같이 DOM이 나온다 (bing에서 개발자도구)

<li class="b_algo" data-bm="8"><div class="b_title"><h2><a target="_blank" href="http://casperjs.org/" h="ID=SERP,5100.1"><strong>CasperJS</strong>, a navigation scripting and testing utility …</a></h2><div class="b_suffix b_secondaryText nowrap"><a target="_blank" href="http://www.microsofttranslator.com/bv.aspx?ref=SERP&amp;br=ro&amp;mkt=ko-KR&amp;dl=ko&amp;lp=EN_KO&amp;a=http%3a%2f%2fcasperjs.org%2f" h="ID=SERP,5107.1"> </a></div></div><div class="b_caption"><div class="b_attribution" u="0N|5088|4633283525150274|o_PI4ciGKG46-UTuYr8QUhAeTMF0oCpZ"><cite><strong>casperjs</strong>.org</cite><a href="#" aria-label=" " aria-haspopup="true"><span class="c_tlbxTrg"><span class="c_tlbxTrgIcn sw_ddgn"></span><span class="c_tlbxH" h="BASE:CACHEDPAGEDEFAULT" k="SERP,5101.1"></span></span></a>2017-04-25</div><p><strong>CasperJS</strong> is a browser navigation scripting &amp; testing utility written in Javascript for PhantomJS or SlimerJS.</p></div></li>

url 태그의 경우 b_algo 클래스 아래의 a태그가 된다.

//capserjs initalizing
var casper = require('casper').create({
verbose: true,
logLevel: 'error',
clientScripts:[]
});
var links = [];
function getLinks(){
//b_algo dom a
var links = document.querySelectorAll('.b_algo a');
return Array.prototype.map.call(links, function(e){
// href
return e.getAttribute('href');
})
};
casper.start('http://bing.com/', function(){
//selector
this.fill('form[action="/search]',{
//name="q" casperjs
q:'casperjs'
}, true);
});
//
casper.run();
///////////////////////
/// capser.start method
///////////////////////
casper.then(function(){
//evaluate .
links = this.evaluate(getLinks);
});
//
casper.run();

추가동작

casper.then(function(){
//evaluate .
links = this.evaluate(getLinks);
// phantomjs
this.fill('form[action="/search]',{
//name="q" casperjs
q:'phantomjs'
}, true);
})

전체소스

//capserjs initalizing
var casper = require('casper').create({
verbose: true,
logLevel: 'error',
clientScripts:[]
});
var links = [];
function getLinks(){
//b_algo dom a
var links = document.querySelectorAll('.b_algo a');
return Array.prototype.map.call(links, function(e){
// href
return e.getAttribute('href');
})
};
casper.start('http://bing.com/', function() {
// search for 'casperjs' from google form
this.fill('form[action="/search"]', {
q: 'casperjs'
}, true);
});
casper.then(function() {
// aggregate results for the 'casperjs' search
links = this.evaluate(getLinks);
// now search for 'phantomjs' by filling the form again
this.fill('form[action="/search"]', {
q: 'phantomjs'
}, true);
});
casper.then(function() {
// aggregate results for the 'phantomjs' search
links = links.concat(this.evaluate(getLinks));
});
//
casper.run(function(){
//echo results in a readable fashion
this.echo(links.length +'links founds:');
this.echo('-' +links.join('\n - ')).exit();
});

커스텀 스크립트 삽입

var casper = require('casper').create({
verbose: true,
logLevel: 'debug',
clientScripts: ["vendor/jquery.min.js", "vendor/lodash.js"]
});

위와같이 커스텀 스크립트를 삽입할 수 있다. jquery의 경우

기존 소스에서의

//b_algo dom a
var links = document.querySelectorAll('.b_algo a');
return Array.prototype.map.call(links, function(e){
// href
return e.getAttribute('href');
})

부분을

function getLinks() {
// var links = document.querySelectorAll('.b_algo a');
var links = $('.b_algo a')
return _.map(links, function(e) {
return e.getAttribute('href');
});
}

위와 같이 jquery selector를 통해 사용할 수 있다.