你所需要的,不仅仅是一个好用的代理。
var request = require('request'), cheerio = require('cheerio'), http = require('http'), url = require('url'); var host = 'http://baike.baidu.com/view/39744.htm';//可修改为其他的百科地址 var html = []; setInterval(scraper(host), 1000*60*15);//15 分钟更新一次 function scraper (host) { request(host, function (error, response, data) { if (!error && response.statusCode == 200) { var $ = cheerio.load(data); var title = $('.title').first().text(), header = [], nav = [], body = []; //删除无用数据 $('.title').remove(); $('.pic-info').remove(); $('.count').remove(); $('sup').remove(); //筛选有用数据 $('#lemmaContent-0 .headline-1').each(function (i) { var str = '', $next = $(this).next(); while (!$next.hasClass('headline-1')&&(!$next.next().hasClass('clear'))) { if ($next.hasClass('headline-2')) { str += "<p><strong>" + $next.text() + "</strong></p>"; } else { str += "<p>" + $next.text() + "</p>"; } $next = $next.next(); } header.push($(this).find('.headline-content').text()); nav.push("<span><a href='/" + i + "'>" + header[i] + "</a></span>"); body.push(str); }); var len = $('#catalog-holder-0 .catalog-item').length;//获取 “目录” 条文数 for (var i = 0; i < len; i++) { html[i] = "" + "<!DOCTYPE html>" + "<html>" + "<head>" + "<meta charset='UTF-8' />" + "<title>" + title + "</title>" + "<style type='text/css'>" + "body{width:600px;margin:2em auto;font-family:'Microsoft YaHei';}" + "p{line-height:24px;margin:1em 0;}" + "header{border-bottom:1px solid #cccccc;font-size:2em;font-weight:bold;padding-bottom:.2em;}" + "nav{float:left;font-family:'Microsoft YaHei';margin-left:-12em;width:9em;text-align:right;}" + "nav a{display:block;text-decoration:none;padding:.7em 1em;color:#000000;}" + "nav a:hover{background-color:#003f00;color:#f9f9f9;-webkit-transition:color .2s linear;}" + "</style>" + "</head>" + "<body>" + "<header>" + header[i] + "</header>" + "<nav>" + nav.join('') + "</nav>" + "<article>" + body[i] + "</article>" + "</body>" + "</html>"; } } }); } http.createServer(function (req, res) { var path = url.parse(req.url).pathname; path = path == '/' ? 0 : parseInt(path.slice(1)); res.writeHead(200, {"Content-Type":"text/html"}); res.end(html[path]); }).listen(3000); console.log('Server running at localhost:3000');
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
var request = require('request'),
cheerio = require('cheerio'),
http = require('http'),
url = require('url');
var host = 'http://baike.baidu.com/view/39744.htm';//可修改为其他的百科地址
var html = [];
setInterval(scraper(host), 1000*60*15);//15 分钟更新一次
function scraper (host) {
request(host, function (error, response, data) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(data);
var title = $('.title').first().text(),
header = [],
nav = [],
body = [];
//删除无用数据
$('.title').remove();
$('.pic-info').remove();
$('.count').remove();
$('sup').remove();
//筛选有用数据
$('#lemmaContent-0 .headline-1').each(function (i) {
var str = '',
$next = $(this).next();
while (!$next.hasClass('headline-1')&&(!$next.next().hasClass('clear'))) {
if ($next.hasClass('headline-2')) {
str += "<p><strong>" + $next.text() + "</strong></p>";
} else {
str += "<p>" + $next.text() + "</p>";
}
$next = $next.next();
}
header.push($(this).find('.headline-content').text());
nav.push("<span><a href='/" + i + "'>" + header[i] + "</a></span>");
body.push(str);
});
var len = $('#catalog-holder-0 .catalog-item').length;//获取 “目录” 条文数
for (var i = 0; i < len; i++) {
html[i] = "" +
"<!DOCTYPE html>" +
"<html>" +
"<head>" +
"<meta charset='UTF-8' />" +
"<title>" + title + "</title>" +
"<style type='text/css'>" +
"body{width:600px;margin:2em auto;font-family:'Microsoft YaHei';}" +
"p{line-height:24px;margin:1em 0;}" +
"header{border-bottom:1px solid #cccccc;font-size:2em;font-weight:bold;padding-bottom:.2em;}" +
"nav{float:left;font-family:'Microsoft YaHei';margin-left:-12em;width:9em;text-align:right;}" +
"nav a{display:block;text-decoration:none;padding:.7em 1em;color:#000000;}" +
"nav a:hover{background-color:#003f00;color:#f9f9f9;-webkit-transition:color .2s linear;}" +
"</style>" +
"</head>" +
"<body>" +
"<header>" + header[i] + "</header>" +
"<nav>" + nav.join('') + "</nav>" +
"<article>" + body[i] + "</article>" +
"</body>" +
"</html>";
}
}
});
}
http.createServer(function (req, res) {
var path = url.parse(req.url).pathname;
path = path == '/' ? 0 : parseInt(path.slice(1));
res.writeHead(200, {"Content-Type":"text/html"});
res.end(html[path]);
}).listen(3000);
console.log('Server running at localhost:3000');
抓取过于频繁,服务器返回429.这个时候需要切换代理IP了,推荐使用阿布云代理,阿布云代理IP,提供高匿代理,爬虫代理.
文章来源的:https://geekspider.org/