这几天对 Node 编写小爬虫进行了学习,这里记录一下实战的代码。
爬取网页信息
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
| let axios = require('axios') let { fsRead, fsWrite, fsDir } = require('./rw') let httpUrl = 'https://www.1905.com/vod/list/n_1_t_1/o3p1.html' function req(link) { return new Promise(function (resolve, reject) { axios.get(link).then(function (res) { resolve(res.data) }) }) }
async function getClassUrl(link) { let data = await req(link) let reg = /<span class="search-index-L">类型(.*?)<div class="grid-12x">/gis let result = reg.exec(data)[1] let reg1 = /<a href="javascript\:void\(0\);" onclick="location\.href='(.*?)';return false;" .*?>(.*?)<\/a>/gis let arr = []
while ((result2 = reg1.exec(result))) { if (result2[2] != '全部') { let obj = { className: result2[2], link: result2[1], } arr.push(obj) await fsDir('./movies/' + result2[2]) getMovie(result2[1], result2[2]) } } }
async function getMovie(link, movieType) { let data = await req(link) let reg = /<a class="pic-pack-outer" target="\_blank" href="(.*?)" .*?>/gis var res9 var arrList = [] while ((res9 = reg.exec(data))) { arrList.push(res9[1]) parsePage(res9[1], movieType) } }
async function parsePage(url, movieType) { let data = await req(url) let reg = /<h1 class="playerBox-info-name playerBox-info-cnName">(.*?)<\/h1>.*?id="playerBoxIntroCon">(.*?)<a .*?导演.*?data-hrefexp="fr=vodplay\_ypzl\_dy">(.*?)<\/a>/gis let res3 = reg.exec(data) console.log(res3[1]) let movie = { name: res3[1], brief: res3[2], daoyan: res3[3], movieUrl: url, movieType, } let str = JSON.stringify(movie) fsWrite('./movies/' + movieType + '/' + res3[1] + 'json', str) } getClassUrl(httpUrl)
|
爬取表情包
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
| const cheerio = require('cheerio') const axios = require('axios') const fs = require('fs') const url = require('url') const path = require('path')
function wait(millSeconds) { return new Promise(function (resolve, reject) { setTimeout(() => { resolve('成功执行延迟函数,延迟时间:' + millSeconds) }, millSeconds) }) }
async function pageNum(link) { let res = await axios.get(link) let $ = cheerio.load(res.data) let btnLength = $('.pagination li').length let allNum = $('.pagination li') .eq(btnLength - 2) .find('a') .text() return allNum }
async function getListPage(pageNum) { let httpUrl = `https://www.doutula.com/article/list/?page=${pageNum}` let res = await axios.get(httpUrl) let $ = cheerio.load(res.data) $('#home .col-sm-9>a').each((i, element) => { let pageUrl = $(element).attr('href') let title = $(element).find('.random_title').text() let reg = /(.*?)\d/gis title = reg.exec(title)[1] fs.mkdir('./img/' + title, function (err) { if (err) { console.log(err) } else { console.log('创建:' + './img/' + title) } }) parsePage(pageUrl, title) }) }
async function parsePage(link, title) { let res = await axios.get(link) let $ = cheerio.load(res.data) $('.pic-content img').each((i, element) => { let imgUrl = $(element).attr('src') let b = url.parse(imgUrl) let name = path.parse(b.pathname) let filePath = `./img/${title}/${name.base}/` let ws = fs.createWriteStream(filePath.trim()) axios.get(imgUrl, { responseType: 'stream' }).then(function (res) { res.data.pipe(ws) console.log('正在下载表情:' + filePath) res.data.on('close', () => ws.close()) }) }) }
async function spider(link) { let allPageNum = await pageNum(link) for (let i = 1; i <= allPageNum; i++) { await wait(4000 * i) getListPage(i) } } spider('https://www.doutula.com/article/list/?page=1')
|
爬取音乐
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
| const axios = require('axios') const fs = require('fs') const path = require('path')
async function getPage(num) { let httpUrl = 'http://www.app-echo.com/api/recommend/sound-day?page=' + num let res = await axios.get(httpUrl) let list = res.data.list list.forEach(function (item, i) { let title = item.sound.name let musicUrl = item.sound.source let fileName = path.parse(musicUrl).name let content = `${title},${musicUrl}.${fileName}\n` fs.writeFile('music.txt', content, { flag: 'a' }, function () { }) download(musicUrl, fileName) }) } async function download(link, fileName) { let res = await axios.get(link, { responseType: 'stream' }) let ws = fs.createWriteStream('./music/' + fileName + '.mp3') console.log(res.data) res.data.pipe(ws) res.data.on('close', function () { ws.close() }) }
getPage(1)
|
心得总结
编写爬虫主要是通过 axios
来进行发送请求,在这个过程中,我们要分析网页结构,和网站信息,来提取我们需要的信息,进行一个爬取。在这个过程中,大部分都是异步完成的,要记得加 await
。
在没有 cheerio
模块的时候,我们通过正则匹配来进行抓取,有了 cheerio
模块我们可以像使用 jquery
一样方便的来获取页面中的元素。
当然过程中也使用了 Node 的一些核心模块,包括 url 的解析,path 路径的解析,文件的读写,还有 stream
的操作等等,这次爬虫的小实战,也算是对前几天的学习的一个综合运用。
妈妈再也不用担心斗图斗不过别人啦!