学习笔记
对 Puppeteer 进行了一个练习,记录一下笔记代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
| let puppeteer = require('puppeteer')
async function test() { let options = { defaultViewport: { width: 1920, height: 1080, }, headless: false, slowMo: 300, } let browser = await puppeteer.launch(options) const page = await browser.newPage() await page.goto('https://www.dytt8.net/index.htm')
InputEle = await page.$('.searchl .formhue') await InputEle.focus() await page.keyboard.type('蝙蝠侠') await page.$eval('.bd3rl .search', (element) => { element.addEventListener('click', (event) => (event.cancelBubble = true)) }) let btn = await page.$('.searchr input[name = "Submit"]') await btn.click() } test()
|
实战代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
| let puppeteer = require('puppeteer') let url = require('url') let fs = require('fs')
let httpUrl = 'https://sobooks.cc/'
;(async function () { let debugOptions = { defaultViewport: { width: 1400, height: 720, }, headless: false, slowMo: 300, } let options = { headless: true } let browser = await puppeteer.launch(options)
function wait(millSeconds) { return new Promise(function (resolve, reject) { setTimeout(() => { resolve('成功执行延迟函数,延迟时间:' + millSeconds) }, millSeconds) }) }
async function getAllNum(link) { let page = await browser.newPage() await page.goto(link) await page.setRequestInterception(true) page.on('request', (interceptedRequest) => { let urlObj = url.parse(interceptedRequest.url()) if (urlObj.hostname == 'googleads.g.doubleclick.net') { interceptedRequest.abort() } else { interceptedRequest.continue() } }) let pageNum = await page.$eval( '.pagination li:last-child span', (element) => { let text = element.innerHTML .substring(1, element.innerHTML.length - 2) .trim() return text } ) return pageNum }
async function pageList(num) { let pageListUrl = 'https://sobooks.cc/page/' + num let page = await browser.newPage()
await page.goto(pageListUrl) await page.setRequestInterception(true) page.on('request', (interceptedRequest) => { let urlObj = url.parse(interceptedRequest.url()) if (urlObj.hostname == 'googleads.g.doubleclick.net') { interceptedRequest.abort() } else { interceptedRequest.continue() } }) let arrPage = await page.$$eval( '.card .card-item .thumb-img>a', (elements) => { let arr = [] elements.forEach((element, i) => { var obj = { href: element.getAttribute('href'), title: element.getAttribute('title'), } arr.push(obj) }) console.log(arr) return arr } ) page.close() arrPage.forEach(async (pageObj, i) => { await wait(500 * i) getPageInfo(pageObj) }) }
async function getPageInfo(pageObj) { let page = await browser.newPage() await page.goto(pageObj.href) await page.setRequestInterception(true) page.on('request', (interceptedRequest) => { let urlObj = url.parse(interceptedRequest.url()) if (urlObj.hostname == 'googleads.g.doubleclick.net') { interceptedRequest.abort() } else { interceptedRequest.continue() } }) let eleA = await page.$('.dltable tr:nth-child(3) a:last-child') let aHref = await eleA.getProperty('href') aHref = aHref._remoteObject.value aHref = aHref.split('?url=')[1] let content = `{\n"title":"${pageObj.title}",\n"href":"${aHref}"\n}\n` fs.writeFile('book.txt', content, { flag: 'a' }, () => { console.log('正在写入数据:' + pageObj.title) page.close() }) }
async function spider(link) { let allPageNum = await getAllNum(link) console.log('成功获取页面总数:' + allPageNum)
for (let i = 1; i <= allPageNum; i++) { await wait(4000 * i) pageList(i) } } spider(httpUrl) })()
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
| let puppeteer = require('puppeteer') let url = require('url') let fs = require('fs') let { fsRead, fsWrite, fsDir } = require('./rw') let axios = require('axios')
let httpUrl = 'https://sobooks.cc/'
;(async function () { let debugOptions = { defaultViewport: { width: 1400, height: 800, }, headless: false, slowMo: 300, } let options = { headless: true } let browser = await puppeteer.launch(debugOptions)
function wait(millSeconds) { return new Promise(function (resolve, reject) { setTimeout(() => { resolve('成功执行延迟函数,延迟时间:' + millSeconds) }, millSeconds) }) } async function parseTxt() { let textContent = await fsRead('./book.txt') let reg = /(\{.*?\})/gis var tempRes let bookArr = [] while ((tempRes = reg.exec(textContent))) { let jsonStr = tempRes[1] let jsonObj = JSON.parse(jsonStr) await bookArr.push(jsonObj) } return bookArr } let bookArr = await parseTxt() let index = 0 async function downloadBook() { if (index == bookArr.length) { return '下载完成' } let bookObj = bookArr[index] index++ let page = await browser.newPage() await page.goto(bookObj.href) await page.waitForSelector('#table_files tbody .even a', { visible: true }) let elementAHref = await page.$eval( '#table_files tbody .even a', (element) => { return element.getAttribute('href') } ) bookLinkPage(elementAHref, bookObj.title) page.close() } async function bookLinkPage(link, title) { let page = await browser.newPage() await page.setRequestInterception(true) page.on('request', (interceptedRequest) => { let urlObj = url.parse(interceptedRequest.url()) if (urlObj.hostname == 'u066.164-ctc-dd.tv002.com') { console.log('截获下载地址:' + urlObj.href) interceptedRequest.abort() let ws = fs.createWriteStream('./book/' + title + '.epub') axios.get(urlObj.href, { responseType: 'stream' }).then(function (res) { res.data.pipe(ws) ws.on('close', function () { console.log('下载完成:' + title) }) }) downloadBook() page.close() } else { interceptedRequest.continue() } }) await page.goto('https://590m.com' + link) await page.waitForSelector('.btn.btn-outline-secondary.fs--1') let btn = await page.$('.btn.btn-outline-secondary.fs--1') await btn.click() } downloadBook() })()
|
这个下载代码好像还有点问题,主要是有广告页和验证码弹出来。