1. 爬取内容 #

1. http/axios 等爬取API接口 #

let axios = require('axios');
axios.get('https://follow-api-ms.juejin.im/v1/getUserFollowInfo?uid=551d6923e4b0cd5b623f54da&src=web')
    .then(res => console.log(res.data))

2. superagent/request/crawl爬取HTML页面 #

let request = require('request');
request('https://juejin.im/tag/%E5%89%8D%E7%AB%AF', (err, response, body) => {
    let regexp = /class="title" data-v-\w+>(.+?)<\/a>/g;
    let titles = [];
    body.replace(regexp, (matched, title) => {
        titles.push(title);
    });
    console.log(titles);
});

3. 使用puppeteer控制chromium #

(async () => {
     const browser = await puppeteer.launch();//打开浏览器
     const page = await browser.newPage();//打开一个空白页
     await page.goto('https://www.baidu.com');//在地址栏输入网址并等待加载
     await page.screenshot({ path: 'baidu.png' });//截个图
     await browser.close();//关掉浏览器
})();
const puppeteer=require('puppeteer');
const fs=require('fs');
(async function () {
        const browser=await puppeteer.launch({headless:false});
        const page=await browser.newPage();
        await page.goto('https://juejin.im/tag/%E5%89%8D%E7%AB%AF', {
waitUntil: 'networkidle2'
        });
        await page.waitFor(500);
        let comments = await page.$$eval('a.title', els => {
return els.map(item => item.innerText);
        });
        fs.writeFileSync('comments.txt',comments.join('\r\n'),'utf8');
        await browser.close();
})();
const puppeteer=require('puppeteer');
(async function () {
    const browser=await puppeteer.launch({headless:false});//启动浏览器
    let page = await browser.newPage();//创建一个 Page 实例
    await page.setJavaScriptEnabled(true);//启用javascript
    await page.goto("https://www.jd.com/");
    const searchInput = await page.$("#key");//获取元素
    await searchInput.focus(); //定位到搜索框
    await page.keyboard.type("手机");//输入手机
    const searchBtn = await page.$(".button");
    await searchBtn.click();
    await page.waitForSelector('.gl-item'); //等待元素加载之后,否则获取不了异步加载的元素
    const links = await page.$$eval('.gl-item > .gl-i-wrap > .p-img > a', links => {
        return links.map(a => {
            return {
                href: a.href.trim(),
                title: a.title
            }
        });
    });
    page.close();
    const aTags = links.splice(0, 1);
    for (var i = 0; i < aTags.length; i++) {
        page=await browser.newPage();
        page.setJavaScriptEnabled(true);
        await page.setViewport({//修改浏览器视窗大小
            width: 1920,
            height: 1080
        });
        var a = aTags[i];
        await page.goto(a.href, {timeout: 0});
        let filename = "items-" + i + ".png";
        await page.screenshot({
            path: filename,
            fullPage: true
        });
        page.close();
    }
    browser.close();
})();

2.数据持久化 #

3. 数据订阅 #

4. 分发 #

有新的数据的时候

参考 #