Scraping webpages

2 min read

A light weight scraper for javascript enabled webpages is below. This uses readability.js to extract readable text content from a JSDOM rendered webpage.

const got = require('got');
const { Readability } = require('@mozilla/readability');
const { JSDOM } = require('jsdom');
const { htmlToText } = require('html-to-text');


function parseArticle(url) {
	  const options = {
			headers: {
				'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
		 } 
		}
    got(url, options).then((response) => {
        try {
            const dom = new JSDOM(response.body, { url: url, runScripts: "outside-only", resources: "usable" });
            const reader = new Readability(dom.window.document);
            const article = reader.parse();


            const text = htmlToText(article.content, {
                wordwrap: null,
                selectors: [
                    { selector: 'img', format: 'skip' },
                    { selector: 'a', options: { hideLinkHrefIfSameAsText: true } },
                    { selector: 'p', options: { leadingLineBreaks: 0, trailingLineBreaks: 0 } },
                    { selector: 'pre', options: { leadingLineBreaks: 0, trailingLineBreaks: 0 } },
                ]
            });


            console.log(text);
            return 0;
        }
        catch (err) {
            console.log(err);
            return 1;
        }
    });
}


parseArticle(process.argv[2]);


Install in a python environment

pip install nodejs-bin

# install npm packages from python 
from nodejs import node, npm
npm.call(["install", "[email protected]"])
npm.call(["install", "html-to-text"])
npm.call(["install", "@mozilla/readability"])
npm.call(["install", "jsdom"])


import os
from nodejs import node

def extract_text(urls):
    """
    Extract text from the given urls.


    Uses jsdom to parse javasript generated DOM within a node process. Note that we are
    executing external javascripts in an unsafe manner. So run this inside a VM.


    Parameters
    ----------
    urls : sequence


    Returns
    -------
    Yields a tuple of the form (url, text).


    """
    this_module_dir = os.path.dirname(os.path.abspath(__file__))
    for url in urls:
        output = ""
        try:
            output = node.run(
                [f"{this_module_dir}/contentParser.js", url],
                capture_output=True,
                encoding="utf-8",
                timeout=30,
            )
            if output.returncode != 0:
                raise Exception("Error, non zero return code from node subprocess")
            _out = output.stdout.strip()
            yield url, _out
        except Exception:
           logger.exception(f"Error processing {url}, output => {output}")

           

This may not work when the web app uses React or similar framework. In that case we will need to use Selenium web driver to write a custom web scraper in python that waits for DOM elements to be rendered before invoking actions. Or if you prefer a managed scraping environment go for https://apify.com/

web scraping JSDOM selenium apify