A light weight scraper for javascript enabled webpages is below. This uses readability.js to extract readable text content from a JSDOM rendered webpage.
const got = require('got');
const { Readability } = require('@mozilla/readability');
const { JSDOM } = require('jsdom');
const { htmlToText } = require('html-to-text');
function parseArticle(url) {
const options = {
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
}
}
got(url, options).then((response) => {
try {
const dom = new JSDOM(response.body, { url: url, runScripts: "outside-only", resources: "usable" });
const reader = new Readability(dom.window.document);
const article = reader.parse();
const text = htmlToText(article.content, {
wordwrap: null,
selectors: [
{ selector: 'img', format: 'skip' },
{ selector: 'a', options: { hideLinkHrefIfSameAsText: true } },
{ selector: 'p', options: { leadingLineBreaks: 0, trailingLineBreaks: 0 } },
{ selector: 'pre', options: { leadingLineBreaks: 0, trailingLineBreaks: 0 } },
]
});
console.log(text);
return 0;
}
catch (err) {
console.log(err);
return 1;
}
});
}
parseArticle(process.argv[2]);
pip install nodejs-bin
# install npm packages from python
from nodejs import node, npm
npm.call(["install", "[email protected]"])
npm.call(["install", "html-to-text"])
npm.call(["install", "@mozilla/readability"])
npm.call(["install", "jsdom"])
import os
from nodejs import node
def extract_text(urls):
"""
Extract text from the given urls.
Uses jsdom to parse javasript generated DOM within a node process. Note that we are
executing external javascripts in an unsafe manner. So run this inside a VM.
Parameters
----------
urls : sequence
Returns
-------
Yields a tuple of the form (url, text).
"""
this_module_dir = os.path.dirname(os.path.abspath(__file__))
for url in urls:
output = ""
try:
output = node.run(
[f"{this_module_dir}/contentParser.js", url],
capture_output=True,
encoding="utf-8",
timeout=30,
)
if output.returncode != 0:
raise Exception("Error, non zero return code from node subprocess")
_out = output.stdout.strip()
yield url, _out
except Exception:
logger.exception(f"Error processing {url}, output => {output}")
This may not work when the web app uses React or similar framework. In that case we will need to use Selenium web driver to write a custom web scraper in python that waits for DOM elements to be rendered before invoking actions. Or if you prefer a managed scraping environment go for https://apify.com/