phantomjs/examples/extract-text.js

122 lines
3.9 KiB
JavaScript

/**
* Extract text from a webpage using CSS selectors to include or exclude elements
* By @westonruter
*/
var presetSelectors = {
wikipedia: {
excludeSelectors: "table.metadata, #bodyContent sup, #toc, .editsection, .rellink.relarticle.mainarticle, .rellink.boilerplate.seealso, table.vertical-navbox, .dablink, table.navbox, div.reflist, #catlinks, #mw-articlefeedback, table.infobox",
includeSelectors: "#content p, #content h1, #content h2, #content h3, #content h4, #content h5, #content h6, #content li"
},
_default: {
includeSelectors: 'h1, h2, h3, h4, h5, h6, p, li'
}
};
if(!phantom.state) {
if(phantom.args.length < 1){
console.log('Usage: extract-text.js [options] URL');
console.log('Try piping the output into the `say` command!');
console.log('Options:');
console.log('--include SELECTORS');
console.log('--exclude SELECTORS');
console.log('--preset NAME (only "wikipedia" right now; default values for include/exclude)');
phantom.exit();
}
else {
var state = {
url: null,
includeSelectors: null,
excludeSelectors: null
};
// Parse the command-line arguments
var args = toArray(phantom.args);
while(args.length){
var arg = args.shift();
var matches = arg.match(/^--(\w+)(?:=(.+))?/);
if(matches){
var name = matches[1];
var value;
if(matches[2]){
value = matches[2].replace(/^["']/).replace(/["']$/);
}
else {
value = args.shift();
}
switch(name){
case 'include':
state.includeSelectors = value;
break;
case 'exclude':
state.excludeSelectors = value;
break;
case 'preset':
if(!(value in presetSelectors)){
console.log("Error: Unrecognized preset selector set named '" + value + "'.");
phantom.exit(1);
}
else {
state.includeSelectors = presetSelectors[value].includeSelectors;
state.excludeSelectors = presetSelectors[value].excludeSelectors;
}
break;
}
}
else {
state.url = arg;
}
}
// Provide default selectors
if(!state.includeSelectors){
state.includeSelectors = presetSelectors._default.includeSelectors;
}
phantom.state = JSON.stringify(state);
phantom.open(state.url);
}
}
else {
var state = JSON.parse(phantom.state);
// Remove the elements that we want excluded
if(state.excludeSelectors){
var excludedElements = document.querySelectorAll(state.excludeSelectors);
toArray(excludedElements).forEach(function(el){
el.parentNode.removeChild(el);
});
}
// Select only the remaining that we want included, and then print them out
var includedElements = document.querySelectorAll(state.includeSelectors);
toArray(includedElements).forEach(function(el){
var text = trim(el.textContent);
if(text){
console.log(text + "\n");
}
});
phantom.exit();
}
/**
* Coerce an array-like object into an array and/or copy an array
* @param {Object} list
* @returns {Array}
*/
function toArray(list){
return Array.prototype.slice.call(list);
}
/**
* Trim whitespace around an element and normalize the whitespace inside
* @param {String} s
* @returns {String}
*/
function trim(s){
return s.replace(/\s+/g, ' ').replace(/^\s+/, '').replace(/\s+$/, '');
}