mirror of https://github.com/vitalif/phantomjs
122 lines
3.9 KiB
JavaScript
122 lines
3.9 KiB
JavaScript
/**
|
|
* Extract text from a webpage using CSS selectors to include or exclude elements
|
|
* By @westonruter
|
|
*/
|
|
|
|
var presetSelectors = {
|
|
wikipedia: {
|
|
excludeSelectors: "table.metadata, #bodyContent sup, #toc, .editsection, .rellink.relarticle.mainarticle, .rellink.boilerplate.seealso, table.vertical-navbox, .dablink, table.navbox, div.reflist, #catlinks, #mw-articlefeedback, table.infobox",
|
|
includeSelectors: "#content p, #content h1, #content h2, #content h3, #content h4, #content h5, #content h6, #content li"
|
|
},
|
|
_default: {
|
|
includeSelectors: 'h1, h2, h3, h4, h5, h6, p, li'
|
|
}
|
|
};
|
|
|
|
if(!phantom.state) {
|
|
if(phantom.args.length < 1){
|
|
console.log('Usage: extract-text.js [options] URL');
|
|
console.log('Try piping the output into the `say` command!');
|
|
console.log('Options:');
|
|
console.log('--include SELECTORS');
|
|
console.log('--exclude SELECTORS');
|
|
console.log('--preset NAME (only "wikipedia" right now; default values for include/exclude)');
|
|
phantom.exit();
|
|
}
|
|
else {
|
|
var state = {
|
|
url: null,
|
|
includeSelectors: null,
|
|
excludeSelectors: null
|
|
};
|
|
|
|
// Parse the command-line arguments
|
|
var args = toArray(phantom.args);
|
|
while(args.length){
|
|
var arg = args.shift();
|
|
var matches = arg.match(/^--(\w+)(?:=(.+))?/);
|
|
if(matches){
|
|
var name = matches[1];
|
|
var value;
|
|
if(matches[2]){
|
|
value = matches[2].replace(/^["']/).replace(/["']$/);
|
|
}
|
|
else {
|
|
value = args.shift();
|
|
}
|
|
|
|
switch(name){
|
|
case 'include':
|
|
state.includeSelectors = value;
|
|
break;
|
|
case 'exclude':
|
|
state.excludeSelectors = value;
|
|
break;
|
|
case 'preset':
|
|
if(!(value in presetSelectors)){
|
|
console.log("Error: Unrecognized preset selector set named '" + value + "'.");
|
|
phantom.exit(1);
|
|
}
|
|
else {
|
|
state.includeSelectors = presetSelectors[value].includeSelectors;
|
|
state.excludeSelectors = presetSelectors[value].excludeSelectors;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
else {
|
|
state.url = arg;
|
|
}
|
|
}
|
|
|
|
// Provide default selectors
|
|
if(!state.includeSelectors){
|
|
state.includeSelectors = presetSelectors._default.includeSelectors;
|
|
}
|
|
|
|
phantom.state = JSON.stringify(state);
|
|
phantom.open(state.url);
|
|
}
|
|
}
|
|
else {
|
|
var state = JSON.parse(phantom.state);
|
|
|
|
// Remove the elements that we want excluded
|
|
if(state.excludeSelectors){
|
|
var excludedElements = document.querySelectorAll(state.excludeSelectors);
|
|
toArray(excludedElements).forEach(function(el){
|
|
el.parentNode.removeChild(el);
|
|
});
|
|
}
|
|
|
|
// Select only the remaining that we want included, and then print them out
|
|
var includedElements = document.querySelectorAll(state.includeSelectors);
|
|
toArray(includedElements).forEach(function(el){
|
|
var text = trim(el.textContent);
|
|
if(text){
|
|
console.log(text + "\n");
|
|
}
|
|
});
|
|
|
|
phantom.exit();
|
|
}
|
|
|
|
|
|
/**
|
|
* Coerce an array-like object into an array and/or copy an array
|
|
* @param {Object} list
|
|
* @returns {Array}
|
|
*/
|
|
function toArray(list){
|
|
return Array.prototype.slice.call(list);
|
|
}
|
|
|
|
/**
|
|
* Trim whitespace around an element and normalize the whitespace inside
|
|
* @param {String} s
|
|
* @returns {String}
|
|
*/
|
|
function trim(s){
|
|
return s.replace(/\s+/g, ' ').replace(/^\s+/, '').replace(/\s+$/, '');
|
|
}
|