phantomjs/examples/extract-text.coffee

82 lines
3.1 KiB
CoffeeScript

###
Extract text from a webpage using CSS selectors to include or exclude elements
By @westonruter
###
# Trim whitespace around an element and normalize the whitespace inside
# @param {String} s
# @returns {String}
trim = (s) ->
s.replace(/\s+/g, ' ').replace(/^\s+/, '').replace(/\s+$/, '')
presetSelectors =
wikipedia:
excludeSelectors: "table.metadata, #bodyContent sup, #toc, .editsection, .rellink.relarticle.mainarticle, .rellink.boilerplate.seealso, table.vertical-navbox, .dablink, table.navbox, div.reflist, #catlinks, #mw-articlefeedback, table.infobox"
includeSelectors: "#content p, #content h1, #content h2, #content h3, #content h4, #content h5, #content h6, #content li"
_default:
includeSelectors: 'h1, h2, h3, h4, h5, h6, p, li'
if not phantom.state
if phantom.args.length < 1
console.log 'Usage: extract-text.js [options] URL'
console.log 'Try piping the output into the `say` command!'
console.log 'Options:'
console.log '--include SELECTORS'
console.log '--exclude SELECTORS'
console.log '--preset NAME (only "wikipedia" right now; default values for include/exclude)'
phantom.exit()
else
state =
url: null
includeSelectors: null
excludeSelectors: null
# Parse the command-line arguments
for arg in phantom.args
matches = arg.match(/^--(\w+)(?:=(.+))?/)
if matches
name = matches[1]
if matches[2]
value = matches[2].replace(/^["']/).replace(/["']$/)
else
value = args.shift()
switch name
when 'include'
state.includeSelectors = value
when 'exclude'
state.excludeSelectors = value
when 'preset'
if value not in presetSelectors
console.log "Error: Unrecognized preset selector set named '" + value + "'."
phantom.exit(1)
else
state.includeSelectors = presetSelectors[value].includeSelectors
state.excludeSelectors = presetSelectors[value].excludeSelectors
else
state.url = arg
# Provide default selectors
if not state.includeSelectors
state.includeSelectors = presetSelectors._default.includeSelectors
phantom.state = JSON.stringify(state)
phantom.open(state.url)
else
state = JSON.parse(phantom.state)
# Remove the elements that we want excluded
if state.excludeSelectors
excludedElements = document.querySelectorAll(state.excludeSelectors)
for el in excludedElements
el.parentNode.removeChild(el)
# Select only the remaining that we want included, and then print them out
includedElements = document.querySelectorAll(state.includeSelectors)
for el in includedElements
text = trim(el.textContent)
if text
console.log text + "\n"
phantom.exit()