lime/parse_engine.php

<?php
/**
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Library General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

define('LIME_CALL_PROTOCOL', '$tokens, &$result');

abstract class lime_parser {
}

/**
 * The input doesn't match the grammar
 */
class parse_error extends Exception {
}

/**
 * Bug, I made a mistake
 */
class parse_bug extends Exception {
}

class parse_unexpected_token extends parse_error {
	public function __construct($type, $state) {
		parent::__construct("Unexpected token of type {$type}");

		$this->type = $type;
		$this->state = $state;
	}
}

class parse_premature_eof extends parse_error {
	public function __construct(array $expect) {
		parent::__construct('Premature EOF');
	}
}

class parse_stack {
	public $q;
	public $qs = array();
	/**
	 * Stack of semantic actions
	 */
	public $ss = array();

	public function __construct($qi) {
		$this->q = $qi;
	}

	public function shift($q, $semantic) {
		$this->ss[] = $semantic;
		$this->qs[] = $this->q;

		$this->q = $q;

		// echo "Shift $q -- $semantic\n";
	}

	public function top_n($n) {
		if (!$n) {
			return array();
		}

		return array_slice($this->ss, 0 - $n);
	}

	public function pop_n($n) {
		if (!$n) {
			return array();
		}

		$qq = array_splice($this->qs, 0 - $n);
		$this->q = $qq[0];

		return array_splice($this->ss, 0 - $n);
	}

	public function occupied() {
		return !empty($this->ss);
	}

	public function index($n) {
		if ($n) {
			$this->q = $this->qs[count($this->qs) - $n];
		}
	}

	public function text() {
		return $this->q . ' : ' . implode(' . ', array_reverse($this->qs));
	}
}

class parse_engine {
	public $debug = false;

	public $parser;
	public $qi;
	public $rule;
	public $step;
	public $descr;
	/**
	 * @var boolean
	 */
	public $accept;
	/**
	 * @var parse_stack
	 */
	public $stack;

	public function __construct($parser) {
		$this->parser = $parser;
		$this->qi = $parser->qi;
		$this->rule = $parser->a;
		$this->step = $parser->i;
		$this->descr = $parser->d;

		$this->reset();
	}

	public function reset() {
		$this->accept = false;
		$this->stack = new parse_stack($this->qi);
		$this->parser->errors = array();
	}

	private function enter_error_tolerant_state() {
		while ($this->stack->occupied()) {
			if ($this->has_step_for('error')) {
				return true;
			}

			if ($this->debug) echo "Dropped an item from the stack, {" . implode(', ', $this->get_steps()) . "} left\n";
			if ($this->debug) echo 'Currently in state ' . $this->state() . "\n";

			$this->drop();
		}

		return false;
	}

	private function drop() {
		$this->stack->pop_n(1);
	}

	/*
	 * So that I don't get any brilliant misguided ideas:
	 *
	 * The "accept" step happens when we try to eat a start symbol.
	 * That happens because the reductions up the stack at the end
	 * finally (and symetrically) tell the parser to eat a symbol
	 * representing what they've just shifted off the end of the stack
	 * and reduced. However, that doesn't put the parser into any
	 * special different state. Therefore, it's back at the start
	 * state.
	 *
	 * That being said, the parser is ready to reduce an EOF to the
	 * empty program, if given a grammar that allows them.
	 *
	 * So anyway, if you literally tell the parser to eat an EOF
	 * symbol, then after it's done reducing and accepting the prior
	 * program, it's going to think it has another symbol to deal with.
	 * That is the EOF symbol, which means to reduce the empty program,
	 * accept it, and then continue trying to eat the terminal EOF.
	 *
	 * This infinte loop quickly runs out of memory.
	 *
	 * That's why the real EOF algorithm doesn't try to pretend that
	 * EOF is a terminal. Like the invented start symbol, it's special.
	 *
	 * Instead, we pretend to want to eat EOF, but never actually
	 * try to get it into the parse stack. (It won't fit.) In short,
	 * we look up what reduction is indicated at each step in the
	 * process of rolling up the parse stack.
	 *
	 * The repetition is because one reduction is not guaranteed to
	 * cascade into another and clean up the entire parse stack.
	 * Rather, it will instead shift each partial production as it
	 * is forced to completion by the EOF lookahead.
	 */
	public function eat_eof() {
		// We must reduce as if having read the EOF symbol
		do {
			// and we have to try at least once, because if nothing
			// has ever been shifted, then the stack will be empty
			// at the start.
			list($opcode, $operand) = $this->step_for('#');

			switch ($opcode) {
			case 'r':
				$this->reduce($operand);
				break;
			case 'e':
				$this->premature_eof();
				break;
			default:
				throw new parse_bug();
			}
		} while ($this->stack->occupied());

		// If the sentence is well-formed according to the grammar, then
		// this will eventually result in eating a start symbol, which
		// causes the "accept" instruction to fire. Otherwise, the
		// step('#') method will indicate an error in the syntax, which
		// here means a premature EOF.
		//
		// Incidentally, some tremendous amount of voodoo with the parse
		// stack might help find the beginning of some unfinished
		// production that the sentence was cut off during, but as a
		// general rule that would require deeper knowledge.
		if (!$this->accept) {
			throw new parse_bug();
		}

		return $this->semantic;
	}

	private function premature_eof() {
		$seen = array();

		$expect = $this->get_steps();

		while ($this->enter_error_tolerant_state() || $this->has_step_for('error')) {
			if (isset($seen[$this->state()])) {
				// This means that it's pointless to try here.
				// We're guaranteed that the stack is occupied.
				$this->drop();
				continue;
			}

			$seen[$this->state()] = true;

			$this->eat('error', 'Premature EOF');

			if ($this->has_step_for('#')) {
				// Good. We can continue as normal.
				return;
			} else {
				// That attempt to resolve the error condition
				// did not work. There's no point trying to
				// figure out how much to slice off the stack.
				// The rest of the algorithm will make it happen.
			}
		}

		throw new parse_premature_eof($expect);
	}

	private function current_row() {
		return $this->step[$this->state()];
	}

	private function step_for($type) {
		$row = $this->current_row();
		if (isset($row[$type])) {
			return explode(' ', $row[$type]);
		}
		if (isset($row[''])) {
			return explode(' ', $row['']);
		}
		return array('e', $this->stack->q);
	}

	private function get_steps() {
		$out = array();
		foreach($this->current_row() as $type => $row) {
			foreach($this->rule as $rule) {
				if ($rule['symbol'] == $type) {
					continue 2;
				}
			}

			list($opcode) = explode(' ', $row, 2);
			if ($opcode != 'e') {
				$out[] = $type === '' ? '$default' : $type;
			}
		}

		return $out;
	}

	private function has_step_for($type) {
		$row = $this->current_row();
		return isset($row[$type]);
	}

	private function state() {
		return $this->stack->q;
	}

	function eat($type, $semantic) {
		// assert('$type == trim($type)');
		if ($this->debug) echo "Trying to eat a ($type)\n";
		list($opcode, $operand) = $this->step_for($type);

		switch ($opcode) {
		case 's':
			if ($this->debug) echo "shift $type to state $operand\n";
			$this->stack->shift($operand, $semantic);
			// echo $this->stack->text()." shift $type<br/>\n";
			break;
		case 'r':
			if ($this->debug) echo "Reducing $type via rule $operand\n";
			$this->reduce($operand);
			// Yes, this is tail-recursive. It's also the simplest way.
			return $this->eat($type, $semantic);
		case 'a':
			if ($this->stack->occupied()) {
				throw new parse_bug('Accept should happen with empty stack.');
			}

			$this->accept = true;
			if ($this->debug) echo ("Accept\n\n");
			$this->semantic = $semantic;
			break;
		case 'e':
			// This is thought to be the uncommon, exceptional path, so
			// it's OK that this algorithm will cause the stack to
			// flutter while the parse engine waits for an edible token.
			if ($this->debug) echo "($type) causes a problem.\n";

			// get these before doing anything
			$expected = $this->get_steps();

			$this->parser->errors[] = $this->descr($type, $semantic) . ' not expected, expected one of ' . implode(', ', $expected);

			if ($this->debug) echo "Possibilities before error fixing: {" . implode(', ', $expected) . "}\n";

			if ($this->enter_error_tolerant_state() || $this->has_step_for('error')) {
				$this->eat('error', end($this->parser->errors));
				if ($this->has_step_for($type)) {
					$this->eat($type, $semantic);
				}
				return false;
			} else {
				// If that didn't work, give up:
				throw new parse_error('Parse Error: ' . $this->descr($type, $semantic) . ' not expected, expected one of ' . implode(', ', $expected));
			}
			break;
		default:
			throw new parse_bug("Bad parse table instruction " . htmlspecialchars($opcode));
		}
		return true;
	}

	private function descr($type, $semantic) {
		if (isset($this->descr[$type])) {
			return $this->descr[$type];
		} elseif ("$semantic" !== "") {
			return $type . ' (' . $semantic . ')';
		} else {
			return $type;
		}
	}

	private function reduce($rule_id) {
		$rule = $this->rule[$rule_id];
		$len = $rule['len'];
		$semantic = $this->perform_action($rule_id, $this->stack->top_n($len));

		//echo $semantic.br();
		if ($rule['replace']) {
			$this->stack->pop_n($len);
		} else {
			$this->stack->index($len);
		}

		$this->eat($rule['symbol'], $semantic);
	}

	private function perform_action($rule_id, $slice) {
		// we have this weird calling convention....
		$result = null;
		$method = $this->parser->method[$rule_id];

		//if ($this->debug) echo "rule $id: $method\n";
		$this->parser->$method($slice, $result);

		return $result;
	}
}