diff --git a/flex_token_stream.php b/flex_token_stream.php index c21e411..f65681d 100644 --- a/flex_token_stream.php +++ b/flex_token_stream.php @@ -1,34 +1,41 @@ executable(); $tokens = explode("\0", `$scanner < "\$PHP_LIME_SCAN_STDIN"`); + array_pop($tokens); $this->tokens = $tokens; $this->lineno = 1; } - function next() { + + public function next() { if (list($key, $token) = each($this->tokens)) { list($this->lineno, $type, $text) = explode("\1", $token); + return array($type, $text); } } - function feed($parser) { + + public function feed($parser) { while (list($type, $text) = $this->next()) { $parser->eat($type, $text); } + return $parser->eat_eof(); } } diff --git a/lime.php b/lime.php index b7d4e64..7f9eb6f 100755 --- a/lime.php +++ b/lime.php @@ -17,6 +17,7 @@ */ define('LIME_DIR', __DIR__); +define('INDENT', ' '); function emit($str) { fputs(STDERR, $str . PHP_EOL); @@ -66,11 +67,11 @@ function lime_export($var) { $out[] = (!$i ? lime_export($k).' => ' : '') . lime_export($v); } - $result = 'array(' . PHP_EOL . preg_replace('~^~m', "\t", implode(',' . PHP_EOL, $out)) . PHP_EOL . ')'; + $result = 'array(' . PHP_EOL . preg_replace('~^~m', INDENT, implode(',' . PHP_EOL, $out)) . PHP_EOL . ')'; } elseif (is_int($var) || is_float($var)) { $result = (string)$var; } elseif (is_string($var)) { - $opt1 = "'" . str_replace(array('\\', "'"), array('\\\\', "\'"), $var) . "'"; + $opt1 = '\'' . str_replace(array('\\', '\''), array('\\\\', '\\\''), $var) . '\''; $opt2 = $opt1; if (strpos($var, '$') === false) { @@ -254,12 +255,16 @@ class RRC extends Exception { } class state { + public $id; + public $key; + public $close; + public $action = array(); + public function __construct($id, $key, $close) { $this->id = $id; $this->key = $key; $this->close = $close; // config key -> object ksort($this->close); - $this->action = array(); } public function dump() { @@ -1049,7 +1054,7 @@ class lime_language_php extends lime_language { $php = $this->to_php($a['code']); $code .= 'function ' . $mn . '(' . LIME_CALL_PROTOCOL . ') {' . PHP_EOL . - preg_replace('~^~m', "\t", $comment . $php) . PHP_EOL . + rtrim(preg_replace('~^~m', INDENT, $comment . $php)) . PHP_EOL . '}' . PHP_EOL . PHP_EOL; @@ -1063,7 +1068,7 @@ class lime_language_php extends lime_language { $code .= 'public $a = '.lime_export($rules, true) . ';' . PHP_EOL; return 'class ' . $parser_class . ' extends lime_parser {' . PHP_EOL . - preg_replace(array('~^~m', '~^\h+$~m'), array("\t", ''), $code) . + preg_replace(array('~^~m', '~^\h+$~m'), array(INDENT, ''), $code) . '}' . PHP_EOL; } } @@ -1153,12 +1158,15 @@ class lime_rewrite { } } +/** + * This keeps track of one position in an rhs. + * We specialize to handle actions and glyphs. + * + * If there is a name for the slot, we store it here. + * Later on, this structure will be consulted in the formation of + * actual production rules. + */ class lime_slot { - // This keeps track of one position in an rhs. - // We specialize to handle actions and glyphs. - // If there is a name for the slot, we store it here. - // Later on, this structure will be consulted in the formation of - // actual production rules. public function __construct($data, $name) { $this->data = $data; $this->name = $name; @@ -1175,34 +1183,32 @@ class lime_glyph extends lime_slot { } class lime_action extends lime_slot { } + + +/** + * This function isn't too terribly interesting to the casual observer. + * You're probably better off looking at parse_lime_grammar() instead. + * + * Ok, if you insist, I'll explain. + * + * The input to Lime is a CFG parser definition. That definition is + * written in some language. (The Lime language, to be exact.) + * Anyway, I have to parse the Lime language and compile it into a + * very complex data structure from which a parser is eventually + * built. What better way than to use Lime itself to parse its own + * language? Well, it's almost that simple, but not quite. + + * The Lime language is fairly potent, but a restricted subset of + * its features was used to write a metagrammar. Then, I hand-translated + * that metagrammar into another form which is easy to snarf up. + * In the process of reading that simplified form, this function + * builds the same sort of data structure that later gets turned into + * a parser. The last step is to run the parser generation algorithm, + * eval() the resulting PHP code, and voila! With no hard work, I can + * suddenly read and comprehend the full range of the Lime language + * without ever having written an algorithm to do so. It feels like magic. + */ function lime_bootstrap() { - - /* - - This function isn't too terribly interesting to the casual observer. - You're probably better off looking at parse_lime_grammar() instead. - - Ok, if you insist, I'll explain. - - The input to Lime is a CFG parser definition. That definition is - written in some language. (The Lime language, to be exact.) - Anyway, I have to parse the Lime language and compile it into a - very complex data structure from which a parser is eventually - built. What better way than to use Lime itself to parse its own - language? Well, it's almost that simple, but not quite. - - The Lime language is fairly potent, but a restricted subset of - its features was used to write a metagrammar. Then, I hand-translated - that metagrammar into another form which is easy to snarf up. - In the process of reading that simplified form, this function - builds the same sort of data structure that later gets turned into - a parser. The last step is to run the parser generation algorithm, - eval() the resulting PHP code, and voila! With no hard work, I can - suddenly read and comprehend the full range of the Lime language - without ever having written an algorithm to do so. It feels like magic. - - */ - $bootstrap = LIME_DIR . '/lime.bootstrap'; $lime = new lime(); $lime->parser_class = 'lime_metaparser'; @@ -1245,31 +1251,29 @@ function lime_bootstrap() { eval($parser_code); } +/** + * The voodoo is in the way I do lexical processing on grammar definition + * files. They contain embedded bits of PHP, and it's important to keep + * track of things like strings, comments, and matched braces. It seemed + * like an ideal problem to solve with GNU flex, so I wrote a little + * scanner in flex and C to dig out the tokens for me. Of course, I need + * the tokens in PHP, so I designed a simple binary wrapper for them which + * also contains line-number information, guaranteed to help out if you + * write a grammar which surprises the parser in any manner. + */ class voodoo_scanner extends flex_scanner { - /* - - The voodoo is in the way I do lexical processing on grammar definition - files. They contain embedded bits of PHP, and it's important to keep - track of things like strings, comments, and matched braces. It seemed - like an ideal problem to solve with GNU flex, so I wrote a little - scanner in flex and C to dig out the tokens for me. Of course, I need - the tokens in PHP, so I designed a simple binary wrapper for them which - also contains line-number information, guaranteed to help out if you - write a grammar which surprises the parser in any manner. - - */ function executable() { return LIME_DIR.'/lime_scan_tokens'; } } +/** + * This is a good function to read because it teaches you how to interface + * with a Lime parser. I've tried to isolate out the bits that aren't + * instructive in that regard. + */ function parse_lime_grammar($path) { - /* - - This is a good function to read because it teaches you how to interface - with a Lime parser. I've tried to isolate out the bits that aren't - instructive in that regard. - - */ - if (!class_exists('lime_metaparser')) lime_bootstrap(); + if (!class_exists('lime_metaparser', false)) { + lime_bootstrap(); + } $parse_engine = new parse_engine(new lime_metaparser()); $scanner = new voodoo_scanner($path); @@ -1284,10 +1288,9 @@ function parse_lime_grammar($path) { } } - if ($_SERVER['argv']) { $code = ''; - array_shift($_SERVER['argv']); # Strip out the program name. + array_shift($_SERVER['argv']); // Strip out the program name. foreach ($_SERVER['argv'] as $path) { $code .= parse_lime_grammar($path); } diff --git a/lime_scan_tokens b/lime_scan_tokens index 37cade9..70c7e0c 100755 Binary files a/lime_scan_tokens and b/lime_scan_tokens differ diff --git a/lime_scan_tokens.l b/lime_scan_tokens.l index 3884279..d8d9a9d 100644 --- a/lime_scan_tokens.l +++ b/lime_scan_tokens.l @@ -28,76 +28,76 @@ void php(); %x dquote %x squote -CHAR \n|. +CHAR \n|. -ALPHA [a-zA-Z] -DIGIT [0-9] -ALNUM {ALPHA}|{DIGIT} -WORD {ALNUM}|_ -STOP "." +ALPHA [a-zA-Z] +DIGIT [0-9] +ALNUM {ALPHA}|{DIGIT} +WORD {ALNUM}|_ +STOP "." -SYM {ALPHA}{WORD}*'* -LIT '.' +SYM {ALPHA}{WORD}*'* +LIT '.' -ESC "\"{CHAR} -SCHAR [^\']|ESC -DCHAR [^\"]|ESC -COM "//"|"#" +ESC "\"{CHAR} +SCHAR [^\']|ESC +DCHAR [^\"]|ESC +COM "//"|"#" -CC [^*\n] -CX "*"+{CC}+ -CT "*"+"/" -BLOCKCMT "/*"({CC}|{CX})*{CT} +CC [^*\n] +CX "*"+{CC}+ +CT "*"+"/" +BLOCKCMT "/*"({CC}|{CX})*{CT} %x pragma %% -[[:space:]]+ {} -#.* {} +[[:space:]]+ {} +#.* {} {STOP} out("stop", "."); {SYM} tok("sym"); {LIT} tok("lit"); {BLOCKCMT} {} -"/"{WORD}+ | +"/"{WORD}+ | "/$" out("lambda", yytext+1); "%"{WORD}+ { out("pragma", yytext+1); yy_push_state(pragma); } -<*>"{" { +<*>"{" { lit(); yy_push_state(code); } -. lit(); +. lit(); { -\n { - out("stop", "."); - yy_pop_state(); -} -[[:space:]] {} -{SYM} tok("sym"); -{LIT} tok("lit"); -. lit(); + \n { + out("stop", "."); + yy_pop_state(); + } + [[:space:]] {} + {SYM} tok("sym"); + {LIT} tok("lit"); + . lit(); } { -"}" { - lit(); - yy_pop_state(); -} -'{SCHAR}*' php(); -\"{DCHAR}*\" php(); -{COM}.* php(); -{BLOCKCMT} php(); -[^{}'"#/]+ php(); -. php(); + "}" { + lit(); + yy_pop_state(); + } + '{SCHAR}*' php(); + \"{DCHAR}*\" php(); + {COM}.* php(); + {BLOCKCMT} php(); + [^{}'"#/]+ php(); + . php(); } %% diff --git a/parse_engine.php b/parse_engine.php index fd54cc4..587c56e 100644 --- a/parse_engine.php +++ b/parse_engine.php @@ -1,5 +1,5 @@ type = $type; $this->state = $state; } } + class parse_premature_eof extends parse_error { - function __construct() { - parent::__construct("Premature EOF"); + public function __construct() { + parent::__construct('Premature EOF'); } } - class parse_stack { - function __construct($qi) { + public $q; + public $qs = array(); + /** + * Stack of semantic actions + */ + public $ss = array(); + + public function __construct($qi) { $this->q = $qi; - $this->qs = array(); - $this->ss = array(); } - function shift($q, $semantic) { + + public function shift($q, $semantic) { $this->ss[] = $semantic; $this->qs[] = $this->q; + $this->q = $q; - # echo "Shift $q -- $semantic
\n"; + + // echo "Shift $q -- $semantic\n"; } - function top_n($n) { - if (!$n) return array(); - return array_slice($this->ss, 0-$n); + + public function top_n($n) { + if (!$n) { + return array(); + } + + return array_slice($this->ss, 0 - $n); } - function pop_n($n) { - if (!$n) return array(); - $qq = array_splice($this->qs, 0-$n); + + public function pop_n($n) { + if (!$n) { + return array(); + } + + $qq = array_splice($this->qs, 0 - $n); $this->q = $qq[0]; - return array_splice($this->ss, 0-$n); + + return array_splice($this->ss, 0 - $n); } - function occupied() { return !empty($this->ss); } - function index($n) { - if ($n) $this->q = $this->qs[count($this->qs)-$n]; + + public function occupied() { + return !empty($this->ss); } - function text() { - return $this->q." : ".implode(' . ', array_reverse($this->qs)); + + public function index($n) { + if ($n) { + $this->q = $this->qs[count($this->qs) - $n]; + } + } + + public function text() { + return $this->q . ' : ' . implode(' . ', array_reverse($this->qs)); } } + class parse_engine { - function __construct($parser) { + public $parser; + public $qi; + public $rule; + public $step; + /** + * @var boolean + */ + public $accept; + /** + * @var parse_stack + */ + public $stack; + + public function __construct($parser) { $this->parser = $parser; $this->qi = $parser->qi; $this->rule = $parser->a; $this->step = $parser->i; - #$this->prepare_callables(); + $this->reset(); - #$this->debug = false; } - function reset() { + + public function reset() { $this->accept = false; $this->stack = new parse_stack($this->qi); } + private function enter_error_tolerant_state() { while ($this->stack->occupied()) { - if ($this->has_step_for('error')) return true; + if ($this->has_step_for('error')) { + return true; + } + $this->drop(); - }; + } + return false; } - private function drop() { $this->stack->pop_n(1); } - function eat_eof() { - {/* - - So that I don't get any brilliant misguided ideas: - - The "accept" step happens when we try to eat a start symbol. - That happens because the reductions up the stack at the end - finally (and symetrically) tell the parser to eat a symbol - representing what they've just shifted off the end of the stack - and reduced. However, that doesn't put the parser into any - special different state. Therefore, it's back at the start - state. - - That being said, the parser is ready to reduce an EOF to the - empty program, if given a grammar that allows them. - - So anyway, if you literally tell the parser to eat an EOF - symbol, then after it's done reducing and accepting the prior - program, it's going to think it has another symbol to deal with. - That is the EOF symbol, which means to reduce the empty program, - accept it, and then continue trying to eat the terminal EOF. - - This infinte loop quickly runs out of memory. - - That's why the real EOF algorithm doesn't try to pretend that - EOF is a terminal. Like the invented start symbol, it's special. - - Instead, we pretend to want to eat EOF, but never actually - try to get it into the parse stack. (It won't fit.) In short, - we look up what reduction is indicated at each step in the - process of rolling up the parse stack. - - The repetition is because one reduction is not guaranteed to - cascade into another and clean up the entire parse stack. - Rather, it will instead shift each partial production as it - is forced to completion by the EOF lookahead. - */} - - # We must reduce as if having read the EOF symbol + + private function drop() { + $this->stack->pop_n(1); + } + + /* + * So that I don't get any brilliant misguided ideas: + * + * The "accept" step happens when we try to eat a start symbol. + * That happens because the reductions up the stack at the end + * finally (and symetrically) tell the parser to eat a symbol + * representing what they've just shifted off the end of the stack + * and reduced. However, that doesn't put the parser into any + * special different state. Therefore, it's back at the start + * state. + * + * That being said, the parser is ready to reduce an EOF to the + * empty program, if given a grammar that allows them. + * + * So anyway, if you literally tell the parser to eat an EOF + * symbol, then after it's done reducing and accepting the prior + * program, it's going to think it has another symbol to deal with. + * That is the EOF symbol, which means to reduce the empty program, + * accept it, and then continue trying to eat the terminal EOF. + * + * This infinte loop quickly runs out of memory. + * + * That's why the real EOF algorithm doesn't try to pretend that + * EOF is a terminal. Like the invented start symbol, it's special. + * + * Instead, we pretend to want to eat EOF, but never actually + * try to get it into the parse stack. (It won't fit.) In short, + * we look up what reduction is indicated at each step in the + * process of rolling up the parse stack. + * + * The repetition is because one reduction is not guaranteed to + * cascade into another and clean up the entire parse stack. + * Rather, it will instead shift each partial production as it + * is forced to completion by the EOF lookahead. + */ + public function eat_eof() { + // We must reduce as if having read the EOF symbol do { - # and we have to try at least once, because if nothing - # has ever been shifted, then the stack will be empty - # at the start. + // and we have to try at least once, because if nothing + // has ever been shifted, then the stack will be empty + // at the start. list($opcode, $operand) = $this->step_for('#'); + switch ($opcode) { - case 'r': $this->reduce($operand); break; - case 'e': $this->premature_eof(); break; - default: throw new parse_bug(); break; + case 'r': + $this->reduce($operand); + break; + case 'e': + $this->premature_eof(); + break; + default: + throw new parse_bug(); + break; } } while ($this->stack->occupied()); - {/* - If the sentence is well-formed according to the grammar, then - this will eventually result in eating a start symbol, which - causes the "accept" instruction to fire. Otherwise, the - step('#') method will indicate an error in the syntax, which - here means a premature EOF. - - Incedentally, some tremendous amount of voodoo with the parse - stack might help find the beginning of some unfinished - production that the sentence was cut off during, but as a - general rule that would require deeper knowledge. - */} - if (!$this->accept) throw new parse_bug(); + + // If the sentence is well-formed according to the grammar, then + // this will eventually result in eating a start symbol, which + // causes the "accept" instruction to fire. Otherwise, the + // step('#') method will indicate an error in the syntax, which + // here means a premature EOF. + // + // Incidentally, some tremendous amount of voodoo with the parse + // stack might help find the beginning of some unfinished + // production that the sentence was cut off during, but as a + // general rule that would require deeper knowledge. + if (!$this->accept) { + throw new parse_bug(); + } + return $this->semantic; } + private function premature_eof() { $seen = array(); + while ($this->enter_error_tolerant_state()) { if (isset($seen[$this->state()])) { // This means that it's pointless to try here. @@ -164,9 +228,11 @@ class parse_engine { $this->drop(); continue; } + $seen[$this->state()] = true; - - $this->eat('error', NULL); + + $this->eat('error', null); + if ($this->has_step_for('#')) { // Good. We can continue as normal. return; @@ -177,76 +243,101 @@ class parse_engine { // The rest of the algorithm will make it happen. } } + throw new parse_premature_eof(); } - private function current_row() { return $this->step[$this->state()]; } + + private function current_row() { + return $this->step[$this->state()]; + } + private function step_for($type) { $row = $this->current_row(); - if (!isset($row[$type])) return array('e', $this->stack->q); + if (!isset($row[$type])) { + return array('e', $this->stack->q); + } + return explode(' ', $row[$type]); } + private function has_step_for($type) { $row = $this->current_row(); return isset($row[$type]); } - private function state() { return $this->stack->q; } + + private function state() { + return $this->stack->q; + } + function eat($type, $semantic) { - # assert('$type == trim($type)'); - # if ($this->debug) echo "Trying to eat a ($type)\n"; + // assert('$type == trim($type)'); + // if ($this->debug) echo "Trying to eat a ($type)\n"; list($opcode, $operand) = $this->step_for($type); + switch ($opcode) { - case 's': - # if ($this->debug) echo "shift $type to state $operand\n"; + case 's': + // if ($this->debug) echo "shift $type to state $operand\n"; $this->stack->shift($operand, $semantic); - # echo $this->stack->text()." shift $type
\n"; + // echo $this->stack->text()." shift $type
\n"; break; - - case 'r': + case 'r': $this->reduce($operand); $this->eat($type, $semantic); - # Yes, this is tail-recursive. It's also the simplest way. + // Yes, this is tail-recursive. It's also the simplest way. break; - - case 'a': - if ($this->stack->occupied()) throw new parse_bug('Accept should happen with empty stack.'); + case 'a': + if ($this->stack->occupied()) { + throw new parse_bug('Accept should happen with empty stack.'); + } + $this->accept = true; - #if ($this->debug) echo ("Accept\n\n"); + //if ($this->debug) echo ("Accept\n\n"); $this->semantic = $semantic; break; - - case 'e': - # This is thought to be the uncommon, exceptional path, so - # it's OK that this algorithm will cause the stack to - # flutter while the parse engine waits for an edible token. - # if ($this->debug) echo "($type) causes a problem.\n"; + case 'e': + // This is thought to be the uncommon, exceptional path, so + // it's OK that this algorithm will cause the stack to + // flutter while the parse engine waits for an edible token. + // if ($this->debug) echo "($type) causes a problem.\n"; + if ($this->enter_error_tolerant_state()) { - $this->eat('error', NULL); - if ($this->has_step_for($type)) $this->eat($type, $semantic); + $this->eat('error', null); + if ($this->has_step_for($type)) { + $this->eat($type, $semantic); + } } else { - # If that didn't work, give up: - throw new parse_error("Parse Error: ($type)($semantic) not expected"); + // If that didn't work, give up: + throw new parse_error("Parse Error: ({$type})({$semantic}) not expected"); } break; - - default: - throw new parse_bug("Bad parse table instruction ".htmlspecialchars($opcode)); + default: + throw new parse_bug("Bad parse table instruction " . htmlspecialchars($opcode)); } } + private function reduce($rule_id) { $rule = $this->rule[$rule_id]; $len = $rule['len']; $semantic = $this->perform_action($rule_id, $this->stack->top_n($len)); - #echo $semantic.br(); - if ($rule['replace']) $this->stack->pop_n($len); - else $this->stack->index($len); + + //echo $semantic.br(); + if ($rule['replace']) { + $this->stack->pop_n($len); + } else { + $this->stack->index($len); + } + $this->eat($rule['symbol'], $semantic); } + private function perform_action($rule_id, $slice) { - # we have this weird calling convention.... + // we have this weird calling convention.... $result = null; $method = $this->parser->method[$rule_id]; - #if ($this->debug) echo "rule $id: $method\n"; + + //if ($this->debug) echo "rule $id: $method\n"; $this->parser->$method($slice, $result); + return $result; } } diff --git a/set.so.php b/set.so.php index ef87c6c..26ab138 100644 --- a/set.so.php +++ b/set.so.php @@ -7,23 +7,52 @@ Purpose: We should really have a "set" data type. It's too useful. */ class set { - function __construct($list=array()) { $this->data = array_count_values($list); } - function has($item) { return isset($this->data[$item]); } - function add($item) { $this->data[$item] = true; } - function del($item) { unset($this->data[$item]); return $item;} - function all() { return array_keys($this->data); } - function one() { return key($this->data); } - function count() { return count($this->data); } - function pop() { return $this->del($this->one()); } - function union($that) { + public function __construct(array $list = array()) { + $this->data = array_count_values($list); + } + + public function has($item) { + return isset($this->data[$item]); + } + + public function add($item) { + $this->data[$item] = true; + } + + public function del($item) { + unset($this->data[$item]); + return $item; + } + + public function all() { + return array_keys($this->data); + } + + public function one() { + return key($this->data); + } + + public function count() { + return count($this->data); + } + + public function pop() { + return $this->del($this->one()); + } + + public function union($that) { $progress = false; - foreach ($that->all() as $item) if (!$this->has($item)) { - $this->add($item); - $progress = true; + foreach ($that->all() as $item) { + if (!$this->has($item)) { + $this->add($item); + $progress = true; + } } + return $progress; } - function text() { - return ' { '.implode(' ', $this->all()).' } '; + + public function text() { + return ' { ' . implode(' ', $this->all()) . ' } '; } }