viewvc-4intranet/misc/elemx/python/scanner.c

524 lines
13 KiB
C
Raw Normal View History

I've been screwing around for *days* trying to tweak this darned Java grammar. Forget it for now, and checkpoint what I've done. The elx-python and elx-java programs produce "element" text files which can then be consumed by elx_html.py to produce syntax-colored HTML output. elx_page.sh is a little wrapper around that to produce a full page. The Python stuff seems quite fine, and is blazing fast (the scanner runs over 100k lines/second). The Java grammar is horked, so it does work (the Java scanner seems fine; just the grammar). Lots more to do on this, but I'm out for the weekend, so this it's time to check point this. Maybe someone will have better ideas on how to fix the Java parser. Note that this stuff requires the 'msta' and 'shilka' programs from the 'cocom' toolkit. I still need to package this up nicely (some copyright/license notices, better makefiles, some minimal doc, etc). I also want to mess around with profiling the syntax coloring. It appears to be a bit slower than enscript right now, but it shouldn't be since we've already parsed the file by that point (gonna try binary element files, and some perf improvements to elx_html.py). General theory: write elx-* for any language needing cross-referencing capabilities (things that only need hiliting can continue to use enscript). The elx-* programs can be implemented in any fashion, as long as it produces the element description file. Python, Perl, pure C, automated scanner/parser, whatever. Future steps include using the element description file to get function names, to index them, and to feed that into the HTML generation. It would also be quite possible to feed the element descriptions right into a database and query from there. git-svn-id: http://viewvc.tigris.org/svn/viewvc/trunk@498 8cb11bc2-c004-0410-86c3-e597b4017df7
2002-03-15 04:54:28 +03:00
#include <stdlib.h>
#include <ctype.h>
#include <assert.h>
#include <string.h>
#include "python.h" /* get the TK_ values */
#include "scanner.h"
#define SCANNER_EMPTY (SCANNER_EOF - 1) /* -2 */
#define SCANNER_TABSIZE 8
#define SCANNER_MAXINDENT 100
#define SCANNER_MAXIDLEN 200
typedef struct
{
get_char_t getfunc;
void *user_ctx;
char saved;
int was_newline; /* was previous character a newline? */
int start; /* start position of last token returned */
int start_col;
int start_line;
int fpos; /* file position */
int lineno; /* file line number */
int line_pos; /* file position of current line's first char */
int nesting_level;
int indent; /* which indent */
int indents[SCANNER_MAXINDENT]; /* the set of indents */
int dedent_count; /* how many DEDENTs to issue */
int skip_newline; /* skip the newline after a blank_line + comment */
int idlen;
char identifier[SCANNER_MAXIDLEN]; /* accumulated identifier */
} scanner_ctx;
static int next_char(scanner_ctx *ctx)
{
int c;
++ctx->fpos;
if (ctx->saved == SCANNER_EMPTY)
{
return (*ctx->getfunc)(ctx->user_ctx);
}
c = ctx->saved;
ctx->saved = SCANNER_EMPTY;
return c;
}
static void backup_char(scanner_ctx *ctx, int c)
{
assert(ctx->saved == SCANNER_EMPTY);
ctx->saved = c;
ctx->was_newline = 0; /* we may have put it back */
--ctx->fpos;
}
/* called to note that we've moved on to another line */
static void on_next_line(scanner_ctx *ctx)
{
ctx->line_pos = ctx->fpos;
++ctx->lineno;
}
void *scanner_begin(get_char_t getfunc, void *user_ctx)
{
scanner_ctx *ctx = malloc(sizeof(*ctx));
memset(ctx, 0, sizeof(*ctx));
ctx->getfunc = getfunc;
ctx->user_ctx = user_ctx;
ctx->saved = SCANNER_EMPTY;
ctx->lineno = 1;
return ctx;
}
int scanner_get_token(void *opaque_ctx)
{
scanner_ctx *ctx = opaque_ctx;
int c;
int c2;
int blank_line;
if (ctx->dedent_count)
{
--ctx->dedent_count;
return TK_DEDENT;
}
nextline:
blank_line = 0;
/* if we're at the start of the line, then get the indentation level */
if (ctx->fpos == ctx->line_pos)
{
int col = 0;
while (1)
{
c = next_char(ctx);
if (c == ' ')
++col;
else if (c == '\t')
col = (col / SCANNER_TABSIZE + 1) * SCANNER_TABSIZE;
else if (c == '\f') /* ^L / formfeed */
col = 0;
else
break;
}
backup_char(ctx, c);
if (c == '#' || c == '\n')
{
/* this is a "blank" line and doesn't count towards indentation,
and it doesn't produce NEWLINE tokens */
blank_line = 1;
}
/* if it isn't blank, and we aren't inside nesting expressions, then
we need to handle INDENT/DEDENT */
if (!blank_line && ctx->nesting_level == 0)
{
int last_indent = ctx->indents[ctx->indent];
if (col == last_indent)
{
/* no change */
}
else if (col > last_indent)
{
if (ctx->indent == SCANNER_MAXINDENT - 1)
{
/* oops. too deep. */
return E_TOO_MANY_INDENTS;
}
ctx->indents[++ctx->indent] = col;
return TK_INDENT;
}
else /* col < last_indent */
{
/* find the previous indentation that matches this one */
while (ctx->indent > 0
&& col < ctx->indents[ctx->indent])
{
++ctx->dedent_count;
--ctx->indent;
}
if (col != ctx->indents[ctx->indent])
{
/* oops. dedent doesn't match any indent. */
return E_DEDENT_MISMATCH;
}
/* deliver one dedent now */
--ctx->dedent_count;
return TK_DEDENT;
}
} /* !blank_line ... */
} /* start of line */
/* start here if we see a line continuation */
read_more:
do {
c = next_char(ctx);
} while (c == ' ' || c == '\t' || c == '\f');
/* here is where the token starts */
ctx->start = ctx->fpos;
ctx->start_line = ctx->lineno;
ctx->start_col = ctx->fpos - ctx->line_pos;
/* comment? */
if (c == '#')
{
do {
c = next_char(ctx);
} while (c != SCANNER_EOF && c != '\n');
/* if we are suppressing newlines because this is a blank line, then
leave a marker to skip the newline, next time through. */
if (blank_line && c == '\n')
ctx->skip_newline = 1;
/* put back whatever we sucked up */
backup_char(ctx, c);
return TK_COMMENT;
}
/* Look for an identifier */
if (isalpha(c) || c == '_')
{
ctx->idlen = 0;
/* is this actually a string? */
if (c == 'r' || c == 'R')
{
ctx->identifier[ctx->idlen++] = c;
c = next_char(ctx);
if (c == '"' || c == '\'')
goto parse_string;
}
else if (c == 'u' || c == 'U')
{
ctx->identifier[ctx->idlen++] = c;
c = next_char(ctx);
if (c == 'r' || c == 'R')
{
ctx->identifier[ctx->idlen++] = c;
c = next_char(ctx);
}
if (c == '"' || c == '\'')
goto parse_string;
}
while (isalnum(c) || c == '_') {
/* store the character if there is room for it, and room left
for a null-terminator. */
if (ctx->idlen < SCANNER_MAXIDLEN-1)
ctx->identifier[ctx->idlen++] = c;
I've been screwing around for *days* trying to tweak this darned Java grammar. Forget it for now, and checkpoint what I've done. The elx-python and elx-java programs produce "element" text files which can then be consumed by elx_html.py to produce syntax-colored HTML output. elx_page.sh is a little wrapper around that to produce a full page. The Python stuff seems quite fine, and is blazing fast (the scanner runs over 100k lines/second). The Java grammar is horked, so it does work (the Java scanner seems fine; just the grammar). Lots more to do on this, but I'm out for the weekend, so this it's time to check point this. Maybe someone will have better ideas on how to fix the Java parser. Note that this stuff requires the 'msta' and 'shilka' programs from the 'cocom' toolkit. I still need to package this up nicely (some copyright/license notices, better makefiles, some minimal doc, etc). I also want to mess around with profiling the syntax coloring. It appears to be a bit slower than enscript right now, but it shouldn't be since we've already parsed the file by that point (gonna try binary element files, and some perf improvements to elx_html.py). General theory: write elx-* for any language needing cross-referencing capabilities (things that only need hiliting can continue to use enscript). The elx-* programs can be implemented in any fashion, as long as it produces the element description file. Python, Perl, pure C, automated scanner/parser, whatever. Future steps include using the element description file to get function names, to index them, and to feed that into the HTML generation. It would also be quite possible to feed the element descriptions right into a database and query from there. git-svn-id: http://viewvc.tigris.org/svn/viewvc/trunk@498 8cb11bc2-c004-0410-86c3-e597b4017df7
2002-03-15 04:54:28 +03:00
c = next_char(ctx);
}
backup_char(ctx, c);
/* ### check for a keyword */
return TK_IDENTIFIER;
}
if (c == '\n')
{
on_next_line(ctx);
/* don't report NEWLINE tokens for blank lines or nested exprs */
if (blank_line || ctx->nesting_level > 0 || ctx->skip_newline)
{
ctx->skip_newline = 0;
goto nextline;
}
return TK_NEWLINE;
}
if (c == '.')
{
c = next_char(ctx);
if (isdigit(c))
goto parse_fraction;
backup_char(ctx, c);
return '.';
}
if (isdigit(c))
{
if (c == '0')
{
c = next_char(ctx);
if (c == 'x' || c == 'X')
{
do {
c = next_char(ctx);
} while (isxdigit(c));
goto skip_fp;
}
else if (isdigit(c))
{
do {
c = next_char(ctx);
} while (isdigit(c));
}
if (c == '.')
goto parse_fraction;
if (c == 'e' || c == 'E')
goto parse_exponent;
if (c == 'j' || c == 'J')
goto parse_imaginary;
skip_fp:
/* this point: parsed an octal, decimal, or hexadecimal */
if (c == 'l' || c == 'L')
{
/* we consumed just enough. stop and return a NUMBER */
return TK_NUMBER;
}
/* consumed too much. backup and return a NUMBER */
backup_char(ctx, c);
return TK_NUMBER;
}
/* decimal number */
do {
c = next_char(ctx);
} while (isdigit(c));
if (c == 'l' || c == 'L')
{
/* we consumed just enogh. stop and return a NUMBER */
return TK_NUMBER;
}
if (c == '.')
{
parse_fraction:
do {
c = next_char(ctx);
} while (isdigit(c));
}
if (c == 'e' || c == 'E')
{
parse_exponent:
c = next_char(ctx);
if (c == '+' || c == '-')
c = next_char(ctx);
if (!isdigit(c))
{
backup_char(ctx, c);
return E_BAD_NUMBER;
}
do {
c = next_char(ctx);
} while (isdigit(c));
}
if (c == 'j' || c == 'J')
{
parse_imaginary:
c = next_char(ctx);
}
/* one too far. backup and return a NUMBER */
backup_char(ctx, c);
return TK_NUMBER;
} /* isdigit */
parse_string:
if (c == '\'' || c == '"')
{
int second_quote_pos = ctx->fpos + 1;
int which_quote = c;
int is_triple = 0;
int quote_count = 0;
while (1)
{
c = next_char(ctx);
if (c == '\n')
{
on_next_line(ctx);
if (!is_triple)
return E_UNTERM_STRING;
quote_count = 0;
}
else if (c == SCANNER_EOF)
{
return E_UNTERM_STRING;
}
else if (c == which_quote)
{
++quote_count;
if (ctx->fpos == second_quote_pos)
{
c = next_char(ctx);
if (c == which_quote)
{
is_triple = 1;
quote_count = 0;
continue;
}
/* we just read one past the empty string. back up. */
backup_char(ctx, c);
}
/* this quote may have terminated the string */
if (!is_triple || quote_count == 3)
return TK_STRING;
}
else if (c == '\\')
{
c = next_char(ctx);
if (c == SCANNER_EOF)
return E_UNTERM_STRING;
if (c == '\n')
on_next_line(ctx);
quote_count = 0;
}
else
{
quote_count = 0;
}
}
/* NOTREACHED */
}
/* line continuation */
if (c == '\\')
{
c = next_char(ctx);
if (c != '\n')
return E_BAD_CONTINUATION;
on_next_line(ctx);
goto read_more;
}
/* look for operators */
/* the nesting operators */
if (c == '(' || c == '[' || c == '{')
{
++ctx->nesting_level;
return c;
}
if (c == ')' || c == ']' || c == '}')
{
--ctx->nesting_level;
return c;
}
/* look for up-to-3-char ops */
if (c == '<' || c == '>' || c == '*' || c == '/')
{
c2 = next_char(ctx);
if (c == c2)
{
c2 = next_char(ctx);
if (c2 != '=')
{
/* oops. one too far. */
backup_char(ctx, c2);
}
return TK_OPERATOR;
}
if (c == '<' && c2 == '>')
return TK_OPERATOR;
if (c2 != '=')
{
/* one char too far. */
backup_char(ctx, c2);
}
return TK_OPERATOR;
}
/* look for 2-char ops */
if (c == '=' || c == '!' || c == '+' || c == '-'
|| c == '|' || c == '%' || c == '&' || c == '^')
{
c2 = next_char(ctx);
if (c2 == '=')
return TK_OPERATOR;
/* oops. too far. */
backup_char(ctx, c2);
return TK_OPERATOR;
}
/* ### should all of these return 'c' ? */
if (c == ':' || c == ',' || c == ';' || c == '`')
return c;
/* as a unary operator, this must be a TK_OPERATOR */
if (c == '~')
return TK_OPERATOR;
/* if we have an EOF, then just return it */
if (c == SCANNER_EOF)
return SCANNER_EOF;
/* unknown input */
return E_UNKNOWN_TOKEN;
}
void scanner_identifier(void *opaque_ctx, const char **ident, int *len)
{
scanner_ctx *ctx = opaque_ctx;
ctx->identifier[ctx->idlen] = '\0';
I've been screwing around for *days* trying to tweak this darned Java grammar. Forget it for now, and checkpoint what I've done. The elx-python and elx-java programs produce "element" text files which can then be consumed by elx_html.py to produce syntax-colored HTML output. elx_page.sh is a little wrapper around that to produce a full page. The Python stuff seems quite fine, and is blazing fast (the scanner runs over 100k lines/second). The Java grammar is horked, so it does work (the Java scanner seems fine; just the grammar). Lots more to do on this, but I'm out for the weekend, so this it's time to check point this. Maybe someone will have better ideas on how to fix the Java parser. Note that this stuff requires the 'msta' and 'shilka' programs from the 'cocom' toolkit. I still need to package this up nicely (some copyright/license notices, better makefiles, some minimal doc, etc). I also want to mess around with profiling the syntax coloring. It appears to be a bit slower than enscript right now, but it shouldn't be since we've already parsed the file by that point (gonna try binary element files, and some perf improvements to elx_html.py). General theory: write elx-* for any language needing cross-referencing capabilities (things that only need hiliting can continue to use enscript). The elx-* programs can be implemented in any fashion, as long as it produces the element description file. Python, Perl, pure C, automated scanner/parser, whatever. Future steps include using the element description file to get function names, to index them, and to feed that into the HTML generation. It would also be quite possible to feed the element descriptions right into a database and query from there. git-svn-id: http://viewvc.tigris.org/svn/viewvc/trunk@498 8cb11bc2-c004-0410-86c3-e597b4017df7
2002-03-15 04:54:28 +03:00
*ident = ctx->identifier;
*len = ctx->idlen;
}
void scanner_token_range(void *opaque_ctx, int *start, int *end)
{
scanner_ctx *ctx = opaque_ctx;
*start = ctx->start;
*end = ctx->fpos;
}
void scanner_token_linecol(void *opaque_ctx,
int *sline, int *scol, int *eline, int *ecol)
{
scanner_ctx *ctx = opaque_ctx;
*sline = ctx->start_line;
*scol = ctx->start_col;
*eline = ctx->lineno;
*ecol = ctx->fpos - ctx->line_pos;
}
void scanner_end(void *ctx)
{
free(ctx);
}