viewvc-4intranet/misc/elemx/python/scanner.c

524 lines
13 KiB
C

#include <stdlib.h>
#include <ctype.h>
#include <assert.h>
#include <string.h>
#include "python.h" /* get the TK_ values */
#include "scanner.h"
#define SCANNER_EMPTY (SCANNER_EOF - 1) /* -2 */
#define SCANNER_TABSIZE 8
#define SCANNER_MAXINDENT 100
#define SCANNER_MAXIDLEN 200
typedef struct
{
get_char_t getfunc;
void *user_ctx;
char saved;
int was_newline; /* was previous character a newline? */
int start; /* start position of last token returned */
int start_col;
int start_line;
int fpos; /* file position */
int lineno; /* file line number */
int line_pos; /* file position of current line's first char */
int nesting_level;
int indent; /* which indent */
int indents[SCANNER_MAXINDENT]; /* the set of indents */
int dedent_count; /* how many DEDENTs to issue */
int skip_newline; /* skip the newline after a blank_line + comment */
int idlen;
char identifier[SCANNER_MAXIDLEN]; /* accumulated identifier */
} scanner_ctx;
static int next_char(scanner_ctx *ctx)
{
int c;
++ctx->fpos;
if (ctx->saved == SCANNER_EMPTY)
{
return (*ctx->getfunc)(ctx->user_ctx);
}
c = ctx->saved;
ctx->saved = SCANNER_EMPTY;
return c;
}
static void backup_char(scanner_ctx *ctx, int c)
{
assert(ctx->saved == SCANNER_EMPTY);
ctx->saved = c;
ctx->was_newline = 0; /* we may have put it back */
--ctx->fpos;
}
/* called to note that we've moved on to another line */
static void on_next_line(scanner_ctx *ctx)
{
ctx->line_pos = ctx->fpos;
++ctx->lineno;
}
void *scanner_begin(get_char_t getfunc, void *user_ctx)
{
scanner_ctx *ctx = malloc(sizeof(*ctx));
memset(ctx, 0, sizeof(*ctx));
ctx->getfunc = getfunc;
ctx->user_ctx = user_ctx;
ctx->saved = SCANNER_EMPTY;
ctx->lineno = 1;
return ctx;
}
int scanner_get_token(void *opaque_ctx)
{
scanner_ctx *ctx = opaque_ctx;
int c;
int c2;
int blank_line;
if (ctx->dedent_count)
{
--ctx->dedent_count;
return TK_DEDENT;
}
nextline:
blank_line = 0;
/* if we're at the start of the line, then get the indentation level */
if (ctx->fpos == ctx->line_pos)
{
int col = 0;
while (1)
{
c = next_char(ctx);
if (c == ' ')
++col;
else if (c == '\t')
col = (col / SCANNER_TABSIZE + 1) * SCANNER_TABSIZE;
else if (c == '\f') /* ^L / formfeed */
col = 0;
else
break;
}
backup_char(ctx, c);
if (c == '#' || c == '\n')
{
/* this is a "blank" line and doesn't count towards indentation,
and it doesn't produce NEWLINE tokens */
blank_line = 1;
}
/* if it isn't blank, and we aren't inside nesting expressions, then
we need to handle INDENT/DEDENT */
if (!blank_line && ctx->nesting_level == 0)
{
int last_indent = ctx->indents[ctx->indent];
if (col == last_indent)
{
/* no change */
}
else if (col > last_indent)
{
if (ctx->indent == SCANNER_MAXINDENT - 1)
{
/* oops. too deep. */
return E_TOO_MANY_INDENTS;
}
ctx->indents[++ctx->indent] = col;
return TK_INDENT;
}
else /* col < last_indent */
{
/* find the previous indentation that matches this one */
while (ctx->indent > 0
&& col < ctx->indents[ctx->indent])
{
++ctx->dedent_count;
--ctx->indent;
}
if (col != ctx->indents[ctx->indent])
{
/* oops. dedent doesn't match any indent. */
return E_DEDENT_MISMATCH;
}
/* deliver one dedent now */
--ctx->dedent_count;
return TK_DEDENT;
}
} /* !blank_line ... */
} /* start of line */
/* start here if we see a line continuation */
read_more:
do {
c = next_char(ctx);
} while (c == ' ' || c == '\t' || c == '\f');
/* here is where the token starts */
ctx->start = ctx->fpos;
ctx->start_line = ctx->lineno;
ctx->start_col = ctx->fpos - ctx->line_pos;
/* comment? */
if (c == '#')
{
do {
c = next_char(ctx);
} while (c != SCANNER_EOF && c != '\n');
/* if we are suppressing newlines because this is a blank line, then
leave a marker to skip the newline, next time through. */
if (blank_line && c == '\n')
ctx->skip_newline = 1;
/* put back whatever we sucked up */
backup_char(ctx, c);
return TK_COMMENT;
}
/* Look for an identifier */
if (isalpha(c) || c == '_')
{
ctx->idlen = 0;
/* is this actually a string? */
if (c == 'r' || c == 'R')
{
ctx->identifier[ctx->idlen++] = c;
c = next_char(ctx);
if (c == '"' || c == '\'')
goto parse_string;
}
else if (c == 'u' || c == 'U')
{
ctx->identifier[ctx->idlen++] = c;
c = next_char(ctx);
if (c == 'r' || c == 'R')
{
ctx->identifier[ctx->idlen++] = c;
c = next_char(ctx);
}
if (c == '"' || c == '\'')
goto parse_string;
}
while (isalnum(c) || c == '_') {
/* store the character if there is room for it, and room left
for a null-terminator. */
if (ctx->idlen < SCANNER_MAXIDLEN-1)
ctx->identifier[ctx->idlen++] = c;
c = next_char(ctx);
}
backup_char(ctx, c);
/* ### check for a keyword */
return TK_IDENTIFIER;
}
if (c == '\n')
{
on_next_line(ctx);
/* don't report NEWLINE tokens for blank lines or nested exprs */
if (blank_line || ctx->nesting_level > 0 || ctx->skip_newline)
{
ctx->skip_newline = 0;
goto nextline;
}
return TK_NEWLINE;
}
if (c == '.')
{
c = next_char(ctx);
if (isdigit(c))
goto parse_fraction;
backup_char(ctx, c);
return '.';
}
if (isdigit(c))
{
if (c == '0')
{
c = next_char(ctx);
if (c == 'x' || c == 'X')
{
do {
c = next_char(ctx);
} while (isxdigit(c));
goto skip_fp;
}
else if (isdigit(c))
{
do {
c = next_char(ctx);
} while (isdigit(c));
}
if (c == '.')
goto parse_fraction;
if (c == 'e' || c == 'E')
goto parse_exponent;
if (c == 'j' || c == 'J')
goto parse_imaginary;
skip_fp:
/* this point: parsed an octal, decimal, or hexadecimal */
if (c == 'l' || c == 'L')
{
/* we consumed just enough. stop and return a NUMBER */
return TK_NUMBER;
}
/* consumed too much. backup and return a NUMBER */
backup_char(ctx, c);
return TK_NUMBER;
}
/* decimal number */
do {
c = next_char(ctx);
} while (isdigit(c));
if (c == 'l' || c == 'L')
{
/* we consumed just enogh. stop and return a NUMBER */
return TK_NUMBER;
}
if (c == '.')
{
parse_fraction:
do {
c = next_char(ctx);
} while (isdigit(c));
}
if (c == 'e' || c == 'E')
{
parse_exponent:
c = next_char(ctx);
if (c == '+' || c == '-')
c = next_char(ctx);
if (!isdigit(c))
{
backup_char(ctx, c);
return E_BAD_NUMBER;
}
do {
c = next_char(ctx);
} while (isdigit(c));
}
if (c == 'j' || c == 'J')
{
parse_imaginary:
c = next_char(ctx);
}
/* one too far. backup and return a NUMBER */
backup_char(ctx, c);
return TK_NUMBER;
} /* isdigit */
parse_string:
if (c == '\'' || c == '"')
{
int second_quote_pos = ctx->fpos + 1;
int which_quote = c;
int is_triple = 0;
int quote_count = 0;
while (1)
{
c = next_char(ctx);
if (c == '\n')
{
on_next_line(ctx);
if (!is_triple)
return E_UNTERM_STRING;
quote_count = 0;
}
else if (c == SCANNER_EOF)
{
return E_UNTERM_STRING;
}
else if (c == which_quote)
{
++quote_count;
if (ctx->fpos == second_quote_pos)
{
c = next_char(ctx);
if (c == which_quote)
{
is_triple = 1;
quote_count = 0;
continue;
}
/* we just read one past the empty string. back up. */
backup_char(ctx, c);
}
/* this quote may have terminated the string */
if (!is_triple || quote_count == 3)
return TK_STRING;
}
else if (c == '\\')
{
c = next_char(ctx);
if (c == SCANNER_EOF)
return E_UNTERM_STRING;
if (c == '\n')
on_next_line(ctx);
quote_count = 0;
}
else
{
quote_count = 0;
}
}
/* NOTREACHED */
}
/* line continuation */
if (c == '\\')
{
c = next_char(ctx);
if (c != '\n')
return E_BAD_CONTINUATION;
on_next_line(ctx);
goto read_more;
}
/* look for operators */
/* the nesting operators */
if (c == '(' || c == '[' || c == '{')
{
++ctx->nesting_level;
return c;
}
if (c == ')' || c == ']' || c == '}')
{
--ctx->nesting_level;
return c;
}
/* look for up-to-3-char ops */
if (c == '<' || c == '>' || c == '*' || c == '/')
{
c2 = next_char(ctx);
if (c == c2)
{
c2 = next_char(ctx);
if (c2 != '=')
{
/* oops. one too far. */
backup_char(ctx, c2);
}
return TK_OPERATOR;
}
if (c == '<' && c2 == '>')
return TK_OPERATOR;
if (c2 != '=')
{
/* one char too far. */
backup_char(ctx, c2);
}
return TK_OPERATOR;
}
/* look for 2-char ops */
if (c == '=' || c == '!' || c == '+' || c == '-'
|| c == '|' || c == '%' || c == '&' || c == '^')
{
c2 = next_char(ctx);
if (c2 == '=')
return TK_OPERATOR;
/* oops. too far. */
backup_char(ctx, c2);
return TK_OPERATOR;
}
/* ### should all of these return 'c' ? */
if (c == ':' || c == ',' || c == ';' || c == '`')
return c;
/* as a unary operator, this must be a TK_OPERATOR */
if (c == '~')
return TK_OPERATOR;
/* if we have an EOF, then just return it */
if (c == SCANNER_EOF)
return SCANNER_EOF;
/* unknown input */
return E_UNKNOWN_TOKEN;
}
void scanner_identifier(void *opaque_ctx, const char **ident, int *len)
{
scanner_ctx *ctx = opaque_ctx;
ctx->identifier[ctx->idlen] = '\0';
*ident = ctx->identifier;
*len = ctx->idlen;
}
void scanner_token_range(void *opaque_ctx, int *start, int *end)
{
scanner_ctx *ctx = opaque_ctx;
*start = ctx->start;
*end = ctx->fpos;
}
void scanner_token_linecol(void *opaque_ctx,
int *sline, int *scol, int *eline, int *ecol)
{
scanner_ctx *ctx = opaque_ctx;
*sline = ctx->start_line;
*scol = ctx->start_col;
*eline = ctx->lineno;
*ecol = ctx->fpos - ctx->line_pos;
}
void scanner_end(void *ctx)
{
free(ctx);
}