524 lines
13 KiB
C
524 lines
13 KiB
C
|
|
#include <stdlib.h>
|
|
#include <ctype.h>
|
|
#include <assert.h>
|
|
#include <string.h>
|
|
|
|
#include "python.h" /* get the TK_ values */
|
|
#include "scanner.h"
|
|
|
|
#define SCANNER_EMPTY (SCANNER_EOF - 1) /* -2 */
|
|
#define SCANNER_TABSIZE 8
|
|
#define SCANNER_MAXINDENT 100
|
|
#define SCANNER_MAXIDLEN 200
|
|
|
|
typedef struct
|
|
{
|
|
get_char_t getfunc;
|
|
void *user_ctx;
|
|
|
|
char saved;
|
|
int was_newline; /* was previous character a newline? */
|
|
|
|
int start; /* start position of last token returned */
|
|
int start_col;
|
|
int start_line;
|
|
|
|
int fpos; /* file position */
|
|
int lineno; /* file line number */
|
|
int line_pos; /* file position of current line's first char */
|
|
|
|
int nesting_level;
|
|
|
|
int indent; /* which indent */
|
|
int indents[SCANNER_MAXINDENT]; /* the set of indents */
|
|
|
|
int dedent_count; /* how many DEDENTs to issue */
|
|
|
|
int skip_newline; /* skip the newline after a blank_line + comment */
|
|
|
|
int idlen;
|
|
char identifier[SCANNER_MAXIDLEN]; /* accumulated identifier */
|
|
|
|
} scanner_ctx;
|
|
|
|
|
|
static int next_char(scanner_ctx *ctx)
|
|
{
|
|
int c;
|
|
|
|
++ctx->fpos;
|
|
|
|
if (ctx->saved == SCANNER_EMPTY)
|
|
{
|
|
return (*ctx->getfunc)(ctx->user_ctx);
|
|
}
|
|
|
|
c = ctx->saved;
|
|
ctx->saved = SCANNER_EMPTY;
|
|
return c;
|
|
}
|
|
|
|
static void backup_char(scanner_ctx *ctx, int c)
|
|
{
|
|
assert(ctx->saved == SCANNER_EMPTY);
|
|
ctx->saved = c;
|
|
ctx->was_newline = 0; /* we may have put it back */
|
|
--ctx->fpos;
|
|
}
|
|
|
|
/* called to note that we've moved on to another line */
|
|
static void on_next_line(scanner_ctx *ctx)
|
|
{
|
|
ctx->line_pos = ctx->fpos;
|
|
++ctx->lineno;
|
|
}
|
|
|
|
void *scanner_begin(get_char_t getfunc, void *user_ctx)
|
|
{
|
|
scanner_ctx *ctx = malloc(sizeof(*ctx));
|
|
|
|
memset(ctx, 0, sizeof(*ctx));
|
|
ctx->getfunc = getfunc;
|
|
ctx->user_ctx = user_ctx;
|
|
ctx->saved = SCANNER_EMPTY;
|
|
ctx->lineno = 1;
|
|
|
|
return ctx;
|
|
}
|
|
|
|
int scanner_get_token(void *opaque_ctx)
|
|
{
|
|
scanner_ctx *ctx = opaque_ctx;
|
|
int c;
|
|
int c2;
|
|
int blank_line;
|
|
|
|
if (ctx->dedent_count)
|
|
{
|
|
--ctx->dedent_count;
|
|
return TK_DEDENT;
|
|
}
|
|
|
|
nextline:
|
|
blank_line = 0;
|
|
/* if we're at the start of the line, then get the indentation level */
|
|
if (ctx->fpos == ctx->line_pos)
|
|
{
|
|
int col = 0;
|
|
|
|
while (1)
|
|
{
|
|
c = next_char(ctx);
|
|
if (c == ' ')
|
|
++col;
|
|
else if (c == '\t')
|
|
col = (col / SCANNER_TABSIZE + 1) * SCANNER_TABSIZE;
|
|
else if (c == '\f') /* ^L / formfeed */
|
|
col = 0;
|
|
else
|
|
break;
|
|
}
|
|
backup_char(ctx, c);
|
|
|
|
if (c == '#' || c == '\n')
|
|
{
|
|
/* this is a "blank" line and doesn't count towards indentation,
|
|
and it doesn't produce NEWLINE tokens */
|
|
blank_line = 1;
|
|
}
|
|
|
|
/* if it isn't blank, and we aren't inside nesting expressions, then
|
|
we need to handle INDENT/DEDENT */
|
|
if (!blank_line && ctx->nesting_level == 0)
|
|
{
|
|
int last_indent = ctx->indents[ctx->indent];
|
|
|
|
if (col == last_indent)
|
|
{
|
|
/* no change */
|
|
}
|
|
else if (col > last_indent)
|
|
{
|
|
if (ctx->indent == SCANNER_MAXINDENT - 1)
|
|
{
|
|
/* oops. too deep. */
|
|
return E_TOO_MANY_INDENTS;
|
|
}
|
|
ctx->indents[++ctx->indent] = col;
|
|
return TK_INDENT;
|
|
}
|
|
else /* col < last_indent */
|
|
{
|
|
/* find the previous indentation that matches this one */
|
|
while (ctx->indent > 0
|
|
&& col < ctx->indents[ctx->indent])
|
|
{
|
|
++ctx->dedent_count;
|
|
--ctx->indent;
|
|
}
|
|
if (col != ctx->indents[ctx->indent])
|
|
{
|
|
/* oops. dedent doesn't match any indent. */
|
|
return E_DEDENT_MISMATCH;
|
|
}
|
|
|
|
/* deliver one dedent now */
|
|
--ctx->dedent_count;
|
|
return TK_DEDENT;
|
|
}
|
|
} /* !blank_line ... */
|
|
} /* start of line */
|
|
|
|
/* start here if we see a line continuation */
|
|
read_more:
|
|
|
|
do {
|
|
c = next_char(ctx);
|
|
} while (c == ' ' || c == '\t' || c == '\f');
|
|
|
|
/* here is where the token starts */
|
|
ctx->start = ctx->fpos;
|
|
ctx->start_line = ctx->lineno;
|
|
ctx->start_col = ctx->fpos - ctx->line_pos;
|
|
|
|
/* comment? */
|
|
if (c == '#')
|
|
{
|
|
do {
|
|
c = next_char(ctx);
|
|
} while (c != SCANNER_EOF && c != '\n');
|
|
|
|
/* if we are suppressing newlines because this is a blank line, then
|
|
leave a marker to skip the newline, next time through. */
|
|
if (blank_line && c == '\n')
|
|
ctx->skip_newline = 1;
|
|
|
|
/* put back whatever we sucked up */
|
|
backup_char(ctx, c);
|
|
|
|
return TK_COMMENT;
|
|
}
|
|
|
|
/* Look for an identifier */
|
|
if (isalpha(c) || c == '_')
|
|
{
|
|
ctx->idlen = 0;
|
|
|
|
/* is this actually a string? */
|
|
if (c == 'r' || c == 'R')
|
|
{
|
|
ctx->identifier[ctx->idlen++] = c;
|
|
c = next_char(ctx);
|
|
if (c == '"' || c == '\'')
|
|
goto parse_string;
|
|
}
|
|
else if (c == 'u' || c == 'U')
|
|
{
|
|
ctx->identifier[ctx->idlen++] = c;
|
|
c = next_char(ctx);
|
|
if (c == 'r' || c == 'R')
|
|
{
|
|
ctx->identifier[ctx->idlen++] = c;
|
|
c = next_char(ctx);
|
|
}
|
|
if (c == '"' || c == '\'')
|
|
goto parse_string;
|
|
}
|
|
|
|
while (isalnum(c) || c == '_') {
|
|
/* store the character if there is room for it, and room left
|
|
for a null-terminator. */
|
|
if (ctx->idlen < SCANNER_MAXIDLEN-1)
|
|
ctx->identifier[ctx->idlen++] = c;
|
|
c = next_char(ctx);
|
|
}
|
|
backup_char(ctx, c);
|
|
|
|
/* ### check for a keyword */
|
|
return TK_IDENTIFIER;
|
|
}
|
|
|
|
if (c == '\n')
|
|
{
|
|
on_next_line(ctx);
|
|
|
|
/* don't report NEWLINE tokens for blank lines or nested exprs */
|
|
if (blank_line || ctx->nesting_level > 0 || ctx->skip_newline)
|
|
{
|
|
ctx->skip_newline = 0;
|
|
goto nextline;
|
|
}
|
|
|
|
return TK_NEWLINE;
|
|
}
|
|
|
|
if (c == '.')
|
|
{
|
|
c = next_char(ctx);
|
|
if (isdigit(c))
|
|
goto parse_fraction;
|
|
backup_char(ctx, c);
|
|
return '.';
|
|
}
|
|
|
|
if (isdigit(c))
|
|
{
|
|
if (c == '0')
|
|
{
|
|
c = next_char(ctx);
|
|
if (c == 'x' || c == 'X')
|
|
{
|
|
do {
|
|
c = next_char(ctx);
|
|
} while (isxdigit(c));
|
|
goto skip_fp;
|
|
}
|
|
else if (isdigit(c))
|
|
{
|
|
do {
|
|
c = next_char(ctx);
|
|
} while (isdigit(c));
|
|
}
|
|
if (c == '.')
|
|
goto parse_fraction;
|
|
if (c == 'e' || c == 'E')
|
|
goto parse_exponent;
|
|
if (c == 'j' || c == 'J')
|
|
goto parse_imaginary;
|
|
skip_fp:
|
|
/* this point: parsed an octal, decimal, or hexadecimal */
|
|
|
|
if (c == 'l' || c == 'L')
|
|
{
|
|
/* we consumed just enough. stop and return a NUMBER */
|
|
return TK_NUMBER;
|
|
}
|
|
|
|
/* consumed too much. backup and return a NUMBER */
|
|
backup_char(ctx, c);
|
|
return TK_NUMBER;
|
|
}
|
|
|
|
/* decimal number */
|
|
do {
|
|
c = next_char(ctx);
|
|
} while (isdigit(c));
|
|
|
|
if (c == 'l' || c == 'L')
|
|
{
|
|
/* we consumed just enogh. stop and return a NUMBER */
|
|
return TK_NUMBER;
|
|
}
|
|
|
|
if (c == '.')
|
|
{
|
|
parse_fraction:
|
|
do {
|
|
c = next_char(ctx);
|
|
} while (isdigit(c));
|
|
}
|
|
|
|
if (c == 'e' || c == 'E')
|
|
{
|
|
parse_exponent:
|
|
c = next_char(ctx);
|
|
if (c == '+' || c == '-')
|
|
c = next_char(ctx);
|
|
if (!isdigit(c))
|
|
{
|
|
backup_char(ctx, c);
|
|
return E_BAD_NUMBER;
|
|
}
|
|
do {
|
|
c = next_char(ctx);
|
|
} while (isdigit(c));
|
|
}
|
|
|
|
if (c == 'j' || c == 'J')
|
|
{
|
|
parse_imaginary:
|
|
c = next_char(ctx);
|
|
}
|
|
|
|
/* one too far. backup and return a NUMBER */
|
|
backup_char(ctx, c);
|
|
return TK_NUMBER;
|
|
|
|
} /* isdigit */
|
|
|
|
parse_string:
|
|
if (c == '\'' || c == '"')
|
|
{
|
|
int second_quote_pos = ctx->fpos + 1;
|
|
int which_quote = c;
|
|
int is_triple = 0;
|
|
int quote_count = 0;
|
|
|
|
while (1)
|
|
{
|
|
c = next_char(ctx);
|
|
if (c == '\n')
|
|
{
|
|
on_next_line(ctx);
|
|
|
|
if (!is_triple)
|
|
return E_UNTERM_STRING;
|
|
quote_count = 0;
|
|
}
|
|
else if (c == SCANNER_EOF)
|
|
{
|
|
return E_UNTERM_STRING;
|
|
}
|
|
else if (c == which_quote)
|
|
{
|
|
++quote_count;
|
|
if (ctx->fpos == second_quote_pos)
|
|
{
|
|
c = next_char(ctx);
|
|
if (c == which_quote)
|
|
{
|
|
is_triple = 1;
|
|
quote_count = 0;
|
|
continue;
|
|
}
|
|
/* we just read one past the empty string. back up. */
|
|
backup_char(ctx, c);
|
|
}
|
|
|
|
/* this quote may have terminated the string */
|
|
if (!is_triple || quote_count == 3)
|
|
return TK_STRING;
|
|
}
|
|
else if (c == '\\')
|
|
{
|
|
c = next_char(ctx);
|
|
if (c == SCANNER_EOF)
|
|
return E_UNTERM_STRING;
|
|
if (c == '\n')
|
|
on_next_line(ctx);
|
|
quote_count = 0;
|
|
}
|
|
else
|
|
{
|
|
quote_count = 0;
|
|
}
|
|
}
|
|
|
|
/* NOTREACHED */
|
|
}
|
|
|
|
/* line continuation */
|
|
if (c == '\\')
|
|
{
|
|
c = next_char(ctx);
|
|
if (c != '\n')
|
|
return E_BAD_CONTINUATION;
|
|
|
|
on_next_line(ctx);
|
|
goto read_more;
|
|
}
|
|
|
|
/* look for operators */
|
|
|
|
/* the nesting operators */
|
|
if (c == '(' || c == '[' || c == '{')
|
|
{
|
|
++ctx->nesting_level;
|
|
return c;
|
|
}
|
|
if (c == ')' || c == ']' || c == '}')
|
|
{
|
|
--ctx->nesting_level;
|
|
return c;
|
|
}
|
|
|
|
/* look for up-to-3-char ops */
|
|
if (c == '<' || c == '>' || c == '*' || c == '/')
|
|
{
|
|
c2 = next_char(ctx);
|
|
if (c == c2)
|
|
{
|
|
c2 = next_char(ctx);
|
|
if (c2 != '=')
|
|
{
|
|
/* oops. one too far. */
|
|
backup_char(ctx, c2);
|
|
}
|
|
return TK_OPERATOR;
|
|
}
|
|
|
|
if (c == '<' && c2 == '>')
|
|
return TK_OPERATOR;
|
|
|
|
if (c2 != '=')
|
|
{
|
|
/* one char too far. */
|
|
backup_char(ctx, c2);
|
|
}
|
|
return TK_OPERATOR;
|
|
}
|
|
|
|
/* look for 2-char ops */
|
|
if (c == '=' || c == '!' || c == '+' || c == '-'
|
|
|| c == '|' || c == '%' || c == '&' || c == '^')
|
|
{
|
|
c2 = next_char(ctx);
|
|
if (c2 == '=')
|
|
return TK_OPERATOR;
|
|
|
|
/* oops. too far. */
|
|
backup_char(ctx, c2);
|
|
return TK_OPERATOR;
|
|
}
|
|
|
|
/* ### should all of these return 'c' ? */
|
|
if (c == ':' || c == ',' || c == ';' || c == '`')
|
|
return c;
|
|
|
|
/* as a unary operator, this must be a TK_OPERATOR */
|
|
if (c == '~')
|
|
return TK_OPERATOR;
|
|
|
|
/* if we have an EOF, then just return it */
|
|
if (c == SCANNER_EOF)
|
|
return SCANNER_EOF;
|
|
|
|
/* unknown input */
|
|
return E_UNKNOWN_TOKEN;
|
|
}
|
|
|
|
void scanner_identifier(void *opaque_ctx, const char **ident, int *len)
|
|
{
|
|
scanner_ctx *ctx = opaque_ctx;
|
|
|
|
ctx->identifier[ctx->idlen] = '\0';
|
|
*ident = ctx->identifier;
|
|
*len = ctx->idlen;
|
|
}
|
|
|
|
void scanner_token_range(void *opaque_ctx, int *start, int *end)
|
|
{
|
|
scanner_ctx *ctx = opaque_ctx;
|
|
|
|
*start = ctx->start;
|
|
*end = ctx->fpos;
|
|
}
|
|
|
|
void scanner_token_linecol(void *opaque_ctx,
|
|
int *sline, int *scol, int *eline, int *ecol)
|
|
{
|
|
scanner_ctx *ctx = opaque_ctx;
|
|
|
|
*sline = ctx->start_line;
|
|
*scol = ctx->start_col;
|
|
|
|
*eline = ctx->lineno;
|
|
*ecol = ctx->fpos - ctx->line_pos;
|
|
}
|
|
|
|
void scanner_end(void *ctx)
|
|
{
|
|
free(ctx);
|
|
}
|