%start token
%scanner

%local {
#include "elx.h"

/* from elx-java.c */
void yyserror(const char *msg);
int yyslex(void);

/* for the TK_ symbols, generated from java.y */
#include "java.h"

/* for keyword recognition */
#include "j_keywords.h"

extern void issue_token(char which);
extern void mark_token_start(void);

#define MAX_IDENT 200
static int idlen;
static char identifier[MAX_IDENT+1];
#define INIT_IDENT(c) (identifier[0] = (c), idlen = 1)
#define ADD_IDENT(c) if (idlen == MAX_IDENT) return E_IDENT_TOO_LONG; \
                     else identifier[idlen++] = (c)

/* ### is there a better place? */
#define E_IDENT_TOO_LONG  (-100)

static int lookup(void);
}


%%

token : pure_ws* { mark_token_start(); } slash_op

slash_op : "/=" { return TK_OPERATOR; }
	 | comment token
	 | '/' { return TK_OPERATOR; }
	 | one_token
	 |
	 ;

one_token : t_identifier { return lookup(); }
	  | t_literal { return TK_LITERAL; }
	  | t_operator { return TK_OPERATOR; }
	  | t_chars { return yysprev_char; }
	  | t_inc_dec { return TK_INC_DEC; }
	  | t_bracket
          ;

t_identifier : alpha { INIT_IDENT(yysprev_char); }
	       ( alphanum { ADD_IDENT(yysprev_char); } )*

alpha : 'a' - 'z' | 'A' - 'Z' | '_' | '$'
alphanum : alpha | digit

digit : '0' - '9'
hexdigit : digit | 'a' - 'f' | 'A' - 'F'
octal : '0' - '7'

t_literal : number | string | char_constant

number : ('1' - '9') digit* decimal_suffix
       | '.' digit+ [exponent] [float_suffix]
       | '0' (('x' | 'X') hexdigit+ | octal+) decimal_suffix
       ;
decimal_suffix : ('.' digit* [exponent] [float_suffix])
	       | 'l' | 'L'
	       | /* nothing */
	       ;
exponent : ('e' | 'E') ['+' | '-'] digit+
float_suffix : 'f' | 'F' | 'd' | 'D'

string : '"' string_char* '"' { issue_token(ELX_STRING); }
string_char : '\1' -> '"' | '"' <-> '\\' | '\\' <- '\377' | '\\' '\1' - '\377'

char_constant : '\'' one_char '\''
one_char : '\1' -> '\'' | '\'' <-> '\\' | '\\' <- '\377' | '\\' '\1' - '\377'

comment : ( "//" line_comment_char* '\n'
	  | "/*" (block_comment_char | '*' block_non_term_char)* "*/"
	  ) { issue_token(ELX_COMMENT); }
	;
line_comment_char : '\1' -> '\n' | '\n' <- '\377'
block_comment_char : '\1' -> '*' | '*' <- '\377'
block_non_term_char : '\1' -> '/' | '/' <- '\377'

t_operator : "<<" | ">>" | ">>>"
           | ">=" | "<=" | "==" | "!=" | "&&" | "||"
	   | "*=" | "%=" | "+=" | "-=" | "<<=" | ">>="
	   | ">>>=" | "&=" | "^=" | "|="
	   | '<' | '>' | '%' | '^' | '&' | '|'
	   ;
t_inc_dec : "++" | "--"

/* note: could not use ws* ; the '[' form would only reduce on $end
   rather than "any" character. that meant we could not recognize '['
   within the program text. separating out the cases Does The Right
   Thing */
t_bracket : '[' { return '['; }
	  | '[' ']' { return TK_DIM; }
	  | '[' ws+ ']' { return TK_DIM; }
	  ;

t_chars : ',' | ';' | '.' | '{' | '}' | '=' | '(' | ')' | ':'
        | ']' | '!' | '~' | '+' | '-' | '*' | '?'
	;

ws : pure_ws | comment

pure_ws : ' ' | '\t' | '\n' | '\f'

%%

static int lookup(void)
{
    int kw = KR_find_keyword(identifier, idlen);

    if (kw == KR__not_found)
    {
        /* terminate so user can grab an identifier string */
        identifier[idlen] = '\0';
        return TK_IDENTIFIER;
    }
    
    issue_token(ELX_KEYWORD);
    return kw;
}

const char *get_identifier(void)
{
    return identifier;
}