349 lines
10 KiB
Python
349 lines
10 KiB
Python
# -*-python-*-
|
|
#
|
|
# Copyright (C) 1999-2013 The ViewCVS Group. All Rights Reserved.
|
|
#
|
|
# By using this file, you agree to the terms and conditions set forth in
|
|
# the LICENSE.html file which can be found at the top level of the ViewVC
|
|
# distribution or at http://viewvc.org/license-1.html.
|
|
#
|
|
# For more information, visit http://viewvc.org/
|
|
#
|
|
# -----------------------------------------------------------------------
|
|
|
|
import string
|
|
|
|
# note: this will raise an ImportError if it isn't available. the rcsparse
|
|
# package will recognize this and switch over to the default parser.
|
|
from mx import TextTools
|
|
|
|
import common
|
|
|
|
|
|
# for convenience
|
|
_tt = TextTools
|
|
|
|
_idchar_list = map(chr, range(33, 127)) + map(chr, range(160, 256))
|
|
_idchar_list.remove('$')
|
|
_idchar_list.remove(',')
|
|
#_idchar_list.remove('.') # leave as part of 'num' symbol
|
|
_idchar_list.remove(':')
|
|
_idchar_list.remove(';')
|
|
_idchar_list.remove('@')
|
|
_idchar = string.join(_idchar_list, '')
|
|
_idchar_set = _tt.set(_idchar)
|
|
|
|
_onechar_token_set = _tt.set(':;')
|
|
|
|
_not_at_set = _tt.invset('@')
|
|
|
|
_T_TOKEN = 30
|
|
_T_STRING_START = 40
|
|
_T_STRING_SPAN = 60
|
|
_T_STRING_END = 70
|
|
|
|
_E_COMPLETE = 100 # ended on a complete token
|
|
_E_TOKEN = 110 # ended mid-token
|
|
_E_STRING_SPAN = 130 # ended within a string
|
|
_E_STRING_END = 140 # ended with string-end ('@') (could be mid-@@)
|
|
|
|
_SUCCESS = +100
|
|
|
|
_EOF = 'EOF'
|
|
_CONTINUE = 'CONTINUE'
|
|
_UNUSED = 'UNUSED'
|
|
|
|
|
|
# continuation of a token over a chunk boundary
|
|
_c_token_table = (
|
|
(_T_TOKEN, _tt.AllInSet, _idchar_set),
|
|
)
|
|
|
|
class _mxTokenStream:
|
|
|
|
# the algorithm is about the same speed for any CHUNK_SIZE chosen.
|
|
# grab a good-sized chunk, but not too large to overwhelm memory.
|
|
# note: we use a multiple of a standard block size
|
|
CHUNK_SIZE = 192 * 512 # about 100k
|
|
|
|
# CHUNK_SIZE = 5 # for debugging, make the function grind...
|
|
|
|
def __init__(self, file):
|
|
self.rcsfile = file
|
|
self.tokens = [ ]
|
|
self.partial = None
|
|
|
|
self.string_end = None
|
|
|
|
def _parse_chunk(self, buf, start=0):
|
|
"Get the next token from the RCS file."
|
|
|
|
buflen = len(buf)
|
|
|
|
assert start < buflen
|
|
|
|
# construct a tag table which refers to the buffer we need to parse.
|
|
table = (
|
|
#1: ignore whitespace. with or without whitespace, move to the next rule.
|
|
(None, _tt.AllInSet, _tt.whitespace_set, +1),
|
|
|
|
#2
|
|
(_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
|
|
|
|
#3: accumulate token text and exit, or move to the next rule.
|
|
(_UNUSED, _tt.AllInSet + _tt.AppendMatch, _idchar_set, +2),
|
|
|
|
#4
|
|
(_E_TOKEN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -3, _SUCCESS),
|
|
|
|
#5: single character tokens exit immediately, or move to the next rule
|
|
(_UNUSED, _tt.IsInSet + _tt.AppendMatch, _onechar_token_set, +2),
|
|
|
|
#6
|
|
(_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, -5, _SUCCESS),
|
|
|
|
#7: if this isn't an '@' symbol, then we have a syntax error (go to a
|
|
# negative index to indicate that condition). otherwise, suck it up
|
|
# and move to the next rule.
|
|
(_T_STRING_START, _tt.Is + _tt.AppendTagobj, '@'),
|
|
|
|
#8
|
|
(None, _tt.Is, '@', +4, +1),
|
|
#9
|
|
(buf, _tt.Is, '@', +1, -1),
|
|
#10
|
|
(_T_STRING_END, _tt.Skip + _tt.AppendTagobj, 0, 0, +1),
|
|
#11
|
|
(_E_STRING_END, _tt.EOF + _tt.AppendTagobj, _tt.Here, -10, _SUCCESS),
|
|
|
|
#12
|
|
(_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
|
|
|
|
#13: suck up everything that isn't an AT. go to next rule to look for EOF
|
|
(buf, _tt.AllInSet, _not_at_set, 0, +1),
|
|
|
|
#14: go back to look for double AT if we aren't at the end of the string
|
|
(_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -6, _SUCCESS),
|
|
)
|
|
|
|
# Fast, texttools may be, but it's somewhat lacking in clarity.
|
|
# Here's an attempt to document the logic encoded in the table above:
|
|
#
|
|
# Flowchart:
|
|
# _____
|
|
# / /\
|
|
# 1 -> 2 -> 3 -> 5 -> 7 -> 8 -> 9 -> 10 -> 11
|
|
# | \/ \/ \/ /\ \/
|
|
# \ 4 6 12 14 /
|
|
# \_______/_____/ \ / /
|
|
# \ 13 /
|
|
# \__________________________________________/
|
|
#
|
|
# #1: Skip over any whitespace.
|
|
# #2: If now EOF, exit with code _E_COMPLETE.
|
|
# #3: If we have a series of characters in _idchar_set, then:
|
|
# #4: Output them as a token, and go back to #1.
|
|
# #5: If we have a character in _onechar_token_set, then:
|
|
# #6: Output it as a token, and go back to #1.
|
|
# #7: If we do not have an '@', then error.
|
|
# If we do, then log a _T_STRING_START and continue.
|
|
# #8: If we have another '@', continue on to #9. Otherwise:
|
|
# #12: If now EOF, exit with code _E_STRING_SPAN.
|
|
# #13: Record the slice up to the next '@' (or EOF).
|
|
# #14: If now EOF, exit with code _E_STRING_SPAN.
|
|
# Otherwise, go back to #8.
|
|
# #9: If we have another '@', then we've just seen an escaped
|
|
# (by doubling) '@' within an @-string. Record a slice including
|
|
# just one '@' character, and jump back to #8.
|
|
# Otherwise, we've *either* seen the terminating '@' of an @-string,
|
|
# *or* we've seen one half of an escaped @@ sequence that just
|
|
# happened to be split over a chunk boundary - in either case,
|
|
# we continue on to #10.
|
|
# #10: Log a _T_STRING_END.
|
|
# #11: If now EOF, exit with _E_STRING_END. Otherwise, go back to #1.
|
|
|
|
success, taglist, idx = _tt.tag(buf, table, start)
|
|
|
|
if not success:
|
|
### need a better way to report this error
|
|
raise common.RCSIllegalCharacter()
|
|
assert idx == buflen
|
|
|
|
# pop off the last item
|
|
last_which = taglist.pop()
|
|
|
|
i = 0
|
|
tlen = len(taglist)
|
|
while i < tlen:
|
|
if taglist[i] == _T_STRING_START:
|
|
j = i + 1
|
|
while j < tlen:
|
|
if taglist[j] == _T_STRING_END:
|
|
s = _tt.join(taglist, '', i+1, j)
|
|
del taglist[i:j]
|
|
tlen = len(taglist)
|
|
taglist[i] = s
|
|
break
|
|
j = j + 1
|
|
else:
|
|
assert last_which == _E_STRING_SPAN
|
|
s = _tt.join(taglist, '', i+1)
|
|
del taglist[i:]
|
|
self.partial = (_T_STRING_SPAN, [ s ])
|
|
break
|
|
i = i + 1
|
|
|
|
# figure out whether we have a partial last-token
|
|
if last_which == _E_TOKEN:
|
|
self.partial = (_T_TOKEN, [ taglist.pop() ])
|
|
elif last_which == _E_COMPLETE:
|
|
pass
|
|
elif last_which == _E_STRING_SPAN:
|
|
assert self.partial
|
|
else:
|
|
assert last_which == _E_STRING_END
|
|
self.partial = (_T_STRING_END, [ taglist.pop() ])
|
|
|
|
taglist.reverse()
|
|
taglist.extend(self.tokens)
|
|
self.tokens = taglist
|
|
|
|
def _set_end(self, taglist, text, l, r, subtags):
|
|
self.string_end = l
|
|
|
|
def _handle_partial(self, buf):
|
|
which, chunks = self.partial
|
|
if which == _T_TOKEN:
|
|
success, taglist, idx = _tt.tag(buf, _c_token_table)
|
|
if not success:
|
|
# The start of this buffer was not a token. So the end of the
|
|
# prior buffer was a complete token.
|
|
self.tokens.insert(0, string.join(chunks, ''))
|
|
else:
|
|
assert len(taglist) == 1 and taglist[0][0] == _T_TOKEN \
|
|
and taglist[0][1] == 0 and taglist[0][2] == idx
|
|
if idx == len(buf):
|
|
#
|
|
# The whole buffer was one huge token, so we may have a
|
|
# partial token again.
|
|
#
|
|
# Note: this modifies the list of chunks in self.partial
|
|
#
|
|
chunks.append(buf)
|
|
|
|
# consumed the whole buffer
|
|
return len(buf)
|
|
|
|
# got the rest of the token.
|
|
chunks.append(buf[:idx])
|
|
self.tokens.insert(0, string.join(chunks, ''))
|
|
|
|
# no more partial token
|
|
self.partial = None
|
|
|
|
return idx
|
|
|
|
if which == _T_STRING_END:
|
|
if buf[0] != '@':
|
|
self.tokens.insert(0, string.join(chunks, ''))
|
|
return 0
|
|
chunks.append('@')
|
|
start = 1
|
|
else:
|
|
start = 0
|
|
|
|
self.string_end = None
|
|
string_table = (
|
|
(None, _tt.Is, '@', +3, +1),
|
|
(_UNUSED, _tt.Is + _tt.AppendMatch, '@', +1, -1),
|
|
(self._set_end, _tt.Skip + _tt.CallTag, 0, 0, _SUCCESS),
|
|
|
|
(None, _tt.EOF, _tt.Here, +1, _SUCCESS),
|
|
|
|
# suck up everything that isn't an AT. move to next rule to look
|
|
# for EOF
|
|
(_UNUSED, _tt.AllInSet + _tt.AppendMatch, _not_at_set, 0, +1),
|
|
|
|
# go back to look for double AT if we aren't at the end of the string
|
|
(None, _tt.EOF, _tt.Here, -5, _SUCCESS),
|
|
)
|
|
|
|
success, unused, idx = _tt.tag(buf, string_table,
|
|
start, len(buf), chunks)
|
|
|
|
# must have matched at least one item
|
|
assert success
|
|
|
|
if self.string_end is None:
|
|
assert idx == len(buf)
|
|
self.partial = (_T_STRING_SPAN, chunks)
|
|
elif self.string_end < len(buf):
|
|
self.partial = None
|
|
self.tokens.insert(0, string.join(chunks, ''))
|
|
else:
|
|
self.partial = (_T_STRING_END, chunks)
|
|
|
|
return idx
|
|
|
|
def _parse_more(self):
|
|
buf = self.rcsfile.read(self.CHUNK_SIZE)
|
|
if not buf:
|
|
return _EOF
|
|
|
|
if self.partial:
|
|
idx = self._handle_partial(buf)
|
|
if idx is None:
|
|
return _CONTINUE
|
|
if idx < len(buf):
|
|
self._parse_chunk(buf, idx)
|
|
else:
|
|
self._parse_chunk(buf)
|
|
|
|
return _CONTINUE
|
|
|
|
def get(self):
|
|
try:
|
|
return self.tokens.pop()
|
|
except IndexError:
|
|
pass
|
|
|
|
while not self.tokens:
|
|
action = self._parse_more()
|
|
if action == _EOF:
|
|
return None
|
|
|
|
return self.tokens.pop()
|
|
|
|
|
|
# _get = get
|
|
# def get(self):
|
|
token = self._get()
|
|
print 'T:', `token`
|
|
return token
|
|
|
|
def match(self, match):
|
|
if self.tokens:
|
|
token = self.tokens.pop()
|
|
else:
|
|
token = self.get()
|
|
|
|
if token != match:
|
|
raise common.RCSExpected(token, match)
|
|
|
|
def unget(self, token):
|
|
self.tokens.append(token)
|
|
|
|
def mget(self, count):
|
|
"Return multiple tokens. 'next' is at the end."
|
|
while len(self.tokens) < count:
|
|
action = self._parse_more()
|
|
if action == _EOF:
|
|
### fix this
|
|
raise RuntimeError, 'EOF hit while expecting tokens'
|
|
result = self.tokens[-count:]
|
|
del self.tokens[-count:]
|
|
return result
|
|
|
|
|
|
class Parser(common._Parser):
|
|
stream_class = _mxTokenStream
|