viewvc-4intranet/lib/vclib/ccvs/rcsparse/texttools.py

349 lines
10 KiB
Python

# -*-python-*-
#
# Copyright (C) 1999-2013 The ViewCVS Group. All Rights Reserved.
#
# By using this file, you agree to the terms and conditions set forth in
# the LICENSE.html file which can be found at the top level of the ViewVC
# distribution or at http://viewvc.org/license-1.html.
#
# For more information, visit http://viewvc.org/
#
# -----------------------------------------------------------------------
import string
# note: this will raise an ImportError if it isn't available. the rcsparse
# package will recognize this and switch over to the default parser.
from mx import TextTools
import common
# for convenience
_tt = TextTools
_idchar_list = map(chr, range(33, 127)) + map(chr, range(160, 256))
_idchar_list.remove('$')
_idchar_list.remove(',')
#_idchar_list.remove('.') # leave as part of 'num' symbol
_idchar_list.remove(':')
_idchar_list.remove(';')
_idchar_list.remove('@')
_idchar = string.join(_idchar_list, '')
_idchar_set = _tt.set(_idchar)
_onechar_token_set = _tt.set(':;')
_not_at_set = _tt.invset('@')
_T_TOKEN = 30
_T_STRING_START = 40
_T_STRING_SPAN = 60
_T_STRING_END = 70
_E_COMPLETE = 100 # ended on a complete token
_E_TOKEN = 110 # ended mid-token
_E_STRING_SPAN = 130 # ended within a string
_E_STRING_END = 140 # ended with string-end ('@') (could be mid-@@)
_SUCCESS = +100
_EOF = 'EOF'
_CONTINUE = 'CONTINUE'
_UNUSED = 'UNUSED'
# continuation of a token over a chunk boundary
_c_token_table = (
(_T_TOKEN, _tt.AllInSet, _idchar_set),
)
class _mxTokenStream:
# the algorithm is about the same speed for any CHUNK_SIZE chosen.
# grab a good-sized chunk, but not too large to overwhelm memory.
# note: we use a multiple of a standard block size
CHUNK_SIZE = 192 * 512 # about 100k
# CHUNK_SIZE = 5 # for debugging, make the function grind...
def __init__(self, file):
self.rcsfile = file
self.tokens = [ ]
self.partial = None
self.string_end = None
def _parse_chunk(self, buf, start=0):
"Get the next token from the RCS file."
buflen = len(buf)
assert start < buflen
# construct a tag table which refers to the buffer we need to parse.
table = (
#1: ignore whitespace. with or without whitespace, move to the next rule.
(None, _tt.AllInSet, _tt.whitespace_set, +1),
#2
(_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
#3: accumulate token text and exit, or move to the next rule.
(_UNUSED, _tt.AllInSet + _tt.AppendMatch, _idchar_set, +2),
#4
(_E_TOKEN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -3, _SUCCESS),
#5: single character tokens exit immediately, or move to the next rule
(_UNUSED, _tt.IsInSet + _tt.AppendMatch, _onechar_token_set, +2),
#6
(_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, -5, _SUCCESS),
#7: if this isn't an '@' symbol, then we have a syntax error (go to a
# negative index to indicate that condition). otherwise, suck it up
# and move to the next rule.
(_T_STRING_START, _tt.Is + _tt.AppendTagobj, '@'),
#8
(None, _tt.Is, '@', +4, +1),
#9
(buf, _tt.Is, '@', +1, -1),
#10
(_T_STRING_END, _tt.Skip + _tt.AppendTagobj, 0, 0, +1),
#11
(_E_STRING_END, _tt.EOF + _tt.AppendTagobj, _tt.Here, -10, _SUCCESS),
#12
(_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
#13: suck up everything that isn't an AT. go to next rule to look for EOF
(buf, _tt.AllInSet, _not_at_set, 0, +1),
#14: go back to look for double AT if we aren't at the end of the string
(_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -6, _SUCCESS),
)
# Fast, texttools may be, but it's somewhat lacking in clarity.
# Here's an attempt to document the logic encoded in the table above:
#
# Flowchart:
# _____
# / /\
# 1 -> 2 -> 3 -> 5 -> 7 -> 8 -> 9 -> 10 -> 11
# | \/ \/ \/ /\ \/
# \ 4 6 12 14 /
# \_______/_____/ \ / /
# \ 13 /
# \__________________________________________/
#
# #1: Skip over any whitespace.
# #2: If now EOF, exit with code _E_COMPLETE.
# #3: If we have a series of characters in _idchar_set, then:
# #4: Output them as a token, and go back to #1.
# #5: If we have a character in _onechar_token_set, then:
# #6: Output it as a token, and go back to #1.
# #7: If we do not have an '@', then error.
# If we do, then log a _T_STRING_START and continue.
# #8: If we have another '@', continue on to #9. Otherwise:
# #12: If now EOF, exit with code _E_STRING_SPAN.
# #13: Record the slice up to the next '@' (or EOF).
# #14: If now EOF, exit with code _E_STRING_SPAN.
# Otherwise, go back to #8.
# #9: If we have another '@', then we've just seen an escaped
# (by doubling) '@' within an @-string. Record a slice including
# just one '@' character, and jump back to #8.
# Otherwise, we've *either* seen the terminating '@' of an @-string,
# *or* we've seen one half of an escaped @@ sequence that just
# happened to be split over a chunk boundary - in either case,
# we continue on to #10.
# #10: Log a _T_STRING_END.
# #11: If now EOF, exit with _E_STRING_END. Otherwise, go back to #1.
success, taglist, idx = _tt.tag(buf, table, start)
if not success:
### need a better way to report this error
raise common.RCSIllegalCharacter()
assert idx == buflen
# pop off the last item
last_which = taglist.pop()
i = 0
tlen = len(taglist)
while i < tlen:
if taglist[i] == _T_STRING_START:
j = i + 1
while j < tlen:
if taglist[j] == _T_STRING_END:
s = _tt.join(taglist, '', i+1, j)
del taglist[i:j]
tlen = len(taglist)
taglist[i] = s
break
j = j + 1
else:
assert last_which == _E_STRING_SPAN
s = _tt.join(taglist, '', i+1)
del taglist[i:]
self.partial = (_T_STRING_SPAN, [ s ])
break
i = i + 1
# figure out whether we have a partial last-token
if last_which == _E_TOKEN:
self.partial = (_T_TOKEN, [ taglist.pop() ])
elif last_which == _E_COMPLETE:
pass
elif last_which == _E_STRING_SPAN:
assert self.partial
else:
assert last_which == _E_STRING_END
self.partial = (_T_STRING_END, [ taglist.pop() ])
taglist.reverse()
taglist.extend(self.tokens)
self.tokens = taglist
def _set_end(self, taglist, text, l, r, subtags):
self.string_end = l
def _handle_partial(self, buf):
which, chunks = self.partial
if which == _T_TOKEN:
success, taglist, idx = _tt.tag(buf, _c_token_table)
if not success:
# The start of this buffer was not a token. So the end of the
# prior buffer was a complete token.
self.tokens.insert(0, string.join(chunks, ''))
else:
assert len(taglist) == 1 and taglist[0][0] == _T_TOKEN \
and taglist[0][1] == 0 and taglist[0][2] == idx
if idx == len(buf):
#
# The whole buffer was one huge token, so we may have a
# partial token again.
#
# Note: this modifies the list of chunks in self.partial
#
chunks.append(buf)
# consumed the whole buffer
return len(buf)
# got the rest of the token.
chunks.append(buf[:idx])
self.tokens.insert(0, string.join(chunks, ''))
# no more partial token
self.partial = None
return idx
if which == _T_STRING_END:
if buf[0] != '@':
self.tokens.insert(0, string.join(chunks, ''))
return 0
chunks.append('@')
start = 1
else:
start = 0
self.string_end = None
string_table = (
(None, _tt.Is, '@', +3, +1),
(_UNUSED, _tt.Is + _tt.AppendMatch, '@', +1, -1),
(self._set_end, _tt.Skip + _tt.CallTag, 0, 0, _SUCCESS),
(None, _tt.EOF, _tt.Here, +1, _SUCCESS),
# suck up everything that isn't an AT. move to next rule to look
# for EOF
(_UNUSED, _tt.AllInSet + _tt.AppendMatch, _not_at_set, 0, +1),
# go back to look for double AT if we aren't at the end of the string
(None, _tt.EOF, _tt.Here, -5, _SUCCESS),
)
success, unused, idx = _tt.tag(buf, string_table,
start, len(buf), chunks)
# must have matched at least one item
assert success
if self.string_end is None:
assert idx == len(buf)
self.partial = (_T_STRING_SPAN, chunks)
elif self.string_end < len(buf):
self.partial = None
self.tokens.insert(0, string.join(chunks, ''))
else:
self.partial = (_T_STRING_END, chunks)
return idx
def _parse_more(self):
buf = self.rcsfile.read(self.CHUNK_SIZE)
if not buf:
return _EOF
if self.partial:
idx = self._handle_partial(buf)
if idx is None:
return _CONTINUE
if idx < len(buf):
self._parse_chunk(buf, idx)
else:
self._parse_chunk(buf)
return _CONTINUE
def get(self):
try:
return self.tokens.pop()
except IndexError:
pass
while not self.tokens:
action = self._parse_more()
if action == _EOF:
return None
return self.tokens.pop()
# _get = get
# def get(self):
token = self._get()
print 'T:', `token`
return token
def match(self, match):
if self.tokens:
token = self.tokens.pop()
else:
token = self.get()
if token != match:
raise common.RCSExpected(token, match)
def unget(self, token):
self.tokens.append(token)
def mget(self, count):
"Return multiple tokens. 'next' is at the end."
while len(self.tokens) < count:
action = self._parse_more()
if action == _EOF:
### fix this
raise RuntimeError, 'EOF hit while expecting tokens'
result = self.tokens[-count:]
del self.tokens[-count:]
return result
class Parser(common._Parser):
stream_class = _mxTokenStream