Check in the mxTextTools-based parser that I implemented back in

November. The timelog.fetch_log2() timing dropped from .53 seconds for
my test input to .34 seconds. Overall stream speed dropped from .25
seconds to .10 seconds.

This also introduces an mget() method on the token stream so that the
parser can get multiple tokens in one shot, rather than needing to
call many times.

In timelog, I've added a new compare_many() for big verifications,
time_stream to time individual token streams, profile_stream for
profiling streams, and revised the time_fetch function to do multiple
fetches and to trim the exceptional cases off the ends before
reporting.


git-svn-id: http://viewvc.tigris.org/svn/viewvc/trunk@456 8cb11bc2-c004-0410-86c3-e597b4017df7
remotes/tags/1.0.0-rc1
gstein 2002-01-29 10:08:46 +00:00
parent 2c245d2bdb
commit b37103c4d7
2 changed files with 400 additions and 33 deletions

View File

@ -160,8 +160,317 @@ class _TokenStream:
self.get = give_it_back
def mget(self, count):
"Return multiple tokens. 'next' is at the end."
result = [ ]
for i in range(count):
result.append(self.get())
result.reverse()
return result
try:
from mx import TextTools
except ImportError:
_mxTokenStream = None
else:
_tt = TextTools
_idchar_list = map(chr, range(33, 127)) + map(chr, range(160, 256))
_idchar_list.remove('$')
_idchar_list.remove(',')
#_idchar_list.remove('.') leave as part of 'num' symbol
_idchar_list.remove(':')
_idchar_list.remove(';')
_idchar_list.remove('@')
_idchar = string.join(_idchar_list, '')
_idchar_set = _tt.set(_idchar)
_onechar_token_set = _tt.set(':;')
_not_at_set = _tt.invset('@')
_T_TOKEN = 30
_T_STRING_START = 40
_T_STRING_SPAN = 60
_T_STRING_END = 70
_E_COMPLETE = 100 # ended on a complete token
_E_TOKEN = 110 # ended mid-token
_E_STRING_SPAN = 130 # ended within a string
_E_STRING_END = 140 # ended with string-end ('@') (could be mid-@@)
_SUCCESS = +100
_EOF = 'EOF'
_CONTINUE = 'CONTINUE'
_UNUSED = 'UNUSED'
# continuation of a token over a chunk boundary
_c_token_table = (
(_T_TOKEN, _tt.AllInSet, _idchar_set),
)
class _mxTokenStream:
# the algorithm is about the same speed for any CHUNK_SIZE chosen.
# grab a good-sized chunk, but not too large to overwhelm memory.
CHUNK_SIZE = 100000
# CHUNK_SIZE = 5 # for debugging, make the function grind...
def __init__(self, file):
self.rcsfile = file
self.tokens = [ ]
self.partial = None
self.string_end = None
def _parse_chunk(self, buf, start=0):
"Get the next token from the RCS file."
buflen = len(buf)
assert start < buflen
# construct a tag table which refers to the buffer we need to parse.
table = (
# ignore whitespace. with or without whitespace, move to the next rule.
(None, _tt.AllInSet, _tt.whitespace_set, +1),
(_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
# accumulate token text and exit, or move to the next rule.
(_UNUSED, _tt.AllInSet + _tt.AppendMatch, _idchar_set, +2),
(_E_TOKEN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -3, _SUCCESS),
# single character tokens exit immediately, or move to the next rule
(_UNUSED, _tt.IsInSet + _tt.AppendMatch, _onechar_token_set, +2),
(_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, -5, _SUCCESS),
# if this isn't an '@' symbol, then we have a syntax error (go to a
# negative index to indicate that condition). otherwise, suck it up
# and move to the next rule.
(_T_STRING_START, _tt.Is + _tt.AppendTagobj, '@'),
(None, _tt.Is, '@', +4, +1),
(buf, _tt.Is, '@', +1, -1),
(_T_STRING_END, _tt.Skip + _tt.AppendTagobj, 0, 0, +1),
(_E_STRING_END, _tt.EOF + _tt.AppendTagobj, _tt.Here, -10, _SUCCESS),
(_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
# suck up everything that isn't an AT. go to next rule to look for EOF
(buf, _tt.AllInSet, _not_at_set, 0, +1),
# go back to look for double AT if we aren't at the end of the string
(_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -6, _SUCCESS),
)
success, taglist, idx = _tt.tag(buf, table, start)
if not success:
### need a better way to report this error
raise RCSIllegalCharacter()
assert idx == buflen
# pop off the last item
last_which = taglist.pop()
i = 0
tlen = len(taglist)
while i < tlen:
if taglist[i] == _T_STRING_START:
j = i + 1
while j < tlen:
if taglist[j] == _T_STRING_END:
s = _tt.join(taglist, '', i+1, j)
del taglist[i:j]
tlen = len(taglist)
taglist[i] = s
break
j = j + 1
else:
assert last_which == _E_STRING_SPAN
s = _tt.join(taglist, '', i+1)
del taglist[i:]
self.partial = (_T_STRING_SPAN, [ s ])
break
i = i + 1
# figure out whether we have a partial last-token
if last_which == _E_TOKEN:
self.partial = (_T_TOKEN, [ taglist.pop() ])
elif last_which == _E_COMPLETE:
pass
elif last_which == _E_STRING_SPAN:
assert self.partial
else:
assert last_which == _E_STRING_END
self.partial = (_T_STRING_END, [ taglist.pop() ])
taglist.reverse()
taglist.extend(self.tokens)
self.tokens = taglist
def _set_end(self, taglist, text, l, r, subtags):
self.string_end = l
def _handle_partial(self, buf):
which, chunks = self.partial
if which == _T_TOKEN:
success, taglist, idx = _tt.tag(buf, _c_token_table)
if not success:
# The start of this buffer was not a token. So the end of the
# prior buffer was a complete token.
self.tokens.insert(0, string.join(chunks, ''))
else:
assert len(taglist) == 1 and taglist[0][0] == _T_TOKEN \
and taglist[0][1] == 0 and taglist[0][2] == idx
if idx == len(buf):
#
# The whole buffer was one huge token, so we may have a
# partial token again.
#
# Note: this modifies the list of chunks in self.partial
#
chunks.append(buf)
# consumed the whole buffer
return len(buf)
# got the rest of the token.
chunks.append(buf[:idx])
self.tokens.insert(0, string.join(chunks, ''))
# no more partial token
self.partial = None
return idx
if which == _T_STRING_END:
if buf[0] != '@':
self.tokens.insert(0, string.join(chunks, ''))
return 0
chunks.append('@')
start = 1
else:
start = 0
self.string_end = None
string_table = (
(None, _tt.Is, '@', +3, +1),
(_UNUSED, _tt.Is + _tt.AppendMatch, '@', +1, -1),
(self._set_end, _tt.Skip + _tt.CallTag, 0, 0, _SUCCESS),
(None, _tt.EOF, _tt.Here, +1, _SUCCESS),
# suck up everything that isn't an AT. move to next rule to look
# for EOF
(_UNUSED, _tt.AllInSet + _tt.AppendMatch, _not_at_set, 0, +1),
# go back to look for double AT if we aren't at the end of the string
(None, _tt.EOF, _tt.Here, -5, _SUCCESS),
)
success, unused, idx = _tt.tag(buf, string_table,
start, len(buf), chunks)
# must have matched at least one item
assert success
if self.string_end is None:
assert idx == len(buf)
self.partial = (_T_STRING_SPAN, chunks)
elif self.string_end < len(buf):
self.partial = None
self.tokens.insert(0, string.join(chunks, ''))
else:
self.partial = (_T_STRING_END, chunks)
return idx
def _parse_more(self):
buf = self.rcsfile.read(self.CHUNK_SIZE)
if not buf:
return _EOF
if self.partial:
idx = self._handle_partial(buf)
if idx is None:
return _CONTINUE
if idx < len(buf):
self._parse_chunk(buf, idx)
else:
self._parse_chunk(buf)
return _CONTINUE
def get(self):
try:
return self.tokens.pop()
except IndexError:
pass
while not self.tokens:
action = self._parse_more()
if action == _EOF:
return None
return self.tokens.pop()
# _get = get
# def get(self):
token = self._get()
print 'T:', `token`
return token
def match(self, match):
if self.tokens:
token = self.tokens.pop()
if token != match:
raise RuntimeError, ('Unexpected parsing error in RCS file.\n'
'Expected token: %s, but saw: %s'
% (match, token))
else:
token = self.get()
if token != match:
raise RuntimeError, ('Unexpected parsing error in RCS file.\n'
'Expected token: %s, but saw: %s'
% (match, token))
def unget(self, token):
self.tokens.append(token)
def mget(self, count):
"Return multiple tokens. 'next' is at the end."
while len(self.tokens) < count:
action = self._parse_more()
if action == _EOF:
### fix this
raise RuntimeError, 'EOF hit while expecting tokens'
result = self.tokens[-count:]
del self.tokens[-count:]
return result
class RCSParseError(Exception):
pass
class RCSIllegalCharacter(RCSParseError):
pass
### need more work on this one
class RCSExpected(RCSParseError):
def __init__(self, got, wanted):
pass
class Parser:
stream_class = _mxTokenStream or _TokenStream
def parse_rcs_admin(self):
while 1:
# Read initial token at beginning of line
@ -173,21 +482,32 @@ class Parser:
return
if token == "head":
self.sink.set_head_revision(self.ts.get())
self.ts.match(';')
semi, rev = self.ts.mget(2)
self.sink.set_head_revision(rev)
if semi != ';':
raise RCSExpected(semi, ';')
elif token == "branch":
self.sink.set_principal_branch(self.ts.get())
self.ts.match(';')
semi, branch = self.ts.mget(2)
self.sink.set_principal_branch(branch)
if semi != ';':
raise RCSExpected(semi, ';')
elif token == "symbols":
while 1:
tag = self.ts.get()
if tag == ';':
break
(tag_name, tag_rev) = string.split(tag, ':')
if self.stream_class == _mxTokenStream:
self.ts.match(':')
tag_name = tag
tag_rev = self.ts.get()
else:
(tag_name, tag_rev) = string.split(tag, ':')
self.sink.define_tag(tag_name, tag_rev)
elif token == "comment":
self.sink.set_comment(self.ts.get())
self.ts.match(';')
semi, comment = self.ts.mget(2)
self.sink.set_comment(comment)
if semi != ';':
raise RCSExpected(semi, ';')
# Ignore all these other fields - We don't care about them. Also chews
# up "newphrase".
@ -212,9 +532,11 @@ class Parser:
return
# Parse date
self.ts.match('date')
date = self.ts.get()
self.ts.match(';')
semi, date, sym = self.ts.mget(3)
if sym != 'date':
raise RCSExpected(sym, 'date')
if semi != ';':
raise RCSExpected(semi, ';')
# Convert date into timestamp
date_fields = string.split(date, '.') + ['0', '0', '0']
@ -224,9 +546,11 @@ class Parser:
timestamp = time.mktime(tuple(date_fields))
# Parse author
self.ts.match('author')
author = self.ts.get()
self.ts.match(';')
semi, author, sym = self.ts.mget(3)
if sym != 'author':
raise RCSExpected(sym, 'author')
if semi != ';':
raise RCSExpected(semi, ';')
# Parse state
self.ts.match('state')
@ -248,8 +572,9 @@ class Parser:
branches.append(token)
# Parse revision of next delta in chain
self.ts.match('next')
next = self.ts.get()
next, sym = self.ts.mget(2)
if sym != 'next':
raise RCSExpected(sym, 'next')
if next == ';':
next = None
else:
@ -283,15 +608,17 @@ class Parser:
if revision is None:
# EOF
break
self.ts.match('log')
log = self.ts.get()
text, sym2, log, sym1 = self.ts.mget(4)
if sym1 != 'log':
print `text[:100], sym2[:100], log[:100], sym1[:100]`
raise RCSExpected(sym1, 'log')
if sym2 != 'text':
raise RCSExpected(sym2, 'text')
### need to add code to chew up "newphrase"
self.ts.match('text')
text = self.ts.get()
self.sink.set_revision_info(revision, log, text)
def parse(self, file, sink):
self.ts = _TokenStream(file)
self.ts = self.stream_class(file)
self.sink = sink
self.parse_rcs_admin()

View File

@ -84,6 +84,8 @@ def fetch_log2(full_name, which_rev=None):
return sink.head, sink.branch, sink.tags, sink.revs
def compare_fetch(full_name, which_rev=None):
# d1 and d2 are:
# ( HEAD revision, branch name, TAGS { name : revision }, [ LogEntry ] )
d1 = viewcvs.fetch_log(full_name, which_rev)
d2 = fetch_log2(full_name, which_rev)
if d1[:3] != d2[:3]:
@ -102,21 +104,59 @@ def compare_fetch(full_name, which_rev=None):
if vars(d1[3][i]) != vars(d2[3][i]):
pprint.pprint((i, vars(d1[3][i]), vars(d2[3][i])))
def time_fetch(full_name, which_rev=None):
t = time.time()
viewcvs.fetch_log(full_name, which_rev)
t1 = time.time() - t
t = time.time()
fetch_log2(full_name, which_rev)
t2 = time.time() - t
print t1, t2
def compare_many(files):
for file in files:
print file, '...'
compare_fetch(file)
def profile_fetch(full_name, which_rev=None):
def time_stream(stream_class, filename, n=10):
d1 = d2 = d3 = d4 = 0
t = time.time()
for i in range(n):
ts = stream_class(open(filename))
while ts.get() is not None:
pass
t = time.time() - t
print t/n
def time_fetch(full_name, which_rev=None, n=1):
times1 = [ None ] * n
times2 = [ None ] * n
for i in range(n):
t = time.time()
viewcvs.fetch_log(full_name, which_rev)
times1[i] = time.time() - t
for i in range(n):
t = time.time()
fetch_log2(full_name, which_rev)
times2[i] = time.time() - t
times1.sort()
times2.sort()
i1 = int(n*.05)
i2 = int(n*.95)+1
times1 = times1[i1:i2]
times2 = times2[i1:i2]
t1 = reduce(lambda x,y: x+y, times1, 0) / len(times1)
t2 = reduce(lambda x,y: x+y, times2, 0) / len(times2)
print "t1=%.4f (%.4f .. %.4f) t2=%.4f (%.4f .. %.4f)" % \
(t1, times1[0], times1[-1], t2, times2[0], times2[-1])
def profile_stream(stream_class, filename, n=20):
p = profile.Profile()
def many_calls(*args):
for i in xrange(10):
apply(fetch_log2, args)
p.runcall(many_calls, full_name, which_rev)
def many_calls(filename, n):
for i in xrange(n):
ts = stream_class(open(filename))
while ts.get() is not None:
pass
p.runcall(many_calls, filename, n)
p.print_stats()
def profile_fetch(full_name, which_rev=None, n=10):
p = profile.Profile()
def many_calls(full_name, which_rev, n):
for i in xrange(n):
fetch_log2(full_name, which_rev)
p.runcall(many_calls, full_name, which_rev, n)
p.print_stats()
def varysize(full_name, which_rev=None):