Speed up the get() method by being smarter about regular expression use,

avoiding slicing of strings, and more fine-tuned parsing.

(some debug stuff, too; this will disappear soon; just checkpointing now)


git-svn-id: http://viewvc.tigris.org/svn/viewvc/trunk@186 8cb11bc2-c004-0410-86c3-e597b4017df7
remotes/tags/V0_7
gstein 2001-05-13 02:52:26 +00:00
parent 425ae2e8ec
commit fc6d80e2fb
1 changed files with 67 additions and 42 deletions

View File

@ -27,17 +27,21 @@ import time
class _TokenStream:
# Precompiled regular expressions
find_token = re.compile('^\\s*(;|@|.([^;\\s]*))(?P<ws>\\s*)')
rest_token = re.compile('^([^;\\s]*)(?P<ws>\\s*)')
odd_at = re.compile('(([^@]|^)(@@)*)@([^@]|$)')
undo_escape = re.compile('@@')
nonws_token = re.compile('([^;\\s]*)(\\s*)')
# odd_at = re.compile('(([^@]|^)(@@)*)@([^@]|$)')
# odd_at = re.compile('((@@)*)@([^@]|$)')
# undo_escape = re.compile('@@')
CHUNK_SIZE = 16384
# CHUNK_SIZE = 5 # for debugging, make the function grind...
def __init__(self, file):
self.rcsfile = file
self.save_token = None
self.buf = ''
self.idx = 0
self.buf = self.rcsfile.read(self.CHUNK_SIZE)
if self.buf == '':
raise RuntimeError, 'EOF'
def get(self):
"Get the next token from the RCS file."
@ -48,66 +52,87 @@ class _TokenStream:
self.save_token = None
return token
idx = self.idx
while 1:
match = self.find_token.match(self.buf)
if match:
self.buf = string.lstrip(self.buf[idx:])
idx = 0
if self.buf:
# some non-whitespace exists, so go parse it
break
# if we didn't find something, then it is all white space (note that
# the pattern will match a non-white because of the "."). we can just
# toss the whole buffer and go for more.
# the whole buffer was whitespace. go get more.
self.buf = self.rcsfile.read(self.CHUNK_SIZE)
if self.buf == '':
# signal EOF by returning None as the token
return None
# retrieve the match and trim it from the buffer
token = match.group(1)
self.buf = self.buf[match.end():]
if token == ';':
if self.buf[idx] == ';':
self.idx = idx + 1
return ';'
if token != '@':
if self.buf[idx] != '@':
match = self.nonws_token.match(self.buf, idx)
start, idx = match.span(1)
token = self.buf[start:idx]
# got a string of non-whitespace characters. if we recognized the rest
# of the buffer (and we didn't see trailing white space), then we may
# not have the whole token.
while self.buf == '' and match.group('ws') == 0:
while idx == len(self.buf) == match.end(2):
# hit the end (and trimmed it). get more data, and append the results
self.buf = self.rcsfile.read(self.CHUNK_SIZE)
match = self.rest_token.match(self.buf)
if not match.group(1):
# first character (';' or '\\s') terminates the token
match = self.nonws_token.match(self.buf)
start, idx = match.span(1)
if idx == 0:
# the first character (';' or '\\s') terminated the token
break
token = token + match.group(1)
self.buf = self.buf[match.end():]
token = token + self.buf[start:idx]
self.idx = idx
# done piecing together tokens; return the bugger
return token
# a "string" which starts with the "@" character. the white space that
# we may have sucked up is the initial token.
token = match.group(3)
# a "string" which starts with the "@" character. we'll skip it when we
# search for content. initialize the token for gathering content.
idx = idx + 1
token = ''
chunks = [ ]
# start scanning blocks looking for the odd @ character which closes
# the RCS "string"
while 1:
match = self.odd_at.search(self.buf)
if match:
break
if idx == len(self.buf):
idx = 0
self.buf = self.rcsfile.read(self.CHUNK_SIZE)
if self.buf == '':
raise RuntimeError, 'EOF'
double = string.find(self.buf, '@@', idx)
single = string.find(self.buf, '@', idx)
# print 'I:', idx, double, single
if double != -1 and double <= single:
chunks.append(self.buf[idx:double+1])
idx = double + 2
continue
if single == -1:
chunks.append(self.buf[idx:])
idx = len(self.buf)
continue
if single == len(self.buf) - 1:
chunks.append(self.buf[idx:single])
idx = 0
buf = self.rcsfile.read(self.CHUNK_SIZE)
if buf == '':
raise RuntimeError, 'EOF'
self.buf = '@' + buf
continue
chunks.append(self.buf[idx:single])
self.idx = single + 1
break
# nothing in the whole chunk. append it all and go for more.
token = token + self.buf
self.buf = self.rcsfile.read(self.CHUNK_SIZE)
if self.buf == '':
raise RuntimeError, 'EOF'
# split up the chunk into "token" and "the rest"
token = token + self.buf[:match.end(1)]
self.buf = self.buf[match.end(1)+1:]
# undo the escape-encoding of @ characters
token = self.undo_escape.sub('@', token)
# print 'S:', `self.buf[self.idx:self.idx+10]`
return string.join(chunks, '')
# _get = get
# def get(self):
token = self._get()
print 'T:', `token`
return token
def match(self, match):