Speed up the get() method by being smarter about regular expression use,
avoiding slicing of strings, and more fine-tuned parsing. (some debug stuff, too; this will disappear soon; just checkpointing now) git-svn-id: http://viewvc.tigris.org/svn/viewvc/trunk@186 8cb11bc2-c004-0410-86c3-e597b4017df7remotes/tags/V0_7
parent
425ae2e8ec
commit
fc6d80e2fb
109
lib/rcsparse.py
109
lib/rcsparse.py
|
@ -27,17 +27,21 @@ import time
|
||||||
|
|
||||||
class _TokenStream:
|
class _TokenStream:
|
||||||
# Precompiled regular expressions
|
# Precompiled regular expressions
|
||||||
find_token = re.compile('^\\s*(;|@|.([^;\\s]*))(?P<ws>\\s*)')
|
nonws_token = re.compile('([^;\\s]*)(\\s*)')
|
||||||
rest_token = re.compile('^([^;\\s]*)(?P<ws>\\s*)')
|
# odd_at = re.compile('(([^@]|^)(@@)*)@([^@]|$)')
|
||||||
odd_at = re.compile('(([^@]|^)(@@)*)@([^@]|$)')
|
# odd_at = re.compile('((@@)*)@([^@]|$)')
|
||||||
undo_escape = re.compile('@@')
|
# undo_escape = re.compile('@@')
|
||||||
|
|
||||||
CHUNK_SIZE = 16384
|
CHUNK_SIZE = 16384
|
||||||
|
# CHUNK_SIZE = 5 # for debugging, make the function grind...
|
||||||
|
|
||||||
def __init__(self, file):
|
def __init__(self, file):
|
||||||
self.rcsfile = file
|
self.rcsfile = file
|
||||||
self.save_token = None
|
self.save_token = None
|
||||||
self.buf = ''
|
self.idx = 0
|
||||||
|
self.buf = self.rcsfile.read(self.CHUNK_SIZE)
|
||||||
|
if self.buf == '':
|
||||||
|
raise RuntimeError, 'EOF'
|
||||||
|
|
||||||
def get(self):
|
def get(self):
|
||||||
"Get the next token from the RCS file."
|
"Get the next token from the RCS file."
|
||||||
|
@ -48,66 +52,87 @@ class _TokenStream:
|
||||||
self.save_token = None
|
self.save_token = None
|
||||||
return token
|
return token
|
||||||
|
|
||||||
|
idx = self.idx
|
||||||
while 1:
|
while 1:
|
||||||
match = self.find_token.match(self.buf)
|
self.buf = string.lstrip(self.buf[idx:])
|
||||||
if match:
|
idx = 0
|
||||||
|
if self.buf:
|
||||||
|
# some non-whitespace exists, so go parse it
|
||||||
break
|
break
|
||||||
# if we didn't find something, then it is all white space (note that
|
# the whole buffer was whitespace. go get more.
|
||||||
# the pattern will match a non-white because of the "."). we can just
|
|
||||||
# toss the whole buffer and go for more.
|
|
||||||
self.buf = self.rcsfile.read(self.CHUNK_SIZE)
|
self.buf = self.rcsfile.read(self.CHUNK_SIZE)
|
||||||
if self.buf == '':
|
if self.buf == '':
|
||||||
# signal EOF by returning None as the token
|
# signal EOF by returning None as the token
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# retrieve the match and trim it from the buffer
|
if self.buf[idx] == ';':
|
||||||
token = match.group(1)
|
self.idx = idx + 1
|
||||||
self.buf = self.buf[match.end():]
|
|
||||||
|
|
||||||
if token == ';':
|
|
||||||
return ';'
|
return ';'
|
||||||
|
|
||||||
if token != '@':
|
if self.buf[idx] != '@':
|
||||||
|
match = self.nonws_token.match(self.buf, idx)
|
||||||
|
start, idx = match.span(1)
|
||||||
|
token = self.buf[start:idx]
|
||||||
# got a string of non-whitespace characters. if we recognized the rest
|
# got a string of non-whitespace characters. if we recognized the rest
|
||||||
# of the buffer (and we didn't see trailing white space), then we may
|
# of the buffer (and we didn't see trailing white space), then we may
|
||||||
# not have the whole token.
|
# not have the whole token.
|
||||||
while self.buf == '' and match.group('ws') == 0:
|
while idx == len(self.buf) == match.end(2):
|
||||||
# hit the end (and trimmed it). get more data, and append the results
|
# hit the end (and trimmed it). get more data, and append the results
|
||||||
self.buf = self.rcsfile.read(self.CHUNK_SIZE)
|
self.buf = self.rcsfile.read(self.CHUNK_SIZE)
|
||||||
match = self.rest_token.match(self.buf)
|
match = self.nonws_token.match(self.buf)
|
||||||
if not match.group(1):
|
start, idx = match.span(1)
|
||||||
# first character (';' or '\\s') terminates the token
|
if idx == 0:
|
||||||
|
# the first character (';' or '\\s') terminated the token
|
||||||
break
|
break
|
||||||
token = token + match.group(1)
|
token = token + self.buf[start:idx]
|
||||||
self.buf = self.buf[match.end():]
|
|
||||||
|
|
||||||
|
self.idx = idx
|
||||||
# done piecing together tokens; return the bugger
|
# done piecing together tokens; return the bugger
|
||||||
return token
|
return token
|
||||||
|
|
||||||
# a "string" which starts with the "@" character. the white space that
|
# a "string" which starts with the "@" character. we'll skip it when we
|
||||||
# we may have sucked up is the initial token.
|
# search for content. initialize the token for gathering content.
|
||||||
token = match.group(3)
|
idx = idx + 1
|
||||||
|
token = ''
|
||||||
|
|
||||||
|
chunks = [ ]
|
||||||
|
|
||||||
# start scanning blocks looking for the odd @ character which closes
|
|
||||||
# the RCS "string"
|
|
||||||
while 1:
|
while 1:
|
||||||
match = self.odd_at.search(self.buf)
|
if idx == len(self.buf):
|
||||||
if match:
|
idx = 0
|
||||||
break
|
self.buf = self.rcsfile.read(self.CHUNK_SIZE)
|
||||||
|
if self.buf == '':
|
||||||
|
raise RuntimeError, 'EOF'
|
||||||
|
double = string.find(self.buf, '@@', idx)
|
||||||
|
single = string.find(self.buf, '@', idx)
|
||||||
|
# print 'I:', idx, double, single
|
||||||
|
if double != -1 and double <= single:
|
||||||
|
chunks.append(self.buf[idx:double+1])
|
||||||
|
idx = double + 2
|
||||||
|
continue
|
||||||
|
if single == -1:
|
||||||
|
chunks.append(self.buf[idx:])
|
||||||
|
idx = len(self.buf)
|
||||||
|
continue
|
||||||
|
if single == len(self.buf) - 1:
|
||||||
|
chunks.append(self.buf[idx:single])
|
||||||
|
idx = 0
|
||||||
|
buf = self.rcsfile.read(self.CHUNK_SIZE)
|
||||||
|
if buf == '':
|
||||||
|
raise RuntimeError, 'EOF'
|
||||||
|
self.buf = '@' + buf
|
||||||
|
continue
|
||||||
|
chunks.append(self.buf[idx:single])
|
||||||
|
self.idx = single + 1
|
||||||
|
break
|
||||||
|
|
||||||
# nothing in the whole chunk. append it all and go for more.
|
# print 'S:', `self.buf[self.idx:self.idx+10]`
|
||||||
token = token + self.buf
|
return string.join(chunks, '')
|
||||||
self.buf = self.rcsfile.read(self.CHUNK_SIZE)
|
|
||||||
if self.buf == '':
|
|
||||||
raise RuntimeError, 'EOF'
|
|
||||||
|
|
||||||
# split up the chunk into "token" and "the rest"
|
|
||||||
token = token + self.buf[:match.end(1)]
|
|
||||||
self.buf = self.buf[match.end(1)+1:]
|
|
||||||
|
|
||||||
# undo the escape-encoding of @ characters
|
|
||||||
token = self.undo_escape.sub('@', token)
|
|
||||||
|
|
||||||
|
# _get = get
|
||||||
|
# def get(self):
|
||||||
|
token = self._get()
|
||||||
|
print 'T:', `token`
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def match(self, match):
|
def match(self, match):
|
||||||
|
|
Loading…
Reference in New Issue