Speed up the get() method by being smarter about regular expression use,

avoiding slicing of strings, and more fine-tuned parsing. (some debug stuff, too; this will disappear soon; just checkpointing now) git-svn-id: http://viewvc.tigris.org/svn/viewvc/trunk@186 8cb11bc2-c004-0410-86c3-e597b4017df7
2001-05-13 02:52:26 +00:00 · 2001-05-13 02:52:26 +00:00 · fc6d80e2fb
parent 425ae2e8ec
commit fc6d80e2fb
1 changed files with 67 additions and 42 deletions
--- a/lib/rcsparse.py
+++ b/lib/rcsparse.py
@ -27,17 +27,21 @@ import time
 class _TokenStream:
  # Precompiled regular expressions
-  find_token  = re.compile('^\\s*(;|@|.([^;\\s]*))(?P<ws>\\s*)')
+  nonws_token = re.compile('([^;\\s]*)(\\s*)')
-  rest_token  = re.compile('^([^;\\s]*)(?P<ws>\\s*)')
+#  odd_at      = re.compile('(([^@]|^)(@@)*)@([^@]|$)')
-  odd_at      = re.compile('(([^@]|^)(@@)*)@([^@]|$)')
+#  odd_at      = re.compile('((@@)*)@([^@]|$)')
-  undo_escape = re.compile('@@')
+#  undo_escape = re.compile('@@')
  CHUNK_SIZE  = 16384
 #  CHUNK_SIZE  = 5	# for debugging, make the function grind...
  def __init__(self, file):
    self.rcsfile = file
    self.save_token = None
-    self.buf = ''
+    self.idx = 0
    self.buf = self.rcsfile.read(self.CHUNK_SIZE)
    if self.buf == '':
      raise RuntimeError, 'EOF'
  def get(self):
    "Get the next token from the RCS file."
@ -48,66 +52,87 @@ class _TokenStream:
      self.save_token = None
      return token
    idx = self.idx
    while 1:
-      match = self.find_token.match(self.buf)
+      self.buf = string.lstrip(self.buf[idx:])
-      if match:
+      idx = 0
      if self.buf:
        # some non-whitespace exists, so go parse it
        break
-      # if we didn't find something, then it is all white space (note that
+      # the whole buffer was whitespace. go get more.
      # the pattern will match a non-white because of the "."). we can just
      # toss the whole buffer and go for more.
      self.buf = self.rcsfile.read(self.CHUNK_SIZE)
      if self.buf == '':
        # signal EOF by returning None as the token
        return None
-    # retrieve the match and trim it from the buffer
+    if self.buf[idx] == ';':
-    token = match.group(1)
+      self.idx = idx + 1
    self.buf = self.buf[match.end():]
    if token == ';':
      return ';'
-    if token != '@':
+    if self.buf[idx] != '@':
      match = self.nonws_token.match(self.buf, idx)
      start, idx = match.span(1)
      token = self.buf[start:idx]
      # got a string of non-whitespace characters. if we recognized the rest
      # of the buffer (and we didn't see trailing white space), then we may
      # not have the whole token.
-      while self.buf == '' and match.group('ws') == 0:
+      while idx == len(self.buf) == match.end(2):
        # hit the end (and trimmed it). get more data, and append the results
        self.buf = self.rcsfile.read(self.CHUNK_SIZE)
-        match = self.rest_token.match(self.buf)
+        match = self.nonws_token.match(self.buf)
-        if not match.group(1):
+        start, idx = match.span(1)
-          # first character (';' or '\\s') terminates the token
+        if idx == 0:
          # the first character (';' or '\\s') terminated the token
          break
-        token = token + match.group(1)
+        token = token + self.buf[start:idx]
        self.buf = self.buf[match.end():]
      self.idx = idx
      # done piecing together tokens; return the bugger
      return token
-    # a "string" which starts with the "@" character. the white space that
+    # a "string" which starts with the "@" character. we'll skip it when we
-    # we may have sucked up is the initial token.
+    # search for content. initialize the token for gathering content.
-    token = match.group(3)
+    idx = idx + 1
    token = ''
    chunks = [ ]
    # start scanning blocks looking for the odd @ character which closes
    # the RCS "string"
    while 1:
-      match = self.odd_at.search(self.buf)
+      if idx == len(self.buf):
-      if match:
+        idx = 0
-        break
+        self.buf = self.rcsfile.read(self.CHUNK_SIZE)
        if self.buf == '':
          raise RuntimeError, 'EOF'
      double = string.find(self.buf, '@@', idx)
      single = string.find(self.buf, '@', idx)
 #      print 'I:', idx, double, single
      if double != -1 and double <= single:
        chunks.append(self.buf[idx:double+1])
        idx = double + 2
        continue
      if single == -1:
        chunks.append(self.buf[idx:])
        idx = len(self.buf)
        continue
      if single == len(self.buf) - 1:
        chunks.append(self.buf[idx:single])
        idx = 0
        buf = self.rcsfile.read(self.CHUNK_SIZE)
        if buf == '':
          raise RuntimeError, 'EOF'
        self.buf = '@' + buf
        continue
      chunks.append(self.buf[idx:single])
      self.idx = single + 1
      break
-      # nothing in the whole chunk. append it all and go for more.
+#    print 'S:', `self.buf[self.idx:self.idx+10]`
-      token = token + self.buf
+    return string.join(chunks, '')
      self.buf = self.rcsfile.read(self.CHUNK_SIZE)
      if self.buf == '':
        raise RuntimeError, 'EOF'
    # split up the chunk into "token" and "the rest"
    token = token + self.buf[:match.end(1)]
    self.buf = self.buf[match.end(1)+1:]
    # undo the escape-encoding of @ characters
    token = self.undo_escape.sub('@', token)
 #  _get = get
 #  def get(self):
    token = self._get()
    print 'T:', `token`
    return token
  def match(self, match):