More performance tweaks.

git-svn-id: http://viewvc.tigris.org/svn/viewvc/trunk@187 8cb11bc2-c004-0410-86c3-e597b4017df7
2001-05-14 05:53:35 +00:00 · 2001-05-14 05:53:35 +00:00 · 3bff6b7378
parent fc6d80e2fb
commit 3bff6b7378
2 changed files with 72 additions and 59 deletions
--- a/lib/rcsparse.py
+++ b/lib/rcsparse.py
@ -27,17 +27,14 @@ import time

 class _TokenStream:
  # Precompiled regular expressions
-  nonws_token = re.compile('([^;\\s]*)(\\s*)')
-#  odd_at      = re.compile('(([^@]|^)(@@)*)@([^@]|$)')
-#  odd_at      = re.compile('((@@)*)@([^@]|$)')
-#  undo_escape = re.compile('@@')
+  nonws_token = re.compile('[^;\\s]*')

  CHUNK_SIZE  = 16384
+#  CHUNK_SIZE  = 500000
 #  CHUNK_SIZE  = 5	# for debugging, make the function grind...

  def __init__(self, file):
    self.rcsfile = file
-    self.save_token = None
    self.idx = 0
    self.buf = self.rcsfile.read(self.CHUNK_SIZE)
    if self.buf == '':
@ -46,88 +43,94 @@ class _TokenStream:
  def get(self):
    "Get the next token from the RCS file."

-    # if one was pushed back, then return it
-    if self.save_token:
-      token = self.save_token
-      self.save_token = None
-      return token
-
+    buf = self.buf
    idx = self.idx
-    while 1:
-      self.buf = string.lstrip(self.buf[idx:])
+
+    if idx == len(buf):
+      buf = self.rcsfile.read(self.CHUNK_SIZE)
+      if buf == '':
+        # signal EOF by returning None as the token
+        del self.buf	# so we fail if get() is called again
+        return None
      idx = 0
-      if self.buf:
+
+    while buf[idx] in string.whitespace:
+      buf = string.lstrip(buf[idx:])
+      idx = 0
+      if buf:
        # some non-whitespace exists, so go parse it
        break
      # the whole buffer was whitespace. go get more.
-      self.buf = self.rcsfile.read(self.CHUNK_SIZE)
-      if self.buf == '':
+      buf = self.rcsfile.read(self.CHUNK_SIZE)
+      if buf == '':
        # signal EOF by returning None as the token
+        del self.buf	# so we fail if get() is called again
        return None

-    if self.buf[idx] == ';':
+    if buf[idx] == ';':
+      self.buf = buf
      self.idx = idx + 1
      return ';'

-    if self.buf[idx] != '@':
-      match = self.nonws_token.match(self.buf, idx)
-      start, idx = match.span(1)
-      token = self.buf[start:idx]
+    if buf[idx] != '@':
+      match = self.nonws_token.match(buf, idx)
+      start, idx = match.span()
+      token = buf[start:idx]
      # got a string of non-whitespace characters. if we recognized the rest
-      # of the buffer (and we didn't see trailing white space), then we may
-      # not have the whole token.
-      while idx == len(self.buf) == match.end(2):
-        # hit the end (and trimmed it). get more data, and append the results
-        self.buf = self.rcsfile.read(self.CHUNK_SIZE)
-        match = self.nonws_token.match(self.buf)
-        start, idx = match.span(1)
+      # of the buffer, then we may not have the whole token.
+      while idx == len(buf):
+        # hit the end. get more data, and append the results.
+        buf = self.rcsfile.read(self.CHUNK_SIZE)
+        match = self.nonws_token.match(buf)
+        start, idx = match.span()
        if idx == 0:
          # the first character (';' or '\\s') terminated the token
          break
-        token = token + self.buf[start:idx]
+        token = token + buf[start:idx]

+      self.buf = buf
      self.idx = idx
+
      # done piecing together tokens; return the bugger
      return token

    # a "string" which starts with the "@" character. we'll skip it when we
-    # search for content. initialize the token for gathering content.
+    # search for content.
    idx = idx + 1
-    token = ''

    chunks = [ ]

    while 1:
-      if idx == len(self.buf):
-        idx = 0
-        self.buf = self.rcsfile.read(self.CHUNK_SIZE)
-        if self.buf == '':
-          raise RuntimeError, 'EOF'
-      double = string.find(self.buf, '@@', idx)
-      single = string.find(self.buf, '@', idx)
-#      print 'I:', idx, double, single
-      if double != -1 and double <= single:
-        chunks.append(self.buf[idx:double+1])
-        idx = double + 2
-        continue
-      if single == -1:
-        chunks.append(self.buf[idx:])
-        idx = len(self.buf)
-        continue
-      if single == len(self.buf) - 1:
-        chunks.append(self.buf[idx:single])
+      if idx == len(buf):
        idx = 0
        buf = self.rcsfile.read(self.CHUNK_SIZE)
        if buf == '':
          raise RuntimeError, 'EOF'
-        self.buf = '@' + buf
+      double = string.find(buf, '@@', idx)
+      single = string.find(buf, '@', idx)
+#      print 'I:', idx, double, single
+      if double != -1 and double <= single:
+        chunks.append(buf[idx:double+1])
+        idx = double + 2
        continue
-      chunks.append(self.buf[idx:single])
-      self.idx = single + 1
-      break
+      if single == -1:
+        chunks.append(buf[idx:])
+        idx = len(buf)
+        continue
+      if single == len(buf) - 1:
+        chunks.append(buf[idx:single])
+        idx = 0
+        buf = '@' + self.rcsfile.read(self.CHUNK_SIZE)
+        if buf == '@':
+          raise RuntimeError, 'EOF'
+        continue
+      chunks.append(buf[idx:single])

-#    print 'S:', `self.buf[self.idx:self.idx+10]`
-    return string.join(chunks, '')
+      self.buf = buf
+      self.idx = single + 1
+
+#      print 'S:', `buf[self.idx:self.idx+10]`
+      return string.join(chunks, '')

 #  _get = get
 #  def get(self):
@ -146,10 +149,20 @@ class _TokenStream:
  def unget(self, token):
    "Put this token back, for the next get() to return."

+    # Override the class' .get method with a function which clears the
+    # overridden method then returns the pushed token. Since this function
+    # will not be looked up via the class mechanism, it should be a "normal"
+    # function, meaning it won't have "self" automatically inserted.
+    # Therefore, we need to pass both self and the token thru via defaults.
+
    # note: we don't put this into the input buffer because it may have been
    # @-unescaped already.
-    self.save_token = token

+    def give_it_back(self=self, token=token):
+      del self.get
+      return token
+
+    self.get = give_it_back

 class Parser:
  rcs_tree = re.compile('^\\d')
--- a/tests/timelog.py
+++ b/tests/timelog.py
@ -11,8 +11,8 @@ def lines_changed(delta):
  added = deleted = 0
  while idx < len(delta):
    op = delta[idx]
-    i = string.index(delta, ' ', idx + 1)
-    j = string.index(delta, '\n', i + 1)
+    i = string.find(delta, ' ', idx + 1)
+    j = string.find(delta, '\n', i + 1)
    line = int(delta[idx+1:i])
    count = int(delta[i+1:j])
    idx = j + 1
@ -114,7 +114,7 @@ def time_fetch(full_name, which_rev=None):
 def profile_fetch(full_name, which_rev=None):
  p = profile.Profile()
  def many_calls(*args):
-    for i in xrange(5):
+    for i in xrange(10):
      apply(fetch_log2, args)
  p.runcall(many_calls, full_name, which_rev)
  p.print_stats()