From 1b358fc88daaef63f8cd4ff97f59ae11f0fc287c Mon Sep 17 00:00:00 2001
From: gstein <gstein@8cb11bc2-c004-0410-86c3-e597b4017df7>
Date: Wed, 9 May 2001 11:32:47 +0000
Subject: [PATCH] switch from a line-oriented processing to buffer-oriented.
 provides approximately 3x performance increase.

git-svn-id: http://viewvc.tigris.org/svn/viewvc/trunk@169 8cb11bc2-c004-0410-86c3-e597b4017df7
---
 lib/rcsparse.py | 109 ++++++++++++++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 45 deletions(-)
diff --git a/lib/rcsparse.py b/lib/rcsparse.py
index 6e209d2d..c753a60a 100644
--- a/lib/rcsparse.py
+++ b/lib/rcsparse.py
@@ -32,17 +32,17 @@ import time
 
 class _TokenStream:
   # Precompiled regular expressions
-  nonws_token = re.compile('^([^;@][^;\\s]*)\\s*')
-  semic_token = re.compile('^;\\s*')
-  rcsen_token = re.compile('^@([^@]*)')
-  undo_escape = re.compile('@@')
+  find_token  = re.compile('^\\s*(;|@|.([^;\\s]*))(?P<ws>\\s*)')
+  rest_token  = re.compile('^([^;\\s]*)(?P<ws>\\s*)')
   odd_at      = re.compile('(([^@]|^)(@@)*)@([^@]|$)')
+  undo_escape = re.compile('@@')
+
+  CHUNK_SIZE  = 16384
 
   def __init__(self, file):
     self.rcsfile = file
-    self.line_buffer = ''
-    self.feof = 0
     self.save_token = None
+    self.buf = ''
 
   def get(self):
     "Get the next token from the RCS file."
@@ -53,51 +53,66 @@ class _TokenStream:
       self.save_token = None
       return token
 
-    # Erase all-whitespace lines
-    while len(self.line_buffer) == 0:
-      self.line_buffer = self.rcsfile.readline()
-      if self.line_buffer == '':
-        raise RuntimeError, 'EOF'
-      self.line_buffer = string.lstrip(self.line_buffer)
+    while 1:
+      match = self.find_token.match(self.buf)
+      if match:
+        break
+      # if we didn't find something, then it is all white space (note that
+      # the pattern will match a non-white because of the "."). we can just
+      # toss the whole buffer and go for more.
+      self.buf = self.rcsfile.read(self.CHUNK_SIZE)
+      if self.buf == '':
+        # signal EOF by returning None as the token
+        return None
 
-    # A string of non-whitespace characters is a token
-    match = self.nonws_token.match(self.line_buffer)
-    if match:
-      self.line_buffer = self.nonws_token.sub('', self.line_buffer)
-      return match.group(1)
+    # retrieve the match and trim it from the buffer
+    token = match.group(1)
+    self.buf = self.buf[match.end():]
 
-    # ...and so is a single semicolon
-    if self.semic_token.match(self.line_buffer):
-      self.line_buffer = self.semic_token.sub('', self.line_buffer)
+    if token == ';':
       return ';'
 
-    # ...or an RCS-encoded string that starts with an @ character
-    match = self.rcsen_token.match(self.line_buffer)
-    self.line_buffer = self.rcsen_token.sub('', self.line_buffer)
-    token = match.group(1)
+    if token != '@':
+      # got a string of non-whitespace characters. if we recognized the rest
+      # of the buffer (and we didn't see trailing white space), then we may
+      # not have the whole token.
+      while self.buf == '' and match.group('ws') == 0:
+        # hit the end (and trimmed it). get more data, and append the results
+        self.buf = self.rcsfile.read(self.CHUNK_SIZE)
+        match = self.rest_token.match(self.buf)
+        if not match.group(1):
+          # first character (';' or '\\s') terminates the token
+          break
+        token = token + match.group(1)
+        self.buf = self.buf[match.end():]
 
-    # Detect odd @ character used to close RCS-encoded string
-    while string.find(self.line_buffer, '@') < 0 or not self.odd_at.search(self.line_buffer):
-      token = token + self.line_buffer
-      self.line_buffer = self.rcsfile.readline()
-      if self.line_buffer == '':
+      # done piecing together tokens; return the bugger
+      return token
+
+    # a "string" which starts with the "@" character. the white space that
+    # we may have sucked up is the initial token.
+    token = match.group(3)
+
+    # start scanning blocks looking for the odd @ character which closes
+    # the RCS "string"
+    while 1:
+      match = self.odd_at.search(self.buf)
+      if match:
+        break
+
+      # nothing in the whole chunk. append it all and go for more.
+      token = token + self.buf
+      self.buf = self.rcsfile.read(self.CHUNK_SIZE)
+      if self.buf == '':
         raise RuntimeError, 'EOF'
 
-    # Retain the remainder of the line after the terminating @ character
-    i = self.odd_at.search(self.line_buffer).end(1)
-    token = token + self.line_buffer[:i]
-    self.line_buffer = self.line_buffer[i+1:]
+    # split up the chunk into "token" and "the rest"
+    token = token + self.buf[:match.end(1)]
+    self.buf = self.buf[match.end(1)+1:]
 
-    # Undo escape-coding of @ characters.
+    # undo the escape-encoding of @ characters
     token = self.undo_escape.sub('@', token)
 
-    # Digest any extra blank lines
-    while len(self.line_buffer) == 0 or self.line_buffer == '\n':
-      self.line_buffer = self.rcsfile.readline()
-      if self.line_buffer == '':
-        self.feof = 1
-        break
-
     return token
 
   def match(self, match):
@@ -148,7 +163,8 @@ class Parser:
         self.sink.set_comment(self.ts.get())
         self.ts.match(';')
 
-      # Ignore all these other fields - We don't care about them.         
+      # Ignore all these other fields - We don't care about them. Also chews
+      # up "newphrase".
       elif token in ("locks", "strict", "expand", "access"):
         while 1:
           tag = self.ts.get()
@@ -218,7 +234,7 @@ class Parser:
       #    group	15;
       #    permissions	644;
       #    hardlinks	@configure.in@;
-      # we just want to skip over these
+      # this is "newphrase" in RCSFILE(5). we just want to skip over these.
       while 1:
         token = self.ts.get()
         if token == 'desc' or self.rcs_tree.match(token):
@@ -236,11 +252,14 @@ class Parser:
     self.sink.set_description(self.ts.get())
 
   def parse_rcs_deltatext(self):
-    ### maybe have another way to single EOF?
-    while not self.ts.feof:
+    while 1:
       revision = self.ts.get()
+      if revision is None:
+        # EOF
+        break
       self.ts.match('log')
       log = self.ts.get()
+      ### need to add code to chew up "newphrase"
       self.ts.match('text')
       text = self.ts.get()
       self.sink.set_revision_info(revision, log, text)