]> arthur.barton.de Git - bup.git/commitdiff
Use CatPipe, not show, in git_commit_dates()
authorRob Browning <rlb@defaultvalue.org>
Sun, 18 May 2014 05:51:34 +0000 (00:51 -0500)
committerRob Browning <rlb@defaultvalue.org>
Wed, 21 May 2014 16:45:21 +0000 (11:45 -0500)
Import the commit parser from the experimental bup-get branch and use
it along with CatPipe to produce the git commit dates in
git_commit_dates().

This appears to resolve the performance problem with real archives
that was introduced by the use of "git show -s --pretty=format:%ct
..." (cf. 00ba9fb811e71bb6182b9379461bc6b493e3b7a4), and is still much
faster than 0.25 for at least a 1000 branch synthetic repository here.

Thanks to Gabriel Filion <gabster@lelutin.ca> for reporting the
problem, and to him and Patrick Rouleau <prouleau72@gmail.com> for
helping test the solution.

Signed-off-by: Rob Browning <rlb@defaultvalue.org>
lib/bup/git.py
lib/bup/t/tgit.py

index 50a7bf1346d5ee91424b1c8c16aa9e1dddf78843..59bc169a2e68508f23fe995b5888aea86f447926 100644 (file)
@@ -3,6 +3,8 @@ bup repositories are in Git format. This library allows us to
 interact with the Git data structures.
 """
 import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
+from collections import namedtuple
+
 from bup.helpers import *
 from bup import _helpers, path, midx, bloom, xstat
 
@@ -24,6 +26,66 @@ class GitError(Exception):
     pass
 
 
+def parse_tz_offset(s):
+    """UTC offset in seconds."""
+    tz_off = (int(s[1:3]) * 60 * 60) + (int(s[3:5]) * 60)
+    if s[0] == '-':
+        return - tz_off
+    return tz_off
+
+
+# FIXME: derived from http://git.rsbx.net/Documents/Git_Data_Formats.txt
+# Make sure that's authoritative.
+_start_end_char = r'[^ .,:;<>"\'\0\n]'
+_content_char = r'[^\0\n<>]'
+_safe_str_rx = '(?:%s{1,2}|(?:%s%s*%s))' \
+    % (_start_end_char,
+       _start_end_char, _content_char, _start_end_char)
+_tz_rx = r'[-+]\d\d[0-5]\d'
+_parent_rx = r'(?:parent [abcdefABCDEF0123456789]{40}\n)'
+_commit_rx = re.compile(r'''tree (?P<tree>[abcdefABCDEF0123456789]{40})
+(?P<parents>%s*)author (?P<author_name>%s) <(?P<author_mail>%s)> (?P<asec>\d+) (?P<atz>%s)
+committer (?P<committer_name>%s) <(?P<committer_mail>%s)> (?P<csec>\d+) (?P<ctz>%s)
+
+(?P<message>(?:.|\n)*)''' % (_parent_rx,
+                             _safe_str_rx, _safe_str_rx, _tz_rx,
+                             _safe_str_rx, _safe_str_rx, _tz_rx))
+_parent_hash_rx = re.compile(r'\s*parent ([abcdefABCDEF0123456789]{40})\s*')
+
+
+# Note that the author_sec and committer_sec values are (UTC) epoch seconds.
+CommitInfo = namedtuple('CommitInfo', ['tree', 'parents',
+                                       'author_name', 'author_mail',
+                                       'author_sec', 'author_offset',
+                                       'committer_name', 'committer_mail',
+                                       'committer_sec', 'committer_offset',
+                                       'message'])
+
+def parse_commit(content):
+    commit_match = re.match(_commit_rx, content)
+    if not commit_match:
+        raise Exception('cannot parse commit %r' % content)
+    matches = commit_match.groupdict()
+    return CommitInfo(tree=matches['tree'],
+                      parents=re.findall(_parent_hash_rx, matches['parents']),
+                      author_name=matches['author_name'],
+                      author_mail=matches['author_mail'],
+                      author_sec=int(matches['asec']),
+                      author_offset=parse_tz_offset(matches['atz']),
+                      committer_name=matches['committer_name'],
+                      committer_mail=matches['committer_mail'],
+                      committer_sec=int(matches['csec']),
+                      committer_offset=parse_tz_offset(matches['ctz']),
+                      message=matches['message'])
+
+
+def get_commit_items(id, cp):
+    commit_it = cp.get(id)
+    assert(commit_it.next() == 'commit')
+    commit_content = ''.join(commit_it)
+    return parse_commit(commit_content)
+
+
 def repo(sub = ''):
     """Get the path to the git repository or one of its subdirectories."""
     global repodir
@@ -764,25 +826,10 @@ def get_commit_dates(refs):
        string in refs must resolve to a different commit or this
        function will fail."""
     result = []
-    cmd = ['git', 'show', '-s', '--pretty=format:%ct']
-    for chunk in batchpipe(cmd, refs, preexec_fn=_gitenv):
-        result += [int(x) for x in chunk.splitlines()]
-    if len(result) == len(refs):
-        return result
-    # git show suppressed duplicates -- fix it
-    ref_dates = {}
-    corrected_result = []
-    dates = iter(result)
     for ref in refs:
-        prev_date = ref_dates.get(ref)
-        if prev_date:
-            corrected_result.append(prev_date)
-        else:
-            date = next(dates)
-            ref_dates[ref] = date
-            corrected_result.append(date)
-    assert(next(dates, None) is None)
-    return corrected_result
+        commit = get_commit_items(ref, cp())
+        result.append(commit.author_sec)
+    return result
 
 
 def rev_parse(committish):
index 3f0c703acd6b9f04506ec1ce273d6f6856685026..3a8378b0275605744a361fb1cf34ca7bd21c617e 100644 (file)
@@ -191,3 +191,61 @@ def test_check_repo_or_die():
         os.chdir(orig_cwd)
     if wvfailure_count() == initial_failures:
         subprocess.call(['rm', '-rf', tmpdir])
+
+
+@wvtest
+def test_commit_parsing():
+    def showval(commit, val):
+        return readpipe(['git', 'show', '-s',
+                         '--pretty=format:%s' % val, commit]).strip()
+    initial_failures = wvfailure_count()
+    orig_cwd = os.getcwd()
+    tmpdir = tempfile.mkdtemp(dir=bup_tmp, prefix='bup-tgit-')
+    workdir = tmpdir + "/work"
+    repodir = workdir + '/.git'
+    try:
+        readpipe(['git', 'init', workdir])
+        os.environ['GIT_DIR'] = os.environ['BUP_DIR'] = repodir
+        git.check_repo_or_die(repodir)
+        os.chdir(workdir)
+        with open('foo', 'w') as f:
+            print >> f, 'bar'
+        readpipe(['git', 'add', '.'])
+        readpipe(['git', 'commit', '-am', 'Do something',
+                  '--author', 'Someone <someone@somewhere>',
+                  '--date', 'Sat Oct 3 19:48:49 2009 -0400'])
+        commit = readpipe(['git', 'show-ref', '-s', 'master']).strip()
+        parents = showval(commit, '%P')
+        tree = showval(commit, '%T')
+        cname = showval(commit, '%cn')
+        cmail = showval(commit, '%ce')
+        cdate = showval(commit, '%ct')
+        coffs = showval(commit, '%ci')
+        coffs = coffs[-5:]
+        coff = (int(coffs[-4:-2]) * 60 * 60) + (int(coffs[-2:]) * 60)
+        if coffs[-5] == '-':
+            coff = - coff
+        commit_items = git.get_commit_items(commit, git.cp())
+        WVPASSEQ(commit_items.parents, [])
+        WVPASSEQ(commit_items.tree, tree)
+        WVPASSEQ(commit_items.author_name, 'Someone')
+        WVPASSEQ(commit_items.author_mail, 'someone@somewhere')
+        WVPASSEQ(commit_items.author_sec, 1254613729)
+        WVPASSEQ(commit_items.author_offset, -(4 * 60 * 60))
+        WVPASSEQ(commit_items.committer_name, cname)
+        WVPASSEQ(commit_items.committer_mail, cmail)
+        WVPASSEQ(commit_items.committer_sec, int(cdate))
+        WVPASSEQ(commit_items.committer_offset, coff)
+        WVPASSEQ(commit_items.message, 'Do something\n')
+        with open('bar', 'w') as f:
+            print >> f, 'baz'
+        readpipe(['git', 'add', '.'])
+        readpipe(['git', 'commit', '-am', 'Do something else'])
+        child = readpipe(['git', 'show-ref', '-s', 'master']).strip()
+        parents = showval(child, '%P')
+        commit_items = git.get_commit_items(child, git.cp())
+        WVPASSEQ(commit_items.parents, [commit])
+    finally:
+        os.chdir(orig_cwd)
+    if wvfailure_count() == initial_failures:
+        subprocess.call(['rm', '-rf', tmpdir])