Change name of MissingObject id to oid

[bup.git] / lib / bup / git.py
diff --git a/lib/bup/git.py b/lib/bup/git.py

index 8dd92361f964c367d9ab74c0e2523368ff31f564..12755234bc91eccfa8ecbf30f131458e8830da3e 100644 (file)
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -6,20 +6,18 @@ interact with the Git data structures.
  import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
  from collections import namedtuple
  from itertools import islice
  import errno, os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
  from collections import namedtuple
  from itertools import islice
+from numbers import Integral
  
  from bup import _helpers, hashsplit, path, midx, bloom, xstat
  from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
                           fdatasync,
                           hostname, localtime, log, merge_iter,
                           mmap_read, mmap_readwrite,
  
  from bup import _helpers, hashsplit, path, midx, bloom, xstat
  from bup.helpers import (Sha1, add_error, chunkyreader, debug1, debug2,
                           fdatasync,
                           hostname, localtime, log, merge_iter,
                           mmap_read, mmap_readwrite,
+                         parse_num,
                           progress, qprogress, stat_if_exists,
                           unlink, username, userfullname,
                           utc_offset_str)
  
                           progress, qprogress, stat_if_exists,
                           unlink, username, userfullname,
                           utc_offset_str)
  
-
-max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
-max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
-
  verbose = 0
  ignore_midx = 0
  repodir = None  # The default repository, once initialized
  verbose = 0
  ignore_midx = 0
  repodir = None  # The default repository, once initialized
@@ -46,6 +44,18 @@ def _git_capture(argv):
      _git_wait(repr(argv), p)
      return r
  
      _git_wait(repr(argv), p)
      return r
  
+def git_config_get(option, repo_dir=None):
+    cmd = ('git', 'config', '--get', option)
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                         preexec_fn=_gitenv(repo_dir=repo_dir))
+    r = p.stdout.read()
+    rc = p.wait()
+    if rc == 0:
+        return r
+    if rc != 1:
+        raise GitError('%s returned %d' % (cmd, rc))
+    return None
+
  
  def parse_tz_offset(s):
      """UTC offset in seconds."""
  
  def parse_tz_offset(s):
      """UTC offset in seconds."""
@@ -102,7 +112,8 @@ def parse_commit(content):
  
  def get_commit_items(id, cp):
      commit_it = cp.get(id)
  
  def get_commit_items(id, cp):
      commit_it = cp.get(id)
-    assert(commit_it.next() == 'commit')
+    _, typ, _ = next(commit_it)
+    assert(typ == 'commit')
      commit_content = ''.join(commit_it)
      return parse_commit(commit_content)
  
      commit_content = ''.join(commit_it)
      return parse_commit(commit_content)
  
@@ -596,7 +607,9 @@ def _make_objcache():
  class PackWriter:
      """Writes Git objects inside a pack file."""
      def __init__(self, objcache_maker=_make_objcache, compression_level=1,
  class PackWriter:
      """Writes Git objects inside a pack file."""
      def __init__(self, objcache_maker=_make_objcache, compression_level=1,
-                 run_midx=True, on_pack_finish=None):
+                 run_midx=True, on_pack_finish=None,
+                 max_pack_size=None, max_pack_objects=None):
+        self.repo_dir = repo()
          self.file = None
          self.parentfd = None
          self.count = 0
          self.file = None
          self.parentfd = None
          self.count = 0
@@ -608,13 +621,25 @@ class PackWriter:
          self.compression_level = compression_level
          self.run_midx=run_midx
          self.on_pack_finish = on_pack_finish
          self.compression_level = compression_level
          self.run_midx=run_midx
          self.on_pack_finish = on_pack_finish
+        if not max_pack_size:
+            max_pack_size = git_config_get('pack.packSizeLimit',
+                                           repo_dir=self.repo_dir)
+            if max_pack_size is not None:
+                max_pack_size = parse_num(max_pack_size)
+            if not max_pack_size:
+                # larger packs slow down pruning
+                max_pack_size = 1000 * 1000 * 1000
+        self.max_pack_size = max_pack_size
+        # cache memory usage is about 83 bytes per object
+        self.max_pack_objects = max_pack_objects if max_pack_objects \
+                                else max(1, self.max_pack_size // 5000)
  
      def __del__(self):
          self.close()
  
      def _open(self):
          if not self.file:
  
      def __del__(self):
          self.close()
  
      def _open(self):
          if not self.file:
-            objdir = dir=repo('objects')
+            objdir = dir = os.path.join(self.repo_dir, 'objects')
              fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
              try:
                  self.file = os.fdopen(fd, 'w+b')
              fd, name = tempfile.mkstemp(suffix='.pack', dir=objdir)
              try:
                  self.file = os.fdopen(fd, 'w+b')
@@ -666,7 +691,8 @@ class PackWriter:
          size, crc = self._raw_write(_encode_packobj(type, content,
                                                      self.compression_level),
                                      sha=sha)
          size, crc = self._raw_write(_encode_packobj(type, content,
                                                      self.compression_level),
                                      sha=sha)
-        if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
+        if self.outbytes >= self.max_pack_size \
+           or self.count >= self.max_pack_objects:
              self.breakpoint()
          return sha
  
              self.breakpoint()
          return sha
  
@@ -778,8 +804,8 @@ class PackWriter:
              f.close()
  
          obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
              f.close()
  
          obj_list_sha = self._write_pack_idx_v2(self.filename + '.idx', idx, packbin)
-
-        nameprefix = repo('objects/pack/pack-%s' % obj_list_sha)
+        nameprefix = os.path.join(self.repo_dir,
+                                  'objects/pack/pack-' +  obj_list_sha)
          if os.path.exists(self.filename + '.map'):
              os.unlink(self.filename + '.map')
          os.rename(self.filename + '.pack', nameprefix + '.pack')
          if os.path.exists(self.filename + '.map'):
              os.unlink(self.filename + '.map')
          os.rename(self.filename + '.pack', nameprefix + '.pack')
@@ -790,7 +816,7 @@ class PackWriter:
              os.close(self.parentfd)
  
          if run_midx:
              os.close(self.parentfd)
  
          if run_midx:
-            auto_midx(repo('objects/pack'))
+            auto_midx(os.path.join(self.repo_dir, 'objects/pack'))
  
          if self.on_pack_finish:
              self.on_pack_finish(nameprefix)
  
          if self.on_pack_finish:
              self.on_pack_finish(nameprefix)
@@ -856,13 +882,13 @@ def _gitenv(repo_dir = None):
      return env
  
  
      return env
  
  
-def list_refs(refnames=None, repo_dir=None,
+def list_refs(patterns=None, repo_dir=None,
                limit_to_heads=False, limit_to_tags=False):
      """Yield (refname, hash) tuples for all repository refs unless
                limit_to_heads=False, limit_to_tags=False):
      """Yield (refname, hash) tuples for all repository refs unless
-    refnames are specified.  In that case, only include tuples for
-    those refs.  The limits restrict the result items to refs/heads or
-    refs/tags.  If both limits are specified, items from both sources
-    will be included.
+    patterns are specified.  In that case, only include tuples for
+    refs matching those patterns (cf. git-show-ref(1)).  The limits
+    restrict the result items to refs/heads or refs/tags.  If both
+    limits are specified, items from both sources will be included.
  
      """
      argv = ['git', 'show-ref']
  
      """
      argv = ['git', 'show-ref']
@@ -871,8 +897,8 @@ def list_refs(refnames=None, repo_dir=None,
      if limit_to_tags:
          argv.append('--tags')
      argv.append('--')
      if limit_to_tags:
          argv.append('--tags')
      argv.append('--')
-    if refnames:
-        argv += refnames
+    if patterns:
+        argv.extend(patterns)
      p = subprocess.Popen(argv,
                           preexec_fn = _gitenv(repo_dir),
                           stdout = subprocess.PIPE)
      p = subprocess.Popen(argv,
                           preexec_fn = _gitenv(repo_dir),
                           stdout = subprocess.PIPE)
@@ -888,7 +914,7 @@ def list_refs(refnames=None, repo_dir=None,
  
  def read_ref(refname, repo_dir = None):
      """Get the commit id of the most recent commit made on a given ref."""
  
  def read_ref(refname, repo_dir = None):
      """Get the commit id of the most recent commit made on a given ref."""
-    refs = list_refs(refnames=[refname], repo_dir=repo_dir, limit_to_heads=True)
+    refs = list_refs(patterns=[refname], repo_dir=repo_dir, limit_to_heads=True)
      l = tuple(islice(refs, 2))
      if l:
          assert(len(l) == 1)
      l = tuple(islice(refs, 2))
      if l:
          assert(len(l) == 1)
@@ -909,8 +935,10 @@ def rev_list(ref, count=None, repo_dir=None):
      """
      assert(not ref.startswith('-'))
      opts = []
      """
      assert(not ref.startswith('-'))
      opts = []
-    if count:
-        opts += ['-n', str(atoi(count))]
+    if isinstance(count, Integral):
+        opts += ['-n', str(count)]
+    else:
+        assert not count
      argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
      p = subprocess.Popen(argv,
                           preexec_fn = _gitenv(repo_dir),
      argv = ['git', 'rev-list', '--pretty=format:%at'] + opts + [ref, '--']
      p = subprocess.Popen(argv,
                           preexec_fn = _gitenv(repo_dir),
@@ -1081,7 +1109,7 @@ class _AbortableIter:
  
      def next(self):
          try:
  
      def next(self):
          try:
-            return self.it.next()
+            return next(self.it)
          except StopIteration as e:
              self.done = True
              raise
          except StopIteration as e:
              self.done = True
              raise
@@ -1100,12 +1128,6 @@ class _AbortableIter:
          self.abort()
  
  
          self.abort()
  
  
-class MissingObject(KeyError):
-    def __init__(self, id):
-        self.id = id
-        KeyError.__init__(self, 'object %r is missing' % id.encode('hex'))
-
-
  _ver_warned = 0
  class CatPipe:
      """Link to 'git cat-file' that is used to retrieve blob data."""
  _ver_warned = 0
  class CatPipe:
      """Link to 'git cat-file' that is used to retrieve blob data."""
@@ -1114,14 +1136,9 @@ class CatPipe:
          self.repo_dir = repo_dir
          wanted = ('1','5','6')
          if ver() < wanted:
          self.repo_dir = repo_dir
          wanted = ('1','5','6')
          if ver() < wanted:
-            if not _ver_warned:
-                log('warning: git version < %s; bup will be slow.\n'
-                    % '.'.join(wanted))
-                _ver_warned = 1
-            self.get = self._slow_get
-        else:
-            self.p = self.inprogress = None
-            self.get = self._fast_get
+            log('error: git version must be at least 1.5.6\n')
+            sys.exit(1)
+        self.p = self.inprogress = None
  
      def _abort(self):
          if self.p:
  
      def _abort(self):
          if self.p:
@@ -1139,35 +1156,39 @@ class CatPipe:
                                    bufsize = 4096,
                                    preexec_fn = _gitenv(self.repo_dir))
  
                                    bufsize = 4096,
                                    preexec_fn = _gitenv(self.repo_dir))
  
-    def _fast_get(self, id):
+    def get(self, ref):
+        """Yield (oidx, type, size), followed by the data referred to by ref.
+        If ref does not exist, only yield (None, None, None).
+
+        """
          if not self.p or self.p.poll() != None:
              self.restart()
          assert(self.p)
          poll_result = self.p.poll()
          assert(poll_result == None)
          if self.inprogress:
          if not self.p or self.p.poll() != None:
              self.restart()
          assert(self.p)
          poll_result = self.p.poll()
          assert(poll_result == None)
          if self.inprogress:
-            log('_fast_get: opening %r while %r is open\n'
-                % (id, self.inprogress))
+            log('get: opening %r while %r is open\n' % (ref, self.inprogress))
          assert(not self.inprogress)
          assert(not self.inprogress)
-        assert(id.find('\n') < 0)
-        assert(id.find('\r') < 0)
-        assert(not id.startswith('-'))
-        self.inprogress = id
-        self.p.stdin.write('%s\n' % id)
+        assert(ref.find('\n') < 0)
+        assert(ref.find('\r') < 0)
+        assert(not ref.startswith('-'))
+        self.inprogress = ref
+        self.p.stdin.write('%s\n' % ref)
          self.p.stdin.flush()
          hdr = self.p.stdout.readline()
          if hdr.endswith(' missing\n'):
              self.inprogress = None
          self.p.stdin.flush()
          hdr = self.p.stdout.readline()
          if hdr.endswith(' missing\n'):
              self.inprogress = None
-            raise MissingObject(id.decode('hex'))
-        spl = hdr.split(' ')
-        if len(spl) != 3 or len(spl[0]) != 40:
-            raise GitError('expected blob, got %r' % spl)
-        (hex, type, size) = spl
-
-        it = _AbortableIter(chunkyreader(self.p.stdout, int(spl[2])),
-                           onabort = self._abort)
+            yield None, None, None
+            return
+        info = hdr.split(' ')
+        if len(info) != 3 or len(info[0]) != 40:
+            raise GitError('expected object (id, type, size), got %r' % spl)
+        oidx, typ, size = info
+        size = int(size)
+        it = _AbortableIter(chunkyreader(self.p.stdout, size),
+                            onabort=self._abort)
          try:
          try:
-            yield type
+            yield oidx, typ, size
              for blob in it:
                  yield blob
              readline_result = self.p.stdout.readline()
              for blob in it:
                  yield blob
              readline_result = self.p.stdout.readline()
@@ -1177,38 +1198,24 @@ class CatPipe:
              it.abort()
              raise
  
              it.abort()
              raise
  
-    def _slow_get(self, id):
-        assert(id.find('\n') < 0)
-        assert(id.find('\r') < 0)
-        assert(id[0] != '-')
-        type = _git_capture(['git', 'cat-file', '-t', id]).strip()
-        yield type
-
-        p = subprocess.Popen(['git', 'cat-file', type, id],
-                             stdout=subprocess.PIPE,
-                             preexec_fn = _gitenv(self.repo_dir))
-        for blob in chunkyreader(p.stdout):
-            yield blob
-        _git_wait('git cat-file', p)
-
      def _join(self, it):
      def _join(self, it):
-        type = it.next()
-        if type == 'blob':
+        _, typ, _ = next(it)
+        if typ == 'blob':
              for blob in it:
                  yield blob
              for blob in it:
                  yield blob
-        elif type == 'tree':
+        elif typ == 'tree':
              treefile = ''.join(it)
              for (mode, name, sha) in tree_decode(treefile):
                  for blob in self.join(sha.encode('hex')):
                      yield blob
              treefile = ''.join(it)
              for (mode, name, sha) in tree_decode(treefile):
                  for blob in self.join(sha.encode('hex')):
                      yield blob
-        elif type == 'commit':
+        elif typ == 'commit':
              treeline = ''.join(it).split('\n')[0]
              assert(treeline.startswith('tree '))
              for blob in self.join(treeline[5:]):
                  yield blob
          else:
              raise GitError('invalid object type %r: expected blob/tree/commit'
              treeline = ''.join(it).split('\n')[0]
              assert(treeline.startswith('tree '))
              for blob in self.join(treeline[5:]):
                  yield blob
          else:
              raise GitError('invalid object type %r: expected blob/tree/commit'
-                           % type)
+                           % typ)
  
      def join(self, id):
          """Generate a list of the content of all blobs that can be reached
  
      def join(self, id):
          """Generate a list of the content of all blobs that can be reached
@@ -1250,6 +1257,12 @@ def tags(repo_dir = None):
      return tags
  
  
      return tags
  
  
+class MissingObject(KeyError):
+    def __init__(self, oid):
+        self.oid = oid
+        KeyError.__init__(self, 'object %r is missing' % oid.encode('hex'))
+
+
  WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
                                     'path', 'chunk_path', 'data'])
  # The path is the mangled path, and if an item represents a fragment
  WalkItem = namedtuple('WalkItem', ['id', 'type', 'mode',
                                     'path', 'chunk_path', 'data'])
  # The path is the mangled path, and if an item represents a fragment
@@ -1291,12 +1304,14 @@ def walk_object(cat_pipe, id,
              continue
  
          item_it = cat_pipe.get(id)
              continue
  
          item_it = cat_pipe.get(id)
-        type = item_it.next()
-        if type not in ('blob', 'commit', 'tree'):
-            raise Exception('unexpected repository object type %r' % type)
+        get_oidx, typ, _ = next(item_it)
+        if not get_oidx:
+            raise MissingObject(id.decode('hex'))
+        if typ not in ('blob', 'commit', 'tree'):
+            raise Exception('unexpected repository object type %r' % typ)
  
          # FIXME: set the mode based on the type when the mode is None
  
          # FIXME: set the mode based on the type when the mode is None
-        if type == 'blob' and not include_data:
+        if typ == 'blob' and not include_data:
              # Dump data until we can ask cat_pipe not to fetch it
              for ignored in item_it:
                  pass
              # Dump data until we can ask cat_pipe not to fetch it
              for ignored in item_it:
                  pass
@@ -1304,18 +1319,18 @@ def walk_object(cat_pipe, id,
          else:
              data = ''.join(item_it)
  
          else:
              data = ''.join(item_it)
  
-        yield WalkItem(id=id, type=type,
+        yield WalkItem(id=id, type=typ,
                         chunk_path=chunk_path, path=parent_path,
                         mode=mode,
                         data=(data if include_data else None))
  
                         chunk_path=chunk_path, path=parent_path,
                         mode=mode,
                         data=(data if include_data else None))
  
-        if type == 'commit':
+        if typ == 'commit':
              commit_items = parse_commit(data)
              for pid in commit_items.parents:
                  pending.append((pid, parent_path, chunk_path, mode))
              pending.append((commit_items.tree, parent_path, chunk_path,
                              hashsplit.GIT_MODE_TREE))
              commit_items = parse_commit(data)
              for pid in commit_items.parents:
                  pending.append((pid, parent_path, chunk_path, mode))
              pending.append((commit_items.tree, parent_path, chunk_path,
                              hashsplit.GIT_MODE_TREE))
-        elif type == 'tree':
+        elif typ == 'tree':
              for mode, name, ent_id in tree_decode(data):
                  demangled, bup_type = demangle_name(name, mode)
                  if chunk_path:
              for mode, name, ent_id in tree_decode(data):
                  demangled, bup_type = demangle_name(name, mode)
                  if chunk_path: