hashsplit.py: remove PackWriter-specific knowledge.

author Avery Pennarun <apenwarr@gmail.com>

Thu, 17 Feb 2011 12:22:50 +0000 (04:22 -0800)

committer Avery Pennarun <apenwarr@gmail.com>

Sun, 20 Feb 2011 05:38:28 +0000 (21:38 -0800)
author Avery Pennarun <apenwarr@gmail.com>
Thu, 17 Feb 2011 12:22:50 +0000 (04:22 -0800)
committer Avery Pennarun <apenwarr@gmail.com>
Sun, 20 Feb 2011 05:38:28 +0000 (21:38 -0800)
diff --git a/cmd/save-cmd.py b/cmd/save-cmd.py

index 8c2ba40abe61382ea5a46ebf99f9fee007432842..7219ab4b55441293136d74f58bfa99193aef1702 100755 (executable)
--- a/cmd/save-cmd.py
+++ b/cmd/save-cmd.py
@@ -263,7 +263,8 @@ for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse_during):
                  lastskip_name = ent.name
              else:
                  try:
-                    (mode, id) = hashsplit.split_to_blob_or_tree(w, [f],
+                    (mode, id) = hashsplit.split_to_blob_or_tree(
+                                            w.new_blob, w.new_tree, [f],
                                              keep_boundaries=False)
                  except IOError, e:
                      add_error('%s: %s' % (ent.name, e))
diff --git a/cmd/split-cmd.py b/cmd/split-cmd.py

index 363896f792381f3b85d34cc3ab98339af0458d2d..b243c61400e918fdae1b19a907177c38a61ea2a1 100755 (executable)
--- a/cmd/split-cmd.py
+++ b/cmd/split-cmd.py
@@ -47,9 +47,9 @@ if opt.verbose >= 2:
      git.verbose = opt.verbose - 1
      opt.bench = 1
  if opt.max_pack_size:
-    hashsplit.max_pack_size = parse_num(opt.max_pack_size)
+    git.max_pack_size = parse_num(opt.max_pack_size)
  if opt.max_pack_objects:
-    hashsplit.max_pack_objects = parse_num(opt.max_pack_objects)
+    git.max_pack_objects = parse_num(opt.max_pack_objects)
  if opt.fanout:
      hashsplit.fanout = parse_num(opt.fanout)
  if opt.blobs:
@@ -128,14 +128,16 @@ else:
      files = extra and (open(fn) for fn in extra) or [sys.stdin]
  
  if pack_writer and opt.blobs:
-    shalist = hashsplit.split_to_blobs(pack_writer, files,
+    shalist = hashsplit.split_to_blobs(pack_writer.new_blob, files,
                                         keep_boundaries=opt.keep_boundaries,
                                         progress=prog)
      for (sha, size, level) in shalist:
          print sha.encode('hex')
          reprogress()
  elif pack_writer:  # tree or commit or name
-    shalist = hashsplit.split_to_shalist(pack_writer, files,
+    shalist = hashsplit.split_to_shalist(pack_writer.new_blob,
+                                         pack_writer.new_tree,
+                                         files,
                                           keep_boundaries=opt.keep_boundaries,
                                           progress=prog)
      tree = pack_writer.new_tree(shalist)
diff --git a/lib/bup/client.py b/lib/bup/client.py

index f941067237b1f574d93bb2b40668c5f5eb20dfca..5563905569e5d489b2ef46ae166bf03a5de0824e 100644 (file)
--- a/lib/bup/client.py
+++ b/lib/bup/client.py
@@ -177,7 +177,6 @@ class Client:
              self.sync_index(idx)
          git.auto_midx(self.cachedir)
  
-
      def sync_index(self, name):
          #debug1('requesting %r\n' % name)
          self.check_busy()
diff --git a/lib/bup/git.py b/lib/bup/git.py

index 6db392a2226fe5de78df7d4c15c44982a18bfdd2..fd364b542531e3499839b8ac156f43e7ddfb3fb1 100644 (file)
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -6,6 +6,8 @@ import os, sys, zlib, time, subprocess, struct, stat, re, tempfile, glob
  from bup.helpers import *
  from bup import _helpers, path, midx, bloom
  
+max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
+max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  SEEK_END=2  # os.SEEK_END is not defined in python 2.4
  
  verbose = 0
@@ -509,6 +511,8 @@ class PackWriter:
          if not sha:
              sha = calc_hash(type, content)
          size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
+        if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
+            self.breakpoint()
          return sha
  
      def breakpoint(self):
@@ -531,10 +535,10 @@ class PackWriter:
  
      def maybe_write(self, type, content):
          """Write an object to the pack file if not present and return its id."""
-        self._require_objcache()
          sha = calc_hash(type, content)
          if not self.exists(sha):
              self._write(sha, type, content)
+            self._require_objcache()
              self.objcache.add(sha)
          return sha
  
diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py

index 439c63db6c7374da6c868b3c1e123c05170b648c..1819294d1aa7d2029203316d78f9c300bc75b7c8 100644 (file)
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -6,8 +6,6 @@ BLOB_MAX = 8192*4   # 8192 is the "typical" blob size for bupsplit
  BLOB_READ_SIZE = 1024*1024
  MAX_PER_TREE = 256
  progress_callback = None
-max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
-max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
  fanout = 16
  
  # The purpose of this type of buffer is to avoid copying on peek(), get(),
@@ -105,13 +103,11 @@ def hashsplit_iter(files, keep_boundaries, progress):
  
  
  total_split = 0
-def split_to_blobs(w, files, keep_boundaries, progress):
+def split_to_blobs(makeblob, files, keep_boundaries, progress):
      global total_split
      for (blob, level) in hashsplit_iter(files, keep_boundaries, progress):
-        sha = w.new_blob(blob)
+        sha = makeblob(blob)
          total_split += len(blob)
-        if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
-            w.breakpoint()
          if progress_callback:
              progress_callback(len(blob))
          yield (sha, len(blob), level)
@@ -127,7 +123,7 @@ def _make_shalist(l):
      return (shalist, total)
  
  
-def _squish(w, stacks, n):
+def _squish(maketree, stacks, n):
      i = 0
      while i<n or len(stacks[i]) > MAX_PER_TREE:
          while len(stacks) <= i+1:
@@ -136,14 +132,15 @@ def _squish(w, stacks, n):
              stacks[i+1] += stacks[i]
          elif stacks[i]:
              (shalist, size) = _make_shalist(stacks[i])
-            tree = w.new_tree(shalist)
+            tree = maketree(shalist)
              stacks[i+1].append(('40000', tree, size))
          stacks[i] = []
          i += 1
  
  
-def split_to_shalist(w, files, keep_boundaries, progress=None):
-    sl = split_to_blobs(w, files, keep_boundaries, progress)
+def split_to_shalist(makeblob, maketree, files,
+                     keep_boundaries, progress=None):
+    sl = split_to_blobs(makeblob, files, keep_boundaries, progress)
      assert(fanout != 0)
      if not fanout:
          shal = []
@@ -155,21 +152,22 @@ def split_to_shalist(w, files, keep_boundaries, progress=None):
          for (sha,size,level) in sl:
              stacks[0].append(('100644', sha, size))
              if level:
-                _squish(w, stacks, level)
+                _squish(maketree, stacks, level)
          #log('stacks: %r\n' % [len(i) for i in stacks])
-        _squish(w, stacks, len(stacks)-1)
+        _squish(maketree, stacks, len(stacks)-1)
          #log('stacks: %r\n' % [len(i) for i in stacks])
          return _make_shalist(stacks[-1])[0]
  
  
-def split_to_blob_or_tree(w, files, keep_boundaries):
-    shalist = list(split_to_shalist(w, files, keep_boundaries))
+def split_to_blob_or_tree(makeblob, maketree, files, keep_boundaries):
+    shalist = list(split_to_shalist(makeblob, maketree,
+                                    files, keep_boundaries))
      if len(shalist) == 1:
          return (shalist[0][0], shalist[0][2])
      elif len(shalist) == 0:
-        return ('100644', w.new_blob(''))
+        return ('100644', makeblob(''))
      else:
-        return ('40000', w.new_tree(shalist))
+        return ('40000', maketree(shalist))
  
  
  def open_noatime(name):
author	Avery Pennarun <apenwarr@gmail.com>
	Thu, 17 Feb 2011 12:22:50 +0000 (04:22 -0800)
committer	Avery Pennarun <apenwarr@gmail.com>
	Sun, 20 Feb 2011 05:38:28 +0000 (21:38 -0800)
cmd/save-cmd.py		patch \| blob \| history
cmd/split-cmd.py		patch \| blob \| history
lib/bup/client.py		patch \| blob \| history
lib/bup/git.py		patch \| blob \| history
lib/bup/hashsplit.py		patch \| blob \| history