# SYNOPSIS
-bup random [-S seed] [-f] <numbytes>
+bup random [-S seed] [-fv] <numbytes>
# DESCRIPTION
: generate output even if stdout is a tty. (Generating
random data to a tty is generally considered
ill-advised, but you can do if you really want.)
+
+-v, --verbose
+: print a progress message showing the number of bytes that
+ has been output so far.
# EXAMPLES
bup split [-r *host*:*path*] <-b|-t|-c|-n *name*> [-v] [-q]
[--bench] [--max-pack-size=*bytes*]
- [--max-pack-objects=*n*] [--fanout=*count] [filenames...]
+ [--max-pack-objects=*n*] [--fanout=*count]
+ [--git-ids] [--keep-boundaries] [filenames...]
# DESCRIPTION
-v, --verbose
: increase verbosity (can be used more than once).
+--git-ids
+: stdin is a list of git object ids instead of raw data.
+ `bup split` will read the contents of each named git
+ object (if it exists in the bup repository) and split
+ it. This might be useful for converting a git
+ repository with large binary files to use bup-style
+ hashsplitting instead. This option is probably most
+ useful when combined with `--keep-boundaries`.
+
+--keep-boundaries
+: if multiple filenames are given on the command line,
+ they are normally concatenated together as if the
+ content all came from a single file. That is, the
+ set of blobs/trees produced is identical to what it
+ would have been if there had been a single input file.
+ However, if you use `--keep-boundaries`, each file is
+ split separately. You still only get a single tree or
+ commit or series of blobs, but each blob comes from
+ only one of the files; the end of one of the input
+ files always ends a blob.
+
--noop
: read the data and split it into blocks based on the "bupsplit"
rolling checksum algorithm, but don't do anything with
--
S,seed= optional random number seed [1]
f,force print random data to stdout even if it's a tty
+v,verbose print byte counter to stderr
"""
o = options.Options('bup random', optspec)
(opt, flags, extra) = o.parse(sys.argv[1:])
if opt.force or (not os.isatty(1) and
not atoi(os.environ.get('BUP_FORCE_TTY')) & 1):
- _helpers.write_random(sys.stdout.fileno(), total, opt.seed)
+ _helpers.write_random(sys.stdout.fileno(), total, opt.seed,
+ opt.verbose and 1 or 0)
else:
log('error: not writing binary data to a terminal. Use -f to force.\n')
sys.exit(1)
add_error(e)
lastskip_name = ent.name
else:
- (mode, id) = hashsplit.split_to_blob_or_tree(w, [f])
+ (mode, id) = hashsplit.split_to_blob_or_tree(w, [f], False)
else:
if stat.S_ISDIR(ent.mode):
assert(0) # handled above
git.check_repo_or_die()
assert(name.find('/') < 0)
assert(name.endswith('.idx'))
- idx = git.PackIdx(git.repo('objects/pack/%s' % name))
+ idx = git.open_idx(git.repo('objects/pack/%s' % name))
conn.write(struct.pack('!I', len(idx.map)))
conn.write(idx.map)
conn.ok()
d,date= date for the commit (seconds since the epoch)
q,quiet don't print progress messages
v,verbose increase log output (can be used more than once)
+git-ids read a list of git object ids from stdin and split their contents
+keep-boundaries don't let one chunk span two input files
noop don't actually save the data anywhere
copy just copy input to output, hashsplitting along the way
bench print benchmark timings to stderr
if (opt.noop or opt.copy) and (opt.blobs or opt.tree or
opt.commit or opt.name):
o.fatal('-N and --copy are incompatible with -b, -t, -c, -n')
+if extra and opt.git_ids:
+ o.fatal("don't provide filenames when using --git-ids")
if opt.verbose >= 2:
git.verbose = opt.verbose - 1
date = time.time()
+last_prog = total_bytes = 0
+def prog(filenum, nbytes):
+ global last_prog, total_bytes
+ total_bytes += nbytes
+ now = time.time()
+ if now - last_prog < 0.2:
+ return
+ if filenum > 0:
+ progress('Splitting: file #%d, %d kbytes\r'
+ % (filenum+1, total_bytes/1024))
+ else:
+ progress('Splitting: %d kbytes\r' % (total_bytes/1024))
+ last_prog = now
+
+
is_reverse = os.environ.get('BUP_SERVER_REVERSE')
if is_reverse and opt.remote:
o.fatal("don't use -r in reverse mode; it's automatic")
oldref = refname and git.read_ref(refname) or None
pack_writer = git.PackWriter()
-files = extra and (open(fn) for fn in extra) or [sys.stdin]
+if opt.git_ids:
+ # the input is actually a series of git object ids that we should retrieve
+ # and split.
+ #
+ # This is a bit messy, but basically it converts from a series of
+ # CatPipe.get() iterators into a series of file-type objects.
+ # It would be less ugly if either CatPipe.get() returned a file-like object
+ # (not very efficient), or split_to_shalist() expected an iterator instead
+ # of a file.
+ cp = git.CatPipe()
+ class IterToFile:
+ def __init__(self, it):
+ self.it = iter(it)
+ def read(self, size):
+ v = next(self.it)
+ return v or ''
+ def read_ids():
+ while 1:
+ line = sys.stdin.readline()
+ if not line:
+ break
+ if line:
+ line = line.strip()
+ try:
+ it = cp.get(line.strip())
+ next(it) # skip the file type
+ except KeyError, e:
+ add_error('error: %s' % e)
+ continue
+ yield IterToFile(it)
+ files = read_ids()
+else:
+ # the input either comes from a series of files or from stdin.
+ files = extra and (open(fn) for fn in extra) or [sys.stdin]
+
if pack_writer:
- shalist = hashsplit.split_to_shalist(pack_writer, files)
+ shalist = hashsplit.split_to_shalist(pack_writer, files,
+ keep_boundaries=opt.keep_boundaries,
+ progress=prog)
tree = pack_writer.new_tree(shalist)
else:
last = 0
- for (blob, bits) in hashsplit.hashsplit_iter(files):
+ for (blob, bits) in hashsplit.hashsplit_iter(files,
+ keep_boundaries=opt.keep_boundaries,
+ progress=prog):
hashsplit.total_split += len(blob)
if opt.copy:
sys.stdout.write(str(blob))
if opt.bench:
log('\nbup: %.2fkbytes in %.2f secs = %.2f kbytes/sec\n'
% (size/1024., secs, size/1024./secs))
+
+if saved_errors:
+ log('WARNING: %d errors encountered while saving.\n' % len(saved_errors))
+ sys.exit(1)
static PyObject *write_random(PyObject *self, PyObject *args)
{
uint32_t buf[1024/4];
- int fd = -1, seed = 0;
+ int fd = -1, seed = 0, verbose = 0;
ssize_t ret;
long long len = 0, kbytes = 0, written = 0;
- if (!PyArg_ParseTuple(args, "iLi", &fd, &len, &seed))
+ if (!PyArg_ParseTuple(args, "iLii", &fd, &len, &seed, &verbose))
return NULL;
srandom(seed);
written += ret;
if (ret < (int)sizeof(buf))
break;
- if (kbytes/1024 > 0 && !(kbytes%1024))
+ if (verbose && kbytes/1024 > 0 && !(kbytes%1024))
fprintf(stderr, "Random: %lld Mbytes\r", kbytes/1024);
}
class PackIdx:
- """Object representation of a Git pack index file."""
- def __init__(self, filename):
- self.name = filename
- self.idxnames = [self.name]
- self.map = mmap_read(open(filename))
- assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
- self.fanout = list(struct.unpack('!256I',
- str(buffer(self.map, 8, 256*4))))
- self.fanout.append(0) # entry "-1"
- nsha = self.fanout[255]
- self.ofstable = buffer(self.map,
- 8 + 256*4 + nsha*20 + nsha*4,
- nsha*4)
- self.ofs64table = buffer(self.map,
- 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
+ def __init__(self):
+ assert(0)
+
+ def find_offset(self, hash):
+ """Get the offset of an object inside the index file."""
+ idx = self._idx_from_hash(hash)
+ if idx != None:
+ return self._ofs_from_idx(idx)
+ return None
- def _ofs_from_idx(self, idx):
- ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
- if ofs & 0x80000000:
- idx64 = ofs & 0x7fffffff
- ofs = struct.unpack('!I',
- str(buffer(self.ofs64table, idx64*8, 8)))[0]
- return ofs
+ def exists(self, hash):
+ """Return nonempty if the object exists in this index."""
+ return hash and (self._idx_from_hash(hash) != None) and True or None
+
+ def __len__(self):
+ return int(self.fanout[255])
def _idx_from_hash(self, hash):
global _total_searches, _total_steps
b1 = ord(hash[0])
start = self.fanout[b1-1] # range -1..254
end = self.fanout[b1] # range 0..255
- buf = buffer(self.map, 8 + 256*4, end*20)
want = str(hash)
_total_steps += 1 # lookup table is a step
while start < end:
_total_steps += 1
mid = start + (end-start)/2
- v = str(buf[mid*20:(mid+1)*20])
+ v = self._idx_to_hash(mid)
if v < want:
start = mid+1
elif v > want:
return mid
return None
- def find_offset(self, hash):
- """Get the offset of an object inside the index file."""
- idx = self._idx_from_hash(hash)
- if idx != None:
- return self._ofs_from_idx(idx)
- return None
- def exists(self, hash):
- """Return nonempty if the object exists in this index."""
- return hash and (self._idx_from_hash(hash) != None) and True or None
+class PackIdxV1(PackIdx):
+ """Object representation of a Git pack index (version 1) file."""
+ def __init__(self, filename, f):
+ self.name = filename
+ self.idxnames = [self.name]
+ self.map = mmap_read(f)
+ self.fanout = list(struct.unpack('!256I',
+ str(buffer(self.map, 0, 256*4))))
+ self.fanout.append(0) # entry "-1"
+ nsha = self.fanout[255]
+ self.shatable = buffer(self.map, 256*4, nsha*24)
+
+ def _ofs_from_idx(self, idx):
+ return struct.unpack('!I', str(self.shatable[idx*24 : idx*24+4]))[0]
+
+ def _idx_to_hash(self, idx):
+ return str(self.shatable[idx*24+4 : idx*24+24])
def __iter__(self):
for i in xrange(self.fanout[255]):
- yield buffer(self.map, 8 + 256*4 + 20*i, 20)
+ yield buffer(self.map, 256*4 + 24*i + 4, 20)
- def __len__(self):
- return int(self.fanout[255])
+
+class PackIdxV2(PackIdx):
+ """Object representation of a Git pack index (version 2) file."""
+ def __init__(self, filename, f):
+ self.name = filename
+ self.idxnames = [self.name]
+ self.map = mmap_read(f)
+ assert(str(self.map[0:8]) == '\377tOc\0\0\0\2')
+ self.fanout = list(struct.unpack('!256I',
+ str(buffer(self.map, 8, 256*4))))
+ self.fanout.append(0) # entry "-1"
+ nsha = self.fanout[255]
+ self.shatable = buffer(self.map, 8 + 256*4, nsha*20)
+ self.ofstable = buffer(self.map,
+ 8 + 256*4 + nsha*20 + nsha*4,
+ nsha*4)
+ self.ofs64table = buffer(self.map,
+ 8 + 256*4 + nsha*20 + nsha*4 + nsha*4)
+
+ def _ofs_from_idx(self, idx):
+ ofs = struct.unpack('!I', str(buffer(self.ofstable, idx*4, 4)))[0]
+ if ofs & 0x80000000:
+ idx64 = ofs & 0x7fffffff
+ ofs = struct.unpack('!I',
+ str(buffer(self.ofs64table, idx64*8, 8)))[0]
+ return ofs
+
+ def _idx_to_hash(self, idx):
+ return str(self.shatable[idx*20:(idx+1)*20])
+
+ def __iter__(self):
+ for i in xrange(self.fanout[255]):
+ yield buffer(self.map, 8 + 256*4 + 20*i, 20)
extract_bits = _helpers.extract_bits
for f in os.listdir(self.dir):
full = os.path.join(self.dir, f)
if f.endswith('.idx') and not d.get(full):
- ix = PackIdx(full)
+ ix = open_idx(full)
d[full] = ix
self.packs = list(set(d.values()))
debug1('PackIdxList: using %d index%s.\n'
def open_idx(filename):
if filename.endswith('.idx'):
- return PackIdx(filename)
+ f = open(filename, 'rb')
+ header = f.read(8)
+ if header[0:4] == '\377tOc':
+ version = struct.unpack('!I', header[4:8])[0]
+ if version == 2:
+ return PackIdxV2(filename, f)
+ else:
+ raise GitError('%s: expected idx file version 2, got %d'
+ % (filename, version))
+ else:
+ return PackIdxV1(filename, f)
elif filename.endswith('.midx'):
return PackMidx(filename)
else:
assert(not self.inprogress)
assert(id.find('\n') < 0)
assert(id.find('\r') < 0)
- assert(id[0] != '-')
+ assert(not id.startswith('-'))
self.inprogress = id
self.p.stdin.write('%s\n' % id)
hdr = self.p.stdout.readline()
if hdr.endswith(' missing\n'):
+ self.inprogress = None
raise KeyError('blob %r is missing' % id)
spl = hdr.split(' ')
if len(spl) != 3 or len(spl[0]) != 40:
return (None, 0)
-def blobiter(files):
- for f in files:
+def blobiter(files, progress=None):
+ for filenum,f in enumerate(files):
ofs = 0
+ b = ''
while 1:
+ if progress:
+ progress(filenum, len(b))
fadvise_done(f, max(0, ofs - 1024*1024))
b = f.read(BLOB_HWM)
ofs += len(b)
yield (buf.get(buf.used()), 0)
-def hashsplit_iter(files):
+def _hashsplit_iter(files, progress):
assert(BLOB_HWM > BLOB_MAX)
buf = Buf()
- fi = blobiter(files)
+ fi = blobiter(files, progress)
while 1:
for i in drainbuf(buf, finalize=False):
yield i
buf.put(bnew)
+def _hashsplit_iter_keep_boundaries(files, progress):
+ for real_filenum,f in enumerate(files):
+ if progress:
+ def prog(filenum, nbytes):
+ # the inner _hashsplit_iter doesn't know the real file count,
+ # so we'll replace it here.
+ return progress(real_filenum, nbytes)
+ else:
+ prog = None
+ for i in _hashsplit_iter([f], progress=prog):
+ yield i
+
+
+def hashsplit_iter(files, keep_boundaries, progress):
+ if keep_boundaries:
+ return _hashsplit_iter_keep_boundaries(files, progress)
+ else:
+ return _hashsplit_iter(files, progress)
+
+
total_split = 0
-def _split_to_blobs(w, files):
+def _split_to_blobs(w, files, keep_boundaries, progress):
global total_split
- for (blob, bits) in hashsplit_iter(files):
+ for (blob, bits) in hashsplit_iter(files, keep_boundaries, progress):
sha = w.new_blob(blob)
total_split += len(blob)
if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
i += 1
-def split_to_shalist(w, files):
- sl = _split_to_blobs(w, files)
+def split_to_shalist(w, files, keep_boundaries, progress=None):
+ sl = _split_to_blobs(w, files, keep_boundaries, progress)
if not fanout:
shal = []
for (sha,size,bits) in sl:
return _make_shalist(stacks[-1])[0]
-def split_to_blob_or_tree(w, files):
- shalist = list(split_to_shalist(w, files))
+def split_to_blob_or_tree(w, files, keep_boundaries):
+ shalist = list(split_to_shalist(w, files, keep_boundaries))
if len(shalist) == 1:
return (shalist[0][0], shalist[0][2])
elif len(shalist) == 0:
def fadvise_done(f, ofs):
assert(ofs >= 0)
- if ofs > 0:
+ if ofs > 0 and hasattr(f, 'fileno'):
_helpers.fadvise_done(f.fileno(), ofs)
WVPASS(os.path.exists(nameprefix + '.pack'))
WVPASS(os.path.exists(nameprefix + '.idx'))
- r = git.PackIdx(nameprefix + '.idx')
+ r = git.open_idx(nameprefix + '.idx')
print repr(r.fanout)
for i in range(nobj):
WVFAIL bup save -r :$BUP_DIR -n r-test $D/fake/path
WVSTART "split"
+echo a >a.tmp
+echo b >b.tmp
+WVPASS bup split -b a.tmp >taga.tmp
+WVPASS bup split -b b.tmp >tagb.tmp
+cat a.tmp b.tmp | WVPASS bup split -b >tagab.tmp
+WVPASSEQ "$(cat taga.tmp | wc -l)" 1
+WVPASSEQ "$(cat tagb.tmp | wc -l)" 1
+WVPASSEQ "$(cat tagab.tmp | wc -l)" 1
+WVPASSEQ "$(cat tag[ab].tmp | wc -l)" 2
+WVPASSEQ "$(bup split -b a.tmp b.tmp)" "$(cat tagab.tmp)"
+WVPASSEQ "$(bup split -b --keep-boundaries a.tmp b.tmp)" "$(cat tag[ab].tmp)"
+WVPASSEQ "$(cat tag[ab].tmp | bup split -b --keep-boundaries --git-ids)" \
+ "$(cat tag[ab].tmp)"
+WVPASSEQ "$(cat tag[ab].tmp | bup split -b --git-ids)" \
+ "$(cat tagab.tmp)"
WVPASS bup split --bench -b <t/testfile1 >tags1.tmp
WVPASS bup split -vvvv -b t/testfile2 >tags2.tmp
WVPASS bup margin