We're going to use that with some subcommands, git-style.
-hsplit
-hjoin
-hashsplit
-hashjoin
+bup
+bup-split
+bup-join
datagen
*.o
*.so
default: all
-all: hashsplit hashjoin datagen hashsplit.so
+all: bup-split bup-join bup datagen hashsplit.so
datagen: datagen.o
$(CC) -shared -Wl,-Bsymbolic-functions -o $@ $<
test: all
- ./hashsplit.py <testfile1 >tags1
- ./hashsplit.py <testfile2 >tags2
+ ./bup split <testfile1 >tags1
+ ./bup split <testfile2 >tags2
diff -u tags1 tags2 || true
wc -c testfile1 testfile2
wc -l tags1 tags2
- ./hashjoin <tags1 >out1
- ./hashjoin <tags2 >out2
+ ./bup join <tags1 >out1
+ ./bup join <tags2 >out2
diff -u testfile1 out1
diff -u testfile2 out2
%: %.o
gcc -o $@ $< $(LDFLAGS) $(LIBS)
-%: %.py
+bup: bup.py
rm -f $@
ln -s $^ $@
-%: %.sh
+bup-%: cmd-%.py
+ rm -f $@
+ ln -s $^ $@
+
+bup-%: cmd-%.sh
rm -f $@
ln -s $^ $@
gcc -c -o $@ $^ $(CPPFLAGS) $(CFLAGS)
clean:
- rm -f *.o *.so *~ hashsplit hashjoin hsplit hjoin datagen *.pyc \
- out[12] tags[12] .*~
+ rm -f *.o *.so *~ .*~ *.pyc \
+ bup bup-split bup-join datagen \
+ out[12] tags[12]
--- /dev/null
+#!/usr/bin/env python
+import sys, os
+
+argv = sys.argv
+exe = argv[0]
+exepath = os.path.split(exe)[0] or '.'
+
+def log(s):
+ sys.stderr.write(s)
+
+def usage():
+ log('Usage: %s <subcmd> <options...>\n\n' % exe)
+ log('Available subcommands:\n')
+ for c in os.listdir(exepath):
+ if c.startswith('bup-') and c.find('.') < 0:
+ log('\t%s\n' % c[4:])
+ exit(99)
+
+
+if len(argv) < 2 or not argv[1] or argv[1][0] == '-':
+ usage()
+
+subcmd = argv[1]
+if subcmd == 'help':
+ usage()
+
+subpath = os.path.join(exepath, 'bup-%s' % subcmd)
+
+if not os.path.exists(subpath):
+ log('%s: unknown command "%s"\n' % (exe, subcmd))
+ usage()
+
+try:
+ os.execv(subpath, [subpath] + argv[2:])
+except OSError, e:
+ log('%s: %s\n' % (subpath, e))
+ exit(98)
--- /dev/null
+#!/bin/sh
+while read x junk; do
+ git cat-file -p "$x"
+done
--- /dev/null
+#!/usr/bin/env python
+import sys, os, subprocess, errno, zlib, time
+import hashsplit
+from sha import sha
+
+# FIXME: duplicated in C module. This shouldn't really be here at all...
+BLOBBITS = 14
+BLOBSIZE = 1 << (BLOBBITS-1)
+
+
+def log(s):
+ sys.stderr.write('%s\n' % s)
+
+
+class Buf:
+ def __init__(self):
+ self.data = ''
+ self.start = 0
+
+ def put(self, s):
+ #log('oldsize=%d+%d adding=%d' % (len(self.data), self.start, len(s)))
+ if s:
+ self.data = buffer(self.data, self.start) + s
+ self.start = 0
+
+ def peek(self, count):
+ return buffer(self.data, self.start, count)
+
+ def eat(self, count):
+ self.start += count
+
+ def get(self, count):
+ v = buffer(self.data, self.start, count)
+ self.start += count
+ return v
+
+ def used(self):
+ return len(self.data) - self.start
+
+
+def splitbuf(buf):
+ #return buf.get(BLOBSIZE)
+ b = buf.peek(buf.used())
+ ofs = hashsplit.splitbuf(b)
+ if ofs:
+ buf.eat(ofs)
+ return buffer(b, 0, ofs)
+ return None
+
+
+ocache = {}
+def save_blob(blob):
+ header = 'blob %d\0' % len(blob)
+ sum = sha(header)
+ sum.update(blob)
+ hex = sum.hexdigest()
+ dir = '.git/objects/%s' % hex[0:2]
+ fn = '%s/%s' % (dir, hex[2:])
+ if not ocache.get(hex) and not os.path.exists(fn):
+ #log('creating %s' % fn)
+ try:
+ os.mkdir(dir)
+ except OSError, e:
+ if e.errno != errno.EEXIST:
+ raise
+ tfn = '%s.%d' % (fn, os.getpid())
+ f = open(tfn, 'w')
+ z = zlib.compressobj(1)
+ f.write(z.compress(header))
+ f.write(z.compress(blob))
+ f.write(z.flush())
+ f.close()
+ os.rename(tfn, fn)
+ else:
+ #log('exists %s' % fn)
+ pass
+ ocache[hex] = 1
+ print hex
+ return hex
+
+
+def do_main():
+ start_time = time.time()
+ ofs = 0
+ buf = Buf()
+ blob = 1
+
+ eof = 0
+ lv = 0
+ while blob or not eof:
+ if not eof and (buf.used() < BLOBSIZE*2 or not blob):
+ bnew = sys.stdin.read(1024*1024)
+ if not len(bnew): eof = 1
+ #log('got %d, total %d' % (len(bnew), buf.used()))
+ buf.put(bnew)
+
+ blob = splitbuf(buf)
+ if eof and not blob:
+ blob = buf.get(buf.used())
+ if not blob and buf.used() >= BLOBSIZE*8:
+ blob = buf.get(BLOBSIZE*4) # limit max blob size
+ if not blob and not eof:
+ continue
+
+ if blob:
+ ofs += len(blob)
+ #log('SPLIT @ %-8d size=%-8d (blobsize=%d)'
+ # % (ofs, len(blob), BLOBSIZE))
+ save_blob(blob)
+
+ nv = (ofs + buf.used())/1000000
+ if nv != lv:
+ log(nv)
+ lv = nv
+ secs = time.time() - start_time
+ log('\n%.2fkbytes in %.2f secs = %.2f kbytes/sec'
+ % (ofs/1024., secs, ofs/1024./secs))
+
+
+assert(BLOBSIZE >= 32)
+do_main()
+++ /dev/null
-#!/bin/sh
-while read x junk; do
- git cat-file -p "$x"
-done
+++ /dev/null
-#!/usr/bin/env python
-import sys, os, subprocess, errno, zlib, time
-import hashsplit
-from sha import sha
-
-# FIXME: duplicated in C module. This shouldn't really be here at all...
-BLOBBITS = 14
-BLOBSIZE = 1 << (BLOBBITS-1)
-
-
-def log(s):
- sys.stderr.write('%s\n' % s)
-
-
-class Buf:
- def __init__(self):
- self.data = ''
- self.start = 0
-
- def put(self, s):
- #log('oldsize=%d+%d adding=%d' % (len(self.data), self.start, len(s)))
- if s:
- self.data = buffer(self.data, self.start) + s
- self.start = 0
-
- def peek(self, count):
- return buffer(self.data, self.start, count)
-
- def eat(self, count):
- self.start += count
-
- def get(self, count):
- v = buffer(self.data, self.start, count)
- self.start += count
- return v
-
- def used(self):
- return len(self.data) - self.start
-
-
-def splitbuf(buf):
- #return buf.get(BLOBSIZE)
- b = buf.peek(buf.used())
- ofs = hashsplit.splitbuf(b)
- if ofs:
- buf.eat(ofs)
- return buffer(b, 0, ofs)
- return None
-
-
-ocache = {}
-def save_blob(blob):
- header = 'blob %d\0' % len(blob)
- sum = sha(header)
- sum.update(blob)
- hex = sum.hexdigest()
- dir = '.git/objects/%s' % hex[0:2]
- fn = '%s/%s' % (dir, hex[2:])
- if not ocache.get(hex) and not os.path.exists(fn):
- #log('creating %s' % fn)
- try:
- os.mkdir(dir)
- except OSError, e:
- if e.errno != errno.EEXIST:
- raise
- tfn = '%s.%d' % (fn, os.getpid())
- f = open(tfn, 'w')
- z = zlib.compressobj(1)
- f.write(z.compress(header))
- f.write(z.compress(blob))
- f.write(z.flush())
- f.close()
- os.rename(tfn, fn)
- else:
- #log('exists %s' % fn)
- pass
- ocache[hex] = 1
- print hex
- return hex
-
-
-def do_main():
- start_time = time.time()
- ofs = 0
- buf = Buf()
- blob = 1
-
- eof = 0
- lv = 0
- while blob or not eof:
- if not eof and (buf.used() < BLOBSIZE*2 or not blob):
- bnew = sys.stdin.read(1024*1024)
- if not len(bnew): eof = 1
- #log('got %d, total %d' % (len(bnew), buf.used()))
- buf.put(bnew)
-
- blob = splitbuf(buf)
- if eof and not blob:
- blob = buf.get(buf.used())
- if not blob and buf.used() >= BLOBSIZE*8:
- blob = buf.get(BLOBSIZE*4) # limit max blob size
- if not blob and not eof:
- continue
-
- if blob:
- ofs += len(blob)
- #log('SPLIT @ %-8d size=%-8d (blobsize=%d)'
- # % (ofs, len(blob), BLOBSIZE))
- save_blob(blob)
-
- nv = (ofs + buf.used())/1000000
- if nv != lv:
- log(nv)
- lv = nv
- secs = time.time() - start_time
- log('\n%.2fkbytes in %.2f secs = %.2f kbytes/sec'
- % (ofs/1024., secs, ofs/1024./secs))
-
-
-assert(BLOBSIZE >= 32)
-do_main()