]> arthur.barton.de Git - bup.git/commitdiff
Stop interleaving stream and mmap IO operations when writing the index.
authorRob Browning <rlb@defaultvalue.org>
Fri, 23 Aug 2013 03:57:03 +0000 (22:57 -0500)
committerRob Browning <rlb@defaultvalue.org>
Sat, 7 Sep 2013 22:51:09 +0000 (17:51 -0500)
Previously bup would write the index via a simultaneous combination of
stream operations and mmap writes to non-overlapping, but adjacent
regions of the same file.  This was causing index corruption in some
cases.

Instead, make an extra pass over the data in memory to precompute the
size of the final index section, which contains any 31+-bit offsets.
Then mmap and write the entire set of tables directly, avoiding the
need for simultaneous stream operations.

Reported-by: Alan Jenkins <alan.christopher.jenkins@gmail.com>
Signed-off-by: Rob Browning <rlb@defaultvalue.org>
Tested-by: Greg Troxel <gdt@lexort.com>
Reviewed-by: Greg Troxel <gdt@lexort.com>
lib/bup/_helpers.c
lib/bup/git.py

index a8e38ba935fa7073a9d71dc51fe6e1fb51604d2a..8381383553a5e0eb73b1874adad70d98fa25f0f0 100644 (file)
@@ -11,6 +11,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include <sys/mman.h>
 
 #ifdef HAVE_SYS_TYPES_H
 #include <sys/types.h>
@@ -479,30 +480,37 @@ static uint64_t htonll(uint64_t value)
     return value; // already in network byte order MSB-LSB
 }
 
-#define PACK_IDX_V2_HEADERLEN 8
 #define FAN_ENTRIES 256
 
 static PyObject *write_idx(PyObject *self, PyObject *args)
 {
-    PyObject *pf = NULL, *idx = NULL;
+    const char *filename = NULL;
+    PyObject *idx = NULL;
     PyObject *part;
-    FILE *f;
     unsigned char *fmap = NULL;
     Py_ssize_t flen = 0;
     unsigned int total = 0;
     uint32_t count;
     int i, j, ofs64_count;
     uint32_t *fan_ptr, *crc_ptr, *ofs_ptr;
+    uint64_t *ofs64_ptr;
     struct sha *sha_ptr;
 
-    if (!PyArg_ParseTuple(args, "Ow#OI", &pf, &fmap, &flen, &idx, &total))
+    if (!PyArg_ParseTuple(args, "sw#OI", &filename, &fmap, &flen, &idx, &total))
        return NULL;
 
-    fan_ptr = (uint32_t *)&fmap[PACK_IDX_V2_HEADERLEN];
+    if (PyList_Size (idx) != FAN_ENTRIES) // Check for list of the right length.
+        return PyErr_Format (PyExc_TypeError, "idx must contain %d entries",
+                             FAN_ENTRIES);
+
+    const char idx_header[] = "\377tOc\0\0\0\002";
+    memcpy (fmap, idx_header, sizeof(idx_header) - 1);
+
+    fan_ptr = (uint32_t *)&fmap[sizeof(idx_header) - 1];
     sha_ptr = (struct sha *)&fan_ptr[FAN_ENTRIES];
     crc_ptr = (uint32_t *)&sha_ptr[total];
     ofs_ptr = (uint32_t *)&crc_ptr[total];
-    f = PyFile_AsFile(pf);
+    ofs64_ptr = (uint64_t *)&ofs_ptr[total];
 
     count = 0;
     ofs64_count = 0;
@@ -533,14 +541,17 @@ static PyObject *write_idx(PyObject *self, PyObject *args)
            *crc_ptr++ = htonl(crc);
            if (ofs > 0x7fffffff)
            {
-               const uint64_t nofs = htonll(ofs);
-               if (fwrite(&nofs, sizeof(uint64_t), 1, f) != 1)
-                   return PyErr_SetFromErrno(PyExc_OSError);
+                *ofs64_ptr++ = htonll(ofs);
                ofs = 0x80000000 | ofs64_count++;
            }
            *ofs_ptr++ = htonl((uint32_t)ofs);
        }
     }
+
+    int rc = msync(fmap, flen, MS_ASYNC);
+    if (rc != 0)
+       return PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename);
+
     return PyLong_FromUnsignedLong(count);
 }
 
index 5debc9a5c6968933d341f93a9fba59778fb53e35..a8202e71190d7e54d65a708d10f6fc551ab5142d 100644 (file)
@@ -654,40 +654,45 @@ class PackWriter:
         return self._end(run_midx=run_midx)
 
     def _write_pack_idx_v2(self, filename, idx, packbin):
+        ofs64_count = 0
+        for section in idx:
+            for entry in section:
+                if entry[2] >= 2**31:
+                    ofs64_count += 1
+
+        # Length: header + fan-out + shas-and-crcs + overflow-offsets
+        index_len = 8 + (4 * 256) + (28 * self.count) + (8 * ofs64_count)
+        idx_map = None
         idx_f = open(filename, 'w+b')
-        idx_f.write('\377tOc\0\0\0\2')
-
-        ofs64_ofs = 8 + 4*256 + 28*self.count
-        idx_f.truncate(ofs64_ofs)
-        idx_f.seek(0)
-        idx_map = mmap_readwrite(idx_f, close=False)
-        idx_f.seek(0, os.SEEK_END)
-        count = _helpers.write_idx(idx_f, idx_map, idx, self.count)
-        assert(count == self.count)
-        # Sync, since it doesn't look like POSIX guarantees that a
-        # matching FILE* (i.e. idx_f) will see the parallel changes if
-        # we don't.
-        idx_map.flush()
-        idx_map.close()
-        idx_f.write(packbin)
-
-        idx_f.seek(0)
-        idx_sum = Sha1()
-        b = idx_f.read(8 + 4*256)
-        idx_sum.update(b)
-
-        obj_list_sum = Sha1()
-        for b in chunkyreader(idx_f, 20*self.count):
-            idx_sum.update(b)
-            obj_list_sum.update(b)
-        namebase = obj_list_sum.hexdigest()
-
-        for b in chunkyreader(idx_f):
+        try:
+            idx_f.truncate(index_len)
+            idx_map = mmap_readwrite(idx_f, close=False)
+            count = _helpers.write_idx(filename, idx_map, idx, self.count)
+            assert(count == self.count)
+        finally:
+            if idx_map: idx_map.close()
+            idx_f.close()
+
+        idx_f = open(filename, 'a+b')
+        try:
+            idx_f.write(packbin)
+            idx_f.seek(0)
+            idx_sum = Sha1()
+            b = idx_f.read(8 + 4*256)
             idx_sum.update(b)
-        idx_f.write(idx_sum.digest())
-        idx_f.close()
 
-        return namebase
+            obj_list_sum = Sha1()
+            for b in chunkyreader(idx_f, 20*self.count):
+                idx_sum.update(b)
+                obj_list_sum.update(b)
+            namebase = obj_list_sum.hexdigest()
+
+            for b in chunkyreader(idx_f):
+                idx_sum.update(b)
+            idx_f.write(idx_sum.digest())
+            return namebase
+        finally:
+            idx_f.close()
 
 
 def _git_date(date):