Bigfile: teach "git add" to send a large file straight to a pack

author Junio C Hamano <gitster@pobox.com>

Sun, 8 May 2011 08:47:35 +0000 (01:47 -0700)

committer Junio C Hamano <gitster@pobox.com>

Fri, 13 May 2011 23:11:18 +0000 (16:11 -0700)
author Junio C Hamano <gitster@pobox.com>
Sun, 8 May 2011 08:47:35 +0000 (01:47 -0700)
committer Junio C Hamano <gitster@pobox.com>
Fri, 13 May 2011 23:11:18 +0000 (16:11 -0700)
diff --git a/sha1_file.c b/sha1_file.c

index 49416b02917ef600d927f8d9398c0a55a3eafc78..f0ca6a1749e2517bf3640f9d9f17e866c4eed47f 100644 (file)
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -11,6 +11,7 @@
  #include "pack.h"
  #include "blob.h"
  #include "commit.h"
+#include "run-command.h"
  #include "tag.h"
  #include "tree.h"
  #include "tree-walk.h"
@@ -2658,6 +2659,85 @@ static int index_core(unsigned char *sha1, int fd, size_t size,
         return ret;
  }
  
+/*
+ * This creates one packfile per large blob, because the caller
+ * immediately wants the result sha1, and fast-import can report the
+ * object name via marks mechanism only by closing the created
+ * packfile.
+ *
+ * This also bypasses the usual "convert-to-git" dance, and that is on
+ * purpose. We could write a streaming version of the converting
+ * functions and insert that before feeding the data to fast-import
+ * (or equivalent in-core API described above), but the primary
+ * motivation for trying to stream from the working tree file and to
+ * avoid mmaping it in core is to deal with large binary blobs, and
+ * by definition they do _not_ want to get any conversion.
+ */
+static int index_stream(unsigned char *sha1, int fd, size_t size,
+                       enum object_type type, const char *path,
+                       unsigned flags)
+{
+       struct child_process fast_import;
+       char export_marks[512];
+       const char *argv[] = { "fast-import", "--quiet", export_marks, NULL };
+       char tmpfile[512];
+       char fast_import_cmd[512];
+       char buf[512];
+       int len, tmpfd;
+
+       strcpy(tmpfile, git_path("hashstream_XXXXXX"));
+       tmpfd = git_mkstemp_mode(tmpfile, 0600);
+       if (tmpfd < 0)
+               die_errno("cannot create tempfile: %s", tmpfile);
+       if (close(tmpfd))
+               die_errno("cannot close tempfile: %s", tmpfile);
+       sprintf(export_marks, "--export-marks=%s", tmpfile);
+
+       memset(&fast_import, 0, sizeof(fast_import));
+       fast_import.in = -1;
+       fast_import.argv = argv;
+       fast_import.git_cmd = 1;
+       if (start_command(&fast_import))
+               die_errno("index-stream: git fast-import failed");
+
+       len = sprintf(fast_import_cmd, "blob\nmark :1\ndata %lu\n",
+                     (unsigned long) size);
+       write_or_whine(fast_import.in, fast_import_cmd, len,
+                      "index-stream: feeding fast-import");
+       while (size) {
+               char buf[10240];
+               size_t sz = size < sizeof(buf) ? size : sizeof(buf);
+               size_t actual;
+
+               actual = read_in_full(fd, buf, sz);
+               if (actual < 0)
+                       die_errno("index-stream: reading input");
+               if (write_in_full(fast_import.in, buf, actual) != actual)
+                       die_errno("index-stream: feeding fast-import");
+               size -= actual;
+       }
+       if (close(fast_import.in))
+               die_errno("index-stream: closing fast-import");
+       if (finish_command(&fast_import))
+               die_errno("index-stream: finishing fast-import");
+
+       tmpfd = open(tmpfile, O_RDONLY);
+       if (tmpfd < 0)
+               die_errno("index-stream: cannot open fast-import mark");
+       len = read(tmpfd, buf, sizeof(buf));
+       if (len < 0)
+               die_errno("index-stream: reading fast-import mark");
+       if (close(tmpfd) < 0)
+               die_errno("index-stream: closing fast-import mark");
+       if (unlink(tmpfile))
+               die_errno("index-stream: unlinking fast-import mark");
+       if (len != 44 ||
+           memcmp(":1 ", buf, 3) ||
+           get_sha1_hex(buf + 3, sha1))
+               die_errno("index-stream: unexpected fast-import mark: <%s>", buf);
+       return 0;
+}
+
  int index_fd(unsigned char *sha1, int fd, struct stat *st,
              enum object_type type, const char *path, unsigned flags)
  {
@@ -2666,8 +2746,10 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st,
  
         if (!S_ISREG(st->st_mode))
                 ret = index_pipe(sha1, fd, type, path, flags);
-       else
+       else if (size <= big_file_threshold || type != OBJ_BLOB)
                 ret = index_core(sha1, fd, size, type, path, flags);
+       else
+               ret = index_stream(sha1, fd, size, type, path, flags);
         close(fd);
         return ret;
  }
diff --git a/t/t1050-large.sh b/t/t1050-large.sh

new file mode 100755 (executable)

index 0000000..deba111
--- /dev/null
+++ b/t/t1050-large.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+# Copyright (c) 2011, Google Inc.
+
+test_description='adding and checking out large blobs'
+
+. ./test-lib.sh
+
+test_expect_success setup '
+       git config core.bigfilethreshold 200k &&
+       echo X | dd of=large bs=1k seek=2000
+'
+
+test_expect_success 'add a large file' '
+       git add large &&
+       # make sure we got a packfile and no loose objects
+       test -f .git/objects/pack/pack-*.pack &&
+       test ! -f .git/objects/??/??????????????????????????????????????
+'
+
+test_expect_success 'checkout a large file' '
+       large=$(git rev-parse :large) &&
+       git update-index --add --cacheinfo 100644 $large another &&
+       git checkout another &&
+       cmp large another ;# this must not be test_cmp
+'
+
+test_done
author	Junio C Hamano <gitster@pobox.com>
	Sun, 8 May 2011 08:47:35 +0000 (01:47 -0700)
committer	Junio C Hamano <gitster@pobox.com>
	Fri, 13 May 2011 23:11:18 +0000 (16:11 -0700)
sha1_file.c		patch \| blob \| history
t/t1050-large.sh	[new file with mode: 0755]	patch \| blob