X-Git-Url: https://git.tokkee.org/?a=blobdiff_plain;f=git-deltafy-script;h=476d8796ecbb60688a91888f311ac6074eead45b;hb=93c36dcd0a4f6373e3a02a8505046801106ddb85;hp=f63cf075ec3f350036c7654606424b5be5a2abbb;hpb=587e49405be6c4053a69ee8a938660125aa1b51f;p=git.git diff --git a/git-deltafy-script b/git-deltafy-script old mode 100644 new mode 100755 index f63cf075e..476d8796e --- a/git-deltafy-script +++ b/git-deltafy-script @@ -1,40 +1,81 @@ #!/bin/bash -# Script to deltafy an entire GIT repository based on the commit list. +# Example script to deltify an entire GIT repository based on the commit list. # The most recent version of a file is the reference and previous versions # are made delta against the best earlier version available. And so on for -# successive versions going back in time. This way the delta overhead is -# pushed towards older version of any given file. -# -# NOTE: the "best earlier version" is not implemented in mkdelta yet -# and therefore only the next eariler version is used at this time. -# -# TODO: deltafy tree objects as well. +# successive versions going back in time. This way the increasing delta +# overhead is pushed towards older versions of any given file. # # The -d argument allows to provide a limit on the delta chain depth. -# If 0 is passed then everything is undeltafied. +# If 0 is passed then everything is undeltafied. Limiting the delta +# depth is meaningful for subsequent access performance to old revisions. +# A value of 16 might be a good compromize between performance and good +# space saving. Current default is unbounded. +# +# The --max-behind=30 argument is passed to git-mkdelta so to keep +# combinations and memory usage bounded a bit. If you have lots of memory +# and CPU power you may remove it (or set to 0) to let git-mkdelta find the +# best delta match regardless of the number of revisions for a given file. +# You can also make the value smaller to make it faster and less +# memory hungry. A value of 5 ought to still give pretty good results. +# When set to 0 or ommitted then look behind is unbounded. Note that +# git-mkdelta might die with a segmentation fault in that case if it +# runs out of memory. Note that the GIT repository will still be consistent +# even if git-mkdelta dies unexpectedly. set -e -depth= -[ "$1" == "-d" ] && depth="--max-depth=$2" && shift 2 +max_depth= +[ "$1" == "-d" ] && max_depth="--max-depth=$2" && shift 2 +overlap=30 +max_behind="--max-behind=$overlap" + +function process_list() { + if [ "$list" ]; then + echo "Processing $curr_file" + echo "$list" | xargs git-mkdelta $max_depth $max_behind -v + fi +} + +rev_list="" curr_file="" git-rev-list HEAD | -git-diff-tree -r --stdin | -awk '/^:/ { if ($5 == "M" || $5 == "N") print $4, $6 }' | -LC_ALL=C sort -s -k 2 | uniq | -while read sha1 file; do - if [ "$file" == "$curr_file" ]; then - list="$list $sha1" - else - if [ "$list" ]; then - echo "Processing $curr_file" - echo "$head $list" | xargs git-mkdelta $depth -v +while true; do + # Let's batch revisions into groups of 1000 to give it a chance to + # scale with repositories containing long revision lists. We also + # overlap with the previous batch the size of mkdelta's look behind + # value in order to account for the processing discontinuity. + rev_list="$(echo -e -n "$rev_list" | tail --lines=$overlap)" + for i in $(seq 1000); do + read rev || break + rev_list="$rev_list$rev\n" + done + echo -e -n "$rev_list" | + git-diff-tree -r -t --stdin | + awk '/^:/ { if ($5 == "M") printf "%s %s\n%s %s\n", $4, $6, $3, $6 }' | + LC_ALL=C sort -s -k 2 | uniq | + while read sha1 file; do + if [ "$file" == "$curr_file" ]; then + list="$list $sha1" + else + process_list + curr_file="$file" + list="$sha1" fi - curr_file="$file" - list="" - head="$sha1" - fi + done + [ "$rev" ] || break done +process_list + +curr_file="root directory" +list="$( + git-rev-list HEAD | + while read commit; do + git-cat-file commit $commit | + sed -n 's/tree //p;Q' + done + )" +process_list +