Changeset 134

Show
Ignore:
Timestamp:
05/23/07 01:24:54 (3 years ago)
Author:
bradfitz
Message:

support for merging little files together into big chunks
on the backup target. aka "tail packing". requires no changes
to target drivers. this should speed backups, as less network
round-trips. will also be cheaper, once Amazon starts charging
per number of HTTP requests in June.

Location:
trunk
Files:
4 added
11 modified

Legend:

Unmodified
Added
Removed
  • trunk/Changes

    r130 r134  
     1  - support for merging little files together into big chunks 
     2    on the backup target.  aka "tail packing".  requires no changes 
     3    to target drivers.  this should speed backups, as less network 
     4    round-trips.  will also be cheaper, once Amazon starts charging 
     5    per number of HTTP requests in June. 
     6 
     7  - improved docs 
     8 
    191.01 (may 21, 2007) 
    210 
  • trunk/MANIFEST

    r130 r134  
    1313lib/Brackup/BackupStats.pm 
    1414lib/Brackup/ChunkIterator.pm 
     15lib/Brackup/CompositeChunk.pm 
    1516lib/Brackup/Config.pm 
    1617lib/Brackup/ConfigSection.pm 
     
    3334MANIFEST                        This list of files 
    3435MANIFEST.SKIP 
     36META.yml                        Module meta-data (added by MakeMaker) 
    3537t/00-use.t 
    3638t/01-backup.t 
    3739t/02-gpg.t 
     40t/03-combine-little-files.t 
     41t/data/000-dup1.txt 
     42t/data/000-dup2.txt 
    3843t/data/huge-file.txt 
    3944t/data/my-link.txt 
     
    4449t/data/test-file.txt 
    4550TODO 
    46 META.yml                                 Module meta-data (added by MakeMaker) 
  • trunk/lib/Brackup/Backup.pm

    r124 r134  
    44use Carp qw(croak); 
    55use Brackup::ChunkIterator; 
     6use Brackup::CompositeChunk; 
    67use Brackup::GPGProcManager; 
    78use Brackup::GPGProcess; 
     
    125126    }; 
    126127 
     128    my $merge_under = $root->merge_files_under; 
     129    my $comp_chunk  = undef; 
     130     
    127131    # records are either Brackup::File (for symlinks, directories, etc), or 
    128132    # PositionedChunks, in which case the file can asked of the chunk 
     
    137141        } 
    138142 
     143        # have we already stored this chunk before?  (iterative backup) 
    139144        my $schunk; 
    140145        if ($schunk = $target->stored_chunk_from_inventory($pchunk)) { 
     
    144149        } 
    145150 
     151        # weird case... have we stored this same pchunk digest in the 
     152        # current comp_chunk we're building?  these aren't caught by 
     153        # the above inventory check, because chunks in a composite 
     154        # chunk aren't added to the inventory until after the the composite 
     155        # chunk has fully grown (because it's not until it's fully grown 
     156        # that we know the handle for it, its digest) 
     157        if ($comp_chunk && ($schunk = $comp_chunk->stored_chunk_from_dup_internal_raw($pchunk))) { 
     158            $pchunk->forget_chunkref; 
     159            push @stored_chunks, $schunk; 
     160            next; 
     161        } 
     162 
    146163        $show_status->() unless $file_has_shown_status++; 
    147164        $self->debug("  * storing chunk: ", $pchunk->as_string, "\n"); 
    148165 
    149         my $handle; 
    150166        unless ($self->{dryrun}) { 
    151167            $schunk = Brackup::StoredChunk->new($pchunk); 
     
    154170            if ($gpg_rcpt) { 
    155171                $schunk->set_encrypted_chunkref($gpg_pm->enc_chunkref_of($pchunk)); 
     172            } 
     173 
     174            # see if we should pack it into a bigger blob 
     175            my $chunk_size = $schunk->backup_length; 
     176            if ($merge_under && $chunk_size < $merge_under) { 
     177                if ($comp_chunk && ! $comp_chunk->can_fit($chunk_size)) { 
     178                    $self->debug("Finalizing composite chunk $comp_chunk..."); 
     179                    $comp_chunk->finalize; 
     180                    $comp_chunk = undef; 
     181                } 
     182                $comp_chunk ||= Brackup::CompositeChunk->new($root, $target); 
     183                $comp_chunk->append_little_chunk($schunk); 
     184            } else { 
     185                # store it regularly, as its own chunk on the target 
     186                $target->store_chunk($schunk) 
     187                    or die "Chunk storage failed.\n"; 
     188                $target->add_to_inventory($pchunk => $schunk); 
    156189            } 
    157190 
     
    163196            #}; 
    164197 
    165             $target->store_chunk($schunk) 
    166                 or die "Chunk storage failed.\n"; 
    167             $target->add_to_inventory($pchunk => $schunk); 
     198 
    168199            $n_kb_up += $pchunk->length / 1024; 
    169200            push @stored_chunks, $schunk; 
     
    173204 
    174205        # DEBUG: verify it got written correctly 
    175         if ($ENV{BRACKUP_PARANOID} && $handle) { 
     206        if ($ENV{BRACKUP_PARANOID}) { 
    176207            die "FIX UP TO NEW API"; 
    177208            #my $saved_ref = $target->load_chunk($handle); 
     
    187218    } 
    188219    $end_file->(); 
     220    $comp_chunk->finalize if $comp_chunk; 
    189221 
    190222    unless ($self->{dryrun}) { 
  • trunk/lib/Brackup/InventoryDatabase.pm

    r128 r134  
    119119B<Keys> 
    120120 
    121 The key is the digest of the "raw" (pre-compression/encryption) file/chunk, and the value is 
    122 the digest of the file/chunk post-compression/encryption, along with its length. 
     121The key is the digest of the "raw" (pre-compression/encryption) 
     122file/chunk (with GPG recipient, if using encryption), and the value is 
     123the digest of the chunk stored on the target, which contains the raw 
     124chunk.  The chunk stored on the target may contain other chunks, may 
     125be compressed, encrypted, etc. 
     126 
     127 <raw_digest>               --> <stored_digest> <stored_length> 
     128 <raw_digest>;to=<gpg_rcpt> --> <stored_digest> <stored_length> 
    123129 
    124130For example: 
    125131 
    126   sha1:e23c4b5f685e046e7cc50e30e378ab11391e528e => 
     132  sha1:e23c4b5f685e046e7cc50e30e378ab11391e528e;to=6BAFF35F => 
    127133     sha1:d7257184899c9e6c4e26506f1c46f8b6562d9ee7 71223 
    128134 
    129 Means that the chunk with sha1 contents "e23c4...", after being 
    130 compressed/encrypted, is stored on the target with digest 
    131 "d72571848...", with length 71,223 bytes. 
     135Means that the chunk with sha1 contents "e23c4...", intended to be 
     136en/de-crypted for 6BAFF35F, can be got by asking the target for the 
     137chunk with digest "d72571848...", with length 71,223 bytes. 
     138 
     139When using the Brackup feature which combines small files into larger 
     140blobs, the inventory database instead stores values like: 
     141 
     142  <raw_digest>[;to=<gpg_rcpt>] --> 
     143     <stored_digest> <stored_length> <from_offset>-<to_offset> 
     144 
     145Which is the same thing, but after fetching the composite chunk using 
     146the stored digest provided, only the range provided from C<from_offset> to  
     147C<to_offset> should be used. 
    132148 
    133149=head1 SEE ALSO 
  • trunk/lib/Brackup/Restore.pm

    r124 r134  
    188188 
    189189        my $len_chunk = length $$dataref; 
    190         unless ($len_chunk == $enc_len) { 
    191             die "Backup chunk $dig isn't of expected length: got $len_chunk, expecting $enc_len\n"; 
     190 
     191        # using just a range of the file 
     192        # TODO: inefficient!  we don't want to download the chunk from the 
     193        # target multiple times.  better to cache it locally, or at least 
     194        # only fetch a region from the target (but that's still kinda inefficient 
     195        # and pushes complexity into the Target interface) 
     196        if ($enc_len =~ /^(\d+)-(\d+)$/) { 
     197            my ($from, $to) = ($1, $2); 
     198            # file range.  gotta be at least as big as bigger number 
     199            unless ($len_chunk >= $to) { 
     200                die "Backup chunk $dig isn't at least as big as range: got $len_chunk, needing $to\n"; 
     201            } 
     202            my $region = substr($$dataref, $from, $to-$from); 
     203            $dataref = \$region; 
     204        } else { 
     205            # using the whole chunk, so make sure fetched size matches 
     206            # expected size 
     207            unless ($len_chunk == $enc_len) { 
     208                die "Backup chunk $dig isn't of expected length: got $len_chunk, expecting $enc_len\n"; 
     209            } 
    192210        } 
    193211 
  • trunk/lib/Brackup/Root.pm

    r129 r134  
    2020    $self->{gpg_path}   = $conf->value('gpg_path') || "/usr/bin/gpg"; 
    2121    $self->{gpg_rcpt}   = $conf->value('gpg_recipient'); 
    22     $self->{chunk_size} = $conf->byte_value('chunk_size'), 
     22    $self->{chunk_size} = $conf->byte_value('chunk_size'); 
    2323    $self->{ignore}     = []; 
     24 
     25    $self->{merge_files_under}  = $conf->byte_value('merge_files_under'); 
     26    $self->{max_composite_size} = $conf->byte_value('max_composite_chunk_size') || 2**20; 
     27 
     28    die "'max_composite_chunk_size' must be greater than 'merge_files_under'\n" unless 
     29        $self->{max_composite_size} > $self->{merge_files_under}; 
    2430 
    2531    $self->{gpg_args}   = [];  # TODO: let user set this.  for now, not possible 
     
    3137    return $self; 
    3238} 
     39 
     40sub merge_files_under  { $_[0]{merge_files_under}  } 
     41sub max_composite_size { $_[0]{max_composite_size} } 
    3342 
    3443sub gpg_path { 
     
    279288just make the *.brackup metafiles larger. 
    280289 
     290=item B<merge_files_under> 
     291 
     292In units of bytes, kB, MB, etc.  If files are under this size.  By 
     293default this feature is off (value 0), purely because it's new, but 1 
     294kB is a recommended size, and will probably be the default in the 
     295future.  Set it to 0 to explicitly disable. 
     296 
     297=item B<max_composite_chunk_size> 
     298 
     299In units of bytes, kB, MB, etc.  The maximum size of a composite 
     300chunk, holding lots of little files.  If this is too big, you'll waste 
     301more space with future iterative backups updating files locked into 
     302this chunk with unchanged chunks. 
     303 
     304Recommended, and default value, is 1 MB. 
     305 
    281306=back 
  • trunk/lib/Brackup/StoredChunk.pm

    r110 r134  
    1111#   backdigest - memoized 
    1212#   _chunkref  - memoized 
     13#   compchunk  - composite chunk, if we were added to a composite chunk 
     14#   compfrom   - offset in composite chunk where we start 
     15#   compto     - offset in composite chunk where we end 
    1316 
    1417sub new { 
     
    1922} 
    2023 
     24sub pchunk { $_[0]{pchunk} } 
     25 
    2126# create the 'lite' or 'handle' version of a storedchunk.  can't get to 
    2227# the chunkref from this, but callers aren't won't.  and we'll DIE if they 
    2328# try to access the chunkref. 
    24 sub new_from_inventory { 
    25     my ($class, $pchunk, $dig, $len) = @_; 
    26     return bless { 
     29sub new_from_inventory_value { 
     30    my ($class, $pchunk, $invval) = @_; 
     31 
     32    my ($dig, $len, $range) = split /\s+/, $invval; 
     33     
     34    my $sc = bless { 
    2735        pchunk     => $pchunk, 
    2836        backdigest => $dig, 
    2937        backlength => $len, 
    3038    }, $class; 
     39 
     40    # normal 
     41    return $sc unless $range; 
     42 
     43    # in case of little file in a composite chunk, 
     44    # we gotta be a range. 
     45    my ($from, $to) = $range =~ /^(\d+)-(\d+)$/ 
     46        or die "bogus range: $range"; 
     47    $sc->{compfrom} = $from; 
     48    $sc->{compto}   = $to; 
     49    return $sc; 
     50} 
     51 
     52sub clone_but_for_pchunk { 
     53    my ($self, $pchunk) = @_; 
     54    my $copy = bless {}, ref $self; 
     55    foreach my $f (qw(backlength backdigest compchunk compfrom compto)) { 
     56        $copy->{$f} = $self->{$f}; 
     57    } 
     58    $copy->{pchunk} = $pchunk; 
     59    return $copy; 
     60} 
     61 
     62sub set_composite_chunk { 
     63    my ($self, $cchunk, $from, $to) = @_; 
     64    $self->{compchunk} = $cchunk; 
     65 
     66    # forget our backup length/digest.  this handle information 
     67    # to the stored chunk should be asked of our composite 
     68    # chunk in the future, when it's done populating. 
     69    $self->{backdigest} = undef; 
     70    $self->{backlength} = undef; 
     71    $self->forget_chunkref; 
     72 
     73    $self->{compfrom}  = $from; 
     74    $self->{compto}    = $to; 
     75} 
     76 
     77sub range_in_composite { 
     78    my $self = shift; 
     79    return undef unless $self->{compfrom} || $self->{compto}; 
     80    return "$self->{compfrom}-$self->{compto}"; 
    3181} 
    3282 
     
    78128sub _populate_lengthdigest { 
    79129    my $self = shift; 
     130    if (my $cchunk = $self->{compchunk}) { 
     131        $self->{backlength} = $cchunk->backup_length; 
     132        $self->{backdigest} = $cchunk->digest; 
     133        return 1; 
     134    } 
     135 
    80136    my $dataref = $self->chunkref; 
    81137    $self->{backlength} = CORE::length($$dataref); 
     
    121177    my $self = shift; 
    122178    my @parts = ($self->{pchunk}->offset, 
    123                  $self->{pchunk}->length, 
    124                  $self->backup_length, 
    125                  $self->backup_digest, 
    126                  ); 
     179                 $self->{pchunk}->length); 
     180 
     181    if (my $range = $self->range_in_composite) { 
     182        push @parts, ( 
     183                      $range, 
     184                      $self->backup_digest, 
     185                      ); 
     186    } else { 
     187        push @parts, ( 
     188                      $self->backup_length, 
     189                      $self->backup_digest, 
     190                      ); 
     191    } 
    127192 
    128193    # if the inventory database is lost, it should be possible to 
     
    142207} 
    143208 
     209# aka "instructions to attach to a pchunk, on how to recover the pchunk from a target" 
     210sub inventory_value { 
     211    my $self = shift; 
     212 
     213    # when this chunk was stored as part of a composite chunk, the instructions 
     214    # are of form: 
     215    #    sha1:deadbeef 0-50 
     216    # which means download "sha1:deadbeef", then the contents will be in from 
     217    # byte offset 0 to byte offset 50 (length of 50). 
     218    if (my $range = $self->range_in_composite) { 
     219        return join(" ", 
     220                    $self->backup_digest, 
     221                    $self->backup_length, 
     222                    $range); 
     223    } 
     224 
     225    # else, the historical format: 
     226    #   sha1:deadbeef <length> 
     227    return join(" ", $self->backup_digest, $self->backup_length); 
     228} 
     229 
    1442301; 
  • trunk/lib/Brackup/Target.pm

    r129 r134  
    4646    my $key  = $pchunk->inventory_key; 
    4747    my $db = $self->inventory_db; 
    48     $db->set($key => join(" ", $schunk->backup_digest, $schunk->backup_length)); 
     48    $db->set($key => $schunk->inventory_value); 
    4949} 
    5050 
     
    5555    my $key    = $pchunk->inventory_key; 
    5656    my $db     = $self->inventory_db; 
    57     my $diglen = $db->get($key) 
     57    my $invval = $db->get($key) 
    5858        or return undef; 
    59     my ($digest, $length) = split /\s+/, $diglen; 
    60     return Brackup::StoredChunk->new_from_inventory($pchunk, $digest, $length); 
     59    return Brackup::StoredChunk->new_from_inventory_value($pchunk, $invval); 
    6160} 
    6261 
  • trunk/lib/Brackup/Target/Amazon.pm

    r129 r134  
    9999    my $dig = $chunk->backup_digest; 
    100100    my $blen = $chunk->backup_length; 
    101     my $len = $chunk->length; 
    102101    my $chunkref = $chunk->chunkref; 
    103102 
  • trunk/lib/Brackup/Target/Filesystem.pm

    r129 r134  
    7474    my $dig = $chunk->backup_digest; 
    7575    my $blen = $chunk->backup_length; 
    76     my $len = $chunk->length; 
    7776 
    7877    my $path = $self->chunkpath($dig); 
  • trunk/lib/Brackup/Test.pm

    r126 r134  
    7575    } 
    7676    ok(-s $meta_filename, "backup file has size"); 
    77     return $meta_filename; 
     77    return wantarray ? ($meta_filename, $backup) : $meta_filename; 
    7878} 
    7979