Changeset 350

Show
Ignore:
Timestamp:
08/15/06 00:23:08 (2 years ago)
Author:
marksmith
Message:

* add exponential backoff which uses the failcount to determine how long

to wait before trying again. starts at 15 seconds ends at 1 day and continues
to try daily forever. (perhaps can change that?)

* some errors cause a "never retry" to be thrown. no_devices and no_source?

brad, this right? there's nothing we can do, but do we want to keep the row
in file_to_replicate until another job comes along and analyzes/reports it?

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • branches/server-newrepl/lib/MogileFS/ReplicationPolicy/MultipleHosts.pm

    r349 r350  
    88sub replicate_to { 
    99    my ($class, %args) = @_; 
    10 #    use Data::Dumper; 
    1110 
    1211    my $fid      = delete $args{fid};      # fid scalar to copy 
  • branches/server-newrepl/lib/MogileFS/Worker/Replicate.pm

    r349 r350  
    8484        LIMIT $LIMIT 
    8585    }, "fid"); 
     86    return if $dbh->err || ! $to_repl_map; 
    8687 
    8788    # get random list of hashref of things to do: 
    8889    my $to_repl = [ List::Util::shuffle(values %$to_repl_map) ]; 
     90    return unless @$to_repl; 
    8991 
    9092    # sort our priority list in terms of 0s (immediate, only 1 copy), 1s (immediate replicate, 
     
    121123 
    122124        } else { 
    123             error("ERROR CODE for fid $fid: $errcode"); 
     125            # bail if we failed getting the lock, that means someone else probably 
     126            # already did it, so we should just move on 
     127            if ($errcode eq 'failed_getting_lock') { 
     128                $unlock->() if $unlock; 
     129                next; 
     130            } 
     131 
     132            # logic for setting the next try time appropriately 
     133            my $update_nexttry = sub { 
     134                my ($type, $delay) = @_; 
     135                error("update_nexttry( $type, $delay ) for $fid"); 
     136                if ($type eq 'end_of_time') { 
     137                    # special; update to a time that won't happen again, as we've encountered a scenario 
     138                    # in which case we're really hosed 
     139                    $dbh->do("UPDATE file_to_replicate SET nexttry = 2147483647, failcount = failcount + 1 WHERE fid = ?", 
     140                             undef, $fid); 
     141                } else { 
     142                    my $extra = $type eq 'offset' ? 'UNIX_TIMESTAMP() +' : ''; 
     143                    $dbh->do("UPDATE file_to_replicate SET nexttry = $extra ?, failcount = failcount + 1 WHERE fid = ?", 
     144                             undef, $delay+0, $fid); 
     145                } 
     146                error("Failed setting nexttry of fid $fid to $type $delay: " . $dbh->errstr) 
     147                    if $dbh->err; 
     148            }; 
     149 
     150            # now let's handle any error we want to consider a total failure; do not 
     151            # retry at any point.  push this file off to the end so someone has to come 
     152            # along and figure out what went wrong. 
     153            if ($errcode eq 'no_source' || $errcode eq 'no_devices') { 
     154                $update_nexttry->( end_of_time => 1 ); 
     155                $unlock->() if $unlock; 
     156                next; 
     157            } 
     158 
     159            # at this point, the rest of the errors require exponential backoff.  define what this means 
     160            # as far as failcount -> delay to next try. 
     161            # 15s, 1m, 5m, 30m, 1h, 2h, 4h, 8h, 24h, 24h, 24h, 24h, ... 
     162            my @backoff = qw( 15 60 300 1800 3600 7200 14400 28800 ); 
     163            $update_nexttry->( offset => $backoff[$todo->{failcount}] || 86400 ); 
     164            $unlock->() if $unlock; 
     165 
     166            #error("ERROR CODE for fid $fid: $errcode"); 
    124167            # no_source            => it simply exists nowhere.  not that something's down, but file_on is empty. 
    125168            # no_devices           => no devices are configured.  at all.  why are we replicating something?  how did something come into being since you can't delete devices? 
     
    130173            # policy_no_suggestions      => expo backoff.  no copy was attempted.  policy is just not happy. 
    131174            # copy_error                 => policy said to do 1+ things, we failed, it ran out of suggestions.  expo backoff. 
    132             $unlock->() if $unlock; 
    133175        } 
    134176    }