Changeset 378

Show
Ignore:
Timestamp:
08/25/06 00:20:00 (2 years ago)
Author:
bradfitz
Message:

unindenting a huge else block, but also:

also delete from file_to_delete in case == 2, when file's already been
replicated by somebody else.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • branches/server-newrepl/lib/MogileFS/Worker/Replicate.pm

    r362 r378  
    131131            # replicated this file), or 2 (success, but someone else replicated it). 
    132132 
    133             # so if it's 2, we just want to go to the next fid.  this file is done. 
    134             # somebody else presumably took care of the file_to_replicate maintenance 
    135             if ($status == 2) { 
    136                 $unlock->() if $unlock; 
    137                 next; 
     133            # when $staus == 2, this delete is unnecessary normally 
     134            # (somebody else presumably already deleted it if they 
     135            # also replicated it), but in the case of running with old 
     136            # replicators from previous versions, -or- simply if the 
     137            # other guy's delete failed, this cleans it up.... 
     138            $dbh->do("DELETE FROM file_to_replicate WHERE fid=?", undef, $fid); 
     139            $unlock->() if $unlock; 
     140            next; 
     141        } 
     142 
     143        # ERROR CASES: 
     144 
     145        # README: please keep this up to date if you update the replicate() function so we ensure 
     146        # that this code always does the right thing 
     147        # 
     148        # -- HARMLESS -- 
     149        # failed_getting_lock        => harmless.  skip.  somebody else probably doing. 
     150        # 
     151        # -- TEMPORARY; DO EXPONENTIAL BACKOFF -- 
     152        # source_down                => only source available is observed down. 
     153        # policy_error_doing_failed  => policy plugin fucked up.  it's looping. 
     154        # policy_error_already_there => policy plugin fucked up.  it's dumb. 
     155        # policy_no_suggestions      => no copy was attempted.  policy is just not happy. 
     156        # copy_error                 => policy said to do 1+ things, we failed, it ran out of suggestions. 
     157        # 
     158        # -- FATAL; DON'T TRY AGAIN -- 
     159        # no_source                  => it simply exists nowhere.  not that something's down, but file_on is empty. 
     160        # no_devices                 => no devices are configured.  at all.  why are we replicating something? 
     161        #                               how did something come into being since you can't delete devices? 
     162 
     163        # bail if we failed getting the lock, that means someone else probably 
     164        # already did it, so we should just move on 
     165        if ($errcode eq 'failed_getting_lock') { 
     166            $unlock->() if $unlock; 
     167            next; 
     168        } 
     169 
     170        # logic for setting the next try time appropriately 
     171        my $update_nexttry = sub { 
     172            my ($type, $delay) = @_; 
     173            if ($type eq 'end_of_time') { 
     174                # special; update to a time that won't happen again, as we've encountered a scenario 
     175                # in which case we're really hosed 
     176                $dbh->do("UPDATE file_to_replicate SET nexttry = " . ENDOFTIME . ", failcount = failcount + 1 WHERE fid = ?", 
     177                         undef, $fid); 
     178            } else { 
     179                my $extra = $type eq 'offset' ? 'UNIX_TIMESTAMP() +' : ''; 
     180                $dbh->do("UPDATE file_to_replicate SET nexttry = $extra ?, failcount = failcount + 1 WHERE fid = ?", 
     181                         undef, $delay+0, $fid); 
    138182            } 
    139  
    140             # replicate was a success, so cleanup table, then unlock 
    141             $dbh->do("DELETE FROM file_to_replicate WHERE fid=?", undef, $fid); 
    142             $unlock->(); 
    143  
    144         } else { 
    145             # README: please keep this up to date if you update the replicate() function so we ensure 
    146             # that this code always does the right thing 
    147             # 
    148             # -- HARMLESS -- 
    149             # failed_getting_lock        => harmless.  skip.  somebody else probably doing. 
    150             # 
    151             # -- TEMPORARY; DO EXPONENTIAL BACKOFF -- 
    152             # source_down                => only source available is observed down. 
    153             # policy_error_doing_failed  => policy plugin fucked up.  it's looping. 
    154             # policy_error_already_there => policy plugin fucked up.  it's dumb. 
    155             # policy_no_suggestions      => no copy was attempted.  policy is just not happy. 
    156             # copy_error                 => policy said to do 1+ things, we failed, it ran out of suggestions. 
    157             # 
    158             # -- FATAL; DON'T TRY AGAIN -- 
    159             # no_source                  => it simply exists nowhere.  not that something's down, but file_on is empty. 
    160             # no_devices                 => no devices are configured.  at all.  why are we replicating something? 
    161             #                               how did something come into being since you can't delete devices? 
    162  
    163             # bail if we failed getting the lock, that means someone else probably 
    164             # already did it, so we should just move on 
    165             if ($errcode eq 'failed_getting_lock') { 
    166                 $unlock->() if $unlock; 
    167                 next; 
    168             } 
    169  
    170             # logic for setting the next try time appropriately 
    171             my $update_nexttry = sub { 
    172                 my ($type, $delay) = @_; 
    173                 if ($type eq 'end_of_time') { 
    174                     # special; update to a time that won't happen again, as we've encountered a scenario 
    175                     # in which case we're really hosed 
    176                     $dbh->do("UPDATE file_to_replicate SET nexttry = " . ENDOFTIME . ", failcount = failcount + 1 WHERE fid = ?", 
    177                              undef, $fid); 
    178                 } else { 
    179                     my $extra = $type eq 'offset' ? 'UNIX_TIMESTAMP() +' : ''; 
    180                     $dbh->do("UPDATE file_to_replicate SET nexttry = $extra ?, failcount = failcount + 1 WHERE fid = ?", 
    181                              undef, $delay+0, $fid); 
    182                 } 
    183                 error("Failed setting nexttry of fid $fid to $type $delay: " . $dbh->errstr) 
    184                     if $dbh->err; 
    185             }; 
    186  
    187             # now let's handle any error we want to consider a total failure; do not 
    188             # retry at any point.  push this file off to the end so someone has to come 
    189             # along and figure out what went wrong. 
    190             if ($errcode eq 'no_source' || $errcode eq 'no_devices') { 
    191                 $update_nexttry->( end_of_time => 1 ); 
    192                 $unlock->() if $unlock; 
    193                 next; 
    194             } 
    195  
    196             # at this point, the rest of the errors require exponential backoff.  define what this means 
    197             # as far as failcount -> delay to next try. 
    198             # 15s, 1m, 5m, 30m, 1h, 2h, 4h, 8h, 24h, 24h, 24h, 24h, ... 
    199             my @backoff = qw( 15 60 300 1800 3600 7200 14400 28800 ); 
    200             $update_nexttry->( offset => int(($backoff[$todo->{failcount}] || 86400) * (rand(0.4) + 0.8)) ); 
     183            error("Failed setting nexttry of fid $fid to $type $delay: " . $dbh->errstr) 
     184                if $dbh->err; 
     185        }; 
     186 
     187        # now let's handle any error we want to consider a total failure; do not 
     188        # retry at any point.  push this file off to the end so someone has to come 
     189        # along and figure out what went wrong. 
     190        if ($errcode eq 'no_source' || $errcode eq 'no_devices') { 
     191            $update_nexttry->( end_of_time => 1 ); 
    201192            $unlock->() if $unlock; 
    202         } 
     193            next; 
     194        } 
     195 
     196        # at this point, the rest of the errors require exponential backoff.  define what this means 
     197        # as far as failcount -> delay to next try. 
     198        # 15s, 1m, 5m, 30m, 1h, 2h, 4h, 8h, 24h, 24h, 24h, 24h, ... 
     199        my @backoff = qw( 15 60 300 1800 3600 7200 14400 28800 ); 
     200        $update_nexttry->( offset => int(($backoff[$todo->{failcount}] || 86400) * (rand(0.4) + 0.8)) ); 
     201        $unlock->() if $unlock; 
    203202    } 
    204203