Changeset 352

Show
Ignore:
Timestamp:
08/15/06 23:28:52 (2 years ago)
Author:
marksmith
Message:

* update newrepl, codify ENDOFTIME into constant, make the error comments more

descriptive, and throw an error on the SQL case, and misc cleanup

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • branches/server-newrepl/lib/MogileFS/Worker/Replicate.pm

    r350 r352  
    1313use POSIX ":sys_wait_h"; # argument for waitpid 
    1414use POSIX; 
     15 
     16# setup the value used in a 'nexttry' field to indicate that this item will never 
     17# actually be tried again and require some sort of manual intervention. 
     18use constant ENDOFTIME => 2147483647; 
    1519 
    1620sub new { 
     
    7680    my $self = shift; 
    7781 
     82    # find some fids to replicate, prioritize based on when they should be tried 
    7883    my $LIMIT = 1000; 
    7984    my $to_repl_map = $dbh->selectall_hashref(qq{ 
     
    8489        LIMIT $LIMIT 
    8590    }, "fid"); 
    86     return if $dbh->err || ! $to_repl_map; 
     91    if ($dbh->err) { 
     92        error("Database error selecting fids to replicate: " . $dbh->errstr); 
     93        return; 
     94    } 
    8795 
    8896    # get random list of hashref of things to do: 
     
    99107    foreach my $todo (@$to_repl) { 
    100108        my $fid = $todo->{fid}; 
    101         warn "to repl (new): $fid\n"; 
    102109 
    103110        my $errcode; 
     
    123130 
    124131        } else { 
     132            # README: please keep this up to date if you update the replicate() function so we ensure 
     133            # that this code always does the right thing 
     134            # 
     135            # -- HARMLESS -- 
     136            # failed_getting_lock        => harmless.  skip.  somebody else probably doing. 
     137            # 
     138            # -- TEMPORARY; DO EXPONENTIAL BACKOFF -- 
     139            # source_down                => only source available is observed down. 
     140            # policy_error_doing_failed  => policy plugin fucked up.  it's looping. 
     141            # policy_error_already_there => policy plugin fucked up.  it's dumb. 
     142            # policy_no_suggestions      => no copy was attempted.  policy is just not happy. 
     143            # copy_error                 => policy said to do 1+ things, we failed, it ran out of suggestions. 
     144            # 
     145            # -- FATAL; DON'T TRY AGAIN -- 
     146            # no_source                  => it simply exists nowhere.  not that something's down, but file_on is empty. 
     147            # no_devices                 => no devices are configured.  at all.  why are we replicating something? 
     148            #                               how did something come into being since you can't delete devices? 
     149 
    125150            # bail if we failed getting the lock, that means someone else probably 
    126151            # already did it, so we should just move on 
     
    133158            my $update_nexttry = sub { 
    134159                my ($type, $delay) = @_; 
    135                 error("update_nexttry( $type, $delay ) for $fid"); 
    136160                if ($type eq 'end_of_time') { 
    137161                    # special; update to a time that won't happen again, as we've encountered a scenario 
    138162                    # in which case we're really hosed 
    139                     $dbh->do("UPDATE file_to_replicate SET nexttry = 2147483647, failcount = failcount + 1 WHERE fid = ?", 
     163                    $dbh->do("UPDATE file_to_replicate SET nexttry = " . ENDOFTIME . ", failcount = failcount + 1 WHERE fid = ?", 
    140164                             undef, $fid); 
    141165                } else { 
     
    161185            # 15s, 1m, 5m, 30m, 1h, 2h, 4h, 8h, 24h, 24h, 24h, 24h, ... 
    162186            my @backoff = qw( 15 60 300 1800 3600 7200 14400 28800 ); 
    163             $update_nexttry->( offset => $backoff[$todo->{failcount}] || 86400 ); 
     187            $update_nexttry->( offset => int(($backoff[$todo->{failcount}] || 86400) * (rand(0.4) + 0.8)) ); 
    164188            $unlock->() if $unlock; 
    165  
    166             #error("ERROR CODE for fid $fid: $errcode"); 
    167             # no_source            => it simply exists nowhere.  not that something's down, but file_on is empty. 
    168             # no_devices           => no devices are configured.  at all.  why are we replicating something?  how did something come into being since you can't delete devices? 
    169             # failed_getting_lock  => harmless.  skip.  somebody else probably doing. 
    170             # source_down          => expo backoff (only source available is observed down) 
    171             # policy_error_doing_failed  => policy plugin fucked up.  it's looping.  expo backoff? 
    172             # policy_error_already_there => policy plugin fucked up.  it's dumb.     expo backoff? 
    173             # policy_no_suggestions      => expo backoff.  no copy was attempted.  policy is just not happy. 
    174             # copy_error                 => policy said to do 1+ things, we failed, it ran out of suggestions.  expo backoff. 
    175189        } 
    176190    } 
     
    267281# 
    268282# $policy_class is optional (perl classname representing replication policy).  if present, used.  if not, looked up based on $fid. 
     283# 
     284# README: if you update this sub to return a new error code, please update the 
     285# appropriate callers to know how to deal with the errors returned. 
    269286sub replicate { 
    270287    my ($dbh, $fid, %opts) = @_; 
     
    298315        if (@_ == 2) { 
    299316            ($errcode, $errmsg) = @_; 
     317            $errmsg = "$errcode: $errmsg"; # include code with message 
    300318        } else { 
    301319            ($errmsg) = @_;