Changeset 352
- Timestamp:
- 08/15/06 23:28:52 (2 years ago)
- Files:
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
branches/server-newrepl/lib/MogileFS/Worker/Replicate.pm
r350 r352 13 13 use POSIX ":sys_wait_h"; # argument for waitpid 14 14 use POSIX; 15 16 # setup the value used in a 'nexttry' field to indicate that this item will never 17 # actually be tried again and require some sort of manual intervention. 18 use constant ENDOFTIME => 2147483647; 15 19 16 20 sub new { … … 76 80 my $self = shift; 77 81 82 # find some fids to replicate, prioritize based on when they should be tried 78 83 my $LIMIT = 1000; 79 84 my $to_repl_map = $dbh->selectall_hashref(qq{ … … 84 89 LIMIT $LIMIT 85 90 }, "fid"); 86 return if $dbh->err || ! $to_repl_map; 91 if ($dbh->err) { 92 error("Database error selecting fids to replicate: " . $dbh->errstr); 93 return; 94 } 87 95 88 96 # get random list of hashref of things to do: … … 99 107 foreach my $todo (@$to_repl) { 100 108 my $fid = $todo->{fid}; 101 warn "to repl (new): $fid\n";102 109 103 110 my $errcode; … … 123 130 124 131 } else { 132 # README: please keep this up to date if you update the replicate() function so we ensure 133 # that this code always does the right thing 134 # 135 # -- HARMLESS -- 136 # failed_getting_lock => harmless. skip. somebody else probably doing. 137 # 138 # -- TEMPORARY; DO EXPONENTIAL BACKOFF -- 139 # source_down => only source available is observed down. 140 # policy_error_doing_failed => policy plugin fucked up. it's looping. 141 # policy_error_already_there => policy plugin fucked up. it's dumb. 142 # policy_no_suggestions => no copy was attempted. policy is just not happy. 143 # copy_error => policy said to do 1+ things, we failed, it ran out of suggestions. 144 # 145 # -- FATAL; DON'T TRY AGAIN -- 146 # no_source => it simply exists nowhere. not that something's down, but file_on is empty. 147 # no_devices => no devices are configured. at all. why are we replicating something? 148 # how did something come into being since you can't delete devices? 149 125 150 # bail if we failed getting the lock, that means someone else probably 126 151 # already did it, so we should just move on … … 133 158 my $update_nexttry = sub { 134 159 my ($type, $delay) = @_; 135 error("update_nexttry( $type, $delay ) for $fid");136 160 if ($type eq 'end_of_time') { 137 161 # special; update to a time that won't happen again, as we've encountered a scenario 138 162 # in which case we're really hosed 139 $dbh->do("UPDATE file_to_replicate SET nexttry = 2147483647, failcount = failcount + 1 WHERE fid = ?",163 $dbh->do("UPDATE file_to_replicate SET nexttry = " . ENDOFTIME . ", failcount = failcount + 1 WHERE fid = ?", 140 164 undef, $fid); 141 165 } else { … … 161 185 # 15s, 1m, 5m, 30m, 1h, 2h, 4h, 8h, 24h, 24h, 24h, 24h, ... 162 186 my @backoff = qw( 15 60 300 1800 3600 7200 14400 28800 ); 163 $update_nexttry->( offset => $backoff[$todo->{failcount}] || 86400);187 $update_nexttry->( offset => int(($backoff[$todo->{failcount}] || 86400) * (rand(0.4) + 0.8)) ); 164 188 $unlock->() if $unlock; 165 166 #error("ERROR CODE for fid $fid: $errcode");167 # no_source => it simply exists nowhere. not that something's down, but file_on is empty.168 # no_devices => no devices are configured. at all. why are we replicating something? how did something come into being since you can't delete devices?169 # failed_getting_lock => harmless. skip. somebody else probably doing.170 # source_down => expo backoff (only source available is observed down)171 # policy_error_doing_failed => policy plugin fucked up. it's looping. expo backoff?172 # policy_error_already_there => policy plugin fucked up. it's dumb. expo backoff?173 # policy_no_suggestions => expo backoff. no copy was attempted. policy is just not happy.174 # copy_error => policy said to do 1+ things, we failed, it ran out of suggestions. expo backoff.175 189 } 176 190 } … … 267 281 # 268 282 # $policy_class is optional (perl classname representing replication policy). if present, used. if not, looked up based on $fid. 283 # 284 # README: if you update this sub to return a new error code, please update the 285 # appropriate callers to know how to deal with the errors returned. 269 286 sub replicate { 270 287 my ($dbh, $fid, %opts) = @_; … … 298 315 if (@_ == 2) { 299 316 ($errcode, $errmsg) = @_; 317 $errmsg = "$errcode: $errmsg"; # include code with message 300 318 } else { 301 319 ($errmsg) = @_;
