| 133 | | # so if it's 2, we just want to go to the next fid. this file is done. |
|---|
| 134 | | # somebody else presumably took care of the file_to_replicate maintenance |
|---|
| 135 | | if ($status == 2) { |
|---|
| 136 | | $unlock->() if $unlock; |
|---|
| 137 | | next; |
|---|
| | 133 | # when $staus == 2, this delete is unnecessary normally |
|---|
| | 134 | # (somebody else presumably already deleted it if they |
|---|
| | 135 | # also replicated it), but in the case of running with old |
|---|
| | 136 | # replicators from previous versions, -or- simply if the |
|---|
| | 137 | # other guy's delete failed, this cleans it up.... |
|---|
| | 138 | $dbh->do("DELETE FROM file_to_replicate WHERE fid=?", undef, $fid); |
|---|
| | 139 | $unlock->() if $unlock; |
|---|
| | 140 | next; |
|---|
| | 141 | } |
|---|
| | 142 | |
|---|
| | 143 | # ERROR CASES: |
|---|
| | 144 | |
|---|
| | 145 | # README: please keep this up to date if you update the replicate() function so we ensure |
|---|
| | 146 | # that this code always does the right thing |
|---|
| | 147 | # |
|---|
| | 148 | # -- HARMLESS -- |
|---|
| | 149 | # failed_getting_lock => harmless. skip. somebody else probably doing. |
|---|
| | 150 | # |
|---|
| | 151 | # -- TEMPORARY; DO EXPONENTIAL BACKOFF -- |
|---|
| | 152 | # source_down => only source available is observed down. |
|---|
| | 153 | # policy_error_doing_failed => policy plugin fucked up. it's looping. |
|---|
| | 154 | # policy_error_already_there => policy plugin fucked up. it's dumb. |
|---|
| | 155 | # policy_no_suggestions => no copy was attempted. policy is just not happy. |
|---|
| | 156 | # copy_error => policy said to do 1+ things, we failed, it ran out of suggestions. |
|---|
| | 157 | # |
|---|
| | 158 | # -- FATAL; DON'T TRY AGAIN -- |
|---|
| | 159 | # no_source => it simply exists nowhere. not that something's down, but file_on is empty. |
|---|
| | 160 | # no_devices => no devices are configured. at all. why are we replicating something? |
|---|
| | 161 | # how did something come into being since you can't delete devices? |
|---|
| | 162 | |
|---|
| | 163 | # bail if we failed getting the lock, that means someone else probably |
|---|
| | 164 | # already did it, so we should just move on |
|---|
| | 165 | if ($errcode eq 'failed_getting_lock') { |
|---|
| | 166 | $unlock->() if $unlock; |
|---|
| | 167 | next; |
|---|
| | 168 | } |
|---|
| | 169 | |
|---|
| | 170 | # logic for setting the next try time appropriately |
|---|
| | 171 | my $update_nexttry = sub { |
|---|
| | 172 | my ($type, $delay) = @_; |
|---|
| | 173 | if ($type eq 'end_of_time') { |
|---|
| | 174 | # special; update to a time that won't happen again, as we've encountered a scenario |
|---|
| | 175 | # in which case we're really hosed |
|---|
| | 176 | $dbh->do("UPDATE file_to_replicate SET nexttry = " . ENDOFTIME . ", failcount = failcount + 1 WHERE fid = ?", |
|---|
| | 177 | undef, $fid); |
|---|
| | 178 | } else { |
|---|
| | 179 | my $extra = $type eq 'offset' ? 'UNIX_TIMESTAMP() +' : ''; |
|---|
| | 180 | $dbh->do("UPDATE file_to_replicate SET nexttry = $extra ?, failcount = failcount + 1 WHERE fid = ?", |
|---|
| | 181 | undef, $delay+0, $fid); |
|---|
| 139 | | |
|---|
| 140 | | # replicate was a success, so cleanup table, then unlock |
|---|
| 141 | | $dbh->do("DELETE FROM file_to_replicate WHERE fid=?", undef, $fid); |
|---|
| 142 | | $unlock->(); |
|---|
| 143 | | |
|---|
| 144 | | } else { |
|---|
| 145 | | # README: please keep this up to date if you update the replicate() function so we ensure |
|---|
| 146 | | # that this code always does the right thing |
|---|
| 147 | | # |
|---|
| 148 | | # -- HARMLESS -- |
|---|
| 149 | | # failed_getting_lock => harmless. skip. somebody else probably doing. |
|---|
| 150 | | # |
|---|
| 151 | | # -- TEMPORARY; DO EXPONENTIAL BACKOFF -- |
|---|
| 152 | | # source_down => only source available is observed down. |
|---|
| 153 | | # policy_error_doing_failed => policy plugin fucked up. it's looping. |
|---|
| 154 | | # policy_error_already_there => policy plugin fucked up. it's dumb. |
|---|
| 155 | | # policy_no_suggestions => no copy was attempted. policy is just not happy. |
|---|
| 156 | | # copy_error => policy said to do 1+ things, we failed, it ran out of suggestions. |
|---|
| 157 | | # |
|---|
| 158 | | # -- FATAL; DON'T TRY AGAIN -- |
|---|
| 159 | | # no_source => it simply exists nowhere. not that something's down, but file_on is empty. |
|---|
| 160 | | # no_devices => no devices are configured. at all. why are we replicating something? |
|---|
| 161 | | # how did something come into being since you can't delete devices? |
|---|
| 162 | | |
|---|
| 163 | | # bail if we failed getting the lock, that means someone else probably |
|---|
| 164 | | # already did it, so we should just move on |
|---|
| 165 | | if ($errcode eq 'failed_getting_lock') { |
|---|
| 166 | | $unlock->() if $unlock; |
|---|
| 167 | | next; |
|---|
| 168 | | } |
|---|
| 169 | | |
|---|
| 170 | | # logic for setting the next try time appropriately |
|---|
| 171 | | my $update_nexttry = sub { |
|---|
| 172 | | my ($type, $delay) = @_; |
|---|
| 173 | | if ($type eq 'end_of_time') { |
|---|
| 174 | | # special; update to a time that won't happen again, as we've encountered a scenario |
|---|
| 175 | | # in which case we're really hosed |
|---|
| 176 | | $dbh->do("UPDATE file_to_replicate SET nexttry = " . ENDOFTIME . ", failcount = failcount + 1 WHERE fid = ?", |
|---|
| 177 | | undef, $fid); |
|---|
| 178 | | } else { |
|---|
| 179 | | my $extra = $type eq 'offset' ? 'UNIX_TIMESTAMP() +' : ''; |
|---|
| 180 | | $dbh->do("UPDATE file_to_replicate SET nexttry = $extra ?, failcount = failcount + 1 WHERE fid = ?", |
|---|
| 181 | | undef, $delay+0, $fid); |
|---|
| 182 | | } |
|---|
| 183 | | error("Failed setting nexttry of fid $fid to $type $delay: " . $dbh->errstr) |
|---|
| 184 | | if $dbh->err; |
|---|
| 185 | | }; |
|---|
| 186 | | |
|---|
| 187 | | # now let's handle any error we want to consider a total failure; do not |
|---|
| 188 | | # retry at any point. push this file off to the end so someone has to come |
|---|
| 189 | | # along and figure out what went wrong. |
|---|
| 190 | | if ($errcode eq 'no_source' || $errcode eq 'no_devices') { |
|---|
| 191 | | $update_nexttry->( end_of_time => 1 ); |
|---|
| 192 | | $unlock->() if $unlock; |
|---|
| 193 | | next; |
|---|
| 194 | | } |
|---|
| 195 | | |
|---|
| 196 | | # at this point, the rest of the errors require exponential backoff. define what this means |
|---|
| 197 | | # as far as failcount -> delay to next try. |
|---|
| 198 | | # 15s, 1m, 5m, 30m, 1h, 2h, 4h, 8h, 24h, 24h, 24h, 24h, ... |
|---|
| 199 | | my @backoff = qw( 15 60 300 1800 3600 7200 14400 28800 ); |
|---|
| 200 | | $update_nexttry->( offset => int(($backoff[$todo->{failcount}] || 86400) * (rand(0.4) + 0.8)) ); |
|---|
| | 183 | error("Failed setting nexttry of fid $fid to $type $delay: " . $dbh->errstr) |
|---|
| | 184 | if $dbh->err; |
|---|
| | 185 | }; |
|---|
| | 186 | |
|---|
| | 187 | # now let's handle any error we want to consider a total failure; do not |
|---|
| | 188 | # retry at any point. push this file off to the end so someone has to come |
|---|
| | 189 | # along and figure out what went wrong. |
|---|
| | 190 | if ($errcode eq 'no_source' || $errcode eq 'no_devices') { |
|---|
| | 191 | $update_nexttry->( end_of_time => 1 ); |
|---|