| 55 | | # randomize so we can work on them |
|---|
| 56 | | my @fids = [ List::Util::shuffle(@{$fids || []}) ]; |
|---|
| 57 | | |
|---|
| 58 | | last; |
|---|
| 59 | | # while (find files to check from fsck table), |
|---|
| 60 | | # randomize |
|---|
| 61 | | # check them, inserting into needs_replicate or updating nextcheck time, or deleting if okay. |
|---|
| | 76 | # if nothing to do, we're done |
|---|
| | 77 | last unless $fids && @$fids; |
|---|
| | 78 | |
|---|
| | 79 | # iterate randomly |
|---|
| | 80 | foreach my $fid (List::Util::shuffle(@$fids)) { |
|---|
| | 81 | # try to check this fid |
|---|
| | 82 | my $now = [ gettimeofday() ]; |
|---|
| | 83 | my $rv = check_fid($dbh, $fid, $setting) || 0; |
|---|
| | 84 | my $elapsed = tv_interval($now); |
|---|
| | 85 | |
|---|
| | 86 | # process the return value to do something |
|---|
| | 87 | if ($rv == SUCCESS) { |
|---|
| | 88 | $dbh->do("DELETE FROM fsck WHERE fid = ?", undef, $fid); |
|---|
| | 89 | |
|---|
| | 90 | } elsif ($rv == TEMPORARY) { |
|---|
| | 91 | # temporary means - try again in 5-10 minutes |
|---|
| | 92 | $dbh->do("UPDATE fsck SET nextcheck = UNIX_TIMESTAMP() + ? WHERE fid = ?", |
|---|
| | 93 | undef, int((rand()*300)+300), $fid); |
|---|
| | 94 | |
|---|
| | 95 | } elsif ($rv == PERMANENT) { |
|---|
| | 96 | # FIXME: should probably do something more than this here? |
|---|
| | 97 | $dbh->do("DELETE FROM fsck WHERE fid = ?", undef, $fid); |
|---|
| | 98 | |
|---|
| | 99 | } elsif ($rv == REPLICATE) { |
|---|
| | 100 | # FIXME: use nexttry = 1? fromdevid should be specified as a known good, too. flags |
|---|
| | 101 | # should probably be set for something? not sure yet. |
|---|
| | 102 | $dbh->do("INSERT INTO file_to_replicate (fid, nexttry, fromdevid, failcount, flags) " . |
|---|
| | 103 | "VALUES (?, 1, NULL, 0, 0)", undef, $fid); |
|---|
| | 104 | $dbh->do("DELETE FROM fsck WHERE fid = ?", undef, $fid); |
|---|
| | 105 | } |
|---|
| | 106 | |
|---|
| | 107 | # store the stats now |
|---|
| | 108 | $total_time += $elapsed; |
|---|
| | 109 | $total_done++; |
|---|
| | 110 | $retvals{$rv}++; |
|---|
| | 111 | |
|---|
| | 112 | # dump some stats every 20 fids |
|---|
| | 113 | if ($total_done % 20 == 0) { |
|---|
| | 114 | my $avg_time = $total_time / $total_done; |
|---|
| | 115 | my $fids_sec = 1 / $avg_time; |
|---|
| | 116 | error(sprintf('status: done=%d, seconds/fid=%0.2f, fids/second=%0.2f, retvals: %s', |
|---|
| | 117 | $total_done, $avg_time, $fids_sec, join(', ', map { "$_=$retvals{$_}" } sort keys %retvals))); |
|---|
| | 118 | } |
|---|
| | 119 | } |
|---|
| | 149 | # this sub actually does the checking of a fid. we put it in its own sub so we can |
|---|
| | 150 | # return from it using the unlock coderef. always returns a number: |
|---|
| | 151 | # |
|---|
| | 152 | # 0 - file is just fine, drop it from the list |
|---|
| | 153 | # 1 - temporary failure, check again later |
|---|
| | 154 | # 2 - permanent failure, this file shouldn't get tried again |
|---|
| | 155 | # 3 - needs replication, we found something not quite right |
|---|
| | 156 | # |
|---|
| | 157 | sub check_fid { |
|---|
| | 158 | my ($dbh, $fid, $level) = @_; |
|---|
| | 159 | |
|---|
| | 160 | # unlocker sub to be used |
|---|
| | 161 | my $lockname = "mgfs:fid:$fid:check"; |
|---|
| | 162 | my $retunlock = sub { |
|---|
| | 163 | my $rv = shift()+0; |
|---|
| | 164 | |
|---|
| | 165 | # 0 means success, else some sort of failure |
|---|
| | 166 | if ($rv) { |
|---|
| | 167 | my $msg = shift() || "no error text"; |
|---|
| | 168 | my $rvtype = { |
|---|
| | 169 | 1 => 'temporary failure', |
|---|
| | 170 | 2 => 'permanent failure', |
|---|
| | 171 | 3 => 'needs replication', |
|---|
| | 172 | }->{$rv} || 'unknown error'; |
|---|
| | 173 | error("check_fid($fid, $level) = $rvtype: $msg"); |
|---|
| | 174 | } |
|---|
| | 175 | |
|---|
| | 176 | $dbh->do("SELECT RELEASE_LOCK(?)", undef, $lockname); |
|---|
| | 177 | return $rv; |
|---|
| | 178 | }; |
|---|
| | 179 | |
|---|
| | 180 | # try to get the lock |
|---|
| | 181 | my $lock = $dbh->selectrow_array("SELECT GET_LOCK(?, 1)", undef, $lockname); |
|---|
| | 182 | return $retunlock->(TEMPORARY, "failed getting lock $lockname") unless $lock; |
|---|
| | 183 | |
|---|
| | 184 | # all checks require us to get the file paths |
|---|
| | 185 | my $devids = $dbh->selectcol_arrayref('SELECT devid FROM file_on WHERE fid = ?', undef, $fid); |
|---|
| | 186 | return $retunlock->(PERMANENT, 'no sources found') unless $devids && @$devids; |
|---|
| | 187 | |
|---|
| | 188 | # if it's a simple location check, we're done |
|---|
| | 189 | return $retunlock->(SUCCESS) if $level eq 'locations'; |
|---|
| | 190 | |
|---|
| | 191 | # get the file size from the database, we're going to need it. note that this could be |
|---|
| | 192 | # a 0 size, so we have to watch for defined. |
|---|
| | 193 | my $db_size = $dbh->selectrow_array('SELECT length FROM file WHERE fid = ?', undef, $fid); |
|---|
| | 194 | return $retunlock->(TEMPORARY, "database does not contain file size") unless defined $db_size; |
|---|
| | 195 | |
|---|
| | 196 | # iterate and do HEAD requests to determine some basic information about the file |
|---|
| | 197 | my %devs; |
|---|
| | 198 | foreach my $devid (@$devids) { |
|---|
| | 199 | # setup and do the request. these failures are total failures in that we expect |
|---|
| | 200 | # them to work again later, as it's probably transient and will persist no matter |
|---|
| | 201 | # how many paths we try. |
|---|
| | 202 | my $path = Mgd::make_http_path($devid, $fid) |
|---|
| | 203 | or return $retunlock->(TEMPORARY, 'failure to create HTTP path to file'); |
|---|
| | 204 | my $ua = LWP::UserAgent->new(timeout => 3) |
|---|
| | 205 | or return $retunlock->(TEMPORARY, 'failed to create LWP::UserAgent object'); |
|---|
| | 206 | my $resp = $ua->head($path); |
|---|
| | 207 | |
|---|
| | 208 | # at this point we're going to assume that any error is based on the device alone |
|---|
| | 209 | # so we want to store the status and not return |
|---|
| | 210 | if ($resp->is_success) { |
|---|
| | 211 | # great, check the size against what's in the database |
|---|
| | 212 | if ($resp->header('Content-Length') == $db_size) { |
|---|
| | 213 | $devs{$devid} = SUCCESS; |
|---|
| | 214 | } else { |
|---|
| | 215 | $devs{$devid} = PERMANENT; |
|---|
| | 216 | } |
|---|
| | 217 | |
|---|
| | 218 | } else { |
|---|
| | 219 | # easy one, the request failed for some reason, 500 would tend to imply that the |
|---|
| | 220 | # mogstored is having issues so we should try again later, whereas a 404 is a |
|---|
| | 221 | # total and permanent failure |
|---|
| | 222 | error("check_fid($fid, $level): " . $resp->code . " on device $devid"); |
|---|
| | 223 | if ($resp->code == 404) { |
|---|
| | 224 | $devs{$devid} = PERMANENT; |
|---|
| | 225 | } else { |
|---|
| | 226 | $devs{$devid} = TEMPORARY; |
|---|
| | 227 | } |
|---|
| | 228 | } |
|---|
| | 229 | } |
|---|
| | 230 | |
|---|
| | 231 | # at this point, we need to take actions. if we discovered some PERMANENT failures in |
|---|
| | 232 | # a device scan, then we need to take care of those now by removing them. but DO NOT |
|---|
| | 233 | # remove them if that would leave us with no mappings! ONLY if there is at least one |
|---|
| | 234 | # SUCCESS mapping. |
|---|
| | 235 | # FIXME: implement |
|---|
| | 236 | |
|---|
| | 237 | # if they wanted a quick scan, let's stop here and throw a result based on the contents |
|---|
| | 238 | # of the %devs hash. basically, if any of the devices had issues, then at this point we |
|---|
| | 239 | # want to throw a flag saying "please replicate this". if not, then we tell them that |
|---|
| | 240 | # we're successful on this fid. |
|---|
| | 241 | if ($level eq 'quick') { |
|---|
| | 242 | foreach my $code (values %devs) { |
|---|
| | 243 | return $retunlock->(REPLICATE, "permanent failure on one or more devices") |
|---|
| | 244 | if $code != SUCCESS; |
|---|
| | 245 | } |
|---|
| | 246 | return $retunlock->(SUCCESS); |
|---|
| | 247 | } |
|---|
| | 248 | |
|---|
| | 249 | # full mode not here yet |
|---|
| | 250 | return $retunlock->(TEMPORARY, "sorry, $level mode is not implemented yet"); |
|---|
| | 251 | } |
|---|
| | 252 | |
|---|