Changeset 856

Show
Ignore:
Timestamp:
04/19/07 19:26:17 (3 years ago)
Author:
bradfitz
Message:

more fsck work. coming along well. went from 90 fid/second on my
local dev box, to 280 fids/s when using HTTPFile->at (which uses the
mogstored side-channel). much still to do, but basics work, so
polishing time. also more speed improvements possible later.

Location:
trunk/server/lib/MogileFS
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • trunk/server/lib/MogileFS/Store.pm

    r842 r856  
    589589utime  INT UNSIGNED NOT NULL, 
    590590fid    INT UNSIGNED NULL, 
    591 err    CHAR(4), 
     591evcode CHAR(4), 
    592592devid  MEDIUMINT UNSIGNED, 
    593593INDEX(utime) 
     
    12221222} 
    12231223 
     1224sub fsck_log { 
     1225    my ($self, %opts) = @_; 
     1226    $self->dbh->do("INSERT INTO fsck_log (utime, fid, evcode, devid) ". 
     1227                   "VALUES (" . $self->unix_timestamp . ",?,?,?)", 
     1228                   undef, 
     1229                   delete $opts{fid}, 
     1230                   delete $opts{code}, 
     1231                   delete $opts{devid}); 
     1232    croak("Unknown opts") if %opts; 
     1233    return 1; 
     1234} 
     1235 
     1236 
    12241237# run before daemonizing.  you can die from here if you see something's amiss.  or emit 
    12251238# warnings. 
  • trunk/server/lib/MogileFS/Worker/Fsck.pm

    r851 r856  
    2222    my $run_count = 0; 
    2323 
     24    # <debug crap> 
     25    my $running = 0; # start time 
     26    my $n_check = 0; # items checked 
     27    my $start = sub { 
     28        return if $running; 
     29        $running = time(); 
     30        print "START @" . time() . "\n"; 
     31    }; 
     32    my $stats = sub { 
     33        return unless $running; 
     34        my $now = time(); 
     35        my $elap = $now - $running; 
     36        printf("In %d secs, %d fids, %0.02f fids/sec\n", $elap, $n_check, ($n_check / ($elap || 1))); 
     37    }; 
     38    my $last_beat = 0; 
     39    my $beat = sub { 
     40        my $now = time(); 
     41        return unless $now >= $last_beat + 5; 
     42        $stats->(); 
     43        $last_beat = $now; 
     44    }; 
     45    my $stop = sub { 
     46        return unless $running; 
     47        $stats->(); 
     48        print "DONE.\n"; 
     49        $running = 0; 
     50    }; 
     51    # </debug crap> 
     52 
    2453    every(5.0, sub { 
    2554        my $sleep_set = shift; 
     
    4574 
    4675        unless (@fids) { 
    47             warn "[fsck] no fids to check...\n"; 
     76            $stop->(); 
    4877            return; 
    4978        } 
     79        $start->(); 
    5080 
    5181        MogileFS::FID->mass_load_devids(@fids); 
     
    5383        my $new_max; 
    5484        foreach my $fid (@fids) { 
     85            $n_check++; 
    5586            $self->still_alive; 
    5687            last if $self->should_stop_running; 
    5788            last unless $self->check_fid($fid, no_stat => $opt_nostat); 
    5889            $new_max = $fid->id; 
     90            $beat->(); 
    5991        } 
    6092 
     
    87119#   if no problems, no action. 
    88120#   if problems, log & enqueue fixes 
     121use constant CANT_CHECK => 0; 
    89122sub check_fid { 
    90123    my ($self, $fid, %opts) = @_; 
     
    93126 
    94127    # first obvious fucked-up case:  no devids even presumed to exist. 
    95     unless (@{ $fid->devids }) { 
     128    unless ($fid->devids) { 
    96129        # first, log this weird condition. (N = NO PAths) 
    97         Mgd::get_store->fsck_log(code => "NOPA", fid => $fid->id); 
     130        Mgd::get_store()->fsck_log(code => "NOPA", fid => $fid->id); 
    98131        # weird, schedule a fix (which will do a search over all 
    99132        # devices as a last-ditch effort to locate it) 
     
    106139    unless ($fid->devids_meet_policy) { 
    107140        # log a policy violation 
    108         Mgd::get_store->fsck_log(code => "POVI", fid => $fid->id); 
     141        Mgd::get_store()->fsck_log(code => "POVI", fid => $fid->id); 
    109142        $self->fix_fid($fid); 
    110143        return 1; 
     
    115148    # check the replication policy, which is already done, so finish. 
    116149    return 1 if $opt_no_stat; 
    117  
    118     # ********************************************************************** 
    119     # FIXME: temporary, until statting is done... 
    120     # ********************************************************************** 
    121  
    122     # printf("fid %d is good.\n", $fid->id); 
    123     return 1; 
    124150 
    125151    # iterate and do HEAD requests to determine some basic information about the file 
     
    132158        my $path = $dfid->get_url 
    133159            or die "FIXME"; 
    134         # TODO: use side-channel?  eh, why?  LWP + ConnCache good enough for now. 
    135         my $ua = LWP::UserAgent->new(timeout => 3) 
    136             or die "FIXME"; 
    137         my $resp = $ua->head($path); 
    138  
    139         # at this point we're going to assume that any error is based on the device alone 
    140         # so we want to store the status and not return 
    141         if ($resp->is_success) { 
    142             # great, check the size against what's in the database 
    143             if ($resp->header('Content-Length') == $fid->length) { 
    144                 #yay 
    145             } else { 
    146                 #shit. 
    147             } 
    148  
     160 
     161        my $disk_size = $dfid->size_on_disk; 
     162 
     163        if (! defined $disk_size) { 
     164            warn("Connectivity problem reaching XXXFIXME\n"); 
     165            return CANT_CHECK; 
     166        } 
     167 
     168        # great, check the size against what's in the database 
     169        if ($disk_size == $fid->length) { 
     170            #yay 
    149171        } else { 
    150             # easy one, the request failed for some reason, 500 would tend to imply that the 
    151             # mogstored is having issues so we should try again later, whereas a 404 is a 
    152             # total and permanent failure 
    153             #error("check_fid($fid, $level): " . $resp->code . " on device $devid"); 
    154  
    155             if ($resp->code == 404) { 
    156                 #fucked! 
    157             } else { 
    158                 # just unreachable 
    159                 warn "TODO: foo is unreachable\n"; 
    160                 return 0; 
    161             } 
     172            printf("FID %d corruption on devid $devid: e=%d, a=%d\n", 
     173                   $fid->id, 
     174                   $fid->length, 
     175                   $disk_size, 
     176                   ); 
    162177        } 
    163178    } 
     
    182197 
    183198    # make devfid objects from the devids that this fid is on, 
    184     my @dfids = map { MogileFS::DevID->new($_, $fid) } @{ $fid->devids }; 
     199    my @dfids = map { MogileFS::DevID->new($_, $fid) } $fid->devids; 
    185200 
    186201    # keep track if we found the file (with the right size) at least somewhere.