drivers/scsi/scsi_error.c

   1 /*
   2  *  scsi_error.c Copyright (C) 1997 Eric Youngdale
   3  *
   4  *  SCSI error/timeout handling
   5  *      Initial versions: Eric Youngdale.  Based upon conversations with
   6  *                        Leonard Zubkoff and David Miller at Linux Expo,
   7  *                        ideas originating from all over the place.
   8  *
   9  *      Restructured scsi_unjam_host and associated functions.
  10  *      September 04, 2002 Mike Anderson (andmike@us.ibm.com)
  11  *
  12  *      Forward port of Russell King's (rmk@arm.linux.org.uk) changes and
  13  *      minor  cleanups.
  14  *      September 30, 2002 Mike Anderson (andmike@us.ibm.com)
  15  */
  16
  17 #include <linux/module.h>
  18 #include <linux/sched.h>
  19 #include <linux/timer.h>
  20 #include <linux/string.h>
  21 #include <linux/slab.h>
  22 #include <linux/kernel.h>
  23 #include <linux/interrupt.h>
  24 #include <linux/blkdev.h>
  25 #include <linux/smp_lock.h>
  26 #include <scsi/scsi_ioctl.h>
  27
  28 #include "scsi.h"
  29 #include "hosts.h"
  30
  31 #include "scsi_priv.h"
  32 #include "scsi_logging.h"
  33
  34 #ifdef DEBUG
  35 #define SENSE_TIMEOUT SCSI_TIMEOUT
  36 #else
  37 #define SENSE_TIMEOUT (10*HZ)
  38 #endif
  39
  40 #define START_UNIT_TIMEOUT (30*HZ)
  41
  42 /*
  43  * These should *probably* be handled by the host itself.
  44  * Since it is allowed to sleep, it probably should.
  45  */
  46 #define BUS_RESET_SETTLE_TIME   10*HZ
  47 #define HOST_RESET_SETTLE_TIME  10*HZ
  48
  49 /* called with shost->host_lock held */
  50 void scsi_eh_wakeup(struct Scsi_Host *shost)
  51 {
  52         if (shost->host_busy == shost->host_failed) {
  53                 up(shost->eh_wait);
  54                 SCSI_LOG_ERROR_RECOVERY(5,
  55                                 printk("Waking error handler thread\n"));
  56         }
  57 }
  58
  59 /**
  60  * scsi_eh_scmd_add - add scsi cmd to error handling.
  61  * @scmd:       scmd to run eh on.
  62  * @eh_flag:    optional SCSI_EH flag.
  63  *
  64  * Return value:
  65  *      0 on failure.
  66  **/
  67 int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
  68 {
  69         struct Scsi_Host *shost = scmd->device->host;
  70         unsigned long flags;
  71
  72         if (shost->eh_wait == NULL)
  73                 return 0;
  74
  75         spin_lock_irqsave(shost->host_lock, flags);
  76
  77         scsi_eh_eflags_set(scmd, eh_flag);
  78         /*
  79          * FIXME: Can we stop setting owner and state.
  80          */
  81         scmd->owner = SCSI_OWNER_ERROR_HANDLER;
  82         scmd->state = SCSI_STATE_FAILED;
  83         /*
  84          * Set the serial_number_at_timeout to the current
  85          * serial_number
  86          */
  87         scmd->serial_number_at_timeout = scmd->serial_number;
  88         list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
  89         set_bit(SHOST_RECOVERY, &shost->shost_state);
  90         shost->host_failed++;
  91         scsi_eh_wakeup(shost);
  92         spin_unlock_irqrestore(shost->host_lock, flags);
  93         return 1;
  94 }
  95
  96 /**
  97  * scsi_add_timer - Start timeout timer for a single scsi command.
  98  * @scmd:       scsi command that is about to start running.
  99  * @timeout:    amount of time to allow this command to run.
 100  * @complete:   timeout function to call if timer isn't canceled.
 101  *
 102  * Notes:
 103  *    This should be turned into an inline function.  Each scsi command
 104  *    has its own timer, and as it is added to the queue, we set up the
 105  *    timer.  When the command completes, we cancel the timer.
 106  **/
 107 void scsi_add_timer(struct scsi_cmnd *scmd, int timeout,
 108                     void (*complete)(struct scsi_cmnd *))
 109 {
 110
 111         /*
 112          * If the clock was already running for this command, then
 113          * first delete the timer.  The timer handling code gets rather
 114          * confused if we don't do this.
 115          */
 116         if (scmd->eh_timeout.function)
 117                 del_timer(&scmd->eh_timeout);
 118
 119         scmd->eh_timeout.data = (unsigned long)scmd;
 120         scmd->eh_timeout.expires = jiffies + timeout;
 121         scmd->eh_timeout.function = (void (*)(unsigned long)) complete;
 122
 123         SCSI_LOG_ERROR_RECOVERY(5, printk("%s: scmd: %p, time:"
 124                                           " %d, (%p)\n", __FUNCTION__,
 125                                           scmd, timeout, complete));
 126
 127         add_timer(&scmd->eh_timeout);
 128 }
 129
 130 /**
 131  * scsi_delete_timer - Delete/cancel timer for a given function.
 132  * @scmd:       Cmd that we are canceling timer for
 133  *
 134  * Notes:
 135  *     This should be turned into an inline function.
 136  *
 137  * Return value:
 138  *     1 if we were able to detach the timer.  0 if we blew it, and the
 139  *     timer function has already started to run.
 140  **/
 141 int scsi_delete_timer(struct scsi_cmnd *scmd)
 142 {
 143         int rtn;
 144
 145         rtn = del_timer(&scmd->eh_timeout);
 146
 147         SCSI_LOG_ERROR_RECOVERY(5, printk("%s: scmd: %p,"
 148                                          " rtn: %d\n", __FUNCTION__,
 149                                          scmd, rtn));
 150
 151         scmd->eh_timeout.data = (unsigned long)NULL;
 152         scmd->eh_timeout.function = NULL;
 153
 154         return rtn;
 155 }
 156
 157 /**
 158  * scsi_times_out - Timeout function for normal scsi commands.
 159  * @scmd:       Cmd that is timing out.
 160  *
 161  * Notes:
 162  *     We do not need to lock this.  There is the potential for a race
 163  *     only in that the normal completion handling might run, but if the
 164  *     normal completion function determines that the timer has already
 165  *     fired, then it mustn't do anything.
 166  **/
 167 void scsi_times_out(struct scsi_cmnd *scmd)
 168 {
 169         scsi_log_completion(scmd, TIMEOUT_ERROR);
 170         if (unlikely(!scsi_eh_scmd_add(scmd, SCSI_EH_CANCEL_CMD))) {
 171                 panic("Error handler thread not present at %p %p %s %d",
 172                       scmd, scmd->device->host, __FILE__, __LINE__);
 173         }
 174 }
 175
 176 /**
 177  * scsi_block_when_processing_errors - Prevent cmds from being queued.
 178  * @sdev:       Device on which we are performing recovery.
 179  *
 180  * Description:
 181  *     We block until the host is out of error recovery, and then check to
 182  *     see whether the host or the device is offline.
 183  *
 184  * Return value:
 185  *     0 when dev was taken offline by error recovery. 1 OK to proceed.
 186  **/
 187 int scsi_block_when_processing_errors(struct scsi_device *sdev)
 188 {
 189         int online;
 190
 191         wait_event(sdev->host->host_wait, (!test_bit(SHOST_RECOVERY, &sdev->host->shost_state)));
 192
 193         online = scsi_device_online(sdev);
 194
 195         SCSI_LOG_ERROR_RECOVERY(5, printk("%s: rtn: %d\n", __FUNCTION__,
 196                                           online));
 197
 198         return online;
 199 }
 200
 201 #ifdef CONFIG_SCSI_LOGGING
 202 /**
 203  * scsi_eh_prt_fail_stats - Log info on failures.
 204  * @shost:      scsi host being recovered.
 205  * @work_q:     Queue of scsi cmds to process.
 206  **/
 207 static inline void scsi_eh_prt_fail_stats(struct Scsi_Host *shost,
 208                                           struct list_head *work_q)
 209 {
 210         struct scsi_cmnd *scmd;
 211         struct scsi_device *sdev;
 212         int total_failures = 0;
 213         int cmd_failed = 0;
 214         int cmd_cancel = 0;
 215         int devices_failed = 0;
 216
 217         shost_for_each_device(sdev, shost) {
 218                 list_for_each_entry(scmd, work_q, eh_entry) {
 219                         if (scmd->device == sdev) {
 220                                 ++total_failures;
 221                                 if (scsi_eh_eflags_chk(scmd,
 222                                                        SCSI_EH_CANCEL_CMD))
 223                                         ++cmd_cancel;
 224                                 else
 225                                         ++cmd_failed;
 226                         }
 227                 }
 228
 229                 if (cmd_cancel || cmd_failed) {
 230                         SCSI_LOG_ERROR_RECOVERY(3,
 231                                 printk("%s: %d:%d:%d:%d cmds failed: %d,"
 232                                        " cancel: %d\n",
 233                                        __FUNCTION__, shost->host_no,
 234                                        sdev->channel, sdev->id, sdev->lun,
 235                                        cmd_failed, cmd_cancel));
 236                         cmd_cancel = 0;
 237                         cmd_failed = 0;
 238                         ++devices_failed;
 239                 }
 240         }
 241
 242         SCSI_LOG_ERROR_RECOVERY(2, printk("Total of %d commands on %d"
 243                                           " devices require eh work\n",
 244                                   total_failures, devices_failed));
 245 }
 246 #endif
 247
 248 /**
 249  * scsi_check_sense - Examine scsi cmd sense
 250  * @scmd:       Cmd to have sense checked.
 251  *
 252  * Return value:
 253  *      SUCCESS or FAILED or NEEDS_RETRY
 254  **/
 255 static int scsi_check_sense(struct scsi_cmnd *scmd)
 256 {
 257         if (!SCSI_SENSE_VALID(scmd))
 258                 return FAILED;
 259
 260         if (scmd->sense_buffer[2] & 0xe0)
 261                 return SUCCESS;
 262
 263         switch (scmd->sense_buffer[2] & 0xf) {
 264         case NO_SENSE:
 265                 return SUCCESS;
 266         case RECOVERED_ERROR:
 267                 return /* soft_error */ SUCCESS;
 268
 269         case ABORTED_COMMAND:
 270                 return NEEDS_RETRY;
 271         case NOT_READY:
 272         case UNIT_ATTENTION:
 273                 /*
 274                  * if we are expecting a cc/ua because of a bus reset that we
 275                  * performed, treat this just as a retry.  otherwise this is
 276                  * information that we should pass up to the upper-level driver
 277                  * so that we can deal with it there.
 278                  */
 279                 if (scmd->device->expecting_cc_ua) {
 280                         scmd->device->expecting_cc_ua = 0;
 281                         return NEEDS_RETRY;
 282                 }
 283                 /*
 284                  * if the device is in the process of becoming ready, we
 285                  * should retry.
 286                  */
 287                 if ((scmd->sense_buffer[12] == 0x04) &&
 288                         (scmd->sense_buffer[13] == 0x01)) {
 289                         return NEEDS_RETRY;
 290                 }
 291                 /*
 292                  * if the device is not started, we need to wake
 293                  * the error handler to start the motor
 294                  */
 295                 if (scmd->device->allow_restart &&
 296                     (scmd->sense_buffer[12] == 0x04) &&
 297                     (scmd->sense_buffer[13] == 0x02)) {
 298                         return FAILED;
 299                 }
 300                 return SUCCESS;
 301
 302                 /* these three are not supported */
 303         case COPY_ABORTED:
 304         case VOLUME_OVERFLOW:
 305         case MISCOMPARE:
 306                 return SUCCESS;
 307
 308         case MEDIUM_ERROR:
 309                 return NEEDS_RETRY;
 310
 311         case ILLEGAL_REQUEST:
 312         case BLANK_CHECK:
 313         case DATA_PROTECT:
 314         case HARDWARE_ERROR:
 315         default:
 316                 return SUCCESS;
 317         }
 318 }
 319
 320 /**
 321  * scsi_eh_completed_normally - Disposition a eh cmd on return from LLD.
 322  * @scmd:       SCSI cmd to examine.
 323  *
 324  * Notes:
 325  *    This is *only* called when we are examining the status of commands
 326  *    queued during error recovery.  the main difference here is that we
 327  *    don't allow for the possibility of retries here, and we are a lot
 328  *    more restrictive about what we consider acceptable.
 329  **/
 330 static int scsi_eh_completed_normally(struct scsi_cmnd *scmd)
 331 {
 332         /*
 333          * first check the host byte, to see if there is anything in there
 334          * that would indicate what we need to do.
 335          */
 336         if (host_byte(scmd->result) == DID_RESET) {
 337                 /*
 338                  * rats.  we are already in the error handler, so we now
 339                  * get to try and figure out what to do next.  if the sense
 340                  * is valid, we have a pretty good idea of what to do.
 341                  * if not, we mark it as FAILED.
 342                  */
 343                 return scsi_check_sense(scmd);
 344         }
 345         if (host_byte(scmd->result) != DID_OK)
 346                 return FAILED;
 347
 348         /*
 349          * next, check the message byte.
 350          */
 351         if (msg_byte(scmd->result) != COMMAND_COMPLETE)
 352                 return FAILED;
 353
 354         /*
 355          * now, check the status byte to see if this indicates
 356          * anything special.
 357          */
 358         switch (status_byte(scmd->result)) {
 359         case GOOD:
 360         case COMMAND_TERMINATED:
 361                 return SUCCESS;
 362         case CHECK_CONDITION:
 363                 return scsi_check_sense(scmd);
 364         case CONDITION_GOOD:
 365         case INTERMEDIATE_GOOD:
 366         case INTERMEDIATE_C_GOOD:
 367                 /*
 368                  * who knows?  FIXME(eric)
 369                  */
 370                 return SUCCESS;
 371         case BUSY:
 372         case QUEUE_FULL:
 373         case RESERVATION_CONFLICT:
 374         default:
 375                 return FAILED;
 376         }
 377         return FAILED;
 378 }
 379
 380 /**
 381  * scsi_eh_times_out - timeout function for error handling.
 382  * @scmd:       Cmd that is timing out.
 383  *
 384  * Notes:
 385  *    During error handling, the kernel thread will be sleeping waiting
 386  *    for some action to complete on the device.  our only job is to
 387  *    record that it timed out, and to wake up the thread.
 388  **/
 389 static void scsi_eh_times_out(struct scsi_cmnd *scmd)
 390 {
 391         scsi_eh_eflags_set(scmd, SCSI_EH_REC_TIMEOUT);
 392         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: scmd:%p\n", __FUNCTION__,
 393                                           scmd));
 394
 395         if (scmd->device->host->eh_action)
 396                 up(scmd->device->host->eh_action);
 397 }
 398
 399 /**
 400  * scsi_eh_done - Completion function for error handling.
 401  * @scmd:       Cmd that is done.
 402  **/
 403 static void scsi_eh_done(struct scsi_cmnd *scmd)
 404 {
 405         /*
 406          * if the timeout handler is already running, then just set the
 407          * flag which says we finished late, and return.  we have no
 408          * way of stopping the timeout handler from running, so we must
 409          * always defer to it.
 410          */
 411         if (del_timer(&scmd->eh_timeout)) {
 412                 scmd->request->rq_status = RQ_SCSI_DONE;
 413                 scmd->owner = SCSI_OWNER_ERROR_HANDLER;
 414
 415                 SCSI_LOG_ERROR_RECOVERY(3, printk("%s scmd: %p result: %x\n",
 416                                            __FUNCTION__, scmd, scmd->result));
 417
 418                 if (scmd->device->host->eh_action)
 419                         up(scmd->device->host->eh_action);
 420         }
 421 }
 422
 423 /**
 424  * scsi_send_eh_cmnd  - send a cmd to a device as part of error recovery.
 425  * @scmd:       SCSI Cmd to send.
 426  * @timeout:    Timeout for cmd.
 427  *
 428  * Notes:
 429  *    The initialization of the structures is quite a bit different in
 430  *    this case, and furthermore, there is a different completion handler
 431  *    vs scsi_dispatch_cmd.
 432  * Return value:
 433  *    SUCCESS or FAILED or NEEDS_RETRY
 434  **/
 435 static int scsi_send_eh_cmnd(struct scsi_cmnd *scmd, int timeout)
 436 {
 437         struct Scsi_Host *host = scmd->device->host;
 438         DECLARE_MUTEX_LOCKED(sem);
 439         unsigned long flags;
 440         int rtn = SUCCESS;
 441
 442         /*
 443          * we will use a queued command if possible, otherwise we will
 444          * emulate the queuing and calling of completion function ourselves.
 445          */
 446         scmd->owner = SCSI_OWNER_LOWLEVEL;
 447
 448         if (scmd->device->scsi_level <= SCSI_2)
 449                 scmd->cmnd[1] = (scmd->cmnd[1] & 0x1f) |
 450                         (scmd->device->lun << 5 & 0xe0);
 451
 452         scsi_add_timer(scmd, timeout, scsi_eh_times_out);
 453
 454         /*
 455          * set up the semaphore so we wait for the command to complete.
 456          */
 457         scmd->device->host->eh_action = &sem;
 458         scmd->request->rq_status = RQ_SCSI_BUSY;
 459
 460         spin_lock_irqsave(scmd->device->host->host_lock, flags);
 461         scsi_log_send(scmd);
 462         host->hostt->queuecommand(scmd, scsi_eh_done);
 463         spin_unlock_irqrestore(scmd->device->host->host_lock, flags);
 464
 465         down(&sem);
 466         scsi_log_completion(scmd, SUCCESS);
 467
 468         scmd->device->host->eh_action = NULL;
 469
 470         /*
 471          * see if timeout.  if so, tell the host to forget about it.
 472          * in other words, we don't want a callback any more.
 473          */
 474         if (scsi_eh_eflags_chk(scmd, SCSI_EH_REC_TIMEOUT)) {
 475                 scsi_eh_eflags_clr(scmd,  SCSI_EH_REC_TIMEOUT);
 476                 scmd->owner = SCSI_OWNER_LOWLEVEL;
 477
 478                 /*
 479                  * as far as the low level driver is
 480                  * concerned, this command is still active, so
 481                  * we must give the low level driver a chance
 482                  * to abort it. (db)
 483                  *
 484                  * FIXME(eric) - we are not tracking whether we could
 485                  * abort a timed out command or not.  not sure how
 486                  * we should treat them differently anyways.
 487                  */
 488                 spin_lock_irqsave(scmd->device->host->host_lock, flags);
 489                 if (scmd->device->host->hostt->eh_abort_handler)
 490                         scmd->device->host->hostt->eh_abort_handler(scmd);
 491                 spin_unlock_irqrestore(scmd->device->host->host_lock, flags);
 492
 493                 scmd->request->rq_status = RQ_SCSI_DONE;
 494                 scmd->owner = SCSI_OWNER_ERROR_HANDLER;
 495
 496                 rtn = FAILED;
 497         }
 498
 499         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: scmd: %p, rtn:%x\n",
 500                                           __FUNCTION__, scmd, rtn));
 501
 502         /*
 503          * now examine the actual status codes to see whether the command
 504          * actually did complete normally.
 505          */
 506         if (rtn == SUCCESS) {
 507                 rtn = scsi_eh_completed_normally(scmd);
 508                 SCSI_LOG_ERROR_RECOVERY(3,
 509                         printk("%s: scsi_eh_completed_normally %x\n",
 510                                __FUNCTION__, rtn));
 511                 switch (rtn) {
 512                 case SUCCESS:
 513                 case NEEDS_RETRY:
 514                 case FAILED:
 515                         break;
 516                 default:
 517                         rtn = FAILED;
 518                         break;
 519                 }
 520         }
 521
 522         return rtn;
 523 }
 524
 525 /**
 526  * scsi_request_sense - Request sense data from a particular target.
 527  * @scmd:       SCSI cmd for request sense.
 528  *
 529  * Notes:
 530  *    Some hosts automatically obtain this information, others require
 531  *    that we obtain it on our own. This function will *not* return until
 532  *    the command either times out, or it completes.
 533  **/
 534 static int scsi_request_sense(struct scsi_cmnd *scmd)
 535 {
 536         static unsigned char generic_sense[6] =
 537         {REQUEST_SENSE, 0, 0, 0, 252, 0};
 538         unsigned char *scsi_result;
 539         int saved_result;
 540         int rtn;
 541
 542         memcpy(scmd->cmnd, generic_sense, sizeof(generic_sense));
 543
 544         scsi_result = kmalloc(252, GFP_ATOMIC | (scmd->device->host->hostt->unchecked_isa_dma) ? __GFP_DMA : 0);
 545
 546
 547         if (unlikely(!scsi_result)) {
 548                 printk(KERN_ERR "%s: cannot allocate scsi_result.\n",
 549                        __FUNCTION__);
 550                 return FAILED;
 551         }
 552
 553         /*
 554          * zero the sense buffer.  some host adapters automatically always
 555          * request sense, so it is not a good idea that
 556          * scmd->request_buffer and scmd->sense_buffer point to the same
 557          * address (db).  0 is not a valid sense code.
 558          */
 559         memset(scmd->sense_buffer, 0, sizeof(scmd->sense_buffer));
 560         memset(scsi_result, 0, 252);
 561
 562         saved_result = scmd->result;
 563         scmd->request_buffer = scsi_result;
 564         scmd->request_bufflen = 252;
 565         scmd->use_sg = 0;
 566         scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]);
 567         scmd->sc_data_direction = DMA_FROM_DEVICE;
 568         scmd->underflow = 0;
 569
 570         rtn = scsi_send_eh_cmnd(scmd, SENSE_TIMEOUT);
 571
 572         /* last chance to have valid sense data */
 573         if(!SCSI_SENSE_VALID(scmd)) {
 574                 memcpy(scmd->sense_buffer, scmd->request_buffer,
 575                        sizeof(scmd->sense_buffer));
 576         }
 577
 578         kfree(scsi_result);
 579
 580         /*
 581          * when we eventually call scsi_finish, we really wish to complete
 582          * the original request, so let's restore the original data. (db)
 583          */
 584         scsi_setup_cmd_retry(scmd);
 585         scmd->result = saved_result;
 586         return rtn;
 587 }
 588
 589 /**
 590  * scsi_eh_finish_cmd - Handle a cmd that eh is finished with.
 591  * @scmd:       Original SCSI cmd that eh has finished.
 592  * @done_q:     Queue for processed commands.
 593  *
 594  * Notes:
 595  *    We don't want to use the normal command completion while we are are
 596  *    still handling errors - it may cause other commands to be queued,
 597  *    and that would disturb what we are doing.  thus we really want to
 598  *    keep a list of pending commands for final completion, and once we
 599  *    are ready to leave error handling we handle completion for real.
 600  **/
 601 static void scsi_eh_finish_cmd(struct scsi_cmnd *scmd,
 602                                struct list_head *done_q)
 603 {
 604         scmd->device->host->host_failed--;
 605         scmd->state = SCSI_STATE_BHQUEUE;
 606
 607         scsi_eh_eflags_clr_all(scmd);
 608
 609         /*
 610          * set this back so that the upper level can correctly free up
 611          * things.
 612          */
 613         scsi_setup_cmd_retry(scmd);
 614         list_move_tail(&scmd->eh_entry, done_q);
 615 }
 616
 617 /**
 618  * scsi_eh_get_sense - Get device sense data.
 619  * @work_q:     Queue of commands to process.
 620  * @done_q:     Queue of proccessed commands..
 621  *
 622  * Description:
 623  *    See if we need to request sense information.  if so, then get it
 624  *    now, so we have a better idea of what to do.
 625  *
 626  * Notes:
 627  *    This has the unfortunate side effect that if a shost adapter does
 628  *    not automatically request sense information, that we end up shutting
 629  *    it down before we request it.  All shosts should be doing this
 630  *    anyways, so for now all I have to say is tough noogies if you end up
 631  *    in here.  On second thought, this is probably a good idea.  We
 632  *    *really* want to give authors an incentive to automatically request
 633  *    this.
 634  *
 635  *    In 2.5 this capability will be going away.
 636  *
 637  *    Really?  --hch
 638  **/
 639 static int scsi_eh_get_sense(struct list_head *work_q,
 640                              struct list_head *done_q)
 641 {
 642         struct list_head *lh, *lh_sf;
 643         struct scsi_cmnd *scmd;
 644         int rtn;
 645
 646         list_for_each_safe(lh, lh_sf, work_q) {
 647                 scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
 648                 if (scsi_eh_eflags_chk(scmd, SCSI_EH_CANCEL_CMD) ||
 649                     SCSI_SENSE_VALID(scmd))
 650                         continue;
 651
 652                 SCSI_LOG_ERROR_RECOVERY(2, printk("%s: requesting sense"
 653                                                   " for id: %d\n",
 654                                                   current->comm,
 655                                                   scmd->device->id));
 656                 rtn = scsi_request_sense(scmd);
 657                 if (rtn != SUCCESS)
 658                         continue;
 659
 660                 SCSI_LOG_ERROR_RECOVERY(3, printk("sense requested for %p"
 661                                                   " result %x\n", scmd,
 662                                                   scmd->result));
 663                 SCSI_LOG_ERROR_RECOVERY(3, print_sense("bh", scmd));
 664
 665                 rtn = scsi_decide_disposition(scmd);
 666
 667                 /*
 668                  * if the result was normal, then just pass it along to the
 669                  * upper level.
 670                  */
 671                 if (rtn == SUCCESS)
 672                         /* we don't want this command reissued, just
 673                          * finished with the sense data, so set
 674                          * retries to the max allowed to ensure it
 675                          * won't get reissued */
 676                         scmd->retries = scmd->allowed;
 677                 else if (rtn != NEEDS_RETRY)
 678                         continue;
 679
 680                 scsi_eh_finish_cmd(scmd, done_q);
 681         }
 682
 683         return list_empty(work_q);
 684 }
 685
 686 /**
 687  * scsi_try_to_abort_cmd - Ask host to abort a running command.
 688  * @scmd:       SCSI cmd to abort from Lower Level.
 689  *
 690  * Notes:
 691  *    This function will not return until the user's completion function
 692  *    has been called.  there is no timeout on this operation.  if the
 693  *    author of the low-level driver wishes this operation to be timed,
 694  *    they can provide this facility themselves.  helper functions in
 695  *    scsi_error.c can be supplied to make this easier to do.
 696  **/
 697 static int scsi_try_to_abort_cmd(struct scsi_cmnd *scmd)
 698 {
 699         unsigned long flags;
 700         int rtn = FAILED;
 701
 702         if (!scmd->device->host->hostt->eh_abort_handler)
 703                 return rtn;
 704
 705         /*
 706          * scsi_done was called just after the command timed out and before
 707          * we had a chance to process it. (db)
 708          */
 709         if (scmd->serial_number == 0)
 710                 return SUCCESS;
 711
 712         scmd->owner = SCSI_OWNER_LOWLEVEL;
 713
 714         spin_lock_irqsave(scmd->device->host->host_lock, flags);
 715         rtn = scmd->device->host->hostt->eh_abort_handler(scmd);
 716         spin_unlock_irqrestore(scmd->device->host->host_lock, flags);
 717
 718         return rtn;
 719 }
 720
 721 /**
 722  * scsi_eh_tur - Send TUR to device.
 723  * @scmd:       Scsi cmd to send TUR
 724  *
 725  * Return value:
 726  *    0 - Device is ready. 1 - Device NOT ready.
 727  **/
 728 static int scsi_eh_tur(struct scsi_cmnd *scmd)
 729 {
 730         static unsigned char tur_command[6] = {TEST_UNIT_READY, 0, 0, 0, 0, 0};
 731         int retry_cnt = 1, rtn;
 732
 733 retry_tur:
 734         memcpy(scmd->cmnd, tur_command, sizeof(tur_command));
 735
 736         /*
 737          * zero the sense buffer.  the scsi spec mandates that any
 738          * untransferred sense data should be interpreted as being zero.
 739          */
 740         memset(scmd->sense_buffer, 0, sizeof(scmd->sense_buffer));
 741
 742         scmd->request_buffer = NULL;
 743         scmd->request_bufflen = 0;
 744         scmd->use_sg = 0;
 745         scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]);
 746         scmd->underflow = 0;
 747         scmd->sc_data_direction = DMA_NONE;
 748
 749         rtn = scsi_send_eh_cmnd(scmd, SENSE_TIMEOUT);
 750
 751         /*
 752          * when we eventually call scsi_finish, we really wish to complete
 753          * the original request, so let's restore the original data. (db)
 754          */
 755         scsi_setup_cmd_retry(scmd);
 756
 757         /*
 758          * hey, we are done.  let's look to see what happened.
 759          */
 760         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: scmd %p rtn %x\n",
 761                 __FUNCTION__, scmd, rtn));
 762         if (rtn == SUCCESS)
 763                 return 0;
 764         else if (rtn == NEEDS_RETRY)
 765                 if (retry_cnt--)
 766                         goto retry_tur;
 767         return 1;
 768 }
 769
 770 /**
 771  * scsi_eh_abort_cmds - abort canceled commands.
 772  * @shost:      scsi host being recovered.
 773  * @eh_done_q:  list_head for processed commands.
 774  *
 775  * Decription:
 776  *    Try and see whether or not it makes sense to try and abort the
 777  *    running command.  this only works out to be the case if we have one
 778  *    command that has timed out.  if the command simply failed, it makes
 779  *    no sense to try and abort the command, since as far as the shost
 780  *    adapter is concerned, it isn't running.
 781  **/
 782 static int scsi_eh_abort_cmds(struct list_head *work_q,
 783                               struct list_head *done_q)
 784 {
 785         struct list_head *lh, *lh_sf;
 786         struct scsi_cmnd *scmd;
 787         int rtn;
 788
 789         list_for_each_safe(lh, lh_sf, work_q) {
 790                 scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
 791                 if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CANCEL_CMD))
 792                         continue;
 793                 SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting cmd:"
 794                                                   "0x%p\n", current->comm,
 795                                                   scmd));
 796                 rtn = scsi_try_to_abort_cmd(scmd);
 797                 if (rtn == SUCCESS) {
 798                         scsi_eh_eflags_clr(scmd,  SCSI_EH_CANCEL_CMD);
 799                         if (!scsi_device_online(scmd->device) ||
 800                             !scsi_eh_tur(scmd)) {
 801                                 scsi_eh_finish_cmd(scmd, done_q);
 802                         }
 803
 804                 } else
 805                         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting"
 806                                                           " cmd failed:"
 807                                                           "0x%p\n",
 808                                                           current->comm,
 809                                                           scmd));
 810         }
 811
 812         return list_empty(work_q);
 813 }
 814
 815 /**
 816  * scsi_try_bus_device_reset - Ask host to perform a BDR on a dev
 817  * @scmd:       SCSI cmd used to send BDR
 818  *
 819  * Notes:
 820  *    There is no timeout for this operation.  if this operation is
 821  *    unreliable for a given host, then the host itself needs to put a
 822  *    timer on it, and set the host back to a consistent state prior to
 823  *    returning.
 824  **/
 825 static int scsi_try_bus_device_reset(struct scsi_cmnd *scmd)
 826 {
 827         unsigned long flags;
 828         int rtn = FAILED;
 829
 830         if (!scmd->device->host->hostt->eh_device_reset_handler)
 831                 return rtn;
 832
 833         scmd->owner = SCSI_OWNER_LOWLEVEL;
 834
 835         spin_lock_irqsave(scmd->device->host->host_lock, flags);
 836         rtn = scmd->device->host->hostt->eh_device_reset_handler(scmd);
 837         spin_unlock_irqrestore(scmd->device->host->host_lock, flags);
 838
 839         if (rtn == SUCCESS) {
 840                 scmd->device->was_reset = 1;
 841                 scmd->device->expecting_cc_ua = 1;
 842         }
 843
 844         return rtn;
 845 }
 846
 847 /**
 848  * scsi_eh_try_stu - Send START_UNIT to device.
 849  * @scmd:       Scsi cmd to send START_UNIT
 850  *
 851  * Return value:
 852  *    0 - Device is ready. 1 - Device NOT ready.
 853  **/
 854 static int scsi_eh_try_stu(struct scsi_cmnd *scmd)
 855 {
 856         static unsigned char stu_command[6] = {START_STOP, 0, 0, 0, 1, 0};
 857         int rtn;
 858
 859         if (!scmd->device->allow_restart)
 860                 return 1;
 861
 862         memcpy(scmd->cmnd, stu_command, sizeof(stu_command));
 863
 864         /*
 865          * zero the sense buffer.  the scsi spec mandates that any
 866          * untransferred sense data should be interpreted as being zero.
 867          */
 868         memset(scmd->sense_buffer, 0, sizeof(scmd->sense_buffer));
 869
 870         scmd->request_buffer = NULL;
 871         scmd->request_bufflen = 0;
 872         scmd->use_sg = 0;
 873         scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]);
 874         scmd->underflow = 0;
 875         scmd->sc_data_direction = DMA_NONE;
 876
 877         rtn = scsi_send_eh_cmnd(scmd, START_UNIT_TIMEOUT);
 878
 879         /*
 880          * when we eventually call scsi_finish, we really wish to complete
 881          * the original request, so let's restore the original data. (db)
 882          */
 883         scsi_setup_cmd_retry(scmd);
 884
 885         /*
 886          * hey, we are done.  let's look to see what happened.
 887          */
 888         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: scmd %p rtn %x\n",
 889                 __FUNCTION__, scmd, rtn));
 890         if (rtn == SUCCESS)
 891                 return 0;
 892         return 1;
 893 }
 894
 895  /**
 896  * scsi_eh_stu - send START_UNIT if needed
 897  * @shost:      scsi host being recovered.
 898  * @eh_done_q:  list_head for processed commands.
 899  *
 900  * Notes:
 901  *    If commands are failing due to not ready, initializing command required,
 902  *      try revalidating the device, which will end up sending a start unit.
 903  **/
 904 static int scsi_eh_stu(struct Scsi_Host *shost,
 905                               struct list_head *work_q,
 906                               struct list_head *done_q)
 907 {
 908         struct list_head *lh, *lh_sf;
 909         struct scsi_cmnd *scmd, *stu_scmd;
 910         struct scsi_device *sdev;
 911
 912         shost_for_each_device(sdev, shost) {
 913                 stu_scmd = NULL;
 914                 list_for_each_entry(scmd, work_q, eh_entry)
 915                         if (scmd->device == sdev && SCSI_SENSE_VALID(scmd) &&
 916                             scsi_check_sense(scmd) == FAILED ) {
 917                                 stu_scmd = scmd;
 918                                 break;
 919                         }
 920
 921                 if (!stu_scmd)
 922                         continue;
 923
 924                 SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Sending START_UNIT to sdev:"
 925                                                   " 0x%p\n", current->comm, sdev));
 926
 927                 if (!scsi_eh_try_stu(stu_scmd)) {
 928                         if (!scsi_device_online(sdev) ||
 929                             !scsi_eh_tur(stu_scmd)) {
 930                                 list_for_each_safe(lh, lh_sf, work_q) {
 931                                         scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
 932                                         if (scmd->device == sdev)
 933                                                 scsi_eh_finish_cmd(scmd, done_q);
 934                                 }
 935                         }
 936                 } else {
 937                         SCSI_LOG_ERROR_RECOVERY(3,
 938                                                 printk("%s: START_UNIT failed to sdev:"
 939                                                        " 0x%p\n", current->comm, sdev));
 940                 }
 941         }
 942
 943         return list_empty(work_q);
 944 }
 945
 946
 947 /**
 948  * scsi_eh_bus_device_reset - send bdr if needed
 949  * @shost:      scsi host being recovered.
 950  * @eh_done_q:  list_head for processed commands.
 951  *
 952  * Notes:
 953  *    Try a bus device reset.  still, look to see whether we have multiple
 954  *    devices that are jammed or not - if we have multiple devices, it
 955  *    makes no sense to try bus_device_reset - we really would need to try
 956  *    a bus_reset instead.
 957  **/
 958 static int scsi_eh_bus_device_reset(struct Scsi_Host *shost,
 959                                     struct list_head *work_q,
 960                                     struct list_head *done_q)
 961 {
 962         struct list_head *lh, *lh_sf;
 963         struct scsi_cmnd *scmd, *bdr_scmd;
 964         struct scsi_device *sdev;
 965         int rtn;
 966
 967         shost_for_each_device(sdev, shost) {
 968                 bdr_scmd = NULL;
 969                 list_for_each_entry(scmd, work_q, eh_entry)
 970                         if (scmd->device == sdev) {
 971                                 bdr_scmd = scmd;
 972                                 break;
 973                         }
 974
 975                 if (!bdr_scmd)
 976                         continue;
 977
 978                 SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Sending BDR sdev:"
 979                                                   " 0x%p\n", current->comm,
 980                                                   sdev));
 981                 rtn = scsi_try_bus_device_reset(bdr_scmd);
 982                 if (rtn == SUCCESS) {
 983                         if (!scsi_device_online(sdev) ||
 984                             !scsi_eh_tur(bdr_scmd)) {
 985                                 list_for_each_safe(lh, lh_sf,
 986                                                    work_q) {
 987                                         scmd = list_entry(lh, struct
 988                                                           scsi_cmnd,
 989                                                           eh_entry);
 990                                         if (scmd->device == sdev)
 991                                                 scsi_eh_finish_cmd(scmd,
 992                                                                    done_q);
 993                                 }
 994                         }
 995                 } else {
 996                         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: BDR"
 997                                                           " failed sdev:"
 998                                                           "0x%p\n",
 999                                                           current->comm,
1000                                                            sdev));
1001                 }
1002         }
1003
1004         return list_empty(work_q);
1005 }
1006
1007 /**
1008  * scsi_try_bus_reset - ask host to perform a bus reset
1009  * @scmd:       SCSI cmd to send bus reset.
1010  **/
1011 static int scsi_try_bus_reset(struct scsi_cmnd *scmd)
1012 {
1013         unsigned long flags;
1014         int rtn;
1015
1016         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Snd Bus RST\n",
1017                                           __FUNCTION__));
1018         scmd->owner = SCSI_OWNER_LOWLEVEL;
1019         scmd->serial_number_at_timeout = scmd->serial_number;
1020
1021         if (!scmd->device->host->hostt->eh_bus_reset_handler)
1022                 return FAILED;
1023
1024         spin_lock_irqsave(scmd->device->host->host_lock, flags);
1025         rtn = scmd->device->host->hostt->eh_bus_reset_handler(scmd);
1026         spin_unlock_irqrestore(scmd->device->host->host_lock, flags);
1027
1028         if (rtn == SUCCESS) {
1029                 scsi_sleep(BUS_RESET_SETTLE_TIME);
1030                 spin_lock_irqsave(scmd->device->host->host_lock, flags);
1031                 scsi_report_bus_reset(scmd->device->host, scmd->device->channel);
1032                 spin_unlock_irqrestore(scmd->device->host->host_lock, flags);
1033         }
1034
1035         return rtn;
1036 }
1037
1038 /**
1039  * scsi_try_host_reset - ask host adapter to reset itself
1040  * @scmd:       SCSI cmd to send hsot reset.
1041  **/
1042 static int scsi_try_host_reset(struct scsi_cmnd *scmd)
1043 {
1044         unsigned long flags;
1045         int rtn;
1046
1047         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Snd Host RST\n",
1048                                           __FUNCTION__));
1049         scmd->owner = SCSI_OWNER_LOWLEVEL;
1050         scmd->serial_number_at_timeout = scmd->serial_number;
1051
1052         if (!scmd->device->host->hostt->eh_host_reset_handler)
1053                 return FAILED;
1054
1055         spin_lock_irqsave(scmd->device->host->host_lock, flags);
1056         rtn = scmd->device->host->hostt->eh_host_reset_handler(scmd);
1057         spin_unlock_irqrestore(scmd->device->host->host_lock, flags);
1058
1059         if (rtn == SUCCESS) {
1060                 scsi_sleep(HOST_RESET_SETTLE_TIME);
1061                 spin_lock_irqsave(scmd->device->host->host_lock, flags);
1062                 scsi_report_bus_reset(scmd->device->host, scmd->device->channel);
1063                 spin_unlock_irqrestore(scmd->device->host->host_lock, flags);
1064         }
1065
1066         return rtn;
1067 }
1068
1069 /**
1070  * scsi_eh_bus_reset - send a bus reset
1071  * @shost:      scsi host being recovered.
1072  * @eh_done_q:  list_head for processed commands.
1073  **/
1074 static int scsi_eh_bus_reset(struct Scsi_Host *shost,
1075                              struct list_head *work_q,
1076                              struct list_head *done_q)
1077 {
1078         struct list_head *lh, *lh_sf;
1079         struct scsi_cmnd *scmd;
1080         struct scsi_cmnd *chan_scmd;
1081         unsigned int channel;
1082         int rtn;
1083
1084         /*
1085          * we really want to loop over the various channels, and do this on
1086          * a channel by channel basis.  we should also check to see if any
1087          * of the failed commands are on soft_reset devices, and if so, skip
1088          * the reset.
1089          */
1090
1091         for (channel = 0; channel <= shost->max_channel; channel++) {
1092                 chan_scmd = NULL;
1093                 list_for_each_entry(scmd, work_q, eh_entry) {
1094                         if (channel == scmd->device->channel) {
1095                                 chan_scmd = scmd;
1096                                 break;
1097                                 /*
1098                                  * FIXME add back in some support for
1099                                  * soft_reset devices.
1100                                  */
1101                         }
1102                 }
1103
1104                 if (!chan_scmd)
1105                         continue;
1106                 SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Sending BRST chan:"
1107                                                   " %d\n", current->comm,
1108                                                   channel));
1109                 rtn = scsi_try_bus_reset(chan_scmd);
1110                 if (rtn == SUCCESS) {
1111                         list_for_each_safe(lh, lh_sf, work_q) {
1112                                 scmd = list_entry(lh, struct scsi_cmnd,
1113                                                   eh_entry);
1114                                 if (channel == scmd->device->channel)
1115                                         if (!scsi_device_online(scmd->device) ||
1116                                             !scsi_eh_tur(scmd))
1117                                                 scsi_eh_finish_cmd(scmd,
1118                                                                    done_q);
1119                         }
1120                 } else {
1121                         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: BRST"
1122                                                           " failed chan: %d\n",
1123                                                           current->comm,
1124                                                           channel));
1125                 }
1126         }
1127         return list_empty(work_q);
1128 }
1129
1130 /**
1131  * scsi_eh_host_reset - send a host reset
1132  * @work_q:     list_head for processed commands.
1133  * @done_q:     list_head for processed commands.
1134  **/
1135 static int scsi_eh_host_reset(struct list_head *work_q,
1136                               struct list_head *done_q)
1137 {
1138         int rtn;
1139         struct list_head *lh, *lh_sf;
1140         struct scsi_cmnd *scmd;
1141
1142         if (!list_empty(work_q)) {
1143                 scmd = list_entry(work_q->next,
1144                                   struct scsi_cmnd, eh_entry);
1145
1146                 SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Sending HRST\n"
1147                                                   , current->comm));
1148
1149                 rtn = scsi_try_host_reset(scmd);
1150                 if (rtn == SUCCESS) {
1151                         list_for_each_safe(lh, lh_sf, work_q) {
1152                                 scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
1153                                 if (!scsi_device_online(scmd->device) ||
1154                                     (!scsi_eh_try_stu(scmd) && !scsi_eh_tur(scmd)) ||
1155                                     !scsi_eh_tur(scmd))
1156                                         scsi_eh_finish_cmd(scmd, done_q);
1157                         }
1158                 } else {
1159                         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: HRST"
1160                                                           " failed\n",
1161                                                           current->comm));
1162                 }
1163         }
1164         return list_empty(work_q);
1165 }
1166
1167 /**
1168  * scsi_eh_offline_sdevs - offline scsi devices that fail to recover
1169  * @work_q:     list_head for processed commands.
1170  * @done_q:     list_head for processed commands.
1171  *
1172  **/
1173 static void scsi_eh_offline_sdevs(struct list_head *work_q,
1174                                   struct list_head *done_q)
1175 {
1176         struct list_head *lh, *lh_sf;
1177         struct scsi_cmnd *scmd;
1178
1179         list_for_each_safe(lh, lh_sf, work_q) {
1180                 scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
1181                 printk(KERN_INFO "scsi: Device offlined - not"
1182                                 " ready after error recovery: host"
1183                                 " %d channel %d id %d lun %d\n",
1184                                 scmd->device->host->host_no,
1185                                 scmd->device->channel,
1186                                 scmd->device->id,
1187                                 scmd->device->lun);
1188                 scsi_device_set_state(scmd->device, SDEV_OFFLINE);
1189                 if (scsi_eh_eflags_chk(scmd, SCSI_EH_CANCEL_CMD)) {
1190                         /*
1191                          * FIXME: Handle lost cmds.
1192                          */
1193                 }
1194                 scsi_eh_finish_cmd(scmd, done_q);
1195         }
1196         return;
1197 }
1198
1199 /**
1200  * scsi_sleep_done - timer function for scsi_sleep
1201  * @sem:        semphore to signal
1202  *
1203  **/
1204 static void scsi_sleep_done(unsigned long data)
1205 {
1206         struct semaphore *sem = (struct semaphore *)data;
1207
1208         if (sem)
1209                 up(sem);
1210 }
1211
1212 /**
1213  * scsi_sleep - sleep for specified timeout
1214  * @timeout:    timeout value
1215  *
1216  **/
1217 void scsi_sleep(int timeout)
1218 {
1219         DECLARE_MUTEX_LOCKED(sem);
1220         struct timer_list timer;
1221
1222         init_timer(&timer);
1223         timer.data = (unsigned long)&sem;
1224         timer.expires = jiffies + timeout;
1225         timer.function = (void (*)(unsigned long))scsi_sleep_done;
1226
1227         SCSI_LOG_ERROR_RECOVERY(5, printk("sleeping for timer tics %d\n",
1228                                           timeout));
1229
1230         add_timer(&timer);
1231
1232         down(&sem);
1233         del_timer(&timer);
1234 }
1235
1236 /**
1237  * scsi_decide_disposition - Disposition a cmd on return from LLD.
1238  * @scmd:       SCSI cmd to examine.
1239  *
1240  * Notes:
1241  *    This is *only* called when we are examining the status after sending
1242  *    out the actual data command.  any commands that are queued for error
1243  *    recovery (e.g. test_unit_ready) do *not* come through here.
1244  *
1245  *    When this routine returns failed, it means the error handler thread
1246  *    is woken.  In cases where the error code indicates an error that
1247  *    doesn't require the error handler read (i.e. we don't need to
1248  *    abort/reset), this function should return SUCCESS.
1249  **/
1250 int scsi_decide_disposition(struct scsi_cmnd *scmd)
1251 {
1252         int rtn;
1253
1254         /*
1255          * if the device is offline, then we clearly just pass the result back
1256          * up to the top level.
1257          */
1258         if (!scsi_device_online(scmd->device)) {
1259                 SCSI_LOG_ERROR_RECOVERY(5, printk("%s: device offline - report"
1260                                                   " as SUCCESS\n",
1261                                                   __FUNCTION__));
1262                 return SUCCESS;
1263         }
1264
1265         /*
1266          * first check the host byte, to see if there is anything in there
1267          * that would indicate what we need to do.
1268          */
1269         switch (host_byte(scmd->result)) {
1270         case DID_PASSTHROUGH:
1271                 /*
1272                  * no matter what, pass this through to the upper layer.
1273                  * nuke this special code so that it looks like we are saying
1274                  * did_ok.
1275                  */
1276                 scmd->result &= 0xff00ffff;
1277                 return SUCCESS;
1278         case DID_OK:
1279                 /*
1280                  * looks good.  drop through, and check the next byte.
1281                  */
1282                 break;
1283         case DID_NO_CONNECT:
1284         case DID_BAD_TARGET:
1285         case DID_ABORT:
1286                 /*
1287                  * note - this means that we just report the status back
1288                  * to the top level driver, not that we actually think
1289                  * that it indicates SUCCESS.
1290                  */
1291                 return SUCCESS;
1292                 /*
1293                  * when the low level driver returns did_soft_error,
1294                  * it is responsible for keeping an internal retry counter
1295                  * in order to avoid endless loops (db)
1296                  *
1297                  * actually this is a bug in this function here.  we should
1298                  * be mindful of the maximum number of retries specified
1299                  * and not get stuck in a loop.
1300                  */
1301         case DID_SOFT_ERROR:
1302                 goto maybe_retry;
1303         case DID_IMM_RETRY:
1304                 return NEEDS_RETRY;
1305
1306         case DID_ERROR:
1307                 if (msg_byte(scmd->result) == COMMAND_COMPLETE &&
1308                     status_byte(scmd->result) == RESERVATION_CONFLICT)
1309                         /*
1310                          * execute reservation conflict processing code
1311                          * lower down
1312                          */
1313                         break;
1314                 /* fallthrough */
1315
1316         case DID_BUS_BUSY:
1317         case DID_PARITY:
1318                 goto maybe_retry;
1319         case DID_TIME_OUT:
1320                 /*
1321                  * when we scan the bus, we get timeout messages for
1322                  * these commands if there is no device available.
1323                  * other hosts report did_no_connect for the same thing.
1324                  */
1325                 if ((scmd->cmnd[0] == TEST_UNIT_READY ||
1326                      scmd->cmnd[0] == INQUIRY)) {
1327                         return SUCCESS;
1328                 } else {
1329                         return FAILED;
1330                 }
1331         case DID_RESET:
1332                 return SUCCESS;
1333         default:
1334                 return FAILED;
1335         }
1336
1337         /*
1338          * next, check the message byte.
1339          */
1340         if (msg_byte(scmd->result) != COMMAND_COMPLETE)
1341                 return FAILED;
1342
1343         /*
1344          * check the status byte to see if this indicates anything special.
1345          */
1346         switch (status_byte(scmd->result)) {
1347         case QUEUE_FULL:
1348                 /*
1349                  * the case of trying to send too many commands to a
1350                  * tagged queueing device.
1351                  */
1352         case BUSY:
1353                 /*
1354                  * device can't talk to us at the moment.  Should only
1355                  * occur (SAM-3) when the task queue is empty, so will cause
1356                  * the empty queue handling to trigger a stall in the
1357                  * device.
1358                  */
1359                 return ADD_TO_MLQUEUE;
1360         case GOOD:
1361         case COMMAND_TERMINATED:
1362                 return SUCCESS;
1363         case CHECK_CONDITION:
1364                 rtn = scsi_check_sense(scmd);
1365                 if (rtn == NEEDS_RETRY)
1366                         goto maybe_retry;
1367                 /* if rtn == FAILED, we have no sense information;
1368                  * returning FAILED will wake the error handler thread
1369                  * to collect the sense and redo the decide
1370                  * disposition */
1371                 return rtn;
1372         case CONDITION_GOOD:
1373         case INTERMEDIATE_GOOD:
1374         case INTERMEDIATE_C_GOOD:
1375                 /*
1376                  * who knows?  FIXME(eric)
1377                  */
1378                 return SUCCESS;
1379
1380         case RESERVATION_CONFLICT:
1381                 printk("scsi%d (%d,%d,%d) : reservation conflict\n",
1382                        scmd->device->host->host_no, scmd->device->channel,
1383                        scmd->device->id, scmd->device->lun);
1384                 return SUCCESS; /* causes immediate i/o error */
1385         default:
1386                 return FAILED;
1387         }
1388         return FAILED;
1389
1390       maybe_retry:
1391
1392         /* we requeue for retry because the error was retryable, and
1393          * the request was not marked fast fail.  Note that above,
1394          * even if the request is marked fast fail, we still requeue
1395          * for queue congestion conditions (QUEUE_FULL or BUSY) */
1396         if ((++scmd->retries) < scmd->allowed
1397             && !blk_noretry_request(scmd->request)) {
1398                 return NEEDS_RETRY;
1399         } else {
1400                 /*
1401                  * no more retries - report this one back to upper level.
1402                  */
1403                 return SUCCESS;
1404         }
1405 }
1406
1407 /**
1408  * scsi_eh_lock_done - done function for eh door lock request
1409  * @scmd:       SCSI command block for the door lock request
1410  *
1411  * Notes:
1412  *      We completed the asynchronous door lock request, and it has either
1413  *      locked the door or failed.  We must free the command structures
1414  *      associated with this request.
1415  **/
1416 static void scsi_eh_lock_done(struct scsi_cmnd *scmd)
1417 {
1418         struct scsi_request *sreq = scmd->sc_request;
1419
1420         scsi_release_request(sreq);
1421 }
1422
1423
1424 /**
1425  * scsi_eh_lock_door - Prevent medium removal for the specified device
1426  * @sdev:       SCSI device to prevent medium removal
1427  *
1428  * Locking:
1429  *      We must be called from process context; scsi_allocate_request()
1430  *      may sleep.
1431  *
1432  * Notes:
1433  *      We queue up an asynchronous "ALLOW MEDIUM REMOVAL" request on the
1434  *      head of the devices request queue, and continue.
1435  *
1436  * Bugs:
1437  *      scsi_allocate_request() may sleep waiting for existing requests to
1438  *      be processed.  However, since we haven't kicked off any request
1439  *      processing for this host, this may deadlock.
1440  *
1441  *      If scsi_allocate_request() fails for what ever reason, we
1442  *      completely forget to lock the door.
1443  **/
1444 static void scsi_eh_lock_door(struct scsi_device *sdev)
1445 {
1446         struct scsi_request *sreq = scsi_allocate_request(sdev, GFP_KERNEL);
1447
1448         if (unlikely(!sreq)) {
1449                 printk(KERN_ERR "%s: request allocate failed,"
1450                        "prevent media removal cmd not sent\n", __FUNCTION__);
1451                 return;
1452         }
1453
1454         sreq->sr_cmnd[0] = ALLOW_MEDIUM_REMOVAL;
1455         sreq->sr_cmnd[1] = 0;
1456         sreq->sr_cmnd[2] = 0;
1457         sreq->sr_cmnd[3] = 0;
1458         sreq->sr_cmnd[4] = SCSI_REMOVAL_PREVENT;
1459         sreq->sr_cmnd[5] = 0;
1460         sreq->sr_data_direction = DMA_NONE;
1461         sreq->sr_bufflen = 0;
1462         sreq->sr_buffer = NULL;
1463         sreq->sr_allowed = 5;
1464         sreq->sr_done = scsi_eh_lock_done;
1465         sreq->sr_timeout_per_command = 10 * HZ;
1466         sreq->sr_cmd_len = COMMAND_SIZE(sreq->sr_cmnd[0]);
1467
1468         scsi_insert_special_req(sreq, 1);
1469 }
1470
1471
1472 /**
1473  * scsi_restart_operations - restart io operations to the specified host.
1474  * @shost:      Host we are restarting.
1475  *
1476  * Notes:
1477  *    When we entered the error handler, we blocked all further i/o to
1478  *    this device.  we need to 'reverse' this process.
1479  **/
1480 static void scsi_restart_operations(struct Scsi_Host *shost)
1481 {
1482         struct scsi_device *sdev;
1483
1484         /*
1485          * If the door was locked, we need to insert a door lock request
1486          * onto the head of the SCSI request queue for the device.  There
1487          * is no point trying to lock the door of an off-line device.
1488          */
1489         shost_for_each_device(sdev, shost) {
1490                 if (scsi_device_online(sdev) && sdev->locked)
1491                         scsi_eh_lock_door(sdev);
1492         }
1493
1494         /*
1495          * next free up anything directly waiting upon the host.  this
1496          * will be requests for character device operations, and also for
1497          * ioctls to queued block devices.
1498          */
1499         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: waking up host to restart\n",
1500                                           __FUNCTION__));
1501
1502         clear_bit(SHOST_RECOVERY, &shost->shost_state);
1503
1504         wake_up(&shost->host_wait);
1505
1506         /*
1507          * finally we need to re-initiate requests that may be pending.  we will
1508          * have had everything blocked while error handling is taking place, and
1509          * now that error recovery is done, we will need to ensure that these
1510          * requests are started.
1511          */
1512         scsi_run_host_queues(shost);
1513 }
1514
1515 /**
1516  * scsi_eh_ready_devs - check device ready state and recover if not.
1517  * @shost:      host to be recovered.
1518  * @eh_done_q:  list_head for processed commands.
1519  *
1520  **/
1521 static void scsi_eh_ready_devs(struct Scsi_Host *shost,
1522                                struct list_head *work_q,
1523                                struct list_head *done_q)
1524 {
1525         if (!scsi_eh_stu(shost, work_q, done_q))
1526                 if (!scsi_eh_bus_device_reset(shost, work_q, done_q))
1527                         if (!scsi_eh_bus_reset(shost, work_q, done_q))
1528                                 if (!scsi_eh_host_reset(work_q, done_q))
1529                                         scsi_eh_offline_sdevs(work_q, done_q);
1530 }
1531
1532 /**
1533  * scsi_eh_flush_done_q - finish processed commands or retry them.
1534  * @done_q:     list_head of processed commands.
1535  *
1536  **/
1537 static void scsi_eh_flush_done_q(struct list_head *done_q)
1538 {
1539         struct list_head *lh, *lh_sf;
1540         struct scsi_cmnd *scmd;
1541
1542         list_for_each_safe(lh, lh_sf, done_q) {
1543                 scmd = list_entry(lh, struct scsi_cmnd, eh_entry);
1544                 list_del_init(lh);
1545                 if (scsi_device_online(scmd->device) &&
1546                     !blk_noretry_request(scmd->request) &&
1547                     (++scmd->retries < scmd->allowed)) {
1548                         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: flush"
1549                                                           " retry cmd: %p\n",
1550                                                           current->comm,
1551                                                           scmd));
1552                                 scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
1553                 } else {
1554                         if (!scmd->result)
1555                                 scmd->result |= (DRIVER_TIMEOUT << 24);
1556                         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: flush finish"
1557                                                         " cmd: %p\n",
1558                                                         current->comm, scmd));
1559                         scsi_finish_command(scmd);
1560                 }
1561         }
1562 }
1563
1564 /**
1565  * scsi_unjam_host - Attempt to fix a host which has a cmd that failed.
1566  * @shost:      Host to unjam.
1567  *
1568  * Notes:
1569  *    When we come in here, we *know* that all commands on the bus have
1570  *    either completed, failed or timed out.  we also know that no further
1571  *    commands are being sent to the host, so things are relatively quiet
1572  *    and we have freedom to fiddle with things as we wish.
1573  *
1574  *    This is only the *default* implementation.  it is possible for
1575  *    individual drivers to supply their own version of this function, and
1576  *    if the maintainer wishes to do this, it is strongly suggested that
1577  *    this function be taken as a template and modified.  this function
1578  *    was designed to correctly handle problems for about 95% of the
1579  *    different cases out there, and it should always provide at least a
1580  *    reasonable amount of error recovery.
1581  *
1582  *    Any command marked 'failed' or 'timeout' must eventually have
1583  *    scsi_finish_cmd() called for it.  we do all of the retry stuff
1584  *    here, so when we restart the host after we return it should have an
1585  *    empty queue.
1586  **/
1587 static void scsi_unjam_host(struct Scsi_Host *shost)
1588 {
1589         unsigned long flags;
1590         LIST_HEAD(eh_work_q);
1591         LIST_HEAD(eh_done_q);
1592
1593         spin_lock_irqsave(shost->host_lock, flags);
1594         list_splice_init(&shost->eh_cmd_q, &eh_work_q);
1595         spin_unlock_irqrestore(shost->host_lock, flags);
1596
1597         SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost, &eh_work_q));
1598
1599         if (!scsi_eh_get_sense(&eh_work_q, &eh_done_q))
1600                 if (!scsi_eh_abort_cmds(&eh_work_q, &eh_done_q))
1601                         scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q);
1602
1603         scsi_eh_flush_done_q(&eh_done_q);
1604 }
1605
1606 /**
1607  * scsi_error_handler - Handle errors/timeouts of SCSI cmds.
1608  * @data:       Host for which we are running.
1609  *
1610  * Notes:
1611  *    This is always run in the context of a kernel thread.  The idea is
1612  *    that we start this thing up when the kernel starts up (one per host
1613  *    that we detect), and it immediately goes to sleep and waits for some
1614  *    event (i.e. failure).  When this takes place, we have the job of
1615  *    trying to unjam the bus and restarting things.
1616  **/
1617 int scsi_error_handler(void *data)
1618 {
1619         struct Scsi_Host *shost = (struct Scsi_Host *) data;
1620         int rtn;
1621         DECLARE_MUTEX_LOCKED(sem);
1622
1623         lock_kernel();
1624
1625         /*
1626          *    Flush resources
1627          */
1628
1629         daemonize("scsi_eh_%d", shost->host_no);
1630
1631         current->flags |= PF_NOFREEZE;
1632
1633         shost->eh_wait = &sem;
1634         shost->ehandler = current;
1635
1636         unlock_kernel();
1637
1638         /*
1639          * Wake up the thread that created us.
1640          */
1641         SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent of"
1642                                           " scsi_eh_%d\n",shost->host_no));
1643
1644         complete(shost->eh_notify);
1645
1646         while (1) {
1647                 /*
1648                  * If we get a signal, it means we are supposed to go
1649                  * away and die.  This typically happens if the user is
1650                  * trying to unload a module.
1651                  */
1652                 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
1653                                                   " scsi_eh_%d"
1654                                                   " sleeping\n",shost->host_no));
1655
1656                 /*
1657                  * Note - we always use down_interruptible with the semaphore
1658                  * even if the module was loaded as part of the kernel.  The
1659                  * reason is that down() will cause this thread to be counted
1660                  * in the load average as a running process, and down
1661                  * interruptible doesn't.  Given that we need to allow this
1662                  * thread to die if the driver was loaded as a module, using
1663                  * semaphores isn't unreasonable.
1664                  */
1665                 down_interruptible(&sem);
1666                 if (shost->eh_kill)
1667                         break;
1668
1669                 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
1670                                                   " scsi_eh_%d waking"
1671                                                   " up\n",shost->host_no));
1672
1673                 shost->eh_active = 1;
1674
1675                 /*
1676                  * We have a host that is failing for some reason.  Figure out
1677                  * what we need to do to get it up and online again (if we can).
1678                  * If we fail, we end up taking the thing offline.
1679                  */
1680                 if (shost->hostt->eh_strategy_handler)
1681                         rtn = shost->hostt->eh_strategy_handler(shost);
1682                 else
1683                         scsi_unjam_host(shost);
1684
1685                 shost->eh_active = 0;
1686
1687                 /*
1688                  * Note - if the above fails completely, the action is to take
1689                  * individual devices offline and flush the queue of any
1690                  * outstanding requests that may have been pending.  When we
1691                  * restart, we restart any I/O to any other devices on the bus
1692                  * which are still online.
1693                  */
1694                 scsi_restart_operations(shost);
1695
1696         }
1697
1698         SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d"
1699                                           " exiting\n",shost->host_no));
1700
1701         /*
1702          * Make sure that nobody tries to wake us up again.
1703          */
1704         shost->eh_wait = NULL;
1705
1706         /*
1707          * Knock this down too.  From this point on, the host is flying
1708          * without a pilot.  If this is because the module is being unloaded,
1709          * that's fine.  If the user sent a signal to this thing, we are
1710          * potentially in real danger.
1711          */
1712         shost->eh_active = 0;
1713         shost->ehandler = NULL;
1714
1715         /*
1716          * If anyone is waiting for us to exit (i.e. someone trying to unload
1717          * a driver), then wake up that process to let them know we are on
1718          * the way out the door.
1719          */
1720         complete_and_exit(shost->eh_notify, 0);
1721         return 0;
1722 }
1723
1724 /*
1725  * Function:    scsi_report_bus_reset()
1726  *
1727  * Purpose:     Utility function used by low-level drivers to report that
1728  *              they have observed a bus reset on the bus being handled.
1729  *
1730  * Arguments:   shost       - Host in question
1731  *              channel     - channel on which reset was observed.
1732  *
1733  * Returns:     Nothing
1734  *
1735  * Lock status: Host lock must be held.
1736  *
1737  * Notes:       This only needs to be called if the reset is one which
1738  *              originates from an unknown location.  Resets originated
1739  *              by the mid-level itself don't need to call this, but there
1740  *              should be no harm.
1741  *
1742  *              The main purpose of this is to make sure that a CHECK_CONDITION
1743  *              is properly treated.
1744  */
1745 void scsi_report_bus_reset(struct Scsi_Host *shost, int channel)
1746 {
1747         struct scsi_device *sdev;
1748
1749         __shost_for_each_device(sdev, shost) {
1750                 if (channel == sdev->channel) {
1751                         sdev->was_reset = 1;
1752                         sdev->expecting_cc_ua = 1;
1753                 }
1754         }
1755 }
1756
1757 /*
1758  * Function:    scsi_report_device_reset()
1759  *
1760  * Purpose:     Utility function used by low-level drivers to report that
1761  *              they have observed a device reset on the device being handled.
1762  *
1763  * Arguments:   shost       - Host in question
1764  *              channel     - channel on which reset was observed
1765  *              target      - target on which reset was observed
1766  *
1767  * Returns:     Nothing
1768  *
1769  * Lock status: Host lock must be held
1770  *
1771  * Notes:       This only needs to be called if the reset is one which
1772  *              originates from an unknown location.  Resets originated
1773  *              by the mid-level itself don't need to call this, but there
1774  *              should be no harm.
1775  *
1776  *              The main purpose of this is to make sure that a CHECK_CONDITION
1777  *              is properly treated.
1778  */
1779 void scsi_report_device_reset(struct Scsi_Host *shost, int channel, int target)
1780 {
1781         struct scsi_device *sdev;
1782
1783         __shost_for_each_device(sdev, shost) {
1784                 if (channel == sdev->channel &&
1785                     target == sdev->id) {
1786                         sdev->was_reset = 1;
1787                         sdev->expecting_cc_ua = 1;
1788                 }
1789         }
1790 }
1791
1792 static void
1793 scsi_reset_provider_done_command(struct scsi_cmnd *scmd)
1794 {
1795 }
1796
1797 /*
1798  * Function:    scsi_reset_provider
1799  *
1800  * Purpose:     Send requested reset to a bus or device at any phase.
1801  *
1802  * Arguments:   device  - device to send reset to
1803  *              flag - reset type (see scsi.h)
1804  *
1805  * Returns:     SUCCESS/FAILURE.
1806  *
1807  * Notes:       This is used by the SCSI Generic driver to provide
1808  *              Bus/Device reset capability.
1809  */
1810 int
1811 scsi_reset_provider(struct scsi_device *dev, int flag)
1812 {
1813         struct scsi_cmnd *scmd = scsi_get_command(dev, GFP_KERNEL);
1814         struct request req;
1815         int rtn;
1816
1817         scmd->request = &req;
1818         memset(&scmd->eh_timeout, 0, sizeof(scmd->eh_timeout));
1819         scmd->request->rq_status        = RQ_SCSI_BUSY;
1820         scmd->state                     = SCSI_STATE_INITIALIZING;
1821         scmd->owner                     = SCSI_OWNER_MIDLEVEL;
1822
1823         memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd));
1824
1825         scmd->scsi_done         = scsi_reset_provider_done_command;
1826         scmd->done                      = NULL;
1827         scmd->buffer                    = NULL;
1828         scmd->bufflen                   = 0;
1829         scmd->request_buffer            = NULL;
1830         scmd->request_bufflen           = 0;
1831         scmd->internal_timeout          = NORMAL_TIMEOUT;
1832         scmd->abort_reason              = DID_ABORT;
1833
1834         scmd->cmd_len                   = 0;
1835
1836         scmd->sc_data_direction         = DMA_BIDIRECTIONAL;
1837         scmd->sc_request                = NULL;
1838         scmd->sc_magic                  = SCSI_CMND_MAGIC;
1839
1840         init_timer(&scmd->eh_timeout);
1841
1842         /*
1843          * Sometimes the command can get back into the timer chain,
1844          * so use the pid as an identifier.
1845          */
1846         scmd->pid                       = 0;
1847
1848         switch (flag) {
1849         case SCSI_TRY_RESET_DEVICE:
1850                 rtn = scsi_try_bus_device_reset(scmd);
1851                 if (rtn == SUCCESS)
1852                         break;
1853                 /* FALLTHROUGH */
1854         case SCSI_TRY_RESET_BUS:
1855                 rtn = scsi_try_bus_reset(scmd);
1856                 if (rtn == SUCCESS)
1857                         break;
1858                 /* FALLTHROUGH */
1859         case SCSI_TRY_RESET_HOST:
1860                 rtn = scsi_try_host_reset(scmd);
1861                 break;
1862         default:
1863                 rtn = FAILED;
1864         }
1865
1866         scsi_delete_timer(scmd);
1867         scsi_next_command(scmd);
1868         return rtn;
1869 }