2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
6 * Copyright (C) 1992 - 1997, 2000,2002-2003 Silicon Graphics, Inc. All rights reserved.
10 #include <linux/types.h>
11 #include <linux/slab.h>
12 #include <linux/irq.h>
16 #include <asm/delay.h>
17 #include <asm/sn/sgi.h>
18 #include <asm/sn/io.h>
19 #include <asm/sn/hcl.h>
20 #include <asm/sn/labelcl.h>
21 #include <asm/sn/sn_private.h>
22 #include <asm/sn/klconfig.h>
23 #include <asm/sn/sn_cpuid.h>
24 #include <asm/sn/pci/pciio.h>
25 #include <asm/sn/pci/pcibr.h>
26 #include <asm/sn/xtalk/xtalk.h>
27 #include <asm/sn/pci/pcibr_private.h>
28 #include <asm/sn/intr.h>
29 #include <asm/sn/ioerror_handling.h>
30 #include <asm/sn/ioerror.h>
31 #include <asm/sn/sn2/shubio.h>
32 #include <asm/sn/sn2/shub_mmr.h>
33 #include <asm/sn/bte.h>
35 extern void hubni_eint_init(cnodeid_t cnode);
36 extern void hubii_eint_init(cnodeid_t cnode);
37 extern irqreturn_t hubii_eint_handler (int irq, void *arg, struct pt_regs *ep);
38 int hubiio_crb_error_handler(vertex_hdl_t hub_v, hubinfo_t hinfo);
39 int hubiio_prb_error_handler(vertex_hdl_t hub_v, hubinfo_t hinfo);
40 extern void bte_crb_error_handler(vertex_hdl_t hub_v, int btenum, int crbnum, ioerror_t *ioe, int bteop);
41 void print_crb_fields(int crb_num, ii_icrb0_a_u_t icrba,
42 ii_icrb0_b_u_t icrbb, ii_icrb0_c_u_t icrbc,
43 ii_icrb0_d_u_t icrbd, ii_icrb0_e_u_t icrbe);
46 extern error_return_code_t error_state_set(vertex_hdl_t v,error_state_t new_state);
48 #define HUB_ERROR_PERIOD (120 * HZ) /* 2 minutes */
51 hub_error_clear(nasid_t nasid)
56 * Make sure spurious write response errors are cleared
57 * (values are from hub_set_prb())
59 for (i = 0; i <= HUB_WIDGET_ID_MAX - HUB_WIDGET_ID_MIN + 1; i++) {
62 prb.iprb_regval = REMOTE_HUB_L(nasid, IIO_IOPRB_0 + (i * sizeof(hubreg_t)));
64 /* Clear out some fields */
69 prb.iprb_xtalkctr = 3; /* approx. PIO credits for the widget */
71 REMOTE_HUB_S(nasid, IIO_IOPRB_0 + (i * sizeof(hubreg_t)), prb.iprb_regval);
74 REMOTE_HUB_S(nasid, IIO_IECLR, -1);
80 * Function : hub_error_init
81 * Purpose : initialize the error handling requirements for a given hub.
82 * Parameters : cnode, the compact nodeid.
83 * Assumptions : Called only once per hub, either by a local cpu. Or by a
84 * remote cpu, when this hub is headless.(cpuless)
89 hub_error_init(cnodeid_t cnode)
93 nasid = cnodeid_to_nasid(cnode);
94 hub_error_clear(nasid);
98 * Now setup the hub ii error interrupt handler.
101 hubii_eint_init(cnode);
107 * Function : hubii_eint_init
109 * Purpose : to initialize the hub iio error interrupt.
110 * Assumptions : Called once per hub, by the cpu which will ultimately
111 * handle this interrupt.
116 hubii_eint_init(cnodeid_t cnode)
119 ii_iidsr_u_t hubio_eint;
123 int bit_pos_to_irq(int bit);
127 hub_v = (vertex_hdl_t)cnodeid_to_vertex(cnode);
128 ASSERT_ALWAYS(hub_v);
129 hubinfo_get(hub_v, &hinfo);
132 ASSERT(hinfo->h_cnodeid == cnode);
134 ilcsr.ii_ilcsr_regval = REMOTE_HUB_L(hinfo->h_nasid, IIO_ILCSR);
135 if ((ilcsr.ii_ilcsr_fld_s.i_llp_stat & 0x2) == 0) {
137 * HUB II link is not up. Disable LLP. Clear old errors.
138 * Enable interrupts to handle BTE errors.
140 ilcsr.ii_ilcsr_fld_s.i_llp_en = 0;
141 REMOTE_HUB_S(hinfo->h_nasid, IIO_ILCSR, ilcsr.ii_ilcsr_regval);
144 /* Select a possible interrupt target where there is a free interrupt
145 * bit and also reserve the interrupt bit for this IO error interrupt
147 intr_cpu = intr_heuristic(hub_v, SGI_II_ERROR, &bit);
148 if (intr_cpu == CPU_NONE) {
149 printk("hubii_eint_init: intr_heuristic failed, cnode %d", cnode);
153 rv = intr_connect_level(intr_cpu, SGI_II_ERROR);
154 request_irq(SGI_II_ERROR, hubii_eint_handler, SA_SHIRQ, "SN_hub_error", (void *)hub_v);
155 irq_descp(bit)->status |= SN2_IRQ_PER_HUB;
156 ASSERT_ALWAYS(rv >= 0);
157 hubio_eint.ii_iidsr_regval = 0;
158 hubio_eint.ii_iidsr_fld_s.i_enable = 1;
159 hubio_eint.ii_iidsr_fld_s.i_level = bit;/* Take the least significant bits*/
160 hubio_eint.ii_iidsr_fld_s.i_node = cnodeid_to_nasid(cnode);
161 hubio_eint.ii_iidsr_fld_s.i_pi_id = cpuid_to_subnode(intr_cpu);
162 REMOTE_HUB_S(hinfo->h_nasid, IIO_IIDSR, hubio_eint.ii_iidsr_regval);
169 hubii_eint_handler (int irq, void *arg, struct pt_regs *ep)
177 /* two levels of casting avoids compiler warning.!! */
178 hub_v = (vertex_hdl_t)(long)(arg);
181 hubinfo_get(hub_v, &hinfo);
183 idsr = REMOTE_HUB_L(hinfo->h_nasid, IIO_ICMR);
186 /* ICMR bit is set .. we are getting into "Spurious Interrupts condition. */
187 printk("Cnode %d II has seen the ICMR condition\n", hinfo->h_cnodeid);
188 printk("***** Please file PV with the above messages *****\n");
189 /* panic("We have to panic to prevent further unknown states ..\n"); */
194 * Identify the reason for error.
196 wstat.ii_wstat_regval = REMOTE_HUB_L(hinfo->h_nasid, IIO_WSTAT);
198 if (wstat.ii_wstat_fld_s.w_crazy) {
201 * We can do a couple of things here.
202 * Look at the fields TX_MX_RTY/XT_TAIL_TO/XT_CRD_TO to check
203 * which of these caused the CRAZY bit to be set.
204 * You may be able to check if the Link is up really.
206 if (wstat.ii_wstat_fld_s.w_tx_mx_rty)
207 reason = "Micro Packet Retry Timeout";
208 else if (wstat.ii_wstat_fld_s.w_xt_tail_to)
209 reason = "Crosstalk Tail Timeout";
210 else if (wstat.ii_wstat_fld_s.w_xt_crd_to)
211 reason = "Crosstalk Credit Timeout";
215 * Check if widget 0 has been marked as shutdown, or
216 * if BTE 0/1 has been marked.
218 hubii_imem = REMOTE_HUB_L(hinfo->h_nasid, IIO_IMEM);
219 if (hubii_imem & IIO_IMEM_W0ESD)
220 reason = "Hub Widget 0 has been Shutdown";
221 else if (hubii_imem & IIO_IMEM_B0ESD)
222 reason = "BTE 0 has been shutdown";
223 else if (hubii_imem & IIO_IMEM_B1ESD)
224 reason = "BTE 1 has been shutdown";
225 else reason = "Unknown";
229 * Only print the II_ECRAZY message if there is an attached xbow.
231 if (NODEPDA(hinfo->h_cnodeid)->xbow_vhdl != 0) {
232 printk("Hub %d, cnode %d to Xtalk Link failed (II_ECRAZY) Reason: %s",
233 hinfo->h_nasid, hinfo->h_cnodeid, reason);
239 * Before processing any interrupt related information, clear all
240 * error indication and reenable interrupts. This will prevent
241 * lost interrupts due to the interrupt handler scanning past a PRB/CRB
242 * which has not errorred yet and then the PRB/CRB goes into error.
243 * Note, PRB errors are cleared individually.
245 REMOTE_HUB_S(hinfo->h_nasid, IIO_IECLR, 0xff0000);
246 idsr = REMOTE_HUB_L(hinfo->h_nasid, IIO_IIDSR) & ~IIO_IIDSR_SENT_MASK;
247 REMOTE_HUB_S(hinfo->h_nasid, IIO_IIDSR, idsr);
251 * It's a toss as to which one among PRB/CRB to check first.
252 * Current decision is based on the severity of the errors.
253 * IO CRB errors tend to be more severe than PRB errors.
255 * It is possible for BTE errors to have been handled already, so we
256 * may not see any errors handled here.
258 (void)hubiio_crb_error_handler(hub_v, hinfo);
259 (void)hubiio_prb_error_handler(hub_v, hinfo);
265 * Free the hub CRB "crbnum" which encountered an error.
266 * Assumption is, error handling was successfully done,
267 * and we now want to return the CRB back to Hub for normal usage.
269 * In order to free the CRB, all that's needed is to de-allocate it
272 * No other processor is mucking around with the hub control register.
273 * So, upper layer has to single thread this.
276 hubiio_crb_free(hubinfo_t hinfo, int crbnum)
278 ii_icrb0_b_u_t icrbb;
281 * The hardware does NOT clear the mark bit, so it must get cleared
282 * here to be sure the error is not processed twice.
284 icrbb.ii_icrb0_b_regval = REMOTE_HUB_L(hinfo->h_nasid, IIO_ICRB_B(crbnum));
286 REMOTE_HUB_S(hinfo->h_nasid, IIO_ICRB_B(crbnum), icrbb.ii_icrb0_b_regval);
289 * Deallocate the register.
292 REMOTE_HUB_S(hinfo->h_nasid, IIO_ICDR, (IIO_ICDR_PND | crbnum));
295 * Wait till hub indicates it's done.
297 while (REMOTE_HUB_L(hinfo->h_nasid, IIO_ICDR) & IIO_ICDR_PND)
304 * Array of error names that get logged in CRBs
306 char *hubiio_crb_errors[] = {
311 "I/O Partial Write Error",
312 "I/O Partial Read Error",
318 print_crb_fields(int crb_num, ii_icrb0_a_u_t icrba,
319 ii_icrb0_b_u_t icrbb, ii_icrb0_c_u_t icrbc,
320 ii_icrb0_d_u_t icrbd, ii_icrb0_e_u_t icrbe)
322 printk("CRB %d regA\n\t"
334 printk("CRB %d regB\n\t"
335 "b_imsgtype 0x%x\n\t"
337 "\tb_use_old 0x%x\n\t"
338 "b_initiator 0x%x\n\t"
340 "\tb_ackcnt 0x%x\n\t"
346 "\tb_stall_ib 0x%x\n\t"
348 "\tb_stall_bte_0 0x%x\n\t"
349 "b_stall_bte_1 0x%x\n"
374 printk("CRB %d regC\n\t"
377 "c_cohtrans 0x%x\n\t"
392 printk("CRB %d regD\n\t"
393 "d_bteaddr 0x%lx\n\t"
404 printk("CRB %d regE\n\t"
405 "icrbe_timeout 0x%x\n\t"
406 "icrbe_context 0x%x\n\t"
407 "icrbe_toutvld 0x%x\n\t"
408 "icrbe_ctxtvld 0x%x\n\t",
413 icrbe.icrbe_ctxtvld);
417 * hubiio_crb_error_handler
419 * This routine gets invoked when a hub gets an error
420 * interrupt. So, the routine is running in interrupt context
421 * at error interrupt level.
423 * It's responsible for identifying ALL the CRBs that are marked
424 * with error, and process them.
426 * If you find the CRB that's marked with error, map this to the
427 * reason it caused error, and invoke appropriate error handler.
429 * XXX Be aware of the information in the context register.
432 * Use REMOTE_HUB_* macro instead of LOCAL_HUB_* so that the interrupt
433 * handler can be run on any node. (not necessarily the node
434 * corresponding to the hub that encountered error).
438 hubiio_crb_error_handler(vertex_hdl_t hub_v, hubinfo_t hinfo)
442 ii_icrb0_a_u_t icrba; /* II CRB Register A */
443 ii_icrb0_b_u_t icrbb; /* II CRB Register B */
444 ii_icrb0_c_u_t icrbc; /* II CRB Register C */
445 ii_icrb0_d_u_t icrbd; /* II CRB Register D */
446 ii_icrb0_e_u_t icrbe; /* II CRB Register D */
448 int num_errors = 0; /* Num of errors handled */
452 nasid = hinfo->h_nasid;
453 cnode = nasid_to_cnodeid(nasid);
456 * XXX - Add locking for any recovery actions
459 * Scan through all CRBs in the Hub, and handle the errors
460 * in any of the CRBs marked.
462 for (i = 0; i < IIO_NUM_CRBS; i++) {
463 /* Check this crb entry to see if it is in error. */
464 icrbb.ii_icrb0_b_regval = REMOTE_HUB_L(nasid, IIO_ICRB_B(i));
466 if (icrbb.b_mark == 0) {
470 icrba.ii_icrb0_a_regval = REMOTE_HUB_L(nasid, IIO_ICRB_A(i));
472 IOERROR_INIT(&ioerror);
474 /* read other CRB error registers. */
475 icrbc.ii_icrb0_c_regval = REMOTE_HUB_L(nasid, IIO_ICRB_C(i));
476 icrbd.ii_icrb0_d_regval = REMOTE_HUB_L(nasid, IIO_ICRB_D(i));
477 icrbe.ii_icrb0_e_regval = REMOTE_HUB_L(nasid, IIO_ICRB_E(i));
479 IOERROR_SETVALUE(&ioerror,errortype,icrbb.b_ecode);
481 /* Check if this error is due to BTE operation,
482 * and handle it separately.
485 ((icrbb.b_initiator == IIO_ICRB_INIT_BTE0 ||
486 icrbb.b_initiator == IIO_ICRB_INIT_BTE1) &&
487 (icrbb.b_imsgtype == IIO_ICRB_IMSGT_BTE ||
488 icrbb.b_imsgtype == IIO_ICRB_IMSGT_SN1NET))){
493 bte_num = icrbc.c_btenum;
494 else /* b_initiator bit 2 gives BTE number */
495 bte_num = (icrbb.b_initiator & 0x4) >> 2;
497 hubiio_crb_free(hinfo, i);
499 bte_crb_error_handler(hub_v, bte_num,
508 * Assuming the only other error that would reach here is
510 * If CRB times out on a message from Xtalk, it changes
511 * the message type to CRB.
513 * If we get here due to other errors (SN0net/CRB)
514 * what's the action ?
518 * Pick out the useful fields in CRB, and
519 * tuck them away into ioerror structure.
521 IOERROR_SETVALUE(&ioerror,xtalkaddr,icrba.a_addr << IIO_ICRB_ADDR_SHFT);
522 IOERROR_SETVALUE(&ioerror,widgetnum,icrba.a_sidn);
527 * XXX We shouldn't really have BRIDGE-specific code
530 * The BRIDGE (or XBRIDGE) sets the upper bit of TNUM
531 * to indicate a WRITE operation. It sets the next
532 * bit to indicate an INTERRUPT operation. The bottom
533 * 3 bits of TNUM indicate which device was responsible.
535 IOERROR_SETVALUE(&ioerror,widgetdev,
536 TNUM_TO_WIDGET_DEV(icrba.a_tnum));
538 * The encoding of TNUM (see comments above) is
539 * different for PIC. So we'll save TNUM here and
540 * deal with the differences later when we can
541 * determine if we're using a Bridge or the PIC.
543 * XXX: We may be able to remove saving the widgetdev
544 * above and just sort it out of TNUM later.
546 IOERROR_SETVALUE(&ioerror, tnum, icrba.a_tnum);
551 * CRB 'i' has some error. Identify the type of error,
552 * and try to handle it.
555 switch(icrbb.b_ecode) {
556 case IIO_ICRB_ECODE_PERR:
557 case IIO_ICRB_ECODE_WERR:
558 case IIO_ICRB_ECODE_AERR:
559 case IIO_ICRB_ECODE_PWERR:
560 case IIO_ICRB_ECODE_TOUT:
561 case IIO_ICRB_ECODE_XTERR:
562 printk("Shub II CRB %d: error %s on hub cnodeid: %d",
563 i, hubiio_crb_errors[icrbb.b_ecode], cnode);
565 * Any sort of write error is mostly due
566 * bad programming (Note it's not a timeout.)
567 * So, invoke hub_iio_error_handler with
568 * appropriate information.
570 IOERROR_SETVALUE(&ioerror,errortype,icrbb.b_ecode);
572 /* Go through the error bit lookup phase */
573 if (error_state_set(hub_v, ERROR_STATE_LOOKUP) ==
574 ERROR_RETURN_CODE_CANNOT_SET_STATE)
575 return(IOERROR_UNHANDLED);
576 rc = hub_ioerror_handler(
581 if (rc == IOERROR_HANDLED) {
582 rc = hub_ioerror_handler(
588 printk("Unable to handle %s on hub %d",
589 hubiio_crb_errors[icrbb.b_ecode],
593 /* Go to Next error */
594 print_crb_fields(i, icrba, icrbb, icrbc,
596 hubiio_crb_free(hinfo, i);
598 case IIO_ICRB_ECODE_PRERR:
599 case IIO_ICRB_ECODE_DERR:
600 printk("Shub II CRB %d: error %s on hub : %d",
601 i, hubiio_crb_errors[icrbb.b_ecode], cnode);
604 printk("Shub II CRB error (code : %d) on hub : %d",
605 icrbb.b_ecode, cnode);
610 * Error is not indicated via the errcode field
611 * Check other error indications in this register.
614 printk("Shub II CRB %d: Xtalk Packet with error bit set to hub %d",
618 if (icrbb.b_lnetuce) {
619 printk("Shub II CRB %d: Uncorrectable data error detected on data "
620 " from NUMAlink to node %d",
624 print_crb_fields(i, icrba, icrbb, icrbc, icrbd, icrbe);
632 * CRB 'i' has some error. Identify the type of error,
633 * and try to handle it.
635 switch(icrbb.b_ecode) {
636 case IIO_ICRB_ECODE_PERR:
637 case IIO_ICRB_ECODE_WERR:
638 case IIO_ICRB_ECODE_AERR:
639 case IIO_ICRB_ECODE_PWERR:
641 printk("%s on hub cnodeid: %d",
642 hubiio_crb_errors[icrbb.b_ecode], cnode);
644 * Any sort of write error is mostly due
645 * bad programming (Note it's not a timeout.)
646 * So, invoke hub_iio_error_handler with
647 * appropriate information.
649 IOERROR_SETVALUE(&ioerror,errortype,icrbb.b_ecode);
651 rc = hub_ioerror_handler(
657 if (rc == IOERROR_HANDLED) {
658 rc = hub_ioerror_handler(
663 ASSERT(rc == IOERROR_HANDLED);
666 panic("Unable to handle %s on hub %d",
667 hubiio_crb_errors[icrbb.b_ecode],
671 /* Go to Next error */
672 hubiio_crb_free(hinfo, i);
675 case IIO_ICRB_ECODE_PRERR:
677 case IIO_ICRB_ECODE_TOUT:
678 case IIO_ICRB_ECODE_XTERR:
680 case IIO_ICRB_ECODE_DERR:
681 panic("Fatal %s on hub : %d",
682 hubiio_crb_errors[icrbb.b_ecode], cnode);
686 panic("Fatal error (code : %d) on hub : %d",
687 icrbb.b_ecode, cnode);
691 } /* if (icrbb.b_error) */
694 * Error is not indicated via the errcode field
695 * Check other error indications in this register.
699 panic("Xtalk Packet with error bit set to hub %d",
704 if (icrbb.b_lnetuce) {
705 panic("Uncorrectable data error detected on data "
706 " from Craylink to node %d",
716 * hubii_check_widget_disabled
718 * Check if PIO access to the specified widget is disabled due
719 * to any II errors that are currently set.
721 * The specific error bits checked are:
722 * IPRBx register: SPUR_RD (51)
727 * WSTAT register: CRAZY (32)
731 hubii_check_widget_disabled(nasid_t nasid, int wnum)
736 iprb.iprb_regval = REMOTE_HUB_L(nasid, IIO_IOPRB(wnum));
737 if (iprb.iprb_regval & (IIO_PRB_SPUR_RD | IIO_PRB_SPUR_WR |
738 IIO_PRB_RD_TO | IIO_PRB_ERROR)) {
740 printk(KERN_WARNING "II error, IPRB%x=0x%lx\n", wnum, iprb.iprb_regval);
745 wstat.ii_wstat_regval = REMOTE_HUB_L(nasid, IIO_WSTAT);
746 if (wstat.ii_wstat_regval & IIO_WSTAT_ECRAZY) {
748 printk(KERN_WARNING "II error, WSTAT=0x%lx\n", wstat.ii_wstat_regval);
758 * Handle the error reported in the PRB for wiget number wnum.
759 * This typically happens on a PIO write error.
760 * There is nothing much we can do in this interrupt context for
761 * PIO write errors. For e.g. QL scsi controller has the
762 * habit of flaking out on PIO writes.
763 * Print a message and try to continue for now
764 * Cleanup involes freeing the PRB register
767 hubii_prb_handler(vertex_hdl_t hub_v, hubinfo_t hinfo, int wnum)
771 nasid = hinfo->h_nasid;
773 * Clear error bit by writing to IECLR register.
775 REMOTE_HUB_S(nasid, IIO_IECLR, (1 << wnum));
777 * PIO Write to Widget 'i' got into an error.
778 * Invoke hubiio_error_handler with this information.
780 printk( "Hub nasid %d got a PIO Write error from widget %d, "
781 "cleaning up and continuing", nasid, wnum);
784 * It may be necessary to adjust IO PRB counter
785 * to account for any lost credits.
790 hubiio_prb_error_handler(vertex_hdl_t hub_v, hubinfo_t hinfo)
797 nasid = hinfo->h_nasid;
799 * Check if IPRB0 has any error first.
801 iprb.iprb_regval = REMOTE_HUB_L(nasid, IIO_IOPRB(0));
802 if (iprb.iprb_error) {
804 hubii_prb_handler(hub_v, hinfo, 0);
807 * Look through PRBs 8 - F to see if any of them has error bit set.
808 * If true, invoke hub iio error handler for this widget.
810 for (wnum = HUB_WIDGET_ID_MIN; wnum <= HUB_WIDGET_ID_MAX; wnum++) {
811 iprb.iprb_regval = REMOTE_HUB_L(nasid, IIO_IOPRB(wnum));
813 if (!iprb.iprb_error)
817 hubii_prb_handler(hub_v, hinfo, wnum);