Fedora kernel-2.6.17-1.2142_FC4 patched with stable patch-2.6.17.4-vs2.0.2-rc26.diff

[linux-2.6.git] / arch / ia64 / kernel / fsys.S
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S

index 458daa5..7a05b1c 100644 (file)
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -8,11 +8,13 @@
   * 18-Feb-03 louisk    Implement fsys_gettimeofday().
   * 28-Feb-03 davidm    Fixed several bugs in fsys_gettimeofday().  Tuned it some more,
   *                     probably broke it along the way... ;-)
+ * 13-Jul-04 clameter   Implement fsys_clock_gettime and revise fsys_gettimeofday to make
+ *                      it capable of using memory based clocks without falling back to C code.
   */
  
  #include <asm/asmmacro.h>
  #include <asm/errno.h>
-#include <asm/offsets.h>
+#include <asm/asm-offsets.h>
  #include <asm/percpu.h>
  #include <asm/thread_info.h>
  #include <asm/sal.h>
@@ -144,196 +146,208 @@ ENTRY(fsys_set_tid_address)
  END(fsys_set_tid_address)
  
  /*
- * Note 1: This routine uses floating-point registers, but only with registers that
- *        operate on integers.  Because of that, we don't need to set ar.fpsr to the
- *        kernel default value.
- *
- * Note 2: For now, we will assume that all CPUs run at the same clock-frequency.
- *        If that wasn't the case, we would have to disable preemption (e.g.,
- *        by disabling interrupts) between reading the ITC and reading
- *        local_cpu_data->nsec_per_cyc.
- *
- * Note 3: On platforms where the ITC-drift bit is set in the SAL feature vector,
- *        we ought to either skip the ITC-based interpolation or run an ntp-like
- *        daemon to keep the ITCs from drifting too far apart.
+ * Ensure that the time interpolator structure is compatible with the asm code
   */
+#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \
+       || IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4
+#error fsys_gettimeofday incompatible with changes to struct time_interpolator
+#endif
+#define CLOCK_REALTIME 0
+#define CLOCK_MONOTONIC 1
+#define CLOCK_DIVIDE_BY_1000 0x4000
+#define CLOCK_ADD_MONOTONIC 0x8000
  
  ENTRY(fsys_gettimeofday)
         .prologue
         .altrp b6
         .body
-       add r9=TI_FLAGS+IA64_TASK_SIZE,r16
-       addl r3=THIS_CPU(cpu_info),r0
-
-       mov.m r31=ar.itc                // put time stamp into r31 (ITC) == now         (35 cyc)
-#ifdef CONFIG_SMP
-       movl r10=__per_cpu_offset
-       movl r2=sal_platform_features
-       ;;
-
-       ld8 r2=[r2]
-       movl r19=xtime                  // xtime is a timespec struct
-
-       ld8 r10=[r10]                   // r10 <- __per_cpu_offset[0]
-       addl r21=THIS_CPU(cpu_info),r0
-       ;;
-       add r10=r21, r10                // r10 <- &cpu_data(time_keeper_id)
-       tbit.nz p8,p0 = r2, IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT_BIT
-(p8)   br.spnt.many fsys_fallback_syscall
-#else
-       ;;
-       mov r10=r3
-       movl r19=xtime                  // xtime is a timespec struct
-#endif
-       ld4 r9=[r9]
-       movl r17=xtime_lock
-       ;;
-
-       // r32, r33 should contain the 2 args of gettimeofday
-       adds r21=IA64_CPUINFO_ITM_NEXT_OFFSET, r10
-       mov r2=-1
-       tnat.nz p6,p7=r32               // guard against NaT args
-       ;;
-
-       adds r10=IA64_CPUINFO_ITM_DELTA_OFFSET, r10
-(p7)   tnat.nz p6,p0=r33
-(p6)   br.cond.spnt.few .fail_einval
-
-       adds r8=IA64_CPUINFO_NSEC_PER_CYC_OFFSET, r3
-       movl r24=2361183241434822607    // for division hack (only for / 1000)
-       ;;
-
-       ldf8 f7=[r10]                   // f7 now contains itm_delta
-       setf.sig f11=r2
-       adds r10=8, r32
-
-       adds r20=IA64_TIMESPEC_TV_NSEC_OFFSET, r19      // r20 = &xtime->tv_nsec
-       movl r26=jiffies
-
-       setf.sig f9=r24                 // f9 is used for division hack
-       movl r27=wall_jiffies
-
-       and r9=TIF_ALLWORK_MASK,r9
-       movl r25=last_nsec_offset
-       ;;
-
-       /*
-        * Verify that we have permission to write to struct timeval.  Note:
-        * Another thread might unmap the mapping before we actually get
-        * to store the result.  That's OK as long as the stores are also
-        * protect by EX().
-        */
-EX(.fail_efault, probe.w.fault r32, 3)         // this must come _after_ NaT-check
-EX(.fail_efault, probe.w.fault r10, 3)         // this must come _after_ NaT-check
-       nop 0
-
-       ldf8 f10=[r8]                   // f10 <- local_cpu_data->nsec_per_cyc value
-       cmp.ne p8, p0=0, r9
-(p8)   br.spnt.many fsys_fallback_syscall
-       ;;
-.retry:        // *** seq = read_seqbegin(&xtime_lock); ***
-       ld4.acq r23=[r17]               // since &xtime_lock == &xtime_lock->sequence
-       ld8 r14=[r25]                   // r14 (old) = last_nsec_offset
-
-       ld8 r28=[r26]                   // r28 = jiffies
-       ld8 r29=[r27]                   // r29 = wall_jiffies
-       ;;
-
-       ldf8 f8=[r21]                   // f8 now contains itm_next
-       sub r28=r29, r28, 1             // r28 now contains "-(lost + 1)"
-       tbit.nz p9, p10=r23, 0          // p9 <- is_odd(r23), p10 <- is_even(r23)
-       ;;
-
-       ld8 r2=[r19]                    // r2 = sec = xtime.tv_sec
-       ld8 r29=[r20]                   // r29 = nsec = xtime.tv_nsec
-
-       setf.sig f6=r28                 // f6 <- -(lost + 1)                            (6 cyc)
-       ;;
-
+       mov r31 = r32
+       tnat.nz p6,p0 = r33             // guard against NaT argument
+(p6)    br.cond.spnt.few .fail_einval
+       mov r30 = CLOCK_DIVIDE_BY_1000
+       ;;
+.gettime:
+       // Register map
+       // Incoming r31 = pointer to address where to place result
+       //          r30 = flags determining how time is processed
+       // r2,r3 = temp r4-r7 preserved
+       // r8 = result nanoseconds
+       // r9 = result seconds
+       // r10 = temporary storage for clock difference
+       // r11 = preserved: saved ar.pfs
+       // r12 = preserved: memory stack
+       // r13 = preserved: thread pointer
+       // r14 = address of mask / mask
+       // r15 = preserved: system call number
+       // r16 = preserved: current task pointer
+       // r17 = wall to monotonic use
+       // r18 = time_interpolator->offset
+       // r19 = address of wall_to_monotonic
+       // r20 = pointer to struct time_interpolator / pointer to time_interpolator->address
+       // r21 = shift factor
+       // r22 = address of time interpolator->last_counter
+       // r23 = address of time_interpolator->last_cycle
+       // r24 = adress of time_interpolator->offset
+       // r25 = last_cycle value
+       // r26 = last_counter value
+       // r27 = pointer to xtime
+       // r28 = sequence number at the beginning of critcal section
+       // r29 = address of seqlock
+       // r30 = time processing flags / memory address
+       // r31 = pointer to result
+       // Predicates
+       // p6,p7 short term use
+       // p8 = timesource ar.itc
+       // p9 = timesource mmio64
+       // p10 = timesource mmio32
+       // p11 = timesource not to be handled by asm code
+       // p12 = memory time source ( = p9 | p10)
+       // p13 = do cmpxchg with time_interpolator_last_cycle
+       // p14 = Divide by 1000
+       // p15 = Add monotonic
+       //
+       // Note that instructions are optimized for McKinley. McKinley can process two
+       // bundles simultaneously and therefore we continuously try to feed the CPU
+       // two bundles and then a stop.
+       tnat.nz p6,p0 = r31     // branch deferred since it does not fit into bundle structure
+       mov pr = r30,0xc000     // Set predicates according to function
+       add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
+       movl r20 = time_interpolator
+       ;;
+       ld8 r20 = [r20]         // get pointer to time_interpolator structure
+       movl r29 = xtime_lock
+       ld4 r2 = [r2]           // process work pending flags
+       movl r27 = xtime
+       ;;      // only one bundle here
+       ld8 r21 = [r20]         // first quad with control information
+       and r2 = TIF_ALLWORK_MASK,r2
+(p6)    br.cond.spnt.few .fail_einval  // deferred branch
+       ;;
+       add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20
+       extr r3 = r21,32,32     // time_interpolator->nsec_per_cyc
+       extr r8 = r21,0,16      // time_interpolator->source
+       cmp.ne p6, p0 = 0, r2   // Fallback if work is scheduled
+(p6)    br.cond.spnt.many fsys_fallback_syscall
+       ;;
+       cmp.eq p8,p12 = 0,r8    // Check for cpu timer
+       cmp.eq p9,p0 = 1,r8     // MMIO64 ?
+       extr r2 = r21,24,8      // time_interpolator->jitter
+       cmp.eq p10,p0 = 2,r8    // MMIO32 ?
+       cmp.ltu p11,p0 = 2,r8   // function or other clock
+(p11)  br.cond.spnt.many fsys_fallback_syscall
+       ;;
+       setf.sig f7 = r3        // Setup for scaling of counter
+(p15)  movl r19 = wall_to_monotonic
+(p12)  ld8 r30 = [r10]
+       cmp.ne p13,p0 = r2,r0   // need jitter compensation?
+       extr r21 = r21,16,8     // shift factor
+       ;;
+.time_redo:
+       .pred.rel.mutex p8,p9,p10
+       ld4.acq r28 = [r29]     // xtime_lock.sequence. Must come first for locking purposes
+(p8)   mov r2 = ar.itc         // CPU_TIMER. 36 clocks latency!!!
+       add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20
+(p9)   ld8 r2 = [r30]          // readq(ti->address). Could also have latency issues..
+(p10)  ld4 r2 = [r30]          // readw(ti->address)
+(p13)  add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20
+       ;;                      // could be removed by moving the last add upward
+       ld8 r26 = [r22]         // time_interpolator->last_counter
+(p13)  ld8 r25 = [r23]         // time interpolator->last_cycle
+       add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20
+(p15)  ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET
+       ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET
+       add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20
+       ;;
+       ld8 r18 = [r24]         // time_interpolator->offset
+       ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET    // xtime.tv_nsec
+(p13)  sub r3 = r25,r2 // Diff needed before comparison (thanks davidm)
+       ;;
+       ld8 r14 = [r14]         // time_interpolator->mask
+(p13)  cmp.gt.unc p6,p7 = r3,r0        // check if it is less than last. p6,p7 cleared
+       sub r10 = r2,r26        // current_counter - last_counter
+       ;;
+(p6)   sub r10 = r25,r26       // time we got was less than last_cycle
+(p7)   mov ar.ccv = r25        // more than last_cycle. Prep for cmpxchg
+       ;;
+       and r10 = r10,r14       // Apply mask
+       ;;
+       setf.sig f8 = r10
+       nop.i 123
+       ;;
+(p7)   cmpxchg8.rel r3 = [r23],r2,ar.ccv
+EX(.fail_efault, probe.w.fault r31, 3) // This takes 5 cycles and we have spare time
+       xmpy.l f8 = f8,f7       // nsec_per_cyc*(counter-last_counter)
+(p15)  add r9 = r9,r17         // Add wall to monotonic.secs to result secs
+       ;;
+(p15)  ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET
+(p7)   cmp.ne p7,p0 = r25,r3   // if cmpxchg not successful redo
+       // simulate tbit.nz.or p7,p0 = r28,0
+       and r28 = ~1,r28        // Make sequence even to force retry if odd
+       getf.sig r2 = f8
         mf
-       xma.l f8=f6, f7, f8     // f8 (last_tick) <- -(lost + 1)*itm_delta + itm_next   (5 cyc)
-       nop 0
-
-       setf.sig f12=r31                // f12 <- ITC                                   (6 cyc)
-       // *** if (unlikely(read_seqretry(&xtime_lock, seq))) continue; ***
-       ld4 r24=[r17]                   // r24 = xtime_lock->sequence (re-read)
-       nop 0
-       ;;
-
-       mov r31=ar.itc                  // re-read ITC in case we .retry                (35 cyc)
-       xma.l f8=f11, f8, f12   // f8 (elapsed_cycles) <- (-1*last_tick + now) = (now - last_tick)
-       nop 0
-       ;;
-
-       getf.sig r18=f8                 // r18 <- (now - last_tick)
-       xmpy.l f8=f8, f10               // f8 <- elapsed_cycles*nsec_per_cyc (5 cyc)
-       add r3=r29, r14                 // r3 = (nsec + old)
-       ;;
-
-       cmp.lt p7, p8=r18, r0           // if now < last_tick, set p7 = 1, p8 = 0
-       getf.sig r18=f8                 // r18 = elapsed_cycles*nsec_per_cyc            (6 cyc)
-       nop 0
-       ;;
-
-(p10)  cmp.ne p9, p0=r23, r24          // if xtime_lock->sequence != seq, set p9
-       shr.u r18=r18, IA64_NSEC_PER_CYC_SHIFT  // r18 <- offset
-(p9)   br.spnt.many .retry
-       ;;
-
-       mov ar.ccv=r14                  // ar.ccv = old                                 (1 cyc)
-       cmp.leu p7, p8=r18, r14         // if (offset <= old), set p7 = 1, p8 = 0
-       ;;
-
-(p8)   cmpxchg8.rel r24=[r25], r18, ar.ccv     // compare-and-exchange (atomic!)
-(p8)   add r3=r29, r18                 // r3 = (nsec + offset)
-       ;;
-       shr.u r3=r3, 3                  // initiate dividing r3 by 1000
-       ;;
-       setf.sig f8=r3                  //                                              (6 cyc)
-       mov r10=1000000                 // r10 = 1000000
-       ;;
-(p8)   cmp.ne.unc p9, p0=r24, r14
-       xmpy.hu f6=f8, f9               //                                              (5 cyc)
-(p9)   br.spnt.many .retry
-       ;;
-
-       getf.sig r3=f6                  //                                              (6 cyc)
-       ;;
-       shr.u r3=r3, 4                  // end of division, r3 is divided by 1000 (=usec)
-       ;;
-
-1:     cmp.geu p7, p0=r3, r10          // while (usec >= 1000000)
-       ;;
-(p7)   sub r3=r3, r10                  // usec -= 1000000
-(p7)   adds r2=1, r2                   // ++sec
-(p7)   br.spnt.many 1b
-
-       // finally: r2 = sec, r3 = usec
-EX(.fail_efault, st8 [r32]=r2)
-       adds r9=8, r32
-       mov r8=r0                       // success
-       ;;
-EX(.fail_efault, st8 [r9]=r3)          // store them in the timeval struct
-       mov r10=0
+       add r8 = r8,r18         // Add time interpolator offset
+       ;;
+       ld4 r10 = [r29]         // xtime_lock.sequence
+(p15)  add r8 = r8, r17        // Add monotonic.nsecs to nsecs
+       shr.u r2 = r2,r21
+       ;;              // overloaded 3 bundles!
+       // End critical section.
+       add r8 = r8,r2          // Add xtime.nsecs
+       cmp4.ne.or p7,p0 = r28,r10
+(p7)   br.cond.dpnt.few .time_redo     // sequence number changed ?
+       // Now r8=tv->tv_nsec and r9=tv->tv_sec
+       mov r10 = r0
+       movl r2 = 1000000000
+       add r23 = IA64_TIMESPEC_TV_NSEC_OFFSET, r31
+(p14)  movl r3 = 2361183241434822607   // Prep for / 1000 hack
+       ;;
+.time_normalize:
+       mov r21 = r8
+       cmp.ge p6,p0 = r8,r2
+(p14)  shr.u r20 = r8, 3               // We can repeat this if necessary just wasting some time
+       ;;
+(p14)  setf.sig f8 = r20
+(p6)   sub r8 = r8,r2
+(p6)   add r9 = 1,r9                   // two nops before the branch.
+(p14)  setf.sig f7 = r3                // Chances for repeats are 1 in 10000 for gettod
+(p6)   br.cond.dpnt.few .time_normalize
+       ;;
+       // Divided by 8 though shift. Now divide by 125
+       // The compiler was able to do that with a multiply
+       // and a shift and we do the same
+EX(.fail_efault, probe.w.fault r23, 3)         // This also costs 5 cycles
+(p14)  xmpy.hu f8 = f8, f7                     // xmpy has 5 cycles latency so use it...
+       ;;
+       mov r8 = r0
+(p14)  getf.sig r2 = f8
+       ;;
+(p14)  shr.u r21 = r2, 4
+       ;;
+EX(.fail_efault, st8 [r31] = r9)
+EX(.fail_efault, st8 [r23] = r21)
         FSYS_RETURN
-       /*
-        * Note: We are NOT clearing the scratch registers here.  Since the only things
-        *       in those registers are time-related variables and some addresses (which
-        *       can be obtained from System.map), none of this should be security-sensitive
-        *       and we should be fine.
-        */
-
  .fail_einval:
-       mov r8=EINVAL                   // r8 = EINVAL
-       mov r10=-1                      // r10 = -1
+       mov r8 = EINVAL
+       mov r10 = -1
         FSYS_RETURN
-
  .fail_efault:
-       mov r8=EFAULT                   // r8 = EFAULT
-       mov r10=-1                      // r10 = -1
+       mov r8 = EFAULT
+       mov r10 = -1
         FSYS_RETURN
  END(fsys_gettimeofday)
  
+ENTRY(fsys_clock_gettime)
+       .prologue
+       .altrp b6
+       .body
+       cmp4.ltu p6, p0 = CLOCK_MONOTONIC, r32
+       // Fallback if this is not CLOCK_REALTIME or CLOCK_MONOTONIC
+(p6)   br.spnt.few fsys_fallback_syscall
+       mov r31 = r33
+       shl r30 = r32,15
+       br.many .gettime
+END(fsys_clock_gettime)
+
  /*
   * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize).
   */
@@ -345,40 +359,33 @@ ENTRY(fsys_rt_sigprocmask)
         .altrp b6
         .body
  
-       mf                                      // ensure reading of current->blocked is ordered
         add r2=IA64_TASK_BLOCKED_OFFSET,r16
         add r9=TI_FLAGS+IA64_TASK_SIZE,r16
+       cmp4.ltu p6,p0=SIG_SETMASK,r32
+
+       cmp.ne p15,p0=r0,r34                    // oset != NULL?
+       tnat.nz p8,p0=r34
+       add r31=IA64_TASK_SIGHAND_OFFSET,r16
         ;;
-       /*
-        * Since we're only reading a single word, we can do it
-        * atomically without acquiring current->sighand->siglock.  To
-        * be on the safe side, we need a fully-ordered load, though:
-        */
-       ld8.acq r3=[r2]                         // read/prefetch current->blocked
+       ld8 r3=[r2]                             // read/prefetch current->blocked
         ld4 r9=[r9]
-       add r31=IA64_TASK_SIGHAND_OFFSET,r16
+       tnat.nz.or p6,p0=r35
+
+       cmp.ne.or p6,p0=_NSIG_WORDS*8,r35
+       tnat.nz.or p6,p0=r32
+(p6)   br.spnt.few .fail_einval                // fail with EINVAL
         ;;
  #ifdef CONFIG_SMP
         ld8 r31=[r31]                           // r31 <- current->sighand
  #endif
         and r9=TIF_ALLWORK_MASK,r9
-       tnat.nz p6,p0=r32
-       ;;
-       cmp.ne p7,p0=0,r9
-       tnat.nz.or p6,p0=r35
-       tnat.nz p8,p0=r34
-       ;;
-       cmp.ne p15,p0=r0,r34                    // oset != NULL?
-       cmp.ne.or p6,p0=_NSIG_WORDS*8,r35
         tnat.nz.or p8,p0=r33
-
-(p6)   br.spnt.few .fail_einval                // fail with EINVAL
-(p7)   br.spnt.many fsys_fallback_syscall      // got pending kernel work...
-(p8)   br.spnt.few .fail_efault                // fail with EFAULT
         ;;
-
-       cmp.eq p6,p7=r0,r33                     // set == NULL?
+       cmp.ne p7,p0=0,r9
+       cmp.eq p6,p0=r0,r33                     // set == NULL?
         add r31=IA64_SIGHAND_SIGLOCK_OFFSET,r31 // r31 <- current->sighand->siglock
+(p8)   br.spnt.few .fail_efault                // fail with EFAULT
+(p7)   br.spnt.many fsys_fallback_syscall      // got pending kernel work...
  (p6)   br.dpnt.many .store_mask                // -> short-circuit to just reading the signal mask
  
         /* Argh, we actually have to do some work and _update_ the signal mask: */
@@ -453,21 +460,19 @@ EX(.fail_efault, ld8 r14=[r33])                   // r14 <- *set
         ;;
  
         st8 [r2]=r14                            // update current->blocked with new mask
-       cmpxchg4.acq r14=[r9],r18,ar.ccv        // current->thread_info->flags <- r18
+       cmpxchg4.acq r8=[r9],r18,ar.ccv         // current->thread_info->flags <- r18
         ;;
-       cmp.ne p6,p0=r17,r14                    // update failed?
+       cmp.ne p6,p0=r17,r8                     // update failed?
  (p6)   br.cond.spnt.few 1b                     // yes -> retry
  
  #ifdef CONFIG_SMP
         st4.rel [r31]=r0                        // release the lock
  #endif
         ssm psr.i
-       cmp.ne p9,p0=r8,r0                      // check for bad HOW value
         ;;
  
         srlz.d                                  // ensure psr.i is set again
         mov r18=0                                       // i must not leak kernel bits...
-(p9)   br.spnt.few .fail_einval                // bail out for bad HOW value
  
  .store_mask:
  EX(.fail_efault, (p15) probe.w.fault r34, 3)   // verify user has write-access to *oset
@@ -511,6 +516,7 @@ ENTRY(fsys_fallback_syscall)
         adds r17=-1024,r15
         movl r14=sys_call_table
         ;;
+       rsm psr.i
         shladd r18=r17,3,r14
         ;;
         ld8 r18=[r18]                           // load normal (heavy-weight) syscall entry-point
@@ -525,86 +531,114 @@ GLOBAL_ENTRY(fsys_bubble_down)
         .altrp b6
         .body
         /*
-        * We get here for syscalls that don't have a lightweight handler.  For those, we
-        * need to bubble down into the kernel and that requires setting up a minimal
-        * pt_regs structure, and initializing the CPU state more or less as if an
-        * interruption had occurred.  To make syscall-restarts work, we setup pt_regs
-        * such that cr_iip points to the second instruction in syscall_via_break.
-        * Decrementing the IP hence will restart the syscall via break and not
-        * decrementing IP will return us to the caller, as usual.  Note that we preserve
-        * the value of psr.pp rather than initializing it from dcr.pp.  This makes it
-        * possible to distinguish fsyscall execution from other privileged execution.
+        * We get here for syscalls that don't have a lightweight
+        * handler.  For those, we need to bubble down into the kernel
+        * and that requires setting up a minimal pt_regs structure,
+        * and initializing the CPU state more or less as if an
+        * interruption had occurred.  To make syscall-restarts work,
+        * we setup pt_regs such that cr_iip points to the second
+        * instruction in syscall_via_break.  Decrementing the IP
+        * hence will restart the syscall via break and not
+        * decrementing IP will return us to the caller, as usual.
+        * Note that we preserve the value of psr.pp rather than
+        * initializing it from dcr.pp.  This makes it possible to
+        * distinguish fsyscall execution from other privileged
+        * execution.
          *
          * On entry:
-        *      - normal fsyscall handler register usage, except that we also have:
+        *      - normal fsyscall handler register usage, except
+        *        that we also have:
          *      - r18: address of syscall entry point
          *      - r21: ar.fpsr
          *      - r26: ar.pfs
          *      - r27: ar.rsc
          *      - r29: psr
+        *
+        * We used to clear some PSR bits here but that requires slow
+        * serialization.  Fortuntely, that isn't really necessary.
+        * The rationale is as follows: we used to clear bits
+        * ~PSR_PRESERVED_BITS in PSR.L.  Since
+        * PSR_PRESERVED_BITS==PSR.{UP,MFL,MFH,PK,DT,PP,SP,RT,IC}, we
+        * ended up clearing PSR.{BE,AC,I,DFL,DFH,DI,DB,SI,TB}.
+        * However,
+        *
+        * PSR.BE : already is turned off in __kernel_syscall_via_epc()
+        * PSR.AC : don't care (kernel normally turns PSR.AC on)
+        * PSR.I  : already turned off by the time fsys_bubble_down gets
+        *          invoked
+        * PSR.DFL: always 0 (kernel never turns it on)
+        * PSR.DFH: don't care --- kernel never touches f32-f127 on its own
+        *          initiative
+        * PSR.DI : always 0 (kernel never turns it on)
+        * PSR.SI : always 0 (kernel never turns it on)
+        * PSR.DB : don't care --- kernel never enables kernel-level
+        *          breakpoints
+        * PSR.TB : must be 0 already; if it wasn't zero on entry to
+        *          __kernel_syscall_via_epc, the branch to fsys_bubble_down
+        *          will trigger a taken branch; the taken-trap-handler then
+        *          converts the syscall into a break-based system-call.
          */
-#      define PSR_PRESERVED_BITS       (IA64_PSR_UP | IA64_PSR_MFL | IA64_PSR_MFH | IA64_PSR_PK \
-                                        | IA64_PSR_DT | IA64_PSR_PP | IA64_PSR_SP | IA64_PSR_RT \
-                                        | IA64_PSR_IC)
         /*
-        * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc.  The rest we have
-        * to synthesize.
+        * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc.
+        * The rest we have to synthesize.
          */
-#      define PSR_ONE_BITS             ((3 << IA64_PSR_CPL0_BIT) | (0x1 << IA64_PSR_RI_BIT) \
-                                        | IA64_PSR_BN)
+#      define PSR_ONE_BITS             ((3 << IA64_PSR_CPL0_BIT)       \
+                                        | (0x1 << IA64_PSR_RI_BIT)     \
+                                        | IA64_PSR_BN | IA64_PSR_I)
  
-       invala
-       movl r8=PSR_ONE_BITS
+       invala                                  // M0|1
+       movl r14=ia64_ret_from_syscall          // X
  
-       mov r25=ar.unat                 // save ar.unat (5 cyc)
-       movl r9=PSR_PRESERVED_BITS
+       nop.m 0
+       movl r28=__kernel_syscall_via_break     // X    create cr.iip
+       ;;
  
-       mov ar.rsc=0                    // set enforced lazy mode, pl 0, little-endian, loadrs=0
-       movl r28=__kernel_syscall_via_break
+       mov r2=r16                              // A    get task addr to addl-addressable register
+       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 // A
+       mov r31=pr                              // I0   save pr (2 cyc)
         ;;
-       mov r23=ar.bspstore             // save ar.bspstore (12 cyc)
-       mov r31=pr                      // save pr (2 cyc)
-       mov r20=r1                      // save caller's gp in r20
+       st1 [r16]=r0                            // M2|3 clear current->thread.on_ustack flag
+       addl r22=IA64_RBS_OFFSET,r2             // A    compute base of RBS
+       add r3=TI_FLAGS+IA64_TASK_SIZE,r2       // A
         ;;
-       mov r2=r16                      // copy current task addr to addl-addressable register
-       and r9=r9,r29
-       mov r19=b6                      // save b6 (2 cyc)
+       ld4 r3=[r3]                             // M0|1 r3 = current_thread_info()->flags
+       lfetch.fault.excl.nt1 [r22]             // M0|1 prefetch register backing-store
+       nop.i 0
         ;;
-       mov psr.l=r9                    // slam the door (17 cyc to srlz.i)
-       or r29=r8,r29                   // construct cr.ipsr value to save
-       addl r22=IA64_RBS_OFFSET,r2     // compute base of RBS
+       mov ar.rsc=0                            // M2   set enforced lazy mode, pl 0, LE, loadrs=0
+       nop.m 0
+       nop.i 0
         ;;
-       mov.m r24=ar.rnat               // read ar.rnat (5 cyc lat)
-       lfetch.fault.excl.nt1 [r22]
-       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r2
-
-       // ensure previous insn group is issued before we stall for srlz.i:
+       mov r23=ar.bspstore                     // M2 (12 cyc) save ar.bspstore
+       mov.m r24=ar.rnat                       // M2 (5 cyc) read ar.rnat (dual-issues!)
+       nop.i 0
         ;;
-       srlz.i                          // ensure new psr.l has been established
-       /////////////////////////////////////////////////////////////////////////////
-       ////////// from this point on, execution is not interruptible anymore
-       /////////////////////////////////////////////////////////////////////////////
-       addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2    // compute base of memory stack
-       cmp.ne pKStk,pUStk=r0,r0        // set pKStk <- 0, pUStk <- 1
+       mov ar.bspstore=r22                     // M2 (6 cyc) switch to kernel RBS
+       movl r8=PSR_ONE_BITS                    // X
         ;;
-       st1 [r16]=r0                    // clear current->thread.on_ustack flag
-       mov ar.bspstore=r22             // switch to kernel RBS
-       mov b6=r18                      // copy syscall entry-point to b6 (7 cyc)
-       add r3=TI_FLAGS+IA64_TASK_SIZE,r2
+       mov r25=ar.unat                         // M2 (5 cyc) save ar.unat
+       mov r19=b6                              // I0   save b6 (2 cyc)
+       mov r20=r1                              // A    save caller's gp in r20
         ;;
-       ld4 r3=[r3]                             // r2 = current_thread_info()->flags
-       mov r18=ar.bsp                  // save (kernel) ar.bsp (12 cyc)
-       mov ar.rsc=0x3                  // set eager mode, pl 0, little-endian, loadrs=0
-       br.call.sptk.many b7=ia64_syscall_setup
+       or r29=r8,r29                           // A    construct cr.ipsr value to save
+       mov b6=r18                              // I0   copy syscall entry-point to b6 (7 cyc)
+       addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2 // A compute base of memory stack
+
+       mov r18=ar.bsp                          // M2   save (kernel) ar.bsp (12 cyc)
+       cmp.ne pKStk,pUStk=r0,r0                // A    set pKStk <- 0, pUStk <- 1
+       br.call.sptk.many b7=ia64_syscall_setup // B
         ;;
-       ssm psr.i
-       movl r2=ia64_ret_from_syscall
+       mov ar.rsc=0x3                          // M2   set eager mode, pl 0, LE, loadrs=0
+       mov rp=r14                              // I0   set the real return addr
+       and r3=_TIF_SYSCALL_TRACEAUDIT,r3       // A
         ;;
-       mov rp=r2                               // set the real return addr
-       tbit.z p8,p0=r3,TIF_SYSCALL_TRACE
+       ssm psr.i                               // M2   we're on kernel stacks now, reenable irqs
+       cmp.eq p8,p0=r3,r0                      // A
+(p10)  br.cond.spnt.many ia64_ret_from_syscall // B    return if bad call-frame or r15 is a NaT
  
-(p8)   br.call.sptk.many b6=b6                 // ignore this return addr
-       br.cond.sptk ia64_trace_syscall
+       nop.m 0
+(p8)   br.call.sptk.many b6=b6                 // B    (ignore return address)
+       br.cond.spnt ia64_trace_syscall         // B
  END(fsys_bubble_down)
  
         .rodata
@@ -843,31 +877,8 @@ fsyscall_table:
         data8 0                         // timer_getoverrun
         data8 0                         // timer_delete
         data8 0                         // clock_settime
-       data8 0                         // clock_gettime
-       data8 0                         // clock_getres         // 1255
-       data8 0                         // clock_nanosleep
-       data8 0                         // fstatfs64
-       data8 0                         // statfs64
-       data8 0
-       data8 0                                                 // 1260
-       data8 0
-       data8 0                         // mq_open
-       data8 0                         // mq_unlink
-       data8 0                         // mq_timedsend
-       data8 0                         // mq_timedreceive      // 1265
-       data8 0                         // mq_notify
-       data8 0                         // mq_getsetattr
-       data8 0
-       data8 0
-       data8 0                                                 // 1270
-       data8 0
-       data8 0
-       data8 0
-       data8 0
-       data8 0                                                 // 1275
-       data8 0
-       data8 0
-       data8 0
-       data8 0
+       data8 fsys_clock_gettime        // clock_gettime
  
-       .org fsyscall_table + 8*NR_syscalls     // guard against failures to increase NR_syscalls
+       // fill in zeros for the remaining entries
+       .zero:
+       .space fsyscall_table + 8*NR_syscalls - .zero, 0