X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fia64%2Fkernel%2Ffsys.S;h=06b05742d9d9c0fc71865b958026b345db4f3f4a;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=0f8e5b5fd8fef5b5fbc6bab41f90c319df578489;hpb=9bf4aaab3e101692164d49b7ca357651eb691cb6;p=linux-2.6.git

diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index 0f8e5b5fd..06b05742d 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -8,11 +8,13 @@
  * 18-Feb-03 louisk	Implement fsys_gettimeofday().
  * 28-Feb-03 davidm	Fixed several bugs in fsys_gettimeofday().  Tuned it some more,
  *			probably broke it along the way... ;-)
+ * 13-Jul-04 clameter   Implement fsys_clock_gettime and revise fsys_gettimeofday to make
+ *                      it capable of using memory based clocks without falling back to C code.
  */
 
 #include <asm/asmmacro.h>
 #include <asm/errno.h>
-#include <asm/offsets.h>
+#include <asm/asm-offsets.h>
 #include <asm/percpu.h>
 #include <asm/thread_info.h>
 #include <asm/sal.h>
@@ -81,29 +83,29 @@ ENTRY(fsys_getppid)
 	;;
 
 	ld4 r9=[r9]
-	add r17=IA64_TASK_REAL_PARENT_OFFSET,r17 // r17 = &current->group_leader->real_parent
+	add r17=IA64_TASK_PARENT_OFFSET,r17 // r17 = &current->group_leader->parent
 	;;
 	and r9=TIF_ALLWORK_MASK,r9
 
-1:	ld8 r18=[r17]				// r18 = current->group_leader->real_parent
+1:	ld8 r18=[r17]				// r18 = current->group_leader->parent
 	;;
 	cmp.ne p8,p0=0,r9
-	add r8=IA64_TASK_TGID_OFFSET,r18	// r8 = &current->group_leader->real_parent->tgid
+	add r8=IA64_TASK_TGID_OFFSET,r18	// r8 = &current->group_leader->parent->tgid
 	;;
 
 	/*
 	 * The .acq is needed to ensure that the read of tgid has returned its data before
-	 * we re-check "real_parent".
+	 * we re-check "parent".
 	 */
-	ld4.acq r8=[r8]				// r8 = current->group_leader->real_parent->tgid
+	ld4.acq r8=[r8]				// r8 = current->group_leader->parent->tgid
 #ifdef CONFIG_SMP
 	/*
-	 * Re-read current->group_leader->real_parent.
+	 * Re-read current->group_leader->parent.
 	 */
-	ld8 r19=[r17]				// r19 = current->group_leader->real_parent
+	ld8 r19=[r17]				// r19 = current->group_leader->parent
 (p8)	br.spnt.many fsys_fallback_syscall
 	;;
-	cmp.ne p6,p0=r18,r19			// did real_parent change?
+	cmp.ne p6,p0=r18,r19			// did parent change?
 	mov r19=0			// i must not leak kernel bits...
 (p6)	br.cond.spnt.few 1b			// yes -> redo the read of tgid and the check
 	;;
@@ -144,195 +146,208 @@ ENTRY(fsys_set_tid_address)
 END(fsys_set_tid_address)
 
 /*
- * Note 1: This routine uses floating-point registers, but only with registers that
- *	   operate on integers.  Because of that, we don't need to set ar.fpsr to the
- *	   kernel default value.
- *
- * Note 2: For now, we will assume that all CPUs run at the same clock-frequency.
- *	   If that wasn't the case, we would have to disable preemption (e.g.,
- *	   by disabling interrupts) between reading the ITC and reading
- *	   local_cpu_data->nsec_per_cyc.
- *
- * Note 3: On platforms where the ITC-drift bit is set in the SAL feature vector,
- *	   we ought to either skip the ITC-based interpolation or run an ntp-like
- *	   daemon to keep the ITCs from drifting too far apart.
+ * Ensure that the time interpolator structure is compatible with the asm code
  */
+#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \
+	|| IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4
+#error fsys_gettimeofday incompatible with changes to struct time_interpolator
+#endif
+#define CLOCK_REALTIME 0
+#define CLOCK_MONOTONIC 1
+#define CLOCK_DIVIDE_BY_1000 0x4000
+#define CLOCK_ADD_MONOTONIC 0x8000
 
 ENTRY(fsys_gettimeofday)
 	.prologue
 	.altrp b6
 	.body
-	add r9=TI_FLAGS+IA64_TASK_SIZE,r16
-	addl r3=THIS_CPU(cpu_info),r0
-
-#ifdef CONFIG_SMP
-	movl r10=__per_cpu_offset
-	movl r2=sal_platform_features
-	;;
-
-	ld8 r2=[r2]
-	movl r19=xtime			// xtime is a timespec struct
-
-	ld8 r10=[r10]			// r10 <- __per_cpu_offset[0]
-	addl r21=THIS_CPU(cpu_info),r0
-	;;
-	add r10=r21, r10		// r10 <- &cpu_data(time_keeper_id)
-	tbit.nz p8,p0 = r2, IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT_BIT
-(p8)	br.spnt.many fsys_fallback_syscall
-#else
-	;;
-	mov r10=r3
-	movl r19=xtime			// xtime is a timespec struct
-#endif
-	ld4 r9=[r9]
-	movl r17=xtime_lock
-	;;
-
-	// r32, r33 should contain the 2 args of gettimeofday
-	adds r21=IA64_CPUINFO_ITM_NEXT_OFFSET, r10
-	mov r2=-1
-	tnat.nz p6,p7=r32		// guard against NaT args
-	;;
-
-	adds r10=IA64_CPUINFO_ITM_DELTA_OFFSET, r10
-(p7)	tnat.nz p6,p0=r33
-(p6)	br.cond.spnt.few .fail_einval
-
-	adds r8=IA64_CPUINFO_NSEC_PER_CYC_OFFSET, r3
-	movl r24=2361183241434822607	// for division hack (only for / 1000)
-	;;
-
-	ldf8 f7=[r10]			// f7 now contains itm_delta
-	setf.sig f11=r2
-	adds r10=8, r32
-
-	adds r20=IA64_TIMESPEC_TV_NSEC_OFFSET, r19	// r20 = &xtime->tv_nsec
-	movl r26=jiffies
-
-	setf.sig f9=r24			// f9 is used for division hack
-	movl r27=wall_jiffies
-
-	and r9=TIF_ALLWORK_MASK,r9
-	movl r25=last_nsec_offset
-	;;
-
-	/*
-	 * Verify that we have permission to write to struct timeval.  Note:
-	 * Another thread might unmap the mapping before we actually get
-	 * to store the result.  That's OK as long as the stores are also
-	 * protect by EX().
-	 */
-EX(.fail_efault, probe.w.fault r32, 3)		// this must come _after_ NaT-check
-EX(.fail_efault, probe.w.fault r10, 3)		// this must come _after_ NaT-check
-	nop 0
-
-	ldf8 f10=[r8]			// f10 <- local_cpu_data->nsec_per_cyc value
-	cmp.ne p8, p0=0, r9
-(p8)	br.spnt.many fsys_fallback_syscall
-	;;
-.retry:	// *** seq = read_seqbegin(&xtime_lock); ***
-	ld4.acq r23=[r17]		// since &xtime_lock == &xtime_lock->sequence
-	ld8 r14=[r25]			// r14 (old) = last_nsec_offset
-
-	ld8 r28=[r26]			// r28 = jiffies
-	ld8 r29=[r27]			// r29 = wall_jiffies
-	;;
-
-	ldf8 f8=[r21]			// f8 now contains itm_next
-	mov.m r31=ar.itc		// put time stamp into r31 (ITC) == now
-	sub r28=r29, r28, 1		// r28 now contains "-(lost + 1)"
-	;;
-
-	ld8 r2=[r19]			// r2 = sec = xtime.tv_sec
-	ld8 r29=[r20]			// r29 = nsec = xtime.tv_nsec
-	tbit.nz p9, p10=r23, 0		// p9 <- is_odd(r23), p10 <- is_even(r23)
-
-	setf.sig f6=r28			// f6 <- -(lost + 1)				(6 cyc)
-	;;
-
+	mov r31 = r32
+	tnat.nz p6,p0 = r33		// guard against NaT argument
+(p6)    br.cond.spnt.few .fail_einval
+	mov r30 = CLOCK_DIVIDE_BY_1000
+	;;
+.gettime:
+	// Register map
+	// Incoming r31 = pointer to address where to place result
+	//          r30 = flags determining how time is processed
+	// r2,r3 = temp r4-r7 preserved
+	// r8 = result nanoseconds
+	// r9 = result seconds
+	// r10 = temporary storage for clock difference
+	// r11 = preserved: saved ar.pfs
+	// r12 = preserved: memory stack
+	// r13 = preserved: thread pointer
+	// r14 = address of mask / mask
+	// r15 = preserved: system call number
+	// r16 = preserved: current task pointer
+	// r17 = wall to monotonic use
+	// r18 = time_interpolator->offset
+	// r19 = address of wall_to_monotonic
+	// r20 = pointer to struct time_interpolator / pointer to time_interpolator->address
+	// r21 = shift factor
+	// r22 = address of time interpolator->last_counter
+	// r23 = address of time_interpolator->last_cycle
+	// r24 = adress of time_interpolator->offset
+	// r25 = last_cycle value
+	// r26 = last_counter value
+	// r27 = pointer to xtime
+	// r28 = sequence number at the beginning of critcal section
+	// r29 = address of seqlock
+	// r30 = time processing flags / memory address
+	// r31 = pointer to result
+	// Predicates
+	// p6,p7 short term use
+	// p8 = timesource ar.itc
+	// p9 = timesource mmio64
+	// p10 = timesource mmio32
+	// p11 = timesource not to be handled by asm code
+	// p12 = memory time source ( = p9 | p10)
+	// p13 = do cmpxchg with time_interpolator_last_cycle
+	// p14 = Divide by 1000
+	// p15 = Add monotonic
+	//
+	// Note that instructions are optimized for McKinley. McKinley can process two
+	// bundles simultaneously and therefore we continuously try to feed the CPU
+	// two bundles and then a stop.
+	tnat.nz p6,p0 = r31	// branch deferred since it does not fit into bundle structure
+	mov pr = r30,0xc000	// Set predicates according to function
+	add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
+	movl r20 = time_interpolator
+	;;
+	ld8 r20 = [r20]		// get pointer to time_interpolator structure
+	movl r29 = xtime_lock
+	ld4 r2 = [r2]		// process work pending flags
+	movl r27 = xtime
+	;;	// only one bundle here
+	ld8 r21 = [r20]		// first quad with control information
+	and r2 = TIF_ALLWORK_MASK,r2
+(p6)    br.cond.spnt.few .fail_einval	// deferred branch
+	;;
+	add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20
+	extr r3 = r21,32,32	// time_interpolator->nsec_per_cyc
+	extr r8 = r21,0,16	// time_interpolator->source
+	cmp.ne p6, p0 = 0, r2	// Fallback if work is scheduled
+(p6)    br.cond.spnt.many fsys_fallback_syscall
+	;;
+	cmp.eq p8,p12 = 0,r8	// Check for cpu timer
+	cmp.eq p9,p0 = 1,r8	// MMIO64 ?
+	extr r2 = r21,24,8	// time_interpolator->jitter
+	cmp.eq p10,p0 = 2,r8	// MMIO32 ?
+	cmp.ltu p11,p0 = 2,r8	// function or other clock
+(p11)	br.cond.spnt.many fsys_fallback_syscall
+	;;
+	setf.sig f7 = r3	// Setup for scaling of counter
+(p15)	movl r19 = wall_to_monotonic
+(p12)	ld8 r30 = [r10]
+	cmp.ne p13,p0 = r2,r0	// need jitter compensation?
+	extr r21 = r21,16,8	// shift factor
+	;;
+.time_redo:
+	.pred.rel.mutex p8,p9,p10
+	ld4.acq r28 = [r29]	// xtime_lock.sequence. Must come first for locking purposes
+(p8)	mov r2 = ar.itc		// CPU_TIMER. 36 clocks latency!!!
+	add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20
+(p9)	ld8 r2 = [r30]		// readq(ti->address). Could also have latency issues..
+(p10)	ld4 r2 = [r30]		// readw(ti->address)
+(p13)	add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20
+	;;			// could be removed by moving the last add upward
+	ld8 r26 = [r22]		// time_interpolator->last_counter
+(p13)	ld8 r25 = [r23]		// time interpolator->last_cycle
+	add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20
+(p15)	ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET
+ 	ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET
+	add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20
+	;;
+	ld8 r18 = [r24]		// time_interpolator->offset
+	ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET	// xtime.tv_nsec
+(p13)	sub r3 = r25,r2	// Diff needed before comparison (thanks davidm)
+	;;
+	ld8 r14 = [r14]		// time_interpolator->mask
+(p13)	cmp.gt.unc p6,p7 = r3,r0	// check if it is less than last. p6,p7 cleared
+	sub r10 = r2,r26	// current_counter - last_counter
+	;;
+(p6)	sub r10 = r25,r26	// time we got was less than last_cycle
+(p7)	mov ar.ccv = r25	// more than last_cycle. Prep for cmpxchg
+	;;
+	and r10 = r10,r14	// Apply mask
+	;;
+	setf.sig f8 = r10
+	nop.i 123
+	;;
+(p7)	cmpxchg8.rel r3 = [r23],r2,ar.ccv
+EX(.fail_efault, probe.w.fault r31, 3)	// This takes 5 cycles and we have spare time
+	xmpy.l f8 = f8,f7	// nsec_per_cyc*(counter-last_counter)
+(p15)	add r9 = r9,r17		// Add wall to monotonic.secs to result secs
+	;;
+(p15)	ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET
+(p7)	cmp.ne p7,p0 = r25,r3	// if cmpxchg not successful redo
+	// simulate tbit.nz.or p7,p0 = r28,0
+	and r28 = ~1,r28	// Make sequence even to force retry if odd
+	getf.sig r2 = f8
 	mf
-	xma.l f8=f6, f7, f8	// f8 (last_tick) <- -(lost + 1)*itm_delta + itm_next	(5 cyc)
-	nop 0
-
-	setf.sig f12=r31		// f12 <- ITC					(6 cyc)
-	// *** if (unlikely(read_seqretry(&xtime_lock, seq))) continue; ***
-	ld4 r24=[r17]			// r24 = xtime_lock->sequence (re-read)
-	nop 0
-	;;
-
-	xma.l f8=f11, f8, f12	// f8 (elapsed_cycles) <- (-1*last_tick + now) = (now - last_tick)
-	nop 0
-	;;
-
-	getf.sig r18=f8			// r18 <- (now - last_tick)
-	xmpy.l f8=f8, f10		// f8 <- elapsed_cycles*nsec_per_cyc (5 cyc)
-	add r3=r29, r14			// r3 = (nsec + old)
-	;;
-
-	cmp.lt p7, p8=r18, r0		// if now < last_tick, set p7 = 1, p8 = 0
-	getf.sig r18=f8			// r18 = elapsed_cycles*nsec_per_cyc		(6 cyc)
-	nop 0
-	;;
-
-(p10)	cmp.ne p9, p0=r23, r24		// if xtime_lock->sequence != seq, set p9
-	shr.u r18=r18, IA64_NSEC_PER_CYC_SHIFT	// r18 <- offset
-(p9)	br.spnt.many .retry
-	;;
-
-	mov ar.ccv=r14			// ar.ccv = old					(1 cyc)
-	cmp.leu p7, p8=r18, r14		// if (offset <= old), set p7 = 1, p8 = 0
-	;;
-
-(p8)	cmpxchg8.rel r24=[r25], r18, ar.ccv	// compare-and-exchange (atomic!)
-(p8)	add r3=r29, r18			// r3 = (nsec + offset)
-	;;
-	shr.u r3=r3, 3			// initiate dividing r3 by 1000
-	;;
-	setf.sig f8=r3			//						(6 cyc)
-	mov r10=1000000			// r10 = 1000000
-	;;
-(p8)	cmp.ne.unc p9, p0=r24, r14
-	xmpy.hu f6=f8, f9		//						(5 cyc)
-(p9)	br.spnt.many .retry
-	;;
-
-	getf.sig r3=f6			//						(6 cyc)
-	;;
-	shr.u r3=r3, 4			// end of division, r3 is divided by 1000 (=usec)
-	;;
-
-1:	cmp.geu p7, p0=r3, r10		// while (usec >= 1000000)
-	;;
-(p7)	sub r3=r3, r10			// usec -= 1000000
-(p7)	adds r2=1, r2			// ++sec
-(p7)	br.spnt.many 1b
-
-	// finally: r2 = sec, r3 = usec
-EX(.fail_efault, st8 [r32]=r2)
-	adds r9=8, r32
-	mov r8=r0			// success
-	;;
-EX(.fail_efault, st8 [r9]=r3)		// store them in the timeval struct
-	mov r10=0
+	add r8 = r8,r18		// Add time interpolator offset
+	;;
+	ld4 r10 = [r29]		// xtime_lock.sequence
+(p15)	add r8 = r8, r17	// Add monotonic.nsecs to nsecs
+	shr.u r2 = r2,r21
+	;;		// overloaded 3 bundles!
+	// End critical section.
+	add r8 = r8,r2		// Add xtime.nsecs
+	cmp4.ne.or p7,p0 = r28,r10
+(p7)	br.cond.dpnt.few .time_redo	// sequence number changed ?
+	// Now r8=tv->tv_nsec and r9=tv->tv_sec
+	mov r10 = r0
+	movl r2 = 1000000000
+	add r23 = IA64_TIMESPEC_TV_NSEC_OFFSET, r31
+(p14)	movl r3 = 2361183241434822607	// Prep for / 1000 hack
+	;;
+.time_normalize:
+	mov r21 = r8
+	cmp.ge p6,p0 = r8,r2
+(p14)	shr.u r20 = r8, 3		// We can repeat this if necessary just wasting some time
+	;;
+(p14)	setf.sig f8 = r20
+(p6)	sub r8 = r8,r2
+(p6)	add r9 = 1,r9			// two nops before the branch.
+(p14)	setf.sig f7 = r3		// Chances for repeats are 1 in 10000 for gettod
+(p6)	br.cond.dpnt.few .time_normalize
+	;;
+	// Divided by 8 though shift. Now divide by 125
+	// The compiler was able to do that with a multiply
+	// and a shift and we do the same
+EX(.fail_efault, probe.w.fault r23, 3)		// This also costs 5 cycles
+(p14)	xmpy.hu f8 = f8, f7			// xmpy has 5 cycles latency so use it...
+	;;
+	mov r8 = r0
+(p14)	getf.sig r2 = f8
+	;;
+(p14)	shr.u r21 = r2, 4
+	;;
+EX(.fail_efault, st8 [r31] = r9)
+EX(.fail_efault, st8 [r23] = r21)
 	FSYS_RETURN
-	/*
-	 * Note: We are NOT clearing the scratch registers here.  Since the only things
-	 *	 in those registers are time-related variables and some addresses (which
-	 *	 can be obtained from System.map), none of this should be security-sensitive
-	 *	 and we should be fine.
-	 */
-
 .fail_einval:
-	mov r8=EINVAL			// r8 = EINVAL
-	mov r10=-1			// r10 = -1
+	mov r8 = EINVAL
+	mov r10 = -1
 	FSYS_RETURN
-
 .fail_efault:
-	mov r8=EFAULT			// r8 = EFAULT
-	mov r10=-1			// r10 = -1
+	mov r8 = EFAULT
+	mov r10 = -1
 	FSYS_RETURN
 END(fsys_gettimeofday)
 
+ENTRY(fsys_clock_gettime)
+	.prologue
+	.altrp b6
+	.body
+	cmp4.ltu p6, p0 = CLOCK_MONOTONIC, r32
+	// Fallback if this is not CLOCK_REALTIME or CLOCK_MONOTONIC
+(p6)	br.spnt.few fsys_fallback_syscall
+	mov r31 = r33
+	shl r30 = r32,15
+	br.many .gettime
+END(fsys_clock_gettime)
+
 /*
  * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize).
  */
@@ -445,9 +460,9 @@ EX(.fail_efault, ld8 r14=[r33])			// r14 <- *set
 	;;
 
 	st8 [r2]=r14				// update current->blocked with new mask
-	cmpxchg4.acq r14=[r9],r18,ar.ccv	// current->thread_info->flags <- r18
+	cmpxchg4.acq r8=[r9],r18,ar.ccv		// current->thread_info->flags <- r18
 	;;
-	cmp.ne p6,p0=r17,r14			// update failed?
+	cmp.ne p6,p0=r17,r8			// update failed?
 (p6)	br.cond.spnt.few 1b			// yes -> retry
 
 #ifdef CONFIG_SMP
@@ -516,90 +531,114 @@ GLOBAL_ENTRY(fsys_bubble_down)
 	.altrp b6
 	.body
 	/*
-	 * We get here for syscalls that don't have a lightweight handler.  For those, we
-	 * need to bubble down into the kernel and that requires setting up a minimal
-	 * pt_regs structure, and initializing the CPU state more or less as if an
-	 * interruption had occurred.  To make syscall-restarts work, we setup pt_regs
-	 * such that cr_iip points to the second instruction in syscall_via_break.
-	 * Decrementing the IP hence will restart the syscall via break and not
-	 * decrementing IP will return us to the caller, as usual.  Note that we preserve
-	 * the value of psr.pp rather than initializing it from dcr.pp.  This makes it
-	 * possible to distinguish fsyscall execution from other privileged execution.
+	 * We get here for syscalls that don't have a lightweight
+	 * handler.  For those, we need to bubble down into the kernel
+	 * and that requires setting up a minimal pt_regs structure,
+	 * and initializing the CPU state more or less as if an
+	 * interruption had occurred.  To make syscall-restarts work,
+	 * we setup pt_regs such that cr_iip points to the second
+	 * instruction in syscall_via_break.  Decrementing the IP
+	 * hence will restart the syscall via break and not
+	 * decrementing IP will return us to the caller, as usual.
+	 * Note that we preserve the value of psr.pp rather than
+	 * initializing it from dcr.pp.  This makes it possible to
+	 * distinguish fsyscall execution from other privileged
+	 * execution.
 	 *
 	 * On entry:
-	 *	- normal fsyscall handler register usage, except that we also have:
+	 *	- normal fsyscall handler register usage, except
+	 *	  that we also have:
 	 *	- r18: address of syscall entry point
 	 *	- r21: ar.fpsr
 	 *	- r26: ar.pfs
 	 *	- r27: ar.rsc
 	 *	- r29: psr
+	 *
+	 * We used to clear some PSR bits here but that requires slow
+	 * serialization.  Fortuntely, that isn't really necessary.
+	 * The rationale is as follows: we used to clear bits
+	 * ~PSR_PRESERVED_BITS in PSR.L.  Since
+	 * PSR_PRESERVED_BITS==PSR.{UP,MFL,MFH,PK,DT,PP,SP,RT,IC}, we
+	 * ended up clearing PSR.{BE,AC,I,DFL,DFH,DI,DB,SI,TB}.
+	 * However,
+	 *
+	 * PSR.BE : already is turned off in __kernel_syscall_via_epc()
+	 * PSR.AC : don't care (kernel normally turns PSR.AC on)
+	 * PSR.I  : already turned off by the time fsys_bubble_down gets
+	 *	    invoked
+	 * PSR.DFL: always 0 (kernel never turns it on)
+	 * PSR.DFH: don't care --- kernel never touches f32-f127 on its own
+	 *	    initiative
+	 * PSR.DI : always 0 (kernel never turns it on)
+	 * PSR.SI : always 0 (kernel never turns it on)
+	 * PSR.DB : don't care --- kernel never enables kernel-level
+	 *	    breakpoints
+	 * PSR.TB : must be 0 already; if it wasn't zero on entry to
+	 *          __kernel_syscall_via_epc, the branch to fsys_bubble_down
+	 *          will trigger a taken branch; the taken-trap-handler then
+	 *          converts the syscall into a break-based system-call.
 	 */
-#	define PSR_PRESERVED_BITS	(IA64_PSR_UP | IA64_PSR_MFL | IA64_PSR_MFH | IA64_PSR_PK \
-					 | IA64_PSR_DT | IA64_PSR_PP | IA64_PSR_SP | IA64_PSR_RT \
-					 | IA64_PSR_IC)
 	/*
-	 * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc.  The rest we have
-	 * to synthesize.
+	 * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc.
+	 * The rest we have to synthesize.
 	 */
-#	define PSR_ONE_BITS		((3 << IA64_PSR_CPL0_BIT) | (0x1 << IA64_PSR_RI_BIT) \
+#	define PSR_ONE_BITS		((3 << IA64_PSR_CPL0_BIT)	\
+					 | (0x1 << IA64_PSR_RI_BIT)	\
 					 | IA64_PSR_BN | IA64_PSR_I)
 
-	invala
-	movl r8=PSR_ONE_BITS
+	invala					// M0|1
+	movl r14=ia64_ret_from_syscall		// X
 
-	mov r25=ar.unat			// save ar.unat (5 cyc)
-	movl r9=PSR_PRESERVED_BITS
+	nop.m 0
+	movl r28=__kernel_syscall_via_break	// X	create cr.iip
+	;;
 
-	mov ar.rsc=0			// set enforced lazy mode, pl 0, little-endian, loadrs=0
-	movl r28=__kernel_syscall_via_break
+	mov r2=r16				// A    get task addr to addl-addressable register
+	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 // A
+	mov r31=pr				// I0   save pr (2 cyc)
 	;;
-	mov r23=ar.bspstore		// save ar.bspstore (12 cyc)
-	mov r31=pr			// save pr (2 cyc)
-	mov r20=r1			// save caller's gp in r20
+	st1 [r16]=r0				// M2|3 clear current->thread.on_ustack flag
+	addl r22=IA64_RBS_OFFSET,r2		// A    compute base of RBS
+	add r3=TI_FLAGS+IA64_TASK_SIZE,r2	// A
 	;;
-	mov r2=r16			// copy current task addr to addl-addressable register
-	and r9=r9,r29
-	mov r19=b6			// save b6 (2 cyc)
+	ld4 r3=[r3]				// M0|1 r3 = current_thread_info()->flags
+	lfetch.fault.excl.nt1 [r22]		// M0|1 prefetch register backing-store
+	nop.i 0
 	;;
-	mov psr.l=r9			// slam the door (17 cyc to srlz.i)
-	or r29=r8,r29			// construct cr.ipsr value to save
-	addl r22=IA64_RBS_OFFSET,r2	// compute base of RBS
+	mov ar.rsc=0				// M2   set enforced lazy mode, pl 0, LE, loadrs=0
+	nop.m 0
+	nop.i 0
 	;;
-	// GAS reports a spurious RAW hazard on the read of ar.rnat because it thinks
-	// we may be reading ar.itc after writing to psr.l.  Avoid that message with
-	// this directive:
-	dv_serialize_data
-	mov.m r24=ar.rnat		// read ar.rnat (5 cyc lat)
-	lfetch.fault.excl.nt1 [r22]
-	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r2
-
-	// ensure previous insn group is issued before we stall for srlz.i:
+	mov r23=ar.bspstore			// M2 (12 cyc) save ar.bspstore
+	mov.m r24=ar.rnat			// M2 (5 cyc) read ar.rnat (dual-issues!)
+	nop.i 0
 	;;
-	srlz.i				// ensure new psr.l has been established
-	/////////////////////////////////////////////////////////////////////////////
-	////////// from this point on, execution is not interruptible anymore
-	/////////////////////////////////////////////////////////////////////////////
-	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2	// compute base of memory stack
-	cmp.ne pKStk,pUStk=r0,r0	// set pKStk <- 0, pUStk <- 1
+	mov ar.bspstore=r22			// M2 (6 cyc) switch to kernel RBS
+	movl r8=PSR_ONE_BITS			// X
 	;;
-	st1 [r16]=r0			// clear current->thread.on_ustack flag
-	mov ar.bspstore=r22		// switch to kernel RBS
-	mov b6=r18			// copy syscall entry-point to b6 (7 cyc)
-	add r3=TI_FLAGS+IA64_TASK_SIZE,r2
+	mov r25=ar.unat				// M2 (5 cyc) save ar.unat
+	mov r19=b6				// I0   save b6 (2 cyc)
+	mov r20=r1				// A    save caller's gp in r20
 	;;
-	ld4 r3=[r3]				// r2 = current_thread_info()->flags
-	mov r18=ar.bsp			// save (kernel) ar.bsp (12 cyc)
-	mov ar.rsc=0x3			// set eager mode, pl 0, little-endian, loadrs=0
-	br.call.sptk.many b7=ia64_syscall_setup
+	or r29=r8,r29				// A    construct cr.ipsr value to save
+	mov b6=r18				// I0   copy syscall entry-point to b6 (7 cyc)
+	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2 // A compute base of memory stack
+
+	mov r18=ar.bsp				// M2   save (kernel) ar.bsp (12 cyc)
+	cmp.ne pKStk,pUStk=r0,r0		// A    set pKStk <- 0, pUStk <- 1
+	br.call.sptk.many b7=ia64_syscall_setup	// B
 	;;
-	ssm psr.i
-	movl r2=ia64_ret_from_syscall
+	mov ar.rsc=0x3				// M2   set eager mode, pl 0, LE, loadrs=0
+	mov rp=r14				// I0   set the real return addr
+	and r3=_TIF_SYSCALL_TRACEAUDIT,r3	// A
 	;;
-	mov rp=r2				// set the real return addr
-	tbit.z p8,p0=r3,TIF_SYSCALL_TRACE
+	ssm psr.i				// M2   we're on kernel stacks now, reenable irqs
+	cmp.eq p8,p0=r3,r0			// A
+(p10)	br.cond.spnt.many ia64_ret_from_syscall	// B    return if bad call-frame or r15 is a NaT
 
-(p8)	br.call.sptk.many b6=b6			// ignore this return addr
-	br.cond.sptk ia64_trace_syscall
+	nop.m 0
+(p8)	br.call.sptk.many b6=b6			// B    (ignore return address)
+	br.cond.spnt ia64_trace_syscall		// B
 END(fsys_bubble_down)
 
 	.rodata
@@ -838,31 +877,8 @@ fsyscall_table:
 	data8 0				// timer_getoverrun
 	data8 0				// timer_delete
 	data8 0				// clock_settime
-	data8 0				// clock_gettime
-	data8 0				// clock_getres		// 1255
-	data8 0				// clock_nanosleep
-	data8 0				// fstatfs64
-	data8 0				// statfs64
-	data8 0
-	data8 0							// 1260
-	data8 0
-	data8 0				// mq_open
-	data8 0				// mq_unlink
-	data8 0				// mq_timedsend
-	data8 0				// mq_timedreceive	// 1265
-	data8 0				// mq_notify
-	data8 0				// mq_getsetattr
-	data8 0				// kexec_load
-	data8 0
-	data8 0							// 1270
-	data8 0
-	data8 0
-	data8 0
-	data8 0
-	data8 0							// 1275
-	data8 0
-	data8 0
-	data8 0
-	data8 0
+	data8 fsys_clock_gettime	// clock_gettime
 
-	.org fsyscall_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
+	// fill in zeros for the remaining entries
+	.zero:
+	.space fsyscall_table + 8*NR_syscalls - .zero, 0