GLOBAL_ENTRY(_start)
start_ap:
.prologue
- .save rp, r4 // terminate unwind chain with a NULL rp
- mov r4=r0
+ .save rp, r0 // terminate unwind chain with a NULL rp
.body
rsm psr.i | psr.ic
* Initialize kernel region registers:
* rr[5]: VHPT enabled, page size = PAGE_SHIFT
* rr[6]: VHPT disabled, page size = IA64_GRANULE_SHIFT
- * rr[5]: VHPT disabled, page size = IA64_GRANULE_SHIFT
+ * rr[7]: VHPT disabled, page size = IA64_GRANULE_SHIFT
*/
mov r16=((ia64_rid(IA64_REGION_ID_KERNEL, (5<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
movl r17=(5<<61)
#endif
;;
tpa r3=r2 // r3 == phys addr of task struct
+ mov r16=-1
+(isBP) br.cond.dpnt .load_current // BP stack is on region 5 --- no need to map it
+
// load mapping for stack (virtaddr in r2, physaddr in r3)
rsm psr.ic
movl r17=PAGE_KERNEL
srlz.d
;;
+.load_current:
// load the "current" pointer (r13) and ar.k6 with the current task
mov IA64_KR(CURRENT)=r2 // virtual address
mov IA64_KR(CURRENT_STACK)=r16
*
* Inputs:
* r16 = new psr to establish
+ * Output:
+ * r19 = old virtual address of ar.bsp
+ * r20 = old virtual address of sp
*
* Note: RSE must already be in enforced lazy mode
*/
mov cr.ipsr=r16 // set new PSR
add r3=1f-ia64_switch_mode_phys,r15
- mov r17=ar.bsp
+ mov r19=ar.bsp
+ mov r20=sp
mov r14=rp // get return address into a general register
;;
// going to physical mode, use tpa to translate virt->phys
- tpa r17=r17
+ tpa r17=r19
tpa r3=r3
tpa sp=sp
tpa r14=r14
*
* Inputs:
* r16 = new psr to establish
+ * r19 = new bspstore to establish
+ * r20 = new sp to establish
*
* Note: RSE must already be in enforced lazy mode
*/
mov cr.ipsr=r16 // set new PSR
add r3=1f-ia64_switch_mode_virt,r15
- mov r17=ar.bsp
mov r14=rp // get return address into a general register
;;
// going to virtual
// - for code addresses, set upper bits of addr to KERNEL_START
- // - for stack addresses, set upper 3 bits to 0xe.... Dont change any of the
- // lower bits since we want it to stay identity mapped
+ // - for stack addresses, copy from input argument
movl r18=KERNEL_START
dep r3=0,r3,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT
dep r14=0,r14,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT
- dep r17=-1,r17,61,3
- dep sp=-1,sp,61,3
+ mov sp=r20
;;
or r3=r3,r18
or r14=r14,r18
;;
mov r18=ar.rnat // save ar.rnat
- mov ar.bspstore=r17 // this steps on ar.rnat
+ mov ar.bspstore=r19 // this steps on ar.rnat
mov cr.iip=r3
mov cr.ifs=r0
;;
br.ret.sptk.many rp
END(ia64_delay_loop)
+/*
+ * Return a CPU-local timestamp in nano-seconds. This timestamp is
+ * NOT synchronized across CPUs its return value must never be
+ * compared against the values returned on another CPU. The usage in
+ * kernel/sched.c ensures that.
+ *
+ * The return-value of sched_clock() is NOT supposed to wrap-around.
+ * If it did, it would cause some scheduling hiccups (at the worst).
+ * Fortunately, with a 64-bit cycle-counter ticking at 100GHz, even
+ * that would happen only once every 5+ years.
+ *
+ * The code below basically calculates:
+ *
+ * (ia64_get_itc() * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT
+ *
+ * except that the multiplication and the shift are done with 128-bit
+ * intermediate precision so that we can produce a full 64-bit result.
+ */
+GLOBAL_ENTRY(sched_clock)
+ addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
+ mov.m r9=ar.itc // fetch cycle-counter (35 cyc)
+ ;;
+ ldf8 f8=[r8]
+ ;;
+ setf.sig f9=r9 // certain to stall, so issue it _after_ ldf8...
+ ;;
+ xmpy.lu f10=f9,f8 // calculate low 64 bits of 128-bit product (4 cyc)
+ xmpy.hu f11=f9,f8 // calculate high 64 bits of 128-bit product
+ ;;
+ getf.sig r8=f10 // (5 cyc)
+ getf.sig r9=f11
+ ;;
+ shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
+ br.ret.sptk.many rp
+END(sched_clock)
+
GLOBAL_ENTRY(start_kernel_thread)
.prologue
.save rp, r0 // this is the end of the call-chain
* Inputs:
* ar.pfs - saved CFM of caller
* ar.ccv - 0 (and available for use)
+ * r27 - flags from spin_lock_irqsave or 0. Must be preserved.
* r28 - available for use.
* r29 - available for use.
* r30 - available for use.
* r31 - address of lock, available for use.
* b6 - return address
* p14 - available for use.
+ * p15 - used to track flag status.
*
* If you patch this code to use more registers, do not forget to update
* the clobber lists for spin_lock() in include/asm-ia64/spinlock.h.
.save rp, r28
.body
nop 0
- nop 0
+ tbit.nz p15,p0=r27,IA64_PSR_I_BIT
.restore sp // pop existing prologue after next insn
mov b6 = r28
.prologue
.save ar.pfs, r0
.altrp b6
.body
+ ;;
+(p15) ssm psr.i // reenable interrupts if they were on
+ // DavidM says that srlz.d is slow and is not required in this case
.wait:
// exponential backoff, kdb, lockmeter etc. go in here
hint @pause
ld4 r30=[r31] // don't use ld4.bias; if it's contended, we won't write the word
nop 0
;;
- cmp4.eq p14,p0=r30,r0
-(p14) br.cond.sptk.few b6 // lock is now free, try to acquire
- br.cond.sptk.few .wait
+ cmp4.ne p14,p0=r30,r0
+(p14) br.cond.sptk.few .wait
+(p15) rsm psr.i // disable interrupts if we reenabled them
+ br.cond.sptk.few b6 // lock is now free, try to acquire
END(ia64_spinlock_contention_pre3_4)
#else
.prologue
.altrp b6
.body
+ tbit.nz p15,p0=r27,IA64_PSR_I_BIT
+ ;;
.wait:
+(p15) ssm psr.i // reenable interrupts if they were on
+ // DavidM says that srlz.d is slow and is not required in this case
+.wait2:
// exponential backoff, kdb, lockmeter etc. go in here
hint @pause
ld4 r30=[r31] // don't use ld4.bias; if it's contended, we won't write the word
;;
cmp4.ne p14,p0=r30,r0
mov r30 = 1
-(p14) br.cond.sptk.few .wait
+(p14) br.cond.sptk.few .wait2
+(p15) rsm psr.i // disable interrupts if we reenabled them
;;
cmpxchg4.acq r30=[r31], r30, ar.ccv
;;