1 /* $Id: U3copy_to_user.S,v 1.3 2000/11/01 09:29:19 davem Exp $
2 * U3memcpy.S: UltraSparc-III optimized copy to userspace.
4 * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com)
8 #include <asm/visasm.h>
11 #include <asm/spitfire.h>
12 #undef SMALL_COPY_USES_FPU
13 #define EXNV(x,y,a,b) \
19 .section __ex_table; \
24 #define EXNV2(x,y,a,b) \
31 .section __ex_table; \
36 #define EXNV3(x,y,a,b) \
43 .section __ex_table; \
55 .section __ex_table; \
65 add %o4, 0x1c0, %o1; \
66 and %o2, (0x40 - 1), %o2; \
69 .section __ex_table; \
80 and %o2, (0x40 - 1), %o2; \
84 .section __ex_table; \
94 and %o2, (0x40 - 1), %o2; \
97 .section __ex_table; \
102 #define EXBLK4(x,y) \
107 and %o2, (0x40 - 1), %o2; \
109 add %o2, 0x40, %o0; \
110 .section __ex_table; \
116 #define ASI_AIUS 0x80
117 #define ASI_BLK_AIUS 0xf0
118 #define FPRS_FEF 0x04
119 #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
120 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
121 #define SMALL_COPY_USES_FPU
122 #define EXNV(x,y,a,b) x,y;
123 #define EXNV2(x,y,a,b) x,y;
124 #define EXNV3(x,y,a,b) x,y;
125 #define EX(x,y,a,b) x,y;
126 #define EXBLK1(x,y) x,y;
127 #define EXBLK2(x,y) x,y;
128 #define EXBLK3(x,y) x,y;
129 #define EXBLK4(x,y) x,y;
132 /* Special/non-trivial issues of this code:
134 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
135 * 2) Only low 32 FPU registers are used so that only the
136 * lower half of the FPU register set is dirtied by this
137 * code. This is especially important in the kernel.
138 * 3) This code never prefetches cachelines past the end
139 * of the source buffer.
145 /* The cheetah's flexible spine, oversized liver, enlarged heart,
146 * slender muscular body, and claws make it the swiftest hunter
147 * in Africa and the fastest animal on land. Can reach speeds
148 * of up to 2.4GB per second.
151 .globl U3copy_to_user
152 U3copy_to_user: /* %o0=dst, %o1=src, %o2=len */
153 /* Writing to %asi is _expensive_ so we hardcode it.
154 * Reading %asi to check for KERNEL_DS is comparatively
157 rd %asi, %g1 ! MS Group (4 cycles)
158 cmp %g1, ASI_AIUS ! A0 Group
162 /* Save away original 'dst' for memcpy return value. */
163 mov %o0, %g3 ! A0 Group
165 /* Anything to copy at all? */
167 ble,pn %icc, U3copy_to_user_short_ret ! BR
169 /* Extremely small copy? */
170 cmp %o2, 31 ! A0 Group
171 ble,pn %icc, U3copy_to_user_short ! BR
173 /* Large enough to use unrolled prefetch loops? */
175 bge,a,pt %icc, U3copy_to_user_enter ! BR Group
176 andcc %o0, 0x3f, %g2 ! A0
178 ba,pt %xcc, U3copy_to_user_toosmall ! BR Group
179 andcc %o0, 0x7, %g2 ! A0
182 U3copy_to_user_short:
183 /* Copy %o2 bytes from src to dst, one byte at a time. */
184 ldub [%o1 + 0x00], %o3 ! MS Group
185 add %o1, 0x1, %o1 ! A0
186 add %o0, 0x1, %o0 ! A1
187 subcc %o2, 1, %o2 ! A0 Group
189 bg,pt %icc, U3copy_to_user_short ! BR
190 EXNV(stba %o3, [%o0 + -1] %asi, add %o2, 1) ! MS Group (1-cycle stall)
192 U3copy_to_user_short_ret:
194 retl ! BR Group (0-4 cycle stall)
197 retl ! BR Group (0-4 cycle stall)
201 /* Here len >= (6 * 64) and condition codes reflect execution
202 * of "andcc %o0, 0x7, %g2", done by caller.
205 U3copy_to_user_enter:
206 /* Is 'dst' already aligned on an 64-byte boundary? */
209 /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
210 * of bytes to copy to make 'dst' 64-byte aligned. We pre-
211 * subtract this from 'len'.
213 sub %g2, 0x40, %g2 ! A0 Group
214 sub %g0, %g2, %g2 ! A0 Group
215 sub %o2, %g2, %o2 ! A0 Group
217 /* Copy %g2 bytes from src to dst, one byte at a time. */
218 1: ldub [%o1 + 0x00], %o3 ! MS (Group)
219 add %o1, 0x1, %o1 ! A1
220 add %o0, 0x1, %o0 ! A0 Group
221 subcc %g2, 0x1, %g2 ! A1
223 bg,pt %icc, 1b ! BR Group
224 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS Group
226 2: VISEntryHalf ! MS+MS
227 and %o1, 0x7, %g1 ! A1
228 ba,pt %xcc, U3copy_to_user_begin ! BR
229 alignaddr %o1, %g0, %o1 ! MS (Break-after)
232 U3copy_to_user_begin:
234 .globl U3copy_to_user_nop_1_6
235 U3copy_to_user_nop_1_6:
236 ldxa [%g0] ASI_DCU_CONTROL_REG, %g3
237 sethi %uhi(DCU_PE), %o3
240 stxa %o3, [%g0] ASI_DCU_CONTROL_REG ! Enable P-cache
243 prefetch [%o1 + 0x000], #one_read ! MS Group1
244 prefetch [%o1 + 0x040], #one_read ! MS Group2
245 andn %o2, (0x40 - 1), %o4 ! A0
246 prefetch [%o1 + 0x080], #one_read ! MS Group3
248 prefetch [%o1 + 0x0c0], #one_read ! MS Group4
249 ldd [%o1 + 0x000], %f0 ! MS Group5 (%f0 results at G8)
250 bge,a,pt %icc, 1f ! BR
252 prefetch [%o1 + 0x100], #one_read ! MS Group6
253 1: ldd [%o1 + 0x008], %f2 ! AX (%f2 results at G9)
255 bge,a,pt %icc, 1f ! BR
256 prefetch [%o1 + 0x140], #one_read ! MS Group7
257 1: ldd [%o1 + 0x010], %f4 ! AX (%f4 results at G10)
259 bge,a,pt %icc, 1f ! BR
261 prefetch [%o1 + 0x180], #one_read ! MS Group8
262 1: faligndata %f0, %f2, %f16 ! FGA Group9 (%f16 at G12)
263 ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G12)
264 faligndata %f2, %f4, %f18 ! FGA Group10 (%f18 results at G13)
265 ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G13)
266 faligndata %f4, %f6, %f20 ! FGA Group12 (1-cycle stall,%f20 at G15)
267 ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G15)
268 faligndata %f6, %f8, %f22 ! FGA Group13 (%f22 results at G16)
270 ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G16)
271 faligndata %f8, %f10, %f24 ! FGA Group15 (1-cycle stall,%f24 at G18)
272 ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18)
273 faligndata %f10, %f12, %f26 ! FGA Group16 (%f26 results at G19)
274 ldd [%o1 + 0x040], %f0 ! MS (%f0 results at G19)
276 /* We only use the first loop if len > (7 * 64). */
277 subcc %o4, 0x1c0, %o4 ! A0 Group17
278 bg,pt %icc, U3copy_to_user_loop1 ! BR
279 add %o1, 0x40, %o1 ! A1
281 add %o4, 0x140, %o4 ! A0 Group18
282 ba,pt %xcc, U3copy_to_user_loop2 ! BR
283 srl %o4, 6, %o3 ! A0 Group19
293 /* This loop performs the copy and queues new prefetches.
294 * We drop into the second loop when len <= (5 * 64). Note
295 * that this (5 * 64) factor has been subtracted from len
298 U3copy_to_user_loop1:
299 ldd [%o1 + 0x008], %f2 ! MS Group2 (%f2 results at G5)
300 faligndata %f12, %f14, %f28 ! FGA (%f28 results at G5)
301 ldd [%o1 + 0x010], %f4 ! MS Group3 (%f4 results at G6)
302 faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall, %f30 at G7)
303 EXBLK1(stda %f16, [%o0] ASI_BLK_AIUS) ! MS
304 ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G7)
306 faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
307 ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G15)
308 faligndata %f2, %f4, %f18 ! FGA Group13 (%f18 results at G16)
309 ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G16)
310 faligndata %f4, %f6, %f20 ! FGA Group14 (%f20 results at G17)
311 ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G17)
312 faligndata %f6, %f8, %f22 ! FGA Group15 (%f22 results at G18)
313 ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18)
315 faligndata %f8, %f10, %f24 ! FGA Group16 (%f24 results at G19)
316 ldd [%o1 + 0x040], %f0 ! AX (%f0 results at G19)
317 prefetch [%o1 + 0x180], #one_read ! MS
318 faligndata %f10, %f12, %f26 ! FGA Group17 (%f26 results at G20)
319 subcc %o4, 0x40, %o4 ! A0
320 add %o1, 0x40, %o1 ! A1
321 bg,pt %xcc, U3copy_to_user_loop1 ! BR
322 add %o0, 0x40, %o0 ! A0 Group18
324 U3copy_to_user_loop2_enter:
327 /* This loop performs on the copy, no new prefetches are
328 * queued. We do things this way so that we do not perform
329 * any spurious prefetches past the end of the src buffer.
331 U3copy_to_user_loop2:
332 ldd [%o1 + 0x008], %f2 ! MS
333 faligndata %f12, %f14, %f28 ! FGA Group2
334 ldd [%o1 + 0x010], %f4 ! MS
335 faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall)
336 EXBLK2(stda %f16, [%o0] ASI_BLK_AIUS) ! MS
337 ldd [%o1 + 0x018], %f6 ! AX
338 faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
340 ldd [%o1 + 0x020], %f8 ! MS
341 faligndata %f2, %f4, %f18 ! FGA Group13
342 ldd [%o1 + 0x028], %f10 ! MS
343 faligndata %f4, %f6, %f20 ! FGA Group14
344 ldd [%o1 + 0x030], %f12 ! MS
345 faligndata %f6, %f8, %f22 ! FGA Group15
346 ldd [%o1 + 0x038], %f14 ! MS
347 faligndata %f8, %f10, %f24 ! FGA Group16
349 ldd [%o1 + 0x040], %f0 ! AX
350 faligndata %f10, %f12, %f26 ! FGA Group17
351 subcc %o3, 0x01, %o3 ! A0
352 add %o1, 0x40, %o1 ! A1
353 bg,pt %xcc, U3copy_to_user_loop2 ! BR
354 add %o0, 0x40, %o0 ! A0 Group18
356 /* Finally we copy the last full 64-byte block. */
357 U3copy_to_user_loopfini:
358 ldd [%o1 + 0x008], %f2 ! MS
359 faligndata %f12, %f14, %f28 ! FGA
360 ldd [%o1 + 0x010], %f4 ! MS Group19
361 faligndata %f14, %f0, %f30 ! FGA
362 EXBLK3(stda %f16, [%o0] ASI_BLK_AIUS) ! MS Group20
363 ldd [%o1 + 0x018], %f6 ! AX
364 faligndata %f0, %f2, %f16 ! FGA Group11 (7-cycle stall)
365 ldd [%o1 + 0x020], %f8 ! MS
366 faligndata %f2, %f4, %f18 ! FGA Group12
367 ldd [%o1 + 0x028], %f10 ! MS
368 faligndata %f4, %f6, %f20 ! FGA Group13
369 ldd [%o1 + 0x030], %f12 ! MS
370 faligndata %f6, %f8, %f22 ! FGA Group14
371 ldd [%o1 + 0x038], %f14 ! MS
372 faligndata %f8, %f10, %f24 ! FGA Group15
375 add %o0, 0x40, %o0 ! A1
376 ldd [%o1 + 0x040], %f0 ! MS
377 1: faligndata %f10, %f12, %f26 ! FGA Group16
378 faligndata %f12, %f14, %f28 ! FGA Group17
379 faligndata %f14, %f0, %f30 ! FGA Group18
380 EXBLK4(stda %f16, [%o0] ASI_BLK_AIUS) ! MS
381 add %o0, 0x40, %o0 ! A0
382 add %o1, 0x40, %o1 ! A1
384 .globl U3copy_to_user_nop_2_3
385 U3copy_to_user_nop_2_3:
386 mov PRIMARY_CONTEXT, %o3
387 stxa %g0, [%o3] ASI_DMMU ! Flush P-cache
388 stxa %g3, [%g0] ASI_DCU_CONTROL_REG ! Disable P-cache
390 membar #Sync ! MS Group26 (7-cycle stall)
392 /* Now we copy the (len modulo 64) bytes at the end.
393 * Note how we borrow the %f0 loaded above.
395 * Also notice how this code is careful not to perform a
396 * load past the end of the src buffer just like similar
397 * code found in U3copy_to_user_toosmall processing.
399 U3copy_to_user_loopend:
400 and %o2, 0x3f, %o2 ! A0 Group
401 andcc %o2, 0x38, %g2 ! A0 Group
402 be,pn %icc, U3copy_to_user_endcruft ! BR
403 subcc %g2, 0x8, %g2 ! A1
404 be,pn %icc, U3copy_to_user_endcruft ! BR Group
407 be,a,pt %icc, 1f ! BR Group
408 ldd [%o1 + 0x00], %f0 ! MS
410 1: ldd [%o1 + 0x08], %f2 ! MS Group
411 add %o1, 0x8, %o1 ! A0
412 sub %o2, 0x8, %o2 ! A1
413 subcc %g2, 0x8, %g2 ! A0 Group
414 faligndata %f0, %f2, %f8 ! FGA Group
415 EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS (XXX does it stall here? XXX)
416 be,pn %icc, U3copy_to_user_endcruft ! BR
417 add %o0, 0x8, %o0 ! A0
418 ldd [%o1 + 0x08], %f0 ! MS Group
419 add %o1, 0x8, %o1 ! A0
420 sub %o2, 0x8, %o2 ! A1
421 subcc %g2, 0x8, %g2 ! A0 Group
422 faligndata %f2, %f0, %f8 ! FGA
423 EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS (XXX does it stall here? XXX)
425 add %o0, 0x8, %o0 ! A0 Group
427 /* If anything is left, we copy it one byte at a time.
428 * Note that %g1 is (src & 0x3) saved above before the
429 * alignaddr was performed.
431 U3copy_to_user_endcruft:
435 be,pn %icc, U3copy_to_user_short_ret
437 ba,a,pt %xcc, U3copy_to_user_short
439 /* If we get here, then 32 <= len < (6 * 64) */
440 U3copy_to_user_toosmall:
442 #ifdef SMALL_COPY_USES_FPU
444 /* Is 'dst' already aligned on an 8-byte boundary? */
445 be,pt %xcc, 2f ! BR Group
447 /* Compute abs((dst & 7) - 8) into %g2. This is the number
448 * of bytes to copy to make 'dst' 8-byte aligned. We pre-
449 * subtract this from 'len'.
451 sub %g2, 0x8, %g2 ! A0
452 sub %g0, %g2, %g2 ! A0 Group (reg-dep)
453 sub %o2, %g2, %o2 ! A0 Group (reg-dep)
455 /* Copy %g2 bytes from src to dst, one byte at a time. */
456 1: ldub [%o1 + 0x00], %o3 ! MS (Group) (%o3 in 3 cycles)
457 add %o1, 0x1, %o1 ! A1
458 add %o0, 0x1, %o0 ! A0 Group
459 subcc %g2, 0x1, %g2 ! A1
461 bg,pt %icc, 1b ! BR Group
462 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS Group
464 2: VISEntryHalf ! MS+MS
466 /* Compute (len - (len % 8)) into %g2. This is guaranteed
469 andn %o2, 0x7, %g2 ! A0 Group
471 /* You may read this and believe that it allows reading
472 * one 8-byte longword past the end of src. It actually
473 * does not, as %g2 is subtracted as loads are done from
474 * src, so we always stop before running off the end.
475 * Also, we are guaranteed to have at least 0x10 bytes
478 sub %g2, 0x8, %g2 ! A0 Group (reg-dep)
479 alignaddr %o1, %g0, %g1 ! MS (Break-after)
480 ldd [%g1 + 0x00], %f0 ! MS Group (1-cycle stall)
481 add %g1, 0x8, %g1 ! A0
483 1: ldd [%g1 + 0x00], %f2 ! MS Group
484 add %g1, 0x8, %g1 ! A0
485 sub %o2, 0x8, %o2 ! A1
486 subcc %g2, 0x8, %g2 ! A0 Group
488 faligndata %f0, %f2, %f8 ! FGA Group (1-cycle stall)
489 EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS Group (2-cycle stall)
490 add %o1, 0x8, %o1 ! A0
493 add %o0, 0x8, %o0 ! A1
494 ldd [%g1 + 0x00], %f0 ! MS Group
495 add %g1, 0x8, %g1 ! A0
496 sub %o2, 0x8, %o2 ! A1
498 subcc %g2, 0x8, %g2 ! A0 Group
499 faligndata %f2, %f0, %f8 ! FGA Group (1-cycle stall)
500 EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS Group (2-cycle stall)
501 add %o1, 0x8, %o1 ! A0
504 add %o0, 0x8, %o0 ! A1
506 /* Nothing left to copy? */
507 2: cmp %o2, 0 ! A0 Group
509 be,pn %icc, U3copy_to_user_short_ret ! BR Group
511 ba,a,pt %xcc, U3copy_to_user_short ! BR Group
513 #else /* !(SMALL_COPY_USES_FPU) */
517 bne,pn %icc, U3copy_to_user_short
525 1: ldub [%o1 + 0x00], %o3
530 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)
532 2: andn %o2, 0x7, %g2
535 3: ldx [%o1 + 0x00], %o3
540 EXNV3(stxa %o3, [%o0 + -8] %asi, add %o2, %g2)
543 bne,pn %icc, U3copy_to_user_short
545 ba,a,pt %xcc, U3copy_to_user_short_ret
547 #endif /* !(SMALL_COPY_USES_FPU) */