1 /* $Id: U3copy_in_user.S,v 1.4 2001/03/21 05:58:47 davem Exp $
2 * U3memcpy.S: UltraSparc-III optimized copy within userspace.
4 * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com)
8 #include <asm/visasm.h>
10 #undef SMALL_COPY_USES_FPU
11 #define EXNV(x,y,a,b) \
17 .section __ex_table; \
22 #define EXNV2(x,y,a,b) \
29 .section __ex_table; \
34 #define EXNV3(x,y,a,b) \
41 .section __ex_table; \
53 .section __ex_table; \
63 add %o4, 0x1c0, %o1; \
64 and %o2, (0x40 - 1), %o2; \
67 .section __ex_table; \
78 and %o2, (0x40 - 1), %o2; \
82 .section __ex_table; \
92 and %o2, (0x40 - 1), %o2; \
95 .section __ex_table; \
100 #define EXBLK4(x,y) \
105 and %o2, (0x40 - 1), %o2; \
107 add %o2, 0x40, %o0; \
108 .section __ex_table; \
114 #define ASI_AIUS 0x80
115 #define ASI_BLK_AIUS 0xf0
116 #define FPRS_FEF 0x04
117 #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
118 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
119 #define SMALL_COPY_USES_FPU
120 #define EXNV(x,y,a,b) x,y;
121 #define EXNV2(x,y,a,b) x,y;
122 #define EXNV3(x,y,a,b) x,y;
123 #define EX(x,y,a,b) x,y;
124 #define EXBLK1(x,y) x,y;
125 #define EXBLK2(x,y) x,y;
126 #define EXBLK3(x,y) x,y;
127 #define EXBLK4(x,y) x,y;
130 /* Special/non-trivial issues of this code:
132 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
133 * 2) Only low 32 FPU registers are used so that only the
134 * lower half of the FPU register set is dirtied by this
135 * code. This is especially important in the kernel.
136 * 3) This code never prefetches cachelines past the end
137 * of the source buffer.
139 * XXX Actually, Cheetah can buffer up to 8 concurrent
140 * XXX prefetches, revisit this...
146 /* The cheetah's flexible spine, oversized liver, enlarged heart,
147 * slender muscular body, and claws make it the swiftest hunter
148 * in Africa and the fastest animal on land. Can reach speeds
149 * of up to 2.4GB per second.
152 .globl U3copy_in_user
153 U3copy_in_user: /* %o0=dst, %o1=src, %o2=len */
154 /* Writing to %asi is _expensive_ so we hardcode it.
155 * Reading %asi to check for KERNEL_DS is comparatively
158 rd %asi, %g1 ! MS Group (4 cycles)
159 cmp %g1, ASI_AIUS ! A0 Group
163 /* Save away original 'dst' for memcpy return value. */
164 mov %o0, %g3 ! A0 Group
166 /* Anything to copy at all? */
168 ble,pn %icc, U3copy_in_user_short_ret ! BR
170 /* Extremely small copy? */
171 cmp %o2, 31 ! A0 Group
172 ble,pn %icc, U3copy_in_user_short ! BR
174 /* Large enough to use unrolled prefetch loops? */
176 bge,a,pt %icc, U3copy_in_user_enter ! BR Group
177 andcc %o0, 0x3f, %g2 ! A0
179 ba,pt %xcc, U3copy_in_user_toosmall ! BR Group
180 andcc %o0, 0x7, %g2 ! A0
183 U3copy_in_user_short:
184 /* Copy %o2 bytes from src to dst, one byte at a time. */
185 EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g0)! MS Group
186 add %o1, 0x1, %o1 ! A0
187 add %o0, 0x1, %o0 ! A1
188 subcc %o2, 1, %o2 ! A0 Group
190 bg,pt %icc, U3copy_in_user_short ! BR
191 EXNV(stba %o3, [%o0 + -1] %asi, add %o2, 1) ! MS Group (1-cycle stall)
193 U3copy_in_user_short_ret:
195 retl ! BR Group (0-4 cycle stall)
198 retl ! BR Group (0-4 cycle stall)
202 /* Here len >= (6 * 64) and condition codes reflect execution
203 * of "andcc %o0, 0x7, %g2", done by caller.
206 U3copy_in_user_enter:
207 /* Is 'dst' already aligned on an 64-byte boundary? */
210 /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
211 * of bytes to copy to make 'dst' 64-byte aligned. We pre-
212 * subtract this from 'len'.
214 sub %g2, 0x40, %g2 ! A0 Group
215 sub %g0, %g2, %g2 ! A0 Group
216 sub %o2, %g2, %o2 ! A0 Group
218 /* Copy %g2 bytes from src to dst, one byte at a time. */
219 1: EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS (Group)
220 add %o1, 0x1, %o1 ! A1
221 add %o0, 0x1, %o0 ! A0 Group
222 subcc %g2, 0x1, %g2 ! A1
224 bg,pt %icc, 1b ! BR Group
225 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS Group
227 2: VISEntryHalf ! MS+MS
228 and %o1, 0x7, %g1 ! A1
229 ba,pt %xcc, U3copy_in_user_begin ! BR
230 alignaddr %o1, %g0, %o1 ! MS (Break-after)
233 U3copy_in_user_begin:
234 prefetcha [%o1 + 0x000] %asi, #one_read ! MS Group1
235 prefetcha [%o1 + 0x040] %asi, #one_read ! MS Group2
236 andn %o2, (0x40 - 1), %o4 ! A0
237 prefetcha [%o1 + 0x080] %asi, #one_read ! MS Group3
239 prefetcha [%o1 + 0x0c0] %asi, #one_read ! MS Group4
240 EX(ldda [%o1 + 0x000] %asi, %f0, add %o2, %g0) ! MS Group5 (%f0 results at G8)
241 bge,a,pt %icc, 1f ! BR
243 prefetcha [%o1 + 0x100] %asi, #one_read ! MS Group6
244 1: EX(ldda [%o1 + 0x008] %asi, %f2, add %o2, %g0) ! AX (%f2 results at G9)
246 bge,a,pt %icc, 1f ! BR
247 prefetcha [%o1 + 0x140] %asi, #one_read ! MS Group7
248 1: EX(ldda [%o1 + 0x010] %asi, %f4, add %o2, %g0) ! AX (%f4 results at G10)
250 bge,a,pt %icc, 1f ! BR
252 prefetcha [%o1 + 0x180] %asi, #one_read ! MS Group8
253 1: faligndata %f0, %f2, %f16 ! FGA Group9 (%f16 at G12)
254 EX(ldda [%o1 + 0x018] %asi, %f6, add %o2, %g0) ! AX (%f6 results at G12)
255 faligndata %f2, %f4, %f18 ! FGA Group10 (%f18 results at G13)
256 EX(ldda [%o1 + 0x020] %asi, %f8, add %o2, %g0) ! MS (%f8 results at G13)
257 faligndata %f4, %f6, %f20 ! FGA Group12 (1-cycle stall,%f20 at G15)
258 EX(ldda [%o1 + 0x028] %asi, %f10, add %o2, %g0) ! MS (%f10 results at G15)
259 faligndata %f6, %f8, %f22 ! FGA Group13 (%f22 results at G16)
261 EX(ldda [%o1 + 0x030] %asi, %f12, add %o2, %g0) ! MS (%f12 results at G16)
262 faligndata %f8, %f10, %f24 ! FGA Group15 (1-cycle stall,%f24 at G18)
263 EX(ldda [%o1 + 0x038] %asi, %f14, add %o2, %g0) ! MS (%f14 results at G18)
264 faligndata %f10, %f12, %f26 ! FGA Group16 (%f26 results at G19)
265 EX(ldda [%o1 + 0x040] %asi, %f0, add %o2, %g0) ! MS (%f0 results at G19)
267 /* We only use the first loop if len > (7 * 64). */
268 subcc %o4, 0x1c0, %o4 ! A0 Group17
269 bg,pt %icc, U3copy_in_user_loop1 ! BR
270 add %o1, 0x40, %o1 ! A1
272 add %o4, 0x140, %o4 ! A0 Group18
273 ba,pt %xcc, U3copy_in_user_loop2 ! BR
274 srl %o4, 6, %o3 ! A0 Group19
284 /* This loop performs the copy and queues new prefetches.
285 * We drop into the second loop when len <= (5 * 64). Note
286 * that this (5 * 64) factor has been subtracted from len
289 U3copy_in_user_loop1:
290 EXBLK1(ldda [%o1 + 0x008] %asi, %f2) ! MS Group2 (%f2 results at G5)
291 faligndata %f12, %f14, %f28 ! FGA (%f28 results at G5)
292 EXBLK1(ldda [%o1 + 0x010] %asi, %f4) ! MS Group3 (%f4 results at G6)
293 faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall, %f30 at G7)
294 EXBLK1(stda %f16, [%o0] ASI_BLK_AIUS) ! MS
295 EXBLK1(ldda [%o1 + 0x018] %asi, %f6) ! AX (%f6 results at G7)
297 faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
298 EXBLK1(ldda [%o1 + 0x020] %asi, %f8) ! MS (%f8 results at G15)
299 faligndata %f2, %f4, %f18 ! FGA Group13 (%f18 results at G16)
300 EXBLK1(ldda [%o1 + 0x028] %asi, %f10) ! MS (%f10 results at G16)
301 faligndata %f4, %f6, %f20 ! FGA Group14 (%f20 results at G17)
302 EXBLK1(ldda [%o1 + 0x030] %asi, %f12) ! MS (%f12 results at G17)
303 faligndata %f6, %f8, %f22 ! FGA Group15 (%f22 results at G18)
304 EXBLK1(ldda [%o1 + 0x038] %asi, %f14) ! MS (%f14 results at G18)
306 faligndata %f8, %f10, %f24 ! FGA Group16 (%f24 results at G19)
307 EXBLK1(ldda [%o1 + 0x040] %asi, %f0) ! AX (%f0 results at G19)
308 prefetcha [%o1 + 0x180] %asi, #one_read ! MS
309 faligndata %f10, %f12, %f26 ! FGA Group17 (%f26 results at G20)
310 subcc %o4, 0x40, %o4 ! A0
311 add %o1, 0x40, %o1 ! A1
312 bg,pt %xcc, U3copy_in_user_loop1 ! BR
313 add %o0, 0x40, %o0 ! A0 Group18
315 U3copy_in_user_loop2_enter:
318 /* This loop performs on the copy, no new prefetches are
319 * queued. We do things this way so that we do not perform
320 * any spurious prefetches past the end of the src buffer.
322 U3copy_in_user_loop2:
323 EXBLK2(ldda [%o1 + 0x008] %asi, %f2) ! MS
324 faligndata %f12, %f14, %f28 ! FGA Group2
325 EXBLK2(ldda [%o1 + 0x010] %asi, %f4) ! MS
326 faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall)
327 EXBLK2(stda %f16, [%o0] ASI_BLK_AIUS) ! MS
328 EXBLK2(ldda [%o1 + 0x018] %asi, %f6) ! AX
329 faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
331 EXBLK2(ldda [%o1 + 0x020] %asi, %f8) ! MS
332 faligndata %f2, %f4, %f18 ! FGA Group13
333 EXBLK2(ldda [%o1 + 0x028] %asi, %f10) ! MS
334 faligndata %f4, %f6, %f20 ! FGA Group14
335 EXBLK2(ldda [%o1 + 0x030] %asi, %f12) ! MS
336 faligndata %f6, %f8, %f22 ! FGA Group15
337 EXBLK2(ldda [%o1 + 0x038] %asi, %f14) ! MS
338 faligndata %f8, %f10, %f24 ! FGA Group16
340 EXBLK2(ldda [%o1 + 0x040] %asi, %f0) ! AX
341 faligndata %f10, %f12, %f26 ! FGA Group17
342 subcc %o3, 0x01, %o3 ! A0
343 add %o1, 0x40, %o1 ! A1
344 bg,pt %xcc, U3copy_in_user_loop2 ! BR
345 add %o0, 0x40, %o0 ! A0 Group18
347 /* Finally we copy the last full 64-byte block. */
348 U3copy_in_user_loopfini:
349 EXBLK3(ldda [%o1 + 0x008] %asi, %f2) ! MS
350 faligndata %f12, %f14, %f28 ! FGA
351 EXBLK3(ldda [%o1 + 0x010] %asi, %f4) ! MS Group19
352 faligndata %f14, %f0, %f30 ! FGA
353 EXBLK3(stda %f16, [%o0] ASI_BLK_AIUS) ! MS Group20
354 EXBLK4(ldda [%o1 + 0x018] %asi, %f6) ! AX
355 faligndata %f0, %f2, %f16 ! FGA Group11 (7-cycle stall)
356 EXBLK4(ldda [%o1 + 0x020] %asi, %f8) ! MS
357 faligndata %f2, %f4, %f18 ! FGA Group12
358 EXBLK4(ldda [%o1 + 0x028] %asi, %f10) ! MS
359 faligndata %f4, %f6, %f20 ! FGA Group13
360 EXBLK4(ldda [%o1 + 0x030] %asi, %f12) ! MS
361 faligndata %f6, %f8, %f22 ! FGA Group14
362 EXBLK4(ldda [%o1 + 0x038] %asi, %f14) ! MS
363 faligndata %f8, %f10, %f24 ! FGA Group15
366 add %o0, 0x40, %o0 ! A1
367 EXBLK4(ldda [%o1 + 0x040] %asi, %f0) ! MS
368 1: faligndata %f10, %f12, %f26 ! FGA Group16
369 faligndata %f12, %f14, %f28 ! FGA Group17
370 faligndata %f14, %f0, %f30 ! FGA Group18
371 EXBLK4(stda %f16, [%o0] ASI_BLK_AIUS) ! MS
372 add %o0, 0x40, %o0 ! A0
373 add %o1, 0x40, %o1 ! A1
374 membar #Sync ! MS Group26 (7-cycle stall)
376 /* Now we copy the (len modulo 64) bytes at the end.
377 * Note how we borrow the %f0 loaded above.
379 * Also notice how this code is careful not to perform a
380 * load past the end of the src buffer just like similar
381 * code found in U3copy_in_user_toosmall processing.
383 U3copy_in_user_loopend:
384 and %o2, 0x3f, %o2 ! A0 Group
385 andcc %o2, 0x38, %g2 ! A0 Group
386 be,pn %icc, U3copy_in_user_endcruft ! BR
387 subcc %g2, 0x8, %g2 ! A1
388 be,pn %icc, U3copy_in_user_endcruft ! BR Group
391 be,a,pt %icc, 1f ! BR Group
392 EX(ldda [%o1 + 0x00] %asi, %f0, add %o2, %g0) ! MS
394 1: EX(ldda [%o1 + 0x08] %asi, %f2, add %o2, %g0) ! MS Group
395 add %o1, 0x8, %o1 ! A0
396 sub %o2, 0x8, %o2 ! A1
397 subcc %g2, 0x8, %g2 ! A0 Group
398 faligndata %f0, %f2, %f8 ! FGA Group
399 EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS (XXX does it stall here? XXX)
400 be,pn %icc, U3copy_in_user_endcruft ! BR
401 add %o0, 0x8, %o0 ! A0
402 EX(ldda [%o1 + 0x08] %asi, %f0, add %o2, %g0) ! MS Group
403 add %o1, 0x8, %o1 ! A0
404 sub %o2, 0x8, %o2 ! A1
405 subcc %g2, 0x8, %g2 ! A0 Group
406 faligndata %f2, %f0, %f8 ! FGA
407 EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS (XXX does it stall here? XXX)
409 add %o0, 0x8, %o0 ! A0 Group
411 /* If anything is left, we copy it one byte at a time.
412 * Note that %g1 is (src & 0x3) saved above before the
413 * alignaddr was performed.
415 U3copy_in_user_endcruft:
419 be,pn %icc, U3copy_in_user_short_ret
421 ba,a,pt %xcc, U3copy_in_user_short
423 /* If we get here, then 32 <= len < (6 * 64) */
424 U3copy_in_user_toosmall:
426 #ifdef SMALL_COPY_USES_FPU
428 /* Is 'dst' already aligned on an 8-byte boundary? */
429 be,pt %xcc, 2f ! BR Group
431 /* Compute abs((dst & 7) - 8) into %g2. This is the number
432 * of bytes to copy to make 'dst' 8-byte aligned. We pre-
433 * subtract this from 'len'.
435 sub %g2, 0x8, %g2 ! A0
436 sub %g0, %g2, %g2 ! A0 Group (reg-dep)
437 sub %o2, %g2, %o2 ! A0 Group (reg-dep)
439 /* Copy %g2 bytes from src to dst, one byte at a time. */
440 1: EXNV2(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS (Group) (%o3 in 3 cycles)
441 add %o1, 0x1, %o1 ! A1
442 add %o0, 0x1, %o0 ! A0 Group
443 subcc %g2, 0x1, %g2 ! A1
445 bg,pt %icc, 1b ! BR Group
446 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS Group
448 2: VISEntryHalf ! MS+MS
450 /* Compute (len - (len % 8)) into %g2. This is guaranteed
453 andn %o2, 0x7, %g2 ! A0 Group
455 /* You may read this and believe that it allows reading
456 * one 8-byte longword past the end of src. It actually
457 * does not, as %g2 is subtracted as loads are done from
458 * src, so we always stop before running off the end.
459 * Also, we are guaranteed to have at least 0x10 bytes
462 sub %g2, 0x8, %g2 ! A0 Group (reg-dep)
463 alignaddr %o1, %g0, %g1 ! MS (Break-after)
464 EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0) ! MS Group (1-cycle stall)
465 add %g1, 0x8, %g1 ! A0
467 1: EX(ldda [%g1 + 0x00] %asi, %f2, add %o2, %g0) ! MS Group
468 add %g1, 0x8, %g1 ! A0
469 sub %o2, 0x8, %o2 ! A1
470 subcc %g2, 0x8, %g2 ! A0 Group
472 faligndata %f0, %f2, %f8 ! FGA Group (1-cycle stall)
473 EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS Group (2-cycle stall)
474 add %o1, 0x8, %o1 ! A0
477 add %o0, 0x8, %o0 ! A1
478 EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0) ! MS Group
479 add %g1, 0x8, %g1 ! A0
480 sub %o2, 0x8, %o2 ! A1
482 subcc %g2, 0x8, %g2 ! A0 Group
483 faligndata %f2, %f0, %f8 ! FGA Group (1-cycle stall)
484 EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8) ! MS Group (2-cycle stall)
485 add %o1, 0x8, %o1 ! A0
488 add %o0, 0x8, %o0 ! A1
490 /* Nothing left to copy? */
491 2: cmp %o2, 0 ! A0 Group
493 be,pn %icc, U3copy_in_user_short_ret ! BR Group
495 ba,a,pt %xcc, U3copy_in_user_short ! BR Group
497 #else /* !(SMALL_COPY_USES_FPU) */
501 bne,pn %icc, U3copy_in_user_short
509 1: EXNV2(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)
514 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)
516 2: andn %o2, 0x7, %g2
519 3: EXNV3(ldxa [%o1 + 0x00] %asi, %o3, add %o2, %g2)
524 EXNV3(stxa %o3, [%o0 + -8] %asi, add %o2, %g2)
527 bne,pn %icc, U3copy_in_user_short
529 ba,a,pt %xcc, U3copy_in_user_short_ret
531 #endif /* !(SMALL_COPY_USES_FPU) */