1 /* U3copy_to_user.S: UltraSparc-III optimized memcpy.
3 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
6 #include <asm/visasm.h>
9 #include <asm/spitfire.h>
13 #define EXNV(x,y,a,b) \
19 .section __ex_table; \
24 #define EXNV2(x,y,a,b) \
31 .section __ex_table; \
36 #define EXNV3(x,y,a,b) \
43 .section __ex_table; \
48 #define EXNV4(x,y,a,b) \
55 .section __ex_table; \
67 .section __ex_table; \
77 add %o4, 0x1c0, %o1; \
78 and %o2, (0x40 - 1), %o2; \
81 .section __ex_table; \
92 and %o2, (0x40 - 1), %o2; \
96 .section __ex_table; \
101 #define EXBLK3(x,y) \
106 and %o2, (0x40 - 1), %o2; \
108 add %o2, 0x80, %o0; \
109 .section __ex_table; \
114 #define EXBLK4(x,y) \
119 and %o2, (0x40 - 1), %o2; \
121 add %o2, 0x40, %o0; \
122 .section __ex_table; \
128 .register %g2,#scratch
129 .register %g3,#scratch
131 /* Special/non-trivial issues of this code:
133 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
134 * 2) Only low 32 FPU registers are used so that only the
135 * lower half of the FPU register set is dirtied by this
136 * code. This is especially important in the kernel.
137 * 3) This code never prefetches cachelines past the end
138 * of the source buffer.
144 /* The cheetah's flexible spine, oversized liver, enlarged heart,
145 * slender muscular body, and claws make it the swiftest hunter
146 * in Africa and the fastest animal on land. Can reach speeds
147 * of up to 2.4GB per second.
150 .globl U3copy_to_user
151 U3copy_to_user: /* %o0=dst, %o1=src, %o2=len */
152 /* Writing to %asi is _expensive_ so we hardcode it.
153 * Reading %asi to check for KERNEL_DS is comparatively
158 bne,pn %icc, U3memcpy_user_stub
175 /* Here len >= 256 and condition codes reflect execution
176 * of "andcc %o0, 0x7, %g2", done by caller.
180 /* Is 'dst' already aligned on an 64-byte boundary? */
183 /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
184 * of bytes to copy to make 'dst' 64-byte aligned. We pre-
185 * subtract this from 'len'.
191 /* Copy %g2 bytes from src to dst, one byte at a time. */
192 1: ldub [%o1 + 0x00], %o3
198 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)
203 alignaddr %o1, %g0, %o1
207 membar #StoreLoad | #StoreStore | #LoadStore
208 prefetch [%o1 + 0x000], #one_read
209 prefetch [%o1 + 0x040], #one_read
210 andn %o2, (0x40 - 1), %o4
211 prefetch [%o1 + 0x080], #one_read
212 prefetch [%o1 + 0x0c0], #one_read
213 ldd [%o1 + 0x000], %f0
214 prefetch [%o1 + 0x100], #one_read
215 ldd [%o1 + 0x008], %f2
216 prefetch [%o1 + 0x140], #one_read
217 ldd [%o1 + 0x010], %f4
218 prefetch [%o1 + 0x180], #one_read
219 faligndata %f0, %f2, %f16
220 ldd [%o1 + 0x018], %f6
221 faligndata %f2, %f4, %f18
222 ldd [%o1 + 0x020], %f8
223 faligndata %f4, %f6, %f20
224 ldd [%o1 + 0x028], %f10
225 faligndata %f6, %f8, %f22
227 ldd [%o1 + 0x030], %f12
228 faligndata %f8, %f10, %f24
229 ldd [%o1 + 0x038], %f14
230 faligndata %f10, %f12, %f26
231 ldd [%o1 + 0x040], %f0
240 ldd [%o1 + 0x008], %f2
241 faligndata %f12, %f14, %f28
242 ldd [%o1 + 0x010], %f4
243 faligndata %f14, %f0, %f30
244 EXBLK2(stda %f16, [%o0] ASI_BLK_AIUS)
245 ldd [%o1 + 0x018], %f6
246 faligndata %f0, %f2, %f16
248 ldd [%o1 + 0x020], %f8
249 faligndata %f2, %f4, %f18
250 ldd [%o1 + 0x028], %f10
251 faligndata %f4, %f6, %f20
252 ldd [%o1 + 0x030], %f12
253 faligndata %f6, %f8, %f22
254 ldd [%o1 + 0x038], %f14
255 faligndata %f8, %f10, %f24
257 ldd [%o1 + 0x040], %f0
258 prefetch [%o1 + 0x180], #one_read
259 faligndata %f10, %f12, %f26
265 /* Finally we copy the last full 64-byte block. */
266 ldd [%o1 + 0x008], %f2
267 faligndata %f12, %f14, %f28
268 ldd [%o1 + 0x010], %f4
269 faligndata %f14, %f0, %f30
270 EXBLK3(stda %f16, [%o0] ASI_BLK_AIUS)
271 ldd [%o1 + 0x018], %f6
272 faligndata %f0, %f2, %f16
273 ldd [%o1 + 0x020], %f8
274 faligndata %f2, %f4, %f18
275 ldd [%o1 + 0x028], %f10
276 faligndata %f4, %f6, %f20
277 ldd [%o1 + 0x030], %f12
278 faligndata %f6, %f8, %f22
279 ldd [%o1 + 0x038], %f14
280 faligndata %f8, %f10, %f24
284 ldd [%o1 + 0x040], %f0
285 1: faligndata %f10, %f12, %f26
286 faligndata %f12, %f14, %f28
287 faligndata %f14, %f0, %f30
288 EXBLK4(stda %f16, [%o0] ASI_BLK_AIUS)
294 /* Now we copy the (len modulo 64) bytes at the end.
295 * Note how we borrow the %f0 loaded above.
297 * Also notice how this code is careful not to perform a
298 * load past the end of the src buffer.
308 ldd [%o1 + 0x00], %f0
310 1: ldd [%o1 + 0x08], %f2
314 faligndata %f0, %f2, %f8
315 EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)
318 ldd [%o1 + 0x08], %f0
322 faligndata %f2, %f0, %f8
323 EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)
327 /* If anything is left, we copy it one byte at a time.
328 * Note that %g1 is (src & 0x3) saved above before the
329 * alignaddr was performed.
344 EXNV(stxa %o5, [%o1 + %o3] ASI_AIUS, add %o2, %g0)
347 1: andcc %o2, 0x4, %g0
351 EXNV(stwa %o5, [%o1 + %o3] ASI_AIUS, and %o2, 0x7)
354 1: andcc %o2, 0x2, %g0
358 EXNV(stha %o5, [%o1 + %o3] ASI_AIUS, and %o2, 0x3)
361 1: andcc %o2, 0x1, %g0
366 EXNV(stba %o5, [%o1 + %o3] ASI_AIUS, and %o2, 0x1)
368 70: /* 16 < len <= 64 */
374 1: subcc %o4, 0x8, %o4
376 EXNV4(stxa %o5, [%o1 + %o3] ASI_AIUS, add %o2, %o4)
384 EXNV3(stwa %o5, [%o1 + %o3] ASI_AIUS, add %o2, %g0)
392 80: /* 0 < len <= 16 */
400 EXNV3(stwa %g1, [%o1 + %o3] ASI_AIUS, add %o2, %g0)
411 EXNV2(stba %g1, [%o1 + %o3] ASI_AIUS, add %o2, %g0)