1 /* U3copy_from_user.S: UltraSparc-III optimized copy from userspace.
3 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
6 #include <asm/visasm.h>
9 #include <asm/spitfire.h>
13 #define EXNV_RAW(x,y,a,b) \
19 .section __ex_table; \
24 #define EXNV(x,y,a,b) \
28 99: add %o1, %o3, %o0; \
31 .section __ex_table; \
36 #define EXNV4(x,y,a,b) \
40 99: add %o1, %o3, %o0; \
44 .section __ex_table; \
49 #define EXNV8(x,y,a,b) \
53 99: add %o1, %o3, %o0; \
57 .section __ex_table; \
69 .section __ex_table; \
79 and %o2, (0x40 - 1), %o1; \
82 add %o1, 0x1c0, %o1; \
83 .section __ex_table; \
93 and %o2, (0x40 - 1), %o1; \
98 .section __ex_table; \
108 and %o2, (0x40 - 1), %o1; \
109 add %o1, 0x40, %o1; \
112 .section __ex_table; \
118 .register %g2,#scratch
119 .register %g3,#scratch
121 /* Special/non-trivial issues of this code:
123 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
124 * 2) Only low 32 FPU registers are used so that only the
125 * lower half of the FPU register set is dirtied by this
126 * code. This is especially important in the kernel.
127 * 3) This code never prefetches cachelines past the end
128 * of the source buffer.
134 /* The cheetah's flexible spine, oversized liver, enlarged heart,
135 * slender muscular body, and claws make it the swiftest hunter
136 * in Africa and the fastest animal on land. Can reach speeds
137 * of up to 2.4GB per second.
140 .globl U3copy_from_user
141 U3copy_from_user: /* %o0=dst, %o1=src, %o2=len */
156 /* Here len >= 256 and condition codes reflect execution
157 * of "andcc %o0, 0x7, %g2", done by caller.
161 /* Is 'dst' already aligned on an 64-byte boundary? */
164 /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
165 * of bytes to copy to make 'dst' 64-byte aligned. We pre-
166 * subtract this from 'len'.
172 /* Copy %g2 bytes from src to dst, one byte at a time. */
173 1: EXNV_RAW(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)
184 alignaddr %o1, %g0, %o1
188 membar #StoreLoad | #StoreStore | #LoadStore
189 prefetcha [%o1 + 0x000] %asi, #one_read
190 prefetcha [%o1 + 0x040] %asi, #one_read
191 andn %o2, (0x40 - 1), %o4
192 prefetcha [%o1 + 0x080] %asi, #one_read
193 prefetcha [%o1 + 0x0c0] %asi, #one_read
194 EX(ldda [%o1 + 0x000] %asi, %f0, add %o2, %g0)
195 prefetcha [%o1 + 0x100] %asi, #one_read
196 EX(ldda [%o1 + 0x008] %asi, %f2, add %o2, %g0)
197 prefetcha [%o1 + 0x140] %asi, #one_read
198 EX(ldda [%o1 + 0x010] %asi, %f4, add %o2, %g0)
199 prefetcha [%o1 + 0x180] %asi, #one_read
200 faligndata %f0, %f2, %f16
201 EX(ldda [%o1 + 0x018] %asi, %f6, add %o2, %g0)
202 faligndata %f2, %f4, %f18
203 EX(ldda [%o1 + 0x020] %asi, %f8, add %o2, %g0)
204 faligndata %f4, %f6, %f20
205 EX(ldda [%o1 + 0x028] %asi, %f10, add %o2, %g0)
206 faligndata %f6, %f8, %f22
208 EX(ldda [%o1 + 0x030] %asi, %f12, add %o2, %g0)
209 faligndata %f8, %f10, %f24
210 EX(ldda [%o1 + 0x038] %asi, %f14, add %o2, %g0)
211 faligndata %f10, %f12, %f26
212 EX(ldda [%o1 + 0x040] %asi, %f0, add %o2, %g0)
221 EX3(ldda [%o1 + 0x008] %asi, %f2)
222 faligndata %f12, %f14, %f28
223 EX3(ldda [%o1 + 0x010] %asi, %f4)
224 faligndata %f14, %f0, %f30
225 stda %f16, [%o0] ASI_BLK_P
226 EX3(ldda [%o1 + 0x018] %asi, %f6)
227 faligndata %f0, %f2, %f16
229 EX3(ldda [%o1 + 0x020] %asi, %f8)
230 faligndata %f2, %f4, %f18
231 EX3(ldda [%o1 + 0x028] %asi, %f10)
232 faligndata %f4, %f6, %f20
233 EX3(ldda [%o1 + 0x030] %asi, %f12)
234 faligndata %f6, %f8, %f22
235 EX3(ldda [%o1 + 0x038] %asi, %f14)
236 faligndata %f8, %f10, %f24
238 EX3(ldda [%o1 + 0x040] %asi, %f0)
239 prefetcha [%o1 + 0x180] %asi, #one_read
240 faligndata %f10, %f12, %f26
246 /* Finally we copy the last full 64-byte block. */
247 EX3(ldda [%o1 + 0x008] %asi, %f2)
248 faligndata %f12, %f14, %f28
249 EX3(ldda [%o1 + 0x010] %asi, %f4)
250 faligndata %f14, %f0, %f30
251 stda %f16, [%o0] ASI_BLK_P
252 EX3(ldda [%o1 + 0x018] %asi, %f6)
253 faligndata %f0, %f2, %f16
254 EX3(ldda [%o1 + 0x020] %asi, %f8)
255 faligndata %f2, %f4, %f18
256 EX3(ldda [%o1 + 0x028] %asi, %f10)
257 faligndata %f4, %f6, %f20
258 EX3(ldda [%o1 + 0x030] %asi, %f12)
259 faligndata %f6, %f8, %f22
260 EX3(ldda [%o1 + 0x038] %asi, %f14)
261 faligndata %f8, %f10, %f24
265 EX4(ldda [%o1 + 0x040] %asi, %f0)
266 1: faligndata %f10, %f12, %f26
267 faligndata %f12, %f14, %f28
268 faligndata %f14, %f0, %f30
269 stda %f16, [%o0] ASI_BLK_P
275 /* Now we copy the (len modulo 64) bytes at the end.
276 * Note how we borrow the %f0 loaded above.
278 * Also notice how this code is careful not to perform a
279 * load past the end of the src buffer.
289 EX(ldda [%o1 + 0x00] %asi, %f0, add %o2, %g0)
291 1: EX(ldda [%o1 + 0x08] %asi, %f2, add %o2, %g0)
295 faligndata %f0, %f2, %f8
296 std %f8, [%o0 + 0x00]
299 EX(ldda [%o1 + 0x08] %asi, %f0, add %o2, %g0)
303 faligndata %f2, %f0, %f8
304 std %f8, [%o0 + 0x00]
308 /* If anything is left, we copy it one byte at a time.
309 * Note that %g1 is (src & 0x3) saved above before the
310 * alignaddr was performed.
324 EXNV(ldxa [%o1] %asi, %o5, add %o2, %g0)
328 1: andcc %o2, 0x4, %g0
331 EXNV(lduwa [%o1] %asi, %o5, and %o2, 0x7)
335 1: andcc %o2, 0x2, %g0
338 EXNV(lduha [%o1] %asi, %o5, and %o2, 0x3)
342 1: andcc %o2, 0x1, %g0
345 EXNV(lduba [%o1] %asi, %o5, and %o2, 0x1)
349 70: /* 16 < len <= 64 */
355 1: subcc %o4, 0x8, %o4
356 EXNV8(ldxa [%o1] %asi, %o5, add %o2, %o4)
364 EXNV4(lduwa [%o1] %asi, %o5, add %o2, %g0)
373 80: /* 0 < len <= 16 */
380 EXNV(lduwa [%o1] %asi, %g1, add %o2, %g0)
391 EXNV(lduba [%o1] %asi, %g1, add %o2, %g0)
399 /* Since this is copy_from_user(), zero out the rest of the