1 /* memcpy.S: Sparc optimized memcpy, bcopy and memmove code
2 * Hand optimized from GNU libc's memcpy, bcopy and memmove
3 * Copyright (C) 1991,1996 Free Software Foundation
4 * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi)
5 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
6 * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
7 * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
19 #undef FASTER_NONALIGNED
20 #define FASTER_ALIGNED
22 /* In kernel these functions don't return a value.
23 * One should use macros in asm/string.h for that purpose.
24 * We return 0, so that bugs are more apparent.
27 #define RETL_INSN clr %o0
35 #define FASTER_REVERSE
36 #define FASTER_NONALIGNED
37 #define FASTER_ALIGNED
39 #define SETUP_RETL mov %o0, %g6
40 #define RETL_INSN mov %g6, %o0
44 /* Both these macros have to start with exactly the same insn */
45 #define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
46 ldd [%src + offset + 0x00], %t0; \
47 ldd [%src + offset + 0x08], %t2; \
48 ldd [%src + offset + 0x10], %t4; \
49 ldd [%src + offset + 0x18], %t6; \
50 st %t0, [%dst + offset + 0x00]; \
51 st %t1, [%dst + offset + 0x04]; \
52 st %t2, [%dst + offset + 0x08]; \
53 st %t3, [%dst + offset + 0x0c]; \
54 st %t4, [%dst + offset + 0x10]; \
55 st %t5, [%dst + offset + 0x14]; \
56 st %t6, [%dst + offset + 0x18]; \
57 st %t7, [%dst + offset + 0x1c];
59 #define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
60 ldd [%src + offset + 0x00], %t0; \
61 ldd [%src + offset + 0x08], %t2; \
62 ldd [%src + offset + 0x10], %t4; \
63 ldd [%src + offset + 0x18], %t6; \
64 std %t0, [%dst + offset + 0x00]; \
65 std %t2, [%dst + offset + 0x08]; \
66 std %t4, [%dst + offset + 0x10]; \
67 std %t6, [%dst + offset + 0x18];
69 #define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
70 ldd [%src - offset - 0x10], %t0; \
71 ldd [%src - offset - 0x08], %t2; \
72 st %t0, [%dst - offset - 0x10]; \
73 st %t1, [%dst - offset - 0x0c]; \
74 st %t2, [%dst - offset - 0x08]; \
75 st %t3, [%dst - offset - 0x04];
77 #define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \
78 ldd [%src - offset - 0x10], %t0; \
79 ldd [%src - offset - 0x08], %t2; \
80 std %t0, [%dst - offset - 0x10]; \
81 std %t2, [%dst - offset - 0x08];
83 #define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
84 ldub [%src - offset - 0x02], %t0; \
85 ldub [%src - offset - 0x01], %t1; \
86 stb %t0, [%dst - offset - 0x02]; \
87 stb %t1, [%dst - offset - 0x01];
89 /* Both these macros have to start with exactly the same insn */
90 #define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
91 ldd [%src - offset - 0x20], %t0; \
92 ldd [%src - offset - 0x18], %t2; \
93 ldd [%src - offset - 0x10], %t4; \
94 ldd [%src - offset - 0x08], %t6; \
95 st %t0, [%dst - offset - 0x20]; \
96 st %t1, [%dst - offset - 0x1c]; \
97 st %t2, [%dst - offset - 0x18]; \
98 st %t3, [%dst - offset - 0x14]; \
99 st %t4, [%dst - offset - 0x10]; \
100 st %t5, [%dst - offset - 0x0c]; \
101 st %t6, [%dst - offset - 0x08]; \
102 st %t7, [%dst - offset - 0x04];
104 #define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
105 ldd [%src - offset - 0x20], %t0; \
106 ldd [%src - offset - 0x18], %t2; \
107 ldd [%src - offset - 0x10], %t4; \
108 ldd [%src - offset - 0x08], %t6; \
109 std %t0, [%dst - offset - 0x20]; \
110 std %t2, [%dst - offset - 0x18]; \
111 std %t4, [%dst - offset - 0x10]; \
112 std %t6, [%dst - offset - 0x08];
114 #define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
115 ldd [%src + offset + 0x00], %t0; \
116 ldd [%src + offset + 0x08], %t2; \
117 st %t0, [%dst + offset + 0x00]; \
118 st %t1, [%dst + offset + 0x04]; \
119 st %t2, [%dst + offset + 0x08]; \
120 st %t3, [%dst + offset + 0x0c];
122 #define RMOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
123 ldub [%src + offset + 0x00], %t0; \
124 ldub [%src + offset + 0x01], %t1; \
125 stb %t0, [%dst + offset + 0x00]; \
126 stb %t1, [%dst + offset + 0x01];
128 #define SMOVE_CHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
129 ldd [%src + offset + 0x00], %t0; \
130 ldd [%src + offset + 0x08], %t2; \
131 srl %t0, shir, %t5; \
132 srl %t1, shir, %t6; \
133 sll %t0, shil, %t0; \
134 or %t5, %prev, %t5; \
135 sll %t1, shil, %prev; \
137 srl %t2, shir, %t1; \
138 srl %t3, shir, %t6; \
139 sll %t2, shil, %t2; \
140 or %t1, %prev, %t1; \
141 std %t4, [%dst + offset + offset2 - 0x04]; \
142 std %t0, [%dst + offset + offset2 + 0x04]; \
143 sll %t3, shil, %prev; \
146 #define SMOVE_ALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
147 ldd [%src + offset + 0x00], %t0; \
148 ldd [%src + offset + 0x08], %t2; \
149 srl %t0, shir, %t4; \
150 srl %t1, shir, %t5; \
151 sll %t0, shil, %t6; \
152 or %t4, %prev, %t0; \
153 sll %t1, shil, %prev; \
155 srl %t2, shir, %t4; \
156 srl %t3, shir, %t5; \
157 sll %t2, shil, %t6; \
158 or %t4, %prev, %t2; \
159 sll %t3, shil, %prev; \
161 std %t0, [%dst + offset + offset2 + 0x00]; \
162 std %t2, [%dst + offset + offset2 + 0x08];
167 #ifdef FASTER_REVERSE
169 70: /* rdword_align */
189 #endif /* FASTER_REVERSE */
193 nop ! Only bcopy returns here and it retuns void...
201 /* Do the cmp in the delay slot */
217 #ifndef FASTER_REVERSE
224 1: /* reverse_bytes */
236 #else /* FASTER_REVERSE */
257 andcc %g1, 0xffffff80, %g7
263 RMOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
264 RMOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
265 RMOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
266 RMOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
281 jmpl %o5 + %lo(72f), %g0
284 71: /* rmemcpy_table */
285 RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
286 RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
287 RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
288 RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
289 RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
290 RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
291 RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
293 72: /* rmemcpy_table_end */
298 ldd [%o1 - 0x08], %g2
304 73: /* rmemcpy_last7 */
332 RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
333 RMOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
334 RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
335 RMOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
350 jmpl %o5 + %lo(72b), %g0
362 jmpl %o5 + %lo(76f), %g0
365 RMOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
366 RMOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
367 RMOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
368 RMOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
369 RMOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
370 RMOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
371 RMOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
373 76: /* rshort_table_end */
383 91: /* rshort_aligned_end */
401 77: /* rnon_aligned */
522 #endif /* FASTER_REVERSE */
524 /* NOTE: This code is executed just for the cases,
525 where %src (=%o1) & 3 is != 0.
526 We need to align it to 4. So, for (%src & 3)
527 1 we need to do ldub,lduh
530 so even if it looks weird, the branches
531 are correct here. -jj
533 78: /* dword_align */
556 FUNC(memcpy) /* %o0=dst %o1=src %o2=len */
582 andcc %g1, 0xffffff80, %g7
588 MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
589 MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
590 MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
591 MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
606 jmpl %o5 + %lo(80f), %g0
609 79: /* memcpy_table */
611 MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
612 MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
613 MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
614 MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
615 MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
616 MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
617 MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
619 80: /* memcpy_table_end */
629 81: /* memcpy_last7 */
657 MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
658 MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
659 MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
660 MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
666 #ifndef FASTER_ALIGNED
677 jmpl %o5 + %lo(80b), %g0
680 #else /* FASTER_ALIGNED */
689 jmpl %o5 + %lo(84f), %g0
692 83: /* amemcpy_table */
694 MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
695 MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
696 MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
697 MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
698 MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
699 MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
700 MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
702 84: /* amemcpy_table_end */
708 std %g2, [%o0 - 0x08]
710 85: /* amemcpy_last7 */
736 #endif /* FASTER_ALIGNED */
738 86: /* non_aligned */
742 #ifdef FASTER_NONALIGNED
747 #endif /* FASTER_NONALIGNED */
868 #ifdef FASTER_NONALIGNED
870 87: /* faster_nonaligned */
925 and %o2, 0xffffffc0, %o3
928 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
929 SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
930 SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
931 SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
941 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
964 and %o2, 0xffffffc0, %o3
967 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
968 SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
969 SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
970 SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
980 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1003 and %o2, 0xffffffc0, %o3
1008 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1009 SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1010 SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1011 SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1017 andcc %o2, 0x30, %o3
1021 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1033 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1034 SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1035 SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1036 SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1042 andcc %o2, 0x30, %o3
1046 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1059 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1060 SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1061 SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1062 SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1068 andcc %o2, 0x30, %o3
1072 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1084 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1085 SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1086 SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1087 SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1093 andcc %o2, 0x30, %o3
1097 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1109 #endif /* FASTER_NONALIGNED */
1120 jmpl %o5 + %lo(89f), %g0
1123 MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
1124 MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
1125 MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
1126 MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
1127 MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
1128 MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
1129 MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
1131 89: /* short_table_end */
1142 90: /* short_aligned_end */
1149 ld [%o1 + 0x00], %g2
1150 ld [%o1 + 0x04], %g3
1152 st %g2, [%o0 + 0x00]
1153 st %g3, [%o0 + 0x04]