1 /*
2   (C) Copyright 2001,2006,
3   International Business Machines Corporation,
4   Sony Computer Entertainment, Incorporated,
5   Toshiba Corporation,
6 
7   All rights reserved.
8 
9   Redistribution and use in source and binary forms, with or without
10   modification, are permitted provided that the following conditions are met:
11 
12     * Redistributions of source code must retain the above copyright notice,
13   this list of conditions and the following disclaimer.
14     * Redistributions in binary form must reproduce the above copyright
15   notice, this list of conditions and the following disclaimer in the
16   documentation and/or other materials provided with the distribution.
17     * Neither the names of the copyright holders nor the names of their
18   contributors may be used to endorse or promote products derived from this
19   software without specific prior written permission.
20 
21   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31   POSSIBILITY OF SUCH DAMAGE.
32 */
33 #include <spu_intrinsics.h>
34 #include <stddef.h>
35 #include "vec_literal.h"
36 
37 /* Copy n bytes from memory area src to memory area dest.
38  * Copying is performed as if the n characters pointed to
39  * by src are first copied into a temporary array that does
40  * not overlap the src and dest arrays. Then the n characters
41  * of the temporary array are copied into the destination
42  * array. The memmove subroutine returns a pointer to dest.
43  */
44 
memmove(void * __restrict__ dest,const void * __restrict__ src,size_t n)45 void * memmove(void * __restrict__ dest, const void * __restrict__ src, size_t n)
46 {
47   int adjust, delta;
48   unsigned int soffset1, soffset2, doffset1, doffset2;
49   vec_uchar16 *vSrc, *vDst;
50   vec_uchar16 sdata1, sdata2, sdata, ddata, shuffle;
51   vec_uchar16 mask, mask1, mask2, mask3, one = spu_splats((unsigned char)-1);
52 
53   soffset1  = (unsigned int)(src) & 15;
54   doffset1 = (unsigned int)(dest) & 15;
55   doffset2 = ((unsigned int)(dest) + n) & 15;
56 
57   /* Construct a series of masks used to data insert. The masks
58    * contains 0 bit when the destination word is unchanged, 1 when it
59    * must be replaced by source bits.
60    *
61    * mask1 = mask for leading unchanged bytes
62    * mask2 = mask for trailing unchange bytes
63    * mask3 = mask indicating the more than one qword is being changed.
64    */
65   mask  = one;
66   mask1 = spu_rlmaskqwbyte(mask, -doffset1);
67   mask2 = spu_slqwbyte(mask, 16-doffset2);
68   mask3 = (vec_uchar16)spu_cmpgt(spu_splats((unsigned int)(doffset1 + n)), 15);
69 
70   vDst = (vec_uchar16 *)(dest);
71 
72   delta  = (int)soffset1 - (int)doffset1;
73 
74   /* The follow check only works if the SPU addresses are not
75    * wrapped. No provisions have been made to correct for this
76    * limitation.
77    */
78   if (((unsigned int)dest - (unsigned int)src) >= (unsigned int)n) {
79     /* Forward copy. Perform a memcpy.
80      *
81      * Handle any leading destination partial quadwords as
82      * well a very short copy (ie, such that the n characters
83      * all reside in a single (destination) quadword.
84      */
85     vSrc = (vec_uchar16 *)(src);
86     vDst = (vec_uchar16 *)(dest);
87 
88     /* Handle any leading destination partial quadwords as
89      * well a very short copy (ie, such that the n characters
90      * all reside in a single (destination) quadword.
91      */
92     soffset1 = (unsigned int)(src) & 15;
93     doffset1 = (unsigned int)(dest) & 15;
94     doffset2 = ((unsigned int)(dest) + n) & 15;
95 
96     /* Compute a shuffle pattern used to align the source string
97      * with the alignment of the destination string.
98      */
99 
100     adjust = (int)spu_extract(spu_cmpgt(spu_promote(doffset1, 0), spu_promote(soffset1, 0)), 0);
101     delta  = (int)soffset1 - (int)doffset1;
102     delta += adjust & 16;
103 
104     shuffle = (vec_uchar16)spu_add((vec_uint4)spu_splats((unsigned char)delta),
105 				   VEC_LITERAL(vec_uint4, 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F));
106 
107     vSrc += adjust;
108 
109     sdata1 = *vSrc++;
110     sdata2 = *vSrc++;
111 
112     ddata = *vDst;
113     sdata = spu_shuffle(sdata1, sdata2, shuffle);
114 
115     /* Construct a series of masks used to data insert. The masks
116      * contain 0 when the destination word is unchanged, 1 when it
117      * must be replaced by source bytes.
118      *
119      * mask1 = mask for leading unchanged bytes
120      * mask2 = mask for trailing unchange bytes
121      * mask3 = mask indicating the more than one qword is being changed.
122      */
123     mask  = one;
124     mask1 = spu_rlmaskqwbyte(mask, -doffset1);
125     mask2 = spu_slqwbyte(mask, 16-doffset2);
126     mask3 = (vec_uchar16)spu_cmpgt(spu_splats((unsigned int)(doffset1 + n)), 15);
127 
128     *vDst++ = spu_sel(ddata, sdata, spu_and(mask1, spu_or(mask2, mask3)));
129 
130     n += doffset1;
131 
132     /* Handle complete destination quadwords
133      */
134     while (n > 31) {
135       sdata1 = sdata2;
136       sdata2 = *vSrc++;
137       *vDst++ = spu_shuffle(sdata1, sdata2, shuffle);
138       n -= 16;
139     }
140 
141     /* Handle any trailing partial (destination) quadwords
142      */
143     mask = spu_and((vec_uchar16)spu_cmpgt(spu_splats((unsigned int)n), 16), mask2);
144     *vDst = spu_sel(*vDst, spu_shuffle(sdata2, *vSrc, shuffle), mask);
145 
146   } else {
147     /* Backward copy.
148      *
149      * Handle any leading destination partial quadwords as
150      * well a very short copy (ie, such that the n characters
151      * all reside in a single (destination) quadword.
152      */
153     vSrc = (vec_uchar16 *)((unsigned int)src  + n-1);
154     vDst = (vec_uchar16 *)((unsigned int)dest + n-1);
155 
156     /* Handle any leading destination partial quadwords as
157      * well a very short copy (ie, such that the n characters
158      * all reside in a single (destination) quadword.
159      */
160     soffset1 = (unsigned int)(src)  & 15;
161     soffset2 = (unsigned int)(vSrc) & 15;
162     doffset1 = (unsigned int)(dest) & 15;
163     doffset2 = (unsigned int)(vDst) & 15;
164 
165     /* Compute a shuffle pattern used to align the source string
166      * with the alignment of the destination string.
167      */
168     adjust = (int)spu_extract(spu_cmpgt(spu_promote(soffset2, 0), spu_promote(doffset2, 0)), 0);
169     delta  = (int)doffset2 - (int)soffset2;
170     delta += adjust & 16;
171 
172     shuffle = (vec_uchar16)spu_sub(VEC_LITERAL(vec_uint4, 0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F),
173 				   (vec_uint4)spu_splats((unsigned char)delta));
174 
175     vSrc -= adjust;
176 
177     sdata2 = *vSrc--;
178     sdata1 = *vSrc--;
179 
180     ddata = *vDst;
181     sdata = spu_shuffle(sdata1, sdata2, shuffle);
182 
183     /* Construct a series of masks used to data insert. The masks
184      * contain 0 when the destination word is unchanged, 1 when it
185      * must be replaced by source bytes.
186      *
187      * mask1 = mask for leading unchanged bytes
188      * mask2 = mask for trailing unchange bytes
189      * mask3 = mask indicating the more than one qword is being changed.
190      */
191     mask  = one;
192     mask1 = spu_rlmaskqwbyte(mask, -doffset1);
193     mask2 = spu_slqwbyte(mask, 15-doffset2);
194     mask3 = (vec_uchar16)spu_cmpgt(spu_splats((int)(doffset2 - n)), -2);
195 
196     *vDst-- = spu_sel(ddata, sdata, spu_and(mask2, spu_orc(mask1, mask3)));
197 
198     n -= doffset2 + 1;
199 
200     /* Handle complete destination quadwords
201      */
202     while ((int)n > 15) {
203       sdata2 = sdata1;
204       sdata1 = *vSrc--;
205       *vDst-- = spu_shuffle(sdata1, sdata2, shuffle);
206       n -= 16;
207     }
208 
209     /* Handle any trailing partial (destination) quadwords
210      */
211     mask = spu_and((vec_uchar16)spu_cmpgt(spu_splats((int)n), 0), mask1);
212     *vDst = spu_sel(*vDst, spu_shuffle(*vSrc, sdata1, shuffle), mask);
213   }
214   return (dest);
215 }
216 
217