1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_barycenter_f32.c
4  * Description:  Barycenter
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/support_functions.h"
30 #include <limits.h>
31 #include <math.h>
32 
33 
34 /**
35   @ingroup barycenter
36   @{
37  */
38 
39 
40 /**
41  * @brief Barycenter
42  *
43  *
44  * @param[in]    *in         List of vectors
45  * @param[in]    *weights    Weights of the vectors
46  * @param[out]   *out        Barycenter
47  * @param[in]    nbVectors   Number of vectors
48  * @param[in]    vecDim      Dimension of space (vector dimension)
49  * @return       None
50  *
51  */
52 
53 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_barycenter_f32(const float32_t * in,const float32_t * weights,float32_t * out,uint32_t nbVectors,uint32_t vecDim)54 void arm_barycenter_f32(const float32_t *in,
55   const float32_t *weights,
56   float32_t *out,
57   uint32_t nbVectors,
58   uint32_t vecDim)
59 {
60     const float32_t *pIn, *pW;
61     const float32_t *pIn1, *pIn2, *pIn3, *pIn4;
62     float32_t      *pOut;
63     uint32_t        blkCntVector, blkCntSample;
64     float32_t       accum, w;
65 
66     blkCntVector = nbVectors;
67     blkCntSample = vecDim;
68 
69     accum = 0.0f;
70 
71     pW = weights;
72     pIn = in;
73 
74 
75     arm_fill_f32(0.0f, out, vecDim);
76 
77 
78     /* Sum */
79     pIn1 = pIn;
80     pIn2 = pIn1 + vecDim;
81     pIn3 = pIn2 + vecDim;
82     pIn4 = pIn3 + vecDim;
83 
84     blkCntVector = nbVectors >> 2;
85     while (blkCntVector > 0)
86     {
87         f32x4_t         outV, inV1, inV2, inV3, inV4;
88         float32_t       w1, w2, w3, w4;
89 
90         pOut = out;
91         w1 = *pW++;
92         w2 = *pW++;
93         w3 = *pW++;
94         w4 = *pW++;
95         accum += w1 + w2 + w3 + w4;
96 
97         blkCntSample = vecDim >> 2;
98         while (blkCntSample > 0) {
99             outV = vld1q((const float32_t *) pOut);
100             inV1 = vld1q(pIn1);
101             inV2 = vld1q(pIn2);
102             inV3 = vld1q(pIn3);
103             inV4 = vld1q(pIn4);
104             outV = vfmaq(outV, inV1, w1);
105             outV = vfmaq(outV, inV2, w2);
106             outV = vfmaq(outV, inV3, w3);
107             outV = vfmaq(outV, inV4, w4);
108             vst1q(pOut, outV);
109 
110             pOut += 4;
111             pIn1 += 4;
112             pIn2 += 4;
113             pIn3 += 4;
114             pIn4 += 4;
115 
116             blkCntSample--;
117         }
118 
119         blkCntSample = vecDim & 3;
120         while (blkCntSample > 0) {
121             *pOut = *pOut + *pIn1++ * w1;
122             *pOut = *pOut + *pIn2++ * w2;
123             *pOut = *pOut + *pIn3++ * w3;
124             *pOut = *pOut + *pIn4++ * w4;
125             pOut++;
126             blkCntSample--;
127         }
128 
129         pIn1 += 3 * vecDim;
130         pIn2 += 3 * vecDim;
131         pIn3 += 3 * vecDim;
132         pIn4 += 3 * vecDim;
133 
134         blkCntVector--;
135     }
136 
137     pIn = pIn1;
138 
139     blkCntVector = nbVectors & 3;
140     while (blkCntVector > 0)
141     {
142         f32x4_t         inV, outV;
143 
144         pOut = out;
145         w = *pW++;
146         accum += w;
147 
148         blkCntSample = vecDim >> 2;
149         while (blkCntSample > 0)
150         {
151             outV = vld1q_f32(pOut);
152             inV = vld1q_f32(pIn);
153             outV = vfmaq(outV, inV, w);
154             vst1q_f32(pOut, outV);
155             pOut += 4;
156             pIn += 4;
157 
158             blkCntSample--;
159         }
160 
161         blkCntSample = vecDim & 3;
162         while (blkCntSample > 0)
163         {
164             *pOut = *pOut + *pIn++ * w;
165             pOut++;
166             blkCntSample--;
167         }
168 
169         blkCntVector--;
170     }
171 
172     /* Normalize */
173     pOut = out;
174     accum = 1.0f / accum;
175 
176     blkCntSample = vecDim >> 2;
177     while (blkCntSample > 0)
178     {
179         f32x4_t         tmp;
180 
181         tmp = vld1q((const float32_t *) pOut);
182         tmp = vmulq(tmp, accum);
183         vst1q(pOut, tmp);
184         pOut += 4;
185         blkCntSample--;
186     }
187 
188     blkCntSample = vecDim & 3;
189     while (blkCntSample > 0)
190     {
191         *pOut = *pOut * accum;
192         pOut++;
193         blkCntSample--;
194     }
195 }
196 #else
197 #if defined(ARM_MATH_NEON)
198 
199 #include "NEMath.h"
arm_barycenter_f32(const float32_t * in,const float32_t * weights,float32_t * out,uint32_t nbVectors,uint32_t vecDim)200 void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim)
201 {
202 
203    const float32_t *pIn,*pW, *pIn1, *pIn2, *pIn3, *pIn4;
204    float32_t *pOut;
205    uint32_t blkCntVector,blkCntSample;
206    float32_t accum, w,w1,w2,w3,w4;
207 
208    float32x4_t tmp, inV,outV, inV1, inV2, inV3, inV4;
209 
210    blkCntVector = nbVectors;
211    blkCntSample = vecDim;
212 
213    accum = 0.0f;
214 
215    pW = weights;
216    pIn = in;
217 
218    /* Set counters to 0 */
219    tmp = vdupq_n_f32(0.0f);
220    pOut = out;
221 
222    blkCntSample = vecDim >> 2;
223    while(blkCntSample > 0)
224    {
225          vst1q_f32(pOut, tmp);
226          pOut += 4;
227          blkCntSample--;
228    }
229 
230    blkCntSample = vecDim & 3;
231    while(blkCntSample > 0)
232    {
233          *pOut = 0.0f;
234          pOut++;
235          blkCntSample--;
236    }
237 
238    /* Sum */
239 
240    pIn1 = pIn;
241    pIn2 = pIn1 + vecDim;
242    pIn3 = pIn2 + vecDim;
243    pIn4 = pIn3 + vecDim;
244 
245    blkCntVector = nbVectors >> 2;
246    while(blkCntVector > 0)
247    {
248       pOut = out;
249       w1 = *pW++;
250       w2 = *pW++;
251       w3 = *pW++;
252       w4 = *pW++;
253       accum += w1 + w2 + w3 + w4;
254 
255       blkCntSample = vecDim >> 2;
256       while(blkCntSample > 0)
257       {
258           outV = vld1q_f32(pOut);
259           inV1 = vld1q_f32(pIn1);
260           inV2 = vld1q_f32(pIn2);
261           inV3 = vld1q_f32(pIn3);
262           inV4 = vld1q_f32(pIn4);
263           outV = vmlaq_n_f32(outV,inV1,w1);
264           outV = vmlaq_n_f32(outV,inV2,w2);
265           outV = vmlaq_n_f32(outV,inV3,w3);
266           outV = vmlaq_n_f32(outV,inV4,w4);
267           vst1q_f32(pOut, outV);
268           pOut += 4;
269           pIn1 += 4;
270           pIn2 += 4;
271           pIn3 += 4;
272           pIn4 += 4;
273 
274           blkCntSample--;
275       }
276 
277       blkCntSample = vecDim & 3;
278       while(blkCntSample > 0)
279       {
280           *pOut = *pOut + *pIn1++ * w1;
281           *pOut = *pOut + *pIn2++ * w2;
282           *pOut = *pOut + *pIn3++ * w3;
283           *pOut = *pOut + *pIn4++ * w4;
284           pOut++;
285           blkCntSample--;
286       }
287 
288       pIn1 += 3*vecDim;
289       pIn2 += 3*vecDim;
290       pIn3 += 3*vecDim;
291       pIn4 += 3*vecDim;
292 
293       blkCntVector--;
294    }
295 
296    pIn = pIn1;
297 
298    blkCntVector = nbVectors & 3;
299    while(blkCntVector > 0)
300    {
301       pOut = out;
302       w = *pW++;
303       accum += w;
304 
305       blkCntSample = vecDim >> 2;
306       while(blkCntSample > 0)
307       {
308           outV = vld1q_f32(pOut);
309           inV = vld1q_f32(pIn);
310           outV = vmlaq_n_f32(outV,inV,w);
311           vst1q_f32(pOut, outV);
312           pOut += 4;
313           pIn += 4;
314 
315           blkCntSample--;
316       }
317 
318       blkCntSample = vecDim & 3;
319       while(blkCntSample > 0)
320       {
321           *pOut = *pOut + *pIn++ * w;
322           pOut++;
323           blkCntSample--;
324       }
325 
326       blkCntVector--;
327    }
328 
329    /* Normalize */
330    pOut = out;
331    accum = 1.0f / accum;
332 
333    blkCntSample = vecDim >> 2;
334    while(blkCntSample > 0)
335    {
336          tmp = vld1q_f32(pOut);
337          tmp = vmulq_n_f32(tmp,accum);
338          vst1q_f32(pOut, tmp);
339          pOut += 4;
340          blkCntSample--;
341    }
342 
343    blkCntSample = vecDim & 3;
344    while(blkCntSample > 0)
345    {
346          *pOut = *pOut * accum;
347          pOut++;
348          blkCntSample--;
349    }
350 
351 }
352 #else
arm_barycenter_f32(const float32_t * in,const float32_t * weights,float32_t * out,uint32_t nbVectors,uint32_t vecDim)353 void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim)
354 {
355 
356    const float32_t *pIn,*pW;
357    float32_t *pOut;
358    uint32_t blkCntVector,blkCntSample;
359    float32_t accum, w;
360 
361    blkCntVector = nbVectors;
362    blkCntSample = vecDim;
363 
364    accum = 0.0f;
365 
366    pW = weights;
367    pIn = in;
368 
369    /* Set counters to 0 */
370    blkCntSample = vecDim;
371    pOut = out;
372 
373    while(blkCntSample > 0)
374    {
375          *pOut = 0.0f;
376          pOut++;
377          blkCntSample--;
378    }
379 
380    /* Sum */
381    while(blkCntVector > 0)
382    {
383       pOut = out;
384       w = *pW++;
385       accum += w;
386 
387       blkCntSample = vecDim;
388       while(blkCntSample > 0)
389       {
390           *pOut = *pOut + *pIn++ * w;
391           pOut++;
392           blkCntSample--;
393       }
394 
395       blkCntVector--;
396    }
397 
398    /* Normalize */
399    blkCntSample = vecDim;
400    pOut = out;
401 
402    while(blkCntSample > 0)
403    {
404          *pOut = *pOut / accum;
405          pOut++;
406          blkCntSample--;
407    }
408 
409 }
410 #endif
411 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
412 
413 /**
414  * @} end of barycenter group
415  */
416