1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_weighted_sum_f32.c
4  * Description:  Weighted Sum
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include <limits.h>
30 #include <math.h>
31 
32 #include "dsp/support_functions.h"
33 
34 /**
35  * @addtogroup weightedsum
36  * @{
37  */
38 
39 
40 /**
41  * @brief Weighted sum
42  *
43  *
44  * @param[in]    *in           Array of input values.
45  * @param[in]    *weigths      Weights
46  * @param[in]    blockSize     Number of samples in the input array.
47  * @return       Weighted sum
48  *
49  */
50 
51 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
52 
53 #include "arm_helium_utils.h"
54 
arm_weighted_sum_f32(const float32_t * in,const float32_t * weigths,uint32_t blockSize)55 float32_t arm_weighted_sum_f32(const float32_t *in,const float32_t *weigths, uint32_t blockSize)
56 {
57     float32_t       accum1, accum2;
58     f32x4_t         accum1V, accum2V;
59     f32x4_t         inV, wV;
60     const float32_t *pIn, *pW;
61     uint32_t        blkCnt;
62 
63 
64     pIn = in;
65     pW = weigths;
66 
67 
68     accum1V = vdupq_n_f32(0.0);
69     accum2V = vdupq_n_f32(0.0);
70 
71     blkCnt = blockSize >> 2;
72     while (blkCnt > 0)
73     {
74         inV = vld1q(pIn);
75         wV = vld1q(pW);
76 
77         pIn += 4;
78         pW += 4;
79 
80         accum1V = vfmaq(accum1V, inV, wV);
81         accum2V = vaddq(accum2V, wV);
82         blkCnt--;
83     }
84 
85     accum1 = vecAddAcrossF32Mve(accum1V);
86     accum2 = vecAddAcrossF32Mve(accum2V);
87 
88     blkCnt = blockSize & 3;
89     while(blkCnt > 0)
90     {
91         accum1 += *pIn++ * *pW;
92         accum2 += *pW++;
93         blkCnt--;
94     }
95 
96 
97     return (accum1 / accum2);
98 }
99 
100 #else
101 #if defined(ARM_MATH_NEON)
102 
103 #include "NEMath.h"
arm_weighted_sum_f32(const float32_t * in,const float32_t * weigths,uint32_t blockSize)104 float32_t arm_weighted_sum_f32(const float32_t *in,const float32_t *weigths, uint32_t blockSize)
105 {
106 
107     float32_t accum1, accum2;
108     float32x4_t accum1V, accum2V;
109     float32x2_t tempV;
110 
111     float32x4_t inV,wV;
112 
113     const float32_t *pIn, *pW;
114     uint32_t blkCnt;
115 
116 
117     pIn = in;
118     pW = weigths;
119 
120     accum1=0.0f;
121     accum2=0.0f;
122 
123     accum1V = vdupq_n_f32(0.0f);
124     accum2V = vdupq_n_f32(0.0f);
125 
126     blkCnt = blockSize >> 2;
127     while(blkCnt > 0)
128     {
129         inV = vld1q_f32(pIn);
130         wV = vld1q_f32(pW);
131 
132         pIn += 4;
133         pW += 4;
134 
135         accum1V = vmlaq_f32(accum1V,inV,wV);
136         accum2V = vaddq_f32(accum2V,wV);
137         blkCnt--;
138     }
139 
140     tempV = vpadd_f32(vget_low_f32(accum1V),vget_high_f32(accum1V));
141     accum1 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
142 
143     tempV = vpadd_f32(vget_low_f32(accum2V),vget_high_f32(accum2V));
144     accum2 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
145 
146     blkCnt = blockSize & 3;
147     while(blkCnt > 0)
148     {
149         accum1 += *pIn++ * *pW;
150         accum2 += *pW++;
151         blkCnt--;
152     }
153 
154 
155     return(accum1 / accum2);
156 }
157 #else
arm_weighted_sum_f32(const float32_t * in,const float32_t * weigths,uint32_t blockSize)158 float32_t arm_weighted_sum_f32(const float32_t *in, const float32_t *weigths, uint32_t blockSize)
159 {
160 
161     float32_t accum1, accum2;
162     const float32_t *pIn, *pW;
163     uint32_t blkCnt;
164 
165 
166     pIn = in;
167     pW = weigths;
168 
169     accum1=0.0f;
170     accum2=0.0f;
171 
172     blkCnt = blockSize;
173     while(blkCnt > 0)
174     {
175         accum1 += *pIn++ * *pW;
176         accum2 += *pW++;
177         blkCnt--;
178     }
179 
180     return(accum1 / accum2);
181 }
182 #endif
183 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
184 
185 /**
186  * @} end of weightedsum group
187  */
188