1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_scale_f32.c
4 * Description: Multiplies a floating-point vector by a scalar
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/basic_math_functions.h"
30
31 /**
32 @ingroup groupMath
33 */
34
35 /**
36 @defgroup BasicScale Vector Scale
37
38 Multiply a vector by a scalar value. For floating-point data, the algorithm used is:
39
40 <pre>
41 pDst[n] = pSrc[n] * scale, 0 <= n < blockSize.
42 </pre>
43
44 In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
45 a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
46 The shift allows the gain of the scaling operation to exceed 1.0.
47 The algorithm used with fixed-point data is:
48
49 <pre>
50 pDst[n] = (pSrc[n] * scaleFract) << shift, 0 <= n < blockSize.
51 </pre>
52
53 The overall scale factor applied to the fixed-point data is
54 <pre>
55 scale = scaleFract * 2^shift.
56 </pre>
57
58 The functions support in-place computation allowing the source and destination
59 pointers to reference the same memory buffer.
60 */
61
62 /**
63 @addtogroup BasicScale
64 @{
65 */
66
67 /**
68 @brief Multiplies a floating-point vector by a scalar.
69 @param[in] pSrc points to the input vector
70 @param[in] scale scale factor to be applied
71 @param[out] pDst points to the output vector
72 @param[in] blockSize number of samples in each vector
73 @return none
74 */
75
76 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
77
78 #include "arm_helium_utils.h"
79
arm_scale_f32(const float32_t * pSrc,float32_t scale,float32_t * pDst,uint32_t blockSize)80 void arm_scale_f32(
81 const float32_t * pSrc,
82 float32_t scale,
83 float32_t * pDst,
84 uint32_t blockSize)
85 {
86 uint32_t blkCnt; /* Loop counter */
87
88 f32x4_t vec1;
89 f32x4_t res;
90
91 /* Compute 4 outputs at a time */
92 blkCnt = blockSize >> 2U;
93
94 while (blkCnt > 0U)
95 {
96 /* C = A + offset */
97
98 /* Add offset and then store the results in the destination buffer. */
99 vec1 = vld1q(pSrc);
100 res = vmulq(vec1,scale);
101 vst1q(pDst, res);
102
103 /* Increment pointers */
104 pSrc += 4;
105 pDst += 4;
106
107 /* Decrement the loop counter */
108 blkCnt--;
109 }
110
111 /* Tail */
112 blkCnt = blockSize & 0x3;
113
114 if (blkCnt > 0U)
115 {
116 mve_pred16_t p0 = vctp32q(blkCnt);
117 vec1 = vld1q((float32_t const *) pSrc);
118 vstrwq_p(pDst, vmulq(vec1, scale), p0);
119 }
120
121
122 }
123
124 #else
arm_scale_f32(const float32_t * pSrc,float32_t scale,float32_t * pDst,uint32_t blockSize)125 void arm_scale_f32(
126 const float32_t *pSrc,
127 float32_t scale,
128 float32_t *pDst,
129 uint32_t blockSize)
130 {
131 uint32_t blkCnt; /* Loop counter */
132 #if defined(ARM_MATH_NEON_EXPERIMENTAL)
133 f32x4_t vec1;
134 f32x4_t res;
135
136 /* Compute 4 outputs at a time */
137 blkCnt = blockSize >> 2U;
138
139 while (blkCnt > 0U)
140 {
141 /* C = A * scale */
142
143 /* Scale the input and then store the results in the destination buffer. */
144 vec1 = vld1q_f32(pSrc);
145 res = vmulq_f32(vec1, vdupq_n_f32(scale));
146 vst1q_f32(pDst, res);
147
148 /* Increment pointers */
149 pSrc += 4;
150 pDst += 4;
151
152 /* Decrement the loop counter */
153 blkCnt--;
154 }
155
156 /* Tail */
157 blkCnt = blockSize & 0x3;
158
159 #else
160 #if defined (ARM_MATH_LOOPUNROLL)
161
162 /* Loop unrolling: Compute 4 outputs at a time */
163 blkCnt = blockSize >> 2U;
164
165 while (blkCnt > 0U)
166 {
167 float32_t in1, in2, in3, in4;
168
169 /* C = A * scale */
170
171 /* Scale input and store result in destination buffer. */
172 in1 = (*pSrc++) * scale;
173
174 in2 = (*pSrc++) * scale;
175
176 in3 = (*pSrc++) * scale;
177
178 in4 = (*pSrc++) * scale;
179
180 *pDst++ = in1;
181 *pDst++ = in2;
182 *pDst++ = in3;
183 *pDst++ = in4;
184
185 /* Decrement loop counter */
186 blkCnt--;
187 }
188
189 /* Loop unrolling: Compute remaining outputs */
190 blkCnt = blockSize % 0x4U;
191
192 #else
193
194 /* Initialize blkCnt with number of samples */
195 blkCnt = blockSize;
196
197 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
198 #endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
199
200 while (blkCnt > 0U)
201 {
202 /* C = A * scale */
203
204 /* Scale input and store result in destination buffer. */
205 *pDst++ = (*pSrc++) * scale;
206
207 /* Decrement loop counter */
208 blkCnt--;
209 }
210
211 }
212 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
213
214 /**
215 @} end of BasicScale group
216 */
217