1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_logsumexp_f32.c
4 * Description: LogSumExp
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/statistics_functions.h"
30 #include <limits.h>
31 #include <math.h>
32
33
34 /**
35 * @addtogroup Kullback-Leibler
36 * @{
37 */
38
39
40 /**
41 * @brief Kullback-Leibler
42 *
43 * Distribution A may contain 0 with Neon version.
44 * Result will be right but some exception flags will be set.
45 *
46 * Distribution B must not contain 0 probability.
47 *
48 * @param[in] *pSrcA points to an array of input values for probaility distribution A.
49 * @param[in] *pSrcB points to an array of input values for probaility distribution B.
50 * @param[in] blockSize number of samples in the input array.
51 * @return Kullback-Leibler divergence D(A || B)
52 *
53 */
54
55 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
56
57 #include "arm_helium_utils.h"
58 #include "arm_vec_math.h"
59
arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)60 float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)
61 {
62 uint32_t blkCnt;
63 float32_t accum, pA,pB;
64
65
66 blkCnt = blockSize;
67
68 accum = 0.0f;
69
70 f32x4_t vSum = vdupq_n_f32(0.0f);
71 blkCnt = blockSize >> 2;
72 while(blkCnt > 0)
73 {
74 f32x4_t vecA = vld1q(pSrcA);
75 f32x4_t vecB = vld1q(pSrcB);
76 f32x4_t vRatio;
77
78 vRatio = vdiv_f32(vecB, vecA);
79 vSum = vaddq_f32(vSum, vmulq(vecA, vlogq_f32(vRatio)));
80
81 /*
82 * Decrement the blockSize loop counter
83 * Advance vector source and destination pointers
84 */
85 pSrcA += 4;
86 pSrcB += 4;
87 blkCnt --;
88 }
89
90 accum = vecAddAcrossF32Mve(vSum);
91
92 blkCnt = blockSize & 3;
93 while(blkCnt > 0)
94 {
95 pA = *pSrcA++;
96 pB = *pSrcB++;
97 accum += pA * logf(pB / pA);
98
99 blkCnt--;
100
101 }
102
103 return(-accum);
104 }
105
106 #else
107 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
108
109 #include "NEMath.h"
110
arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)111 float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)
112 {
113 const float32_t *pInA, *pInB;
114 uint32_t blkCnt;
115 float32_t accum, pA,pB;
116
117 float32x4_t accumV;
118 float32x2_t accumV2;
119 float32x4_t tmpVA, tmpVB,tmpV;
120
121 pInA = pSrcA;
122 pInB = pSrcB;
123
124 accum = 0.0f;
125 accumV = vdupq_n_f32(0.0f);
126
127 blkCnt = blockSize >> 2;
128 while(blkCnt > 0)
129 {
130 tmpVA = vld1q_f32(pInA);
131 pInA += 4;
132
133 tmpVB = vld1q_f32(pInB);
134 pInB += 4;
135
136 tmpV = vinvq_f32(tmpVA);
137 tmpVB = vmulq_f32(tmpVB, tmpV);
138 tmpVB = vlogq_f32(tmpVB);
139
140 accumV = vmlaq_f32(accumV, tmpVA, tmpVB);
141
142 blkCnt--;
143
144 }
145
146 accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
147 accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
148
149 blkCnt = blockSize & 3;
150 while(blkCnt > 0)
151 {
152 pA = *pInA++;
153 pB = *pInB++;
154 accum += pA * logf(pB/pA);
155
156 blkCnt--;
157
158 }
159
160 return(-accum);
161 }
162
163 #else
arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)164 float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)
165 {
166 const float32_t *pInA, *pInB;
167 uint32_t blkCnt;
168 float32_t accum, pA,pB;
169
170 pInA = pSrcA;
171 pInB = pSrcB;
172 blkCnt = blockSize;
173
174 accum = 0.0f;
175
176 while(blkCnt > 0)
177 {
178 pA = *pInA++;
179 pB = *pInB++;
180 accum += pA * logf(pB / pA);
181
182 blkCnt--;
183
184 }
185
186 return(-accum);
187 }
188 #endif
189 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
190
191 /**
192 * @} end of Kullback-Leibler group
193 */
194