1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_logsumexp_f32.c
4  * Description:  LogSumExp
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/statistics_functions.h"
30 #include <limits.h>
31 #include <math.h>
32 
33 
34 /**
35  * @addtogroup Kullback-Leibler
36  * @{
37  */
38 
39 
40 /**
41  * @brief Kullback-Leibler
42  *
43  * Distribution A may contain 0 with Neon version.
44  * Result will be right but some exception flags will be set.
45  *
46  * Distribution B must not contain 0 probability.
47  *
48  * @param[in]  *pSrcA         points to an array of input values for probaility distribution A.
49  * @param[in]  *pSrcB         points to an array of input values for probaility distribution B.
50  * @param[in]  blockSize      number of samples in the input array.
51  * @return Kullback-Leibler divergence D(A || B)
52  *
53  */
54 
55 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
56 
57 #include "arm_helium_utils.h"
58 #include "arm_vec_math.h"
59 
arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)60 float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)
61 {
62     uint32_t blkCnt;
63     float32_t accum, pA,pB;
64 
65 
66     blkCnt = blockSize;
67 
68     accum = 0.0f;
69 
70     f32x4_t         vSum = vdupq_n_f32(0.0f);
71     blkCnt = blockSize >> 2;
72     while(blkCnt > 0)
73     {
74         f32x4_t         vecA = vld1q(pSrcA);
75         f32x4_t         vecB = vld1q(pSrcB);
76         f32x4_t         vRatio;
77 
78         vRatio = vdiv_f32(vecB, vecA);
79         vSum = vaddq_f32(vSum, vmulq(vecA, vlogq_f32(vRatio)));
80 
81         /*
82          * Decrement the blockSize loop counter
83          * Advance vector source and destination pointers
84          */
85         pSrcA += 4;
86         pSrcB += 4;
87         blkCnt --;
88     }
89 
90     accum = vecAddAcrossF32Mve(vSum);
91 
92     blkCnt = blockSize & 3;
93     while(blkCnt > 0)
94     {
95        pA = *pSrcA++;
96        pB = *pSrcB++;
97        accum += pA * logf(pB / pA);
98 
99        blkCnt--;
100 
101     }
102 
103     return(-accum);
104 }
105 
106 #else
107 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
108 
109 #include "NEMath.h"
110 
arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)111 float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)
112 {
113     const float32_t *pInA, *pInB;
114     uint32_t blkCnt;
115     float32_t accum, pA,pB;
116 
117     float32x4_t accumV;
118     float32x2_t accumV2;
119     float32x4_t tmpVA, tmpVB,tmpV;
120 
121     pInA = pSrcA;
122     pInB = pSrcB;
123 
124     accum = 0.0f;
125     accumV = vdupq_n_f32(0.0f);
126 
127     blkCnt = blockSize >> 2;
128     while(blkCnt > 0)
129     {
130       tmpVA = vld1q_f32(pInA);
131       pInA += 4;
132 
133       tmpVB = vld1q_f32(pInB);
134       pInB += 4;
135 
136       tmpV = vinvq_f32(tmpVA);
137       tmpVB = vmulq_f32(tmpVB, tmpV);
138       tmpVB = vlogq_f32(tmpVB);
139 
140       accumV = vmlaq_f32(accumV, tmpVA, tmpVB);
141 
142       blkCnt--;
143 
144     }
145 
146     accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
147     accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
148 
149     blkCnt = blockSize & 3;
150     while(blkCnt > 0)
151     {
152        pA = *pInA++;
153        pB = *pInB++;
154        accum += pA * logf(pB/pA);
155 
156        blkCnt--;
157 
158     }
159 
160     return(-accum);
161 }
162 
163 #else
arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)164 float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)
165 {
166     const float32_t *pInA, *pInB;
167     uint32_t blkCnt;
168     float32_t accum, pA,pB;
169 
170     pInA = pSrcA;
171     pInB = pSrcB;
172     blkCnt = blockSize;
173 
174     accum = 0.0f;
175 
176     while(blkCnt > 0)
177     {
178        pA = *pInA++;
179        pB = *pInB++;
180        accum += pA * logf(pB / pA);
181 
182        blkCnt--;
183 
184     }
185 
186     return(-accum);
187 }
188 #endif
189 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
190 
191 /**
192  * @} end of Kullback-Leibler group
193  */
194