1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_clip_f16.c
4  * Description:  Floating-point vector addition
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/basic_math_functions_f16.h"
30 
31 /**
32   @ingroup groupMath
33  */
34 
35 
36 /**
37   @addtogroup BasicClip
38   @{
39  */
40 
41 /**
42   @brief         Elementwise floating-point clipping
43   @param[in]     pSrc          points to input values
44   @param[out]    pDst          points to output clipped values
45   @param[in]     low           lower bound
46   @param[in]     high          higher bound
47   @param[in]     numSamples    number of samples to clip
48   @return        none
49  */
50 
51 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
52 
53 #include "arm_helium_utils.h"
54 
arm_clip_f16(const float16_t * pSrc,float16_t * pDst,float16_t low,float16_t high,uint32_t numSamples)55 void arm_clip_f16(const float16_t * pSrc,
56   float16_t * pDst,
57   float16_t low,
58   float16_t high,
59   uint32_t numSamples)
60 {
61     uint32_t  blkCnt;
62     f16x8_t curVec0, curVec1;
63     f16x8_t vecLow, vecHigh;
64 
65     vecLow = vdupq_n_f16(low);
66     vecHigh = vdupq_n_f16(high);
67 
68     curVec0 = vld1q(pSrc);
69     pSrc += 8;
70     /*
71      * unrolled x 2 to allow
72      * vldr/vstr/vmin/vmax
73      * stall free interleaving
74      */
75     blkCnt = numSamples >> 4;
76     while (blkCnt--)
77     {
78         curVec0 = vmaxnmq(curVec0, vecLow);
79         curVec1 = vld1q(pSrc);
80         pSrc += 8;
81         curVec0 = vminnmq(curVec0, vecHigh);
82         vst1q(pDst, curVec0);
83         pDst += 8;
84         curVec1 = vmaxnmq(curVec1, vecLow);
85         curVec0 = vld1q(pSrc);
86         pSrc += 8;
87         curVec1 = vminnmq(curVec1, vecHigh);
88         vst1q(pDst, curVec1);
89         pDst += 8;
90     }
91     /*
92      * Tail handling
93      */
94     blkCnt = numSamples - ((numSamples >> 4) << 4);
95     if (blkCnt >= 8)
96     {
97         curVec0 = vmaxnmq(curVec0, vecLow);
98         curVec0 = vminnmq(curVec0, vecHigh);
99         vst1q(pDst, curVec0);
100         pDst += 8;
101         curVec0 = vld1q(pSrc);
102         pSrc += 8;
103     }
104 
105     if (blkCnt > 0)
106     {
107         mve_pred16_t p0 = vctp16q(blkCnt & 7);
108         curVec0 = vmaxnmq(curVec0, vecLow);
109         curVec0 = vminnmq(curVec0, vecHigh);
110         vstrhq_p(pDst, curVec0, p0);
111     }
112 }
113 
114 #else
115 
116 #if defined(ARM_FLOAT16_SUPPORTED)
117 
arm_clip_f16(const float16_t * pSrc,float16_t * pDst,float16_t low,float16_t high,uint32_t numSamples)118 void arm_clip_f16(const float16_t * pSrc,
119   float16_t * pDst,
120   float16_t low,
121   float16_t high,
122   uint32_t numSamples)
123 {
124     for (uint32_t i = 0; i < numSamples; i++)
125     {
126         if (pSrc[i] > high)
127             pDst[i] = high;
128         else if (pSrc[i] < low)
129             pDst[i] = low;
130         else
131             pDst[i] = pSrc[i];
132     }
133 }
134 #endif /* defined(ARM_FLOAT16_SUPPORTED */
135 
136 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
137 
138 
139 /**
140   @} end of BasicClip group
141  */
142