1 /*
2  * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_nn_compiler.h
22  * Description:  Generic compiler header
23  *
24  * $Date:        16 January 2024
25  * $Revision:    V.1.2.2
26  *
27  * Target :  Arm(R) M-Profile Architecture
28  * -------------------------------------------------------------------- */
29 
30 #ifndef ARM_NN_COMPILER_H
31 #define ARM_NN_COMPILER_H
32 
33 /**
34  *
35  * @brief Arm C-Language Extension(ACLE) Includes
36  *
37  */
38 
39 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
40 
41     #ifndef __ASM
42         #define __ASM __asm
43     #endif
44     #ifndef __INLINE
45         #define __INLINE __inline
46     #endif
47     #ifndef __STATIC_INLINE
48         #define __STATIC_INLINE static __inline
49     #endif
50     #ifndef __STATIC_FORCEINLINE
51         #define __STATIC_FORCEINLINE __attribute__((always_inline)) static __inline
52     #endif
53     #ifndef __RESTRICT
54         #define __RESTRICT __restrict
55     #endif
56 
57 #elif defined(__ICCARM__)
58 
59     #warning IAR support is not tested
60     #ifndef __ASM
61         #define __ASM __asm
62     #endif
63     #ifndef __INLINE
64         #define __INLINE inline
65     #endif
66     #ifndef __STATIC_INLINE
67         #define __STATIC_INLINE static inline
68     #endif
69     #ifndef __FORCEINLINE
70         #define __FORCEINLINE _Pragma("inline=forced")
71     #endif
72     #ifndef __STATIC_FORCEINLINE
73         #define __STATIC_FORCEINLINE __FORCEINLINE __STATIC_INLINE
74     #endif
75     #ifndef __RESTRICT
76         #define __RESTRICT __restrict
77     #endif
78 
79 #elif defined(_MSC_VER)
80 
81     // Build for non Arm Cortex-M processors is not tested or supported.
82     // Use this section to stub any macros or intrinsics
83     #warning Unsupported compiler
84     #ifndef __STATIC_FORCEINLINE
85         #define __STATIC_FORCEINLINE static __forceinline
86     #endif
87     #ifndef __STATIC_INLINE
88         #define __STATIC_INLINE static __inline
89     #endif
90     #ifndef __ALIGNED
91         #define __ALIGNED(x) __declspec(align(x))
92     #endif
93 
94 #elif defined(__GNUC__)
95 
96     #ifndef __ASM
97         #define __ASM __asm
98     #endif
99     #ifndef __INLINE
100         #define __INLINE inline
101     #endif
102     #ifndef __STATIC_INLINE
103         #define __STATIC_INLINE static inline
104     #endif
105     #ifndef __STATIC_FORCEINLINE
106         #define __STATIC_FORCEINLINE __attribute__((always_inline)) static inline
107     #endif
108     #ifndef __RESTRICT
109         #define __RESTRICT __restrict
110     #endif
111 
112 #else
113 
114     #error Unsupported compiler. Add support as needed
115 
116 #endif
117 
118 /**
119  *
120  * @brief Compiler specific diagnostic adjustment / fixes if applicable
121  *
122  */
123 
124 // Note: __ARM_ARCH is used with M-profile architecture as the target here.
125 #if defined(__GNUC__)
126     #if (__GNUC__ == 12 && (__GNUC_MINOR__ <= 2)) && defined(__ARM_ARCH)
127         // Workaround for 'Internal Compiler Error' on Arm GNU Toolchain rel 12.2.x
128         // https://gcc.gnu.org/pipermail/gcc-patches/2022-December/607963.html
129         #define ARM_GCC_12_2_ICE
130     #endif
131 #endif
132 
133 #if defined(__ARM_FEATURE_MVE) && ((__ARM_FEATURE_MVE & 3) == 3) || (__ARM_FEATURE_MVE & 1)
134     #include <arm_mve.h>
135 #endif
136 
137 #if defined(__ARM_ARCH) || defined(__ARM_ACLE)
138     #include <arm_acle.h>
139 #endif
140 
141 #if defined(__GNUC__)
142     #include <stdint.h>
143 #endif
144 
145 /**
146  *
147  * @brief ACLE and Intrinsics
148  *
149  */
150 
151 // Note: Have __GNUC__, that is used to check for GCC , checks at the end
152 // as __GNUC__ is defined by non-GCC compilers as well
153 
154 /* Common intrinsics for all architectures */
155 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
156     #define CLZ __clz
157 #elif defined(__GNUC__)
158 /**
159   \brief   Count leading zeros
160   \details Counts the number of leading zeros of a data value.
161   \param [in]  value  Value to count the leading zeros
162   \return             number of leading zeros in value
163  */
CLZ(uint32_t value)164 __STATIC_FORCEINLINE uint8_t CLZ(uint32_t value)
165 {
166     /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
167        __builtin_clz(0) is undefined behaviour, so handle this case specially.
168        This guarantees Arm-compatible results if compiling on a non-Arm
169        target, and ensures the compiler doesn't decide to activate any
170        optimisations using the logic "value was passed to __builtin_clz, so it
171        is non-zero".
172        ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a
173        single CLZ instruction.
174      */
175     if (value == 0U)
176     {
177         return 32U;
178     }
179     return __builtin_clz(value);
180 }
181 #endif
182 
183 // ACLE intrinsics under groups __ARM_FEATURE_QBIT, __ARM_FEATURE_DSP , __ARM_FEATURE_SAT, __ARM_FEATURE_SIMD32
184 
185 // Note: Just __ARM_FEATURE_DSP is checked to collect all intrinsics from the above mentioned groups
186 
187 #if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
188 
189     // Common intrinsics
190     #define SMLABB __smlabb
191     #define SMLATT __smlatt
192     #define QADD __qadd
193     #define QSUB8 __qsub8
194     #define QSUB16 __qsub16
195     #define SADD16 __sadd16
196 
197     // Compiler specifc variants of intrinsics. Create a new section or file for IAR if needed
198     #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
199 
200         #define SMULBB __smulbb
201         #define SMULTT __smultt
202         #define ROR __ror
203         #define SXTB16 __sxtb16
204         #define SXTAB16 __sxtab16
205         #define SXTB16_RORn(ARG1, ARG2) SXTB16(ROR(ARG1, ARG2))
206         #define SXTAB16_RORn(ARG1, ARG2, ARG3) SXTAB16(ARG1, ROR(ARG2, ARG3))
207         #define SMLAD __smlad
208         // PKH<XY> translates into pkh<xy> on AC6
209         #define PKHBT(ARG1, ARG2, ARG3)                                                                                \
210             (((((uint32_t)(ARG1))) & 0x0000FFFFUL) | ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL))
211         #define PKHTB(ARG1, ARG2, ARG3)                                                                                \
212             (((((uint32_t)(ARG1))) & 0xFFFF0000UL) | ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL))
213 
214     #elif defined(__GNUC__)
215 
216         #define PKHBT(ARG1, ARG2, ARG3)                                                                                \
217             __extension__({                                                                                            \
218                 uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2);                                                      \
219                 __ASM("pkhbt %0, %1, %2, lsl %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3));                 \
220                 __RES;                                                                                                 \
221             })
222         #define PKHTB(ARG1, ARG2, ARG3)                                                                                \
223             __extension__({                                                                                            \
224                 uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2);                                                      \
225                 if (ARG3 == 0)                                                                                         \
226                     __ASM("pkhtb %0, %1, %2" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2));                                \
227                 else                                                                                                   \
228                     __ASM("pkhtb %0, %1, %2, asr %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3));             \
229                 __RES;                                                                                                 \
230             })
231 
SXTAB16(uint32_t op1,uint32_t op2)232 __STATIC_FORCEINLINE uint32_t SXTAB16(uint32_t op1, uint32_t op2)
233 {
234     uint32_t result;
235 
236     __ASM("sxtab16 %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
237     return (result);
238 }
239 
SXTB16(uint32_t op1)240 __STATIC_FORCEINLINE uint32_t SXTB16(uint32_t op1)
241 {
242     uint32_t result;
243 
244     __ASM("sxtb16 %0, %1" : "=r"(result) : "r"(op1));
245     return (result);
246 }
247 
248 // __smlad is defined by GCC, but results in a performance drop(Tested on Arm GNU Toolchain version 11.x and 12.x)
SMLAD(uint32_t op1,uint32_t op2,uint32_t op3)249 __STATIC_FORCEINLINE uint32_t SMLAD(uint32_t op1, uint32_t op2, uint32_t op3)
250 {
251     uint32_t result;
252 
253     __ASM volatile("smlad %0, %1, %2, %3" : "=r"(result) : "r"(op1), "r"(op2), "r"(op3));
254     return (result);
255 }
256 
ROR(uint32_t op1,uint32_t op2)257 __STATIC_FORCEINLINE uint32_t ROR(uint32_t op1, uint32_t op2)
258 {
259     op2 %= 32U;
260     if (op2 == 0U)
261     {
262         return op1;
263     }
264     return (op1 >> op2) | (op1 << (32U - op2));
265 }
266 
SXTB16_RORn(uint32_t op1,uint32_t rotate)267 __STATIC_FORCEINLINE uint32_t SXTB16_RORn(uint32_t op1, uint32_t rotate)
268 {
269     uint32_t result;
270     if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
271     {
272         __ASM volatile("sxtb16 %0, %1, ROR %2" : "=r"(result) : "r"(op1), "i"(rotate));
273     }
274     else
275     {
276         result = SXTB16(ROR(op1, rotate));
277     }
278     return result;
279 }
280 
SXTAB16_RORn(uint32_t op1,uint32_t op2,uint32_t rotate)281 __STATIC_FORCEINLINE uint32_t SXTAB16_RORn(uint32_t op1, uint32_t op2, uint32_t rotate)
282 {
283     uint32_t result;
284     if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
285     {
286         __ASM volatile("sxtab16 %0, %1, %2, ROR %3" : "=r"(result) : "r"(op1), "r"(op2), "i"(rotate));
287     }
288     else
289     {
290         result = SXTAB16(op1, ROR(op2, rotate));
291     }
292     return result;
293 }
294 
295 // Inline assembly routines for ACLE intrinsics that are not defined by GCC toolchain
SMULBB(uint32_t op1,uint32_t op2)296 __STATIC_FORCEINLINE uint32_t SMULBB(uint32_t op1, uint32_t op2)
297 {
298     uint32_t result;
299 
300     __ASM volatile("smulbb %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
301     return (result);
302 }
303 
SMULTT(uint32_t op1,uint32_t op2)304 __STATIC_FORCEINLINE uint32_t SMULTT(uint32_t op1, uint32_t op2)
305 {
306     uint32_t result;
307 
308     __ASM volatile("smultt %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
309     return (result);
310 }
311     #endif
312 
313 #endif
314 
315 #endif /* #ifndef ARM_NN_COMPILER_H */
316