1 /*
2 * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 /* ----------------------------------------------------------------------
20 * Project: CMSIS NN Library
21 * Title: arm_nn_compiler.h
22 * Description: Generic compiler header
23 *
24 * $Date: 20 June 2024
25 * $Revision: V.1.3.0
26 *
27 * Target : Arm(R) M-Profile Architecture
28 * -------------------------------------------------------------------- */
29
30 #ifndef ARM_NN_COMPILER_H
31 #define ARM_NN_COMPILER_H
32
33 /**
34 *
35 * @brief Arm C-Language Extension(ACLE) Includes
36 *
37 */
38
39 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
40
41 #ifndef __ASM
42 #define __ASM __asm
43 #endif
44 #ifndef __INLINE
45 #define __INLINE __inline
46 #endif
47 #ifndef __STATIC_INLINE
48 #define __STATIC_INLINE static __inline
49 #endif
50 #ifndef __STATIC_FORCEINLINE
51 #define __STATIC_FORCEINLINE __attribute__((always_inline)) static __inline
52 #endif
53 #ifndef __RESTRICT
54 #define __RESTRICT __restrict
55 #endif
56
57 #elif defined(__ICCARM__)
58
59 #warning IAR support is not tested
60 #ifndef __ASM
61 #define __ASM __asm
62 #endif
63 #ifndef __INLINE
64 #define __INLINE inline
65 #endif
66 #ifndef __STATIC_INLINE
67 #define __STATIC_INLINE static inline
68 #endif
69 #ifndef __FORCEINLINE
70 #define __FORCEINLINE _Pragma("inline=forced")
71 #endif
72 #ifndef __STATIC_FORCEINLINE
73 #define __STATIC_FORCEINLINE __FORCEINLINE __STATIC_INLINE
74 #endif
75 #ifndef __RESTRICT
76 #define __RESTRICT __restrict
77 #endif
78
79 #elif defined(_MSC_VER)
80
81 // Build for non Arm Cortex-M processors is not tested or supported.
82 // Use this section to stub any macros or intrinsics
83 #warning Unsupported compiler
84 #ifndef __STATIC_FORCEINLINE
85 #define __STATIC_FORCEINLINE static __forceinline
86 #endif
87 #ifndef __STATIC_INLINE
88 #define __STATIC_INLINE static __inline
89 #endif
90 #ifndef __ALIGNED
91 #define __ALIGNED(x) __declspec(align(x))
92 #endif
93
94 #elif defined(__GNUC__)
95
96 #ifndef __ASM
97 #define __ASM __asm
98 #endif
99 #ifndef __INLINE
100 #define __INLINE inline
101 #endif
102 #ifndef __STATIC_INLINE
103 #define __STATIC_INLINE static inline
104 #endif
105 #ifndef __STATIC_FORCEINLINE
106 #define __STATIC_FORCEINLINE __attribute__((always_inline)) static inline
107 #endif
108 #ifndef __RESTRICT
109 #define __RESTRICT __restrict
110 #endif
111
112 #else
113
114 #error Unsupported compiler. Add support as needed
115
116 #endif
117
118 /**
119 *
120 * @brief Compiler specific diagnostic adjustment / fixes if applicable
121 *
122 */
123
124 // Note: __ARM_ARCH is used with M-profile architecture as the target here.
125 #if defined(__GNUC__)
126 #if (__GNUC__ == 12 && (__GNUC_MINOR__ <= 2)) && defined(__ARM_ARCH)
127 // Workaround for 'Internal Compiler Error' on Arm GNU Toolchain rel 12.2.x
128 // https://gcc.gnu.org/pipermail/gcc-patches/2022-December/607963.html
129 #define ARM_GCC_12_2_ICE
130 #endif
131 #endif
132
133 #if defined(__ARM_FEATURE_MVE) && ((__ARM_FEATURE_MVE & 3) == 3) || (__ARM_FEATURE_MVE & 1)
134 #include <arm_mve.h>
135 #endif
136
137 #if defined(__ARM_ARCH) || defined(__ARM_ACLE)
138 #include <arm_acle.h>
139 #endif
140
141 #if defined(__GNUC__)
142 #include <stdint.h>
143 #endif
144
145 /**
146 *
147 * @brief ACLE and Intrinsics
148 *
149 */
150
151 // Note: Have __GNUC__, that is used to check for GCC , checks at the end
152 // as __GNUC__ is defined by non-GCC compilers as well
153
154 /* Common intrinsics for all architectures */
155 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
156 #define CLZ __clz
157 #elif defined(__GNUC__)
158 /**
159 \brief Count leading zeros
160 \details Counts the number of leading zeros of a data value.
161 \param [in] value Value to count the leading zeros
162 \return number of leading zeros in value
163 */
CLZ(uint32_t value)164 __STATIC_FORCEINLINE uint8_t CLZ(uint32_t value)
165 {
166 /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
167 __builtin_clz(0) is undefined behaviour, so handle this case specially.
168 This guarantees Arm-compatible results if compiling on a non-Arm
169 target, and ensures the compiler doesn't decide to activate any
170 optimisations using the logic "value was passed to __builtin_clz, so it
171 is non-zero".
172 ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a
173 single CLZ instruction.
174 */
175 if (value == 0U)
176 {
177 return 32U;
178 }
179 return __builtin_clz(value);
180 }
181 #endif
182
183 // ACLE intrinsics under groups __ARM_FEATURE_QBIT, __ARM_FEATURE_DSP , __ARM_FEATURE_SAT, __ARM_FEATURE_SIMD32
184
185 // Note: Just __ARM_FEATURE_DSP is checked to collect all intrinsics from the above mentioned groups
186
187 #if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
188
189 // Common intrinsics
190 #define SMLABB __smlabb
191 #define SMLATT __smlatt
192 #define SMLALD __smlald
193 #define QADD __qadd
194 #define QSUB8 __qsub8
195 #define QSUB16 __qsub16
196 #define SADD16 __sadd16
197
198 // Compiler specifc variants of intrinsics. Create a new section or file for IAR if needed
199 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
200
201 #define SMULBB __smulbb
202 #define SMULTT __smultt
203 #define ROR __ror
204 #define SXTB16 __sxtb16
205 #define SXTAB16 __sxtab16
206 #define SXTB16_RORn(ARG1, ARG2) SXTB16(ROR(ARG1, ARG2))
207 #define SXTAB16_RORn(ARG1, ARG2, ARG3) SXTAB16(ARG1, ROR(ARG2, ARG3))
208 #define SMLAD __smlad
209 // PKH<XY> translates into pkh<xy> on AC6
210 #define PKHBT(ARG1, ARG2, ARG3) \
211 (((((uint32_t)(ARG1))) & 0x0000FFFFUL) | ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL))
212 #define PKHTB(ARG1, ARG2, ARG3) \
213 (((((uint32_t)(ARG1))) & 0xFFFF0000UL) | ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL))
214
215 #elif defined(__GNUC__)
216
217 #define PKHBT(ARG1, ARG2, ARG3) \
218 __extension__({ \
219 uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
220 __ASM("pkhbt %0, %1, %2, lsl %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3)); \
221 __RES; \
222 })
223 #define PKHTB(ARG1, ARG2, ARG3) \
224 __extension__({ \
225 uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
226 if (ARG3 == 0) \
227 __ASM("pkhtb %0, %1, %2" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2)); \
228 else \
229 __ASM("pkhtb %0, %1, %2, asr %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3)); \
230 __RES; \
231 })
232
SXTAB16(uint32_t op1,uint32_t op2)233 __STATIC_FORCEINLINE uint32_t SXTAB16(uint32_t op1, uint32_t op2)
234 {
235 uint32_t result;
236
237 __ASM("sxtab16 %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
238 return (result);
239 }
240
SXTB16(uint32_t op1)241 __STATIC_FORCEINLINE uint32_t SXTB16(uint32_t op1)
242 {
243 uint32_t result;
244
245 __ASM("sxtb16 %0, %1" : "=r"(result) : "r"(op1));
246 return (result);
247 }
248
249 // __smlad is defined by GCC, but results in a performance drop(Tested on Arm GNU Toolchain version 11.x and 12.x)
SMLAD(uint32_t op1,uint32_t op2,uint32_t op3)250 __STATIC_FORCEINLINE uint32_t SMLAD(uint32_t op1, uint32_t op2, uint32_t op3)
251 {
252 uint32_t result;
253
254 __ASM volatile("smlad %0, %1, %2, %3" : "=r"(result) : "r"(op1), "r"(op2), "r"(op3));
255 return (result);
256 }
257
ROR(uint32_t op1,uint32_t op2)258 __STATIC_FORCEINLINE uint32_t ROR(uint32_t op1, uint32_t op2)
259 {
260 op2 %= 32U;
261 if (op2 == 0U)
262 {
263 return op1;
264 }
265 return (op1 >> op2) | (op1 << (32U - op2));
266 }
267
SXTB16_RORn(uint32_t op1,uint32_t rotate)268 __STATIC_FORCEINLINE uint32_t SXTB16_RORn(uint32_t op1, uint32_t rotate)
269 {
270 uint32_t result;
271 if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
272 {
273 __ASM volatile("sxtb16 %0, %1, ROR %2" : "=r"(result) : "r"(op1), "i"(rotate));
274 }
275 else
276 {
277 result = SXTB16(ROR(op1, rotate));
278 }
279 return result;
280 }
281
SXTAB16_RORn(uint32_t op1,uint32_t op2,uint32_t rotate)282 __STATIC_FORCEINLINE uint32_t SXTAB16_RORn(uint32_t op1, uint32_t op2, uint32_t rotate)
283 {
284 uint32_t result;
285 if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
286 {
287 __ASM volatile("sxtab16 %0, %1, %2, ROR %3" : "=r"(result) : "r"(op1), "r"(op2), "i"(rotate));
288 }
289 else
290 {
291 result = SXTAB16(op1, ROR(op2, rotate));
292 }
293 return result;
294 }
295
296 // Inline assembly routines for ACLE intrinsics that are not defined by GCC toolchain
SMULBB(uint32_t op1,uint32_t op2)297 __STATIC_FORCEINLINE uint32_t SMULBB(uint32_t op1, uint32_t op2)
298 {
299 uint32_t result;
300
301 __ASM volatile("smulbb %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
302 return (result);
303 }
304
SMULTT(uint32_t op1,uint32_t op2)305 __STATIC_FORCEINLINE uint32_t SMULTT(uint32_t op1, uint32_t op2)
306 {
307 uint32_t result;
308
309 __ASM volatile("smultt %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
310 return (result);
311 }
312 #endif
313
314 #endif
315
316 #endif /* #ifndef ARM_NN_COMPILER_H */
317