1 /*
2 * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 /* ----------------------------------------------------------------------
20 * Project: CMSIS NN Library
21 * Title: arm_nn_compiler.h
22 * Description: Generic compiler header
23 *
24 * $Date: 16 January 2024
25 * $Revision: V.1.2.2
26 *
27 * Target : Arm(R) M-Profile Architecture
28 * -------------------------------------------------------------------- */
29
30 #ifndef ARM_NN_COMPILER_H
31 #define ARM_NN_COMPILER_H
32
33 /**
34 *
35 * @brief Arm C-Language Extension(ACLE) Includes
36 *
37 */
38
39 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
40
41 #ifndef __ASM
42 #define __ASM __asm
43 #endif
44 #ifndef __INLINE
45 #define __INLINE __inline
46 #endif
47 #ifndef __STATIC_INLINE
48 #define __STATIC_INLINE static __inline
49 #endif
50 #ifndef __STATIC_FORCEINLINE
51 #define __STATIC_FORCEINLINE __attribute__((always_inline)) static __inline
52 #endif
53 #ifndef __RESTRICT
54 #define __RESTRICT __restrict
55 #endif
56
57 #elif defined(__ICCARM__)
58
59 #warning IAR support is not tested
60 #ifndef __ASM
61 #define __ASM __asm
62 #endif
63 #ifndef __INLINE
64 #define __INLINE inline
65 #endif
66 #ifndef __STATIC_INLINE
67 #define __STATIC_INLINE static inline
68 #endif
69 #ifndef __FORCEINLINE
70 #define __FORCEINLINE _Pragma("inline=forced")
71 #endif
72 #ifndef __STATIC_FORCEINLINE
73 #define __STATIC_FORCEINLINE __FORCEINLINE __STATIC_INLINE
74 #endif
75 #ifndef __RESTRICT
76 #define __RESTRICT __restrict
77 #endif
78
79 #elif defined(_MSC_VER)
80
81 // Build for non Arm Cortex-M processors is not tested or supported.
82 // Use this section to stub any macros or intrinsics
83 #warning Unsupported compiler
84 #ifndef __STATIC_FORCEINLINE
85 #define __STATIC_FORCEINLINE static __forceinline
86 #endif
87 #ifndef __STATIC_INLINE
88 #define __STATIC_INLINE static __inline
89 #endif
90 #ifndef __ALIGNED
91 #define __ALIGNED(x) __declspec(align(x))
92 #endif
93
94 #elif defined(__GNUC__)
95
96 #ifndef __ASM
97 #define __ASM __asm
98 #endif
99 #ifndef __INLINE
100 #define __INLINE inline
101 #endif
102 #ifndef __STATIC_INLINE
103 #define __STATIC_INLINE static inline
104 #endif
105 #ifndef __STATIC_FORCEINLINE
106 #define __STATIC_FORCEINLINE __attribute__((always_inline)) static inline
107 #endif
108 #ifndef __RESTRICT
109 #define __RESTRICT __restrict
110 #endif
111
112 #else
113
114 #error Unsupported compiler. Add support as needed
115
116 #endif
117
118 /**
119 *
120 * @brief Compiler specific diagnostic adjustment / fixes if applicable
121 *
122 */
123
124 // Note: __ARM_ARCH is used with M-profile architecture as the target here.
125 #if defined(__GNUC__)
126 #if (__GNUC__ == 12 && (__GNUC_MINOR__ <= 2)) && defined(__ARM_ARCH)
127 // Workaround for 'Internal Compiler Error' on Arm GNU Toolchain rel 12.2.x
128 // https://gcc.gnu.org/pipermail/gcc-patches/2022-December/607963.html
129 #define ARM_GCC_12_2_ICE
130 #endif
131 #endif
132
133 #if defined(__ARM_FEATURE_MVE) && ((__ARM_FEATURE_MVE & 3) == 3) || (__ARM_FEATURE_MVE & 1)
134 #include <arm_mve.h>
135 #endif
136
137 #if defined(__ARM_ARCH) || defined(__ARM_ACLE)
138 #include <arm_acle.h>
139 #endif
140
141 #if defined(__GNUC__)
142 #include <stdint.h>
143 #endif
144
145 /**
146 *
147 * @brief ACLE and Intrinsics
148 *
149 */
150
151 // Note: Have __GNUC__, that is used to check for GCC , checks at the end
152 // as __GNUC__ is defined by non-GCC compilers as well
153
154 /* Common intrinsics for all architectures */
155 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
156 #define CLZ __clz
157 #elif defined(__GNUC__)
158 /**
159 \brief Count leading zeros
160 \details Counts the number of leading zeros of a data value.
161 \param [in] value Value to count the leading zeros
162 \return number of leading zeros in value
163 */
CLZ(uint32_t value)164 __STATIC_FORCEINLINE uint8_t CLZ(uint32_t value)
165 {
166 /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
167 __builtin_clz(0) is undefined behaviour, so handle this case specially.
168 This guarantees Arm-compatible results if compiling on a non-Arm
169 target, and ensures the compiler doesn't decide to activate any
170 optimisations using the logic "value was passed to __builtin_clz, so it
171 is non-zero".
172 ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a
173 single CLZ instruction.
174 */
175 if (value == 0U)
176 {
177 return 32U;
178 }
179 return __builtin_clz(value);
180 }
181 #endif
182
183 // ACLE intrinsics under groups __ARM_FEATURE_QBIT, __ARM_FEATURE_DSP , __ARM_FEATURE_SAT, __ARM_FEATURE_SIMD32
184
185 // Note: Just __ARM_FEATURE_DSP is checked to collect all intrinsics from the above mentioned groups
186
187 #if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
188
189 // Common intrinsics
190 #define SMLABB __smlabb
191 #define SMLATT __smlatt
192 #define QADD __qadd
193 #define QSUB8 __qsub8
194 #define QSUB16 __qsub16
195 #define SADD16 __sadd16
196
197 // Compiler specifc variants of intrinsics. Create a new section or file for IAR if needed
198 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
199
200 #define SMULBB __smulbb
201 #define SMULTT __smultt
202 #define ROR __ror
203 #define SXTB16 __sxtb16
204 #define SXTAB16 __sxtab16
205 #define SXTB16_RORn(ARG1, ARG2) SXTB16(ROR(ARG1, ARG2))
206 #define SXTAB16_RORn(ARG1, ARG2, ARG3) SXTAB16(ARG1, ROR(ARG2, ARG3))
207 #define SMLAD __smlad
208 // PKH<XY> translates into pkh<xy> on AC6
209 #define PKHBT(ARG1, ARG2, ARG3) \
210 (((((uint32_t)(ARG1))) & 0x0000FFFFUL) | ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL))
211 #define PKHTB(ARG1, ARG2, ARG3) \
212 (((((uint32_t)(ARG1))) & 0xFFFF0000UL) | ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL))
213
214 #elif defined(__GNUC__)
215
216 #define PKHBT(ARG1, ARG2, ARG3) \
217 __extension__({ \
218 uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
219 __ASM("pkhbt %0, %1, %2, lsl %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3)); \
220 __RES; \
221 })
222 #define PKHTB(ARG1, ARG2, ARG3) \
223 __extension__({ \
224 uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
225 if (ARG3 == 0) \
226 __ASM("pkhtb %0, %1, %2" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2)); \
227 else \
228 __ASM("pkhtb %0, %1, %2, asr %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3)); \
229 __RES; \
230 })
231
SXTAB16(uint32_t op1,uint32_t op2)232 __STATIC_FORCEINLINE uint32_t SXTAB16(uint32_t op1, uint32_t op2)
233 {
234 uint32_t result;
235
236 __ASM("sxtab16 %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
237 return (result);
238 }
239
SXTB16(uint32_t op1)240 __STATIC_FORCEINLINE uint32_t SXTB16(uint32_t op1)
241 {
242 uint32_t result;
243
244 __ASM("sxtb16 %0, %1" : "=r"(result) : "r"(op1));
245 return (result);
246 }
247
248 // __smlad is defined by GCC, but results in a performance drop(Tested on Arm GNU Toolchain version 11.x and 12.x)
SMLAD(uint32_t op1,uint32_t op2,uint32_t op3)249 __STATIC_FORCEINLINE uint32_t SMLAD(uint32_t op1, uint32_t op2, uint32_t op3)
250 {
251 uint32_t result;
252
253 __ASM volatile("smlad %0, %1, %2, %3" : "=r"(result) : "r"(op1), "r"(op2), "r"(op3));
254 return (result);
255 }
256
ROR(uint32_t op1,uint32_t op2)257 __STATIC_FORCEINLINE uint32_t ROR(uint32_t op1, uint32_t op2)
258 {
259 op2 %= 32U;
260 if (op2 == 0U)
261 {
262 return op1;
263 }
264 return (op1 >> op2) | (op1 << (32U - op2));
265 }
266
SXTB16_RORn(uint32_t op1,uint32_t rotate)267 __STATIC_FORCEINLINE uint32_t SXTB16_RORn(uint32_t op1, uint32_t rotate)
268 {
269 uint32_t result;
270 if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
271 {
272 __ASM volatile("sxtb16 %0, %1, ROR %2" : "=r"(result) : "r"(op1), "i"(rotate));
273 }
274 else
275 {
276 result = SXTB16(ROR(op1, rotate));
277 }
278 return result;
279 }
280
SXTAB16_RORn(uint32_t op1,uint32_t op2,uint32_t rotate)281 __STATIC_FORCEINLINE uint32_t SXTAB16_RORn(uint32_t op1, uint32_t op2, uint32_t rotate)
282 {
283 uint32_t result;
284 if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
285 {
286 __ASM volatile("sxtab16 %0, %1, %2, ROR %3" : "=r"(result) : "r"(op1), "r"(op2), "i"(rotate));
287 }
288 else
289 {
290 result = SXTAB16(op1, ROR(op2, rotate));
291 }
292 return result;
293 }
294
295 // Inline assembly routines for ACLE intrinsics that are not defined by GCC toolchain
SMULBB(uint32_t op1,uint32_t op2)296 __STATIC_FORCEINLINE uint32_t SMULBB(uint32_t op1, uint32_t op2)
297 {
298 uint32_t result;
299
300 __ASM volatile("smulbb %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
301 return (result);
302 }
303
SMULTT(uint32_t op1,uint32_t op2)304 __STATIC_FORCEINLINE uint32_t SMULTT(uint32_t op1, uint32_t op2)
305 {
306 uint32_t result;
307
308 __ASM volatile("smultt %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
309 return (result);
310 }
311 #endif
312
313 #endif
314
315 #endif /* #ifndef ARM_NN_COMPILER_H */
316