1 /*
2  * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_nn_compiler.h
22  * Description:  Generic compiler header
23  *
24  * $Date:        20 June 2024
25  * $Revision:    V.1.3.0
26  *
27  * Target :  Arm(R) M-Profile Architecture
28  * -------------------------------------------------------------------- */
29 
30 #ifndef ARM_NN_COMPILER_H
31 #define ARM_NN_COMPILER_H
32 
33 /**
34  *
35  * @brief Arm C-Language Extension(ACLE) Includes
36  *
37  */
38 
39 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
40 
41     #ifndef __ASM
42         #define __ASM __asm
43     #endif
44     #ifndef __INLINE
45         #define __INLINE __inline
46     #endif
47     #ifndef __STATIC_INLINE
48         #define __STATIC_INLINE static __inline
49     #endif
50     #ifndef __STATIC_FORCEINLINE
51         #define __STATIC_FORCEINLINE __attribute__((always_inline)) static __inline
52     #endif
53     #ifndef __RESTRICT
54         #define __RESTRICT __restrict
55     #endif
56 
57 #elif defined(__ICCARM__)
58 
59     #warning IAR support is not tested
60     #ifndef __ASM
61         #define __ASM __asm
62     #endif
63     #ifndef __INLINE
64         #define __INLINE inline
65     #endif
66     #ifndef __STATIC_INLINE
67         #define __STATIC_INLINE static inline
68     #endif
69     #ifndef __FORCEINLINE
70         #define __FORCEINLINE _Pragma("inline=forced")
71     #endif
72     #ifndef __STATIC_FORCEINLINE
73         #define __STATIC_FORCEINLINE __FORCEINLINE __STATIC_INLINE
74     #endif
75     #ifndef __RESTRICT
76         #define __RESTRICT __restrict
77     #endif
78 
79 #elif defined(_MSC_VER)
80 
81     // Build for non Arm Cortex-M processors is not tested or supported.
82     // Use this section to stub any macros or intrinsics
83     #warning Unsupported compiler
84     #ifndef __STATIC_FORCEINLINE
85         #define __STATIC_FORCEINLINE static __forceinline
86     #endif
87     #ifndef __STATIC_INLINE
88         #define __STATIC_INLINE static __inline
89     #endif
90     #ifndef __ALIGNED
91         #define __ALIGNED(x) __declspec(align(x))
92     #endif
93 
94 #elif defined(__GNUC__)
95 
96     #ifndef __ASM
97         #define __ASM __asm
98     #endif
99     #ifndef __INLINE
100         #define __INLINE inline
101     #endif
102     #ifndef __STATIC_INLINE
103         #define __STATIC_INLINE static inline
104     #endif
105     #ifndef __STATIC_FORCEINLINE
106         #define __STATIC_FORCEINLINE __attribute__((always_inline)) static inline
107     #endif
108     #ifndef __RESTRICT
109         #define __RESTRICT __restrict
110     #endif
111 
112 #else
113 
114     #error Unsupported compiler. Add support as needed
115 
116 #endif
117 
118 /**
119  *
120  * @brief Compiler specific diagnostic adjustment / fixes if applicable
121  *
122  */
123 
124 // Note: __ARM_ARCH is used with M-profile architecture as the target here.
125 #if defined(__GNUC__)
126     #if (__GNUC__ == 12 && (__GNUC_MINOR__ <= 2)) && defined(__ARM_ARCH)
127         // Workaround for 'Internal Compiler Error' on Arm GNU Toolchain rel 12.2.x
128         // https://gcc.gnu.org/pipermail/gcc-patches/2022-December/607963.html
129         #define ARM_GCC_12_2_ICE
130     #endif
131 #endif
132 
133 #if defined(__ARM_FEATURE_MVE) && ((__ARM_FEATURE_MVE & 3) == 3) || (__ARM_FEATURE_MVE & 1)
134     #include <arm_mve.h>
135 #endif
136 
137 #if defined(__ARM_ARCH) || defined(__ARM_ACLE)
138     #include <arm_acle.h>
139 #endif
140 
141 #if defined(__GNUC__)
142     #include <stdint.h>
143 #endif
144 
145 /**
146  *
147  * @brief ACLE and Intrinsics
148  *
149  */
150 
151 // Note: Have __GNUC__, that is used to check for GCC , checks at the end
152 // as __GNUC__ is defined by non-GCC compilers as well
153 
154 /* Common intrinsics for all architectures */
155 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
156     #define CLZ __clz
157 #elif defined(__GNUC__)
158 /**
159   \brief   Count leading zeros
160   \details Counts the number of leading zeros of a data value.
161   \param [in]  value  Value to count the leading zeros
162   \return             number of leading zeros in value
163  */
CLZ(uint32_t value)164 __STATIC_FORCEINLINE uint8_t CLZ(uint32_t value)
165 {
166     /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
167        __builtin_clz(0) is undefined behaviour, so handle this case specially.
168        This guarantees Arm-compatible results if compiling on a non-Arm
169        target, and ensures the compiler doesn't decide to activate any
170        optimisations using the logic "value was passed to __builtin_clz, so it
171        is non-zero".
172        ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a
173        single CLZ instruction.
174      */
175     if (value == 0U)
176     {
177         return 32U;
178     }
179     return __builtin_clz(value);
180 }
181 #endif
182 
183 // ACLE intrinsics under groups __ARM_FEATURE_QBIT, __ARM_FEATURE_DSP , __ARM_FEATURE_SAT, __ARM_FEATURE_SIMD32
184 
185 // Note: Just __ARM_FEATURE_DSP is checked to collect all intrinsics from the above mentioned groups
186 
187 #if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
188 
189     // Common intrinsics
190     #define SMLABB __smlabb
191     #define SMLATT __smlatt
192     #define SMLALD __smlald
193     #define QADD __qadd
194     #define QSUB8 __qsub8
195     #define QSUB16 __qsub16
196     #define SADD16 __sadd16
197 
198     // Compiler specifc variants of intrinsics. Create a new section or file for IAR if needed
199     #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
200 
201         #define SMULBB __smulbb
202         #define SMULTT __smultt
203         #define ROR __ror
204         #define SXTB16 __sxtb16
205         #define SXTAB16 __sxtab16
206         #define SXTB16_RORn(ARG1, ARG2) SXTB16(ROR(ARG1, ARG2))
207         #define SXTAB16_RORn(ARG1, ARG2, ARG3) SXTAB16(ARG1, ROR(ARG2, ARG3))
208         #define SMLAD __smlad
209         // PKH<XY> translates into pkh<xy> on AC6
210         #define PKHBT(ARG1, ARG2, ARG3)                                                                                \
211             (((((uint32_t)(ARG1))) & 0x0000FFFFUL) | ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL))
212         #define PKHTB(ARG1, ARG2, ARG3)                                                                                \
213             (((((uint32_t)(ARG1))) & 0xFFFF0000UL) | ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL))
214 
215     #elif defined(__GNUC__)
216 
217         #define PKHBT(ARG1, ARG2, ARG3)                                                                                \
218             __extension__({                                                                                            \
219                 uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2);                                                      \
220                 __ASM("pkhbt %0, %1, %2, lsl %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3));                 \
221                 __RES;                                                                                                 \
222             })
223         #define PKHTB(ARG1, ARG2, ARG3)                                                                                \
224             __extension__({                                                                                            \
225                 uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2);                                                      \
226                 if (ARG3 == 0)                                                                                         \
227                     __ASM("pkhtb %0, %1, %2" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2));                                \
228                 else                                                                                                   \
229                     __ASM("pkhtb %0, %1, %2, asr %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3));             \
230                 __RES;                                                                                                 \
231             })
232 
SXTAB16(uint32_t op1,uint32_t op2)233 __STATIC_FORCEINLINE uint32_t SXTAB16(uint32_t op1, uint32_t op2)
234 {
235     uint32_t result;
236 
237     __ASM("sxtab16 %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
238     return (result);
239 }
240 
SXTB16(uint32_t op1)241 __STATIC_FORCEINLINE uint32_t SXTB16(uint32_t op1)
242 {
243     uint32_t result;
244 
245     __ASM("sxtb16 %0, %1" : "=r"(result) : "r"(op1));
246     return (result);
247 }
248 
249 // __smlad is defined by GCC, but results in a performance drop(Tested on Arm GNU Toolchain version 11.x and 12.x)
SMLAD(uint32_t op1,uint32_t op2,uint32_t op3)250 __STATIC_FORCEINLINE uint32_t SMLAD(uint32_t op1, uint32_t op2, uint32_t op3)
251 {
252     uint32_t result;
253 
254     __ASM volatile("smlad %0, %1, %2, %3" : "=r"(result) : "r"(op1), "r"(op2), "r"(op3));
255     return (result);
256 }
257 
ROR(uint32_t op1,uint32_t op2)258 __STATIC_FORCEINLINE uint32_t ROR(uint32_t op1, uint32_t op2)
259 {
260     op2 %= 32U;
261     if (op2 == 0U)
262     {
263         return op1;
264     }
265     return (op1 >> op2) | (op1 << (32U - op2));
266 }
267 
SXTB16_RORn(uint32_t op1,uint32_t rotate)268 __STATIC_FORCEINLINE uint32_t SXTB16_RORn(uint32_t op1, uint32_t rotate)
269 {
270     uint32_t result;
271     if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
272     {
273         __ASM volatile("sxtb16 %0, %1, ROR %2" : "=r"(result) : "r"(op1), "i"(rotate));
274     }
275     else
276     {
277         result = SXTB16(ROR(op1, rotate));
278     }
279     return result;
280 }
281 
SXTAB16_RORn(uint32_t op1,uint32_t op2,uint32_t rotate)282 __STATIC_FORCEINLINE uint32_t SXTAB16_RORn(uint32_t op1, uint32_t op2, uint32_t rotate)
283 {
284     uint32_t result;
285     if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
286     {
287         __ASM volatile("sxtab16 %0, %1, %2, ROR %3" : "=r"(result) : "r"(op1), "r"(op2), "i"(rotate));
288     }
289     else
290     {
291         result = SXTAB16(op1, ROR(op2, rotate));
292     }
293     return result;
294 }
295 
296 // Inline assembly routines for ACLE intrinsics that are not defined by GCC toolchain
SMULBB(uint32_t op1,uint32_t op2)297 __STATIC_FORCEINLINE uint32_t SMULBB(uint32_t op1, uint32_t op2)
298 {
299     uint32_t result;
300 
301     __ASM volatile("smulbb %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
302     return (result);
303 }
304 
SMULTT(uint32_t op1,uint32_t op2)305 __STATIC_FORCEINLINE uint32_t SMULTT(uint32_t op1, uint32_t op2)
306 {
307     uint32_t result;
308 
309     __ASM volatile("smultt %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
310     return (result);
311 }
312     #endif
313 
314 #endif
315 
316 #endif /* #ifndef ARM_NN_COMPILER_H */
317