1 /*
2  * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_nn_compiler.h
22  * Description:  Generic compiler header
23  *
24  * $Date:        31 January 2023
25  * $Revision:    V.1.1.0
26  *
27  * Target :  Arm(R) M-Profile Architecture
28  * -------------------------------------------------------------------- */
29 
30 #ifndef ARM_NN_COMPILER_H
31 #define ARM_NN_COMPILER_H
32 
33 /**
34  *
35  * @brief Arm C-Language Extension(ACLE) Includes
36  *
37  */
38 
39 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
40 
41     #ifndef __ASM
42         #define __ASM __asm
43     #endif
44     #ifndef __INLINE
45         #define __INLINE __inline
46     #endif
47     #ifndef __STATIC_INLINE
48         #define __STATIC_INLINE static __inline
49     #endif
50     #ifndef __STATIC_FORCEINLINE
51         #define __STATIC_FORCEINLINE __attribute__((always_inline)) static __inline
52     #endif
53     #ifndef __RESTRICT
54         #define __RESTRICT __restrict
55     #endif
56 
57 #elif defined(__ICCARM__)
58 
59     #warning IAR support is not tested
60     #ifndef __ASM
61         #define __ASM __asm
62     #endif
63     #ifndef __INLINE
64         #define __INLINE inline
65     #endif
66     #ifndef __STATIC_INLINE
67         #define __STATIC_INLINE static inline
68     #endif
69     #ifndef __FORCEINLINE
70         #define __FORCEINLINE _Pragma("inline=forced")
71     #endif
72     #ifndef __STATIC_FORCEINLINE
73         #define __STATIC_FORCEINLINE __FORCEINLINE __STATIC_INLINE
74     #endif
75 
76 #elif defined(_MSC_VER)
77 
78     // Build for non Arm Cortex-M processors is not tested or supported.
79     // Use this section to stub any macros or intrinsics
80     #warning Unsupported compiler
81     #ifndef __STATIC_FORCEINLINE
82         #define __STATIC_FORCEINLINE static __forceinline
83     #endif
84     #ifndef __STATIC_INLINE
85         #define __STATIC_INLINE static __inline
86     #endif
87     #ifndef __ALIGNED
88         #define __ALIGNED(x) __declspec(align(x))
89     #endif
90 
91 #elif defined(__GNUC__)
92 
93     #ifndef __ASM
94         #define __ASM __asm
95     #endif
96     #ifndef __INLINE
97         #define __INLINE inline
98     #endif
99     #ifndef __STATIC_INLINE
100         #define __STATIC_INLINE static inline
101     #endif
102     #ifndef __STATIC_FORCEINLINE
103         #define __STATIC_FORCEINLINE __attribute__((always_inline)) static inline
104     #endif
105     #ifndef __RESTRICT
106         #define __RESTRICT __restrict
107     #endif
108 
109 #else
110 
111     #error Unsupported compiler. Add support as needed
112 
113 #endif
114 
115 /**
116  *
117  * @brief Compiler specific diagnostic adjustment / fixes if applicable
118  *
119  */
120 
121 // Note: __ARM_ARCH is used with M-profile architecture as the target here.
122 #if defined(__GNUC__)
123     #if (__GNUC__ == 12 && (__GNUC_MINOR__ <= 2)) && defined(__ARM_ARCH)
124         // Workaround for 'Internal Compiler Error' on Arm GNU Toolchain rel 12.2.x
125         // https://gcc.gnu.org/pipermail/gcc-patches/2022-December/607963.html
126         #define ARM_GCC_12_2_ICE
127     #endif
128 #endif
129 
130 #if ((__ARM_FEATURE_MVE & 3) == 3) || (__ARM_FEATURE_MVE & 1)
131     #include <arm_mve.h>
132 #endif
133 
134 #if defined(__ARM_ARCH) || defined(__ARM_ACLE)
135     #include <arm_acle.h>
136 #endif
137 
138 /**
139  *
140  * @brief ACLE and Intrinsics
141  *
142  */
143 
144 // Note: Have __GNUC__, that is used to check for GCC , checks at the end
145 // as __GNUC__ is defined by non-GCC compilers as well
146 
147 /* Common intrinsics for all architectures */
148 #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
149     #define CLZ __clz
150 #elif defined(__GNUC__)
151 /**
152   \brief   Count leading zeros
153   \details Counts the number of leading zeros of a data value.
154   \param [in]  value  Value to count the leading zeros
155   \return             number of leading zeros in value
156  */
CLZ(uint32_t value)157 __STATIC_FORCEINLINE uint8_t CLZ(uint32_t value)
158 {
159     /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
160        __builtin_clz(0) is undefined behaviour, so handle this case specially.
161        This guarantees Arm-compatible results if compiling on a non-Arm
162        target, and ensures the compiler doesn't decide to activate any
163        optimisations using the logic "value was passed to __builtin_clz, so it
164        is non-zero".
165        ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a
166        single CLZ instruction.
167      */
168     if (value == 0U)
169     {
170         return 32U;
171     }
172     return __builtin_clz(value);
173 }
174 #endif
175 
176 // ACLE intrinsics under groups __ARM_FEATURE_QBIT, __ARM_FEATURE_DSP , __ARM_FEATURE_SAT, __ARM_FEATURE_SIMD32
177 
178 // Note: Just __ARM_FEATURE_DSP is checked to collect all intrinsics from the above mentioned groups
179 
180 #if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
181 
182     // Common intrinsics
183     #define SMLABB __smlabb
184     #define SMLATT __smlatt
185     #define QADD __qadd
186     #define QSUB8 __qsub8
187     #define QSUB16 __qsub16
188     #define SADD16 __sadd16
189 
190     // Compiler specifc variants of intrinsics. Create a new section or file for IAR if needed
191     #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
192 
193         #define SMULBB __smulbb
194         #define SMULTT __smultt
195         #define ROR __ror
196         #define SXTB16 __sxtb16
197         #define SXTAB16 __sxtab16
198         #define SXTB16_RORn(ARG1, ARG2) SXTB16(ROR(ARG1, ARG2))
199         #define SXTAB16_RORn(ARG1, ARG2, ARG3) SXTAB16(ARG1, ROR(ARG2, ARG3))
200         #define SMLAD __smlad
201         // PKH<XY> translates into pkh<xy> on AC6
202         #define PKHBT(ARG1, ARG2, ARG3)                                                                                \
203             (((((uint32_t)(ARG1))) & 0x0000FFFFUL) | ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL))
204         #define PKHTB(ARG1, ARG2, ARG3)                                                                                \
205             (((((uint32_t)(ARG1))) & 0xFFFF0000UL) | ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL))
206 
207     #elif defined(__GNUC__)
208 
209         #define PKHBT(ARG1, ARG2, ARG3)                                                                                \
210             __extension__({                                                                                            \
211                 uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2);                                                      \
212                 __ASM("pkhbt %0, %1, %2, lsl %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3));                 \
213                 __RES;                                                                                                 \
214             })
215         #define PKHTB(ARG1, ARG2, ARG3)                                                                                \
216             __extension__({                                                                                            \
217                 uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2);                                                      \
218                 if (ARG3 == 0)                                                                                         \
219                     __ASM("pkhtb %0, %1, %2" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2));                                \
220                 else                                                                                                   \
221                     __ASM("pkhtb %0, %1, %2, asr %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3));             \
222                 __RES;                                                                                                 \
223             })
224 
SXTAB16(uint32_t op1,uint32_t op2)225 __STATIC_FORCEINLINE uint32_t SXTAB16(uint32_t op1, uint32_t op2)
226 {
227     uint32_t result;
228 
229     __ASM("sxtab16 %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
230     return (result);
231 }
232 
SXTB16(uint32_t op1)233 __STATIC_FORCEINLINE uint32_t SXTB16(uint32_t op1)
234 {
235     uint32_t result;
236 
237     __ASM("sxtb16 %0, %1" : "=r"(result) : "r"(op1));
238     return (result);
239 }
240 
241 // __smlad is defined by GCC, but results in a performance drop(Tested on Arm GNU Toolchain version 11.x and 12.x)
SMLAD(uint32_t op1,uint32_t op2,uint32_t op3)242 __STATIC_FORCEINLINE uint32_t SMLAD(uint32_t op1, uint32_t op2, uint32_t op3)
243 {
244     uint32_t result;
245 
246     __ASM volatile("smlad %0, %1, %2, %3" : "=r"(result) : "r"(op1), "r"(op2), "r"(op3));
247     return (result);
248 }
249 
ROR(uint32_t op1,uint32_t op2)250 __STATIC_FORCEINLINE uint32_t ROR(uint32_t op1, uint32_t op2)
251 {
252     op2 %= 32U;
253     if (op2 == 0U)
254     {
255         return op1;
256     }
257     return (op1 >> op2) | (op1 << (32U - op2));
258 }
259 
SXTB16_RORn(uint32_t op1,uint32_t rotate)260 __STATIC_FORCEINLINE uint32_t SXTB16_RORn(uint32_t op1, uint32_t rotate)
261 {
262     uint32_t result;
263     if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
264     {
265         __ASM volatile("sxtb16 %0, %1, ROR %2" : "=r"(result) : "r"(op1), "i"(rotate));
266     }
267     else
268     {
269         result = SXTB16(ROR(op1, rotate));
270     }
271     return result;
272 }
273 
SXTAB16_RORn(uint32_t op1,uint32_t op2,uint32_t rotate)274 __STATIC_FORCEINLINE uint32_t SXTAB16_RORn(uint32_t op1, uint32_t op2, uint32_t rotate)
275 {
276     uint32_t result;
277     if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
278     {
279         __ASM volatile("sxtab16 %0, %1, %2, ROR %3" : "=r"(result) : "r"(op1), "r"(op2), "i"(rotate));
280     }
281     else
282     {
283         result = SXTAB16(op1, ROR(op2, rotate));
284     }
285     return result;
286 }
287 
288 // Inline assembly routines for ACLE intrinsics that are not defined by GCC toolchain
SMULBB(uint32_t op1,uint32_t op2)289 __STATIC_FORCEINLINE uint32_t SMULBB(uint32_t op1, uint32_t op2)
290 {
291     uint32_t result;
292 
293     __ASM volatile("smulbb %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
294     return (result);
295 }
296 
SMULTT(uint32_t op1,uint32_t op2)297 __STATIC_FORCEINLINE uint32_t SMULTT(uint32_t op1, uint32_t op2)
298 {
299     uint32_t result;
300 
301     __ASM volatile("smultt %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
302     return (result);
303 }
304     #endif
305 
306 #endif
307 
308 #endif /* #ifndef ARM_NN_COMPILER_H */