aarch64-neon-scalar-x-indexed-elem.c source code [clang_source_code/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c]

1	// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
2	// RUN: -disable-O0-optnone -emit-llvm -o - %s \| opt -S -mem2reg \| FileCheck %s
3
4	// Test new aarch64 intrinsics and types
5
6	#include <arm_neon.h>
7
8
9	// CHECK-LABEL: define float @test_vmuls_lane_f32(float %a, <2 x float> %b) #0 {
10	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
11	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
12	// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
13	// CHECK: [[MUL:%.*]] = fmul float %a, [[VGET_LANE]]
14	// CHECK: ret float [[MUL]]
15	float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) {
16	return vmuls_lane_f32(a, b, 1);
17	}
18
19	// CHECK-LABEL: define double @test_vmuld_lane_f64(double %a, <1 x double> %b) #0 {
20	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
21	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
22	// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
23	// CHECK: [[MUL:%.*]] = fmul double %a, [[VGET_LANE]]
24	// CHECK: ret double [[MUL]]
25	float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) {
26	return vmuld_lane_f64(a, b, 0);
27	}
28
29	// CHECK-LABEL: define float @test_vmuls_laneq_f32(float %a, <4 x float> %b) #1 {
30	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
31	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
32	// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
33	// CHECK: [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]]
34	// CHECK: ret float [[MUL]]
35	float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) {
36	return vmuls_laneq_f32(a, b, 3);
37	}
38
39	// CHECK-LABEL: define double @test_vmuld_laneq_f64(double %a, <2 x double> %b) #1 {
40	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
41	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
42	// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
43	// CHECK: [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]]
44	// CHECK: ret double [[MUL]]
45	float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
46	return vmuld_laneq_f64(a, b, 1);
47	}
48
49	// CHECK-LABEL: define <1 x double> @test_vmul_n_f64(<1 x double> %a, double %b) #0 {
50	// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %a to double
51	// CHECK: [[TMP3:%.*]] = fmul double [[TMP2]], %b
52	// CHECK: [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
53	// CHECK: ret <1 x double> [[TMP4]]
54	float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
55	return vmul_n_f64(a, b);
56	}
57
58	// CHECK-LABEL: define float @test_vmulxs_lane_f32(float %a, <2 x float> %b) #0 {
59	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
60	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
61	// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
62	// CHECK: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]])
63	// CHECK: ret float [[VMULXS_F32_I]]
64	float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) {
65	return vmulxs_lane_f32(a, b, 1);
66	}
67
68	// CHECK-LABEL: define float @test_vmulxs_laneq_f32(float %a, <4 x float> %b) #1 {
69	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
70	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
71	// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
72	// CHECK: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]])
73	// CHECK: ret float [[VMULXS_F32_I]]
74	float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) {
75	return vmulxs_laneq_f32(a, b, 3);
76	}
77
78	// CHECK-LABEL: define double @test_vmulxd_lane_f64(double %a, <1 x double> %b) #0 {
79	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
80	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
81	// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
82	// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]])
83	// CHECK: ret double [[VMULXD_F64_I]]
84	float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) {
85	return vmulxd_lane_f64(a, b, 0);
86	}
87
88	// CHECK-LABEL: define double @test_vmulxd_laneq_f64(double %a, <2 x double> %b) #1 {
89	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
90	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
91	// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
92	// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]])
93	// CHECK: ret double [[VMULXD_F64_I]]
94	float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
95	return vmulxd_laneq_f64(a, b, 1);
96	}
97
98	// CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %b) #0 {
99	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
100	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
101	// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
102	// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %b to <8 x i8>
103	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
104	// CHECK: [[VGET_LANE6:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
105	// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE6]])
106	// CHECK: [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
107	// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
108	// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
109	// CHECK: ret <1 x double> [[VSET_LANE]]
110	float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) {
111	return vmulx_lane_f64(a, b, 0);
112	}
113
114
115	// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %b) #1 {
116	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
117	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
118	// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
119	// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
120	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
121	// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
122	// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
123	// CHECK: [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
124	// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
125	// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
126	// CHECK: ret <1 x double> [[VSET_LANE]]
127	float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) {
128	return vmulx_laneq_f64(a, b, 0);
129	}
130
131	// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_1(<1 x double> %a, <2 x double> %b) #1 {
132	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
133	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
134	// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
135	// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
136	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
137	// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
138	// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
139	// CHECK: [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
140	// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
141	// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
142	// CHECK: ret <1 x double> [[VSET_LANE]]
143	float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
144	return vmulx_laneq_f64(a, b, 1);
145	}
146
147
148	// CHECK-LABEL: define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 {
149	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
150	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
151	// CHECK: [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
152	// CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
153	// CHECK: ret float [[TMP2]]
154	float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
155	return vfmas_lane_f32(a, b, c, 1);
156	}
157
158	// CHECK-LABEL: define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 {
159	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %c to <8 x i8>
160	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
161	// CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
162	// CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
163	// CHECK: ret double [[TMP2]]
164	float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
165	return vfmad_lane_f64(a, b, c, 0);
166	}
167
168	// CHECK-LABEL: define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #1 {
169	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %c to <16 x i8>
170	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
171	// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
172	// CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
173	// CHECK: ret double [[TMP2]]
174	float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
175	return vfmad_laneq_f64(a, b, c, 1);
176	}
177
178	// CHECK-LABEL: define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 {
179	// CHECK: [[SUB:%.*]] = fsub float -0.000000e+00, %b
180	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
181	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
182	// CHECK: [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
183	// CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
184	// CHECK: ret float [[TMP2]]
185	float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
186	return vfmss_lane_f32(a, b, c, 1);
187	}
188
189	// CHECK-LABEL: define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
190	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
191	// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
192	// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
193	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
194	// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
195	// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
196	// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
197	// CHECK: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
198	// CHECK: ret <1 x double> [[FMLA2]]
199	float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
200	return vfma_lane_f64(a, b, v, 0);
201	}
202
203	// CHECK-LABEL: define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
204	// CHECK: [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
205	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
206	// CHECK: [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
207	// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
208	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
209	// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
210	// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
211	// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
212	// CHECK: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
213	// CHECK: ret <1 x double> [[FMLA2]]
214	float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
215	return vfms_lane_f64(a, b, v, 0);
216	}
217
218	// CHECK-LABEL: define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #1 {
219	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
220	// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
221	// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
222	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
223	// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
224	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
225	// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
226	// CHECK: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
227	// CHECK: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
228	// CHECK: ret <1 x double> [[TMP7]]
229	float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
230	return vfma_laneq_f64(a, b, v, 0);
231	}
232
233	// CHECK-LABEL: define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #1 {
234	// CHECK: [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
235	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
236	// CHECK: [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
237	// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
238	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
239	// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
240	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
241	// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
242	// CHECK: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
243	// CHECK: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
244	// CHECK: ret <1 x double> [[TMP7]]
245	float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
246	return vfms_laneq_f64(a, b, v, 0);
247	}
248
249	// CHECK-LABEL: define i32 @test_vqdmullh_lane_s16(i16 %a, <4 x i16> %b) #0 {
250	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
251	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
252	// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
253	// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
254	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
255	// CHECK: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
256	// CHECK: [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
257	// CHECK: ret i32 [[TMP4]]
258	int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) {
259	return vqdmullh_lane_s16(a, b, 3);
260	}
261
262	// CHECK-LABEL: define i64 @test_vqdmulls_lane_s32(i32 %a, <2 x i32> %b) #0 {
263	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
264	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
265	// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
266	// CHECK: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]])
267	// CHECK: ret i64 [[VQDMULLS_S32_I]]
268	int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) {
269	return vqdmulls_lane_s32(a, b, 1);
270	}
271
272	// CHECK-LABEL: define i32 @test_vqdmullh_laneq_s16(i16 %a, <8 x i16> %b) #1 {
273	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
274	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
275	// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
276	// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
277	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
278	// CHECK: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
279	// CHECK: [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
280	// CHECK: ret i32 [[TMP4]]
281	int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) {
282	return vqdmullh_laneq_s16(a, b, 7);
283	}
284
285	// CHECK-LABEL: define i64 @test_vqdmulls_laneq_s32(i32 %a, <4 x i32> %b) #1 {
286	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
287	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
288	// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
289	// CHECK: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]])
290	// CHECK: ret i64 [[VQDMULLS_S32_I]]
291	int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) {
292	return vqdmulls_laneq_s32(a, b, 3);
293	}
294
295	// CHECK-LABEL: define i16 @test_vqdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
296	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
297	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
298	// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
299	// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
300	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
301	// CHECK: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
302	// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
303	// CHECK: ret i16 [[TMP4]]
304	int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) {
305	return vqdmulhh_lane_s16(a, b, 3);
306	}
307
308	// CHECK-LABEL: define i32 @test_vqdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
309	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
310	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
311	// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
312	// CHECK: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]])
313	// CHECK: ret i32 [[VQDMULHS_S32_I]]
314	int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) {
315	return vqdmulhs_lane_s32(a, b, 1);
316	}
317
318
319	// CHECK-LABEL: define i16 @test_vqdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #1 {
320	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
321	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
322	// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
323	// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
324	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
325	// CHECK: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
326	// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
327	// CHECK: ret i16 [[TMP4]]
328	int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) {
329	return vqdmulhh_laneq_s16(a, b, 7);
330	}
331
332
333	// CHECK-LABEL: define i32 @test_vqdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #1 {
334	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
335	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
336	// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
337	// CHECK: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]])
338	// CHECK: ret i32 [[VQDMULHS_S32_I]]
339	int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) {
340	return vqdmulhs_laneq_s32(a, b, 3);
341	}
342
343	// CHECK-LABEL: define i16 @test_vqrdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
344	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
345	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
346	// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
347	// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
348	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
349	// CHECK: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
350	// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
351	// CHECK: ret i16 [[TMP4]]
352	int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) {
353	return vqrdmulhh_lane_s16(a, b, 3);
354	}
355
356	// CHECK-LABEL: define i32 @test_vqrdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
357	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
358	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
359	// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
360	// CHECK: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]])
361	// CHECK: ret i32 [[VQRDMULHS_S32_I]]
362	int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) {
363	return vqrdmulhs_lane_s32(a, b, 1);
364	}
365
366
367	// CHECK-LABEL: define i16 @test_vqrdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #1 {
368	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
369	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
370	// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
371	// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
372	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
373	// CHECK: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
374	// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
375	// CHECK: ret i16 [[TMP4]]
376	int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) {
377	return vqrdmulhh_laneq_s16(a, b, 7);
378	}
379
380
381	// CHECK-LABEL: define i32 @test_vqrdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #1 {
382	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
383	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
384	// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
385	// CHECK: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]])
386	// CHECK: ret i32 [[VQRDMULHS_S32_I]]
387	int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) {
388	return vqrdmulhs_laneq_s32(a, b, 3);
389	}
390
391	// CHECK-LABEL: define i32 @test_vqdmlalh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
392	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
393	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
394	// CHECK: [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
395	// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
396	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
397	// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
398	// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
399	// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
400	// CHECK: ret i32 [[VQDMLXL1]]
401	int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
402	return vqdmlalh_lane_s16(a, b, c, 3);
403	}
404
405	// CHECK-LABEL: define i64 @test_vqdmlals_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
406	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
407	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
408	// CHECK: [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
409	// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
410	// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
411	// CHECK: ret i64 [[VQDMLXL1]]
412	int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) {
413	return vqdmlals_lane_s32(a, b, c, 1);
414	}
415
416	// CHECK-LABEL: define i32 @test_vqdmlalh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #1 {
417	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
418	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
419	// CHECK: [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
420	// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
421	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
422	// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
423	// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
424	// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
425	// CHECK: ret i32 [[VQDMLXL1]]
426	int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
427	return vqdmlalh_laneq_s16(a, b, c, 7);
428	}
429
430	// CHECK-LABEL: define i64 @test_vqdmlals_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #1 {
431	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
432	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
433	// CHECK: [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
434	// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
435	// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
436	// CHECK: ret i64 [[VQDMLXL1]]
437	int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
438	return vqdmlals_laneq_s32(a, b, c, 3);
439	}
440
441	// CHECK-LABEL: define i32 @test_vqdmlslh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
442	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
443	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
444	// CHECK: [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
445	// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
446	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
447	// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
448	// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
449	// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
450	// CHECK: ret i32 [[VQDMLXL1]]
451	int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
452	return vqdmlslh_lane_s16(a, b, c, 3);
453	}
454
455	// CHECK-LABEL: define i64 @test_vqdmlsls_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
456	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
457	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
458	// CHECK: [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
459	// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
460	// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
461	// CHECK: ret i64 [[VQDMLXL1]]
462	int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) {
463	return vqdmlsls_lane_s32(a, b, c, 1);
464	}
465
466	// CHECK-LABEL: define i32 @test_vqdmlslh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #1 {
467	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
468	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
469	// CHECK: [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
470	// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
471	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
472	// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
473	// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
474	// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
475	// CHECK: ret i32 [[VQDMLXL1]]
476	int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
477	return vqdmlslh_laneq_s16(a, b, c, 7);
478	}
479
480	// CHECK-LABEL: define i64 @test_vqdmlsls_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #1 {
481	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
482	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
483	// CHECK: [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
484	// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
485	// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
486	// CHECK: ret i64 [[VQDMLXL1]]
487	int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
488	return vqdmlsls_laneq_s32(a, b, c, 3);
489	}
490
491	// CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64_0() #0 {
492	// CHECK: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
493	// CHECK: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
494	// CHECK: [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
495	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
496	// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
497	// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP1]] to <8 x i8>
498	// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
499	// CHECK: [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP5]], i32 0
500	// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE7]])
501	// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
502	// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
503	// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
504	// CHECK: ret <1 x double> [[VSET_LANE]]
505	float64x1_t test_vmulx_lane_f64_0() {
506	float64x1_t arg1;
507	float64x1_t arg2;
508	float64x1_t result;
509	float64_t sarg1, sarg2, sres;
510	arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
511	arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
512	result = vmulx_lane_f64(arg1, arg2, 0);
513	return result;
514	}
515
516	// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_2() #1 {
517	// CHECK: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
518	// CHECK: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
519	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
520	// CHECK: [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
521	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
522	// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
523	// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[SHUFFLE_I]] to <16 x i8>
524	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
525	// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
526	// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
527	// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
528	// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
529	// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
530	// CHECK: ret <1 x double> [[VSET_LANE]]
531	float64x1_t test_vmulx_laneq_f64_2() {
532	float64x1_t arg1;
533	float64x1_t arg2;
534	float64x2_t arg3;
535	float64x1_t result;
536	float64_t sarg1, sarg2, sres;
537	arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
538	arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
539	arg3 = vcombine_f64(arg1, arg2);
540	result = vmulx_laneq_f64(arg1, arg3, 1);
541	return result;
542	}
543
544	// CHECK: attributes #0 ={{.*}}"min-legal-vector-width"="64"
545	// CHECK: attributes #1 ={{.*}}"min-legal-vector-width"="128"
546

Clang Project