aarch64-neon-2velem.c source code [clang_source_code/test/CodeGen/aarch64-neon-2velem.c]

1	// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s \| opt -S -mem2reg \| FileCheck %s
2
3	// Test new aarch64 intrinsics and types
4
5	#include <arm_neon.h>
6
7	// CHECK-LABEL: @test_vmla_lane_s16(
8	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
10	// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
11	// CHECK: ret <4 x i16> [[ADD]]
12	int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
13	return vmla_lane_s16(a, b, v, 3);
14	}
15
16	// CHECK-LABEL: @test_vmlaq_lane_s16(
17	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
18	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
19	// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
20	// CHECK: ret <8 x i16> [[ADD]]
21	int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
22	return vmlaq_lane_s16(a, b, v, 3);
23	}
24
25	// CHECK-LABEL: @test_vmla_lane_s32(
26	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
27	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
28	// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
29	// CHECK: ret <2 x i32> [[ADD]]
30	int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
31	return vmla_lane_s32(a, b, v, 1);
32	}
33
34	// CHECK-LABEL: @test_vmlaq_lane_s32(
35	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
36	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
37	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
38	// CHECK: ret <4 x i32> [[ADD]]
39	int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
40	return vmlaq_lane_s32(a, b, v, 1);
41	}
42
43	// CHECK-LABEL: @test_vmla_laneq_s16(
44	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
45	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
46	// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
47	// CHECK: ret <4 x i16> [[ADD]]
48	int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
49	return vmla_laneq_s16(a, b, v, 7);
50	}
51
52	// CHECK-LABEL: @test_vmlaq_laneq_s16(
53	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
54	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
55	// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
56	// CHECK: ret <8 x i16> [[ADD]]
57	int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
58	return vmlaq_laneq_s16(a, b, v, 7);
59	}
60
61	// CHECK-LABEL: @test_vmla_laneq_s32(
62	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
63	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
64	// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
65	// CHECK: ret <2 x i32> [[ADD]]
66	int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
67	return vmla_laneq_s32(a, b, v, 3);
68	}
69
70	// CHECK-LABEL: @test_vmlaq_laneq_s32(
71	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
72	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
73	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
74	// CHECK: ret <4 x i32> [[ADD]]
75	int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
76	return vmlaq_laneq_s32(a, b, v, 3);
77	}
78
79	// CHECK-LABEL: @test_vmls_lane_s16(
80	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
81	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
82	// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
83	// CHECK: ret <4 x i16> [[SUB]]
84	int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
85	return vmls_lane_s16(a, b, v, 3);
86	}
87
88	// CHECK-LABEL: @test_vmlsq_lane_s16(
89	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
90	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
91	// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
92	// CHECK: ret <8 x i16> [[SUB]]
93	int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
94	return vmlsq_lane_s16(a, b, v, 3);
95	}
96
97	// CHECK-LABEL: @test_vmls_lane_s32(
98	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
99	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
100	// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
101	// CHECK: ret <2 x i32> [[SUB]]
102	int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
103	return vmls_lane_s32(a, b, v, 1);
104	}
105
106	// CHECK-LABEL: @test_vmlsq_lane_s32(
107	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
108	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
109	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
110	// CHECK: ret <4 x i32> [[SUB]]
111	int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
112	return vmlsq_lane_s32(a, b, v, 1);
113	}
114
115	// CHECK-LABEL: @test_vmls_laneq_s16(
116	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
117	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
118	// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
119	// CHECK: ret <4 x i16> [[SUB]]
120	int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
121	return vmls_laneq_s16(a, b, v, 7);
122	}
123
124	// CHECK-LABEL: @test_vmlsq_laneq_s16(
125	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
126	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
127	// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
128	// CHECK: ret <8 x i16> [[SUB]]
129	int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
130	return vmlsq_laneq_s16(a, b, v, 7);
131	}
132
133	// CHECK-LABEL: @test_vmls_laneq_s32(
134	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
135	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
136	// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
137	// CHECK: ret <2 x i32> [[SUB]]
138	int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
139	return vmls_laneq_s32(a, b, v, 3);
140	}
141
142	// CHECK-LABEL: @test_vmlsq_laneq_s32(
143	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
144	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
145	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
146	// CHECK: ret <4 x i32> [[SUB]]
147	int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
148	return vmlsq_laneq_s32(a, b, v, 3);
149	}
150
151	// CHECK-LABEL: @test_vmul_lane_s16(
152	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
153	// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
154	// CHECK: ret <4 x i16> [[MUL]]
155	int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
156	return vmul_lane_s16(a, v, 3);
157	}
158
159	// CHECK-LABEL: @test_vmulq_lane_s16(
160	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
161	// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
162	// CHECK: ret <8 x i16> [[MUL]]
163	int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
164	return vmulq_lane_s16(a, v, 3);
165	}
166
167	// CHECK-LABEL: @test_vmul_lane_s32(
168	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
169	// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
170	// CHECK: ret <2 x i32> [[MUL]]
171	int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
172	return vmul_lane_s32(a, v, 1);
173	}
174
175	// CHECK-LABEL: @test_vmulq_lane_s32(
176	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
177	// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
178	// CHECK: ret <4 x i32> [[MUL]]
179	int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
180	return vmulq_lane_s32(a, v, 1);
181	}
182
183	// CHECK-LABEL: @test_vmul_lane_u16(
184	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
185	// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
186	// CHECK: ret <4 x i16> [[MUL]]
187	uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
188	return vmul_lane_u16(a, v, 3);
189	}
190
191	// CHECK-LABEL: @test_vmulq_lane_u16(
192	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
193	// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
194	// CHECK: ret <8 x i16> [[MUL]]
195	uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
196	return vmulq_lane_u16(a, v, 3);
197	}
198
199	// CHECK-LABEL: @test_vmul_lane_u32(
200	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
201	// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
202	// CHECK: ret <2 x i32> [[MUL]]
203	uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
204	return vmul_lane_u32(a, v, 1);
205	}
206
207	// CHECK-LABEL: @test_vmulq_lane_u32(
208	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
209	// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
210	// CHECK: ret <4 x i32> [[MUL]]
211	uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
212	return vmulq_lane_u32(a, v, 1);
213	}
214
215	// CHECK-LABEL: @test_vmul_laneq_s16(
216	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
217	// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
218	// CHECK: ret <4 x i16> [[MUL]]
219	int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
220	return vmul_laneq_s16(a, v, 7);
221	}
222
223	// CHECK-LABEL: @test_vmulq_laneq_s16(
224	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
225	// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
226	// CHECK: ret <8 x i16> [[MUL]]
227	int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
228	return vmulq_laneq_s16(a, v, 7);
229	}
230
231	// CHECK-LABEL: @test_vmul_laneq_s32(
232	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
233	// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
234	// CHECK: ret <2 x i32> [[MUL]]
235	int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
236	return vmul_laneq_s32(a, v, 3);
237	}
238
239	// CHECK-LABEL: @test_vmulq_laneq_s32(
240	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
241	// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
242	// CHECK: ret <4 x i32> [[MUL]]
243	int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
244	return vmulq_laneq_s32(a, v, 3);
245	}
246
247	// CHECK-LABEL: @test_vmul_laneq_u16(
248	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
249	// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
250	// CHECK: ret <4 x i16> [[MUL]]
251	uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
252	return vmul_laneq_u16(a, v, 7);
253	}
254
255	// CHECK-LABEL: @test_vmulq_laneq_u16(
256	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
257	// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
258	// CHECK: ret <8 x i16> [[MUL]]
259	uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
260	return vmulq_laneq_u16(a, v, 7);
261	}
262
263	// CHECK-LABEL: @test_vmul_laneq_u32(
264	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
265	// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
266	// CHECK: ret <2 x i32> [[MUL]]
267	uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
268	return vmul_laneq_u32(a, v, 3);
269	}
270
271	// CHECK-LABEL: @test_vmulq_laneq_u32(
272	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
273	// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
274	// CHECK: ret <4 x i32> [[MUL]]
275	uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
276	return vmulq_laneq_u32(a, v, 3);
277	}
278
279	// CHECK-LABEL: @test_vfma_lane_f32(
280	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
281	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
282	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
283	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
284	// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
285	// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
286	// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
287	// CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
288	// CHECK: ret <2 x float> [[FMLA2]]
289	float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
290	return vfma_lane_f32(a, b, v, 1);
291	}
292
293	// CHECK-LABEL: @test_vfmaq_lane_f32(
294	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
295	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
296	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
297	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
298	// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
299	// CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
300	// CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
301	// CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
302	// CHECK: ret <4 x float> [[FMLA2]]
303	float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
304	return vfmaq_lane_f32(a, b, v, 1);
305	}
306
307	// CHECK-LABEL: @test_vfma_laneq_f32(
308	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
309	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
310	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
311	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
312	// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
313	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
314	// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
315	// CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
316	// CHECK: ret <2 x float> [[TMP6]]
317	float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
318	return vfma_laneq_f32(a, b, v, 3);
319	}
320
321	// CHECK-LABEL: @test_vfmaq_laneq_f32(
322	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
323	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
324	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
325	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
326	// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
327	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
328	// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
329	// CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
330	// CHECK: ret <4 x float> [[TMP6]]
331	float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
332	return vfmaq_laneq_f32(a, b, v, 3);
333	}
334
335	// CHECK-LABEL: @test_vfms_lane_f32(
336	// CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
337	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
338	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
339	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
340	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
341	// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
342	// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
343	// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
344	// CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
345	// CHECK: ret <2 x float> [[FMLA2]]
346	float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
347	return vfms_lane_f32(a, b, v, 1);
348	}
349
350	// CHECK-LABEL: @test_vfmsq_lane_f32(
351	// CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
352	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
353	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
354	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
355	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
356	// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
357	// CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
358	// CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
359	// CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
360	// CHECK: ret <4 x float> [[FMLA2]]
361	float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
362	return vfmsq_lane_f32(a, b, v, 1);
363	}
364
365	// CHECK-LABEL: @test_vfms_laneq_f32(
366	// CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
367	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
368	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
369	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
370	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
371	// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
372	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
373	// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
374	// CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
375	// CHECK: ret <2 x float> [[TMP6]]
376	float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
377	return vfms_laneq_f32(a, b, v, 3);
378	}
379
380	// CHECK-LABEL: @test_vfmsq_laneq_f32(
381	// CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
382	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
383	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
384	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
385	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
386	// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
387	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
388	// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
389	// CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
390	// CHECK: ret <4 x float> [[TMP6]]
391	float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
392	return vfmsq_laneq_f32(a, b, v, 3);
393	}
394
395	// CHECK-LABEL: @test_vfmaq_lane_f64(
396	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
397	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
398	// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
399	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
400	// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
401	// CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
402	// CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
403	// CHECK: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
404	// CHECK: ret <2 x double> [[FMLA2]]
405	float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
406	return vfmaq_lane_f64(a, b, v, 0);
407	}
408
409	// CHECK-LABEL: @test_vfmaq_laneq_f64(
410	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
411	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
412	// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
413	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
414	// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
415	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
416	// CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
417	// CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
418	// CHECK: ret <2 x double> [[TMP6]]
419	float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
420	return vfmaq_laneq_f64(a, b, v, 1);
421	}
422
423	// CHECK-LABEL: @test_vfmsq_lane_f64(
424	// CHECK: [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
425	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
426	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
427	// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
428	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
429	// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
430	// CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
431	// CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
432	// CHECK: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
433	// CHECK: ret <2 x double> [[FMLA2]]
434	float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
435	return vfmsq_lane_f64(a, b, v, 0);
436	}
437
438	// CHECK-LABEL: @test_vfmsq_laneq_f64(
439	// CHECK: [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
440	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
441	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
442	// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
443	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
444	// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
445	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
446	// CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
447	// CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
448	// CHECK: ret <2 x double> [[TMP6]]
449	float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
450	return vfmsq_laneq_f64(a, b, v, 1);
451	}
452
453	// CHECK-LABEL: @test_vfmas_laneq_f32(
454	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
455	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
456	// CHECK: [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
457	// CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
458	// CHECK: ret float [[TMP2]]
459	float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
460	return vfmas_laneq_f32(a, b, v, 3);
461	}
462
463	// CHECK-LABEL: @test_vfmsd_lane_f64(
464	// CHECK: [[SUB:%.*]] = fsub double -0.000000e+00, %b
465	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %v to <8 x i8>
466	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
467	// CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
468	// CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
469	// CHECK: ret double [[TMP2]]
470	float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
471	return vfmsd_lane_f64(a, b, v, 0);
472	}
473
474	// CHECK-LABEL: @test_vfmss_laneq_f32(
475	// CHECK: [[SUB:%.*]] = fsub float -0.000000e+00, %b
476	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
477	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
478	// CHECK: [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
479	// CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
480	// CHECK: ret float [[TMP2]]
481	float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
482	return vfmss_laneq_f32(a, b, v, 3);
483	}
484
485	// CHECK-LABEL: @test_vfmsd_laneq_f64(
486	// CHECK: [[SUB:%.*]] = fsub double -0.000000e+00, %b
487	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v to <16 x i8>
488	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
489	// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
490	// CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
491	// CHECK: ret double [[TMP2]]
492	float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
493	return vfmsd_laneq_f64(a, b, v, 1);
494	}
495
496	// CHECK-LABEL: @test_vmlal_lane_s16(
497	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
498	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
499	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
500	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
501	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
502	// CHECK: ret <4 x i32> [[ADD]]
503	int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
504	return vmlal_lane_s16(a, b, v, 3);
505	}
506
507	// CHECK-LABEL: @test_vmlal_lane_s32(
508	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
509	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
510	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
511	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
512	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
513	// CHECK: ret <2 x i64> [[ADD]]
514	int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
515	return vmlal_lane_s32(a, b, v, 1);
516	}
517
518	// CHECK-LABEL: @test_vmlal_laneq_s16(
519	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
520	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
521	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
522	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
523	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
524	// CHECK: ret <4 x i32> [[ADD]]
525	int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
526	return vmlal_laneq_s16(a, b, v, 7);
527	}
528
529	// CHECK-LABEL: @test_vmlal_laneq_s32(
530	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
531	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
532	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
533	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
534	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
535	// CHECK: ret <2 x i64> [[ADD]]
536	int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
537	return vmlal_laneq_s32(a, b, v, 3);
538	}
539
540	// CHECK-LABEL: @test_vmlal_high_lane_s16(
541	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
542	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
543	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
544	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
545	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
546	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
547	// CHECK: ret <4 x i32> [[ADD]]
548	int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
549	return vmlal_high_lane_s16(a, b, v, 3);
550	}
551
552	// CHECK-LABEL: @test_vmlal_high_lane_s32(
553	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
554	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
555	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
556	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
557	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
558	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
559	// CHECK: ret <2 x i64> [[ADD]]
560	int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
561	return vmlal_high_lane_s32(a, b, v, 1);
562	}
563
564	// CHECK-LABEL: @test_vmlal_high_laneq_s16(
565	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
566	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
567	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
568	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
569	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
570	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
571	// CHECK: ret <4 x i32> [[ADD]]
572	int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
573	return vmlal_high_laneq_s16(a, b, v, 7);
574	}
575
576	// CHECK-LABEL: @test_vmlal_high_laneq_s32(
577	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
578	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
579	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
580	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
581	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
582	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
583	// CHECK: ret <2 x i64> [[ADD]]
584	int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
585	return vmlal_high_laneq_s32(a, b, v, 3);
586	}
587
588	// CHECK-LABEL: @test_vmlsl_lane_s16(
589	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
590	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
591	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
592	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
593	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
594	// CHECK: ret <4 x i32> [[SUB]]
595	int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
596	return vmlsl_lane_s16(a, b, v, 3);
597	}
598
599	// CHECK-LABEL: @test_vmlsl_lane_s32(
600	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
601	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
602	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
603	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
604	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
605	// CHECK: ret <2 x i64> [[SUB]]
606	int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
607	return vmlsl_lane_s32(a, b, v, 1);
608	}
609
610	// CHECK-LABEL: @test_vmlsl_laneq_s16(
611	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
612	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
613	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
614	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
615	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
616	// CHECK: ret <4 x i32> [[SUB]]
617	int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
618	return vmlsl_laneq_s16(a, b, v, 7);
619	}
620
621	// CHECK-LABEL: @test_vmlsl_laneq_s32(
622	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
623	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
624	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
625	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
626	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
627	// CHECK: ret <2 x i64> [[SUB]]
628	int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
629	return vmlsl_laneq_s32(a, b, v, 3);
630	}
631
632	// CHECK-LABEL: @test_vmlsl_high_lane_s16(
633	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
634	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
635	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
636	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
637	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
638	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
639	// CHECK: ret <4 x i32> [[SUB]]
640	int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
641	return vmlsl_high_lane_s16(a, b, v, 3);
642	}
643
644	// CHECK-LABEL: @test_vmlsl_high_lane_s32(
645	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
646	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
647	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
648	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
649	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
650	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
651	// CHECK: ret <2 x i64> [[SUB]]
652	int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
653	return vmlsl_high_lane_s32(a, b, v, 1);
654	}
655
656	// CHECK-LABEL: @test_vmlsl_high_laneq_s16(
657	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
658	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
659	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
660	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
661	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
662	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
663	// CHECK: ret <4 x i32> [[SUB]]
664	int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
665	return vmlsl_high_laneq_s16(a, b, v, 7);
666	}
667
668	// CHECK-LABEL: @test_vmlsl_high_laneq_s32(
669	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
670	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
671	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
672	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
673	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
674	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
675	// CHECK: ret <2 x i64> [[SUB]]
676	int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
677	return vmlsl_high_laneq_s32(a, b, v, 3);
678	}
679
680	// CHECK-LABEL: @test_vmlal_lane_u16(
681	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
682	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
683	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
684	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
685	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
686	// CHECK: ret <4 x i32> [[ADD]]
687	int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
688	return vmlal_lane_u16(a, b, v, 3);
689	}
690
691	// CHECK-LABEL: @test_vmlal_lane_u32(
692	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
693	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
694	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
695	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
696	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
697	// CHECK: ret <2 x i64> [[ADD]]
698	int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
699	return vmlal_lane_u32(a, b, v, 1);
700	}
701
702	// CHECK-LABEL: @test_vmlal_laneq_u16(
703	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
704	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
705	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
706	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
707	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
708	// CHECK: ret <4 x i32> [[ADD]]
709	int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
710	return vmlal_laneq_u16(a, b, v, 7);
711	}
712
713	// CHECK-LABEL: @test_vmlal_laneq_u32(
714	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
715	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
716	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
717	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
718	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
719	// CHECK: ret <2 x i64> [[ADD]]
720	int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
721	return vmlal_laneq_u32(a, b, v, 3);
722	}
723
724	// CHECK-LABEL: @test_vmlal_high_lane_u16(
725	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
726	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
727	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
728	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
729	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
730	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
731	// CHECK: ret <4 x i32> [[ADD]]
732	int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
733	return vmlal_high_lane_u16(a, b, v, 3);
734	}
735
736	// CHECK-LABEL: @test_vmlal_high_lane_u32(
737	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
738	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
739	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
740	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
741	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
742	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
743	// CHECK: ret <2 x i64> [[ADD]]
744	int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
745	return vmlal_high_lane_u32(a, b, v, 1);
746	}
747
748	// CHECK-LABEL: @test_vmlal_high_laneq_u16(
749	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
750	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
751	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
752	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
753	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
754	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
755	// CHECK: ret <4 x i32> [[ADD]]
756	int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
757	return vmlal_high_laneq_u16(a, b, v, 7);
758	}
759
760	// CHECK-LABEL: @test_vmlal_high_laneq_u32(
761	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
762	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
763	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
764	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
765	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
766	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
767	// CHECK: ret <2 x i64> [[ADD]]
768	int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
769	return vmlal_high_laneq_u32(a, b, v, 3);
770	}
771
772	// CHECK-LABEL: @test_vmlsl_lane_u16(
773	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
774	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
775	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
776	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
777	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
778	// CHECK: ret <4 x i32> [[SUB]]
779	int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
780	return vmlsl_lane_u16(a, b, v, 3);
781	}
782
783	// CHECK-LABEL: @test_vmlsl_lane_u32(
784	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
785	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
786	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
787	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
788	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
789	// CHECK: ret <2 x i64> [[SUB]]
790	int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
791	return vmlsl_lane_u32(a, b, v, 1);
792	}
793
794	// CHECK-LABEL: @test_vmlsl_laneq_u16(
795	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
796	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
797	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
798	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
799	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
800	// CHECK: ret <4 x i32> [[SUB]]
801	int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
802	return vmlsl_laneq_u16(a, b, v, 7);
803	}
804
805	// CHECK-LABEL: @test_vmlsl_laneq_u32(
806	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
807	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
808	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
809	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
810	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
811	// CHECK: ret <2 x i64> [[SUB]]
812	int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
813	return vmlsl_laneq_u32(a, b, v, 3);
814	}
815
816	// CHECK-LABEL: @test_vmlsl_high_lane_u16(
817	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
818	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
819	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
820	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
821	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
822	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
823	// CHECK: ret <4 x i32> [[SUB]]
824	int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
825	return vmlsl_high_lane_u16(a, b, v, 3);
826	}
827
828	// CHECK-LABEL: @test_vmlsl_high_lane_u32(
829	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
830	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
831	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
832	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
833	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
834	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
835	// CHECK: ret <2 x i64> [[SUB]]
836	int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
837	return vmlsl_high_lane_u32(a, b, v, 1);
838	}
839
840	// CHECK-LABEL: @test_vmlsl_high_laneq_u16(
841	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
842	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
843	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
844	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
845	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
846	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
847	// CHECK: ret <4 x i32> [[SUB]]
848	int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
849	return vmlsl_high_laneq_u16(a, b, v, 7);
850	}
851
852	// CHECK-LABEL: @test_vmlsl_high_laneq_u32(
853	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
854	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
855	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
856	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
857	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
858	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
859	// CHECK: ret <2 x i64> [[SUB]]
860	int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
861	return vmlsl_high_laneq_u32(a, b, v, 3);
862	}
863
864	// CHECK-LABEL: @test_vmull_lane_s16(
865	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
866	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
867	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
868	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
869	// CHECK: ret <4 x i32> [[VMULL2_I]]
870	int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
871	return vmull_lane_s16(a, v, 3);
872	}
873
874	// CHECK-LABEL: @test_vmull_lane_s32(
875	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
876	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
877	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
878	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
879	// CHECK: ret <2 x i64> [[VMULL2_I]]
880	int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
881	return vmull_lane_s32(a, v, 1);
882	}
883
884	// CHECK-LABEL: @test_vmull_lane_u16(
885	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
886	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
887	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
888	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
889	// CHECK: ret <4 x i32> [[VMULL2_I]]
890	uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
891	return vmull_lane_u16(a, v, 3);
892	}
893
894	// CHECK-LABEL: @test_vmull_lane_u32(
895	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
896	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
897	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
898	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
899	// CHECK: ret <2 x i64> [[VMULL2_I]]
900	uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
901	return vmull_lane_u32(a, v, 1);
902	}
903
904	// CHECK-LABEL: @test_vmull_high_lane_s16(
905	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
906	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
907	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
908	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
909	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
910	// CHECK: ret <4 x i32> [[VMULL2_I]]
911	int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
912	return vmull_high_lane_s16(a, v, 3);
913	}
914
915	// CHECK-LABEL: @test_vmull_high_lane_s32(
916	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
917	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
918	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
919	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
920	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
921	// CHECK: ret <2 x i64> [[VMULL2_I]]
922	int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
923	return vmull_high_lane_s32(a, v, 1);
924	}
925
926	// CHECK-LABEL: @test_vmull_high_lane_u16(
927	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
928	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
929	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
930	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
931	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
932	// CHECK: ret <4 x i32> [[VMULL2_I]]
933	uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
934	return vmull_high_lane_u16(a, v, 3);
935	}
936
937	// CHECK-LABEL: @test_vmull_high_lane_u32(
938	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
939	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
940	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
941	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
942	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
943	// CHECK: ret <2 x i64> [[VMULL2_I]]
944	uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
945	return vmull_high_lane_u32(a, v, 1);
946	}
947
948	// CHECK-LABEL: @test_vmull_laneq_s16(
949	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
950	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
951	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
952	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
953	// CHECK: ret <4 x i32> [[VMULL2_I]]
954	int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
955	return vmull_laneq_s16(a, v, 7);
956	}
957
958	// CHECK-LABEL: @test_vmull_laneq_s32(
959	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
960	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
961	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
962	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
963	// CHECK: ret <2 x i64> [[VMULL2_I]]
964	int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
965	return vmull_laneq_s32(a, v, 3);
966	}
967
968	// CHECK-LABEL: @test_vmull_laneq_u16(
969	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
970	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
971	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
972	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
973	// CHECK: ret <4 x i32> [[VMULL2_I]]
974	uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
975	return vmull_laneq_u16(a, v, 7);
976	}
977
978	// CHECK-LABEL: @test_vmull_laneq_u32(
979	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
980	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
981	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
982	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
983	// CHECK: ret <2 x i64> [[VMULL2_I]]
984	uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
985	return vmull_laneq_u32(a, v, 3);
986	}
987
988	// CHECK-LABEL: @test_vmull_high_laneq_s16(
989	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
990	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
991	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
992	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
993	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
994	// CHECK: ret <4 x i32> [[VMULL2_I]]
995	int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
996	return vmull_high_laneq_s16(a, v, 7);
997	}
998
999	// CHECK-LABEL: @test_vmull_high_laneq_s32(
1000	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1001	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1002	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1003	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1004	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1005	// CHECK: ret <2 x i64> [[VMULL2_I]]
1006	int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1007	return vmull_high_laneq_s32(a, v, 3);
1008	}
1009
1010	// CHECK-LABEL: @test_vmull_high_laneq_u16(
1011	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1012	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1013	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1014	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1015	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1016	// CHECK: ret <4 x i32> [[VMULL2_I]]
1017	uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
1018	return vmull_high_laneq_u16(a, v, 7);
1019	}
1020
1021	// CHECK-LABEL: @test_vmull_high_laneq_u32(
1022	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1023	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1024	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1025	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1026	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1027	// CHECK: ret <2 x i64> [[VMULL2_I]]
1028	uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
1029	return vmull_high_laneq_u32(a, v, 3);
1030	}
1031
1032	// CHECK-LABEL: @test_vqdmlal_lane_s16(
1033	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1034	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1035	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1036	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1037	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1038	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
1039	// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
1040	int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1041	return vqdmlal_lane_s16(a, b, v, 3);
1042	}
1043
1044	// CHECK-LABEL: @test_vqdmlal_lane_s32(
1045	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1046	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1047	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1048	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1049	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1050	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
1051	// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
1052	int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1053	return vqdmlal_lane_s32(a, b, v, 1);
1054	}
1055
1056	// CHECK-LABEL: @test_vqdmlal_high_lane_s16(
1057	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1058	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1059	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1060	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1061	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1062	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1063	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
1064	// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
1065	int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1066	return vqdmlal_high_lane_s16(a, b, v, 3);
1067	}
1068
1069	// CHECK-LABEL: @test_vqdmlal_high_lane_s32(
1070	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1071	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1072	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1073	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1074	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1075	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1076	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
1077	// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
1078	int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1079	return vqdmlal_high_lane_s32(a, b, v, 1);
1080	}
1081
1082	// CHECK-LABEL: @test_vqdmlsl_lane_s16(
1083	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1084	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1085	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1086	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1087	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1088	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
1089	// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
1090	int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1091	return vqdmlsl_lane_s16(a, b, v, 3);
1092	}
1093
1094	// CHECK-LABEL: @test_vqdmlsl_lane_s32(
1095	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1096	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1097	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1098	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1099	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1100	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
1101	// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
1102	int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1103	return vqdmlsl_lane_s32(a, b, v, 1);
1104	}
1105
1106	// CHECK-LABEL: @test_vqdmlsl_high_lane_s16(
1107	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1108	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1109	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1110	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1111	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1112	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1113	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
1114	// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
1115	int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1116	return vqdmlsl_high_lane_s16(a, b, v, 3);
1117	}
1118
1119	// CHECK-LABEL: @test_vqdmlsl_high_lane_s32(
1120	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1121	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1122	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1123	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1124	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1125	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1126	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
1127	// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
1128	int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1129	return vqdmlsl_high_lane_s32(a, b, v, 1);
1130	}
1131
1132	// CHECK-LABEL: @test_vqdmull_lane_s16(
1133	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1134	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1135	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1136	// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
1137	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1138	// CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
1139	int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
1140	return vqdmull_lane_s16(a, v, 3);
1141	}
1142
1143	// CHECK-LABEL: @test_vqdmull_lane_s32(
1144	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1145	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1146	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1147	// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
1148	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1149	// CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
1150	int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
1151	return vqdmull_lane_s32(a, v, 1);
1152	}
1153
1154	// CHECK-LABEL: @test_vqdmull_laneq_s16(
1155	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1156	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1157	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1158	// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
1159	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1160	// CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
1161	int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
1162	return vqdmull_laneq_s16(a, v, 3);
1163	}
1164
1165	// CHECK-LABEL: @test_vqdmull_laneq_s32(
1166	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1167	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1168	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1169	// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
1170	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1171	// CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
1172	int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
1173	return vqdmull_laneq_s32(a, v, 3);
1174	}
1175
1176	// CHECK-LABEL: @test_vqdmull_high_lane_s16(
1177	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1178	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1179	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1180	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1181	// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1182	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1183	// CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
1184	int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1185	return vqdmull_high_lane_s16(a, v, 3);
1186	}
1187
1188	// CHECK-LABEL: @test_vqdmull_high_lane_s32(
1189	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1190	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1191	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1192	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1193	// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1194	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1195	// CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
1196	int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1197	return vqdmull_high_lane_s32(a, v, 1);
1198	}
1199
1200	// CHECK-LABEL: @test_vqdmull_high_laneq_s16(
1201	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1202	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1203	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1204	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1205	// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1206	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1207	// CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
1208	int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1209	return vqdmull_high_laneq_s16(a, v, 7);
1210	}
1211
1212	// CHECK-LABEL: @test_vqdmull_high_laneq_s32(
1213	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1214	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1215	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1216	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1217	// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1218	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1219	// CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
1220	int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1221	return vqdmull_high_laneq_s32(a, v, 3);
1222	}
1223
1224	// CHECK-LABEL: @test_vqdmulh_lane_s16(
1225	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1226	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1227	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1228	// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
1229	// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
1230	// CHECK: ret <4 x i16> [[VQDMULH_V2_I]]
1231	int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1232	return vqdmulh_lane_s16(a, v, 3);
1233	}
1234
1235	// CHECK-LABEL: @test_vqdmulhq_lane_s16(
1236	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1237	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1238	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
1239	// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
1240	// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
1241	// CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]]
1242	int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1243	return vqdmulhq_lane_s16(a, v, 3);
1244	}
1245
1246	// CHECK-LABEL: @test_vqdmulh_lane_s32(
1247	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1248	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1249	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1250	// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
1251	// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
1252	// CHECK: ret <2 x i32> [[VQDMULH_V2_I]]
1253	int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1254	return vqdmulh_lane_s32(a, v, 1);
1255	}
1256
1257	// CHECK-LABEL: @test_vqdmulhq_lane_s32(
1258	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1259	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1260	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
1261	// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
1262	// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
1263	// CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]]
1264	int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1265	return vqdmulhq_lane_s32(a, v, 1);
1266	}
1267
1268	// CHECK-LABEL: @test_vqrdmulh_lane_s16(
1269	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1270	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1271	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1272	// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
1273	// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
1274	// CHECK: ret <4 x i16> [[VQRDMULH_V2_I]]
1275	int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1276	return vqrdmulh_lane_s16(a, v, 3);
1277	}
1278
1279	// CHECK-LABEL: @test_vqrdmulhq_lane_s16(
1280	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1281	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1282	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
1283	// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
1284	// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
1285	// CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]]
1286	int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1287	return vqrdmulhq_lane_s16(a, v, 3);
1288	}
1289
1290	// CHECK-LABEL: @test_vqrdmulh_lane_s32(
1291	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1292	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1293	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1294	// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
1295	// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
1296	// CHECK: ret <2 x i32> [[VQRDMULH_V2_I]]
1297	int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1298	return vqrdmulh_lane_s32(a, v, 1);
1299	}
1300
1301	// CHECK-LABEL: @test_vqrdmulhq_lane_s32(
1302	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1303	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1304	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
1305	// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
1306	// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
1307	// CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]]
1308	int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1309	return vqrdmulhq_lane_s32(a, v, 1);
1310	}
1311
1312	// CHECK-LABEL: @test_vmul_lane_f32(
1313	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
1314	// CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
1315	// CHECK: ret <2 x float> [[MUL]]
1316	float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
1317	return vmul_lane_f32(a, v, 1);
1318	}
1319
1320	// CHECK-LABEL: @test_vmul_lane_f64(
1321	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
1322	// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8>
1323	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1324	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
1325	// CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
1326	// CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1327	// CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1328	// CHECK: ret <1 x double> [[TMP5]]
1329
1330	float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
1331	return vmul_lane_f64(a, v, 0);
1332	}
1333
1334	// CHECK-LABEL: @test_vmulq_lane_f32(
1335	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1336	// CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
1337	// CHECK: ret <4 x float> [[MUL]]
1338
1339	float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
1340	return vmulq_lane_f32(a, v, 1);
1341	}
1342
1343	// CHECK-LABEL: @test_vmulq_lane_f64(
1344	// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
1345	// CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
1346	// CHECK: ret <2 x double> [[MUL]]
1347	float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
1348	return vmulq_lane_f64(a, v, 0);
1349	}
1350
1351	// CHECK-LABEL: @test_vmul_laneq_f32(
1352	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
1353	// CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
1354	// CHECK: ret <2 x float> [[MUL]]
1355	float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
1356	return vmul_laneq_f32(a, v, 3);
1357	}
1358
1359	// CHECK-LABEL: @test_vmul_laneq_f64(
1360	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
1361	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
1362	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1363	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1364	// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
1365	// CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1366	// CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1367	// CHECK: ret <1 x double> [[TMP5]]
1368	float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
1369	return vmul_laneq_f64(a, v, 1);
1370	}
1371
1372	// CHECK-LABEL: @test_vmulq_laneq_f32(
1373	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1374	// CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
1375	// CHECK: ret <4 x float> [[MUL]]
1376
1377	float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
1378	return vmulq_laneq_f32(a, v, 3);
1379	}
1380
1381	// CHECK-LABEL: @test_vmulq_laneq_f64(
1382	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
1383	// CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
1384	// CHECK: ret <2 x double> [[MUL]]
1385	float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
1386	return vmulq_laneq_f64(a, v, 1);
1387	}
1388
1389	// CHECK-LABEL: @test_vmulx_lane_f32(
1390	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
1391	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1392	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
1393	// CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]])
1394	// CHECK: ret <2 x float> [[VMULX2_I]]
1395	float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
1396	return vmulx_lane_f32(a, v, 1);
1397	}
1398
1399	// CHECK-LABEL: @test_vmulxq_lane_f32(
1400	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1401	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1402	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
1403	// CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]])
1404	// CHECK: ret <4 x float> [[VMULX2_I]]
1405	float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
1406	return vmulxq_lane_f32(a, v, 1);
1407	}
1408
1409	// CHECK-LABEL: @test_vmulxq_lane_f64(
1410	// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
1411	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1412	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
1413	// CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]])
1414	// CHECK: ret <2 x double> [[VMULX2_I]]
1415	float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
1416	return vmulxq_lane_f64(a, v, 0);
1417	}
1418
1419	// CHECK-LABEL: @test_vmulx_laneq_f32(
1420	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
1421	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1422	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
1423	// CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]])
1424	// CHECK: ret <2 x float> [[VMULX2_I]]
1425	float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
1426	return vmulx_laneq_f32(a, v, 3);
1427	}
1428
1429	// CHECK-LABEL: @test_vmulxq_laneq_f32(
1430	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1431	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1432	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
1433	// CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]])
1434	// CHECK: ret <4 x float> [[VMULX2_I]]
1435	float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
1436	return vmulxq_laneq_f32(a, v, 3);
1437	}
1438
1439	// CHECK-LABEL: @test_vmulxq_laneq_f64(
1440	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
1441	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1442	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
1443	// CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]])
1444	// CHECK: ret <2 x double> [[VMULX2_I]]
1445	float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
1446	return vmulxq_laneq_f64(a, v, 1);
1447	}
1448
1449	// CHECK-LABEL: @test_vmla_lane_s16_0(
1450	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1451	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1452	// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
1453	// CHECK: ret <4 x i16> [[ADD]]
1454	int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1455	return vmla_lane_s16(a, b, v, 0);
1456	}
1457
1458	// CHECK-LABEL: @test_vmlaq_lane_s16_0(
1459	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1460	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1461	// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
1462	// CHECK: ret <8 x i16> [[ADD]]
1463	int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1464	return vmlaq_lane_s16(a, b, v, 0);
1465	}
1466
1467	// CHECK-LABEL: @test_vmla_lane_s32_0(
1468	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1469	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1470	// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
1471	// CHECK: ret <2 x i32> [[ADD]]
1472	int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1473	return vmla_lane_s32(a, b, v, 0);
1474	}
1475
1476	// CHECK-LABEL: @test_vmlaq_lane_s32_0(
1477	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1478	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1479	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
1480	// CHECK: ret <4 x i32> [[ADD]]
1481	int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1482	return vmlaq_lane_s32(a, b, v, 0);
1483	}
1484
1485	// CHECK-LABEL: @test_vmla_laneq_s16_0(
1486	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1487	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1488	// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
1489	// CHECK: ret <4 x i16> [[ADD]]
1490	int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1491	return vmla_laneq_s16(a, b, v, 0);
1492	}
1493
1494	// CHECK-LABEL: @test_vmlaq_laneq_s16_0(
1495	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1496	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1497	// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
1498	// CHECK: ret <8 x i16> [[ADD]]
1499	int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1500	return vmlaq_laneq_s16(a, b, v, 0);
1501	}
1502
1503	// CHECK-LABEL: @test_vmla_laneq_s32_0(
1504	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1505	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1506	// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
1507	// CHECK: ret <2 x i32> [[ADD]]
1508	int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
1509	return vmla_laneq_s32(a, b, v, 0);
1510	}
1511
1512	// CHECK-LABEL: @test_vmlaq_laneq_s32_0(
1513	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1514	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1515	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
1516	// CHECK: ret <4 x i32> [[ADD]]
1517	int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
1518	return vmlaq_laneq_s32(a, b, v, 0);
1519	}
1520
1521	// CHECK-LABEL: @test_vmls_lane_s16_0(
1522	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1523	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1524	// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
1525	// CHECK: ret <4 x i16> [[SUB]]
1526	int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1527	return vmls_lane_s16(a, b, v, 0);
1528	}
1529
1530	// CHECK-LABEL: @test_vmlsq_lane_s16_0(
1531	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1532	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1533	// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
1534	// CHECK: ret <8 x i16> [[SUB]]
1535	int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1536	return vmlsq_lane_s16(a, b, v, 0);
1537	}
1538
1539	// CHECK-LABEL: @test_vmls_lane_s32_0(
1540	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1541	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1542	// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
1543	// CHECK: ret <2 x i32> [[SUB]]
1544	int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1545	return vmls_lane_s32(a, b, v, 0);
1546	}
1547
1548	// CHECK-LABEL: @test_vmlsq_lane_s32_0(
1549	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1550	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1551	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
1552	// CHECK: ret <4 x i32> [[SUB]]
1553	int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1554	return vmlsq_lane_s32(a, b, v, 0);
1555	}
1556
1557	// CHECK-LABEL: @test_vmls_laneq_s16_0(
1558	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1559	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1560	// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
1561	// CHECK: ret <4 x i16> [[SUB]]
1562	int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1563	return vmls_laneq_s16(a, b, v, 0);
1564	}
1565
1566	// CHECK-LABEL: @test_vmlsq_laneq_s16_0(
1567	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1568	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1569	// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
1570	// CHECK: ret <8 x i16> [[SUB]]
1571	int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1572	return vmlsq_laneq_s16(a, b, v, 0);
1573	}
1574
1575	// CHECK-LABEL: @test_vmls_laneq_s32_0(
1576	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1577	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1578	// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
1579	// CHECK: ret <2 x i32> [[SUB]]
1580	int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
1581	return vmls_laneq_s32(a, b, v, 0);
1582	}
1583
1584	// CHECK-LABEL: @test_vmlsq_laneq_s32_0(
1585	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1586	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1587	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
1588	// CHECK: ret <4 x i32> [[SUB]]
1589	int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
1590	return vmlsq_laneq_s32(a, b, v, 0);
1591	}
1592
1593	// CHECK-LABEL: @test_vmul_lane_s16_0(
1594	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1595	// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1596	// CHECK: ret <4 x i16> [[MUL]]
1597	int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
1598	return vmul_lane_s16(a, v, 0);
1599	}
1600
1601	// CHECK-LABEL: @test_vmulq_lane_s16_0(
1602	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1603	// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1604	// CHECK: ret <8 x i16> [[MUL]]
1605	int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
1606	return vmulq_lane_s16(a, v, 0);
1607	}
1608
1609	// CHECK-LABEL: @test_vmul_lane_s32_0(
1610	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1611	// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1612	// CHECK: ret <2 x i32> [[MUL]]
1613	int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
1614	return vmul_lane_s32(a, v, 0);
1615	}
1616
1617	// CHECK-LABEL: @test_vmulq_lane_s32_0(
1618	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1619	// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1620	// CHECK: ret <4 x i32> [[MUL]]
1621	int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
1622	return vmulq_lane_s32(a, v, 0);
1623	}
1624
1625	// CHECK-LABEL: @test_vmul_lane_u16_0(
1626	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1627	// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1628	// CHECK: ret <4 x i16> [[MUL]]
1629	uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
1630	return vmul_lane_u16(a, v, 0);
1631	}
1632
1633	// CHECK-LABEL: @test_vmulq_lane_u16_0(
1634	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1635	// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1636	// CHECK: ret <8 x i16> [[MUL]]
1637	uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
1638	return vmulq_lane_u16(a, v, 0);
1639	}
1640
1641	// CHECK-LABEL: @test_vmul_lane_u32_0(
1642	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1643	// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1644	// CHECK: ret <2 x i32> [[MUL]]
1645	uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
1646	return vmul_lane_u32(a, v, 0);
1647	}
1648
1649	// CHECK-LABEL: @test_vmulq_lane_u32_0(
1650	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1651	// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1652	// CHECK: ret <4 x i32> [[MUL]]
1653	uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
1654	return vmulq_lane_u32(a, v, 0);
1655	}
1656
1657	// CHECK-LABEL: @test_vmul_laneq_s16_0(
1658	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1659	// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1660	// CHECK: ret <4 x i16> [[MUL]]
1661	int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
1662	return vmul_laneq_s16(a, v, 0);
1663	}
1664
1665	// CHECK-LABEL: @test_vmulq_laneq_s16_0(
1666	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1667	// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1668	// CHECK: ret <8 x i16> [[MUL]]
1669	int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
1670	return vmulq_laneq_s16(a, v, 0);
1671	}
1672
1673	// CHECK-LABEL: @test_vmul_laneq_s32_0(
1674	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1675	// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1676	// CHECK: ret <2 x i32> [[MUL]]
1677	int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
1678	return vmul_laneq_s32(a, v, 0);
1679	}
1680
1681	// CHECK-LABEL: @test_vmulq_laneq_s32_0(
1682	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1683	// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1684	// CHECK: ret <4 x i32> [[MUL]]
1685	int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
1686	return vmulq_laneq_s32(a, v, 0);
1687	}
1688
1689	// CHECK-LABEL: @test_vmul_laneq_u16_0(
1690	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1691	// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1692	// CHECK: ret <4 x i16> [[MUL]]
1693	uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
1694	return vmul_laneq_u16(a, v, 0);
1695	}
1696
1697	// CHECK-LABEL: @test_vmulq_laneq_u16_0(
1698	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1699	// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1700	// CHECK: ret <8 x i16> [[MUL]]
1701	uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
1702	return vmulq_laneq_u16(a, v, 0);
1703	}
1704
1705	// CHECK-LABEL: @test_vmul_laneq_u32_0(
1706	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1707	// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1708	// CHECK: ret <2 x i32> [[MUL]]
1709	uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
1710	return vmul_laneq_u32(a, v, 0);
1711	}
1712
1713	// CHECK-LABEL: @test_vmulq_laneq_u32_0(
1714	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1715	// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1716	// CHECK: ret <4 x i32> [[MUL]]
1717	uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
1718	return vmulq_laneq_u32(a, v, 0);
1719	}
1720
1721	// CHECK-LABEL: @test_vfma_lane_f32_0(
1722	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1723	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1724	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1725	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1726	// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
1727	// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1728	// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1729	// CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
1730	// CHECK: ret <2 x float> [[FMLA2]]
1731	float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
1732	return vfma_lane_f32(a, b, v, 0);
1733	}
1734
1735	// CHECK-LABEL: @test_vfmaq_lane_f32_0(
1736	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1737	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1738	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1739	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1740	// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
1741	// CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1742	// CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1743	// CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
1744	// CHECK: ret <4 x float> [[FMLA2]]
1745	float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
1746	return vfmaq_lane_f32(a, b, v, 0);
1747	}
1748
1749	// CHECK-LABEL: @test_vfma_laneq_f32_0(
1750	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1751	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1752	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1753	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1754	// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1755	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1756	// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
1757	// CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
1758	// CHECK: ret <2 x float> [[TMP6]]
1759	float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
1760	return vfma_laneq_f32(a, b, v, 0);
1761	}
1762
1763	// CHECK-LABEL: @test_vfmaq_laneq_f32_0(
1764	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1765	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1766	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1767	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1768	// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1769	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1770	// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
1771	// CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
1772	// CHECK: ret <4 x float> [[TMP6]]
1773	float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
1774	return vfmaq_laneq_f32(a, b, v, 0);
1775	}
1776
1777	// CHECK-LABEL: @test_vfms_lane_f32_0(
1778	// CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
1779	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1780	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
1781	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1782	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1783	// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
1784	// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1785	// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1786	// CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
1787	// CHECK: ret <2 x float> [[FMLA2]]
1788	float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
1789	return vfms_lane_f32(a, b, v, 0);
1790	}
1791
1792	// CHECK-LABEL: @test_vfmsq_lane_f32_0(
1793	// CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
1794	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1795	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
1796	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1797	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1798	// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
1799	// CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1800	// CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1801	// CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
1802	// CHECK: ret <4 x float> [[FMLA2]]
1803	float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
1804	return vfmsq_lane_f32(a, b, v, 0);
1805	}
1806
1807	// CHECK-LABEL: @test_vfms_laneq_f32_0(
1808	// CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
1809	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1810	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
1811	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1812	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1813	// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1814	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1815	// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
1816	// CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
1817	// CHECK: ret <2 x float> [[TMP6]]
1818	float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
1819	return vfms_laneq_f32(a, b, v, 0);
1820	}
1821
1822	// CHECK-LABEL: @test_vfmsq_laneq_f32_0(
1823	// CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
1824	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1825	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
1826	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1827	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1828	// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1829	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1830	// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
1831	// CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
1832	// CHECK: ret <4 x float> [[TMP6]]
1833	float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
1834	return vfmsq_laneq_f32(a, b, v, 0);
1835	}
1836
1837	// CHECK-LABEL: @test_vfmaq_laneq_f64_0(
1838	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1839	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
1840	// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
1841	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1842	// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1843	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
1844	// CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
1845	// CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
1846	// CHECK: ret <2 x double> [[TMP6]]
1847	float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
1848	return vfmaq_laneq_f64(a, b, v, 0);
1849	}
1850
1851	// CHECK-LABEL: @test_vfmsq_laneq_f64_0(
1852	// CHECK: [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
1853	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1854	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
1855	// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
1856	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1857	// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1858	// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
1859	// CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
1860	// CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
1861	// CHECK: ret <2 x double> [[TMP6]]
1862	float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
1863	return vfmsq_laneq_f64(a, b, v, 0);
1864	}
1865
1866	// CHECK-LABEL: @test_vmlal_lane_s16_0(
1867	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1868	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1869	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1870	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1871	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1872	// CHECK: ret <4 x i32> [[ADD]]
1873	int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
1874	return vmlal_lane_s16(a, b, v, 0);
1875	}
1876
1877	// CHECK-LABEL: @test_vmlal_lane_s32_0(
1878	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1879	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1880	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1881	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1882	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1883	// CHECK: ret <2 x i64> [[ADD]]
1884	int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
1885	return vmlal_lane_s32(a, b, v, 0);
1886	}
1887
1888	// CHECK-LABEL: @test_vmlal_laneq_s16_0(
1889	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1890	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1891	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1892	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1893	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1894	// CHECK: ret <4 x i32> [[ADD]]
1895	int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
1896	return vmlal_laneq_s16(a, b, v, 0);
1897	}
1898
1899	// CHECK-LABEL: @test_vmlal_laneq_s32_0(
1900	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1901	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1902	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1903	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1904	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1905	// CHECK: ret <2 x i64> [[ADD]]
1906	int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
1907	return vmlal_laneq_s32(a, b, v, 0);
1908	}
1909
1910	// CHECK-LABEL: @test_vmlal_high_lane_s16_0(
1911	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1912	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1913	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1914	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1915	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1916	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1917	// CHECK: ret <4 x i32> [[ADD]]
1918	int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
1919	return vmlal_high_lane_s16(a, b, v, 0);
1920	}
1921
1922	// CHECK-LABEL: @test_vmlal_high_lane_s32_0(
1923	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1924	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1925	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1926	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1927	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1928	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1929	// CHECK: ret <2 x i64> [[ADD]]
1930	int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
1931	return vmlal_high_lane_s32(a, b, v, 0);
1932	}
1933
1934	// CHECK-LABEL: @test_vmlal_high_laneq_s16_0(
1935	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1936	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1937	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1938	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1939	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1940	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1941	// CHECK: ret <4 x i32> [[ADD]]
1942	int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
1943	return vmlal_high_laneq_s16(a, b, v, 0);
1944	}
1945
1946	// CHECK-LABEL: @test_vmlal_high_laneq_s32_0(
1947	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1948	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1949	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1950	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1951	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1952	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1953	// CHECK: ret <2 x i64> [[ADD]]
1954	int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
1955	return vmlal_high_laneq_s32(a, b, v, 0);
1956	}
1957
1958	// CHECK-LABEL: @test_vmlsl_lane_s16_0(
1959	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1960	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1961	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1962	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1963	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
1964	// CHECK: ret <4 x i32> [[SUB]]
1965	int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
1966	return vmlsl_lane_s16(a, b, v, 0);
1967	}
1968
1969	// CHECK-LABEL: @test_vmlsl_lane_s32_0(
1970	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1971	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1972	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1973	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1974	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
1975	// CHECK: ret <2 x i64> [[SUB]]
1976	int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
1977	return vmlsl_lane_s32(a, b, v, 0);
1978	}
1979
1980	// CHECK-LABEL: @test_vmlsl_laneq_s16_0(
1981	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1982	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1983	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1984	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1985	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
1986	// CHECK: ret <4 x i32> [[SUB]]
1987	int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
1988	return vmlsl_laneq_s16(a, b, v, 0);
1989	}
1990
1991	// CHECK-LABEL: @test_vmlsl_laneq_s32_0(
1992	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1993	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1994	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1995	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1996	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
1997	// CHECK: ret <2 x i64> [[SUB]]
1998	int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
1999	return vmlsl_laneq_s32(a, b, v, 0);
2000	}
2001
2002	// CHECK-LABEL: @test_vmlsl_high_lane_s16_0(
2003	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2004	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2005	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2006	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2007	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2008	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2009	// CHECK: ret <4 x i32> [[SUB]]
2010	int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2011	return vmlsl_high_lane_s16(a, b, v, 0);
2012	}
2013
2014	// CHECK-LABEL: @test_vmlsl_high_lane_s32_0(
2015	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2016	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2017	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2018	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2019	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2020	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2021	// CHECK: ret <2 x i64> [[SUB]]
2022	int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2023	return vmlsl_high_lane_s32(a, b, v, 0);
2024	}
2025
2026	// CHECK-LABEL: @test_vmlsl_high_laneq_s16_0(
2027	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2028	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2029	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2030	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2031	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2032	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2033	// CHECK: ret <4 x i32> [[SUB]]
2034	int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2035	return vmlsl_high_laneq_s16(a, b, v, 0);
2036	}
2037
2038	// CHECK-LABEL: @test_vmlsl_high_laneq_s32_0(
2039	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2040	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2041	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2042	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2043	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2044	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2045	// CHECK: ret <2 x i64> [[SUB]]
2046	int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2047	return vmlsl_high_laneq_s32(a, b, v, 0);
2048	}
2049
2050	// CHECK-LABEL: @test_vmlal_lane_u16_0(
2051	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2052	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2053	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2054	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2055	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2056	// CHECK: ret <4 x i32> [[ADD]]
2057	int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2058	return vmlal_lane_u16(a, b, v, 0);
2059	}
2060
2061	// CHECK-LABEL: @test_vmlal_lane_u32_0(
2062	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2063	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2064	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2065	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2066	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2067	// CHECK: ret <2 x i64> [[ADD]]
2068	int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2069	return vmlal_lane_u32(a, b, v, 0);
2070	}
2071
2072	// CHECK-LABEL: @test_vmlal_laneq_u16_0(
2073	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2074	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2075	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2076	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2077	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2078	// CHECK: ret <4 x i32> [[ADD]]
2079	int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2080	return vmlal_laneq_u16(a, b, v, 0);
2081	}
2082
2083	// CHECK-LABEL: @test_vmlal_laneq_u32_0(
2084	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2085	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2086	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2087	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2088	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2089	// CHECK: ret <2 x i64> [[ADD]]
2090	int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2091	return vmlal_laneq_u32(a, b, v, 0);
2092	}
2093
2094	// CHECK-LABEL: @test_vmlal_high_lane_u16_0(
2095	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2096	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2097	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2098	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2099	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2100	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2101	// CHECK: ret <4 x i32> [[ADD]]
2102	int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2103	return vmlal_high_lane_u16(a, b, v, 0);
2104	}
2105
2106	// CHECK-LABEL: @test_vmlal_high_lane_u32_0(
2107	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2108	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2109	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2110	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2111	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2112	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2113	// CHECK: ret <2 x i64> [[ADD]]
2114	int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2115	return vmlal_high_lane_u32(a, b, v, 0);
2116	}
2117
2118	// CHECK-LABEL: @test_vmlal_high_laneq_u16_0(
2119	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2120	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2121	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2122	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2123	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2124	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2125	// CHECK: ret <4 x i32> [[ADD]]
2126	int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2127	return vmlal_high_laneq_u16(a, b, v, 0);
2128	}
2129
2130	// CHECK-LABEL: @test_vmlal_high_laneq_u32_0(
2131	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2132	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2133	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2134	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2135	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2136	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2137	// CHECK: ret <2 x i64> [[ADD]]
2138	int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2139	return vmlal_high_laneq_u32(a, b, v, 0);
2140	}
2141
2142	// CHECK-LABEL: @test_vmlsl_lane_u16_0(
2143	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2144	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2145	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2146	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2147	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2148	// CHECK: ret <4 x i32> [[SUB]]
2149	int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2150	return vmlsl_lane_u16(a, b, v, 0);
2151	}
2152
2153	// CHECK-LABEL: @test_vmlsl_lane_u32_0(
2154	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2155	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2156	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2157	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2158	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2159	// CHECK: ret <2 x i64> [[SUB]]
2160	int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2161	return vmlsl_lane_u32(a, b, v, 0);
2162	}
2163
2164	// CHECK-LABEL: @test_vmlsl_laneq_u16_0(
2165	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2166	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2167	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2168	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2169	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2170	// CHECK: ret <4 x i32> [[SUB]]
2171	int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2172	return vmlsl_laneq_u16(a, b, v, 0);
2173	}
2174
2175	// CHECK-LABEL: @test_vmlsl_laneq_u32_0(
2176	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2177	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2178	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2179	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2180	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2181	// CHECK: ret <2 x i64> [[SUB]]
2182	int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2183	return vmlsl_laneq_u32(a, b, v, 0);
2184	}
2185
2186	// CHECK-LABEL: @test_vmlsl_high_lane_u16_0(
2187	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2188	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2189	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2190	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2191	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2192	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2193	// CHECK: ret <4 x i32> [[SUB]]
2194	int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2195	return vmlsl_high_lane_u16(a, b, v, 0);
2196	}
2197
2198	// CHECK-LABEL: @test_vmlsl_high_lane_u32_0(
2199	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2200	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2201	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2202	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2203	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2204	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2205	// CHECK: ret <2 x i64> [[SUB]]
2206	int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2207	return vmlsl_high_lane_u32(a, b, v, 0);
2208	}
2209
2210	// CHECK-LABEL: @test_vmlsl_high_laneq_u16_0(
2211	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2212	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2213	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2214	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2215	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2216	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2217	// CHECK: ret <4 x i32> [[SUB]]
2218	int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2219	return vmlsl_high_laneq_u16(a, b, v, 0);
2220	}
2221
2222	// CHECK-LABEL: @test_vmlsl_high_laneq_u32_0(
2223	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2224	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2225	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2226	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2227	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2228	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2229	// CHECK: ret <2 x i64> [[SUB]]
2230	int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2231	return vmlsl_high_laneq_u32(a, b, v, 0);
2232	}
2233
2234	// CHECK-LABEL: @test_vmull_lane_s16_0(
2235	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2236	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2237	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2238	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2239	// CHECK: ret <4 x i32> [[VMULL2_I]]
2240	int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2241	return vmull_lane_s16(a, v, 0);
2242	}
2243
2244	// CHECK-LABEL: @test_vmull_lane_s32_0(
2245	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2246	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2247	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2248	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2249	// CHECK: ret <2 x i64> [[VMULL2_I]]
2250	int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
2251	return vmull_lane_s32(a, v, 0);
2252	}
2253
2254	// CHECK-LABEL: @test_vmull_lane_u16_0(
2255	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2256	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2257	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2258	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2259	// CHECK: ret <4 x i32> [[VMULL2_I]]
2260	uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
2261	return vmull_lane_u16(a, v, 0);
2262	}
2263
2264	// CHECK-LABEL: @test_vmull_lane_u32_0(
2265	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2266	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2267	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2268	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2269	// CHECK: ret <2 x i64> [[VMULL2_I]]
2270	uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
2271	return vmull_lane_u32(a, v, 0);
2272	}
2273
2274	// CHECK-LABEL: @test_vmull_high_lane_s16_0(
2275	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2276	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2277	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2278	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2279	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2280	// CHECK: ret <4 x i32> [[VMULL2_I]]
2281	int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
2282	return vmull_high_lane_s16(a, v, 0);
2283	}
2284
2285	// CHECK-LABEL: @test_vmull_high_lane_s32_0(
2286	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2287	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2288	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2289	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2290	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2291	// CHECK: ret <2 x i64> [[VMULL2_I]]
2292	int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
2293	return vmull_high_lane_s32(a, v, 0);
2294	}
2295
2296	// CHECK-LABEL: @test_vmull_high_lane_u16_0(
2297	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2298	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2299	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2300	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2301	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2302	// CHECK: ret <4 x i32> [[VMULL2_I]]
2303	uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
2304	return vmull_high_lane_u16(a, v, 0);
2305	}
2306
2307	// CHECK-LABEL: @test_vmull_high_lane_u32_0(
2308	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2309	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2310	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2311	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2312	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2313	// CHECK: ret <2 x i64> [[VMULL2_I]]
2314	uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
2315	return vmull_high_lane_u32(a, v, 0);
2316	}
2317
2318	// CHECK-LABEL: @test_vmull_laneq_s16_0(
2319	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2320	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2321	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2322	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2323	// CHECK: ret <4 x i32> [[VMULL2_I]]
2324	int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
2325	return vmull_laneq_s16(a, v, 0);
2326	}
2327
2328	// CHECK-LABEL: @test_vmull_laneq_s32_0(
2329	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2330	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2331	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2332	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2333	// CHECK: ret <2 x i64> [[VMULL2_I]]
2334	int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
2335	return vmull_laneq_s32(a, v, 0);
2336	}
2337
2338	// CHECK-LABEL: @test_vmull_laneq_u16_0(
2339	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2340	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2341	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2342	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2343	// CHECK: ret <4 x i32> [[VMULL2_I]]
2344	uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
2345	return vmull_laneq_u16(a, v, 0);
2346	}
2347
2348	// CHECK-LABEL: @test_vmull_laneq_u32_0(
2349	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2350	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2351	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2352	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2353	// CHECK: ret <2 x i64> [[VMULL2_I]]
2354	uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
2355	return vmull_laneq_u32(a, v, 0);
2356	}
2357
2358	// CHECK-LABEL: @test_vmull_high_laneq_s16_0(
2359	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2360	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2361	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2362	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2363	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2364	// CHECK: ret <4 x i32> [[VMULL2_I]]
2365	int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
2366	return vmull_high_laneq_s16(a, v, 0);
2367	}
2368
2369	// CHECK-LABEL: @test_vmull_high_laneq_s32_0(
2370	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2371	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2372	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2373	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2374	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2375	// CHECK: ret <2 x i64> [[VMULL2_I]]
2376	int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
2377	return vmull_high_laneq_s32(a, v, 0);
2378	}
2379
2380	// CHECK-LABEL: @test_vmull_high_laneq_u16_0(
2381	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2382	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2383	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2384	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2385	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2386	// CHECK: ret <4 x i32> [[VMULL2_I]]
2387	uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
2388	return vmull_high_laneq_u16(a, v, 0);
2389	}
2390
2391	// CHECK-LABEL: @test_vmull_high_laneq_u32_0(
2392	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2393	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2394	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2395	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2396	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2397	// CHECK: ret <2 x i64> [[VMULL2_I]]
2398	uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
2399	return vmull_high_laneq_u32(a, v, 0);
2400	}
2401
2402	// CHECK-LABEL: @test_vqdmlal_lane_s16_0(
2403	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2404	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2405	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2406	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2407	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2408	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
2409	// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
2410	int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2411	return vqdmlal_lane_s16(a, b, v, 0);
2412	}
2413
2414	// CHECK-LABEL: @test_vqdmlal_lane_s32_0(
2415	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2416	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2417	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2418	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2419	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2420	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
2421	// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
2422	int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2423	return vqdmlal_lane_s32(a, b, v, 0);
2424	}
2425
2426	// CHECK-LABEL: @test_vqdmlal_high_lane_s16_0(
2427	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2428	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2429	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2430	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2431	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2432	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2433	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
2434	// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
2435	int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2436	return vqdmlal_high_lane_s16(a, b, v, 0);
2437	}
2438
2439	// CHECK-LABEL: @test_vqdmlal_high_lane_s32_0(
2440	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2441	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2442	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2443	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2444	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2445	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2446	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
2447	// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
2448	int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2449	return vqdmlal_high_lane_s32(a, b, v, 0);
2450	}
2451
2452	// CHECK-LABEL: @test_vqdmlsl_lane_s16_0(
2453	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2454	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2455	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2456	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2457	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2458	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
2459	// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
2460	int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2461	return vqdmlsl_lane_s16(a, b, v, 0);
2462	}
2463
2464	// CHECK-LABEL: @test_vqdmlsl_lane_s32_0(
2465	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2466	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2467	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2468	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2469	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2470	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
2471	// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
2472	int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2473	return vqdmlsl_lane_s32(a, b, v, 0);
2474	}
2475
2476	// CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0(
2477	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2478	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2479	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2480	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2481	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2482	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2483	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
2484	// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
2485	int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2486	return vqdmlsl_high_lane_s16(a, b, v, 0);
2487	}
2488
2489	// CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0(
2490	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2491	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2492	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2493	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2494	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2495	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2496	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
2497	// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
2498	int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2499	return vqdmlsl_high_lane_s32(a, b, v, 0);
2500	}
2501
2502	// CHECK-LABEL: @test_vqdmull_lane_s16_0(
2503	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2504	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2505	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2506	// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2507	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2508	// CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
2509	int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2510	return vqdmull_lane_s16(a, v, 0);
2511	}
2512
2513	// CHECK-LABEL: @test_vqdmull_lane_s32_0(
2514	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2515	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2516	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2517	// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2518	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2519	// CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
2520	int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
2521	return vqdmull_lane_s32(a, v, 0);
2522	}
2523
2524	// CHECK-LABEL: @test_vqdmull_laneq_s16_0(
2525	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2526	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2527	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2528	// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2529	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2530	// CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
2531	int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
2532	return vqdmull_laneq_s16(a, v, 0);
2533	}
2534
2535	// CHECK-LABEL: @test_vqdmull_laneq_s32_0(
2536	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2537	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2538	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2539	// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2540	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2541	// CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
2542	int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
2543	return vqdmull_laneq_s32(a, v, 0);
2544	}
2545
2546	// CHECK-LABEL: @test_vqdmull_high_lane_s16_0(
2547	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2548	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2549	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2550	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2551	// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2552	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2553	// CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
2554	int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
2555	return vqdmull_high_lane_s16(a, v, 0);
2556	}
2557
2558	// CHECK-LABEL: @test_vqdmull_high_lane_s32_0(
2559	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2560	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2561	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2562	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2563	// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2564	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2565	// CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
2566	int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
2567	return vqdmull_high_lane_s32(a, v, 0);
2568	}
2569
2570	// CHECK-LABEL: @test_vqdmull_high_laneq_s16_0(
2571	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2572	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2573	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2574	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2575	// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2576	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2577	// CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
2578	int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
2579	return vqdmull_high_laneq_s16(a, v, 0);
2580	}
2581
2582	// CHECK-LABEL: @test_vqdmull_high_laneq_s32_0(
2583	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2584	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2585	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2586	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2587	// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2588	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2589	// CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
2590	int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
2591	return vqdmull_high_laneq_s32(a, v, 0);
2592	}
2593
2594	// CHECK-LABEL: @test_vqdmulh_lane_s16_0(
2595	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2596	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2597	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2598	// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2599	// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
2600	// CHECK: ret <4 x i16> [[VQDMULH_V2_I]]
2601	int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
2602	return vqdmulh_lane_s16(a, v, 0);
2603	}
2604
2605	// CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
2606	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
2607	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2608	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
2609	// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
2610	// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
2611	// CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]]
2612	int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
2613	return vqdmulhq_lane_s16(a, v, 0);
2614	}
2615
2616	// CHECK-LABEL: @test_vqdmulh_lane_s32_0(
2617	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2618	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2619	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2620	// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2621	// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
2622	// CHECK: ret <2 x i32> [[VQDMULH_V2_I]]
2623	int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
2624	return vqdmulh_lane_s32(a, v, 0);
2625	}
2626
2627	// CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
2628	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
2629	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2630	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
2631	// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
2632	// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
2633	// CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]]
2634	int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
2635	return vqdmulhq_lane_s32(a, v, 0);
2636	}
2637
2638	// CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
2639	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2640	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2641	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2642	// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2643	// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
2644	// CHECK: ret <4 x i16> [[VQRDMULH_V2_I]]
2645	int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
2646	return vqrdmulh_lane_s16(a, v, 0);
2647	}
2648
2649	// CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
2650	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
2651	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2652	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
2653	// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
2654	// CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]]
2655	int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
2656	return vqrdmulhq_lane_s16(a, v, 0);
2657	}
2658
2659	// CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
2660	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2661	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2662	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2663	// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2664	// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
2665	// CHECK: ret <2 x i32> [[VQRDMULH_V2_I]]
2666	int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
2667	return vqrdmulh_lane_s32(a, v, 0);
2668	}
2669
2670	// CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
2671	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
2672	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2673	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
2674	// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
2675	// CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]]
2676	int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
2677	return vqrdmulhq_lane_s32(a, v, 0);
2678	}
2679
2680	// CHECK-LABEL: @test_vmul_lane_f32_0(
2681	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
2682	// CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
2683	// CHECK: ret <2 x float> [[MUL]]
2684	float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
2685	return vmul_lane_f32(a, v, 0);
2686	}
2687
2688	// CHECK-LABEL: @test_vmulq_lane_f32_0(
2689	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
2690	// CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
2691	// CHECK: ret <4 x float> [[MUL]]
2692	float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
2693	return vmulq_lane_f32(a, v, 0);
2694	}
2695
2696	// CHECK-LABEL: @test_vmul_laneq_f32_0(
2697	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
2698	// CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
2699	// CHECK: ret <2 x float> [[MUL]]
2700	float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
2701	return vmul_laneq_f32(a, v, 0);
2702	}
2703
2704	// CHECK-LABEL: @test_vmul_laneq_f64_0(
2705	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
2706	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
2707	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
2708	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2709	// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
2710	// CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
2711	// CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
2712	// CHECK: ret <1 x double> [[TMP5]]
2713	float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
2714	return vmul_laneq_f64(a, v, 0);
2715	}
2716
2717	// CHECK-LABEL: @test_vmulq_laneq_f32_0(
2718	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
2719	// CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
2720	// CHECK: ret <4 x float> [[MUL]]
2721	float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
2722	return vmulq_laneq_f32(a, v, 0);
2723	}
2724
2725	// CHECK-LABEL: @test_vmulq_laneq_f64_0(
2726	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
2727	// CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
2728	// CHECK: ret <2 x double> [[MUL]]
2729	float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
2730	return vmulq_laneq_f64(a, v, 0);
2731	}
2732
2733	// CHECK-LABEL: @test_vmulx_lane_f32_0(
2734	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
2735	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2736	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
2737	// CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]])
2738	// CHECK: ret <2 x float> [[VMULX2_I]]
2739	float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
2740	return vmulx_lane_f32(a, v, 0);
2741	}
2742
2743	// CHECK-LABEL: @test_vmulxq_lane_f32_0(
2744	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
2745	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2746	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
2747	// CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]])
2748	// CHECK: ret <4 x float> [[VMULX2_I]]
2749	float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
2750	return vmulxq_lane_f32(a, v, 0);
2751	}
2752
2753	// CHECK-LABEL: @test_vmulxq_lane_f64_0(
2754	// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
2755	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
2756	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
2757	// CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]])
2758	// CHECK: ret <2 x double> [[VMULX2_I]]
2759	float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
2760	return vmulxq_lane_f64(a, v, 0);
2761	}
2762
2763	// CHECK-LABEL: @test_vmulx_laneq_f32_0(
2764	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
2765	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2766	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
2767	// CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]])
2768	// CHECK: ret <2 x float> [[VMULX2_I]]
2769	float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
2770	return vmulx_laneq_f32(a, v, 0);
2771	}
2772
2773	// CHECK-LABEL: @test_vmulxq_laneq_f32_0(
2774	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
2775	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2776	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
2777	// CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]])
2778	// CHECK: ret <4 x float> [[VMULX2_I]]
2779	float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
2780	return vmulxq_laneq_f32(a, v, 0);
2781	}
2782
2783	// CHECK-LABEL: @test_vmulxq_laneq_f64_0(
2784	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
2785	// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
2786	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
2787	// CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]])
2788	// CHECK: ret <2 x double> [[VMULX2_I]]
2789	float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
2790	return vmulxq_laneq_f64(a, v, 0);
2791	}
2792
2793	// CHECK-LABEL: @test_vmull_high_n_s16(
2794	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2795	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2796	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
2797	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
2798	// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
2799	// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
2800	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2801	// CHECK: [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2802	// CHECK: ret <4 x i32> [[VMULL5_I_I]]
2803	int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
2804	return vmull_high_n_s16(a, b);
2805	}
2806
2807	// CHECK-LABEL: @test_vmull_high_n_s32(
2808	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2809	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2810	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
2811	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
2812	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2813	// CHECK: [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2814	// CHECK: ret <2 x i64> [[VMULL3_I_I]]
2815	int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
2816	return vmull_high_n_s32(a, b);
2817	}
2818
2819	// CHECK-LABEL: @test_vmull_high_n_u16(
2820	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2821	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2822	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
2823	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
2824	// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
2825	// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
2826	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2827	// CHECK: [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2828	// CHECK: ret <4 x i32> [[VMULL5_I_I]]
2829	uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
2830	return vmull_high_n_u16(a, b);
2831	}
2832
2833	// CHECK-LABEL: @test_vmull_high_n_u32(
2834	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2835	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2836	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
2837	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
2838	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2839	// CHECK: [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2840	// CHECK: ret <2 x i64> [[VMULL3_I_I]]
2841	uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
2842	return vmull_high_n_u32(a, b);
2843	}
2844
2845	// CHECK-LABEL: @test_vqdmull_high_n_s16(
2846	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2847	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2848	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
2849	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
2850	// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
2851	// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
2852	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2853	// CHECK: [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2854	// CHECK: [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8>
2855	// CHECK: ret <4 x i32> [[VQDMULL_V5_I_I]]
2856	int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
2857	return vqdmull_high_n_s16(a, b);
2858	}
2859
2860	// CHECK-LABEL: @test_vqdmull_high_n_s32(
2861	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2862	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2863	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
2864	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
2865	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2866	// CHECK: [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2867	// CHECK: [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8>
2868	// CHECK: ret <2 x i64> [[VQDMULL_V3_I_I]]
2869	int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
2870	return vqdmull_high_n_s32(a, b);
2871	}
2872
2873	// CHECK-LABEL: @test_vmlal_high_n_s16(
2874	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2875	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2876	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2877	// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2878	// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2879	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2880	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2881	// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2882	// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
2883	// CHECK: ret <4 x i32> [[ADD_I_I]]
2884	int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
2885	return vmlal_high_n_s16(a, b, c);
2886	}
2887
2888	// CHECK-LABEL: @test_vmlal_high_n_s32(
2889	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2890	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2891	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2892	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2893	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2894	// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2895	// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
2896	// CHECK: ret <2 x i64> [[ADD_I_I]]
2897	int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
2898	return vmlal_high_n_s32(a, b, c);
2899	}
2900
2901	// CHECK-LABEL: @test_vmlal_high_n_u16(
2902	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2903	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2904	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2905	// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2906	// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2907	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2908	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2909	// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2910	// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
2911	// CHECK: ret <4 x i32> [[ADD_I_I]]
2912	uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
2913	return vmlal_high_n_u16(a, b, c);
2914	}
2915
2916	// CHECK-LABEL: @test_vmlal_high_n_u32(
2917	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2918	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2919	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2920	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2921	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2922	// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2923	// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
2924	// CHECK: ret <2 x i64> [[ADD_I_I]]
2925	uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
2926	return vmlal_high_n_u32(a, b, c);
2927	}
2928
2929	// CHECK-LABEL: @test_vqdmlal_high_n_s16(
2930	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2931	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2932	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2933	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2934	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2935	// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2936	// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2937	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2938	// CHECK: [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2939	// CHECK: [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]])
2940	// CHECK: ret <4 x i32> [[VQDMLAL_V6_I_I]]
2941	int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
2942	return vqdmlal_high_n_s16(a, b, c);
2943	}
2944
2945	// CHECK-LABEL: @test_vqdmlal_high_n_s32(
2946	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2947	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2948	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2949	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2950	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2951	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2952	// CHECK: [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2953	// CHECK: [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]])
2954	// CHECK: ret <2 x i64> [[VQDMLAL_V4_I_I]]
2955	int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
2956	return vqdmlal_high_n_s32(a, b, c);
2957	}
2958
2959	// CHECK-LABEL: @test_vmlsl_high_n_s16(
2960	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2961	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2962	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2963	// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2964	// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2965	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2966	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2967	// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2968	// CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
2969	// CHECK: ret <4 x i32> [[SUB_I_I]]
2970	int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
2971	return vmlsl_high_n_s16(a, b, c);
2972	}
2973
2974	// CHECK-LABEL: @test_vmlsl_high_n_s32(
2975	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2976	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2977	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2978	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2979	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2980	// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2981	// CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
2982	// CHECK: ret <2 x i64> [[SUB_I_I]]
2983	int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
2984	return vmlsl_high_n_s32(a, b, c);
2985	}
2986
2987	// CHECK-LABEL: @test_vmlsl_high_n_u16(
2988	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2989	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2990	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2991	// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2992	// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2993	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2994	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2995	// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2996	// CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
2997	// CHECK: ret <4 x i32> [[SUB_I_I]]
2998	uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
2999	return vmlsl_high_n_u16(a, b, c);
3000	}
3001
3002	// CHECK-LABEL: @test_vmlsl_high_n_u32(
3003	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3004	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3005	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3006	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3007	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3008	// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
3009	// CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
3010	// CHECK: ret <2 x i64> [[SUB_I_I]]
3011	uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3012	return vmlsl_high_n_u32(a, b, c);
3013	}
3014
3015	// CHECK-LABEL: @test_vqdmlsl_high_n_s16(
3016	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3017	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3018	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3019	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3020	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3021	// CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3022	// CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3023	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3024	// CHECK: [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
3025	// CHECK: [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]])
3026	// CHECK: ret <4 x i32> [[VQDMLSL_V6_I_I]]
3027	int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3028	return vqdmlsl_high_n_s16(a, b, c);
3029	}
3030
3031	// CHECK-LABEL: @test_vqdmlsl_high_n_s32(
3032	// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3033	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3034	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3035	// CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3036	// CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3037	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3038	// CHECK: [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
3039	// CHECK: [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]])
3040	// CHECK: ret <2 x i64> [[VQDMLSL_V4_I_I]]
3041	int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3042	return vqdmlsl_high_n_s32(a, b, c);
3043	}
3044
3045	// CHECK-LABEL: @test_vmul_n_f32(
3046	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
3047	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
3048	// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
3049	// CHECK: ret <2 x float> [[MUL_I]]
3050	float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
3051	return vmul_n_f32(a, b);
3052	}
3053
3054	// CHECK-LABEL: @test_vmulq_n_f32(
3055	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
3056	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
3057	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
3058	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
3059	// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
3060	// CHECK: ret <4 x float> [[MUL_I]]
3061	float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
3062	return vmulq_n_f32(a, b);
3063	}
3064
3065	// CHECK-LABEL: @test_vmulq_n_f64(
3066	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0
3067	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1
3068	// CHECK: [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]]
3069	// CHECK: ret <2 x double> [[MUL_I]]
3070	float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
3071	return vmulq_n_f64(a, b);
3072	}
3073
3074	// CHECK-LABEL: @test_vfma_n_f32(
3075	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
3076	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
3077	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3078	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3079	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
3080	// CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> [[VECINIT1_I]], <2 x float> %a)
3081	// CHECK: ret <2 x float> [[TMP3]]
3082	float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
3083	return vfma_n_f32(a, b, n);
3084	}
3085
3086	// CHECK-LABEL: @test_vfma_n_f64(
3087	// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double %n, i32 0
3088	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
3089	// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
3090	// CHECK: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
3091	// CHECK: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> [[VECINIT_I]], <1 x double> %a)
3092	// CHECK: ret <1 x double> [[TMP3]]
3093	float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
3094	return vfma_n_f64(a, b, n);
3095	}
3096
3097	// CHECK-LABEL: @test_vfmaq_n_f32(
3098	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
3099	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
3100	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
3101	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
3102	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3103	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3104	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
3105	// CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> [[VECINIT3_I]], <4 x float> %a)
3106	// CHECK: ret <4 x float> [[TMP3]]
3107	float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
3108	return vfmaq_n_f32(a, b, n);
3109	}
3110
3111	// CHECK-LABEL: @test_vfms_n_f32(
3112	// CHECK: [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
3113	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
3114	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
3115	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3116	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
3117	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
3118	// CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> [[VECINIT1_I]], <2 x float> %a)
3119	// CHECK: ret <2 x float> [[TMP3]]
3120	float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
3121	return vfms_n_f32(a, b, n);
3122	}
3123
3124	// CHECK-LABEL: @test_vfms_n_f64(
3125	// CHECK: [[SUB_I:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
3126	// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double %n, i32 0
3127	// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
3128	// CHECK: [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8>
3129	// CHECK: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
3130	// CHECK: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> [[VECINIT_I]], <1 x double> %a)
3131	// CHECK: ret <1 x double> [[TMP3]]
3132	float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
3133	return vfms_n_f64(a, b, n);
3134	}
3135
3136	// CHECK-LABEL: @test_vfmsq_n_f32(
3137	// CHECK: [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
3138	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
3139	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
3140	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
3141	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
3142	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3143	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
3144	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
3145	// CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> [[VECINIT3_I]], <4 x float> %a)
3146	// CHECK: ret <4 x float> [[TMP3]]
3147	float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
3148	return vfmsq_n_f32(a, b, n);
3149	}
3150
3151	// CHECK-LABEL: @test_vmul_n_s16(
3152	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3153	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3154	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3155	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3156	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
3157	// CHECK: ret <4 x i16> [[MUL_I]]
3158	int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
3159	return vmul_n_s16(a, b);
3160	}
3161
3162	// CHECK-LABEL: @test_vmulq_n_s16(
3163	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3164	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3165	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3166	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3167	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3168	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3169	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3170	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3171	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
3172	// CHECK: ret <8 x i16> [[MUL_I]]
3173	int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
3174	return vmulq_n_s16(a, b);
3175	}
3176
3177	// CHECK-LABEL: @test_vmul_n_s32(
3178	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3179	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3180	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
3181	// CHECK: ret <2 x i32> [[MUL_I]]
3182	int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
3183	return vmul_n_s32(a, b);
3184	}
3185
3186	// CHECK-LABEL: @test_vmulq_n_s32(
3187	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3188	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3189	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3190	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3191	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
3192	// CHECK: ret <4 x i32> [[MUL_I]]
3193	int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
3194	return vmulq_n_s32(a, b);
3195	}
3196
3197	// CHECK-LABEL: @test_vmul_n_u16(
3198	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3199	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3200	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3201	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3202	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
3203	// CHECK: ret <4 x i16> [[MUL_I]]
3204	uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
3205	return vmul_n_u16(a, b);
3206	}
3207
3208	// CHECK-LABEL: @test_vmulq_n_u16(
3209	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3210	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3211	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3212	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3213	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3214	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3215	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3216	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3217	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
3218	// CHECK: ret <8 x i16> [[MUL_I]]
3219	uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
3220	return vmulq_n_u16(a, b);
3221	}
3222
3223	// CHECK-LABEL: @test_vmul_n_u32(
3224	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3225	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3226	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
3227	// CHECK: ret <2 x i32> [[MUL_I]]
3228	uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
3229	return vmul_n_u32(a, b);
3230	}
3231
3232	// CHECK-LABEL: @test_vmulq_n_u32(
3233	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3234	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3235	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3236	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3237	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
3238	// CHECK: ret <4 x i32> [[MUL_I]]
3239	uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
3240	return vmulq_n_u32(a, b);
3241	}
3242
3243	// CHECK-LABEL: @test_vmull_n_s16(
3244	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3245	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3246	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3247	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3248	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3249	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3250	// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3251	// CHECK: ret <4 x i32> [[VMULL5_I]]
3252	int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
3253	return vmull_n_s16(a, b);
3254	}
3255
3256	// CHECK-LABEL: @test_vmull_n_s32(
3257	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3258	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3259	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3260	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3261	// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3262	// CHECK: ret <2 x i64> [[VMULL3_I]]
3263	int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
3264	return vmull_n_s32(a, b);
3265	}
3266
3267	// CHECK-LABEL: @test_vmull_n_u16(
3268	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3269	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3270	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3271	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3272	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3273	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3274	// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3275	// CHECK: ret <4 x i32> [[VMULL5_I]]
3276	uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
3277	return vmull_n_u16(a, b);
3278	}
3279
3280	// CHECK-LABEL: @test_vmull_n_u32(
3281	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3282	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3283	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3284	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3285	// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3286	// CHECK: ret <2 x i64> [[VMULL3_I]]
3287	uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
3288	return vmull_n_u32(a, b);
3289	}
3290
3291	// CHECK-LABEL: @test_vqdmull_n_s16(
3292	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3293	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3294	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3295	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3296	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3297	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3298	// CHECK: [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3299	// CHECK: [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
3300	// CHECK: ret <4 x i32> [[VQDMULL_V5_I]]
3301	int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
3302	return vqdmull_n_s16(a, b);
3303	}
3304
3305	// CHECK-LABEL: @test_vqdmull_n_s32(
3306	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3307	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3308	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3309	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3310	// CHECK: [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3311	// CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
3312	// CHECK: ret <2 x i64> [[VQDMULL_V3_I]]
3313	int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
3314	return vqdmull_n_s32(a, b);
3315	}
3316
3317	// CHECK-LABEL: @test_vqdmulh_n_s16(
3318	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3319	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3320	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3321	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3322	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3323	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3324	// CHECK: [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3325	// CHECK: [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
3326	// CHECK: ret <4 x i16> [[VQDMULH_V5_I]]
3327	int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
3328	return vqdmulh_n_s16(a, b);
3329	}
3330
3331	// CHECK-LABEL: @test_vqdmulhq_n_s16(
3332	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3333	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3334	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3335	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3336	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3337	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3338	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3339	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3340	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3341	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
3342	// CHECK: [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
3343	// CHECK: [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
3344	// CHECK: ret <8 x i16> [[VQDMULHQ_V9_I]]
3345	int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
3346	return vqdmulhq_n_s16(a, b);
3347	}
3348
3349	// CHECK-LABEL: @test_vqdmulh_n_s32(
3350	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3351	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3352	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3353	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3354	// CHECK: [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3355	// CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
3356	// CHECK: ret <2 x i32> [[VQDMULH_V3_I]]
3357	int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
3358	return vqdmulh_n_s32(a, b);
3359	}
3360
3361	// CHECK-LABEL: @test_vqdmulhq_n_s32(
3362	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3363	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3364	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3365	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3366	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3367	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
3368	// CHECK: [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
3369	// CHECK: [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
3370	// CHECK: ret <4 x i32> [[VQDMULHQ_V5_I]]
3371	int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
3372	return vqdmulhq_n_s32(a, b);
3373	}
3374
3375	// CHECK-LABEL: @test_vqrdmulh_n_s16(
3376	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3377	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3378	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3379	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3380	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3381	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3382	// CHECK: [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3383	// CHECK: [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
3384	// CHECK: ret <4 x i16> [[VQRDMULH_V5_I]]
3385	int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
3386	return vqrdmulh_n_s16(a, b);
3387	}
3388
3389	// CHECK-LABEL: @test_vqrdmulhq_n_s16(
3390	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3391	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3392	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3393	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3394	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3395	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3396	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3397	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3398	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3399	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
3400	// CHECK: [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
3401	// CHECK: [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
3402	// CHECK: ret <8 x i16> [[VQRDMULHQ_V9_I]]
3403	int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
3404	return vqrdmulhq_n_s16(a, b);
3405	}
3406
3407	// CHECK-LABEL: @test_vqrdmulh_n_s32(
3408	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3409	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3410	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3411	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3412	// CHECK: [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3413	// CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
3414	// CHECK: ret <2 x i32> [[VQRDMULH_V3_I]]
3415	int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
3416	return vqrdmulh_n_s32(a, b);
3417	}
3418
3419	// CHECK-LABEL: @test_vqrdmulhq_n_s32(
3420	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3421	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3422	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3423	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3424	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3425	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
3426	// CHECK: [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
3427	// CHECK: [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
3428	// CHECK: ret <4 x i32> [[VQRDMULHQ_V5_I]]
3429	int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
3430	return vqrdmulhq_n_s32(a, b);
3431	}
3432
3433	// CHECK-LABEL: @test_vmla_n_s16(
3434	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3435	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3436	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3437	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3438	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3439	// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
3440	// CHECK: ret <4 x i16> [[ADD_I]]
3441	int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
3442	return vmla_n_s16(a, b, c);
3443	}
3444
3445	// CHECK-LABEL: @test_vmlaq_n_s16(
3446	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3447	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3448	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3449	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3450	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3451	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3452	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3453	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3454	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3455	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
3456	// CHECK: ret <8 x i16> [[ADD_I]]
3457	int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
3458	return vmlaq_n_s16(a, b, c);
3459	}
3460
3461	// CHECK-LABEL: @test_vmla_n_s32(
3462	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3463	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3464	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3465	// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
3466	// CHECK: ret <2 x i32> [[ADD_I]]
3467	int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
3468	return vmla_n_s32(a, b, c);
3469	}
3470
3471	// CHECK-LABEL: @test_vmlaq_n_s32(
3472	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3473	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3474	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3475	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3476	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3477	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
3478	// CHECK: ret <4 x i32> [[ADD_I]]
3479	int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
3480	return vmlaq_n_s32(a, b, c);
3481	}
3482
3483	// CHECK-LABEL: @test_vmla_n_u16(
3484	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3485	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3486	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3487	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3488	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3489	// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
3490	// CHECK: ret <4 x i16> [[ADD_I]]
3491	uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
3492	return vmla_n_u16(a, b, c);
3493	}
3494
3495	// CHECK-LABEL: @test_vmlaq_n_u16(
3496	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3497	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3498	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3499	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3500	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3501	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3502	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3503	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3504	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3505	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
3506	// CHECK: ret <8 x i16> [[ADD_I]]
3507	uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
3508	return vmlaq_n_u16(a, b, c);
3509	}
3510
3511	// CHECK-LABEL: @test_vmla_n_u32(
3512	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3513	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3514	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3515	// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
3516	// CHECK: ret <2 x i32> [[ADD_I]]
3517	uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
3518	return vmla_n_u32(a, b, c);
3519	}
3520
3521	// CHECK-LABEL: @test_vmlaq_n_u32(
3522	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3523	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3524	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3525	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3526	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3527	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
3528	// CHECK: ret <4 x i32> [[ADD_I]]
3529	uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
3530	return vmlaq_n_u32(a, b, c);
3531	}
3532
3533	// CHECK-LABEL: @test_vmlal_n_s16(
3534	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3535	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3536	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3537	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3538	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3539	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3540	// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3541	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
3542	// CHECK: ret <4 x i32> [[ADD_I]]
3543	int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3544	return vmlal_n_s16(a, b, c);
3545	}
3546
3547	// CHECK-LABEL: @test_vmlal_n_s32(
3548	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3549	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3550	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3551	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3552	// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3553	// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
3554	// CHECK: ret <2 x i64> [[ADD_I]]
3555	int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3556	return vmlal_n_s32(a, b, c);
3557	}
3558
3559	// CHECK-LABEL: @test_vmlal_n_u16(
3560	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3561	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3562	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3563	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3564	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3565	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3566	// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3567	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
3568	// CHECK: ret <4 x i32> [[ADD_I]]
3569	uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
3570	return vmlal_n_u16(a, b, c);
3571	}
3572
3573	// CHECK-LABEL: @test_vmlal_n_u32(
3574	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3575	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3576	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3577	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3578	// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3579	// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
3580	// CHECK: ret <2 x i64> [[ADD_I]]
3581	uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
3582	return vmlal_n_u32(a, b, c);
3583	}
3584
3585	// CHECK-LABEL: @test_vqdmlal_n_s16(
3586	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3587	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3588	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3589	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3590	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3591	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3592	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3593	// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3594	// CHECK: [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
3595	// CHECK: ret <4 x i32> [[VQDMLAL_V6_I]]
3596	int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3597	return vqdmlal_n_s16(a, b, c);
3598	}
3599
3600	// CHECK-LABEL: @test_vqdmlal_n_s32(
3601	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3602	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3603	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3604	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3605	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3606	// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3607	// CHECK: [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
3608	// CHECK: ret <2 x i64> [[VQDMLAL_V4_I]]
3609	int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3610	return vqdmlal_n_s32(a, b, c);
3611	}
3612
3613	// CHECK-LABEL: @test_vmls_n_s16(
3614	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3615	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3616	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3617	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3618	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3619	// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
3620	// CHECK: ret <4 x i16> [[SUB_I]]
3621	int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
3622	return vmls_n_s16(a, b, c);
3623	}
3624
3625	// CHECK-LABEL: @test_vmlsq_n_s16(
3626	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3627	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3628	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3629	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3630	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3631	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3632	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3633	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3634	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3635	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
3636	// CHECK: ret <8 x i16> [[SUB_I]]
3637	int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
3638	return vmlsq_n_s16(a, b, c);
3639	}
3640
3641	// CHECK-LABEL: @test_vmls_n_s32(
3642	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3643	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3644	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3645	// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
3646	// CHECK: ret <2 x i32> [[SUB_I]]
3647	int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
3648	return vmls_n_s32(a, b, c);
3649	}
3650
3651	// CHECK-LABEL: @test_vmlsq_n_s32(
3652	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3653	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3654	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3655	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3656	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3657	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
3658	// CHECK: ret <4 x i32> [[SUB_I]]
3659	int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
3660	return vmlsq_n_s32(a, b, c);
3661	}
3662
3663	// CHECK-LABEL: @test_vmls_n_u16(
3664	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3665	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3666	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3667	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3668	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3669	// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
3670	// CHECK: ret <4 x i16> [[SUB_I]]
3671	uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
3672	return vmls_n_u16(a, b, c);
3673	}
3674
3675	// CHECK-LABEL: @test_vmlsq_n_u16(
3676	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3677	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3678	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3679	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3680	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3681	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3682	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3683	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3684	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3685	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
3686	// CHECK: ret <8 x i16> [[SUB_I]]
3687	uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
3688	return vmlsq_n_u16(a, b, c);
3689	}
3690
3691	// CHECK-LABEL: @test_vmls_n_u32(
3692	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3693	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3694	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3695	// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
3696	// CHECK: ret <2 x i32> [[SUB_I]]
3697	uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
3698	return vmls_n_u32(a, b, c);
3699	}
3700
3701	// CHECK-LABEL: @test_vmlsq_n_u32(
3702	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3703	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3704	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3705	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3706	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3707	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
3708	// CHECK: ret <4 x i32> [[SUB_I]]
3709	uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
3710	return vmlsq_n_u32(a, b, c);
3711	}
3712
3713	// CHECK-LABEL: @test_vmlsl_n_s16(
3714	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3715	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3716	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3717	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3718	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3719	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3720	// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3721	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
3722	// CHECK: ret <4 x i32> [[SUB_I]]
3723	int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3724	return vmlsl_n_s16(a, b, c);
3725	}
3726
3727	// CHECK-LABEL: @test_vmlsl_n_s32(
3728	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3729	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3730	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3731	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3732	// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3733	// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
3734	// CHECK: ret <2 x i64> [[SUB_I]]
3735	int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3736	return vmlsl_n_s32(a, b, c);
3737	}
3738
3739	// CHECK-LABEL: @test_vmlsl_n_u16(
3740	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3741	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3742	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3743	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3744	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3745	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3746	// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3747	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
3748	// CHECK: ret <4 x i32> [[SUB_I]]
3749	uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
3750	return vmlsl_n_u16(a, b, c);
3751	}
3752
3753	// CHECK-LABEL: @test_vmlsl_n_u32(
3754	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3755	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3756	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3757	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3758	// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3759	// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
3760	// CHECK: ret <2 x i64> [[SUB_I]]
3761	uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
3762	return vmlsl_n_u32(a, b, c);
3763	}
3764
3765	// CHECK-LABEL: @test_vqdmlsl_n_s16(
3766	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3767	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3768	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3769	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3770	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3771	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3772	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3773	// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3774	// CHECK: [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
3775	// CHECK: ret <4 x i32> [[VQDMLSL_V6_I]]
3776	int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3777	return vqdmlsl_n_s16(a, b, c);
3778	}
3779
3780	// CHECK-LABEL: @test_vqdmlsl_n_s32(
3781	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3782	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3783	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3784	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3785	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3786	// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3787	// CHECK: [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
3788	// CHECK: ret <2 x i64> [[VQDMLSL_V4_I]]
3789	int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3790	return vqdmlsl_n_s32(a, b, c);
3791	}
3792
3793	// CHECK-LABEL: @test_vmla_lane_u16_0(
3794	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
3795	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3796	// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
3797	// CHECK: ret <4 x i16> [[ADD]]
3798	uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
3799	return vmla_lane_u16(a, b, v, 0);
3800	}
3801
3802	// CHECK-LABEL: @test_vmlaq_lane_u16_0(
3803	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
3804	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3805	// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
3806	// CHECK: ret <8 x i16> [[ADD]]
3807	uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
3808	return vmlaq_lane_u16(a, b, v, 0);
3809	}
3810
3811	// CHECK-LABEL: @test_vmla_lane_u32_0(
3812	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
3813	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3814	// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
3815	// CHECK: ret <2 x i32> [[ADD]]
3816	uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
3817	return vmla_lane_u32(a, b, v, 0);
3818	}
3819
3820	// CHECK-LABEL: @test_vmlaq_lane_u32_0(
3821	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
3822	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3823	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
3824	// CHECK: ret <4 x i32> [[ADD]]
3825	uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
3826	return vmlaq_lane_u32(a, b, v, 0);
3827	}
3828
3829	// CHECK-LABEL: @test_vmla_laneq_u16_0(
3830	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3831	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3832	// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
3833	// CHECK: ret <4 x i16> [[ADD]]
3834	uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
3835	return vmla_laneq_u16(a, b, v, 0);
3836	}
3837
3838	// CHECK-LABEL: @test_vmlaq_laneq_u16_0(
3839	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
3840	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3841	// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
3842	// CHECK: ret <8 x i16> [[ADD]]
3843	uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
3844	return vmlaq_laneq_u16(a, b, v, 0);
3845	}
3846
3847	// CHECK-LABEL: @test_vmla_laneq_u32_0(
3848	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3849	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3850	// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
3851	// CHECK: ret <2 x i32> [[ADD]]
3852	uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
3853	return vmla_laneq_u32(a, b, v, 0);
3854	}
3855
3856	// CHECK-LABEL: @test_vmlaq_laneq_u32_0(
3857	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
3858	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3859	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
3860	// CHECK: ret <4 x i32> [[ADD]]
3861	uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
3862	return vmlaq_laneq_u32(a, b, v, 0);
3863	}
3864
3865	// CHECK-LABEL: @test_vqdmlal_laneq_s16_0(
3866	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3867	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3868	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3869	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
3870	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
3871	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
3872	// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
3873	int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
3874	return vqdmlal_laneq_s16(a, b, v, 0);
3875	}
3876
3877	// CHECK-LABEL: @test_vqdmlal_laneq_s32_0(
3878	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3879	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3880	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3881	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
3882	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
3883	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
3884	// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
3885	int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
3886	return vqdmlal_laneq_s32(a, b, v, 0);
3887	}
3888
3889	// CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0(
3890	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3891	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3892	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3893	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3894	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
3895	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
3896	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
3897	// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
3898	int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
3899	return vqdmlal_high_laneq_s16(a, b, v, 0);
3900	}
3901
3902	// CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0(
3903	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3904	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3905	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3906	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3907	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
3908	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
3909	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
3910	// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
3911	int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
3912	return vqdmlal_high_laneq_s32(a, b, v, 0);
3913	}
3914
3915	// CHECK-LABEL: @test_vmls_lane_u16_0(
3916	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
3917	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3918	// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
3919	// CHECK: ret <4 x i16> [[SUB]]
3920	uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
3921	return vmls_lane_u16(a, b, v, 0);
3922	}
3923
3924	// CHECK-LABEL: @test_vmlsq_lane_u16_0(
3925	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
3926	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3927	// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
3928	// CHECK: ret <8 x i16> [[SUB]]
3929	uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
3930	return vmlsq_lane_u16(a, b, v, 0);
3931	}
3932
3933	// CHECK-LABEL: @test_vmls_lane_u32_0(
3934	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
3935	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3936	// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
3937	// CHECK: ret <2 x i32> [[SUB]]
3938	uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
3939	return vmls_lane_u32(a, b, v, 0);
3940	}
3941
3942	// CHECK-LABEL: @test_vmlsq_lane_u32_0(
3943	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
3944	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3945	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
3946	// CHECK: ret <4 x i32> [[SUB]]
3947	uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
3948	return vmlsq_lane_u32(a, b, v, 0);
3949	}
3950
3951	// CHECK-LABEL: @test_vmls_laneq_u16_0(
3952	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3953	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3954	// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
3955	// CHECK: ret <4 x i16> [[SUB]]
3956	uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
3957	return vmls_laneq_u16(a, b, v, 0);
3958	}
3959
3960	// CHECK-LABEL: @test_vmlsq_laneq_u16_0(
3961	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
3962	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3963	// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
3964	// CHECK: ret <8 x i16> [[SUB]]
3965	uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
3966	return vmlsq_laneq_u16(a, b, v, 0);
3967	}
3968
3969	// CHECK-LABEL: @test_vmls_laneq_u32_0(
3970	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3971	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3972	// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
3973	// CHECK: ret <2 x i32> [[SUB]]
3974	uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
3975	return vmls_laneq_u32(a, b, v, 0);
3976	}
3977
3978	// CHECK-LABEL: @test_vmlsq_laneq_u32_0(
3979	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
3980	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3981	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
3982	// CHECK: ret <4 x i32> [[SUB]]
3983	uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
3984	return vmlsq_laneq_u32(a, b, v, 0);
3985	}
3986
3987	// CHECK-LABEL: @test_vqdmlsl_laneq_s16_0(
3988	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3989	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3990	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3991	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
3992	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
3993	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
3994	// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
3995	int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
3996	return vqdmlsl_laneq_s16(a, b, v, 0);
3997	}
3998
3999	// CHECK-LABEL: @test_vqdmlsl_laneq_s32_0(
4000	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4001	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4002	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4003	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4004	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
4005	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4006	// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
4007	int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
4008	return vqdmlsl_laneq_s32(a, b, v, 0);
4009	}
4010
4011	// CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0(
4012	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4013	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4014	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4015	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4016	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4017	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
4018	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4019	// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
4020	int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
4021	return vqdmlsl_high_laneq_s16(a, b, v, 0);
4022	}
4023
4024	// CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0(
4025	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4026	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4027	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4028	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4029	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4030	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
4031	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4032	// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
4033	int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
4034	return vqdmlsl_high_laneq_s32(a, b, v, 0);
4035	}
4036
4037	// CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
4038	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4039	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4040	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4041	// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
4042	// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4043	// CHECK: ret <4 x i16> [[VQDMULH_V2_I]]
4044	int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
4045	return vqdmulh_laneq_s16(a, v, 0);
4046	}
4047
4048	// CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
4049	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4050	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4051	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4052	// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
4053	// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4054	// CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]]
4055	int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
4056	return vqdmulhq_laneq_s16(a, v, 0);
4057	}
4058
4059	// CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
4060	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4061	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4062	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4063	// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
4064	// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4065	// CHECK: ret <2 x i32> [[VQDMULH_V2_I]]
4066	int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
4067	return vqdmulh_laneq_s32(a, v, 0);
4068	}
4069
4070	// CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
4071	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4072	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4073	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4074	// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
4075	// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4076	// CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]]
4077	int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
4078	return vqdmulhq_laneq_s32(a, v, 0);
4079	}
4080
4081	// CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
4082	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4083	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4084	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4085	// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
4086	// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4087	// CHECK: ret <4 x i16> [[VQRDMULH_V2_I]]
4088	int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
4089	return vqrdmulh_laneq_s16(a, v, 0);
4090	}
4091
4092	// CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
4093	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4094	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4095	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4096	// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
4097	// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4098	// CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]]
4099	int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
4100	return vqrdmulhq_laneq_s16(a, v, 0);
4101	}
4102
4103	// CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
4104	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4105	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4106	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4107	// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
4108	// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4109	// CHECK: ret <2 x i32> [[VQRDMULH_V2_I]]
4110	int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
4111	return vqrdmulh_laneq_s32(a, v, 0);
4112	}
4113
4114	// CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
4115	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4116	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4117	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4118	// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
4119	// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
4120	// CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]]
4121	int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
4122	return vqrdmulhq_laneq_s32(a, v, 0);
4123	}
4124
4125	// CHECK-LABEL: @test_vmla_lane_u16(
4126	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4127	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4128	// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4129	// CHECK: ret <4 x i16> [[ADD]]
4130	uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4131	return vmla_lane_u16(a, b, v, 3);
4132	}
4133
4134	// CHECK-LABEL: @test_vmlaq_lane_u16(
4135	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4136	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4137	// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4138	// CHECK: ret <8 x i16> [[ADD]]
4139	uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4140	return vmlaq_lane_u16(a, b, v, 3);
4141	}
4142
4143	// CHECK-LABEL: @test_vmla_lane_u32(
4144	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
4145	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4146	// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4147	// CHECK: ret <2 x i32> [[ADD]]
4148	uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4149	return vmla_lane_u32(a, b, v, 1);
4150	}
4151
4152	// CHECK-LABEL: @test_vmlaq_lane_u32(
4153	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4154	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4155	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4156	// CHECK: ret <4 x i32> [[ADD]]
4157	uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4158	return vmlaq_lane_u32(a, b, v, 1);
4159	}
4160
4161	// CHECK-LABEL: @test_vmla_laneq_u16(
4162	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4163	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4164	// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4165	// CHECK: ret <4 x i16> [[ADD]]
4166	uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4167	return vmla_laneq_u16(a, b, v, 7);
4168	}
4169
4170	// CHECK-LABEL: @test_vmlaq_laneq_u16(
4171	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4172	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4173	// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4174	// CHECK: ret <8 x i16> [[ADD]]
4175	uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4176	return vmlaq_laneq_u16(a, b, v, 7);
4177	}
4178
4179	// CHECK-LABEL: @test_vmla_laneq_u32(
4180	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4181	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4182	// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4183	// CHECK: ret <2 x i32> [[ADD]]
4184	uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4185	return vmla_laneq_u32(a, b, v, 3);
4186	}
4187
4188	// CHECK-LABEL: @test_vmlaq_laneq_u32(
4189	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4190	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4191	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4192	// CHECK: ret <4 x i32> [[ADD]]
4193	uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4194	return vmlaq_laneq_u32(a, b, v, 3);
4195	}
4196
4197	// CHECK-LABEL: @test_vqdmlal_laneq_s16(
4198	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4199	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4200	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4201	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4202	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
4203	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4204	// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
4205	int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
4206	return vqdmlal_laneq_s16(a, b, v, 7);
4207	}
4208
4209	// CHECK-LABEL: @test_vqdmlal_laneq_s32(
4210	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4211	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4212	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4213	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4214	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
4215	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4216	// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
4217	int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
4218	return vqdmlal_laneq_s32(a, b, v, 3);
4219	}
4220
4221	// CHECK-LABEL: @test_vqdmlal_high_laneq_s16(
4222	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4223	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4224	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4225	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4226	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4227	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
4228	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4229	// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
4230	int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
4231	return vqdmlal_high_laneq_s16(a, b, v, 7);
4232	}
4233
4234	// CHECK-LABEL: @test_vqdmlal_high_laneq_s32(
4235	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4236	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4237	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4238	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4239	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4240	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
4241	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4242	// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
4243	int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
4244	return vqdmlal_high_laneq_s32(a, b, v, 3);
4245	}
4246
4247	// CHECK-LABEL: @test_vmls_lane_u16(
4248	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4249	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4250	// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4251	// CHECK: ret <4 x i16> [[SUB]]
4252	uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4253	return vmls_lane_u16(a, b, v, 3);
4254	}
4255
4256	// CHECK-LABEL: @test_vmlsq_lane_u16(
4257	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4258	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4259	// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4260	// CHECK: ret <8 x i16> [[SUB]]
4261	uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4262	return vmlsq_lane_u16(a, b, v, 3);
4263	}
4264
4265	// CHECK-LABEL: @test_vmls_lane_u32(
4266	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
4267	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4268	// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4269	// CHECK: ret <2 x i32> [[SUB]]
4270	uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4271	return vmls_lane_u32(a, b, v, 1);
4272	}
4273
4274	// CHECK-LABEL: @test_vmlsq_lane_u32(
4275	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4276	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4277	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4278	// CHECK: ret <4 x i32> [[SUB]]
4279	uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4280	return vmlsq_lane_u32(a, b, v, 1);
4281	}
4282
4283	// CHECK-LABEL: @test_vmls_laneq_u16(
4284	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4285	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4286	// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4287	// CHECK: ret <4 x i16> [[SUB]]
4288	uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4289	return vmls_laneq_u16(a, b, v, 7);
4290	}
4291
4292	// CHECK-LABEL: @test_vmlsq_laneq_u16(
4293	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4294	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4295	// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4296	// CHECK: ret <8 x i16> [[SUB]]
4297	uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4298	return vmlsq_laneq_u16(a, b, v, 7);
4299	}
4300
4301	// CHECK-LABEL: @test_vmls_laneq_u32(
4302	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4303	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4304	// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4305	// CHECK: ret <2 x i32> [[SUB]]
4306	uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4307	return vmls_laneq_u32(a, b, v, 3);
4308	}
4309
4310	// CHECK-LABEL: @test_vmlsq_laneq_u32(
4311	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4312	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4313	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4314	// CHECK: ret <4 x i32> [[SUB]]
4315	uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4316	return vmlsq_laneq_u32(a, b, v, 3);
4317	}
4318
4319	// CHECK-LABEL: @test_vqdmlsl_laneq_s16(
4320	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4321	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4322	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4323	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4324	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
4325	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4326	// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
4327	int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
4328	return vqdmlsl_laneq_s16(a, b, v, 7);
4329	}
4330
4331	// CHECK-LABEL: @test_vqdmlsl_laneq_s32(
4332	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4333	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4334	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4335	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4336	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
4337	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4338	// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
4339	int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
4340	return vqdmlsl_laneq_s32(a, b, v, 3);
4341	}
4342
4343	// CHECK-LABEL: @test_vqdmlsl_high_laneq_s16(
4344	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4345	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4346	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4347	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4348	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4349	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
4350	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4351	// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
4352	int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
4353	return vqdmlsl_high_laneq_s16(a, b, v, 7);
4354	}
4355
4356	// CHECK-LABEL: @test_vqdmlsl_high_laneq_s32(
4357	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4358	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4359	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4360	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4361	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4362	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
4363	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4364	// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
4365	int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
4366	return vqdmlsl_high_laneq_s32(a, b, v, 3);
4367	}
4368
4369	// CHECK-LABEL: @test_vqdmulh_laneq_s16(
4370	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4371	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4372	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4373	// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
4374	// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4375	// CHECK: ret <4 x i16> [[VQDMULH_V2_I]]
4376	int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
4377	return vqdmulh_laneq_s16(a, v, 7);
4378	}
4379
4380	// CHECK-LABEL: @test_vqdmulhq_laneq_s16(
4381	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4382	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4383	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4384	// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
4385	// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4386	// CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]]
4387	int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
4388	return vqdmulhq_laneq_s16(a, v, 7);
4389	}
4390
4391	// CHECK-LABEL: @test_vqdmulh_laneq_s32(
4392	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4393	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4394	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4395	// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
4396	// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4397	// CHECK: ret <2 x i32> [[VQDMULH_V2_I]]
4398	int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
4399	return vqdmulh_laneq_s32(a, v, 3);
4400	}
4401
4402	// CHECK-LABEL: @test_vqdmulhq_laneq_s32(
4403	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4404	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4405	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4406	// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
4407	// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4408	// CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]]
4409	int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
4410	return vqdmulhq_laneq_s32(a, v, 3);
4411	}
4412
4413	// CHECK-LABEL: @test_vqrdmulh_laneq_s16(
4414	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4415	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4416	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4417	// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
4418	// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4419	// CHECK: ret <4 x i16> [[VQRDMULH_V2_I]]
4420	int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
4421	return vqrdmulh_laneq_s16(a, v, 7);
4422	}
4423
4424	// CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
4425	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4426	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4427	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4428	// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
4429	// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4430	// CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]]
4431	int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
4432	return vqrdmulhq_laneq_s16(a, v, 7);
4433	}
4434
4435	// CHECK-LABEL: @test_vqrdmulh_laneq_s32(
4436	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4437	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4438	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4439	// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
4440	// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4441	// CHECK: ret <2 x i32> [[VQRDMULH_V2_I]]
4442	int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
4443	return vqrdmulh_laneq_s32(a, v, 3);
4444	}
4445
4446	// CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
4447	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4448	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4449	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4450	// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
4451	// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
4452	// CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]]
4453	int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
4454	return vqrdmulhq_laneq_s32(a, v, 3);
4455	}
4456

Clang Project