arm_neon_intrinsics.c source code [clang_source_code/test/CodeGen/arm_neon

1	// RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\
2	// RUN: -target-cpu swift -fallow-half-arguments-and-returns \
3	// RUN: -target-feature +fullfp16 -ffreestanding \
4	// RUN: -disable-O0-optnone -emit-llvm -o - %s \
5	// RUN: \| opt -S -mem2reg \| FileCheck %s
6
7	#include <arm_neon.h>
8
9	// CHECK-LABEL: @test_vaba_s8(
10	// CHECK: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
11	// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
12	// CHECK: ret <8 x i8> [[ADD_I]]
13	int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
14	return vaba_s8(a, b, c);
15	}
16
17	// CHECK-LABEL: @test_vaba_s16(
18	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
20	// CHECK: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
21	// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
22	// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]]
23	// CHECK: ret <4 x i16> [[ADD_I]]
24	int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
25	return vaba_s16(a, b, c);
26	}
27
28	// CHECK-LABEL: @test_vaba_s32(
29	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
30	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
31	// CHECK: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
32	// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
33	// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]]
34	// CHECK: ret <2 x i32> [[ADD_I]]
35	int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
36	return vaba_s32(a, b, c);
37	}
38
39	// CHECK-LABEL: @test_vaba_u8(
40	// CHECK: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
41	// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
42	// CHECK: ret <8 x i8> [[ADD_I]]
43	uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
44	return vaba_u8(a, b, c);
45	}
46
47	// CHECK-LABEL: @test_vaba_u16(
48	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
49	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
50	// CHECK: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
51	// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
52	// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]]
53	// CHECK: ret <4 x i16> [[ADD_I]]
54	uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
55	return vaba_u16(a, b, c);
56	}
57
58	// CHECK-LABEL: @test_vaba_u32(
59	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
60	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
61	// CHECK: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
62	// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
63	// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]]
64	// CHECK: ret <2 x i32> [[ADD_I]]
65	uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
66	return vaba_u32(a, b, c);
67	}
68
69	// CHECK-LABEL: @test_vabaq_s8(
70	// CHECK: [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %b, <16 x i8> %c)
71	// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
72	// CHECK: ret <16 x i8> [[ADD_I]]
73	int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
74	return vabaq_s8(a, b, c);
75	}
76
77	// CHECK-LABEL: @test_vabaq_s16(
78	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
79	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
80	// CHECK: [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %b, <8 x i16> %c)
81	// CHECK: [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
82	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]]
83	// CHECK: ret <8 x i16> [[ADD_I]]
84	int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
85	return vabaq_s16(a, b, c);
86	}
87
88	// CHECK-LABEL: @test_vabaq_s32(
89	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
90	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
91	// CHECK: [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %b, <4 x i32> %c)
92	// CHECK: [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
93	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]]
94	// CHECK: ret <4 x i32> [[ADD_I]]
95	int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
96	return vabaq_s32(a, b, c);
97	}
98
99	// CHECK-LABEL: @test_vabaq_u8(
100	// CHECK: [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %b, <16 x i8> %c)
101	// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
102	// CHECK: ret <16 x i8> [[ADD_I]]
103	uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
104	return vabaq_u8(a, b, c);
105	}
106
107	// CHECK-LABEL: @test_vabaq_u16(
108	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
109	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
110	// CHECK: [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %b, <8 x i16> %c)
111	// CHECK: [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
112	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]]
113	// CHECK: ret <8 x i16> [[ADD_I]]
114	uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
115	return vabaq_u16(a, b, c);
116	}
117
118	// CHECK-LABEL: @test_vabaq_u32(
119	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
120	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
121	// CHECK: [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %b, <4 x i32> %c)
122	// CHECK: [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
123	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]]
124	// CHECK: ret <4 x i32> [[ADD_I]]
125	uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
126	return vabaq_u32(a, b, c);
127	}
128
129	// CHECK-LABEL: @test_vabal_s8(
130	// CHECK: [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
131	// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
132	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
133	// CHECK: ret <8 x i16> [[ADD_I]]
134	int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
135	return vabal_s8(a, b, c);
136	}
137
138	// CHECK-LABEL: @test_vabal_s16(
139	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
140	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
141	// CHECK: [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
142	// CHECK: [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
143	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
144	// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32>
145	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
146	// CHECK: ret <4 x i32> [[ADD_I]]
147	int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
148	return vabal_s16(a, b, c);
149	}
150
151	// CHECK-LABEL: @test_vabal_s32(
152	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
153	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
154	// CHECK: [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
155	// CHECK: [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
156	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
157	// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64>
158	// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
159	// CHECK: ret <2 x i64> [[ADD_I]]
160	int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
161	return vabal_s32(a, b, c);
162	}
163
164	// CHECK-LABEL: @test_vabal_u8(
165	// CHECK: [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
166	// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
167	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
168	// CHECK: ret <8 x i16> [[ADD_I]]
169	uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
170	return vabal_u8(a, b, c);
171	}
172
173	// CHECK-LABEL: @test_vabal_u16(
174	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
175	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
176	// CHECK: [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
177	// CHECK: [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
178	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
179	// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32>
180	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
181	// CHECK: ret <4 x i32> [[ADD_I]]
182	uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
183	return vabal_u16(a, b, c);
184	}
185
186	// CHECK-LABEL: @test_vabal_u32(
187	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
188	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
189	// CHECK: [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
190	// CHECK: [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
191	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
192	// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64>
193	// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
194	// CHECK: ret <2 x i64> [[ADD_I]]
195	uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
196	return vabal_u32(a, b, c);
197	}
198
199	// CHECK-LABEL: @test_vabd_s8(
200	// CHECK: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
201	// CHECK: ret <8 x i8> [[VABD_V_I]]
202	int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) {
203	return vabd_s8(a, b);
204	}
205
206	// CHECK-LABEL: @test_vabd_s16(
207	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
208	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
209	// CHECK: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
210	// CHECK: [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
211	// CHECK: ret <4 x i16> [[VABD_V2_I]]
212	int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) {
213	return vabd_s16(a, b);
214	}
215
216	// CHECK-LABEL: @test_vabd_s32(
217	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
218	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
219	// CHECK: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
220	// CHECK: [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
221	// CHECK: ret <2 x i32> [[VABD_V2_I]]
222	int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) {
223	return vabd_s32(a, b);
224	}
225
226	// CHECK-LABEL: @test_vabd_u8(
227	// CHECK: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
228	// CHECK: ret <8 x i8> [[VABD_V_I]]
229	uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) {
230	return vabd_u8(a, b);
231	}
232
233	// CHECK-LABEL: @test_vabd_u16(
234	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
235	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
236	// CHECK: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
237	// CHECK: [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
238	// CHECK: ret <4 x i16> [[VABD_V2_I]]
239	uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) {
240	return vabd_u16(a, b);
241	}
242
243	// CHECK-LABEL: @test_vabd_u32(
244	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
245	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
246	// CHECK: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
247	// CHECK: [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
248	// CHECK: ret <2 x i32> [[VABD_V2_I]]
249	uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) {
250	return vabd_u32(a, b);
251	}
252
253	// CHECK-LABEL: @test_vabd_f32(
254	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
255	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
256	// CHECK: [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %a, <2 x float> %b)
257	// CHECK: [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8>
258	// CHECK: ret <2 x float> [[VABD_V2_I]]
259	float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) {
260	return vabd_f32(a, b);
261	}
262
263	// CHECK-LABEL: @test_vabdq_s8(
264	// CHECK: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %a, <16 x i8> %b)
265	// CHECK: ret <16 x i8> [[VABDQ_V_I]]
266	int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) {
267	return vabdq_s8(a, b);
268	}
269
270	// CHECK-LABEL: @test_vabdq_s16(
271	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
272	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
273	// CHECK: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %a, <8 x i16> %b)
274	// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
275	// CHECK: ret <8 x i16> [[VABDQ_V2_I]]
276	int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) {
277	return vabdq_s16(a, b);
278	}
279
280	// CHECK-LABEL: @test_vabdq_s32(
281	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
282	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
283	// CHECK: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %a, <4 x i32> %b)
284	// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
285	// CHECK: ret <4 x i32> [[VABDQ_V2_I]]
286	int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) {
287	return vabdq_s32(a, b);
288	}
289
290	// CHECK-LABEL: @test_vabdq_u8(
291	// CHECK: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %a, <16 x i8> %b)
292	// CHECK: ret <16 x i8> [[VABDQ_V_I]]
293	uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) {
294	return vabdq_u8(a, b);
295	}
296
297	// CHECK-LABEL: @test_vabdq_u16(
298	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
299	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
300	// CHECK: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %a, <8 x i16> %b)
301	// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
302	// CHECK: ret <8 x i16> [[VABDQ_V2_I]]
303	uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) {
304	return vabdq_u16(a, b);
305	}
306
307	// CHECK-LABEL: @test_vabdq_u32(
308	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
309	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
310	// CHECK: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %a, <4 x i32> %b)
311	// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
312	// CHECK: ret <4 x i32> [[VABDQ_V2_I]]
313	uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) {
314	return vabdq_u32(a, b);
315	}
316
317	// CHECK-LABEL: @test_vabdq_f32(
318	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
319	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
320	// CHECK: [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %a, <4 x float> %b)
321	// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8>
322	// CHECK: ret <4 x float> [[VABDQ_V2_I]]
323	float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) {
324	return vabdq_f32(a, b);
325	}
326
327	// CHECK-LABEL: @test_vabdl_s8(
328	// CHECK: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
329	// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
330	// CHECK: ret <8 x i16> [[VMOVL_I_I]]
331	int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
332	return vabdl_s8(a, b);
333	}
334
335	// CHECK-LABEL: @test_vabdl_s16(
336	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
337	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
338	// CHECK: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
339	// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
340	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
341	// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
342	// CHECK: ret <4 x i32> [[VMOVL_I_I]]
343	int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
344	return vabdl_s16(a, b);
345	}
346
347	// CHECK-LABEL: @test_vabdl_s32(
348	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
349	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
350	// CHECK: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
351	// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
352	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
353	// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
354	// CHECK: ret <2 x i64> [[VMOVL_I_I]]
355	int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
356	return vabdl_s32(a, b);
357	}
358
359	// CHECK-LABEL: @test_vabdl_u8(
360	// CHECK: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
361	// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
362	// CHECK: ret <8 x i16> [[VMOVL_I_I]]
363	uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
364	return vabdl_u8(a, b);
365	}
366
367	// CHECK-LABEL: @test_vabdl_u16(
368	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
369	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
370	// CHECK: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
371	// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
372	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
373	// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32>
374	// CHECK: ret <4 x i32> [[VMOVL_I_I]]
375	uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
376	return vabdl_u16(a, b);
377	}
378
379	// CHECK-LABEL: @test_vabdl_u32(
380	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
381	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
382	// CHECK: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
383	// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
384	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
385	// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64>
386	// CHECK: ret <2 x i64> [[VMOVL_I_I]]
387	uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
388	return vabdl_u32(a, b);
389	}
390
391	// CHECK-LABEL: @test_vabs_s8(
392	// CHECK: [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a)
393	// CHECK: ret <8 x i8> [[VABS_I]]
394	int8x8_t test_vabs_s8(int8x8_t a) {
395	return vabs_s8(a);
396	}
397
398	// CHECK-LABEL: @test_vabs_s16(
399	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
400	// CHECK: [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a)
401	// CHECK: ret <4 x i16> [[VABS1_I]]
402	int16x4_t test_vabs_s16(int16x4_t a) {
403	return vabs_s16(a);
404	}
405
406	// CHECK-LABEL: @test_vabs_s32(
407	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
408	// CHECK: [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a)
409	// CHECK: ret <2 x i32> [[VABS1_I]]
410	int32x2_t test_vabs_s32(int32x2_t a) {
411	return vabs_s32(a);
412	}
413
414	// CHECK-LABEL: @test_vabs_f32(
415	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
416	// CHECK: [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
417	// CHECK: ret <2 x float> [[VABS1_I]]
418	float32x2_t test_vabs_f32(float32x2_t a) {
419	return vabs_f32(a);
420	}
421
422	// CHECK-LABEL: @test_vabsq_s8(
423	// CHECK: [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a)
424	// CHECK: ret <16 x i8> [[VABS_I]]
425	int8x16_t test_vabsq_s8(int8x16_t a) {
426	return vabsq_s8(a);
427	}
428
429	// CHECK-LABEL: @test_vabsq_s16(
430	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
431	// CHECK: [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a)
432	// CHECK: ret <8 x i16> [[VABS1_I]]
433	int16x8_t test_vabsq_s16(int16x8_t a) {
434	return vabsq_s16(a);
435	}
436
437	// CHECK-LABEL: @test_vabsq_s32(
438	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
439	// CHECK: [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a)
440	// CHECK: ret <4 x i32> [[VABS1_I]]
441	int32x4_t test_vabsq_s32(int32x4_t a) {
442	return vabsq_s32(a);
443	}
444
445	// CHECK-LABEL: @test_vabsq_f32(
446	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
447	// CHECK: [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
448	// CHECK: ret <4 x float> [[VABS1_I]]
449	float32x4_t test_vabsq_f32(float32x4_t a) {
450	return vabsq_f32(a);
451	}
452
453	// CHECK-LABEL: @test_vadd_s8(
454	// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, %b
455	// CHECK: ret <8 x i8> [[ADD_I]]
456	int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) {
457	return vadd_s8(a, b);
458	}
459
460	// CHECK-LABEL: @test_vadd_s16(
461	// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, %b
462	// CHECK: ret <4 x i16> [[ADD_I]]
463	int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) {
464	return vadd_s16(a, b);
465	}
466
467	// CHECK-LABEL: @test_vadd_s32(
468	// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, %b
469	// CHECK: ret <2 x i32> [[ADD_I]]
470	int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) {
471	return vadd_s32(a, b);
472	}
473
474	// CHECK-LABEL: @test_vadd_s64(
475	// CHECK: [[ADD_I:%.*]] = add <1 x i64> %a, %b
476	// CHECK: ret <1 x i64> [[ADD_I]]
477	int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) {
478	return vadd_s64(a, b);
479	}
480
481	// CHECK-LABEL: @test_vadd_f32(
482	// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %a, %b
483	// CHECK: ret <2 x float> [[ADD_I]]
484	float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) {
485	return vadd_f32(a, b);
486	}
487
488	// CHECK-LABEL: @test_vadd_u8(
489	// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, %b
490	// CHECK: ret <8 x i8> [[ADD_I]]
491	uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) {
492	return vadd_u8(a, b);
493	}
494
495	// CHECK-LABEL: @test_vadd_u16(
496	// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, %b
497	// CHECK: ret <4 x i16> [[ADD_I]]
498	uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) {
499	return vadd_u16(a, b);
500	}
501
502	// CHECK-LABEL: @test_vadd_u32(
503	// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, %b
504	// CHECK: ret <2 x i32> [[ADD_I]]
505	uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) {
506	return vadd_u32(a, b);
507	}
508
509	// CHECK-LABEL: @test_vadd_u64(
510	// CHECK: [[ADD_I:%.*]] = add <1 x i64> %a, %b
511	// CHECK: ret <1 x i64> [[ADD_I]]
512	uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) {
513	return vadd_u64(a, b);
514	}
515
516	// CHECK-LABEL: @test_vaddq_s8(
517	// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, %b
518	// CHECK: ret <16 x i8> [[ADD_I]]
519	int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) {
520	return vaddq_s8(a, b);
521	}
522
523	// CHECK-LABEL: @test_vaddq_s16(
524	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, %b
525	// CHECK: ret <8 x i16> [[ADD_I]]
526	int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) {
527	return vaddq_s16(a, b);
528	}
529
530	// CHECK-LABEL: @test_vaddq_s32(
531	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, %b
532	// CHECK: ret <4 x i32> [[ADD_I]]
533	int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) {
534	return vaddq_s32(a, b);
535	}
536
537	// CHECK-LABEL: @test_vaddq_s64(
538	// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, %b
539	// CHECK: ret <2 x i64> [[ADD_I]]
540	int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) {
541	return vaddq_s64(a, b);
542	}
543
544	// CHECK-LABEL: @test_vaddq_f32(
545	// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %a, %b
546	// CHECK: ret <4 x float> [[ADD_I]]
547	float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) {
548	return vaddq_f32(a, b);
549	}
550
551	// CHECK-LABEL: @test_vaddq_u8(
552	// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, %b
553	// CHECK: ret <16 x i8> [[ADD_I]]
554	uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) {
555	return vaddq_u8(a, b);
556	}
557
558	// CHECK-LABEL: @test_vaddq_u16(
559	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, %b
560	// CHECK: ret <8 x i16> [[ADD_I]]
561	uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) {
562	return vaddq_u16(a, b);
563	}
564
565	// CHECK-LABEL: @test_vaddq_u32(
566	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, %b
567	// CHECK: ret <4 x i32> [[ADD_I]]
568	uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) {
569	return vaddq_u32(a, b);
570	}
571
572	// CHECK-LABEL: @test_vaddq_u64(
573	// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, %b
574	// CHECK: ret <2 x i64> [[ADD_I]]
575	uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) {
576	return vaddq_u64(a, b);
577	}
578
579	// CHECK-LABEL: @test_vaddhn_s16(
580	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
581	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
582	// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
583	// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
584	// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
585	// CHECK: ret <8 x i8> [[VADDHN2_I]]
586	int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
587	return vaddhn_s16(a, b);
588	}
589
590	// CHECK-LABEL: @test_vaddhn_s32(
591	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
592	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
593	// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
594	// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
595	// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
596	// CHECK: ret <4 x i16> [[VADDHN2_I]]
597	int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
598	return vaddhn_s32(a, b);
599	}
600
601	// CHECK-LABEL: @test_vaddhn_s64(
602	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
603	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
604	// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
605	// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
606	// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
607	// CHECK: ret <2 x i32> [[VADDHN2_I]]
608	int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
609	return vaddhn_s64(a, b);
610	}
611
612	// CHECK-LABEL: @test_vaddhn_u16(
613	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
614	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
615	// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> %a, %b
616	// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
617	// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
618	// CHECK: ret <8 x i8> [[VADDHN2_I]]
619	uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
620	return vaddhn_u16(a, b);
621	}
622
623	// CHECK-LABEL: @test_vaddhn_u32(
624	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
625	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
626	// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> %a, %b
627	// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
628	// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
629	// CHECK: ret <4 x i16> [[VADDHN2_I]]
630	uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
631	return vaddhn_u32(a, b);
632	}
633
634	// CHECK-LABEL: @test_vaddhn_u64(
635	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
636	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
637	// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> %a, %b
638	// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
639	// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
640	// CHECK: ret <2 x i32> [[VADDHN2_I]]
641	uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
642	return vaddhn_u64(a, b);
643	}
644
645	// CHECK-LABEL: @test_vaddl_s8(
646	// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
647	// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
648	// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
649	// CHECK: ret <8 x i16> [[ADD_I]]
650	int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
651	return vaddl_s8(a, b);
652	}
653
654	// CHECK-LABEL: @test_vaddl_s16(
655	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
656	// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
657	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
658	// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
659	// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
660	// CHECK: ret <4 x i32> [[ADD_I]]
661	int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
662	return vaddl_s16(a, b);
663	}
664
665	// CHECK-LABEL: @test_vaddl_s32(
666	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
667	// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
668	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
669	// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
670	// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
671	// CHECK: ret <2 x i64> [[ADD_I]]
672	int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
673	return vaddl_s32(a, b);
674	}
675
676	// CHECK-LABEL: @test_vaddl_u8(
677	// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
678	// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
679	// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
680	// CHECK: ret <8 x i16> [[ADD_I]]
681	uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
682	return vaddl_u8(a, b);
683	}
684
685	// CHECK-LABEL: @test_vaddl_u16(
686	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
687	// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
688	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
689	// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
690	// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
691	// CHECK: ret <4 x i32> [[ADD_I]]
692	uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
693	return vaddl_u16(a, b);
694	}
695
696	// CHECK-LABEL: @test_vaddl_u32(
697	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
698	// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
699	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
700	// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
701	// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
702	// CHECK: ret <2 x i64> [[ADD_I]]
703	uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
704	return vaddl_u32(a, b);
705	}
706
707	// CHECK-LABEL: @test_vaddw_s8(
708	// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
709	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
710	// CHECK: ret <8 x i16> [[ADD_I]]
711	int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
712	return vaddw_s8(a, b);
713	}
714
715	// CHECK-LABEL: @test_vaddw_s16(
716	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
717	// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
718	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
719	// CHECK: ret <4 x i32> [[ADD_I]]
720	int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
721	return vaddw_s16(a, b);
722	}
723
724	// CHECK-LABEL: @test_vaddw_s32(
725	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
726	// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
727	// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
728	// CHECK: ret <2 x i64> [[ADD_I]]
729	int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
730	return vaddw_s32(a, b);
731	}
732
733	// CHECK-LABEL: @test_vaddw_u8(
734	// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
735	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
736	// CHECK: ret <8 x i16> [[ADD_I]]
737	uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
738	return vaddw_u8(a, b);
739	}
740
741	// CHECK-LABEL: @test_vaddw_u16(
742	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
743	// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
744	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
745	// CHECK: ret <4 x i32> [[ADD_I]]
746	uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
747	return vaddw_u16(a, b);
748	}
749
750	// CHECK-LABEL: @test_vaddw_u32(
751	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
752	// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
753	// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
754	// CHECK: ret <2 x i64> [[ADD_I]]
755	uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
756	return vaddw_u32(a, b);
757	}
758
759	// CHECK-LABEL: @test_vand_s8(
760	// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, %b
761	// CHECK: ret <8 x i8> [[AND_I]]
762	int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
763	return vand_s8(a, b);
764	}
765
766	// CHECK-LABEL: @test_vand_s16(
767	// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, %b
768	// CHECK: ret <4 x i16> [[AND_I]]
769	int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
770	return vand_s16(a, b);
771	}
772
773	// CHECK-LABEL: @test_vand_s32(
774	// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, %b
775	// CHECK: ret <2 x i32> [[AND_I]]
776	int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
777	return vand_s32(a, b);
778	}
779
780	// CHECK-LABEL: @test_vand_s64(
781	// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, %b
782	// CHECK: ret <1 x i64> [[AND_I]]
783	int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
784	return vand_s64(a, b);
785	}
786
787	// CHECK-LABEL: @test_vand_u8(
788	// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, %b
789	// CHECK: ret <8 x i8> [[AND_I]]
790	uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
791	return vand_u8(a, b);
792	}
793
794	// CHECK-LABEL: @test_vand_u16(
795	// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, %b
796	// CHECK: ret <4 x i16> [[AND_I]]
797	uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
798	return vand_u16(a, b);
799	}
800
801	// CHECK-LABEL: @test_vand_u32(
802	// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, %b
803	// CHECK: ret <2 x i32> [[AND_I]]
804	uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
805	return vand_u32(a, b);
806	}
807
808	// CHECK-LABEL: @test_vand_u64(
809	// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, %b
810	// CHECK: ret <1 x i64> [[AND_I]]
811	uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
812	return vand_u64(a, b);
813	}
814
815	// CHECK-LABEL: @test_vandq_s8(
816	// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, %b
817	// CHECK: ret <16 x i8> [[AND_I]]
818	int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
819	return vandq_s8(a, b);
820	}
821
822	// CHECK-LABEL: @test_vandq_s16(
823	// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, %b
824	// CHECK: ret <8 x i16> [[AND_I]]
825	int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
826	return vandq_s16(a, b);
827	}
828
829	// CHECK-LABEL: @test_vandq_s32(
830	// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, %b
831	// CHECK: ret <4 x i32> [[AND_I]]
832	int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
833	return vandq_s32(a, b);
834	}
835
836	// CHECK-LABEL: @test_vandq_s64(
837	// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, %b
838	// CHECK: ret <2 x i64> [[AND_I]]
839	int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
840	return vandq_s64(a, b);
841	}
842
843	// CHECK-LABEL: @test_vandq_u8(
844	// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, %b
845	// CHECK: ret <16 x i8> [[AND_I]]
846	uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
847	return vandq_u8(a, b);
848	}
849
850	// CHECK-LABEL: @test_vandq_u16(
851	// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, %b
852	// CHECK: ret <8 x i16> [[AND_I]]
853	uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
854	return vandq_u16(a, b);
855	}
856
857	// CHECK-LABEL: @test_vandq_u32(
858	// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, %b
859	// CHECK: ret <4 x i32> [[AND_I]]
860	uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
861	return vandq_u32(a, b);
862	}
863
864	// CHECK-LABEL: @test_vandq_u64(
865	// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, %b
866	// CHECK: ret <2 x i64> [[AND_I]]
867	uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
868	return vandq_u64(a, b);
869	}
870
871	// CHECK-LABEL: @test_vbic_s8(
872	// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
873	// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
874	// CHECK: ret <8 x i8> [[AND_I]]
875	int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
876	return vbic_s8(a, b);
877	}
878
879	// CHECK-LABEL: @test_vbic_s16(
880	// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
881	// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
882	// CHECK: ret <4 x i16> [[AND_I]]
883	int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
884	return vbic_s16(a, b);
885	}
886
887	// CHECK-LABEL: @test_vbic_s32(
888	// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
889	// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
890	// CHECK: ret <2 x i32> [[AND_I]]
891	int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
892	return vbic_s32(a, b);
893	}
894
895	// CHECK-LABEL: @test_vbic_s64(
896	// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
897	// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
898	// CHECK: ret <1 x i64> [[AND_I]]
899	int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
900	return vbic_s64(a, b);
901	}
902
903	// CHECK-LABEL: @test_vbic_u8(
904	// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
905	// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
906	// CHECK: ret <8 x i8> [[AND_I]]
907	uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
908	return vbic_u8(a, b);
909	}
910
911	// CHECK-LABEL: @test_vbic_u16(
912	// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
913	// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
914	// CHECK: ret <4 x i16> [[AND_I]]
915	uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
916	return vbic_u16(a, b);
917	}
918
919	// CHECK-LABEL: @test_vbic_u32(
920	// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
921	// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
922	// CHECK: ret <2 x i32> [[AND_I]]
923	uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
924	return vbic_u32(a, b);
925	}
926
927	// CHECK-LABEL: @test_vbic_u64(
928	// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
929	// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
930	// CHECK: ret <1 x i64> [[AND_I]]
931	uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
932	return vbic_u64(a, b);
933	}
934
935	// CHECK-LABEL: @test_vbicq_s8(
936	// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
937	// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
938	// CHECK: ret <16 x i8> [[AND_I]]
939	int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
940	return vbicq_s8(a, b);
941	}
942
943	// CHECK-LABEL: @test_vbicq_s16(
944	// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
945	// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
946	// CHECK: ret <8 x i16> [[AND_I]]
947	int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
948	return vbicq_s16(a, b);
949	}
950
951	// CHECK-LABEL: @test_vbicq_s32(
952	// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
953	// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
954	// CHECK: ret <4 x i32> [[AND_I]]
955	int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
956	return vbicq_s32(a, b);
957	}
958
959	// CHECK-LABEL: @test_vbicq_s64(
960	// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
961	// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
962	// CHECK: ret <2 x i64> [[AND_I]]
963	int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
964	return vbicq_s64(a, b);
965	}
966
967	// CHECK-LABEL: @test_vbicq_u8(
968	// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
969	// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
970	// CHECK: ret <16 x i8> [[AND_I]]
971	uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
972	return vbicq_u8(a, b);
973	}
974
975	// CHECK-LABEL: @test_vbicq_u16(
976	// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
977	// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
978	// CHECK: ret <8 x i16> [[AND_I]]
979	uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
980	return vbicq_u16(a, b);
981	}
982
983	// CHECK-LABEL: @test_vbicq_u32(
984	// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
985	// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
986	// CHECK: ret <4 x i32> [[AND_I]]
987	uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
988	return vbicq_u32(a, b);
989	}
990
991	// CHECK-LABEL: @test_vbicq_u64(
992	// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
993	// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
994	// CHECK: ret <2 x i64> [[AND_I]]
995	uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
996	return vbicq_u64(a, b);
997	}
998
999	// CHECK-LABEL: @test_vbsl_s8(
1000	// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
1001	// CHECK: ret <8 x i8> [[VBSL_V_I]]
1002	int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) {
1003	return vbsl_s8(a, b, c);
1004	}
1005
1006	// CHECK-LABEL: @test_vbsl_s16(
1007	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1008	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1009	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
1010	// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1011	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
1012	// CHECK: ret <4 x i16> [[TMP3]]
1013	int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) {
1014	return vbsl_s16(a, b, c);
1015	}
1016
1017	// CHECK-LABEL: @test_vbsl_s32(
1018	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1019	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1020	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
1021	// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1022	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
1023	// CHECK: ret <2 x i32> [[TMP3]]
1024	int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) {
1025	return vbsl_s32(a, b, c);
1026	}
1027
1028	// CHECK-LABEL: @test_vbsl_s64(
1029	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
1030	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
1031	// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
1032	// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1033	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
1034	// CHECK: ret <1 x i64> [[TMP3]]
1035	int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) {
1036	return vbsl_s64(a, b, c);
1037	}
1038
1039	// CHECK-LABEL: @test_vbsl_u8(
1040	// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
1041	// CHECK: ret <8 x i8> [[VBSL_V_I]]
1042	uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
1043	return vbsl_u8(a, b, c);
1044	}
1045
1046	// CHECK-LABEL: @test_vbsl_u16(
1047	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1048	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1049	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
1050	// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1051	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
1052	// CHECK: ret <4 x i16> [[TMP3]]
1053	uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
1054	return vbsl_u16(a, b, c);
1055	}
1056
1057	// CHECK-LABEL: @test_vbsl_u32(
1058	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1059	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1060	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
1061	// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1062	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
1063	// CHECK: ret <2 x i32> [[TMP3]]
1064	uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
1065	return vbsl_u32(a, b, c);
1066	}
1067
1068	// CHECK-LABEL: @test_vbsl_u64(
1069	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
1070	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
1071	// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
1072	// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1073	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
1074	// CHECK: ret <1 x i64> [[TMP3]]
1075	uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) {
1076	return vbsl_u64(a, b, c);
1077	}
1078
1079	// CHECK-LABEL: @test_vbsl_f32(
1080	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1081	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1082	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
1083	// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1084	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x float>
1085	// CHECK: ret <2 x float> [[TMP3]]
1086	float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) {
1087	return vbsl_f32(a, b, c);
1088	}
1089
1090	// CHECK-LABEL: @test_vbsl_p8(
1091	// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
1092	// CHECK: ret <8 x i8> [[VBSL_V_I]]
1093	poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) {
1094	return vbsl_p8(a, b, c);
1095	}
1096
1097	// CHECK-LABEL: @test_vbsl_p16(
1098	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1099	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1100	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
1101	// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
1102	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
1103	// CHECK: ret <4 x i16> [[TMP3]]
1104	poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) {
1105	return vbsl_p16(a, b, c);
1106	}
1107
1108	// CHECK-LABEL: @test_vbslq_s8(
1109	// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
1110	// CHECK: ret <16 x i8> [[VBSLQ_V_I]]
1111	int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) {
1112	return vbslq_s8(a, b, c);
1113	}
1114
1115	// CHECK-LABEL: @test_vbslq_s16(
1116	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1117	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
1118	// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
1119	// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1120	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
1121	// CHECK: ret <8 x i16> [[TMP3]]
1122	int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) {
1123	return vbslq_s16(a, b, c);
1124	}
1125
1126	// CHECK-LABEL: @test_vbslq_s32(
1127	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1128	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
1129	// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
1130	// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1131	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
1132	// CHECK: ret <4 x i32> [[TMP3]]
1133	int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) {
1134	return vbslq_s32(a, b, c);
1135	}
1136
1137	// CHECK-LABEL: @test_vbslq_s64(
1138	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1139	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
1140	// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
1141	// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1142	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
1143	// CHECK: ret <2 x i64> [[TMP3]]
1144	int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) {
1145	return vbslq_s64(a, b, c);
1146	}
1147
1148	// CHECK-LABEL: @test_vbslq_u8(
1149	// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
1150	// CHECK: ret <16 x i8> [[VBSLQ_V_I]]
1151	uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
1152	return vbslq_u8(a, b, c);
1153	}
1154
1155	// CHECK-LABEL: @test_vbslq_u16(
1156	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1157	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
1158	// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
1159	// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1160	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
1161	// CHECK: ret <8 x i16> [[TMP3]]
1162	uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
1163	return vbslq_u16(a, b, c);
1164	}
1165
1166	// CHECK-LABEL: @test_vbslq_u32(
1167	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1168	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
1169	// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
1170	// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1171	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
1172	// CHECK: ret <4 x i32> [[TMP3]]
1173	uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
1174	return vbslq_u32(a, b, c);
1175	}
1176
1177	// CHECK-LABEL: @test_vbslq_u64(
1178	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1179	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
1180	// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
1181	// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1182	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
1183	// CHECK: ret <2 x i64> [[TMP3]]
1184	uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
1185	return vbslq_u64(a, b, c);
1186	}
1187
1188	// CHECK-LABEL: @test_vbslq_f32(
1189	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1190	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1191	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
1192	// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1193	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x float>
1194	// CHECK: ret <4 x float> [[TMP3]]
1195	float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) {
1196	return vbslq_f32(a, b, c);
1197	}
1198
1199	// CHECK-LABEL: @test_vbslq_p8(
1200	// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
1201	// CHECK: ret <16 x i8> [[VBSLQ_V_I]]
1202	poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) {
1203	return vbslq_p8(a, b, c);
1204	}
1205
1206	// CHECK-LABEL: @test_vbslq_p16(
1207	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1208	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
1209	// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
1210	// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
1211	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
1212	// CHECK: ret <8 x i16> [[TMP3]]
1213	poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) {
1214	return vbslq_p16(a, b, c);
1215	}
1216
1217	// CHECK-LABEL: @test_vcage_f32(
1218	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1219	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1220	// CHECK: [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %a, <2 x float> %b)
1221	// CHECK: ret <2 x i32> [[VCAGE_V2_I]]
1222	uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) {
1223	return vcage_f32(a, b);
1224	}
1225
1226	// CHECK-LABEL: @test_vcageq_f32(
1227	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1228	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1229	// CHECK: [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %a, <4 x float> %b)
1230	// CHECK: ret <4 x i32> [[VCAGEQ_V2_I]]
1231	uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) {
1232	return vcageq_f32(a, b);
1233	}
1234
1235	// CHECK-LABEL: @test_vcagt_f32(
1236	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1237	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1238	// CHECK: [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %a, <2 x float> %b)
1239	// CHECK: ret <2 x i32> [[VCAGT_V2_I]]
1240	uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) {
1241	return vcagt_f32(a, b);
1242	}
1243
1244	// CHECK-LABEL: @test_vcagtq_f32(
1245	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1246	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1247	// CHECK: [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %a, <4 x float> %b)
1248	// CHECK: ret <4 x i32> [[VCAGTQ_V2_I]]
1249	uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) {
1250	return vcagtq_f32(a, b);
1251	}
1252
1253	// CHECK-LABEL: @test_vcale_f32(
1254	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1255	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1256	// CHECK: [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %b, <2 x float> %a)
1257	// CHECK: ret <2 x i32> [[VCALE_V2_I]]
1258	uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) {
1259	return vcale_f32(a, b);
1260	}
1261
1262	// CHECK-LABEL: @test_vcaleq_f32(
1263	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1264	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1265	// CHECK: [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %b, <4 x float> %a)
1266	// CHECK: ret <4 x i32> [[VCALEQ_V2_I]]
1267	uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) {
1268	return vcaleq_f32(a, b);
1269	}
1270
1271	// CHECK-LABEL: @test_vcalt_f32(
1272	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1273	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1274	// CHECK: [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %b, <2 x float> %a)
1275	// CHECK: ret <2 x i32> [[VCALT_V2_I]]
1276	uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) {
1277	return vcalt_f32(a, b);
1278	}
1279
1280	// CHECK-LABEL: @test_vcaltq_f32(
1281	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1282	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1283	// CHECK: [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %b, <4 x float> %a)
1284	// CHECK: ret <4 x i32> [[VCALTQ_V2_I]]
1285	uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) {
1286	return vcaltq_f32(a, b);
1287	}
1288
1289	// CHECK-LABEL: @test_vceq_s8(
1290	// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
1291	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1292	// CHECK: ret <8 x i8> [[SEXT_I]]
1293	uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) {
1294	return vceq_s8(a, b);
1295	}
1296
1297	// CHECK-LABEL: @test_vceq_s16(
1298	// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
1299	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1300	// CHECK: ret <4 x i16> [[SEXT_I]]
1301	uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) {
1302	return vceq_s16(a, b);
1303	}
1304
1305	// CHECK-LABEL: @test_vceq_s32(
1306	// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
1307	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1308	// CHECK: ret <2 x i32> [[SEXT_I]]
1309	uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) {
1310	return vceq_s32(a, b);
1311	}
1312
1313	// CHECK-LABEL: @test_vceq_f32(
1314	// CHECK: [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b
1315	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1316	// CHECK: ret <2 x i32> [[SEXT_I]]
1317	uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) {
1318	return vceq_f32(a, b);
1319	}
1320
1321	// CHECK-LABEL: @test_vceq_u8(
1322	// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
1323	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1324	// CHECK: ret <8 x i8> [[SEXT_I]]
1325	uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) {
1326	return vceq_u8(a, b);
1327	}
1328
1329	// CHECK-LABEL: @test_vceq_u16(
1330	// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
1331	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1332	// CHECK: ret <4 x i16> [[SEXT_I]]
1333	uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) {
1334	return vceq_u16(a, b);
1335	}
1336
1337	// CHECK-LABEL: @test_vceq_u32(
1338	// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
1339	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1340	// CHECK: ret <2 x i32> [[SEXT_I]]
1341	uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) {
1342	return vceq_u32(a, b);
1343	}
1344
1345	// CHECK-LABEL: @test_vceq_p8(
1346	// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
1347	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1348	// CHECK: ret <8 x i8> [[SEXT_I]]
1349	uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) {
1350	return vceq_p8(a, b);
1351	}
1352
1353	// CHECK-LABEL: @test_vceqq_s8(
1354	// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
1355	// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1356	// CHECK: ret <16 x i8> [[SEXT_I]]
1357	uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) {
1358	return vceqq_s8(a, b);
1359	}
1360
1361	// CHECK-LABEL: @test_vceqq_s16(
1362	// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
1363	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1364	// CHECK: ret <8 x i16> [[SEXT_I]]
1365	uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) {
1366	return vceqq_s16(a, b);
1367	}
1368
1369	// CHECK-LABEL: @test_vceqq_s32(
1370	// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
1371	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1372	// CHECK: ret <4 x i32> [[SEXT_I]]
1373	uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) {
1374	return vceqq_s32(a, b);
1375	}
1376
1377	// CHECK-LABEL: @test_vceqq_f32(
1378	// CHECK: [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b
1379	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1380	// CHECK: ret <4 x i32> [[SEXT_I]]
1381	uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) {
1382	return vceqq_f32(a, b);
1383	}
1384
1385	// CHECK-LABEL: @test_vceqq_u8(
1386	// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
1387	// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1388	// CHECK: ret <16 x i8> [[SEXT_I]]
1389	uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) {
1390	return vceqq_u8(a, b);
1391	}
1392
1393	// CHECK-LABEL: @test_vceqq_u16(
1394	// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
1395	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1396	// CHECK: ret <8 x i16> [[SEXT_I]]
1397	uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) {
1398	return vceqq_u16(a, b);
1399	}
1400
1401	// CHECK-LABEL: @test_vceqq_u32(
1402	// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
1403	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1404	// CHECK: ret <4 x i32> [[SEXT_I]]
1405	uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) {
1406	return vceqq_u32(a, b);
1407	}
1408
1409	// CHECK-LABEL: @test_vceqq_p8(
1410	// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
1411	// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1412	// CHECK: ret <16 x i8> [[SEXT_I]]
1413	uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) {
1414	return vceqq_p8(a, b);
1415	}
1416
1417	// CHECK-LABEL: @test_vcge_s8(
1418	// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b
1419	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1420	// CHECK: ret <8 x i8> [[SEXT_I]]
1421	uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) {
1422	return vcge_s8(a, b);
1423	}
1424
1425	// CHECK-LABEL: @test_vcge_s16(
1426	// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b
1427	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1428	// CHECK: ret <4 x i16> [[SEXT_I]]
1429	uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) {
1430	return vcge_s16(a, b);
1431	}
1432
1433	// CHECK-LABEL: @test_vcge_s32(
1434	// CHECK: [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b
1435	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1436	// CHECK: ret <2 x i32> [[SEXT_I]]
1437	uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) {
1438	return vcge_s32(a, b);
1439	}
1440
1441	// CHECK-LABEL: @test_vcge_f32(
1442	// CHECK: [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b
1443	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1444	// CHECK: ret <2 x i32> [[SEXT_I]]
1445	uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) {
1446	return vcge_f32(a, b);
1447	}
1448
1449	// CHECK-LABEL: @test_vcge_u8(
1450	// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b
1451	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1452	// CHECK: ret <8 x i8> [[SEXT_I]]
1453	uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) {
1454	return vcge_u8(a, b);
1455	}
1456
1457	// CHECK-LABEL: @test_vcge_u16(
1458	// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b
1459	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1460	// CHECK: ret <4 x i16> [[SEXT_I]]
1461	uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) {
1462	return vcge_u16(a, b);
1463	}
1464
1465	// CHECK-LABEL: @test_vcge_u32(
1466	// CHECK: [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b
1467	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1468	// CHECK: ret <2 x i32> [[SEXT_I]]
1469	uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) {
1470	return vcge_u32(a, b);
1471	}
1472
1473	// CHECK-LABEL: @test_vcgeq_s8(
1474	// CHECK: [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b
1475	// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1476	// CHECK: ret <16 x i8> [[SEXT_I]]
1477	uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) {
1478	return vcgeq_s8(a, b);
1479	}
1480
1481	// CHECK-LABEL: @test_vcgeq_s16(
1482	// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b
1483	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1484	// CHECK: ret <8 x i16> [[SEXT_I]]
1485	uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) {
1486	return vcgeq_s16(a, b);
1487	}
1488
1489	// CHECK-LABEL: @test_vcgeq_s32(
1490	// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b
1491	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1492	// CHECK: ret <4 x i32> [[SEXT_I]]
1493	uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) {
1494	return vcgeq_s32(a, b);
1495	}
1496
1497	// CHECK-LABEL: @test_vcgeq_f32(
1498	// CHECK: [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b
1499	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1500	// CHECK: ret <4 x i32> [[SEXT_I]]
1501	uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) {
1502	return vcgeq_f32(a, b);
1503	}
1504
1505	// CHECK-LABEL: @test_vcgeq_u8(
1506	// CHECK: [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b
1507	// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1508	// CHECK: ret <16 x i8> [[SEXT_I]]
1509	uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) {
1510	return vcgeq_u8(a, b);
1511	}
1512
1513	// CHECK-LABEL: @test_vcgeq_u16(
1514	// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b
1515	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1516	// CHECK: ret <8 x i16> [[SEXT_I]]
1517	uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) {
1518	return vcgeq_u16(a, b);
1519	}
1520
1521	// CHECK-LABEL: @test_vcgeq_u32(
1522	// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b
1523	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1524	// CHECK: ret <4 x i32> [[SEXT_I]]
1525	uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) {
1526	return vcgeq_u32(a, b);
1527	}
1528
1529	// CHECK-LABEL: @test_vcgt_s8(
1530	// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b
1531	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1532	// CHECK: ret <8 x i8> [[SEXT_I]]
1533	uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) {
1534	return vcgt_s8(a, b);
1535	}
1536
1537	// CHECK-LABEL: @test_vcgt_s16(
1538	// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b
1539	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1540	// CHECK: ret <4 x i16> [[SEXT_I]]
1541	uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) {
1542	return vcgt_s16(a, b);
1543	}
1544
1545	// CHECK-LABEL: @test_vcgt_s32(
1546	// CHECK: [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b
1547	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1548	// CHECK: ret <2 x i32> [[SEXT_I]]
1549	uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) {
1550	return vcgt_s32(a, b);
1551	}
1552
1553	// CHECK-LABEL: @test_vcgt_f32(
1554	// CHECK: [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b
1555	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1556	// CHECK: ret <2 x i32> [[SEXT_I]]
1557	uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) {
1558	return vcgt_f32(a, b);
1559	}
1560
1561	// CHECK-LABEL: @test_vcgt_u8(
1562	// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b
1563	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1564	// CHECK: ret <8 x i8> [[SEXT_I]]
1565	uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) {
1566	return vcgt_u8(a, b);
1567	}
1568
1569	// CHECK-LABEL: @test_vcgt_u16(
1570	// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b
1571	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1572	// CHECK: ret <4 x i16> [[SEXT_I]]
1573	uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) {
1574	return vcgt_u16(a, b);
1575	}
1576
1577	// CHECK-LABEL: @test_vcgt_u32(
1578	// CHECK: [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b
1579	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1580	// CHECK: ret <2 x i32> [[SEXT_I]]
1581	uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) {
1582	return vcgt_u32(a, b);
1583	}
1584
1585	// CHECK-LABEL: @test_vcgtq_s8(
1586	// CHECK: [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b
1587	// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1588	// CHECK: ret <16 x i8> [[SEXT_I]]
1589	uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) {
1590	return vcgtq_s8(a, b);
1591	}
1592
1593	// CHECK-LABEL: @test_vcgtq_s16(
1594	// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b
1595	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1596	// CHECK: ret <8 x i16> [[SEXT_I]]
1597	uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) {
1598	return vcgtq_s16(a, b);
1599	}
1600
1601	// CHECK-LABEL: @test_vcgtq_s32(
1602	// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b
1603	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1604	// CHECK: ret <4 x i32> [[SEXT_I]]
1605	uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) {
1606	return vcgtq_s32(a, b);
1607	}
1608
1609	// CHECK-LABEL: @test_vcgtq_f32(
1610	// CHECK: [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b
1611	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1612	// CHECK: ret <4 x i32> [[SEXT_I]]
1613	uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) {
1614	return vcgtq_f32(a, b);
1615	}
1616
1617	// CHECK-LABEL: @test_vcgtq_u8(
1618	// CHECK: [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b
1619	// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1620	// CHECK: ret <16 x i8> [[SEXT_I]]
1621	uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) {
1622	return vcgtq_u8(a, b);
1623	}
1624
1625	// CHECK-LABEL: @test_vcgtq_u16(
1626	// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b
1627	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1628	// CHECK: ret <8 x i16> [[SEXT_I]]
1629	uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) {
1630	return vcgtq_u16(a, b);
1631	}
1632
1633	// CHECK-LABEL: @test_vcgtq_u32(
1634	// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b
1635	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1636	// CHECK: ret <4 x i32> [[SEXT_I]]
1637	uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) {
1638	return vcgtq_u32(a, b);
1639	}
1640
1641	// CHECK-LABEL: @test_vcle_s8(
1642	// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b
1643	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1644	// CHECK: ret <8 x i8> [[SEXT_I]]
1645	uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) {
1646	return vcle_s8(a, b);
1647	}
1648
1649	// CHECK-LABEL: @test_vcle_s16(
1650	// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b
1651	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1652	// CHECK: ret <4 x i16> [[SEXT_I]]
1653	uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) {
1654	return vcle_s16(a, b);
1655	}
1656
1657	// CHECK-LABEL: @test_vcle_s32(
1658	// CHECK: [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b
1659	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1660	// CHECK: ret <2 x i32> [[SEXT_I]]
1661	uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) {
1662	return vcle_s32(a, b);
1663	}
1664
1665	// CHECK-LABEL: @test_vcle_f32(
1666	// CHECK: [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b
1667	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1668	// CHECK: ret <2 x i32> [[SEXT_I]]
1669	uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) {
1670	return vcle_f32(a, b);
1671	}
1672
1673	// CHECK-LABEL: @test_vcle_u8(
1674	// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b
1675	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1676	// CHECK: ret <8 x i8> [[SEXT_I]]
1677	uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) {
1678	return vcle_u8(a, b);
1679	}
1680
1681	// CHECK-LABEL: @test_vcle_u16(
1682	// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b
1683	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1684	// CHECK: ret <4 x i16> [[SEXT_I]]
1685	uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) {
1686	return vcle_u16(a, b);
1687	}
1688
1689	// CHECK-LABEL: @test_vcle_u32(
1690	// CHECK: [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b
1691	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1692	// CHECK: ret <2 x i32> [[SEXT_I]]
1693	uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) {
1694	return vcle_u32(a, b);
1695	}
1696
1697	// CHECK-LABEL: @test_vcleq_s8(
1698	// CHECK: [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b
1699	// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1700	// CHECK: ret <16 x i8> [[SEXT_I]]
1701	uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) {
1702	return vcleq_s8(a, b);
1703	}
1704
1705	// CHECK-LABEL: @test_vcleq_s16(
1706	// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b
1707	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1708	// CHECK: ret <8 x i16> [[SEXT_I]]
1709	uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) {
1710	return vcleq_s16(a, b);
1711	}
1712
1713	// CHECK-LABEL: @test_vcleq_s32(
1714	// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b
1715	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1716	// CHECK: ret <4 x i32> [[SEXT_I]]
1717	uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) {
1718	return vcleq_s32(a, b);
1719	}
1720
1721	// CHECK-LABEL: @test_vcleq_f32(
1722	// CHECK: [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b
1723	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1724	// CHECK: ret <4 x i32> [[SEXT_I]]
1725	uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) {
1726	return vcleq_f32(a, b);
1727	}
1728
1729	// CHECK-LABEL: @test_vcleq_u8(
1730	// CHECK: [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b
1731	// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1732	// CHECK: ret <16 x i8> [[SEXT_I]]
1733	uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) {
1734	return vcleq_u8(a, b);
1735	}
1736
1737	// CHECK-LABEL: @test_vcleq_u16(
1738	// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b
1739	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1740	// CHECK: ret <8 x i16> [[SEXT_I]]
1741	uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) {
1742	return vcleq_u16(a, b);
1743	}
1744
1745	// CHECK-LABEL: @test_vcleq_u32(
1746	// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b
1747	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1748	// CHECK: ret <4 x i32> [[SEXT_I]]
1749	uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) {
1750	return vcleq_u32(a, b);
1751	}
1752
1753	// CHECK-LABEL: @test_vcls_s8(
1754	// CHECK: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a)
1755	// CHECK: ret <8 x i8> [[VCLS_V_I]]
1756	int8x8_t test_vcls_s8(int8x8_t a) {
1757	return vcls_s8(a);
1758	}
1759
1760	// CHECK-LABEL: @test_vcls_s16(
1761	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1762	// CHECK: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a)
1763	// CHECK: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
1764	// CHECK: ret <4 x i16> [[VCLS_V1_I]]
1765	int16x4_t test_vcls_s16(int16x4_t a) {
1766	return vcls_s16(a);
1767	}
1768
1769	// CHECK-LABEL: @test_vcls_s32(
1770	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1771	// CHECK: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a)
1772	// CHECK: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
1773	// CHECK: ret <2 x i32> [[VCLS_V1_I]]
1774	int32x2_t test_vcls_s32(int32x2_t a) {
1775	return vcls_s32(a);
1776	}
1777
1778	// CHECK-LABEL: @test_vclsq_s8(
1779	// CHECK: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a)
1780	// CHECK: ret <16 x i8> [[VCLSQ_V_I]]
1781	int8x16_t test_vclsq_s8(int8x16_t a) {
1782	return vclsq_s8(a);
1783	}
1784
1785	// CHECK-LABEL: @test_vclsq_s16(
1786	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1787	// CHECK: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a)
1788	// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
1789	// CHECK: ret <8 x i16> [[VCLSQ_V1_I]]
1790	int16x8_t test_vclsq_s16(int16x8_t a) {
1791	return vclsq_s16(a);
1792	}
1793
1794	// CHECK-LABEL: @test_vclsq_s32(
1795	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1796	// CHECK: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a)
1797	// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
1798	// CHECK: ret <4 x i32> [[VCLSQ_V1_I]]
1799	int32x4_t test_vclsq_s32(int32x4_t a) {
1800	return vclsq_s32(a);
1801	}
1802
1803	// CHECK-LABEL: @test_vclt_s8(
1804	// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b
1805	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1806	// CHECK: ret <8 x i8> [[SEXT_I]]
1807	uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) {
1808	return vclt_s8(a, b);
1809	}
1810
1811	// CHECK-LABEL: @test_vclt_s16(
1812	// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b
1813	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1814	// CHECK: ret <4 x i16> [[SEXT_I]]
1815	uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) {
1816	return vclt_s16(a, b);
1817	}
1818
1819	// CHECK-LABEL: @test_vclt_s32(
1820	// CHECK: [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b
1821	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1822	// CHECK: ret <2 x i32> [[SEXT_I]]
1823	uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) {
1824	return vclt_s32(a, b);
1825	}
1826
1827	// CHECK-LABEL: @test_vclt_f32(
1828	// CHECK: [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b
1829	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1830	// CHECK: ret <2 x i32> [[SEXT_I]]
1831	uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) {
1832	return vclt_f32(a, b);
1833	}
1834
1835	// CHECK-LABEL: @test_vclt_u8(
1836	// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b
1837	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1838	// CHECK: ret <8 x i8> [[SEXT_I]]
1839	uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) {
1840	return vclt_u8(a, b);
1841	}
1842
1843	// CHECK-LABEL: @test_vclt_u16(
1844	// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b
1845	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1846	// CHECK: ret <4 x i16> [[SEXT_I]]
1847	uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) {
1848	return vclt_u16(a, b);
1849	}
1850
1851	// CHECK-LABEL: @test_vclt_u32(
1852	// CHECK: [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b
1853	// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1854	// CHECK: ret <2 x i32> [[SEXT_I]]
1855	uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) {
1856	return vclt_u32(a, b);
1857	}
1858
1859	// CHECK-LABEL: @test_vcltq_s8(
1860	// CHECK: [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b
1861	// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1862	// CHECK: ret <16 x i8> [[SEXT_I]]
1863	uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) {
1864	return vcltq_s8(a, b);
1865	}
1866
1867	// CHECK-LABEL: @test_vcltq_s16(
1868	// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b
1869	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1870	// CHECK: ret <8 x i16> [[SEXT_I]]
1871	uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) {
1872	return vcltq_s16(a, b);
1873	}
1874
1875	// CHECK-LABEL: @test_vcltq_s32(
1876	// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b
1877	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1878	// CHECK: ret <4 x i32> [[SEXT_I]]
1879	uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) {
1880	return vcltq_s32(a, b);
1881	}
1882
1883	// CHECK-LABEL: @test_vcltq_f32(
1884	// CHECK: [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b
1885	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1886	// CHECK: ret <4 x i32> [[SEXT_I]]
1887	uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) {
1888	return vcltq_f32(a, b);
1889	}
1890
1891	// CHECK-LABEL: @test_vcltq_u8(
1892	// CHECK: [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b
1893	// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1894	// CHECK: ret <16 x i8> [[SEXT_I]]
1895	uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) {
1896	return vcltq_u8(a, b);
1897	}
1898
1899	// CHECK-LABEL: @test_vcltq_u16(
1900	// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b
1901	// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1902	// CHECK: ret <8 x i16> [[SEXT_I]]
1903	uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) {
1904	return vcltq_u16(a, b);
1905	}
1906
1907	// CHECK-LABEL: @test_vcltq_u32(
1908	// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b
1909	// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1910	// CHECK: ret <4 x i32> [[SEXT_I]]
1911	uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) {
1912	return vcltq_u32(a, b);
1913	}
1914
1915	// CHECK-LABEL: @test_vclz_s8(
1916	// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false)
1917	// CHECK: ret <8 x i8> [[VCLZ_V_I]]
1918	int8x8_t test_vclz_s8(int8x8_t a) {
1919	return vclz_s8(a);
1920	}
1921
1922	// CHECK-LABEL: @test_vclz_s16(
1923	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1924	// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false)
1925	// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
1926	// CHECK: ret <4 x i16> [[VCLZ_V1_I]]
1927	int16x4_t test_vclz_s16(int16x4_t a) {
1928	return vclz_s16(a);
1929	}
1930
1931	// CHECK-LABEL: @test_vclz_s32(
1932	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1933	// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
1934	// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
1935	// CHECK: ret <2 x i32> [[VCLZ_V1_I]]
1936	int32x2_t test_vclz_s32(int32x2_t a) {
1937	return vclz_s32(a);
1938	}
1939
1940	// CHECK-LABEL: @test_vclz_u8(
1941	// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false)
1942	// CHECK: ret <8 x i8> [[VCLZ_V_I]]
1943	uint8x8_t test_vclz_u8(uint8x8_t a) {
1944	return vclz_u8(a);
1945	}
1946
1947	// CHECK-LABEL: @test_vclz_u16(
1948	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1949	// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false)
1950	// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
1951	// CHECK: ret <4 x i16> [[VCLZ_V1_I]]
1952	uint16x4_t test_vclz_u16(uint16x4_t a) {
1953	return vclz_u16(a);
1954	}
1955
1956	// CHECK-LABEL: @test_vclz_u32(
1957	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1958	// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false)
1959	// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
1960	// CHECK: ret <2 x i32> [[VCLZ_V1_I]]
1961	uint32x2_t test_vclz_u32(uint32x2_t a) {
1962	return vclz_u32(a);
1963	}
1964
1965	// CHECK-LABEL: @test_vclzq_s8(
1966	// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
1967	// CHECK: ret <16 x i8> [[VCLZQ_V_I]]
1968	int8x16_t test_vclzq_s8(int8x16_t a) {
1969	return vclzq_s8(a);
1970	}
1971
1972	// CHECK-LABEL: @test_vclzq_s16(
1973	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1974	// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
1975	// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
1976	// CHECK: ret <8 x i16> [[VCLZQ_V1_I]]
1977	int16x8_t test_vclzq_s16(int16x8_t a) {
1978	return vclzq_s16(a);
1979	}
1980
1981	// CHECK-LABEL: @test_vclzq_s32(
1982	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1983	// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
1984	// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
1985	// CHECK: ret <4 x i32> [[VCLZQ_V1_I]]
1986	int32x4_t test_vclzq_s32(int32x4_t a) {
1987	return vclzq_s32(a);
1988	}
1989
1990	// CHECK-LABEL: @test_vclzq_u8(
1991	// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
1992	// CHECK: ret <16 x i8> [[VCLZQ_V_I]]
1993	uint8x16_t test_vclzq_u8(uint8x16_t a) {
1994	return vclzq_u8(a);
1995	}
1996
1997	// CHECK-LABEL: @test_vclzq_u16(
1998	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1999	// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
2000	// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
2001	// CHECK: ret <8 x i16> [[VCLZQ_V1_I]]
2002	uint16x8_t test_vclzq_u16(uint16x8_t a) {
2003	return vclzq_u16(a);
2004	}
2005
2006	// CHECK-LABEL: @test_vclzq_u32(
2007	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2008	// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
2009	// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
2010	// CHECK: ret <4 x i32> [[VCLZQ_V1_I]]
2011	uint32x4_t test_vclzq_u32(uint32x4_t a) {
2012	return vclzq_u32(a);
2013	}
2014
2015	// CHECK-LABEL: @test_vcnt_u8(
2016	// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
2017	// CHECK: ret <8 x i8> [[VCNT_V_I]]
2018	uint8x8_t test_vcnt_u8(uint8x8_t a) {
2019	return vcnt_u8(a);
2020	}
2021
2022	// CHECK-LABEL: @test_vcnt_s8(
2023	// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
2024	// CHECK: ret <8 x i8> [[VCNT_V_I]]
2025	int8x8_t test_vcnt_s8(int8x8_t a) {
2026	return vcnt_s8(a);
2027	}
2028
2029	// CHECK-LABEL: @test_vcnt_p8(
2030	// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a)
2031	// CHECK: ret <8 x i8> [[VCNT_V_I]]
2032	poly8x8_t test_vcnt_p8(poly8x8_t a) {
2033	return vcnt_p8(a);
2034	}
2035
2036	// CHECK-LABEL: @test_vcntq_u8(
2037	// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
2038	// CHECK: ret <16 x i8> [[VCNTQ_V_I]]
2039	uint8x16_t test_vcntq_u8(uint8x16_t a) {
2040	return vcntq_u8(a);
2041	}
2042
2043	// CHECK-LABEL: @test_vcntq_s8(
2044	// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
2045	// CHECK: ret <16 x i8> [[VCNTQ_V_I]]
2046	int8x16_t test_vcntq_s8(int8x16_t a) {
2047	return vcntq_s8(a);
2048	}
2049
2050	// CHECK-LABEL: @test_vcntq_p8(
2051	// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
2052	// CHECK: ret <16 x i8> [[VCNTQ_V_I]]
2053	poly8x16_t test_vcntq_p8(poly8x16_t a) {
2054	return vcntq_p8(a);
2055	}
2056
2057	// CHECK-LABEL: @test_vcombine_s8(
2058	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2059	// CHECK: ret <16 x i8> [[SHUFFLE_I]]
2060	int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) {
2061	return vcombine_s8(a, b);
2062	}
2063
2064	// CHECK-LABEL: @test_vcombine_s16(
2065	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2066	// CHECK: ret <8 x i16> [[SHUFFLE_I]]
2067	int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) {
2068	return vcombine_s16(a, b);
2069	}
2070
2071	// CHECK-LABEL: @test_vcombine_s32(
2072	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2073	// CHECK: ret <4 x i32> [[SHUFFLE_I]]
2074	int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) {
2075	return vcombine_s32(a, b);
2076	}
2077
2078	// CHECK-LABEL: @test_vcombine_s64(
2079	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
2080	// CHECK: ret <2 x i64> [[SHUFFLE_I]]
2081	int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) {
2082	return vcombine_s64(a, b);
2083	}
2084
2085	// CHECK-LABEL: @test_vcombine_f16(
2086	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2087	// CHECK: ret <8 x half> [[SHUFFLE_I]]
2088	float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) {
2089	return vcombine_f16(a, b);
2090	}
2091
2092	// CHECK-LABEL: @test_vcombine_f32(
2093	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2094	// CHECK: ret <4 x float> [[SHUFFLE_I]]
2095	float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) {
2096	return vcombine_f32(a, b);
2097	}
2098
2099	// CHECK-LABEL: @test_vcombine_u8(
2100	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2101	// CHECK: ret <16 x i8> [[SHUFFLE_I]]
2102	uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) {
2103	return vcombine_u8(a, b);
2104	}
2105
2106	// CHECK-LABEL: @test_vcombine_u16(
2107	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2108	// CHECK: ret <8 x i16> [[SHUFFLE_I]]
2109	uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) {
2110	return vcombine_u16(a, b);
2111	}
2112
2113	// CHECK-LABEL: @test_vcombine_u32(
2114	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2115	// CHECK: ret <4 x i32> [[SHUFFLE_I]]
2116	uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) {
2117	return vcombine_u32(a, b);
2118	}
2119
2120	// CHECK-LABEL: @test_vcombine_u64(
2121	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
2122	// CHECK: ret <2 x i64> [[SHUFFLE_I]]
2123	uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) {
2124	return vcombine_u64(a, b);
2125	}
2126
2127	// CHECK-LABEL: @test_vcombine_p8(
2128	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2129	// CHECK: ret <16 x i8> [[SHUFFLE_I]]
2130	poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) {
2131	return vcombine_p8(a, b);
2132	}
2133
2134	// CHECK-LABEL: @test_vcombine_p16(
2135	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2136	// CHECK: ret <8 x i16> [[SHUFFLE_I]]
2137	poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) {
2138	return vcombine_p16(a, b);
2139	}
2140
2141	// CHECK-LABEL: @test_vcreate_s8(
2142	// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
2143	// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false)
2144	// CHECK: ret <8 x i8> [[VCLZ_V_I]]
2145	int8x8_t test_vcreate_s8(uint64_t a) {
2146	return vclz_s8(vcreate_s8(a));
2147	}
2148
2149	// CHECK-LABEL: @test_vcreate_s16(
2150	// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
2151	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2152	// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
2153	// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
2154	// CHECK: ret <4 x i16> [[VCLZ_V1_I]]
2155	int16x4_t test_vcreate_s16(uint64_t a) {
2156	return vclz_s16(vcreate_s16(a));
2157	}
2158
2159	// CHECK-LABEL: @test_vcreate_s32(
2160	// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
2161	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
2162	// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
2163	// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
2164	// CHECK: ret <2 x i32> [[VCLZ_V1_I]]
2165	int32x2_t test_vcreate_s32(uint64_t a) {
2166	return vclz_s32(vcreate_s32(a));
2167	}
2168
2169	// CHECK-LABEL: @test_vcreate_f16(
2170	// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x half>
2171	// CHECK: ret <4 x half> [[TMP0]]
2172	float16x4_t test_vcreate_f16(uint64_t a) {
2173	return vcreate_f16(a);
2174	}
2175
2176	// CHECK-LABEL: @test_vcreate_f32(
2177	// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <2 x float>
2178	// CHECK: ret <2 x float> [[TMP0]]
2179	float32x2_t test_vcreate_f32(uint64_t a) {
2180	return vcreate_f32(a);
2181	}
2182
2183	// CHECK-LABEL: @test_vcreate_u8(
2184	// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
2185	// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false)
2186	// CHECK: ret <8 x i8> [[VCLZ_V_I]]
2187	uint8x8_t test_vcreate_u8(uint64_t a) {
2188	return vclz_s8(vcreate_u8(a));
2189	}
2190
2191	// CHECK-LABEL: @test_vcreate_u16(
2192	// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
2193	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2194	// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false)
2195	// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
2196	// CHECK: ret <4 x i16> [[VCLZ_V1_I]]
2197	uint16x4_t test_vcreate_u16(uint64_t a) {
2198	return vclz_s16(vcreate_u16(a));
2199	}
2200
2201	// CHECK-LABEL: @test_vcreate_u32(
2202	// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
2203	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
2204	// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false)
2205	// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
2206	// CHECK: ret <2 x i32> [[VCLZ_V1_I]]
2207	uint32x2_t test_vcreate_u32(uint64_t a) {
2208	return vclz_s32(vcreate_u32(a));
2209	}
2210
2211	// CHECK-LABEL: @test_vcreate_u64(
2212	// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
2213	// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
2214	// CHECK: ret <1 x i64> [[ADD_I]]
2215	uint64x1_t test_vcreate_u64(uint64_t a) {
2216	uint64x1_t tmp = vcreate_u64(a);
2217	return vadd_u64(tmp, tmp);
2218	}
2219
2220	// CHECK-LABEL: @test_vcreate_p8(
2221	// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
2222	// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]])
2223	// CHECK: ret <8 x i8> [[VCNT_V_I]]
2224	poly8x8_t test_vcreate_p8(uint64_t a) {
2225	return vcnt_p8(vcreate_p8(a));
2226	}
2227
2228	// CHECK-LABEL: @test_vcreate_p16(
2229	// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
2230	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2231	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2232	// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2233	// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
2234	// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
2235	// CHECK: ret <4 x i16> [[TMP4]]
2236	poly16x4_t test_vcreate_p16(uint64_t a) {
2237	poly16x4_t tmp = vcreate_p16(a);
2238	return vbsl_p16(tmp, tmp, tmp);
2239	}
2240
2241	// CHECK-LABEL: @test_vcreate_s64(
2242	// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
2243	// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
2244	// CHECK: ret <1 x i64> [[ADD_I]]
2245	int64x1_t test_vcreate_s64(uint64_t a) {
2246	int64x1_t tmp = vcreate_s64(a);
2247	return vadd_s64(tmp, tmp);
2248	}
2249
2250	// CHECK-LABEL: @test_vcvt_f16_f32(
2251	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2252	// CHECK: [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a)
2253	// CHECK: [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
2254	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
2255	// CHECK: ret <4 x half> [[TMP1]]
2256	float16x4_t test_vcvt_f16_f32(float32x4_t a) {
2257	return vcvt_f16_f32(a);
2258	}
2259
2260	// CHECK-LABEL: @test_vcvt_f32_s32(
2261	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2262	// CHECK: [[VCVT_I:%.*]] = sitofp <2 x i32> %a to <2 x float>
2263	// CHECK: ret <2 x float> [[VCVT_I]]
2264	float32x2_t test_vcvt_f32_s32(int32x2_t a) {
2265	return vcvt_f32_s32(a);
2266	}
2267
2268	// CHECK-LABEL: @test_vcvt_f32_u32(
2269	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2270	// CHECK: [[VCVT_I:%.*]] = uitofp <2 x i32> %a to <2 x float>
2271	// CHECK: ret <2 x float> [[VCVT_I]]
2272	float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
2273	return vcvt_f32_u32(a);
2274	}
2275
2276	// CHECK-LABEL: @test_vcvtq_f32_s32(
2277	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2278	// CHECK: [[VCVT_I:%.*]] = sitofp <4 x i32> %a to <4 x float>
2279	// CHECK: ret <4 x float> [[VCVT_I]]
2280	float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
2281	return vcvtq_f32_s32(a);
2282	}
2283
2284	// CHECK-LABEL: @test_vcvtq_f32_u32(
2285	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2286	// CHECK: [[VCVT_I:%.*]] = uitofp <4 x i32> %a to <4 x float>
2287	// CHECK: ret <4 x float> [[VCVT_I]]
2288	float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
2289	return vcvtq_f32_u32(a);
2290	}
2291
2292	// CHECK-LABEL: @test_vcvt_f32_f16(
2293	// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
2294	// CHECK: [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2295	// CHECK: [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]])
2296	// CHECK: [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
2297	// CHECK: ret <4 x float> [[VCVT_F32_F161_I]]
2298	float32x4_t test_vcvt_f32_f16(float16x4_t a) {
2299	return vcvt_f32_f16(a);
2300	}
2301
2302	// CHECK-LABEL: @test_vcvt_n_f32_s32(
2303	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2304	// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2305	// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
2306	// CHECK: ret <2 x float> [[VCVT_N1]]
2307	float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
2308	return vcvt_n_f32_s32(a, 1);
2309	}
2310
2311	// CHECK-LABEL: @test_vcvt_n_f32_u32(
2312	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2313	// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2314	// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
2315	// CHECK: ret <2 x float> [[VCVT_N1]]
2316	float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
2317	return vcvt_n_f32_u32(a, 1);
2318	}
2319
2320	// CHECK-LABEL: @test_vcvtq_n_f32_s32(
2321	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2322	// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2323	// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
2324	// CHECK: ret <4 x float> [[VCVT_N1]]
2325	float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
2326	return vcvtq_n_f32_s32(a, 3);
2327	}
2328
2329	// CHECK-LABEL: @test_vcvtq_n_f32_u32(
2330	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2331	// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2332	// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
2333	// CHECK: ret <4 x float> [[VCVT_N1]]
2334	float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
2335	return vcvtq_n_f32_u32(a, 3);
2336	}
2337
2338	// CHECK-LABEL: @test_vcvt_n_s32_f32(
2339	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2340	// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2341	// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
2342	// CHECK: ret <2 x i32> [[VCVT_N1]]
2343	int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
2344	return vcvt_n_s32_f32(a, 1);
2345	}
2346
2347	// CHECK-LABEL: @test_vcvtq_n_s32_f32(
2348	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2349	// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2350	// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
2351	// CHECK: ret <4 x i32> [[VCVT_N1]]
2352	int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
2353	return vcvtq_n_s32_f32(a, 3);
2354	}
2355
2356	// CHECK-LABEL: @test_vcvt_n_u32_f32(
2357	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2358	// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2359	// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
2360	// CHECK: ret <2 x i32> [[VCVT_N1]]
2361	uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
2362	return vcvt_n_u32_f32(a, 1);
2363	}
2364
2365	// CHECK-LABEL: @test_vcvtq_n_u32_f32(
2366	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2367	// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2368	// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
2369	// CHECK: ret <4 x i32> [[VCVT_N1]]
2370	uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
2371	return vcvtq_n_u32_f32(a, 3);
2372	}
2373
2374	// CHECK-LABEL: @test_vcvt_s32_f32(
2375	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2376	// CHECK: [[VCVT_I:%.*]] = fptosi <2 x float> %a to <2 x i32>
2377	// CHECK: ret <2 x i32> [[VCVT_I]]
2378	int32x2_t test_vcvt_s32_f32(float32x2_t a) {
2379	return vcvt_s32_f32(a);
2380	}
2381
2382	// CHECK-LABEL: @test_vcvtq_s32_f32(
2383	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2384	// CHECK: [[VCVT_I:%.*]] = fptosi <4 x float> %a to <4 x i32>
2385	// CHECK: ret <4 x i32> [[VCVT_I]]
2386	int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
2387	return vcvtq_s32_f32(a);
2388	}
2389
2390	// CHECK-LABEL: @test_vcvt_u32_f32(
2391	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2392	// CHECK: [[VCVT_I:%.*]] = fptoui <2 x float> %a to <2 x i32>
2393	// CHECK: ret <2 x i32> [[VCVT_I]]
2394	uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
2395	return vcvt_u32_f32(a);
2396	}
2397
2398	// CHECK-LABEL: @test_vcvtq_u32_f32(
2399	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2400	// CHECK: [[VCVT_I:%.*]] = fptoui <4 x float> %a to <4 x i32>
2401	// CHECK: ret <4 x i32> [[VCVT_I]]
2402	uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
2403	return vcvtq_u32_f32(a);
2404	}
2405
2406	// CHECK-LABEL: @test_vdup_lane_u8(
2407	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2408	// CHECK: ret <8 x i8> [[SHUFFLE]]
2409	uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
2410	return vdup_lane_u8(a, 7);
2411	}
2412
2413	// CHECK-LABEL: @test_vdup_lane_u16(
2414	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2415	// CHECK: ret <4 x i16> [[SHUFFLE]]
2416	uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
2417	return vdup_lane_u16(a, 3);
2418	}
2419
2420	// CHECK-LABEL: @test_vdup_lane_u32(
2421	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
2422	// CHECK: ret <2 x i32> [[SHUFFLE]]
2423	uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
2424	return vdup_lane_u32(a, 1);
2425	}
2426
2427	// CHECK-LABEL: @test_vdup_lane_s8(
2428	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2429	// CHECK: ret <8 x i8> [[SHUFFLE]]
2430	int8x8_t test_vdup_lane_s8(int8x8_t a) {
2431	return vdup_lane_s8(a, 7);
2432	}
2433
2434	// CHECK-LABEL: @test_vdup_lane_s16(
2435	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2436	// CHECK: ret <4 x i16> [[SHUFFLE]]
2437	int16x4_t test_vdup_lane_s16(int16x4_t a) {
2438	return vdup_lane_s16(a, 3);
2439	}
2440
2441	// CHECK-LABEL: @test_vdup_lane_s32(
2442	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
2443	// CHECK: ret <2 x i32> [[SHUFFLE]]
2444	int32x2_t test_vdup_lane_s32(int32x2_t a) {
2445	return vdup_lane_s32(a, 1);
2446	}
2447
2448	// CHECK-LABEL: @test_vdup_lane_p8(
2449	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2450	// CHECK: ret <8 x i8> [[SHUFFLE]]
2451	poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
2452	return vdup_lane_p8(a, 7);
2453	}
2454
2455	// CHECK-LABEL: @test_vdup_lane_p16(
2456	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2457	// CHECK: ret <4 x i16> [[SHUFFLE]]
2458	poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
2459	return vdup_lane_p16(a, 3);
2460	}
2461
2462	// CHECK-LABEL: @test_vdup_lane_f32(
2463	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 1>
2464	// CHECK: ret <2 x float> [[SHUFFLE]]
2465	float32x2_t test_vdup_lane_f32(float32x2_t a) {
2466	return vdup_lane_f32(a, 1);
2467	}
2468
2469	// CHECK-LABEL: @test_vdupq_lane_u8(
2470	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2471	// CHECK: ret <16 x i8> [[SHUFFLE]]
2472	uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
2473	return vdupq_lane_u8(a, 7);
2474	}
2475
2476	// CHECK-LABEL: @test_vdupq_lane_u16(
2477	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2478	// CHECK: ret <8 x i16> [[SHUFFLE]]
2479	uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
2480	return vdupq_lane_u16(a, 3);
2481	}
2482
2483	// CHECK-LABEL: @test_vdupq_lane_u32(
2484	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2485	// CHECK: ret <4 x i32> [[SHUFFLE]]
2486	uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
2487	return vdupq_lane_u32(a, 1);
2488	}
2489
2490	// CHECK-LABEL: @test_vdupq_lane_s8(
2491	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2492	// CHECK: ret <16 x i8> [[SHUFFLE]]
2493	int8x16_t test_vdupq_lane_s8(int8x8_t a) {
2494	return vdupq_lane_s8(a, 7);
2495	}
2496
2497	// CHECK-LABEL: @test_vdupq_lane_s16(
2498	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2499	// CHECK: ret <8 x i16> [[SHUFFLE]]
2500	int16x8_t test_vdupq_lane_s16(int16x4_t a) {
2501	return vdupq_lane_s16(a, 3);
2502	}
2503
2504	// CHECK-LABEL: @test_vdupq_lane_s32(
2505	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2506	// CHECK: ret <4 x i32> [[SHUFFLE]]
2507	int32x4_t test_vdupq_lane_s32(int32x2_t a) {
2508	return vdupq_lane_s32(a, 1);
2509	}
2510
2511	// CHECK-LABEL: @test_vdupq_lane_p8(
2512	// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2513	// CHECK: ret <16 x i8> [[SHUFFLE]]
2514	poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
2515	return vdupq_lane_p8(a, 7);
2516	}
2517
2518	// CHECK-LABEL: @test_vdupq_lane_p16(
2519	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2520	// CHECK: ret <8 x i16> [[SHUFFLE]]
2521	poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
2522	return vdupq_lane_p16(a, 3);
2523	}
2524
2525	// CHECK-LABEL: @test_vdupq_lane_f32(
2526	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2527	// CHECK: ret <4 x float> [[SHUFFLE]]
2528	float32x4_t test_vdupq_lane_f32(float32x2_t a) {
2529	return vdupq_lane_f32(a, 1);
2530	}
2531
2532	// CHECK-LABEL: @test_vdup_lane_s64(
2533	// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
2534	// CHECK: ret <1 x i64> [[SHUFFLE]]
2535	int64x1_t test_vdup_lane_s64(int64x1_t a) {
2536	return vdup_lane_s64(a, 0);
2537	}
2538
2539	// CHECK-LABEL: @test_vdup_lane_u64(
2540	// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
2541	// CHECK: ret <1 x i64> [[SHUFFLE]]
2542	uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
2543	return vdup_lane_u64(a, 0);
2544	}
2545
2546	// CHECK-LABEL: @test_vdupq_lane_s64(
2547	// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
2548	// CHECK: ret <2 x i64> [[SHUFFLE]]
2549	int64x2_t test_vdupq_lane_s64(int64x1_t a) {
2550	return vdupq_lane_s64(a, 0);
2551	}
2552
2553	// CHECK-LABEL: @test_vdupq_lane_u64(
2554	// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
2555	// CHECK: ret <2 x i64> [[SHUFFLE]]
2556	uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
2557	return vdupq_lane_u64(a, 0);
2558	}
2559
2560	// CHECK-LABEL: @test_vdup_n_u8(
2561	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
2562	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
2563	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
2564	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
2565	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
2566	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
2567	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
2568	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
2569	// CHECK: ret <8 x i8> [[VECINIT7_I]]
2570	uint8x8_t test_vdup_n_u8(uint8_t a) {
2571	return vdup_n_u8(a);
2572	}
2573
2574	// CHECK-LABEL: @test_vdup_n_u16(
2575	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
2576	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
2577	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
2578	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
2579	// CHECK: ret <4 x i16> [[VECINIT3_I]]
2580	uint16x4_t test_vdup_n_u16(uint16_t a) {
2581	return vdup_n_u16(a);
2582	}
2583
2584	// CHECK-LABEL: @test_vdup_n_u32(
2585	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
2586	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
2587	// CHECK: ret <2 x i32> [[VECINIT1_I]]
2588	uint32x2_t test_vdup_n_u32(uint32_t a) {
2589	return vdup_n_u32(a);
2590	}
2591
2592	// CHECK-LABEL: @test_vdup_n_s8(
2593	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
2594	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
2595	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
2596	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
2597	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
2598	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
2599	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
2600	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
2601	// CHECK: ret <8 x i8> [[VECINIT7_I]]
2602	int8x8_t test_vdup_n_s8(int8_t a) {
2603	return vdup_n_s8(a);
2604	}
2605
2606	// CHECK-LABEL: @test_vdup_n_s16(
2607	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
2608	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
2609	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
2610	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
2611	// CHECK: ret <4 x i16> [[VECINIT3_I]]
2612	int16x4_t test_vdup_n_s16(int16_t a) {
2613	return vdup_n_s16(a);
2614	}
2615
2616	// CHECK-LABEL: @test_vdup_n_s32(
2617	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
2618	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
2619	// CHECK: ret <2 x i32> [[VECINIT1_I]]
2620	int32x2_t test_vdup_n_s32(int32_t a) {
2621	return vdup_n_s32(a);
2622	}
2623
2624	// CHECK-LABEL: @test_vdup_n_p8(
2625	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
2626	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
2627	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
2628	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
2629	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
2630	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
2631	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
2632	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
2633	// CHECK: ret <8 x i8> [[VECINIT7_I]]
2634	poly8x8_t test_vdup_n_p8(poly8_t a) {
2635	return vdup_n_p8(a);
2636	}
2637
2638	// CHECK-LABEL: @test_vdup_n_p16(
2639	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
2640	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
2641	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
2642	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
2643	// CHECK: ret <4 x i16> [[VECINIT3_I]]
2644	poly16x4_t test_vdup_n_p16(poly16_t a) {
2645	return vdup_n_p16(a);
2646	}
2647
2648	// CHECK-LABEL: @test_vdup_n_f16(
2649	// CHECK: [[TMP0:%.]] = load half, half %a, align 2
2650	// CHECK: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
2651	// CHECK: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
2652	// CHECK: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
2653	// CHECK: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
2654	// CHECK: ret <4 x half> [[VECINIT3]]
2655	float16x4_t test_vdup_n_f16(float16_t *a) {
2656	return vdup_n_f16(*a);
2657	}
2658
2659	// CHECK-LABEL: @test_vdup_n_f32(
2660	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
2661	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
2662	// CHECK: ret <2 x float> [[VECINIT1_I]]
2663	float32x2_t test_vdup_n_f32(float32_t a) {
2664	return vdup_n_f32(a);
2665	}
2666
2667	// CHECK-LABEL: @test_vdupq_n_u8(
2668	// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
2669	// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
2670	// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
2671	// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
2672	// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
2673	// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
2674	// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
2675	// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
2676	// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
2677	// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
2678	// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
2679	// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
2680	// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
2681	// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
2682	// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
2683	// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
2684	// CHECK: ret <16 x i8> [[VECINIT15_I]]
2685	uint8x16_t test_vdupq_n_u8(uint8_t a) {
2686	return vdupq_n_u8(a);
2687	}
2688
2689	// CHECK-LABEL: @test_vdupq_n_u16(
2690	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
2691	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
2692	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
2693	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
2694	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
2695	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
2696	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
2697	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
2698	// CHECK: ret <8 x i16> [[VECINIT7_I]]
2699	uint16x8_t test_vdupq_n_u16(uint16_t a) {
2700	return vdupq_n_u16(a);
2701	}
2702
2703	// CHECK-LABEL: @test_vdupq_n_u32(
2704	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
2705	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
2706	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
2707	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
2708	// CHECK: ret <4 x i32> [[VECINIT3_I]]
2709	uint32x4_t test_vdupq_n_u32(uint32_t a) {
2710	return vdupq_n_u32(a);
2711	}
2712
2713	// CHECK-LABEL: @test_vdupq_n_s8(
2714	// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
2715	// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
2716	// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
2717	// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
2718	// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
2719	// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
2720	// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
2721	// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
2722	// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
2723	// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
2724	// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
2725	// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
2726	// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
2727	// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
2728	// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
2729	// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
2730	// CHECK: ret <16 x i8> [[VECINIT15_I]]
2731	int8x16_t test_vdupq_n_s8(int8_t a) {
2732	return vdupq_n_s8(a);
2733	}
2734
2735	// CHECK-LABEL: @test_vdupq_n_s16(
2736	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
2737	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
2738	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
2739	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
2740	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
2741	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
2742	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
2743	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
2744	// CHECK: ret <8 x i16> [[VECINIT7_I]]
2745	int16x8_t test_vdupq_n_s16(int16_t a) {
2746	return vdupq_n_s16(a);
2747	}
2748
2749	// CHECK-LABEL: @test_vdupq_n_s32(
2750	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
2751	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
2752	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
2753	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
2754	// CHECK: ret <4 x i32> [[VECINIT3_I]]
2755	int32x4_t test_vdupq_n_s32(int32_t a) {
2756	return vdupq_n_s32(a);
2757	}
2758
2759	// CHECK-LABEL: @test_vdupq_n_p8(
2760	// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
2761	// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
2762	// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
2763	// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
2764	// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
2765	// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
2766	// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
2767	// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
2768	// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
2769	// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
2770	// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
2771	// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
2772	// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
2773	// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
2774	// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
2775	// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
2776	// CHECK: ret <16 x i8> [[VECINIT15_I]]
2777	poly8x16_t test_vdupq_n_p8(poly8_t a) {
2778	return vdupq_n_p8(a);
2779	}
2780
2781	// CHECK-LABEL: @test_vdupq_n_p16(
2782	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
2783	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
2784	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
2785	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
2786	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
2787	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
2788	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
2789	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
2790	// CHECK: ret <8 x i16> [[VECINIT7_I]]
2791	poly16x8_t test_vdupq_n_p16(poly16_t a) {
2792	return vdupq_n_p16(a);
2793	}
2794
2795	// CHECK-LABEL: @test_vdupq_n_f16(
2796	// CHECK: [[TMP0:%.]] = load half, half %a, align 2
2797	// CHECK: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
2798	// CHECK: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
2799	// CHECK: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
2800	// CHECK: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
2801	// CHECK: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
2802	// CHECK: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
2803	// CHECK: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
2804	// CHECK: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
2805	// CHECK: ret <8 x half> [[VECINIT7]]
2806	float16x8_t test_vdupq_n_f16(float16_t *a) {
2807	return vdupq_n_f16(*a);
2808	}
2809
2810	// CHECK-LABEL: @test_vdupq_n_f32(
2811	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
2812	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
2813	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
2814	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
2815	// CHECK: ret <4 x float> [[VECINIT3_I]]
2816	float32x4_t test_vdupq_n_f32(float32_t a) {
2817	return vdupq_n_f32(a);
2818	}
2819
2820	// CHECK-LABEL: @test_vdup_n_s64(
2821	// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
2822	// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
2823	// CHECK: ret <1 x i64> [[ADD_I]]
2824	int64x1_t test_vdup_n_s64(int64_t a) {
2825	int64x1_t tmp = vdup_n_s64(a);
2826	return vadd_s64(tmp, tmp);
2827	}
2828
2829	// CHECK-LABEL: @test_vdup_n_u64(
2830	// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
2831	// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
2832	// CHECK: ret <1 x i64> [[ADD_I]]
2833	uint64x1_t test_vdup_n_u64(uint64_t a) {
2834	int64x1_t tmp = vdup_n_u64(a);
2835	return vadd_s64(tmp, tmp);
2836	}
2837
2838	// CHECK-LABEL: @test_vdupq_n_s64(
2839	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
2840	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
2841	// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
2842	// CHECK: ret <2 x i64> [[ADD_I]]
2843	int64x2_t test_vdupq_n_s64(int64_t a) {
2844	int64x2_t tmp = vdupq_n_s64(a);
2845	return vaddq_s64(tmp, tmp);
2846	}
2847
2848	// CHECK-LABEL: @test_vdupq_n_u64(
2849	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
2850	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
2851	// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
2852	// CHECK: ret <2 x i64> [[ADD_I]]
2853	uint64x2_t test_vdupq_n_u64(uint64_t a) {
2854	int64x2_t tmp = vdupq_n_u64(a);
2855	return vaddq_u64(tmp, tmp);
2856	}
2857
2858	// CHECK-LABEL: @test_veor_s8(
2859	// CHECK: [[XOR_I:%.*]] = xor <8 x i8> %a, %b
2860	// CHECK: ret <8 x i8> [[XOR_I]]
2861	int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
2862	return veor_s8(a, b);
2863	}
2864
2865	// CHECK-LABEL: @test_veor_s16(
2866	// CHECK: [[XOR_I:%.*]] = xor <4 x i16> %a, %b
2867	// CHECK: ret <4 x i16> [[XOR_I]]
2868	int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
2869	return veor_s16(a, b);
2870	}
2871
2872	// CHECK-LABEL: @test_veor_s32(
2873	// CHECK: [[XOR_I:%.*]] = xor <2 x i32> %a, %b
2874	// CHECK: ret <2 x i32> [[XOR_I]]
2875	int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
2876	return veor_s32(a, b);
2877	}
2878
2879	// CHECK-LABEL: @test_veor_s64(
2880	// CHECK: [[XOR_I:%.*]] = xor <1 x i64> %a, %b
2881	// CHECK: ret <1 x i64> [[XOR_I]]
2882	int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
2883	return veor_s64(a, b);
2884	}
2885
2886	// CHECK-LABEL: @test_veor_u8(
2887	// CHECK: [[XOR_I:%.*]] = xor <8 x i8> %a, %b
2888	// CHECK: ret <8 x i8> [[XOR_I]]
2889	uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
2890	return veor_u8(a, b);
2891	}
2892
2893	// CHECK-LABEL: @test_veor_u16(
2894	// CHECK: [[XOR_I:%.*]] = xor <4 x i16> %a, %b
2895	// CHECK: ret <4 x i16> [[XOR_I]]
2896	uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
2897	return veor_u16(a, b);
2898	}
2899
2900	// CHECK-LABEL: @test_veor_u32(
2901	// CHECK: [[XOR_I:%.*]] = xor <2 x i32> %a, %b
2902	// CHECK: ret <2 x i32> [[XOR_I]]
2903	uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
2904	return veor_u32(a, b);
2905	}
2906
2907	// CHECK-LABEL: @test_veor_u64(
2908	// CHECK: [[XOR_I:%.*]] = xor <1 x i64> %a, %b
2909	// CHECK: ret <1 x i64> [[XOR_I]]
2910	uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
2911	return veor_u64(a, b);
2912	}
2913
2914	// CHECK-LABEL: @test_veorq_s8(
2915	// CHECK: [[XOR_I:%.*]] = xor <16 x i8> %a, %b
2916	// CHECK: ret <16 x i8> [[XOR_I]]
2917	int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
2918	return veorq_s8(a, b);
2919	}
2920
2921	// CHECK-LABEL: @test_veorq_s16(
2922	// CHECK: [[XOR_I:%.*]] = xor <8 x i16> %a, %b
2923	// CHECK: ret <8 x i16> [[XOR_I]]
2924	int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
2925	return veorq_s16(a, b);
2926	}
2927
2928	// CHECK-LABEL: @test_veorq_s32(
2929	// CHECK: [[XOR_I:%.*]] = xor <4 x i32> %a, %b
2930	// CHECK: ret <4 x i32> [[XOR_I]]
2931	int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
2932	return veorq_s32(a, b);
2933	}
2934
2935	// CHECK-LABEL: @test_veorq_s64(
2936	// CHECK: [[XOR_I:%.*]] = xor <2 x i64> %a, %b
2937	// CHECK: ret <2 x i64> [[XOR_I]]
2938	int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
2939	return veorq_s64(a, b);
2940	}
2941
2942	// CHECK-LABEL: @test_veorq_u8(
2943	// CHECK: [[XOR_I:%.*]] = xor <16 x i8> %a, %b
2944	// CHECK: ret <16 x i8> [[XOR_I]]
2945	uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
2946	return veorq_u8(a, b);
2947	}
2948
2949	// CHECK-LABEL: @test_veorq_u16(
2950	// CHECK: [[XOR_I:%.*]] = xor <8 x i16> %a, %b
2951	// CHECK: ret <8 x i16> [[XOR_I]]
2952	uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
2953	return veorq_u16(a, b);
2954	}
2955
2956	// CHECK-LABEL: @test_veorq_u32(
2957	// CHECK: [[XOR_I:%.*]] = xor <4 x i32> %a, %b
2958	// CHECK: ret <4 x i32> [[XOR_I]]
2959	uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
2960	return veorq_u32(a, b);
2961	}
2962
2963	// CHECK-LABEL: @test_veorq_u64(
2964	// CHECK: [[XOR_I:%.*]] = xor <2 x i64> %a, %b
2965	// CHECK: ret <2 x i64> [[XOR_I]]
2966	uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
2967	return veorq_u64(a, b);
2968	}
2969
2970	// CHECK-LABEL: @test_vext_s8(
2971	// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
2972	// CHECK: ret <8 x i8> [[VEXT]]
2973	int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
2974	return vext_s8(a, b, 7);
2975	}
2976
2977	// CHECK-LABEL: @test_vext_u8(
2978	// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
2979	// CHECK: ret <8 x i8> [[VEXT]]
2980	uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
2981	return vext_u8(a, b, 7);
2982	}
2983
2984	// CHECK-LABEL: @test_vext_p8(
2985	// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
2986	// CHECK: ret <8 x i8> [[VEXT]]
2987	poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
2988	return vext_p8(a, b, 7);
2989	}
2990
2991	// CHECK-LABEL: @test_vext_s16(
2992	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2993	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2994	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2995	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2996	// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
2997	// CHECK: ret <4 x i16> [[VEXT]]
2998	int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
2999	return vext_s16(a, b, 3);
3000	}
3001
3002	// CHECK-LABEL: @test_vext_u16(
3003	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3004	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3005	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3006	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3007	// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3008	// CHECK: ret <4 x i16> [[VEXT]]
3009	uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
3010	return vext_u16(a, b, 3);
3011	}
3012
3013	// CHECK-LABEL: @test_vext_p16(
3014	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3015	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3016	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3017	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3018	// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3019	// CHECK: ret <4 x i16> [[VEXT]]
3020	poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
3021	return vext_p16(a, b, 3);
3022	}
3023
3024	// CHECK-LABEL: @test_vext_s32(
3025	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3026	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3027	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3028	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3029	// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
3030	// CHECK: ret <2 x i32> [[VEXT]]
3031	int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
3032	return vext_s32(a, b, 1);
3033	}
3034
3035	// CHECK-LABEL: @test_vext_u32(
3036	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3037	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3038	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3039	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3040	// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
3041	// CHECK: ret <2 x i32> [[VEXT]]
3042	uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
3043	return vext_u32(a, b, 1);
3044	}
3045
3046	// CHECK-LABEL: @test_vext_s64(
3047	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3048	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3049	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3050	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3051	// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
3052	// CHECK: ret <1 x i64> [[VEXT]]
3053	int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
3054	return vext_s64(a, b, 0);
3055	}
3056
3057	// CHECK-LABEL: @test_vext_u64(
3058	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3059	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3060	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3061	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3062	// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
3063	// CHECK: ret <1 x i64> [[VEXT]]
3064	uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
3065	return vext_u64(a, b, 0);
3066	}
3067
3068	// CHECK-LABEL: @test_vext_f32(
3069	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3070	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3071	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3072	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3073	// CHECK: [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
3074	// CHECK: ret <2 x float> [[VEXT]]
3075	float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
3076	return vext_f32(a, b, 1);
3077	}
3078
3079	// CHECK-LABEL: @test_vextq_s8(
3080	// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
3081	// CHECK: ret <16 x i8> [[VEXT]]
3082	int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
3083	return vextq_s8(a, b, 15);
3084	}
3085
3086	// CHECK-LABEL: @test_vextq_u8(
3087	// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
3088	// CHECK: ret <16 x i8> [[VEXT]]
3089	uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
3090	return vextq_u8(a, b, 15);
3091	}
3092
3093	// CHECK-LABEL: @test_vextq_p8(
3094	// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
3095	// CHECK: ret <16 x i8> [[VEXT]]
3096	poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
3097	return vextq_p8(a, b, 15);
3098	}
3099
3100	// CHECK-LABEL: @test_vextq_s16(
3101	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3102	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3103	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3104	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3105	// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3106	// CHECK: ret <8 x i16> [[VEXT]]
3107	int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
3108	return vextq_s16(a, b, 7);
3109	}
3110
3111	// CHECK-LABEL: @test_vextq_u16(
3112	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3113	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3114	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3115	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3116	// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3117	// CHECK: ret <8 x i16> [[VEXT]]
3118	uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
3119	return vextq_u16(a, b, 7);
3120	}
3121
3122	// CHECK-LABEL: @test_vextq_p16(
3123	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3124	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3125	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3126	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3127	// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3128	// CHECK: ret <8 x i16> [[VEXT]]
3129	poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
3130	return vextq_p16(a, b, 7);
3131	}
3132
3133	// CHECK-LABEL: @test_vextq_s32(
3134	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3135	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3136	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3137	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3138	// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3139	// CHECK: ret <4 x i32> [[VEXT]]
3140	int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
3141	return vextq_s32(a, b, 3);
3142	}
3143
3144	// CHECK-LABEL: @test_vextq_u32(
3145	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3146	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3147	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3148	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3149	// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3150	// CHECK: ret <4 x i32> [[VEXT]]
3151	uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
3152	return vextq_u32(a, b, 3);
3153	}
3154
3155	// CHECK-LABEL: @test_vextq_s64(
3156	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3157	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3158	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3159	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3160	// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
3161	// CHECK: ret <2 x i64> [[VEXT]]
3162	int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
3163	return vextq_s64(a, b, 1);
3164	}
3165
3166	// CHECK-LABEL: @test_vextq_u64(
3167	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3168	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3169	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3170	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3171	// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
3172	// CHECK: ret <2 x i64> [[VEXT]]
3173	uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
3174	return vextq_u64(a, b, 1);
3175	}
3176
3177	// CHECK-LABEL: @test_vextq_f32(
3178	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3179	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3180	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3181	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3182	// CHECK: [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3183	// CHECK: ret <4 x float> [[VEXT]]
3184	float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
3185	return vextq_f32(a, b, 3);
3186	}
3187
3188	// CHECK-LABEL: @test_vfma_f32(
3189	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3190	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3191	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
3192	// CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %c, <2 x float> %a)
3193	// CHECK: ret <2 x float> [[TMP3]]
3194	float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
3195	return vfma_f32(a, b, c);
3196	}
3197
3198	// CHECK-LABEL: @test_vfmaq_f32(
3199	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3200	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3201	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
3202	// CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %c, <4 x float> %a)
3203	// CHECK: ret <4 x float> [[TMP3]]
3204	float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
3205	return vfmaq_f32(a, b, c);
3206	}
3207
3208	// CHECK-LABEL: @test_vfms_f32(
3209	// CHECK: [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
3210	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3211	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
3212	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
3213	// CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %c, <2 x float> %a)
3214	// CHECK: ret <2 x float> [[TMP3]]
3215	float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
3216	return vfms_f32(a, b, c);
3217	}
3218
3219	// CHECK-LABEL: @test_vfmsq_f32(
3220	// CHECK: [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
3221	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3222	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
3223	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
3224	// CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %c, <4 x float> %a)
3225	// CHECK: ret <4 x float> [[TMP3]]
3226	float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
3227	return vfmsq_f32(a, b, c);
3228	}
3229
3230	// CHECK-LABEL: @test_vget_high_s8(
3231	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3232	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
3233	int8x8_t test_vget_high_s8(int8x16_t a) {
3234	return vget_high_s8(a);
3235	}
3236
3237	// CHECK-LABEL: @test_vget_high_s16(
3238	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3239	// CHECK: ret <4 x i16> [[SHUFFLE_I]]
3240	int16x4_t test_vget_high_s16(int16x8_t a) {
3241	return vget_high_s16(a);
3242	}
3243
3244	// CHECK-LABEL: @test_vget_high_s32(
3245	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3246	// CHECK: ret <2 x i32> [[SHUFFLE_I]]
3247	int32x2_t test_vget_high_s32(int32x4_t a) {
3248	return vget_high_s32(a);
3249	}
3250
3251	// CHECK-LABEL: @test_vget_high_s64(
3252	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
3253	// CHECK: ret <1 x i64> [[SHUFFLE_I]]
3254	int64x1_t test_vget_high_s64(int64x2_t a) {
3255	return vget_high_s64(a);
3256	}
3257
3258	// CHECK-LABEL: @test_vget_high_f16(
3259	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3260	// CHECK: ret <4 x half> [[SHUFFLE_I]]
3261	float16x4_t test_vget_high_f16(float16x8_t a) {
3262	return vget_high_f16(a);
3263	}
3264
3265	// CHECK-LABEL: @test_vget_high_f32(
3266	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
3267	// CHECK: ret <2 x float> [[SHUFFLE_I]]
3268	float32x2_t test_vget_high_f32(float32x4_t a) {
3269	return vget_high_f32(a);
3270	}
3271
3272	// CHECK-LABEL: @test_vget_high_u8(
3273	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3274	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
3275	uint8x8_t test_vget_high_u8(uint8x16_t a) {
3276	return vget_high_u8(a);
3277	}
3278
3279	// CHECK-LABEL: @test_vget_high_u16(
3280	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3281	// CHECK: ret <4 x i16> [[SHUFFLE_I]]
3282	uint16x4_t test_vget_high_u16(uint16x8_t a) {
3283	return vget_high_u16(a);
3284	}
3285
3286	// CHECK-LABEL: @test_vget_high_u32(
3287	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3288	// CHECK: ret <2 x i32> [[SHUFFLE_I]]
3289	uint32x2_t test_vget_high_u32(uint32x4_t a) {
3290	return vget_high_u32(a);
3291	}
3292
3293	// CHECK-LABEL: @test_vget_high_u64(
3294	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
3295	// CHECK: ret <1 x i64> [[SHUFFLE_I]]
3296	uint64x1_t test_vget_high_u64(uint64x2_t a) {
3297	return vget_high_u64(a);
3298	}
3299
3300	// CHECK-LABEL: @test_vget_high_p8(
3301	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3302	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
3303	poly8x8_t test_vget_high_p8(poly8x16_t a) {
3304	return vget_high_p8(a);
3305	}
3306
3307	// CHECK-LABEL: @test_vget_high_p16(
3308	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3309	// CHECK: ret <4 x i16> [[SHUFFLE_I]]
3310	poly16x4_t test_vget_high_p16(poly16x8_t a) {
3311	return vget_high_p16(a);
3312	}
3313
3314	// CHECK-LABEL: @test_vget_lane_u8(
3315	// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
3316	// CHECK: ret i8 [[VGET_LANE]]
3317	uint8_t test_vget_lane_u8(uint8x8_t a) {
3318	return vget_lane_u8(a, 7);
3319	}
3320
3321	// CHECK-LABEL: @test_vget_lane_u16(
3322	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3323	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3324	// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
3325	// CHECK: ret i16 [[VGET_LANE]]
3326	uint16_t test_vget_lane_u16(uint16x4_t a) {
3327	return vget_lane_u16(a, 3);
3328	}
3329
3330	// CHECK-LABEL: @test_vget_lane_u32(
3331	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3332	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3333	// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
3334	// CHECK: ret i32 [[VGET_LANE]]
3335	uint32_t test_vget_lane_u32(uint32x2_t a) {
3336	return vget_lane_u32(a, 1);
3337	}
3338
3339	// CHECK-LABEL: @test_vget_lane_s8(
3340	// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
3341	// CHECK: ret i8 [[VGET_LANE]]
3342	int8_t test_vget_lane_s8(int8x8_t a) {
3343	return vget_lane_s8(a, 7);
3344	}
3345
3346	// CHECK-LABEL: @test_vget_lane_s16(
3347	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3348	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3349	// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
3350	// CHECK: ret i16 [[VGET_LANE]]
3351	int16_t test_vget_lane_s16(int16x4_t a) {
3352	return vget_lane_s16(a, 3);
3353	}
3354
3355	// CHECK-LABEL: @test_vget_lane_s32(
3356	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3357	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3358	// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
3359	// CHECK: ret i32 [[VGET_LANE]]
3360	int32_t test_vget_lane_s32(int32x2_t a) {
3361	return vget_lane_s32(a, 1);
3362	}
3363
3364	// CHECK-LABEL: @test_vget_lane_p8(
3365	// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
3366	// CHECK: ret i8 [[VGET_LANE]]
3367	poly8_t test_vget_lane_p8(poly8x8_t a) {
3368	return vget_lane_p8(a, 7);
3369	}
3370
3371	// CHECK-LABEL: @test_vget_lane_p16(
3372	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3373	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3374	// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
3375	// CHECK: ret i16 [[VGET_LANE]]
3376	poly16_t test_vget_lane_p16(poly16x4_t a) {
3377	return vget_lane_p16(a, 3);
3378	}
3379
3380	// CHECK-LABEL: @test_vget_lane_f32(
3381	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3382	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3383	// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
3384	// CHECK: ret float [[VGET_LANE]]
3385	float32_t test_vget_lane_f32(float32x2_t a) {
3386	return vget_lane_f32(a, 1);
3387	}
3388
3389	// CHECK-LABEL: @test_vget_lane_f16(
3390	// CHECK: [[__REINT_242:%.*]] = alloca <4 x half>, align 8
3391	// CHECK: [[__REINT1_242:%.*]] = alloca i16, align 2
3392	// CHECK: store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8
3393	// CHECK: [[TMP0:%.]] = bitcast <4 x half> [[__REINT_242]] to <4 x i16>*
3394	// CHECK: [[TMP1:%.]] = load <4 x i16>, <4 x i16> [[TMP0]], align 8
3395	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
3396	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
3397	// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
3398	// CHECK: store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2
3399	// CHECK: [[TMP4:%.]] = bitcast i16 [[__REINT1_242]] to half*
3400	// CHECK: [[TMP5:%.]] = load half, half [[TMP4]], align 2
3401	// CHECK: [[CONV:%.*]] = fpext half [[TMP5]] to float
3402	// CHECK: ret float [[CONV]]
3403	float32_t test_vget_lane_f16(float16x4_t a) {
3404	return vget_lane_f16(a, 1);
3405	}
3406
3407	// CHECK-LABEL: @test_vgetq_lane_u8(
3408	// CHECK: [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
3409	// CHECK: ret i8 [[VGET_LANE]]
3410	uint8_t test_vgetq_lane_u8(uint8x16_t a) {
3411	return vgetq_lane_u8(a, 15);
3412	}
3413
3414	// CHECK-LABEL: @test_vgetq_lane_u16(
3415	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3416	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3417	// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
3418	// CHECK: ret i16 [[VGET_LANE]]
3419	uint16_t test_vgetq_lane_u16(uint16x8_t a) {
3420	return vgetq_lane_u16(a, 7);
3421	}
3422
3423	// CHECK-LABEL: @test_vgetq_lane_u32(
3424	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3425	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3426	// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
3427	// CHECK: ret i32 [[VGET_LANE]]
3428	uint32_t test_vgetq_lane_u32(uint32x4_t a) {
3429	return vgetq_lane_u32(a, 3);
3430	}
3431
3432	// CHECK-LABEL: @test_vgetq_lane_s8(
3433	// CHECK: [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
3434	// CHECK: ret i8 [[VGET_LANE]]
3435	int8_t test_vgetq_lane_s8(int8x16_t a) {
3436	return vgetq_lane_s8(a, 15);
3437	}
3438
3439	// CHECK-LABEL: @test_vgetq_lane_s16(
3440	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3441	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3442	// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
3443	// CHECK: ret i16 [[VGET_LANE]]
3444	int16_t test_vgetq_lane_s16(int16x8_t a) {
3445	return vgetq_lane_s16(a, 7);
3446	}
3447
3448	// CHECK-LABEL: @test_vgetq_lane_s32(
3449	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3450	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3451	// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
3452	// CHECK: ret i32 [[VGET_LANE]]
3453	int32_t test_vgetq_lane_s32(int32x4_t a) {
3454	return vgetq_lane_s32(a, 3);
3455	}
3456
3457	// CHECK-LABEL: @test_vgetq_lane_p8(
3458	// CHECK: [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
3459	// CHECK: ret i8 [[VGET_LANE]]
3460	poly8_t test_vgetq_lane_p8(poly8x16_t a) {
3461	return vgetq_lane_p8(a, 15);
3462	}
3463
3464	// CHECK-LABEL: @test_vgetq_lane_p16(
3465	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3466	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3467	// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
3468	// CHECK: ret i16 [[VGET_LANE]]
3469	poly16_t test_vgetq_lane_p16(poly16x8_t a) {
3470	return vgetq_lane_p16(a, 7);
3471	}
3472
3473	// CHECK-LABEL: @test_vgetq_lane_f32(
3474	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3475	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3476	// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
3477	// CHECK: ret float [[VGET_LANE]]
3478	float32_t test_vgetq_lane_f32(float32x4_t a) {
3479	return vgetq_lane_f32(a, 3);
3480	}
3481
3482	// CHECK-LABEL: @test_vgetq_lane_f16(
3483	// CHECK: [[__REINT_244:%.*]] = alloca <8 x half>, align 16
3484	// CHECK: [[__REINT1_244:%.*]] = alloca i16, align 2
3485	// CHECK: store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
3486	// CHECK: [[TMP0:%.]] = bitcast <8 x half> [[__REINT_244]] to <8 x i16>*
3487	// CHECK: [[TMP1:%.]] = load <8 x i16>, <8 x i16> [[TMP0]], align 16
3488	// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
3489	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
3490	// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
3491	// CHECK: store i16 [[VGET_LANE]], i16* [[__REINT1_244]], align 2
3492	// CHECK: [[TMP4:%.]] = bitcast i16 [[__REINT1_244]] to half*
3493	// CHECK: [[TMP5:%.]] = load half, half [[TMP4]], align 2
3494	// CHECK: [[CONV:%.*]] = fpext half [[TMP5]] to float
3495	// CHECK: ret float [[CONV]]
3496	float32_t test_vgetq_lane_f16(float16x8_t a) {
3497	return vgetq_lane_f16(a, 3);
3498	}
3499
3500	// CHECK-LABEL: @test_vget_lane_s64(
3501	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3502	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3503	// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
3504	// CHECK: ret i64 [[VGET_LANE]]
3505	int64_t test_vget_lane_s64(int64x1_t a) {
3506	return vget_lane_s64(a, 0);
3507	}
3508
3509	// CHECK-LABEL: @test_vget_lane_u64(
3510	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3511	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3512	// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
3513	// CHECK: ret i64 [[VGET_LANE]]
3514	uint64_t test_vget_lane_u64(uint64x1_t a) {
3515	return vget_lane_u64(a, 0);
3516	}
3517
3518	// CHECK-LABEL: @test_vgetq_lane_s64(
3519	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3520	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3521	// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
3522	// CHECK: ret i64 [[VGET_LANE]]
3523	int64_t test_vgetq_lane_s64(int64x2_t a) {
3524	return vgetq_lane_s64(a, 1);
3525	}
3526
3527	// CHECK-LABEL: @test_vgetq_lane_u64(
3528	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3529	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3530	// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
3531	// CHECK: ret i64 [[VGET_LANE]]
3532	uint64_t test_vgetq_lane_u64(uint64x2_t a) {
3533	return vgetq_lane_u64(a, 1);
3534	}
3535
3536	// CHECK-LABEL: @test_vget_low_s8(
3537	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3538	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
3539	int8x8_t test_vget_low_s8(int8x16_t a) {
3540	return vget_low_s8(a);
3541	}
3542
3543	// CHECK-LABEL: @test_vget_low_s16(
3544	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3545	// CHECK: ret <4 x i16> [[SHUFFLE_I]]
3546	int16x4_t test_vget_low_s16(int16x8_t a) {
3547	return vget_low_s16(a);
3548	}
3549
3550	// CHECK-LABEL: @test_vget_low_s32(
3551	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
3552	// CHECK: ret <2 x i32> [[SHUFFLE_I]]
3553	int32x2_t test_vget_low_s32(int32x4_t a) {
3554	return vget_low_s32(a);
3555	}
3556
3557	// CHECK-LABEL: @test_vget_low_s64(
3558	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
3559	// CHECK: ret <1 x i64> [[SHUFFLE_I]]
3560	int64x1_t test_vget_low_s64(int64x2_t a) {
3561	return vget_low_s64(a);
3562	}
3563
3564	// CHECK-LABEL: @test_vget_low_f16(
3565	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3566	// CHECK: ret <4 x half> [[SHUFFLE_I]]
3567	float16x4_t test_vget_low_f16(float16x8_t a) {
3568	return vget_low_f16(a);
3569	}
3570
3571	// CHECK-LABEL: @test_vget_low_f32(
3572	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
3573	// CHECK: ret <2 x float> [[SHUFFLE_I]]
3574	float32x2_t test_vget_low_f32(float32x4_t a) {
3575	return vget_low_f32(a);
3576	}
3577
3578	// CHECK-LABEL: @test_vget_low_u8(
3579	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3580	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
3581	uint8x8_t test_vget_low_u8(uint8x16_t a) {
3582	return vget_low_u8(a);
3583	}
3584
3585	// CHECK-LABEL: @test_vget_low_u16(
3586	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3587	// CHECK: ret <4 x i16> [[SHUFFLE_I]]
3588	uint16x4_t test_vget_low_u16(uint16x8_t a) {
3589	return vget_low_u16(a);
3590	}
3591
3592	// CHECK-LABEL: @test_vget_low_u32(
3593	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
3594	// CHECK: ret <2 x i32> [[SHUFFLE_I]]
3595	uint32x2_t test_vget_low_u32(uint32x4_t a) {
3596	return vget_low_u32(a);
3597	}
3598
3599	// CHECK-LABEL: @test_vget_low_u64(
3600	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
3601	// CHECK: ret <1 x i64> [[SHUFFLE_I]]
3602	uint64x1_t test_vget_low_u64(uint64x2_t a) {
3603	return vget_low_u64(a);
3604	}
3605
3606	// CHECK-LABEL: @test_vget_low_p8(
3607	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3608	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
3609	poly8x8_t test_vget_low_p8(poly8x16_t a) {
3610	return vget_low_p8(a);
3611	}
3612
3613	// CHECK-LABEL: @test_vget_low_p16(
3614	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3615	// CHECK: ret <4 x i16> [[SHUFFLE_I]]
3616	poly16x4_t test_vget_low_p16(poly16x8_t a) {
3617	return vget_low_p16(a);
3618	}
3619
3620	// CHECK-LABEL: @test_vhadd_s8(
3621	// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %a, <8 x i8> %b)
3622	// CHECK: ret <8 x i8> [[VHADD_V_I]]
3623	int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) {
3624	return vhadd_s8(a, b);
3625	}
3626
3627	// CHECK-LABEL: @test_vhadd_s16(
3628	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3629	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3630	// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %a, <4 x i16> %b)
3631	// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
3632	// CHECK: ret <4 x i16> [[VHADD_V2_I]]
3633	int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) {
3634	return vhadd_s16(a, b);
3635	}
3636
3637	// CHECK-LABEL: @test_vhadd_s32(
3638	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3639	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3640	// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %a, <2 x i32> %b)
3641	// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
3642	// CHECK: ret <2 x i32> [[VHADD_V2_I]]
3643	int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) {
3644	return vhadd_s32(a, b);
3645	}
3646
3647	// CHECK-LABEL: @test_vhadd_u8(
3648	// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %a, <8 x i8> %b)
3649	// CHECK: ret <8 x i8> [[VHADD_V_I]]
3650	uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) {
3651	return vhadd_u8(a, b);
3652	}
3653
3654	// CHECK-LABEL: @test_vhadd_u16(
3655	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3656	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3657	// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %a, <4 x i16> %b)
3658	// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
3659	// CHECK: ret <4 x i16> [[VHADD_V2_I]]
3660	uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) {
3661	return vhadd_u16(a, b);
3662	}
3663
3664	// CHECK-LABEL: @test_vhadd_u32(
3665	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3666	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3667	// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %a, <2 x i32> %b)
3668	// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
3669	// CHECK: ret <2 x i32> [[VHADD_V2_I]]
3670	uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) {
3671	return vhadd_u32(a, b);
3672	}
3673
3674	// CHECK-LABEL: @test_vhaddq_s8(
3675	// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %a, <16 x i8> %b)
3676	// CHECK: ret <16 x i8> [[VHADDQ_V_I]]
3677	int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) {
3678	return vhaddq_s8(a, b);
3679	}
3680
3681	// CHECK-LABEL: @test_vhaddq_s16(
3682	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3683	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3684	// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a, <8 x i16> %b)
3685	// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
3686	// CHECK: ret <8 x i16> [[VHADDQ_V2_I]]
3687	int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) {
3688	return vhaddq_s16(a, b);
3689	}
3690
3691	// CHECK-LABEL: @test_vhaddq_s32(
3692	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3693	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3694	// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %a, <4 x i32> %b)
3695	// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
3696	// CHECK: ret <4 x i32> [[VHADDQ_V2_I]]
3697	int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) {
3698	return vhaddq_s32(a, b);
3699	}
3700
3701	// CHECK-LABEL: @test_vhaddq_u8(
3702	// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %a, <16 x i8> %b)
3703	// CHECK: ret <16 x i8> [[VHADDQ_V_I]]
3704	uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) {
3705	return vhaddq_u8(a, b);
3706	}
3707
3708	// CHECK-LABEL: @test_vhaddq_u16(
3709	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3710	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3711	// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a, <8 x i16> %b)
3712	// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
3713	// CHECK: ret <8 x i16> [[VHADDQ_V2_I]]
3714	uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) {
3715	return vhaddq_u16(a, b);
3716	}
3717
3718	// CHECK-LABEL: @test_vhaddq_u32(
3719	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3720	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3721	// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %a, <4 x i32> %b)
3722	// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
3723	// CHECK: ret <4 x i32> [[VHADDQ_V2_I]]
3724	uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) {
3725	return vhaddq_u32(a, b);
3726	}
3727
3728	// CHECK-LABEL: @test_vhsub_s8(
3729	// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %a, <8 x i8> %b)
3730	// CHECK: ret <8 x i8> [[VHSUB_V_I]]
3731	int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) {
3732	return vhsub_s8(a, b);
3733	}
3734
3735	// CHECK-LABEL: @test_vhsub_s16(
3736	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3737	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3738	// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %a, <4 x i16> %b)
3739	// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
3740	// CHECK: ret <4 x i16> [[VHSUB_V2_I]]
3741	int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) {
3742	return vhsub_s16(a, b);
3743	}
3744
3745	// CHECK-LABEL: @test_vhsub_s32(
3746	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3747	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3748	// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %a, <2 x i32> %b)
3749	// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
3750	// CHECK: ret <2 x i32> [[VHSUB_V2_I]]
3751	int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) {
3752	return vhsub_s32(a, b);
3753	}
3754
3755	// CHECK-LABEL: @test_vhsub_u8(
3756	// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %a, <8 x i8> %b)
3757	// CHECK: ret <8 x i8> [[VHSUB_V_I]]
3758	uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) {
3759	return vhsub_u8(a, b);
3760	}
3761
3762	// CHECK-LABEL: @test_vhsub_u16(
3763	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3764	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3765	// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %a, <4 x i16> %b)
3766	// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
3767	// CHECK: ret <4 x i16> [[VHSUB_V2_I]]
3768	uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) {
3769	return vhsub_u16(a, b);
3770	}
3771
3772	// CHECK-LABEL: @test_vhsub_u32(
3773	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3774	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3775	// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %a, <2 x i32> %b)
3776	// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
3777	// CHECK: ret <2 x i32> [[VHSUB_V2_I]]
3778	uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) {
3779	return vhsub_u32(a, b);
3780	}
3781
3782	// CHECK-LABEL: @test_vhsubq_s8(
3783	// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %a, <16 x i8> %b)
3784	// CHECK: ret <16 x i8> [[VHSUBQ_V_I]]
3785	int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) {
3786	return vhsubq_s8(a, b);
3787	}
3788
3789	// CHECK-LABEL: @test_vhsubq_s16(
3790	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3791	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3792	// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %a, <8 x i16> %b)
3793	// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
3794	// CHECK: ret <8 x i16> [[VHSUBQ_V2_I]]
3795	int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) {
3796	return vhsubq_s16(a, b);
3797	}
3798
3799	// CHECK-LABEL: @test_vhsubq_s32(
3800	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3801	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3802	// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %a, <4 x i32> %b)
3803	// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
3804	// CHECK: ret <4 x i32> [[VHSUBQ_V2_I]]
3805	int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) {
3806	return vhsubq_s32(a, b);
3807	}
3808
3809	// CHECK-LABEL: @test_vhsubq_u8(
3810	// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %a, <16 x i8> %b)
3811	// CHECK: ret <16 x i8> [[VHSUBQ_V_I]]
3812	uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) {
3813	return vhsubq_u8(a, b);
3814	}
3815
3816	// CHECK-LABEL: @test_vhsubq_u16(
3817	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3818	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3819	// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %a, <8 x i16> %b)
3820	// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
3821	// CHECK: ret <8 x i16> [[VHSUBQ_V2_I]]
3822	uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) {
3823	return vhsubq_u16(a, b);
3824	}
3825
3826	// CHECK-LABEL: @test_vhsubq_u32(
3827	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3828	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3829	// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %a, <4 x i32> %b)
3830	// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
3831	// CHECK: ret <4 x i32> [[VHSUBQ_V2_I]]
3832	uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) {
3833	return vhsubq_u32(a, b);
3834	}
3835
3836	// CHECK-LABEL: @test_vld1q_u8(
3837	// CHECK: [[VLD1:%.]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8 %a, i32 1)
3838	// CHECK: ret <16 x i8> [[VLD1]]
3839	uint8x16_t test_vld1q_u8(uint8_t const * a) {
3840	return vld1q_u8(a);
3841	}
3842
3843	// CHECK-LABEL: @test_vld1q_u16(
3844	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
3845	// CHECK: [[VLD1:%.]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8 [[TMP0]], i32 2)
3846	// CHECK: ret <8 x i16> [[VLD1]]
3847	uint16x8_t test_vld1q_u16(uint16_t const * a) {
3848	return vld1q_u16(a);
3849	}
3850
3851	// CHECK-LABEL: @test_vld1q_u32(
3852	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
3853	// CHECK: [[VLD1:%.]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8 [[TMP0]], i32 4)
3854	// CHECK: ret <4 x i32> [[VLD1]]
3855	uint32x4_t test_vld1q_u32(uint32_t const * a) {
3856	return vld1q_u32(a);
3857	}
3858
3859	// CHECK-LABEL: @test_vld1q_u64(
3860	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
3861	// CHECK: [[VLD1:%.]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8 [[TMP0]], i32 4)
3862	// CHECK: ret <2 x i64> [[VLD1]]
3863	uint64x2_t test_vld1q_u64(uint64_t const * a) {
3864	return vld1q_u64(a);
3865	}
3866
3867	// CHECK-LABEL: @test_vld1q_s8(
3868	// CHECK: [[VLD1:%.]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8 %a, i32 1)
3869	// CHECK: ret <16 x i8> [[VLD1]]
3870	int8x16_t test_vld1q_s8(int8_t const * a) {
3871	return vld1q_s8(a);
3872	}
3873
3874	// CHECK-LABEL: @test_vld1q_s16(
3875	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
3876	// CHECK: [[VLD1:%.]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8 [[TMP0]], i32 2)
3877	// CHECK: ret <8 x i16> [[VLD1]]
3878	int16x8_t test_vld1q_s16(int16_t const * a) {
3879	return vld1q_s16(a);
3880	}
3881
3882	// CHECK-LABEL: @test_vld1q_s32(
3883	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
3884	// CHECK: [[VLD1:%.]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8 [[TMP0]], i32 4)
3885	// CHECK: ret <4 x i32> [[VLD1]]
3886	int32x4_t test_vld1q_s32(int32_t const * a) {
3887	return vld1q_s32(a);
3888	}
3889
3890	// CHECK-LABEL: @test_vld1q_s64(
3891	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
3892	// CHECK: [[VLD1:%.]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8 [[TMP0]], i32 4)
3893	// CHECK: ret <2 x i64> [[VLD1]]
3894	int64x2_t test_vld1q_s64(int64_t const * a) {
3895	return vld1q_s64(a);
3896	}
3897
3898	// CHECK-LABEL: @test_vld1q_f16(
3899	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
3900	// CHECK: [[VLD1:%.]] = call <8 x half> @llvm.arm.neon.vld1.v8f16.p0i8(i8 [[TMP0]], i32 2)
3901	// CHECK: ret <8 x half> [[VLD1]]
3902	float16x8_t test_vld1q_f16(float16_t const * a) {
3903	return vld1q_f16(a);
3904	}
3905
3906	// CHECK-LABEL: @test_vld1q_f32(
3907	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
3908	// CHECK: [[VLD1:%.]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8 [[TMP0]], i32 4)
3909	// CHECK: ret <4 x float> [[VLD1]]
3910	float32x4_t test_vld1q_f32(float32_t const * a) {
3911	return vld1q_f32(a);
3912	}
3913
3914	// CHECK-LABEL: @test_vld1q_p8(
3915	// CHECK: [[VLD1:%.]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8 %a, i32 1)
3916	// CHECK: ret <16 x i8> [[VLD1]]
3917	poly8x16_t test_vld1q_p8(poly8_t const * a) {
3918	return vld1q_p8(a);
3919	}
3920
3921	// CHECK-LABEL: @test_vld1q_p16(
3922	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
3923	// CHECK: [[VLD1:%.]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8 [[TMP0]], i32 2)
3924	// CHECK: ret <8 x i16> [[VLD1]]
3925	poly16x8_t test_vld1q_p16(poly16_t const * a) {
3926	return vld1q_p16(a);
3927	}
3928
3929	// CHECK-LABEL: @test_vld1_u8(
3930	// CHECK: [[VLD1:%.]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8 %a, i32 1)
3931	// CHECK: ret <8 x i8> [[VLD1]]
3932	uint8x8_t test_vld1_u8(uint8_t const * a) {
3933	return vld1_u8(a);
3934	}
3935
3936	// CHECK-LABEL: @test_vld1_u16(
3937	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
3938	// CHECK: [[VLD1:%.]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8 [[TMP0]], i32 2)
3939	// CHECK: ret <4 x i16> [[VLD1]]
3940	uint16x4_t test_vld1_u16(uint16_t const * a) {
3941	return vld1_u16(a);
3942	}
3943
3944	// CHECK-LABEL: @test_vld1_u32(
3945	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
3946	// CHECK: [[VLD1:%.]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8 [[TMP0]], i32 4)
3947	// CHECK: ret <2 x i32> [[VLD1]]
3948	uint32x2_t test_vld1_u32(uint32_t const * a) {
3949	return vld1_u32(a);
3950	}
3951
3952	// CHECK-LABEL: @test_vld1_u64(
3953	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
3954	// CHECK: [[VLD1:%.]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8 [[TMP0]], i32 4)
3955	// CHECK: ret <1 x i64> [[VLD1]]
3956	uint64x1_t test_vld1_u64(uint64_t const * a) {
3957	return vld1_u64(a);
3958	}
3959
3960	// CHECK-LABEL: @test_vld1_s8(
3961	// CHECK: [[VLD1:%.]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8 %a, i32 1)
3962	// CHECK: ret <8 x i8> [[VLD1]]
3963	int8x8_t test_vld1_s8(int8_t const * a) {
3964	return vld1_s8(a);
3965	}
3966
3967	// CHECK-LABEL: @test_vld1_s16(
3968	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
3969	// CHECK: [[VLD1:%.]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8 [[TMP0]], i32 2)
3970	// CHECK: ret <4 x i16> [[VLD1]]
3971	int16x4_t test_vld1_s16(int16_t const * a) {
3972	return vld1_s16(a);
3973	}
3974
3975	// CHECK-LABEL: @test_vld1_s32(
3976	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
3977	// CHECK: [[VLD1:%.]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8 [[TMP0]], i32 4)
3978	// CHECK: ret <2 x i32> [[VLD1]]
3979	int32x2_t test_vld1_s32(int32_t const * a) {
3980	return vld1_s32(a);
3981	}
3982
3983	// CHECK-LABEL: @test_vld1_s64(
3984	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
3985	// CHECK: [[VLD1:%.]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8 [[TMP0]], i32 4)
3986	// CHECK: ret <1 x i64> [[VLD1]]
3987	int64x1_t test_vld1_s64(int64_t const * a) {
3988	return vld1_s64(a);
3989	}
3990
3991	// CHECK-LABEL: @test_vld1_f16(
3992	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
3993	// CHECK: [[VLD1:%.]] = call <4 x half> @llvm.arm.neon.vld1.v4f16.p0i8(i8 [[TMP0]], i32 2)
3994	// CHECK: ret <4 x half> [[VLD1]]
3995	float16x4_t test_vld1_f16(float16_t const * a) {
3996	return vld1_f16(a);
3997	}
3998
3999	// CHECK-LABEL: @test_vld1_f32(
4000	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
4001	// CHECK: [[VLD1:%.]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8 [[TMP0]], i32 4)
4002	// CHECK: ret <2 x float> [[VLD1]]
4003	float32x2_t test_vld1_f32(float32_t const * a) {
4004	return vld1_f32(a);
4005	}
4006
4007	// CHECK-LABEL: @test_vld1_p8(
4008	// CHECK: [[VLD1:%.]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8 %a, i32 1)
4009	// CHECK: ret <8 x i8> [[VLD1]]
4010	poly8x8_t test_vld1_p8(poly8_t const * a) {
4011	return vld1_p8(a);
4012	}
4013
4014	// CHECK-LABEL: @test_vld1_p16(
4015	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4016	// CHECK: [[VLD1:%.]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8 [[TMP0]], i32 2)
4017	// CHECK: ret <4 x i16> [[VLD1]]
4018	poly16x4_t test_vld1_p16(poly16_t const * a) {
4019	return vld1_p16(a);
4020	}
4021
4022	// CHECK-LABEL: @test_vld1q_dup_u8(
4023	// CHECK: [[TMP0:%.]] = load i8, i8 %a, align 1
4024	// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
4025	// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
4026	// CHECK: ret <16 x i8> [[LANE]]
4027	uint8x16_t test_vld1q_dup_u8(uint8_t const * a) {
4028	return vld1q_dup_u8(a);
4029	}
4030
4031	// CHECK-LABEL: @test_vld1q_dup_u16(
4032	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4033	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i16*
4034	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]], align 2
4035	// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
4036	// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4037	// CHECK: ret <8 x i16> [[LANE]]
4038	uint16x8_t test_vld1q_dup_u16(uint16_t const * a) {
4039	return vld1q_dup_u16(a);
4040	}
4041
4042	// CHECK-LABEL: @test_vld1q_dup_u32(
4043	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
4044	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i32*
4045	// CHECK: [[TMP2:%.]] = load i32, i32 [[TMP1]], align 4
4046	// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
4047	// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
4048	// CHECK: ret <4 x i32> [[LANE]]
4049	uint32x4_t test_vld1q_dup_u32(uint32_t const * a) {
4050	return vld1q_dup_u32(a);
4051	}
4052
4053	// CHECK-LABEL: @test_vld1q_dup_u64(
4054	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4055	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i64*
4056	// CHECK: [[TMP2:%.]] = load i64, i64 [[TMP1]], align 4
4057	// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
4058	// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
4059	// CHECK: ret <2 x i64> [[LANE]]
4060	uint64x2_t test_vld1q_dup_u64(uint64_t const * a) {
4061	return vld1q_dup_u64(a);
4062	}
4063
4064	// CHECK-LABEL: @test_vld1q_dup_s8(
4065	// CHECK: [[TMP0:%.]] = load i8, i8 %a, align 1
4066	// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
4067	// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
4068	// CHECK: ret <16 x i8> [[LANE]]
4069	int8x16_t test_vld1q_dup_s8(int8_t const * a) {
4070	return vld1q_dup_s8(a);
4071	}
4072
4073	// CHECK-LABEL: @test_vld1q_dup_s16(
4074	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4075	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i16*
4076	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]], align 2
4077	// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
4078	// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4079	// CHECK: ret <8 x i16> [[LANE]]
4080	int16x8_t test_vld1q_dup_s16(int16_t const * a) {
4081	return vld1q_dup_s16(a);
4082	}
4083
4084	// CHECK-LABEL: @test_vld1q_dup_s32(
4085	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
4086	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i32*
4087	// CHECK: [[TMP2:%.]] = load i32, i32 [[TMP1]], align 4
4088	// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
4089	// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
4090	// CHECK: ret <4 x i32> [[LANE]]
4091	int32x4_t test_vld1q_dup_s32(int32_t const * a) {
4092	return vld1q_dup_s32(a);
4093	}
4094
4095	// CHECK-LABEL: @test_vld1q_dup_s64(
4096	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4097	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i64*
4098	// CHECK: [[TMP2:%.]] = load i64, i64 [[TMP1]], align 4
4099	// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
4100	// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
4101	// CHECK: ret <2 x i64> [[LANE]]
4102	int64x2_t test_vld1q_dup_s64(int64_t const * a) {
4103	return vld1q_dup_s64(a);
4104	}
4105
4106	// CHECK-LABEL: @test_vld1q_dup_f16(
4107	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
4108	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to half*
4109	// CHECK: [[TMP2:%.]] = load half, half [[TMP1]], align 2
4110	// CHECK: [[TMP3:%.*]] = insertelement <8 x half> undef, half [[TMP2]], i32 0
4111	// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
4112	// CHECK: ret <8 x half> [[LANE]]
4113	float16x8_t test_vld1q_dup_f16(float16_t const * a) {
4114	return vld1q_dup_f16(a);
4115	}
4116
4117	// CHECK-LABEL: @test_vld1q_dup_f32(
4118	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
4119	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to float*
4120	// CHECK: [[TMP2:%.]] = load float, float [[TMP1]], align 4
4121	// CHECK: [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
4122	// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
4123	// CHECK: ret <4 x float> [[LANE]]
4124	float32x4_t test_vld1q_dup_f32(float32_t const * a) {
4125	return vld1q_dup_f32(a);
4126	}
4127
4128	// CHECK-LABEL: @test_vld1q_dup_p8(
4129	// CHECK: [[TMP0:%.]] = load i8, i8 %a, align 1
4130	// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
4131	// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
4132	// CHECK: ret <16 x i8> [[LANE]]
4133	poly8x16_t test_vld1q_dup_p8(poly8_t const * a) {
4134	return vld1q_dup_p8(a);
4135	}
4136
4137	// CHECK-LABEL: @test_vld1q_dup_p16(
4138	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4139	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i16*
4140	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]], align 2
4141	// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
4142	// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4143	// CHECK: ret <8 x i16> [[LANE]]
4144	poly16x8_t test_vld1q_dup_p16(poly16_t const * a) {
4145	return vld1q_dup_p16(a);
4146	}
4147
4148	// CHECK-LABEL: @test_vld1_dup_u8(
4149	// CHECK: [[TMP0:%.]] = load i8, i8 %a, align 1
4150	// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
4151	// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
4152	// CHECK: ret <8 x i8> [[LANE]]
4153	uint8x8_t test_vld1_dup_u8(uint8_t const * a) {
4154	return vld1_dup_u8(a);
4155	}
4156
4157	// CHECK-LABEL: @test_vld1_dup_u16(
4158	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4159	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i16*
4160	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]], align 2
4161	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
4162	// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4163	// CHECK: ret <4 x i16> [[LANE]]
4164	uint16x4_t test_vld1_dup_u16(uint16_t const * a) {
4165	return vld1_dup_u16(a);
4166	}
4167
4168	// CHECK-LABEL: @test_vld1_dup_u32(
4169	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
4170	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i32*
4171	// CHECK: [[TMP2:%.]] = load i32, i32 [[TMP1]], align 4
4172	// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
4173	// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
4174	// CHECK: ret <2 x i32> [[LANE]]
4175	uint32x2_t test_vld1_dup_u32(uint32_t const * a) {
4176	return vld1_dup_u32(a);
4177	}
4178
4179	// CHECK-LABEL: @test_vld1_dup_u64(
4180	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4181	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i64*
4182	// CHECK: [[TMP2:%.]] = load i64, i64 [[TMP1]], align 4
4183	// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
4184	// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
4185	// CHECK: ret <1 x i64> [[LANE]]
4186	uint64x1_t test_vld1_dup_u64(uint64_t const * a) {
4187	return vld1_dup_u64(a);
4188	}
4189
4190	// CHECK-LABEL: @test_vld1_dup_s8(
4191	// CHECK: [[TMP0:%.]] = load i8, i8 %a, align 1
4192	// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
4193	// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
4194	// CHECK: ret <8 x i8> [[LANE]]
4195	int8x8_t test_vld1_dup_s8(int8_t const * a) {
4196	return vld1_dup_s8(a);
4197	}
4198
4199	// CHECK-LABEL: @test_vld1_dup_s16(
4200	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4201	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i16*
4202	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]], align 2
4203	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
4204	// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4205	// CHECK: ret <4 x i16> [[LANE]]
4206	int16x4_t test_vld1_dup_s16(int16_t const * a) {
4207	return vld1_dup_s16(a);
4208	}
4209
4210	// CHECK-LABEL: @test_vld1_dup_s32(
4211	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
4212	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i32*
4213	// CHECK: [[TMP2:%.]] = load i32, i32 [[TMP1]], align 4
4214	// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
4215	// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
4216	// CHECK: ret <2 x i32> [[LANE]]
4217	int32x2_t test_vld1_dup_s32(int32_t const * a) {
4218	return vld1_dup_s32(a);
4219	}
4220
4221	// CHECK-LABEL: @test_vld1_dup_s64(
4222	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4223	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i64*
4224	// CHECK: [[TMP2:%.]] = load i64, i64 [[TMP1]], align 4
4225	// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
4226	// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
4227	// CHECK: ret <1 x i64> [[LANE]]
4228	int64x1_t test_vld1_dup_s64(int64_t const * a) {
4229	return vld1_dup_s64(a);
4230	}
4231
4232	// CHECK-LABEL: @test_vld1_dup_f16(
4233	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
4234	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to half*
4235	// CHECK: [[TMP2:%.]] = load half, half [[TMP1]], align 2
4236	// CHECK: [[TMP3:%.*]] = insertelement <4 x half> undef, half [[TMP2]], i32 0
4237	// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
4238	// CHECK: ret <4 x half> [[LANE]]
4239	float16x4_t test_vld1_dup_f16(float16_t const * a) {
4240	return vld1_dup_f16(a);
4241	}
4242
4243	// CHECK-LABEL: @test_vld1_dup_f32(
4244	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
4245	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to float*
4246	// CHECK: [[TMP2:%.]] = load float, float [[TMP1]], align 4
4247	// CHECK: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
4248	// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
4249	// CHECK: ret <2 x float> [[LANE]]
4250	float32x2_t test_vld1_dup_f32(float32_t const * a) {
4251	return vld1_dup_f32(a);
4252	}
4253
4254	// CHECK-LABEL: @test_vld1_dup_p8(
4255	// CHECK: [[TMP0:%.]] = load i8, i8 %a, align 1
4256	// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
4257	// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
4258	// CHECK: ret <8 x i8> [[LANE]]
4259	poly8x8_t test_vld1_dup_p8(poly8_t const * a) {
4260	return vld1_dup_p8(a);
4261	}
4262
4263	// CHECK-LABEL: @test_vld1_dup_p16(
4264	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4265	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i16*
4266	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]], align 2
4267	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
4268	// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4269	// CHECK: ret <4 x i16> [[LANE]]
4270	poly16x4_t test_vld1_dup_p16(poly16_t const * a) {
4271	return vld1_dup_p16(a);
4272	}
4273
4274	// CHECK-LABEL: @test_vld1q_lane_u8(
4275	// CHECK: [[TMP0:%.]] = load i8, i8 %a, align 1
4276	// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
4277	// CHECK: ret <16 x i8> [[VLD1_LANE]]
4278	uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) {
4279	return vld1q_lane_u8(a, b, 15);
4280	}
4281
4282	// CHECK-LABEL: @test_vld1q_lane_u16(
4283	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4284	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4285	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4286	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i16*
4287	// CHECK: [[TMP4:%.]] = load i16, i16 [[TMP3]], align 2
4288	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4289	// CHECK: ret <8 x i16> [[VLD1_LANE]]
4290	uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) {
4291	return vld1q_lane_u16(a, b, 7);
4292	}
4293
4294	// CHECK-LABEL: @test_vld1q_lane_u32(
4295	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
4296	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4297	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4298	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i32*
4299	// CHECK: [[TMP4:%.]] = load i32, i32 [[TMP3]], align 4
4300	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
4301	// CHECK: ret <4 x i32> [[VLD1_LANE]]
4302	uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) {
4303	return vld1q_lane_u32(a, b, 3);
4304	}
4305
4306	// CHECK-LABEL: @test_vld1q_lane_u64(
4307	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4308	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
4309	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
4310	// CHECK: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
4311	// CHECK: [[TMP4:%.]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8 [[TMP0]], i32 4)
4312	// CHECK: [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
4313	// CHECK: ret <2 x i64> [[VLD1Q_LANE]]
4314	uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) {
4315	return vld1q_lane_u64(a, b, 1);
4316	}
4317
4318	// CHECK-LABEL: @test_vld1q_lane_s8(
4319	// CHECK: [[TMP0:%.]] = load i8, i8 %a, align 1
4320	// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
4321	// CHECK: ret <16 x i8> [[VLD1_LANE]]
4322	int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) {
4323	return vld1q_lane_s8(a, b, 15);
4324	}
4325
4326	// CHECK-LABEL: @test_vld1q_lane_s16(
4327	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4328	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4329	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4330	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i16*
4331	// CHECK: [[TMP4:%.]] = load i16, i16 [[TMP3]], align 2
4332	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4333	// CHECK: ret <8 x i16> [[VLD1_LANE]]
4334	int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) {
4335	return vld1q_lane_s16(a, b, 7);
4336	}
4337
4338	// CHECK-LABEL: @test_vld1q_lane_s32(
4339	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
4340	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4341	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4342	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i32*
4343	// CHECK: [[TMP4:%.]] = load i32, i32 [[TMP3]], align 4
4344	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
4345	// CHECK: ret <4 x i32> [[VLD1_LANE]]
4346	int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) {
4347	return vld1q_lane_s32(a, b, 3);
4348	}
4349
4350	// CHECK-LABEL: @test_vld1q_lane_s64(
4351	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4352	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
4353	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
4354	// CHECK: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
4355	// CHECK: [[TMP4:%.]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8 [[TMP0]], i32 4)
4356	// CHECK: [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
4357	// CHECK: ret <2 x i64> [[VLD1Q_LANE]]
4358	int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
4359	return vld1q_lane_s64(a, b, 1);
4360	}
4361
4362	// CHECK-LABEL: @test_vld1q_lane_f16(
4363	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
4364	// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
4365	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
4366	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to half*
4367	// CHECK: [[TMP4:%.]] = load half, half [[TMP3]], align 2
4368	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
4369	// CHECK: ret <8 x half> [[VLD1_LANE]]
4370	float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
4371	return vld1q_lane_f16(a, b, 7);
4372	}
4373
4374	// CHECK-LABEL: @test_vld1q_lane_f32(
4375	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
4376	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
4377	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
4378	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to float*
4379	// CHECK: [[TMP4:%.]] = load float, float [[TMP3]], align 4
4380	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
4381	// CHECK: ret <4 x float> [[VLD1_LANE]]
4382	float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) {
4383	return vld1q_lane_f32(a, b, 3);
4384	}
4385
4386	// CHECK-LABEL: @test_vld1q_lane_p8(
4387	// CHECK: [[TMP0:%.]] = load i8, i8 %a, align 1
4388	// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
4389	// CHECK: ret <16 x i8> [[VLD1_LANE]]
4390	poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) {
4391	return vld1q_lane_p8(a, b, 15);
4392	}
4393
4394	// CHECK-LABEL: @test_vld1q_lane_p16(
4395	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4396	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4397	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4398	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i16*
4399	// CHECK: [[TMP4:%.]] = load i16, i16 [[TMP3]], align 2
4400	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4401	// CHECK: ret <8 x i16> [[VLD1_LANE]]
4402	poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) {
4403	return vld1q_lane_p16(a, b, 7);
4404	}
4405
4406	// CHECK-LABEL: @test_vld1_lane_u8(
4407	// CHECK: [[TMP0:%.]] = load i8, i8 %a, align 1
4408	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
4409	// CHECK: ret <8 x i8> [[VLD1_LANE]]
4410	uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) {
4411	return vld1_lane_u8(a, b, 7);
4412	}
4413
4414	// CHECK-LABEL: @test_vld1_lane_u16(
4415	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4416	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4417	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4418	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i16*
4419	// CHECK: [[TMP4:%.]] = load i16, i16 [[TMP3]], align 2
4420	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4421	// CHECK: ret <4 x i16> [[VLD1_LANE]]
4422	uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) {
4423	return vld1_lane_u16(a, b, 3);
4424	}
4425
4426	// CHECK-LABEL: @test_vld1_lane_u32(
4427	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
4428	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4429	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4430	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i32*
4431	// CHECK: [[TMP4:%.]] = load i32, i32 [[TMP3]], align 4
4432	// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
4433	// CHECK: ret <2 x i32> [[VLD1_LANE]]
4434	uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) {
4435	return vld1_lane_u32(a, b, 1);
4436	}
4437
4438	// CHECK-LABEL: @test_vld1_lane_u64(
4439	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4440	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
4441	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
4442	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i64*
4443	// CHECK: [[TMP4:%.]] = load i64, i64 [[TMP3]], align 4
4444	// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
4445	// CHECK: ret <1 x i64> [[VLD1_LANE]]
4446	uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) {
4447	return vld1_lane_u64(a, b, 0);
4448	}
4449
4450	// CHECK-LABEL: @test_vld1_lane_s8(
4451	// CHECK: [[TMP0:%.]] = load i8, i8 %a, align 1
4452	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
4453	// CHECK: ret <8 x i8> [[VLD1_LANE]]
4454	int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) {
4455	return vld1_lane_s8(a, b, 7);
4456	}
4457
4458	// CHECK-LABEL: @test_vld1_lane_s16(
4459	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4460	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4461	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4462	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i16*
4463	// CHECK: [[TMP4:%.]] = load i16, i16 [[TMP3]], align 2
4464	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4465	// CHECK: ret <4 x i16> [[VLD1_LANE]]
4466	int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) {
4467	return vld1_lane_s16(a, b, 3);
4468	}
4469
4470	// CHECK-LABEL: @test_vld1_lane_s32(
4471	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
4472	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4473	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4474	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i32*
4475	// CHECK: [[TMP4:%.]] = load i32, i32 [[TMP3]], align 4
4476	// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
4477	// CHECK: ret <2 x i32> [[VLD1_LANE]]
4478	int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) {
4479	return vld1_lane_s32(a, b, 1);
4480	}
4481
4482	// CHECK-LABEL: @test_vld1_lane_s64(
4483	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4484	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
4485	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
4486	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i64*
4487	// CHECK: [[TMP4:%.]] = load i64, i64 [[TMP3]], align 4
4488	// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
4489	// CHECK: ret <1 x i64> [[VLD1_LANE]]
4490	int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
4491	return vld1_lane_s64(a, b, 0);
4492	}
4493
4494	// CHECK-LABEL: @test_vld1_lane_f16(
4495	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
4496	// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
4497	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
4498	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to half*
4499	// CHECK: [[TMP4:%.]] = load half, half [[TMP3]], align 2
4500	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3
4501	// CHECK: ret <4 x half> [[VLD1_LANE]]
4502	float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
4503	return vld1_lane_f16(a, b, 3);
4504	}
4505
4506	// CHECK-LABEL: @test_vld1_lane_f32(
4507	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
4508	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
4509	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
4510	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to float*
4511	// CHECK: [[TMP4:%.]] = load float, float [[TMP3]], align 4
4512	// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
4513	// CHECK: ret <2 x float> [[VLD1_LANE]]
4514	float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) {
4515	return vld1_lane_f32(a, b, 1);
4516	}
4517
4518	// CHECK-LABEL: @test_vld1_lane_p8(
4519	// CHECK: [[TMP0:%.]] = load i8, i8 %a, align 1
4520	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
4521	// CHECK: ret <8 x i8> [[VLD1_LANE]]
4522	poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) {
4523	return vld1_lane_p8(a, b, 7);
4524	}
4525
4526	// CHECK-LABEL: @test_vld1_lane_p16(
4527	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4528	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4529	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4530	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i16*
4531	// CHECK: [[TMP4:%.]] = load i16, i16 [[TMP3]], align 2
4532	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4533	// CHECK: ret <4 x i16> [[VLD1_LANE]]
4534	poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
4535	return vld1_lane_p16(a, b, 3);
4536	}
4537
4538	// CHECK-LABEL: @test_vld2q_u8(
4539	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
4540	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x2_t [[__RET]] to i8*
4541	// CHECK: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
4542	uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
4543	return vld2q_u8(a);
4544	}
4545
4546	// CHECK-LABEL: @test_vld2q_u16(
4547	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
4548	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x2_t [[__RET]] to i8*
4549	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
4550	// CHECK: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
4551	uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
4552	return vld2q_u16(a);
4553	}
4554
4555	// CHECK-LABEL: @test_vld2q_u32(
4556	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
4557	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x2_t [[__RET]] to i8*
4558	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
4559	// CHECK: [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
4560	uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
4561	return vld2q_u32(a);
4562	}
4563
4564	// CHECK-LABEL: @test_vld2q_s8(
4565	// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
4566	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x2_t [[__RET]] to i8*
4567	// CHECK: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
4568	int8x16x2_t test_vld2q_s8(int8_t const * a) {
4569	return vld2q_s8(a);
4570	}
4571
4572	// CHECK-LABEL: @test_vld2q_s16(
4573	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
4574	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x2_t [[__RET]] to i8*
4575	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
4576	// CHECK: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
4577	int16x8x2_t test_vld2q_s16(int16_t const * a) {
4578	return vld2q_s16(a);
4579	}
4580
4581	// CHECK-LABEL: @test_vld2q_s32(
4582	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
4583	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x2_t [[__RET]] to i8*
4584	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
4585	// CHECK: [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
4586	int32x4x2_t test_vld2q_s32(int32_t const * a) {
4587	return vld2q_s32(a);
4588	}
4589
4590	// CHECK-LABEL: @test_vld2q_f16(
4591	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
4592	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x2_t [[__RET]] to i8*
4593	// CHECK: [[TMP1:%.]] = bitcast half %a to i8*
4594	// CHECK: [[VLD2Q_V:%.*]] = call { <8 x half>, <8 x half>
4595	float16x8x2_t test_vld2q_f16(float16_t const * a) {
4596	return vld2q_f16(a);
4597	}
4598
4599	// CHECK-LABEL: @test_vld2q_f32(
4600	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
4601	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x2_t [[__RET]] to i8*
4602	// CHECK: [[TMP1:%.]] = bitcast float %a to i8*
4603	// CHECK: [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float>
4604	float32x4x2_t test_vld2q_f32(float32_t const * a) {
4605	return vld2q_f32(a);
4606	}
4607
4608	// CHECK-LABEL: @test_vld2q_p8(
4609	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
4610	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x2_t [[__RET]] to i8*
4611	// CHECK: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
4612	poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
4613	return vld2q_p8(a);
4614	}
4615
4616	// CHECK-LABEL: @test_vld2q_p16(
4617	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
4618	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x2_t [[__RET]] to i8*
4619	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
4620	// CHECK: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
4621	poly16x8x2_t test_vld2q_p16(poly16_t const * a) {
4622	return vld2q_p16(a);
4623	}
4624
4625	// CHECK-LABEL: @test_vld2_u8(
4626	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
4627	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x2_t [[__RET]] to i8*
4628	// CHECK: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
4629	uint8x8x2_t test_vld2_u8(uint8_t const * a) {
4630	return vld2_u8(a);
4631	}
4632
4633	// CHECK-LABEL: @test_vld2_u16(
4634	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
4635	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x2_t [[__RET]] to i8*
4636	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
4637	// CHECK: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
4638	uint16x4x2_t test_vld2_u16(uint16_t const * a) {
4639	return vld2_u16(a);
4640	}
4641
4642	// CHECK-LABEL: @test_vld2_u32(
4643	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
4644	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x2_t [[__RET]] to i8*
4645	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
4646	// CHECK: [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32>
4647	uint32x2x2_t test_vld2_u32(uint32_t const * a) {
4648	return vld2_u32(a);
4649	}
4650
4651	// CHECK-LABEL: @test_vld2_u64(
4652	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
4653	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x2_t [[__RET]] to i8*
4654	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
4655	// CHECK: [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64>
4656	uint64x1x2_t test_vld2_u64(uint64_t const * a) {
4657	return vld2_u64(a);
4658	}
4659
4660	// CHECK-LABEL: @test_vld2_s8(
4661	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
4662	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x2_t [[__RET]] to i8*
4663	// CHECK: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
4664	int8x8x2_t test_vld2_s8(int8_t const * a) {
4665	return vld2_s8(a);
4666	}
4667
4668	// CHECK-LABEL: @test_vld2_s16(
4669	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
4670	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x2_t [[__RET]] to i8*
4671	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
4672	// CHECK: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
4673	int16x4x2_t test_vld2_s16(int16_t const * a) {
4674	return vld2_s16(a);
4675	}
4676
4677	// CHECK-LABEL: @test_vld2_s32(
4678	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
4679	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x2_t [[__RET]] to i8*
4680	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
4681	// CHECK: [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32>
4682	int32x2x2_t test_vld2_s32(int32_t const * a) {
4683	return vld2_s32(a);
4684	}
4685
4686	// CHECK-LABEL: @test_vld2_s64(
4687	// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
4688	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x2_t [[__RET]] to i8*
4689	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
4690	// CHECK: [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64>
4691	int64x1x2_t test_vld2_s64(int64_t const * a) {
4692	return vld2_s64(a);
4693	}
4694
4695	// CHECK-LABEL: @test_vld2_f16(
4696	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
4697	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x2_t [[__RET]] to i8*
4698	// CHECK: [[TMP1:%.]] = bitcast half %a to i8*
4699	// CHECK: [[VLD2_V:%.*]] = call { <4 x half>, <4 x half>
4700	float16x4x2_t test_vld2_f16(float16_t const * a) {
4701	return vld2_f16(a);
4702	}
4703
4704	// CHECK-LABEL: @test_vld2_f32(
4705	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
4706	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x2_t [[__RET]] to i8*
4707	// CHECK: [[TMP1:%.]] = bitcast float %a to i8*
4708	// CHECK: [[VLD2_V:%.*]] = call { <2 x float>, <2 x float>
4709	float32x2x2_t test_vld2_f32(float32_t const * a) {
4710	return vld2_f32(a);
4711	}
4712
4713	// CHECK-LABEL: @test_vld2_p8(
4714	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
4715	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x2_t [[__RET]] to i8*
4716	// CHECK: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8>
4717	poly8x8x2_t test_vld2_p8(poly8_t const * a) {
4718	return vld2_p8(a);
4719	}
4720
4721	// CHECK-LABEL: @test_vld2_p16(
4722	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
4723	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x2_t [[__RET]] to i8*
4724	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
4725	// CHECK: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
4726	poly16x4x2_t test_vld2_p16(poly16_t const * a) {
4727	return vld2_p16(a);
4728	}
4729
4730	// CHECK-LABEL: @test_vld2q_lane_u16(
4731	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
4732	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
4733	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
4734	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[B]], i32 0, i32 0
4735	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i16>] [[COERCE_DIVE]] to [4 x i64]*
4736	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
4737	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x2_t [[__S1]] to i8*
4738	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x8x2_t [[B]] to i8*
4739	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
4740	// CHECK: [[TMP3:%.]] = bitcast %struct.uint16x8x2_t [[__RET]] to i8*
4741	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
4742	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[__S1]], i32 0, i32 0
4743	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i32 0, i32 0
4744	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
4745	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4746	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[__S1]], i32 0, i32 0
4747	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i32 0, i32 1
4748	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
4749	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4750	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4751	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4752	// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
4753	uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
4754	return vld2q_lane_u16(a, b, 7);
4755	}
4756
4757	// CHECK-LABEL: @test_vld2q_lane_u32(
4758	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
4759	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
4760	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
4761	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[B]], i32 0, i32 0
4762	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i32>] [[COERCE_DIVE]] to [4 x i64]*
4763	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
4764	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x2_t [[__S1]] to i8*
4765	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x4x2_t [[B]] to i8*
4766	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
4767	// CHECK: [[TMP3:%.]] = bitcast %struct.uint32x4x2_t [[__RET]] to i8*
4768	// CHECK: [[TMP4:%.]] = bitcast i32 %a to i8*
4769	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[__S1]], i32 0, i32 0
4770	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL]], i32 0, i32 0
4771	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
4772	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4773	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[__S1]], i32 0, i32 0
4774	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL1]], i32 0, i32 1
4775	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
4776	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
4777	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4778	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
4779	// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>
4780	uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
4781	return vld2q_lane_u32(a, b, 3);
4782	}
4783
4784	// CHECK-LABEL: @test_vld2q_lane_s16(
4785	// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
4786	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
4787	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
4788	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[B]], i32 0, i32 0
4789	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i16>] [[COERCE_DIVE]] to [4 x i64]*
4790	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
4791	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x2_t [[__S1]] to i8*
4792	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x8x2_t [[B]] to i8*
4793	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
4794	// CHECK: [[TMP3:%.]] = bitcast %struct.int16x8x2_t [[__RET]] to i8*
4795	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
4796	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[__S1]], i32 0, i32 0
4797	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i32 0, i32 0
4798	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
4799	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4800	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[__S1]], i32 0, i32 0
4801	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i32 0, i32 1
4802	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
4803	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4804	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4805	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4806	// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
4807	int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
4808	return vld2q_lane_s16(a, b, 7);
4809	}
4810
4811	// CHECK-LABEL: @test_vld2q_lane_s32(
4812	// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
4813	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
4814	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
4815	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[B]], i32 0, i32 0
4816	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i32>] [[COERCE_DIVE]] to [4 x i64]*
4817	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
4818	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x2_t [[__S1]] to i8*
4819	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x4x2_t [[B]] to i8*
4820	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
4821	// CHECK: [[TMP3:%.]] = bitcast %struct.int32x4x2_t [[__RET]] to i8*
4822	// CHECK: [[TMP4:%.]] = bitcast i32 %a to i8*
4823	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[__S1]], i32 0, i32 0
4824	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL]], i32 0, i32 0
4825	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
4826	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4827	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[__S1]], i32 0, i32 0
4828	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL1]], i32 0, i32 1
4829	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
4830	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
4831	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4832	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
4833	// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>
4834	int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
4835	return vld2q_lane_s32(a, b, 3);
4836	}
4837
4838	// CHECK-LABEL: @test_vld2q_lane_f16(
4839	// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
4840	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
4841	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
4842	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[B]], i32 0, i32 0
4843	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x half>] [[COERCE_DIVE]] to [4 x i64]*
4844	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
4845	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x2_t [[__S1]] to i8*
4846	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x8x2_t [[B]] to i8*
4847	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
4848	// CHECK: [[TMP3:%.]] = bitcast %struct.float16x8x2_t [[__RET]] to i8*
4849	// CHECK: [[TMP4:%.]] = bitcast half %a to i8*
4850	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[__S1]], i32 0, i32 0
4851	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>] [[VAL]], i32 0, i32 0
4852	// CHECK: [[TMP5:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
4853	// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
4854	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[__S1]], i32 0, i32 0
4855	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>] [[VAL1]], i32 0, i32 1
4856	// CHECK: [[TMP7:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
4857	// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
4858	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
4859	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
4860	// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>
4861	float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
4862	return vld2q_lane_f16(a, b, 7);
4863	}
4864
4865	// CHECK-LABEL: @test_vld2q_lane_f32(
4866	// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
4867	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
4868	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
4869	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[B]], i32 0, i32 0
4870	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x float>] [[COERCE_DIVE]] to [4 x i64]*
4871	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
4872	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x2_t [[__S1]] to i8*
4873	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x4x2_t [[B]] to i8*
4874	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
4875	// CHECK: [[TMP3:%.]] = bitcast %struct.float32x4x2_t [[__RET]] to i8*
4876	// CHECK: [[TMP4:%.]] = bitcast float %a to i8*
4877	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[__S1]], i32 0, i32 0
4878	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>] [[VAL]], i32 0, i32 0
4879	// CHECK: [[TMP5:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
4880	// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
4881	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[__S1]], i32 0, i32 0
4882	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>] [[VAL1]], i32 0, i32 1
4883	// CHECK: [[TMP7:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
4884	// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
4885	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
4886	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
4887	// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>
4888	float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
4889	return vld2q_lane_f32(a, b, 3);
4890	}
4891
4892	// CHECK-LABEL: @test_vld2q_lane_p16(
4893	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
4894	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
4895	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
4896	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[B]], i32 0, i32 0
4897	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i16>] [[COERCE_DIVE]] to [4 x i64]*
4898	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
4899	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x2_t [[__S1]] to i8*
4900	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x8x2_t [[B]] to i8*
4901	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
4902	// CHECK: [[TMP3:%.]] = bitcast %struct.poly16x8x2_t [[__RET]] to i8*
4903	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
4904	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[__S1]], i32 0, i32 0
4905	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i32 0, i32 0
4906	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
4907	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4908	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[__S1]], i32 0, i32 0
4909	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i32 0, i32 1
4910	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
4911	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
4912	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4913	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
4914	// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
4915	poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
4916	return vld2q_lane_p16(a, b, 7);
4917	}
4918
4919	// CHECK-LABEL: @test_vld2_lane_u8(
4920	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
4921	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
4922	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
4923	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[B]], i32 0, i32 0
4924	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
4925	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
4926	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x2_t [[__S1]] to i8*
4927	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x2_t [[B]] to i8*
4928	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
4929	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x8x2_t [[__RET]] to i8*
4930	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__S1]], i32 0, i32 0
4931	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i32 0, i32 0
4932	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
4933	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__S1]], i32 0, i32 0
4934	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i32 0, i32 1
4935	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
4936	// CHECK: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
4937	uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
4938	return vld2_lane_u8(a, b, 7);
4939	}
4940
4941	// CHECK-LABEL: @test_vld2_lane_u16(
4942	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
4943	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
4944	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
4945	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[B]], i32 0, i32 0
4946	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i16>] [[COERCE_DIVE]] to [2 x i64]*
4947	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
4948	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x2_t [[__S1]] to i8*
4949	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x4x2_t [[B]] to i8*
4950	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
4951	// CHECK: [[TMP3:%.]] = bitcast %struct.uint16x4x2_t [[__RET]] to i8*
4952	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
4953	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[__S1]], i32 0, i32 0
4954	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i32 0, i32 0
4955	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
4956	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4957	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[__S1]], i32 0, i32 0
4958	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i32 0, i32 1
4959	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
4960	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
4961	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4962	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
4963	// CHECK: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
4964	uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
4965	return vld2_lane_u16(a, b, 3);
4966	}
4967
4968	// CHECK-LABEL: @test_vld2_lane_u32(
4969	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
4970	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
4971	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
4972	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[B]], i32 0, i32 0
4973	// CHECK: [[TMP0:%.]] = bitcast [2 x <2 x i32>] [[COERCE_DIVE]] to [2 x i64]*
4974	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
4975	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x2_t [[__S1]] to i8*
4976	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x2x2_t [[B]] to i8*
4977	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
4978	// CHECK: [[TMP3:%.]] = bitcast %struct.uint32x2x2_t [[__RET]] to i8*
4979	// CHECK: [[TMP4:%.]] = bitcast i32 %a to i8*
4980	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[__S1]], i32 0, i32 0
4981	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL]], i32 0, i32 0
4982	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
4983	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4984	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[__S1]], i32 0, i32 0
4985	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL1]], i32 0, i32 1
4986	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
4987	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
4988	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4989	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
4990	// CHECK: [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>
4991	uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
4992	return vld2_lane_u32(a, b, 1);
4993	}
4994
4995	// CHECK-LABEL: @test_vld2_lane_s8(
4996	// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
4997	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
4998	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
4999	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[B]], i32 0, i32 0
5000	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
5001	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5002	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x2_t [[__S1]] to i8*
5003	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x2_t [[B]] to i8*
5004	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
5005	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x8x2_t [[__RET]] to i8*
5006	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__S1]], i32 0, i32 0
5007	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i32 0, i32 0
5008	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
5009	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__S1]], i32 0, i32 0
5010	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i32 0, i32 1
5011	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
5012	// CHECK: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
5013	int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
5014	return vld2_lane_s8(a, b, 7);
5015	}
5016
5017	// CHECK-LABEL: @test_vld2_lane_s16(
5018	// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
5019	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
5020	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
5021	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[B]], i32 0, i32 0
5022	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i16>] [[COERCE_DIVE]] to [2 x i64]*
5023	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5024	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x2_t [[__S1]] to i8*
5025	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x4x2_t [[B]] to i8*
5026	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
5027	// CHECK: [[TMP3:%.]] = bitcast %struct.int16x4x2_t [[__RET]] to i8*
5028	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
5029	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[__S1]], i32 0, i32 0
5030	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i32 0, i32 0
5031	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
5032	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5033	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[__S1]], i32 0, i32 0
5034	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i32 0, i32 1
5035	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
5036	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5037	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5038	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5039	// CHECK: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
5040	int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
5041	return vld2_lane_s16(a, b, 3);
5042	}
5043
5044	// CHECK-LABEL: @test_vld2_lane_s32(
5045	// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
5046	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
5047	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
5048	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[B]], i32 0, i32 0
5049	// CHECK: [[TMP0:%.]] = bitcast [2 x <2 x i32>] [[COERCE_DIVE]] to [2 x i64]*
5050	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5051	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x2_t [[__S1]] to i8*
5052	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x2x2_t [[B]] to i8*
5053	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
5054	// CHECK: [[TMP3:%.]] = bitcast %struct.int32x2x2_t [[__RET]] to i8*
5055	// CHECK: [[TMP4:%.]] = bitcast i32 %a to i8*
5056	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[__S1]], i32 0, i32 0
5057	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL]], i32 0, i32 0
5058	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
5059	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5060	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[__S1]], i32 0, i32 0
5061	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL1]], i32 0, i32 1
5062	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
5063	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5064	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5065	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5066	// CHECK: [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>
5067	int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
5068	return vld2_lane_s32(a, b, 1);
5069	}
5070
5071	// CHECK-LABEL: @test_vld2_lane_f16(
5072	// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
5073	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
5074	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
5075	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[B]], i32 0, i32 0
5076	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x half>] [[COERCE_DIVE]] to [2 x i64]*
5077	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5078	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x2_t [[__S1]] to i8*
5079	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x4x2_t [[B]] to i8*
5080	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
5081	// CHECK: [[TMP3:%.]] = bitcast %struct.float16x4x2_t [[__RET]] to i8*
5082	// CHECK: [[TMP4:%.]] = bitcast half %a to i8*
5083	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[__S1]], i32 0, i32 0
5084	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>] [[VAL]], i32 0, i32 0
5085	// CHECK: [[TMP5:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
5086	// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
5087	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[__S1]], i32 0, i32 0
5088	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>] [[VAL1]], i32 0, i32 1
5089	// CHECK: [[TMP7:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
5090	// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
5091	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
5092	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
5093	// CHECK: [[VLD2_LANE_V:%.*]] = call { <4 x half>, <4 x half>
5094	float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
5095	return vld2_lane_f16(a, b, 3);
5096	}
5097
5098	// CHECK-LABEL: @test_vld2_lane_f32(
5099	// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
5100	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
5101	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
5102	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[B]], i32 0, i32 0
5103	// CHECK: [[TMP0:%.]] = bitcast [2 x <2 x float>] [[COERCE_DIVE]] to [2 x i64]*
5104	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5105	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x2_t [[__S1]] to i8*
5106	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x2x2_t [[B]] to i8*
5107	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
5108	// CHECK: [[TMP3:%.]] = bitcast %struct.float32x2x2_t [[__RET]] to i8*
5109	// CHECK: [[TMP4:%.]] = bitcast float %a to i8*
5110	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[__S1]], i32 0, i32 0
5111	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>] [[VAL]], i32 0, i32 0
5112	// CHECK: [[TMP5:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
5113	// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
5114	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[__S1]], i32 0, i32 0
5115	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>] [[VAL1]], i32 0, i32 1
5116	// CHECK: [[TMP7:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
5117	// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
5118	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
5119	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
5120	// CHECK: [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float>
5121	float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
5122	return vld2_lane_f32(a, b, 1);
5123	}
5124
5125	// CHECK-LABEL: @test_vld2_lane_p8(
5126	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
5127	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
5128	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
5129	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[B]], i32 0, i32 0
5130	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
5131	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5132	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x2_t [[__S1]] to i8*
5133	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x2_t [[B]] to i8*
5134	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
5135	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x8x2_t [[__RET]] to i8*
5136	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__S1]], i32 0, i32 0
5137	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i32 0, i32 0
5138	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
5139	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__S1]], i32 0, i32 0
5140	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i32 0, i32 1
5141	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
5142	// CHECK: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>
5143	poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
5144	return vld2_lane_p8(a, b, 7);
5145	}
5146
5147	// CHECK-LABEL: @test_vld2_lane_p16(
5148	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
5149	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
5150	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
5151	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[B]], i32 0, i32 0
5152	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i16>] [[COERCE_DIVE]] to [2 x i64]*
5153	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5154	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x2_t [[__S1]] to i8*
5155	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x4x2_t [[B]] to i8*
5156	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
5157	// CHECK: [[TMP3:%.]] = bitcast %struct.poly16x4x2_t [[__RET]] to i8*
5158	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
5159	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[__S1]], i32 0, i32 0
5160	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i32 0, i32 0
5161	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
5162	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5163	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[__S1]], i32 0, i32 0
5164	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i32 0, i32 1
5165	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
5166	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5167	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5168	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5169	// CHECK: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
5170	poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
5171	return vld2_lane_p16(a, b, 3);
5172	}
5173
5174	// CHECK-LABEL: @test_vld3q_u8(
5175	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
5176	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x3_t [[__RET]] to i8*
5177	// CHECK: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
5178	uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
5179	return vld3q_u8(a);
5180	}
5181
5182	// CHECK-LABEL: @test_vld3q_u16(
5183	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
5184	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x3_t [[__RET]] to i8*
5185	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
5186	// CHECK: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
5187	uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
5188	return vld3q_u16(a);
5189	}
5190
5191	// CHECK-LABEL: @test_vld3q_u32(
5192	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
5193	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x3_t [[__RET]] to i8*
5194	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
5195	// CHECK: [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
5196	uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
5197	return vld3q_u32(a);
5198	}
5199
5200	// CHECK-LABEL: @test_vld3q_s8(
5201	// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
5202	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x3_t [[__RET]] to i8*
5203	// CHECK: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
5204	int8x16x3_t test_vld3q_s8(int8_t const * a) {
5205	return vld3q_s8(a);
5206	}
5207
5208	// CHECK-LABEL: @test_vld3q_s16(
5209	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
5210	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x3_t [[__RET]] to i8*
5211	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
5212	// CHECK: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
5213	int16x8x3_t test_vld3q_s16(int16_t const * a) {
5214	return vld3q_s16(a);
5215	}
5216
5217	// CHECK-LABEL: @test_vld3q_s32(
5218	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
5219	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x3_t [[__RET]] to i8*
5220	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
5221	// CHECK: [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
5222	int32x4x3_t test_vld3q_s32(int32_t const * a) {
5223	return vld3q_s32(a);
5224	}
5225
5226	// CHECK-LABEL: @test_vld3q_f16(
5227	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
5228	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x3_t [[__RET]] to i8*
5229	// CHECK: [[TMP1:%.]] = bitcast half %a to i8*
5230	// CHECK: [[VLD3Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>
5231	float16x8x3_t test_vld3q_f16(float16_t const * a) {
5232	return vld3q_f16(a);
5233	}
5234
5235	// CHECK-LABEL: @test_vld3q_f32(
5236	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
5237	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x3_t [[__RET]] to i8*
5238	// CHECK: [[TMP1:%.]] = bitcast float %a to i8*
5239	// CHECK: [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>
5240	float32x4x3_t test_vld3q_f32(float32_t const * a) {
5241	return vld3q_f32(a);
5242	}
5243
5244	// CHECK-LABEL: @test_vld3q_p8(
5245	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
5246	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x3_t [[__RET]] to i8*
5247	// CHECK: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
5248	poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
5249	return vld3q_p8(a);
5250	}
5251
5252	// CHECK-LABEL: @test_vld3q_p16(
5253	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
5254	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x3_t [[__RET]] to i8*
5255	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
5256	// CHECK: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
5257	poly16x8x3_t test_vld3q_p16(poly16_t const * a) {
5258	return vld3q_p16(a);
5259	}
5260
5261	// CHECK-LABEL: @test_vld3_u8(
5262	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
5263	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x3_t [[__RET]] to i8*
5264	// CHECK: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
5265	uint8x8x3_t test_vld3_u8(uint8_t const * a) {
5266	return vld3_u8(a);
5267	}
5268
5269	// CHECK-LABEL: @test_vld3_u16(
5270	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
5271	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x3_t [[__RET]] to i8*
5272	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
5273	// CHECK: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
5274	uint16x4x3_t test_vld3_u16(uint16_t const * a) {
5275	return vld3_u16(a);
5276	}
5277
5278	// CHECK-LABEL: @test_vld3_u32(
5279	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
5280	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x3_t [[__RET]] to i8*
5281	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
5282	// CHECK: [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
5283	uint32x2x3_t test_vld3_u32(uint32_t const * a) {
5284	return vld3_u32(a);
5285	}
5286
5287	// CHECK-LABEL: @test_vld3_u64(
5288	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
5289	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x3_t [[__RET]] to i8*
5290	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
5291	// CHECK: [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
5292	uint64x1x3_t test_vld3_u64(uint64_t const * a) {
5293	return vld3_u64(a);
5294	}
5295
5296	// CHECK-LABEL: @test_vld3_s8(
5297	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
5298	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x3_t [[__RET]] to i8*
5299	// CHECK: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
5300	int8x8x3_t test_vld3_s8(int8_t const * a) {
5301	return vld3_s8(a);
5302	}
5303
5304	// CHECK-LABEL: @test_vld3_s16(
5305	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
5306	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x3_t [[__RET]] to i8*
5307	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
5308	// CHECK: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
5309	int16x4x3_t test_vld3_s16(int16_t const * a) {
5310	return vld3_s16(a);
5311	}
5312
5313	// CHECK-LABEL: @test_vld3_s32(
5314	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
5315	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x3_t [[__RET]] to i8*
5316	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
5317	// CHECK: [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
5318	int32x2x3_t test_vld3_s32(int32_t const * a) {
5319	return vld3_s32(a);
5320	}
5321
5322	// CHECK-LABEL: @test_vld3_s64(
5323	// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
5324	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x3_t [[__RET]] to i8*
5325	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
5326	// CHECK: [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>
5327	int64x1x3_t test_vld3_s64(int64_t const * a) {
5328	return vld3_s64(a);
5329	}
5330
5331	// CHECK-LABEL: @test_vld3_f16(
5332	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
5333	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x3_t [[__RET]] to i8*
5334	// CHECK: [[TMP1:%.]] = bitcast half %a to i8*
5335	// CHECK: [[VLD3_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>
5336	float16x4x3_t test_vld3_f16(float16_t const * a) {
5337	return vld3_f16(a);
5338	}
5339
5340	// CHECK-LABEL: @test_vld3_f32(
5341	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
5342	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x3_t [[__RET]] to i8*
5343	// CHECK: [[TMP1:%.]] = bitcast float %a to i8*
5344	// CHECK: [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>
5345	float32x2x3_t test_vld3_f32(float32_t const * a) {
5346	return vld3_f32(a);
5347	}
5348
5349	// CHECK-LABEL: @test_vld3_p8(
5350	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
5351	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x3_t [[__RET]] to i8*
5352	// CHECK: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
5353	poly8x8x3_t test_vld3_p8(poly8_t const * a) {
5354	return vld3_p8(a);
5355	}
5356
5357	// CHECK-LABEL: @test_vld3_p16(
5358	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
5359	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x3_t [[__RET]] to i8*
5360	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
5361	// CHECK: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
5362	poly16x4x3_t test_vld3_p16(poly16_t const * a) {
5363	return vld3_p16(a);
5364	}
5365
5366	// CHECK-LABEL: @test_vld3q_lane_u16(
5367	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
5368	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
5369	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
5370	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[B]], i32 0, i32 0
5371	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i16>] [[COERCE_DIVE]] to [6 x i64]*
5372	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
5373	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x3_t [[__S1]] to i8*
5374	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x8x3_t [[B]] to i8*
5375	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
5376	// CHECK: [[TMP3:%.]] = bitcast %struct.uint16x8x3_t [[__RET]] to i8*
5377	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
5378	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
5379	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i32 0, i32 0
5380	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
5381	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5382	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
5383	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i32 0, i32 1
5384	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
5385	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5386	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
5387	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i32 0, i32 2
5388	// CHECK: [[TMP9:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
5389	// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5390	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5391	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5392	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5393	// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
5394	uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
5395	return vld3q_lane_u16(a, b, 7);
5396	}
5397
5398	// CHECK-LABEL: @test_vld3q_lane_u32(
5399	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
5400	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
5401	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
5402	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[B]], i32 0, i32 0
5403	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i32>] [[COERCE_DIVE]] to [6 x i64]*
5404	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
5405	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x3_t [[__S1]] to i8*
5406	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x4x3_t [[B]] to i8*
5407	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
5408	// CHECK: [[TMP3:%.]] = bitcast %struct.uint32x4x3_t [[__RET]] to i8*
5409	// CHECK: [[TMP4:%.]] = bitcast i32 %a to i8*
5410	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
5411	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL]], i32 0, i32 0
5412	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
5413	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5414	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
5415	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL1]], i32 0, i32 1
5416	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
5417	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5418	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
5419	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL3]], i32 0, i32 2
5420	// CHECK: [[TMP9:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
5421	// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
5422	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5423	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5424	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
5425	// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
5426	uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
5427	return vld3q_lane_u32(a, b, 3);
5428	}
5429
5430	// CHECK-LABEL: @test_vld3q_lane_s16(
5431	// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
5432	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
5433	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
5434	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[B]], i32 0, i32 0
5435	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i16>] [[COERCE_DIVE]] to [6 x i64]*
5436	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
5437	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x3_t [[__S1]] to i8*
5438	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x8x3_t [[B]] to i8*
5439	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
5440	// CHECK: [[TMP3:%.]] = bitcast %struct.int16x8x3_t [[__RET]] to i8*
5441	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
5442	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
5443	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i32 0, i32 0
5444	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
5445	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5446	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
5447	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i32 0, i32 1
5448	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
5449	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5450	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
5451	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i32 0, i32 2
5452	// CHECK: [[TMP9:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
5453	// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5454	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5455	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5456	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5457	// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
5458	int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
5459	return vld3q_lane_s16(a, b, 7);
5460	}
5461
5462	// CHECK-LABEL: @test_vld3q_lane_s32(
5463	// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
5464	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
5465	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
5466	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[B]], i32 0, i32 0
5467	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i32>] [[COERCE_DIVE]] to [6 x i64]*
5468	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
5469	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x3_t [[__S1]] to i8*
5470	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x4x3_t [[B]] to i8*
5471	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
5472	// CHECK: [[TMP3:%.]] = bitcast %struct.int32x4x3_t [[__RET]] to i8*
5473	// CHECK: [[TMP4:%.]] = bitcast i32 %a to i8*
5474	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
5475	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL]], i32 0, i32 0
5476	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
5477	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5478	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
5479	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL1]], i32 0, i32 1
5480	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
5481	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5482	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
5483	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL3]], i32 0, i32 2
5484	// CHECK: [[TMP9:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
5485	// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
5486	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5487	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5488	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
5489	// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
5490	int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
5491	return vld3q_lane_s32(a, b, 3);
5492	}
5493
5494	// CHECK-LABEL: @test_vld3q_lane_f16(
5495	// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
5496	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
5497	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
5498	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[B]], i32 0, i32 0
5499	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x half>] [[COERCE_DIVE]] to [6 x i64]*
5500	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
5501	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x3_t [[__S1]] to i8*
5502	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x8x3_t [[B]] to i8*
5503	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
5504	// CHECK: [[TMP3:%.]] = bitcast %struct.float16x8x3_t [[__RET]] to i8*
5505	// CHECK: [[TMP4:%.]] = bitcast half %a to i8*
5506	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
5507	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL]], i32 0, i32 0
5508	// CHECK: [[TMP5:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
5509	// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
5510	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
5511	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL1]], i32 0, i32 1
5512	// CHECK: [[TMP7:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
5513	// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
5514	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
5515	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL3]], i32 0, i32 2
5516	// CHECK: [[TMP9:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX4]], align 16
5517	// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
5518	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
5519	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
5520	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
5521	// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>
5522	float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
5523	return vld3q_lane_f16(a, b, 7);
5524	}
5525
5526	// CHECK-LABEL: @test_vld3q_lane_f32(
5527	// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
5528	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
5529	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
5530	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[B]], i32 0, i32 0
5531	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x float>] [[COERCE_DIVE]] to [6 x i64]*
5532	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
5533	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x3_t [[__S1]] to i8*
5534	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x4x3_t [[B]] to i8*
5535	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
5536	// CHECK: [[TMP3:%.]] = bitcast %struct.float32x4x3_t [[__RET]] to i8*
5537	// CHECK: [[TMP4:%.]] = bitcast float %a to i8*
5538	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
5539	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL]], i32 0, i32 0
5540	// CHECK: [[TMP5:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
5541	// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
5542	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
5543	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL1]], i32 0, i32 1
5544	// CHECK: [[TMP7:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
5545	// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
5546	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
5547	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL3]], i32 0, i32 2
5548	// CHECK: [[TMP9:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX4]], align 16
5549	// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
5550	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
5551	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
5552	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
5553	// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>
5554	float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
5555	return vld3q_lane_f32(a, b, 3);
5556	}
5557
5558	// CHECK-LABEL: @test_vld3q_lane_p16(
5559	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
5560	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
5561	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
5562	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[B]], i32 0, i32 0
5563	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i16>] [[COERCE_DIVE]] to [6 x i64]*
5564	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
5565	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x3_t [[__S1]] to i8*
5566	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x8x3_t [[B]] to i8*
5567	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
5568	// CHECK: [[TMP3:%.]] = bitcast %struct.poly16x8x3_t [[__RET]] to i8*
5569	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
5570	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
5571	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i32 0, i32 0
5572	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
5573	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5574	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
5575	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i32 0, i32 1
5576	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
5577	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5578	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
5579	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i32 0, i32 2
5580	// CHECK: [[TMP9:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
5581	// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5582	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5583	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5584	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5585	// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
5586	poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
5587	return vld3q_lane_p16(a, b, 7);
5588	}
5589
5590	// CHECK-LABEL: @test_vld3_lane_u8(
5591	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
5592	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
5593	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
5594	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[B]], i32 0, i32 0
5595	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
5596	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
5597	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x3_t [[__S1]] to i8*
5598	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x3_t [[B]] to i8*
5599	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
5600	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x8x3_t [[__RET]] to i8*
5601	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
5602	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i32 0, i32 0
5603	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
5604	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
5605	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i32 0, i32 1
5606	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
5607	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
5608	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i32 0, i32 2
5609	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
5610	// CHECK: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
5611	uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
5612	return vld3_lane_u8(a, b, 7);
5613	}
5614
5615	// CHECK-LABEL: @test_vld3_lane_u16(
5616	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
5617	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
5618	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
5619	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[B]], i32 0, i32 0
5620	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i16>] [[COERCE_DIVE]] to [3 x i64]*
5621	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
5622	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x3_t [[__S1]] to i8*
5623	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x4x3_t [[B]] to i8*
5624	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
5625	// CHECK: [[TMP3:%.]] = bitcast %struct.uint16x4x3_t [[__RET]] to i8*
5626	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
5627	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
5628	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i32 0, i32 0
5629	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
5630	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5631	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
5632	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i32 0, i32 1
5633	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
5634	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5635	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
5636	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i32 0, i32 2
5637	// CHECK: [[TMP9:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
5638	// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5639	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5640	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5641	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5642	// CHECK: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
5643	uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
5644	return vld3_lane_u16(a, b, 3);
5645	}
5646
5647	// CHECK-LABEL: @test_vld3_lane_u32(
5648	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
5649	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
5650	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
5651	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[B]], i32 0, i32 0
5652	// CHECK: [[TMP0:%.]] = bitcast [3 x <2 x i32>] [[COERCE_DIVE]] to [3 x i64]*
5653	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
5654	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x3_t [[__S1]] to i8*
5655	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x2x3_t [[B]] to i8*
5656	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
5657	// CHECK: [[TMP3:%.]] = bitcast %struct.uint32x2x3_t [[__RET]] to i8*
5658	// CHECK: [[TMP4:%.]] = bitcast i32 %a to i8*
5659	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
5660	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL]], i32 0, i32 0
5661	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
5662	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5663	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
5664	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL1]], i32 0, i32 1
5665	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
5666	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5667	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
5668	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL3]], i32 0, i32 2
5669	// CHECK: [[TMP9:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
5670	// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
5671	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5672	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5673	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
5674	// CHECK: [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
5675	uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
5676	return vld3_lane_u32(a, b, 1);
5677	}
5678
5679	// CHECK-LABEL: @test_vld3_lane_s8(
5680	// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
5681	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
5682	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
5683	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[B]], i32 0, i32 0
5684	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
5685	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
5686	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x3_t [[__S1]] to i8*
5687	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x3_t [[B]] to i8*
5688	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
5689	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x8x3_t [[__RET]] to i8*
5690	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
5691	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i32 0, i32 0
5692	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
5693	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
5694	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i32 0, i32 1
5695	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
5696	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
5697	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i32 0, i32 2
5698	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
5699	// CHECK: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
5700	int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
5701	return vld3_lane_s8(a, b, 7);
5702	}
5703
5704	// CHECK-LABEL: @test_vld3_lane_s16(
5705	// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
5706	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
5707	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
5708	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[B]], i32 0, i32 0
5709	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i16>] [[COERCE_DIVE]] to [3 x i64]*
5710	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
5711	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x3_t [[__S1]] to i8*
5712	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x4x3_t [[B]] to i8*
5713	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
5714	// CHECK: [[TMP3:%.]] = bitcast %struct.int16x4x3_t [[__RET]] to i8*
5715	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
5716	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
5717	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i32 0, i32 0
5718	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
5719	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5720	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
5721	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i32 0, i32 1
5722	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
5723	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5724	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
5725	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i32 0, i32 2
5726	// CHECK: [[TMP9:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
5727	// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5728	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5729	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5730	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5731	// CHECK: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
5732	int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
5733	return vld3_lane_s16(a, b, 3);
5734	}
5735
5736	// CHECK-LABEL: @test_vld3_lane_s32(
5737	// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
5738	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
5739	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
5740	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[B]], i32 0, i32 0
5741	// CHECK: [[TMP0:%.]] = bitcast [3 x <2 x i32>] [[COERCE_DIVE]] to [3 x i64]*
5742	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
5743	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x3_t [[__S1]] to i8*
5744	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x2x3_t [[B]] to i8*
5745	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
5746	// CHECK: [[TMP3:%.]] = bitcast %struct.int32x2x3_t [[__RET]] to i8*
5747	// CHECK: [[TMP4:%.]] = bitcast i32 %a to i8*
5748	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
5749	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL]], i32 0, i32 0
5750	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
5751	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5752	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
5753	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL1]], i32 0, i32 1
5754	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
5755	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5756	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
5757	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL3]], i32 0, i32 2
5758	// CHECK: [[TMP9:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
5759	// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
5760	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5761	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5762	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
5763	// CHECK: [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>
5764	int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
5765	return vld3_lane_s32(a, b, 1);
5766	}
5767
5768	// CHECK-LABEL: @test_vld3_lane_f16(
5769	// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
5770	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
5771	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
5772	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[B]], i32 0, i32 0
5773	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x half>] [[COERCE_DIVE]] to [3 x i64]*
5774	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
5775	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x3_t [[__S1]] to i8*
5776	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x4x3_t [[B]] to i8*
5777	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
5778	// CHECK: [[TMP3:%.]] = bitcast %struct.float16x4x3_t [[__RET]] to i8*
5779	// CHECK: [[TMP4:%.]] = bitcast half %a to i8*
5780	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
5781	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL]], i32 0, i32 0
5782	// CHECK: [[TMP5:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
5783	// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
5784	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
5785	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL1]], i32 0, i32 1
5786	// CHECK: [[TMP7:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
5787	// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
5788	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
5789	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL3]], i32 0, i32 2
5790	// CHECK: [[TMP9:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX4]], align 8
5791	// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
5792	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
5793	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
5794	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
5795	// CHECK: [[VLD3_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>
5796	float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
5797	return vld3_lane_f16(a, b, 3);
5798	}
5799
5800	// CHECK-LABEL: @test_vld3_lane_f32(
5801	// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
5802	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
5803	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
5804	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[B]], i32 0, i32 0
5805	// CHECK: [[TMP0:%.]] = bitcast [3 x <2 x float>] [[COERCE_DIVE]] to [3 x i64]*
5806	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
5807	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x3_t [[__S1]] to i8*
5808	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x2x3_t [[B]] to i8*
5809	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
5810	// CHECK: [[TMP3:%.]] = bitcast %struct.float32x2x3_t [[__RET]] to i8*
5811	// CHECK: [[TMP4:%.]] = bitcast float %a to i8*
5812	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
5813	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL]], i32 0, i32 0
5814	// CHECK: [[TMP5:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
5815	// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
5816	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
5817	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL1]], i32 0, i32 1
5818	// CHECK: [[TMP7:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
5819	// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
5820	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
5821	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL3]], i32 0, i32 2
5822	// CHECK: [[TMP9:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX4]], align 8
5823	// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
5824	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
5825	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
5826	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
5827	// CHECK: [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>
5828	float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
5829	return vld3_lane_f32(a, b, 1);
5830	}
5831
5832	// CHECK-LABEL: @test_vld3_lane_p8(
5833	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
5834	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
5835	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
5836	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[B]], i32 0, i32 0
5837	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
5838	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
5839	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x3_t [[__S1]] to i8*
5840	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x3_t [[B]] to i8*
5841	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
5842	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x8x3_t [[__RET]] to i8*
5843	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
5844	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i32 0, i32 0
5845	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
5846	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
5847	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i32 0, i32 1
5848	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
5849	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
5850	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i32 0, i32 2
5851	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
5852	// CHECK: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>
5853	poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
5854	return vld3_lane_p8(a, b, 7);
5855	}
5856
5857	// CHECK-LABEL: @test_vld3_lane_p16(
5858	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
5859	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
5860	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
5861	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[B]], i32 0, i32 0
5862	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i16>] [[COERCE_DIVE]] to [3 x i64]*
5863	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
5864	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x3_t [[__S1]] to i8*
5865	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x4x3_t [[B]] to i8*
5866	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
5867	// CHECK: [[TMP3:%.]] = bitcast %struct.poly16x4x3_t [[__RET]] to i8*
5868	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
5869	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
5870	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i32 0, i32 0
5871	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
5872	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5873	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
5874	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i32 0, i32 1
5875	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
5876	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5877	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
5878	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i32 0, i32 2
5879	// CHECK: [[TMP9:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
5880	// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
5881	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5882	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5883	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
5884	// CHECK: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
5885	poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
5886	return vld3_lane_p16(a, b, 3);
5887	}
5888
5889	// CHECK-LABEL: @test_vld4q_u8(
5890	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
5891	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x4_t [[__RET]] to i8*
5892	// CHECK: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
5893	uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
5894	return vld4q_u8(a);
5895	}
5896
5897	// CHECK-LABEL: @test_vld4q_u16(
5898	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
5899	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x4_t [[__RET]] to i8*
5900	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
5901	// CHECK: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
5902	uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
5903	return vld4q_u16(a);
5904	}
5905
5906	// CHECK-LABEL: @test_vld4q_u32(
5907	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
5908	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x4_t [[__RET]] to i8*
5909	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
5910	// CHECK: [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
5911	uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
5912	return vld4q_u32(a);
5913	}
5914
5915	// CHECK-LABEL: @test_vld4q_s8(
5916	// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
5917	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x4_t [[__RET]] to i8*
5918	// CHECK: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
5919	int8x16x4_t test_vld4q_s8(int8_t const * a) {
5920	return vld4q_s8(a);
5921	}
5922
5923	// CHECK-LABEL: @test_vld4q_s16(
5924	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
5925	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x4_t [[__RET]] to i8*
5926	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
5927	// CHECK: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
5928	int16x8x4_t test_vld4q_s16(int16_t const * a) {
5929	return vld4q_s16(a);
5930	}
5931
5932	// CHECK-LABEL: @test_vld4q_s32(
5933	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
5934	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x4_t [[__RET]] to i8*
5935	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
5936	// CHECK: [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
5937	int32x4x4_t test_vld4q_s32(int32_t const * a) {
5938	return vld4q_s32(a);
5939	}
5940
5941	// CHECK-LABEL: @test_vld4q_f16(
5942	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
5943	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x4_t [[__RET]] to i8*
5944	// CHECK: [[TMP1:%.]] = bitcast half %a to i8*
5945	// CHECK: [[VLD4Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half>
5946	float16x8x4_t test_vld4q_f16(float16_t const * a) {
5947	return vld4q_f16(a);
5948	}
5949
5950	// CHECK-LABEL: @test_vld4q_f32(
5951	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
5952	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x4_t [[__RET]] to i8*
5953	// CHECK: [[TMP1:%.]] = bitcast float %a to i8*
5954	// CHECK: [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float>
5955	float32x4x4_t test_vld4q_f32(float32_t const * a) {
5956	return vld4q_f32(a);
5957	}
5958
5959	// CHECK-LABEL: @test_vld4q_p8(
5960	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
5961	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x4_t [[__RET]] to i8*
5962	// CHECK: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
5963	poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
5964	return vld4q_p8(a);
5965	}
5966
5967	// CHECK-LABEL: @test_vld4q_p16(
5968	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
5969	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x4_t [[__RET]] to i8*
5970	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
5971	// CHECK: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
5972	poly16x8x4_t test_vld4q_p16(poly16_t const * a) {
5973	return vld4q_p16(a);
5974	}
5975
5976	// CHECK-LABEL: @test_vld4_u8(
5977	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
5978	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x4_t [[__RET]] to i8*
5979	// CHECK: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
5980	uint8x8x4_t test_vld4_u8(uint8_t const * a) {
5981	return vld4_u8(a);
5982	}
5983
5984	// CHECK-LABEL: @test_vld4_u16(
5985	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
5986	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x4_t [[__RET]] to i8*
5987	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
5988	// CHECK: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
5989	uint16x4x4_t test_vld4_u16(uint16_t const * a) {
5990	return vld4_u16(a);
5991	}
5992
5993	// CHECK-LABEL: @test_vld4_u32(
5994	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
5995	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x4_t [[__RET]] to i8*
5996	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
5997	// CHECK: [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
5998	uint32x2x4_t test_vld4_u32(uint32_t const * a) {
5999	return vld4_u32(a);
6000	}
6001
6002	// CHECK-LABEL: @test_vld4_u64(
6003	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
6004	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x4_t [[__RET]] to i8*
6005	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
6006	// CHECK: [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
6007	uint64x1x4_t test_vld4_u64(uint64_t const * a) {
6008	return vld4_u64(a);
6009	}
6010
6011	// CHECK-LABEL: @test_vld4_s8(
6012	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
6013	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x4_t [[__RET]] to i8*
6014	// CHECK: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
6015	int8x8x4_t test_vld4_s8(int8_t const * a) {
6016	return vld4_s8(a);
6017	}
6018
6019	// CHECK-LABEL: @test_vld4_s16(
6020	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
6021	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x4_t [[__RET]] to i8*
6022	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
6023	// CHECK: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
6024	int16x4x4_t test_vld4_s16(int16_t const * a) {
6025	return vld4_s16(a);
6026	}
6027
6028	// CHECK-LABEL: @test_vld4_s32(
6029	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
6030	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x4_t [[__RET]] to i8*
6031	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
6032	// CHECK: [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
6033	int32x2x4_t test_vld4_s32(int32_t const * a) {
6034	return vld4_s32(a);
6035	}
6036
6037	// CHECK-LABEL: @test_vld4_s64(
6038	// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
6039	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x4_t [[__RET]] to i8*
6040	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
6041	// CHECK: [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>
6042	int64x1x4_t test_vld4_s64(int64_t const * a) {
6043	return vld4_s64(a);
6044	}
6045
6046	// CHECK-LABEL: @test_vld4_f16(
6047	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
6048	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x4_t [[__RET]] to i8*
6049	// CHECK: [[TMP1:%.]] = bitcast half %a to i8*
6050	// CHECK: [[VLD4_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half>
6051	float16x4x4_t test_vld4_f16(float16_t const * a) {
6052	return vld4_f16(a);
6053	}
6054
6055	// CHECK-LABEL: @test_vld4_f32(
6056	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
6057	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x4_t [[__RET]] to i8*
6058	// CHECK: [[TMP1:%.]] = bitcast float %a to i8*
6059	// CHECK: [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float>
6060	float32x2x4_t test_vld4_f32(float32_t const * a) {
6061	return vld4_f32(a);
6062	}
6063
6064	// CHECK-LABEL: @test_vld4_p8(
6065	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
6066	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x4_t [[__RET]] to i8*
6067	// CHECK: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
6068	poly8x8x4_t test_vld4_p8(poly8_t const * a) {
6069	return vld4_p8(a);
6070	}
6071
6072	// CHECK-LABEL: @test_vld4_p16(
6073	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
6074	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x4_t [[__RET]] to i8*
6075	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
6076	// CHECK: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
6077	poly16x4x4_t test_vld4_p16(poly16_t const * a) {
6078	return vld4_p16(a);
6079	}
6080
6081	// CHECK-LABEL: @test_vld4q_lane_u16(
6082	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
6083	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
6084	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
6085	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[B]], i32 0, i32 0
6086	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i16>] [[COERCE_DIVE]] to [8 x i64]*
6087	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
6088	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x4_t [[__S1]] to i8*
6089	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x8x4_t [[B]] to i8*
6090	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
6091	// CHECK: [[TMP3:%.]] = bitcast %struct.uint16x8x4_t [[__RET]] to i8*
6092	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
6093	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
6094	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i32 0, i32 0
6095	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
6096	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
6097	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
6098	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i32 0, i32 1
6099	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
6100	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
6101	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
6102	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i32 0, i32 2
6103	// CHECK: [[TMP9:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
6104	// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
6105	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
6106	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i32 0, i32 3
6107	// CHECK: [[TMP11:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
6108	// CHECK: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
6109	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
6110	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
6111	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
6112	// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
6113	// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
6114	uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
6115	return vld4q_lane_u16(a, b, 7);
6116	}
6117
6118	// CHECK-LABEL: @test_vld4q_lane_u32(
6119	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
6120	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
6121	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
6122	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[B]], i32 0, i32 0
6123	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i32>] [[COERCE_DIVE]] to [8 x i64]*
6124	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
6125	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x4_t [[__S1]] to i8*
6126	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x4x4_t [[B]] to i8*
6127	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
6128	// CHECK: [[TMP3:%.]] = bitcast %struct.uint32x4x4_t [[__RET]] to i8*
6129	// CHECK: [[TMP4:%.]] = bitcast i32 %a to i8*
6130	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
6131	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL]], i32 0, i32 0
6132	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
6133	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
6134	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
6135	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL1]], i32 0, i32 1
6136	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
6137	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
6138	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
6139	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL3]], i32 0, i32 2
6140	// CHECK: [[TMP9:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
6141	// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
6142	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
6143	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL5]], i32 0, i32 3
6144	// CHECK: [[TMP11:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX6]], align 16
6145	// CHECK: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
6146	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
6147	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
6148	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
6149	// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
6150	// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
6151	uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
6152	return vld4q_lane_u32(a, b, 3);
6153	}
6154
6155	// CHECK-LABEL: @test_vld4q_lane_s16(
6156	// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
6157	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
6158	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
6159	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[B]], i32 0, i32 0
6160	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i16>] [[COERCE_DIVE]] to [8 x i64]*
6161	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
6162	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x4_t [[__S1]] to i8*
6163	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x8x4_t [[B]] to i8*
6164	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
6165	// CHECK: [[TMP3:%.]] = bitcast %struct.int16x8x4_t [[__RET]] to i8*
6166	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
6167	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
6168	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i32 0, i32 0
6169	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
6170	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
6171	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
6172	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i32 0, i32 1
6173	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
6174	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
6175	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
6176	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i32 0, i32 2
6177	// CHECK: [[TMP9:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
6178	// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
6179	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
6180	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i32 0, i32 3
6181	// CHECK: [[TMP11:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
6182	// CHECK: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
6183	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
6184	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
6185	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
6186	// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
6187	// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
6188	int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
6189	return vld4q_lane_s16(a, b, 7);
6190	}
6191
6192	// CHECK-LABEL: @test_vld4q_lane_s32(
6193	// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
6194	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
6195	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
6196	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[B]], i32 0, i32 0
6197	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i32>] [[COERCE_DIVE]] to [8 x i64]*
6198	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
6199	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x4_t [[__S1]] to i8*
6200	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x4x4_t [[B]] to i8*
6201	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
6202	// CHECK: [[TMP3:%.]] = bitcast %struct.int32x4x4_t [[__RET]] to i8*
6203	// CHECK: [[TMP4:%.]] = bitcast i32 %a to i8*
6204	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
6205	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL]], i32 0, i32 0
6206	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
6207	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
6208	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
6209	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL1]], i32 0, i32 1
6210	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
6211	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
6212	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
6213	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL3]], i32 0, i32 2
6214	// CHECK: [[TMP9:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
6215	// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
6216	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
6217	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL5]], i32 0, i32 3
6218	// CHECK: [[TMP11:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX6]], align 16
6219	// CHECK: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
6220	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
6221	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
6222	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
6223	// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
6224	// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
6225	int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
6226	return vld4q_lane_s32(a, b, 3);
6227	}
6228
6229	// CHECK-LABEL: @test_vld4q_lane_f16(
6230	// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
6231	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
6232	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
6233	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[B]], i32 0, i32 0
6234	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x half>] [[COERCE_DIVE]] to [8 x i64]*
6235	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
6236	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x4_t [[__S1]] to i8*
6237	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x8x4_t [[B]] to i8*
6238	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
6239	// CHECK: [[TMP3:%.]] = bitcast %struct.float16x8x4_t [[__RET]] to i8*
6240	// CHECK: [[TMP4:%.]] = bitcast half %a to i8*
6241	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
6242	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL]], i32 0, i32 0
6243	// CHECK: [[TMP5:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
6244	// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
6245	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
6246	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL1]], i32 0, i32 1
6247	// CHECK: [[TMP7:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
6248	// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
6249	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
6250	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL3]], i32 0, i32 2
6251	// CHECK: [[TMP9:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX4]], align 16
6252	// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
6253	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
6254	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL5]], i32 0, i32 3
6255	// CHECK: [[TMP11:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX6]], align 16
6256	// CHECK: [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8>
6257	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
6258	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
6259	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
6260	// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x half>
6261	// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half>
6262	float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
6263	return vld4q_lane_f16(a, b, 7);
6264	}
6265
6266	// CHECK-LABEL: @test_vld4q_lane_f32(
6267	// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
6268	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
6269	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
6270	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[B]], i32 0, i32 0
6271	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x float>] [[COERCE_DIVE]] to [8 x i64]*
6272	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
6273	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x4_t [[__S1]] to i8*
6274	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x4x4_t [[B]] to i8*
6275	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
6276	// CHECK: [[TMP3:%.]] = bitcast %struct.float32x4x4_t [[__RET]] to i8*
6277	// CHECK: [[TMP4:%.]] = bitcast float %a to i8*
6278	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
6279	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL]], i32 0, i32 0
6280	// CHECK: [[TMP5:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
6281	// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
6282	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
6283	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL1]], i32 0, i32 1
6284	// CHECK: [[TMP7:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
6285	// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
6286	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
6287	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL3]], i32 0, i32 2
6288	// CHECK: [[TMP9:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX4]], align 16
6289	// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
6290	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
6291	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL5]], i32 0, i32 3
6292	// CHECK: [[TMP11:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX6]], align 16
6293	// CHECK: [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <16 x i8>
6294	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
6295	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
6296	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
6297	// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x float>
6298	// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float>
6299	float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
6300	return vld4q_lane_f32(a, b, 3);
6301	}
6302
6303	// CHECK-LABEL: @test_vld4q_lane_p16(
6304	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
6305	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
6306	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
6307	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[B]], i32 0, i32 0
6308	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i16>] [[COERCE_DIVE]] to [8 x i64]*
6309	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
6310	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x4_t [[__S1]] to i8*
6311	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x8x4_t [[B]] to i8*
6312	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
6313	// CHECK: [[TMP3:%.]] = bitcast %struct.poly16x8x4_t [[__RET]] to i8*
6314	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
6315	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
6316	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i32 0, i32 0
6317	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
6318	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
6319	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
6320	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i32 0, i32 1
6321	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
6322	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
6323	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
6324	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i32 0, i32 2
6325	// CHECK: [[TMP9:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
6326	// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
6327	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
6328	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i32 0, i32 3
6329	// CHECK: [[TMP11:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
6330	// CHECK: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
6331	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
6332	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
6333	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
6334	// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
6335	// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
6336	poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
6337	return vld4q_lane_p16(a, b, 7);
6338	}
6339
6340	// CHECK-LABEL: @test_vld4_lane_u8(
6341	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
6342	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
6343	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
6344	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[B]], i32 0, i32 0
6345	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
6346	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
6347	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x4_t [[__S1]] to i8*
6348	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x4_t [[B]] to i8*
6349	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
6350	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x8x4_t [[__RET]] to i8*
6351	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
6352	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i32 0, i32 0
6353	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
6354	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
6355	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i32 0, i32 1
6356	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
6357	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
6358	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i32 0, i32 2
6359	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
6360	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
6361	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i32 0, i32 3
6362	// CHECK: [[TMP7:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
6363	// CHECK: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
6364	uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
6365	return vld4_lane_u8(a, b, 7);
6366	}
6367
6368	// CHECK-LABEL: @test_vld4_lane_u16(
6369	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
6370	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
6371	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
6372	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[B]], i32 0, i32 0
6373	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i16>] [[COERCE_DIVE]] to [4 x i64]*
6374	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
6375	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x4_t [[__S1]] to i8*
6376	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x4x4_t [[B]] to i8*
6377	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
6378	// CHECK: [[TMP3:%.]] = bitcast %struct.uint16x4x4_t [[__RET]] to i8*
6379	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
6380	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
6381	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i32 0, i32 0
6382	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
6383	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
6384	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
6385	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i32 0, i32 1
6386	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
6387	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
6388	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
6389	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i32 0, i32 2
6390	// CHECK: [[TMP9:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
6391	// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
6392	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
6393	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i32 0, i32 3
6394	// CHECK: [[TMP11:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
6395	// CHECK: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
6396	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
6397	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
6398	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
6399	// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
6400	// CHECK: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
6401	uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
6402	return vld4_lane_u16(a, b, 3);
6403	}
6404
6405	// CHECK-LABEL: @test_vld4_lane_u32(
6406	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
6407	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
6408	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
6409	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[B]], i32 0, i32 0
6410	// CHECK: [[TMP0:%.]] = bitcast [4 x <2 x i32>] [[COERCE_DIVE]] to [4 x i64]*
6411	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
6412	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x4_t [[__S1]] to i8*
6413	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x2x4_t [[B]] to i8*
6414	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
6415	// CHECK: [[TMP3:%.]] = bitcast %struct.uint32x2x4_t [[__RET]] to i8*
6416	// CHECK: [[TMP4:%.]] = bitcast i32 %a to i8*
6417	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
6418	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL]], i32 0, i32 0
6419	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
6420	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
6421	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
6422	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL1]], i32 0, i32 1
6423	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
6424	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
6425	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
6426	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL3]], i32 0, i32 2
6427	// CHECK: [[TMP9:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
6428	// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
6429	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
6430	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL5]], i32 0, i32 3
6431	// CHECK: [[TMP11:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX6]], align 8
6432	// CHECK: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
6433	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
6434	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
6435	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
6436	// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
6437	// CHECK: [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
6438	uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
6439	return vld4_lane_u32(a, b, 1);
6440	}
6441
6442	// CHECK-LABEL: @test_vld4_lane_s8(
6443	// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
6444	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
6445	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
6446	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[B]], i32 0, i32 0
6447	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
6448	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
6449	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x4_t [[__S1]] to i8*
6450	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x4_t [[B]] to i8*
6451	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
6452	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x8x4_t [[__RET]] to i8*
6453	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
6454	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i32 0, i32 0
6455	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
6456	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
6457	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i32 0, i32 1
6458	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
6459	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
6460	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i32 0, i32 2
6461	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
6462	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
6463	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i32 0, i32 3
6464	// CHECK: [[TMP7:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
6465	// CHECK: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
6466	int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
6467	return vld4_lane_s8(a, b, 7);
6468	}
6469
6470	// CHECK-LABEL: @test_vld4_lane_s16(
6471	// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
6472	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
6473	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
6474	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[B]], i32 0, i32 0
6475	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i16>] [[COERCE_DIVE]] to [4 x i64]*
6476	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
6477	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x4_t [[__S1]] to i8*
6478	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x4x4_t [[B]] to i8*
6479	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
6480	// CHECK: [[TMP3:%.]] = bitcast %struct.int16x4x4_t [[__RET]] to i8*
6481	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
6482	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
6483	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i32 0, i32 0
6484	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
6485	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
6486	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
6487	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i32 0, i32 1
6488	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
6489	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
6490	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
6491	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i32 0, i32 2
6492	// CHECK: [[TMP9:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
6493	// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
6494	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
6495	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i32 0, i32 3
6496	// CHECK: [[TMP11:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
6497	// CHECK: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
6498	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
6499	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
6500	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
6501	// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
6502	// CHECK: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
6503	int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
6504	return vld4_lane_s16(a, b, 3);
6505	}
6506
6507	// CHECK-LABEL: @test_vld4_lane_s32(
6508	// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
6509	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
6510	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
6511	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[B]], i32 0, i32 0
6512	// CHECK: [[TMP0:%.]] = bitcast [4 x <2 x i32>] [[COERCE_DIVE]] to [4 x i64]*
6513	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
6514	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x4_t [[__S1]] to i8*
6515	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x2x4_t [[B]] to i8*
6516	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
6517	// CHECK: [[TMP3:%.]] = bitcast %struct.int32x2x4_t [[__RET]] to i8*
6518	// CHECK: [[TMP4:%.]] = bitcast i32 %a to i8*
6519	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
6520	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL]], i32 0, i32 0
6521	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
6522	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
6523	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
6524	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL1]], i32 0, i32 1
6525	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
6526	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
6527	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
6528	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL3]], i32 0, i32 2
6529	// CHECK: [[TMP9:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
6530	// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
6531	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
6532	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL5]], i32 0, i32 3
6533	// CHECK: [[TMP11:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX6]], align 8
6534	// CHECK: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
6535	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
6536	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
6537	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
6538	// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
6539	// CHECK: [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>
6540	int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
6541	return vld4_lane_s32(a, b, 1);
6542	}
6543
6544	// CHECK-LABEL: @test_vld4_lane_f16(
6545	// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
6546	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
6547	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
6548	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[B]], i32 0, i32 0
6549	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x half>] [[COERCE_DIVE]] to [4 x i64]*
6550	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
6551	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x4_t [[__S1]] to i8*
6552	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x4x4_t [[B]] to i8*
6553	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
6554	// CHECK: [[TMP3:%.]] = bitcast %struct.float16x4x4_t [[__RET]] to i8*
6555	// CHECK: [[TMP4:%.]] = bitcast half %a to i8*
6556	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
6557	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL]], i32 0, i32 0
6558	// CHECK: [[TMP5:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
6559	// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
6560	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
6561	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL1]], i32 0, i32 1
6562	// CHECK: [[TMP7:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
6563	// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
6564	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
6565	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL3]], i32 0, i32 2
6566	// CHECK: [[TMP9:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX4]], align 8
6567	// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
6568	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
6569	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL5]], i32 0, i32 3
6570	// CHECK: [[TMP11:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX6]], align 8
6571	// CHECK: [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <8 x i8>
6572	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
6573	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
6574	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
6575	// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half>
6576	// CHECK: [[VLD4_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half>
6577	float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
6578	return vld4_lane_f16(a, b, 3);
6579	}
6580
6581	// CHECK-LABEL: @test_vld4_lane_f32(
6582	// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
6583	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
6584	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
6585	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[B]], i32 0, i32 0
6586	// CHECK: [[TMP0:%.]] = bitcast [4 x <2 x float>] [[COERCE_DIVE]] to [4 x i64]*
6587	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
6588	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x4_t [[__S1]] to i8*
6589	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x2x4_t [[B]] to i8*
6590	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
6591	// CHECK: [[TMP3:%.]] = bitcast %struct.float32x2x4_t [[__RET]] to i8*
6592	// CHECK: [[TMP4:%.]] = bitcast float %a to i8*
6593	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
6594	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL]], i32 0, i32 0
6595	// CHECK: [[TMP5:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
6596	// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
6597	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
6598	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL1]], i32 0, i32 1
6599	// CHECK: [[TMP7:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
6600	// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
6601	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
6602	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL3]], i32 0, i32 2
6603	// CHECK: [[TMP9:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX4]], align 8
6604	// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
6605	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
6606	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL5]], i32 0, i32 3
6607	// CHECK: [[TMP11:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX6]], align 8
6608	// CHECK: [[TMP12:%.*]] = bitcast <2 x float> [[TMP11]] to <8 x i8>
6609	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
6610	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
6611	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
6612	// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x float>
6613	// CHECK: [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float>
6614	float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
6615	return vld4_lane_f32(a, b, 1);
6616	}
6617
6618	// CHECK-LABEL: @test_vld4_lane_p8(
6619	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
6620	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
6621	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
6622	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[B]], i32 0, i32 0
6623	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
6624	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
6625	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x4_t [[__S1]] to i8*
6626	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x4_t [[B]] to i8*
6627	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
6628	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x8x4_t [[__RET]] to i8*
6629	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
6630	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i32 0, i32 0
6631	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
6632	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
6633	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i32 0, i32 1
6634	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
6635	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
6636	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i32 0, i32 2
6637	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
6638	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
6639	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i32 0, i32 3
6640	// CHECK: [[TMP7:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
6641	// CHECK: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>
6642	poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
6643	return vld4_lane_p8(a, b, 7);
6644	}
6645
6646	// CHECK-LABEL: @test_vld4_lane_p16(
6647	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
6648	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
6649	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
6650	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[B]], i32 0, i32 0
6651	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i16>] [[COERCE_DIVE]] to [4 x i64]*
6652	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
6653	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x4_t [[__S1]] to i8*
6654	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x4x4_t [[B]] to i8*
6655	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
6656	// CHECK: [[TMP3:%.]] = bitcast %struct.poly16x4x4_t [[__RET]] to i8*
6657	// CHECK: [[TMP4:%.]] = bitcast i16 %a to i8*
6658	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
6659	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i32 0, i32 0
6660	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
6661	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
6662	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
6663	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i32 0, i32 1
6664	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
6665	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
6666	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
6667	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i32 0, i32 2
6668	// CHECK: [[TMP9:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
6669	// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
6670	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
6671	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i32 0, i32 3
6672	// CHECK: [[TMP11:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
6673	// CHECK: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
6674	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
6675	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
6676	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
6677	// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
6678	// CHECK: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
6679	poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) {
6680	return vld4_lane_p16(a, b, 3);
6681	}
6682
6683	// CHECK-LABEL: @test_vmax_s8(
6684	// CHECK: [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %a, <8 x i8> %b)
6685	// CHECK: ret <8 x i8> [[VMAX_V_I]]
6686	int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
6687	return vmax_s8(a, b);
6688	}
6689
6690	// CHECK-LABEL: @test_vmax_s16(
6691	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
6692	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
6693	// CHECK: [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %a, <4 x i16> %b)
6694	// CHECK: [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
6695	// CHECK: ret <4 x i16> [[VMAX_V2_I]]
6696	int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
6697	return vmax_s16(a, b);
6698	}
6699
6700	// CHECK-LABEL: @test_vmax_s32(
6701	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
6702	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
6703	// CHECK: [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %a, <2 x i32> %b)
6704	// CHECK: [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
6705	// CHECK: ret <2 x i32> [[VMAX_V2_I]]
6706	int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
6707	return vmax_s32(a, b);
6708	}
6709
6710	// CHECK-LABEL: @test_vmax_u8(
6711	// CHECK: [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %a, <8 x i8> %b)
6712	// CHECK: ret <8 x i8> [[VMAX_V_I]]
6713	uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
6714	return vmax_u8(a, b);
6715	}
6716
6717	// CHECK-LABEL: @test_vmax_u16(
6718	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
6719	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
6720	// CHECK: [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %a, <4 x i16> %b)
6721	// CHECK: [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
6722	// CHECK: ret <4 x i16> [[VMAX_V2_I]]
6723	uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
6724	return vmax_u16(a, b);
6725	}
6726
6727	// CHECK-LABEL: @test_vmax_u32(
6728	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
6729	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
6730	// CHECK: [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %a, <2 x i32> %b)
6731	// CHECK: [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
6732	// CHECK: ret <2 x i32> [[VMAX_V2_I]]
6733	uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
6734	return vmax_u32(a, b);
6735	}
6736
6737	// CHECK-LABEL: @test_vmax_f32(
6738	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
6739	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
6740	// CHECK: [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %b)
6741	// CHECK: [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8>
6742	// CHECK: ret <2 x float> [[VMAX_V2_I]]
6743	float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
6744	return vmax_f32(a, b);
6745	}
6746
6747	// CHECK-LABEL: @test_vmaxq_s8(
6748	// CHECK: [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %a, <16 x i8> %b)
6749	// CHECK: ret <16 x i8> [[VMAXQ_V_I]]
6750	int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
6751	return vmaxq_s8(a, b);
6752	}
6753
6754	// CHECK-LABEL: @test_vmaxq_s16(
6755	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
6756	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
6757	// CHECK: [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %a, <8 x i16> %b)
6758	// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
6759	// CHECK: ret <8 x i16> [[VMAXQ_V2_I]]
6760	int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
6761	return vmaxq_s16(a, b);
6762	}
6763
6764	// CHECK-LABEL: @test_vmaxq_s32(
6765	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
6766	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
6767	// CHECK: [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %b)
6768	// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
6769	// CHECK: ret <4 x i32> [[VMAXQ_V2_I]]
6770	int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
6771	return vmaxq_s32(a, b);
6772	}
6773
6774	// CHECK-LABEL: @test_vmaxq_u8(
6775	// CHECK: [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %a, <16 x i8> %b)
6776	// CHECK: ret <16 x i8> [[VMAXQ_V_I]]
6777	uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
6778	return vmaxq_u8(a, b);
6779	}
6780
6781	// CHECK-LABEL: @test_vmaxq_u16(
6782	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
6783	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
6784	// CHECK: [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %a, <8 x i16> %b)
6785	// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
6786	// CHECK: ret <8 x i16> [[VMAXQ_V2_I]]
6787	uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
6788	return vmaxq_u16(a, b);
6789	}
6790
6791	// CHECK-LABEL: @test_vmaxq_u32(
6792	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
6793	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
6794	// CHECK: [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %b)
6795	// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
6796	// CHECK: ret <4 x i32> [[VMAXQ_V2_I]]
6797	uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
6798	return vmaxq_u32(a, b);
6799	}
6800
6801	// CHECK-LABEL: @test_vmaxq_f32(
6802	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
6803	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
6804	// CHECK: [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %b)
6805	// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8>
6806	// CHECK: ret <4 x float> [[VMAXQ_V2_I]]
6807	float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
6808	return vmaxq_f32(a, b);
6809	}
6810
6811	// CHECK-LABEL: @test_vmin_s8(
6812	// CHECK: [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %a, <8 x i8> %b)
6813	// CHECK: ret <8 x i8> [[VMIN_V_I]]
6814	int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
6815	return vmin_s8(a, b);
6816	}
6817
6818	// CHECK-LABEL: @test_vmin_s16(
6819	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
6820	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
6821	// CHECK: [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %a, <4 x i16> %b)
6822	// CHECK: [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
6823	// CHECK: ret <4 x i16> [[VMIN_V2_I]]
6824	int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
6825	return vmin_s16(a, b);
6826	}
6827
6828	// CHECK-LABEL: @test_vmin_s32(
6829	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
6830	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
6831	// CHECK: [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %a, <2 x i32> %b)
6832	// CHECK: [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
6833	// CHECK: ret <2 x i32> [[VMIN_V2_I]]
6834	int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
6835	return vmin_s32(a, b);
6836	}
6837
6838	// CHECK-LABEL: @test_vmin_u8(
6839	// CHECK: [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %a, <8 x i8> %b)
6840	// CHECK: ret <8 x i8> [[VMIN_V_I]]
6841	uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
6842	return vmin_u8(a, b);
6843	}
6844
6845	// CHECK-LABEL: @test_vmin_u16(
6846	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
6847	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
6848	// CHECK: [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %a, <4 x i16> %b)
6849	// CHECK: [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
6850	// CHECK: ret <4 x i16> [[VMIN_V2_I]]
6851	uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
6852	return vmin_u16(a, b);
6853	}
6854
6855	// CHECK-LABEL: @test_vmin_u32(
6856	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
6857	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
6858	// CHECK: [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %a, <2 x i32> %b)
6859	// CHECK: [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
6860	// CHECK: ret <2 x i32> [[VMIN_V2_I]]
6861	uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
6862	return vmin_u32(a, b);
6863	}
6864
6865	// CHECK-LABEL: @test_vmin_f32(
6866	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
6867	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
6868	// CHECK: [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %a, <2 x float> %b)
6869	// CHECK: [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8>
6870	// CHECK: ret <2 x float> [[VMIN_V2_I]]
6871	float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
6872	return vmin_f32(a, b);
6873	}
6874
6875	// CHECK-LABEL: @test_vminq_s8(
6876	// CHECK: [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %a, <16 x i8> %b)
6877	// CHECK: ret <16 x i8> [[VMINQ_V_I]]
6878	int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
6879	return vminq_s8(a, b);
6880	}
6881
6882	// CHECK-LABEL: @test_vminq_s16(
6883	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
6884	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
6885	// CHECK: [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %a, <8 x i16> %b)
6886	// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
6887	// CHECK: ret <8 x i16> [[VMINQ_V2_I]]
6888	int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
6889	return vminq_s16(a, b);
6890	}
6891
6892	// CHECK-LABEL: @test_vminq_s32(
6893	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
6894	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
6895	// CHECK: [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %a, <4 x i32> %b)
6896	// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
6897	// CHECK: ret <4 x i32> [[VMINQ_V2_I]]
6898	int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
6899	return vminq_s32(a, b);
6900	}
6901
6902	// CHECK-LABEL: @test_vminq_u8(
6903	// CHECK: [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %a, <16 x i8> %b)
6904	// CHECK: ret <16 x i8> [[VMINQ_V_I]]
6905	uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
6906	return vminq_u8(a, b);
6907	}
6908
6909	// CHECK-LABEL: @test_vminq_u16(
6910	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
6911	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
6912	// CHECK: [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %a, <8 x i16> %b)
6913	// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
6914	// CHECK: ret <8 x i16> [[VMINQ_V2_I]]
6915	uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
6916	return vminq_u16(a, b);
6917	}
6918
6919	// CHECK-LABEL: @test_vminq_u32(
6920	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
6921	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
6922	// CHECK: [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %a, <4 x i32> %b)
6923	// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
6924	// CHECK: ret <4 x i32> [[VMINQ_V2_I]]
6925	uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
6926	return vminq_u32(a, b);
6927	}
6928
6929	// CHECK-LABEL: @test_vminq_f32(
6930	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
6931	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
6932	// CHECK: [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %a, <4 x float> %b)
6933	// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8>
6934	// CHECK: ret <4 x float> [[VMINQ_V2_I]]
6935	float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
6936	return vminq_f32(a, b);
6937	}
6938
6939	// CHECK-LABEL: @test_vmla_s8(
6940	// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c
6941	// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
6942	// CHECK: ret <8 x i8> [[ADD_I]]
6943	int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
6944	return vmla_s8(a, b, c);
6945	}
6946
6947	// CHECK-LABEL: @test_vmla_s16(
6948	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c
6949	// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
6950	// CHECK: ret <4 x i16> [[ADD_I]]
6951	int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
6952	return vmla_s16(a, b, c);
6953	}
6954
6955	// CHECK-LABEL: @test_vmla_s32(
6956	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c
6957	// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
6958	// CHECK: ret <2 x i32> [[ADD_I]]
6959	int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
6960	return vmla_s32(a, b, c);
6961	}
6962
6963	// CHECK-LABEL: @test_vmla_f32(
6964	// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, %c
6965	// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
6966	// CHECK: ret <2 x float> [[ADD_I]]
6967	float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
6968	return vmla_f32(a, b, c);
6969	}
6970
6971	// CHECK-LABEL: @test_vmla_u8(
6972	// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c
6973	// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
6974	// CHECK: ret <8 x i8> [[ADD_I]]
6975	uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
6976	return vmla_u8(a, b, c);
6977	}
6978
6979	// CHECK-LABEL: @test_vmla_u16(
6980	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c
6981	// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
6982	// CHECK: ret <4 x i16> [[ADD_I]]
6983	uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
6984	return vmla_u16(a, b, c);
6985	}
6986
6987	// CHECK-LABEL: @test_vmla_u32(
6988	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c
6989	// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
6990	// CHECK: ret <2 x i32> [[ADD_I]]
6991	uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
6992	return vmla_u32(a, b, c);
6993	}
6994
6995	// CHECK-LABEL: @test_vmlaq_s8(
6996	// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c
6997	// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
6998	// CHECK: ret <16 x i8> [[ADD_I]]
6999	int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
7000	return vmlaq_s8(a, b, c);
7001	}
7002
7003	// CHECK-LABEL: @test_vmlaq_s16(
7004	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c
7005	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
7006	// CHECK: ret <8 x i16> [[ADD_I]]
7007	int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
7008	return vmlaq_s16(a, b, c);
7009	}
7010
7011	// CHECK-LABEL: @test_vmlaq_s32(
7012	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c
7013	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
7014	// CHECK: ret <4 x i32> [[ADD_I]]
7015	int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
7016	return vmlaq_s32(a, b, c);
7017	}
7018
7019	// CHECK-LABEL: @test_vmlaq_f32(
7020	// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, %c
7021	// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
7022	// CHECK: ret <4 x float> [[ADD_I]]
7023	float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
7024	return vmlaq_f32(a, b, c);
7025	}
7026
7027	// CHECK-LABEL: @test_vmlaq_u8(
7028	// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c
7029	// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
7030	// CHECK: ret <16 x i8> [[ADD_I]]
7031	uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
7032	return vmlaq_u8(a, b, c);
7033	}
7034
7035	// CHECK-LABEL: @test_vmlaq_u16(
7036	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c
7037	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
7038	// CHECK: ret <8 x i16> [[ADD_I]]
7039	uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
7040	return vmlaq_u16(a, b, c);
7041	}
7042
7043	// CHECK-LABEL: @test_vmlaq_u32(
7044	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c
7045	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
7046	// CHECK: ret <4 x i32> [[ADD_I]]
7047	uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
7048	return vmlaq_u32(a, b, c);
7049	}
7050
7051	// CHECK-LABEL: @test_vmlal_s8(
7052	// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
7053	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
7054	// CHECK: ret <8 x i16> [[ADD_I]]
7055	int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
7056	return vmlal_s8(a, b, c);
7057	}
7058
7059	// CHECK-LABEL: @test_vmlal_s16(
7060	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7061	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
7062	// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
7063	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
7064	// CHECK: ret <4 x i32> [[ADD_I]]
7065	int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
7066	return vmlal_s16(a, b, c);
7067	}
7068
7069	// CHECK-LABEL: @test_vmlal_s32(
7070	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7071	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
7072	// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
7073	// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
7074	// CHECK: ret <2 x i64> [[ADD_I]]
7075	int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
7076	return vmlal_s32(a, b, c);
7077	}
7078
7079	// CHECK-LABEL: @test_vmlal_u8(
7080	// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
7081	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
7082	// CHECK: ret <8 x i16> [[ADD_I]]
7083	uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
7084	return vmlal_u8(a, b, c);
7085	}
7086
7087	// CHECK-LABEL: @test_vmlal_u16(
7088	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7089	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
7090	// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
7091	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
7092	// CHECK: ret <4 x i32> [[ADD_I]]
7093	uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
7094	return vmlal_u16(a, b, c);
7095	}
7096
7097	// CHECK-LABEL: @test_vmlal_u32(
7098	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7099	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
7100	// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
7101	// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
7102	// CHECK: ret <2 x i64> [[ADD_I]]
7103	uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
7104	return vmlal_u32(a, b, c);
7105	}
7106
7107	// CHECK-LABEL: @test_vmlal_lane_s16(
7108	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
7109	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7110	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
7111	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
7112	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
7113	// CHECK: ret <4 x i32> [[ADD]]
7114	int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
7115	return vmlal_lane_s16(a, b, c, 3);
7116	}
7117
7118	// CHECK-LABEL: @test_vmlal_lane_s32(
7119	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
7120	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7121	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
7122	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
7123	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
7124	// CHECK: ret <2 x i64> [[ADD]]
7125	int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
7126	return vmlal_lane_s32(a, b, c, 1);
7127	}
7128
7129	// CHECK-LABEL: @test_vmlal_lane_u16(
7130	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
7131	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7132	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
7133	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
7134	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
7135	// CHECK: ret <4 x i32> [[ADD]]
7136	uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
7137	return vmlal_lane_u16(a, b, c, 3);
7138	}
7139
7140	// CHECK-LABEL: @test_vmlal_lane_u32(
7141	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
7142	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7143	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
7144	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
7145	// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
7146	// CHECK: ret <2 x i64> [[ADD]]
7147	uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
7148	return vmlal_lane_u32(a, b, c, 1);
7149	}
7150
7151	// CHECK-LABEL: @test_vmlal_n_s16(
7152	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
7153	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
7154	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
7155	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
7156	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7157	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
7158	// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
7159	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
7160	// CHECK: ret <4 x i32> [[ADD_I]]
7161	int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
7162	return vmlal_n_s16(a, b, c);
7163	}
7164
7165	// CHECK-LABEL: @test_vmlal_n_s32(
7166	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
7167	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
7168	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7169	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
7170	// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
7171	// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
7172	// CHECK: ret <2 x i64> [[ADD_I]]
7173	int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
7174	return vmlal_n_s32(a, b, c);
7175	}
7176
7177	// CHECK-LABEL: @test_vmlal_n_u16(
7178	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
7179	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
7180	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
7181	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
7182	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7183	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
7184	// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
7185	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
7186	// CHECK: ret <4 x i32> [[ADD_I]]
7187	uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
7188	return vmlal_n_u16(a, b, c);
7189	}
7190
7191	// CHECK-LABEL: @test_vmlal_n_u32(
7192	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
7193	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
7194	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7195	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
7196	// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
7197	// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
7198	// CHECK: ret <2 x i64> [[ADD_I]]
7199	uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
7200	return vmlal_n_u32(a, b, c);
7201	}
7202
7203	// CHECK-LABEL: @test_vmla_lane_s16(
7204	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
7205	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
7206	// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
7207	// CHECK: ret <4 x i16> [[ADD]]
7208	int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
7209	return vmla_lane_s16(a, b, c, 3);
7210	}
7211
7212	// CHECK-LABEL: @test_vmla_lane_s32(
7213	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
7214	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
7215	// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
7216	// CHECK: ret <2 x i32> [[ADD]]
7217	int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
7218	return vmla_lane_s32(a, b, c, 1);
7219	}
7220
7221	// CHECK-LABEL: @test_vmla_lane_u16(
7222	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
7223	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
7224	// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
7225	// CHECK: ret <4 x i16> [[ADD]]
7226	uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
7227	return vmla_lane_u16(a, b, c, 3);
7228	}
7229
7230	// CHECK-LABEL: @test_vmla_lane_u32(
7231	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
7232	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
7233	// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
7234	// CHECK: ret <2 x i32> [[ADD]]
7235	uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
7236	return vmla_lane_u32(a, b, c, 1);
7237	}
7238
7239	// CHECK-LABEL: @test_vmla_lane_f32(
7240	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
7241	// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
7242	// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
7243	// CHECK: ret <2 x float> [[ADD]]
7244	float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
7245	return vmla_lane_f32(a, b, c, 1);
7246	}
7247
7248	// CHECK-LABEL: @test_vmlaq_lane_s16(
7249	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
7250	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
7251	// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
7252	// CHECK: ret <8 x i16> [[ADD]]
7253	int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
7254	return vmlaq_lane_s16(a, b, c, 3);
7255	}
7256
7257	// CHECK-LABEL: @test_vmlaq_lane_s32(
7258	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
7259	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
7260	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
7261	// CHECK: ret <4 x i32> [[ADD]]
7262	int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
7263	return vmlaq_lane_s32(a, b, c, 1);
7264	}
7265
7266	// CHECK-LABEL: @test_vmlaq_lane_u16(
7267	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
7268	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
7269	// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
7270	// CHECK: ret <8 x i16> [[ADD]]
7271	uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
7272	return vmlaq_lane_u16(a, b, c, 3);
7273	}
7274
7275	// CHECK-LABEL: @test_vmlaq_lane_u32(
7276	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
7277	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
7278	// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
7279	// CHECK: ret <4 x i32> [[ADD]]
7280	uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
7281	return vmlaq_lane_u32(a, b, c, 1);
7282	}
7283
7284	// CHECK-LABEL: @test_vmlaq_lane_f32(
7285	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
7286	// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
7287	// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
7288	// CHECK: ret <4 x float> [[ADD]]
7289	float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
7290	return vmlaq_lane_f32(a, b, c, 1);
7291	}
7292
7293	// CHECK-LABEL: @test_vmla_n_s16(
7294	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
7295	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
7296	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
7297	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
7298	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
7299	// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
7300	// CHECK: ret <4 x i16> [[ADD_I]]
7301	int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
7302	return vmla_n_s16(a, b, c);
7303	}
7304
7305	// CHECK-LABEL: @test_vmla_n_s32(
7306	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
7307	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
7308	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
7309	// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
7310	// CHECK: ret <2 x i32> [[ADD_I]]
7311	int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
7312	return vmla_n_s32(a, b, c);
7313	}
7314
7315	// CHECK-LABEL: @test_vmla_n_u16(
7316	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
7317	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
7318	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
7319	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
7320	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
7321	// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
7322	// CHECK: ret <4 x i16> [[ADD_I]]
7323	uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
7324	return vmla_n_u16(a, b, c);
7325	}
7326
7327	// CHECK-LABEL: @test_vmla_n_u32(
7328	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
7329	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
7330	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
7331	// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
7332	// CHECK: ret <2 x i32> [[ADD_I]]
7333	uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
7334	return vmla_n_u32(a, b, c);
7335	}
7336
7337	// CHECK-LABEL: @test_vmla_n_f32(
7338	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
7339	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
7340	// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
7341	// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
7342	// CHECK: ret <2 x float> [[ADD_I]]
7343	float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
7344	return vmla_n_f32(a, b, c);
7345	}
7346
7347	// CHECK-LABEL: @test_vmlaq_n_s16(
7348	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
7349	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
7350	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
7351	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
7352	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
7353	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
7354	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
7355	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
7356	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
7357	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
7358	// CHECK: ret <8 x i16> [[ADD_I]]
7359	int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
7360	return vmlaq_n_s16(a, b, c);
7361	}
7362
7363	// CHECK-LABEL: @test_vmlaq_n_s32(
7364	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
7365	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
7366	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
7367	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
7368	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
7369	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
7370	// CHECK: ret <4 x i32> [[ADD_I]]
7371	int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
7372	return vmlaq_n_s32(a, b, c);
7373	}
7374
7375	// CHECK-LABEL: @test_vmlaq_n_u16(
7376	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
7377	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
7378	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
7379	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
7380	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
7381	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
7382	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
7383	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
7384	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
7385	// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
7386	// CHECK: ret <8 x i16> [[ADD_I]]
7387	uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
7388	return vmlaq_n_u16(a, b, c);
7389	}
7390
7391	// CHECK-LABEL: @test_vmlaq_n_u32(
7392	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
7393	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
7394	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
7395	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
7396	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
7397	// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
7398	// CHECK: ret <4 x i32> [[ADD_I]]
7399	uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
7400	return vmlaq_n_u32(a, b, c);
7401	}
7402
7403	// CHECK-LABEL: @test_vmlaq_n_f32(
7404	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
7405	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
7406	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
7407	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
7408	// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
7409	// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
7410	// CHECK: ret <4 x float> [[ADD_I]]
7411	float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
7412	return vmlaq_n_f32(a, b, c);
7413	}
7414
7415	// CHECK-LABEL: @test_vmls_s8(
7416	// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c
7417	// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
7418	// CHECK: ret <8 x i8> [[SUB_I]]
7419	int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
7420	return vmls_s8(a, b, c);
7421	}
7422
7423	// CHECK-LABEL: @test_vmls_s16(
7424	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c
7425	// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
7426	// CHECK: ret <4 x i16> [[SUB_I]]
7427	int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
7428	return vmls_s16(a, b, c);
7429	}
7430
7431	// CHECK-LABEL: @test_vmls_s32(
7432	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c
7433	// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
7434	// CHECK: ret <2 x i32> [[SUB_I]]
7435	int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
7436	return vmls_s32(a, b, c);
7437	}
7438
7439	// CHECK-LABEL: @test_vmls_f32(
7440	// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, %c
7441	// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
7442	// CHECK: ret <2 x float> [[SUB_I]]
7443	float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
7444	return vmls_f32(a, b, c);
7445	}
7446
7447	// CHECK-LABEL: @test_vmls_u8(
7448	// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c
7449	// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
7450	// CHECK: ret <8 x i8> [[SUB_I]]
7451	uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
7452	return vmls_u8(a, b, c);
7453	}
7454
7455	// CHECK-LABEL: @test_vmls_u16(
7456	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c
7457	// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
7458	// CHECK: ret <4 x i16> [[SUB_I]]
7459	uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
7460	return vmls_u16(a, b, c);
7461	}
7462
7463	// CHECK-LABEL: @test_vmls_u32(
7464	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c
7465	// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
7466	// CHECK: ret <2 x i32> [[SUB_I]]
7467	uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
7468	return vmls_u32(a, b, c);
7469	}
7470
7471	// CHECK-LABEL: @test_vmlsq_s8(
7472	// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c
7473	// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
7474	// CHECK: ret <16 x i8> [[SUB_I]]
7475	int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
7476	return vmlsq_s8(a, b, c);
7477	}
7478
7479	// CHECK-LABEL: @test_vmlsq_s16(
7480	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c
7481	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
7482	// CHECK: ret <8 x i16> [[SUB_I]]
7483	int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
7484	return vmlsq_s16(a, b, c);
7485	}
7486
7487	// CHECK-LABEL: @test_vmlsq_s32(
7488	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c
7489	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
7490	// CHECK: ret <4 x i32> [[SUB_I]]
7491	int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
7492	return vmlsq_s32(a, b, c);
7493	}
7494
7495	// CHECK-LABEL: @test_vmlsq_f32(
7496	// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, %c
7497	// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
7498	// CHECK: ret <4 x float> [[SUB_I]]
7499	float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
7500	return vmlsq_f32(a, b, c);
7501	}
7502
7503	// CHECK-LABEL: @test_vmlsq_u8(
7504	// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c
7505	// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
7506	// CHECK: ret <16 x i8> [[SUB_I]]
7507	uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
7508	return vmlsq_u8(a, b, c);
7509	}
7510
7511	// CHECK-LABEL: @test_vmlsq_u16(
7512	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c
7513	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
7514	// CHECK: ret <8 x i16> [[SUB_I]]
7515	uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
7516	return vmlsq_u16(a, b, c);
7517	}
7518
7519	// CHECK-LABEL: @test_vmlsq_u32(
7520	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c
7521	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
7522	// CHECK: ret <4 x i32> [[SUB_I]]
7523	uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
7524	return vmlsq_u32(a, b, c);
7525	}
7526
7527	// CHECK-LABEL: @test_vmlsl_s8(
7528	// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
7529	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
7530	// CHECK: ret <8 x i16> [[SUB_I]]
7531	int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
7532	return vmlsl_s8(a, b, c);
7533	}
7534
7535	// CHECK-LABEL: @test_vmlsl_s16(
7536	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7537	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
7538	// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
7539	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
7540	// CHECK: ret <4 x i32> [[SUB_I]]
7541	int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
7542	return vmlsl_s16(a, b, c);
7543	}
7544
7545	// CHECK-LABEL: @test_vmlsl_s32(
7546	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7547	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
7548	// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
7549	// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
7550	// CHECK: ret <2 x i64> [[SUB_I]]
7551	int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
7552	return vmlsl_s32(a, b, c);
7553	}
7554
7555	// CHECK-LABEL: @test_vmlsl_u8(
7556	// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
7557	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
7558	// CHECK: ret <8 x i16> [[SUB_I]]
7559	uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
7560	return vmlsl_u8(a, b, c);
7561	}
7562
7563	// CHECK-LABEL: @test_vmlsl_u16(
7564	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7565	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
7566	// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
7567	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
7568	// CHECK: ret <4 x i32> [[SUB_I]]
7569	uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
7570	return vmlsl_u16(a, b, c);
7571	}
7572
7573	// CHECK-LABEL: @test_vmlsl_u32(
7574	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7575	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
7576	// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
7577	// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
7578	// CHECK: ret <2 x i64> [[SUB_I]]
7579	uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
7580	return vmlsl_u32(a, b, c);
7581	}
7582
7583	// CHECK-LABEL: @test_vmlsl_lane_s16(
7584	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
7585	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7586	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
7587	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
7588	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
7589	// CHECK: ret <4 x i32> [[SUB]]
7590	int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
7591	return vmlsl_lane_s16(a, b, c, 3);
7592	}
7593
7594	// CHECK-LABEL: @test_vmlsl_lane_s32(
7595	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
7596	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7597	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
7598	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
7599	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
7600	// CHECK: ret <2 x i64> [[SUB]]
7601	int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
7602	return vmlsl_lane_s32(a, b, c, 1);
7603	}
7604
7605	// CHECK-LABEL: @test_vmlsl_lane_u16(
7606	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
7607	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7608	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
7609	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
7610	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
7611	// CHECK: ret <4 x i32> [[SUB]]
7612	uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
7613	return vmlsl_lane_u16(a, b, c, 3);
7614	}
7615
7616	// CHECK-LABEL: @test_vmlsl_lane_u32(
7617	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
7618	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7619	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
7620	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
7621	// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
7622	// CHECK: ret <2 x i64> [[SUB]]
7623	uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
7624	return vmlsl_lane_u32(a, b, c, 1);
7625	}
7626
7627	// CHECK-LABEL: @test_vmlsl_n_s16(
7628	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
7629	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
7630	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
7631	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
7632	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7633	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
7634	// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
7635	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
7636	// CHECK: ret <4 x i32> [[SUB_I]]
7637	int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
7638	return vmlsl_n_s16(a, b, c);
7639	}
7640
7641	// CHECK-LABEL: @test_vmlsl_n_s32(
7642	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
7643	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
7644	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7645	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
7646	// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
7647	// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
7648	// CHECK: ret <2 x i64> [[SUB_I]]
7649	int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
7650	return vmlsl_n_s32(a, b, c);
7651	}
7652
7653	// CHECK-LABEL: @test_vmlsl_n_u16(
7654	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
7655	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
7656	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
7657	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
7658	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
7659	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
7660	// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
7661	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
7662	// CHECK: ret <4 x i32> [[SUB_I]]
7663	uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
7664	return vmlsl_n_u16(a, b, c);
7665	}
7666
7667	// CHECK-LABEL: @test_vmlsl_n_u32(
7668	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
7669	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
7670	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
7671	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
7672	// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
7673	// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
7674	// CHECK: ret <2 x i64> [[SUB_I]]
7675	uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
7676	return vmlsl_n_u32(a, b, c);
7677	}
7678
7679	// CHECK-LABEL: @test_vmls_lane_s16(
7680	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
7681	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
7682	// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
7683	// CHECK: ret <4 x i16> [[SUB]]
7684	int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
7685	return vmls_lane_s16(a, b, c, 3);
7686	}
7687
7688	// CHECK-LABEL: @test_vmls_lane_s32(
7689	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
7690	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
7691	// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
7692	// CHECK: ret <2 x i32> [[SUB]]
7693	int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
7694	return vmls_lane_s32(a, b, c, 1);
7695	}
7696
7697	// CHECK-LABEL: @test_vmls_lane_u16(
7698	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
7699	// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
7700	// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
7701	// CHECK: ret <4 x i16> [[SUB]]
7702	uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
7703	return vmls_lane_u16(a, b, c, 3);
7704	}
7705
7706	// CHECK-LABEL: @test_vmls_lane_u32(
7707	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
7708	// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
7709	// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
7710	// CHECK: ret <2 x i32> [[SUB]]
7711	uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
7712	return vmls_lane_u32(a, b, c, 1);
7713	}
7714
7715	// CHECK-LABEL: @test_vmls_lane_f32(
7716	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
7717	// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
7718	// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
7719	// CHECK: ret <2 x float> [[SUB]]
7720	float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
7721	return vmls_lane_f32(a, b, c, 1);
7722	}
7723
7724	// CHECK-LABEL: @test_vmlsq_lane_s16(
7725	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
7726	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
7727	// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
7728	// CHECK: ret <8 x i16> [[SUB]]
7729	int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
7730	return vmlsq_lane_s16(a, b, c, 3);
7731	}
7732
7733	// CHECK-LABEL: @test_vmlsq_lane_s32(
7734	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
7735	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
7736	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
7737	// CHECK: ret <4 x i32> [[SUB]]
7738	int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
7739	return vmlsq_lane_s32(a, b, c, 1);
7740	}
7741
7742	// CHECK-LABEL: @test_vmlsq_lane_u16(
7743	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
7744	// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
7745	// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
7746	// CHECK: ret <8 x i16> [[SUB]]
7747	uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
7748	return vmlsq_lane_u16(a, b, c, 3);
7749	}
7750
7751	// CHECK-LABEL: @test_vmlsq_lane_u32(
7752	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
7753	// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
7754	// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
7755	// CHECK: ret <4 x i32> [[SUB]]
7756	uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
7757	return vmlsq_lane_u32(a, b, c, 1);
7758	}
7759
7760	// CHECK-LABEL: @test_vmlsq_lane_f32(
7761	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
7762	// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
7763	// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
7764	// CHECK: ret <4 x float> [[SUB]]
7765	float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
7766	return vmlsq_lane_f32(a, b, c, 1);
7767	}
7768
7769	// CHECK-LABEL: @test_vmls_n_s16(
7770	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
7771	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
7772	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
7773	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
7774	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
7775	// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
7776	// CHECK: ret <4 x i16> [[SUB_I]]
7777	int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
7778	return vmls_n_s16(a, b, c);
7779	}
7780
7781	// CHECK-LABEL: @test_vmls_n_s32(
7782	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
7783	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
7784	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
7785	// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
7786	// CHECK: ret <2 x i32> [[SUB_I]]
7787	int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
7788	return vmls_n_s32(a, b, c);
7789	}
7790
7791	// CHECK-LABEL: @test_vmls_n_u16(
7792	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
7793	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
7794	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
7795	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
7796	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
7797	// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
7798	// CHECK: ret <4 x i16> [[SUB_I]]
7799	uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
7800	return vmls_n_u16(a, b, c);
7801	}
7802
7803	// CHECK-LABEL: @test_vmls_n_u32(
7804	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
7805	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
7806	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
7807	// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
7808	// CHECK: ret <2 x i32> [[SUB_I]]
7809	uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
7810	return vmls_n_u32(a, b, c);
7811	}
7812
7813	// CHECK-LABEL: @test_vmls_n_f32(
7814	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
7815	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
7816	// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
7817	// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
7818	// CHECK: ret <2 x float> [[SUB_I]]
7819	float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
7820	return vmls_n_f32(a, b, c);
7821	}
7822
7823	// CHECK-LABEL: @test_vmlsq_n_s16(
7824	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
7825	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
7826	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
7827	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
7828	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
7829	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
7830	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
7831	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
7832	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
7833	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
7834	// CHECK: ret <8 x i16> [[SUB_I]]
7835	int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
7836	return vmlsq_n_s16(a, b, c);
7837	}
7838
7839	// CHECK-LABEL: @test_vmlsq_n_s32(
7840	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
7841	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
7842	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
7843	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
7844	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
7845	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
7846	// CHECK: ret <4 x i32> [[SUB_I]]
7847	int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
7848	return vmlsq_n_s32(a, b, c);
7849	}
7850
7851	// CHECK-LABEL: @test_vmlsq_n_u16(
7852	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
7853	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
7854	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
7855	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
7856	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
7857	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
7858	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
7859	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
7860	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
7861	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
7862	// CHECK: ret <8 x i16> [[SUB_I]]
7863	uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
7864	return vmlsq_n_u16(a, b, c);
7865	}
7866
7867	// CHECK-LABEL: @test_vmlsq_n_u32(
7868	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
7869	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
7870	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
7871	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
7872	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
7873	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
7874	// CHECK: ret <4 x i32> [[SUB_I]]
7875	uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
7876	return vmlsq_n_u32(a, b, c);
7877	}
7878
7879	// CHECK-LABEL: @test_vmlsq_n_f32(
7880	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
7881	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
7882	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
7883	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
7884	// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
7885	// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
7886	// CHECK: ret <4 x float> [[SUB_I]]
7887	float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
7888	return vmlsq_n_f32(a, b, c);
7889	}
7890
7891	// CHECK-LABEL: @test_vmovl_s8(
7892	// CHECK: [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
7893	// CHECK: ret <8 x i16> [[VMOVL_I]]
7894	int16x8_t test_vmovl_s8(int8x8_t a) {
7895	return vmovl_s8(a);
7896	}
7897
7898	// CHECK-LABEL: @test_vmovl_s16(
7899	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
7900	// CHECK: [[VMOVL_I:%.*]] = sext <4 x i16> %a to <4 x i32>
7901	// CHECK: ret <4 x i32> [[VMOVL_I]]
7902	int32x4_t test_vmovl_s16(int16x4_t a) {
7903	return vmovl_s16(a);
7904	}
7905
7906	// CHECK-LABEL: @test_vmovl_s32(
7907	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
7908	// CHECK: [[VMOVL_I:%.*]] = sext <2 x i32> %a to <2 x i64>
7909	// CHECK: ret <2 x i64> [[VMOVL_I]]
7910	int64x2_t test_vmovl_s32(int32x2_t a) {
7911	return vmovl_s32(a);
7912	}
7913
7914	// CHECK-LABEL: @test_vmovl_u8(
7915	// CHECK: [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
7916	// CHECK: ret <8 x i16> [[VMOVL_I]]
7917	uint16x8_t test_vmovl_u8(uint8x8_t a) {
7918	return vmovl_u8(a);
7919	}
7920
7921	// CHECK-LABEL: @test_vmovl_u16(
7922	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
7923	// CHECK: [[VMOVL_I:%.*]] = zext <4 x i16> %a to <4 x i32>
7924	// CHECK: ret <4 x i32> [[VMOVL_I]]
7925	uint32x4_t test_vmovl_u16(uint16x4_t a) {
7926	return vmovl_u16(a);
7927	}
7928
7929	// CHECK-LABEL: @test_vmovl_u32(
7930	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
7931	// CHECK: [[VMOVL_I:%.*]] = zext <2 x i32> %a to <2 x i64>
7932	// CHECK: ret <2 x i64> [[VMOVL_I]]
7933	uint64x2_t test_vmovl_u32(uint32x2_t a) {
7934	return vmovl_u32(a);
7935	}
7936
7937	// CHECK-LABEL: @test_vmovn_s16(
7938	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
7939	// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
7940	// CHECK: ret <8 x i8> [[VMOVN_I]]
7941	int8x8_t test_vmovn_s16(int16x8_t a) {
7942	return vmovn_s16(a);
7943	}
7944
7945	// CHECK-LABEL: @test_vmovn_s32(
7946	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
7947	// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
7948	// CHECK: ret <4 x i16> [[VMOVN_I]]
7949	int16x4_t test_vmovn_s32(int32x4_t a) {
7950	return vmovn_s32(a);
7951	}
7952
7953	// CHECK-LABEL: @test_vmovn_s64(
7954	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
7955	// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
7956	// CHECK: ret <2 x i32> [[VMOVN_I]]
7957	int32x2_t test_vmovn_s64(int64x2_t a) {
7958	return vmovn_s64(a);
7959	}
7960
7961	// CHECK-LABEL: @test_vmovn_u16(
7962	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
7963	// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8>
7964	// CHECK: ret <8 x i8> [[VMOVN_I]]
7965	uint8x8_t test_vmovn_u16(uint16x8_t a) {
7966	return vmovn_u16(a);
7967	}
7968
7969	// CHECK-LABEL: @test_vmovn_u32(
7970	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
7971	// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16>
7972	// CHECK: ret <4 x i16> [[VMOVN_I]]
7973	uint16x4_t test_vmovn_u32(uint32x4_t a) {
7974	return vmovn_u32(a);
7975	}
7976
7977	// CHECK-LABEL: @test_vmovn_u64(
7978	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
7979	// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32>
7980	// CHECK: ret <2 x i32> [[VMOVN_I]]
7981	uint32x2_t test_vmovn_u64(uint64x2_t a) {
7982	return vmovn_u64(a);
7983	}
7984
7985	// CHECK-LABEL: @test_vmov_n_u8(
7986	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
7987	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
7988	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
7989	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
7990	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
7991	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
7992	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
7993	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
7994	// CHECK: ret <8 x i8> [[VECINIT7_I]]
7995	uint8x8_t test_vmov_n_u8(uint8_t a) {
7996	return vmov_n_u8(a);
7997	}
7998
7999	// CHECK-LABEL: @test_vmov_n_u16(
8000	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
8001	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
8002	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
8003	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
8004	// CHECK: ret <4 x i16> [[VECINIT3_I]]
8005	uint16x4_t test_vmov_n_u16(uint16_t a) {
8006	return vmov_n_u16(a);
8007	}
8008
8009	// CHECK-LABEL: @test_vmov_n_u32(
8010	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
8011	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
8012	// CHECK: ret <2 x i32> [[VECINIT1_I]]
8013	uint32x2_t test_vmov_n_u32(uint32_t a) {
8014	return vmov_n_u32(a);
8015	}
8016
8017	// CHECK-LABEL: @test_vmov_n_s8(
8018	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
8019	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
8020	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
8021	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
8022	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
8023	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
8024	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
8025	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
8026	// CHECK: ret <8 x i8> [[VECINIT7_I]]
8027	int8x8_t test_vmov_n_s8(int8_t a) {
8028	return vmov_n_s8(a);
8029	}
8030
8031	// CHECK-LABEL: @test_vmov_n_s16(
8032	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
8033	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
8034	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
8035	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
8036	// CHECK: ret <4 x i16> [[VECINIT3_I]]
8037	int16x4_t test_vmov_n_s16(int16_t a) {
8038	return vmov_n_s16(a);
8039	}
8040
8041	// CHECK-LABEL: @test_vmov_n_s32(
8042	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
8043	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
8044	// CHECK: ret <2 x i32> [[VECINIT1_I]]
8045	int32x2_t test_vmov_n_s32(int32_t a) {
8046	return vmov_n_s32(a);
8047	}
8048
8049	// CHECK-LABEL: @test_vmov_n_p8(
8050	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
8051	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
8052	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
8053	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
8054	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
8055	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
8056	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
8057	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
8058	// CHECK: ret <8 x i8> [[VECINIT7_I]]
8059	poly8x8_t test_vmov_n_p8(poly8_t a) {
8060	return vmov_n_p8(a);
8061	}
8062
8063	// CHECK-LABEL: @test_vmov_n_p16(
8064	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
8065	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
8066	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
8067	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
8068	// CHECK: ret <4 x i16> [[VECINIT3_I]]
8069	poly16x4_t test_vmov_n_p16(poly16_t a) {
8070	return vmov_n_p16(a);
8071	}
8072
8073	// CHECK-LABEL: @test_vmov_n_f16(
8074	// CHECK: [[TMP0:%.]] = load half, half %a, align 2
8075	// CHECK: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
8076	// CHECK: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
8077	// CHECK: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
8078	// CHECK: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
8079	// CHECK: ret <4 x half> [[VECINIT3]]
8080	float16x4_t test_vmov_n_f16(float16_t *a) {
8081	return vmov_n_f16(*a);
8082	}
8083
8084	// CHECK-LABEL: @test_vmov_n_f32(
8085	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
8086	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
8087	// CHECK: ret <2 x float> [[VECINIT1_I]]
8088	float32x2_t test_vmov_n_f32(float32_t a) {
8089	return vmov_n_f32(a);
8090	}
8091
8092	// CHECK-LABEL: @test_vmovq_n_u8(
8093	// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
8094	// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
8095	// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
8096	// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
8097	// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
8098	// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
8099	// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
8100	// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
8101	// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
8102	// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
8103	// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
8104	// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
8105	// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
8106	// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
8107	// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
8108	// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
8109	// CHECK: ret <16 x i8> [[VECINIT15_I]]
8110	uint8x16_t test_vmovq_n_u8(uint8_t a) {
8111	return vmovq_n_u8(a);
8112	}
8113
8114	// CHECK-LABEL: @test_vmovq_n_u16(
8115	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
8116	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
8117	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
8118	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
8119	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
8120	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
8121	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
8122	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
8123	// CHECK: ret <8 x i16> [[VECINIT7_I]]
8124	uint16x8_t test_vmovq_n_u16(uint16_t a) {
8125	return vmovq_n_u16(a);
8126	}
8127
8128	// CHECK-LABEL: @test_vmovq_n_u32(
8129	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
8130	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
8131	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
8132	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
8133	// CHECK: ret <4 x i32> [[VECINIT3_I]]
8134	uint32x4_t test_vmovq_n_u32(uint32_t a) {
8135	return vmovq_n_u32(a);
8136	}
8137
8138	// CHECK-LABEL: @test_vmovq_n_s8(
8139	// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
8140	// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
8141	// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
8142	// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
8143	// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
8144	// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
8145	// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
8146	// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
8147	// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
8148	// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
8149	// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
8150	// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
8151	// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
8152	// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
8153	// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
8154	// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
8155	// CHECK: ret <16 x i8> [[VECINIT15_I]]
8156	int8x16_t test_vmovq_n_s8(int8_t a) {
8157	return vmovq_n_s8(a);
8158	}
8159
8160	// CHECK-LABEL: @test_vmovq_n_s16(
8161	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
8162	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
8163	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
8164	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
8165	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
8166	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
8167	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
8168	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
8169	// CHECK: ret <8 x i16> [[VECINIT7_I]]
8170	int16x8_t test_vmovq_n_s16(int16_t a) {
8171	return vmovq_n_s16(a);
8172	}
8173
8174	// CHECK-LABEL: @test_vmovq_n_s32(
8175	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
8176	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
8177	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
8178	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
8179	// CHECK: ret <4 x i32> [[VECINIT3_I]]
8180	int32x4_t test_vmovq_n_s32(int32_t a) {
8181	return vmovq_n_s32(a);
8182	}
8183
8184	// CHECK-LABEL: @test_vmovq_n_p8(
8185	// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
8186	// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
8187	// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
8188	// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
8189	// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
8190	// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
8191	// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
8192	// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
8193	// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
8194	// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
8195	// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
8196	// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
8197	// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
8198	// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
8199	// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
8200	// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
8201	// CHECK: ret <16 x i8> [[VECINIT15_I]]
8202	poly8x16_t test_vmovq_n_p8(poly8_t a) {
8203	return vmovq_n_p8(a);
8204	}
8205
8206	// CHECK-LABEL: @test_vmovq_n_p16(
8207	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
8208	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
8209	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
8210	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
8211	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
8212	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
8213	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
8214	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
8215	// CHECK: ret <8 x i16> [[VECINIT7_I]]
8216	poly16x8_t test_vmovq_n_p16(poly16_t a) {
8217	return vmovq_n_p16(a);
8218	}
8219
8220	// CHECK-LABEL: @test_vmovq_n_f16(
8221	// CHECK: [[TMP0:%.]] = load half, half %a, align 2
8222	// CHECK: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
8223	// CHECK: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
8224	// CHECK: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
8225	// CHECK: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
8226	// CHECK: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
8227	// CHECK: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
8228	// CHECK: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
8229	// CHECK: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
8230	// CHECK: ret <8 x half> [[VECINIT7]]
8231	float16x8_t test_vmovq_n_f16(float16_t *a) {
8232	return vmovq_n_f16(*a);
8233	}
8234
8235	// CHECK-LABEL: @test_vmovq_n_f32(
8236	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
8237	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
8238	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
8239	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
8240	// CHECK: ret <4 x float> [[VECINIT3_I]]
8241	float32x4_t test_vmovq_n_f32(float32_t a) {
8242	return vmovq_n_f32(a);
8243	}
8244
8245	// CHECK-LABEL: @test_vmov_n_s64(
8246	// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
8247	// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
8248	// CHECK: ret <1 x i64> [[ADD_I]]
8249	int64x1_t test_vmov_n_s64(int64_t a) {
8250	int64x1_t tmp = vmov_n_s64(a);
8251	return vadd_s64(tmp, tmp);
8252	}
8253
8254	// CHECK-LABEL: @test_vmov_n_u64(
8255	// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
8256	// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
8257	// CHECK: ret <1 x i64> [[ADD_I]]
8258	uint64x1_t test_vmov_n_u64(uint64_t a) {
8259	uint64x1_t tmp = vmov_n_u64(a);
8260	return vadd_u64(tmp, tmp);
8261	}
8262
8263	// CHECK-LABEL: @test_vmovq_n_s64(
8264	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
8265	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
8266	// CHECK: ret <2 x i64> [[VECINIT1_I]]
8267	int64x2_t test_vmovq_n_s64(int64_t a) {
8268	return vmovq_n_s64(a);
8269	}
8270
8271	// CHECK-LABEL: @test_vmovq_n_u64(
8272	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
8273	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
8274	// CHECK: ret <2 x i64> [[VECINIT1_I]]
8275	uint64x2_t test_vmovq_n_u64(uint64_t a) {
8276	return vmovq_n_u64(a);
8277	}
8278
8279	// CHECK-LABEL: @test_vmul_s8(
8280	// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %a, %b
8281	// CHECK: ret <8 x i8> [[MUL_I]]
8282	int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) {
8283	return vmul_s8(a, b);
8284	}
8285
8286	// CHECK-LABEL: @test_vmul_s16(
8287	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, %b
8288	// CHECK: ret <4 x i16> [[MUL_I]]
8289	int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) {
8290	return vmul_s16(a, b);
8291	}
8292
8293	// CHECK-LABEL: @test_vmul_s32(
8294	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, %b
8295	// CHECK: ret <2 x i32> [[MUL_I]]
8296	int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) {
8297	return vmul_s32(a, b);
8298	}
8299
8300	// CHECK-LABEL: @test_vmul_f32(
8301	// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %a, %b
8302	// CHECK: ret <2 x float> [[MUL_I]]
8303	float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) {
8304	return vmul_f32(a, b);
8305	}
8306
8307	// CHECK-LABEL: @test_vmul_u8(
8308	// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %a, %b
8309	// CHECK: ret <8 x i8> [[MUL_I]]
8310	uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) {
8311	return vmul_u8(a, b);
8312	}
8313
8314	// CHECK-LABEL: @test_vmul_u16(
8315	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, %b
8316	// CHECK: ret <4 x i16> [[MUL_I]]
8317	uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) {
8318	return vmul_u16(a, b);
8319	}
8320
8321	// CHECK-LABEL: @test_vmul_u32(
8322	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, %b
8323	// CHECK: ret <2 x i32> [[MUL_I]]
8324	uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) {
8325	return vmul_u32(a, b);
8326	}
8327
8328	// CHECK-LABEL: @test_vmulq_s8(
8329	// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %a, %b
8330	// CHECK: ret <16 x i8> [[MUL_I]]
8331	int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) {
8332	return vmulq_s8(a, b);
8333	}
8334
8335	// CHECK-LABEL: @test_vmulq_s16(
8336	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, %b
8337	// CHECK: ret <8 x i16> [[MUL_I]]
8338	int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) {
8339	return vmulq_s16(a, b);
8340	}
8341
8342	// CHECK-LABEL: @test_vmulq_s32(
8343	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, %b
8344	// CHECK: ret <4 x i32> [[MUL_I]]
8345	int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) {
8346	return vmulq_s32(a, b);
8347	}
8348
8349	// CHECK-LABEL: @test_vmulq_f32(
8350	// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %a, %b
8351	// CHECK: ret <4 x float> [[MUL_I]]
8352	float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) {
8353	return vmulq_f32(a, b);
8354	}
8355
8356	// CHECK-LABEL: @test_vmulq_u8(
8357	// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %a, %b
8358	// CHECK: ret <16 x i8> [[MUL_I]]
8359	uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) {
8360	return vmulq_u8(a, b);
8361	}
8362
8363	// CHECK-LABEL: @test_vmulq_u16(
8364	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, %b
8365	// CHECK: ret <8 x i16> [[MUL_I]]
8366	uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) {
8367	return vmulq_u16(a, b);
8368	}
8369
8370	// CHECK-LABEL: @test_vmulq_u32(
8371	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, %b
8372	// CHECK: ret <4 x i32> [[MUL_I]]
8373	uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) {
8374	return vmulq_u32(a, b);
8375	}
8376
8377	// CHECK-LABEL: @test_vmull_s8(
8378	// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b)
8379	// CHECK: ret <8 x i16> [[VMULL_I]]
8380	int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
8381	return vmull_s8(a, b);
8382	}
8383
8384	// CHECK-LABEL: @test_vmull_s16(
8385	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8386	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8387	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b)
8388	// CHECK: ret <4 x i32> [[VMULL2_I]]
8389	int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
8390	return vmull_s16(a, b);
8391	}
8392
8393	// CHECK-LABEL: @test_vmull_s32(
8394	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8395	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8396	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b)
8397	// CHECK: ret <2 x i64> [[VMULL2_I]]
8398	int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
8399	return vmull_s32(a, b);
8400	}
8401
8402	// CHECK-LABEL: @test_vmull_u8(
8403	// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b)
8404	// CHECK: ret <8 x i16> [[VMULL_I]]
8405	uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
8406	return vmull_u8(a, b);
8407	}
8408
8409	// CHECK-LABEL: @test_vmull_u16(
8410	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8411	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8412	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b)
8413	// CHECK: ret <4 x i32> [[VMULL2_I]]
8414	uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
8415	return vmull_u16(a, b);
8416	}
8417
8418	// CHECK-LABEL: @test_vmull_u32(
8419	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8420	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8421	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b)
8422	// CHECK: ret <2 x i64> [[VMULL2_I]]
8423	uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
8424	return vmull_u32(a, b);
8425	}
8426
8427	// CHECK-LABEL: @test_vmull_p8(
8428	// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b)
8429	// CHECK: ret <8 x i16> [[VMULL_I]]
8430	poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
8431	return vmull_p8(a, b);
8432	}
8433
8434	// CHECK-LABEL: @test_vmull_lane_s16(
8435	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
8436	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8437	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
8438	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
8439	// CHECK: ret <4 x i32> [[VMULL2_I]]
8440	int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) {
8441	return vmull_lane_s16(a, b, 3);
8442	}
8443
8444	// CHECK-LABEL: @test_vmull_lane_s32(
8445	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
8446	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8447	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
8448	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
8449	// CHECK: ret <2 x i64> [[VMULL2_I]]
8450	int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) {
8451	return vmull_lane_s32(a, b, 1);
8452	}
8453
8454	// CHECK-LABEL: @test_vmull_lane_u16(
8455	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
8456	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8457	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
8458	// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
8459	// CHECK: ret <4 x i32> [[VMULL2_I]]
8460	uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) {
8461	return vmull_lane_u16(a, b, 3);
8462	}
8463
8464	// CHECK-LABEL: @test_vmull_lane_u32(
8465	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
8466	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8467	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
8468	// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
8469	// CHECK: ret <2 x i64> [[VMULL2_I]]
8470	uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
8471	return vmull_lane_u32(a, b, 1);
8472	}
8473
8474	// CHECK-LABEL: @test_vmull_n_s16(
8475	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8476	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
8477	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
8478	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
8479	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
8480	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
8481	// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
8482	// CHECK: ret <4 x i32> [[VMULL5_I]]
8483	int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
8484	return vmull_n_s16(a, b);
8485	}
8486
8487	// CHECK-LABEL: @test_vmull_n_s32(
8488	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8489	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
8490	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
8491	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
8492	// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
8493	// CHECK: ret <2 x i64> [[VMULL3_I]]
8494	int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
8495	return vmull_n_s32(a, b);
8496	}
8497
8498	// CHECK-LABEL: @test_vmull_n_u16(
8499	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8500	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
8501	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
8502	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
8503	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
8504	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
8505	// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
8506	// CHECK: ret <4 x i32> [[VMULL5_I]]
8507	uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
8508	return vmull_n_u16(a, b);
8509	}
8510
8511	// CHECK-LABEL: @test_vmull_n_u32(
8512	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8513	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
8514	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
8515	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
8516	// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
8517	// CHECK: ret <2 x i64> [[VMULL3_I]]
8518	uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
8519	return vmull_n_u32(a, b);
8520	}
8521
8522	// CHECK-LABEL: @test_vmul_p8(
8523	// CHECK: [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %a, <8 x i8> %b)
8524	// CHECK: ret <8 x i8> [[VMUL_V_I]]
8525	poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) {
8526	return vmul_p8(a, b);
8527	}
8528
8529	// CHECK-LABEL: @test_vmulq_p8(
8530	// CHECK: [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %a, <16 x i8> %b)
8531	// CHECK: ret <16 x i8> [[VMULQ_V_I]]
8532	poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) {
8533	return vmulq_p8(a, b);
8534	}
8535
8536	// CHECK-LABEL: @test_vmul_lane_s16(
8537	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
8538	// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
8539	// CHECK: ret <4 x i16> [[MUL]]
8540	int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) {
8541	return vmul_lane_s16(a, b, 3);
8542	}
8543
8544	// CHECK-LABEL: @test_vmul_lane_s32(
8545	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
8546	// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
8547	// CHECK: ret <2 x i32> [[MUL]]
8548	int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) {
8549	return vmul_lane_s32(a, b, 1);
8550	}
8551
8552	// CHECK-LABEL: @test_vmul_lane_f32(
8553	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> <i32 1, i32 1>
8554	// CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
8555	// CHECK: ret <2 x float> [[MUL]]
8556	float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) {
8557	return vmul_lane_f32(a, b, 1);
8558	}
8559
8560	// CHECK-LABEL: @test_vmul_lane_u16(
8561	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
8562	// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
8563	// CHECK: ret <4 x i16> [[MUL]]
8564	uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) {
8565	return vmul_lane_u16(a, b, 3);
8566	}
8567
8568	// CHECK-LABEL: @test_vmul_lane_u32(
8569	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
8570	// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
8571	// CHECK: ret <2 x i32> [[MUL]]
8572	uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) {
8573	return vmul_lane_u32(a, b, 1);
8574	}
8575
8576	// CHECK-LABEL: @test_vmulq_lane_s16(
8577	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
8578	// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
8579	// CHECK: ret <8 x i16> [[MUL]]
8580	int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) {
8581	return vmulq_lane_s16(a, b, 3);
8582	}
8583
8584	// CHECK-LABEL: @test_vmulq_lane_s32(
8585	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
8586	// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
8587	// CHECK: ret <4 x i32> [[MUL]]
8588	int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) {
8589	return vmulq_lane_s32(a, b, 1);
8590	}
8591
8592	// CHECK-LABEL: @test_vmulq_lane_f32(
8593	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
8594	// CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
8595	// CHECK: ret <4 x float> [[MUL]]
8596	float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) {
8597	return vmulq_lane_f32(a, b, 1);
8598	}
8599
8600	// CHECK-LABEL: @test_vmulq_lane_u16(
8601	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
8602	// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
8603	// CHECK: ret <8 x i16> [[MUL]]
8604	uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) {
8605	return vmulq_lane_u16(a, b, 3);
8606	}
8607
8608	// CHECK-LABEL: @test_vmulq_lane_u32(
8609	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
8610	// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
8611	// CHECK: ret <4 x i32> [[MUL]]
8612	uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) {
8613	return vmulq_lane_u32(a, b, 1);
8614	}
8615
8616	// CHECK-LABEL: @test_vmul_n_s16(
8617	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
8618	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
8619	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
8620	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
8621	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
8622	// CHECK: ret <4 x i16> [[MUL_I]]
8623	int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
8624	return vmul_n_s16(a, b);
8625	}
8626
8627	// CHECK-LABEL: @test_vmul_n_s32(
8628	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
8629	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
8630	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
8631	// CHECK: ret <2 x i32> [[MUL_I]]
8632	int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
8633	return vmul_n_s32(a, b);
8634	}
8635
8636	// CHECK-LABEL: @test_vmul_n_f32(
8637	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
8638	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
8639	// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
8640	// CHECK: ret <2 x float> [[MUL_I]]
8641	float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
8642	return vmul_n_f32(a, b);
8643	}
8644
8645	// CHECK-LABEL: @test_vmul_n_u16(
8646	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
8647	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
8648	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
8649	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
8650	// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
8651	// CHECK: ret <4 x i16> [[MUL_I]]
8652	uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
8653	return vmul_n_u16(a, b);
8654	}
8655
8656	// CHECK-LABEL: @test_vmul_n_u32(
8657	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
8658	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
8659	// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
8660	// CHECK: ret <2 x i32> [[MUL_I]]
8661	uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
8662	return vmul_n_u32(a, b);
8663	}
8664
8665	// CHECK-LABEL: @test_vmulq_n_s16(
8666	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
8667	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
8668	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
8669	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
8670	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
8671	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
8672	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
8673	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
8674	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
8675	// CHECK: ret <8 x i16> [[MUL_I]]
8676	int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
8677	return vmulq_n_s16(a, b);
8678	}
8679
8680	// CHECK-LABEL: @test_vmulq_n_s32(
8681	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
8682	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
8683	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
8684	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
8685	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
8686	// CHECK: ret <4 x i32> [[MUL_I]]
8687	int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
8688	return vmulq_n_s32(a, b);
8689	}
8690
8691	// CHECK-LABEL: @test_vmulq_n_f32(
8692	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
8693	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
8694	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
8695	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
8696	// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
8697	// CHECK: ret <4 x float> [[MUL_I]]
8698	float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
8699	return vmulq_n_f32(a, b);
8700	}
8701
8702	// CHECK-LABEL: @test_vmulq_n_u16(
8703	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
8704	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
8705	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
8706	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
8707	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
8708	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
8709	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
8710	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
8711	// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
8712	// CHECK: ret <8 x i16> [[MUL_I]]
8713	uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
8714	return vmulq_n_u16(a, b);
8715	}
8716
8717	// CHECK-LABEL: @test_vmulq_n_u32(
8718	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
8719	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
8720	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
8721	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
8722	// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
8723	// CHECK: ret <4 x i32> [[MUL_I]]
8724	uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
8725	return vmulq_n_u32(a, b);
8726	}
8727
8728	// CHECK-LABEL: @test_vmvn_s8(
8729	// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8730	// CHECK: ret <8 x i8> [[NEG_I]]
8731	int8x8_t test_vmvn_s8(int8x8_t a) {
8732	return vmvn_s8(a);
8733	}
8734
8735	// CHECK-LABEL: @test_vmvn_s16(
8736	// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
8737	// CHECK: ret <4 x i16> [[NEG_I]]
8738	int16x4_t test_vmvn_s16(int16x4_t a) {
8739	return vmvn_s16(a);
8740	}
8741
8742	// CHECK-LABEL: @test_vmvn_s32(
8743	// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
8744	// CHECK: ret <2 x i32> [[NEG_I]]
8745	int32x2_t test_vmvn_s32(int32x2_t a) {
8746	return vmvn_s32(a);
8747	}
8748
8749	// CHECK-LABEL: @test_vmvn_u8(
8750	// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8751	// CHECK: ret <8 x i8> [[NEG_I]]
8752	uint8x8_t test_vmvn_u8(uint8x8_t a) {
8753	return vmvn_u8(a);
8754	}
8755
8756	// CHECK-LABEL: @test_vmvn_u16(
8757	// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
8758	// CHECK: ret <4 x i16> [[NEG_I]]
8759	uint16x4_t test_vmvn_u16(uint16x4_t a) {
8760	return vmvn_u16(a);
8761	}
8762
8763	// CHECK-LABEL: @test_vmvn_u32(
8764	// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
8765	// CHECK: ret <2 x i32> [[NEG_I]]
8766	uint32x2_t test_vmvn_u32(uint32x2_t a) {
8767	return vmvn_u32(a);
8768	}
8769
8770	// CHECK-LABEL: @test_vmvn_p8(
8771	// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8772	// CHECK: ret <8 x i8> [[NEG_I]]
8773	poly8x8_t test_vmvn_p8(poly8x8_t a) {
8774	return vmvn_p8(a);
8775	}
8776
8777	// CHECK-LABEL: @test_vmvnq_s8(
8778	// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8779	// CHECK: ret <16 x i8> [[NEG_I]]
8780	int8x16_t test_vmvnq_s8(int8x16_t a) {
8781	return vmvnq_s8(a);
8782	}
8783
8784	// CHECK-LABEL: @test_vmvnq_s16(
8785	// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
8786	// CHECK: ret <8 x i16> [[NEG_I]]
8787	int16x8_t test_vmvnq_s16(int16x8_t a) {
8788	return vmvnq_s16(a);
8789	}
8790
8791	// CHECK-LABEL: @test_vmvnq_s32(
8792	// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
8793	// CHECK: ret <4 x i32> [[NEG_I]]
8794	int32x4_t test_vmvnq_s32(int32x4_t a) {
8795	return vmvnq_s32(a);
8796	}
8797
8798	// CHECK-LABEL: @test_vmvnq_u8(
8799	// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8800	// CHECK: ret <16 x i8> [[NEG_I]]
8801	uint8x16_t test_vmvnq_u8(uint8x16_t a) {
8802	return vmvnq_u8(a);
8803	}
8804
8805	// CHECK-LABEL: @test_vmvnq_u16(
8806	// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
8807	// CHECK: ret <8 x i16> [[NEG_I]]
8808	uint16x8_t test_vmvnq_u16(uint16x8_t a) {
8809	return vmvnq_u16(a);
8810	}
8811
8812	// CHECK-LABEL: @test_vmvnq_u32(
8813	// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
8814	// CHECK: ret <4 x i32> [[NEG_I]]
8815	uint32x4_t test_vmvnq_u32(uint32x4_t a) {
8816	return vmvnq_u32(a);
8817	}
8818
8819	// CHECK-LABEL: @test_vmvnq_p8(
8820	// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8821	// CHECK: ret <16 x i8> [[NEG_I]]
8822	poly8x16_t test_vmvnq_p8(poly8x16_t a) {
8823	return vmvnq_p8(a);
8824	}
8825
8826	// CHECK-LABEL: @test_vneg_s8(
8827	// CHECK: [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
8828	// CHECK: ret <8 x i8> [[SUB_I]]
8829	int8x8_t test_vneg_s8(int8x8_t a) {
8830	return vneg_s8(a);
8831	}
8832
8833	// CHECK-LABEL: @test_vneg_s16(
8834	// CHECK: [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
8835	// CHECK: ret <4 x i16> [[SUB_I]]
8836	int16x4_t test_vneg_s16(int16x4_t a) {
8837	return vneg_s16(a);
8838	}
8839
8840	// CHECK-LABEL: @test_vneg_s32(
8841	// CHECK: [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
8842	// CHECK: ret <2 x i32> [[SUB_I]]
8843	int32x2_t test_vneg_s32(int32x2_t a) {
8844	return vneg_s32(a);
8845	}
8846
8847	// CHECK-LABEL: @test_vneg_f32(
8848	// CHECK: [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
8849	// CHECK: ret <2 x float> [[SUB_I]]
8850	float32x2_t test_vneg_f32(float32x2_t a) {
8851	return vneg_f32(a);
8852	}
8853
8854	// CHECK-LABEL: @test_vnegq_s8(
8855	// CHECK: [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
8856	// CHECK: ret <16 x i8> [[SUB_I]]
8857	int8x16_t test_vnegq_s8(int8x16_t a) {
8858	return vnegq_s8(a);
8859	}
8860
8861	// CHECK-LABEL: @test_vnegq_s16(
8862	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
8863	// CHECK: ret <8 x i16> [[SUB_I]]
8864	int16x8_t test_vnegq_s16(int16x8_t a) {
8865	return vnegq_s16(a);
8866	}
8867
8868	// CHECK-LABEL: @test_vnegq_s32(
8869	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
8870	// CHECK: ret <4 x i32> [[SUB_I]]
8871	int32x4_t test_vnegq_s32(int32x4_t a) {
8872	return vnegq_s32(a);
8873	}
8874
8875	// CHECK-LABEL: @test_vnegq_f32(
8876	// CHECK: [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
8877	// CHECK: ret <4 x float> [[SUB_I]]
8878	float32x4_t test_vnegq_f32(float32x4_t a) {
8879	return vnegq_f32(a);
8880	}
8881
8882	// CHECK-LABEL: @test_vorn_s8(
8883	// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8884	// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
8885	// CHECK: ret <8 x i8> [[OR_I]]
8886	int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
8887	return vorn_s8(a, b);
8888	}
8889
8890	// CHECK-LABEL: @test_vorn_s16(
8891	// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
8892	// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
8893	// CHECK: ret <4 x i16> [[OR_I]]
8894	int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
8895	return vorn_s16(a, b);
8896	}
8897
8898	// CHECK-LABEL: @test_vorn_s32(
8899	// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
8900	// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
8901	// CHECK: ret <2 x i32> [[OR_I]]
8902	int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
8903	return vorn_s32(a, b);
8904	}
8905
8906	// CHECK-LABEL: @test_vorn_s64(
8907	// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
8908	// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
8909	// CHECK: ret <1 x i64> [[OR_I]]
8910	int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
8911	return vorn_s64(a, b);
8912	}
8913
8914	// CHECK-LABEL: @test_vorn_u8(
8915	// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8916	// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
8917	// CHECK: ret <8 x i8> [[OR_I]]
8918	uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
8919	return vorn_u8(a, b);
8920	}
8921
8922	// CHECK-LABEL: @test_vorn_u16(
8923	// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
8924	// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
8925	// CHECK: ret <4 x i16> [[OR_I]]
8926	uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
8927	return vorn_u16(a, b);
8928	}
8929
8930	// CHECK-LABEL: @test_vorn_u32(
8931	// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
8932	// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
8933	// CHECK: ret <2 x i32> [[OR_I]]
8934	uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
8935	return vorn_u32(a, b);
8936	}
8937
8938	// CHECK-LABEL: @test_vorn_u64(
8939	// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
8940	// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
8941	// CHECK: ret <1 x i64> [[OR_I]]
8942	uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
8943	return vorn_u64(a, b);
8944	}
8945
8946	// CHECK-LABEL: @test_vornq_s8(
8947	// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8948	// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
8949	// CHECK: ret <16 x i8> [[OR_I]]
8950	int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
8951	return vornq_s8(a, b);
8952	}
8953
8954	// CHECK-LABEL: @test_vornq_s16(
8955	// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
8956	// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
8957	// CHECK: ret <8 x i16> [[OR_I]]
8958	int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
8959	return vornq_s16(a, b);
8960	}
8961
8962	// CHECK-LABEL: @test_vornq_s32(
8963	// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
8964	// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
8965	// CHECK: ret <4 x i32> [[OR_I]]
8966	int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
8967	return vornq_s32(a, b);
8968	}
8969
8970	// CHECK-LABEL: @test_vornq_s64(
8971	// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
8972	// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
8973	// CHECK: ret <2 x i64> [[OR_I]]
8974	int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
8975	return vornq_s64(a, b);
8976	}
8977
8978	// CHECK-LABEL: @test_vornq_u8(
8979	// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
8980	// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
8981	// CHECK: ret <16 x i8> [[OR_I]]
8982	uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
8983	return vornq_u8(a, b);
8984	}
8985
8986	// CHECK-LABEL: @test_vornq_u16(
8987	// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
8988	// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
8989	// CHECK: ret <8 x i16> [[OR_I]]
8990	uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
8991	return vornq_u16(a, b);
8992	}
8993
8994	// CHECK-LABEL: @test_vornq_u32(
8995	// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
8996	// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
8997	// CHECK: ret <4 x i32> [[OR_I]]
8998	uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
8999	return vornq_u32(a, b);
9000	}
9001
9002	// CHECK-LABEL: @test_vornq_u64(
9003	// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
9004	// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
9005	// CHECK: ret <2 x i64> [[OR_I]]
9006	uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
9007	return vornq_u64(a, b);
9008	}
9009
9010	// CHECK-LABEL: @test_vorr_s8(
9011	// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, %b
9012	// CHECK: ret <8 x i8> [[OR_I]]
9013	int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
9014	return vorr_s8(a, b);
9015	}
9016
9017	// CHECK-LABEL: @test_vorr_s16(
9018	// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, %b
9019	// CHECK: ret <4 x i16> [[OR_I]]
9020	int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
9021	return vorr_s16(a, b);
9022	}
9023
9024	// CHECK-LABEL: @test_vorr_s32(
9025	// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, %b
9026	// CHECK: ret <2 x i32> [[OR_I]]
9027	int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
9028	return vorr_s32(a, b);
9029	}
9030
9031	// CHECK-LABEL: @test_vorr_s64(
9032	// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, %b
9033	// CHECK: ret <1 x i64> [[OR_I]]
9034	int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
9035	return vorr_s64(a, b);
9036	}
9037
9038	// CHECK-LABEL: @test_vorr_u8(
9039	// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, %b
9040	// CHECK: ret <8 x i8> [[OR_I]]
9041	uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
9042	return vorr_u8(a, b);
9043	}
9044
9045	// CHECK-LABEL: @test_vorr_u16(
9046	// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, %b
9047	// CHECK: ret <4 x i16> [[OR_I]]
9048	uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
9049	return vorr_u16(a, b);
9050	}
9051
9052	// CHECK-LABEL: @test_vorr_u32(
9053	// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, %b
9054	// CHECK: ret <2 x i32> [[OR_I]]
9055	uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
9056	return vorr_u32(a, b);
9057	}
9058
9059	// CHECK-LABEL: @test_vorr_u64(
9060	// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, %b
9061	// CHECK: ret <1 x i64> [[OR_I]]
9062	uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
9063	return vorr_u64(a, b);
9064	}
9065
9066	// CHECK-LABEL: @test_vorrq_s8(
9067	// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, %b
9068	// CHECK: ret <16 x i8> [[OR_I]]
9069	int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
9070	return vorrq_s8(a, b);
9071	}
9072
9073	// CHECK-LABEL: @test_vorrq_s16(
9074	// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, %b
9075	// CHECK: ret <8 x i16> [[OR_I]]
9076	int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
9077	return vorrq_s16(a, b);
9078	}
9079
9080	// CHECK-LABEL: @test_vorrq_s32(
9081	// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, %b
9082	// CHECK: ret <4 x i32> [[OR_I]]
9083	int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
9084	return vorrq_s32(a, b);
9085	}
9086
9087	// CHECK-LABEL: @test_vorrq_s64(
9088	// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, %b
9089	// CHECK: ret <2 x i64> [[OR_I]]
9090	int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
9091	return vorrq_s64(a, b);
9092	}
9093
9094	// CHECK-LABEL: @test_vorrq_u8(
9095	// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, %b
9096	// CHECK: ret <16 x i8> [[OR_I]]
9097	uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
9098	return vorrq_u8(a, b);
9099	}
9100
9101	// CHECK-LABEL: @test_vorrq_u16(
9102	// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, %b
9103	// CHECK: ret <8 x i16> [[OR_I]]
9104	uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
9105	return vorrq_u16(a, b);
9106	}
9107
9108	// CHECK-LABEL: @test_vorrq_u32(
9109	// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, %b
9110	// CHECK: ret <4 x i32> [[OR_I]]
9111	uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
9112	return vorrq_u32(a, b);
9113	}
9114
9115	// CHECK-LABEL: @test_vorrq_u64(
9116	// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, %b
9117	// CHECK: ret <2 x i64> [[OR_I]]
9118	uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
9119	return vorrq_u64(a, b);
9120	}
9121
9122	// CHECK-LABEL: @test_vpadal_s8(
9123	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9124	// CHECK: [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b)
9125	// CHECK: ret <4 x i16> [[VPADAL_V1_I]]
9126	int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
9127	return vpadal_s8(a, b);
9128	}
9129
9130	// CHECK-LABEL: @test_vpadal_s16(
9131	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9132	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9133	// CHECK: [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b)
9134	// CHECK: ret <2 x i32> [[VPADAL_V2_I]]
9135	int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
9136	return vpadal_s16(a, b);
9137	}
9138
9139	// CHECK-LABEL: @test_vpadal_s32(
9140	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
9141	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9142	// CHECK: [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b)
9143	// CHECK: ret <1 x i64> [[VPADAL_V2_I]]
9144	int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
9145	return vpadal_s32(a, b);
9146	}
9147
9148	// CHECK-LABEL: @test_vpadal_u8(
9149	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9150	// CHECK: [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b)
9151	// CHECK: ret <4 x i16> [[VPADAL_V1_I]]
9152	uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
9153	return vpadal_u8(a, b);
9154	}
9155
9156	// CHECK-LABEL: @test_vpadal_u16(
9157	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9158	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9159	// CHECK: [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b)
9160	// CHECK: ret <2 x i32> [[VPADAL_V2_I]]
9161	uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
9162	return vpadal_u16(a, b);
9163	}
9164
9165	// CHECK-LABEL: @test_vpadal_u32(
9166	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
9167	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9168	// CHECK: [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b)
9169	// CHECK: ret <1 x i64> [[VPADAL_V2_I]]
9170	uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
9171	return vpadal_u32(a, b);
9172	}
9173
9174	// CHECK-LABEL: @test_vpadalq_s8(
9175	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9176	// CHECK: [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b)
9177	// CHECK: ret <8 x i16> [[VPADALQ_V1_I]]
9178	int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
9179	return vpadalq_s8(a, b);
9180	}
9181
9182	// CHECK-LABEL: @test_vpadalq_s16(
9183	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9184	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
9185	// CHECK: [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b)
9186	// CHECK: ret <4 x i32> [[VPADALQ_V2_I]]
9187	int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
9188	return vpadalq_s16(a, b);
9189	}
9190
9191	// CHECK-LABEL: @test_vpadalq_s32(
9192	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9193	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
9194	// CHECK: [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b)
9195	// CHECK: ret <2 x i64> [[VPADALQ_V2_I]]
9196	int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
9197	return vpadalq_s32(a, b);
9198	}
9199
9200	// CHECK-LABEL: @test_vpadalq_u8(
9201	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9202	// CHECK: [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b)
9203	// CHECK: ret <8 x i16> [[VPADALQ_V1_I]]
9204	uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
9205	return vpadalq_u8(a, b);
9206	}
9207
9208	// CHECK-LABEL: @test_vpadalq_u16(
9209	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9210	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
9211	// CHECK: [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b)
9212	// CHECK: ret <4 x i32> [[VPADALQ_V2_I]]
9213	uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
9214	return vpadalq_u16(a, b);
9215	}
9216
9217	// CHECK-LABEL: @test_vpadalq_u32(
9218	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9219	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
9220	// CHECK: [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b)
9221	// CHECK: ret <2 x i64> [[VPADALQ_V2_I]]
9222	uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
9223	return vpadalq_u32(a, b);
9224	}
9225
9226	// CHECK-LABEL: @test_vpadd_s8(
9227	// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b)
9228	// CHECK: ret <8 x i8> [[VPADD_V_I]]
9229	int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
9230	return vpadd_s8(a, b);
9231	}
9232
9233	// CHECK-LABEL: @test_vpadd_s16(
9234	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9235	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9236	// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b)
9237	// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
9238	// CHECK: ret <4 x i16> [[VPADD_V2_I]]
9239	int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
9240	return vpadd_s16(a, b);
9241	}
9242
9243	// CHECK-LABEL: @test_vpadd_s32(
9244	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9245	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9246	// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b)
9247	// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
9248	// CHECK: ret <2 x i32> [[VPADD_V2_I]]
9249	int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
9250	return vpadd_s32(a, b);
9251	}
9252
9253	// CHECK-LABEL: @test_vpadd_u8(
9254	// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b)
9255	// CHECK: ret <8 x i8> [[VPADD_V_I]]
9256	uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
9257	return vpadd_u8(a, b);
9258	}
9259
9260	// CHECK-LABEL: @test_vpadd_u16(
9261	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9262	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9263	// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b)
9264	// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
9265	// CHECK: ret <4 x i16> [[VPADD_V2_I]]
9266	uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
9267	return vpadd_u16(a, b);
9268	}
9269
9270	// CHECK-LABEL: @test_vpadd_u32(
9271	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9272	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9273	// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b)
9274	// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
9275	// CHECK: ret <2 x i32> [[VPADD_V2_I]]
9276	uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
9277	return vpadd_u32(a, b);
9278	}
9279
9280	// CHECK-LABEL: @test_vpadd_f32(
9281	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
9282	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
9283	// CHECK: [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %a, <2 x float> %b)
9284	// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
9285	// CHECK: ret <2 x float> [[VPADD_V2_I]]
9286	float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
9287	return vpadd_f32(a, b);
9288	}
9289
9290	// CHECK-LABEL: @test_vpaddl_s8(
9291	// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a)
9292	// CHECK: ret <4 x i16> [[VPADDL_I]]
9293	int16x4_t test_vpaddl_s8(int8x8_t a) {
9294	return vpaddl_s8(a);
9295	}
9296
9297	// CHECK-LABEL: @test_vpaddl_s16(
9298	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9299	// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a)
9300	// CHECK: ret <2 x i32> [[VPADDL1_I]]
9301	int32x2_t test_vpaddl_s16(int16x4_t a) {
9302	return vpaddl_s16(a);
9303	}
9304
9305	// CHECK-LABEL: @test_vpaddl_s32(
9306	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9307	// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a)
9308	// CHECK: ret <1 x i64> [[VPADDL1_I]]
9309	int64x1_t test_vpaddl_s32(int32x2_t a) {
9310	return vpaddl_s32(a);
9311	}
9312
9313	// CHECK-LABEL: @test_vpaddl_u8(
9314	// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a)
9315	// CHECK: ret <4 x i16> [[VPADDL_I]]
9316	uint16x4_t test_vpaddl_u8(uint8x8_t a) {
9317	return vpaddl_u8(a);
9318	}
9319
9320	// CHECK-LABEL: @test_vpaddl_u16(
9321	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9322	// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a)
9323	// CHECK: ret <2 x i32> [[VPADDL1_I]]
9324	uint32x2_t test_vpaddl_u16(uint16x4_t a) {
9325	return vpaddl_u16(a);
9326	}
9327
9328	// CHECK-LABEL: @test_vpaddl_u32(
9329	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9330	// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a)
9331	// CHECK: ret <1 x i64> [[VPADDL1_I]]
9332	uint64x1_t test_vpaddl_u32(uint32x2_t a) {
9333	return vpaddl_u32(a);
9334	}
9335
9336	// CHECK-LABEL: @test_vpaddlq_s8(
9337	// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a)
9338	// CHECK: ret <8 x i16> [[VPADDL_I]]
9339	int16x8_t test_vpaddlq_s8(int8x16_t a) {
9340	return vpaddlq_s8(a);
9341	}
9342
9343	// CHECK-LABEL: @test_vpaddlq_s16(
9344	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9345	// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a)
9346	// CHECK: ret <4 x i32> [[VPADDL1_I]]
9347	int32x4_t test_vpaddlq_s16(int16x8_t a) {
9348	return vpaddlq_s16(a);
9349	}
9350
9351	// CHECK-LABEL: @test_vpaddlq_s32(
9352	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9353	// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a)
9354	// CHECK: ret <2 x i64> [[VPADDL1_I]]
9355	int64x2_t test_vpaddlq_s32(int32x4_t a) {
9356	return vpaddlq_s32(a);
9357	}
9358
9359	// CHECK-LABEL: @test_vpaddlq_u8(
9360	// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a)
9361	// CHECK: ret <8 x i16> [[VPADDL_I]]
9362	uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
9363	return vpaddlq_u8(a);
9364	}
9365
9366	// CHECK-LABEL: @test_vpaddlq_u16(
9367	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9368	// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a)
9369	// CHECK: ret <4 x i32> [[VPADDL1_I]]
9370	uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
9371	return vpaddlq_u16(a);
9372	}
9373
9374	// CHECK-LABEL: @test_vpaddlq_u32(
9375	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9376	// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a)
9377	// CHECK: ret <2 x i64> [[VPADDL1_I]]
9378	uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
9379	return vpaddlq_u32(a);
9380	}
9381
9382	// CHECK-LABEL: @test_vpmax_s8(
9383	// CHECK: [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %a, <8 x i8> %b)
9384	// CHECK: ret <8 x i8> [[VPMAX_V_I]]
9385	int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
9386	return vpmax_s8(a, b);
9387	}
9388
9389	// CHECK-LABEL: @test_vpmax_s16(
9390	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9391	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9392	// CHECK: [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %a, <4 x i16> %b)
9393	// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
9394	// CHECK: ret <4 x i16> [[VPMAX_V2_I]]
9395	int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
9396	return vpmax_s16(a, b);
9397	}
9398
9399	// CHECK-LABEL: @test_vpmax_s32(
9400	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9401	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9402	// CHECK: [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %a, <2 x i32> %b)
9403	// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
9404	// CHECK: ret <2 x i32> [[VPMAX_V2_I]]
9405	int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
9406	return vpmax_s32(a, b);
9407	}
9408
9409	// CHECK-LABEL: @test_vpmax_u8(
9410	// CHECK: [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %a, <8 x i8> %b)
9411	// CHECK: ret <8 x i8> [[VPMAX_V_I]]
9412	uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
9413	return vpmax_u8(a, b);
9414	}
9415
9416	// CHECK-LABEL: @test_vpmax_u16(
9417	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9418	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9419	// CHECK: [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %a, <4 x i16> %b)
9420	// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
9421	// CHECK: ret <4 x i16> [[VPMAX_V2_I]]
9422	uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
9423	return vpmax_u16(a, b);
9424	}
9425
9426	// CHECK-LABEL: @test_vpmax_u32(
9427	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9428	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9429	// CHECK: [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %a, <2 x i32> %b)
9430	// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
9431	// CHECK: ret <2 x i32> [[VPMAX_V2_I]]
9432	uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
9433	return vpmax_u32(a, b);
9434	}
9435
9436	// CHECK-LABEL: @test_vpmax_f32(
9437	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
9438	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
9439	// CHECK: [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %a, <2 x float> %b)
9440	// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8>
9441	// CHECK: ret <2 x float> [[VPMAX_V2_I]]
9442	float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
9443	return vpmax_f32(a, b);
9444	}
9445
9446	// CHECK-LABEL: @test_vpmin_s8(
9447	// CHECK: [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %a, <8 x i8> %b)
9448	// CHECK: ret <8 x i8> [[VPMIN_V_I]]
9449	int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
9450	return vpmin_s8(a, b);
9451	}
9452
9453	// CHECK-LABEL: @test_vpmin_s16(
9454	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9455	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9456	// CHECK: [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %a, <4 x i16> %b)
9457	// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
9458	// CHECK: ret <4 x i16> [[VPMIN_V2_I]]
9459	int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
9460	return vpmin_s16(a, b);
9461	}
9462
9463	// CHECK-LABEL: @test_vpmin_s32(
9464	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9465	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9466	// CHECK: [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %a, <2 x i32> %b)
9467	// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
9468	// CHECK: ret <2 x i32> [[VPMIN_V2_I]]
9469	int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
9470	return vpmin_s32(a, b);
9471	}
9472
9473	// CHECK-LABEL: @test_vpmin_u8(
9474	// CHECK: [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %a, <8 x i8> %b)
9475	// CHECK: ret <8 x i8> [[VPMIN_V_I]]
9476	uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
9477	return vpmin_u8(a, b);
9478	}
9479
9480	// CHECK-LABEL: @test_vpmin_u16(
9481	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9482	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9483	// CHECK: [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %a, <4 x i16> %b)
9484	// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
9485	// CHECK: ret <4 x i16> [[VPMIN_V2_I]]
9486	uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
9487	return vpmin_u16(a, b);
9488	}
9489
9490	// CHECK-LABEL: @test_vpmin_u32(
9491	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9492	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9493	// CHECK: [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %a, <2 x i32> %b)
9494	// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
9495	// CHECK: ret <2 x i32> [[VPMIN_V2_I]]
9496	uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
9497	return vpmin_u32(a, b);
9498	}
9499
9500	// CHECK-LABEL: @test_vpmin_f32(
9501	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
9502	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
9503	// CHECK: [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %a, <2 x float> %b)
9504	// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8>
9505	// CHECK: ret <2 x float> [[VPMIN_V2_I]]
9506	float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
9507	return vpmin_f32(a, b);
9508	}
9509
9510	// CHECK-LABEL: @test_vqabs_s8(
9511	// CHECK: [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a)
9512	// CHECK: ret <8 x i8> [[VQABS_V_I]]
9513	int8x8_t test_vqabs_s8(int8x8_t a) {
9514	return vqabs_s8(a);
9515	}
9516
9517	// CHECK-LABEL: @test_vqabs_s16(
9518	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9519	// CHECK: [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a)
9520	// CHECK: [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
9521	// CHECK: ret <4 x i16> [[VQABS_V1_I]]
9522	int16x4_t test_vqabs_s16(int16x4_t a) {
9523	return vqabs_s16(a);
9524	}
9525
9526	// CHECK-LABEL: @test_vqabs_s32(
9527	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9528	// CHECK: [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a)
9529	// CHECK: [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
9530	// CHECK: ret <2 x i32> [[VQABS_V1_I]]
9531	int32x2_t test_vqabs_s32(int32x2_t a) {
9532	return vqabs_s32(a);
9533	}
9534
9535	// CHECK-LABEL: @test_vqabsq_s8(
9536	// CHECK: [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a)
9537	// CHECK: ret <16 x i8> [[VQABSQ_V_I]]
9538	int8x16_t test_vqabsq_s8(int8x16_t a) {
9539	return vqabsq_s8(a);
9540	}
9541
9542	// CHECK-LABEL: @test_vqabsq_s16(
9543	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9544	// CHECK: [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a)
9545	// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
9546	// CHECK: ret <8 x i16> [[VQABSQ_V1_I]]
9547	int16x8_t test_vqabsq_s16(int16x8_t a) {
9548	return vqabsq_s16(a);
9549	}
9550
9551	// CHECK-LABEL: @test_vqabsq_s32(
9552	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9553	// CHECK: [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a)
9554	// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
9555	// CHECK: ret <4 x i32> [[VQABSQ_V1_I]]
9556	int32x4_t test_vqabsq_s32(int32x4_t a) {
9557	return vqabsq_s32(a);
9558	}
9559
9560	// CHECK-LABEL: @test_vqadd_s8(
9561	// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %a, <8 x i8> %b)
9562	// CHECK: ret <8 x i8> [[VQADD_V_I]]
9563	int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
9564	return vqadd_s8(a, b);
9565	}
9566
9567	// CHECK-LABEL: @test_vqadd_s16(
9568	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9569	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9570	// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %a, <4 x i16> %b)
9571	// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
9572	// CHECK: ret <4 x i16> [[VQADD_V2_I]]
9573	int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
9574	return vqadd_s16(a, b);
9575	}
9576
9577	// CHECK-LABEL: @test_vqadd_s32(
9578	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9579	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9580	// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %a, <2 x i32> %b)
9581	// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
9582	// CHECK: ret <2 x i32> [[VQADD_V2_I]]
9583	int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
9584	return vqadd_s32(a, b);
9585	}
9586
9587	// CHECK-LABEL: @test_vqadd_s64(
9588	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
9589	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
9590	// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %a, <1 x i64> %b)
9591	// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
9592	// CHECK: ret <1 x i64> [[VQADD_V2_I]]
9593	int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
9594	return vqadd_s64(a, b);
9595	}
9596
9597	// CHECK-LABEL: @test_vqadd_u8(
9598	// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %a, <8 x i8> %b)
9599	// CHECK: ret <8 x i8> [[VQADD_V_I]]
9600	uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
9601	return vqadd_u8(a, b);
9602	}
9603
9604	// CHECK-LABEL: @test_vqadd_u16(
9605	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9606	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9607	// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %a, <4 x i16> %b)
9608	// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
9609	// CHECK: ret <4 x i16> [[VQADD_V2_I]]
9610	uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
9611	return vqadd_u16(a, b);
9612	}
9613
9614	// CHECK-LABEL: @test_vqadd_u32(
9615	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9616	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9617	// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %a, <2 x i32> %b)
9618	// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
9619	// CHECK: ret <2 x i32> [[VQADD_V2_I]]
9620	uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
9621	return vqadd_u32(a, b);
9622	}
9623
9624	// CHECK-LABEL: @test_vqadd_u64(
9625	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
9626	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
9627	// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %a, <1 x i64> %b)
9628	// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
9629	// CHECK: ret <1 x i64> [[VQADD_V2_I]]
9630	uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
9631	return vqadd_u64(a, b);
9632	}
9633
9634	// CHECK-LABEL: @test_vqaddq_s8(
9635	// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %a, <16 x i8> %b)
9636	// CHECK: ret <16 x i8> [[VQADDQ_V_I]]
9637	int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
9638	return vqaddq_s8(a, b);
9639	}
9640
9641	// CHECK-LABEL: @test_vqaddq_s16(
9642	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9643	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
9644	// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %a, <8 x i16> %b)
9645	// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
9646	// CHECK: ret <8 x i16> [[VQADDQ_V2_I]]
9647	int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
9648	return vqaddq_s16(a, b);
9649	}
9650
9651	// CHECK-LABEL: @test_vqaddq_s32(
9652	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9653	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
9654	// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %b)
9655	// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
9656	// CHECK: ret <4 x i32> [[VQADDQ_V2_I]]
9657	int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
9658	return vqaddq_s32(a, b);
9659	}
9660
9661	// CHECK-LABEL: @test_vqaddq_s64(
9662	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9663	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
9664	// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %b)
9665	// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
9666	// CHECK: ret <2 x i64> [[VQADDQ_V2_I]]
9667	int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
9668	return vqaddq_s64(a, b);
9669	}
9670
9671	// CHECK-LABEL: @test_vqaddq_u8(
9672	// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %a, <16 x i8> %b)
9673	// CHECK: ret <16 x i8> [[VQADDQ_V_I]]
9674	uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
9675	return vqaddq_u8(a, b);
9676	}
9677
9678	// CHECK-LABEL: @test_vqaddq_u16(
9679	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9680	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
9681	// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %a, <8 x i16> %b)
9682	// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
9683	// CHECK: ret <8 x i16> [[VQADDQ_V2_I]]
9684	uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
9685	return vqaddq_u16(a, b);
9686	}
9687
9688	// CHECK-LABEL: @test_vqaddq_u32(
9689	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9690	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
9691	// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %a, <4 x i32> %b)
9692	// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
9693	// CHECK: ret <4 x i32> [[VQADDQ_V2_I]]
9694	uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
9695	return vqaddq_u32(a, b);
9696	}
9697
9698	// CHECK-LABEL: @test_vqaddq_u64(
9699	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9700	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
9701	// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %a, <2 x i64> %b)
9702	// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
9703	// CHECK: ret <2 x i64> [[VQADDQ_V2_I]]
9704	uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
9705	return vqaddq_u64(a, b);
9706	}
9707
9708	// CHECK-LABEL: @test_vqdmlal_s16(
9709	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9710	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9711	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
9712	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
9713	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
9714	// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
9715	int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
9716	return vqdmlal_s16(a, b, c);
9717	}
9718
9719	// CHECK-LABEL: @test_vqdmlal_s32(
9720	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9721	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9722	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
9723	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
9724	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
9725	// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
9726	int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9727	return vqdmlal_s32(a, b, c);
9728	}
9729
9730	// CHECK-LABEL: @test_vqdmlal_lane_s16(
9731	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9732	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9733	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9734	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
9735	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
9736	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
9737	// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
9738	int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
9739	return vqdmlal_lane_s16(a, b, c, 3);
9740	}
9741
9742	// CHECK-LABEL: @test_vqdmlal_lane_s32(
9743	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9744	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9745	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9746	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
9747	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
9748	// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
9749	// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
9750	int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9751	return vqdmlal_lane_s32(a, b, c, 1);
9752	}
9753
9754	// CHECK-LABEL: @test_vqdmlal_n_s16(
9755	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9756	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9757	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9758	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9759	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9760	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9761	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9762	// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
9763	// CHECK: [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
9764	// CHECK: ret <4 x i32> [[VQDMLAL_V6_I]]
9765	int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
9766	return vqdmlal_n_s16(a, b, c);
9767	}
9768
9769	// CHECK-LABEL: @test_vqdmlal_n_s32(
9770	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9771	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9772	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9773	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9774	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9775	// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
9776	// CHECK: [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
9777	// CHECK: ret <2 x i64> [[VQDMLAL_V4_I]]
9778	int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
9779	return vqdmlal_n_s32(a, b, c);
9780	}
9781
9782	// CHECK-LABEL: @test_vqdmlsl_s16(
9783	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9784	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9785	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
9786	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
9787	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
9788	// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
9789	int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
9790	return vqdmlsl_s16(a, b, c);
9791	}
9792
9793	// CHECK-LABEL: @test_vqdmlsl_s32(
9794	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9795	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9796	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
9797	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
9798	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
9799	// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
9800	int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9801	return vqdmlsl_s32(a, b, c);
9802	}
9803
9804	// CHECK-LABEL: @test_vqdmlsl_lane_s16(
9805	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9806	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9807	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9808	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
9809	// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
9810	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
9811	// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
9812	int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
9813	return vqdmlsl_lane_s16(a, b, c, 3);
9814	}
9815
9816	// CHECK-LABEL: @test_vqdmlsl_lane_s32(
9817	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9818	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9819	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9820	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
9821	// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
9822	// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
9823	// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
9824	int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9825	return vqdmlsl_lane_s32(a, b, c, 1);
9826	}
9827
9828	// CHECK-LABEL: @test_vqdmlsl_n_s16(
9829	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9830	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9831	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9832	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9833	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9834	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9835	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9836	// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
9837	// CHECK: [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
9838	// CHECK: ret <4 x i32> [[VQDMLSL_V6_I]]
9839	int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
9840	return vqdmlsl_n_s16(a, b, c);
9841	}
9842
9843	// CHECK-LABEL: @test_vqdmlsl_n_s32(
9844	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9845	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9846	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9847	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9848	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9849	// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
9850	// CHECK: [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
9851	// CHECK: ret <2 x i64> [[VQDMLSL_V4_I]]
9852	int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
9853	return vqdmlsl_n_s32(a, b, c);
9854	}
9855
9856	// CHECK-LABEL: @test_vqdmulh_s16(
9857	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9858	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9859	// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %b)
9860	// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
9861	// CHECK: ret <4 x i16> [[VQDMULH_V2_I]]
9862	int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
9863	return vqdmulh_s16(a, b);
9864	}
9865
9866	// CHECK-LABEL: @test_vqdmulh_s32(
9867	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9868	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9869	// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %b)
9870	// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
9871	// CHECK: ret <2 x i32> [[VQDMULH_V2_I]]
9872	int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
9873	return vqdmulh_s32(a, b);
9874	}
9875
9876	// CHECK-LABEL: @test_vqdmulhq_s16(
9877	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9878	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
9879	// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
9880	// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
9881	// CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]]
9882	int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
9883	return vqdmulhq_s16(a, b);
9884	}
9885
9886	// CHECK-LABEL: @test_vqdmulhq_s32(
9887	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9888	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
9889	// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
9890	// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
9891	// CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]]
9892	int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
9893	return vqdmulhq_s32(a, b);
9894	}
9895
9896	// CHECK-LABEL: @test_vqdmulh_lane_s16(
9897	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9898	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9899	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
9900	// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
9901	// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
9902	// CHECK: ret <4 x i16> [[VQDMULH_V2_I]]
9903	int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
9904	return vqdmulh_lane_s16(a, b, 3);
9905	}
9906
9907	// CHECK-LABEL: @test_vqdmulh_lane_s32(
9908	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
9909	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9910	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
9911	// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
9912	// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
9913	// CHECK: ret <2 x i32> [[VQDMULH_V2_I]]
9914	int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
9915	return vqdmulh_lane_s32(a, b, 1);
9916	}
9917
9918	// CHECK-LABEL: @test_vqdmulhq_lane_s16(
9919	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
9920	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9921	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
9922	// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
9923	// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
9924	// CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]]
9925	int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
9926	return vqdmulhq_lane_s16(a, b, 3);
9927	}
9928
9929	// CHECK-LABEL: @test_vqdmulhq_lane_s32(
9930	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9931	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9932	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
9933	// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
9934	// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
9935	// CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]]
9936	int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
9937	return vqdmulhq_lane_s32(a, b, 1);
9938	}
9939
9940	// CHECK-LABEL: @test_vqdmulh_n_s16(
9941	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9942	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
9943	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
9944	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
9945	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
9946	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9947	// CHECK: [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
9948	// CHECK: [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
9949	// CHECK: ret <4 x i16> [[VQDMULH_V5_I]]
9950	int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
9951	return vqdmulh_n_s16(a, b);
9952	}
9953
9954	// CHECK-LABEL: @test_vqdmulh_n_s32(
9955	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9956	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
9957	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
9958	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9959	// CHECK: [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
9960	// CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
9961	// CHECK: ret <2 x i32> [[VQDMULH_V3_I]]
9962	int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
9963	return vqdmulh_n_s32(a, b);
9964	}
9965
9966	// CHECK-LABEL: @test_vqdmulhq_n_s16(
9967	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9968	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
9969	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
9970	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
9971	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
9972	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
9973	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
9974	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
9975	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
9976	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
9977	// CHECK: [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
9978	// CHECK: [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
9979	// CHECK: ret <8 x i16> [[VQDMULHQ_V9_I]]
9980	int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
9981	return vqdmulhq_n_s16(a, b);
9982	}
9983
9984	// CHECK-LABEL: @test_vqdmulhq_n_s32(
9985	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9986	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
9987	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
9988	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
9989	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
9990	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
9991	// CHECK: [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
9992	// CHECK: [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
9993	// CHECK: ret <4 x i32> [[VQDMULHQ_V5_I]]
9994	int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
9995	return vqdmulhq_n_s32(a, b);
9996	}
9997
9998	// CHECK-LABEL: @test_vqdmull_s16(
9999	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10000	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10001	// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
10002	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
10003	// CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
10004	int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
10005	return vqdmull_s16(a, b);
10006	}
10007
10008	// CHECK-LABEL: @test_vqdmull_s32(
10009	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10010	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10011	// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
10012	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
10013	// CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
10014	int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
10015	return vqdmull_s32(a, b);
10016	}
10017
10018	// CHECK-LABEL: @test_vqdmull_lane_s16(
10019	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
10020	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10021	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
10022	// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
10023	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
10024	// CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
10025	int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
10026	return vqdmull_lane_s16(a, b, 3);
10027	}
10028
10029	// CHECK-LABEL: @test_vqdmull_lane_s32(
10030	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
10031	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10032	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
10033	// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
10034	// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
10035	// CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
10036	int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
10037	return vqdmull_lane_s32(a, b, 1);
10038	}
10039
10040	// CHECK-LABEL: @test_vqdmull_n_s16(
10041	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10042	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
10043	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
10044	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
10045	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
10046	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
10047	// CHECK: [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
10048	// CHECK: [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
10049	// CHECK: ret <4 x i32> [[VQDMULL_V5_I]]
10050	int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
10051	return vqdmull_n_s16(a, b);
10052	}
10053
10054	// CHECK-LABEL: @test_vqdmull_n_s32(
10055	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10056	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
10057	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
10058	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
10059	// CHECK: [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
10060	// CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
10061	// CHECK: ret <2 x i64> [[VQDMULL_V3_I]]
10062	int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
10063	return vqdmull_n_s32(a, b);
10064	}
10065
10066	// CHECK-LABEL: @test_vqmovn_s16(
10067	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10068	// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a)
10069	// CHECK: ret <8 x i8> [[VQMOVN_V1_I]]
10070	int8x8_t test_vqmovn_s16(int16x8_t a) {
10071	return vqmovn_s16(a);
10072	}
10073
10074	// CHECK-LABEL: @test_vqmovn_s32(
10075	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10076	// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a)
10077	// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
10078	// CHECK: ret <4 x i16> [[VQMOVN_V1_I]]
10079	int16x4_t test_vqmovn_s32(int32x4_t a) {
10080	return vqmovn_s32(a);
10081	}
10082
10083	// CHECK-LABEL: @test_vqmovn_s64(
10084	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10085	// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a)
10086	// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
10087	// CHECK: ret <2 x i32> [[VQMOVN_V1_I]]
10088	int32x2_t test_vqmovn_s64(int64x2_t a) {
10089	return vqmovn_s64(a);
10090	}
10091
10092	// CHECK-LABEL: @test_vqmovn_u16(
10093	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10094	// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a)
10095	// CHECK: ret <8 x i8> [[VQMOVN_V1_I]]
10096	uint8x8_t test_vqmovn_u16(uint16x8_t a) {
10097	return vqmovn_u16(a);
10098	}
10099
10100	// CHECK-LABEL: @test_vqmovn_u32(
10101	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10102	// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a)
10103	// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
10104	// CHECK: ret <4 x i16> [[VQMOVN_V1_I]]
10105	uint16x4_t test_vqmovn_u32(uint32x4_t a) {
10106	return vqmovn_u32(a);
10107	}
10108
10109	// CHECK-LABEL: @test_vqmovn_u64(
10110	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10111	// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a)
10112	// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
10113	// CHECK: ret <2 x i32> [[VQMOVN_V1_I]]
10114	uint32x2_t test_vqmovn_u64(uint64x2_t a) {
10115	return vqmovn_u64(a);
10116	}
10117
10118	// CHECK-LABEL: @test_vqmovun_s16(
10119	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10120	// CHECK: [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a)
10121	// CHECK: ret <8 x i8> [[VQMOVUN_V1_I]]
10122	uint8x8_t test_vqmovun_s16(int16x8_t a) {
10123	return vqmovun_s16(a);
10124	}
10125
10126	// CHECK-LABEL: @test_vqmovun_s32(
10127	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10128	// CHECK: [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a)
10129	// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
10130	// CHECK: ret <4 x i16> [[VQMOVUN_V1_I]]
10131	uint16x4_t test_vqmovun_s32(int32x4_t a) {
10132	return vqmovun_s32(a);
10133	}
10134
10135	// CHECK-LABEL: @test_vqmovun_s64(
10136	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10137	// CHECK: [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a)
10138	// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
10139	// CHECK: ret <2 x i32> [[VQMOVUN_V1_I]]
10140	uint32x2_t test_vqmovun_s64(int64x2_t a) {
10141	return vqmovun_s64(a);
10142	}
10143
10144	// CHECK-LABEL: @test_vqneg_s8(
10145	// CHECK: [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a)
10146	// CHECK: ret <8 x i8> [[VQNEG_V_I]]
10147	int8x8_t test_vqneg_s8(int8x8_t a) {
10148	return vqneg_s8(a);
10149	}
10150
10151	// CHECK-LABEL: @test_vqneg_s16(
10152	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10153	// CHECK: [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a)
10154	// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
10155	// CHECK: ret <4 x i16> [[VQNEG_V1_I]]
10156	int16x4_t test_vqneg_s16(int16x4_t a) {
10157	return vqneg_s16(a);
10158	}
10159
10160	// CHECK-LABEL: @test_vqneg_s32(
10161	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10162	// CHECK: [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a)
10163	// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
10164	// CHECK: ret <2 x i32> [[VQNEG_V1_I]]
10165	int32x2_t test_vqneg_s32(int32x2_t a) {
10166	return vqneg_s32(a);
10167	}
10168
10169	// CHECK-LABEL: @test_vqnegq_s8(
10170	// CHECK: [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a)
10171	// CHECK: ret <16 x i8> [[VQNEGQ_V_I]]
10172	int8x16_t test_vqnegq_s8(int8x16_t a) {
10173	return vqnegq_s8(a);
10174	}
10175
10176	// CHECK-LABEL: @test_vqnegq_s16(
10177	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10178	// CHECK: [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a)
10179	// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
10180	// CHECK: ret <8 x i16> [[VQNEGQ_V1_I]]
10181	int16x8_t test_vqnegq_s16(int16x8_t a) {
10182	return vqnegq_s16(a);
10183	}
10184
10185	// CHECK-LABEL: @test_vqnegq_s32(
10186	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10187	// CHECK: [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a)
10188	// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
10189	// CHECK: ret <4 x i32> [[VQNEGQ_V1_I]]
10190	int32x4_t test_vqnegq_s32(int32x4_t a) {
10191	return vqnegq_s32(a);
10192	}
10193
10194	// CHECK-LABEL: @test_vqrdmulh_s16(
10195	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10196	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10197	// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %b)
10198	// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
10199	// CHECK: ret <4 x i16> [[VQRDMULH_V2_I]]
10200	int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
10201	return vqrdmulh_s16(a, b);
10202	}
10203
10204	// CHECK-LABEL: @test_vqrdmulh_s32(
10205	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10206	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10207	// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %b)
10208	// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
10209	// CHECK: ret <2 x i32> [[VQRDMULH_V2_I]]
10210	int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
10211	return vqrdmulh_s32(a, b);
10212	}
10213
10214	// CHECK-LABEL: @test_vqrdmulhq_s16(
10215	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10216	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
10217	// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
10218	// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
10219	// CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]]
10220	int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
10221	return vqrdmulhq_s16(a, b);
10222	}
10223
10224	// CHECK-LABEL: @test_vqrdmulhq_s32(
10225	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10226	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
10227	// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
10228	// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
10229	// CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]]
10230	int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
10231	return vqrdmulhq_s32(a, b);
10232	}
10233
10234	// CHECK-LABEL: @test_vqrdmulh_lane_s16(
10235	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
10236	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10237	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
10238	// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
10239	// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
10240	// CHECK: ret <4 x i16> [[VQRDMULH_V2_I]]
10241	int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
10242	return vqrdmulh_lane_s16(a, b, 3);
10243	}
10244
10245	// CHECK-LABEL: @test_vqrdmulh_lane_s32(
10246	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
10247	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10248	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
10249	// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
10250	// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
10251	// CHECK: ret <2 x i32> [[VQRDMULH_V2_I]]
10252	int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
10253	return vqrdmulh_lane_s32(a, b, 1);
10254	}
10255
10256	// CHECK-LABEL: @test_vqrdmulhq_lane_s16(
10257	// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
10258	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10259	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
10260	// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
10261	// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
10262	// CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]]
10263	int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
10264	return vqrdmulhq_lane_s16(a, b, 3);
10265	}
10266
10267	// CHECK-LABEL: @test_vqrdmulhq_lane_s32(
10268	// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
10269	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10270	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
10271	// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
10272	// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
10273	// CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]]
10274	int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
10275	return vqrdmulhq_lane_s32(a, b, 1);
10276	}
10277
10278	// CHECK-LABEL: @test_vqrdmulh_n_s16(
10279	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10280	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
10281	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
10282	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
10283	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
10284	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
10285	// CHECK: [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
10286	// CHECK: [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
10287	// CHECK: ret <4 x i16> [[VQRDMULH_V5_I]]
10288	int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
10289	return vqrdmulh_n_s16(a, b);
10290	}
10291
10292	// CHECK-LABEL: @test_vqrdmulh_n_s32(
10293	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10294	// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
10295	// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
10296	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
10297	// CHECK: [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
10298	// CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
10299	// CHECK: ret <2 x i32> [[VQRDMULH_V3_I]]
10300	int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
10301	return vqrdmulh_n_s32(a, b);
10302	}
10303
10304	// CHECK-LABEL: @test_vqrdmulhq_n_s16(
10305	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10306	// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
10307	// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
10308	// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
10309	// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
10310	// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
10311	// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
10312	// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
10313	// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
10314	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
10315	// CHECK: [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
10316	// CHECK: [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
10317	// CHECK: ret <8 x i16> [[VQRDMULHQ_V9_I]]
10318	int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
10319	return vqrdmulhq_n_s16(a, b);
10320	}
10321
10322	// CHECK-LABEL: @test_vqrdmulhq_n_s32(
10323	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10324	// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
10325	// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
10326	// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
10327	// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
10328	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
10329	// CHECK: [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
10330	// CHECK: [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
10331	// CHECK: ret <4 x i32> [[VQRDMULHQ_V5_I]]
10332	int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
10333	return vqrdmulhq_n_s32(a, b);
10334	}
10335
10336	// CHECK-LABEL: @test_vqrshl_s8(
10337	// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
10338	// CHECK: ret <8 x i8> [[VQRSHL_V_I]]
10339	int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
10340	return vqrshl_s8(a, b);
10341	}
10342
10343	// CHECK-LABEL: @test_vqrshl_s16(
10344	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10345	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10346	// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
10347	// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
10348	// CHECK: ret <4 x i16> [[VQRSHL_V2_I]]
10349	int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
10350	return vqrshl_s16(a, b);
10351	}
10352
10353	// CHECK-LABEL: @test_vqrshl_s32(
10354	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10355	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10356	// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
10357	// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
10358	// CHECK: ret <2 x i32> [[VQRSHL_V2_I]]
10359	int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
10360	return vqrshl_s32(a, b);
10361	}
10362
10363	// CHECK-LABEL: @test_vqrshl_s64(
10364	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10365	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
10366	// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
10367	// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
10368	// CHECK: ret <1 x i64> [[VQRSHL_V2_I]]
10369	int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
10370	return vqrshl_s64(a, b);
10371	}
10372
10373	// CHECK-LABEL: @test_vqrshl_u8(
10374	// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
10375	// CHECK: ret <8 x i8> [[VQRSHL_V_I]]
10376	uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
10377	return vqrshl_u8(a, b);
10378	}
10379
10380	// CHECK-LABEL: @test_vqrshl_u16(
10381	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10382	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10383	// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
10384	// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
10385	// CHECK: ret <4 x i16> [[VQRSHL_V2_I]]
10386	uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
10387	return vqrshl_u16(a, b);
10388	}
10389
10390	// CHECK-LABEL: @test_vqrshl_u32(
10391	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10392	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10393	// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
10394	// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
10395	// CHECK: ret <2 x i32> [[VQRSHL_V2_I]]
10396	uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
10397	return vqrshl_u32(a, b);
10398	}
10399
10400	// CHECK-LABEL: @test_vqrshl_u64(
10401	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10402	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
10403	// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
10404	// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
10405	// CHECK: ret <1 x i64> [[VQRSHL_V2_I]]
10406	uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
10407	return vqrshl_u64(a, b);
10408	}
10409
10410	// CHECK-LABEL: @test_vqrshlq_s8(
10411	// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
10412	// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]]
10413	int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
10414	return vqrshlq_s8(a, b);
10415	}
10416
10417	// CHECK-LABEL: @test_vqrshlq_s16(
10418	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10419	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
10420	// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
10421	// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
10422	// CHECK: ret <8 x i16> [[VQRSHLQ_V2_I]]
10423	int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
10424	return vqrshlq_s16(a, b);
10425	}
10426
10427	// CHECK-LABEL: @test_vqrshlq_s32(
10428	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10429	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
10430	// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
10431	// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
10432	// CHECK: ret <4 x i32> [[VQRSHLQ_V2_I]]
10433	int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
10434	return vqrshlq_s32(a, b);
10435	}
10436
10437	// CHECK-LABEL: @test_vqrshlq_s64(
10438	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10439	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
10440	// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
10441	// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
10442	// CHECK: ret <2 x i64> [[VQRSHLQ_V2_I]]
10443	int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
10444	return vqrshlq_s64(a, b);
10445	}
10446
10447	// CHECK-LABEL: @test_vqrshlq_u8(
10448	// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
10449	// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]]
10450	uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
10451	return vqrshlq_u8(a, b);
10452	}
10453
10454	// CHECK-LABEL: @test_vqrshlq_u16(
10455	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10456	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
10457	// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
10458	// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
10459	// CHECK: ret <8 x i16> [[VQRSHLQ_V2_I]]
10460	uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
10461	return vqrshlq_u16(a, b);
10462	}
10463
10464	// CHECK-LABEL: @test_vqrshlq_u32(
10465	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10466	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
10467	// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
10468	// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
10469	// CHECK: ret <4 x i32> [[VQRSHLQ_V2_I]]
10470	uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
10471	return vqrshlq_u32(a, b);
10472	}
10473
10474	// CHECK-LABEL: @test_vqrshlq_u64(
10475	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10476	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
10477	// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
10478	// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
10479	// CHECK: ret <2 x i64> [[VQRSHLQ_V2_I]]
10480	uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
10481	return vqrshlq_u64(a, b);
10482	}
10483
10484	// CHECK-LABEL: @test_vqrshrn_n_s16(
10485	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10486	// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10487	// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
10488	// CHECK: ret <8 x i8> [[VQRSHRN_N1]]
10489	int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
10490	return vqrshrn_n_s16(a, 1);
10491	}
10492
10493	// CHECK-LABEL: @test_vqrshrn_n_s32(
10494	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10495	// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10496	// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
10497	// CHECK: ret <4 x i16> [[VQRSHRN_N1]]
10498	int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
10499	return vqrshrn_n_s32(a, 1);
10500	}
10501
10502	// CHECK-LABEL: @test_vqrshrn_n_s64(
10503	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10504	// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10505	// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
10506	// CHECK: ret <2 x i32> [[VQRSHRN_N1]]
10507	int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
10508	return vqrshrn_n_s64(a, 1);
10509	}
10510
10511	// CHECK-LABEL: @test_vqrshrn_n_u16(
10512	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10513	// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10514	// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
10515	// CHECK: ret <8 x i8> [[VQRSHRN_N1]]
10516	uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
10517	return vqrshrn_n_u16(a, 1);
10518	}
10519
10520	// CHECK-LABEL: @test_vqrshrn_n_u32(
10521	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10522	// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10523	// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
10524	// CHECK: ret <4 x i16> [[VQRSHRN_N1]]
10525	uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
10526	return vqrshrn_n_u32(a, 1);
10527	}
10528
10529	// CHECK-LABEL: @test_vqrshrn_n_u64(
10530	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10531	// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10532	// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
10533	// CHECK: ret <2 x i32> [[VQRSHRN_N1]]
10534	uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
10535	return vqrshrn_n_u64(a, 1);
10536	}
10537
10538	// CHECK-LABEL: @test_vqrshrun_n_s16(
10539	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10540	// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10541	// CHECK: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
10542	// CHECK: ret <8 x i8> [[VQRSHRUN_N1]]
10543	uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
10544	return vqrshrun_n_s16(a, 1);
10545	}
10546
10547	// CHECK-LABEL: @test_vqrshrun_n_s32(
10548	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10549	// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10550	// CHECK: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
10551	// CHECK: ret <4 x i16> [[VQRSHRUN_N1]]
10552	uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
10553	return vqrshrun_n_s32(a, 1);
10554	}
10555
10556	// CHECK-LABEL: @test_vqrshrun_n_s64(
10557	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10558	// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10559	// CHECK: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
10560	// CHECK: ret <2 x i32> [[VQRSHRUN_N1]]
10561	uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
10562	return vqrshrun_n_s64(a, 1);
10563	}
10564
10565	// CHECK-LABEL: @test_vqshl_s8(
10566	// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
10567	// CHECK: ret <8 x i8> [[VQSHL_V_I]]
10568	int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
10569	return vqshl_s8(a, b);
10570	}
10571
10572	// CHECK-LABEL: @test_vqshl_s16(
10573	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10574	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10575	// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
10576	// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
10577	// CHECK: ret <4 x i16> [[VQSHL_V2_I]]
10578	int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
10579	return vqshl_s16(a, b);
10580	}
10581
10582	// CHECK-LABEL: @test_vqshl_s32(
10583	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10584	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10585	// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
10586	// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
10587	// CHECK: ret <2 x i32> [[VQSHL_V2_I]]
10588	int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
10589	return vqshl_s32(a, b);
10590	}
10591
10592	// CHECK-LABEL: @test_vqshl_s64(
10593	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10594	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
10595	// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
10596	// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
10597	// CHECK: ret <1 x i64> [[VQSHL_V2_I]]
10598	int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
10599	return vqshl_s64(a, b);
10600	}
10601
10602	// CHECK-LABEL: @test_vqshl_u8(
10603	// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
10604	// CHECK: ret <8 x i8> [[VQSHL_V_I]]
10605	uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
10606	return vqshl_u8(a, b);
10607	}
10608
10609	// CHECK-LABEL: @test_vqshl_u16(
10610	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10611	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10612	// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
10613	// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
10614	// CHECK: ret <4 x i16> [[VQSHL_V2_I]]
10615	uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
10616	return vqshl_u16(a, b);
10617	}
10618
10619	// CHECK-LABEL: @test_vqshl_u32(
10620	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10621	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10622	// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
10623	// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
10624	// CHECK: ret <2 x i32> [[VQSHL_V2_I]]
10625	uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
10626	return vqshl_u32(a, b);
10627	}
10628
10629	// CHECK-LABEL: @test_vqshl_u64(
10630	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10631	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
10632	// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
10633	// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
10634	// CHECK: ret <1 x i64> [[VQSHL_V2_I]]
10635	uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
10636	return vqshl_u64(a, b);
10637	}
10638
10639	// CHECK-LABEL: @test_vqshlq_s8(
10640	// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
10641	// CHECK: ret <16 x i8> [[VQSHLQ_V_I]]
10642	int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
10643	return vqshlq_s8(a, b);
10644	}
10645
10646	// CHECK-LABEL: @test_vqshlq_s16(
10647	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10648	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
10649	// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
10650	// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
10651	// CHECK: ret <8 x i16> [[VQSHLQ_V2_I]]
10652	int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
10653	return vqshlq_s16(a, b);
10654	}
10655
10656	// CHECK-LABEL: @test_vqshlq_s32(
10657	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10658	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
10659	// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
10660	// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
10661	// CHECK: ret <4 x i32> [[VQSHLQ_V2_I]]
10662	int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
10663	return vqshlq_s32(a, b);
10664	}
10665
10666	// CHECK-LABEL: @test_vqshlq_s64(
10667	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10668	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
10669	// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
10670	// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
10671	// CHECK: ret <2 x i64> [[VQSHLQ_V2_I]]
10672	int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
10673	return vqshlq_s64(a, b);
10674	}
10675
10676	// CHECK-LABEL: @test_vqshlq_u8(
10677	// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
10678	// CHECK: ret <16 x i8> [[VQSHLQ_V_I]]
10679	uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
10680	return vqshlq_u8(a, b);
10681	}
10682
10683	// CHECK-LABEL: @test_vqshlq_u16(
10684	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10685	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
10686	// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
10687	// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
10688	// CHECK: ret <8 x i16> [[VQSHLQ_V2_I]]
10689	uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
10690	return vqshlq_u16(a, b);
10691	}
10692
10693	// CHECK-LABEL: @test_vqshlq_u32(
10694	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10695	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
10696	// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
10697	// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
10698	// CHECK: ret <4 x i32> [[VQSHLQ_V2_I]]
10699	uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
10700	return vqshlq_u32(a, b);
10701	}
10702
10703	// CHECK-LABEL: @test_vqshlq_u64(
10704	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10705	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
10706	// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
10707	// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
10708	// CHECK: ret <2 x i64> [[VQSHLQ_V2_I]]
10709	uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
10710	return vqshlq_u64(a, b);
10711	}
10712
10713	// CHECK-LABEL: @test_vqshlu_n_s8(
10714	// CHECK: [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
10715	// CHECK: ret <8 x i8> [[VQSHLU_N]]
10716	uint8x8_t test_vqshlu_n_s8(int8x8_t a) {
10717	return vqshlu_n_s8(a, 1);
10718	}
10719
10720	// CHECK-LABEL: @test_vqshlu_n_s16(
10721	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10722	// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10723	// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
10724	// CHECK: ret <4 x i16> [[VQSHLU_N1]]
10725	uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
10726	return vqshlu_n_s16(a, 1);
10727	}
10728
10729	// CHECK-LABEL: @test_vqshlu_n_s32(
10730	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10731	// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10732	// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 1, i32 1>)
10733	// CHECK: ret <2 x i32> [[VQSHLU_N1]]
10734	uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
10735	return vqshlu_n_s32(a, 1);
10736	}
10737
10738	// CHECK-LABEL: @test_vqshlu_n_s64(
10739	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10740	// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
10741	// CHECK: [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
10742	// CHECK: ret <1 x i64> [[VQSHLU_N1]]
10743	uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
10744	return vqshlu_n_s64(a, 1);
10745	}
10746
10747	// CHECK-LABEL: @test_vqshluq_n_s8(
10748	// CHECK: [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
10749	// CHECK: ret <16 x i8> [[VQSHLU_N]]
10750	uint8x16_t test_vqshluq_n_s8(int8x16_t a) {
10751	return vqshluq_n_s8(a, 1);
10752	}
10753
10754	// CHECK-LABEL: @test_vqshluq_n_s16(
10755	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10756	// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10757	// CHECK: [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
10758	// CHECK: ret <8 x i16> [[VQSHLU_N1]]
10759	uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
10760	return vqshluq_n_s16(a, 1);
10761	}
10762
10763	// CHECK-LABEL: @test_vqshluq_n_s32(
10764	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10765	// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10766	// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
10767	// CHECK: ret <4 x i32> [[VQSHLU_N1]]
10768	uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
10769	return vqshluq_n_s32(a, 1);
10770	}
10771
10772	// CHECK-LABEL: @test_vqshluq_n_s64(
10773	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10774	// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10775	// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 1, i64 1>)
10776	// CHECK: ret <2 x i64> [[VQSHLU_N1]]
10777	uint64x2_t test_vqshluq_n_s64(int64x2_t a) {
10778	return vqshluq_n_s64(a, 1);
10779	}
10780
10781	// CHECK-LABEL: @test_vqshl_n_s8(
10782	// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
10783	// CHECK: ret <8 x i8> [[VQSHL_N]]
10784	int8x8_t test_vqshl_n_s8(int8x8_t a) {
10785	return vqshl_n_s8(a, 1);
10786	}
10787
10788	// CHECK-LABEL: @test_vqshl_n_s16(
10789	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10790	// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10791	// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
10792	// CHECK: ret <4 x i16> [[VQSHL_N1]]
10793	int16x4_t test_vqshl_n_s16(int16x4_t a) {
10794	return vqshl_n_s16(a, 1);
10795	}
10796
10797	// CHECK-LABEL: @test_vqshl_n_s32(
10798	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10799	// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10800	// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
10801	// CHECK: ret <2 x i32> [[VQSHL_N1]]
10802	int32x2_t test_vqshl_n_s32(int32x2_t a) {
10803	return vqshl_n_s32(a, 1);
10804	}
10805
10806	// CHECK-LABEL: @test_vqshl_n_s64(
10807	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10808	// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
10809	// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
10810	// CHECK: ret <1 x i64> [[VQSHL_N1]]
10811	int64x1_t test_vqshl_n_s64(int64x1_t a) {
10812	return vqshl_n_s64(a, 1);
10813	}
10814
10815	// CHECK-LABEL: @test_vqshl_n_u8(
10816	// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
10817	// CHECK: ret <8 x i8> [[VQSHL_N]]
10818	uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
10819	return vqshl_n_u8(a, 1);
10820	}
10821
10822	// CHECK-LABEL: @test_vqshl_n_u16(
10823	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10824	// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10825	// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
10826	// CHECK: ret <4 x i16> [[VQSHL_N1]]
10827	uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
10828	return vqshl_n_u16(a, 1);
10829	}
10830
10831	// CHECK-LABEL: @test_vqshl_n_u32(
10832	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10833	// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10834	// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
10835	// CHECK: ret <2 x i32> [[VQSHL_N1]]
10836	uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
10837	return vqshl_n_u32(a, 1);
10838	}
10839
10840	// CHECK-LABEL: @test_vqshl_n_u64(
10841	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
10842	// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
10843	// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
10844	// CHECK: ret <1 x i64> [[VQSHL_N1]]
10845	uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
10846	return vqshl_n_u64(a, 1);
10847	}
10848
10849	// CHECK-LABEL: @test_vqshlq_n_s8(
10850	// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
10851	// CHECK: ret <16 x i8> [[VQSHL_N]]
10852	int8x16_t test_vqshlq_n_s8(int8x16_t a) {
10853	return vqshlq_n_s8(a, 1);
10854	}
10855
10856	// CHECK-LABEL: @test_vqshlq_n_s16(
10857	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10858	// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10859	// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
10860	// CHECK: ret <8 x i16> [[VQSHL_N1]]
10861	int16x8_t test_vqshlq_n_s16(int16x8_t a) {
10862	return vqshlq_n_s16(a, 1);
10863	}
10864
10865	// CHECK-LABEL: @test_vqshlq_n_s32(
10866	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10867	// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10868	// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
10869	// CHECK: ret <4 x i32> [[VQSHL_N1]]
10870	int32x4_t test_vqshlq_n_s32(int32x4_t a) {
10871	return vqshlq_n_s32(a, 1);
10872	}
10873
10874	// CHECK-LABEL: @test_vqshlq_n_s64(
10875	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10876	// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10877	// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
10878	// CHECK: ret <2 x i64> [[VQSHL_N1]]
10879	int64x2_t test_vqshlq_n_s64(int64x2_t a) {
10880	return vqshlq_n_s64(a, 1);
10881	}
10882
10883	// CHECK-LABEL: @test_vqshlq_n_u8(
10884	// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
10885	// CHECK: ret <16 x i8> [[VQSHL_N]]
10886	uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
10887	return vqshlq_n_u8(a, 1);
10888	}
10889
10890	// CHECK-LABEL: @test_vqshlq_n_u16(
10891	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10892	// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10893	// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
10894	// CHECK: ret <8 x i16> [[VQSHL_N1]]
10895	uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
10896	return vqshlq_n_u16(a, 1);
10897	}
10898
10899	// CHECK-LABEL: @test_vqshlq_n_u32(
10900	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10901	// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10902	// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
10903	// CHECK: ret <4 x i32> [[VQSHL_N1]]
10904	uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
10905	return vqshlq_n_u32(a, 1);
10906	}
10907
10908	// CHECK-LABEL: @test_vqshlq_n_u64(
10909	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10910	// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10911	// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
10912	// CHECK: ret <2 x i64> [[VQSHL_N1]]
10913	uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
10914	return vqshlq_n_u64(a, 1);
10915	}
10916
10917	// CHECK-LABEL: @test_vqshrn_n_s16(
10918	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10919	// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10920	// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
10921	// CHECK: ret <8 x i8> [[VQSHRN_N1]]
10922	int8x8_t test_vqshrn_n_s16(int16x8_t a) {
10923	return vqshrn_n_s16(a, 1);
10924	}
10925
10926	// CHECK-LABEL: @test_vqshrn_n_s32(
10927	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10928	// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10929	// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
10930	// CHECK: ret <4 x i16> [[VQSHRN_N1]]
10931	int16x4_t test_vqshrn_n_s32(int32x4_t a) {
10932	return vqshrn_n_s32(a, 1);
10933	}
10934
10935	// CHECK-LABEL: @test_vqshrn_n_s64(
10936	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10937	// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10938	// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
10939	// CHECK: ret <2 x i32> [[VQSHRN_N1]]
10940	int32x2_t test_vqshrn_n_s64(int64x2_t a) {
10941	return vqshrn_n_s64(a, 1);
10942	}
10943
10944	// CHECK-LABEL: @test_vqshrn_n_u16(
10945	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10946	// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10947	// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
10948	// CHECK: ret <8 x i8> [[VQSHRN_N1]]
10949	uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
10950	return vqshrn_n_u16(a, 1);
10951	}
10952
10953	// CHECK-LABEL: @test_vqshrn_n_u32(
10954	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10955	// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10956	// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
10957	// CHECK: ret <4 x i16> [[VQSHRN_N1]]
10958	uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
10959	return vqshrn_n_u32(a, 1);
10960	}
10961
10962	// CHECK-LABEL: @test_vqshrn_n_u64(
10963	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10964	// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10965	// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
10966	// CHECK: ret <2 x i32> [[VQSHRN_N1]]
10967	uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
10968	return vqshrn_n_u64(a, 1);
10969	}
10970
10971	// CHECK-LABEL: @test_vqshrun_n_s16(
10972	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
10973	// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
10974	// CHECK: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
10975	// CHECK: ret <8 x i8> [[VQSHRUN_N1]]
10976	uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
10977	return vqshrun_n_s16(a, 1);
10978	}
10979
10980	// CHECK-LABEL: @test_vqshrun_n_s32(
10981	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
10982	// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
10983	// CHECK: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
10984	// CHECK: ret <4 x i16> [[VQSHRUN_N1]]
10985	uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
10986	return vqshrun_n_s32(a, 1);
10987	}
10988
10989	// CHECK-LABEL: @test_vqshrun_n_s64(
10990	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
10991	// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
10992	// CHECK: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
10993	// CHECK: ret <2 x i32> [[VQSHRUN_N1]]
10994	uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
10995	return vqshrun_n_s64(a, 1);
10996	}
10997
10998	// CHECK-LABEL: @test_vqsub_s8(
10999	// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %a, <8 x i8> %b)
11000	// CHECK: ret <8 x i8> [[VQSUB_V_I]]
11001	int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
11002	return vqsub_s8(a, b);
11003	}
11004
11005	// CHECK-LABEL: @test_vqsub_s16(
11006	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11007	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11008	// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %a, <4 x i16> %b)
11009	// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
11010	// CHECK: ret <4 x i16> [[VQSUB_V2_I]]
11011	int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
11012	return vqsub_s16(a, b);
11013	}
11014
11015	// CHECK-LABEL: @test_vqsub_s32(
11016	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11017	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11018	// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %a, <2 x i32> %b)
11019	// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
11020	// CHECK: ret <2 x i32> [[VQSUB_V2_I]]
11021	int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
11022	return vqsub_s32(a, b);
11023	}
11024
11025	// CHECK-LABEL: @test_vqsub_s64(
11026	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11027	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
11028	// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %a, <1 x i64> %b)
11029	// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
11030	// CHECK: ret <1 x i64> [[VQSUB_V2_I]]
11031	int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
11032	return vqsub_s64(a, b);
11033	}
11034
11035	// CHECK-LABEL: @test_vqsub_u8(
11036	// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %a, <8 x i8> %b)
11037	// CHECK: ret <8 x i8> [[VQSUB_V_I]]
11038	uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
11039	return vqsub_u8(a, b);
11040	}
11041
11042	// CHECK-LABEL: @test_vqsub_u16(
11043	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11044	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11045	// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %a, <4 x i16> %b)
11046	// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
11047	// CHECK: ret <4 x i16> [[VQSUB_V2_I]]
11048	uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
11049	return vqsub_u16(a, b);
11050	}
11051
11052	// CHECK-LABEL: @test_vqsub_u32(
11053	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11054	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11055	// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %a, <2 x i32> %b)
11056	// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
11057	// CHECK: ret <2 x i32> [[VQSUB_V2_I]]
11058	uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
11059	return vqsub_u32(a, b);
11060	}
11061
11062	// CHECK-LABEL: @test_vqsub_u64(
11063	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11064	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
11065	// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %a, <1 x i64> %b)
11066	// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
11067	// CHECK: ret <1 x i64> [[VQSUB_V2_I]]
11068	uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
11069	return vqsub_u64(a, b);
11070	}
11071
11072	// CHECK-LABEL: @test_vqsubq_s8(
11073	// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %a, <16 x i8> %b)
11074	// CHECK: ret <16 x i8> [[VQSUBQ_V_I]]
11075	int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
11076	return vqsubq_s8(a, b);
11077	}
11078
11079	// CHECK-LABEL: @test_vqsubq_s16(
11080	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11081	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
11082	// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %a, <8 x i16> %b)
11083	// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
11084	// CHECK: ret <8 x i16> [[VQSUBQ_V2_I]]
11085	int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
11086	return vqsubq_s16(a, b);
11087	}
11088
11089	// CHECK-LABEL: @test_vqsubq_s32(
11090	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11091	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
11092	// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %b)
11093	// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
11094	// CHECK: ret <4 x i32> [[VQSUBQ_V2_I]]
11095	int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
11096	return vqsubq_s32(a, b);
11097	}
11098
11099	// CHECK-LABEL: @test_vqsubq_s64(
11100	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11101	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
11102	// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %b)
11103	// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
11104	// CHECK: ret <2 x i64> [[VQSUBQ_V2_I]]
11105	int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
11106	return vqsubq_s64(a, b);
11107	}
11108
11109	// CHECK-LABEL: @test_vqsubq_u8(
11110	// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %a, <16 x i8> %b)
11111	// CHECK: ret <16 x i8> [[VQSUBQ_V_I]]
11112	uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
11113	return vqsubq_u8(a, b);
11114	}
11115
11116	// CHECK-LABEL: @test_vqsubq_u16(
11117	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11118	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
11119	// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %a, <8 x i16> %b)
11120	// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
11121	// CHECK: ret <8 x i16> [[VQSUBQ_V2_I]]
11122	uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
11123	return vqsubq_u16(a, b);
11124	}
11125
11126	// CHECK-LABEL: @test_vqsubq_u32(
11127	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11128	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
11129	// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %a, <4 x i32> %b)
11130	// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
11131	// CHECK: ret <4 x i32> [[VQSUBQ_V2_I]]
11132	uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
11133	return vqsubq_u32(a, b);
11134	}
11135
11136	// CHECK-LABEL: @test_vqsubq_u64(
11137	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11138	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
11139	// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %a, <2 x i64> %b)
11140	// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
11141	// CHECK: ret <2 x i64> [[VQSUBQ_V2_I]]
11142	uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
11143	return vqsubq_u64(a, b);
11144	}
11145
11146	// CHECK-LABEL: @test_vraddhn_s16(
11147	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11148	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
11149	// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
11150	// CHECK: ret <8 x i8> [[VRADDHN_V2_I]]
11151	int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
11152	return vraddhn_s16(a, b);
11153	}
11154
11155	// CHECK-LABEL: @test_vraddhn_s32(
11156	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11157	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
11158	// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
11159	// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
11160	// CHECK: ret <4 x i16> [[VRADDHN_V2_I]]
11161	int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
11162	return vraddhn_s32(a, b);
11163	}
11164
11165	// CHECK-LABEL: @test_vraddhn_s64(
11166	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11167	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
11168	// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
11169	// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
11170	// CHECK: ret <2 x i32> [[VRADDHN_V2_I]]
11171	int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
11172	return vraddhn_s64(a, b);
11173	}
11174
11175	// CHECK-LABEL: @test_vraddhn_u16(
11176	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11177	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
11178	// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
11179	// CHECK: ret <8 x i8> [[VRADDHN_V2_I]]
11180	uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
11181	return vraddhn_u16(a, b);
11182	}
11183
11184	// CHECK-LABEL: @test_vraddhn_u32(
11185	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11186	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
11187	// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
11188	// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
11189	// CHECK: ret <4 x i16> [[VRADDHN_V2_I]]
11190	uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
11191	return vraddhn_u32(a, b);
11192	}
11193
11194	// CHECK-LABEL: @test_vraddhn_u64(
11195	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11196	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
11197	// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
11198	// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
11199	// CHECK: ret <2 x i32> [[VRADDHN_V2_I]]
11200	uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
11201	return vraddhn_u64(a, b);
11202	}
11203
11204	// CHECK-LABEL: @test_vrecpe_f32(
11205	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
11206	// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a)
11207	// CHECK: ret <2 x float> [[VRECPE_V1_I]]
11208	float32x2_t test_vrecpe_f32(float32x2_t a) {
11209	return vrecpe_f32(a);
11210	}
11211
11212	// CHECK-LABEL: @test_vrecpe_u32(
11213	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11214	// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a)
11215	// CHECK: ret <2 x i32> [[VRECPE_V1_I]]
11216	uint32x2_t test_vrecpe_u32(uint32x2_t a) {
11217	return vrecpe_u32(a);
11218	}
11219
11220	// CHECK-LABEL: @test_vrecpeq_f32(
11221	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
11222	// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a)
11223	// CHECK: ret <4 x float> [[VRECPEQ_V1_I]]
11224	float32x4_t test_vrecpeq_f32(float32x4_t a) {
11225	return vrecpeq_f32(a);
11226	}
11227
11228	// CHECK-LABEL: @test_vrecpeq_u32(
11229	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11230	// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a)
11231	// CHECK: ret <4 x i32> [[VRECPEQ_V1_I]]
11232	uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
11233	return vrecpeq_u32(a);
11234	}
11235
11236	// CHECK-LABEL: @test_vrecps_f32(
11237	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
11238	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
11239	// CHECK: [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %a, <2 x float> %b)
11240	// CHECK: [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
11241	// CHECK: ret <2 x float> [[VRECPS_V2_I]]
11242	float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) {
11243	return vrecps_f32(a, b);
11244	}
11245
11246	// CHECK-LABEL: @test_vrecpsq_f32(
11247	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
11248	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
11249	// CHECK: [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %a, <4 x float> %b)
11250	// CHECK: [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
11251	// CHECK: ret <4 x float> [[VRECPSQ_V2_I]]
11252	float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) {
11253	return vrecpsq_f32(a, b);
11254	}
11255
11256	// CHECK-LABEL: @test_vreinterpret_s8_s16(
11257	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11258	// CHECK: ret <8 x i8> [[TMP0]]
11259	int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
11260	return vreinterpret_s8_s16(a);
11261	}
11262
11263	// CHECK-LABEL: @test_vreinterpret_s8_s32(
11264	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11265	// CHECK: ret <8 x i8> [[TMP0]]
11266	int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
11267	return vreinterpret_s8_s32(a);
11268	}
11269
11270	// CHECK-LABEL: @test_vreinterpret_s8_s64(
11271	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11272	// CHECK: ret <8 x i8> [[TMP0]]
11273	int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
11274	return vreinterpret_s8_s64(a);
11275	}
11276
11277	// CHECK-LABEL: @test_vreinterpret_s8_u8(
11278	// CHECK: ret <8 x i8> %a
11279	int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
11280	return vreinterpret_s8_u8(a);
11281	}
11282
11283	// CHECK-LABEL: @test_vreinterpret_s8_u16(
11284	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11285	// CHECK: ret <8 x i8> [[TMP0]]
11286	int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
11287	return vreinterpret_s8_u16(a);
11288	}
11289
11290	// CHECK-LABEL: @test_vreinterpret_s8_u32(
11291	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11292	// CHECK: ret <8 x i8> [[TMP0]]
11293	int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
11294	return vreinterpret_s8_u32(a);
11295	}
11296
11297	// CHECK-LABEL: @test_vreinterpret_s8_u64(
11298	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11299	// CHECK: ret <8 x i8> [[TMP0]]
11300	int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
11301	return vreinterpret_s8_u64(a);
11302	}
11303
11304	// CHECK-LABEL: @test_vreinterpret_s8_f16(
11305	// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
11306	// CHECK: ret <8 x i8> [[TMP0]]
11307	int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
11308	return vreinterpret_s8_f16(a);
11309	}
11310
11311	// CHECK-LABEL: @test_vreinterpret_s8_f32(
11312	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
11313	// CHECK: ret <8 x i8> [[TMP0]]
11314	int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
11315	return vreinterpret_s8_f32(a);
11316	}
11317
11318	// CHECK-LABEL: @test_vreinterpret_s8_p8(
11319	// CHECK: ret <8 x i8> %a
11320	int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
11321	return vreinterpret_s8_p8(a);
11322	}
11323
11324	// CHECK-LABEL: @test_vreinterpret_s8_p16(
11325	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11326	// CHECK: ret <8 x i8> [[TMP0]]
11327	int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
11328	return vreinterpret_s8_p16(a);
11329	}
11330
11331	// CHECK-LABEL: @test_vreinterpret_s16_s8(
11332	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11333	// CHECK: ret <4 x i16> [[TMP0]]
11334	int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
11335	return vreinterpret_s16_s8(a);
11336	}
11337
11338	// CHECK-LABEL: @test_vreinterpret_s16_s32(
11339	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
11340	// CHECK: ret <4 x i16> [[TMP0]]
11341	int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
11342	return vreinterpret_s16_s32(a);
11343	}
11344
11345	// CHECK-LABEL: @test_vreinterpret_s16_s64(
11346	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
11347	// CHECK: ret <4 x i16> [[TMP0]]
11348	int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
11349	return vreinterpret_s16_s64(a);
11350	}
11351
11352	// CHECK-LABEL: @test_vreinterpret_s16_u8(
11353	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11354	// CHECK: ret <4 x i16> [[TMP0]]
11355	int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
11356	return vreinterpret_s16_u8(a);
11357	}
11358
11359	// CHECK-LABEL: @test_vreinterpret_s16_u16(
11360	// CHECK: ret <4 x i16> %a
11361	int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
11362	return vreinterpret_s16_u16(a);
11363	}
11364
11365	// CHECK-LABEL: @test_vreinterpret_s16_u32(
11366	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
11367	// CHECK: ret <4 x i16> [[TMP0]]
11368	int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
11369	return vreinterpret_s16_u32(a);
11370	}
11371
11372	// CHECK-LABEL: @test_vreinterpret_s16_u64(
11373	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
11374	// CHECK: ret <4 x i16> [[TMP0]]
11375	int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
11376	return vreinterpret_s16_u64(a);
11377	}
11378
11379	// CHECK-LABEL: @test_vreinterpret_s16_f16(
11380	// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
11381	// CHECK: ret <4 x i16> [[TMP0]]
11382	int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
11383	return vreinterpret_s16_f16(a);
11384	}
11385
11386	// CHECK-LABEL: @test_vreinterpret_s16_f32(
11387	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
11388	// CHECK: ret <4 x i16> [[TMP0]]
11389	int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
11390	return vreinterpret_s16_f32(a);
11391	}
11392
11393	// CHECK-LABEL: @test_vreinterpret_s16_p8(
11394	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11395	// CHECK: ret <4 x i16> [[TMP0]]
11396	int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
11397	return vreinterpret_s16_p8(a);
11398	}
11399
11400	// CHECK-LABEL: @test_vreinterpret_s16_p16(
11401	// CHECK: ret <4 x i16> %a
11402	int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
11403	return vreinterpret_s16_p16(a);
11404	}
11405
11406	// CHECK-LABEL: @test_vreinterpret_s32_s8(
11407	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
11408	// CHECK: ret <2 x i32> [[TMP0]]
11409	int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
11410	return vreinterpret_s32_s8(a);
11411	}
11412
11413	// CHECK-LABEL: @test_vreinterpret_s32_s16(
11414	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
11415	// CHECK: ret <2 x i32> [[TMP0]]
11416	int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
11417	return vreinterpret_s32_s16(a);
11418	}
11419
11420	// CHECK-LABEL: @test_vreinterpret_s32_s64(
11421	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
11422	// CHECK: ret <2 x i32> [[TMP0]]
11423	int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
11424	return vreinterpret_s32_s64(a);
11425	}
11426
11427	// CHECK-LABEL: @test_vreinterpret_s32_u8(
11428	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
11429	// CHECK: ret <2 x i32> [[TMP0]]
11430	int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
11431	return vreinterpret_s32_u8(a);
11432	}
11433
11434	// CHECK-LABEL: @test_vreinterpret_s32_u16(
11435	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
11436	// CHECK: ret <2 x i32> [[TMP0]]
11437	int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
11438	return vreinterpret_s32_u16(a);
11439	}
11440
11441	// CHECK-LABEL: @test_vreinterpret_s32_u32(
11442	// CHECK: ret <2 x i32> %a
11443	int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
11444	return vreinterpret_s32_u32(a);
11445	}
11446
11447	// CHECK-LABEL: @test_vreinterpret_s32_u64(
11448	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
11449	// CHECK: ret <2 x i32> [[TMP0]]
11450	int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
11451	return vreinterpret_s32_u64(a);
11452	}
11453
11454	// CHECK-LABEL: @test_vreinterpret_s32_f16(
11455	// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
11456	// CHECK: ret <2 x i32> [[TMP0]]
11457	int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
11458	return vreinterpret_s32_f16(a);
11459	}
11460
11461	// CHECK-LABEL: @test_vreinterpret_s32_f32(
11462	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
11463	// CHECK: ret <2 x i32> [[TMP0]]
11464	int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
11465	return vreinterpret_s32_f32(a);
11466	}
11467
11468	// CHECK-LABEL: @test_vreinterpret_s32_p8(
11469	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
11470	// CHECK: ret <2 x i32> [[TMP0]]
11471	int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
11472	return vreinterpret_s32_p8(a);
11473	}
11474
11475	// CHECK-LABEL: @test_vreinterpret_s32_p16(
11476	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
11477	// CHECK: ret <2 x i32> [[TMP0]]
11478	int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
11479	return vreinterpret_s32_p16(a);
11480	}
11481
11482	// CHECK-LABEL: @test_vreinterpret_s64_s8(
11483	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
11484	// CHECK: ret <1 x i64> [[TMP0]]
11485	int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
11486	return vreinterpret_s64_s8(a);
11487	}
11488
11489	// CHECK-LABEL: @test_vreinterpret_s64_s16(
11490	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
11491	// CHECK: ret <1 x i64> [[TMP0]]
11492	int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
11493	return vreinterpret_s64_s16(a);
11494	}
11495
11496	// CHECK-LABEL: @test_vreinterpret_s64_s32(
11497	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
11498	// CHECK: ret <1 x i64> [[TMP0]]
11499	int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
11500	return vreinterpret_s64_s32(a);
11501	}
11502
11503	// CHECK-LABEL: @test_vreinterpret_s64_u8(
11504	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
11505	// CHECK: ret <1 x i64> [[TMP0]]
11506	int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
11507	return vreinterpret_s64_u8(a);
11508	}
11509
11510	// CHECK-LABEL: @test_vreinterpret_s64_u16(
11511	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
11512	// CHECK: ret <1 x i64> [[TMP0]]
11513	int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
11514	return vreinterpret_s64_u16(a);
11515	}
11516
11517	// CHECK-LABEL: @test_vreinterpret_s64_u32(
11518	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
11519	// CHECK: ret <1 x i64> [[TMP0]]
11520	int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
11521	return vreinterpret_s64_u32(a);
11522	}
11523
11524	// CHECK-LABEL: @test_vreinterpret_s64_u64(
11525	// CHECK: ret <1 x i64> %a
11526	int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
11527	return vreinterpret_s64_u64(a);
11528	}
11529
11530	// CHECK-LABEL: @test_vreinterpret_s64_f16(
11531	// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
11532	// CHECK: ret <1 x i64> [[TMP0]]
11533	int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
11534	return vreinterpret_s64_f16(a);
11535	}
11536
11537	// CHECK-LABEL: @test_vreinterpret_s64_f32(
11538	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
11539	// CHECK: ret <1 x i64> [[TMP0]]
11540	int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
11541	return vreinterpret_s64_f32(a);
11542	}
11543
11544	// CHECK-LABEL: @test_vreinterpret_s64_p8(
11545	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
11546	// CHECK: ret <1 x i64> [[TMP0]]
11547	int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
11548	return vreinterpret_s64_p8(a);
11549	}
11550
11551	// CHECK-LABEL: @test_vreinterpret_s64_p16(
11552	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
11553	// CHECK: ret <1 x i64> [[TMP0]]
11554	int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
11555	return vreinterpret_s64_p16(a);
11556	}
11557
11558	// CHECK-LABEL: @test_vreinterpret_u8_s8(
11559	// CHECK: ret <8 x i8> %a
11560	uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
11561	return vreinterpret_u8_s8(a);
11562	}
11563
11564	// CHECK-LABEL: @test_vreinterpret_u8_s16(
11565	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11566	// CHECK: ret <8 x i8> [[TMP0]]
11567	uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
11568	return vreinterpret_u8_s16(a);
11569	}
11570
11571	// CHECK-LABEL: @test_vreinterpret_u8_s32(
11572	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11573	// CHECK: ret <8 x i8> [[TMP0]]
11574	uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
11575	return vreinterpret_u8_s32(a);
11576	}
11577
11578	// CHECK-LABEL: @test_vreinterpret_u8_s64(
11579	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11580	// CHECK: ret <8 x i8> [[TMP0]]
11581	uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
11582	return vreinterpret_u8_s64(a);
11583	}
11584
11585	// CHECK-LABEL: @test_vreinterpret_u8_u16(
11586	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11587	// CHECK: ret <8 x i8> [[TMP0]]
11588	uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
11589	return vreinterpret_u8_u16(a);
11590	}
11591
11592	// CHECK-LABEL: @test_vreinterpret_u8_u32(
11593	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11594	// CHECK: ret <8 x i8> [[TMP0]]
11595	uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
11596	return vreinterpret_u8_u32(a);
11597	}
11598
11599	// CHECK-LABEL: @test_vreinterpret_u8_u64(
11600	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11601	// CHECK: ret <8 x i8> [[TMP0]]
11602	uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
11603	return vreinterpret_u8_u64(a);
11604	}
11605
11606	// CHECK-LABEL: @test_vreinterpret_u8_f16(
11607	// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
11608	// CHECK: ret <8 x i8> [[TMP0]]
11609	uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
11610	return vreinterpret_u8_f16(a);
11611	}
11612
11613	// CHECK-LABEL: @test_vreinterpret_u8_f32(
11614	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
11615	// CHECK: ret <8 x i8> [[TMP0]]
11616	uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
11617	return vreinterpret_u8_f32(a);
11618	}
11619
11620	// CHECK-LABEL: @test_vreinterpret_u8_p8(
11621	// CHECK: ret <8 x i8> %a
11622	uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
11623	return vreinterpret_u8_p8(a);
11624	}
11625
11626	// CHECK-LABEL: @test_vreinterpret_u8_p16(
11627	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11628	// CHECK: ret <8 x i8> [[TMP0]]
11629	uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
11630	return vreinterpret_u8_p16(a);
11631	}
11632
11633	// CHECK-LABEL: @test_vreinterpret_u16_s8(
11634	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11635	// CHECK: ret <4 x i16> [[TMP0]]
11636	uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
11637	return vreinterpret_u16_s8(a);
11638	}
11639
11640	// CHECK-LABEL: @test_vreinterpret_u16_s16(
11641	// CHECK: ret <4 x i16> %a
11642	uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
11643	return vreinterpret_u16_s16(a);
11644	}
11645
11646	// CHECK-LABEL: @test_vreinterpret_u16_s32(
11647	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
11648	// CHECK: ret <4 x i16> [[TMP0]]
11649	uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
11650	return vreinterpret_u16_s32(a);
11651	}
11652
11653	// CHECK-LABEL: @test_vreinterpret_u16_s64(
11654	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
11655	// CHECK: ret <4 x i16> [[TMP0]]
11656	uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
11657	return vreinterpret_u16_s64(a);
11658	}
11659
11660	// CHECK-LABEL: @test_vreinterpret_u16_u8(
11661	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11662	// CHECK: ret <4 x i16> [[TMP0]]
11663	uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
11664	return vreinterpret_u16_u8(a);
11665	}
11666
11667	// CHECK-LABEL: @test_vreinterpret_u16_u32(
11668	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
11669	// CHECK: ret <4 x i16> [[TMP0]]
11670	uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
11671	return vreinterpret_u16_u32(a);
11672	}
11673
11674	// CHECK-LABEL: @test_vreinterpret_u16_u64(
11675	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
11676	// CHECK: ret <4 x i16> [[TMP0]]
11677	uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
11678	return vreinterpret_u16_u64(a);
11679	}
11680
11681	// CHECK-LABEL: @test_vreinterpret_u16_f16(
11682	// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
11683	// CHECK: ret <4 x i16> [[TMP0]]
11684	uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
11685	return vreinterpret_u16_f16(a);
11686	}
11687
11688	// CHECK-LABEL: @test_vreinterpret_u16_f32(
11689	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
11690	// CHECK: ret <4 x i16> [[TMP0]]
11691	uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
11692	return vreinterpret_u16_f32(a);
11693	}
11694
11695	// CHECK-LABEL: @test_vreinterpret_u16_p8(
11696	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
11697	// CHECK: ret <4 x i16> [[TMP0]]
11698	uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
11699	return vreinterpret_u16_p8(a);
11700	}
11701
11702	// CHECK-LABEL: @test_vreinterpret_u16_p16(
11703	// CHECK: ret <4 x i16> %a
11704	uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
11705	return vreinterpret_u16_p16(a);
11706	}
11707
11708	// CHECK-LABEL: @test_vreinterpret_u32_s8(
11709	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
11710	// CHECK: ret <2 x i32> [[TMP0]]
11711	uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
11712	return vreinterpret_u32_s8(a);
11713	}
11714
11715	// CHECK-LABEL: @test_vreinterpret_u32_s16(
11716	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
11717	// CHECK: ret <2 x i32> [[TMP0]]
11718	uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
11719	return vreinterpret_u32_s16(a);
11720	}
11721
11722	// CHECK-LABEL: @test_vreinterpret_u32_s32(
11723	// CHECK: ret <2 x i32> %a
11724	uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
11725	return vreinterpret_u32_s32(a);
11726	}
11727
11728	// CHECK-LABEL: @test_vreinterpret_u32_s64(
11729	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
11730	// CHECK: ret <2 x i32> [[TMP0]]
11731	uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
11732	return vreinterpret_u32_s64(a);
11733	}
11734
11735	// CHECK-LABEL: @test_vreinterpret_u32_u8(
11736	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
11737	// CHECK: ret <2 x i32> [[TMP0]]
11738	uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
11739	return vreinterpret_u32_u8(a);
11740	}
11741
11742	// CHECK-LABEL: @test_vreinterpret_u32_u16(
11743	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
11744	// CHECK: ret <2 x i32> [[TMP0]]
11745	uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
11746	return vreinterpret_u32_u16(a);
11747	}
11748
11749	// CHECK-LABEL: @test_vreinterpret_u32_u64(
11750	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
11751	// CHECK: ret <2 x i32> [[TMP0]]
11752	uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
11753	return vreinterpret_u32_u64(a);
11754	}
11755
11756	// CHECK-LABEL: @test_vreinterpret_u32_f16(
11757	// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
11758	// CHECK: ret <2 x i32> [[TMP0]]
11759	uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
11760	return vreinterpret_u32_f16(a);
11761	}
11762
11763	// CHECK-LABEL: @test_vreinterpret_u32_f32(
11764	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
11765	// CHECK: ret <2 x i32> [[TMP0]]
11766	uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
11767	return vreinterpret_u32_f32(a);
11768	}
11769
11770	// CHECK-LABEL: @test_vreinterpret_u32_p8(
11771	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
11772	// CHECK: ret <2 x i32> [[TMP0]]
11773	uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
11774	return vreinterpret_u32_p8(a);
11775	}
11776
11777	// CHECK-LABEL: @test_vreinterpret_u32_p16(
11778	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
11779	// CHECK: ret <2 x i32> [[TMP0]]
11780	uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
11781	return vreinterpret_u32_p16(a);
11782	}
11783
11784	// CHECK-LABEL: @test_vreinterpret_u64_s8(
11785	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
11786	// CHECK: ret <1 x i64> [[TMP0]]
11787	uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
11788	return vreinterpret_u64_s8(a);
11789	}
11790
11791	// CHECK-LABEL: @test_vreinterpret_u64_s16(
11792	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
11793	// CHECK: ret <1 x i64> [[TMP0]]
11794	uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
11795	return vreinterpret_u64_s16(a);
11796	}
11797
11798	// CHECK-LABEL: @test_vreinterpret_u64_s32(
11799	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
11800	// CHECK: ret <1 x i64> [[TMP0]]
11801	uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
11802	return vreinterpret_u64_s32(a);
11803	}
11804
11805	// CHECK-LABEL: @test_vreinterpret_u64_s64(
11806	// CHECK: ret <1 x i64> %a
11807	uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
11808	return vreinterpret_u64_s64(a);
11809	}
11810
11811	// CHECK-LABEL: @test_vreinterpret_u64_u8(
11812	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
11813	// CHECK: ret <1 x i64> [[TMP0]]
11814	uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
11815	return vreinterpret_u64_u8(a);
11816	}
11817
11818	// CHECK-LABEL: @test_vreinterpret_u64_u16(
11819	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
11820	// CHECK: ret <1 x i64> [[TMP0]]
11821	uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
11822	return vreinterpret_u64_u16(a);
11823	}
11824
11825	// CHECK-LABEL: @test_vreinterpret_u64_u32(
11826	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
11827	// CHECK: ret <1 x i64> [[TMP0]]
11828	uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
11829	return vreinterpret_u64_u32(a);
11830	}
11831
11832	// CHECK-LABEL: @test_vreinterpret_u64_f16(
11833	// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
11834	// CHECK: ret <1 x i64> [[TMP0]]
11835	uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
11836	return vreinterpret_u64_f16(a);
11837	}
11838
11839	// CHECK-LABEL: @test_vreinterpret_u64_f32(
11840	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
11841	// CHECK: ret <1 x i64> [[TMP0]]
11842	uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
11843	return vreinterpret_u64_f32(a);
11844	}
11845
11846	// CHECK-LABEL: @test_vreinterpret_u64_p8(
11847	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
11848	// CHECK: ret <1 x i64> [[TMP0]]
11849	uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
11850	return vreinterpret_u64_p8(a);
11851	}
11852
11853	// CHECK-LABEL: @test_vreinterpret_u64_p16(
11854	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
11855	// CHECK: ret <1 x i64> [[TMP0]]
11856	uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
11857	return vreinterpret_u64_p16(a);
11858	}
11859
11860	// CHECK-LABEL: @test_vreinterpret_f16_s8(
11861	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
11862	// CHECK: ret <4 x half> [[TMP0]]
11863	float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
11864	return vreinterpret_f16_s8(a);
11865	}
11866
11867	// CHECK-LABEL: @test_vreinterpret_f16_s16(
11868	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
11869	// CHECK: ret <4 x half> [[TMP0]]
11870	float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
11871	return vreinterpret_f16_s16(a);
11872	}
11873
11874	// CHECK-LABEL: @test_vreinterpret_f16_s32(
11875	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
11876	// CHECK: ret <4 x half> [[TMP0]]
11877	float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
11878	return vreinterpret_f16_s32(a);
11879	}
11880
11881	// CHECK-LABEL: @test_vreinterpret_f16_s64(
11882	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
11883	// CHECK: ret <4 x half> [[TMP0]]
11884	float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
11885	return vreinterpret_f16_s64(a);
11886	}
11887
11888	// CHECK-LABEL: @test_vreinterpret_f16_u8(
11889	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
11890	// CHECK: ret <4 x half> [[TMP0]]
11891	float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
11892	return vreinterpret_f16_u8(a);
11893	}
11894
11895	// CHECK-LABEL: @test_vreinterpret_f16_u16(
11896	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
11897	// CHECK: ret <4 x half> [[TMP0]]
11898	float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
11899	return vreinterpret_f16_u16(a);
11900	}
11901
11902	// CHECK-LABEL: @test_vreinterpret_f16_u32(
11903	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
11904	// CHECK: ret <4 x half> [[TMP0]]
11905	float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
11906	return vreinterpret_f16_u32(a);
11907	}
11908
11909	// CHECK-LABEL: @test_vreinterpret_f16_u64(
11910	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
11911	// CHECK: ret <4 x half> [[TMP0]]
11912	float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
11913	return vreinterpret_f16_u64(a);
11914	}
11915
11916	// CHECK-LABEL: @test_vreinterpret_f16_f32(
11917	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
11918	// CHECK: ret <4 x half> [[TMP0]]
11919	float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
11920	return vreinterpret_f16_f32(a);
11921	}
11922
11923	// CHECK-LABEL: @test_vreinterpret_f16_p8(
11924	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
11925	// CHECK: ret <4 x half> [[TMP0]]
11926	float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
11927	return vreinterpret_f16_p8(a);
11928	}
11929
11930	// CHECK-LABEL: @test_vreinterpret_f16_p16(
11931	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
11932	// CHECK: ret <4 x half> [[TMP0]]
11933	float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
11934	return vreinterpret_f16_p16(a);
11935	}
11936
11937	// CHECK-LABEL: @test_vreinterpret_f32_s8(
11938	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
11939	// CHECK: ret <2 x float> [[TMP0]]
11940	float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
11941	return vreinterpret_f32_s8(a);
11942	}
11943
11944	// CHECK-LABEL: @test_vreinterpret_f32_s16(
11945	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
11946	// CHECK: ret <2 x float> [[TMP0]]
11947	float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
11948	return vreinterpret_f32_s16(a);
11949	}
11950
11951	// CHECK-LABEL: @test_vreinterpret_f32_s32(
11952	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
11953	// CHECK: ret <2 x float> [[TMP0]]
11954	float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
11955	return vreinterpret_f32_s32(a);
11956	}
11957
11958	// CHECK-LABEL: @test_vreinterpret_f32_s64(
11959	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
11960	// CHECK: ret <2 x float> [[TMP0]]
11961	float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
11962	return vreinterpret_f32_s64(a);
11963	}
11964
11965	// CHECK-LABEL: @test_vreinterpret_f32_u8(
11966	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
11967	// CHECK: ret <2 x float> [[TMP0]]
11968	float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
11969	return vreinterpret_f32_u8(a);
11970	}
11971
11972	// CHECK-LABEL: @test_vreinterpret_f32_u16(
11973	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
11974	// CHECK: ret <2 x float> [[TMP0]]
11975	float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
11976	return vreinterpret_f32_u16(a);
11977	}
11978
11979	// CHECK-LABEL: @test_vreinterpret_f32_u32(
11980	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
11981	// CHECK: ret <2 x float> [[TMP0]]
11982	float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
11983	return vreinterpret_f32_u32(a);
11984	}
11985
11986	// CHECK-LABEL: @test_vreinterpret_f32_u64(
11987	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
11988	// CHECK: ret <2 x float> [[TMP0]]
11989	float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
11990	return vreinterpret_f32_u64(a);
11991	}
11992
11993	// CHECK-LABEL: @test_vreinterpret_f32_f16(
11994	// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
11995	// CHECK: ret <2 x float> [[TMP0]]
11996	float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
11997	return vreinterpret_f32_f16(a);
11998	}
11999
12000	// CHECK-LABEL: @test_vreinterpret_f32_p8(
12001	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
12002	// CHECK: ret <2 x float> [[TMP0]]
12003	float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
12004	return vreinterpret_f32_p8(a);
12005	}
12006
12007	// CHECK-LABEL: @test_vreinterpret_f32_p16(
12008	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
12009	// CHECK: ret <2 x float> [[TMP0]]
12010	float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
12011	return vreinterpret_f32_p16(a);
12012	}
12013
12014	// CHECK-LABEL: @test_vreinterpret_p8_s8(
12015	// CHECK: ret <8 x i8> %a
12016	poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
12017	return vreinterpret_p8_s8(a);
12018	}
12019
12020	// CHECK-LABEL: @test_vreinterpret_p8_s16(
12021	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12022	// CHECK: ret <8 x i8> [[TMP0]]
12023	poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
12024	return vreinterpret_p8_s16(a);
12025	}
12026
12027	// CHECK-LABEL: @test_vreinterpret_p8_s32(
12028	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12029	// CHECK: ret <8 x i8> [[TMP0]]
12030	poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
12031	return vreinterpret_p8_s32(a);
12032	}
12033
12034	// CHECK-LABEL: @test_vreinterpret_p8_s64(
12035	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
12036	// CHECK: ret <8 x i8> [[TMP0]]
12037	poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
12038	return vreinterpret_p8_s64(a);
12039	}
12040
12041	// CHECK-LABEL: @test_vreinterpret_p8_u8(
12042	// CHECK: ret <8 x i8> %a
12043	poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
12044	return vreinterpret_p8_u8(a);
12045	}
12046
12047	// CHECK-LABEL: @test_vreinterpret_p8_u16(
12048	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12049	// CHECK: ret <8 x i8> [[TMP0]]
12050	poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
12051	return vreinterpret_p8_u16(a);
12052	}
12053
12054	// CHECK-LABEL: @test_vreinterpret_p8_u32(
12055	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12056	// CHECK: ret <8 x i8> [[TMP0]]
12057	poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
12058	return vreinterpret_p8_u32(a);
12059	}
12060
12061	// CHECK-LABEL: @test_vreinterpret_p8_u64(
12062	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
12063	// CHECK: ret <8 x i8> [[TMP0]]
12064	poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
12065	return vreinterpret_p8_u64(a);
12066	}
12067
12068	// CHECK-LABEL: @test_vreinterpret_p8_f16(
12069	// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
12070	// CHECK: ret <8 x i8> [[TMP0]]
12071	poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
12072	return vreinterpret_p8_f16(a);
12073	}
12074
12075	// CHECK-LABEL: @test_vreinterpret_p8_f32(
12076	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
12077	// CHECK: ret <8 x i8> [[TMP0]]
12078	poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
12079	return vreinterpret_p8_f32(a);
12080	}
12081
12082	// CHECK-LABEL: @test_vreinterpret_p8_p16(
12083	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12084	// CHECK: ret <8 x i8> [[TMP0]]
12085	poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
12086	return vreinterpret_p8_p16(a);
12087	}
12088
12089	// CHECK-LABEL: @test_vreinterpret_p16_s8(
12090	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
12091	// CHECK: ret <4 x i16> [[TMP0]]
12092	poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
12093	return vreinterpret_p16_s8(a);
12094	}
12095
12096	// CHECK-LABEL: @test_vreinterpret_p16_s16(
12097	// CHECK: ret <4 x i16> %a
12098	poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
12099	return vreinterpret_p16_s16(a);
12100	}
12101
12102	// CHECK-LABEL: @test_vreinterpret_p16_s32(
12103	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
12104	// CHECK: ret <4 x i16> [[TMP0]]
12105	poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
12106	return vreinterpret_p16_s32(a);
12107	}
12108
12109	// CHECK-LABEL: @test_vreinterpret_p16_s64(
12110	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
12111	// CHECK: ret <4 x i16> [[TMP0]]
12112	poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
12113	return vreinterpret_p16_s64(a);
12114	}
12115
12116	// CHECK-LABEL: @test_vreinterpret_p16_u8(
12117	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
12118	// CHECK: ret <4 x i16> [[TMP0]]
12119	poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
12120	return vreinterpret_p16_u8(a);
12121	}
12122
12123	// CHECK-LABEL: @test_vreinterpret_p16_u16(
12124	// CHECK: ret <4 x i16> %a
12125	poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
12126	return vreinterpret_p16_u16(a);
12127	}
12128
12129	// CHECK-LABEL: @test_vreinterpret_p16_u32(
12130	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
12131	// CHECK: ret <4 x i16> [[TMP0]]
12132	poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
12133	return vreinterpret_p16_u32(a);
12134	}
12135
12136	// CHECK-LABEL: @test_vreinterpret_p16_u64(
12137	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
12138	// CHECK: ret <4 x i16> [[TMP0]]
12139	poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
12140	return vreinterpret_p16_u64(a);
12141	}
12142
12143	// CHECK-LABEL: @test_vreinterpret_p16_f16(
12144	// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
12145	// CHECK: ret <4 x i16> [[TMP0]]
12146	poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
12147	return vreinterpret_p16_f16(a);
12148	}
12149
12150	// CHECK-LABEL: @test_vreinterpret_p16_f32(
12151	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
12152	// CHECK: ret <4 x i16> [[TMP0]]
12153	poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
12154	return vreinterpret_p16_f32(a);
12155	}
12156
12157	// CHECK-LABEL: @test_vreinterpret_p16_p8(
12158	// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
12159	// CHECK: ret <4 x i16> [[TMP0]]
12160	poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
12161	return vreinterpret_p16_p8(a);
12162	}
12163
12164	// CHECK-LABEL: @test_vreinterpretq_s8_s16(
12165	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12166	// CHECK: ret <16 x i8> [[TMP0]]
12167	int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
12168	return vreinterpretq_s8_s16(a);
12169	}
12170
12171	// CHECK-LABEL: @test_vreinterpretq_s8_s32(
12172	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12173	// CHECK: ret <16 x i8> [[TMP0]]
12174	int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
12175	return vreinterpretq_s8_s32(a);
12176	}
12177
12178	// CHECK-LABEL: @test_vreinterpretq_s8_s64(
12179	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12180	// CHECK: ret <16 x i8> [[TMP0]]
12181	int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
12182	return vreinterpretq_s8_s64(a);
12183	}
12184
12185	// CHECK-LABEL: @test_vreinterpretq_s8_u8(
12186	// CHECK: ret <16 x i8> %a
12187	int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
12188	return vreinterpretq_s8_u8(a);
12189	}
12190
12191	// CHECK-LABEL: @test_vreinterpretq_s8_u16(
12192	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12193	// CHECK: ret <16 x i8> [[TMP0]]
12194	int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
12195	return vreinterpretq_s8_u16(a);
12196	}
12197
12198	// CHECK-LABEL: @test_vreinterpretq_s8_u32(
12199	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12200	// CHECK: ret <16 x i8> [[TMP0]]
12201	int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
12202	return vreinterpretq_s8_u32(a);
12203	}
12204
12205	// CHECK-LABEL: @test_vreinterpretq_s8_u64(
12206	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12207	// CHECK: ret <16 x i8> [[TMP0]]
12208	int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
12209	return vreinterpretq_s8_u64(a);
12210	}
12211
12212	// CHECK-LABEL: @test_vreinterpretq_s8_f16(
12213	// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
12214	// CHECK: ret <16 x i8> [[TMP0]]
12215	int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
12216	return vreinterpretq_s8_f16(a);
12217	}
12218
12219	// CHECK-LABEL: @test_vreinterpretq_s8_f32(
12220	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
12221	// CHECK: ret <16 x i8> [[TMP0]]
12222	int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
12223	return vreinterpretq_s8_f32(a);
12224	}
12225
12226	// CHECK-LABEL: @test_vreinterpretq_s8_p8(
12227	// CHECK: ret <16 x i8> %a
12228	int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
12229	return vreinterpretq_s8_p8(a);
12230	}
12231
12232	// CHECK-LABEL: @test_vreinterpretq_s8_p16(
12233	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12234	// CHECK: ret <16 x i8> [[TMP0]]
12235	int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
12236	return vreinterpretq_s8_p16(a);
12237	}
12238
12239	// CHECK-LABEL: @test_vreinterpretq_s16_s8(
12240	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12241	// CHECK: ret <8 x i16> [[TMP0]]
12242	int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
12243	return vreinterpretq_s16_s8(a);
12244	}
12245
12246	// CHECK-LABEL: @test_vreinterpretq_s16_s32(
12247	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
12248	// CHECK: ret <8 x i16> [[TMP0]]
12249	int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
12250	return vreinterpretq_s16_s32(a);
12251	}
12252
12253	// CHECK-LABEL: @test_vreinterpretq_s16_s64(
12254	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
12255	// CHECK: ret <8 x i16> [[TMP0]]
12256	int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
12257	return vreinterpretq_s16_s64(a);
12258	}
12259
12260	// CHECK-LABEL: @test_vreinterpretq_s16_u8(
12261	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12262	// CHECK: ret <8 x i16> [[TMP0]]
12263	int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
12264	return vreinterpretq_s16_u8(a);
12265	}
12266
12267	// CHECK-LABEL: @test_vreinterpretq_s16_u16(
12268	// CHECK: ret <8 x i16> %a
12269	int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
12270	return vreinterpretq_s16_u16(a);
12271	}
12272
12273	// CHECK-LABEL: @test_vreinterpretq_s16_u32(
12274	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
12275	// CHECK: ret <8 x i16> [[TMP0]]
12276	int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
12277	return vreinterpretq_s16_u32(a);
12278	}
12279
12280	// CHECK-LABEL: @test_vreinterpretq_s16_u64(
12281	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
12282	// CHECK: ret <8 x i16> [[TMP0]]
12283	int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
12284	return vreinterpretq_s16_u64(a);
12285	}
12286
12287	// CHECK-LABEL: @test_vreinterpretq_s16_f16(
12288	// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
12289	// CHECK: ret <8 x i16> [[TMP0]]
12290	int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
12291	return vreinterpretq_s16_f16(a);
12292	}
12293
12294	// CHECK-LABEL: @test_vreinterpretq_s16_f32(
12295	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
12296	// CHECK: ret <8 x i16> [[TMP0]]
12297	int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
12298	return vreinterpretq_s16_f32(a);
12299	}
12300
12301	// CHECK-LABEL: @test_vreinterpretq_s16_p8(
12302	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12303	// CHECK: ret <8 x i16> [[TMP0]]
12304	int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
12305	return vreinterpretq_s16_p8(a);
12306	}
12307
12308	// CHECK-LABEL: @test_vreinterpretq_s16_p16(
12309	// CHECK: ret <8 x i16> %a
12310	int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
12311	return vreinterpretq_s16_p16(a);
12312	}
12313
12314	// CHECK-LABEL: @test_vreinterpretq_s32_s8(
12315	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
12316	// CHECK: ret <4 x i32> [[TMP0]]
12317	int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
12318	return vreinterpretq_s32_s8(a);
12319	}
12320
12321	// CHECK-LABEL: @test_vreinterpretq_s32_s16(
12322	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
12323	// CHECK: ret <4 x i32> [[TMP0]]
12324	int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
12325	return vreinterpretq_s32_s16(a);
12326	}
12327
12328	// CHECK-LABEL: @test_vreinterpretq_s32_s64(
12329	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
12330	// CHECK: ret <4 x i32> [[TMP0]]
12331	int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
12332	return vreinterpretq_s32_s64(a);
12333	}
12334
12335	// CHECK-LABEL: @test_vreinterpretq_s32_u8(
12336	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
12337	// CHECK: ret <4 x i32> [[TMP0]]
12338	int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
12339	return vreinterpretq_s32_u8(a);
12340	}
12341
12342	// CHECK-LABEL: @test_vreinterpretq_s32_u16(
12343	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
12344	// CHECK: ret <4 x i32> [[TMP0]]
12345	int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
12346	return vreinterpretq_s32_u16(a);
12347	}
12348
12349	// CHECK-LABEL: @test_vreinterpretq_s32_u32(
12350	// CHECK: ret <4 x i32> %a
12351	int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
12352	return vreinterpretq_s32_u32(a);
12353	}
12354
12355	// CHECK-LABEL: @test_vreinterpretq_s32_u64(
12356	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
12357	// CHECK: ret <4 x i32> [[TMP0]]
12358	int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
12359	return vreinterpretq_s32_u64(a);
12360	}
12361
12362	// CHECK-LABEL: @test_vreinterpretq_s32_f16(
12363	// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
12364	// CHECK: ret <4 x i32> [[TMP0]]
12365	int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
12366	return vreinterpretq_s32_f16(a);
12367	}
12368
12369	// CHECK-LABEL: @test_vreinterpretq_s32_f32(
12370	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
12371	// CHECK: ret <4 x i32> [[TMP0]]
12372	int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
12373	return vreinterpretq_s32_f32(a);
12374	}
12375
12376	// CHECK-LABEL: @test_vreinterpretq_s32_p8(
12377	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
12378	// CHECK: ret <4 x i32> [[TMP0]]
12379	int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
12380	return vreinterpretq_s32_p8(a);
12381	}
12382
12383	// CHECK-LABEL: @test_vreinterpretq_s32_p16(
12384	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
12385	// CHECK: ret <4 x i32> [[TMP0]]
12386	int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
12387	return vreinterpretq_s32_p16(a);
12388	}
12389
12390	// CHECK-LABEL: @test_vreinterpretq_s64_s8(
12391	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
12392	// CHECK: ret <2 x i64> [[TMP0]]
12393	int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
12394	return vreinterpretq_s64_s8(a);
12395	}
12396
12397	// CHECK-LABEL: @test_vreinterpretq_s64_s16(
12398	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
12399	// CHECK: ret <2 x i64> [[TMP0]]
12400	int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
12401	return vreinterpretq_s64_s16(a);
12402	}
12403
12404	// CHECK-LABEL: @test_vreinterpretq_s64_s32(
12405	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
12406	// CHECK: ret <2 x i64> [[TMP0]]
12407	int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
12408	return vreinterpretq_s64_s32(a);
12409	}
12410
12411	// CHECK-LABEL: @test_vreinterpretq_s64_u8(
12412	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
12413	// CHECK: ret <2 x i64> [[TMP0]]
12414	int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
12415	return vreinterpretq_s64_u8(a);
12416	}
12417
12418	// CHECK-LABEL: @test_vreinterpretq_s64_u16(
12419	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
12420	// CHECK: ret <2 x i64> [[TMP0]]
12421	int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
12422	return vreinterpretq_s64_u16(a);
12423	}
12424
12425	// CHECK-LABEL: @test_vreinterpretq_s64_u32(
12426	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
12427	// CHECK: ret <2 x i64> [[TMP0]]
12428	int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
12429	return vreinterpretq_s64_u32(a);
12430	}
12431
12432	// CHECK-LABEL: @test_vreinterpretq_s64_u64(
12433	// CHECK: ret <2 x i64> %a
12434	int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
12435	return vreinterpretq_s64_u64(a);
12436	}
12437
12438	// CHECK-LABEL: @test_vreinterpretq_s64_f16(
12439	// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
12440	// CHECK: ret <2 x i64> [[TMP0]]
12441	int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
12442	return vreinterpretq_s64_f16(a);
12443	}
12444
12445	// CHECK-LABEL: @test_vreinterpretq_s64_f32(
12446	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
12447	// CHECK: ret <2 x i64> [[TMP0]]
12448	int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
12449	return vreinterpretq_s64_f32(a);
12450	}
12451
12452	// CHECK-LABEL: @test_vreinterpretq_s64_p8(
12453	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
12454	// CHECK: ret <2 x i64> [[TMP0]]
12455	int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
12456	return vreinterpretq_s64_p8(a);
12457	}
12458
12459	// CHECK-LABEL: @test_vreinterpretq_s64_p16(
12460	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
12461	// CHECK: ret <2 x i64> [[TMP0]]
12462	int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
12463	return vreinterpretq_s64_p16(a);
12464	}
12465
12466	// CHECK-LABEL: @test_vreinterpretq_u8_s8(
12467	// CHECK: ret <16 x i8> %a
12468	uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
12469	return vreinterpretq_u8_s8(a);
12470	}
12471
12472	// CHECK-LABEL: @test_vreinterpretq_u8_s16(
12473	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12474	// CHECK: ret <16 x i8> [[TMP0]]
12475	uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
12476	return vreinterpretq_u8_s16(a);
12477	}
12478
12479	// CHECK-LABEL: @test_vreinterpretq_u8_s32(
12480	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12481	// CHECK: ret <16 x i8> [[TMP0]]
12482	uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
12483	return vreinterpretq_u8_s32(a);
12484	}
12485
12486	// CHECK-LABEL: @test_vreinterpretq_u8_s64(
12487	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12488	// CHECK: ret <16 x i8> [[TMP0]]
12489	uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
12490	return vreinterpretq_u8_s64(a);
12491	}
12492
12493	// CHECK-LABEL: @test_vreinterpretq_u8_u16(
12494	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12495	// CHECK: ret <16 x i8> [[TMP0]]
12496	uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
12497	return vreinterpretq_u8_u16(a);
12498	}
12499
12500	// CHECK-LABEL: @test_vreinterpretq_u8_u32(
12501	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12502	// CHECK: ret <16 x i8> [[TMP0]]
12503	uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
12504	return vreinterpretq_u8_u32(a);
12505	}
12506
12507	// CHECK-LABEL: @test_vreinterpretq_u8_u64(
12508	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12509	// CHECK: ret <16 x i8> [[TMP0]]
12510	uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
12511	return vreinterpretq_u8_u64(a);
12512	}
12513
12514	// CHECK-LABEL: @test_vreinterpretq_u8_f16(
12515	// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
12516	// CHECK: ret <16 x i8> [[TMP0]]
12517	uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
12518	return vreinterpretq_u8_f16(a);
12519	}
12520
12521	// CHECK-LABEL: @test_vreinterpretq_u8_f32(
12522	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
12523	// CHECK: ret <16 x i8> [[TMP0]]
12524	uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
12525	return vreinterpretq_u8_f32(a);
12526	}
12527
12528	// CHECK-LABEL: @test_vreinterpretq_u8_p8(
12529	// CHECK: ret <16 x i8> %a
12530	uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
12531	return vreinterpretq_u8_p8(a);
12532	}
12533
12534	// CHECK-LABEL: @test_vreinterpretq_u8_p16(
12535	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12536	// CHECK: ret <16 x i8> [[TMP0]]
12537	uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
12538	return vreinterpretq_u8_p16(a);
12539	}
12540
12541	// CHECK-LABEL: @test_vreinterpretq_u16_s8(
12542	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12543	// CHECK: ret <8 x i16> [[TMP0]]
12544	uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
12545	return vreinterpretq_u16_s8(a);
12546	}
12547
12548	// CHECK-LABEL: @test_vreinterpretq_u16_s16(
12549	// CHECK: ret <8 x i16> %a
12550	uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
12551	return vreinterpretq_u16_s16(a);
12552	}
12553
12554	// CHECK-LABEL: @test_vreinterpretq_u16_s32(
12555	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
12556	// CHECK: ret <8 x i16> [[TMP0]]
12557	uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
12558	return vreinterpretq_u16_s32(a);
12559	}
12560
12561	// CHECK-LABEL: @test_vreinterpretq_u16_s64(
12562	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
12563	// CHECK: ret <8 x i16> [[TMP0]]
12564	uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
12565	return vreinterpretq_u16_s64(a);
12566	}
12567
12568	// CHECK-LABEL: @test_vreinterpretq_u16_u8(
12569	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12570	// CHECK: ret <8 x i16> [[TMP0]]
12571	uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
12572	return vreinterpretq_u16_u8(a);
12573	}
12574
12575	// CHECK-LABEL: @test_vreinterpretq_u16_u32(
12576	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
12577	// CHECK: ret <8 x i16> [[TMP0]]
12578	uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
12579	return vreinterpretq_u16_u32(a);
12580	}
12581
12582	// CHECK-LABEL: @test_vreinterpretq_u16_u64(
12583	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
12584	// CHECK: ret <8 x i16> [[TMP0]]
12585	uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
12586	return vreinterpretq_u16_u64(a);
12587	}
12588
12589	// CHECK-LABEL: @test_vreinterpretq_u16_f16(
12590	// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
12591	// CHECK: ret <8 x i16> [[TMP0]]
12592	uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
12593	return vreinterpretq_u16_f16(a);
12594	}
12595
12596	// CHECK-LABEL: @test_vreinterpretq_u16_f32(
12597	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
12598	// CHECK: ret <8 x i16> [[TMP0]]
12599	uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
12600	return vreinterpretq_u16_f32(a);
12601	}
12602
12603	// CHECK-LABEL: @test_vreinterpretq_u16_p8(
12604	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12605	// CHECK: ret <8 x i16> [[TMP0]]
12606	uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
12607	return vreinterpretq_u16_p8(a);
12608	}
12609
12610	// CHECK-LABEL: @test_vreinterpretq_u16_p16(
12611	// CHECK: ret <8 x i16> %a
12612	uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
12613	return vreinterpretq_u16_p16(a);
12614	}
12615
12616	// CHECK-LABEL: @test_vreinterpretq_u32_s8(
12617	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
12618	// CHECK: ret <4 x i32> [[TMP0]]
12619	uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
12620	return vreinterpretq_u32_s8(a);
12621	}
12622
12623	// CHECK-LABEL: @test_vreinterpretq_u32_s16(
12624	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
12625	// CHECK: ret <4 x i32> [[TMP0]]
12626	uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
12627	return vreinterpretq_u32_s16(a);
12628	}
12629
12630	// CHECK-LABEL: @test_vreinterpretq_u32_s32(
12631	// CHECK: ret <4 x i32> %a
12632	uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
12633	return vreinterpretq_u32_s32(a);
12634	}
12635
12636	// CHECK-LABEL: @test_vreinterpretq_u32_s64(
12637	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
12638	// CHECK: ret <4 x i32> [[TMP0]]
12639	uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
12640	return vreinterpretq_u32_s64(a);
12641	}
12642
12643	// CHECK-LABEL: @test_vreinterpretq_u32_u8(
12644	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
12645	// CHECK: ret <4 x i32> [[TMP0]]
12646	uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
12647	return vreinterpretq_u32_u8(a);
12648	}
12649
12650	// CHECK-LABEL: @test_vreinterpretq_u32_u16(
12651	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
12652	// CHECK: ret <4 x i32> [[TMP0]]
12653	uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
12654	return vreinterpretq_u32_u16(a);
12655	}
12656
12657	// CHECK-LABEL: @test_vreinterpretq_u32_u64(
12658	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
12659	// CHECK: ret <4 x i32> [[TMP0]]
12660	uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
12661	return vreinterpretq_u32_u64(a);
12662	}
12663
12664	// CHECK-LABEL: @test_vreinterpretq_u32_f16(
12665	// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
12666	// CHECK: ret <4 x i32> [[TMP0]]
12667	uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
12668	return vreinterpretq_u32_f16(a);
12669	}
12670
12671	// CHECK-LABEL: @test_vreinterpretq_u32_f32(
12672	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
12673	// CHECK: ret <4 x i32> [[TMP0]]
12674	uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
12675	return vreinterpretq_u32_f32(a);
12676	}
12677
12678	// CHECK-LABEL: @test_vreinterpretq_u32_p8(
12679	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
12680	// CHECK: ret <4 x i32> [[TMP0]]
12681	uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
12682	return vreinterpretq_u32_p8(a);
12683	}
12684
12685	// CHECK-LABEL: @test_vreinterpretq_u32_p16(
12686	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
12687	// CHECK: ret <4 x i32> [[TMP0]]
12688	uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
12689	return vreinterpretq_u32_p16(a);
12690	}
12691
12692	// CHECK-LABEL: @test_vreinterpretq_u64_s8(
12693	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
12694	// CHECK: ret <2 x i64> [[TMP0]]
12695	uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
12696	return vreinterpretq_u64_s8(a);
12697	}
12698
12699	// CHECK-LABEL: @test_vreinterpretq_u64_s16(
12700	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
12701	// CHECK: ret <2 x i64> [[TMP0]]
12702	uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
12703	return vreinterpretq_u64_s16(a);
12704	}
12705
12706	// CHECK-LABEL: @test_vreinterpretq_u64_s32(
12707	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
12708	// CHECK: ret <2 x i64> [[TMP0]]
12709	uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
12710	return vreinterpretq_u64_s32(a);
12711	}
12712
12713	// CHECK-LABEL: @test_vreinterpretq_u64_s64(
12714	// CHECK: ret <2 x i64> %a
12715	uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
12716	return vreinterpretq_u64_s64(a);
12717	}
12718
12719	// CHECK-LABEL: @test_vreinterpretq_u64_u8(
12720	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
12721	// CHECK: ret <2 x i64> [[TMP0]]
12722	uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
12723	return vreinterpretq_u64_u8(a);
12724	}
12725
12726	// CHECK-LABEL: @test_vreinterpretq_u64_u16(
12727	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
12728	// CHECK: ret <2 x i64> [[TMP0]]
12729	uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
12730	return vreinterpretq_u64_u16(a);
12731	}
12732
12733	// CHECK-LABEL: @test_vreinterpretq_u64_u32(
12734	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
12735	// CHECK: ret <2 x i64> [[TMP0]]
12736	uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
12737	return vreinterpretq_u64_u32(a);
12738	}
12739
12740	// CHECK-LABEL: @test_vreinterpretq_u64_f16(
12741	// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
12742	// CHECK: ret <2 x i64> [[TMP0]]
12743	uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
12744	return vreinterpretq_u64_f16(a);
12745	}
12746
12747	// CHECK-LABEL: @test_vreinterpretq_u64_f32(
12748	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
12749	// CHECK: ret <2 x i64> [[TMP0]]
12750	uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
12751	return vreinterpretq_u64_f32(a);
12752	}
12753
12754	// CHECK-LABEL: @test_vreinterpretq_u64_p8(
12755	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
12756	// CHECK: ret <2 x i64> [[TMP0]]
12757	uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
12758	return vreinterpretq_u64_p8(a);
12759	}
12760
12761	// CHECK-LABEL: @test_vreinterpretq_u64_p16(
12762	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
12763	// CHECK: ret <2 x i64> [[TMP0]]
12764	uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
12765	return vreinterpretq_u64_p16(a);
12766	}
12767
12768	// CHECK-LABEL: @test_vreinterpretq_f16_s8(
12769	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
12770	// CHECK: ret <8 x half> [[TMP0]]
12771	float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
12772	return vreinterpretq_f16_s8(a);
12773	}
12774
12775	// CHECK-LABEL: @test_vreinterpretq_f16_s16(
12776	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
12777	// CHECK: ret <8 x half> [[TMP0]]
12778	float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
12779	return vreinterpretq_f16_s16(a);
12780	}
12781
12782	// CHECK-LABEL: @test_vreinterpretq_f16_s32(
12783	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
12784	// CHECK: ret <8 x half> [[TMP0]]
12785	float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
12786	return vreinterpretq_f16_s32(a);
12787	}
12788
12789	// CHECK-LABEL: @test_vreinterpretq_f16_s64(
12790	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
12791	// CHECK: ret <8 x half> [[TMP0]]
12792	float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
12793	return vreinterpretq_f16_s64(a);
12794	}
12795
12796	// CHECK-LABEL: @test_vreinterpretq_f16_u8(
12797	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
12798	// CHECK: ret <8 x half> [[TMP0]]
12799	float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
12800	return vreinterpretq_f16_u8(a);
12801	}
12802
12803	// CHECK-LABEL: @test_vreinterpretq_f16_u16(
12804	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
12805	// CHECK: ret <8 x half> [[TMP0]]
12806	float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
12807	return vreinterpretq_f16_u16(a);
12808	}
12809
12810	// CHECK-LABEL: @test_vreinterpretq_f16_u32(
12811	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
12812	// CHECK: ret <8 x half> [[TMP0]]
12813	float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
12814	return vreinterpretq_f16_u32(a);
12815	}
12816
12817	// CHECK-LABEL: @test_vreinterpretq_f16_u64(
12818	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
12819	// CHECK: ret <8 x half> [[TMP0]]
12820	float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
12821	return vreinterpretq_f16_u64(a);
12822	}
12823
12824	// CHECK-LABEL: @test_vreinterpretq_f16_f32(
12825	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
12826	// CHECK: ret <8 x half> [[TMP0]]
12827	float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
12828	return vreinterpretq_f16_f32(a);
12829	}
12830
12831	// CHECK-LABEL: @test_vreinterpretq_f16_p8(
12832	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
12833	// CHECK: ret <8 x half> [[TMP0]]
12834	float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
12835	return vreinterpretq_f16_p8(a);
12836	}
12837
12838	// CHECK-LABEL: @test_vreinterpretq_f16_p16(
12839	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
12840	// CHECK: ret <8 x half> [[TMP0]]
12841	float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
12842	return vreinterpretq_f16_p16(a);
12843	}
12844
12845	// CHECK-LABEL: @test_vreinterpretq_f32_s8(
12846	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
12847	// CHECK: ret <4 x float> [[TMP0]]
12848	float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
12849	return vreinterpretq_f32_s8(a);
12850	}
12851
12852	// CHECK-LABEL: @test_vreinterpretq_f32_s16(
12853	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
12854	// CHECK: ret <4 x float> [[TMP0]]
12855	float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
12856	return vreinterpretq_f32_s16(a);
12857	}
12858
12859	// CHECK-LABEL: @test_vreinterpretq_f32_s32(
12860	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
12861	// CHECK: ret <4 x float> [[TMP0]]
12862	float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
12863	return vreinterpretq_f32_s32(a);
12864	}
12865
12866	// CHECK-LABEL: @test_vreinterpretq_f32_s64(
12867	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
12868	// CHECK: ret <4 x float> [[TMP0]]
12869	float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
12870	return vreinterpretq_f32_s64(a);
12871	}
12872
12873	// CHECK-LABEL: @test_vreinterpretq_f32_u8(
12874	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
12875	// CHECK: ret <4 x float> [[TMP0]]
12876	float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
12877	return vreinterpretq_f32_u8(a);
12878	}
12879
12880	// CHECK-LABEL: @test_vreinterpretq_f32_u16(
12881	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
12882	// CHECK: ret <4 x float> [[TMP0]]
12883	float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
12884	return vreinterpretq_f32_u16(a);
12885	}
12886
12887	// CHECK-LABEL: @test_vreinterpretq_f32_u32(
12888	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
12889	// CHECK: ret <4 x float> [[TMP0]]
12890	float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
12891	return vreinterpretq_f32_u32(a);
12892	}
12893
12894	// CHECK-LABEL: @test_vreinterpretq_f32_u64(
12895	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
12896	// CHECK: ret <4 x float> [[TMP0]]
12897	float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
12898	return vreinterpretq_f32_u64(a);
12899	}
12900
12901	// CHECK-LABEL: @test_vreinterpretq_f32_f16(
12902	// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
12903	// CHECK: ret <4 x float> [[TMP0]]
12904	float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
12905	return vreinterpretq_f32_f16(a);
12906	}
12907
12908	// CHECK-LABEL: @test_vreinterpretq_f32_p8(
12909	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
12910	// CHECK: ret <4 x float> [[TMP0]]
12911	float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
12912	return vreinterpretq_f32_p8(a);
12913	}
12914
12915	// CHECK-LABEL: @test_vreinterpretq_f32_p16(
12916	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
12917	// CHECK: ret <4 x float> [[TMP0]]
12918	float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
12919	return vreinterpretq_f32_p16(a);
12920	}
12921
12922	// CHECK-LABEL: @test_vreinterpretq_p8_s8(
12923	// CHECK: ret <16 x i8> %a
12924	poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
12925	return vreinterpretq_p8_s8(a);
12926	}
12927
12928	// CHECK-LABEL: @test_vreinterpretq_p8_s16(
12929	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12930	// CHECK: ret <16 x i8> [[TMP0]]
12931	poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
12932	return vreinterpretq_p8_s16(a);
12933	}
12934
12935	// CHECK-LABEL: @test_vreinterpretq_p8_s32(
12936	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12937	// CHECK: ret <16 x i8> [[TMP0]]
12938	poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
12939	return vreinterpretq_p8_s32(a);
12940	}
12941
12942	// CHECK-LABEL: @test_vreinterpretq_p8_s64(
12943	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12944	// CHECK: ret <16 x i8> [[TMP0]]
12945	poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
12946	return vreinterpretq_p8_s64(a);
12947	}
12948
12949	// CHECK-LABEL: @test_vreinterpretq_p8_u8(
12950	// CHECK: ret <16 x i8> %a
12951	poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
12952	return vreinterpretq_p8_u8(a);
12953	}
12954
12955	// CHECK-LABEL: @test_vreinterpretq_p8_u16(
12956	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12957	// CHECK: ret <16 x i8> [[TMP0]]
12958	poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
12959	return vreinterpretq_p8_u16(a);
12960	}
12961
12962	// CHECK-LABEL: @test_vreinterpretq_p8_u32(
12963	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12964	// CHECK: ret <16 x i8> [[TMP0]]
12965	poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
12966	return vreinterpretq_p8_u32(a);
12967	}
12968
12969	// CHECK-LABEL: @test_vreinterpretq_p8_u64(
12970	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12971	// CHECK: ret <16 x i8> [[TMP0]]
12972	poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
12973	return vreinterpretq_p8_u64(a);
12974	}
12975
12976	// CHECK-LABEL: @test_vreinterpretq_p8_f16(
12977	// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
12978	// CHECK: ret <16 x i8> [[TMP0]]
12979	poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
12980	return vreinterpretq_p8_f16(a);
12981	}
12982
12983	// CHECK-LABEL: @test_vreinterpretq_p8_f32(
12984	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
12985	// CHECK: ret <16 x i8> [[TMP0]]
12986	poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
12987	return vreinterpretq_p8_f32(a);
12988	}
12989
12990	// CHECK-LABEL: @test_vreinterpretq_p8_p16(
12991	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12992	// CHECK: ret <16 x i8> [[TMP0]]
12993	poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
12994	return vreinterpretq_p8_p16(a);
12995	}
12996
12997	// CHECK-LABEL: @test_vreinterpretq_p16_s8(
12998	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
12999	// CHECK: ret <8 x i16> [[TMP0]]
13000	poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
13001	return vreinterpretq_p16_s8(a);
13002	}
13003
13004	// CHECK-LABEL: @test_vreinterpretq_p16_s16(
13005	// CHECK: ret <8 x i16> %a
13006	poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
13007	return vreinterpretq_p16_s16(a);
13008	}
13009
13010	// CHECK-LABEL: @test_vreinterpretq_p16_s32(
13011	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
13012	// CHECK: ret <8 x i16> [[TMP0]]
13013	poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
13014	return vreinterpretq_p16_s32(a);
13015	}
13016
13017	// CHECK-LABEL: @test_vreinterpretq_p16_s64(
13018	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
13019	// CHECK: ret <8 x i16> [[TMP0]]
13020	poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
13021	return vreinterpretq_p16_s64(a);
13022	}
13023
13024	// CHECK-LABEL: @test_vreinterpretq_p16_u8(
13025	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
13026	// CHECK: ret <8 x i16> [[TMP0]]
13027	poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
13028	return vreinterpretq_p16_u8(a);
13029	}
13030
13031	// CHECK-LABEL: @test_vreinterpretq_p16_u16(
13032	// CHECK: ret <8 x i16> %a
13033	poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
13034	return vreinterpretq_p16_u16(a);
13035	}
13036
13037	// CHECK-LABEL: @test_vreinterpretq_p16_u32(
13038	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
13039	// CHECK: ret <8 x i16> [[TMP0]]
13040	poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
13041	return vreinterpretq_p16_u32(a);
13042	}
13043
13044	// CHECK-LABEL: @test_vreinterpretq_p16_u64(
13045	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
13046	// CHECK: ret <8 x i16> [[TMP0]]
13047	poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
13048	return vreinterpretq_p16_u64(a);
13049	}
13050
13051	// CHECK-LABEL: @test_vreinterpretq_p16_f16(
13052	// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
13053	// CHECK: ret <8 x i16> [[TMP0]]
13054	poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
13055	return vreinterpretq_p16_f16(a);
13056	}
13057
13058	// CHECK-LABEL: @test_vreinterpretq_p16_f32(
13059	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
13060	// CHECK: ret <8 x i16> [[TMP0]]
13061	poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
13062	return vreinterpretq_p16_f32(a);
13063	}
13064
13065	// CHECK-LABEL: @test_vreinterpretq_p16_p8(
13066	// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
13067	// CHECK: ret <8 x i16> [[TMP0]]
13068	poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
13069	return vreinterpretq_p16_p8(a);
13070	}
13071
13072	// CHECK-LABEL: @test_vrev16_s8(
13073	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
13074	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
13075	int8x8_t test_vrev16_s8(int8x8_t a) {
13076	return vrev16_s8(a);
13077	}
13078
13079	// CHECK-LABEL: @test_vrev16_u8(
13080	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
13081	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
13082	uint8x8_t test_vrev16_u8(uint8x8_t a) {
13083	return vrev16_u8(a);
13084	}
13085
13086	// CHECK-LABEL: @test_vrev16_p8(
13087	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
13088	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
13089	poly8x8_t test_vrev16_p8(poly8x8_t a) {
13090	return vrev16_p8(a);
13091	}
13092
13093	// CHECK-LABEL: @test_vrev16q_s8(
13094	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
13095	// CHECK: ret <16 x i8> [[SHUFFLE_I]]
13096	int8x16_t test_vrev16q_s8(int8x16_t a) {
13097	return vrev16q_s8(a);
13098	}
13099
13100	// CHECK-LABEL: @test_vrev16q_u8(
13101	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
13102	// CHECK: ret <16 x i8> [[SHUFFLE_I]]
13103	uint8x16_t test_vrev16q_u8(uint8x16_t a) {
13104	return vrev16q_u8(a);
13105	}
13106
13107	// CHECK-LABEL: @test_vrev16q_p8(
13108	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
13109	// CHECK: ret <16 x i8> [[SHUFFLE_I]]
13110	poly8x16_t test_vrev16q_p8(poly8x16_t a) {
13111	return vrev16q_p8(a);
13112	}
13113
13114	// CHECK-LABEL: @test_vrev32_s8(
13115	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
13116	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
13117	int8x8_t test_vrev32_s8(int8x8_t a) {
13118	return vrev32_s8(a);
13119	}
13120
13121	// CHECK-LABEL: @test_vrev32_s16(
13122	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
13123	// CHECK: ret <4 x i16> [[SHUFFLE_I]]
13124	int16x4_t test_vrev32_s16(int16x4_t a) {
13125	return vrev32_s16(a);
13126	}
13127
13128	// CHECK-LABEL: @test_vrev32_u8(
13129	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
13130	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
13131	uint8x8_t test_vrev32_u8(uint8x8_t a) {
13132	return vrev32_u8(a);
13133	}
13134
13135	// CHECK-LABEL: @test_vrev32_u16(
13136	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
13137	// CHECK: ret <4 x i16> [[SHUFFLE_I]]
13138	uint16x4_t test_vrev32_u16(uint16x4_t a) {
13139	return vrev32_u16(a);
13140	}
13141
13142	// CHECK-LABEL: @test_vrev32_p8(
13143	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
13144	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
13145	poly8x8_t test_vrev32_p8(poly8x8_t a) {
13146	return vrev32_p8(a);
13147	}
13148
13149	// CHECK-LABEL: @test_vrev32_p16(
13150	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
13151	// CHECK: ret <4 x i16> [[SHUFFLE_I]]
13152	poly16x4_t test_vrev32_p16(poly16x4_t a) {
13153	return vrev32_p16(a);
13154	}
13155
13156	// CHECK-LABEL: @test_vrev32q_s8(
13157	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
13158	// CHECK: ret <16 x i8> [[SHUFFLE_I]]
13159	int8x16_t test_vrev32q_s8(int8x16_t a) {
13160	return vrev32q_s8(a);
13161	}
13162
13163	// CHECK-LABEL: @test_vrev32q_s16(
13164	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
13165	// CHECK: ret <8 x i16> [[SHUFFLE_I]]
13166	int16x8_t test_vrev32q_s16(int16x8_t a) {
13167	return vrev32q_s16(a);
13168	}
13169
13170	// CHECK-LABEL: @test_vrev32q_u8(
13171	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
13172	// CHECK: ret <16 x i8> [[SHUFFLE_I]]
13173	uint8x16_t test_vrev32q_u8(uint8x16_t a) {
13174	return vrev32q_u8(a);
13175	}
13176
13177	// CHECK-LABEL: @test_vrev32q_u16(
13178	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
13179	// CHECK: ret <8 x i16> [[SHUFFLE_I]]
13180	uint16x8_t test_vrev32q_u16(uint16x8_t a) {
13181	return vrev32q_u16(a);
13182	}
13183
13184	// CHECK-LABEL: @test_vrev32q_p8(
13185	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
13186	// CHECK: ret <16 x i8> [[SHUFFLE_I]]
13187	poly8x16_t test_vrev32q_p8(poly8x16_t a) {
13188	return vrev32q_p8(a);
13189	}
13190
13191	// CHECK-LABEL: @test_vrev32q_p16(
13192	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
13193	// CHECK: ret <8 x i16> [[SHUFFLE_I]]
13194	poly16x8_t test_vrev32q_p16(poly16x8_t a) {
13195	return vrev32q_p16(a);
13196	}
13197
13198	// CHECK-LABEL: @test_vrev64_s8(
13199	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
13200	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
13201	int8x8_t test_vrev64_s8(int8x8_t a) {
13202	return vrev64_s8(a);
13203	}
13204
13205	// CHECK-LABEL: @test_vrev64_s16(
13206	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
13207	// CHECK: ret <4 x i16> [[SHUFFLE_I]]
13208	int16x4_t test_vrev64_s16(int16x4_t a) {
13209	return vrev64_s16(a);
13210	}
13211
13212	// CHECK-LABEL: @test_vrev64_s32(
13213	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
13214	// CHECK: ret <2 x i32> [[SHUFFLE_I]]
13215	int32x2_t test_vrev64_s32(int32x2_t a) {
13216	return vrev64_s32(a);
13217	}
13218
13219	// CHECK-LABEL: @test_vrev64_u8(
13220	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
13221	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
13222	uint8x8_t test_vrev64_u8(uint8x8_t a) {
13223	return vrev64_u8(a);
13224	}
13225
13226	// CHECK-LABEL: @test_vrev64_u16(
13227	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
13228	// CHECK: ret <4 x i16> [[SHUFFLE_I]]
13229	uint16x4_t test_vrev64_u16(uint16x4_t a) {
13230	return vrev64_u16(a);
13231	}
13232
13233	// CHECK-LABEL: @test_vrev64_u32(
13234	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
13235	// CHECK: ret <2 x i32> [[SHUFFLE_I]]
13236	uint32x2_t test_vrev64_u32(uint32x2_t a) {
13237	return vrev64_u32(a);
13238	}
13239
13240	// CHECK-LABEL: @test_vrev64_p8(
13241	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
13242	// CHECK: ret <8 x i8> [[SHUFFLE_I]]
13243	poly8x8_t test_vrev64_p8(poly8x8_t a) {
13244	return vrev64_p8(a);
13245	}
13246
13247	// CHECK-LABEL: @test_vrev64_p16(
13248	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
13249	// CHECK: ret <4 x i16> [[SHUFFLE_I]]
13250	poly16x4_t test_vrev64_p16(poly16x4_t a) {
13251	return vrev64_p16(a);
13252	}
13253
13254	// CHECK-LABEL: @test_vrev64_f32(
13255	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
13256	// CHECK: ret <2 x float> [[SHUFFLE_I]]
13257	float32x2_t test_vrev64_f32(float32x2_t a) {
13258	return vrev64_f32(a);
13259	}
13260
13261	// CHECK-LABEL: @test_vrev64q_s8(
13262	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
13263	// CHECK: ret <16 x i8> [[SHUFFLE_I]]
13264	int8x16_t test_vrev64q_s8(int8x16_t a) {
13265	return vrev64q_s8(a);
13266	}
13267
13268	// CHECK-LABEL: @test_vrev64q_s16(
13269	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
13270	// CHECK: ret <8 x i16> [[SHUFFLE_I]]
13271	int16x8_t test_vrev64q_s16(int16x8_t a) {
13272	return vrev64q_s16(a);
13273	}
13274
13275	// CHECK-LABEL: @test_vrev64q_s32(
13276	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
13277	// CHECK: ret <4 x i32> [[SHUFFLE_I]]
13278	int32x4_t test_vrev64q_s32(int32x4_t a) {
13279	return vrev64q_s32(a);
13280	}
13281
13282	// CHECK-LABEL: @test_vrev64q_u8(
13283	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
13284	// CHECK: ret <16 x i8> [[SHUFFLE_I]]
13285	uint8x16_t test_vrev64q_u8(uint8x16_t a) {
13286	return vrev64q_u8(a);
13287	}
13288
13289	// CHECK-LABEL: @test_vrev64q_u16(
13290	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
13291	// CHECK: ret <8 x i16> [[SHUFFLE_I]]
13292	uint16x8_t test_vrev64q_u16(uint16x8_t a) {
13293	return vrev64q_u16(a);
13294	}
13295
13296	// CHECK-LABEL: @test_vrev64q_u32(
13297	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
13298	// CHECK: ret <4 x i32> [[SHUFFLE_I]]
13299	uint32x4_t test_vrev64q_u32(uint32x4_t a) {
13300	return vrev64q_u32(a);
13301	}
13302
13303	// CHECK-LABEL: @test_vrev64q_p8(
13304	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
13305	// CHECK: ret <16 x i8> [[SHUFFLE_I]]
13306	poly8x16_t test_vrev64q_p8(poly8x16_t a) {
13307	return vrev64q_p8(a);
13308	}
13309
13310	// CHECK-LABEL: @test_vrev64q_p16(
13311	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
13312	// CHECK: ret <8 x i16> [[SHUFFLE_I]]
13313	poly16x8_t test_vrev64q_p16(poly16x8_t a) {
13314	return vrev64q_p16(a);
13315	}
13316
13317	// CHECK-LABEL: @test_vrev64q_f32(
13318	// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
13319	// CHECK: ret <4 x float> [[SHUFFLE_I]]
13320	float32x4_t test_vrev64q_f32(float32x4_t a) {
13321	return vrev64q_f32(a);
13322	}
13323
13324	// CHECK-LABEL: @test_vrhadd_s8(
13325	// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %a, <8 x i8> %b)
13326	// CHECK: ret <8 x i8> [[VRHADD_V_I]]
13327	int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) {
13328	return vrhadd_s8(a, b);
13329	}
13330
13331	// CHECK-LABEL: @test_vrhadd_s16(
13332	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13333	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13334	// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %a, <4 x i16> %b)
13335	// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
13336	// CHECK: ret <4 x i16> [[VRHADD_V2_I]]
13337	int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) {
13338	return vrhadd_s16(a, b);
13339	}
13340
13341	// CHECK-LABEL: @test_vrhadd_s32(
13342	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13343	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13344	// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %a, <2 x i32> %b)
13345	// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
13346	// CHECK: ret <2 x i32> [[VRHADD_V2_I]]
13347	int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) {
13348	return vrhadd_s32(a, b);
13349	}
13350
13351	// CHECK-LABEL: @test_vrhadd_u8(
13352	// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %a, <8 x i8> %b)
13353	// CHECK: ret <8 x i8> [[VRHADD_V_I]]
13354	uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) {
13355	return vrhadd_u8(a, b);
13356	}
13357
13358	// CHECK-LABEL: @test_vrhadd_u16(
13359	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13360	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13361	// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %a, <4 x i16> %b)
13362	// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
13363	// CHECK: ret <4 x i16> [[VRHADD_V2_I]]
13364	uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) {
13365	return vrhadd_u16(a, b);
13366	}
13367
13368	// CHECK-LABEL: @test_vrhadd_u32(
13369	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13370	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13371	// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %a, <2 x i32> %b)
13372	// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
13373	// CHECK: ret <2 x i32> [[VRHADD_V2_I]]
13374	uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) {
13375	return vrhadd_u32(a, b);
13376	}
13377
13378	// CHECK-LABEL: @test_vrhaddq_s8(
13379	// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %a, <16 x i8> %b)
13380	// CHECK: ret <16 x i8> [[VRHADDQ_V_I]]
13381	int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) {
13382	return vrhaddq_s8(a, b);
13383	}
13384
13385	// CHECK-LABEL: @test_vrhaddq_s16(
13386	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13387	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13388	// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a, <8 x i16> %b)
13389	// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
13390	// CHECK: ret <8 x i16> [[VRHADDQ_V2_I]]
13391	int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) {
13392	return vrhaddq_s16(a, b);
13393	}
13394
13395	// CHECK-LABEL: @test_vrhaddq_s32(
13396	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13397	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13398	// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %a, <4 x i32> %b)
13399	// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
13400	// CHECK: ret <4 x i32> [[VRHADDQ_V2_I]]
13401	int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) {
13402	return vrhaddq_s32(a, b);
13403	}
13404
13405	// CHECK-LABEL: @test_vrhaddq_u8(
13406	// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %a, <16 x i8> %b)
13407	// CHECK: ret <16 x i8> [[VRHADDQ_V_I]]
13408	uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) {
13409	return vrhaddq_u8(a, b);
13410	}
13411
13412	// CHECK-LABEL: @test_vrhaddq_u16(
13413	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13414	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13415	// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a, <8 x i16> %b)
13416	// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
13417	// CHECK: ret <8 x i16> [[VRHADDQ_V2_I]]
13418	uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) {
13419	return vrhaddq_u16(a, b);
13420	}
13421
13422	// CHECK-LABEL: @test_vrhaddq_u32(
13423	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13424	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13425	// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %a, <4 x i32> %b)
13426	// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
13427	// CHECK: ret <4 x i32> [[VRHADDQ_V2_I]]
13428	uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) {
13429	return vrhaddq_u32(a, b);
13430	}
13431
13432	// CHECK-LABEL: @test_vrshl_s8(
13433	// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
13434	// CHECK: ret <8 x i8> [[VRSHL_V_I]]
13435	int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
13436	return vrshl_s8(a, b);
13437	}
13438
13439	// CHECK-LABEL: @test_vrshl_s16(
13440	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13441	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13442	// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
13443	// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
13444	// CHECK: ret <4 x i16> [[VRSHL_V2_I]]
13445	int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
13446	return vrshl_s16(a, b);
13447	}
13448
13449	// CHECK-LABEL: @test_vrshl_s32(
13450	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13451	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13452	// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
13453	// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
13454	// CHECK: ret <2 x i32> [[VRSHL_V2_I]]
13455	int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
13456	return vrshl_s32(a, b);
13457	}
13458
13459	// CHECK-LABEL: @test_vrshl_s64(
13460	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13461	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
13462	// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
13463	// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
13464	// CHECK: ret <1 x i64> [[VRSHL_V2_I]]
13465	int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
13466	return vrshl_s64(a, b);
13467	}
13468
13469	// CHECK-LABEL: @test_vrshl_u8(
13470	// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
13471	// CHECK: ret <8 x i8> [[VRSHL_V_I]]
13472	uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
13473	return vrshl_u8(a, b);
13474	}
13475
13476	// CHECK-LABEL: @test_vrshl_u16(
13477	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13478	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13479	// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
13480	// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
13481	// CHECK: ret <4 x i16> [[VRSHL_V2_I]]
13482	uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
13483	return vrshl_u16(a, b);
13484	}
13485
13486	// CHECK-LABEL: @test_vrshl_u32(
13487	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13488	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13489	// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
13490	// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
13491	// CHECK: ret <2 x i32> [[VRSHL_V2_I]]
13492	uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
13493	return vrshl_u32(a, b);
13494	}
13495
13496	// CHECK-LABEL: @test_vrshl_u64(
13497	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13498	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
13499	// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
13500	// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
13501	// CHECK: ret <1 x i64> [[VRSHL_V2_I]]
13502	uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
13503	return vrshl_u64(a, b);
13504	}
13505
13506	// CHECK-LABEL: @test_vrshlq_s8(
13507	// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
13508	// CHECK: ret <16 x i8> [[VRSHLQ_V_I]]
13509	int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
13510	return vrshlq_s8(a, b);
13511	}
13512
13513	// CHECK-LABEL: @test_vrshlq_s16(
13514	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13515	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13516	// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
13517	// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
13518	// CHECK: ret <8 x i16> [[VRSHLQ_V2_I]]
13519	int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
13520	return vrshlq_s16(a, b);
13521	}
13522
13523	// CHECK-LABEL: @test_vrshlq_s32(
13524	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13525	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13526	// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
13527	// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
13528	// CHECK: ret <4 x i32> [[VRSHLQ_V2_I]]
13529	int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
13530	return vrshlq_s32(a, b);
13531	}
13532
13533	// CHECK-LABEL: @test_vrshlq_s64(
13534	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13535	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13536	// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
13537	// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
13538	// CHECK: ret <2 x i64> [[VRSHLQ_V2_I]]
13539	int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
13540	return vrshlq_s64(a, b);
13541	}
13542
13543	// CHECK-LABEL: @test_vrshlq_u8(
13544	// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
13545	// CHECK: ret <16 x i8> [[VRSHLQ_V_I]]
13546	uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
13547	return vrshlq_u8(a, b);
13548	}
13549
13550	// CHECK-LABEL: @test_vrshlq_u16(
13551	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13552	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13553	// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
13554	// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
13555	// CHECK: ret <8 x i16> [[VRSHLQ_V2_I]]
13556	uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
13557	return vrshlq_u16(a, b);
13558	}
13559
13560	// CHECK-LABEL: @test_vrshlq_u32(
13561	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13562	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13563	// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
13564	// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
13565	// CHECK: ret <4 x i32> [[VRSHLQ_V2_I]]
13566	uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
13567	return vrshlq_u32(a, b);
13568	}
13569
13570	// CHECK-LABEL: @test_vrshlq_u64(
13571	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13572	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13573	// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
13574	// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
13575	// CHECK: ret <2 x i64> [[VRSHLQ_V2_I]]
13576	uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
13577	return vrshlq_u64(a, b);
13578	}
13579
13580	// CHECK-LABEL: @test_vrshrn_n_s16(
13581	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13582	// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13583	// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13584	// CHECK: ret <8 x i8> [[VRSHRN_N1]]
13585	int8x8_t test_vrshrn_n_s16(int16x8_t a) {
13586	return vrshrn_n_s16(a, 1);
13587	}
13588
13589	// CHECK-LABEL: @test_vrshrn_n_s32(
13590	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13591	// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13592	// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13593	// CHECK: ret <4 x i16> [[VRSHRN_N1]]
13594	int16x4_t test_vrshrn_n_s32(int32x4_t a) {
13595	return vrshrn_n_s32(a, 1);
13596	}
13597
13598	// CHECK-LABEL: @test_vrshrn_n_s64(
13599	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13600	// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13601	// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
13602	// CHECK: ret <2 x i32> [[VRSHRN_N1]]
13603	int32x2_t test_vrshrn_n_s64(int64x2_t a) {
13604	return vrshrn_n_s64(a, 1);
13605	}
13606
13607	// CHECK-LABEL: @test_vrshrn_n_u16(
13608	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13609	// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13610	// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13611	// CHECK: ret <8 x i8> [[VRSHRN_N1]]
13612	uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
13613	return vrshrn_n_u16(a, 1);
13614	}
13615
13616	// CHECK-LABEL: @test_vrshrn_n_u32(
13617	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13618	// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13619	// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13620	// CHECK: ret <4 x i16> [[VRSHRN_N1]]
13621	uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
13622	return vrshrn_n_u32(a, 1);
13623	}
13624
13625	// CHECK-LABEL: @test_vrshrn_n_u64(
13626	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13627	// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13628	// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
13629	// CHECK: ret <2 x i32> [[VRSHRN_N1]]
13630	uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
13631	return vrshrn_n_u64(a, 1);
13632	}
13633
13634	// CHECK-LABEL: @test_vrshr_n_s8(
13635	// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13636	// CHECK: ret <8 x i8> [[VRSHR_N]]
13637	int8x8_t test_vrshr_n_s8(int8x8_t a) {
13638	return vrshr_n_s8(a, 1);
13639	}
13640
13641	// CHECK-LABEL: @test_vrshr_n_s16(
13642	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13643	// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13644	// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
13645	// CHECK: ret <4 x i16> [[VRSHR_N1]]
13646	int16x4_t test_vrshr_n_s16(int16x4_t a) {
13647	return vrshr_n_s16(a, 1);
13648	}
13649
13650	// CHECK-LABEL: @test_vrshr_n_s32(
13651	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13652	// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13653	// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
13654	// CHECK: ret <2 x i32> [[VRSHR_N1]]
13655	int32x2_t test_vrshr_n_s32(int32x2_t a) {
13656	return vrshr_n_s32(a, 1);
13657	}
13658
13659	// CHECK-LABEL: @test_vrshr_n_s64(
13660	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13661	// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13662	// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
13663	// CHECK: ret <1 x i64> [[VRSHR_N1]]
13664	int64x1_t test_vrshr_n_s64(int64x1_t a) {
13665	return vrshr_n_s64(a, 1);
13666	}
13667
13668	// CHECK-LABEL: @test_vrshr_n_u8(
13669	// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13670	// CHECK: ret <8 x i8> [[VRSHR_N]]
13671	uint8x8_t test_vrshr_n_u8(uint8x8_t a) {
13672	return vrshr_n_u8(a, 1);
13673	}
13674
13675	// CHECK-LABEL: @test_vrshr_n_u16(
13676	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13677	// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13678	// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
13679	// CHECK: ret <4 x i16> [[VRSHR_N1]]
13680	uint16x4_t test_vrshr_n_u16(uint16x4_t a) {
13681	return vrshr_n_u16(a, 1);
13682	}
13683
13684	// CHECK-LABEL: @test_vrshr_n_u32(
13685	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13686	// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13687	// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
13688	// CHECK: ret <2 x i32> [[VRSHR_N1]]
13689	uint32x2_t test_vrshr_n_u32(uint32x2_t a) {
13690	return vrshr_n_u32(a, 1);
13691	}
13692
13693	// CHECK-LABEL: @test_vrshr_n_u64(
13694	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13695	// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13696	// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
13697	// CHECK: ret <1 x i64> [[VRSHR_N1]]
13698	uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
13699	return vrshr_n_u64(a, 1);
13700	}
13701
13702	// CHECK-LABEL: @test_vrshrq_n_s8(
13703	// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13704	// CHECK: ret <16 x i8> [[VRSHR_N]]
13705	int8x16_t test_vrshrq_n_s8(int8x16_t a) {
13706	return vrshrq_n_s8(a, 1);
13707	}
13708
13709	// CHECK-LABEL: @test_vrshrq_n_s16(
13710	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13711	// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13712	// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13713	// CHECK: ret <8 x i16> [[VRSHR_N1]]
13714	int16x8_t test_vrshrq_n_s16(int16x8_t a) {
13715	return vrshrq_n_s16(a, 1);
13716	}
13717
13718	// CHECK-LABEL: @test_vrshrq_n_s32(
13719	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13720	// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13721	// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13722	// CHECK: ret <4 x i32> [[VRSHR_N1]]
13723	int32x4_t test_vrshrq_n_s32(int32x4_t a) {
13724	return vrshrq_n_s32(a, 1);
13725	}
13726
13727	// CHECK-LABEL: @test_vrshrq_n_s64(
13728	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13729	// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13730	// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
13731	// CHECK: ret <2 x i64> [[VRSHR_N1]]
13732	int64x2_t test_vrshrq_n_s64(int64x2_t a) {
13733	return vrshrq_n_s64(a, 1);
13734	}
13735
13736	// CHECK-LABEL: @test_vrshrq_n_u8(
13737	// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13738	// CHECK: ret <16 x i8> [[VRSHR_N]]
13739	uint8x16_t test_vrshrq_n_u8(uint8x16_t a) {
13740	return vrshrq_n_u8(a, 1);
13741	}
13742
13743	// CHECK-LABEL: @test_vrshrq_n_u16(
13744	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13745	// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13746	// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13747	// CHECK: ret <8 x i16> [[VRSHR_N1]]
13748	uint16x8_t test_vrshrq_n_u16(uint16x8_t a) {
13749	return vrshrq_n_u16(a, 1);
13750	}
13751
13752	// CHECK-LABEL: @test_vrshrq_n_u32(
13753	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13754	// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13755	// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13756	// CHECK: ret <4 x i32> [[VRSHR_N1]]
13757	uint32x4_t test_vrshrq_n_u32(uint32x4_t a) {
13758	return vrshrq_n_u32(a, 1);
13759	}
13760
13761	// CHECK-LABEL: @test_vrshrq_n_u64(
13762	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13763	// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13764	// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
13765	// CHECK: ret <2 x i64> [[VRSHR_N1]]
13766	uint64x2_t test_vrshrq_n_u64(uint64x2_t a) {
13767	return vrshrq_n_u64(a, 1);
13768	}
13769
13770	// CHECK-LABEL: @test_vrsqrte_f32(
13771	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
13772	// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a)
13773	// CHECK: ret <2 x float> [[VRSQRTE_V1_I]]
13774	float32x2_t test_vrsqrte_f32(float32x2_t a) {
13775	return vrsqrte_f32(a);
13776	}
13777
13778	// CHECK-LABEL: @test_vrsqrte_u32(
13779	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13780	// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a)
13781	// CHECK: ret <2 x i32> [[VRSQRTE_V1_I]]
13782	uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
13783	return vrsqrte_u32(a);
13784	}
13785
13786	// CHECK-LABEL: @test_vrsqrteq_f32(
13787	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
13788	// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a)
13789	// CHECK: ret <4 x float> [[VRSQRTEQ_V1_I]]
13790	float32x4_t test_vrsqrteq_f32(float32x4_t a) {
13791	return vrsqrteq_f32(a);
13792	}
13793
13794	// CHECK-LABEL: @test_vrsqrteq_u32(
13795	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13796	// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a)
13797	// CHECK: ret <4 x i32> [[VRSQRTEQ_V1_I]]
13798	uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
13799	return vrsqrteq_u32(a);
13800	}
13801
13802	// CHECK-LABEL: @test_vrsqrts_f32(
13803	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
13804	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
13805	// CHECK: [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %a, <2 x float> %b)
13806	// CHECK: [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
13807	// CHECK: ret <2 x float> [[VRSQRTS_V2_I]]
13808	float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) {
13809	return vrsqrts_f32(a, b);
13810	}
13811
13812	// CHECK-LABEL: @test_vrsqrtsq_f32(
13813	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
13814	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
13815	// CHECK: [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %a, <4 x float> %b)
13816	// CHECK: [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
13817	// CHECK: ret <4 x float> [[VRSQRTSQ_V2_I]]
13818	float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) {
13819	return vrsqrtsq_f32(a, b);
13820	}
13821
13822	// CHECK-LABEL: @test_vrsra_n_s8(
13823	// CHECK: [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13824	// CHECK: [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
13825	// CHECK: ret <8 x i8> [[VRSRA_N]]
13826	int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
13827	return vrsra_n_s8(a, b, 1);
13828	}
13829
13830	// CHECK-LABEL: @test_vrsra_n_s16(
13831	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13832	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13833	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13834	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
13835	// CHECK: [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
13836	// CHECK: [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
13837	// CHECK: ret <4 x i16> [[VRSRA_N]]
13838	int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
13839	return vrsra_n_s16(a, b, 1);
13840	}
13841
13842	// CHECK-LABEL: @test_vrsra_n_s32(
13843	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13844	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13845	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13846	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
13847	// CHECK: [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
13848	// CHECK: [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
13849	// CHECK: ret <2 x i32> [[VRSRA_N]]
13850	int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
13851	return vrsra_n_s32(a, b, 1);
13852	}
13853
13854	// CHECK-LABEL: @test_vrsra_n_s64(
13855	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13856	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
13857	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13858	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
13859	// CHECK: [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
13860	// CHECK: [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
13861	// CHECK: ret <1 x i64> [[VRSRA_N]]
13862	int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
13863	return vrsra_n_s64(a, b, 1);
13864	}
13865
13866	// CHECK-LABEL: @test_vrsra_n_u8(
13867	// CHECK: [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13868	// CHECK: [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
13869	// CHECK: ret <8 x i8> [[VRSRA_N]]
13870	uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) {
13871	return vrsra_n_u8(a, b, 1);
13872	}
13873
13874	// CHECK-LABEL: @test_vrsra_n_u16(
13875	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13876	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13877	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13878	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
13879	// CHECK: [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
13880	// CHECK: [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
13881	// CHECK: ret <4 x i16> [[VRSRA_N]]
13882	uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) {
13883	return vrsra_n_u16(a, b, 1);
13884	}
13885
13886	// CHECK-LABEL: @test_vrsra_n_u32(
13887	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13888	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13889	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13890	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
13891	// CHECK: [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
13892	// CHECK: [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
13893	// CHECK: ret <2 x i32> [[VRSRA_N]]
13894	uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) {
13895	return vrsra_n_u32(a, b, 1);
13896	}
13897
13898	// CHECK-LABEL: @test_vrsra_n_u64(
13899	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13900	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
13901	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13902	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
13903	// CHECK: [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
13904	// CHECK: [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
13905	// CHECK: ret <1 x i64> [[VRSRA_N]]
13906	uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
13907	return vrsra_n_u64(a, b, 1);
13908	}
13909
13910	// CHECK-LABEL: @test_vrsraq_n_s8(
13911	// CHECK: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13912	// CHECK: [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
13913	// CHECK: ret <16 x i8> [[VRSRA_N]]
13914	int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
13915	return vrsraq_n_s8(a, b, 1);
13916	}
13917
13918	// CHECK-LABEL: @test_vrsraq_n_s16(
13919	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13920	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13921	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13922	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13923	// CHECK: [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13924	// CHECK: [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
13925	// CHECK: ret <8 x i16> [[VRSRA_N]]
13926	int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
13927	return vrsraq_n_s16(a, b, 1);
13928	}
13929
13930	// CHECK-LABEL: @test_vrsraq_n_s32(
13931	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13932	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13933	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13934	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13935	// CHECK: [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13936	// CHECK: [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
13937	// CHECK: ret <4 x i32> [[VRSRA_N]]
13938	int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
13939	return vrsraq_n_s32(a, b, 1);
13940	}
13941
13942	// CHECK-LABEL: @test_vrsraq_n_s64(
13943	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13944	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13945	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13946	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13947	// CHECK: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
13948	// CHECK: [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
13949	// CHECK: ret <2 x i64> [[VRSRA_N]]
13950	int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
13951	return vrsraq_n_s64(a, b, 1);
13952	}
13953
13954	// CHECK-LABEL: @test_vrsraq_n_u8(
13955	// CHECK: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
13956	// CHECK: [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
13957	// CHECK: ret <16 x i8> [[VRSRA_N]]
13958	uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) {
13959	return vrsraq_n_u8(a, b, 1);
13960	}
13961
13962	// CHECK-LABEL: @test_vrsraq_n_u16(
13963	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13964	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13965	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13966	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13967	// CHECK: [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13968	// CHECK: [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
13969	// CHECK: ret <8 x i16> [[VRSRA_N]]
13970	uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) {
13971	return vrsraq_n_u16(a, b, 1);
13972	}
13973
13974	// CHECK-LABEL: @test_vrsraq_n_u32(
13975	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13976	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13977	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13978	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13979	// CHECK: [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13980	// CHECK: [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
13981	// CHECK: ret <4 x i32> [[VRSRA_N]]
13982	uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) {
13983	return vrsraq_n_u32(a, b, 1);
13984	}
13985
13986	// CHECK-LABEL: @test_vrsraq_n_u64(
13987	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13988	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13989	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13990	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13991	// CHECK: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
13992	// CHECK: [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
13993	// CHECK: ret <2 x i64> [[VRSRA_N]]
13994	uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) {
13995	return vrsraq_n_u64(a, b, 1);
13996	}
13997
13998	// CHECK-LABEL: @test_vrsubhn_s16(
13999	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14000	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14001	// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
14002	// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]]
14003	int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
14004	return vrsubhn_s16(a, b);
14005	}
14006
14007	// CHECK-LABEL: @test_vrsubhn_s32(
14008	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14009	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14010	// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
14011	// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
14012	// CHECK: ret <4 x i16> [[VRSUBHN_V2_I]]
14013	int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
14014	return vrsubhn_s32(a, b);
14015	}
14016
14017	// CHECK-LABEL: @test_vrsubhn_s64(
14018	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14019	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14020	// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
14021	// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
14022	// CHECK: ret <2 x i32> [[VRSUBHN_V2_I]]
14023	int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
14024	return vrsubhn_s64(a, b);
14025	}
14026
14027	// CHECK-LABEL: @test_vrsubhn_u16(
14028	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14029	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14030	// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
14031	// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]]
14032	uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
14033	return vrsubhn_u16(a, b);
14034	}
14035
14036	// CHECK-LABEL: @test_vrsubhn_u32(
14037	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14038	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14039	// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
14040	// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
14041	// CHECK: ret <4 x i16> [[VRSUBHN_V2_I]]
14042	uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
14043	return vrsubhn_u32(a, b);
14044	}
14045
14046	// CHECK-LABEL: @test_vrsubhn_u64(
14047	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14048	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14049	// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
14050	// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
14051	// CHECK: ret <2 x i32> [[VRSUBHN_V2_I]]
14052	uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
14053	return vrsubhn_u64(a, b);
14054	}
14055
14056	// CHECK-LABEL: @test_vset_lane_u8(
14057	// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
14058	// CHECK: ret <8 x i8> [[VSET_LANE]]
14059	uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
14060	return vset_lane_u8(a, b, 7);
14061	}
14062
14063	// CHECK-LABEL: @test_vset_lane_u16(
14064	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14065	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14066	// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
14067	// CHECK: ret <4 x i16> [[VSET_LANE]]
14068	uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
14069	return vset_lane_u16(a, b, 3);
14070	}
14071
14072	// CHECK-LABEL: @test_vset_lane_u32(
14073	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
14074	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14075	// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
14076	// CHECK: ret <2 x i32> [[VSET_LANE]]
14077	uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
14078	return vset_lane_u32(a, b, 1);
14079	}
14080
14081	// CHECK-LABEL: @test_vset_lane_s8(
14082	// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
14083	// CHECK: ret <8 x i8> [[VSET_LANE]]
14084	int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
14085	return vset_lane_s8(a, b, 7);
14086	}
14087
14088	// CHECK-LABEL: @test_vset_lane_s16(
14089	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14090	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14091	// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
14092	// CHECK: ret <4 x i16> [[VSET_LANE]]
14093	int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
14094	return vset_lane_s16(a, b, 3);
14095	}
14096
14097	// CHECK-LABEL: @test_vset_lane_s32(
14098	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
14099	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14100	// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
14101	// CHECK: ret <2 x i32> [[VSET_LANE]]
14102	int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
14103	return vset_lane_s32(a, b, 1);
14104	}
14105
14106	// CHECK-LABEL: @test_vset_lane_p8(
14107	// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
14108	// CHECK: ret <8 x i8> [[VSET_LANE]]
14109	poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
14110	return vset_lane_p8(a, b, 7);
14111	}
14112
14113	// CHECK-LABEL: @test_vset_lane_p16(
14114	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14115	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14116	// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
14117	// CHECK: ret <4 x i16> [[VSET_LANE]]
14118	poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
14119	return vset_lane_p16(a, b, 3);
14120	}
14121
14122	// CHECK-LABEL: @test_vset_lane_f32(
14123	// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
14124	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
14125	// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1
14126	// CHECK: ret <2 x float> [[VSET_LANE]]
14127	float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
14128	return vset_lane_f32(a, b, 1);
14129	}
14130
14131	// CHECK-LABEL: @test_vset_lane_f16(
14132	// CHECK: [[__REINT_246:%.*]] = alloca half, align 2
14133	// CHECK: [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
14134	// CHECK: [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
14135	// CHECK: [[TMP0:%.]] = load half, half %a, align 2
14136	// CHECK: store half [[TMP0]], half* [[__REINT_246]], align 2
14137	// CHECK: store <4 x half> %b, <4 x half>* [[__REINT1_246]], align 8
14138	// CHECK: [[TMP1:%.]] = bitcast half [[__REINT_246]] to i16*
14139	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]], align 2
14140	// CHECK: [[TMP3:%.]] = bitcast <4 x half> [[__REINT1_246]] to <4 x i16>*
14141	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[TMP3]], align 8
14142	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
14143	// CHECK: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
14144	// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[TMP2]], i32 1
14145	// CHECK: store <4 x i16> [[VSET_LANE]], <4 x i16>* [[__REINT2_246]], align 8
14146	// CHECK: [[TMP7:%.]] = bitcast <4 x i16> [[__REINT2_246]] to <4 x half>*
14147	// CHECK: [[TMP8:%.]] = load <4 x half>, <4 x half> [[TMP7]], align 8
14148	// CHECK: ret <4 x half> [[TMP8]]
14149	float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
14150	return vset_lane_f16(*a, b, 1);
14151	}
14152
14153	// CHECK-LABEL: @test_vsetq_lane_u8(
14154	// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
14155	// CHECK: ret <16 x i8> [[VSET_LANE]]
14156	uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
14157	return vsetq_lane_u8(a, b, 15);
14158	}
14159
14160	// CHECK-LABEL: @test_vsetq_lane_u16(
14161	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14162	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14163	// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
14164	// CHECK: ret <8 x i16> [[VSET_LANE]]
14165	uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
14166	return vsetq_lane_u16(a, b, 7);
14167	}
14168
14169	// CHECK-LABEL: @test_vsetq_lane_u32(
14170	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14171	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14172	// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
14173	// CHECK: ret <4 x i32> [[VSET_LANE]]
14174	uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
14175	return vsetq_lane_u32(a, b, 3);
14176	}
14177
14178	// CHECK-LABEL: @test_vsetq_lane_s8(
14179	// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
14180	// CHECK: ret <16 x i8> [[VSET_LANE]]
14181	int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
14182	return vsetq_lane_s8(a, b, 15);
14183	}
14184
14185	// CHECK-LABEL: @test_vsetq_lane_s16(
14186	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14187	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14188	// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
14189	// CHECK: ret <8 x i16> [[VSET_LANE]]
14190	int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
14191	return vsetq_lane_s16(a, b, 7);
14192	}
14193
14194	// CHECK-LABEL: @test_vsetq_lane_s32(
14195	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14196	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14197	// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
14198	// CHECK: ret <4 x i32> [[VSET_LANE]]
14199	int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
14200	return vsetq_lane_s32(a, b, 3);
14201	}
14202
14203	// CHECK-LABEL: @test_vsetq_lane_p8(
14204	// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
14205	// CHECK: ret <16 x i8> [[VSET_LANE]]
14206	poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
14207	return vsetq_lane_p8(a, b, 15);
14208	}
14209
14210	// CHECK-LABEL: @test_vsetq_lane_p16(
14211	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14212	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14213	// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
14214	// CHECK: ret <8 x i16> [[VSET_LANE]]
14215	poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
14216	return vsetq_lane_p16(a, b, 7);
14217	}
14218
14219	// CHECK-LABEL: @test_vsetq_lane_f32(
14220	// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
14221	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
14222	// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3
14223	// CHECK: ret <4 x float> [[VSET_LANE]]
14224	float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
14225	return vsetq_lane_f32(a, b, 3);
14226	}
14227
14228	// CHECK-LABEL: @test_vsetq_lane_f16(
14229	// CHECK: [[__REINT_248:%.*]] = alloca half, align 2
14230	// CHECK: [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
14231	// CHECK: [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
14232	// CHECK: [[TMP0:%.]] = load half, half %a, align 2
14233	// CHECK: store half [[TMP0]], half* [[__REINT_248]], align 2
14234	// CHECK: store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16
14235	// CHECK: [[TMP1:%.]] = bitcast half [[__REINT_248]] to i16*
14236	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]], align 2
14237	// CHECK: [[TMP3:%.]] = bitcast <8 x half> [[__REINT1_248]] to <8 x i16>*
14238	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[TMP3]], align 16
14239	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
14240	// CHECK: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
14241	// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 3
14242	// CHECK: store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16
14243	// CHECK: [[TMP7:%.]] = bitcast <8 x i16> [[__REINT2_248]] to <8 x half>*
14244	// CHECK: [[TMP8:%.]] = load <8 x half>, <8 x half> [[TMP7]], align 16
14245	// CHECK: ret <8 x half> [[TMP8]]
14246	float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
14247	return vsetq_lane_f16(*a, b, 3);
14248	}
14249
14250	// CHECK-LABEL: @test_vset_lane_s64(
14251	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
14252	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14253	// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
14254	// CHECK: ret <1 x i64> [[VSET_LANE]]
14255	int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
14256	return vset_lane_s64(a, b, 0);
14257	}
14258
14259	// CHECK-LABEL: @test_vset_lane_u64(
14260	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
14261	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14262	// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
14263	// CHECK: ret <1 x i64> [[VSET_LANE]]
14264	uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
14265	return vset_lane_u64(a, b, 0);
14266	}
14267
14268	// CHECK-LABEL: @test_vsetq_lane_s64(
14269	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14270	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14271	// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
14272	// CHECK: ret <2 x i64> [[VSET_LANE]]
14273	int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
14274	return vsetq_lane_s64(a, b, 1);
14275	}
14276
14277	// CHECK-LABEL: @test_vsetq_lane_u64(
14278	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14279	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14280	// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
14281	// CHECK: ret <2 x i64> [[VSET_LANE]]
14282	uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
14283	return vsetq_lane_u64(a, b, 1);
14284	}
14285
14286	// CHECK-LABEL: @test_vshl_s8(
14287	// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %a, <8 x i8> %b)
14288	// CHECK: ret <8 x i8> [[VSHL_V_I]]
14289	int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
14290	return vshl_s8(a, b);
14291	}
14292
14293	// CHECK-LABEL: @test_vshl_s16(
14294	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14295	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14296	// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %a, <4 x i16> %b)
14297	// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
14298	// CHECK: ret <4 x i16> [[VSHL_V2_I]]
14299	int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
14300	return vshl_s16(a, b);
14301	}
14302
14303	// CHECK-LABEL: @test_vshl_s32(
14304	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14305	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
14306	// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %a, <2 x i32> %b)
14307	// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
14308	// CHECK: ret <2 x i32> [[VSHL_V2_I]]
14309	int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
14310	return vshl_s32(a, b);
14311	}
14312
14313	// CHECK-LABEL: @test_vshl_s64(
14314	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14315	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
14316	// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %a, <1 x i64> %b)
14317	// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
14318	// CHECK: ret <1 x i64> [[VSHL_V2_I]]
14319	int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
14320	return vshl_s64(a, b);
14321	}
14322
14323	// CHECK-LABEL: @test_vshl_u8(
14324	// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %a, <8 x i8> %b)
14325	// CHECK: ret <8 x i8> [[VSHL_V_I]]
14326	uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
14327	return vshl_u8(a, b);
14328	}
14329
14330	// CHECK-LABEL: @test_vshl_u16(
14331	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14332	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14333	// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %a, <4 x i16> %b)
14334	// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
14335	// CHECK: ret <4 x i16> [[VSHL_V2_I]]
14336	uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
14337	return vshl_u16(a, b);
14338	}
14339
14340	// CHECK-LABEL: @test_vshl_u32(
14341	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14342	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
14343	// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %a, <2 x i32> %b)
14344	// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
14345	// CHECK: ret <2 x i32> [[VSHL_V2_I]]
14346	uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
14347	return vshl_u32(a, b);
14348	}
14349
14350	// CHECK-LABEL: @test_vshl_u64(
14351	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14352	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
14353	// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %a, <1 x i64> %b)
14354	// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
14355	// CHECK: ret <1 x i64> [[VSHL_V2_I]]
14356	uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
14357	return vshl_u64(a, b);
14358	}
14359
14360	// CHECK-LABEL: @test_vshlq_s8(
14361	// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %a, <16 x i8> %b)
14362	// CHECK: ret <16 x i8> [[VSHLQ_V_I]]
14363	int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
14364	return vshlq_s8(a, b);
14365	}
14366
14367	// CHECK-LABEL: @test_vshlq_s16(
14368	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14369	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14370	// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %a, <8 x i16> %b)
14371	// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
14372	// CHECK: ret <8 x i16> [[VSHLQ_V2_I]]
14373	int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
14374	return vshlq_s16(a, b);
14375	}
14376
14377	// CHECK-LABEL: @test_vshlq_s32(
14378	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14379	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14380	// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %a, <4 x i32> %b)
14381	// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
14382	// CHECK: ret <4 x i32> [[VSHLQ_V2_I]]
14383	int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
14384	return vshlq_s32(a, b);
14385	}
14386
14387	// CHECK-LABEL: @test_vshlq_s64(
14388	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14389	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14390	// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %a, <2 x i64> %b)
14391	// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
14392	// CHECK: ret <2 x i64> [[VSHLQ_V2_I]]
14393	int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
14394	return vshlq_s64(a, b);
14395	}
14396
14397	// CHECK-LABEL: @test_vshlq_u8(
14398	// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %a, <16 x i8> %b)
14399	// CHECK: ret <16 x i8> [[VSHLQ_V_I]]
14400	uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
14401	return vshlq_u8(a, b);
14402	}
14403
14404	// CHECK-LABEL: @test_vshlq_u16(
14405	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14406	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14407	// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %a, <8 x i16> %b)
14408	// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
14409	// CHECK: ret <8 x i16> [[VSHLQ_V2_I]]
14410	uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
14411	return vshlq_u16(a, b);
14412	}
14413
14414	// CHECK-LABEL: @test_vshlq_u32(
14415	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14416	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14417	// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %a, <4 x i32> %b)
14418	// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
14419	// CHECK: ret <4 x i32> [[VSHLQ_V2_I]]
14420	uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
14421	return vshlq_u32(a, b);
14422	}
14423
14424	// CHECK-LABEL: @test_vshlq_u64(
14425	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14426	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14427	// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %a, <2 x i64> %b)
14428	// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
14429	// CHECK: ret <2 x i64> [[VSHLQ_V2_I]]
14430	uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
14431	return vshlq_u64(a, b);
14432	}
14433
14434	// CHECK-LABEL: @test_vshll_n_s8(
14435	// CHECK: [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
14436	// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14437	// CHECK: ret <8 x i16> [[VSHLL_N]]
14438	int16x8_t test_vshll_n_s8(int8x8_t a) {
14439	return vshll_n_s8(a, 1);
14440	}
14441
14442	// CHECK-LABEL: @test_vshll_n_s16(
14443	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14444	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14445	// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
14446	// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
14447	// CHECK: ret <4 x i32> [[VSHLL_N]]
14448	int32x4_t test_vshll_n_s16(int16x4_t a) {
14449	return vshll_n_s16(a, 1);
14450	}
14451
14452	// CHECK-LABEL: @test_vshll_n_s32(
14453	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14454	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14455	// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
14456	// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
14457	// CHECK: ret <2 x i64> [[VSHLL_N]]
14458	int64x2_t test_vshll_n_s32(int32x2_t a) {
14459	return vshll_n_s32(a, 1);
14460	}
14461
14462	// CHECK-LABEL: @test_vshll_n_u8(
14463	// CHECK: [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
14464	// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14465	// CHECK: ret <8 x i16> [[VSHLL_N]]
14466	uint16x8_t test_vshll_n_u8(uint8x8_t a) {
14467	return vshll_n_u8(a, 1);
14468	}
14469
14470	// CHECK-LABEL: @test_vshll_n_u16(
14471	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14472	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14473	// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
14474	// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
14475	// CHECK: ret <4 x i32> [[VSHLL_N]]
14476	uint32x4_t test_vshll_n_u16(uint16x4_t a) {
14477	return vshll_n_u16(a, 1);
14478	}
14479
14480	// CHECK-LABEL: @test_vshll_n_u32(
14481	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14482	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14483	// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
14484	// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
14485	// CHECK: ret <2 x i64> [[VSHLL_N]]
14486	uint64x2_t test_vshll_n_u32(uint32x2_t a) {
14487	return vshll_n_u32(a, 1);
14488	}
14489
14490	// CHECK-LABEL: @test_vshl_n_s8(
14491	// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14492	// CHECK: ret <8 x i8> [[VSHL_N]]
14493	int8x8_t test_vshl_n_s8(int8x8_t a) {
14494	return vshl_n_s8(a, 1);
14495	}
14496
14497	// CHECK-LABEL: @test_vshl_n_s16(
14498	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14499	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14500	// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
14501	// CHECK: ret <4 x i16> [[VSHL_N]]
14502	int16x4_t test_vshl_n_s16(int16x4_t a) {
14503	return vshl_n_s16(a, 1);
14504	}
14505
14506	// CHECK-LABEL: @test_vshl_n_s32(
14507	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14508	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14509	// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
14510	// CHECK: ret <2 x i32> [[VSHL_N]]
14511	int32x2_t test_vshl_n_s32(int32x2_t a) {
14512	return vshl_n_s32(a, 1);
14513	}
14514
14515	// CHECK-LABEL: @test_vshl_n_s64(
14516	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14517	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14518	// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
14519	// CHECK: ret <1 x i64> [[VSHL_N]]
14520	int64x1_t test_vshl_n_s64(int64x1_t a) {
14521	return vshl_n_s64(a, 1);
14522	}
14523
14524	// CHECK-LABEL: @test_vshl_n_u8(
14525	// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14526	// CHECK: ret <8 x i8> [[VSHL_N]]
14527	uint8x8_t test_vshl_n_u8(uint8x8_t a) {
14528	return vshl_n_u8(a, 1);
14529	}
14530
14531	// CHECK-LABEL: @test_vshl_n_u16(
14532	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14533	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14534	// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
14535	// CHECK: ret <4 x i16> [[VSHL_N]]
14536	uint16x4_t test_vshl_n_u16(uint16x4_t a) {
14537	return vshl_n_u16(a, 1);
14538	}
14539
14540	// CHECK-LABEL: @test_vshl_n_u32(
14541	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14542	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14543	// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
14544	// CHECK: ret <2 x i32> [[VSHL_N]]
14545	uint32x2_t test_vshl_n_u32(uint32x2_t a) {
14546	return vshl_n_u32(a, 1);
14547	}
14548
14549	// CHECK-LABEL: @test_vshl_n_u64(
14550	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14551	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14552	// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
14553	// CHECK: ret <1 x i64> [[VSHL_N]]
14554	uint64x1_t test_vshl_n_u64(uint64x1_t a) {
14555	return vshl_n_u64(a, 1);
14556	}
14557
14558	// CHECK-LABEL: @test_vshlq_n_s8(
14559	// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14560	// CHECK: ret <16 x i8> [[VSHL_N]]
14561	int8x16_t test_vshlq_n_s8(int8x16_t a) {
14562	return vshlq_n_s8(a, 1);
14563	}
14564
14565	// CHECK-LABEL: @test_vshlq_n_s16(
14566	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14567	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14568	// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14569	// CHECK: ret <8 x i16> [[VSHL_N]]
14570	int16x8_t test_vshlq_n_s16(int16x8_t a) {
14571	return vshlq_n_s16(a, 1);
14572	}
14573
14574	// CHECK-LABEL: @test_vshlq_n_s32(
14575	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14576	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14577	// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
14578	// CHECK: ret <4 x i32> [[VSHL_N]]
14579	int32x4_t test_vshlq_n_s32(int32x4_t a) {
14580	return vshlq_n_s32(a, 1);
14581	}
14582
14583	// CHECK-LABEL: @test_vshlq_n_s64(
14584	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14585	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14586	// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
14587	// CHECK: ret <2 x i64> [[VSHL_N]]
14588	int64x2_t test_vshlq_n_s64(int64x2_t a) {
14589	return vshlq_n_s64(a, 1);
14590	}
14591
14592	// CHECK-LABEL: @test_vshlq_n_u8(
14593	// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14594	// CHECK: ret <16 x i8> [[VSHL_N]]
14595	uint8x16_t test_vshlq_n_u8(uint8x16_t a) {
14596	return vshlq_n_u8(a, 1);
14597	}
14598
14599	// CHECK-LABEL: @test_vshlq_n_u16(
14600	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14601	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14602	// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14603	// CHECK: ret <8 x i16> [[VSHL_N]]
14604	uint16x8_t test_vshlq_n_u16(uint16x8_t a) {
14605	return vshlq_n_u16(a, 1);
14606	}
14607
14608	// CHECK-LABEL: @test_vshlq_n_u32(
14609	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14610	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14611	// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
14612	// CHECK: ret <4 x i32> [[VSHL_N]]
14613	uint32x4_t test_vshlq_n_u32(uint32x4_t a) {
14614	return vshlq_n_u32(a, 1);
14615	}
14616
14617	// CHECK-LABEL: @test_vshlq_n_u64(
14618	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14619	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14620	// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
14621	// CHECK: ret <2 x i64> [[VSHL_N]]
14622	uint64x2_t test_vshlq_n_u64(uint64x2_t a) {
14623	return vshlq_n_u64(a, 1);
14624	}
14625
14626	// CHECK-LABEL: @test_vshrn_n_s16(
14627	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14628	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14629	// CHECK: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14630	// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
14631	// CHECK: ret <8 x i8> [[VSHRN_N]]
14632	int8x8_t test_vshrn_n_s16(int16x8_t a) {
14633	return vshrn_n_s16(a, 1);
14634	}
14635
14636	// CHECK-LABEL: @test_vshrn_n_s32(
14637	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14638	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14639	// CHECK: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
14640	// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
14641	// CHECK: ret <4 x i16> [[VSHRN_N]]
14642	int16x4_t test_vshrn_n_s32(int32x4_t a) {
14643	return vshrn_n_s32(a, 1);
14644	}
14645
14646	// CHECK-LABEL: @test_vshrn_n_s64(
14647	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14648	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14649	// CHECK: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
14650	// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
14651	// CHECK: ret <2 x i32> [[VSHRN_N]]
14652	int32x2_t test_vshrn_n_s64(int64x2_t a) {
14653	return vshrn_n_s64(a, 1);
14654	}
14655
14656	// CHECK-LABEL: @test_vshrn_n_u16(
14657	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14658	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14659	// CHECK: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14660	// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
14661	// CHECK: ret <8 x i8> [[VSHRN_N]]
14662	uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
14663	return vshrn_n_u16(a, 1);
14664	}
14665
14666	// CHECK-LABEL: @test_vshrn_n_u32(
14667	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14668	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14669	// CHECK: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
14670	// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
14671	// CHECK: ret <4 x i16> [[VSHRN_N]]
14672	uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
14673	return vshrn_n_u32(a, 1);
14674	}
14675
14676	// CHECK-LABEL: @test_vshrn_n_u64(
14677	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14678	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14679	// CHECK: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
14680	// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
14681	// CHECK: ret <2 x i32> [[VSHRN_N]]
14682	uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
14683	return vshrn_n_u64(a, 1);
14684	}
14685
14686	// CHECK-LABEL: @test_vshr_n_s8(
14687	// CHECK: [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14688	// CHECK: ret <8 x i8> [[VSHR_N]]
14689	int8x8_t test_vshr_n_s8(int8x8_t a) {
14690	return vshr_n_s8(a, 1);
14691	}
14692
14693	// CHECK-LABEL: @test_vshr_n_s16(
14694	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14695	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14696	// CHECK: [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
14697	// CHECK: ret <4 x i16> [[VSHR_N]]
14698	int16x4_t test_vshr_n_s16(int16x4_t a) {
14699	return vshr_n_s16(a, 1);
14700	}
14701
14702	// CHECK-LABEL: @test_vshr_n_s32(
14703	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14704	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14705	// CHECK: [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 1, i32 1>
14706	// CHECK: ret <2 x i32> [[VSHR_N]]
14707	int32x2_t test_vshr_n_s32(int32x2_t a) {
14708	return vshr_n_s32(a, 1);
14709	}
14710
14711	// CHECK-LABEL: @test_vshr_n_s64(
14712	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14713	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14714	// CHECK: [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
14715	// CHECK: ret <1 x i64> [[VSHR_N]]
14716	int64x1_t test_vshr_n_s64(int64x1_t a) {
14717	return vshr_n_s64(a, 1);
14718	}
14719
14720	// CHECK-LABEL: @test_vshr_n_u8(
14721	// CHECK: [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14722	// CHECK: ret <8 x i8> [[VSHR_N]]
14723	uint8x8_t test_vshr_n_u8(uint8x8_t a) {
14724	return vshr_n_u8(a, 1);
14725	}
14726
14727	// CHECK-LABEL: @test_vshr_n_u16(
14728	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14729	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14730	// CHECK: [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
14731	// CHECK: ret <4 x i16> [[VSHR_N]]
14732	uint16x4_t test_vshr_n_u16(uint16x4_t a) {
14733	return vshr_n_u16(a, 1);
14734	}
14735
14736	// CHECK-LABEL: @test_vshr_n_u32(
14737	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14738	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14739	// CHECK: [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 1, i32 1>
14740	// CHECK: ret <2 x i32> [[VSHR_N]]
14741	uint32x2_t test_vshr_n_u32(uint32x2_t a) {
14742	return vshr_n_u32(a, 1);
14743	}
14744
14745	// CHECK-LABEL: @test_vshr_n_u64(
14746	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14747	// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14748	// CHECK: [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
14749	// CHECK: ret <1 x i64> [[VSHR_N]]
14750	uint64x1_t test_vshr_n_u64(uint64x1_t a) {
14751	return vshr_n_u64(a, 1);
14752	}
14753
14754	// CHECK-LABEL: @test_vshrq_n_s8(
14755	// CHECK: [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14756	// CHECK: ret <16 x i8> [[VSHR_N]]
14757	int8x16_t test_vshrq_n_s8(int8x16_t a) {
14758	return vshrq_n_s8(a, 1);
14759	}
14760
14761	// CHECK-LABEL: @test_vshrq_n_s16(
14762	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14763	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14764	// CHECK: [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14765	// CHECK: ret <8 x i16> [[VSHR_N]]
14766	int16x8_t test_vshrq_n_s16(int16x8_t a) {
14767	return vshrq_n_s16(a, 1);
14768	}
14769
14770	// CHECK-LABEL: @test_vshrq_n_s32(
14771	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14772	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14773	// CHECK: [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
14774	// CHECK: ret <4 x i32> [[VSHR_N]]
14775	int32x4_t test_vshrq_n_s32(int32x4_t a) {
14776	return vshrq_n_s32(a, 1);
14777	}
14778
14779	// CHECK-LABEL: @test_vshrq_n_s64(
14780	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14781	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14782	// CHECK: [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
14783	// CHECK: ret <2 x i64> [[VSHR_N]]
14784	int64x2_t test_vshrq_n_s64(int64x2_t a) {
14785	return vshrq_n_s64(a, 1);
14786	}
14787
14788	// CHECK-LABEL: @test_vshrq_n_u8(
14789	// CHECK: [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
14790	// CHECK: ret <16 x i8> [[VSHR_N]]
14791	uint8x16_t test_vshrq_n_u8(uint8x16_t a) {
14792	return vshrq_n_u8(a, 1);
14793	}
14794
14795	// CHECK-LABEL: @test_vshrq_n_u16(
14796	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14797	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14798	// CHECK: [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
14799	// CHECK: ret <8 x i16> [[VSHR_N]]
14800	uint16x8_t test_vshrq_n_u16(uint16x8_t a) {
14801	return vshrq_n_u16(a, 1);
14802	}
14803
14804	// CHECK-LABEL: @test_vshrq_n_u32(
14805	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14806	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14807	// CHECK: [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
14808	// CHECK: ret <4 x i32> [[VSHR_N]]
14809	uint32x4_t test_vshrq_n_u32(uint32x4_t a) {
14810	return vshrq_n_u32(a, 1);
14811	}
14812
14813	// CHECK-LABEL: @test_vshrq_n_u64(
14814	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14815	// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14816	// CHECK: [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
14817	// CHECK: ret <2 x i64> [[VSHR_N]]
14818	uint64x2_t test_vshrq_n_u64(uint64x2_t a) {
14819	return vshrq_n_u64(a, 1);
14820	}
14821
14822	// CHECK-LABEL: @test_vsli_n_s8(
14823	// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
14824	// CHECK: ret <8 x i8> [[VSLI_N]]
14825	int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
14826	return vsli_n_s8(a, b, 1);
14827	}
14828
14829	// CHECK-LABEL: @test_vsli_n_s16(
14830	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14831	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14832	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14833	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
14834	// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
14835	// CHECK: ret <4 x i16> [[VSLI_N2]]
14836	int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
14837	return vsli_n_s16(a, b, 1);
14838	}
14839
14840	// CHECK-LABEL: @test_vsli_n_s32(
14841	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14842	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
14843	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14844	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
14845	// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
14846	// CHECK: ret <2 x i32> [[VSLI_N2]]
14847	int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
14848	return vsli_n_s32(a, b, 1);
14849	}
14850
14851	// CHECK-LABEL: @test_vsli_n_s64(
14852	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14853	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
14854	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14855	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
14856	// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
14857	// CHECK: ret <1 x i64> [[VSLI_N2]]
14858	int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
14859	return vsli_n_s64(a, b, 1);
14860	}
14861
14862	// CHECK-LABEL: @test_vsli_n_u8(
14863	// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
14864	// CHECK: ret <8 x i8> [[VSLI_N]]
14865	uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
14866	return vsli_n_u8(a, b, 1);
14867	}
14868
14869	// CHECK-LABEL: @test_vsli_n_u16(
14870	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14871	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14872	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14873	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
14874	// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
14875	// CHECK: ret <4 x i16> [[VSLI_N2]]
14876	uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
14877	return vsli_n_u16(a, b, 1);
14878	}
14879
14880	// CHECK-LABEL: @test_vsli_n_u32(
14881	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14882	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
14883	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
14884	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
14885	// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
14886	// CHECK: ret <2 x i32> [[VSLI_N2]]
14887	uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
14888	return vsli_n_u32(a, b, 1);
14889	}
14890
14891	// CHECK-LABEL: @test_vsli_n_u64(
14892	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14893	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
14894	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
14895	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
14896	// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
14897	// CHECK: ret <1 x i64> [[VSLI_N2]]
14898	uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
14899	return vsli_n_u64(a, b, 1);
14900	}
14901
14902	// CHECK-LABEL: @test_vsli_n_p8(
14903	// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
14904	// CHECK: ret <8 x i8> [[VSLI_N]]
14905	poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
14906	return vsli_n_p8(a, b, 1);
14907	}
14908
14909	// CHECK-LABEL: @test_vsli_n_p16(
14910	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14911	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
14912	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
14913	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
14914	// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
14915	// CHECK: ret <4 x i16> [[VSLI_N2]]
14916	poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
14917	return vsli_n_p16(a, b, 1);
14918	}
14919
14920	// CHECK-LABEL: @test_vsliq_n_s8(
14921	// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
14922	// CHECK: ret <16 x i8> [[VSLI_N]]
14923	int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
14924	return vsliq_n_s8(a, b, 1);
14925	}
14926
14927	// CHECK-LABEL: @test_vsliq_n_s16(
14928	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14929	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14930	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14931	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
14932	// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
14933	// CHECK: ret <8 x i16> [[VSLI_N2]]
14934	int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
14935	return vsliq_n_s16(a, b, 1);
14936	}
14937
14938	// CHECK-LABEL: @test_vsliq_n_s32(
14939	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14940	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14941	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14942	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
14943	// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
14944	// CHECK: ret <4 x i32> [[VSLI_N2]]
14945	int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
14946	return vsliq_n_s32(a, b, 1);
14947	}
14948
14949	// CHECK-LABEL: @test_vsliq_n_s64(
14950	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14951	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14952	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14953	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
14954	// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
14955	// CHECK: ret <2 x i64> [[VSLI_N2]]
14956	int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
14957	return vsliq_n_s64(a, b, 1);
14958	}
14959
14960	// CHECK-LABEL: @test_vsliq_n_u8(
14961	// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
14962	// CHECK: ret <16 x i8> [[VSLI_N]]
14963	uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
14964	return vsliq_n_u8(a, b, 1);
14965	}
14966
14967	// CHECK-LABEL: @test_vsliq_n_u16(
14968	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14969	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
14970	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
14971	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
14972	// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
14973	// CHECK: ret <8 x i16> [[VSLI_N2]]
14974	uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
14975	return vsliq_n_u16(a, b, 1);
14976	}
14977
14978	// CHECK-LABEL: @test_vsliq_n_u32(
14979	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14980	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
14981	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
14982	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
14983	// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
14984	// CHECK: ret <4 x i32> [[VSLI_N2]]
14985	uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
14986	return vsliq_n_u32(a, b, 1);
14987	}
14988
14989	// CHECK-LABEL: @test_vsliq_n_u64(
14990	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14991	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
14992	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
14993	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
14994	// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
14995	// CHECK: ret <2 x i64> [[VSLI_N2]]
14996	uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
14997	return vsliq_n_u64(a, b, 1);
14998	}
14999
15000	// CHECK-LABEL: @test_vsliq_n_p8(
15001	// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
15002	// CHECK: ret <16 x i8> [[VSLI_N]]
15003	poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
15004	return vsliq_n_p8(a, b, 1);
15005	}
15006
15007	// CHECK-LABEL: @test_vsliq_n_p16(
15008	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15009	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15010	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15011	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15012	// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
15013	// CHECK: ret <8 x i16> [[VSLI_N2]]
15014	poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
15015	return vsliq_n_p16(a, b, 1);
15016	}
15017
15018	// CHECK-LABEL: @test_vsra_n_s8(
15019	// CHECK: [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
15020	// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
15021	// CHECK: ret <8 x i8> [[TMP0]]
15022	int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
15023	return vsra_n_s8(a, b, 1);
15024	}
15025
15026	// CHECK-LABEL: @test_vsra_n_s16(
15027	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
15028	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15029	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
15030	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15031	// CHECK: [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
15032	// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
15033	// CHECK: ret <4 x i16> [[TMP4]]
15034	int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
15035	return vsra_n_s16(a, b, 1);
15036	}
15037
15038	// CHECK-LABEL: @test_vsra_n_s32(
15039	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
15040	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15041	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
15042	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15043	// CHECK: [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], <i32 1, i32 1>
15044	// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
15045	// CHECK: ret <2 x i32> [[TMP4]]
15046	int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
15047	return vsra_n_s32(a, b, 1);
15048	}
15049
15050	// CHECK-LABEL: @test_vsra_n_s64(
15051	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
15052	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15053	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
15054	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15055	// CHECK: [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], <i64 1>
15056	// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
15057	// CHECK: ret <1 x i64> [[TMP4]]
15058	int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
15059	return vsra_n_s64(a, b, 1);
15060	}
15061
15062	// CHECK-LABEL: @test_vsra_n_u8(
15063	// CHECK: [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
15064	// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
15065	// CHECK: ret <8 x i8> [[TMP0]]
15066	uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) {
15067	return vsra_n_u8(a, b, 1);
15068	}
15069
15070	// CHECK-LABEL: @test_vsra_n_u16(
15071	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
15072	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15073	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
15074	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15075	// CHECK: [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
15076	// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
15077	// CHECK: ret <4 x i16> [[TMP4]]
15078	uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) {
15079	return vsra_n_u16(a, b, 1);
15080	}
15081
15082	// CHECK-LABEL: @test_vsra_n_u32(
15083	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
15084	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15085	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
15086	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15087	// CHECK: [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], <i32 1, i32 1>
15088	// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
15089	// CHECK: ret <2 x i32> [[TMP4]]
15090	uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) {
15091	return vsra_n_u32(a, b, 1);
15092	}
15093
15094	// CHECK-LABEL: @test_vsra_n_u64(
15095	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
15096	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15097	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
15098	// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15099	// CHECK: [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], <i64 1>
15100	// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
15101	// CHECK: ret <1 x i64> [[TMP4]]
15102	uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
15103	return vsra_n_u64(a, b, 1);
15104	}
15105
15106	// CHECK-LABEL: @test_vsraq_n_s8(
15107	// CHECK: [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
15108	// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
15109	// CHECK: ret <16 x i8> [[TMP0]]
15110	int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
15111	return vsraq_n_s8(a, b, 1);
15112	}
15113
15114	// CHECK-LABEL: @test_vsraq_n_s16(
15115	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15116	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15117	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15118	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15119	// CHECK: [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
15120	// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
15121	// CHECK: ret <8 x i16> [[TMP4]]
15122	int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
15123	return vsraq_n_s16(a, b, 1);
15124	}
15125
15126	// CHECK-LABEL: @test_vsraq_n_s32(
15127	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15128	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15129	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
15130	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15131	// CHECK: [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
15132	// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
15133	// CHECK: ret <4 x i32> [[TMP4]]
15134	int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
15135	return vsraq_n_s32(a, b, 1);
15136	}
15137
15138	// CHECK-LABEL: @test_vsraq_n_s64(
15139	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
15140	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
15141	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
15142	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15143	// CHECK: [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], <i64 1, i64 1>
15144	// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
15145	// CHECK: ret <2 x i64> [[TMP4]]
15146	int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
15147	return vsraq_n_s64(a, b, 1);
15148	}
15149
15150	// CHECK-LABEL: @test_vsraq_n_u8(
15151	// CHECK: [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
15152	// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
15153	// CHECK: ret <16 x i8> [[TMP0]]
15154	uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) {
15155	return vsraq_n_u8(a, b, 1);
15156	}
15157
15158	// CHECK-LABEL: @test_vsraq_n_u16(
15159	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15160	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15161	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15162	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15163	// CHECK: [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
15164	// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
15165	// CHECK: ret <8 x i16> [[TMP4]]
15166	uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) {
15167	return vsraq_n_u16(a, b, 1);
15168	}
15169
15170	// CHECK-LABEL: @test_vsraq_n_u32(
15171	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15172	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15173	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
15174	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15175	// CHECK: [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
15176	// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
15177	// CHECK: ret <4 x i32> [[TMP4]]
15178	uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) {
15179	return vsraq_n_u32(a, b, 1);
15180	}
15181
15182	// CHECK-LABEL: @test_vsraq_n_u64(
15183	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
15184	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
15185	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
15186	// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15187	// CHECK: [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], <i64 1, i64 1>
15188	// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
15189	// CHECK: ret <2 x i64> [[TMP4]]
15190	uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) {
15191	return vsraq_n_u64(a, b, 1);
15192	}
15193
15194	// CHECK-LABEL: @test_vsri_n_s8(
15195	// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
15196	// CHECK: ret <8 x i8> [[VSLI_N]]
15197	int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
15198	return vsri_n_s8(a, b, 1);
15199	}
15200
15201	// CHECK-LABEL: @test_vsri_n_s16(
15202	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
15203	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15204	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
15205	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15206	// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
15207	// CHECK: ret <4 x i16> [[VSLI_N2]]
15208	int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
15209	return vsri_n_s16(a, b, 1);
15210	}
15211
15212	// CHECK-LABEL: @test_vsri_n_s32(
15213	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
15214	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15215	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
15216	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15217	// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
15218	// CHECK: ret <2 x i32> [[VSLI_N2]]
15219	int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
15220	return vsri_n_s32(a, b, 1);
15221	}
15222
15223	// CHECK-LABEL: @test_vsri_n_s64(
15224	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
15225	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15226	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
15227	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15228	// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
15229	// CHECK: ret <1 x i64> [[VSLI_N2]]
15230	int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
15231	return vsri_n_s64(a, b, 1);
15232	}
15233
15234	// CHECK-LABEL: @test_vsri_n_u8(
15235	// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
15236	// CHECK: ret <8 x i8> [[VSLI_N]]
15237	uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) {
15238	return vsri_n_u8(a, b, 1);
15239	}
15240
15241	// CHECK-LABEL: @test_vsri_n_u16(
15242	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
15243	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15244	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
15245	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15246	// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
15247	// CHECK: ret <4 x i16> [[VSLI_N2]]
15248	uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) {
15249	return vsri_n_u16(a, b, 1);
15250	}
15251
15252	// CHECK-LABEL: @test_vsri_n_u32(
15253	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
15254	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15255	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
15256	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15257	// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
15258	// CHECK: ret <2 x i32> [[VSLI_N2]]
15259	uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) {
15260	return vsri_n_u32(a, b, 1);
15261	}
15262
15263	// CHECK-LABEL: @test_vsri_n_u64(
15264	// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
15265	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15266	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
15267	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15268	// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
15269	// CHECK: ret <1 x i64> [[VSLI_N2]]
15270	uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
15271	return vsri_n_u64(a, b, 1);
15272	}
15273
15274	// CHECK-LABEL: @test_vsri_n_p8(
15275	// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
15276	// CHECK: ret <8 x i8> [[VSLI_N]]
15277	poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
15278	return vsri_n_p8(a, b, 1);
15279	}
15280
15281	// CHECK-LABEL: @test_vsri_n_p16(
15282	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
15283	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15284	// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
15285	// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15286	// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
15287	// CHECK: ret <4 x i16> [[VSLI_N2]]
15288	poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
15289	return vsri_n_p16(a, b, 1);
15290	}
15291
15292	// CHECK-LABEL: @test_vsriq_n_s8(
15293	// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
15294	// CHECK: ret <16 x i8> [[VSLI_N]]
15295	int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
15296	return vsriq_n_s8(a, b, 1);
15297	}
15298
15299	// CHECK-LABEL: @test_vsriq_n_s16(
15300	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15301	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15302	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15303	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15304	// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
15305	// CHECK: ret <8 x i16> [[VSLI_N2]]
15306	int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
15307	return vsriq_n_s16(a, b, 1);
15308	}
15309
15310	// CHECK-LABEL: @test_vsriq_n_s32(
15311	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15312	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15313	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
15314	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15315	// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
15316	// CHECK: ret <4 x i32> [[VSLI_N2]]
15317	int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
15318	return vsriq_n_s32(a, b, 1);
15319	}
15320
15321	// CHECK-LABEL: @test_vsriq_n_s64(
15322	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
15323	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
15324	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
15325	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15326	// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
15327	// CHECK: ret <2 x i64> [[VSLI_N2]]
15328	int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
15329	return vsriq_n_s64(a, b, 1);
15330	}
15331
15332	// CHECK-LABEL: @test_vsriq_n_u8(
15333	// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
15334	// CHECK: ret <16 x i8> [[VSLI_N]]
15335	uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) {
15336	return vsriq_n_u8(a, b, 1);
15337	}
15338
15339	// CHECK-LABEL: @test_vsriq_n_u16(
15340	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15341	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15342	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15343	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15344	// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
15345	// CHECK: ret <8 x i16> [[VSLI_N2]]
15346	uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) {
15347	return vsriq_n_u16(a, b, 1);
15348	}
15349
15350	// CHECK-LABEL: @test_vsriq_n_u32(
15351	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15352	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15353	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
15354	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15355	// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
15356	// CHECK: ret <4 x i32> [[VSLI_N2]]
15357	uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) {
15358	return vsriq_n_u32(a, b, 1);
15359	}
15360
15361	// CHECK-LABEL: @test_vsriq_n_u64(
15362	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
15363	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
15364	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
15365	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15366	// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
15367	// CHECK: ret <2 x i64> [[VSLI_N2]]
15368	uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) {
15369	return vsriq_n_u64(a, b, 1);
15370	}
15371
15372	// CHECK-LABEL: @test_vsriq_n_p8(
15373	// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
15374	// CHECK: ret <16 x i8> [[VSLI_N]]
15375	poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
15376	return vsriq_n_p8(a, b, 1);
15377	}
15378
15379	// CHECK-LABEL: @test_vsriq_n_p16(
15380	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15381	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15382	// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15383	// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15384	// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
15385	// CHECK: ret <8 x i16> [[VSLI_N2]]
15386	poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
15387	return vsriq_n_p16(a, b, 1);
15388	}
15389
15390	// CHECK-LABEL: @test_vst1q_u8(
15391	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
15392	// CHECK: ret void
15393	void test_vst1q_u8(uint8_t * a, uint8x16_t b) {
15394	vst1q_u8(a, b);
15395	}
15396
15397	// CHECK-LABEL: @test_vst1q_u16(
15398	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
15399	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15400	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15401	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
15402	// CHECK: ret void
15403	void test_vst1q_u16(uint16_t * a, uint16x8_t b) {
15404	vst1q_u16(a, b);
15405	}
15406
15407	// CHECK-LABEL: @test_vst1q_u32(
15408	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
15409	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15410	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15411	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* [[TMP0]], <4 x i32> [[TMP2]], i32 4)
15412	// CHECK: ret void
15413	void test_vst1q_u32(uint32_t * a, uint32x4_t b) {
15414	vst1q_u32(a, b);
15415	}
15416
15417	// CHECK-LABEL: @test_vst1q_u64(
15418	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
15419	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
15420	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15421	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* [[TMP0]], <2 x i64> [[TMP2]], i32 4)
15422	// CHECK: ret void
15423	void test_vst1q_u64(uint64_t * a, uint64x2_t b) {
15424	vst1q_u64(a, b);
15425	}
15426
15427	// CHECK-LABEL: @test_vst1q_s8(
15428	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
15429	// CHECK: ret void
15430	void test_vst1q_s8(int8_t * a, int8x16_t b) {
15431	vst1q_s8(a, b);
15432	}
15433
15434	// CHECK-LABEL: @test_vst1q_s16(
15435	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
15436	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15437	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15438	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
15439	// CHECK: ret void
15440	void test_vst1q_s16(int16_t * a, int16x8_t b) {
15441	vst1q_s16(a, b);
15442	}
15443
15444	// CHECK-LABEL: @test_vst1q_s32(
15445	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
15446	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15447	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15448	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* [[TMP0]], <4 x i32> [[TMP2]], i32 4)
15449	// CHECK: ret void
15450	void test_vst1q_s32(int32_t * a, int32x4_t b) {
15451	vst1q_s32(a, b);
15452	}
15453
15454	// CHECK-LABEL: @test_vst1q_s64(
15455	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
15456	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
15457	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15458	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* [[TMP0]], <2 x i64> [[TMP2]], i32 4)
15459	// CHECK: ret void
15460	void test_vst1q_s64(int64_t * a, int64x2_t b) {
15461	vst1q_s64(a, b);
15462	}
15463
15464	// CHECK-LABEL: @test_vst1q_f16(
15465	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
15466	// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
15467	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
15468	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v8f16(i8* [[TMP0]], <8 x half> [[TMP2]], i32 2)
15469	// CHECK: ret void
15470	void test_vst1q_f16(float16_t * a, float16x8_t b) {
15471	vst1q_f16(a, b);
15472	}
15473
15474	// CHECK-LABEL: @test_vst1q_f32(
15475	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
15476	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
15477	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
15478	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* [[TMP0]], <4 x float> [[TMP2]], i32 4)
15479	// CHECK: ret void
15480	void test_vst1q_f32(float32_t * a, float32x4_t b) {
15481	vst1q_f32(a, b);
15482	}
15483
15484	// CHECK-LABEL: @test_vst1q_p8(
15485	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
15486	// CHECK: ret void
15487	void test_vst1q_p8(poly8_t * a, poly8x16_t b) {
15488	vst1q_p8(a, b);
15489	}
15490
15491	// CHECK-LABEL: @test_vst1q_p16(
15492	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
15493	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15494	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15495	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
15496	// CHECK: ret void
15497	void test_vst1q_p16(poly16_t * a, poly16x8_t b) {
15498	vst1q_p16(a, b);
15499	}
15500
15501	// CHECK-LABEL: @test_vst1_u8(
15502	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
15503	// CHECK: ret void
15504	void test_vst1_u8(uint8_t * a, uint8x8_t b) {
15505	vst1_u8(a, b);
15506	}
15507
15508	// CHECK-LABEL: @test_vst1_u16(
15509	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
15510	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15511	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15512	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
15513	// CHECK: ret void
15514	void test_vst1_u16(uint16_t * a, uint16x4_t b) {
15515	vst1_u16(a, b);
15516	}
15517
15518	// CHECK-LABEL: @test_vst1_u32(
15519	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
15520	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15521	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15522	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* [[TMP0]], <2 x i32> [[TMP2]], i32 4)
15523	// CHECK: ret void
15524	void test_vst1_u32(uint32_t * a, uint32x2_t b) {
15525	vst1_u32(a, b);
15526	}
15527
15528	// CHECK-LABEL: @test_vst1_u64(
15529	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
15530	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15531	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15532	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP2]], i32 4)
15533	// CHECK: ret void
15534	void test_vst1_u64(uint64_t * a, uint64x1_t b) {
15535	vst1_u64(a, b);
15536	}
15537
15538	// CHECK-LABEL: @test_vst1_s8(
15539	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
15540	// CHECK: ret void
15541	void test_vst1_s8(int8_t * a, int8x8_t b) {
15542	vst1_s8(a, b);
15543	}
15544
15545	// CHECK-LABEL: @test_vst1_s16(
15546	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
15547	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15548	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15549	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
15550	// CHECK: ret void
15551	void test_vst1_s16(int16_t * a, int16x4_t b) {
15552	vst1_s16(a, b);
15553	}
15554
15555	// CHECK-LABEL: @test_vst1_s32(
15556	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
15557	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15558	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15559	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* [[TMP0]], <2 x i32> [[TMP2]], i32 4)
15560	// CHECK: ret void
15561	void test_vst1_s32(int32_t * a, int32x2_t b) {
15562	vst1_s32(a, b);
15563	}
15564
15565	// CHECK-LABEL: @test_vst1_s64(
15566	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
15567	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15568	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15569	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP2]], i32 4)
15570	// CHECK: ret void
15571	void test_vst1_s64(int64_t * a, int64x1_t b) {
15572	vst1_s64(a, b);
15573	}
15574
15575	// CHECK-LABEL: @test_vst1_f16(
15576	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
15577	// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
15578	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
15579	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v4f16(i8* [[TMP0]], <4 x half> [[TMP2]], i32 2)
15580	// CHECK: ret void
15581	void test_vst1_f16(float16_t * a, float16x4_t b) {
15582	vst1_f16(a, b);
15583	}
15584
15585	// CHECK-LABEL: @test_vst1_f32(
15586	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
15587	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
15588	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
15589	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* [[TMP0]], <2 x float> [[TMP2]], i32 4)
15590	// CHECK: ret void
15591	void test_vst1_f32(float32_t * a, float32x2_t b) {
15592	vst1_f32(a, b);
15593	}
15594
15595	// CHECK-LABEL: @test_vst1_p8(
15596	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
15597	// CHECK: ret void
15598	void test_vst1_p8(poly8_t * a, poly8x8_t b) {
15599	vst1_p8(a, b);
15600	}
15601
15602	// CHECK-LABEL: @test_vst1_p16(
15603	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
15604	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15605	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15606	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
15607	// CHECK: ret void
15608	void test_vst1_p16(poly16_t * a, poly16x4_t b) {
15609	vst1_p16(a, b);
15610	}
15611
15612	// CHECK-LABEL: @test_vst1q_lane_u8(
15613	// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
15614	// CHECK: store i8 [[TMP0]], i8* %a, align 1
15615	// CHECK: ret void
15616	void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) {
15617	vst1q_lane_u8(a, b, 15);
15618	}
15619
15620	// CHECK-LABEL: @test_vst1q_lane_u16(
15621	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
15622	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15623	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15624	// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
15625	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i16*
15626	// CHECK: store i16 [[TMP3]], i16* [[TMP4]], align 2
15627	// CHECK: ret void
15628	void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) {
15629	vst1q_lane_u16(a, b, 7);
15630	}
15631
15632	// CHECK-LABEL: @test_vst1q_lane_u32(
15633	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
15634	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15635	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15636	// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
15637	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i32*
15638	// CHECK: store i32 [[TMP3]], i32* [[TMP4]], align 4
15639	// CHECK: ret void
15640	void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) {
15641	vst1q_lane_u32(a, b, 3);
15642	}
15643
15644	// CHECK-LABEL: @test_vst1q_lane_u64(
15645	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
15646	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
15647	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15648	// CHECK: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
15649	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP3]], i32 4)
15650	// CHECK: ret void
15651	void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) {
15652	vst1q_lane_u64(a, b, 1);
15653	}
15654
15655	// CHECK-LABEL: @test_vst1q_lane_s8(
15656	// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
15657	// CHECK: store i8 [[TMP0]], i8* %a, align 1
15658	// CHECK: ret void
15659	void test_vst1q_lane_s8(int8_t * a, int8x16_t b) {
15660	vst1q_lane_s8(a, b, 15);
15661	}
15662
15663	// CHECK-LABEL: @test_vst1q_lane_s16(
15664	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
15665	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15666	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15667	// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
15668	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i16*
15669	// CHECK: store i16 [[TMP3]], i16* [[TMP4]], align 2
15670	// CHECK: ret void
15671	void test_vst1q_lane_s16(int16_t * a, int16x8_t b) {
15672	vst1q_lane_s16(a, b, 7);
15673	}
15674
15675	// CHECK-LABEL: @test_vst1q_lane_s32(
15676	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
15677	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15678	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15679	// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
15680	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i32*
15681	// CHECK: store i32 [[TMP3]], i32* [[TMP4]], align 4
15682	// CHECK: ret void
15683	void test_vst1q_lane_s32(int32_t * a, int32x4_t b) {
15684	vst1q_lane_s32(a, b, 3);
15685	}
15686
15687	// CHECK-LABEL: @test_vst1q_lane_s64(
15688	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
15689	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
15690	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
15691	// CHECK: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
15692	// CHECK: call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP3]], i32 4)
15693	// CHECK: ret void
15694	void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
15695	vst1q_lane_s64(a, b, 1);
15696	}
15697
15698	// CHECK-LABEL: @test_vst1q_lane_f16(
15699	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
15700	// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
15701	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
15702	// CHECK: [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7
15703	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to half*
15704	// CHECK: store half [[TMP3]], half* [[TMP4]], align 2
15705	// CHECK: ret void
15706	void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
15707	vst1q_lane_f16(a, b, 7);
15708	}
15709
15710	// CHECK-LABEL: @test_vst1q_lane_f32(
15711	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
15712	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
15713	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
15714	// CHECK: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
15715	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to float*
15716	// CHECK: store float [[TMP3]], float* [[TMP4]], align 4
15717	// CHECK: ret void
15718	void test_vst1q_lane_f32(float32_t * a, float32x4_t b) {
15719	vst1q_lane_f32(a, b, 3);
15720	}
15721
15722	// CHECK-LABEL: @test_vst1q_lane_p8(
15723	// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
15724	// CHECK: store i8 [[TMP0]], i8* %a, align 1
15725	// CHECK: ret void
15726	void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) {
15727	vst1q_lane_p8(a, b, 15);
15728	}
15729
15730	// CHECK-LABEL: @test_vst1q_lane_p16(
15731	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
15732	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15733	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15734	// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
15735	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i16*
15736	// CHECK: store i16 [[TMP3]], i16* [[TMP4]], align 2
15737	// CHECK: ret void
15738	void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) {
15739	vst1q_lane_p16(a, b, 7);
15740	}
15741
15742	// CHECK-LABEL: @test_vst1_lane_u8(
15743	// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
15744	// CHECK: store i8 [[TMP0]], i8* %a, align 1
15745	// CHECK: ret void
15746	void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) {
15747	vst1_lane_u8(a, b, 7);
15748	}
15749
15750	// CHECK-LABEL: @test_vst1_lane_u16(
15751	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
15752	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15753	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15754	// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
15755	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i16*
15756	// CHECK: store i16 [[TMP3]], i16* [[TMP4]], align 2
15757	// CHECK: ret void
15758	void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) {
15759	vst1_lane_u16(a, b, 3);
15760	}
15761
15762	// CHECK-LABEL: @test_vst1_lane_u32(
15763	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
15764	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15765	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15766	// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
15767	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i32*
15768	// CHECK: store i32 [[TMP3]], i32* [[TMP4]], align 4
15769	// CHECK: ret void
15770	void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) {
15771	vst1_lane_u32(a, b, 1);
15772	}
15773
15774	// CHECK-LABEL: @test_vst1_lane_u64(
15775	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
15776	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15777	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15778	// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
15779	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i64*
15780	// CHECK: store i64 [[TMP3]], i64* [[TMP4]], align 4
15781	// CHECK: ret void
15782	void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) {
15783	vst1_lane_u64(a, b, 0);
15784	}
15785
15786	// CHECK-LABEL: @test_vst1_lane_s8(
15787	// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
15788	// CHECK: store i8 [[TMP0]], i8* %a, align 1
15789	// CHECK: ret void
15790	void test_vst1_lane_s8(int8_t * a, int8x8_t b) {
15791	vst1_lane_s8(a, b, 7);
15792	}
15793
15794	// CHECK-LABEL: @test_vst1_lane_s16(
15795	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
15796	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15797	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15798	// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
15799	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i16*
15800	// CHECK: store i16 [[TMP3]], i16* [[TMP4]], align 2
15801	// CHECK: ret void
15802	void test_vst1_lane_s16(int16_t * a, int16x4_t b) {
15803	vst1_lane_s16(a, b, 3);
15804	}
15805
15806	// CHECK-LABEL: @test_vst1_lane_s32(
15807	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
15808	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15809	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15810	// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
15811	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i32*
15812	// CHECK: store i32 [[TMP3]], i32* [[TMP4]], align 4
15813	// CHECK: ret void
15814	void test_vst1_lane_s32(int32_t * a, int32x2_t b) {
15815	vst1_lane_s32(a, b, 1);
15816	}
15817
15818	// CHECK-LABEL: @test_vst1_lane_s64(
15819	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
15820	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15821	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15822	// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
15823	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i64*
15824	// CHECK: store i64 [[TMP3]], i64* [[TMP4]], align 4
15825	// CHECK: ret void
15826	void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
15827	vst1_lane_s64(a, b, 0);
15828	}
15829
15830	// CHECK-LABEL: @test_vst1_lane_f16(
15831	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
15832	// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
15833	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
15834	// CHECK: [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3
15835	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to half*
15836	// CHECK: store half [[TMP3]], half* [[TMP4]], align 2
15837	// CHECK: ret void
15838	void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
15839	vst1_lane_f16(a, b, 3);
15840	}
15841
15842	// CHECK-LABEL: @test_vst1_lane_f32(
15843	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
15844	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
15845	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
15846	// CHECK: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
15847	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to float*
15848	// CHECK: store float [[TMP3]], float* [[TMP4]], align 4
15849	// CHECK: ret void
15850	void test_vst1_lane_f32(float32_t * a, float32x2_t b) {
15851	vst1_lane_f32(a, b, 1);
15852	}
15853
15854	// CHECK-LABEL: @test_vst1_lane_p8(
15855	// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
15856	// CHECK: store i8 [[TMP0]], i8* %a, align 1
15857	// CHECK: ret void
15858	void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) {
15859	vst1_lane_p8(a, b, 7);
15860	}
15861
15862	// CHECK-LABEL: @test_vst1_lane_p16(
15863	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
15864	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15865	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15866	// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
15867	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i16*
15868	// CHECK: store i16 [[TMP3]], i16* [[TMP4]], align 2
15869	// CHECK: ret void
15870	void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
15871	vst1_lane_p16(a, b, 3);
15872	}
15873
15874	// CHECK-LABEL: @test_vst2q_u8(
15875	// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
15876	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
15877	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t [[B]], i32 0, i32 0
15878	// CHECK: [[TMP0:%.]] = bitcast [2 x <16 x i8>] [[COERCE_DIVE]] to [4 x i64]*
15879	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
15880	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x16x2_t [[__S1]] to i8*
15881	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x16x2_t [[B]] to i8*
15882	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
15883	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t [[__S1]], i32 0, i32 0
15884	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL]], i32 0, i32 0
15885	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
15886	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t [[__S1]], i32 0, i32 0
15887	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL1]], i32 0, i32 1
15888	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
15889	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
15890	// CHECK: ret void
15891	void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
15892	vst2q_u8(a, b);
15893	}
15894
15895	// CHECK-LABEL: @test_vst2q_u16(
15896	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
15897	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
15898	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[B]], i32 0, i32 0
15899	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i16>] [[COERCE_DIVE]] to [4 x i64]*
15900	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
15901	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x2_t [[__S1]] to i8*
15902	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x8x2_t [[B]] to i8*
15903	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
15904	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
15905	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[__S1]], i32 0, i32 0
15906	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i32 0, i32 0
15907	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
15908	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
15909	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[__S1]], i32 0, i32 0
15910	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i32 0, i32 1
15911	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
15912	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
15913	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
15914	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
15915	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
15916	// CHECK: ret void
15917	void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
15918	vst2q_u16(a, b);
15919	}
15920
15921	// CHECK-LABEL: @test_vst2q_u32(
15922	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
15923	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
15924	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[B]], i32 0, i32 0
15925	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i32>] [[COERCE_DIVE]] to [4 x i64]*
15926	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
15927	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x2_t [[__S1]] to i8*
15928	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x4x2_t [[B]] to i8*
15929	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
15930	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
15931	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[__S1]], i32 0, i32 0
15932	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL]], i32 0, i32 0
15933	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
15934	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
15935	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[__S1]], i32 0, i32 0
15936	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL1]], i32 0, i32 1
15937	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
15938	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
15939	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
15940	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
15941	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
15942	// CHECK: ret void
15943	void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
15944	vst2q_u32(a, b);
15945	}
15946
15947	// CHECK-LABEL: @test_vst2q_s8(
15948	// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
15949	// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
15950	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t [[B]], i32 0, i32 0
15951	// CHECK: [[TMP0:%.]] = bitcast [2 x <16 x i8>] [[COERCE_DIVE]] to [4 x i64]*
15952	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
15953	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x16x2_t [[__S1]] to i8*
15954	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x16x2_t [[B]] to i8*
15955	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
15956	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t [[__S1]], i32 0, i32 0
15957	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL]], i32 0, i32 0
15958	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
15959	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t [[__S1]], i32 0, i32 0
15960	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL1]], i32 0, i32 1
15961	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
15962	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
15963	// CHECK: ret void
15964	void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
15965	vst2q_s8(a, b);
15966	}
15967
15968	// CHECK-LABEL: @test_vst2q_s16(
15969	// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
15970	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
15971	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[B]], i32 0, i32 0
15972	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i16>] [[COERCE_DIVE]] to [4 x i64]*
15973	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
15974	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x2_t [[__S1]] to i8*
15975	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x8x2_t [[B]] to i8*
15976	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
15977	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
15978	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[__S1]], i32 0, i32 0
15979	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i32 0, i32 0
15980	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
15981	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
15982	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[__S1]], i32 0, i32 0
15983	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i32 0, i32 1
15984	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
15985	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
15986	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
15987	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
15988	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
15989	// CHECK: ret void
15990	void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
15991	vst2q_s16(a, b);
15992	}
15993
15994	// CHECK-LABEL: @test_vst2q_s32(
15995	// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
15996	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
15997	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[B]], i32 0, i32 0
15998	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i32>] [[COERCE_DIVE]] to [4 x i64]*
15999	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
16000	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x2_t [[__S1]] to i8*
16001	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x4x2_t [[B]] to i8*
16002	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
16003	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
16004	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[__S1]], i32 0, i32 0
16005	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL]], i32 0, i32 0
16006	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
16007	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
16008	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[__S1]], i32 0, i32 0
16009	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL1]], i32 0, i32 1
16010	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
16011	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
16012	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
16013	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
16014	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
16015	// CHECK: ret void
16016	void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
16017	vst2q_s32(a, b);
16018	}
16019
16020	// CHECK-LABEL: @test_vst2q_f16(
16021	// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
16022	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
16023	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[B]], i32 0, i32 0
16024	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x half>] [[COERCE_DIVE]] to [4 x i64]*
16025	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
16026	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x2_t [[__S1]] to i8*
16027	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x8x2_t [[B]] to i8*
16028	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
16029	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
16030	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[__S1]], i32 0, i32 0
16031	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>] [[VAL]], i32 0, i32 0
16032	// CHECK: [[TMP4:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
16033	// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
16034	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[__S1]], i32 0, i32 0
16035	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>] [[VAL1]], i32 0, i32 1
16036	// CHECK: [[TMP6:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
16037	// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
16038	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
16039	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
16040	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 2)
16041	// CHECK: ret void
16042	void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
16043	vst2q_f16(a, b);
16044	}
16045
16046	// CHECK-LABEL: @test_vst2q_f32(
16047	// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
16048	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
16049	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[B]], i32 0, i32 0
16050	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x float>] [[COERCE_DIVE]] to [4 x i64]*
16051	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
16052	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x2_t [[__S1]] to i8*
16053	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x4x2_t [[B]] to i8*
16054	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
16055	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
16056	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[__S1]], i32 0, i32 0
16057	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>] [[VAL]], i32 0, i32 0
16058	// CHECK: [[TMP4:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
16059	// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
16060	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[__S1]], i32 0, i32 0
16061	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>] [[VAL1]], i32 0, i32 1
16062	// CHECK: [[TMP6:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
16063	// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
16064	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
16065	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
16066	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 4)
16067	// CHECK: ret void
16068	void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
16069	vst2q_f32(a, b);
16070	}
16071
16072	// CHECK-LABEL: @test_vst2q_p8(
16073	// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
16074	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
16075	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t [[B]], i32 0, i32 0
16076	// CHECK: [[TMP0:%.]] = bitcast [2 x <16 x i8>] [[COERCE_DIVE]] to [4 x i64]*
16077	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
16078	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x16x2_t [[__S1]] to i8*
16079	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x16x2_t [[B]] to i8*
16080	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
16081	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t [[__S1]], i32 0, i32 0
16082	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL]], i32 0, i32 0
16083	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
16084	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t [[__S1]], i32 0, i32 0
16085	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL1]], i32 0, i32 1
16086	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
16087	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
16088	// CHECK: ret void
16089	void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
16090	vst2q_p8(a, b);
16091	}
16092
16093	// CHECK-LABEL: @test_vst2q_p16(
16094	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
16095	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
16096	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[B]], i32 0, i32 0
16097	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i16>] [[COERCE_DIVE]] to [4 x i64]*
16098	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
16099	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x2_t [[__S1]] to i8*
16100	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x8x2_t [[B]] to i8*
16101	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
16102	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
16103	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[__S1]], i32 0, i32 0
16104	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i32 0, i32 0
16105	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
16106	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16107	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[__S1]], i32 0, i32 0
16108	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i32 0, i32 1
16109	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
16110	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
16111	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16112	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
16113	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
16114	// CHECK: ret void
16115	void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
16116	vst2q_p16(a, b);
16117	}
16118
16119	// CHECK-LABEL: @test_vst2_u8(
16120	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
16121	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
16122	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[B]], i32 0, i32 0
16123	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
16124	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16125	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x2_t [[__S1]] to i8*
16126	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x2_t [[B]] to i8*
16127	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16128	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__S1]], i32 0, i32 0
16129	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i32 0, i32 0
16130	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
16131	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__S1]], i32 0, i32 0
16132	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i32 0, i32 1
16133	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
16134	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
16135	// CHECK: ret void
16136	void test_vst2_u8(uint8_t * a, uint8x8x2_t b) {
16137	vst2_u8(a, b);
16138	}
16139
16140	// CHECK-LABEL: @test_vst2_u16(
16141	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
16142	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
16143	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[B]], i32 0, i32 0
16144	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i16>] [[COERCE_DIVE]] to [2 x i64]*
16145	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16146	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x2_t [[__S1]] to i8*
16147	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x4x2_t [[B]] to i8*
16148	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16149	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
16150	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[__S1]], i32 0, i32 0
16151	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i32 0, i32 0
16152	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
16153	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16154	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[__S1]], i32 0, i32 0
16155	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i32 0, i32 1
16156	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
16157	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
16158	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16159	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
16160	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
16161	// CHECK: ret void
16162	void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
16163	vst2_u16(a, b);
16164	}
16165
16166	// CHECK-LABEL: @test_vst2_u32(
16167	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
16168	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
16169	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[B]], i32 0, i32 0
16170	// CHECK: [[TMP0:%.]] = bitcast [2 x <2 x i32>] [[COERCE_DIVE]] to [2 x i64]*
16171	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16172	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x2_t [[__S1]] to i8*
16173	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x2x2_t [[B]] to i8*
16174	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16175	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
16176	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[__S1]], i32 0, i32 0
16177	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL]], i32 0, i32 0
16178	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
16179	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
16180	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[__S1]], i32 0, i32 0
16181	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL1]], i32 0, i32 1
16182	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
16183	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
16184	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
16185	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
16186	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
16187	// CHECK: ret void
16188	void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
16189	vst2_u32(a, b);
16190	}
16191
16192	// CHECK-LABEL: @test_vst2_u64(
16193	// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
16194	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
16195	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t [[B]], i32 0, i32 0
16196	// CHECK: [[TMP0:%.]] = bitcast [2 x <1 x i64>] [[COERCE_DIVE]] to [2 x i64]*
16197	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16198	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x1x2_t [[__S1]] to i8*
16199	// CHECK: [[TMP2:%.]] = bitcast %struct.uint64x1x2_t [[B]] to i8*
16200	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16201	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
16202	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t [[__S1]], i32 0, i32 0
16203	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL]], i32 0, i32 0
16204	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
16205	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
16206	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t [[__S1]], i32 0, i32 0
16207	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL1]], i32 0, i32 1
16208	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
16209	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
16210	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
16211	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
16212	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
16213	// CHECK: ret void
16214	void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
16215	vst2_u64(a, b);
16216	}
16217
16218	// CHECK-LABEL: @test_vst2_s8(
16219	// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
16220	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
16221	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[B]], i32 0, i32 0
16222	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
16223	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16224	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x2_t [[__S1]] to i8*
16225	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x2_t [[B]] to i8*
16226	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16227	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__S1]], i32 0, i32 0
16228	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i32 0, i32 0
16229	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
16230	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__S1]], i32 0, i32 0
16231	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i32 0, i32 1
16232	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
16233	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
16234	// CHECK: ret void
16235	void test_vst2_s8(int8_t * a, int8x8x2_t b) {
16236	vst2_s8(a, b);
16237	}
16238
16239	// CHECK-LABEL: @test_vst2_s16(
16240	// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
16241	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
16242	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[B]], i32 0, i32 0
16243	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i16>] [[COERCE_DIVE]] to [2 x i64]*
16244	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16245	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x2_t [[__S1]] to i8*
16246	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x4x2_t [[B]] to i8*
16247	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16248	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
16249	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[__S1]], i32 0, i32 0
16250	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i32 0, i32 0
16251	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
16252	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16253	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[__S1]], i32 0, i32 0
16254	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i32 0, i32 1
16255	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
16256	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
16257	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16258	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
16259	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
16260	// CHECK: ret void
16261	void test_vst2_s16(int16_t * a, int16x4x2_t b) {
16262	vst2_s16(a, b);
16263	}
16264
16265	// CHECK-LABEL: @test_vst2_s32(
16266	// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
16267	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
16268	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[B]], i32 0, i32 0
16269	// CHECK: [[TMP0:%.]] = bitcast [2 x <2 x i32>] [[COERCE_DIVE]] to [2 x i64]*
16270	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16271	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x2_t [[__S1]] to i8*
16272	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x2x2_t [[B]] to i8*
16273	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16274	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
16275	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[__S1]], i32 0, i32 0
16276	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL]], i32 0, i32 0
16277	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
16278	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
16279	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[__S1]], i32 0, i32 0
16280	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL1]], i32 0, i32 1
16281	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
16282	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
16283	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
16284	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
16285	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
16286	// CHECK: ret void
16287	void test_vst2_s32(int32_t * a, int32x2x2_t b) {
16288	vst2_s32(a, b);
16289	}
16290
16291	// CHECK-LABEL: @test_vst2_s64(
16292	// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
16293	// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
16294	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t [[B]], i32 0, i32 0
16295	// CHECK: [[TMP0:%.]] = bitcast [2 x <1 x i64>] [[COERCE_DIVE]] to [2 x i64]*
16296	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16297	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x1x2_t [[__S1]] to i8*
16298	// CHECK: [[TMP2:%.]] = bitcast %struct.int64x1x2_t [[B]] to i8*
16299	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16300	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
16301	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t [[__S1]], i32 0, i32 0
16302	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL]], i32 0, i32 0
16303	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
16304	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
16305	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t [[__S1]], i32 0, i32 0
16306	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL1]], i32 0, i32 1
16307	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
16308	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
16309	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
16310	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
16311	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
16312	// CHECK: ret void
16313	void test_vst2_s64(int64_t * a, int64x1x2_t b) {
16314	vst2_s64(a, b);
16315	}
16316
16317	// CHECK-LABEL: @test_vst2_f16(
16318	// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
16319	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
16320	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[B]], i32 0, i32 0
16321	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x half>] [[COERCE_DIVE]] to [2 x i64]*
16322	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16323	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x2_t [[__S1]] to i8*
16324	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x4x2_t [[B]] to i8*
16325	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16326	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
16327	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[__S1]], i32 0, i32 0
16328	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>] [[VAL]], i32 0, i32 0
16329	// CHECK: [[TMP4:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
16330	// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
16331	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[__S1]], i32 0, i32 0
16332	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>] [[VAL1]], i32 0, i32 1
16333	// CHECK: [[TMP6:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
16334	// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
16335	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
16336	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
16337	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 2)
16338	// CHECK: ret void
16339	void test_vst2_f16(float16_t * a, float16x4x2_t b) {
16340	vst2_f16(a, b);
16341	}
16342
16343	// CHECK-LABEL: @test_vst2_f32(
16344	// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
16345	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
16346	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[B]], i32 0, i32 0
16347	// CHECK: [[TMP0:%.]] = bitcast [2 x <2 x float>] [[COERCE_DIVE]] to [2 x i64]*
16348	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16349	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x2_t [[__S1]] to i8*
16350	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x2x2_t [[B]] to i8*
16351	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16352	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
16353	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[__S1]], i32 0, i32 0
16354	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>] [[VAL]], i32 0, i32 0
16355	// CHECK: [[TMP4:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
16356	// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
16357	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[__S1]], i32 0, i32 0
16358	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>] [[VAL1]], i32 0, i32 1
16359	// CHECK: [[TMP6:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
16360	// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
16361	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
16362	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
16363	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 4)
16364	// CHECK: ret void
16365	void test_vst2_f32(float32_t * a, float32x2x2_t b) {
16366	vst2_f32(a, b);
16367	}
16368
16369	// CHECK-LABEL: @test_vst2_p8(
16370	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
16371	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
16372	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[B]], i32 0, i32 0
16373	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
16374	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16375	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x2_t [[__S1]] to i8*
16376	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x2_t [[B]] to i8*
16377	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16378	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__S1]], i32 0, i32 0
16379	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i32 0, i32 0
16380	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
16381	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__S1]], i32 0, i32 0
16382	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i32 0, i32 1
16383	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
16384	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
16385	// CHECK: ret void
16386	void test_vst2_p8(poly8_t * a, poly8x8x2_t b) {
16387	vst2_p8(a, b);
16388	}
16389
16390	// CHECK-LABEL: @test_vst2_p16(
16391	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
16392	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
16393	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[B]], i32 0, i32 0
16394	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i16>] [[COERCE_DIVE]] to [2 x i64]*
16395	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16396	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x2_t [[__S1]] to i8*
16397	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x4x2_t [[B]] to i8*
16398	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16399	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
16400	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[__S1]], i32 0, i32 0
16401	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i32 0, i32 0
16402	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
16403	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16404	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[__S1]], i32 0, i32 0
16405	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i32 0, i32 1
16406	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
16407	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
16408	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16409	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
16410	// CHECK: call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
16411	// CHECK: ret void
16412	void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
16413	vst2_p16(a, b);
16414	}
16415
16416	// CHECK-LABEL: @test_vst2q_lane_u16(
16417	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
16418	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
16419	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[B]], i32 0, i32 0
16420	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i16>] [[COERCE_DIVE]] to [4 x i64]*
16421	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
16422	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x2_t [[__S1]] to i8*
16423	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x8x2_t [[B]] to i8*
16424	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
16425	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
16426	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[__S1]], i32 0, i32 0
16427	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i32 0, i32 0
16428	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
16429	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16430	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[__S1]], i32 0, i32 0
16431	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i32 0, i32 1
16432	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
16433	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
16434	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16435	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
16436	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
16437	// CHECK: ret void
16438	void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
16439	vst2q_lane_u16(a, b, 7);
16440	}
16441
16442	// CHECK-LABEL: @test_vst2q_lane_u32(
16443	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
16444	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
16445	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[B]], i32 0, i32 0
16446	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i32>] [[COERCE_DIVE]] to [4 x i64]*
16447	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
16448	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x2_t [[__S1]] to i8*
16449	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x4x2_t [[B]] to i8*
16450	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
16451	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
16452	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[__S1]], i32 0, i32 0
16453	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL]], i32 0, i32 0
16454	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
16455	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
16456	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[__S1]], i32 0, i32 0
16457	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL1]], i32 0, i32 1
16458	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
16459	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
16460	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
16461	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
16462	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
16463	// CHECK: ret void
16464	void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
16465	vst2q_lane_u32(a, b, 3);
16466	}
16467
16468	// CHECK-LABEL: @test_vst2q_lane_s16(
16469	// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
16470	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
16471	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[B]], i32 0, i32 0
16472	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i16>] [[COERCE_DIVE]] to [4 x i64]*
16473	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
16474	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x2_t [[__S1]] to i8*
16475	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x8x2_t [[B]] to i8*
16476	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
16477	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
16478	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[__S1]], i32 0, i32 0
16479	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i32 0, i32 0
16480	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
16481	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16482	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[__S1]], i32 0, i32 0
16483	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i32 0, i32 1
16484	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
16485	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
16486	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16487	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
16488	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
16489	// CHECK: ret void
16490	void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
16491	vst2q_lane_s16(a, b, 7);
16492	}
16493
16494	// CHECK-LABEL: @test_vst2q_lane_s32(
16495	// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
16496	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
16497	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[B]], i32 0, i32 0
16498	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i32>] [[COERCE_DIVE]] to [4 x i64]*
16499	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
16500	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x2_t [[__S1]] to i8*
16501	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x4x2_t [[B]] to i8*
16502	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
16503	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
16504	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[__S1]], i32 0, i32 0
16505	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL]], i32 0, i32 0
16506	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
16507	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
16508	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[__S1]], i32 0, i32 0
16509	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL1]], i32 0, i32 1
16510	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
16511	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
16512	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
16513	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
16514	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
16515	// CHECK: ret void
16516	void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
16517	vst2q_lane_s32(a, b, 3);
16518	}
16519
16520	// CHECK-LABEL: @test_vst2q_lane_f16(
16521	// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
16522	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
16523	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[B]], i32 0, i32 0
16524	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x half>] [[COERCE_DIVE]] to [4 x i64]*
16525	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
16526	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x2_t [[__S1]] to i8*
16527	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x8x2_t [[B]] to i8*
16528	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
16529	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
16530	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[__S1]], i32 0, i32 0
16531	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>] [[VAL]], i32 0, i32 0
16532	// CHECK: [[TMP4:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
16533	// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
16534	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[__S1]], i32 0, i32 0
16535	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>] [[VAL1]], i32 0, i32 1
16536	// CHECK: [[TMP6:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
16537	// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
16538	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
16539	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
16540	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 7, i32 2)
16541	// CHECK: ret void
16542	void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
16543	vst2q_lane_f16(a, b, 7);
16544	}
16545
16546	// CHECK-LABEL: @test_vst2q_lane_f32(
16547	// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
16548	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
16549	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[B]], i32 0, i32 0
16550	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x float>] [[COERCE_DIVE]] to [4 x i64]*
16551	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
16552	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x2_t [[__S1]] to i8*
16553	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x4x2_t [[B]] to i8*
16554	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
16555	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
16556	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[__S1]], i32 0, i32 0
16557	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>] [[VAL]], i32 0, i32 0
16558	// CHECK: [[TMP4:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
16559	// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
16560	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[__S1]], i32 0, i32 0
16561	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>] [[VAL1]], i32 0, i32 1
16562	// CHECK: [[TMP6:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
16563	// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
16564	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
16565	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
16566	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 3, i32 4)
16567	// CHECK: ret void
16568	void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
16569	vst2q_lane_f32(a, b, 3);
16570	}
16571
16572	// CHECK-LABEL: @test_vst2q_lane_p16(
16573	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
16574	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
16575	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[B]], i32 0, i32 0
16576	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i16>] [[COERCE_DIVE]] to [4 x i64]*
16577	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
16578	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x2_t [[__S1]] to i8*
16579	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x8x2_t [[B]] to i8*
16580	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
16581	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
16582	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[__S1]], i32 0, i32 0
16583	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i32 0, i32 0
16584	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
16585	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16586	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[__S1]], i32 0, i32 0
16587	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i32 0, i32 1
16588	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
16589	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
16590	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16591	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
16592	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
16593	// CHECK: ret void
16594	void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
16595	vst2q_lane_p16(a, b, 7);
16596	}
16597
16598	// CHECK-LABEL: @test_vst2_lane_u8(
16599	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
16600	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
16601	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[B]], i32 0, i32 0
16602	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
16603	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16604	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x2_t [[__S1]] to i8*
16605	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x2_t [[B]] to i8*
16606	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16607	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__S1]], i32 0, i32 0
16608	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i32 0, i32 0
16609	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
16610	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__S1]], i32 0, i32 0
16611	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i32 0, i32 1
16612	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
16613	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
16614	// CHECK: ret void
16615	void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) {
16616	vst2_lane_u8(a, b, 7);
16617	}
16618
16619	// CHECK-LABEL: @test_vst2_lane_u16(
16620	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
16621	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
16622	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[B]], i32 0, i32 0
16623	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i16>] [[COERCE_DIVE]] to [2 x i64]*
16624	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16625	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x2_t [[__S1]] to i8*
16626	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x4x2_t [[B]] to i8*
16627	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16628	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
16629	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[__S1]], i32 0, i32 0
16630	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i32 0, i32 0
16631	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
16632	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16633	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[__S1]], i32 0, i32 0
16634	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i32 0, i32 1
16635	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
16636	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
16637	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16638	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
16639	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
16640	// CHECK: ret void
16641	void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
16642	vst2_lane_u16(a, b, 3);
16643	}
16644
16645	// CHECK-LABEL: @test_vst2_lane_u32(
16646	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
16647	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
16648	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[B]], i32 0, i32 0
16649	// CHECK: [[TMP0:%.]] = bitcast [2 x <2 x i32>] [[COERCE_DIVE]] to [2 x i64]*
16650	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16651	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x2_t [[__S1]] to i8*
16652	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x2x2_t [[B]] to i8*
16653	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16654	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
16655	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[__S1]], i32 0, i32 0
16656	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL]], i32 0, i32 0
16657	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
16658	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
16659	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[__S1]], i32 0, i32 0
16660	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL1]], i32 0, i32 1
16661	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
16662	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
16663	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
16664	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
16665	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
16666	// CHECK: ret void
16667	void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
16668	vst2_lane_u32(a, b, 1);
16669	}
16670
16671	// CHECK-LABEL: @test_vst2_lane_s8(
16672	// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
16673	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
16674	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[B]], i32 0, i32 0
16675	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
16676	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16677	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x2_t [[__S1]] to i8*
16678	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x2_t [[B]] to i8*
16679	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16680	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__S1]], i32 0, i32 0
16681	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i32 0, i32 0
16682	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
16683	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__S1]], i32 0, i32 0
16684	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i32 0, i32 1
16685	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
16686	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
16687	// CHECK: ret void
16688	void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) {
16689	vst2_lane_s8(a, b, 7);
16690	}
16691
16692	// CHECK-LABEL: @test_vst2_lane_s16(
16693	// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
16694	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
16695	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[B]], i32 0, i32 0
16696	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i16>] [[COERCE_DIVE]] to [2 x i64]*
16697	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16698	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x2_t [[__S1]] to i8*
16699	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x4x2_t [[B]] to i8*
16700	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16701	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
16702	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[__S1]], i32 0, i32 0
16703	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i32 0, i32 0
16704	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
16705	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16706	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[__S1]], i32 0, i32 0
16707	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i32 0, i32 1
16708	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
16709	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
16710	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16711	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
16712	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
16713	// CHECK: ret void
16714	void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
16715	vst2_lane_s16(a, b, 3);
16716	}
16717
16718	// CHECK-LABEL: @test_vst2_lane_s32(
16719	// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
16720	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
16721	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[B]], i32 0, i32 0
16722	// CHECK: [[TMP0:%.]] = bitcast [2 x <2 x i32>] [[COERCE_DIVE]] to [2 x i64]*
16723	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16724	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x2_t [[__S1]] to i8*
16725	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x2x2_t [[B]] to i8*
16726	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16727	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
16728	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[__S1]], i32 0, i32 0
16729	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL]], i32 0, i32 0
16730	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
16731	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
16732	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[__S1]], i32 0, i32 0
16733	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL1]], i32 0, i32 1
16734	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
16735	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
16736	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
16737	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
16738	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
16739	// CHECK: ret void
16740	void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
16741	vst2_lane_s32(a, b, 1);
16742	}
16743
16744	// CHECK-LABEL: @test_vst2_lane_f16(
16745	// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
16746	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
16747	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[B]], i32 0, i32 0
16748	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x half>] [[COERCE_DIVE]] to [2 x i64]*
16749	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16750	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x2_t [[__S1]] to i8*
16751	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x4x2_t [[B]] to i8*
16752	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16753	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
16754	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[__S1]], i32 0, i32 0
16755	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>] [[VAL]], i32 0, i32 0
16756	// CHECK: [[TMP4:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
16757	// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
16758	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[__S1]], i32 0, i32 0
16759	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>] [[VAL1]], i32 0, i32 1
16760	// CHECK: [[TMP6:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
16761	// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
16762	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
16763	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
16764	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 3, i32 2)
16765	// CHECK: ret void
16766	void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
16767	vst2_lane_f16(a, b, 3);
16768	}
16769
16770	// CHECK-LABEL: @test_vst2_lane_f32(
16771	// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
16772	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
16773	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[B]], i32 0, i32 0
16774	// CHECK: [[TMP0:%.]] = bitcast [2 x <2 x float>] [[COERCE_DIVE]] to [2 x i64]*
16775	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16776	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x2_t [[__S1]] to i8*
16777	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x2x2_t [[B]] to i8*
16778	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16779	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
16780	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[__S1]], i32 0, i32 0
16781	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>] [[VAL]], i32 0, i32 0
16782	// CHECK: [[TMP4:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
16783	// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
16784	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[__S1]], i32 0, i32 0
16785	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>] [[VAL1]], i32 0, i32 1
16786	// CHECK: [[TMP6:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
16787	// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
16788	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
16789	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
16790	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 1, i32 4)
16791	// CHECK: ret void
16792	void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
16793	vst2_lane_f32(a, b, 1);
16794	}
16795
16796	// CHECK-LABEL: @test_vst2_lane_p8(
16797	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
16798	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
16799	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[B]], i32 0, i32 0
16800	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
16801	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16802	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x2_t [[__S1]] to i8*
16803	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x2_t [[B]] to i8*
16804	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16805	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__S1]], i32 0, i32 0
16806	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i32 0, i32 0
16807	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
16808	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__S1]], i32 0, i32 0
16809	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i32 0, i32 1
16810	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
16811	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
16812	// CHECK: ret void
16813	void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) {
16814	vst2_lane_p8(a, b, 7);
16815	}
16816
16817	// CHECK-LABEL: @test_vst2_lane_p16(
16818	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
16819	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
16820	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[B]], i32 0, i32 0
16821	// CHECK: [[TMP0:%.]] = bitcast [2 x <4 x i16>] [[COERCE_DIVE]] to [2 x i64]*
16822	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
16823	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x2_t [[__S1]] to i8*
16824	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x4x2_t [[B]] to i8*
16825	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
16826	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
16827	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[__S1]], i32 0, i32 0
16828	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i32 0, i32 0
16829	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
16830	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16831	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[__S1]], i32 0, i32 0
16832	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i32 0, i32 1
16833	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
16834	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
16835	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16836	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
16837	// CHECK: call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
16838	// CHECK: ret void
16839	void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
16840	vst2_lane_p16(a, b, 3);
16841	}
16842
16843	// CHECK-LABEL: @test_vst3q_u8(
16844	// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
16845	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
16846	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[B]], i32 0, i32 0
16847	// CHECK: [[TMP0:%.]] = bitcast [3 x <16 x i8>] [[COERCE_DIVE]] to [6 x i64]*
16848	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
16849	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x16x3_t [[__S1]] to i8*
16850	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x16x3_t [[B]] to i8*
16851	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
16852	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[__S1]], i32 0, i32 0
16853	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL]], i32 0, i32 0
16854	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
16855	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[__S1]], i32 0, i32 0
16856	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL1]], i32 0, i32 1
16857	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
16858	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[__S1]], i32 0, i32 0
16859	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL3]], i32 0, i32 2
16860	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
16861	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
16862	// CHECK: ret void
16863	void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
16864	vst3q_u8(a, b);
16865	}
16866
16867	// CHECK-LABEL: @test_vst3q_u16(
16868	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
16869	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
16870	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[B]], i32 0, i32 0
16871	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i16>] [[COERCE_DIVE]] to [6 x i64]*
16872	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
16873	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x3_t [[__S1]] to i8*
16874	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x8x3_t [[B]] to i8*
16875	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
16876	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
16877	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
16878	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i32 0, i32 0
16879	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
16880	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16881	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
16882	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i32 0, i32 1
16883	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
16884	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
16885	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
16886	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i32 0, i32 2
16887	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
16888	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
16889	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16890	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
16891	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
16892	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
16893	// CHECK: ret void
16894	void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
16895	vst3q_u16(a, b);
16896	}
16897
16898	// CHECK-LABEL: @test_vst3q_u32(
16899	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
16900	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
16901	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[B]], i32 0, i32 0
16902	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i32>] [[COERCE_DIVE]] to [6 x i64]*
16903	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
16904	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x3_t [[__S1]] to i8*
16905	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x4x3_t [[B]] to i8*
16906	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
16907	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
16908	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
16909	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL]], i32 0, i32 0
16910	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
16911	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
16912	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
16913	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL1]], i32 0, i32 1
16914	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
16915	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
16916	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
16917	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL3]], i32 0, i32 2
16918	// CHECK: [[TMP8:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
16919	// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
16920	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
16921	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
16922	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
16923	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
16924	// CHECK: ret void
16925	void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
16926	vst3q_u32(a, b);
16927	}
16928
16929	// CHECK-LABEL: @test_vst3q_s8(
16930	// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
16931	// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
16932	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[B]], i32 0, i32 0
16933	// CHECK: [[TMP0:%.]] = bitcast [3 x <16 x i8>] [[COERCE_DIVE]] to [6 x i64]*
16934	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
16935	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x16x3_t [[__S1]] to i8*
16936	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x16x3_t [[B]] to i8*
16937	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
16938	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[__S1]], i32 0, i32 0
16939	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL]], i32 0, i32 0
16940	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
16941	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[__S1]], i32 0, i32 0
16942	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL1]], i32 0, i32 1
16943	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
16944	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[__S1]], i32 0, i32 0
16945	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL3]], i32 0, i32 2
16946	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
16947	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
16948	// CHECK: ret void
16949	void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
16950	vst3q_s8(a, b);
16951	}
16952
16953	// CHECK-LABEL: @test_vst3q_s16(
16954	// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
16955	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
16956	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[B]], i32 0, i32 0
16957	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i16>] [[COERCE_DIVE]] to [6 x i64]*
16958	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
16959	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x3_t [[__S1]] to i8*
16960	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x8x3_t [[B]] to i8*
16961	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
16962	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
16963	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
16964	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i32 0, i32 0
16965	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
16966	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16967	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
16968	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i32 0, i32 1
16969	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
16970	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
16971	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
16972	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i32 0, i32 2
16973	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
16974	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
16975	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16976	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
16977	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
16978	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
16979	// CHECK: ret void
16980	void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
16981	vst3q_s16(a, b);
16982	}
16983
16984	// CHECK-LABEL: @test_vst3q_s32(
16985	// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
16986	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
16987	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[B]], i32 0, i32 0
16988	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i32>] [[COERCE_DIVE]] to [6 x i64]*
16989	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
16990	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x3_t [[__S1]] to i8*
16991	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x4x3_t [[B]] to i8*
16992	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
16993	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
16994	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
16995	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL]], i32 0, i32 0
16996	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
16997	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
16998	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
16999	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL1]], i32 0, i32 1
17000	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
17001	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
17002	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
17003	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL3]], i32 0, i32 2
17004	// CHECK: [[TMP8:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
17005	// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
17006	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
17007	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
17008	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
17009	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
17010	// CHECK: ret void
17011	void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
17012	vst3q_s32(a, b);
17013	}
17014
17015	// CHECK-LABEL: @test_vst3q_f16(
17016	// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
17017	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
17018	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[B]], i32 0, i32 0
17019	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x half>] [[COERCE_DIVE]] to [6 x i64]*
17020	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
17021	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x3_t [[__S1]] to i8*
17022	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x8x3_t [[B]] to i8*
17023	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
17024	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
17025	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
17026	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL]], i32 0, i32 0
17027	// CHECK: [[TMP4:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
17028	// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
17029	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
17030	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL1]], i32 0, i32 1
17031	// CHECK: [[TMP6:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
17032	// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
17033	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
17034	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL3]], i32 0, i32 2
17035	// CHECK: [[TMP8:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX4]], align 16
17036	// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
17037	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
17038	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
17039	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
17040	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 2)
17041	// CHECK: ret void
17042	void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
17043	vst3q_f16(a, b);
17044	}
17045
17046	// CHECK-LABEL: @test_vst3q_f32(
17047	// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
17048	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
17049	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[B]], i32 0, i32 0
17050	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x float>] [[COERCE_DIVE]] to [6 x i64]*
17051	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
17052	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x3_t [[__S1]] to i8*
17053	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x4x3_t [[B]] to i8*
17054	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
17055	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
17056	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
17057	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL]], i32 0, i32 0
17058	// CHECK: [[TMP4:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
17059	// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
17060	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
17061	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL1]], i32 0, i32 1
17062	// CHECK: [[TMP6:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
17063	// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
17064	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
17065	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL3]], i32 0, i32 2
17066	// CHECK: [[TMP8:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX4]], align 16
17067	// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
17068	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
17069	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
17070	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
17071	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 4)
17072	// CHECK: ret void
17073	void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
17074	vst3q_f32(a, b);
17075	}
17076
17077	// CHECK-LABEL: @test_vst3q_p8(
17078	// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
17079	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
17080	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[B]], i32 0, i32 0
17081	// CHECK: [[TMP0:%.]] = bitcast [3 x <16 x i8>] [[COERCE_DIVE]] to [6 x i64]*
17082	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
17083	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x16x3_t [[__S1]] to i8*
17084	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x16x3_t [[B]] to i8*
17085	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
17086	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[__S1]], i32 0, i32 0
17087	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL]], i32 0, i32 0
17088	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
17089	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[__S1]], i32 0, i32 0
17090	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL1]], i32 0, i32 1
17091	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
17092	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[__S1]], i32 0, i32 0
17093	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL3]], i32 0, i32 2
17094	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
17095	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
17096	// CHECK: ret void
17097	void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
17098	vst3q_p8(a, b);
17099	}
17100
17101	// CHECK-LABEL: @test_vst3q_p16(
17102	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
17103	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
17104	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[B]], i32 0, i32 0
17105	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i16>] [[COERCE_DIVE]] to [6 x i64]*
17106	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
17107	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x3_t [[__S1]] to i8*
17108	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x8x3_t [[B]] to i8*
17109	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
17110	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
17111	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
17112	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i32 0, i32 0
17113	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
17114	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
17115	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
17116	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i32 0, i32 1
17117	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
17118	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
17119	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
17120	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i32 0, i32 2
17121	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
17122	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
17123	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
17124	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
17125	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
17126	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
17127	// CHECK: ret void
17128	void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
17129	vst3q_p16(a, b);
17130	}
17131
17132	// CHECK-LABEL: @test_vst3_u8(
17133	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
17134	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
17135	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[B]], i32 0, i32 0
17136	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
17137	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17138	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x3_t [[__S1]] to i8*
17139	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x3_t [[B]] to i8*
17140	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17141	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
17142	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i32 0, i32 0
17143	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
17144	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
17145	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i32 0, i32 1
17146	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
17147	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
17148	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i32 0, i32 2
17149	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
17150	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
17151	// CHECK: ret void
17152	void test_vst3_u8(uint8_t * a, uint8x8x3_t b) {
17153	vst3_u8(a, b);
17154	}
17155
17156	// CHECK-LABEL: @test_vst3_u16(
17157	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
17158	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
17159	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[B]], i32 0, i32 0
17160	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i16>] [[COERCE_DIVE]] to [3 x i64]*
17161	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17162	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x3_t [[__S1]] to i8*
17163	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x4x3_t [[B]] to i8*
17164	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17165	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
17166	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
17167	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i32 0, i32 0
17168	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
17169	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
17170	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
17171	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i32 0, i32 1
17172	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
17173	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
17174	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
17175	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i32 0, i32 2
17176	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
17177	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
17178	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
17179	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
17180	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
17181	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
17182	// CHECK: ret void
17183	void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
17184	vst3_u16(a, b);
17185	}
17186
17187	// CHECK-LABEL: @test_vst3_u32(
17188	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
17189	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
17190	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[B]], i32 0, i32 0
17191	// CHECK: [[TMP0:%.]] = bitcast [3 x <2 x i32>] [[COERCE_DIVE]] to [3 x i64]*
17192	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17193	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x3_t [[__S1]] to i8*
17194	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x2x3_t [[B]] to i8*
17195	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17196	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
17197	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
17198	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL]], i32 0, i32 0
17199	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
17200	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
17201	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
17202	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL1]], i32 0, i32 1
17203	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
17204	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
17205	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
17206	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL3]], i32 0, i32 2
17207	// CHECK: [[TMP8:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
17208	// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
17209	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
17210	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
17211	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
17212	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
17213	// CHECK: ret void
17214	void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
17215	vst3_u32(a, b);
17216	}
17217
17218	// CHECK-LABEL: @test_vst3_u64(
17219	// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
17220	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
17221	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[B]], i32 0, i32 0
17222	// CHECK: [[TMP0:%.]] = bitcast [3 x <1 x i64>] [[COERCE_DIVE]] to [3 x i64]*
17223	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17224	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x1x3_t [[__S1]] to i8*
17225	// CHECK: [[TMP2:%.]] = bitcast %struct.uint64x1x3_t [[B]] to i8*
17226	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17227	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
17228	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[__S1]], i32 0, i32 0
17229	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL]], i32 0, i32 0
17230	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
17231	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
17232	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[__S1]], i32 0, i32 0
17233	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL1]], i32 0, i32 1
17234	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
17235	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
17236	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[__S1]], i32 0, i32 0
17237	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL3]], i32 0, i32 2
17238	// CHECK: [[TMP8:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
17239	// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
17240	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
17241	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
17242	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
17243	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
17244	// CHECK: ret void
17245	void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
17246	vst3_u64(a, b);
17247	}
17248
17249	// CHECK-LABEL: @test_vst3_s8(
17250	// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
17251	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
17252	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[B]], i32 0, i32 0
17253	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
17254	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17255	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x3_t [[__S1]] to i8*
17256	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x3_t [[B]] to i8*
17257	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17258	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
17259	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i32 0, i32 0
17260	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
17261	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
17262	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i32 0, i32 1
17263	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
17264	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
17265	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i32 0, i32 2
17266	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
17267	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
17268	// CHECK: ret void
17269	void test_vst3_s8(int8_t * a, int8x8x3_t b) {
17270	vst3_s8(a, b);
17271	}
17272
17273	// CHECK-LABEL: @test_vst3_s16(
17274	// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
17275	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
17276	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[B]], i32 0, i32 0
17277	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i16>] [[COERCE_DIVE]] to [3 x i64]*
17278	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17279	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x3_t [[__S1]] to i8*
17280	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x4x3_t [[B]] to i8*
17281	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17282	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
17283	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
17284	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i32 0, i32 0
17285	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
17286	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
17287	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
17288	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i32 0, i32 1
17289	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
17290	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
17291	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
17292	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i32 0, i32 2
17293	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
17294	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
17295	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
17296	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
17297	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
17298	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
17299	// CHECK: ret void
17300	void test_vst3_s16(int16_t * a, int16x4x3_t b) {
17301	vst3_s16(a, b);
17302	}
17303
17304	// CHECK-LABEL: @test_vst3_s32(
17305	// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
17306	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
17307	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[B]], i32 0, i32 0
17308	// CHECK: [[TMP0:%.]] = bitcast [3 x <2 x i32>] [[COERCE_DIVE]] to [3 x i64]*
17309	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17310	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x3_t [[__S1]] to i8*
17311	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x2x3_t [[B]] to i8*
17312	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17313	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
17314	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
17315	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL]], i32 0, i32 0
17316	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
17317	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
17318	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
17319	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL1]], i32 0, i32 1
17320	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
17321	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
17322	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
17323	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL3]], i32 0, i32 2
17324	// CHECK: [[TMP8:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
17325	// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
17326	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
17327	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
17328	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
17329	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
17330	// CHECK: ret void
17331	void test_vst3_s32(int32_t * a, int32x2x3_t b) {
17332	vst3_s32(a, b);
17333	}
17334
17335	// CHECK-LABEL: @test_vst3_s64(
17336	// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
17337	// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
17338	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[B]], i32 0, i32 0
17339	// CHECK: [[TMP0:%.]] = bitcast [3 x <1 x i64>] [[COERCE_DIVE]] to [3 x i64]*
17340	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17341	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x1x3_t [[__S1]] to i8*
17342	// CHECK: [[TMP2:%.]] = bitcast %struct.int64x1x3_t [[B]] to i8*
17343	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17344	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
17345	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[__S1]], i32 0, i32 0
17346	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL]], i32 0, i32 0
17347	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
17348	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
17349	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[__S1]], i32 0, i32 0
17350	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL1]], i32 0, i32 1
17351	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
17352	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
17353	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[__S1]], i32 0, i32 0
17354	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL3]], i32 0, i32 2
17355	// CHECK: [[TMP8:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
17356	// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
17357	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
17358	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
17359	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
17360	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
17361	// CHECK: ret void
17362	void test_vst3_s64(int64_t * a, int64x1x3_t b) {
17363	vst3_s64(a, b);
17364	}
17365
17366	// CHECK-LABEL: @test_vst3_f16(
17367	// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
17368	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
17369	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[B]], i32 0, i32 0
17370	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x half>] [[COERCE_DIVE]] to [3 x i64]*
17371	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17372	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x3_t [[__S1]] to i8*
17373	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x4x3_t [[B]] to i8*
17374	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17375	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
17376	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
17377	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL]], i32 0, i32 0
17378	// CHECK: [[TMP4:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
17379	// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
17380	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
17381	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL1]], i32 0, i32 1
17382	// CHECK: [[TMP6:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
17383	// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
17384	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
17385	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL3]], i32 0, i32 2
17386	// CHECK: [[TMP8:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX4]], align 8
17387	// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
17388	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
17389	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
17390	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
17391	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 2)
17392	// CHECK: ret void
17393	void test_vst3_f16(float16_t * a, float16x4x3_t b) {
17394	vst3_f16(a, b);
17395	}
17396
17397	// CHECK-LABEL: @test_vst3_f32(
17398	// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
17399	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
17400	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[B]], i32 0, i32 0
17401	// CHECK: [[TMP0:%.]] = bitcast [3 x <2 x float>] [[COERCE_DIVE]] to [3 x i64]*
17402	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17403	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x3_t [[__S1]] to i8*
17404	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x2x3_t [[B]] to i8*
17405	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17406	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
17407	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
17408	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL]], i32 0, i32 0
17409	// CHECK: [[TMP4:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
17410	// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
17411	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
17412	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL1]], i32 0, i32 1
17413	// CHECK: [[TMP6:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
17414	// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
17415	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
17416	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL3]], i32 0, i32 2
17417	// CHECK: [[TMP8:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX4]], align 8
17418	// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
17419	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
17420	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
17421	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
17422	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 4)
17423	// CHECK: ret void
17424	void test_vst3_f32(float32_t * a, float32x2x3_t b) {
17425	vst3_f32(a, b);
17426	}
17427
17428	// CHECK-LABEL: @test_vst3_p8(
17429	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
17430	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
17431	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[B]], i32 0, i32 0
17432	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
17433	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17434	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x3_t [[__S1]] to i8*
17435	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x3_t [[B]] to i8*
17436	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17437	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
17438	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i32 0, i32 0
17439	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
17440	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
17441	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i32 0, i32 1
17442	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
17443	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
17444	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i32 0, i32 2
17445	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
17446	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
17447	// CHECK: ret void
17448	void test_vst3_p8(poly8_t * a, poly8x8x3_t b) {
17449	vst3_p8(a, b);
17450	}
17451
17452	// CHECK-LABEL: @test_vst3_p16(
17453	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
17454	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
17455	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[B]], i32 0, i32 0
17456	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i16>] [[COERCE_DIVE]] to [3 x i64]*
17457	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17458	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x3_t [[__S1]] to i8*
17459	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x4x3_t [[B]] to i8*
17460	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17461	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
17462	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
17463	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i32 0, i32 0
17464	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
17465	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
17466	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
17467	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i32 0, i32 1
17468	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
17469	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
17470	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
17471	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i32 0, i32 2
17472	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
17473	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
17474	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
17475	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
17476	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
17477	// CHECK: call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
17478	// CHECK: ret void
17479	void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
17480	vst3_p16(a, b);
17481	}
17482
17483	// CHECK-LABEL: @test_vst3q_lane_u16(
17484	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
17485	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
17486	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[B]], i32 0, i32 0
17487	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i16>] [[COERCE_DIVE]] to [6 x i64]*
17488	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
17489	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x3_t [[__S1]] to i8*
17490	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x8x3_t [[B]] to i8*
17491	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
17492	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
17493	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
17494	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i32 0, i32 0
17495	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
17496	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
17497	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
17498	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i32 0, i32 1
17499	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
17500	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
17501	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
17502	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i32 0, i32 2
17503	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
17504	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
17505	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
17506	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
17507	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
17508	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
17509	// CHECK: ret void
17510	void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
17511	vst3q_lane_u16(a, b, 7);
17512	}
17513
17514	// CHECK-LABEL: @test_vst3q_lane_u32(
17515	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
17516	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
17517	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[B]], i32 0, i32 0
17518	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i32>] [[COERCE_DIVE]] to [6 x i64]*
17519	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
17520	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x3_t [[__S1]] to i8*
17521	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x4x3_t [[B]] to i8*
17522	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
17523	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
17524	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
17525	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL]], i32 0, i32 0
17526	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
17527	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
17528	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
17529	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL1]], i32 0, i32 1
17530	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
17531	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
17532	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
17533	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL3]], i32 0, i32 2
17534	// CHECK: [[TMP8:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
17535	// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
17536	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
17537	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
17538	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
17539	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
17540	// CHECK: ret void
17541	void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
17542	vst3q_lane_u32(a, b, 3);
17543	}
17544
17545	// CHECK-LABEL: @test_vst3q_lane_s16(
17546	// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
17547	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
17548	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[B]], i32 0, i32 0
17549	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i16>] [[COERCE_DIVE]] to [6 x i64]*
17550	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
17551	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x3_t [[__S1]] to i8*
17552	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x8x3_t [[B]] to i8*
17553	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
17554	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
17555	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
17556	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i32 0, i32 0
17557	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
17558	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
17559	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
17560	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i32 0, i32 1
17561	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
17562	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
17563	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
17564	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i32 0, i32 2
17565	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
17566	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
17567	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
17568	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
17569	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
17570	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
17571	// CHECK: ret void
17572	void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
17573	vst3q_lane_s16(a, b, 7);
17574	}
17575
17576	// CHECK-LABEL: @test_vst3q_lane_s32(
17577	// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
17578	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
17579	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[B]], i32 0, i32 0
17580	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i32>] [[COERCE_DIVE]] to [6 x i64]*
17581	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
17582	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x3_t [[__S1]] to i8*
17583	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x4x3_t [[B]] to i8*
17584	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
17585	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
17586	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
17587	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL]], i32 0, i32 0
17588	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
17589	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
17590	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
17591	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL1]], i32 0, i32 1
17592	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
17593	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
17594	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
17595	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL3]], i32 0, i32 2
17596	// CHECK: [[TMP8:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
17597	// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
17598	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
17599	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
17600	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
17601	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
17602	// CHECK: ret void
17603	void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
17604	vst3q_lane_s32(a, b, 3);
17605	}
17606
17607	// CHECK-LABEL: @test_vst3q_lane_f16(
17608	// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
17609	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
17610	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[B]], i32 0, i32 0
17611	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x half>] [[COERCE_DIVE]] to [6 x i64]*
17612	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
17613	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x3_t [[__S1]] to i8*
17614	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x8x3_t [[B]] to i8*
17615	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
17616	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
17617	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
17618	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL]], i32 0, i32 0
17619	// CHECK: [[TMP4:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
17620	// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
17621	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
17622	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL1]], i32 0, i32 1
17623	// CHECK: [[TMP6:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
17624	// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
17625	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
17626	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL3]], i32 0, i32 2
17627	// CHECK: [[TMP8:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX4]], align 16
17628	// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
17629	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
17630	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
17631	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
17632	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 7, i32 2)
17633	// CHECK: ret void
17634	void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
17635	vst3q_lane_f16(a, b, 7);
17636	}
17637
17638	// CHECK-LABEL: @test_vst3q_lane_f32(
17639	// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
17640	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
17641	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[B]], i32 0, i32 0
17642	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x float>] [[COERCE_DIVE]] to [6 x i64]*
17643	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
17644	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x3_t [[__S1]] to i8*
17645	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x4x3_t [[B]] to i8*
17646	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
17647	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
17648	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
17649	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL]], i32 0, i32 0
17650	// CHECK: [[TMP4:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
17651	// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
17652	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
17653	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL1]], i32 0, i32 1
17654	// CHECK: [[TMP6:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
17655	// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
17656	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
17657	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL3]], i32 0, i32 2
17658	// CHECK: [[TMP8:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX4]], align 16
17659	// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
17660	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
17661	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
17662	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
17663	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 3, i32 4)
17664	// CHECK: ret void
17665	void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
17666	vst3q_lane_f32(a, b, 3);
17667	}
17668
17669	// CHECK-LABEL: @test_vst3q_lane_p16(
17670	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
17671	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
17672	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[B]], i32 0, i32 0
17673	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i16>] [[COERCE_DIVE]] to [6 x i64]*
17674	// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
17675	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x3_t [[__S1]] to i8*
17676	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x8x3_t [[B]] to i8*
17677	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
17678	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
17679	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
17680	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i32 0, i32 0
17681	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
17682	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
17683	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
17684	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i32 0, i32 1
17685	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
17686	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
17687	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
17688	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i32 0, i32 2
17689	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
17690	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
17691	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
17692	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
17693	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
17694	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
17695	// CHECK: ret void
17696	void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
17697	vst3q_lane_p16(a, b, 7);
17698	}
17699
17700	// CHECK-LABEL: @test_vst3_lane_u8(
17701	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
17702	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
17703	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[B]], i32 0, i32 0
17704	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
17705	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17706	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x3_t [[__S1]] to i8*
17707	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x3_t [[B]] to i8*
17708	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17709	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
17710	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i32 0, i32 0
17711	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
17712	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
17713	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i32 0, i32 1
17714	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
17715	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
17716	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i32 0, i32 2
17717	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
17718	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
17719	// CHECK: ret void
17720	void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) {
17721	vst3_lane_u8(a, b, 7);
17722	}
17723
17724	// CHECK-LABEL: @test_vst3_lane_u16(
17725	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
17726	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
17727	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[B]], i32 0, i32 0
17728	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i16>] [[COERCE_DIVE]] to [3 x i64]*
17729	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17730	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x3_t [[__S1]] to i8*
17731	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x4x3_t [[B]] to i8*
17732	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17733	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
17734	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
17735	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i32 0, i32 0
17736	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
17737	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
17738	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
17739	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i32 0, i32 1
17740	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
17741	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
17742	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
17743	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i32 0, i32 2
17744	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
17745	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
17746	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
17747	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
17748	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
17749	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
17750	// CHECK: ret void
17751	void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
17752	vst3_lane_u16(a, b, 3);
17753	}
17754
17755	// CHECK-LABEL: @test_vst3_lane_u32(
17756	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
17757	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
17758	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[B]], i32 0, i32 0
17759	// CHECK: [[TMP0:%.]] = bitcast [3 x <2 x i32>] [[COERCE_DIVE]] to [3 x i64]*
17760	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17761	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x3_t [[__S1]] to i8*
17762	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x2x3_t [[B]] to i8*
17763	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17764	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
17765	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
17766	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL]], i32 0, i32 0
17767	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
17768	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
17769	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
17770	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL1]], i32 0, i32 1
17771	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
17772	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
17773	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
17774	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL3]], i32 0, i32 2
17775	// CHECK: [[TMP8:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
17776	// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
17777	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
17778	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
17779	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
17780	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
17781	// CHECK: ret void
17782	void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
17783	vst3_lane_u32(a, b, 1);
17784	}
17785
17786	// CHECK-LABEL: @test_vst3_lane_s8(
17787	// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
17788	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
17789	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[B]], i32 0, i32 0
17790	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
17791	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17792	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x3_t [[__S1]] to i8*
17793	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x3_t [[B]] to i8*
17794	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17795	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
17796	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i32 0, i32 0
17797	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
17798	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
17799	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i32 0, i32 1
17800	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
17801	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
17802	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i32 0, i32 2
17803	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
17804	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
17805	// CHECK: ret void
17806	void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) {
17807	vst3_lane_s8(a, b, 7);
17808	}
17809
17810	// CHECK-LABEL: @test_vst3_lane_s16(
17811	// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
17812	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
17813	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[B]], i32 0, i32 0
17814	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i16>] [[COERCE_DIVE]] to [3 x i64]*
17815	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17816	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x3_t [[__S1]] to i8*
17817	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x4x3_t [[B]] to i8*
17818	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17819	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
17820	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
17821	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i32 0, i32 0
17822	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
17823	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
17824	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
17825	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i32 0, i32 1
17826	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
17827	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
17828	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
17829	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i32 0, i32 2
17830	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
17831	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
17832	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
17833	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
17834	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
17835	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
17836	// CHECK: ret void
17837	void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
17838	vst3_lane_s16(a, b, 3);
17839	}
17840
17841	// CHECK-LABEL: @test_vst3_lane_s32(
17842	// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
17843	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
17844	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[B]], i32 0, i32 0
17845	// CHECK: [[TMP0:%.]] = bitcast [3 x <2 x i32>] [[COERCE_DIVE]] to [3 x i64]*
17846	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17847	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x3_t [[__S1]] to i8*
17848	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x2x3_t [[B]] to i8*
17849	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17850	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
17851	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
17852	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL]], i32 0, i32 0
17853	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
17854	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
17855	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
17856	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL1]], i32 0, i32 1
17857	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
17858	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
17859	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
17860	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL3]], i32 0, i32 2
17861	// CHECK: [[TMP8:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
17862	// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
17863	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
17864	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
17865	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
17866	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
17867	// CHECK: ret void
17868	void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
17869	vst3_lane_s32(a, b, 1);
17870	}
17871
17872	// CHECK-LABEL: @test_vst3_lane_f16(
17873	// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
17874	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
17875	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[B]], i32 0, i32 0
17876	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x half>] [[COERCE_DIVE]] to [3 x i64]*
17877	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17878	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x3_t [[__S1]] to i8*
17879	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x4x3_t [[B]] to i8*
17880	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17881	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
17882	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
17883	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL]], i32 0, i32 0
17884	// CHECK: [[TMP4:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
17885	// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
17886	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
17887	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL1]], i32 0, i32 1
17888	// CHECK: [[TMP6:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
17889	// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
17890	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
17891	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL3]], i32 0, i32 2
17892	// CHECK: [[TMP8:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX4]], align 8
17893	// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
17894	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
17895	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
17896	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
17897	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 3, i32 2)
17898	// CHECK: ret void
17899	void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
17900	vst3_lane_f16(a, b, 3);
17901	}
17902
17903	// CHECK-LABEL: @test_vst3_lane_f32(
17904	// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
17905	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
17906	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[B]], i32 0, i32 0
17907	// CHECK: [[TMP0:%.]] = bitcast [3 x <2 x float>] [[COERCE_DIVE]] to [3 x i64]*
17908	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17909	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x3_t [[__S1]] to i8*
17910	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x2x3_t [[B]] to i8*
17911	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17912	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
17913	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
17914	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL]], i32 0, i32 0
17915	// CHECK: [[TMP4:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
17916	// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
17917	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
17918	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL1]], i32 0, i32 1
17919	// CHECK: [[TMP6:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
17920	// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
17921	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
17922	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL3]], i32 0, i32 2
17923	// CHECK: [[TMP8:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX4]], align 8
17924	// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
17925	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
17926	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
17927	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
17928	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 1, i32 4)
17929	// CHECK: ret void
17930	void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
17931	vst3_lane_f32(a, b, 1);
17932	}
17933
17934	// CHECK-LABEL: @test_vst3_lane_p8(
17935	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
17936	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
17937	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[B]], i32 0, i32 0
17938	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
17939	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17940	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x3_t [[__S1]] to i8*
17941	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x3_t [[B]] to i8*
17942	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17943	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
17944	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i32 0, i32 0
17945	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
17946	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
17947	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i32 0, i32 1
17948	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
17949	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
17950	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i32 0, i32 2
17951	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
17952	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
17953	// CHECK: ret void
17954	void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) {
17955	vst3_lane_p8(a, b, 7);
17956	}
17957
17958	// CHECK-LABEL: @test_vst3_lane_p16(
17959	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
17960	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
17961	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[B]], i32 0, i32 0
17962	// CHECK: [[TMP0:%.]] = bitcast [3 x <4 x i16>] [[COERCE_DIVE]] to [3 x i64]*
17963	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
17964	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x3_t [[__S1]] to i8*
17965	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x4x3_t [[B]] to i8*
17966	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
17967	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
17968	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
17969	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i32 0, i32 0
17970	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
17971	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
17972	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
17973	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i32 0, i32 1
17974	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
17975	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
17976	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
17977	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i32 0, i32 2
17978	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
17979	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
17980	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
17981	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
17982	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
17983	// CHECK: call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
17984	// CHECK: ret void
17985	void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
17986	vst3_lane_p16(a, b, 3);
17987	}
17988
17989	// CHECK-LABEL: @test_vst4q_u8(
17990	// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
17991	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
17992	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[B]], i32 0, i32 0
17993	// CHECK: [[TMP0:%.]] = bitcast [4 x <16 x i8>] [[COERCE_DIVE]] to [8 x i64]*
17994	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
17995	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x16x4_t [[__S1]] to i8*
17996	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x16x4_t [[B]] to i8*
17997	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
17998	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
17999	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL]], i32 0, i32 0
18000	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
18001	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
18002	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL1]], i32 0, i32 1
18003	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
18004	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
18005	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL3]], i32 0, i32 2
18006	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
18007	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
18008	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL5]], i32 0, i32 3
18009	// CHECK: [[TMP6:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX6]], align 16
18010	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
18011	// CHECK: ret void
18012	void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
18013	vst4q_u8(a, b);
18014	}
18015
18016	// CHECK-LABEL: @test_vst4q_u16(
18017	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
18018	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
18019	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[B]], i32 0, i32 0
18020	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i16>] [[COERCE_DIVE]] to [8 x i64]*
18021	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18022	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x4_t [[__S1]] to i8*
18023	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x8x4_t [[B]] to i8*
18024	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18025	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
18026	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
18027	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i32 0, i32 0
18028	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
18029	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18030	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
18031	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i32 0, i32 1
18032	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
18033	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18034	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
18035	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i32 0, i32 2
18036	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
18037	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
18038	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
18039	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i32 0, i32 3
18040	// CHECK: [[TMP10:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
18041	// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
18042	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18043	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18044	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
18045	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
18046	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
18047	// CHECK: ret void
18048	void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
18049	vst4q_u16(a, b);
18050	}
18051
18052	// CHECK-LABEL: @test_vst4q_u32(
18053	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
18054	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
18055	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[B]], i32 0, i32 0
18056	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i32>] [[COERCE_DIVE]] to [8 x i64]*
18057	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18058	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x4_t [[__S1]] to i8*
18059	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x4x4_t [[B]] to i8*
18060	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18061	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
18062	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
18063	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL]], i32 0, i32 0
18064	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
18065	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
18066	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
18067	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL1]], i32 0, i32 1
18068	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
18069	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
18070	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
18071	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL3]], i32 0, i32 2
18072	// CHECK: [[TMP8:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
18073	// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
18074	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
18075	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL5]], i32 0, i32 3
18076	// CHECK: [[TMP10:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX6]], align 16
18077	// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
18078	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
18079	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
18080	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
18081	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
18082	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
18083	// CHECK: ret void
18084	void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
18085	vst4q_u32(a, b);
18086	}
18087
18088	// CHECK-LABEL: @test_vst4q_s8(
18089	// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
18090	// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
18091	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[B]], i32 0, i32 0
18092	// CHECK: [[TMP0:%.]] = bitcast [4 x <16 x i8>] [[COERCE_DIVE]] to [8 x i64]*
18093	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18094	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x16x4_t [[__S1]] to i8*
18095	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x16x4_t [[B]] to i8*
18096	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18097	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
18098	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL]], i32 0, i32 0
18099	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
18100	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
18101	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL1]], i32 0, i32 1
18102	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
18103	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
18104	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL3]], i32 0, i32 2
18105	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
18106	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
18107	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL5]], i32 0, i32 3
18108	// CHECK: [[TMP6:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX6]], align 16
18109	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
18110	// CHECK: ret void
18111	void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
18112	vst4q_s8(a, b);
18113	}
18114
18115	// CHECK-LABEL: @test_vst4q_s16(
18116	// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
18117	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
18118	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[B]], i32 0, i32 0
18119	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i16>] [[COERCE_DIVE]] to [8 x i64]*
18120	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18121	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x4_t [[__S1]] to i8*
18122	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x8x4_t [[B]] to i8*
18123	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18124	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
18125	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
18126	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i32 0, i32 0
18127	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
18128	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18129	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
18130	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i32 0, i32 1
18131	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
18132	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18133	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
18134	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i32 0, i32 2
18135	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
18136	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
18137	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
18138	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i32 0, i32 3
18139	// CHECK: [[TMP10:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
18140	// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
18141	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18142	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18143	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
18144	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
18145	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
18146	// CHECK: ret void
18147	void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
18148	vst4q_s16(a, b);
18149	}
18150
18151	// CHECK-LABEL: @test_vst4q_s32(
18152	// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
18153	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
18154	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[B]], i32 0, i32 0
18155	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i32>] [[COERCE_DIVE]] to [8 x i64]*
18156	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18157	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x4_t [[__S1]] to i8*
18158	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x4x4_t [[B]] to i8*
18159	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18160	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
18161	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
18162	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL]], i32 0, i32 0
18163	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
18164	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
18165	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
18166	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL1]], i32 0, i32 1
18167	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
18168	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
18169	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
18170	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL3]], i32 0, i32 2
18171	// CHECK: [[TMP8:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
18172	// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
18173	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
18174	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL5]], i32 0, i32 3
18175	// CHECK: [[TMP10:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX6]], align 16
18176	// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
18177	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
18178	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
18179	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
18180	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
18181	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
18182	// CHECK: ret void
18183	void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
18184	vst4q_s32(a, b);
18185	}
18186
18187	// CHECK-LABEL: @test_vst4q_f16(
18188	// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
18189	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
18190	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[B]], i32 0, i32 0
18191	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x half>] [[COERCE_DIVE]] to [8 x i64]*
18192	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18193	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x4_t [[__S1]] to i8*
18194	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x8x4_t [[B]] to i8*
18195	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18196	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
18197	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
18198	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL]], i32 0, i32 0
18199	// CHECK: [[TMP4:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
18200	// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
18201	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
18202	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL1]], i32 0, i32 1
18203	// CHECK: [[TMP6:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
18204	// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
18205	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
18206	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL3]], i32 0, i32 2
18207	// CHECK: [[TMP8:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX4]], align 16
18208	// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
18209	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
18210	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL5]], i32 0, i32 3
18211	// CHECK: [[TMP10:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX6]], align 16
18212	// CHECK: [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
18213	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
18214	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
18215	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
18216	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
18217	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 2)
18218	// CHECK: ret void
18219	void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
18220	vst4q_f16(a, b);
18221	}
18222
18223	// CHECK-LABEL: @test_vst4q_f32(
18224	// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
18225	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
18226	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[B]], i32 0, i32 0
18227	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x float>] [[COERCE_DIVE]] to [8 x i64]*
18228	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18229	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x4_t [[__S1]] to i8*
18230	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x4x4_t [[B]] to i8*
18231	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18232	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
18233	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
18234	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL]], i32 0, i32 0
18235	// CHECK: [[TMP4:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
18236	// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
18237	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
18238	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL1]], i32 0, i32 1
18239	// CHECK: [[TMP6:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
18240	// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
18241	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
18242	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL3]], i32 0, i32 2
18243	// CHECK: [[TMP8:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX4]], align 16
18244	// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
18245	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
18246	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL5]], i32 0, i32 3
18247	// CHECK: [[TMP10:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX6]], align 16
18248	// CHECK: [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
18249	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
18250	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
18251	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
18252	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
18253	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 4)
18254	// CHECK: ret void
18255	void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
18256	vst4q_f32(a, b);
18257	}
18258
18259	// CHECK-LABEL: @test_vst4q_p8(
18260	// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
18261	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
18262	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[B]], i32 0, i32 0
18263	// CHECK: [[TMP0:%.]] = bitcast [4 x <16 x i8>] [[COERCE_DIVE]] to [8 x i64]*
18264	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18265	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x16x4_t [[__S1]] to i8*
18266	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x16x4_t [[B]] to i8*
18267	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18268	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
18269	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL]], i32 0, i32 0
18270	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
18271	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
18272	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL1]], i32 0, i32 1
18273	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
18274	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
18275	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL3]], i32 0, i32 2
18276	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
18277	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
18278	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL5]], i32 0, i32 3
18279	// CHECK: [[TMP6:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX6]], align 16
18280	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
18281	// CHECK: ret void
18282	void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
18283	vst4q_p8(a, b);
18284	}
18285
18286	// CHECK-LABEL: @test_vst4q_p16(
18287	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
18288	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
18289	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[B]], i32 0, i32 0
18290	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i16>] [[COERCE_DIVE]] to [8 x i64]*
18291	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18292	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x4_t [[__S1]] to i8*
18293	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x8x4_t [[B]] to i8*
18294	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18295	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
18296	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
18297	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i32 0, i32 0
18298	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
18299	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18300	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
18301	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i32 0, i32 1
18302	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
18303	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18304	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
18305	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i32 0, i32 2
18306	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
18307	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
18308	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
18309	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i32 0, i32 3
18310	// CHECK: [[TMP10:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
18311	// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
18312	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18313	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18314	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
18315	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
18316	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
18317	// CHECK: ret void
18318	void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
18319	vst4q_p16(a, b);
18320	}
18321
18322	// CHECK-LABEL: @test_vst4_u8(
18323	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
18324	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
18325	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[B]], i32 0, i32 0
18326	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
18327	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
18328	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x4_t [[__S1]] to i8*
18329	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x4_t [[B]] to i8*
18330	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
18331	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
18332	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i32 0, i32 0
18333	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
18334	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
18335	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i32 0, i32 1
18336	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
18337	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
18338	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i32 0, i32 2
18339	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
18340	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
18341	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i32 0, i32 3
18342	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
18343	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
18344	// CHECK: ret void
18345	void test_vst4_u8(uint8_t * a, uint8x8x4_t b) {
18346	vst4_u8(a, b);
18347	}
18348
18349	// CHECK-LABEL: @test_vst4_u16(
18350	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
18351	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
18352	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[B]], i32 0, i32 0
18353	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i16>] [[COERCE_DIVE]] to [4 x i64]*
18354	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
18355	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x4_t [[__S1]] to i8*
18356	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x4x4_t [[B]] to i8*
18357	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
18358	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
18359	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
18360	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i32 0, i32 0
18361	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
18362	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
18363	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
18364	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i32 0, i32 1
18365	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
18366	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
18367	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
18368	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i32 0, i32 2
18369	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
18370	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
18371	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
18372	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i32 0, i32 3
18373	// CHECK: [[TMP10:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
18374	// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
18375	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18376	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18377	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
18378	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
18379	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
18380	// CHECK: ret void
18381	void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
18382	vst4_u16(a, b);
18383	}
18384
18385	// CHECK-LABEL: @test_vst4_u32(
18386	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
18387	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
18388	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[B]], i32 0, i32 0
18389	// CHECK: [[TMP0:%.]] = bitcast [4 x <2 x i32>] [[COERCE_DIVE]] to [4 x i64]*
18390	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
18391	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x4_t [[__S1]] to i8*
18392	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x2x4_t [[B]] to i8*
18393	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
18394	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
18395	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
18396	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL]], i32 0, i32 0
18397	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
18398	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
18399	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
18400	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL1]], i32 0, i32 1
18401	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
18402	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
18403	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
18404	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL3]], i32 0, i32 2
18405	// CHECK: [[TMP8:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
18406	// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
18407	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
18408	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL5]], i32 0, i32 3
18409	// CHECK: [[TMP10:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX6]], align 8
18410	// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
18411	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
18412	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
18413	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
18414	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
18415	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
18416	// CHECK: ret void
18417	void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
18418	vst4_u32(a, b);
18419	}
18420
18421	// CHECK-LABEL: @test_vst4_u64(
18422	// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
18423	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
18424	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[B]], i32 0, i32 0
18425	// CHECK: [[TMP0:%.]] = bitcast [4 x <1 x i64>] [[COERCE_DIVE]] to [4 x i64]*
18426	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
18427	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x1x4_t [[__S1]] to i8*
18428	// CHECK: [[TMP2:%.]] = bitcast %struct.uint64x1x4_t [[B]] to i8*
18429	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
18430	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
18431	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
18432	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL]], i32 0, i32 0
18433	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
18434	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
18435	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
18436	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL1]], i32 0, i32 1
18437	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
18438	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
18439	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
18440	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL3]], i32 0, i32 2
18441	// CHECK: [[TMP8:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
18442	// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
18443	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
18444	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL5]], i32 0, i32 3
18445	// CHECK: [[TMP10:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX6]], align 8
18446	// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
18447	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
18448	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
18449	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
18450	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
18451	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
18452	// CHECK: ret void
18453	void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
18454	vst4_u64(a, b);
18455	}
18456
18457	// CHECK-LABEL: @test_vst4_s8(
18458	// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
18459	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
18460	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[B]], i32 0, i32 0
18461	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
18462	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
18463	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x4_t [[__S1]] to i8*
18464	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x4_t [[B]] to i8*
18465	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
18466	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
18467	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i32 0, i32 0
18468	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
18469	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
18470	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i32 0, i32 1
18471	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
18472	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
18473	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i32 0, i32 2
18474	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
18475	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
18476	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i32 0, i32 3
18477	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
18478	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
18479	// CHECK: ret void
18480	void test_vst4_s8(int8_t * a, int8x8x4_t b) {
18481	vst4_s8(a, b);
18482	}
18483
18484	// CHECK-LABEL: @test_vst4_s16(
18485	// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
18486	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
18487	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[B]], i32 0, i32 0
18488	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i16>] [[COERCE_DIVE]] to [4 x i64]*
18489	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
18490	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x4_t [[__S1]] to i8*
18491	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x4x4_t [[B]] to i8*
18492	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
18493	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
18494	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
18495	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i32 0, i32 0
18496	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
18497	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
18498	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
18499	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i32 0, i32 1
18500	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
18501	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
18502	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
18503	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i32 0, i32 2
18504	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
18505	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
18506	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
18507	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i32 0, i32 3
18508	// CHECK: [[TMP10:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
18509	// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
18510	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18511	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18512	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
18513	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
18514	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
18515	// CHECK: ret void
18516	void test_vst4_s16(int16_t * a, int16x4x4_t b) {
18517	vst4_s16(a, b);
18518	}
18519
18520	// CHECK-LABEL: @test_vst4_s32(
18521	// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
18522	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
18523	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[B]], i32 0, i32 0
18524	// CHECK: [[TMP0:%.]] = bitcast [4 x <2 x i32>] [[COERCE_DIVE]] to [4 x i64]*
18525	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
18526	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x4_t [[__S1]] to i8*
18527	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x2x4_t [[B]] to i8*
18528	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
18529	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
18530	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
18531	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL]], i32 0, i32 0
18532	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
18533	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
18534	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
18535	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL1]], i32 0, i32 1
18536	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
18537	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
18538	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
18539	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL3]], i32 0, i32 2
18540	// CHECK: [[TMP8:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
18541	// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
18542	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
18543	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL5]], i32 0, i32 3
18544	// CHECK: [[TMP10:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX6]], align 8
18545	// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
18546	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
18547	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
18548	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
18549	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
18550	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
18551	// CHECK: ret void
18552	void test_vst4_s32(int32_t * a, int32x2x4_t b) {
18553	vst4_s32(a, b);
18554	}
18555
18556	// CHECK-LABEL: @test_vst4_s64(
18557	// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
18558	// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
18559	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[B]], i32 0, i32 0
18560	// CHECK: [[TMP0:%.]] = bitcast [4 x <1 x i64>] [[COERCE_DIVE]] to [4 x i64]*
18561	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
18562	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x1x4_t [[__S1]] to i8*
18563	// CHECK: [[TMP2:%.]] = bitcast %struct.int64x1x4_t [[B]] to i8*
18564	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
18565	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
18566	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
18567	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL]], i32 0, i32 0
18568	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
18569	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
18570	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
18571	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL1]], i32 0, i32 1
18572	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
18573	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
18574	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
18575	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL3]], i32 0, i32 2
18576	// CHECK: [[TMP8:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
18577	// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
18578	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
18579	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL5]], i32 0, i32 3
18580	// CHECK: [[TMP10:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX6]], align 8
18581	// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
18582	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
18583	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
18584	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
18585	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
18586	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
18587	// CHECK: ret void
18588	void test_vst4_s64(int64_t * a, int64x1x4_t b) {
18589	vst4_s64(a, b);
18590	}
18591
18592	// CHECK-LABEL: @test_vst4_f16(
18593	// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
18594	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
18595	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[B]], i32 0, i32 0
18596	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x half>] [[COERCE_DIVE]] to [4 x i64]*
18597	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
18598	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x4_t [[__S1]] to i8*
18599	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x4x4_t [[B]] to i8*
18600	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
18601	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
18602	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
18603	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL]], i32 0, i32 0
18604	// CHECK: [[TMP4:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
18605	// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
18606	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
18607	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL1]], i32 0, i32 1
18608	// CHECK: [[TMP6:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
18609	// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
18610	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
18611	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL3]], i32 0, i32 2
18612	// CHECK: [[TMP8:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX4]], align 8
18613	// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
18614	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
18615	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL5]], i32 0, i32 3
18616	// CHECK: [[TMP10:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX6]], align 8
18617	// CHECK: [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
18618	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
18619	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
18620	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
18621	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
18622	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 2)
18623	// CHECK: ret void
18624	void test_vst4_f16(float16_t * a, float16x4x4_t b) {
18625	vst4_f16(a, b);
18626	}
18627
18628	// CHECK-LABEL: @test_vst4_f32(
18629	// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
18630	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
18631	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[B]], i32 0, i32 0
18632	// CHECK: [[TMP0:%.]] = bitcast [4 x <2 x float>] [[COERCE_DIVE]] to [4 x i64]*
18633	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
18634	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x4_t [[__S1]] to i8*
18635	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x2x4_t [[B]] to i8*
18636	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
18637	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
18638	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
18639	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL]], i32 0, i32 0
18640	// CHECK: [[TMP4:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
18641	// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
18642	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
18643	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL1]], i32 0, i32 1
18644	// CHECK: [[TMP6:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
18645	// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
18646	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
18647	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL3]], i32 0, i32 2
18648	// CHECK: [[TMP8:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX4]], align 8
18649	// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
18650	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
18651	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL5]], i32 0, i32 3
18652	// CHECK: [[TMP10:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX6]], align 8
18653	// CHECK: [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
18654	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
18655	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
18656	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
18657	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
18658	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 4)
18659	// CHECK: ret void
18660	void test_vst4_f32(float32_t * a, float32x2x4_t b) {
18661	vst4_f32(a, b);
18662	}
18663
18664	// CHECK-LABEL: @test_vst4_p8(
18665	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
18666	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
18667	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[B]], i32 0, i32 0
18668	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
18669	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
18670	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x4_t [[__S1]] to i8*
18671	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x4_t [[B]] to i8*
18672	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
18673	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
18674	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i32 0, i32 0
18675	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
18676	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
18677	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i32 0, i32 1
18678	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
18679	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
18680	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i32 0, i32 2
18681	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
18682	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
18683	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i32 0, i32 3
18684	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
18685	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
18686	// CHECK: ret void
18687	void test_vst4_p8(poly8_t * a, poly8x8x4_t b) {
18688	vst4_p8(a, b);
18689	}
18690
18691	// CHECK-LABEL: @test_vst4_p16(
18692	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
18693	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
18694	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[B]], i32 0, i32 0
18695	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i16>] [[COERCE_DIVE]] to [4 x i64]*
18696	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
18697	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x4_t [[__S1]] to i8*
18698	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x4x4_t [[B]] to i8*
18699	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
18700	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
18701	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
18702	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i32 0, i32 0
18703	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
18704	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
18705	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
18706	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i32 0, i32 1
18707	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
18708	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
18709	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
18710	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i32 0, i32 2
18711	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
18712	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
18713	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
18714	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i32 0, i32 3
18715	// CHECK: [[TMP10:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
18716	// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
18717	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18718	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18719	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
18720	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
18721	// CHECK: call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
18722	// CHECK: ret void
18723	void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
18724	vst4_p16(a, b);
18725	}
18726
18727	// CHECK-LABEL: @test_vst4q_lane_u16(
18728	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
18729	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
18730	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[B]], i32 0, i32 0
18731	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i16>] [[COERCE_DIVE]] to [8 x i64]*
18732	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18733	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x4_t [[__S1]] to i8*
18734	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x8x4_t [[B]] to i8*
18735	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18736	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
18737	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
18738	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i32 0, i32 0
18739	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
18740	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18741	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
18742	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i32 0, i32 1
18743	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
18744	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18745	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
18746	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i32 0, i32 2
18747	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
18748	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
18749	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
18750	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i32 0, i32 3
18751	// CHECK: [[TMP10:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
18752	// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
18753	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18754	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18755	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
18756	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
18757	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
18758	// CHECK: ret void
18759	void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
18760	vst4q_lane_u16(a, b, 7);
18761	}
18762
18763	// CHECK-LABEL: @test_vst4q_lane_u32(
18764	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
18765	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
18766	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[B]], i32 0, i32 0
18767	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i32>] [[COERCE_DIVE]] to [8 x i64]*
18768	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18769	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x4_t [[__S1]] to i8*
18770	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x4x4_t [[B]] to i8*
18771	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18772	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
18773	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
18774	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL]], i32 0, i32 0
18775	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
18776	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
18777	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
18778	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL1]], i32 0, i32 1
18779	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
18780	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
18781	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
18782	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL3]], i32 0, i32 2
18783	// CHECK: [[TMP8:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
18784	// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
18785	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
18786	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL5]], i32 0, i32 3
18787	// CHECK: [[TMP10:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX6]], align 16
18788	// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
18789	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
18790	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
18791	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
18792	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
18793	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
18794	// CHECK: ret void
18795	void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
18796	vst4q_lane_u32(a, b, 3);
18797	}
18798
18799	// CHECK-LABEL: @test_vst4q_lane_s16(
18800	// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
18801	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
18802	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[B]], i32 0, i32 0
18803	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i16>] [[COERCE_DIVE]] to [8 x i64]*
18804	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18805	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x4_t [[__S1]] to i8*
18806	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x8x4_t [[B]] to i8*
18807	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18808	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
18809	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
18810	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i32 0, i32 0
18811	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
18812	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18813	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
18814	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i32 0, i32 1
18815	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
18816	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18817	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
18818	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i32 0, i32 2
18819	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
18820	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
18821	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
18822	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i32 0, i32 3
18823	// CHECK: [[TMP10:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
18824	// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
18825	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18826	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18827	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
18828	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
18829	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
18830	// CHECK: ret void
18831	void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
18832	vst4q_lane_s16(a, b, 7);
18833	}
18834
18835	// CHECK-LABEL: @test_vst4q_lane_s32(
18836	// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
18837	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
18838	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[B]], i32 0, i32 0
18839	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i32>] [[COERCE_DIVE]] to [8 x i64]*
18840	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18841	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x4_t [[__S1]] to i8*
18842	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x4x4_t [[B]] to i8*
18843	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18844	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
18845	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
18846	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL]], i32 0, i32 0
18847	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
18848	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
18849	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
18850	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL1]], i32 0, i32 1
18851	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
18852	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
18853	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
18854	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL3]], i32 0, i32 2
18855	// CHECK: [[TMP8:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
18856	// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
18857	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
18858	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL5]], i32 0, i32 3
18859	// CHECK: [[TMP10:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX6]], align 16
18860	// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
18861	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
18862	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
18863	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
18864	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
18865	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
18866	// CHECK: ret void
18867	void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
18868	vst4q_lane_s32(a, b, 3);
18869	}
18870
18871	// CHECK-LABEL: @test_vst4q_lane_f16(
18872	// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
18873	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
18874	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[B]], i32 0, i32 0
18875	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x half>] [[COERCE_DIVE]] to [8 x i64]*
18876	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18877	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x4_t [[__S1]] to i8*
18878	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x8x4_t [[B]] to i8*
18879	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18880	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
18881	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
18882	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL]], i32 0, i32 0
18883	// CHECK: [[TMP4:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
18884	// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
18885	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
18886	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL1]], i32 0, i32 1
18887	// CHECK: [[TMP6:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
18888	// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
18889	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
18890	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL3]], i32 0, i32 2
18891	// CHECK: [[TMP8:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX4]], align 16
18892	// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
18893	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
18894	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL5]], i32 0, i32 3
18895	// CHECK: [[TMP10:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX6]], align 16
18896	// CHECK: [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
18897	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
18898	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
18899	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
18900	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
18901	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 7, i32 2)
18902	// CHECK: ret void
18903	void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
18904	vst4q_lane_f16(a, b, 7);
18905	}
18906
18907	// CHECK-LABEL: @test_vst4q_lane_f32(
18908	// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
18909	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
18910	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[B]], i32 0, i32 0
18911	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x float>] [[COERCE_DIVE]] to [8 x i64]*
18912	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18913	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x4_t [[__S1]] to i8*
18914	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x4x4_t [[B]] to i8*
18915	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18916	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
18917	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
18918	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL]], i32 0, i32 0
18919	// CHECK: [[TMP4:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
18920	// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
18921	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
18922	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL1]], i32 0, i32 1
18923	// CHECK: [[TMP6:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
18924	// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
18925	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
18926	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL3]], i32 0, i32 2
18927	// CHECK: [[TMP8:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX4]], align 16
18928	// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
18929	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
18930	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL5]], i32 0, i32 3
18931	// CHECK: [[TMP10:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX6]], align 16
18932	// CHECK: [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
18933	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
18934	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
18935	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
18936	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
18937	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 3, i32 4)
18938	// CHECK: ret void
18939	void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
18940	vst4q_lane_f32(a, b, 3);
18941	}
18942
18943	// CHECK-LABEL: @test_vst4q_lane_p16(
18944	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
18945	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
18946	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[B]], i32 0, i32 0
18947	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i16>] [[COERCE_DIVE]] to [8 x i64]*
18948	// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
18949	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x4_t [[__S1]] to i8*
18950	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x8x4_t [[B]] to i8*
18951	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
18952	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
18953	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
18954	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i32 0, i32 0
18955	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
18956	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18957	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
18958	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i32 0, i32 1
18959	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
18960	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18961	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
18962	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i32 0, i32 2
18963	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
18964	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
18965	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
18966	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i32 0, i32 3
18967	// CHECK: [[TMP10:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
18968	// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
18969	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18970	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18971	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
18972	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
18973	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
18974	// CHECK: ret void
18975	void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
18976	vst4q_lane_p16(a, b, 7);
18977	}
18978
18979	// CHECK-LABEL: @test_vst4_lane_u8(
18980	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
18981	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
18982	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[B]], i32 0, i32 0
18983	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
18984	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
18985	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x4_t [[__S1]] to i8*
18986	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x4_t [[B]] to i8*
18987	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
18988	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
18989	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i32 0, i32 0
18990	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
18991	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
18992	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i32 0, i32 1
18993	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
18994	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
18995	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i32 0, i32 2
18996	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
18997	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
18998	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i32 0, i32 3
18999	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
19000	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
19001	// CHECK: ret void
19002	void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) {
19003	vst4_lane_u8(a, b, 7);
19004	}
19005
19006	// CHECK-LABEL: @test_vst4_lane_u16(
19007	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
19008	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
19009	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[B]], i32 0, i32 0
19010	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i16>] [[COERCE_DIVE]] to [4 x i64]*
19011	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
19012	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x4_t [[__S1]] to i8*
19013	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x4x4_t [[B]] to i8*
19014	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
19015	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
19016	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
19017	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i32 0, i32 0
19018	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
19019	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
19020	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
19021	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i32 0, i32 1
19022	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
19023	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
19024	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
19025	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i32 0, i32 2
19026	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
19027	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
19028	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
19029	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i32 0, i32 3
19030	// CHECK: [[TMP10:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
19031	// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
19032	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19033	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19034	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
19035	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
19036	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
19037	// CHECK: ret void
19038	void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
19039	vst4_lane_u16(a, b, 3);
19040	}
19041
19042	// CHECK-LABEL: @test_vst4_lane_u32(
19043	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
19044	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
19045	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[B]], i32 0, i32 0
19046	// CHECK: [[TMP0:%.]] = bitcast [4 x <2 x i32>] [[COERCE_DIVE]] to [4 x i64]*
19047	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
19048	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x4_t [[__S1]] to i8*
19049	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x2x4_t [[B]] to i8*
19050	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
19051	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
19052	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
19053	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL]], i32 0, i32 0
19054	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
19055	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
19056	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
19057	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL1]], i32 0, i32 1
19058	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
19059	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
19060	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
19061	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL3]], i32 0, i32 2
19062	// CHECK: [[TMP8:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
19063	// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
19064	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
19065	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL5]], i32 0, i32 3
19066	// CHECK: [[TMP10:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX6]], align 8
19067	// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
19068	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
19069	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
19070	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
19071	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
19072	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
19073	// CHECK: ret void
19074	void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
19075	vst4_lane_u32(a, b, 1);
19076	}
19077
19078	// CHECK-LABEL: @test_vst4_lane_s8(
19079	// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
19080	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
19081	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[B]], i32 0, i32 0
19082	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
19083	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
19084	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x4_t [[__S1]] to i8*
19085	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x4_t [[B]] to i8*
19086	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
19087	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
19088	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i32 0, i32 0
19089	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
19090	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
19091	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i32 0, i32 1
19092	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
19093	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
19094	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i32 0, i32 2
19095	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
19096	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
19097	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i32 0, i32 3
19098	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
19099	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
19100	// CHECK: ret void
19101	void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) {
19102	vst4_lane_s8(a, b, 7);
19103	}
19104
19105	// CHECK-LABEL: @test_vst4_lane_s16(
19106	// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
19107	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
19108	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[B]], i32 0, i32 0
19109	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i16>] [[COERCE_DIVE]] to [4 x i64]*
19110	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
19111	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x4_t [[__S1]] to i8*
19112	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x4x4_t [[B]] to i8*
19113	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
19114	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
19115	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
19116	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i32 0, i32 0
19117	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
19118	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
19119	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
19120	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i32 0, i32 1
19121	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
19122	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
19123	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
19124	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i32 0, i32 2
19125	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
19126	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
19127	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
19128	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i32 0, i32 3
19129	// CHECK: [[TMP10:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
19130	// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
19131	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19132	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19133	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
19134	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
19135	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
19136	// CHECK: ret void
19137	void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
19138	vst4_lane_s16(a, b, 3);
19139	}
19140
19141	// CHECK-LABEL: @test_vst4_lane_s32(
19142	// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
19143	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
19144	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[B]], i32 0, i32 0
19145	// CHECK: [[TMP0:%.]] = bitcast [4 x <2 x i32>] [[COERCE_DIVE]] to [4 x i64]*
19146	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
19147	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x4_t [[__S1]] to i8*
19148	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x2x4_t [[B]] to i8*
19149	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
19150	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
19151	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
19152	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL]], i32 0, i32 0
19153	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
19154	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
19155	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
19156	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL1]], i32 0, i32 1
19157	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
19158	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
19159	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
19160	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL3]], i32 0, i32 2
19161	// CHECK: [[TMP8:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
19162	// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
19163	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
19164	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL5]], i32 0, i32 3
19165	// CHECK: [[TMP10:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX6]], align 8
19166	// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
19167	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
19168	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
19169	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
19170	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
19171	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
19172	// CHECK: ret void
19173	void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
19174	vst4_lane_s32(a, b, 1);
19175	}
19176
19177	// CHECK-LABEL: @test_vst4_lane_f16(
19178	// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
19179	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
19180	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[B]], i32 0, i32 0
19181	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x half>] [[COERCE_DIVE]] to [4 x i64]*
19182	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
19183	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x4_t [[__S1]] to i8*
19184	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x4x4_t [[B]] to i8*
19185	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
19186	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
19187	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
19188	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL]], i32 0, i32 0
19189	// CHECK: [[TMP4:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
19190	// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
19191	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
19192	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL1]], i32 0, i32 1
19193	// CHECK: [[TMP6:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
19194	// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
19195	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
19196	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL3]], i32 0, i32 2
19197	// CHECK: [[TMP8:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX4]], align 8
19198	// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
19199	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
19200	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL5]], i32 0, i32 3
19201	// CHECK: [[TMP10:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX6]], align 8
19202	// CHECK: [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
19203	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
19204	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
19205	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
19206	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
19207	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 3, i32 2)
19208	// CHECK: ret void
19209	void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
19210	vst4_lane_f16(a, b, 3);
19211	}
19212
19213	// CHECK-LABEL: @test_vst4_lane_f32(
19214	// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
19215	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
19216	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[B]], i32 0, i32 0
19217	// CHECK: [[TMP0:%.]] = bitcast [4 x <2 x float>] [[COERCE_DIVE]] to [4 x i64]*
19218	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
19219	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x4_t [[__S1]] to i8*
19220	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x2x4_t [[B]] to i8*
19221	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
19222	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
19223	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
19224	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL]], i32 0, i32 0
19225	// CHECK: [[TMP4:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
19226	// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
19227	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
19228	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL1]], i32 0, i32 1
19229	// CHECK: [[TMP6:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
19230	// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
19231	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
19232	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL3]], i32 0, i32 2
19233	// CHECK: [[TMP8:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX4]], align 8
19234	// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
19235	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
19236	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL5]], i32 0, i32 3
19237	// CHECK: [[TMP10:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX6]], align 8
19238	// CHECK: [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
19239	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
19240	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
19241	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
19242	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
19243	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 1, i32 4)
19244	// CHECK: ret void
19245	void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
19246	vst4_lane_f32(a, b, 1);
19247	}
19248
19249	// CHECK-LABEL: @test_vst4_lane_p8(
19250	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
19251	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
19252	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[B]], i32 0, i32 0
19253	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
19254	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
19255	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x4_t [[__S1]] to i8*
19256	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x4_t [[B]] to i8*
19257	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
19258	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
19259	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i32 0, i32 0
19260	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
19261	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
19262	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i32 0, i32 1
19263	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
19264	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
19265	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i32 0, i32 2
19266	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
19267	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
19268	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i32 0, i32 3
19269	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
19270	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
19271	// CHECK: ret void
19272	void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) {
19273	vst4_lane_p8(a, b, 7);
19274	}
19275
19276	// CHECK-LABEL: @test_vst4_lane_p16(
19277	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
19278	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
19279	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[B]], i32 0, i32 0
19280	// CHECK: [[TMP0:%.]] = bitcast [4 x <4 x i16>] [[COERCE_DIVE]] to [4 x i64]*
19281	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
19282	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x4_t [[__S1]] to i8*
19283	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x4x4_t [[B]] to i8*
19284	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
19285	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
19286	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
19287	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i32 0, i32 0
19288	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
19289	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
19290	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
19291	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i32 0, i32 1
19292	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
19293	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
19294	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
19295	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i32 0, i32 2
19296	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
19297	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
19298	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
19299	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i32 0, i32 3
19300	// CHECK: [[TMP10:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
19301	// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
19302	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19303	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19304	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
19305	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
19306	// CHECK: call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
19307	// CHECK: ret void
19308	void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) {
19309	vst4_lane_p16(a, b, 3);
19310	}
19311
19312	// CHECK-LABEL: @test_vsub_s8(
19313	// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, %b
19314	// CHECK: ret <8 x i8> [[SUB_I]]
19315	int8x8_t test_vsub_s8(int8x8_t a, int8x8_t b) {
19316	return vsub_s8(a, b);
19317	}
19318
19319	// CHECK-LABEL: @test_vsub_s16(
19320	// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, %b
19321	// CHECK: ret <4 x i16> [[SUB_I]]
19322	int16x4_t test_vsub_s16(int16x4_t a, int16x4_t b) {
19323	return vsub_s16(a, b);
19324	}
19325
19326	// CHECK-LABEL: @test_vsub_s32(
19327	// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, %b
19328	// CHECK: ret <2 x i32> [[SUB_I]]
19329	int32x2_t test_vsub_s32(int32x2_t a, int32x2_t b) {
19330	return vsub_s32(a, b);
19331	}
19332
19333	// CHECK-LABEL: @test_vsub_s64(
19334	// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %a, %b
19335	// CHECK: ret <1 x i64> [[SUB_I]]
19336	int64x1_t test_vsub_s64(int64x1_t a, int64x1_t b) {
19337	return vsub_s64(a, b);
19338	}
19339
19340	// CHECK-LABEL: @test_vsub_f32(
19341	// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %a, %b
19342	// CHECK: ret <2 x float> [[SUB_I]]
19343	float32x2_t test_vsub_f32(float32x2_t a, float32x2_t b) {
19344	return vsub_f32(a, b);
19345	}
19346
19347	// CHECK-LABEL: @test_vsub_u8(
19348	// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, %b
19349	// CHECK: ret <8 x i8> [[SUB_I]]
19350	uint8x8_t test_vsub_u8(uint8x8_t a, uint8x8_t b) {
19351	return vsub_u8(a, b);
19352	}
19353
19354	// CHECK-LABEL: @test_vsub_u16(
19355	// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, %b
19356	// CHECK: ret <4 x i16> [[SUB_I]]
19357	uint16x4_t test_vsub_u16(uint16x4_t a, uint16x4_t b) {
19358	return vsub_u16(a, b);
19359	}
19360
19361	// CHECK-LABEL: @test_vsub_u32(
19362	// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, %b
19363	// CHECK: ret <2 x i32> [[SUB_I]]
19364	uint32x2_t test_vsub_u32(uint32x2_t a, uint32x2_t b) {
19365	return vsub_u32(a, b);
19366	}
19367
19368	// CHECK-LABEL: @test_vsub_u64(
19369	// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %a, %b
19370	// CHECK: ret <1 x i64> [[SUB_I]]
19371	uint64x1_t test_vsub_u64(uint64x1_t a, uint64x1_t b) {
19372	return vsub_u64(a, b);
19373	}
19374
19375	// CHECK-LABEL: @test_vsubq_s8(
19376	// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, %b
19377	// CHECK: ret <16 x i8> [[SUB_I]]
19378	int8x16_t test_vsubq_s8(int8x16_t a, int8x16_t b) {
19379	return vsubq_s8(a, b);
19380	}
19381
19382	// CHECK-LABEL: @test_vsubq_s16(
19383	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, %b
19384	// CHECK: ret <8 x i16> [[SUB_I]]
19385	int16x8_t test_vsubq_s16(int16x8_t a, int16x8_t b) {
19386	return vsubq_s16(a, b);
19387	}
19388
19389	// CHECK-LABEL: @test_vsubq_s32(
19390	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, %b
19391	// CHECK: ret <4 x i32> [[SUB_I]]
19392	int32x4_t test_vsubq_s32(int32x4_t a, int32x4_t b) {
19393	return vsubq_s32(a, b);
19394	}
19395
19396	// CHECK-LABEL: @test_vsubq_s64(
19397	// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, %b
19398	// CHECK: ret <2 x i64> [[SUB_I]]
19399	int64x2_t test_vsubq_s64(int64x2_t a, int64x2_t b) {
19400	return vsubq_s64(a, b);
19401	}
19402
19403	// CHECK-LABEL: @test_vsubq_f32(
19404	// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %a, %b
19405	// CHECK: ret <4 x float> [[SUB_I]]
19406	float32x4_t test_vsubq_f32(float32x4_t a, float32x4_t b) {
19407	return vsubq_f32(a, b);
19408	}
19409
19410	// CHECK-LABEL: @test_vsubq_u8(
19411	// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, %b
19412	// CHECK: ret <16 x i8> [[SUB_I]]
19413	uint8x16_t test_vsubq_u8(uint8x16_t a, uint8x16_t b) {
19414	return vsubq_u8(a, b);
19415	}
19416
19417	// CHECK-LABEL: @test_vsubq_u16(
19418	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, %b
19419	// CHECK: ret <8 x i16> [[SUB_I]]
19420	uint16x8_t test_vsubq_u16(uint16x8_t a, uint16x8_t b) {
19421	return vsubq_u16(a, b);
19422	}
19423
19424	// CHECK-LABEL: @test_vsubq_u32(
19425	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, %b
19426	// CHECK: ret <4 x i32> [[SUB_I]]
19427	uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b) {
19428	return vsubq_u32(a, b);
19429	}
19430
19431	// CHECK-LABEL: @test_vsubq_u64(
19432	// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, %b
19433	// CHECK: ret <2 x i64> [[SUB_I]]
19434	uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) {
19435	return vsubq_u64(a, b);
19436	}
19437
19438	// CHECK-LABEL: @test_vsubhn_s16(
19439	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
19440	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
19441	// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
19442	// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
19443	// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
19444	// CHECK: ret <8 x i8> [[VSUBHN2_I]]
19445	int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
19446	return vsubhn_s16(a, b);
19447	}
19448
19449	// CHECK-LABEL: @test_vsubhn_s32(
19450	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
19451	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
19452	// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
19453	// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
19454	// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
19455	// CHECK: ret <4 x i16> [[VSUBHN2_I]]
19456	int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
19457	return vsubhn_s32(a, b);
19458	}
19459
19460	// CHECK-LABEL: @test_vsubhn_s64(
19461	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
19462	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
19463	// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
19464	// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
19465	// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
19466	// CHECK: ret <2 x i32> [[VSUBHN2_I]]
19467	int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
19468	return vsubhn_s64(a, b);
19469	}
19470
19471	// CHECK-LABEL: @test_vsubhn_u16(
19472	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
19473	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
19474	// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b
19475	// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
19476	// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
19477	// CHECK: ret <8 x i8> [[VSUBHN2_I]]
19478	uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
19479	return vsubhn_u16(a, b);
19480	}
19481
19482	// CHECK-LABEL: @test_vsubhn_u32(
19483	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
19484	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
19485	// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b
19486	// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
19487	// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
19488	// CHECK: ret <4 x i16> [[VSUBHN2_I]]
19489	uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
19490	return vsubhn_u32(a, b);
19491	}
19492
19493	// CHECK-LABEL: @test_vsubhn_u64(
19494	// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
19495	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
19496	// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b
19497	// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
19498	// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
19499	// CHECK: ret <2 x i32> [[VSUBHN2_I]]
19500	uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
19501	return vsubhn_u64(a, b);
19502	}
19503
19504	// CHECK-LABEL: @test_vsubl_s8(
19505	// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
19506	// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
19507	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
19508	// CHECK: ret <8 x i16> [[SUB_I]]
19509	int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
19510	return vsubl_s8(a, b);
19511	}
19512
19513	// CHECK-LABEL: @test_vsubl_s16(
19514	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19515	// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32>
19516	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19517	// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32>
19518	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
19519	// CHECK: ret <4 x i32> [[SUB_I]]
19520	int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
19521	return vsubl_s16(a, b);
19522	}
19523
19524	// CHECK-LABEL: @test_vsubl_s32(
19525	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
19526	// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64>
19527	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
19528	// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64>
19529	// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
19530	// CHECK: ret <2 x i64> [[SUB_I]]
19531	int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
19532	return vsubl_s32(a, b);
19533	}
19534
19535	// CHECK-LABEL: @test_vsubl_u8(
19536	// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
19537	// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
19538	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
19539	// CHECK: ret <8 x i16> [[SUB_I]]
19540	uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
19541	return vsubl_u8(a, b);
19542	}
19543
19544	// CHECK-LABEL: @test_vsubl_u16(
19545	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
19546	// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32>
19547	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19548	// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32>
19549	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
19550	// CHECK: ret <4 x i32> [[SUB_I]]
19551	uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
19552	return vsubl_u16(a, b);
19553	}
19554
19555	// CHECK-LABEL: @test_vsubl_u32(
19556	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
19557	// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64>
19558	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
19559	// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64>
19560	// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
19561	// CHECK: ret <2 x i64> [[SUB_I]]
19562	uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
19563	return vsubl_u32(a, b);
19564	}
19565
19566	// CHECK-LABEL: @test_vsubw_s8(
19567	// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
19568	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
19569	// CHECK: ret <8 x i16> [[SUB_I]]
19570	int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
19571	return vsubw_s8(a, b);
19572	}
19573
19574	// CHECK-LABEL: @test_vsubw_s16(
19575	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19576	// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32>
19577	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
19578	// CHECK: ret <4 x i32> [[SUB_I]]
19579	int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
19580	return vsubw_s16(a, b);
19581	}
19582
19583	// CHECK-LABEL: @test_vsubw_s32(
19584	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
19585	// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64>
19586	// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
19587	// CHECK: ret <2 x i64> [[SUB_I]]
19588	int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
19589	return vsubw_s32(a, b);
19590	}
19591
19592	// CHECK-LABEL: @test_vsubw_u8(
19593	// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
19594	// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
19595	// CHECK: ret <8 x i16> [[SUB_I]]
19596	uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
19597	return vsubw_u8(a, b);
19598	}
19599
19600	// CHECK-LABEL: @test_vsubw_u16(
19601	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19602	// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32>
19603	// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
19604	// CHECK: ret <4 x i32> [[SUB_I]]
19605	uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
19606	return vsubw_u16(a, b);
19607	}
19608
19609	// CHECK-LABEL: @test_vsubw_u32(
19610	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
19611	// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64>
19612	// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
19613	// CHECK: ret <2 x i64> [[SUB_I]]
19614	uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
19615	return vsubw_u32(a, b);
19616	}
19617
19618	// CHECK-LABEL: @test_vtbl1_u8(
19619	// CHECK: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b)
19620	// CHECK: ret <8 x i8> [[VTBL1_I]]
19621	uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
19622	return vtbl1_u8(a, b);
19623	}
19624
19625	// CHECK-LABEL: @test_vtbl1_s8(
19626	// CHECK: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b)
19627	// CHECK: ret <8 x i8> [[VTBL1_I]]
19628	int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
19629	return vtbl1_s8(a, b);
19630	}
19631
19632	// CHECK-LABEL: @test_vtbl1_p8(
19633	// CHECK: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b)
19634	// CHECK: ret <8 x i8> [[VTBL1_I]]
19635	poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
19636	return vtbl1_p8(a, b);
19637	}
19638
19639	// CHECK-LABEL: @test_vtbl2_u8(
19640	// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
19641	// CHECK: [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
19642	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[A]], i32 0, i32 0
19643	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
19644	// CHECK: store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
19645	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[A]], i32 0, i32 0
19646	// CHECK: [[TMP1:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE1]] to [2 x i64]*
19647	// CHECK: [[TMP2:%.]] = load [2 x i64], [2 x i64] [[TMP1]], align 8
19648	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__P0_I]], i32 0, i32 0
19649	// CHECK: [[TMP3:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE_I]] to [2 x i64]*
19650	// CHECK: store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
19651	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__P0_I]], i32 0, i32 0
19652	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL_I]], i32 0, i32 0
19653	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
19654	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__P0_I]], i32 0, i32 0
19655	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
19656	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
19657	// CHECK: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b)
19658	// CHECK: ret <8 x i8> [[VTBL2_I]]
19659	uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
19660	return vtbl2_u8(a, b);
19661	}
19662
19663	// CHECK-LABEL: @test_vtbl2_s8(
19664	// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
19665	// CHECK: [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
19666	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[A]], i32 0, i32 0
19667	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
19668	// CHECK: store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
19669	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[A]], i32 0, i32 0
19670	// CHECK: [[TMP1:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE1]] to [2 x i64]*
19671	// CHECK: [[TMP2:%.]] = load [2 x i64], [2 x i64] [[TMP1]], align 8
19672	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__P0_I]], i32 0, i32 0
19673	// CHECK: [[TMP3:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE_I]] to [2 x i64]*
19674	// CHECK: store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
19675	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__P0_I]], i32 0, i32 0
19676	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL_I]], i32 0, i32 0
19677	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
19678	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__P0_I]], i32 0, i32 0
19679	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
19680	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
19681	// CHECK: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b)
19682	// CHECK: ret <8 x i8> [[VTBL2_I]]
19683	int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
19684	return vtbl2_s8(a, b);
19685	}
19686
19687	// CHECK-LABEL: @test_vtbl2_p8(
19688	// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
19689	// CHECK: [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
19690	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[A]], i32 0, i32 0
19691	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
19692	// CHECK: store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
19693	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[A]], i32 0, i32 0
19694	// CHECK: [[TMP1:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE1]] to [2 x i64]*
19695	// CHECK: [[TMP2:%.]] = load [2 x i64], [2 x i64] [[TMP1]], align 8
19696	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__P0_I]], i32 0, i32 0
19697	// CHECK: [[TMP3:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE_I]] to [2 x i64]*
19698	// CHECK: store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
19699	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__P0_I]], i32 0, i32 0
19700	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL_I]], i32 0, i32 0
19701	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
19702	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__P0_I]], i32 0, i32 0
19703	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
19704	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
19705	// CHECK: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b)
19706	// CHECK: ret <8 x i8> [[VTBL2_I]]
19707	poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
19708	return vtbl2_p8(a, b);
19709	}
19710
19711	// CHECK-LABEL: @test_vtbl3_u8(
19712	// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
19713	// CHECK: [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
19714	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[A]], i32 0, i32 0
19715	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
19716	// CHECK: store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
19717	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[A]], i32 0, i32 0
19718	// CHECK: [[TMP1:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE1]] to [3 x i64]*
19719	// CHECK: [[TMP2:%.]] = load [3 x i64], [3 x i64] [[TMP1]], align 8
19720	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__P0_I]], i32 0, i32 0
19721	// CHECK: [[TMP3:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE_I]] to [3 x i64]*
19722	// CHECK: store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
19723	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__P0_I]], i32 0, i32 0
19724	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL_I]], i32 0, i32 0
19725	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
19726	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__P0_I]], i32 0, i32 0
19727	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
19728	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
19729	// CHECK: [[VAL3_I:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__P0_I]], i32 0, i32 0
19730	// CHECK: [[ARRAYIDX4_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3_I]], i32 0, i32 2
19731	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4_I]], align 8
19732	// CHECK: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b)
19733	// CHECK: ret <8 x i8> [[VTBL3_I]]
19734	uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
19735	return vtbl3_u8(a, b);
19736	}
19737
19738	// CHECK-LABEL: @test_vtbl3_s8(
19739	// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
19740	// CHECK: [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
19741	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[A]], i32 0, i32 0
19742	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
19743	// CHECK: store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
19744	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[A]], i32 0, i32 0
19745	// CHECK: [[TMP1:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE1]] to [3 x i64]*
19746	// CHECK: [[TMP2:%.]] = load [3 x i64], [3 x i64] [[TMP1]], align 8
19747	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__P0_I]], i32 0, i32 0
19748	// CHECK: [[TMP3:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE_I]] to [3 x i64]*
19749	// CHECK: store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
19750	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__P0_I]], i32 0, i32 0
19751	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL_I]], i32 0, i32 0
19752	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
19753	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__P0_I]], i32 0, i32 0
19754	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
19755	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
19756	// CHECK: [[VAL3_I:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__P0_I]], i32 0, i32 0
19757	// CHECK: [[ARRAYIDX4_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3_I]], i32 0, i32 2
19758	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4_I]], align 8
19759	// CHECK: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b)
19760	// CHECK: ret <8 x i8> [[VTBL3_I]]
19761	int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
19762	return vtbl3_s8(a, b);
19763	}
19764
19765	// CHECK-LABEL: @test_vtbl3_p8(
19766	// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
19767	// CHECK: [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
19768	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[A]], i32 0, i32 0
19769	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
19770	// CHECK: store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
19771	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[A]], i32 0, i32 0
19772	// CHECK: [[TMP1:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE1]] to [3 x i64]*
19773	// CHECK: [[TMP2:%.]] = load [3 x i64], [3 x i64] [[TMP1]], align 8
19774	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__P0_I]], i32 0, i32 0
19775	// CHECK: [[TMP3:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE_I]] to [3 x i64]*
19776	// CHECK: store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
19777	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__P0_I]], i32 0, i32 0
19778	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL_I]], i32 0, i32 0
19779	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
19780	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__P0_I]], i32 0, i32 0
19781	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
19782	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
19783	// CHECK: [[VAL3_I:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__P0_I]], i32 0, i32 0
19784	// CHECK: [[ARRAYIDX4_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3_I]], i32 0, i32 2
19785	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4_I]], align 8
19786	// CHECK: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b)
19787	// CHECK: ret <8 x i8> [[VTBL3_I]]
19788	poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
19789	return vtbl3_p8(a, b);
19790	}
19791
19792	// CHECK-LABEL: @test_vtbl4_u8(
19793	// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
19794	// CHECK: [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
19795	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[A]], i32 0, i32 0
19796	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
19797	// CHECK: store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
19798	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[A]], i32 0, i32 0
19799	// CHECK: [[TMP1:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE1]] to [4 x i64]*
19800	// CHECK: [[TMP2:%.]] = load [4 x i64], [4 x i64] [[TMP1]], align 8
19801	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__P0_I]], i32 0, i32 0
19802	// CHECK: [[TMP3:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE_I]] to [4 x i64]*
19803	// CHECK: store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
19804	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__P0_I]], i32 0, i32 0
19805	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL_I]], i32 0, i32 0
19806	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
19807	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__P0_I]], i32 0, i32 0
19808	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
19809	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
19810	// CHECK: [[VAL3_I:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__P0_I]], i32 0, i32 0
19811	// CHECK: [[ARRAYIDX4_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3_I]], i32 0, i32 2
19812	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4_I]], align 8
19813	// CHECK: [[VAL5_I:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__P0_I]], i32 0, i32 0
19814	// CHECK: [[ARRAYIDX6_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5_I]], i32 0, i32 3
19815	// CHECK: [[TMP7:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6_I]], align 8
19816	// CHECK: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b)
19817	// CHECK: ret <8 x i8> [[VTBL4_I]]
19818	uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
19819	return vtbl4_u8(a, b);
19820	}
19821
19822	// CHECK-LABEL: @test_vtbl4_s8(
19823	// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
19824	// CHECK: [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
19825	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[A]], i32 0, i32 0
19826	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
19827	// CHECK: store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
19828	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[A]], i32 0, i32 0
19829	// CHECK: [[TMP1:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE1]] to [4 x i64]*
19830	// CHECK: [[TMP2:%.]] = load [4 x i64], [4 x i64] [[TMP1]], align 8
19831	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__P0_I]], i32 0, i32 0
19832	// CHECK: [[TMP3:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE_I]] to [4 x i64]*
19833	// CHECK: store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
19834	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__P0_I]], i32 0, i32 0
19835	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL_I]], i32 0, i32 0
19836	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
19837	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__P0_I]], i32 0, i32 0
19838	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
19839	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
19840	// CHECK: [[VAL3_I:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__P0_I]], i32 0, i32 0
19841	// CHECK: [[ARRAYIDX4_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3_I]], i32 0, i32 2
19842	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4_I]], align 8
19843	// CHECK: [[VAL5_I:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__P0_I]], i32 0, i32 0
19844	// CHECK: [[ARRAYIDX6_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5_I]], i32 0, i32 3
19845	// CHECK: [[TMP7:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6_I]], align 8
19846	// CHECK: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b)
19847	// CHECK: ret <8 x i8> [[VTBL4_I]]
19848	int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
19849	return vtbl4_s8(a, b);
19850	}
19851
19852	// CHECK-LABEL: @test_vtbl4_p8(
19853	// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
19854	// CHECK: [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
19855	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[A]], i32 0, i32 0
19856	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
19857	// CHECK: store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
19858	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[A]], i32 0, i32 0
19859	// CHECK: [[TMP1:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE1]] to [4 x i64]*
19860	// CHECK: [[TMP2:%.]] = load [4 x i64], [4 x i64] [[TMP1]], align 8
19861	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__P0_I]], i32 0, i32 0
19862	// CHECK: [[TMP3:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE_I]] to [4 x i64]*
19863	// CHECK: store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
19864	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__P0_I]], i32 0, i32 0
19865	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL_I]], i32 0, i32 0
19866	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
19867	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__P0_I]], i32 0, i32 0
19868	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
19869	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
19870	// CHECK: [[VAL3_I:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__P0_I]], i32 0, i32 0
19871	// CHECK: [[ARRAYIDX4_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3_I]], i32 0, i32 2
19872	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4_I]], align 8
19873	// CHECK: [[VAL5_I:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__P0_I]], i32 0, i32 0
19874	// CHECK: [[ARRAYIDX6_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5_I]], i32 0, i32 3
19875	// CHECK: [[TMP7:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6_I]], align 8
19876	// CHECK: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b)
19877	// CHECK: ret <8 x i8> [[VTBL4_I]]
19878	poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
19879	return vtbl4_p8(a, b);
19880	}
19881
19882	// CHECK-LABEL: @test_vtbx1_u8(
19883	// CHECK: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
19884	// CHECK: ret <8 x i8> [[VTBX1_I]]
19885	uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
19886	return vtbx1_u8(a, b, c);
19887	}
19888
19889	// CHECK-LABEL: @test_vtbx1_s8(
19890	// CHECK: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
19891	// CHECK: ret <8 x i8> [[VTBX1_I]]
19892	int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
19893	return vtbx1_s8(a, b, c);
19894	}
19895
19896	// CHECK-LABEL: @test_vtbx1_p8(
19897	// CHECK: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c)
19898	// CHECK: ret <8 x i8> [[VTBX1_I]]
19899	poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
19900	return vtbx1_p8(a, b, c);
19901	}
19902
19903	// CHECK-LABEL: @test_vtbx2_u8(
19904	// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
19905	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
19906	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[B]], i32 0, i32 0
19907	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
19908	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19909	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[B]], i32 0, i32 0
19910	// CHECK: [[TMP1:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE1]] to [2 x i64]*
19911	// CHECK: [[TMP2:%.]] = load [2 x i64], [2 x i64] [[TMP1]], align 8
19912	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__P1_I]], i32 0, i32 0
19913	// CHECK: [[TMP3:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE_I]] to [2 x i64]*
19914	// CHECK: store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
19915	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__P1_I]], i32 0, i32 0
19916	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL_I]], i32 0, i32 0
19917	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
19918	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__P1_I]], i32 0, i32 0
19919	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
19920	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
19921	// CHECK: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c)
19922	// CHECK: ret <8 x i8> [[VTBX2_I]]
19923	uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
19924	return vtbx2_u8(a, b, c);
19925	}
19926
19927	// CHECK-LABEL: @test_vtbx2_s8(
19928	// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
19929	// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
19930	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[B]], i32 0, i32 0
19931	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
19932	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19933	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[B]], i32 0, i32 0
19934	// CHECK: [[TMP1:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE1]] to [2 x i64]*
19935	// CHECK: [[TMP2:%.]] = load [2 x i64], [2 x i64] [[TMP1]], align 8
19936	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__P1_I]], i32 0, i32 0
19937	// CHECK: [[TMP3:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE_I]] to [2 x i64]*
19938	// CHECK: store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
19939	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__P1_I]], i32 0, i32 0
19940	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL_I]], i32 0, i32 0
19941	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
19942	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__P1_I]], i32 0, i32 0
19943	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
19944	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
19945	// CHECK: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c)
19946	// CHECK: ret <8 x i8> [[VTBX2_I]]
19947	int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
19948	return vtbx2_s8(a, b, c);
19949	}
19950
19951	// CHECK-LABEL: @test_vtbx2_p8(
19952	// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
19953	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
19954	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[B]], i32 0, i32 0
19955	// CHECK: [[TMP0:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE]] to [2 x i64]*
19956	// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19957	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[B]], i32 0, i32 0
19958	// CHECK: [[TMP1:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE1]] to [2 x i64]*
19959	// CHECK: [[TMP2:%.]] = load [2 x i64], [2 x i64] [[TMP1]], align 8
19960	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__P1_I]], i32 0, i32 0
19961	// CHECK: [[TMP3:%.]] = bitcast [2 x <8 x i8>] [[COERCE_DIVE_I]] to [2 x i64]*
19962	// CHECK: store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
19963	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__P1_I]], i32 0, i32 0
19964	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL_I]], i32 0, i32 0
19965	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
19966	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__P1_I]], i32 0, i32 0
19967	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
19968	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
19969	// CHECK: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c)
19970	// CHECK: ret <8 x i8> [[VTBX2_I]]
19971	poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
19972	return vtbx2_p8(a, b, c);
19973	}
19974
19975	// CHECK-LABEL: @test_vtbx3_u8(
19976	// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
19977	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
19978	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[B]], i32 0, i32 0
19979	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
19980	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19981	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[B]], i32 0, i32 0
19982	// CHECK: [[TMP1:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE1]] to [3 x i64]*
19983	// CHECK: [[TMP2:%.]] = load [3 x i64], [3 x i64] [[TMP1]], align 8
19984	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__P1_I]], i32 0, i32 0
19985	// CHECK: [[TMP3:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE_I]] to [3 x i64]*
19986	// CHECK: store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
19987	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__P1_I]], i32 0, i32 0
19988	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL_I]], i32 0, i32 0
19989	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
19990	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__P1_I]], i32 0, i32 0
19991	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
19992	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
19993	// CHECK: [[VAL3_I:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__P1_I]], i32 0, i32 0
19994	// CHECK: [[ARRAYIDX4_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3_I]], i32 0, i32 2
19995	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4_I]], align 8
19996	// CHECK: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c)
19997	// CHECK: ret <8 x i8> [[VTBX3_I]]
19998	uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
19999	return vtbx3_u8(a, b, c);
20000	}
20001
20002	// CHECK-LABEL: @test_vtbx3_s8(
20003	// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
20004	// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
20005	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[B]], i32 0, i32 0
20006	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
20007	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20008	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[B]], i32 0, i32 0
20009	// CHECK: [[TMP1:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE1]] to [3 x i64]*
20010	// CHECK: [[TMP2:%.]] = load [3 x i64], [3 x i64] [[TMP1]], align 8
20011	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__P1_I]], i32 0, i32 0
20012	// CHECK: [[TMP3:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE_I]] to [3 x i64]*
20013	// CHECK: store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
20014	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__P1_I]], i32 0, i32 0
20015	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL_I]], i32 0, i32 0
20016	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
20017	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__P1_I]], i32 0, i32 0
20018	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
20019	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
20020	// CHECK: [[VAL3_I:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__P1_I]], i32 0, i32 0
20021	// CHECK: [[ARRAYIDX4_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3_I]], i32 0, i32 2
20022	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4_I]], align 8
20023	// CHECK: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c)
20024	// CHECK: ret <8 x i8> [[VTBX3_I]]
20025	int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
20026	return vtbx3_s8(a, b, c);
20027	}
20028
20029	// CHECK-LABEL: @test_vtbx3_p8(
20030	// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
20031	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
20032	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[B]], i32 0, i32 0
20033	// CHECK: [[TMP0:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE]] to [3 x i64]*
20034	// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20035	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[B]], i32 0, i32 0
20036	// CHECK: [[TMP1:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE1]] to [3 x i64]*
20037	// CHECK: [[TMP2:%.]] = load [3 x i64], [3 x i64] [[TMP1]], align 8
20038	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__P1_I]], i32 0, i32 0
20039	// CHECK: [[TMP3:%.]] = bitcast [3 x <8 x i8>] [[COERCE_DIVE_I]] to [3 x i64]*
20040	// CHECK: store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
20041	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__P1_I]], i32 0, i32 0
20042	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL_I]], i32 0, i32 0
20043	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
20044	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__P1_I]], i32 0, i32 0
20045	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
20046	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
20047	// CHECK: [[VAL3_I:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__P1_I]], i32 0, i32 0
20048	// CHECK: [[ARRAYIDX4_I:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3_I]], i32 0, i32 2
20049	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4_I]], align 8
20050	// CHECK: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c)
20051	// CHECK: ret <8 x i8> [[VTBX3_I]]
20052	poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
20053	return vtbx3_p8(a, b, c);
20054	}
20055
20056	// CHECK-LABEL: @test_vtbx4_u8(
20057	// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
20058	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
20059	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[B]], i32 0, i32 0
20060	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
20061	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
20062	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[B]], i32 0, i32 0
20063	// CHECK: [[TMP1:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE1]] to [4 x i64]*
20064	// CHECK: [[TMP2:%.]] = load [4 x i64], [4 x i64] [[TMP1]], align 8
20065	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__P1_I]], i32 0, i32 0
20066	// CHECK: [[TMP3:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE_I]] to [4 x i64]*
20067	// CHECK: store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
20068	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__P1_I]], i32 0, i32 0
20069	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL_I]], i32 0, i32 0
20070	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
20071	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__P1_I]], i32 0, i32 0
20072	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
20073	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
20074	// CHECK: [[VAL3_I:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__P1_I]], i32 0, i32 0
20075	// CHECK: [[ARRAYIDX4_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3_I]], i32 0, i32 2
20076	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4_I]], align 8
20077	// CHECK: [[VAL5_I:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__P1_I]], i32 0, i32 0
20078	// CHECK: [[ARRAYIDX6_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5_I]], i32 0, i32 3
20079	// CHECK: [[TMP7:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6_I]], align 8
20080	// CHECK: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c)
20081	// CHECK: ret <8 x i8> [[VTBX4_I]]
20082	uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
20083	return vtbx4_u8(a, b, c);
20084	}
20085
20086	// CHECK-LABEL: @test_vtbx4_s8(
20087	// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
20088	// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
20089	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[B]], i32 0, i32 0
20090	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
20091	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
20092	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[B]], i32 0, i32 0
20093	// CHECK: [[TMP1:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE1]] to [4 x i64]*
20094	// CHECK: [[TMP2:%.]] = load [4 x i64], [4 x i64] [[TMP1]], align 8
20095	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__P1_I]], i32 0, i32 0
20096	// CHECK: [[TMP3:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE_I]] to [4 x i64]*
20097	// CHECK: store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
20098	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__P1_I]], i32 0, i32 0
20099	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL_I]], i32 0, i32 0
20100	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
20101	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__P1_I]], i32 0, i32 0
20102	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
20103	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
20104	// CHECK: [[VAL3_I:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__P1_I]], i32 0, i32 0
20105	// CHECK: [[ARRAYIDX4_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3_I]], i32 0, i32 2
20106	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4_I]], align 8
20107	// CHECK: [[VAL5_I:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__P1_I]], i32 0, i32 0
20108	// CHECK: [[ARRAYIDX6_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5_I]], i32 0, i32 3
20109	// CHECK: [[TMP7:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6_I]], align 8
20110	// CHECK: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c)
20111	// CHECK: ret <8 x i8> [[VTBX4_I]]
20112	int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
20113	return vtbx4_s8(a, b, c);
20114	}
20115
20116	// CHECK-LABEL: @test_vtbx4_p8(
20117	// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
20118	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
20119	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[B]], i32 0, i32 0
20120	// CHECK: [[TMP0:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE]] to [4 x i64]*
20121	// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
20122	// CHECK: [[COERCE_DIVE1:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[B]], i32 0, i32 0
20123	// CHECK: [[TMP1:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE1]] to [4 x i64]*
20124	// CHECK: [[TMP2:%.]] = load [4 x i64], [4 x i64] [[TMP1]], align 8
20125	// CHECK: [[COERCE_DIVE_I:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__P1_I]], i32 0, i32 0
20126	// CHECK: [[TMP3:%.]] = bitcast [4 x <8 x i8>] [[COERCE_DIVE_I]] to [4 x i64]*
20127	// CHECK: store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
20128	// CHECK: [[VAL_I:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__P1_I]], i32 0, i32 0
20129	// CHECK: [[ARRAYIDX_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL_I]], i32 0, i32 0
20130	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX_I]], align 8
20131	// CHECK: [[VAL1_I:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__P1_I]], i32 0, i32 0
20132	// CHECK: [[ARRAYIDX2_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1_I]], i32 0, i32 1
20133	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2_I]], align 8
20134	// CHECK: [[VAL3_I:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__P1_I]], i32 0, i32 0
20135	// CHECK: [[ARRAYIDX4_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3_I]], i32 0, i32 2
20136	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4_I]], align 8
20137	// CHECK: [[VAL5_I:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__P1_I]], i32 0, i32 0
20138	// CHECK: [[ARRAYIDX6_I:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5_I]], i32 0, i32 3
20139	// CHECK: [[TMP7:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6_I]], align 8
20140	// CHECK: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c)
20141	// CHECK: ret <8 x i8> [[VTBX4_I]]
20142	poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
20143	return vtbx4_p8(a, b, c);
20144	}
20145
20146	// CHECK: @test_vtrn_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20147	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x2_t [[AGG_RESULT]] to i8*
20148	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <8 x i8>*
20149	// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
20150	// CHECK: store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !alias.scope !3
20151	// CHECK: [[TMP2:%.]] = getelementptr inbounds <8 x i8>, <8 x i8> [[TMP1]], i32 1
20152	// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
20153	// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !alias.scope !3
20154	// CHECK: ret void
20155	int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
20156	return vtrn_s8(a, b);
20157	}
20158
20159	// CHECK: @test_vtrn_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20160	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x2_t [[AGG_RESULT]] to i8*
20161	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
20162	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
20163	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i16>*
20164	// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
20165	// CHECK: store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !alias.scope !6
20166	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i16>, <4 x i16> [[TMP3]], i32 1
20167	// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
20168	// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !alias.scope !6
20169	// CHECK: ret void
20170	int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
20171	return vtrn_s16(a, b);
20172	}
20173
20174	// CHECK: @test_vtrn_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20175	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x2_t [[AGG_RESULT]] to i8*
20176	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
20177	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
20178	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <2 x i32>*
20179	// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
20180	// CHECK: store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !alias.scope !9
20181	// CHECK: [[TMP4:%.]] = getelementptr inbounds <2 x i32>, <2 x i32> [[TMP3]], i32 1
20182	// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
20183	// CHECK: store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !alias.scope !9
20184	// CHECK: ret void
20185	int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
20186	return vtrn_s32(a, b);
20187	}
20188
20189	// CHECK: @test_vtrn_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20190	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x2_t [[AGG_RESULT]] to i8*
20191	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <8 x i8>*
20192	// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
20193	// CHECK: store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !alias.scope !12
20194	// CHECK: [[TMP2:%.]] = getelementptr inbounds <8 x i8>, <8 x i8> [[TMP1]], i32 1
20195	// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
20196	// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !alias.scope !12
20197	// CHECK: ret void
20198	uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
20199	return vtrn_u8(a, b);
20200	}
20201
20202	// CHECK: @test_vtrn_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20203	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x2_t [[AGG_RESULT]] to i8*
20204	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
20205	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
20206	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i16>*
20207	// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
20208	// CHECK: store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !alias.scope !15
20209	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i16>, <4 x i16> [[TMP3]], i32 1
20210	// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
20211	// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !alias.scope !15
20212	// CHECK: ret void
20213	uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
20214	return vtrn_u16(a, b);
20215	}
20216
20217	// CHECK: @test_vtrn_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20218	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x2_t [[AGG_RESULT]] to i8*
20219	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
20220	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
20221	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <2 x i32>*
20222	// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
20223	// CHECK: store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]], !alias.scope !18
20224	// CHECK: [[TMP4:%.]] = getelementptr inbounds <2 x i32>, <2 x i32> [[TMP3]], i32 1
20225	// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
20226	// CHECK: store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !alias.scope !18
20227	// CHECK: ret void
20228	uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
20229	return vtrn_u32(a, b);
20230	}
20231
20232	// CHECK: @test_vtrn_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20233	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x2_t [[AGG_RESULT]] to i8*
20234	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
20235	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
20236	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <2 x float>*
20237	// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
20238	// CHECK: store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]], !alias.scope !21
20239	// CHECK: [[TMP4:%.]] = getelementptr inbounds <2 x float>, <2 x float> [[TMP3]], i32 1
20240	// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
20241	// CHECK: store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]], !alias.scope !21
20242	// CHECK: ret void
20243	float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
20244	return vtrn_f32(a, b);
20245	}
20246
20247	// CHECK: @test_vtrn_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20248	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x2_t [[AGG_RESULT]] to i8*
20249	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <8 x i8>*
20250	// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
20251	// CHECK: store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]], !alias.scope !24
20252	// CHECK: [[TMP2:%.]] = getelementptr inbounds <8 x i8>, <8 x i8> [[TMP1]], i32 1
20253	// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
20254	// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !alias.scope !24
20255	// CHECK: ret void
20256	poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
20257	return vtrn_p8(a, b);
20258	}
20259
20260	// CHECK: @test_vtrn_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20261	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x2_t [[AGG_RESULT]] to i8*
20262	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
20263	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
20264	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i16>*
20265	// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
20266	// CHECK: store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]], !alias.scope !27
20267	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i16>, <4 x i16> [[TMP3]], i32 1
20268	// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
20269	// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !alias.scope !27
20270	// CHECK: ret void
20271	poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
20272	return vtrn_p16(a, b);
20273	}
20274
20275	// CHECK: @test_vtrnq_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20276	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x2_t [[AGG_RESULT]] to i8*
20277	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <16 x i8>*
20278	// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
20279	// CHECK: store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !alias.scope !30
20280	// CHECK: [[TMP2:%.]] = getelementptr inbounds <16 x i8>, <16 x i8> [[TMP1]], i32 1
20281	// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
20282	// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !alias.scope !30
20283	// CHECK: ret void
20284	int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
20285	return vtrnq_s8(a, b);
20286	}
20287
20288	// CHECK: @test_vtrnq_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20289	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x2_t [[AGG_RESULT]] to i8*
20290	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
20291	// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
20292	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <8 x i16>*
20293	// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
20294	// CHECK: store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !alias.scope !33
20295	// CHECK: [[TMP4:%.]] = getelementptr inbounds <8 x i16>, <8 x i16> [[TMP3]], i32 1
20296	// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
20297	// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !alias.scope !33
20298	// CHECK: ret void
20299	int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
20300	return vtrnq_s16(a, b);
20301	}
20302
20303	// CHECK: @test_vtrnq_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20304	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x2_t [[AGG_RESULT]] to i8*
20305	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
20306	// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
20307	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i32>*
20308	// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
20309	// CHECK: store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !alias.scope !36
20310	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> [[TMP3]], i32 1
20311	// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
20312	// CHECK: store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !alias.scope !36
20313	// CHECK: ret void
20314	int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
20315	return vtrnq_s32(a, b);
20316	}
20317
20318	// CHECK: @test_vtrnq_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20319	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x2_t [[AGG_RESULT]] to i8*
20320	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <16 x i8>*
20321	// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
20322	// CHECK: store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !alias.scope !39
20323	// CHECK: [[TMP2:%.]] = getelementptr inbounds <16 x i8>, <16 x i8> [[TMP1]], i32 1
20324	// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
20325	// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !alias.scope !39
20326	// CHECK: ret void
20327	uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
20328	return vtrnq_u8(a, b);
20329	}
20330
20331	// CHECK: @test_vtrnq_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20332	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x2_t [[AGG_RESULT]] to i8*
20333	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
20334	// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
20335	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <8 x i16>*
20336	// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
20337	// CHECK: store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !alias.scope !42
20338	// CHECK: [[TMP4:%.]] = getelementptr inbounds <8 x i16>, <8 x i16> [[TMP3]], i32 1
20339	// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
20340	// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !alias.scope !42
20341	// CHECK: ret void
20342	uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
20343	return vtrnq_u16(a, b);
20344	}
20345
20346	// CHECK: @test_vtrnq_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20347	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x2_t [[AGG_RESULT]] to i8*
20348	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
20349	// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
20350	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i32>*
20351	// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
20352	// CHECK: store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]], !alias.scope !45
20353	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> [[TMP3]], i32 1
20354	// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
20355	// CHECK: store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !alias.scope !45
20356	// CHECK: ret void
20357	uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
20358	return vtrnq_u32(a, b);
20359	}
20360
20361	// CHECK: @test_vtrnq_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20362	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x2_t [[AGG_RESULT]] to i8*
20363	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
20364	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
20365	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x float>*
20366	// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
20367	// CHECK: store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]], !alias.scope !48
20368	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x float>, <4 x float> [[TMP3]], i32 1
20369	// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
20370	// CHECK: store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]], !alias.scope !48
20371	// CHECK: ret void
20372	float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
20373	return vtrnq_f32(a, b);
20374	}
20375
20376	// CHECK: @test_vtrnq_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20377	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x2_t [[AGG_RESULT]] to i8*
20378	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <16 x i8>*
20379	// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
20380	// CHECK: store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]], !alias.scope !51
20381	// CHECK: [[TMP2:%.]] = getelementptr inbounds <16 x i8>, <16 x i8> [[TMP1]], i32 1
20382	// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
20383	// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !alias.scope !51
20384	// CHECK: ret void
20385	poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
20386	return vtrnq_p8(a, b);
20387	}
20388
20389	// CHECK: @test_vtrnq_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20390	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x2_t [[AGG_RESULT]] to i8*
20391	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
20392	// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
20393	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <8 x i16>*
20394	// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
20395	// CHECK: store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]], !alias.scope !54
20396	// CHECK: [[TMP4:%.]] = getelementptr inbounds <8 x i16>, <8 x i16> [[TMP3]], i32 1
20397	// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
20398	// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !alias.scope !54
20399	// CHECK: ret void
20400	poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
20401	return vtrnq_p16(a, b);
20402	}
20403
20404	// CHECK-LABEL: @test_vtst_s8(
20405	// CHECK: [[TMP0:%.*]] = and <8 x i8> %a, %b
20406	// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
20407	// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
20408	// CHECK: ret <8 x i8> [[VTST_I]]
20409	uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) {
20410	return vtst_s8(a, b);
20411	}
20412
20413	// CHECK-LABEL: @test_vtst_s16(
20414	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
20415	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
20416	// CHECK: [[TMP2:%.*]] = and <4 x i16> %a, %b
20417	// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
20418	// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
20419	// CHECK: ret <4 x i16> [[VTST_I]]
20420	uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) {
20421	return vtst_s16(a, b);
20422	}
20423
20424	// CHECK-LABEL: @test_vtst_s32(
20425	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
20426	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
20427	// CHECK: [[TMP2:%.*]] = and <2 x i32> %a, %b
20428	// CHECK: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
20429	// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
20430	// CHECK: ret <2 x i32> [[VTST_I]]
20431	uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) {
20432	return vtst_s32(a, b);
20433	}
20434
20435	// CHECK-LABEL: @test_vtst_u8(
20436	// CHECK: [[TMP0:%.*]] = and <8 x i8> %a, %b
20437	// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
20438	// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
20439	// CHECK: ret <8 x i8> [[VTST_I]]
20440	uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) {
20441	return vtst_u8(a, b);
20442	}
20443
20444	// CHECK-LABEL: @test_vtst_u16(
20445	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
20446	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
20447	// CHECK: [[TMP2:%.*]] = and <4 x i16> %a, %b
20448	// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
20449	// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
20450	// CHECK: ret <4 x i16> [[VTST_I]]
20451	uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) {
20452	return vtst_u16(a, b);
20453	}
20454
20455	// CHECK-LABEL: @test_vtst_u32(
20456	// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
20457	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
20458	// CHECK: [[TMP2:%.*]] = and <2 x i32> %a, %b
20459	// CHECK: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
20460	// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32>
20461	// CHECK: ret <2 x i32> [[VTST_I]]
20462	uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) {
20463	return vtst_u32(a, b);
20464	}
20465
20466	// CHECK-LABEL: @test_vtst_p8(
20467	// CHECK: [[TMP0:%.*]] = and <8 x i8> %a, %b
20468	// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
20469	// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
20470	// CHECK: ret <8 x i8> [[VTST_I]]
20471	uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) {
20472	return vtst_p8(a, b);
20473	}
20474
20475	// CHECK-LABEL: @test_vtst_p16(
20476	// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
20477	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
20478	// CHECK: [[TMP2:%.*]] = and <4 x i16> %a, %b
20479	// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer
20480	// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
20481	// CHECK: ret <4 x i16> [[VTST_I]]
20482	uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) {
20483	return vtst_p16(a, b);
20484	}
20485
20486	// CHECK-LABEL: @test_vtstq_s8(
20487	// CHECK: [[TMP0:%.*]] = and <16 x i8> %a, %b
20488	// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
20489	// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
20490	// CHECK: ret <16 x i8> [[VTST_I]]
20491	uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) {
20492	return vtstq_s8(a, b);
20493	}
20494
20495	// CHECK-LABEL: @test_vtstq_s16(
20496	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
20497	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
20498	// CHECK: [[TMP2:%.*]] = and <8 x i16> %a, %b
20499	// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
20500	// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
20501	// CHECK: ret <8 x i16> [[VTST_I]]
20502	uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) {
20503	return vtstq_s16(a, b);
20504	}
20505
20506	// CHECK-LABEL: @test_vtstq_s32(
20507	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
20508	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
20509	// CHECK: [[TMP2:%.*]] = and <4 x i32> %a, %b
20510	// CHECK: [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
20511	// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
20512	// CHECK: ret <4 x i32> [[VTST_I]]
20513	uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) {
20514	return vtstq_s32(a, b);
20515	}
20516
20517	// CHECK-LABEL: @test_vtstq_u8(
20518	// CHECK: [[TMP0:%.*]] = and <16 x i8> %a, %b
20519	// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
20520	// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
20521	// CHECK: ret <16 x i8> [[VTST_I]]
20522	uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) {
20523	return vtstq_u8(a, b);
20524	}
20525
20526	// CHECK-LABEL: @test_vtstq_u16(
20527	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
20528	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
20529	// CHECK: [[TMP2:%.*]] = and <8 x i16> %a, %b
20530	// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
20531	// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
20532	// CHECK: ret <8 x i16> [[VTST_I]]
20533	uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) {
20534	return vtstq_u16(a, b);
20535	}
20536
20537	// CHECK-LABEL: @test_vtstq_u32(
20538	// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
20539	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
20540	// CHECK: [[TMP2:%.*]] = and <4 x i32> %a, %b
20541	// CHECK: [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
20542	// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
20543	// CHECK: ret <4 x i32> [[VTST_I]]
20544	uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) {
20545	return vtstq_u32(a, b);
20546	}
20547
20548	// CHECK-LABEL: @test_vtstq_p8(
20549	// CHECK: [[TMP0:%.*]] = and <16 x i8> %a, %b
20550	// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
20551	// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
20552	// CHECK: ret <16 x i8> [[VTST_I]]
20553	uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) {
20554	return vtstq_p8(a, b);
20555	}
20556
20557	// CHECK-LABEL: @test_vtstq_p16(
20558	// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
20559	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
20560	// CHECK: [[TMP2:%.*]] = and <8 x i16> %a, %b
20561	// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
20562	// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
20563	// CHECK: ret <8 x i16> [[VTST_I]]
20564	uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
20565	return vtstq_p16(a, b);
20566	}
20567
20568	// CHECK: @test_vuzp_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20569	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x2_t [[AGG_RESULT]] to i8*
20570	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <8 x i8>*
20571	// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
20572	// CHECK: store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !alias.scope !57
20573	// CHECK: [[TMP2:%.]] = getelementptr inbounds <8 x i8>, <8 x i8> [[TMP1]], i32 1
20574	// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
20575	// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !alias.scope !57
20576	// CHECK: ret void
20577	int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
20578	return vuzp_s8(a, b);
20579	}
20580
20581	// CHECK: @test_vuzp_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20582	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x2_t [[AGG_RESULT]] to i8*
20583	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
20584	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
20585	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i16>*
20586	// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
20587	// CHECK: store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !alias.scope !60
20588	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i16>, <4 x i16> [[TMP3]], i32 1
20589	// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
20590	// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !alias.scope !60
20591	// CHECK: ret void
20592	int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
20593	return vuzp_s16(a, b);
20594	}
20595
20596	// CHECK: @test_vuzp_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20597	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x2_t [[AGG_RESULT]] to i8*
20598	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
20599	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
20600	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <2 x i32>*
20601	// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
20602	// CHECK: store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !alias.scope !63
20603	// CHECK: [[TMP4:%.]] = getelementptr inbounds <2 x i32>, <2 x i32> [[TMP3]], i32 1
20604	// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
20605	// CHECK: store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !alias.scope !63
20606	// CHECK: ret void
20607	int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
20608	return vuzp_s32(a, b);
20609	}
20610
20611	// CHECK: @test_vuzp_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20612	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x2_t [[AGG_RESULT]] to i8*
20613	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <8 x i8>*
20614	// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
20615	// CHECK: store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !alias.scope !66
20616	// CHECK: [[TMP2:%.]] = getelementptr inbounds <8 x i8>, <8 x i8> [[TMP1]], i32 1
20617	// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
20618	// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !alias.scope !66
20619	// CHECK: ret void
20620	uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
20621	return vuzp_u8(a, b);
20622	}
20623
20624	// CHECK: @test_vuzp_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20625	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x2_t [[AGG_RESULT]] to i8*
20626	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
20627	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
20628	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i16>*
20629	// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
20630	// CHECK: store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !alias.scope !69
20631	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i16>, <4 x i16> [[TMP3]], i32 1
20632	// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
20633	// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !alias.scope !69
20634	// CHECK: ret void
20635	uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
20636	return vuzp_u16(a, b);
20637	}
20638
20639	// CHECK: @test_vuzp_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20640	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x2_t [[AGG_RESULT]] to i8*
20641	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
20642	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
20643	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <2 x i32>*
20644	// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
20645	// CHECK: store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]], !alias.scope !72
20646	// CHECK: [[TMP4:%.]] = getelementptr inbounds <2 x i32>, <2 x i32> [[TMP3]], i32 1
20647	// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
20648	// CHECK: store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !alias.scope !72
20649	// CHECK: ret void
20650	uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
20651	return vuzp_u32(a, b);
20652	}
20653
20654	// CHECK: @test_vuzp_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20655	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x2_t [[AGG_RESULT]] to i8*
20656	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
20657	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
20658	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <2 x float>*
20659	// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
20660	// CHECK: store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]], !alias.scope !75
20661	// CHECK: [[TMP4:%.]] = getelementptr inbounds <2 x float>, <2 x float> [[TMP3]], i32 1
20662	// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
20663	// CHECK: store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]], !alias.scope !75
20664	// CHECK: ret void
20665	float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
20666	return vuzp_f32(a, b);
20667	}
20668
20669	// CHECK: @test_vuzp_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20670	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x2_t [[AGG_RESULT]] to i8*
20671	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <8 x i8>*
20672	// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
20673	// CHECK: store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]], !alias.scope !78
20674	// CHECK: [[TMP2:%.]] = getelementptr inbounds <8 x i8>, <8 x i8> [[TMP1]], i32 1
20675	// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
20676	// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !alias.scope !78
20677	// CHECK: ret void
20678	poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
20679	return vuzp_p8(a, b);
20680	}
20681
20682	// CHECK: @test_vuzp_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20683	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x2_t [[AGG_RESULT]] to i8*
20684	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
20685	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
20686	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i16>*
20687	// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
20688	// CHECK: store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]], !alias.scope !81
20689	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i16>, <4 x i16> [[TMP3]], i32 1
20690	// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
20691	// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !alias.scope !81
20692	// CHECK: ret void
20693	poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
20694	return vuzp_p16(a, b);
20695	}
20696
20697	// CHECK: @test_vuzpq_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20698	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x2_t [[AGG_RESULT]] to i8*
20699	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <16 x i8>*
20700	// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
20701	// CHECK: store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !alias.scope !84
20702	// CHECK: [[TMP2:%.]] = getelementptr inbounds <16 x i8>, <16 x i8> [[TMP1]], i32 1
20703	// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
20704	// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !alias.scope !84
20705	// CHECK: ret void
20706	int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
20707	return vuzpq_s8(a, b);
20708	}
20709
20710	// CHECK: @test_vuzpq_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20711	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x2_t [[AGG_RESULT]] to i8*
20712	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
20713	// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
20714	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <8 x i16>*
20715	// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
20716	// CHECK: store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !alias.scope !87
20717	// CHECK: [[TMP4:%.]] = getelementptr inbounds <8 x i16>, <8 x i16> [[TMP3]], i32 1
20718	// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
20719	// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !alias.scope !87
20720	// CHECK: ret void
20721	int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
20722	return vuzpq_s16(a, b);
20723	}
20724
20725	// CHECK: @test_vuzpq_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20726	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x2_t [[AGG_RESULT]] to i8*
20727	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
20728	// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
20729	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i32>*
20730	// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
20731	// CHECK: store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !alias.scope !90
20732	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> [[TMP3]], i32 1
20733	// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
20734	// CHECK: store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !alias.scope !90
20735	// CHECK: ret void
20736	int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
20737	return vuzpq_s32(a, b);
20738	}
20739
20740	// CHECK: @test_vuzpq_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20741	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x2_t [[AGG_RESULT]] to i8*
20742	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <16 x i8>*
20743	// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
20744	// CHECK: store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !alias.scope !93
20745	// CHECK: [[TMP2:%.]] = getelementptr inbounds <16 x i8>, <16 x i8> [[TMP1]], i32 1
20746	// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
20747	// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !alias.scope !93
20748	// CHECK: ret void
20749	uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
20750	return vuzpq_u8(a, b);
20751	}
20752
20753	// CHECK: @test_vuzpq_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20754	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x2_t [[AGG_RESULT]] to i8*
20755	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
20756	// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
20757	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <8 x i16>*
20758	// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
20759	// CHECK: store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !alias.scope !96
20760	// CHECK: [[TMP4:%.]] = getelementptr inbounds <8 x i16>, <8 x i16> [[TMP3]], i32 1
20761	// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
20762	// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !alias.scope !96
20763	// CHECK: ret void
20764	uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
20765	return vuzpq_u16(a, b);
20766	}
20767
20768	// CHECK: @test_vuzpq_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20769	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x2_t [[AGG_RESULT]] to i8*
20770	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
20771	// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
20772	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i32>*
20773	// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
20774	// CHECK: store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]], !alias.scope !99
20775	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> [[TMP3]], i32 1
20776	// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
20777	// CHECK: store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !alias.scope !99
20778	// CHECK: ret void
20779	uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
20780	return vuzpq_u32(a, b);
20781	}
20782
20783	// CHECK: @test_vuzpq_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20784	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x2_t [[AGG_RESULT]] to i8*
20785	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
20786	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
20787	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x float>*
20788	// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
20789	// CHECK: store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]], !alias.scope !102
20790	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x float>, <4 x float> [[TMP3]], i32 1
20791	// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
20792	// CHECK: store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]], !alias.scope !102
20793	// CHECK: ret void
20794	float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
20795	return vuzpq_f32(a, b);
20796	}
20797
20798	// CHECK: @test_vuzpq_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20799	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x2_t [[AGG_RESULT]] to i8*
20800	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <16 x i8>*
20801	// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
20802	// CHECK: store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]], !alias.scope !105
20803	// CHECK: [[TMP2:%.]] = getelementptr inbounds <16 x i8>, <16 x i8> [[TMP1]], i32 1
20804	// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
20805	// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !alias.scope !105
20806	// CHECK: ret void
20807	poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
20808	return vuzpq_p8(a, b);
20809	}
20810
20811	// CHECK: @test_vuzpq_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20812	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x2_t [[AGG_RESULT]] to i8*
20813	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
20814	// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
20815	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <8 x i16>*
20816	// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
20817	// CHECK: store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]], !alias.scope !108
20818	// CHECK: [[TMP4:%.]] = getelementptr inbounds <8 x i16>, <8 x i16> [[TMP3]], i32 1
20819	// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
20820	// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !alias.scope !108
20821	// CHECK: ret void
20822	poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
20823	return vuzpq_p16(a, b);
20824	}
20825
20826	// CHECK: @test_vzip_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20827	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x2_t [[AGG_RESULT]] to i8*
20828	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <8 x i8>*
20829	// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
20830	// CHECK: store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !alias.scope !111
20831	// CHECK: [[TMP2:%.]] = getelementptr inbounds <8 x i8>, <8 x i8> [[TMP1]], i32 1
20832	// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
20833	// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !alias.scope !111
20834	// CHECK: ret void
20835	int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
20836	return vzip_s8(a, b);
20837	}
20838
20839	// CHECK: @test_vzip_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20840	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x2_t [[AGG_RESULT]] to i8*
20841	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
20842	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
20843	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i16>*
20844	// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
20845	// CHECK: store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !alias.scope !114
20846	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i16>, <4 x i16> [[TMP3]], i32 1
20847	// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
20848	// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !alias.scope !114
20849	// CHECK: ret void
20850	int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
20851	return vzip_s16(a, b);
20852	}
20853
20854	// CHECK: @test_vzip_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20855	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x2_t [[AGG_RESULT]] to i8*
20856	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
20857	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
20858	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <2 x i32>*
20859	// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
20860	// CHECK: store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !alias.scope !117
20861	// CHECK: [[TMP4:%.]] = getelementptr inbounds <2 x i32>, <2 x i32> [[TMP3]], i32 1
20862	// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
20863	// CHECK: store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !alias.scope !117
20864	// CHECK: ret void
20865	int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
20866	return vzip_s32(a, b);
20867	}
20868
20869	// CHECK: @test_vzip_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20870	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x2_t [[AGG_RESULT]] to i8*
20871	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <8 x i8>*
20872	// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
20873	// CHECK: store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !alias.scope !120
20874	// CHECK: [[TMP2:%.]] = getelementptr inbounds <8 x i8>, <8 x i8> [[TMP1]], i32 1
20875	// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
20876	// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !alias.scope !120
20877	// CHECK: ret void
20878	uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
20879	return vzip_u8(a, b);
20880	}
20881
20882	// CHECK: @test_vzip_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20883	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x2_t [[AGG_RESULT]] to i8*
20884	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
20885	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
20886	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i16>*
20887	// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
20888	// CHECK: store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !alias.scope !123
20889	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i16>, <4 x i16> [[TMP3]], i32 1
20890	// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
20891	// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !alias.scope !123
20892	// CHECK: ret void
20893	uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
20894	return vzip_u16(a, b);
20895	}
20896
20897	// CHECK: @test_vzip_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20898	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x2_t [[AGG_RESULT]] to i8*
20899	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
20900	// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
20901	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <2 x i32>*
20902	// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
20903	// CHECK: store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]], !alias.scope !126
20904	// CHECK: [[TMP4:%.]] = getelementptr inbounds <2 x i32>, <2 x i32> [[TMP3]], i32 1
20905	// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
20906	// CHECK: store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !alias.scope !126
20907	// CHECK: ret void
20908	uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
20909	return vzip_u32(a, b);
20910	}
20911
20912	// CHECK: @test_vzip_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20913	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x2_t [[AGG_RESULT]] to i8*
20914	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
20915	// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
20916	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <2 x float>*
20917	// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
20918	// CHECK: store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]], !alias.scope !129
20919	// CHECK: [[TMP4:%.]] = getelementptr inbounds <2 x float>, <2 x float> [[TMP3]], i32 1
20920	// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
20921	// CHECK: store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]], !alias.scope !129
20922	// CHECK: ret void
20923	float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
20924	return vzip_f32(a, b);
20925	}
20926
20927	// CHECK: @test_vzip_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20928	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x2_t [[AGG_RESULT]] to i8*
20929	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <8 x i8>*
20930	// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
20931	// CHECK: store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]], !alias.scope !132
20932	// CHECK: [[TMP2:%.]] = getelementptr inbounds <8 x i8>, <8 x i8> [[TMP1]], i32 1
20933	// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
20934	// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !alias.scope !132
20935	// CHECK: ret void
20936	poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
20937	return vzip_p8(a, b);
20938	}
20939
20940	// CHECK: @test_vzip_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20941	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x2_t [[AGG_RESULT]] to i8*
20942	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
20943	// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
20944	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i16>*
20945	// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
20946	// CHECK: store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]], !alias.scope !135
20947	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i16>, <4 x i16> [[TMP3]], i32 1
20948	// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
20949	// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !alias.scope !135
20950	// CHECK: ret void
20951	poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
20952	return vzip_p16(a, b);
20953	}
20954
20955	// CHECK: @test_vzipq_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20956	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x2_t [[AGG_RESULT]] to i8*
20957	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <16 x i8>*
20958	// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
20959	// CHECK: store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !alias.scope !138
20960	// CHECK: [[TMP2:%.]] = getelementptr inbounds <16 x i8>, <16 x i8> [[TMP1]], i32 1
20961	// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
20962	// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !alias.scope !138
20963	// CHECK: ret void
20964	int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
20965	return vzipq_s8(a, b);
20966	}
20967
20968	// CHECK: @test_vzipq_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20969	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x2_t [[AGG_RESULT]] to i8*
20970	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
20971	// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
20972	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <8 x i16>*
20973	// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
20974	// CHECK: store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !alias.scope !141
20975	// CHECK: [[TMP4:%.]] = getelementptr inbounds <8 x i16>, <8 x i16> [[TMP3]], i32 1
20976	// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
20977	// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !alias.scope !141
20978	// CHECK: ret void
20979	int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
20980	return vzipq_s16(a, b);
20981	}
20982
20983	// CHECK: @test_vzipq_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20984	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x2_t [[AGG_RESULT]] to i8*
20985	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
20986	// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
20987	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i32>*
20988	// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
20989	// CHECK: store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !alias.scope !144
20990	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> [[TMP3]], i32 1
20991	// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
20992	// CHECK: store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !alias.scope !144
20993	// CHECK: ret void
20994	int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
20995	return vzipq_s32(a, b);
20996	}
20997
20998	// CHECK: @test_vzipq_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
20999	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x2_t [[AGG_RESULT]] to i8*
21000	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <16 x i8>*
21001	// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
21002	// CHECK: store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !alias.scope !147
21003	// CHECK: [[TMP2:%.]] = getelementptr inbounds <16 x i8>, <16 x i8> [[TMP1]], i32 1
21004	// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
21005	// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !alias.scope !147
21006	// CHECK: ret void
21007	uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
21008	return vzipq_u8(a, b);
21009	}
21010
21011	// CHECK: @test_vzipq_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
21012	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x2_t [[AGG_RESULT]] to i8*
21013	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
21014	// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
21015	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <8 x i16>*
21016	// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
21017	// CHECK: store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !alias.scope !150
21018	// CHECK: [[TMP4:%.]] = getelementptr inbounds <8 x i16>, <8 x i16> [[TMP3]], i32 1
21019	// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
21020	// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !alias.scope !150
21021	// CHECK: ret void
21022	uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
21023	return vzipq_u16(a, b);
21024	}
21025
21026	// CHECK: @test_vzipq_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
21027	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x2_t [[AGG_RESULT]] to i8*
21028	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
21029	// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
21030	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x i32>*
21031	// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
21032	// CHECK: store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]], !alias.scope !153
21033	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> [[TMP3]], i32 1
21034	// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
21035	// CHECK: store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !alias.scope !153
21036	// CHECK: ret void
21037	uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
21038	return vzipq_u32(a, b);
21039	}
21040
21041	// CHECK: @test_vzipq_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
21042	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x2_t [[AGG_RESULT]] to i8*
21043	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
21044	// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
21045	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <4 x float>*
21046	// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
21047	// CHECK: store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]], !alias.scope !156
21048	// CHECK: [[TMP4:%.]] = getelementptr inbounds <4 x float>, <4 x float> [[TMP3]], i32 1
21049	// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
21050	// CHECK: store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]], !alias.scope !156
21051	// CHECK: ret void
21052	float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
21053	return vzipq_f32(a, b);
21054	}
21055
21056	// CHECK: @test_vzipq_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
21057	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x2_t [[AGG_RESULT]] to i8*
21058	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to <16 x i8>*
21059	// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
21060	// CHECK: store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]], !alias.scope !159
21061	// CHECK: [[TMP2:%.]] = getelementptr inbounds <16 x i8>, <16 x i8> [[TMP1]], i32 1
21062	// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
21063	// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !alias.scope !159
21064	// CHECK: ret void
21065	poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
21066	return vzipq_p8(a, b);
21067	}
21068
21069	// CHECK: @test_vzipq_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]],
21070	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x2_t [[AGG_RESULT]] to i8*
21071	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
21072	// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
21073	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to <8 x i16>*
21074	// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
21075	// CHECK: store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]], !alias.scope !162
21076	// CHECK: [[TMP4:%.]] = getelementptr inbounds <8 x i16>, <8 x i16> [[TMP3]], i32 1
21077	// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
21078	// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !alias.scope !162
21079	// CHECK: ret void
21080	poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
21081	return vzipq_p16(a, b);
21082	}
21083

Clang Project