aarch64-neon-ldst-one.c source code [clang_source_code/test/CodeGen/aarch64-neon-ldst-one.c]

1	// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
2	// RUN: -disable-O0-optnone -fallow-half-arguments-and-returns -emit-llvm -o - %s \
3	// RUN: \| opt -S -mem2reg \| FileCheck %s
4
5	#include <arm_neon.h>
6
7	// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 {
8	// CHECK: [[TMP0:%.]] = load i8, i8 %a
9	// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
10	// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
11	// CHECK: ret <16 x i8> [[LANE]]
12	uint8x16_t test_vld1q_dup_u8(uint8_t *a) {
13	return vld1q_dup_u8(a);
14	}
15
16	// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 {
17	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
18	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i16*
19	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]]
20	// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
21	// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
22	// CHECK: ret <8 x i16> [[LANE]]
23	uint16x8_t test_vld1q_dup_u16(uint16_t *a) {
24	return vld1q_dup_u16(a);
25	}
26
27	// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 {
28	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
29	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i32*
30	// CHECK: [[TMP2:%.]] = load i32, i32 [[TMP1]]
31	// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
32	// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
33	// CHECK: ret <4 x i32> [[LANE]]
34	uint32x4_t test_vld1q_dup_u32(uint32_t *a) {
35	return vld1q_dup_u32(a);
36	}
37
38	// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 {
39	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
40	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i64*
41	// CHECK: [[TMP2:%.]] = load i64, i64 [[TMP1]]
42	// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
43	// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
44	// CHECK: ret <2 x i64> [[LANE]]
45	uint64x2_t test_vld1q_dup_u64(uint64_t *a) {
46	return vld1q_dup_u64(a);
47	}
48
49	// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 {
50	// CHECK: [[TMP0:%.]] = load i8, i8 %a
51	// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
52	// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
53	// CHECK: ret <16 x i8> [[LANE]]
54	int8x16_t test_vld1q_dup_s8(int8_t *a) {
55	return vld1q_dup_s8(a);
56	}
57
58	// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 {
59	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
60	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i16*
61	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]]
62	// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
63	// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
64	// CHECK: ret <8 x i16> [[LANE]]
65	int16x8_t test_vld1q_dup_s16(int16_t *a) {
66	return vld1q_dup_s16(a);
67	}
68
69	// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 {
70	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
71	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i32*
72	// CHECK: [[TMP2:%.]] = load i32, i32 [[TMP1]]
73	// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
74	// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
75	// CHECK: ret <4 x i32> [[LANE]]
76	int32x4_t test_vld1q_dup_s32(int32_t *a) {
77	return vld1q_dup_s32(a);
78	}
79
80	// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 {
81	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
82	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i64*
83	// CHECK: [[TMP2:%.]] = load i64, i64 [[TMP1]]
84	// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
85	// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
86	// CHECK: ret <2 x i64> [[LANE]]
87	int64x2_t test_vld1q_dup_s64(int64_t *a) {
88	return vld1q_dup_s64(a);
89	}
90
91	// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
92	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
93	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to half*
94	// CHECK: [[TMP2:%.]] = load half, half [[TMP1]]
95	// CHECK: [[TMP3:%.*]] = insertelement <8 x half> undef, half [[TMP2]], i32 0
96	// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
97	// CHECK: ret <8 x half> [[LANE]]
98	float16x8_t test_vld1q_dup_f16(float16_t *a) {
99	return vld1q_dup_f16(a);
100	}
101
102	// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 {
103	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
104	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to float*
105	// CHECK: [[TMP2:%.]] = load float, float [[TMP1]]
106	// CHECK: [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
107	// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
108	// CHECK: ret <4 x float> [[LANE]]
109	float32x4_t test_vld1q_dup_f32(float32_t *a) {
110	return vld1q_dup_f32(a);
111	}
112
113	// CHECK-LABEL: define <2 x double> @test_vld1q_dup_f64(double* %a) #0 {
114	// CHECK: [[TMP0:%.]] = bitcast double %a to i8*
115	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to double*
116	// CHECK: [[TMP2:%.]] = load double, double [[TMP1]]
117	// CHECK: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
118	// CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer
119	// CHECK: ret <2 x double> [[LANE]]
120	float64x2_t test_vld1q_dup_f64(float64_t *a) {
121	return vld1q_dup_f64(a);
122	}
123
124	// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 {
125	// CHECK: [[TMP0:%.]] = load i8, i8 %a
126	// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
127	// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
128	// CHECK: ret <16 x i8> [[LANE]]
129	poly8x16_t test_vld1q_dup_p8(poly8_t *a) {
130	return vld1q_dup_p8(a);
131	}
132
133	// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 {
134	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
135	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i16*
136	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]]
137	// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
138	// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
139	// CHECK: ret <8 x i16> [[LANE]]
140	poly16x8_t test_vld1q_dup_p16(poly16_t *a) {
141	return vld1q_dup_p16(a);
142	}
143
144	// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_p64(i64* %a) #0 {
145	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
146	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i64*
147	// CHECK: [[TMP2:%.]] = load i64, i64 [[TMP1]]
148	// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
149	// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
150	// CHECK: ret <2 x i64> [[LANE]]
151	poly64x2_t test_vld1q_dup_p64(poly64_t *a) {
152	return vld1q_dup_p64(a);
153	}
154
155	// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #1 {
156	// CHECK: [[TMP0:%.]] = load i8, i8 %a
157	// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
158	// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
159	// CHECK: ret <8 x i8> [[LANE]]
160	uint8x8_t test_vld1_dup_u8(uint8_t *a) {
161	return vld1_dup_u8(a);
162	}
163
164	// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #1 {
165	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
166	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i16*
167	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]]
168	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
169	// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
170	// CHECK: ret <4 x i16> [[LANE]]
171	uint16x4_t test_vld1_dup_u16(uint16_t *a) {
172	return vld1_dup_u16(a);
173	}
174
175	// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #1 {
176	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
177	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i32*
178	// CHECK: [[TMP2:%.]] = load i32, i32 [[TMP1]]
179	// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
180	// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
181	// CHECK: ret <2 x i32> [[LANE]]
182	uint32x2_t test_vld1_dup_u32(uint32_t *a) {
183	return vld1_dup_u32(a);
184	}
185
186	// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #1 {
187	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
188	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i64*
189	// CHECK: [[TMP2:%.]] = load i64, i64 [[TMP1]]
190	// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
191	// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
192	// CHECK: ret <1 x i64> [[LANE]]
193	uint64x1_t test_vld1_dup_u64(uint64_t *a) {
194	return vld1_dup_u64(a);
195	}
196
197	// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #1 {
198	// CHECK: [[TMP0:%.]] = load i8, i8 %a
199	// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
200	// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
201	// CHECK: ret <8 x i8> [[LANE]]
202	int8x8_t test_vld1_dup_s8(int8_t *a) {
203	return vld1_dup_s8(a);
204	}
205
206	// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #1 {
207	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
208	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i16*
209	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]]
210	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
211	// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
212	// CHECK: ret <4 x i16> [[LANE]]
213	int16x4_t test_vld1_dup_s16(int16_t *a) {
214	return vld1_dup_s16(a);
215	}
216
217	// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #1 {
218	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
219	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i32*
220	// CHECK: [[TMP2:%.]] = load i32, i32 [[TMP1]]
221	// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
222	// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
223	// CHECK: ret <2 x i32> [[LANE]]
224	int32x2_t test_vld1_dup_s32(int32_t *a) {
225	return vld1_dup_s32(a);
226	}
227
228	// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #1 {
229	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
230	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i64*
231	// CHECK: [[TMP2:%.]] = load i64, i64 [[TMP1]]
232	// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
233	// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
234	// CHECK: ret <1 x i64> [[LANE]]
235	int64x1_t test_vld1_dup_s64(int64_t *a) {
236	return vld1_dup_s64(a);
237	}
238
239	// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #1 {
240	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
241	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to half*
242	// CHECK: [[TMP2:%.]] = load half, half [[TMP1]]
243	// CHECK: [[TMP3:%.*]] = insertelement <4 x half> undef, half [[TMP2]], i32 0
244	// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
245	// CHECK: ret <4 x half> [[LANE]]
246	float16x4_t test_vld1_dup_f16(float16_t *a) {
247	return vld1_dup_f16(a);
248	}
249
250	// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #1 {
251	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
252	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to float*
253	// CHECK: [[TMP2:%.]] = load float, float [[TMP1]]
254	// CHECK: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
255	// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
256	// CHECK: ret <2 x float> [[LANE]]
257	float32x2_t test_vld1_dup_f32(float32_t *a) {
258	return vld1_dup_f32(a);
259	}
260
261	// CHECK-LABEL: define <1 x double> @test_vld1_dup_f64(double* %a) #1 {
262	// CHECK: [[TMP0:%.]] = bitcast double %a to i8*
263	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to double*
264	// CHECK: [[TMP2:%.]] = load double, double [[TMP1]]
265	// CHECK: [[TMP3:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i32 0
266	// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
267	// CHECK: ret <1 x double> [[LANE]]
268	float64x1_t test_vld1_dup_f64(float64_t *a) {
269	return vld1_dup_f64(a);
270	}
271
272	// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #1 {
273	// CHECK: [[TMP0:%.]] = load i8, i8 %a
274	// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
275	// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
276	// CHECK: ret <8 x i8> [[LANE]]
277	poly8x8_t test_vld1_dup_p8(poly8_t *a) {
278	return vld1_dup_p8(a);
279	}
280
281	// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #1 {
282	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
283	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i16*
284	// CHECK: [[TMP2:%.]] = load i16, i16 [[TMP1]]
285	// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
286	// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
287	// CHECK: ret <4 x i16> [[LANE]]
288	poly16x4_t test_vld1_dup_p16(poly16_t *a) {
289	return vld1_dup_p16(a);
290	}
291
292	// CHECK-LABEL: define <1 x i64> @test_vld1_dup_p64(i64* %a) #1 {
293	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
294	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to i64*
295	// CHECK: [[TMP2:%.]] = load i64, i64 [[TMP1]]
296	// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
297	// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
298	// CHECK: ret <1 x i64> [[LANE]]
299	poly64x1_t test_vld1_dup_p64(poly64_t *a) {
300	return vld1_dup_p64(a);
301	}
302
303	// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_dup_u64(i64* %a) #2 {
304	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
305	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
306	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x2_t [[__RET]] to i8*
307	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
308	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
309	// CHECK: [[VLD2:%.]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64 [[TMP2]])
310	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64> }*
311	// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
312	// CHECK: [[TMP4:%.]] = bitcast %struct.uint64x2x2_t [[RETVAL]] to i8*
313	// CHECK: [[TMP5:%.]] = bitcast %struct.uint64x2x2_t [[__RET]] to i8*
314	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
315	// CHECK: [[TMP6:%.]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t [[RETVAL]], align 16
316	// CHECK: ret %struct.uint64x2x2_t [[TMP6]]
317	uint64x2x2_t test_vld2q_dup_u64(uint64_t *a) {
318	return vld2q_dup_u64(a);
319	}
320
321	// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) #2 {
322	// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
323	// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
324	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x2_t [[__RET]] to i8*
325	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
326	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
327	// CHECK: [[VLD2:%.]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64 [[TMP2]])
328	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64> }*
329	// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
330	// CHECK: [[TMP4:%.]] = bitcast %struct.int64x2x2_t [[RETVAL]] to i8*
331	// CHECK: [[TMP5:%.]] = bitcast %struct.int64x2x2_t [[__RET]] to i8*
332	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
333	// CHECK: [[TMP6:%.]] = load %struct.int64x2x2_t, %struct.int64x2x2_t [[RETVAL]], align 16
334	// CHECK: ret %struct.int64x2x2_t [[TMP6]]
335	int64x2x2_t test_vld2q_dup_s64(int64_t *a) {
336	return vld2q_dup_s64(a);
337	}
338
339	// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) #2 {
340	// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
341	// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
342	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x2x2_t [[__RET]] to i8*
343	// CHECK: [[TMP1:%.]] = bitcast double %a to i8*
344	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to double*
345	// CHECK: [[VLD2:%.]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double [[TMP2]])
346	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x double>, <2 x double> }*
347	// CHECK: store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
348	// CHECK: [[TMP4:%.]] = bitcast %struct.float64x2x2_t [[RETVAL]] to i8*
349	// CHECK: [[TMP5:%.]] = bitcast %struct.float64x2x2_t [[__RET]] to i8*
350	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
351	// CHECK: [[TMP6:%.]] = load %struct.float64x2x2_t, %struct.float64x2x2_t [[RETVAL]], align 16
352	// CHECK: ret %struct.float64x2x2_t [[TMP6]]
353	float64x2x2_t test_vld2q_dup_f64(float64_t *a) {
354	return vld2q_dup_f64(a);
355	}
356
357	// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_dup_p64(i64* %a) #2 {
358	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
359	// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
360	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x2x2_t [[__RET]] to i8*
361	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
362	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
363	// CHECK: [[VLD2:%.]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64 [[TMP2]])
364	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64> }*
365	// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
366	// CHECK: [[TMP4:%.]] = bitcast %struct.poly64x2x2_t [[RETVAL]] to i8*
367	// CHECK: [[TMP5:%.]] = bitcast %struct.poly64x2x2_t [[__RET]] to i8*
368	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
369	// CHECK: [[TMP6:%.]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t [[RETVAL]], align 16
370	// CHECK: ret %struct.poly64x2x2_t [[TMP6]]
371	poly64x2x2_t test_vld2q_dup_p64(poly64_t *a) {
372	return vld2q_dup_p64(a);
373	}
374
375	// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) #2 {
376	// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
377	// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
378	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x1x2_t [[__RET]] to i8*
379	// CHECK: [[TMP1:%.]] = bitcast double %a to i8*
380	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to double*
381	// CHECK: [[VLD2:%.]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double [[TMP2]])
382	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x double>, <1 x double> }*
383	// CHECK: store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
384	// CHECK: [[TMP4:%.]] = bitcast %struct.float64x1x2_t [[RETVAL]] to i8*
385	// CHECK: [[TMP5:%.]] = bitcast %struct.float64x1x2_t [[__RET]] to i8*
386	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
387	// CHECK: [[TMP6:%.]] = load %struct.float64x1x2_t, %struct.float64x1x2_t [[RETVAL]], align 8
388	// CHECK: ret %struct.float64x1x2_t [[TMP6]]
389	float64x1x2_t test_vld2_dup_f64(float64_t *a) {
390	return vld2_dup_f64(a);
391	}
392
393	// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_dup_p64(i64* %a) #2 {
394	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
395	// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
396	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x1x2_t [[__RET]] to i8*
397	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
398	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
399	// CHECK: [[VLD2:%.]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64 [[TMP2]])
400	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64> }*
401	// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
402	// CHECK: [[TMP4:%.]] = bitcast %struct.poly64x1x2_t [[RETVAL]] to i8*
403	// CHECK: [[TMP5:%.]] = bitcast %struct.poly64x1x2_t [[__RET]] to i8*
404	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
405	// CHECK: [[TMP6:%.]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t [[RETVAL]], align 8
406	// CHECK: ret %struct.poly64x1x2_t [[TMP6]]
407	poly64x1x2_t test_vld2_dup_p64(poly64_t *a) {
408	return vld2_dup_p64(a);
409	}
410
411	// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_dup_u64(i64* %a) #2 {
412	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
413	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
414	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x3_t [[__RET]] to i8*
415	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
416	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
417	// CHECK: [[VLD3:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64 [[TMP2]])
418	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
419	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
420	// CHECK: [[TMP4:%.]] = bitcast %struct.uint64x2x3_t [[RETVAL]] to i8*
421	// CHECK: [[TMP5:%.]] = bitcast %struct.uint64x2x3_t [[__RET]] to i8*
422	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
423	// CHECK: [[TMP6:%.]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t [[RETVAL]], align 16
424	// CHECK: ret %struct.uint64x2x3_t [[TMP6]]
425	uint64x2x3_t test_vld3q_dup_u64(uint64_t *a) {
426	return vld3q_dup_u64(a);
427	// [{{x[0-9]+\|sp}}]
428	}
429
430	// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) #2 {
431	// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
432	// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
433	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x3_t [[__RET]] to i8*
434	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
435	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
436	// CHECK: [[VLD3:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64 [[TMP2]])
437	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
438	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
439	// CHECK: [[TMP4:%.]] = bitcast %struct.int64x2x3_t [[RETVAL]] to i8*
440	// CHECK: [[TMP5:%.]] = bitcast %struct.int64x2x3_t [[__RET]] to i8*
441	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
442	// CHECK: [[TMP6:%.]] = load %struct.int64x2x3_t, %struct.int64x2x3_t [[RETVAL]], align 16
443	// CHECK: ret %struct.int64x2x3_t [[TMP6]]
444	int64x2x3_t test_vld3q_dup_s64(int64_t *a) {
445	return vld3q_dup_s64(a);
446	// [{{x[0-9]+\|sp}}]
447	}
448
449	// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) #2 {
450	// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
451	// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
452	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x2x3_t [[__RET]] to i8*
453	// CHECK: [[TMP1:%.]] = bitcast double %a to i8*
454	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to double*
455	// CHECK: [[VLD3:%.]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double [[TMP2]])
456	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
457	// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
458	// CHECK: [[TMP4:%.]] = bitcast %struct.float64x2x3_t [[RETVAL]] to i8*
459	// CHECK: [[TMP5:%.]] = bitcast %struct.float64x2x3_t [[__RET]] to i8*
460	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
461	// CHECK: [[TMP6:%.]] = load %struct.float64x2x3_t, %struct.float64x2x3_t [[RETVAL]], align 16
462	// CHECK: ret %struct.float64x2x3_t [[TMP6]]
463	float64x2x3_t test_vld3q_dup_f64(float64_t *a) {
464	return vld3q_dup_f64(a);
465	// [{{x[0-9]+\|sp}}]
466	}
467
468	// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_dup_p64(i64* %a) #2 {
469	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
470	// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
471	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x2x3_t [[__RET]] to i8*
472	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
473	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
474	// CHECK: [[VLD3:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64 [[TMP2]])
475	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
476	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
477	// CHECK: [[TMP4:%.]] = bitcast %struct.poly64x2x3_t [[RETVAL]] to i8*
478	// CHECK: [[TMP5:%.]] = bitcast %struct.poly64x2x3_t [[__RET]] to i8*
479	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
480	// CHECK: [[TMP6:%.]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t [[RETVAL]], align 16
481	// CHECK: ret %struct.poly64x2x3_t [[TMP6]]
482	poly64x2x3_t test_vld3q_dup_p64(poly64_t *a) {
483	return vld3q_dup_p64(a);
484	// [{{x[0-9]+\|sp}}]
485	}
486
487	// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) #2 {
488	// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
489	// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
490	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x1x3_t [[__RET]] to i8*
491	// CHECK: [[TMP1:%.]] = bitcast double %a to i8*
492	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to double*
493	// CHECK: [[VLD3:%.]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double [[TMP2]])
494	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
495	// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
496	// CHECK: [[TMP4:%.]] = bitcast %struct.float64x1x3_t [[RETVAL]] to i8*
497	// CHECK: [[TMP5:%.]] = bitcast %struct.float64x1x3_t [[__RET]] to i8*
498	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
499	// CHECK: [[TMP6:%.]] = load %struct.float64x1x3_t, %struct.float64x1x3_t [[RETVAL]], align 8
500	// CHECK: ret %struct.float64x1x3_t [[TMP6]]
501	float64x1x3_t test_vld3_dup_f64(float64_t *a) {
502	return vld3_dup_f64(a);
503	// [{{x[0-9]+\|sp}}]
504	}
505
506	// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_dup_p64(i64* %a) #2 {
507	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
508	// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
509	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x1x3_t [[__RET]] to i8*
510	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
511	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
512	// CHECK: [[VLD3:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64 [[TMP2]])
513	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
514	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
515	// CHECK: [[TMP4:%.]] = bitcast %struct.poly64x1x3_t [[RETVAL]] to i8*
516	// CHECK: [[TMP5:%.]] = bitcast %struct.poly64x1x3_t [[__RET]] to i8*
517	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
518	// CHECK: [[TMP6:%.]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t [[RETVAL]], align 8
519	// CHECK: ret %struct.poly64x1x3_t [[TMP6]]
520	poly64x1x3_t test_vld3_dup_p64(poly64_t *a) {
521	return vld3_dup_p64(a);
522	// [{{x[0-9]+\|sp}}]
523	}
524
525	// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_dup_u64(i64* %a) #2 {
526	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
527	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
528	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x4_t [[__RET]] to i8*
529	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
530	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
531	// CHECK: [[VLD4:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64 [[TMP2]])
532	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
533	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
534	// CHECK: [[TMP4:%.]] = bitcast %struct.uint64x2x4_t [[RETVAL]] to i8*
535	// CHECK: [[TMP5:%.]] = bitcast %struct.uint64x2x4_t [[__RET]] to i8*
536	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
537	// CHECK: [[TMP6:%.]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t [[RETVAL]], align 16
538	// CHECK: ret %struct.uint64x2x4_t [[TMP6]]
539	uint64x2x4_t test_vld4q_dup_u64(uint64_t *a) {
540	return vld4q_dup_u64(a);
541	}
542
543	// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) #2 {
544	// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
545	// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
546	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x4_t [[__RET]] to i8*
547	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
548	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
549	// CHECK: [[VLD4:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64 [[TMP2]])
550	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
551	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
552	// CHECK: [[TMP4:%.]] = bitcast %struct.int64x2x4_t [[RETVAL]] to i8*
553	// CHECK: [[TMP5:%.]] = bitcast %struct.int64x2x4_t [[__RET]] to i8*
554	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
555	// CHECK: [[TMP6:%.]] = load %struct.int64x2x4_t, %struct.int64x2x4_t [[RETVAL]], align 16
556	// CHECK: ret %struct.int64x2x4_t [[TMP6]]
557	int64x2x4_t test_vld4q_dup_s64(int64_t *a) {
558	return vld4q_dup_s64(a);
559	}
560
561	// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) #2 {
562	// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
563	// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
564	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x2x4_t [[__RET]] to i8*
565	// CHECK: [[TMP1:%.]] = bitcast double %a to i8*
566	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to double*
567	// CHECK: [[VLD4:%.]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double [[TMP2]])
568	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
569	// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
570	// CHECK: [[TMP4:%.]] = bitcast %struct.float64x2x4_t [[RETVAL]] to i8*
571	// CHECK: [[TMP5:%.]] = bitcast %struct.float64x2x4_t [[__RET]] to i8*
572	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
573	// CHECK: [[TMP6:%.]] = load %struct.float64x2x4_t, %struct.float64x2x4_t [[RETVAL]], align 16
574	// CHECK: ret %struct.float64x2x4_t [[TMP6]]
575	float64x2x4_t test_vld4q_dup_f64(float64_t *a) {
576	return vld4q_dup_f64(a);
577	}
578
579	// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_dup_p64(i64* %a) #2 {
580	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
581	// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
582	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x2x4_t [[__RET]] to i8*
583	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
584	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
585	// CHECK: [[VLD4:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64 [[TMP2]])
586	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
587	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
588	// CHECK: [[TMP4:%.]] = bitcast %struct.poly64x2x4_t [[RETVAL]] to i8*
589	// CHECK: [[TMP5:%.]] = bitcast %struct.poly64x2x4_t [[__RET]] to i8*
590	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
591	// CHECK: [[TMP6:%.]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t [[RETVAL]], align 16
592	// CHECK: ret %struct.poly64x2x4_t [[TMP6]]
593	poly64x2x4_t test_vld4q_dup_p64(poly64_t *a) {
594	return vld4q_dup_p64(a);
595	}
596
597	// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) #2 {
598	// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
599	// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
600	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x1x4_t [[__RET]] to i8*
601	// CHECK: [[TMP1:%.]] = bitcast double %a to i8*
602	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to double*
603	// CHECK: [[VLD4:%.]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double [[TMP2]])
604	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
605	// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
606	// CHECK: [[TMP4:%.]] = bitcast %struct.float64x1x4_t [[RETVAL]] to i8*
607	// CHECK: [[TMP5:%.]] = bitcast %struct.float64x1x4_t [[__RET]] to i8*
608	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
609	// CHECK: [[TMP6:%.]] = load %struct.float64x1x4_t, %struct.float64x1x4_t [[RETVAL]], align 8
610	// CHECK: ret %struct.float64x1x4_t [[TMP6]]
611	float64x1x4_t test_vld4_dup_f64(float64_t *a) {
612	return vld4_dup_f64(a);
613	}
614
615	// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_dup_p64(i64* %a) #2 {
616	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
617	// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
618	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x1x4_t [[__RET]] to i8*
619	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
620	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
621	// CHECK: [[VLD4:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64 [[TMP2]])
622	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
623	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
624	// CHECK: [[TMP4:%.]] = bitcast %struct.poly64x1x4_t [[RETVAL]] to i8*
625	// CHECK: [[TMP5:%.]] = bitcast %struct.poly64x1x4_t [[__RET]] to i8*
626	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
627	// CHECK: [[TMP6:%.]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t [[RETVAL]], align 8
628	// CHECK: ret %struct.poly64x1x4_t [[TMP6]]
629	poly64x1x4_t test_vld4_dup_p64(poly64_t *a) {
630	return vld4_dup_p64(a);
631	}
632
633	// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
634	// CHECK: [[TMP0:%.]] = load i8, i8 %a
635	// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
636	// CHECK: ret <16 x i8> [[VLD1_LANE]]
637	uint8x16_t test_vld1q_lane_u8(uint8_t *a, uint8x16_t b) {
638	return vld1q_lane_u8(a, b, 15);
639	}
640
641	// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
642	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
643	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
644	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
645	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i16*
646	// CHECK: [[TMP4:%.]] = load i16, i16 [[TMP3]]
647	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
648	// CHECK: ret <8 x i16> [[VLD1_LANE]]
649	uint16x8_t test_vld1q_lane_u16(uint16_t *a, uint16x8_t b) {
650	return vld1q_lane_u16(a, b, 7);
651	}
652
653	// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
654	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
655	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
656	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
657	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i32*
658	// CHECK: [[TMP4:%.]] = load i32, i32 [[TMP3]]
659	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
660	// CHECK: ret <4 x i32> [[VLD1_LANE]]
661	uint32x4_t test_vld1q_lane_u32(uint32_t *a, uint32x4_t b) {
662	return vld1q_lane_u32(a, b, 3);
663	}
664
665	// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
666	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
667	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
668	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
669	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i64*
670	// CHECK: [[TMP4:%.]] = load i64, i64 [[TMP3]]
671	// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
672	// CHECK: ret <2 x i64> [[VLD1_LANE]]
673	uint64x2_t test_vld1q_lane_u64(uint64_t *a, uint64x2_t b) {
674	return vld1q_lane_u64(a, b, 1);
675	}
676
677	// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
678	// CHECK: [[TMP0:%.]] = load i8, i8 %a
679	// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
680	// CHECK: ret <16 x i8> [[VLD1_LANE]]
681	int8x16_t test_vld1q_lane_s8(int8_t *a, int8x16_t b) {
682	return vld1q_lane_s8(a, b, 15);
683	}
684
685	// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
686	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
687	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
688	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
689	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i16*
690	// CHECK: [[TMP4:%.]] = load i16, i16 [[TMP3]]
691	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
692	// CHECK: ret <8 x i16> [[VLD1_LANE]]
693	int16x8_t test_vld1q_lane_s16(int16_t *a, int16x8_t b) {
694	return vld1q_lane_s16(a, b, 7);
695	}
696
697	// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
698	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
699	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
700	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
701	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i32*
702	// CHECK: [[TMP4:%.]] = load i32, i32 [[TMP3]]
703	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
704	// CHECK: ret <4 x i32> [[VLD1_LANE]]
705	int32x4_t test_vld1q_lane_s32(int32_t *a, int32x4_t b) {
706	return vld1q_lane_s32(a, b, 3);
707	}
708
709	// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
710	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
711	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
712	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
713	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i64*
714	// CHECK: [[TMP4:%.]] = load i64, i64 [[TMP3]]
715	// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
716	// CHECK: ret <2 x i64> [[VLD1_LANE]]
717	int64x2_t test_vld1q_lane_s64(int64_t *a, int64x2_t b) {
718	return vld1q_lane_s64(a, b, 1);
719	}
720
721	// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
722	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
723	// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
724	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
725	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to half*
726	// CHECK: [[TMP4:%.]] = load half, half [[TMP3]]
727	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
728	// CHECK: ret <8 x half> [[VLD1_LANE]]
729	float16x8_t test_vld1q_lane_f16(float16_t *a, float16x8_t b) {
730	return vld1q_lane_f16(a, b, 7);
731	}
732
733	// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 {
734	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
735	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
736	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
737	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to float*
738	// CHECK: [[TMP4:%.]] = load float, float [[TMP3]]
739	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
740	// CHECK: ret <4 x float> [[VLD1_LANE]]
741	float32x4_t test_vld1q_lane_f32(float32_t *a, float32x4_t b) {
742	return vld1q_lane_f32(a, b, 3);
743	}
744
745	// CHECK-LABEL: define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) #0 {
746	// CHECK: [[TMP0:%.]] = bitcast double %a to i8*
747	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
748	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
749	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to double*
750	// CHECK: [[TMP4:%.]] = load double, double [[TMP3]]
751	// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1
752	// CHECK: ret <2 x double> [[VLD1_LANE]]
753	float64x2_t test_vld1q_lane_f64(float64_t *a, float64x2_t b) {
754	return vld1q_lane_f64(a, b, 1);
755	}
756
757	// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
758	// CHECK: [[TMP0:%.]] = load i8, i8 %a
759	// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
760	// CHECK: ret <16 x i8> [[VLD1_LANE]]
761	poly8x16_t test_vld1q_lane_p8(poly8_t *a, poly8x16_t b) {
762	return vld1q_lane_p8(a, b, 15);
763	}
764
765	// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
766	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
767	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
768	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
769	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i16*
770	// CHECK: [[TMP4:%.]] = load i16, i16 [[TMP3]]
771	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
772	// CHECK: ret <8 x i16> [[VLD1_LANE]]
773	poly16x8_t test_vld1q_lane_p16(poly16_t *a, poly16x8_t b) {
774	return vld1q_lane_p16(a, b, 7);
775	}
776
777	// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_p64(i64* %a, <2 x i64> %b) #0 {
778	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
779	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
780	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
781	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i64*
782	// CHECK: [[TMP4:%.]] = load i64, i64 [[TMP3]]
783	// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
784	// CHECK: ret <2 x i64> [[VLD1_LANE]]
785	poly64x2_t test_vld1q_lane_p64(poly64_t *a, poly64x2_t b) {
786	return vld1q_lane_p64(a, b, 1);
787	}
788
789	// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #1 {
790	// CHECK: [[TMP0:%.]] = load i8, i8 %a
791	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
792	// CHECK: ret <8 x i8> [[VLD1_LANE]]
793	uint8x8_t test_vld1_lane_u8(uint8_t *a, uint8x8_t b) {
794	return vld1_lane_u8(a, b, 7);
795	}
796
797	// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #1 {
798	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
799	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
800	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
801	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i16*
802	// CHECK: [[TMP4:%.]] = load i16, i16 [[TMP3]]
803	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
804	// CHECK: ret <4 x i16> [[VLD1_LANE]]
805	uint16x4_t test_vld1_lane_u16(uint16_t *a, uint16x4_t b) {
806	return vld1_lane_u16(a, b, 3);
807	}
808
809	// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #1 {
810	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
811	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
812	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
813	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i32*
814	// CHECK: [[TMP4:%.]] = load i32, i32 [[TMP3]]
815	// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
816	// CHECK: ret <2 x i32> [[VLD1_LANE]]
817	uint32x2_t test_vld1_lane_u32(uint32_t *a, uint32x2_t b) {
818	return vld1_lane_u32(a, b, 1);
819	}
820
821	// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #1 {
822	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
823	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
824	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
825	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i64*
826	// CHECK: [[TMP4:%.]] = load i64, i64 [[TMP3]]
827	// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
828	// CHECK: ret <1 x i64> [[VLD1_LANE]]
829	uint64x1_t test_vld1_lane_u64(uint64_t *a, uint64x1_t b) {
830	return vld1_lane_u64(a, b, 0);
831	}
832
833	// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #1 {
834	// CHECK: [[TMP0:%.]] = load i8, i8 %a
835	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
836	// CHECK: ret <8 x i8> [[VLD1_LANE]]
837	int8x8_t test_vld1_lane_s8(int8_t *a, int8x8_t b) {
838	return vld1_lane_s8(a, b, 7);
839	}
840
841	// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #1 {
842	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
843	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
844	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
845	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i16*
846	// CHECK: [[TMP4:%.]] = load i16, i16 [[TMP3]]
847	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
848	// CHECK: ret <4 x i16> [[VLD1_LANE]]
849	int16x4_t test_vld1_lane_s16(int16_t *a, int16x4_t b) {
850	return vld1_lane_s16(a, b, 3);
851	}
852
853	// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #1 {
854	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
855	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
856	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
857	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i32*
858	// CHECK: [[TMP4:%.]] = load i32, i32 [[TMP3]]
859	// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
860	// CHECK: ret <2 x i32> [[VLD1_LANE]]
861	int32x2_t test_vld1_lane_s32(int32_t *a, int32x2_t b) {
862	return vld1_lane_s32(a, b, 1);
863	}
864
865	// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #1 {
866	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
867	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
868	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
869	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i64*
870	// CHECK: [[TMP4:%.]] = load i64, i64 [[TMP3]]
871	// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
872	// CHECK: ret <1 x i64> [[VLD1_LANE]]
873	int64x1_t test_vld1_lane_s64(int64_t *a, int64x1_t b) {
874	return vld1_lane_s64(a, b, 0);
875	}
876
877	// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #1 {
878	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
879	// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
880	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
881	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to half*
882	// CHECK: [[TMP4:%.]] = load half, half [[TMP3]]
883	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3
884	// CHECK: ret <4 x half> [[VLD1_LANE]]
885	float16x4_t test_vld1_lane_f16(float16_t *a, float16x4_t b) {
886	return vld1_lane_f16(a, b, 3);
887	}
888
889	// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #1 {
890	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
891	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
892	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
893	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to float*
894	// CHECK: [[TMP4:%.]] = load float, float [[TMP3]]
895	// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
896	// CHECK: ret <2 x float> [[VLD1_LANE]]
897	float32x2_t test_vld1_lane_f32(float32_t *a, float32x2_t b) {
898	return vld1_lane_f32(a, b, 1);
899	}
900
901	// CHECK-LABEL: define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) #1 {
902	// CHECK: [[TMP0:%.]] = bitcast double %a to i8*
903	// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
904	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
905	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to double*
906	// CHECK: [[TMP4:%.]] = load double, double [[TMP3]]
907	// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0
908	// CHECK: ret <1 x double> [[VLD1_LANE]]
909	float64x1_t test_vld1_lane_f64(float64_t *a, float64x1_t b) {
910	return vld1_lane_f64(a, b, 0);
911	}
912
913	// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #1 {
914	// CHECK: [[TMP0:%.]] = load i8, i8 %a
915	// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
916	// CHECK: ret <8 x i8> [[VLD1_LANE]]
917	poly8x8_t test_vld1_lane_p8(poly8_t *a, poly8x8_t b) {
918	return vld1_lane_p8(a, b, 7);
919	}
920
921	// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #1 {
922	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
923	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
924	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
925	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i16*
926	// CHECK: [[TMP4:%.]] = load i16, i16 [[TMP3]]
927	// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
928	// CHECK: ret <4 x i16> [[VLD1_LANE]]
929	poly16x4_t test_vld1_lane_p16(poly16_t *a, poly16x4_t b) {
930	return vld1_lane_p16(a, b, 3);
931	}
932
933	// CHECK-LABEL: define <1 x i64> @test_vld1_lane_p64(i64* %a, <1 x i64> %b) #1 {
934	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
935	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
936	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
937	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to i64*
938	// CHECK: [[TMP4:%.]] = load i64, i64 [[TMP3]]
939	// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
940	// CHECK: ret <1 x i64> [[VLD1_LANE]]
941	poly64x1_t test_vld1_lane_p64(poly64_t *a, poly64x1_t b) {
942	return vld1_lane_p64(a, b, 0);
943	}
944
945	// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #2 {
946	// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
947	// CHECK: [[SRC:%.*]] = alloca %struct.int8x16x2_t, align 16
948	// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
949	// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
950	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t [[SRC]], i32 0, i32 0
951	// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
952	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x2_t [[__S1]] to i8*
953	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x16x2_t [[SRC]] to i8*
954	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
955	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x16x2_t [[__RET]] to i8*
956	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t [[__S1]], i32 0, i32 0
957	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL]], i64 0, i64 0
958	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
959	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t [[__S1]], i32 0, i32 0
960	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL1]], i64 0, i64 1
961	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
962	// CHECK: [[VLD2_LANE:%.]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8 %ptr)
963	// CHECK: [[TMP5:%.]] = bitcast i8 [[TMP2]] to { <16 x i8>, <16 x i8> }*
964	// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
965	// CHECK: [[TMP6:%.]] = bitcast %struct.int8x16x2_t [[RETVAL]] to i8*
966	// CHECK: [[TMP7:%.]] = bitcast %struct.int8x16x2_t [[__RET]] to i8*
967	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
968	// CHECK: [[TMP8:%.]] = load %struct.int8x16x2_t, %struct.int8x16x2_t [[RETVAL]], align 16
969	// CHECK: ret %struct.int8x16x2_t [[TMP8]]
970	int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
971	return vld2q_lane_s8(ptr, src, 15);
972	}
973
974	// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #2 {
975	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
976	// CHECK: [[SRC:%.*]] = alloca %struct.uint8x16x2_t, align 16
977	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
978	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
979	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t [[SRC]], i32 0, i32 0
980	// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
981	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x2_t [[__S1]] to i8*
982	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x16x2_t [[SRC]] to i8*
983	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
984	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x16x2_t [[__RET]] to i8*
985	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t [[__S1]], i32 0, i32 0
986	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL]], i64 0, i64 0
987	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
988	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t [[__S1]], i32 0, i32 0
989	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL1]], i64 0, i64 1
990	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
991	// CHECK: [[VLD2_LANE:%.]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8 %ptr)
992	// CHECK: [[TMP5:%.]] = bitcast i8 [[TMP2]] to { <16 x i8>, <16 x i8> }*
993	// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
994	// CHECK: [[TMP6:%.]] = bitcast %struct.uint8x16x2_t [[RETVAL]] to i8*
995	// CHECK: [[TMP7:%.]] = bitcast %struct.uint8x16x2_t [[__RET]] to i8*
996	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
997	// CHECK: [[TMP8:%.]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t [[RETVAL]], align 16
998	// CHECK: ret %struct.uint8x16x2_t [[TMP8]]
999	uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
1000	return vld2q_lane_u8(ptr, src, 15);
1001	}
1002
1003	// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #2 {
1004	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
1005	// CHECK: [[SRC:%.*]] = alloca %struct.poly8x16x2_t, align 16
1006	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
1007	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
1008	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t [[SRC]], i32 0, i32 0
1009	// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
1010	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x2_t [[__S1]] to i8*
1011	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x16x2_t [[SRC]] to i8*
1012	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
1013	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x16x2_t [[__RET]] to i8*
1014	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t [[__S1]], i32 0, i32 0
1015	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL]], i64 0, i64 0
1016	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
1017	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t [[__S1]], i32 0, i32 0
1018	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL1]], i64 0, i64 1
1019	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
1020	// CHECK: [[VLD2_LANE:%.]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8 %ptr)
1021	// CHECK: [[TMP5:%.]] = bitcast i8 [[TMP2]] to { <16 x i8>, <16 x i8> }*
1022	// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
1023	// CHECK: [[TMP6:%.]] = bitcast %struct.poly8x16x2_t [[RETVAL]] to i8*
1024	// CHECK: [[TMP7:%.]] = bitcast %struct.poly8x16x2_t [[__RET]] to i8*
1025	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
1026	// CHECK: [[TMP8:%.]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t [[RETVAL]], align 16
1027	// CHECK: ret %struct.poly8x16x2_t [[TMP8]]
1028	poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
1029	return vld2q_lane_p8(ptr, src, 15);
1030	}
1031
1032	// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #2 {
1033	// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
1034	// CHECK: [[SRC:%.*]] = alloca %struct.int8x16x3_t, align 16
1035	// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
1036	// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
1037	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[SRC]], i32 0, i32 0
1038	// CHECK: store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
1039	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x3_t [[__S1]] to i8*
1040	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x16x3_t [[SRC]] to i8*
1041	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
1042	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x16x3_t [[__RET]] to i8*
1043	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[__S1]], i32 0, i32 0
1044	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL]], i64 0, i64 0
1045	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
1046	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[__S1]], i32 0, i32 0
1047	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL1]], i64 0, i64 1
1048	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
1049	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[__S1]], i32 0, i32 0
1050	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL3]], i64 0, i64 2
1051	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
1052	// CHECK: [[VLD3_LANE:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8 %ptr)
1053	// CHECK: [[TMP6:%.]] = bitcast i8 [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
1054	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
1055	// CHECK: [[TMP7:%.]] = bitcast %struct.int8x16x3_t [[RETVAL]] to i8*
1056	// CHECK: [[TMP8:%.]] = bitcast %struct.int8x16x3_t [[__RET]] to i8*
1057	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
1058	// CHECK: [[TMP9:%.]] = load %struct.int8x16x3_t, %struct.int8x16x3_t [[RETVAL]], align 16
1059	// CHECK: ret %struct.int8x16x3_t [[TMP9]]
1060	int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
1061	return vld3q_lane_s8(ptr, src, 15);
1062	}
1063
1064	// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #2 {
1065	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
1066	// CHECK: [[SRC:%.*]] = alloca %struct.uint8x16x3_t, align 16
1067	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
1068	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
1069	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[SRC]], i32 0, i32 0
1070	// CHECK: store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
1071	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x3_t [[__S1]] to i8*
1072	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x16x3_t [[SRC]] to i8*
1073	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
1074	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x16x3_t [[__RET]] to i8*
1075	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[__S1]], i32 0, i32 0
1076	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL]], i64 0, i64 0
1077	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
1078	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[__S1]], i32 0, i32 0
1079	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL1]], i64 0, i64 1
1080	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
1081	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[__S1]], i32 0, i32 0
1082	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL3]], i64 0, i64 2
1083	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
1084	// CHECK: [[VLD3_LANE:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8 %ptr)
1085	// CHECK: [[TMP6:%.]] = bitcast i8 [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
1086	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
1087	// CHECK: [[TMP7:%.]] = bitcast %struct.uint8x16x3_t [[RETVAL]] to i8*
1088	// CHECK: [[TMP8:%.]] = bitcast %struct.uint8x16x3_t [[__RET]] to i8*
1089	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
1090	// CHECK: [[TMP9:%.]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t [[RETVAL]], align 16
1091	// CHECK: ret %struct.uint8x16x3_t [[TMP9]]
1092	uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
1093	return vld3q_lane_u8(ptr, src, 15);
1094	}
1095
1096	// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #2 {
1097	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
1098	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
1099	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
1100	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
1101	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[B]], i32 0, i32 0
1102	// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
1103	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x2_t [[__S1]] to i8*
1104	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x2_t [[B]] to i8*
1105	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
1106	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x8x2_t [[__RET]] to i8*
1107	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
1108	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[__S1]], i32 0, i32 0
1109	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i64 0, i64 0
1110	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
1111	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1112	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[__S1]], i32 0, i32 0
1113	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i64 0, i64 1
1114	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
1115	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1116	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1117	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1118	// CHECK: [[VLD2_LANE:%.]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8 [[TMP3]])
1119	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <8 x i16>, <8 x i16> }*
1120	// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
1121	// CHECK: [[TMP11:%.]] = bitcast %struct.uint16x8x2_t [[RETVAL]] to i8*
1122	// CHECK: [[TMP12:%.]] = bitcast %struct.uint16x8x2_t [[__RET]] to i8*
1123	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
1124	// CHECK: [[TMP13:%.]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t [[RETVAL]], align 16
1125	// CHECK: ret %struct.uint16x8x2_t [[TMP13]]
1126	uint16x8x2_t test_vld2q_lane_u16(uint16_t *a, uint16x8x2_t b) {
1127	return vld2q_lane_u16(a, b, 7);
1128	}
1129
1130	// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #2 {
1131	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
1132	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
1133	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
1134	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
1135	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[B]], i32 0, i32 0
1136	// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
1137	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x2_t [[__S1]] to i8*
1138	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x2_t [[B]] to i8*
1139	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
1140	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x4x2_t [[__RET]] to i8*
1141	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
1142	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[__S1]], i32 0, i32 0
1143	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL]], i64 0, i64 0
1144	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
1145	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
1146	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[__S1]], i32 0, i32 0
1147	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL1]], i64 0, i64 1
1148	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
1149	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
1150	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1151	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1152	// CHECK: [[VLD2_LANE:%.]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8 [[TMP3]])
1153	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <4 x i32>, <4 x i32> }*
1154	// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
1155	// CHECK: [[TMP11:%.]] = bitcast %struct.uint32x4x2_t [[RETVAL]] to i8*
1156	// CHECK: [[TMP12:%.]] = bitcast %struct.uint32x4x2_t [[__RET]] to i8*
1157	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
1158	// CHECK: [[TMP13:%.]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t [[RETVAL]], align 16
1159	// CHECK: ret %struct.uint32x4x2_t [[TMP13]]
1160	uint32x4x2_t test_vld2q_lane_u32(uint32_t *a, uint32x4x2_t b) {
1161	return vld2q_lane_u32(a, b, 3);
1162	}
1163
1164	// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #2 {
1165	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
1166	// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
1167	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
1168	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
1169	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t [[B]], i32 0, i32 0
1170	// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
1171	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x2_t [[__S1]] to i8*
1172	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x2x2_t [[B]] to i8*
1173	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
1174	// CHECK: [[TMP2:%.]] = bitcast %struct.uint64x2x2_t [[__RET]] to i8*
1175	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
1176	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t [[__S1]], i32 0, i32 0
1177	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL]], i64 0, i64 0
1178	// CHECK: [[TMP4:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
1179	// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1180	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t [[__S1]], i32 0, i32 0
1181	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL1]], i64 0, i64 1
1182	// CHECK: [[TMP6:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
1183	// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1184	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1185	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1186	// CHECK: [[VLD2_LANE:%.]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8 [[TMP3]])
1187	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <2 x i64>, <2 x i64> }*
1188	// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
1189	// CHECK: [[TMP11:%.]] = bitcast %struct.uint64x2x2_t [[RETVAL]] to i8*
1190	// CHECK: [[TMP12:%.]] = bitcast %struct.uint64x2x2_t [[__RET]] to i8*
1191	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
1192	// CHECK: [[TMP13:%.]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t [[RETVAL]], align 16
1193	// CHECK: ret %struct.uint64x2x2_t [[TMP13]]
1194	uint64x2x2_t test_vld2q_lane_u64(uint64_t *a, uint64x2x2_t b) {
1195	return vld2q_lane_u64(a, b, 1);
1196	}
1197
1198	// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #2 {
1199	// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
1200	// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
1201	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
1202	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
1203	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[B]], i32 0, i32 0
1204	// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
1205	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x2_t [[__S1]] to i8*
1206	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x2_t [[B]] to i8*
1207	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
1208	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x8x2_t [[__RET]] to i8*
1209	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
1210	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[__S1]], i32 0, i32 0
1211	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i64 0, i64 0
1212	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
1213	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1214	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[__S1]], i32 0, i32 0
1215	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i64 0, i64 1
1216	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
1217	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1218	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1219	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1220	// CHECK: [[VLD2_LANE:%.]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8 [[TMP3]])
1221	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <8 x i16>, <8 x i16> }*
1222	// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
1223	// CHECK: [[TMP11:%.]] = bitcast %struct.int16x8x2_t [[RETVAL]] to i8*
1224	// CHECK: [[TMP12:%.]] = bitcast %struct.int16x8x2_t [[__RET]] to i8*
1225	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
1226	// CHECK: [[TMP13:%.]] = load %struct.int16x8x2_t, %struct.int16x8x2_t [[RETVAL]], align 16
1227	// CHECK: ret %struct.int16x8x2_t [[TMP13]]
1228	int16x8x2_t test_vld2q_lane_s16(int16_t *a, int16x8x2_t b) {
1229	return vld2q_lane_s16(a, b, 7);
1230	}
1231
1232	// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #2 {
1233	// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
1234	// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
1235	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
1236	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
1237	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[B]], i32 0, i32 0
1238	// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
1239	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x2_t [[__S1]] to i8*
1240	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x2_t [[B]] to i8*
1241	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
1242	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x4x2_t [[__RET]] to i8*
1243	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
1244	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[__S1]], i32 0, i32 0
1245	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL]], i64 0, i64 0
1246	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
1247	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
1248	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[__S1]], i32 0, i32 0
1249	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL1]], i64 0, i64 1
1250	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
1251	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
1252	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1253	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1254	// CHECK: [[VLD2_LANE:%.]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8 [[TMP3]])
1255	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <4 x i32>, <4 x i32> }*
1256	// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
1257	// CHECK: [[TMP11:%.]] = bitcast %struct.int32x4x2_t [[RETVAL]] to i8*
1258	// CHECK: [[TMP12:%.]] = bitcast %struct.int32x4x2_t [[__RET]] to i8*
1259	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
1260	// CHECK: [[TMP13:%.]] = load %struct.int32x4x2_t, %struct.int32x4x2_t [[RETVAL]], align 16
1261	// CHECK: ret %struct.int32x4x2_t [[TMP13]]
1262	int32x4x2_t test_vld2q_lane_s32(int32_t *a, int32x4x2_t b) {
1263	return vld2q_lane_s32(a, b, 3);
1264	}
1265
1266	// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #2 {
1267	// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
1268	// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
1269	// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
1270	// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
1271	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t [[B]], i32 0, i32 0
1272	// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
1273	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x2_t [[__S1]] to i8*
1274	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x2x2_t [[B]] to i8*
1275	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
1276	// CHECK: [[TMP2:%.]] = bitcast %struct.int64x2x2_t [[__RET]] to i8*
1277	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
1278	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t [[__S1]], i32 0, i32 0
1279	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL]], i64 0, i64 0
1280	// CHECK: [[TMP4:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
1281	// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1282	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t [[__S1]], i32 0, i32 0
1283	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL1]], i64 0, i64 1
1284	// CHECK: [[TMP6:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
1285	// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1286	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1287	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1288	// CHECK: [[VLD2_LANE:%.]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8 [[TMP3]])
1289	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <2 x i64>, <2 x i64> }*
1290	// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
1291	// CHECK: [[TMP11:%.]] = bitcast %struct.int64x2x2_t [[RETVAL]] to i8*
1292	// CHECK: [[TMP12:%.]] = bitcast %struct.int64x2x2_t [[__RET]] to i8*
1293	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
1294	// CHECK: [[TMP13:%.]] = load %struct.int64x2x2_t, %struct.int64x2x2_t [[RETVAL]], align 16
1295	// CHECK: ret %struct.int64x2x2_t [[TMP13]]
1296	int64x2x2_t test_vld2q_lane_s64(int64_t *a, int64x2x2_t b) {
1297	return vld2q_lane_s64(a, b, 1);
1298	}
1299
1300	// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #2 {
1301	// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
1302	// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
1303	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
1304	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
1305	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[B]], i32 0, i32 0
1306	// CHECK: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
1307	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x2_t [[__S1]] to i8*
1308	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x2_t [[B]] to i8*
1309	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
1310	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x8x2_t [[__RET]] to i8*
1311	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
1312	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[__S1]], i32 0, i32 0
1313	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>] [[VAL]], i64 0, i64 0
1314	// CHECK: [[TMP4:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
1315	// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
1316	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[__S1]], i32 0, i32 0
1317	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>] [[VAL1]], i64 0, i64 1
1318	// CHECK: [[TMP6:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
1319	// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
1320	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
1321	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
1322	// CHECK: [[VLD2_LANE:%.]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0i8(<8 x half> [[TMP8]], <8 x half> [[TMP9]], i64 7, i8 [[TMP3]])
1323	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <8 x half>, <8 x half> }*
1324	// CHECK: store { <8 x half>, <8 x half> } [[VLD2_LANE]], { <8 x half>, <8 x half> }* [[TMP10]]
1325	// CHECK: [[TMP11:%.]] = bitcast %struct.float16x8x2_t [[RETVAL]] to i8*
1326	// CHECK: [[TMP12:%.]] = bitcast %struct.float16x8x2_t [[__RET]] to i8*
1327	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
1328	// CHECK: [[TMP13:%.]] = load %struct.float16x8x2_t, %struct.float16x8x2_t [[RETVAL]], align 16
1329	// CHECK: ret %struct.float16x8x2_t [[TMP13]]
1330	float16x8x2_t test_vld2q_lane_f16(float16_t *a, float16x8x2_t b) {
1331	return vld2q_lane_f16(a, b, 7);
1332	}
1333
1334	// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #2 {
1335	// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
1336	// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
1337	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
1338	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
1339	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[B]], i32 0, i32 0
1340	// CHECK: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
1341	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x2_t [[__S1]] to i8*
1342	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x2_t [[B]] to i8*
1343	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
1344	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x4x2_t [[__RET]] to i8*
1345	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
1346	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[__S1]], i32 0, i32 0
1347	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>] [[VAL]], i64 0, i64 0
1348	// CHECK: [[TMP4:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
1349	// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
1350	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[__S1]], i32 0, i32 0
1351	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>] [[VAL1]], i64 0, i64 1
1352	// CHECK: [[TMP6:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
1353	// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
1354	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
1355	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
1356	// CHECK: [[VLD2_LANE:%.]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0i8(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, i8 [[TMP3]])
1357	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <4 x float>, <4 x float> }*
1358	// CHECK: store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP10]]
1359	// CHECK: [[TMP11:%.]] = bitcast %struct.float32x4x2_t [[RETVAL]] to i8*
1360	// CHECK: [[TMP12:%.]] = bitcast %struct.float32x4x2_t [[__RET]] to i8*
1361	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
1362	// CHECK: [[TMP13:%.]] = load %struct.float32x4x2_t, %struct.float32x4x2_t [[RETVAL]], align 16
1363	// CHECK: ret %struct.float32x4x2_t [[TMP13]]
1364	float32x4x2_t test_vld2q_lane_f32(float32_t *a, float32x4x2_t b) {
1365	return vld2q_lane_f32(a, b, 3);
1366	}
1367
1368	// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #2 {
1369	// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
1370	// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
1371	// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
1372	// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
1373	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t [[B]], i32 0, i32 0
1374	// CHECK: store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
1375	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x2x2_t [[__S1]] to i8*
1376	// CHECK: [[TMP1:%.]] = bitcast %struct.float64x2x2_t [[B]] to i8*
1377	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
1378	// CHECK: [[TMP2:%.]] = bitcast %struct.float64x2x2_t [[__RET]] to i8*
1379	// CHECK: [[TMP3:%.]] = bitcast double %a to i8*
1380	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t [[__S1]], i32 0, i32 0
1381	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>] [[VAL]], i64 0, i64 0
1382	// CHECK: [[TMP4:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX]], align 16
1383	// CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
1384	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t [[__S1]], i32 0, i32 0
1385	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>] [[VAL1]], i64 0, i64 1
1386	// CHECK: [[TMP6:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX2]], align 16
1387	// CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
1388	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
1389	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
1390	// CHECK: [[VLD2_LANE:%.]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0i8(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, i8 [[TMP3]])
1391	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <2 x double>, <2 x double> }*
1392	// CHECK: store { <2 x double>, <2 x double> } [[VLD2_LANE]], { <2 x double>, <2 x double> }* [[TMP10]]
1393	// CHECK: [[TMP11:%.]] = bitcast %struct.float64x2x2_t [[RETVAL]] to i8*
1394	// CHECK: [[TMP12:%.]] = bitcast %struct.float64x2x2_t [[__RET]] to i8*
1395	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
1396	// CHECK: [[TMP13:%.]] = load %struct.float64x2x2_t, %struct.float64x2x2_t [[RETVAL]], align 16
1397	// CHECK: ret %struct.float64x2x2_t [[TMP13]]
1398	float64x2x2_t test_vld2q_lane_f64(float64_t *a, float64x2x2_t b) {
1399	return vld2q_lane_f64(a, b, 1);
1400	}
1401
1402	// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #2 {
1403	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
1404	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
1405	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
1406	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
1407	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[B]], i32 0, i32 0
1408	// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
1409	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x2_t [[__S1]] to i8*
1410	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x2_t [[B]] to i8*
1411	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
1412	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x8x2_t [[__RET]] to i8*
1413	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
1414	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[__S1]], i32 0, i32 0
1415	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i64 0, i64 0
1416	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
1417	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1418	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[__S1]], i32 0, i32 0
1419	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i64 0, i64 1
1420	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
1421	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1422	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1423	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1424	// CHECK: [[VLD2_LANE:%.]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8 [[TMP3]])
1425	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <8 x i16>, <8 x i16> }*
1426	// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
1427	// CHECK: [[TMP11:%.]] = bitcast %struct.poly16x8x2_t [[RETVAL]] to i8*
1428	// CHECK: [[TMP12:%.]] = bitcast %struct.poly16x8x2_t [[__RET]] to i8*
1429	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
1430	// CHECK: [[TMP13:%.]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t [[RETVAL]], align 16
1431	// CHECK: ret %struct.poly16x8x2_t [[TMP13]]
1432	poly16x8x2_t test_vld2q_lane_p16(poly16_t *a, poly16x8x2_t b) {
1433	return vld2q_lane_p16(a, b, 7);
1434	}
1435
1436	// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #2 {
1437	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
1438	// CHECK: [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
1439	// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
1440	// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
1441	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t [[B]], i32 0, i32 0
1442	// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
1443	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x2x2_t [[__S1]] to i8*
1444	// CHECK: [[TMP1:%.]] = bitcast %struct.poly64x2x2_t [[B]] to i8*
1445	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
1446	// CHECK: [[TMP2:%.]] = bitcast %struct.poly64x2x2_t [[__RET]] to i8*
1447	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
1448	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t [[__S1]], i32 0, i32 0
1449	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL]], i64 0, i64 0
1450	// CHECK: [[TMP4:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
1451	// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
1452	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t [[__S1]], i32 0, i32 0
1453	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL1]], i64 0, i64 1
1454	// CHECK: [[TMP6:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
1455	// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
1456	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
1457	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
1458	// CHECK: [[VLD2_LANE:%.]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8 [[TMP3]])
1459	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <2 x i64>, <2 x i64> }*
1460	// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
1461	// CHECK: [[TMP11:%.]] = bitcast %struct.poly64x2x2_t [[RETVAL]] to i8*
1462	// CHECK: [[TMP12:%.]] = bitcast %struct.poly64x2x2_t [[__RET]] to i8*
1463	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
1464	// CHECK: [[TMP13:%.]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t [[RETVAL]], align 16
1465	// CHECK: ret %struct.poly64x2x2_t [[TMP13]]
1466	poly64x2x2_t test_vld2q_lane_p64(poly64_t *a, poly64x2x2_t b) {
1467	return vld2q_lane_p64(a, b, 1);
1468	}
1469
1470	// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #2 {
1471	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
1472	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
1473	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
1474	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
1475	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[B]], i32 0, i32 0
1476	// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
1477	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x2_t [[__S1]] to i8*
1478	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x2_t [[B]] to i8*
1479	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1480	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x2_t [[__RET]] to i8*
1481	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__S1]], i32 0, i32 0
1482	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i64 0, i64 0
1483	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
1484	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__S1]], i32 0, i32 0
1485	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i64 0, i64 1
1486	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
1487	// CHECK: [[VLD2_LANE:%.]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8 %a)
1488	// CHECK: [[TMP5:%.]] = bitcast i8 [[TMP2]] to { <8 x i8>, <8 x i8> }*
1489	// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
1490	// CHECK: [[TMP6:%.]] = bitcast %struct.uint8x8x2_t [[RETVAL]] to i8*
1491	// CHECK: [[TMP7:%.]] = bitcast %struct.uint8x8x2_t [[__RET]] to i8*
1492	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
1493	// CHECK: [[TMP8:%.]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t [[RETVAL]], align 8
1494	// CHECK: ret %struct.uint8x8x2_t [[TMP8]]
1495	uint8x8x2_t test_vld2_lane_u8(uint8_t *a, uint8x8x2_t b) {
1496	return vld2_lane_u8(a, b, 7);
1497	}
1498
1499	// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #2 {
1500	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
1501	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
1502	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
1503	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
1504	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[B]], i32 0, i32 0
1505	// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
1506	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x2_t [[__S1]] to i8*
1507	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x2_t [[B]] to i8*
1508	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1509	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x4x2_t [[__RET]] to i8*
1510	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
1511	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[__S1]], i32 0, i32 0
1512	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i64 0, i64 0
1513	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
1514	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1515	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[__S1]], i32 0, i32 0
1516	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i64 0, i64 1
1517	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
1518	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1519	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1520	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1521	// CHECK: [[VLD2_LANE:%.]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8 [[TMP3]])
1522	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <4 x i16>, <4 x i16> }*
1523	// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
1524	// CHECK: [[TMP11:%.]] = bitcast %struct.uint16x4x2_t [[RETVAL]] to i8*
1525	// CHECK: [[TMP12:%.]] = bitcast %struct.uint16x4x2_t [[__RET]] to i8*
1526	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
1527	// CHECK: [[TMP13:%.]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t [[RETVAL]], align 8
1528	// CHECK: ret %struct.uint16x4x2_t [[TMP13]]
1529	uint16x4x2_t test_vld2_lane_u16(uint16_t *a, uint16x4x2_t b) {
1530	return vld2_lane_u16(a, b, 3);
1531	}
1532
1533	// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #2 {
1534	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
1535	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
1536	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
1537	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
1538	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[B]], i32 0, i32 0
1539	// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
1540	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x2_t [[__S1]] to i8*
1541	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x2_t [[B]] to i8*
1542	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1543	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x2x2_t [[__RET]] to i8*
1544	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
1545	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[__S1]], i32 0, i32 0
1546	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL]], i64 0, i64 0
1547	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
1548	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
1549	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[__S1]], i32 0, i32 0
1550	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL1]], i64 0, i64 1
1551	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
1552	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
1553	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
1554	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
1555	// CHECK: [[VLD2_LANE:%.]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8 [[TMP3]])
1556	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <2 x i32>, <2 x i32> }*
1557	// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
1558	// CHECK: [[TMP11:%.]] = bitcast %struct.uint32x2x2_t [[RETVAL]] to i8*
1559	// CHECK: [[TMP12:%.]] = bitcast %struct.uint32x2x2_t [[__RET]] to i8*
1560	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
1561	// CHECK: [[TMP13:%.]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t [[RETVAL]], align 8
1562	// CHECK: ret %struct.uint32x2x2_t [[TMP13]]
1563	uint32x2x2_t test_vld2_lane_u32(uint32_t *a, uint32x2x2_t b) {
1564	return vld2_lane_u32(a, b, 1);
1565	}
1566
1567	// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #2 {
1568	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
1569	// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
1570	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
1571	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
1572	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t [[B]], i32 0, i32 0
1573	// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
1574	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x2_t [[__S1]] to i8*
1575	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x1x2_t [[B]] to i8*
1576	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1577	// CHECK: [[TMP2:%.]] = bitcast %struct.uint64x1x2_t [[__RET]] to i8*
1578	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
1579	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t [[__S1]], i32 0, i32 0
1580	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL]], i64 0, i64 0
1581	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
1582	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
1583	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t [[__S1]], i32 0, i32 0
1584	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL1]], i64 0, i64 1
1585	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
1586	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
1587	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
1588	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
1589	// CHECK: [[VLD2_LANE:%.]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8 [[TMP3]])
1590	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <1 x i64>, <1 x i64> }*
1591	// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
1592	// CHECK: [[TMP11:%.]] = bitcast %struct.uint64x1x2_t [[RETVAL]] to i8*
1593	// CHECK: [[TMP12:%.]] = bitcast %struct.uint64x1x2_t [[__RET]] to i8*
1594	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
1595	// CHECK: [[TMP13:%.]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t [[RETVAL]], align 8
1596	// CHECK: ret %struct.uint64x1x2_t [[TMP13]]
1597	uint64x1x2_t test_vld2_lane_u64(uint64_t *a, uint64x1x2_t b) {
1598	return vld2_lane_u64(a, b, 0);
1599	}
1600
1601	// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #2 {
1602	// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
1603	// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
1604	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
1605	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
1606	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[B]], i32 0, i32 0
1607	// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
1608	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x2_t [[__S1]] to i8*
1609	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x2_t [[B]] to i8*
1610	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1611	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x2_t [[__RET]] to i8*
1612	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__S1]], i32 0, i32 0
1613	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i64 0, i64 0
1614	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
1615	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__S1]], i32 0, i32 0
1616	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i64 0, i64 1
1617	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
1618	// CHECK: [[VLD2_LANE:%.]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8 %a)
1619	// CHECK: [[TMP5:%.]] = bitcast i8 [[TMP2]] to { <8 x i8>, <8 x i8> }*
1620	// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
1621	// CHECK: [[TMP6:%.]] = bitcast %struct.int8x8x2_t [[RETVAL]] to i8*
1622	// CHECK: [[TMP7:%.]] = bitcast %struct.int8x8x2_t [[__RET]] to i8*
1623	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
1624	// CHECK: [[TMP8:%.]] = load %struct.int8x8x2_t, %struct.int8x8x2_t [[RETVAL]], align 8
1625	// CHECK: ret %struct.int8x8x2_t [[TMP8]]
1626	int8x8x2_t test_vld2_lane_s8(int8_t *a, int8x8x2_t b) {
1627	return vld2_lane_s8(a, b, 7);
1628	}
1629
1630	// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #2 {
1631	// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
1632	// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
1633	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
1634	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
1635	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[B]], i32 0, i32 0
1636	// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
1637	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x2_t [[__S1]] to i8*
1638	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x2_t [[B]] to i8*
1639	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1640	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x4x2_t [[__RET]] to i8*
1641	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
1642	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[__S1]], i32 0, i32 0
1643	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i64 0, i64 0
1644	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
1645	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1646	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[__S1]], i32 0, i32 0
1647	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i64 0, i64 1
1648	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
1649	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1650	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1651	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1652	// CHECK: [[VLD2_LANE:%.]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8 [[TMP3]])
1653	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <4 x i16>, <4 x i16> }*
1654	// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
1655	// CHECK: [[TMP11:%.]] = bitcast %struct.int16x4x2_t [[RETVAL]] to i8*
1656	// CHECK: [[TMP12:%.]] = bitcast %struct.int16x4x2_t [[__RET]] to i8*
1657	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
1658	// CHECK: [[TMP13:%.]] = load %struct.int16x4x2_t, %struct.int16x4x2_t [[RETVAL]], align 8
1659	// CHECK: ret %struct.int16x4x2_t [[TMP13]]
1660	int16x4x2_t test_vld2_lane_s16(int16_t *a, int16x4x2_t b) {
1661	return vld2_lane_s16(a, b, 3);
1662	}
1663
1664	// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #2 {
1665	// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
1666	// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
1667	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
1668	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
1669	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[B]], i32 0, i32 0
1670	// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
1671	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x2_t [[__S1]] to i8*
1672	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x2_t [[B]] to i8*
1673	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1674	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x2x2_t [[__RET]] to i8*
1675	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
1676	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[__S1]], i32 0, i32 0
1677	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL]], i64 0, i64 0
1678	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
1679	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
1680	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[__S1]], i32 0, i32 0
1681	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL1]], i64 0, i64 1
1682	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
1683	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
1684	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
1685	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
1686	// CHECK: [[VLD2_LANE:%.]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8 [[TMP3]])
1687	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <2 x i32>, <2 x i32> }*
1688	// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
1689	// CHECK: [[TMP11:%.]] = bitcast %struct.int32x2x2_t [[RETVAL]] to i8*
1690	// CHECK: [[TMP12:%.]] = bitcast %struct.int32x2x2_t [[__RET]] to i8*
1691	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
1692	// CHECK: [[TMP13:%.]] = load %struct.int32x2x2_t, %struct.int32x2x2_t [[RETVAL]], align 8
1693	// CHECK: ret %struct.int32x2x2_t [[TMP13]]
1694	int32x2x2_t test_vld2_lane_s32(int32_t *a, int32x2x2_t b) {
1695	return vld2_lane_s32(a, b, 1);
1696	}
1697
1698	// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #2 {
1699	// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
1700	// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
1701	// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
1702	// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
1703	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t [[B]], i32 0, i32 0
1704	// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
1705	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x2_t [[__S1]] to i8*
1706	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x1x2_t [[B]] to i8*
1707	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1708	// CHECK: [[TMP2:%.]] = bitcast %struct.int64x1x2_t [[__RET]] to i8*
1709	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
1710	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t [[__S1]], i32 0, i32 0
1711	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL]], i64 0, i64 0
1712	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
1713	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
1714	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t [[__S1]], i32 0, i32 0
1715	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL1]], i64 0, i64 1
1716	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
1717	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
1718	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
1719	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
1720	// CHECK: [[VLD2_LANE:%.]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8 [[TMP3]])
1721	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <1 x i64>, <1 x i64> }*
1722	// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
1723	// CHECK: [[TMP11:%.]] = bitcast %struct.int64x1x2_t [[RETVAL]] to i8*
1724	// CHECK: [[TMP12:%.]] = bitcast %struct.int64x1x2_t [[__RET]] to i8*
1725	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
1726	// CHECK: [[TMP13:%.]] = load %struct.int64x1x2_t, %struct.int64x1x2_t [[RETVAL]], align 8
1727	// CHECK: ret %struct.int64x1x2_t [[TMP13]]
1728	int64x1x2_t test_vld2_lane_s64(int64_t *a, int64x1x2_t b) {
1729	return vld2_lane_s64(a, b, 0);
1730	}
1731
1732	// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #2 {
1733	// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
1734	// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
1735	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
1736	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
1737	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[B]], i32 0, i32 0
1738	// CHECK: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
1739	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x2_t [[__S1]] to i8*
1740	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x2_t [[B]] to i8*
1741	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1742	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x4x2_t [[__RET]] to i8*
1743	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
1744	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[__S1]], i32 0, i32 0
1745	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>] [[VAL]], i64 0, i64 0
1746	// CHECK: [[TMP4:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
1747	// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
1748	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[__S1]], i32 0, i32 0
1749	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>] [[VAL1]], i64 0, i64 1
1750	// CHECK: [[TMP6:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
1751	// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
1752	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
1753	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
1754	// CHECK: [[VLD2_LANE:%.]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0i8(<4 x half> [[TMP8]], <4 x half> [[TMP9]], i64 3, i8 [[TMP3]])
1755	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <4 x half>, <4 x half> }*
1756	// CHECK: store { <4 x half>, <4 x half> } [[VLD2_LANE]], { <4 x half>, <4 x half> }* [[TMP10]]
1757	// CHECK: [[TMP11:%.]] = bitcast %struct.float16x4x2_t [[RETVAL]] to i8*
1758	// CHECK: [[TMP12:%.]] = bitcast %struct.float16x4x2_t [[__RET]] to i8*
1759	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
1760	// CHECK: [[TMP13:%.]] = load %struct.float16x4x2_t, %struct.float16x4x2_t [[RETVAL]], align 8
1761	// CHECK: ret %struct.float16x4x2_t [[TMP13]]
1762	float16x4x2_t test_vld2_lane_f16(float16_t *a, float16x4x2_t b) {
1763	return vld2_lane_f16(a, b, 3);
1764	}
1765
1766	// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #2 {
1767	// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
1768	// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
1769	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
1770	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
1771	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[B]], i32 0, i32 0
1772	// CHECK: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
1773	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x2_t [[__S1]] to i8*
1774	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x2_t [[B]] to i8*
1775	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1776	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x2x2_t [[__RET]] to i8*
1777	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
1778	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[__S1]], i32 0, i32 0
1779	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>] [[VAL]], i64 0, i64 0
1780	// CHECK: [[TMP4:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
1781	// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
1782	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[__S1]], i32 0, i32 0
1783	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>] [[VAL1]], i64 0, i64 1
1784	// CHECK: [[TMP6:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
1785	// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
1786	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
1787	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
1788	// CHECK: [[VLD2_LANE:%.]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0i8(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, i8 [[TMP3]])
1789	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <2 x float>, <2 x float> }*
1790	// CHECK: store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP10]]
1791	// CHECK: [[TMP11:%.]] = bitcast %struct.float32x2x2_t [[RETVAL]] to i8*
1792	// CHECK: [[TMP12:%.]] = bitcast %struct.float32x2x2_t [[__RET]] to i8*
1793	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
1794	// CHECK: [[TMP13:%.]] = load %struct.float32x2x2_t, %struct.float32x2x2_t [[RETVAL]], align 8
1795	// CHECK: ret %struct.float32x2x2_t [[TMP13]]
1796	float32x2x2_t test_vld2_lane_f32(float32_t *a, float32x2x2_t b) {
1797	return vld2_lane_f32(a, b, 1);
1798	}
1799
1800	// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #2 {
1801	// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
1802	// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
1803	// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
1804	// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
1805	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t [[B]], i32 0, i32 0
1806	// CHECK: store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
1807	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x1x2_t [[__S1]] to i8*
1808	// CHECK: [[TMP1:%.]] = bitcast %struct.float64x1x2_t [[B]] to i8*
1809	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1810	// CHECK: [[TMP2:%.]] = bitcast %struct.float64x1x2_t [[__RET]] to i8*
1811	// CHECK: [[TMP3:%.]] = bitcast double %a to i8*
1812	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t [[__S1]], i32 0, i32 0
1813	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>] [[VAL]], i64 0, i64 0
1814	// CHECK: [[TMP4:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX]], align 8
1815	// CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
1816	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t [[__S1]], i32 0, i32 0
1817	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>] [[VAL1]], i64 0, i64 1
1818	// CHECK: [[TMP6:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX2]], align 8
1819	// CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
1820	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
1821	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
1822	// CHECK: [[VLD2_LANE:%.]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0i8(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, i8 [[TMP3]])
1823	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <1 x double>, <1 x double> }*
1824	// CHECK: store { <1 x double>, <1 x double> } [[VLD2_LANE]], { <1 x double>, <1 x double> }* [[TMP10]]
1825	// CHECK: [[TMP11:%.]] = bitcast %struct.float64x1x2_t [[RETVAL]] to i8*
1826	// CHECK: [[TMP12:%.]] = bitcast %struct.float64x1x2_t [[__RET]] to i8*
1827	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
1828	// CHECK: [[TMP13:%.]] = load %struct.float64x1x2_t, %struct.float64x1x2_t [[RETVAL]], align 8
1829	// CHECK: ret %struct.float64x1x2_t [[TMP13]]
1830	float64x1x2_t test_vld2_lane_f64(float64_t *a, float64x1x2_t b) {
1831	return vld2_lane_f64(a, b, 0);
1832	}
1833
1834	// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #2 {
1835	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
1836	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
1837	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
1838	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
1839	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[B]], i32 0, i32 0
1840	// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
1841	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x2_t [[__S1]] to i8*
1842	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x2_t [[B]] to i8*
1843	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1844	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x2_t [[__RET]] to i8*
1845	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__S1]], i32 0, i32 0
1846	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i64 0, i64 0
1847	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
1848	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__S1]], i32 0, i32 0
1849	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i64 0, i64 1
1850	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
1851	// CHECK: [[VLD2_LANE:%.]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8 %a)
1852	// CHECK: [[TMP5:%.]] = bitcast i8 [[TMP2]] to { <8 x i8>, <8 x i8> }*
1853	// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
1854	// CHECK: [[TMP6:%.]] = bitcast %struct.poly8x8x2_t [[RETVAL]] to i8*
1855	// CHECK: [[TMP7:%.]] = bitcast %struct.poly8x8x2_t [[__RET]] to i8*
1856	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
1857	// CHECK: [[TMP8:%.]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t [[RETVAL]], align 8
1858	// CHECK: ret %struct.poly8x8x2_t [[TMP8]]
1859	poly8x8x2_t test_vld2_lane_p8(poly8_t *a, poly8x8x2_t b) {
1860	return vld2_lane_p8(a, b, 7);
1861	}
1862
1863	// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #2 {
1864	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
1865	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
1866	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
1867	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
1868	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[B]], i32 0, i32 0
1869	// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
1870	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x2_t [[__S1]] to i8*
1871	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x2_t [[B]] to i8*
1872	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1873	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x4x2_t [[__RET]] to i8*
1874	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
1875	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[__S1]], i32 0, i32 0
1876	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i64 0, i64 0
1877	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
1878	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
1879	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[__S1]], i32 0, i32 0
1880	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i64 0, i64 1
1881	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
1882	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
1883	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
1884	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
1885	// CHECK: [[VLD2_LANE:%.]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8 [[TMP3]])
1886	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <4 x i16>, <4 x i16> }*
1887	// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
1888	// CHECK: [[TMP11:%.]] = bitcast %struct.poly16x4x2_t [[RETVAL]] to i8*
1889	// CHECK: [[TMP12:%.]] = bitcast %struct.poly16x4x2_t [[__RET]] to i8*
1890	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
1891	// CHECK: [[TMP13:%.]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t [[RETVAL]], align 8
1892	// CHECK: ret %struct.poly16x4x2_t [[TMP13]]
1893	poly16x4x2_t test_vld2_lane_p16(poly16_t *a, poly16x4x2_t b) {
1894	return vld2_lane_p16(a, b, 3);
1895	}
1896
1897	// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #2 {
1898	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
1899	// CHECK: [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
1900	// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
1901	// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
1902	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t [[B]], i32 0, i32 0
1903	// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
1904	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x1x2_t [[__S1]] to i8*
1905	// CHECK: [[TMP1:%.]] = bitcast %struct.poly64x1x2_t [[B]] to i8*
1906	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
1907	// CHECK: [[TMP2:%.]] = bitcast %struct.poly64x1x2_t [[__RET]] to i8*
1908	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
1909	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t [[__S1]], i32 0, i32 0
1910	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL]], i64 0, i64 0
1911	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
1912	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
1913	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t [[__S1]], i32 0, i32 0
1914	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL1]], i64 0, i64 1
1915	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
1916	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
1917	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
1918	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
1919	// CHECK: [[VLD2_LANE:%.]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8 [[TMP3]])
1920	// CHECK: [[TMP10:%.]] = bitcast i8 [[TMP2]] to { <1 x i64>, <1 x i64> }*
1921	// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
1922	// CHECK: [[TMP11:%.]] = bitcast %struct.poly64x1x2_t [[RETVAL]] to i8*
1923	// CHECK: [[TMP12:%.]] = bitcast %struct.poly64x1x2_t [[__RET]] to i8*
1924	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
1925	// CHECK: [[TMP13:%.]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t [[RETVAL]], align 8
1926	// CHECK: ret %struct.poly64x1x2_t [[TMP13]]
1927	poly64x1x2_t test_vld2_lane_p64(poly64_t *a, poly64x1x2_t b) {
1928	return vld2_lane_p64(a, b, 0);
1929	}
1930
1931	// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #2 {
1932	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
1933	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
1934	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
1935	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
1936	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[B]], i32 0, i32 0
1937	// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
1938	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x3_t [[__S1]] to i8*
1939	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x3_t [[B]] to i8*
1940	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
1941	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x8x3_t [[__RET]] to i8*
1942	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
1943	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
1944	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i64 0, i64 0
1945	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
1946	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
1947	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
1948	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i64 0, i64 1
1949	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
1950	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
1951	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
1952	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i64 0, i64 2
1953	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
1954	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
1955	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
1956	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
1957	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
1958	// CHECK: [[VLD3_LANE:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8 [[TMP3]])
1959	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
1960	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
1961	// CHECK: [[TMP14:%.]] = bitcast %struct.uint16x8x3_t [[RETVAL]] to i8*
1962	// CHECK: [[TMP15:%.]] = bitcast %struct.uint16x8x3_t [[__RET]] to i8*
1963	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
1964	// CHECK: [[TMP16:%.]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t [[RETVAL]], align 16
1965	// CHECK: ret %struct.uint16x8x3_t [[TMP16]]
1966	uint16x8x3_t test_vld3q_lane_u16(uint16_t *a, uint16x8x3_t b) {
1967	return vld3q_lane_u16(a, b, 7);
1968	}
1969
1970	// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #2 {
1971	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
1972	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
1973	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
1974	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
1975	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[B]], i32 0, i32 0
1976	// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
1977	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x3_t [[__S1]] to i8*
1978	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x3_t [[B]] to i8*
1979	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
1980	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x4x3_t [[__RET]] to i8*
1981	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
1982	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
1983	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL]], i64 0, i64 0
1984	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
1985	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
1986	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
1987	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL1]], i64 0, i64 1
1988	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
1989	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
1990	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
1991	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL3]], i64 0, i64 2
1992	// CHECK: [[TMP8:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
1993	// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
1994	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
1995	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
1996	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
1997	// CHECK: [[VLD3_LANE:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, i8 [[TMP3]])
1998	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
1999	// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
2000	// CHECK: [[TMP14:%.]] = bitcast %struct.uint32x4x3_t [[RETVAL]] to i8*
2001	// CHECK: [[TMP15:%.]] = bitcast %struct.uint32x4x3_t [[__RET]] to i8*
2002	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
2003	// CHECK: [[TMP16:%.]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t [[RETVAL]], align 16
2004	// CHECK: ret %struct.uint32x4x3_t [[TMP16]]
2005	uint32x4x3_t test_vld3q_lane_u32(uint32_t *a, uint32x4x3_t b) {
2006	return vld3q_lane_u32(a, b, 3);
2007	}
2008
2009	// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #2 {
2010	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
2011	// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
2012	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
2013	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
2014	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t [[B]], i32 0, i32 0
2015	// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
2016	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x3_t [[__S1]] to i8*
2017	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x2x3_t [[B]] to i8*
2018	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
2019	// CHECK: [[TMP2:%.]] = bitcast %struct.uint64x2x3_t [[__RET]] to i8*
2020	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
2021	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t [[__S1]], i32 0, i32 0
2022	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL]], i64 0, i64 0
2023	// CHECK: [[TMP4:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
2024	// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
2025	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t [[__S1]], i32 0, i32 0
2026	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL1]], i64 0, i64 1
2027	// CHECK: [[TMP6:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
2028	// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
2029	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t [[__S1]], i32 0, i32 0
2030	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL3]], i64 0, i64 2
2031	// CHECK: [[TMP8:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align 16
2032	// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
2033	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
2034	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
2035	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
2036	// CHECK: [[VLD3_LANE:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8 [[TMP3]])
2037	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
2038	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
2039	// CHECK: [[TMP14:%.]] = bitcast %struct.uint64x2x3_t [[RETVAL]] to i8*
2040	// CHECK: [[TMP15:%.]] = bitcast %struct.uint64x2x3_t [[__RET]] to i8*
2041	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
2042	// CHECK: [[TMP16:%.]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t [[RETVAL]], align 16
2043	// CHECK: ret %struct.uint64x2x3_t [[TMP16]]
2044	uint64x2x3_t test_vld3q_lane_u64(uint64_t *a, uint64x2x3_t b) {
2045	return vld3q_lane_u64(a, b, 1);
2046	}
2047
2048	// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #2 {
2049	// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
2050	// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
2051	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
2052	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
2053	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[B]], i32 0, i32 0
2054	// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
2055	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x3_t [[__S1]] to i8*
2056	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x3_t [[B]] to i8*
2057	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
2058	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x8x3_t [[__RET]] to i8*
2059	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
2060	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
2061	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i64 0, i64 0
2062	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
2063	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
2064	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
2065	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i64 0, i64 1
2066	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
2067	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
2068	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
2069	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i64 0, i64 2
2070	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
2071	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
2072	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
2073	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
2074	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
2075	// CHECK: [[VLD3_LANE:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8 [[TMP3]])
2076	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
2077	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
2078	// CHECK: [[TMP14:%.]] = bitcast %struct.int16x8x3_t [[RETVAL]] to i8*
2079	// CHECK: [[TMP15:%.]] = bitcast %struct.int16x8x3_t [[__RET]] to i8*
2080	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
2081	// CHECK: [[TMP16:%.]] = load %struct.int16x8x3_t, %struct.int16x8x3_t [[RETVAL]], align 16
2082	// CHECK: ret %struct.int16x8x3_t [[TMP16]]
2083	int16x8x3_t test_vld3q_lane_s16(int16_t *a, int16x8x3_t b) {
2084	return vld3q_lane_s16(a, b, 7);
2085	}
2086
2087	// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #2 {
2088	// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
2089	// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
2090	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
2091	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
2092	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[B]], i32 0, i32 0
2093	// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
2094	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x3_t [[__S1]] to i8*
2095	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x3_t [[B]] to i8*
2096	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
2097	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x4x3_t [[__RET]] to i8*
2098	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
2099	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
2100	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL]], i64 0, i64 0
2101	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
2102	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
2103	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
2104	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL1]], i64 0, i64 1
2105	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
2106	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
2107	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
2108	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL3]], i64 0, i64 2
2109	// CHECK: [[TMP8:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
2110	// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
2111	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
2112	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
2113	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
2114	// CHECK: [[VLD3_LANE:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, i8 [[TMP3]])
2115	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
2116	// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
2117	// CHECK: [[TMP14:%.]] = bitcast %struct.int32x4x3_t [[RETVAL]] to i8*
2118	// CHECK: [[TMP15:%.]] = bitcast %struct.int32x4x3_t [[__RET]] to i8*
2119	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
2120	// CHECK: [[TMP16:%.]] = load %struct.int32x4x3_t, %struct.int32x4x3_t [[RETVAL]], align 16
2121	// CHECK: ret %struct.int32x4x3_t [[TMP16]]
2122	int32x4x3_t test_vld3q_lane_s32(int32_t *a, int32x4x3_t b) {
2123	return vld3q_lane_s32(a, b, 3);
2124	}
2125
2126	// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #2 {
2127	// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
2128	// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
2129	// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
2130	// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
2131	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t [[B]], i32 0, i32 0
2132	// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
2133	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x3_t [[__S1]] to i8*
2134	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x2x3_t [[B]] to i8*
2135	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
2136	// CHECK: [[TMP2:%.]] = bitcast %struct.int64x2x3_t [[__RET]] to i8*
2137	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
2138	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t [[__S1]], i32 0, i32 0
2139	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL]], i64 0, i64 0
2140	// CHECK: [[TMP4:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
2141	// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
2142	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t [[__S1]], i32 0, i32 0
2143	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL1]], i64 0, i64 1
2144	// CHECK: [[TMP6:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
2145	// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
2146	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t [[__S1]], i32 0, i32 0
2147	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL3]], i64 0, i64 2
2148	// CHECK: [[TMP8:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align 16
2149	// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
2150	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
2151	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
2152	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
2153	// CHECK: [[VLD3_LANE:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8 [[TMP3]])
2154	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
2155	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
2156	// CHECK: [[TMP14:%.]] = bitcast %struct.int64x2x3_t [[RETVAL]] to i8*
2157	// CHECK: [[TMP15:%.]] = bitcast %struct.int64x2x3_t [[__RET]] to i8*
2158	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
2159	// CHECK: [[TMP16:%.]] = load %struct.int64x2x3_t, %struct.int64x2x3_t [[RETVAL]], align 16
2160	// CHECK: ret %struct.int64x2x3_t [[TMP16]]
2161	int64x2x3_t test_vld3q_lane_s64(int64_t *a, int64x2x3_t b) {
2162	return vld3q_lane_s64(a, b, 1);
2163	}
2164
2165	// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #2 {
2166	// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
2167	// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
2168	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
2169	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
2170	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[B]], i32 0, i32 0
2171	// CHECK: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
2172	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x3_t [[__S1]] to i8*
2173	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x3_t [[B]] to i8*
2174	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
2175	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x8x3_t [[__RET]] to i8*
2176	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
2177	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
2178	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL]], i64 0, i64 0
2179	// CHECK: [[TMP4:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
2180	// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
2181	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
2182	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL1]], i64 0, i64 1
2183	// CHECK: [[TMP6:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
2184	// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
2185	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
2186	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL3]], i64 0, i64 2
2187	// CHECK: [[TMP8:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX4]], align 16
2188	// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
2189	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
2190	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
2191	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
2192	// CHECK: [[VLD3_LANE:%.]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0i8(<8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i64 7, i8 [[TMP3]])
2193	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <8 x half>, <8 x half>, <8 x half> }*
2194	// CHECK: store { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP13]]
2195	// CHECK: [[TMP14:%.]] = bitcast %struct.float16x8x3_t [[RETVAL]] to i8*
2196	// CHECK: [[TMP15:%.]] = bitcast %struct.float16x8x3_t [[__RET]] to i8*
2197	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
2198	// CHECK: [[TMP16:%.]] = load %struct.float16x8x3_t, %struct.float16x8x3_t [[RETVAL]], align 16
2199	// CHECK: ret %struct.float16x8x3_t [[TMP16]]
2200	float16x8x3_t test_vld3q_lane_f16(float16_t *a, float16x8x3_t b) {
2201	return vld3q_lane_f16(a, b, 7);
2202	}
2203
2204	// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #2 {
2205	// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
2206	// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
2207	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
2208	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
2209	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[B]], i32 0, i32 0
2210	// CHECK: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
2211	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x3_t [[__S1]] to i8*
2212	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x3_t [[B]] to i8*
2213	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
2214	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x4x3_t [[__RET]] to i8*
2215	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
2216	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
2217	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL]], i64 0, i64 0
2218	// CHECK: [[TMP4:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
2219	// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
2220	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
2221	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL1]], i64 0, i64 1
2222	// CHECK: [[TMP6:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
2223	// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
2224	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
2225	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL3]], i64 0, i64 2
2226	// CHECK: [[TMP8:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX4]], align 16
2227	// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
2228	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
2229	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
2230	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
2231	// CHECK: [[VLD3_LANE:%.]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0i8(<4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i64 3, i8 [[TMP3]])
2232	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <4 x float>, <4 x float>, <4 x float> }*
2233	// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP13]]
2234	// CHECK: [[TMP14:%.]] = bitcast %struct.float32x4x3_t [[RETVAL]] to i8*
2235	// CHECK: [[TMP15:%.]] = bitcast %struct.float32x4x3_t [[__RET]] to i8*
2236	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
2237	// CHECK: [[TMP16:%.]] = load %struct.float32x4x3_t, %struct.float32x4x3_t [[RETVAL]], align 16
2238	// CHECK: ret %struct.float32x4x3_t [[TMP16]]
2239	float32x4x3_t test_vld3q_lane_f32(float32_t *a, float32x4x3_t b) {
2240	return vld3q_lane_f32(a, b, 3);
2241	}
2242
2243	// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #2 {
2244	// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
2245	// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
2246	// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
2247	// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
2248	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t [[B]], i32 0, i32 0
2249	// CHECK: store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
2250	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x2x3_t [[__S1]] to i8*
2251	// CHECK: [[TMP1:%.]] = bitcast %struct.float64x2x3_t [[B]] to i8*
2252	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
2253	// CHECK: [[TMP2:%.]] = bitcast %struct.float64x2x3_t [[__RET]] to i8*
2254	// CHECK: [[TMP3:%.]] = bitcast double %a to i8*
2255	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t [[__S1]], i32 0, i32 0
2256	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>] [[VAL]], i64 0, i64 0
2257	// CHECK: [[TMP4:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX]], align 16
2258	// CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
2259	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t [[__S1]], i32 0, i32 0
2260	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>] [[VAL1]], i64 0, i64 1
2261	// CHECK: [[TMP6:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX2]], align 16
2262	// CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
2263	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t [[__S1]], i32 0, i32 0
2264	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>] [[VAL3]], i64 0, i64 2
2265	// CHECK: [[TMP8:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX4]], align 16
2266	// CHECK: [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
2267	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
2268	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
2269	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
2270	// CHECK: [[VLD3_LANE:%.]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0i8(<2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x double> [[TMP12]], i64 1, i8 [[TMP3]])
2271	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <2 x double>, <2 x double>, <2 x double> }*
2272	// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP13]]
2273	// CHECK: [[TMP14:%.]] = bitcast %struct.float64x2x3_t [[RETVAL]] to i8*
2274	// CHECK: [[TMP15:%.]] = bitcast %struct.float64x2x3_t [[__RET]] to i8*
2275	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
2276	// CHECK: [[TMP16:%.]] = load %struct.float64x2x3_t, %struct.float64x2x3_t [[RETVAL]], align 16
2277	// CHECK: ret %struct.float64x2x3_t [[TMP16]]
2278	float64x2x3_t test_vld3q_lane_f64(float64_t *a, float64x2x3_t b) {
2279	return vld3q_lane_f64(a, b, 1);
2280	}
2281
2282	// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #2 {
2283	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
2284	// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
2285	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
2286	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
2287	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[B]], i32 0, i32 0
2288	// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
2289	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x3_t [[__S1]] to i8*
2290	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x16x3_t [[B]] to i8*
2291	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
2292	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x16x3_t [[__RET]] to i8*
2293	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[__S1]], i32 0, i32 0
2294	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL]], i64 0, i64 0
2295	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
2296	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[__S1]], i32 0, i32 0
2297	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL1]], i64 0, i64 1
2298	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
2299	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[__S1]], i32 0, i32 0
2300	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL3]], i64 0, i64 2
2301	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
2302	// CHECK: [[VLD3_LANE:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8 %a)
2303	// CHECK: [[TMP6:%.]] = bitcast i8 [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
2304	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
2305	// CHECK: [[TMP7:%.]] = bitcast %struct.poly8x16x3_t [[RETVAL]] to i8*
2306	// CHECK: [[TMP8:%.]] = bitcast %struct.poly8x16x3_t [[__RET]] to i8*
2307	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
2308	// CHECK: [[TMP9:%.]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t [[RETVAL]], align 16
2309	// CHECK: ret %struct.poly8x16x3_t [[TMP9]]
2310	poly8x16x3_t test_vld3q_lane_p8(poly8_t *a, poly8x16x3_t b) {
2311	return vld3q_lane_p8(a, b, 15);
2312	}
2313
2314	// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #2 {
2315	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
2316	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
2317	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
2318	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
2319	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[B]], i32 0, i32 0
2320	// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
2321	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x3_t [[__S1]] to i8*
2322	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x3_t [[B]] to i8*
2323	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
2324	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x8x3_t [[__RET]] to i8*
2325	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
2326	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
2327	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i64 0, i64 0
2328	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
2329	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
2330	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
2331	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i64 0, i64 1
2332	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
2333	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
2334	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
2335	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i64 0, i64 2
2336	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
2337	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
2338	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
2339	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
2340	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
2341	// CHECK: [[VLD3_LANE:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8 [[TMP3]])
2342	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
2343	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
2344	// CHECK: [[TMP14:%.]] = bitcast %struct.poly16x8x3_t [[RETVAL]] to i8*
2345	// CHECK: [[TMP15:%.]] = bitcast %struct.poly16x8x3_t [[__RET]] to i8*
2346	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
2347	// CHECK: [[TMP16:%.]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t [[RETVAL]], align 16
2348	// CHECK: ret %struct.poly16x8x3_t [[TMP16]]
2349	poly16x8x3_t test_vld3q_lane_p16(poly16_t *a, poly16x8x3_t b) {
2350	return vld3q_lane_p16(a, b, 7);
2351	}
2352
2353	// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #2 {
2354	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
2355	// CHECK: [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
2356	// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
2357	// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
2358	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t [[B]], i32 0, i32 0
2359	// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
2360	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x2x3_t [[__S1]] to i8*
2361	// CHECK: [[TMP1:%.]] = bitcast %struct.poly64x2x3_t [[B]] to i8*
2362	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
2363	// CHECK: [[TMP2:%.]] = bitcast %struct.poly64x2x3_t [[__RET]] to i8*
2364	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
2365	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t [[__S1]], i32 0, i32 0
2366	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL]], i64 0, i64 0
2367	// CHECK: [[TMP4:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
2368	// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
2369	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t [[__S1]], i32 0, i32 0
2370	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL1]], i64 0, i64 1
2371	// CHECK: [[TMP6:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
2372	// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
2373	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t [[__S1]], i32 0, i32 0
2374	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL3]], i64 0, i64 2
2375	// CHECK: [[TMP8:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align 16
2376	// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
2377	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
2378	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
2379	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
2380	// CHECK: [[VLD3_LANE:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8 [[TMP3]])
2381	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
2382	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
2383	// CHECK: [[TMP14:%.]] = bitcast %struct.poly64x2x3_t [[RETVAL]] to i8*
2384	// CHECK: [[TMP15:%.]] = bitcast %struct.poly64x2x3_t [[__RET]] to i8*
2385	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
2386	// CHECK: [[TMP16:%.]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t [[RETVAL]], align 16
2387	// CHECK: ret %struct.poly64x2x3_t [[TMP16]]
2388	poly64x2x3_t test_vld3q_lane_p64(poly64_t *a, poly64x2x3_t b) {
2389	return vld3q_lane_p64(a, b, 1);
2390	}
2391
2392	// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #2 {
2393	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
2394	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
2395	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
2396	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
2397	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[B]], i32 0, i32 0
2398	// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
2399	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x3_t [[__S1]] to i8*
2400	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x3_t [[B]] to i8*
2401	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2402	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x3_t [[__RET]] to i8*
2403	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
2404	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i64 0, i64 0
2405	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
2406	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
2407	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i64 0, i64 1
2408	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
2409	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
2410	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i64 0, i64 2
2411	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
2412	// CHECK: [[VLD3_LANE:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8 %a)
2413	// CHECK: [[TMP6:%.]] = bitcast i8 [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
2414	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
2415	// CHECK: [[TMP7:%.]] = bitcast %struct.uint8x8x3_t [[RETVAL]] to i8*
2416	// CHECK: [[TMP8:%.]] = bitcast %struct.uint8x8x3_t [[__RET]] to i8*
2417	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
2418	// CHECK: [[TMP9:%.]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t [[RETVAL]], align 8
2419	// CHECK: ret %struct.uint8x8x3_t [[TMP9]]
2420	uint8x8x3_t test_vld3_lane_u8(uint8_t *a, uint8x8x3_t b) {
2421	return vld3_lane_u8(a, b, 7);
2422	}
2423
2424	// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #2 {
2425	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
2426	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
2427	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
2428	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
2429	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[B]], i32 0, i32 0
2430	// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
2431	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x3_t [[__S1]] to i8*
2432	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x3_t [[B]] to i8*
2433	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2434	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x4x3_t [[__RET]] to i8*
2435	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
2436	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
2437	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i64 0, i64 0
2438	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
2439	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
2440	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
2441	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i64 0, i64 1
2442	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
2443	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
2444	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
2445	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i64 0, i64 2
2446	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
2447	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
2448	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
2449	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
2450	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
2451	// CHECK: [[VLD3_LANE:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8 [[TMP3]])
2452	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
2453	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
2454	// CHECK: [[TMP14:%.]] = bitcast %struct.uint16x4x3_t [[RETVAL]] to i8*
2455	// CHECK: [[TMP15:%.]] = bitcast %struct.uint16x4x3_t [[__RET]] to i8*
2456	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
2457	// CHECK: [[TMP16:%.]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t [[RETVAL]], align 8
2458	// CHECK: ret %struct.uint16x4x3_t [[TMP16]]
2459	uint16x4x3_t test_vld3_lane_u16(uint16_t *a, uint16x4x3_t b) {
2460	return vld3_lane_u16(a, b, 3);
2461	}
2462
2463	// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #2 {
2464	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
2465	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
2466	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
2467	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
2468	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[B]], i32 0, i32 0
2469	// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
2470	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x3_t [[__S1]] to i8*
2471	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x3_t [[B]] to i8*
2472	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2473	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x2x3_t [[__RET]] to i8*
2474	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
2475	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
2476	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL]], i64 0, i64 0
2477	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
2478	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
2479	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
2480	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL1]], i64 0, i64 1
2481	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
2482	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
2483	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
2484	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL3]], i64 0, i64 2
2485	// CHECK: [[TMP8:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
2486	// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
2487	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
2488	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
2489	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
2490	// CHECK: [[VLD3_LANE:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, i8 [[TMP3]])
2491	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
2492	// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
2493	// CHECK: [[TMP14:%.]] = bitcast %struct.uint32x2x3_t [[RETVAL]] to i8*
2494	// CHECK: [[TMP15:%.]] = bitcast %struct.uint32x2x3_t [[__RET]] to i8*
2495	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
2496	// CHECK: [[TMP16:%.]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t [[RETVAL]], align 8
2497	// CHECK: ret %struct.uint32x2x3_t [[TMP16]]
2498	uint32x2x3_t test_vld3_lane_u32(uint32_t *a, uint32x2x3_t b) {
2499	return vld3_lane_u32(a, b, 1);
2500	}
2501
2502	// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #2 {
2503	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
2504	// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
2505	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
2506	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
2507	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[B]], i32 0, i32 0
2508	// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
2509	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x3_t [[__S1]] to i8*
2510	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x1x3_t [[B]] to i8*
2511	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2512	// CHECK: [[TMP2:%.]] = bitcast %struct.uint64x1x3_t [[__RET]] to i8*
2513	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
2514	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[__S1]], i32 0, i32 0
2515	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL]], i64 0, i64 0
2516	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
2517	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2518	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[__S1]], i32 0, i32 0
2519	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL1]], i64 0, i64 1
2520	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
2521	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2522	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[__S1]], i32 0, i32 0
2523	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL3]], i64 0, i64 2
2524	// CHECK: [[TMP8:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
2525	// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2526	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2527	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2528	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2529	// CHECK: [[VLD3_LANE:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8 [[TMP3]])
2530	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
2531	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
2532	// CHECK: [[TMP14:%.]] = bitcast %struct.uint64x1x3_t [[RETVAL]] to i8*
2533	// CHECK: [[TMP15:%.]] = bitcast %struct.uint64x1x3_t [[__RET]] to i8*
2534	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
2535	// CHECK: [[TMP16:%.]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t [[RETVAL]], align 8
2536	// CHECK: ret %struct.uint64x1x3_t [[TMP16]]
2537	uint64x1x3_t test_vld3_lane_u64(uint64_t *a, uint64x1x3_t b) {
2538	return vld3_lane_u64(a, b, 0);
2539	}
2540
2541	// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #2 {
2542	// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
2543	// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
2544	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
2545	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
2546	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[B]], i32 0, i32 0
2547	// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
2548	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x3_t [[__S1]] to i8*
2549	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x3_t [[B]] to i8*
2550	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2551	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x3_t [[__RET]] to i8*
2552	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
2553	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i64 0, i64 0
2554	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
2555	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
2556	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i64 0, i64 1
2557	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
2558	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
2559	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i64 0, i64 2
2560	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
2561	// CHECK: [[VLD3_LANE:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8 %a)
2562	// CHECK: [[TMP6:%.]] = bitcast i8 [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
2563	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
2564	// CHECK: [[TMP7:%.]] = bitcast %struct.int8x8x3_t [[RETVAL]] to i8*
2565	// CHECK: [[TMP8:%.]] = bitcast %struct.int8x8x3_t [[__RET]] to i8*
2566	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
2567	// CHECK: [[TMP9:%.]] = load %struct.int8x8x3_t, %struct.int8x8x3_t [[RETVAL]], align 8
2568	// CHECK: ret %struct.int8x8x3_t [[TMP9]]
2569	int8x8x3_t test_vld3_lane_s8(int8_t *a, int8x8x3_t b) {
2570	return vld3_lane_s8(a, b, 7);
2571	}
2572
2573	// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #2 {
2574	// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
2575	// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
2576	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
2577	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
2578	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[B]], i32 0, i32 0
2579	// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
2580	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x3_t [[__S1]] to i8*
2581	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x3_t [[B]] to i8*
2582	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2583	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x4x3_t [[__RET]] to i8*
2584	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
2585	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
2586	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i64 0, i64 0
2587	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
2588	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
2589	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
2590	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i64 0, i64 1
2591	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
2592	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
2593	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
2594	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i64 0, i64 2
2595	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
2596	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
2597	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
2598	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
2599	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
2600	// CHECK: [[VLD3_LANE:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8 [[TMP3]])
2601	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
2602	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
2603	// CHECK: [[TMP14:%.]] = bitcast %struct.int16x4x3_t [[RETVAL]] to i8*
2604	// CHECK: [[TMP15:%.]] = bitcast %struct.int16x4x3_t [[__RET]] to i8*
2605	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
2606	// CHECK: [[TMP16:%.]] = load %struct.int16x4x3_t, %struct.int16x4x3_t [[RETVAL]], align 8
2607	// CHECK: ret %struct.int16x4x3_t [[TMP16]]
2608	int16x4x3_t test_vld3_lane_s16(int16_t *a, int16x4x3_t b) {
2609	return vld3_lane_s16(a, b, 3);
2610	}
2611
2612	// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #2 {
2613	// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
2614	// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
2615	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
2616	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
2617	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[B]], i32 0, i32 0
2618	// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
2619	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x3_t [[__S1]] to i8*
2620	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x3_t [[B]] to i8*
2621	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2622	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x2x3_t [[__RET]] to i8*
2623	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
2624	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
2625	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL]], i64 0, i64 0
2626	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
2627	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
2628	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
2629	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL1]], i64 0, i64 1
2630	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
2631	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
2632	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
2633	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL3]], i64 0, i64 2
2634	// CHECK: [[TMP8:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
2635	// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
2636	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
2637	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
2638	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
2639	// CHECK: [[VLD3_LANE:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, i8 [[TMP3]])
2640	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
2641	// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
2642	// CHECK: [[TMP14:%.]] = bitcast %struct.int32x2x3_t [[RETVAL]] to i8*
2643	// CHECK: [[TMP15:%.]] = bitcast %struct.int32x2x3_t [[__RET]] to i8*
2644	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
2645	// CHECK: [[TMP16:%.]] = load %struct.int32x2x3_t, %struct.int32x2x3_t [[RETVAL]], align 8
2646	// CHECK: ret %struct.int32x2x3_t [[TMP16]]
2647	int32x2x3_t test_vld3_lane_s32(int32_t *a, int32x2x3_t b) {
2648	return vld3_lane_s32(a, b, 1);
2649	}
2650
2651	// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #2 {
2652	// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
2653	// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
2654	// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
2655	// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
2656	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[B]], i32 0, i32 0
2657	// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
2658	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x3_t [[__S1]] to i8*
2659	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x1x3_t [[B]] to i8*
2660	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2661	// CHECK: [[TMP2:%.]] = bitcast %struct.int64x1x3_t [[__RET]] to i8*
2662	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
2663	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[__S1]], i32 0, i32 0
2664	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL]], i64 0, i64 0
2665	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
2666	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2667	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[__S1]], i32 0, i32 0
2668	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL1]], i64 0, i64 1
2669	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
2670	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2671	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[__S1]], i32 0, i32 0
2672	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL3]], i64 0, i64 2
2673	// CHECK: [[TMP8:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
2674	// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2675	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2676	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2677	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2678	// CHECK: [[VLD3_LANE:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8 [[TMP3]])
2679	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
2680	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
2681	// CHECK: [[TMP14:%.]] = bitcast %struct.int64x1x3_t [[RETVAL]] to i8*
2682	// CHECK: [[TMP15:%.]] = bitcast %struct.int64x1x3_t [[__RET]] to i8*
2683	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
2684	// CHECK: [[TMP16:%.]] = load %struct.int64x1x3_t, %struct.int64x1x3_t [[RETVAL]], align 8
2685	// CHECK: ret %struct.int64x1x3_t [[TMP16]]
2686	int64x1x3_t test_vld3_lane_s64(int64_t *a, int64x1x3_t b) {
2687	return vld3_lane_s64(a, b, 0);
2688	}
2689
2690	// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #2 {
2691	// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
2692	// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
2693	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
2694	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
2695	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[B]], i32 0, i32 0
2696	// CHECK: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
2697	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x3_t [[__S1]] to i8*
2698	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x3_t [[B]] to i8*
2699	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2700	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x4x3_t [[__RET]] to i8*
2701	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
2702	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
2703	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL]], i64 0, i64 0
2704	// CHECK: [[TMP4:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
2705	// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
2706	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
2707	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL1]], i64 0, i64 1
2708	// CHECK: [[TMP6:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
2709	// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
2710	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
2711	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL3]], i64 0, i64 2
2712	// CHECK: [[TMP8:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX4]], align 8
2713	// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
2714	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
2715	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
2716	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
2717	// CHECK: [[VLD3_LANE:%.]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0i8(<4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i64 3, i8 [[TMP3]])
2718	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <4 x half>, <4 x half>, <4 x half> }*
2719	// CHECK: store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP13]]
2720	// CHECK: [[TMP14:%.]] = bitcast %struct.float16x4x3_t [[RETVAL]] to i8*
2721	// CHECK: [[TMP15:%.]] = bitcast %struct.float16x4x3_t [[__RET]] to i8*
2722	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
2723	// CHECK: [[TMP16:%.]] = load %struct.float16x4x3_t, %struct.float16x4x3_t [[RETVAL]], align 8
2724	// CHECK: ret %struct.float16x4x3_t [[TMP16]]
2725	float16x4x3_t test_vld3_lane_f16(float16_t *a, float16x4x3_t b) {
2726	return vld3_lane_f16(a, b, 3);
2727	}
2728
2729	// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #2 {
2730	// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
2731	// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
2732	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
2733	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
2734	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[B]], i32 0, i32 0
2735	// CHECK: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
2736	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x3_t [[__S1]] to i8*
2737	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x3_t [[B]] to i8*
2738	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2739	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x2x3_t [[__RET]] to i8*
2740	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
2741	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
2742	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL]], i64 0, i64 0
2743	// CHECK: [[TMP4:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
2744	// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
2745	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
2746	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL1]], i64 0, i64 1
2747	// CHECK: [[TMP6:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
2748	// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
2749	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
2750	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL3]], i64 0, i64 2
2751	// CHECK: [[TMP8:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX4]], align 8
2752	// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
2753	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
2754	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
2755	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
2756	// CHECK: [[VLD3_LANE:%.]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0i8(<2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i64 1, i8 [[TMP3]])
2757	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <2 x float>, <2 x float>, <2 x float> }*
2758	// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP13]]
2759	// CHECK: [[TMP14:%.]] = bitcast %struct.float32x2x3_t [[RETVAL]] to i8*
2760	// CHECK: [[TMP15:%.]] = bitcast %struct.float32x2x3_t [[__RET]] to i8*
2761	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
2762	// CHECK: [[TMP16:%.]] = load %struct.float32x2x3_t, %struct.float32x2x3_t [[RETVAL]], align 8
2763	// CHECK: ret %struct.float32x2x3_t [[TMP16]]
2764	float32x2x3_t test_vld3_lane_f32(float32_t *a, float32x2x3_t b) {
2765	return vld3_lane_f32(a, b, 1);
2766	}
2767
2768	// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #2 {
2769	// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
2770	// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
2771	// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
2772	// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
2773	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t [[B]], i32 0, i32 0
2774	// CHECK: store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
2775	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x1x3_t [[__S1]] to i8*
2776	// CHECK: [[TMP1:%.]] = bitcast %struct.float64x1x3_t [[B]] to i8*
2777	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2778	// CHECK: [[TMP2:%.]] = bitcast %struct.float64x1x3_t [[__RET]] to i8*
2779	// CHECK: [[TMP3:%.]] = bitcast double %a to i8*
2780	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t [[__S1]], i32 0, i32 0
2781	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>] [[VAL]], i64 0, i64 0
2782	// CHECK: [[TMP4:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX]], align 8
2783	// CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
2784	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t [[__S1]], i32 0, i32 0
2785	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>] [[VAL1]], i64 0, i64 1
2786	// CHECK: [[TMP6:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX2]], align 8
2787	// CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
2788	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t [[__S1]], i32 0, i32 0
2789	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>] [[VAL3]], i64 0, i64 2
2790	// CHECK: [[TMP8:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX4]], align 8
2791	// CHECK: [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
2792	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
2793	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
2794	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
2795	// CHECK: [[VLD3_LANE:%.]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0i8(<1 x double> [[TMP10]], <1 x double> [[TMP11]], <1 x double> [[TMP12]], i64 0, i8 [[TMP3]])
2796	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <1 x double>, <1 x double>, <1 x double> }*
2797	// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP13]]
2798	// CHECK: [[TMP14:%.]] = bitcast %struct.float64x1x3_t [[RETVAL]] to i8*
2799	// CHECK: [[TMP15:%.]] = bitcast %struct.float64x1x3_t [[__RET]] to i8*
2800	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
2801	// CHECK: [[TMP16:%.]] = load %struct.float64x1x3_t, %struct.float64x1x3_t [[RETVAL]], align 8
2802	// CHECK: ret %struct.float64x1x3_t [[TMP16]]
2803	float64x1x3_t test_vld3_lane_f64(float64_t *a, float64x1x3_t b) {
2804	return vld3_lane_f64(a, b, 0);
2805	}
2806
2807	// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #2 {
2808	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
2809	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
2810	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
2811	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
2812	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[B]], i32 0, i32 0
2813	// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
2814	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x3_t [[__S1]] to i8*
2815	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x3_t [[B]] to i8*
2816	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2817	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x3_t [[__RET]] to i8*
2818	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
2819	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i64 0, i64 0
2820	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
2821	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
2822	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i64 0, i64 1
2823	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
2824	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
2825	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i64 0, i64 2
2826	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
2827	// CHECK: [[VLD3_LANE:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8 %a)
2828	// CHECK: [[TMP6:%.]] = bitcast i8 [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
2829	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
2830	// CHECK: [[TMP7:%.]] = bitcast %struct.poly8x8x3_t [[RETVAL]] to i8*
2831	// CHECK: [[TMP8:%.]] = bitcast %struct.poly8x8x3_t [[__RET]] to i8*
2832	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
2833	// CHECK: [[TMP9:%.]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t [[RETVAL]], align 8
2834	// CHECK: ret %struct.poly8x8x3_t [[TMP9]]
2835	poly8x8x3_t test_vld3_lane_p8(poly8_t *a, poly8x8x3_t b) {
2836	return vld3_lane_p8(a, b, 7);
2837	}
2838
2839	// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #2 {
2840	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
2841	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
2842	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
2843	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
2844	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[B]], i32 0, i32 0
2845	// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
2846	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x3_t [[__S1]] to i8*
2847	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x3_t [[B]] to i8*
2848	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2849	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x4x3_t [[__RET]] to i8*
2850	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
2851	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
2852	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i64 0, i64 0
2853	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
2854	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
2855	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
2856	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i64 0, i64 1
2857	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
2858	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
2859	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
2860	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i64 0, i64 2
2861	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
2862	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
2863	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
2864	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
2865	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
2866	// CHECK: [[VLD3_LANE:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8 [[TMP3]])
2867	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
2868	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
2869	// CHECK: [[TMP14:%.]] = bitcast %struct.poly16x4x3_t [[RETVAL]] to i8*
2870	// CHECK: [[TMP15:%.]] = bitcast %struct.poly16x4x3_t [[__RET]] to i8*
2871	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
2872	// CHECK: [[TMP16:%.]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t [[RETVAL]], align 8
2873	// CHECK: ret %struct.poly16x4x3_t [[TMP16]]
2874	poly16x4x3_t test_vld3_lane_p16(poly16_t *a, poly16x4x3_t b) {
2875	return vld3_lane_p16(a, b, 3);
2876	}
2877
2878	// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #2 {
2879	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
2880	// CHECK: [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
2881	// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
2882	// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
2883	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t [[B]], i32 0, i32 0
2884	// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
2885	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x1x3_t [[__S1]] to i8*
2886	// CHECK: [[TMP1:%.]] = bitcast %struct.poly64x1x3_t [[B]] to i8*
2887	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
2888	// CHECK: [[TMP2:%.]] = bitcast %struct.poly64x1x3_t [[__RET]] to i8*
2889	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
2890	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t [[__S1]], i32 0, i32 0
2891	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL]], i64 0, i64 0
2892	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
2893	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
2894	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t [[__S1]], i32 0, i32 0
2895	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL1]], i64 0, i64 1
2896	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
2897	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
2898	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t [[__S1]], i32 0, i32 0
2899	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL3]], i64 0, i64 2
2900	// CHECK: [[TMP8:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
2901	// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
2902	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
2903	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
2904	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
2905	// CHECK: [[VLD3_LANE:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8 [[TMP3]])
2906	// CHECK: [[TMP13:%.]] = bitcast i8 [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
2907	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
2908	// CHECK: [[TMP14:%.]] = bitcast %struct.poly64x1x3_t [[RETVAL]] to i8*
2909	// CHECK: [[TMP15:%.]] = bitcast %struct.poly64x1x3_t [[__RET]] to i8*
2910	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
2911	// CHECK: [[TMP16:%.]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t [[RETVAL]], align 8
2912	// CHECK: ret %struct.poly64x1x3_t [[TMP16]]
2913	poly64x1x3_t test_vld3_lane_p64(poly64_t *a, poly64x1x3_t b) {
2914	return vld3_lane_p64(a, b, 0);
2915	}
2916
2917	// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #2 {
2918	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
2919	// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
2920	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
2921	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
2922	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[B]], i32 0, i32 0
2923	// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
2924	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x4_t [[__S1]] to i8*
2925	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x16x4_t [[B]] to i8*
2926	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
2927	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x16x4_t [[__RET]] to i8*
2928	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
2929	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL]], i64 0, i64 0
2930	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
2931	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
2932	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL1]], i64 0, i64 1
2933	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
2934	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
2935	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL3]], i64 0, i64 2
2936	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
2937	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
2938	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL5]], i64 0, i64 3
2939	// CHECK: [[TMP6:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX6]], align 16
2940	// CHECK: [[VLD4_LANE:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8 %a)
2941	// CHECK: [[TMP7:%.]] = bitcast i8 [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
2942	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
2943	// CHECK: [[TMP8:%.]] = bitcast %struct.uint8x16x4_t [[RETVAL]] to i8*
2944	// CHECK: [[TMP9:%.]] = bitcast %struct.uint8x16x4_t [[__RET]] to i8*
2945	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
2946	// CHECK: [[TMP10:%.]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t [[RETVAL]], align 16
2947	// CHECK: ret %struct.uint8x16x4_t [[TMP10]]
2948	uint8x16x4_t test_vld4q_lane_u8(uint8_t *a, uint8x16x4_t b) {
2949	return vld4q_lane_u8(a, b, 15);
2950	}
2951
2952	// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #2 {
2953	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
2954	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
2955	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
2956	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
2957	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[B]], i32 0, i32 0
2958	// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
2959	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x4_t [[__S1]] to i8*
2960	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x4_t [[B]] to i8*
2961	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
2962	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x8x4_t [[__RET]] to i8*
2963	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
2964	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
2965	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i64 0, i64 0
2966	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
2967	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
2968	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
2969	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i64 0, i64 1
2970	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
2971	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
2972	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
2973	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i64 0, i64 2
2974	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
2975	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
2976	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
2977	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i64 0, i64 3
2978	// CHECK: [[TMP10:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
2979	// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
2980	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
2981	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
2982	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
2983	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
2984	// CHECK: [[VLD4_LANE:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8 [[TMP3]])
2985	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
2986	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
2987	// CHECK: [[TMP17:%.]] = bitcast %struct.uint16x8x4_t [[RETVAL]] to i8*
2988	// CHECK: [[TMP18:%.]] = bitcast %struct.uint16x8x4_t [[__RET]] to i8*
2989	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
2990	// CHECK: [[TMP19:%.]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t [[RETVAL]], align 16
2991	// CHECK: ret %struct.uint16x8x4_t [[TMP19]]
2992	uint16x8x4_t test_vld4q_lane_u16(uint16_t *a, uint16x8x4_t b) {
2993	return vld4q_lane_u16(a, b, 7);
2994	}
2995
2996	// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #2 {
2997	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
2998	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
2999	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
3000	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
3001	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[B]], i32 0, i32 0
3002	// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
3003	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x4_t [[__S1]] to i8*
3004	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x4_t [[B]] to i8*
3005	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
3006	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x4x4_t [[__RET]] to i8*
3007	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
3008	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
3009	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL]], i64 0, i64 0
3010	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
3011	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
3012	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
3013	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL1]], i64 0, i64 1
3014	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
3015	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
3016	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
3017	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL3]], i64 0, i64 2
3018	// CHECK: [[TMP8:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
3019	// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
3020	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
3021	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL5]], i64 0, i64 3
3022	// CHECK: [[TMP10:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX6]], align 16
3023	// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
3024	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
3025	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
3026	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
3027	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
3028	// CHECK: [[VLD4_LANE:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, i8 [[TMP3]])
3029	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
3030	// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
3031	// CHECK: [[TMP17:%.]] = bitcast %struct.uint32x4x4_t [[RETVAL]] to i8*
3032	// CHECK: [[TMP18:%.]] = bitcast %struct.uint32x4x4_t [[__RET]] to i8*
3033	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
3034	// CHECK: [[TMP19:%.]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t [[RETVAL]], align 16
3035	// CHECK: ret %struct.uint32x4x4_t [[TMP19]]
3036	uint32x4x4_t test_vld4q_lane_u32(uint32_t *a, uint32x4x4_t b) {
3037	return vld4q_lane_u32(a, b, 3);
3038	}
3039
3040	// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #2 {
3041	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
3042	// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
3043	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
3044	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
3045	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[B]], i32 0, i32 0
3046	// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
3047	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x4_t [[__S1]] to i8*
3048	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x2x4_t [[B]] to i8*
3049	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
3050	// CHECK: [[TMP2:%.]] = bitcast %struct.uint64x2x4_t [[__RET]] to i8*
3051	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
3052	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[__S1]], i32 0, i32 0
3053	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL]], i64 0, i64 0
3054	// CHECK: [[TMP4:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
3055	// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
3056	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[__S1]], i32 0, i32 0
3057	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL1]], i64 0, i64 1
3058	// CHECK: [[TMP6:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
3059	// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
3060	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[__S1]], i32 0, i32 0
3061	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL3]], i64 0, i64 2
3062	// CHECK: [[TMP8:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align 16
3063	// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
3064	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[__S1]], i32 0, i32 0
3065	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL5]], i64 0, i64 3
3066	// CHECK: [[TMP10:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX6]], align 16
3067	// CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
3068	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
3069	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
3070	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
3071	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
3072	// CHECK: [[VLD4_LANE:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8 [[TMP3]])
3073	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
3074	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
3075	// CHECK: [[TMP17:%.]] = bitcast %struct.uint64x2x4_t [[RETVAL]] to i8*
3076	// CHECK: [[TMP18:%.]] = bitcast %struct.uint64x2x4_t [[__RET]] to i8*
3077	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
3078	// CHECK: [[TMP19:%.]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t [[RETVAL]], align 16
3079	// CHECK: ret %struct.uint64x2x4_t [[TMP19]]
3080	uint64x2x4_t test_vld4q_lane_u64(uint64_t *a, uint64x2x4_t b) {
3081	return vld4q_lane_u64(a, b, 1);
3082	}
3083
3084	// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #2 {
3085	// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
3086	// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
3087	// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
3088	// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
3089	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[B]], i32 0, i32 0
3090	// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
3091	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x4_t [[__S1]] to i8*
3092	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x16x4_t [[B]] to i8*
3093	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
3094	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x16x4_t [[__RET]] to i8*
3095	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
3096	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL]], i64 0, i64 0
3097	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
3098	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
3099	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL1]], i64 0, i64 1
3100	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
3101	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
3102	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL3]], i64 0, i64 2
3103	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
3104	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
3105	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL5]], i64 0, i64 3
3106	// CHECK: [[TMP6:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX6]], align 16
3107	// CHECK: [[VLD4_LANE:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8 %a)
3108	// CHECK: [[TMP7:%.]] = bitcast i8 [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
3109	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
3110	// CHECK: [[TMP8:%.]] = bitcast %struct.int8x16x4_t [[RETVAL]] to i8*
3111	// CHECK: [[TMP9:%.]] = bitcast %struct.int8x16x4_t [[__RET]] to i8*
3112	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
3113	// CHECK: [[TMP10:%.]] = load %struct.int8x16x4_t, %struct.int8x16x4_t [[RETVAL]], align 16
3114	// CHECK: ret %struct.int8x16x4_t [[TMP10]]
3115	int8x16x4_t test_vld4q_lane_s8(int8_t *a, int8x16x4_t b) {
3116	return vld4q_lane_s8(a, b, 15);
3117	}
3118
3119	// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #2 {
3120	// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
3121	// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
3122	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
3123	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
3124	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[B]], i32 0, i32 0
3125	// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
3126	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x4_t [[__S1]] to i8*
3127	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x4_t [[B]] to i8*
3128	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
3129	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x8x4_t [[__RET]] to i8*
3130	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
3131	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
3132	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i64 0, i64 0
3133	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
3134	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
3135	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
3136	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i64 0, i64 1
3137	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
3138	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
3139	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
3140	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i64 0, i64 2
3141	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
3142	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
3143	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
3144	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i64 0, i64 3
3145	// CHECK: [[TMP10:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
3146	// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
3147	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
3148	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
3149	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
3150	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
3151	// CHECK: [[VLD4_LANE:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8 [[TMP3]])
3152	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
3153	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
3154	// CHECK: [[TMP17:%.]] = bitcast %struct.int16x8x4_t [[RETVAL]] to i8*
3155	// CHECK: [[TMP18:%.]] = bitcast %struct.int16x8x4_t [[__RET]] to i8*
3156	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
3157	// CHECK: [[TMP19:%.]] = load %struct.int16x8x4_t, %struct.int16x8x4_t [[RETVAL]], align 16
3158	// CHECK: ret %struct.int16x8x4_t [[TMP19]]
3159	int16x8x4_t test_vld4q_lane_s16(int16_t *a, int16x8x4_t b) {
3160	return vld4q_lane_s16(a, b, 7);
3161	}
3162
3163	// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #2 {
3164	// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
3165	// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
3166	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
3167	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
3168	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[B]], i32 0, i32 0
3169	// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
3170	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x4_t [[__S1]] to i8*
3171	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x4_t [[B]] to i8*
3172	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
3173	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x4x4_t [[__RET]] to i8*
3174	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
3175	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
3176	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL]], i64 0, i64 0
3177	// CHECK: [[TMP4:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
3178	// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
3179	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
3180	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL1]], i64 0, i64 1
3181	// CHECK: [[TMP6:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
3182	// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
3183	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
3184	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL3]], i64 0, i64 2
3185	// CHECK: [[TMP8:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
3186	// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
3187	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
3188	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL5]], i64 0, i64 3
3189	// CHECK: [[TMP10:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX6]], align 16
3190	// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
3191	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
3192	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
3193	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
3194	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
3195	// CHECK: [[VLD4_LANE:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, i8 [[TMP3]])
3196	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
3197	// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
3198	// CHECK: [[TMP17:%.]] = bitcast %struct.int32x4x4_t [[RETVAL]] to i8*
3199	// CHECK: [[TMP18:%.]] = bitcast %struct.int32x4x4_t [[__RET]] to i8*
3200	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
3201	// CHECK: [[TMP19:%.]] = load %struct.int32x4x4_t, %struct.int32x4x4_t [[RETVAL]], align 16
3202	// CHECK: ret %struct.int32x4x4_t [[TMP19]]
3203	int32x4x4_t test_vld4q_lane_s32(int32_t *a, int32x4x4_t b) {
3204	return vld4q_lane_s32(a, b, 3);
3205	}
3206
3207	// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #2 {
3208	// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
3209	// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
3210	// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
3211	// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
3212	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[B]], i32 0, i32 0
3213	// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
3214	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x4_t [[__S1]] to i8*
3215	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x2x4_t [[B]] to i8*
3216	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
3217	// CHECK: [[TMP2:%.]] = bitcast %struct.int64x2x4_t [[__RET]] to i8*
3218	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
3219	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[__S1]], i32 0, i32 0
3220	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL]], i64 0, i64 0
3221	// CHECK: [[TMP4:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
3222	// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
3223	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[__S1]], i32 0, i32 0
3224	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL1]], i64 0, i64 1
3225	// CHECK: [[TMP6:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
3226	// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
3227	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[__S1]], i32 0, i32 0
3228	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL3]], i64 0, i64 2
3229	// CHECK: [[TMP8:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align 16
3230	// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
3231	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[__S1]], i32 0, i32 0
3232	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL5]], i64 0, i64 3
3233	// CHECK: [[TMP10:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX6]], align 16
3234	// CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
3235	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
3236	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
3237	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
3238	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
3239	// CHECK: [[VLD4_LANE:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8 [[TMP3]])
3240	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
3241	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
3242	// CHECK: [[TMP17:%.]] = bitcast %struct.int64x2x4_t [[RETVAL]] to i8*
3243	// CHECK: [[TMP18:%.]] = bitcast %struct.int64x2x4_t [[__RET]] to i8*
3244	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
3245	// CHECK: [[TMP19:%.]] = load %struct.int64x2x4_t, %struct.int64x2x4_t [[RETVAL]], align 16
3246	// CHECK: ret %struct.int64x2x4_t [[TMP19]]
3247	int64x2x4_t test_vld4q_lane_s64(int64_t *a, int64x2x4_t b) {
3248	return vld4q_lane_s64(a, b, 1);
3249	}
3250
3251	// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #2 {
3252	// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
3253	// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
3254	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
3255	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
3256	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[B]], i32 0, i32 0
3257	// CHECK: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
3258	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x4_t [[__S1]] to i8*
3259	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x4_t [[B]] to i8*
3260	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
3261	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x8x4_t [[__RET]] to i8*
3262	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
3263	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
3264	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL]], i64 0, i64 0
3265	// CHECK: [[TMP4:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
3266	// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
3267	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
3268	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL1]], i64 0, i64 1
3269	// CHECK: [[TMP6:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
3270	// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
3271	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
3272	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL3]], i64 0, i64 2
3273	// CHECK: [[TMP8:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX4]], align 16
3274	// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
3275	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
3276	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL5]], i64 0, i64 3
3277	// CHECK: [[TMP10:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX6]], align 16
3278	// CHECK: [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
3279	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
3280	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
3281	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
3282	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
3283	// CHECK: [[VLD4_LANE:%.]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0i8(<8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i64 7, i8 [[TMP3]])
3284	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <8 x half>, <8 x half>, <8 x half>, <8 x half> }*
3285	// CHECK: store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP16]]
3286	// CHECK: [[TMP17:%.]] = bitcast %struct.float16x8x4_t [[RETVAL]] to i8*
3287	// CHECK: [[TMP18:%.]] = bitcast %struct.float16x8x4_t [[__RET]] to i8*
3288	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
3289	// CHECK: [[TMP19:%.]] = load %struct.float16x8x4_t, %struct.float16x8x4_t [[RETVAL]], align 16
3290	// CHECK: ret %struct.float16x8x4_t [[TMP19]]
3291	float16x8x4_t test_vld4q_lane_f16(float16_t *a, float16x8x4_t b) {
3292	return vld4q_lane_f16(a, b, 7);
3293	}
3294
3295	// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #2 {
3296	// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
3297	// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
3298	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
3299	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
3300	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[B]], i32 0, i32 0
3301	// CHECK: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
3302	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x4_t [[__S1]] to i8*
3303	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x4_t [[B]] to i8*
3304	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
3305	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x4x4_t [[__RET]] to i8*
3306	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
3307	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
3308	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL]], i64 0, i64 0
3309	// CHECK: [[TMP4:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
3310	// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
3311	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
3312	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL1]], i64 0, i64 1
3313	// CHECK: [[TMP6:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
3314	// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
3315	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
3316	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL3]], i64 0, i64 2
3317	// CHECK: [[TMP8:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX4]], align 16
3318	// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
3319	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
3320	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL5]], i64 0, i64 3
3321	// CHECK: [[TMP10:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX6]], align 16
3322	// CHECK: [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
3323	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
3324	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
3325	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
3326	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
3327	// CHECK: [[VLD4_LANE:%.]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0i8(<4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i64 3, i8 [[TMP3]])
3328	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
3329	// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP16]]
3330	// CHECK: [[TMP17:%.]] = bitcast %struct.float32x4x4_t [[RETVAL]] to i8*
3331	// CHECK: [[TMP18:%.]] = bitcast %struct.float32x4x4_t [[__RET]] to i8*
3332	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
3333	// CHECK: [[TMP19:%.]] = load %struct.float32x4x4_t, %struct.float32x4x4_t [[RETVAL]], align 16
3334	// CHECK: ret %struct.float32x4x4_t [[TMP19]]
3335	float32x4x4_t test_vld4q_lane_f32(float32_t *a, float32x4x4_t b) {
3336	return vld4q_lane_f32(a, b, 3);
3337	}
3338
3339	// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #2 {
3340	// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
3341	// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
3342	// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
3343	// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
3344	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t [[B]], i32 0, i32 0
3345	// CHECK: store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
3346	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x2x4_t [[__S1]] to i8*
3347	// CHECK: [[TMP1:%.]] = bitcast %struct.float64x2x4_t [[B]] to i8*
3348	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
3349	// CHECK: [[TMP2:%.]] = bitcast %struct.float64x2x4_t [[__RET]] to i8*
3350	// CHECK: [[TMP3:%.]] = bitcast double %a to i8*
3351	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t [[__S1]], i32 0, i32 0
3352	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>] [[VAL]], i64 0, i64 0
3353	// CHECK: [[TMP4:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX]], align 16
3354	// CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
3355	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t [[__S1]], i32 0, i32 0
3356	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>] [[VAL1]], i64 0, i64 1
3357	// CHECK: [[TMP6:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX2]], align 16
3358	// CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
3359	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t [[__S1]], i32 0, i32 0
3360	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>] [[VAL3]], i64 0, i64 2
3361	// CHECK: [[TMP8:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX4]], align 16
3362	// CHECK: [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
3363	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t [[__S1]], i32 0, i32 0
3364	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>] [[VAL5]], i64 0, i64 3
3365	// CHECK: [[TMP10:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX6]], align 16
3366	// CHECK: [[TMP11:%.*]] = bitcast <2 x double> [[TMP10]] to <16 x i8>
3367	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
3368	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
3369	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
3370	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x double>
3371	// CHECK: [[VLD4_LANE:%.]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0i8(<2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]], i64 1, i8 [[TMP3]])
3372	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
3373	// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP16]]
3374	// CHECK: [[TMP17:%.]] = bitcast %struct.float64x2x4_t [[RETVAL]] to i8*
3375	// CHECK: [[TMP18:%.]] = bitcast %struct.float64x2x4_t [[__RET]] to i8*
3376	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
3377	// CHECK: [[TMP19:%.]] = load %struct.float64x2x4_t, %struct.float64x2x4_t [[RETVAL]], align 16
3378	// CHECK: ret %struct.float64x2x4_t [[TMP19]]
3379	float64x2x4_t test_vld4q_lane_f64(float64_t *a, float64x2x4_t b) {
3380	return vld4q_lane_f64(a, b, 1);
3381	}
3382
3383	// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #2 {
3384	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
3385	// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
3386	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
3387	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
3388	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[B]], i32 0, i32 0
3389	// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
3390	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x4_t [[__S1]] to i8*
3391	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x16x4_t [[B]] to i8*
3392	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
3393	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x16x4_t [[__RET]] to i8*
3394	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
3395	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL]], i64 0, i64 0
3396	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
3397	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
3398	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL1]], i64 0, i64 1
3399	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
3400	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
3401	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL3]], i64 0, i64 2
3402	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
3403	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
3404	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL5]], i64 0, i64 3
3405	// CHECK: [[TMP6:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX6]], align 16
3406	// CHECK: [[VLD4_LANE:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8 %a)
3407	// CHECK: [[TMP7:%.]] = bitcast i8 [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
3408	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
3409	// CHECK: [[TMP8:%.]] = bitcast %struct.poly8x16x4_t [[RETVAL]] to i8*
3410	// CHECK: [[TMP9:%.]] = bitcast %struct.poly8x16x4_t [[__RET]] to i8*
3411	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
3412	// CHECK: [[TMP10:%.]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t [[RETVAL]], align 16
3413	// CHECK: ret %struct.poly8x16x4_t [[TMP10]]
3414	poly8x16x4_t test_vld4q_lane_p8(poly8_t *a, poly8x16x4_t b) {
3415	return vld4q_lane_p8(a, b, 15);
3416	}
3417
3418	// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #2 {
3419	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
3420	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
3421	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
3422	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
3423	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[B]], i32 0, i32 0
3424	// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
3425	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x4_t [[__S1]] to i8*
3426	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x4_t [[B]] to i8*
3427	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
3428	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x8x4_t [[__RET]] to i8*
3429	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
3430	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
3431	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i64 0, i64 0
3432	// CHECK: [[TMP4:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
3433	// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
3434	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
3435	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i64 0, i64 1
3436	// CHECK: [[TMP6:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
3437	// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
3438	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
3439	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i64 0, i64 2
3440	// CHECK: [[TMP8:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
3441	// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
3442	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
3443	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i64 0, i64 3
3444	// CHECK: [[TMP10:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
3445	// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
3446	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
3447	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
3448	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
3449	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
3450	// CHECK: [[VLD4_LANE:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8 [[TMP3]])
3451	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
3452	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
3453	// CHECK: [[TMP17:%.]] = bitcast %struct.poly16x8x4_t [[RETVAL]] to i8*
3454	// CHECK: [[TMP18:%.]] = bitcast %struct.poly16x8x4_t [[__RET]] to i8*
3455	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
3456	// CHECK: [[TMP19:%.]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t [[RETVAL]], align 16
3457	// CHECK: ret %struct.poly16x8x4_t [[TMP19]]
3458	poly16x8x4_t test_vld4q_lane_p16(poly16_t *a, poly16x8x4_t b) {
3459	return vld4q_lane_p16(a, b, 7);
3460	}
3461
3462	// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #2 {
3463	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
3464	// CHECK: [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
3465	// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
3466	// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
3467	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t [[B]], i32 0, i32 0
3468	// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
3469	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x2x4_t [[__S1]] to i8*
3470	// CHECK: [[TMP1:%.]] = bitcast %struct.poly64x2x4_t [[B]] to i8*
3471	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
3472	// CHECK: [[TMP2:%.]] = bitcast %struct.poly64x2x4_t [[__RET]] to i8*
3473	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
3474	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t [[__S1]], i32 0, i32 0
3475	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL]], i64 0, i64 0
3476	// CHECK: [[TMP4:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
3477	// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
3478	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t [[__S1]], i32 0, i32 0
3479	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL1]], i64 0, i64 1
3480	// CHECK: [[TMP6:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
3481	// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
3482	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t [[__S1]], i32 0, i32 0
3483	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL3]], i64 0, i64 2
3484	// CHECK: [[TMP8:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align 16
3485	// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
3486	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t [[__S1]], i32 0, i32 0
3487	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL5]], i64 0, i64 3
3488	// CHECK: [[TMP10:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX6]], align 16
3489	// CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
3490	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
3491	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
3492	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
3493	// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
3494	// CHECK: [[VLD4_LANE:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8 [[TMP3]])
3495	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
3496	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
3497	// CHECK: [[TMP17:%.]] = bitcast %struct.poly64x2x4_t [[RETVAL]] to i8*
3498	// CHECK: [[TMP18:%.]] = bitcast %struct.poly64x2x4_t [[__RET]] to i8*
3499	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
3500	// CHECK: [[TMP19:%.]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t [[RETVAL]], align 16
3501	// CHECK: ret %struct.poly64x2x4_t [[TMP19]]
3502	poly64x2x4_t test_vld4q_lane_p64(poly64_t *a, poly64x2x4_t b) {
3503	return vld4q_lane_p64(a, b, 1);
3504	}
3505
3506	// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #2 {
3507	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
3508	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
3509	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
3510	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
3511	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[B]], i32 0, i32 0
3512	// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
3513	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x4_t [[__S1]] to i8*
3514	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x4_t [[B]] to i8*
3515	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
3516	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x4_t [[__RET]] to i8*
3517	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
3518	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i64 0, i64 0
3519	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
3520	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
3521	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i64 0, i64 1
3522	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
3523	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
3524	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i64 0, i64 2
3525	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
3526	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
3527	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i64 0, i64 3
3528	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
3529	// CHECK: [[VLD4_LANE:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8 %a)
3530	// CHECK: [[TMP7:%.]] = bitcast i8 [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
3531	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
3532	// CHECK: [[TMP8:%.]] = bitcast %struct.uint8x8x4_t [[RETVAL]] to i8*
3533	// CHECK: [[TMP9:%.]] = bitcast %struct.uint8x8x4_t [[__RET]] to i8*
3534	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
3535	// CHECK: [[TMP10:%.]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t [[RETVAL]], align 8
3536	// CHECK: ret %struct.uint8x8x4_t [[TMP10]]
3537	uint8x8x4_t test_vld4_lane_u8(uint8_t *a, uint8x8x4_t b) {
3538	return vld4_lane_u8(a, b, 7);
3539	}
3540
3541	// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #2 {
3542	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
3543	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
3544	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
3545	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
3546	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[B]], i32 0, i32 0
3547	// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
3548	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x4_t [[__S1]] to i8*
3549	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x4_t [[B]] to i8*
3550	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
3551	// CHECK: [[TMP2:%.]] = bitcast %struct.uint16x4x4_t [[__RET]] to i8*
3552	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
3553	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
3554	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i64 0, i64 0
3555	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
3556	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
3557	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
3558	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i64 0, i64 1
3559	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
3560	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
3561	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
3562	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i64 0, i64 2
3563	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
3564	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
3565	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
3566	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i64 0, i64 3
3567	// CHECK: [[TMP10:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
3568	// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
3569	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
3570	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
3571	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
3572	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
3573	// CHECK: [[VLD4_LANE:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8 [[TMP3]])
3574	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
3575	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
3576	// CHECK: [[TMP17:%.]] = bitcast %struct.uint16x4x4_t [[RETVAL]] to i8*
3577	// CHECK: [[TMP18:%.]] = bitcast %struct.uint16x4x4_t [[__RET]] to i8*
3578	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
3579	// CHECK: [[TMP19:%.]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t [[RETVAL]], align 8
3580	// CHECK: ret %struct.uint16x4x4_t [[TMP19]]
3581	uint16x4x4_t test_vld4_lane_u16(uint16_t *a, uint16x4x4_t b) {
3582	return vld4_lane_u16(a, b, 3);
3583	}
3584
3585	// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #2 {
3586	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
3587	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
3588	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
3589	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
3590	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[B]], i32 0, i32 0
3591	// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
3592	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x4_t [[__S1]] to i8*
3593	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x4_t [[B]] to i8*
3594	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
3595	// CHECK: [[TMP2:%.]] = bitcast %struct.uint32x2x4_t [[__RET]] to i8*
3596	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
3597	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
3598	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL]], i64 0, i64 0
3599	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
3600	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
3601	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
3602	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL1]], i64 0, i64 1
3603	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
3604	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
3605	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
3606	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL3]], i64 0, i64 2
3607	// CHECK: [[TMP8:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
3608	// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
3609	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
3610	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL5]], i64 0, i64 3
3611	// CHECK: [[TMP10:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX6]], align 8
3612	// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
3613	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
3614	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
3615	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
3616	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
3617	// CHECK: [[VLD4_LANE:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, i8 [[TMP3]])
3618	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
3619	// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
3620	// CHECK: [[TMP17:%.]] = bitcast %struct.uint32x2x4_t [[RETVAL]] to i8*
3621	// CHECK: [[TMP18:%.]] = bitcast %struct.uint32x2x4_t [[__RET]] to i8*
3622	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
3623	// CHECK: [[TMP19:%.]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t [[RETVAL]], align 8
3624	// CHECK: ret %struct.uint32x2x4_t [[TMP19]]
3625	uint32x2x4_t test_vld4_lane_u32(uint32_t *a, uint32x2x4_t b) {
3626	return vld4_lane_u32(a, b, 1);
3627	}
3628
3629	// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #2 {
3630	// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
3631	// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
3632	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
3633	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
3634	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[B]], i32 0, i32 0
3635	// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
3636	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x4_t [[__S1]] to i8*
3637	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x1x4_t [[B]] to i8*
3638	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
3639	// CHECK: [[TMP2:%.]] = bitcast %struct.uint64x1x4_t [[__RET]] to i8*
3640	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
3641	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
3642	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL]], i64 0, i64 0
3643	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
3644	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
3645	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
3646	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL1]], i64 0, i64 1
3647	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
3648	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
3649	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
3650	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL3]], i64 0, i64 2
3651	// CHECK: [[TMP8:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
3652	// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
3653	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
3654	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL5]], i64 0, i64 3
3655	// CHECK: [[TMP10:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX6]], align 8
3656	// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
3657	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
3658	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
3659	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
3660	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
3661	// CHECK: [[VLD4_LANE:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8 [[TMP3]])
3662	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
3663	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
3664	// CHECK: [[TMP17:%.]] = bitcast %struct.uint64x1x4_t [[RETVAL]] to i8*
3665	// CHECK: [[TMP18:%.]] = bitcast %struct.uint64x1x4_t [[__RET]] to i8*
3666	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
3667	// CHECK: [[TMP19:%.]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t [[RETVAL]], align 8
3668	// CHECK: ret %struct.uint64x1x4_t [[TMP19]]
3669	uint64x1x4_t test_vld4_lane_u64(uint64_t *a, uint64x1x4_t b) {
3670	return vld4_lane_u64(a, b, 0);
3671	}
3672
3673	// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #2 {
3674	// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
3675	// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
3676	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
3677	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
3678	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[B]], i32 0, i32 0
3679	// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
3680	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x4_t [[__S1]] to i8*
3681	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x4_t [[B]] to i8*
3682	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
3683	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x4_t [[__RET]] to i8*
3684	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
3685	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i64 0, i64 0
3686	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
3687	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
3688	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i64 0, i64 1
3689	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
3690	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
3691	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i64 0, i64 2
3692	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
3693	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
3694	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i64 0, i64 3
3695	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
3696	// CHECK: [[VLD4_LANE:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8 %a)
3697	// CHECK: [[TMP7:%.]] = bitcast i8 [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
3698	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
3699	// CHECK: [[TMP8:%.]] = bitcast %struct.int8x8x4_t [[RETVAL]] to i8*
3700	// CHECK: [[TMP9:%.]] = bitcast %struct.int8x8x4_t [[__RET]] to i8*
3701	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
3702	// CHECK: [[TMP10:%.]] = load %struct.int8x8x4_t, %struct.int8x8x4_t [[RETVAL]], align 8
3703	// CHECK: ret %struct.int8x8x4_t [[TMP10]]
3704	int8x8x4_t test_vld4_lane_s8(int8_t *a, int8x8x4_t b) {
3705	return vld4_lane_s8(a, b, 7);
3706	}
3707
3708	// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #2 {
3709	// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
3710	// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
3711	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
3712	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
3713	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[B]], i32 0, i32 0
3714	// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
3715	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x4_t [[__S1]] to i8*
3716	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x4_t [[B]] to i8*
3717	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
3718	// CHECK: [[TMP2:%.]] = bitcast %struct.int16x4x4_t [[__RET]] to i8*
3719	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
3720	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
3721	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i64 0, i64 0
3722	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
3723	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
3724	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
3725	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i64 0, i64 1
3726	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
3727	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
3728	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
3729	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i64 0, i64 2
3730	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
3731	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
3732	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
3733	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i64 0, i64 3
3734	// CHECK: [[TMP10:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
3735	// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
3736	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
3737	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
3738	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
3739	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
3740	// CHECK: [[VLD4_LANE:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8 [[TMP3]])
3741	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
3742	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
3743	// CHECK: [[TMP17:%.]] = bitcast %struct.int16x4x4_t [[RETVAL]] to i8*
3744	// CHECK: [[TMP18:%.]] = bitcast %struct.int16x4x4_t [[__RET]] to i8*
3745	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
3746	// CHECK: [[TMP19:%.]] = load %struct.int16x4x4_t, %struct.int16x4x4_t [[RETVAL]], align 8
3747	// CHECK: ret %struct.int16x4x4_t [[TMP19]]
3748	int16x4x4_t test_vld4_lane_s16(int16_t *a, int16x4x4_t b) {
3749	return vld4_lane_s16(a, b, 3);
3750	}
3751
3752	// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #2 {
3753	// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
3754	// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
3755	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
3756	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
3757	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[B]], i32 0, i32 0
3758	// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
3759	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x4_t [[__S1]] to i8*
3760	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x4_t [[B]] to i8*
3761	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
3762	// CHECK: [[TMP2:%.]] = bitcast %struct.int32x2x4_t [[__RET]] to i8*
3763	// CHECK: [[TMP3:%.]] = bitcast i32 %a to i8*
3764	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
3765	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL]], i64 0, i64 0
3766	// CHECK: [[TMP4:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
3767	// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
3768	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
3769	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL1]], i64 0, i64 1
3770	// CHECK: [[TMP6:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
3771	// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
3772	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
3773	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL3]], i64 0, i64 2
3774	// CHECK: [[TMP8:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
3775	// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
3776	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
3777	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL5]], i64 0, i64 3
3778	// CHECK: [[TMP10:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX6]], align 8
3779	// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
3780	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
3781	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
3782	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
3783	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
3784	// CHECK: [[VLD4_LANE:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, i8 [[TMP3]])
3785	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
3786	// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
3787	// CHECK: [[TMP17:%.]] = bitcast %struct.int32x2x4_t [[RETVAL]] to i8*
3788	// CHECK: [[TMP18:%.]] = bitcast %struct.int32x2x4_t [[__RET]] to i8*
3789	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
3790	// CHECK: [[TMP19:%.]] = load %struct.int32x2x4_t, %struct.int32x2x4_t [[RETVAL]], align 8
3791	// CHECK: ret %struct.int32x2x4_t [[TMP19]]
3792	int32x2x4_t test_vld4_lane_s32(int32_t *a, int32x2x4_t b) {
3793	return vld4_lane_s32(a, b, 1);
3794	}
3795
3796	// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #2 {
3797	// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
3798	// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
3799	// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
3800	// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
3801	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[B]], i32 0, i32 0
3802	// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
3803	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x4_t [[__S1]] to i8*
3804	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x1x4_t [[B]] to i8*
3805	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
3806	// CHECK: [[TMP2:%.]] = bitcast %struct.int64x1x4_t [[__RET]] to i8*
3807	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
3808	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
3809	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL]], i64 0, i64 0
3810	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
3811	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
3812	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
3813	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL1]], i64 0, i64 1
3814	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
3815	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
3816	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
3817	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL3]], i64 0, i64 2
3818	// CHECK: [[TMP8:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
3819	// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
3820	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
3821	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL5]], i64 0, i64 3
3822	// CHECK: [[TMP10:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX6]], align 8
3823	// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
3824	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
3825	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
3826	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
3827	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
3828	// CHECK: [[VLD4_LANE:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8 [[TMP3]])
3829	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
3830	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
3831	// CHECK: [[TMP17:%.]] = bitcast %struct.int64x1x4_t [[RETVAL]] to i8*
3832	// CHECK: [[TMP18:%.]] = bitcast %struct.int64x1x4_t [[__RET]] to i8*
3833	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
3834	// CHECK: [[TMP19:%.]] = load %struct.int64x1x4_t, %struct.int64x1x4_t [[RETVAL]], align 8
3835	// CHECK: ret %struct.int64x1x4_t [[TMP19]]
3836	int64x1x4_t test_vld4_lane_s64(int64_t *a, int64x1x4_t b) {
3837	return vld4_lane_s64(a, b, 0);
3838	}
3839
3840	// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #2 {
3841	// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
3842	// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
3843	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
3844	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
3845	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[B]], i32 0, i32 0
3846	// CHECK: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
3847	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x4_t [[__S1]] to i8*
3848	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x4_t [[B]] to i8*
3849	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
3850	// CHECK: [[TMP2:%.]] = bitcast %struct.float16x4x4_t [[__RET]] to i8*
3851	// CHECK: [[TMP3:%.]] = bitcast half %a to i8*
3852	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
3853	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL]], i64 0, i64 0
3854	// CHECK: [[TMP4:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
3855	// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
3856	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
3857	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL1]], i64 0, i64 1
3858	// CHECK: [[TMP6:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
3859	// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
3860	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
3861	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL3]], i64 0, i64 2
3862	// CHECK: [[TMP8:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX4]], align 8
3863	// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
3864	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
3865	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL5]], i64 0, i64 3
3866	// CHECK: [[TMP10:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX6]], align 8
3867	// CHECK: [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
3868	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
3869	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
3870	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
3871	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
3872	// CHECK: [[VLD4_LANE:%.]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0i8(<4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i64 3, i8 [[TMP3]])
3873	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <4 x half>, <4 x half>, <4 x half>, <4 x half> }*
3874	// CHECK: store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP16]]
3875	// CHECK: [[TMP17:%.]] = bitcast %struct.float16x4x4_t [[RETVAL]] to i8*
3876	// CHECK: [[TMP18:%.]] = bitcast %struct.float16x4x4_t [[__RET]] to i8*
3877	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
3878	// CHECK: [[TMP19:%.]] = load %struct.float16x4x4_t, %struct.float16x4x4_t [[RETVAL]], align 8
3879	// CHECK: ret %struct.float16x4x4_t [[TMP19]]
3880	float16x4x4_t test_vld4_lane_f16(float16_t *a, float16x4x4_t b) {
3881	return vld4_lane_f16(a, b, 3);
3882	}
3883
3884	// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #2 {
3885	// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
3886	// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
3887	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
3888	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
3889	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[B]], i32 0, i32 0
3890	// CHECK: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
3891	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x4_t [[__S1]] to i8*
3892	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x4_t [[B]] to i8*
3893	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
3894	// CHECK: [[TMP2:%.]] = bitcast %struct.float32x2x4_t [[__RET]] to i8*
3895	// CHECK: [[TMP3:%.]] = bitcast float %a to i8*
3896	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
3897	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL]], i64 0, i64 0
3898	// CHECK: [[TMP4:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
3899	// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
3900	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
3901	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL1]], i64 0, i64 1
3902	// CHECK: [[TMP6:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
3903	// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
3904	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
3905	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL3]], i64 0, i64 2
3906	// CHECK: [[TMP8:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX4]], align 8
3907	// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
3908	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
3909	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL5]], i64 0, i64 3
3910	// CHECK: [[TMP10:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX6]], align 8
3911	// CHECK: [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
3912	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
3913	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
3914	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
3915	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
3916	// CHECK: [[VLD4_LANE:%.]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0i8(<2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i64 1, i8 [[TMP3]])
3917	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
3918	// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP16]]
3919	// CHECK: [[TMP17:%.]] = bitcast %struct.float32x2x4_t [[RETVAL]] to i8*
3920	// CHECK: [[TMP18:%.]] = bitcast %struct.float32x2x4_t [[__RET]] to i8*
3921	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
3922	// CHECK: [[TMP19:%.]] = load %struct.float32x2x4_t, %struct.float32x2x4_t [[RETVAL]], align 8
3923	// CHECK: ret %struct.float32x2x4_t [[TMP19]]
3924	float32x2x4_t test_vld4_lane_f32(float32_t *a, float32x2x4_t b) {
3925	return vld4_lane_f32(a, b, 1);
3926	}
3927
3928	// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #2 {
3929	// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
3930	// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
3931	// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
3932	// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
3933	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t [[B]], i32 0, i32 0
3934	// CHECK: store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
3935	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x1x4_t [[__S1]] to i8*
3936	// CHECK: [[TMP1:%.]] = bitcast %struct.float64x1x4_t [[B]] to i8*
3937	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
3938	// CHECK: [[TMP2:%.]] = bitcast %struct.float64x1x4_t [[__RET]] to i8*
3939	// CHECK: [[TMP3:%.]] = bitcast double %a to i8*
3940	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t [[__S1]], i32 0, i32 0
3941	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>] [[VAL]], i64 0, i64 0
3942	// CHECK: [[TMP4:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX]], align 8
3943	// CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
3944	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t [[__S1]], i32 0, i32 0
3945	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>] [[VAL1]], i64 0, i64 1
3946	// CHECK: [[TMP6:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX2]], align 8
3947	// CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
3948	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t [[__S1]], i32 0, i32 0
3949	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>] [[VAL3]], i64 0, i64 2
3950	// CHECK: [[TMP8:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX4]], align 8
3951	// CHECK: [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
3952	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t [[__S1]], i32 0, i32 0
3953	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>] [[VAL5]], i64 0, i64 3
3954	// CHECK: [[TMP10:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX6]], align 8
3955	// CHECK: [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to <8 x i8>
3956	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
3957	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
3958	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
3959	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x double>
3960	// CHECK: [[VLD4_LANE:%.]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0i8(<1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], <1 x double> [[TMP15]], i64 0, i8 [[TMP3]])
3961	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
3962	// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP16]]
3963	// CHECK: [[TMP17:%.]] = bitcast %struct.float64x1x4_t [[RETVAL]] to i8*
3964	// CHECK: [[TMP18:%.]] = bitcast %struct.float64x1x4_t [[__RET]] to i8*
3965	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
3966	// CHECK: [[TMP19:%.]] = load %struct.float64x1x4_t, %struct.float64x1x4_t [[RETVAL]], align 8
3967	// CHECK: ret %struct.float64x1x4_t [[TMP19]]
3968	float64x1x4_t test_vld4_lane_f64(float64_t *a, float64x1x4_t b) {
3969	return vld4_lane_f64(a, b, 0);
3970	}
3971
3972	// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #2 {
3973	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
3974	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
3975	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
3976	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
3977	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[B]], i32 0, i32 0
3978	// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
3979	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x4_t [[__S1]] to i8*
3980	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x4_t [[B]] to i8*
3981	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
3982	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x4_t [[__RET]] to i8*
3983	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
3984	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i64 0, i64 0
3985	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
3986	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
3987	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i64 0, i64 1
3988	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
3989	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
3990	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i64 0, i64 2
3991	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
3992	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
3993	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i64 0, i64 3
3994	// CHECK: [[TMP6:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
3995	// CHECK: [[VLD4_LANE:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8 %a)
3996	// CHECK: [[TMP7:%.]] = bitcast i8 [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
3997	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
3998	// CHECK: [[TMP8:%.]] = bitcast %struct.poly8x8x4_t [[RETVAL]] to i8*
3999	// CHECK: [[TMP9:%.]] = bitcast %struct.poly8x8x4_t [[__RET]] to i8*
4000	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
4001	// CHECK: [[TMP10:%.]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t [[RETVAL]], align 8
4002	// CHECK: ret %struct.poly8x8x4_t [[TMP10]]
4003	poly8x8x4_t test_vld4_lane_p8(poly8_t *a, poly8x8x4_t b) {
4004	return vld4_lane_p8(a, b, 7);
4005	}
4006
4007	// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #2 {
4008	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
4009	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
4010	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
4011	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
4012	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[B]], i32 0, i32 0
4013	// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
4014	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x4_t [[__S1]] to i8*
4015	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x4_t [[B]] to i8*
4016	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
4017	// CHECK: [[TMP2:%.]] = bitcast %struct.poly16x4x4_t [[__RET]] to i8*
4018	// CHECK: [[TMP3:%.]] = bitcast i16 %a to i8*
4019	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
4020	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i64 0, i64 0
4021	// CHECK: [[TMP4:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
4022	// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
4023	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
4024	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i64 0, i64 1
4025	// CHECK: [[TMP6:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
4026	// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
4027	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
4028	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i64 0, i64 2
4029	// CHECK: [[TMP8:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
4030	// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
4031	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
4032	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i64 0, i64 3
4033	// CHECK: [[TMP10:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
4034	// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
4035	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
4036	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
4037	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
4038	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
4039	// CHECK: [[VLD4_LANE:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8 [[TMP3]])
4040	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
4041	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
4042	// CHECK: [[TMP17:%.]] = bitcast %struct.poly16x4x4_t [[RETVAL]] to i8*
4043	// CHECK: [[TMP18:%.]] = bitcast %struct.poly16x4x4_t [[__RET]] to i8*
4044	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
4045	// CHECK: [[TMP19:%.]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t [[RETVAL]], align 8
4046	// CHECK: ret %struct.poly16x4x4_t [[TMP19]]
4047	poly16x4x4_t test_vld4_lane_p16(poly16_t *a, poly16x4x4_t b) {
4048	return vld4_lane_p16(a, b, 3);
4049	}
4050
4051	// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #2 {
4052	// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
4053	// CHECK: [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
4054	// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
4055	// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
4056	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t [[B]], i32 0, i32 0
4057	// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
4058	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x1x4_t [[__S1]] to i8*
4059	// CHECK: [[TMP1:%.]] = bitcast %struct.poly64x1x4_t [[B]] to i8*
4060	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
4061	// CHECK: [[TMP2:%.]] = bitcast %struct.poly64x1x4_t [[__RET]] to i8*
4062	// CHECK: [[TMP3:%.]] = bitcast i64 %a to i8*
4063	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t [[__S1]], i32 0, i32 0
4064	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL]], i64 0, i64 0
4065	// CHECK: [[TMP4:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
4066	// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
4067	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t [[__S1]], i32 0, i32 0
4068	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL1]], i64 0, i64 1
4069	// CHECK: [[TMP6:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
4070	// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
4071	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t [[__S1]], i32 0, i32 0
4072	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL3]], i64 0, i64 2
4073	// CHECK: [[TMP8:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
4074	// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
4075	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t [[__S1]], i32 0, i32 0
4076	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL5]], i64 0, i64 3
4077	// CHECK: [[TMP10:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX6]], align 8
4078	// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
4079	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
4080	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
4081	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
4082	// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
4083	// CHECK: [[VLD4_LANE:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8 [[TMP3]])
4084	// CHECK: [[TMP16:%.]] = bitcast i8 [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
4085	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
4086	// CHECK: [[TMP17:%.]] = bitcast %struct.poly64x1x4_t [[RETVAL]] to i8*
4087	// CHECK: [[TMP18:%.]] = bitcast %struct.poly64x1x4_t [[__RET]] to i8*
4088	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
4089	// CHECK: [[TMP19:%.]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t [[RETVAL]], align 8
4090	// CHECK: ret %struct.poly64x1x4_t [[TMP19]]
4091	poly64x1x4_t test_vld4_lane_p64(poly64_t *a, poly64x1x4_t b) {
4092	return vld4_lane_p64(a, b, 0);
4093	}
4094
4095	// CHECK-LABEL: define void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
4096	// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
4097	// CHECK: store i8 [[TMP0]], i8* %a
4098	// CHECK: ret void
4099	void test_vst1q_lane_u8(uint8_t *a, uint8x16_t b) {
4100	vst1q_lane_u8(a, b, 15);
4101	}
4102
4103	// CHECK-LABEL: define void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
4104	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4105	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4106	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4107	// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
4108	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i16*
4109	// CHECK: store i16 [[TMP3]], i16* [[TMP4]]
4110	// CHECK: ret void
4111	void test_vst1q_lane_u16(uint16_t *a, uint16x8_t b) {
4112	vst1q_lane_u16(a, b, 7);
4113	}
4114
4115	// CHECK-LABEL: define void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
4116	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
4117	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4118	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4119	// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
4120	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i32*
4121	// CHECK: store i32 [[TMP3]], i32* [[TMP4]]
4122	// CHECK: ret void
4123	void test_vst1q_lane_u32(uint32_t *a, uint32x4_t b) {
4124	vst1q_lane_u32(a, b, 3);
4125	}
4126
4127	// CHECK-LABEL: define void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
4128	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4129	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
4130	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
4131	// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
4132	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i64*
4133	// CHECK: store i64 [[TMP3]], i64* [[TMP4]]
4134	// CHECK: ret void
4135	void test_vst1q_lane_u64(uint64_t *a, uint64x2_t b) {
4136	vst1q_lane_u64(a, b, 1);
4137	}
4138
4139	// CHECK-LABEL: define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
4140	// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
4141	// CHECK: store i8 [[TMP0]], i8* %a
4142	// CHECK: ret void
4143	void test_vst1q_lane_s8(int8_t *a, int8x16_t b) {
4144	vst1q_lane_s8(a, b, 15);
4145	}
4146
4147	// CHECK-LABEL: define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
4148	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4149	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4150	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4151	// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
4152	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i16*
4153	// CHECK: store i16 [[TMP3]], i16* [[TMP4]]
4154	// CHECK: ret void
4155	void test_vst1q_lane_s16(int16_t *a, int16x8_t b) {
4156	vst1q_lane_s16(a, b, 7);
4157	}
4158
4159	// CHECK-LABEL: define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
4160	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
4161	// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4162	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4163	// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
4164	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i32*
4165	// CHECK: store i32 [[TMP3]], i32* [[TMP4]]
4166	// CHECK: ret void
4167	void test_vst1q_lane_s32(int32_t *a, int32x4_t b) {
4168	vst1q_lane_s32(a, b, 3);
4169	}
4170
4171	// CHECK-LABEL: define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
4172	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4173	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
4174	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
4175	// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
4176	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i64*
4177	// CHECK: store i64 [[TMP3]], i64* [[TMP4]]
4178	// CHECK: ret void
4179	void test_vst1q_lane_s64(int64_t *a, int64x2_t b) {
4180	vst1q_lane_s64(a, b, 1);
4181	}
4182
4183	// CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 {
4184	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
4185	// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
4186	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
4187	// CHECK: [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7
4188	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to half*
4189	// CHECK: store half [[TMP3]], half* [[TMP4]]
4190	// CHECK: ret void
4191	void test_vst1q_lane_f16(float16_t *a, float16x8_t b) {
4192	vst1q_lane_f16(a, b, 7);
4193	}
4194
4195	// CHECK-LABEL: define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 {
4196	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
4197	// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
4198	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
4199	// CHECK: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
4200	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to float*
4201	// CHECK: store float [[TMP3]], float* [[TMP4]]
4202	// CHECK: ret void
4203	void test_vst1q_lane_f32(float32_t *a, float32x4_t b) {
4204	vst1q_lane_f32(a, b, 3);
4205	}
4206
4207	// CHECK-LABEL: define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) #0 {
4208	// CHECK: [[TMP0:%.]] = bitcast double %a to i8*
4209	// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
4210	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
4211	// CHECK: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
4212	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to double*
4213	// CHECK: store double [[TMP3]], double* [[TMP4]]
4214	// CHECK: ret void
4215	void test_vst1q_lane_f64(float64_t *a, float64x2_t b) {
4216	vst1q_lane_f64(a, b, 1);
4217	}
4218
4219	// CHECK-LABEL: define void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
4220	// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
4221	// CHECK: store i8 [[TMP0]], i8* %a
4222	// CHECK: ret void
4223	void test_vst1q_lane_p8(poly8_t *a, poly8x16_t b) {
4224	vst1q_lane_p8(a, b, 15);
4225	}
4226
4227	// CHECK-LABEL: define void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
4228	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4229	// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4230	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4231	// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
4232	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i16*
4233	// CHECK: store i16 [[TMP3]], i16* [[TMP4]]
4234	// CHECK: ret void
4235	void test_vst1q_lane_p16(poly16_t *a, poly16x8_t b) {
4236	vst1q_lane_p16(a, b, 7);
4237	}
4238
4239	// CHECK-LABEL: define void @test_vst1q_lane_p64(i64* %a, <2 x i64> %b) #0 {
4240	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4241	// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
4242	// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
4243	// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
4244	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i64*
4245	// CHECK: store i64 [[TMP3]], i64* [[TMP4]]
4246	// CHECK: ret void
4247	void test_vst1q_lane_p64(poly64_t *a, poly64x2_t b) {
4248	vst1q_lane_p64(a, b, 1);
4249	}
4250
4251	// CHECK-LABEL: define void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #1 {
4252	// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
4253	// CHECK: store i8 [[TMP0]], i8* %a
4254	// CHECK: ret void
4255	void test_vst1_lane_u8(uint8_t *a, uint8x8_t b) {
4256	vst1_lane_u8(a, b, 7);
4257	}
4258
4259	// CHECK-LABEL: define void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #1 {
4260	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4261	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4262	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4263	// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
4264	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i16*
4265	// CHECK: store i16 [[TMP3]], i16* [[TMP4]]
4266	// CHECK: ret void
4267	void test_vst1_lane_u16(uint16_t *a, uint16x4_t b) {
4268	vst1_lane_u16(a, b, 3);
4269	}
4270
4271	// CHECK-LABEL: define void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #1 {
4272	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
4273	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4274	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4275	// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
4276	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i32*
4277	// CHECK: store i32 [[TMP3]], i32* [[TMP4]]
4278	// CHECK: ret void
4279	void test_vst1_lane_u32(uint32_t *a, uint32x2_t b) {
4280	vst1_lane_u32(a, b, 1);
4281	}
4282
4283	// CHECK-LABEL: define void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #1 {
4284	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4285	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
4286	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
4287	// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
4288	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i64*
4289	// CHECK: store i64 [[TMP3]], i64* [[TMP4]]
4290	// CHECK: ret void
4291	void test_vst1_lane_u64(uint64_t *a, uint64x1_t b) {
4292	vst1_lane_u64(a, b, 0);
4293	}
4294
4295	// CHECK-LABEL: define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #1 {
4296	// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
4297	// CHECK: store i8 [[TMP0]], i8* %a
4298	// CHECK: ret void
4299	void test_vst1_lane_s8(int8_t *a, int8x8_t b) {
4300	vst1_lane_s8(a, b, 7);
4301	}
4302
4303	// CHECK-LABEL: define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #1 {
4304	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4305	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4306	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4307	// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
4308	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i16*
4309	// CHECK: store i16 [[TMP3]], i16* [[TMP4]]
4310	// CHECK: ret void
4311	void test_vst1_lane_s16(int16_t *a, int16x4_t b) {
4312	vst1_lane_s16(a, b, 3);
4313	}
4314
4315	// CHECK-LABEL: define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #1 {
4316	// CHECK: [[TMP0:%.]] = bitcast i32 %a to i8*
4317	// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4318	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4319	// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
4320	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i32*
4321	// CHECK: store i32 [[TMP3]], i32* [[TMP4]]
4322	// CHECK: ret void
4323	void test_vst1_lane_s32(int32_t *a, int32x2_t b) {
4324	vst1_lane_s32(a, b, 1);
4325	}
4326
4327	// CHECK-LABEL: define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #1 {
4328	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4329	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
4330	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
4331	// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
4332	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i64*
4333	// CHECK: store i64 [[TMP3]], i64* [[TMP4]]
4334	// CHECK: ret void
4335	void test_vst1_lane_s64(int64_t *a, int64x1_t b) {
4336	vst1_lane_s64(a, b, 0);
4337	}
4338
4339	// CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #1 {
4340	// CHECK: [[TMP0:%.]] = bitcast half %a to i8*
4341	// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
4342	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
4343	// CHECK: [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3
4344	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to half*
4345	// CHECK: store half [[TMP3]], half* [[TMP4]]
4346	// CHECK: ret void
4347	void test_vst1_lane_f16(float16_t *a, float16x4_t b) {
4348	vst1_lane_f16(a, b, 3);
4349	}
4350
4351	// CHECK-LABEL: define void @test_vst1_lane_f32(float* %a, <2 x float> %b) #1 {
4352	// CHECK: [[TMP0:%.]] = bitcast float %a to i8*
4353	// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
4354	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
4355	// CHECK: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
4356	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to float*
4357	// CHECK: store float [[TMP3]], float* [[TMP4]]
4358	// CHECK: ret void
4359	void test_vst1_lane_f32(float32_t *a, float32x2_t b) {
4360	vst1_lane_f32(a, b, 1);
4361	}
4362
4363	// CHECK-LABEL: define void @test_vst1_lane_f64(double* %a, <1 x double> %b) #1 {
4364	// CHECK: [[TMP0:%.]] = bitcast double %a to i8*
4365	// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
4366	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
4367	// CHECK: [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0
4368	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to double*
4369	// CHECK: store double [[TMP3]], double* [[TMP4]]
4370	// CHECK: ret void
4371	void test_vst1_lane_f64(float64_t *a, float64x1_t b) {
4372	vst1_lane_f64(a, b, 0);
4373	}
4374
4375	// CHECK-LABEL: define void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #1 {
4376	// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
4377	// CHECK: store i8 [[TMP0]], i8* %a
4378	// CHECK: ret void
4379	void test_vst1_lane_p8(poly8_t *a, poly8x8_t b) {
4380	vst1_lane_p8(a, b, 7);
4381	}
4382
4383	// CHECK-LABEL: define void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #1 {
4384	// CHECK: [[TMP0:%.]] = bitcast i16 %a to i8*
4385	// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4386	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4387	// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
4388	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i16*
4389	// CHECK: store i16 [[TMP3]], i16* [[TMP4]]
4390	// CHECK: ret void
4391	void test_vst1_lane_p16(poly16_t *a, poly16x4_t b) {
4392	vst1_lane_p16(a, b, 3);
4393	}
4394
4395	// CHECK-LABEL: define void @test_vst1_lane_p64(i64* %a, <1 x i64> %b) #1 {
4396	// CHECK: [[TMP0:%.]] = bitcast i64 %a to i8*
4397	// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
4398	// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
4399	// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
4400	// CHECK: [[TMP4:%.]] = bitcast i8 [[TMP0]] to i64*
4401	// CHECK: store i64 [[TMP3]], i64* [[TMP4]]
4402	// CHECK: ret void
4403	void test_vst1_lane_p64(poly64_t *a, poly64x1_t b) {
4404	vst1_lane_p64(a, b, 0);
4405	}
4406
4407	// CHECK-LABEL: define void @test_vst2q_lane_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #2 {
4408	// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
4409	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
4410	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t [[B]], i32 0, i32 0
4411	// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
4412	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x2_t [[__S1]] to i8*
4413	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x16x2_t [[B]] to i8*
4414	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4415	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t [[__S1]], i32 0, i32 0
4416	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL]], i64 0, i64 0
4417	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
4418	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t [[__S1]], i32 0, i32 0
4419	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL1]], i64 0, i64 1
4420	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
4421	// CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
4422	// CHECK: ret void
4423	void test_vst2q_lane_u8(uint8_t *a, uint8x16x2_t b) {
4424	vst2q_lane_u8(a, b, 15);
4425	}
4426
4427	// CHECK-LABEL: define void @test_vst2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #2 {
4428	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
4429	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
4430	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[B]], i32 0, i32 0
4431	// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
4432	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x2_t [[__S1]] to i8*
4433	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x2_t [[B]] to i8*
4434	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4435	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
4436	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[__S1]], i32 0, i32 0
4437	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i64 0, i64 0
4438	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
4439	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4440	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[__S1]], i32 0, i32 0
4441	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i64 0, i64 1
4442	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
4443	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4444	// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4445	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4446	// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
4447	// CHECK: ret void
4448	void test_vst2q_lane_u16(uint16_t *a, uint16x8x2_t b) {
4449	vst2q_lane_u16(a, b, 7);
4450	}
4451
4452	// CHECK-LABEL: define void @test_vst2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #2 {
4453	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
4454	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
4455	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[B]], i32 0, i32 0
4456	// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
4457	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x2_t [[__S1]] to i8*
4458	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x2_t [[B]] to i8*
4459	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4460	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
4461	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[__S1]], i32 0, i32 0
4462	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL]], i64 0, i64 0
4463	// CHECK: [[TMP3:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
4464	// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
4465	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[__S1]], i32 0, i32 0
4466	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL1]], i64 0, i64 1
4467	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
4468	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4469	// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
4470	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4471	// CHECK: call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
4472	// CHECK: ret void
4473	void test_vst2q_lane_u32(uint32_t *a, uint32x4x2_t b) {
4474	vst2q_lane_u32(a, b, 3);
4475	}
4476
4477	// CHECK-LABEL: define void @test_vst2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #2 {
4478	// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
4479	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
4480	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t [[B]], i32 0, i32 0
4481	// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
4482	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x2_t [[__S1]] to i8*
4483	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x2x2_t [[B]] to i8*
4484	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4485	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
4486	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t [[__S1]], i32 0, i32 0
4487	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL]], i64 0, i64 0
4488	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
4489	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
4490	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t [[__S1]], i32 0, i32 0
4491	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL1]], i64 0, i64 1
4492	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
4493	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
4494	// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
4495	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
4496	// CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
4497	// CHECK: ret void
4498	void test_vst2q_lane_u64(uint64_t *a, uint64x2x2_t b) {
4499	vst2q_lane_u64(a, b, 1);
4500	}
4501
4502	// CHECK-LABEL: define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #2 {
4503	// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
4504	// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
4505	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t [[B]], i32 0, i32 0
4506	// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
4507	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x2_t [[__S1]] to i8*
4508	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x16x2_t [[B]] to i8*
4509	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4510	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t [[__S1]], i32 0, i32 0
4511	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL]], i64 0, i64 0
4512	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
4513	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t [[__S1]], i32 0, i32 0
4514	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL1]], i64 0, i64 1
4515	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
4516	// CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
4517	// CHECK: ret void
4518	void test_vst2q_lane_s8(int8_t *a, int8x16x2_t b) {
4519	vst2q_lane_s8(a, b, 15);
4520	}
4521
4522	// CHECK-LABEL: define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #2 {
4523	// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
4524	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
4525	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[B]], i32 0, i32 0
4526	// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
4527	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x2_t [[__S1]] to i8*
4528	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x2_t [[B]] to i8*
4529	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4530	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
4531	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[__S1]], i32 0, i32 0
4532	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i64 0, i64 0
4533	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
4534	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4535	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[__S1]], i32 0, i32 0
4536	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i64 0, i64 1
4537	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
4538	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4539	// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4540	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4541	// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
4542	// CHECK: ret void
4543	void test_vst2q_lane_s16(int16_t *a, int16x8x2_t b) {
4544	vst2q_lane_s16(a, b, 7);
4545	}
4546
4547	// CHECK-LABEL: define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #2 {
4548	// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
4549	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
4550	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[B]], i32 0, i32 0
4551	// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
4552	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x2_t [[__S1]] to i8*
4553	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x2_t [[B]] to i8*
4554	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4555	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
4556	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[__S1]], i32 0, i32 0
4557	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL]], i64 0, i64 0
4558	// CHECK: [[TMP3:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
4559	// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
4560	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[__S1]], i32 0, i32 0
4561	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL1]], i64 0, i64 1
4562	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
4563	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
4564	// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
4565	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
4566	// CHECK: call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
4567	// CHECK: ret void
4568	void test_vst2q_lane_s32(int32_t *a, int32x4x2_t b) {
4569	vst2q_lane_s32(a, b, 3);
4570	}
4571
4572	// CHECK-LABEL: define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #2 {
4573	// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
4574	// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
4575	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t [[B]], i32 0, i32 0
4576	// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
4577	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x2_t [[__S1]] to i8*
4578	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x2x2_t [[B]] to i8*
4579	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4580	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
4581	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t [[__S1]], i32 0, i32 0
4582	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL]], i64 0, i64 0
4583	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
4584	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
4585	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t [[__S1]], i32 0, i32 0
4586	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL1]], i64 0, i64 1
4587	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
4588	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
4589	// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
4590	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
4591	// CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
4592	// CHECK: ret void
4593	void test_vst2q_lane_s64(int64_t *a, int64x2x2_t b) {
4594	vst2q_lane_s64(a, b, 1);
4595	}
4596
4597	// CHECK-LABEL: define void @test_vst2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #2 {
4598	// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
4599	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
4600	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[B]], i32 0, i32 0
4601	// CHECK: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
4602	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x2_t [[__S1]] to i8*
4603	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x2_t [[B]] to i8*
4604	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4605	// CHECK: [[TMP2:%.]] = bitcast half %a to i8*
4606	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[__S1]], i32 0, i32 0
4607	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>] [[VAL]], i64 0, i64 0
4608	// CHECK: [[TMP3:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
4609	// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
4610	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[__S1]], i32 0, i32 0
4611	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>] [[VAL1]], i64 0, i64 1
4612	// CHECK: [[TMP5:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
4613	// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
4614	// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
4615	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
4616	// CHECK: call void @llvm.aarch64.neon.st2lane.v8f16.p0i8(<8 x half> [[TMP7]], <8 x half> [[TMP8]], i64 7, i8* [[TMP2]])
4617	// CHECK: ret void
4618	void test_vst2q_lane_f16(float16_t *a, float16x8x2_t b) {
4619	vst2q_lane_f16(a, b, 7);
4620	}
4621
4622	// CHECK-LABEL: define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #2 {
4623	// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
4624	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
4625	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[B]], i32 0, i32 0
4626	// CHECK: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
4627	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x2_t [[__S1]] to i8*
4628	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x2_t [[B]] to i8*
4629	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4630	// CHECK: [[TMP2:%.]] = bitcast float %a to i8*
4631	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[__S1]], i32 0, i32 0
4632	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>] [[VAL]], i64 0, i64 0
4633	// CHECK: [[TMP3:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
4634	// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
4635	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[__S1]], i32 0, i32 0
4636	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>] [[VAL1]], i64 0, i64 1
4637	// CHECK: [[TMP5:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
4638	// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
4639	// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
4640	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
4641	// CHECK: call void @llvm.aarch64.neon.st2lane.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, i8* [[TMP2]])
4642	// CHECK: ret void
4643	void test_vst2q_lane_f32(float32_t *a, float32x4x2_t b) {
4644	vst2q_lane_f32(a, b, 3);
4645	}
4646
4647	// CHECK-LABEL: define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #2 {
4648	// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
4649	// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
4650	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t [[B]], i32 0, i32 0
4651	// CHECK: store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
4652	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x2x2_t [[__S1]] to i8*
4653	// CHECK: [[TMP1:%.]] = bitcast %struct.float64x2x2_t [[B]] to i8*
4654	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4655	// CHECK: [[TMP2:%.]] = bitcast double %a to i8*
4656	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t [[__S1]], i32 0, i32 0
4657	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>] [[VAL]], i64 0, i64 0
4658	// CHECK: [[TMP3:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX]], align 16
4659	// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
4660	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t [[__S1]], i32 0, i32 0
4661	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>] [[VAL1]], i64 0, i64 1
4662	// CHECK: [[TMP5:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX2]], align 16
4663	// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
4664	// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
4665	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
4666	// CHECK: call void @llvm.aarch64.neon.st2lane.v2f64.p0i8(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, i8* [[TMP2]])
4667	// CHECK: ret void
4668	void test_vst2q_lane_f64(float64_t *a, float64x2x2_t b) {
4669	vst2q_lane_f64(a, b, 1);
4670	}
4671
4672	// CHECK-LABEL: define void @test_vst2q_lane_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #2 {
4673	// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
4674	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
4675	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t [[B]], i32 0, i32 0
4676	// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
4677	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x2_t [[__S1]] to i8*
4678	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x16x2_t [[B]] to i8*
4679	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4680	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t [[__S1]], i32 0, i32 0
4681	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL]], i64 0, i64 0
4682	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
4683	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t [[__S1]], i32 0, i32 0
4684	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL1]], i64 0, i64 1
4685	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
4686	// CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
4687	// CHECK: ret void
4688	void test_vst2q_lane_p8(poly8_t *a, poly8x16x2_t b) {
4689	vst2q_lane_p8(a, b, 15);
4690	}
4691
4692	// CHECK-LABEL: define void @test_vst2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #2 {
4693	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
4694	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
4695	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[B]], i32 0, i32 0
4696	// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
4697	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x2_t [[__S1]] to i8*
4698	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x2_t [[B]] to i8*
4699	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4700	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
4701	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[__S1]], i32 0, i32 0
4702	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], i64 0, i64 0
4703	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
4704	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
4705	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[__S1]], i32 0, i32 0
4706	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], i64 0, i64 1
4707	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
4708	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
4709	// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
4710	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
4711	// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
4712	// CHECK: ret void
4713	void test_vst2q_lane_p16(poly16_t *a, poly16x8x2_t b) {
4714	vst2q_lane_p16(a, b, 7);
4715	}
4716
4717	// CHECK-LABEL: define void @test_vst2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #2 {
4718	// CHECK: [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
4719	// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
4720	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t [[B]], i32 0, i32 0
4721	// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
4722	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x2x2_t [[__S1]] to i8*
4723	// CHECK: [[TMP1:%.]] = bitcast %struct.poly64x2x2_t [[B]] to i8*
4724	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
4725	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
4726	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t [[__S1]], i32 0, i32 0
4727	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL]], i64 0, i64 0
4728	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
4729	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
4730	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t [[__S1]], i32 0, i32 0
4731	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL1]], i64 0, i64 1
4732	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
4733	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
4734	// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
4735	// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
4736	// CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
4737	// CHECK: ret void
4738	void test_vst2q_lane_p64(poly64_t *a, poly64x2x2_t b) {
4739	vst2q_lane_p64(a, b, 1);
4740	}
4741
4742	// CHECK-LABEL: define void @test_vst2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #2 {
4743	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
4744	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
4745	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[B]], i32 0, i32 0
4746	// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
4747	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x2_t [[__S1]] to i8*
4748	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x2_t [[B]] to i8*
4749	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
4750	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__S1]], i32 0, i32 0
4751	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i64 0, i64 0
4752	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
4753	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__S1]], i32 0, i32 0
4754	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i64 0, i64 1
4755	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
4756	// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
4757	// CHECK: ret void
4758	void test_vst2_lane_u8(uint8_t *a, uint8x8x2_t b) {
4759	vst2_lane_u8(a, b, 7);
4760	}
4761
4762	// CHECK-LABEL: define void @test_vst2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #2 {
4763	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
4764	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
4765	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[B]], i32 0, i32 0
4766	// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
4767	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x2_t [[__S1]] to i8*
4768	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x2_t [[B]] to i8*
4769	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
4770	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
4771	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[__S1]], i32 0, i32 0
4772	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i64 0, i64 0
4773	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
4774	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4775	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[__S1]], i32 0, i32 0
4776	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i64 0, i64 1
4777	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
4778	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4779	// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4780	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4781	// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
4782	// CHECK: ret void
4783	void test_vst2_lane_u16(uint16_t *a, uint16x4x2_t b) {
4784	vst2_lane_u16(a, b, 3);
4785	}
4786
4787	// CHECK-LABEL: define void @test_vst2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #2 {
4788	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
4789	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
4790	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[B]], i32 0, i32 0
4791	// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
4792	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x2_t [[__S1]] to i8*
4793	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x2_t [[B]] to i8*
4794	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
4795	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
4796	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[__S1]], i32 0, i32 0
4797	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL]], i64 0, i64 0
4798	// CHECK: [[TMP3:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
4799	// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
4800	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[__S1]], i32 0, i32 0
4801	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL1]], i64 0, i64 1
4802	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
4803	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4804	// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
4805	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4806	// CHECK: call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
4807	// CHECK: ret void
4808	void test_vst2_lane_u32(uint32_t *a, uint32x2x2_t b) {
4809	vst2_lane_u32(a, b, 1);
4810	}
4811
4812	// CHECK-LABEL: define void @test_vst2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #2 {
4813	// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
4814	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
4815	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t [[B]], i32 0, i32 0
4816	// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
4817	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x2_t [[__S1]] to i8*
4818	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x1x2_t [[B]] to i8*
4819	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
4820	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
4821	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t [[__S1]], i32 0, i32 0
4822	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL]], i64 0, i64 0
4823	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
4824	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4825	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t [[__S1]], i32 0, i32 0
4826	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL1]], i64 0, i64 1
4827	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
4828	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4829	// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4830	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4831	// CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
4832	// CHECK: ret void
4833	void test_vst2_lane_u64(uint64_t *a, uint64x1x2_t b) {
4834	vst2_lane_u64(a, b, 0);
4835	}
4836
4837	// CHECK-LABEL: define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #2 {
4838	// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
4839	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
4840	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[B]], i32 0, i32 0
4841	// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
4842	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x2_t [[__S1]] to i8*
4843	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x2_t [[B]] to i8*
4844	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
4845	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__S1]], i32 0, i32 0
4846	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i64 0, i64 0
4847	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
4848	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__S1]], i32 0, i32 0
4849	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i64 0, i64 1
4850	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
4851	// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
4852	// CHECK: ret void
4853	void test_vst2_lane_s8(int8_t *a, int8x8x2_t b) {
4854	vst2_lane_s8(a, b, 7);
4855	}
4856
4857	// CHECK-LABEL: define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #2 {
4858	// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
4859	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
4860	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[B]], i32 0, i32 0
4861	// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
4862	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x2_t [[__S1]] to i8*
4863	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x2_t [[B]] to i8*
4864	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
4865	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
4866	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[__S1]], i32 0, i32 0
4867	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i64 0, i64 0
4868	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
4869	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
4870	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[__S1]], i32 0, i32 0
4871	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i64 0, i64 1
4872	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
4873	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
4874	// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
4875	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
4876	// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
4877	// CHECK: ret void
4878	void test_vst2_lane_s16(int16_t *a, int16x4x2_t b) {
4879	vst2_lane_s16(a, b, 3);
4880	}
4881
4882	// CHECK-LABEL: define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #2 {
4883	// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
4884	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
4885	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[B]], i32 0, i32 0
4886	// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
4887	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x2_t [[__S1]] to i8*
4888	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x2_t [[B]] to i8*
4889	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
4890	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
4891	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[__S1]], i32 0, i32 0
4892	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL]], i64 0, i64 0
4893	// CHECK: [[TMP3:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
4894	// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
4895	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[__S1]], i32 0, i32 0
4896	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL1]], i64 0, i64 1
4897	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
4898	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
4899	// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
4900	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
4901	// CHECK: call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
4902	// CHECK: ret void
4903	void test_vst2_lane_s32(int32_t *a, int32x2x2_t b) {
4904	vst2_lane_s32(a, b, 1);
4905	}
4906
4907	// CHECK-LABEL: define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #2 {
4908	// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
4909	// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
4910	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t [[B]], i32 0, i32 0
4911	// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
4912	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x2_t [[__S1]] to i8*
4913	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x1x2_t [[B]] to i8*
4914	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
4915	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
4916	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t [[__S1]], i32 0, i32 0
4917	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL]], i64 0, i64 0
4918	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
4919	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
4920	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t [[__S1]], i32 0, i32 0
4921	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL1]], i64 0, i64 1
4922	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
4923	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
4924	// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
4925	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
4926	// CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
4927	// CHECK: ret void
4928	void test_vst2_lane_s64(int64_t *a, int64x1x2_t b) {
4929	vst2_lane_s64(a, b, 0);
4930	}
4931
4932	// CHECK-LABEL: define void @test_vst2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #2 {
4933	// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
4934	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
4935	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[B]], i32 0, i32 0
4936	// CHECK: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
4937	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x2_t [[__S1]] to i8*
4938	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x2_t [[B]] to i8*
4939	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
4940	// CHECK: [[TMP2:%.]] = bitcast half %a to i8*
4941	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[__S1]], i32 0, i32 0
4942	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>] [[VAL]], i64 0, i64 0
4943	// CHECK: [[TMP3:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
4944	// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
4945	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[__S1]], i32 0, i32 0
4946	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>] [[VAL1]], i64 0, i64 1
4947	// CHECK: [[TMP5:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
4948	// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
4949	// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
4950	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
4951	// CHECK: call void @llvm.aarch64.neon.st2lane.v4f16.p0i8(<4 x half> [[TMP7]], <4 x half> [[TMP8]], i64 3, i8* [[TMP2]])
4952	// CHECK: ret void
4953	void test_vst2_lane_f16(float16_t *a, float16x4x2_t b) {
4954	vst2_lane_f16(a, b, 3);
4955	}
4956
4957	// CHECK-LABEL: define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #2 {
4958	// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
4959	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
4960	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[B]], i32 0, i32 0
4961	// CHECK: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
4962	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x2_t [[__S1]] to i8*
4963	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x2_t [[B]] to i8*
4964	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
4965	// CHECK: [[TMP2:%.]] = bitcast float %a to i8*
4966	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[__S1]], i32 0, i32 0
4967	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>] [[VAL]], i64 0, i64 0
4968	// CHECK: [[TMP3:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
4969	// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
4970	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[__S1]], i32 0, i32 0
4971	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>] [[VAL1]], i64 0, i64 1
4972	// CHECK: [[TMP5:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
4973	// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
4974	// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
4975	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
4976	// CHECK: call void @llvm.aarch64.neon.st2lane.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, i8* [[TMP2]])
4977	// CHECK: ret void
4978	void test_vst2_lane_f32(float32_t *a, float32x2x2_t b) {
4979	vst2_lane_f32(a, b, 1);
4980	}
4981
4982	// CHECK-LABEL: define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #2 {
4983	// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
4984	// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
4985	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t [[B]], i32 0, i32 0
4986	// CHECK: store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
4987	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x1x2_t [[__S1]] to i8*
4988	// CHECK: [[TMP1:%.]] = bitcast %struct.float64x1x2_t [[B]] to i8*
4989	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
4990	// CHECK: [[TMP2:%.]] = bitcast double %a to i8*
4991	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t [[__S1]], i32 0, i32 0
4992	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>] [[VAL]], i64 0, i64 0
4993	// CHECK: [[TMP3:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX]], align 8
4994	// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
4995	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t [[__S1]], i32 0, i32 0
4996	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>] [[VAL1]], i64 0, i64 1
4997	// CHECK: [[TMP5:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX2]], align 8
4998	// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
4999	// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
5000	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
5001	// CHECK: call void @llvm.aarch64.neon.st2lane.v1f64.p0i8(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, i8* [[TMP2]])
5002	// CHECK: ret void
5003	void test_vst2_lane_f64(float64_t *a, float64x1x2_t b) {
5004	vst2_lane_f64(a, b, 0);
5005	}
5006
5007	// CHECK-LABEL: define void @test_vst2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #2 {
5008	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
5009	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
5010	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[B]], i32 0, i32 0
5011	// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
5012	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x2_t [[__S1]] to i8*
5013	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x2_t [[B]] to i8*
5014	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
5015	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__S1]], i32 0, i32 0
5016	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], i64 0, i64 0
5017	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
5018	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__S1]], i32 0, i32 0
5019	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], i64 0, i64 1
5020	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
5021	// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
5022	// CHECK: ret void
5023	void test_vst2_lane_p8(poly8_t *a, poly8x8x2_t b) {
5024	vst2_lane_p8(a, b, 7);
5025	}
5026
5027	// CHECK-LABEL: define void @test_vst2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #2 {
5028	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
5029	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
5030	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[B]], i32 0, i32 0
5031	// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
5032	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x2_t [[__S1]] to i8*
5033	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x2_t [[B]] to i8*
5034	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
5035	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
5036	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[__S1]], i32 0, i32 0
5037	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], i64 0, i64 0
5038	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
5039	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
5040	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[__S1]], i32 0, i32 0
5041	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], i64 0, i64 1
5042	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
5043	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5044	// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
5045	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5046	// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
5047	// CHECK: ret void
5048	void test_vst2_lane_p16(poly16_t *a, poly16x4x2_t b) {
5049	vst2_lane_p16(a, b, 3);
5050	}
5051
5052	// CHECK-LABEL: define void @test_vst2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #2 {
5053	// CHECK: [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
5054	// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
5055	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t [[B]], i32 0, i32 0
5056	// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
5057	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x1x2_t [[__S1]] to i8*
5058	// CHECK: [[TMP1:%.]] = bitcast %struct.poly64x1x2_t [[B]] to i8*
5059	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
5060	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
5061	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t [[__S1]], i32 0, i32 0
5062	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL]], i64 0, i64 0
5063	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
5064	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
5065	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t [[__S1]], i32 0, i32 0
5066	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL1]], i64 0, i64 1
5067	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
5068	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
5069	// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
5070	// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
5071	// CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
5072	// CHECK: ret void
5073	void test_vst2_lane_p64(poly64_t *a, poly64x1x2_t b) {
5074	vst2_lane_p64(a, b, 0);
5075	}
5076
5077	// CHECK-LABEL: define void @test_vst3q_lane_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #2 {
5078	// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
5079	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
5080	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[B]], i32 0, i32 0
5081	// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
5082	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x3_t [[__S1]] to i8*
5083	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x16x3_t [[B]] to i8*
5084	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5085	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[__S1]], i32 0, i32 0
5086	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL]], i64 0, i64 0
5087	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
5088	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[__S1]], i32 0, i32 0
5089	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL1]], i64 0, i64 1
5090	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
5091	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[__S1]], i32 0, i32 0
5092	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL3]], i64 0, i64 2
5093	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
5094	// CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
5095	// CHECK: ret void
5096	void test_vst3q_lane_u8(uint8_t *a, uint8x16x3_t b) {
5097	vst3q_lane_u8(a, b, 15);
5098	}
5099
5100	// CHECK-LABEL: define void @test_vst3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #2 {
5101	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
5102	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
5103	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[B]], i32 0, i32 0
5104	// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
5105	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x3_t [[__S1]] to i8*
5106	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x3_t [[B]] to i8*
5107	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5108	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
5109	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
5110	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i64 0, i64 0
5111	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
5112	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
5113	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
5114	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i64 0, i64 1
5115	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
5116	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5117	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
5118	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i64 0, i64 2
5119	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
5120	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5121	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
5122	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5123	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5124	// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
5125	// CHECK: ret void
5126	void test_vst3q_lane_u16(uint16_t *a, uint16x8x3_t b) {
5127	vst3q_lane_u16(a, b, 7);
5128	}
5129
5130	// CHECK-LABEL: define void @test_vst3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #2 {
5131	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
5132	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
5133	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[B]], i32 0, i32 0
5134	// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
5135	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x3_t [[__S1]] to i8*
5136	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x3_t [[B]] to i8*
5137	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5138	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
5139	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
5140	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL]], i64 0, i64 0
5141	// CHECK: [[TMP3:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
5142	// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
5143	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
5144	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL1]], i64 0, i64 1
5145	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
5146	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5147	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
5148	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL3]], i64 0, i64 2
5149	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
5150	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5151	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
5152	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5153	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5154	// CHECK: call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, i8* [[TMP2]])
5155	// CHECK: ret void
5156	void test_vst3q_lane_u32(uint32_t *a, uint32x4x3_t b) {
5157	vst3q_lane_u32(a, b, 3);
5158	}
5159
5160	// CHECK-LABEL: define void @test_vst3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #2 {
5161	// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
5162	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
5163	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t [[B]], i32 0, i32 0
5164	// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
5165	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x3_t [[__S1]] to i8*
5166	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x2x3_t [[B]] to i8*
5167	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5168	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
5169	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t [[__S1]], i32 0, i32 0
5170	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL]], i64 0, i64 0
5171	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
5172	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
5173	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t [[__S1]], i32 0, i32 0
5174	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL1]], i64 0, i64 1
5175	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
5176	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
5177	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t [[__S1]], i32 0, i32 0
5178	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL3]], i64 0, i64 2
5179	// CHECK: [[TMP7:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align 16
5180	// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
5181	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
5182	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
5183	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
5184	// CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
5185	// CHECK: ret void
5186	void test_vst3q_lane_u64(uint64_t *a, uint64x2x3_t b) {
5187	vst3q_lane_u64(a, b, 1);
5188	}
5189
5190	// CHECK-LABEL: define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #2 {
5191	// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
5192	// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
5193	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[B]], i32 0, i32 0
5194	// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
5195	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x3_t [[__S1]] to i8*
5196	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x16x3_t [[B]] to i8*
5197	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5198	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[__S1]], i32 0, i32 0
5199	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL]], i64 0, i64 0
5200	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
5201	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[__S1]], i32 0, i32 0
5202	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL1]], i64 0, i64 1
5203	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
5204	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[__S1]], i32 0, i32 0
5205	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL3]], i64 0, i64 2
5206	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
5207	// CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
5208	// CHECK: ret void
5209	void test_vst3q_lane_s8(int8_t *a, int8x16x3_t b) {
5210	vst3q_lane_s8(a, b, 15);
5211	}
5212
5213	// CHECK-LABEL: define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #2 {
5214	// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
5215	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
5216	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[B]], i32 0, i32 0
5217	// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
5218	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x3_t [[__S1]] to i8*
5219	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x3_t [[B]] to i8*
5220	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5221	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
5222	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
5223	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i64 0, i64 0
5224	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
5225	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
5226	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
5227	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i64 0, i64 1
5228	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
5229	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5230	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
5231	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i64 0, i64 2
5232	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
5233	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5234	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
5235	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5236	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5237	// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
5238	// CHECK: ret void
5239	void test_vst3q_lane_s16(int16_t *a, int16x8x3_t b) {
5240	vst3q_lane_s16(a, b, 7);
5241	}
5242
5243	// CHECK-LABEL: define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #2 {
5244	// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
5245	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
5246	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[B]], i32 0, i32 0
5247	// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
5248	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x3_t [[__S1]] to i8*
5249	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x3_t [[B]] to i8*
5250	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5251	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
5252	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
5253	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL]], i64 0, i64 0
5254	// CHECK: [[TMP3:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
5255	// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
5256	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
5257	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL1]], i64 0, i64 1
5258	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
5259	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5260	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
5261	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL3]], i64 0, i64 2
5262	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
5263	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5264	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
5265	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5266	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5267	// CHECK: call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, i8* [[TMP2]])
5268	// CHECK: ret void
5269	void test_vst3q_lane_s32(int32_t *a, int32x4x3_t b) {
5270	vst3q_lane_s32(a, b, 3);
5271	}
5272
5273	// CHECK-LABEL: define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #2 {
5274	// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
5275	// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
5276	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t [[B]], i32 0, i32 0
5277	// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
5278	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x3_t [[__S1]] to i8*
5279	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x2x3_t [[B]] to i8*
5280	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5281	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
5282	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t [[__S1]], i32 0, i32 0
5283	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL]], i64 0, i64 0
5284	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
5285	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
5286	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t [[__S1]], i32 0, i32 0
5287	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL1]], i64 0, i64 1
5288	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
5289	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
5290	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t [[__S1]], i32 0, i32 0
5291	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL3]], i64 0, i64 2
5292	// CHECK: [[TMP7:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align 16
5293	// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
5294	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
5295	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
5296	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
5297	// CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
5298	// CHECK: ret void
5299	void test_vst3q_lane_s64(int64_t *a, int64x2x3_t b) {
5300	vst3q_lane_s64(a, b, 1);
5301	}
5302
5303	// CHECK-LABEL: define void @test_vst3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #2 {
5304	// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
5305	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
5306	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[B]], i32 0, i32 0
5307	// CHECK: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
5308	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x3_t [[__S1]] to i8*
5309	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x3_t [[B]] to i8*
5310	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5311	// CHECK: [[TMP2:%.]] = bitcast half %a to i8*
5312	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
5313	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL]], i64 0, i64 0
5314	// CHECK: [[TMP3:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
5315	// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
5316	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
5317	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL1]], i64 0, i64 1
5318	// CHECK: [[TMP5:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
5319	// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
5320	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
5321	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL3]], i64 0, i64 2
5322	// CHECK: [[TMP7:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX4]], align 16
5323	// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
5324	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
5325	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
5326	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
5327	// CHECK: call void @llvm.aarch64.neon.st3lane.v8f16.p0i8(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i64 7, i8* [[TMP2]])
5328	// CHECK: ret void
5329	void test_vst3q_lane_f16(float16_t *a, float16x8x3_t b) {
5330	vst3q_lane_f16(a, b, 7);
5331	}
5332
5333	// CHECK-LABEL: define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #2 {
5334	// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
5335	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
5336	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[B]], i32 0, i32 0
5337	// CHECK: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
5338	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x3_t [[__S1]] to i8*
5339	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x3_t [[B]] to i8*
5340	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5341	// CHECK: [[TMP2:%.]] = bitcast float %a to i8*
5342	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
5343	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL]], i64 0, i64 0
5344	// CHECK: [[TMP3:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
5345	// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
5346	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
5347	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL1]], i64 0, i64 1
5348	// CHECK: [[TMP5:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
5349	// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
5350	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
5351	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL3]], i64 0, i64 2
5352	// CHECK: [[TMP7:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX4]], align 16
5353	// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
5354	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
5355	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
5356	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
5357	// CHECK: call void @llvm.aarch64.neon.st3lane.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i64 3, i8* [[TMP2]])
5358	// CHECK: ret void
5359	void test_vst3q_lane_f32(float32_t *a, float32x4x3_t b) {
5360	vst3q_lane_f32(a, b, 3);
5361	}
5362
5363	// CHECK-LABEL: define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #2 {
5364	// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
5365	// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
5366	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t [[B]], i32 0, i32 0
5367	// CHECK: store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
5368	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x2x3_t [[__S1]] to i8*
5369	// CHECK: [[TMP1:%.]] = bitcast %struct.float64x2x3_t [[B]] to i8*
5370	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5371	// CHECK: [[TMP2:%.]] = bitcast double %a to i8*
5372	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t [[__S1]], i32 0, i32 0
5373	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>] [[VAL]], i64 0, i64 0
5374	// CHECK: [[TMP3:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX]], align 16
5375	// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
5376	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t [[__S1]], i32 0, i32 0
5377	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>] [[VAL1]], i64 0, i64 1
5378	// CHECK: [[TMP5:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX2]], align 16
5379	// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
5380	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t [[__S1]], i32 0, i32 0
5381	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>] [[VAL3]], i64 0, i64 2
5382	// CHECK: [[TMP7:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX4]], align 16
5383	// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
5384	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
5385	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
5386	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
5387	// CHECK: call void @llvm.aarch64.neon.st3lane.v2f64.p0i8(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i64 1, i8* [[TMP2]])
5388	// CHECK: ret void
5389	void test_vst3q_lane_f64(float64_t *a, float64x2x3_t b) {
5390	vst3q_lane_f64(a, b, 1);
5391	}
5392
5393	// CHECK-LABEL: define void @test_vst3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #2 {
5394	// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
5395	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
5396	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[B]], i32 0, i32 0
5397	// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
5398	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x3_t [[__S1]] to i8*
5399	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x16x3_t [[B]] to i8*
5400	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5401	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[__S1]], i32 0, i32 0
5402	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL]], i64 0, i64 0
5403	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
5404	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[__S1]], i32 0, i32 0
5405	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL1]], i64 0, i64 1
5406	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
5407	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[__S1]], i32 0, i32 0
5408	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL3]], i64 0, i64 2
5409	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
5410	// CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
5411	// CHECK: ret void
5412	void test_vst3q_lane_p8(poly8_t *a, poly8x16x3_t b) {
5413	vst3q_lane_p8(a, b, 15);
5414	}
5415
5416	// CHECK-LABEL: define void @test_vst3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #2 {
5417	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
5418	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
5419	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[B]], i32 0, i32 0
5420	// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
5421	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x3_t [[__S1]] to i8*
5422	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x3_t [[B]] to i8*
5423	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5424	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
5425	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
5426	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], i64 0, i64 0
5427	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
5428	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
5429	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
5430	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], i64 0, i64 1
5431	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
5432	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5433	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
5434	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], i64 0, i64 2
5435	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
5436	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5437	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
5438	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5439	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5440	// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
5441	// CHECK: ret void
5442	void test_vst3q_lane_p16(poly16_t *a, poly16x8x3_t b) {
5443	vst3q_lane_p16(a, b, 7);
5444	}
5445
5446	// CHECK-LABEL: define void @test_vst3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #2 {
5447	// CHECK: [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
5448	// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
5449	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t [[B]], i32 0, i32 0
5450	// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
5451	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x2x3_t [[__S1]] to i8*
5452	// CHECK: [[TMP1:%.]] = bitcast %struct.poly64x2x3_t [[B]] to i8*
5453	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
5454	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
5455	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t [[__S1]], i32 0, i32 0
5456	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL]], i64 0, i64 0
5457	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
5458	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
5459	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t [[__S1]], i32 0, i32 0
5460	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL1]], i64 0, i64 1
5461	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
5462	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
5463	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t [[__S1]], i32 0, i32 0
5464	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL3]], i64 0, i64 2
5465	// CHECK: [[TMP7:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align 16
5466	// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
5467	// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
5468	// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
5469	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
5470	// CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
5471	// CHECK: ret void
5472	void test_vst3q_lane_p64(poly64_t *a, poly64x2x3_t b) {
5473	vst3q_lane_p64(a, b, 1);
5474	}
5475
5476	// CHECK-LABEL: define void @test_vst3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #2 {
5477	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
5478	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
5479	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[B]], i32 0, i32 0
5480	// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
5481	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x3_t [[__S1]] to i8*
5482	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x3_t [[B]] to i8*
5483	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5484	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
5485	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i64 0, i64 0
5486	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
5487	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
5488	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i64 0, i64 1
5489	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
5490	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
5491	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i64 0, i64 2
5492	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
5493	// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
5494	// CHECK: ret void
5495	void test_vst3_lane_u8(uint8_t *a, uint8x8x3_t b) {
5496	vst3_lane_u8(a, b, 7);
5497	}
5498
5499	// CHECK-LABEL: define void @test_vst3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #2 {
5500	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
5501	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
5502	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[B]], i32 0, i32 0
5503	// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
5504	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x3_t [[__S1]] to i8*
5505	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x3_t [[B]] to i8*
5506	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5507	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
5508	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
5509	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i64 0, i64 0
5510	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
5511	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
5512	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
5513	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i64 0, i64 1
5514	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
5515	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5516	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
5517	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i64 0, i64 2
5518	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
5519	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5520	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
5521	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5522	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5523	// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
5524	// CHECK: ret void
5525	void test_vst3_lane_u16(uint16_t *a, uint16x4x3_t b) {
5526	vst3_lane_u16(a, b, 3);
5527	}
5528
5529	// CHECK-LABEL: define void @test_vst3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #2 {
5530	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
5531	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
5532	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[B]], i32 0, i32 0
5533	// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
5534	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x3_t [[__S1]] to i8*
5535	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x3_t [[B]] to i8*
5536	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5537	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
5538	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
5539	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL]], i64 0, i64 0
5540	// CHECK: [[TMP3:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
5541	// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
5542	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
5543	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL1]], i64 0, i64 1
5544	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
5545	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5546	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
5547	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL3]], i64 0, i64 2
5548	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
5549	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5550	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
5551	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5552	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5553	// CHECK: call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, i8* [[TMP2]])
5554	// CHECK: ret void
5555	void test_vst3_lane_u32(uint32_t *a, uint32x2x3_t b) {
5556	vst3_lane_u32(a, b, 1);
5557	}
5558
5559	// CHECK-LABEL: define void @test_vst3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #2 {
5560	// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
5561	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
5562	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[B]], i32 0, i32 0
5563	// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
5564	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x3_t [[__S1]] to i8*
5565	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x1x3_t [[B]] to i8*
5566	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5567	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
5568	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[__S1]], i32 0, i32 0
5569	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL]], i64 0, i64 0
5570	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
5571	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
5572	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[__S1]], i32 0, i32 0
5573	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL1]], i64 0, i64 1
5574	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
5575	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
5576	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[__S1]], i32 0, i32 0
5577	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL3]], i64 0, i64 2
5578	// CHECK: [[TMP7:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
5579	// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
5580	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
5581	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
5582	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
5583	// CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
5584	// CHECK: ret void
5585	void test_vst3_lane_u64(uint64_t *a, uint64x1x3_t b) {
5586	vst3_lane_u64(a, b, 0);
5587	}
5588
5589	// CHECK-LABEL: define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #2 {
5590	// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
5591	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
5592	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[B]], i32 0, i32 0
5593	// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
5594	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x3_t [[__S1]] to i8*
5595	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x3_t [[B]] to i8*
5596	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5597	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
5598	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i64 0, i64 0
5599	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
5600	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
5601	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i64 0, i64 1
5602	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
5603	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
5604	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i64 0, i64 2
5605	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
5606	// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
5607	// CHECK: ret void
5608	void test_vst3_lane_s8(int8_t *a, int8x8x3_t b) {
5609	vst3_lane_s8(a, b, 7);
5610	}
5611
5612	// CHECK-LABEL: define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #2 {
5613	// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
5614	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
5615	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[B]], i32 0, i32 0
5616	// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
5617	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x3_t [[__S1]] to i8*
5618	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x3_t [[B]] to i8*
5619	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5620	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
5621	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
5622	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i64 0, i64 0
5623	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
5624	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
5625	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
5626	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i64 0, i64 1
5627	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
5628	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5629	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
5630	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i64 0, i64 2
5631	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
5632	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5633	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
5634	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5635	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5636	// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
5637	// CHECK: ret void
5638	void test_vst3_lane_s16(int16_t *a, int16x4x3_t b) {
5639	vst3_lane_s16(a, b, 3);
5640	}
5641
5642	// CHECK-LABEL: define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #2 {
5643	// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
5644	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
5645	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[B]], i32 0, i32 0
5646	// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
5647	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x3_t [[__S1]] to i8*
5648	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x3_t [[B]] to i8*
5649	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5650	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
5651	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
5652	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL]], i64 0, i64 0
5653	// CHECK: [[TMP3:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
5654	// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
5655	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
5656	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL1]], i64 0, i64 1
5657	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
5658	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5659	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
5660	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL3]], i64 0, i64 2
5661	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
5662	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5663	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
5664	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5665	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5666	// CHECK: call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, i8* [[TMP2]])
5667	// CHECK: ret void
5668	void test_vst3_lane_s32(int32_t *a, int32x2x3_t b) {
5669	vst3_lane_s32(a, b, 1);
5670	}
5671
5672	// CHECK-LABEL: define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #2 {
5673	// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
5674	// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
5675	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[B]], i32 0, i32 0
5676	// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
5677	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x3_t [[__S1]] to i8*
5678	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x1x3_t [[B]] to i8*
5679	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5680	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
5681	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[__S1]], i32 0, i32 0
5682	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL]], i64 0, i64 0
5683	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
5684	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
5685	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[__S1]], i32 0, i32 0
5686	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL1]], i64 0, i64 1
5687	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
5688	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
5689	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[__S1]], i32 0, i32 0
5690	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL3]], i64 0, i64 2
5691	// CHECK: [[TMP7:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
5692	// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
5693	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
5694	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
5695	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
5696	// CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
5697	// CHECK: ret void
5698	void test_vst3_lane_s64(int64_t *a, int64x1x3_t b) {
5699	vst3_lane_s64(a, b, 0);
5700	}
5701
5702	// CHECK-LABEL: define void @test_vst3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #2 {
5703	// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
5704	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
5705	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[B]], i32 0, i32 0
5706	// CHECK: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
5707	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x3_t [[__S1]] to i8*
5708	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x3_t [[B]] to i8*
5709	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5710	// CHECK: [[TMP2:%.]] = bitcast half %a to i8*
5711	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
5712	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL]], i64 0, i64 0
5713	// CHECK: [[TMP3:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
5714	// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
5715	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
5716	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL1]], i64 0, i64 1
5717	// CHECK: [[TMP5:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
5718	// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
5719	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
5720	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL3]], i64 0, i64 2
5721	// CHECK: [[TMP7:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX4]], align 8
5722	// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
5723	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
5724	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
5725	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
5726	// CHECK: call void @llvm.aarch64.neon.st3lane.v4f16.p0i8(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i64 3, i8* [[TMP2]])
5727	// CHECK: ret void
5728	void test_vst3_lane_f16(float16_t *a, float16x4x3_t b) {
5729	vst3_lane_f16(a, b, 3);
5730	}
5731
5732	// CHECK-LABEL: define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #2 {
5733	// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
5734	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
5735	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[B]], i32 0, i32 0
5736	// CHECK: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
5737	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x3_t [[__S1]] to i8*
5738	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x3_t [[B]] to i8*
5739	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5740	// CHECK: [[TMP2:%.]] = bitcast float %a to i8*
5741	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
5742	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL]], i64 0, i64 0
5743	// CHECK: [[TMP3:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
5744	// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
5745	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
5746	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL1]], i64 0, i64 1
5747	// CHECK: [[TMP5:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
5748	// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
5749	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
5750	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL3]], i64 0, i64 2
5751	// CHECK: [[TMP7:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX4]], align 8
5752	// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
5753	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
5754	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
5755	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
5756	// CHECK: call void @llvm.aarch64.neon.st3lane.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i64 1, i8* [[TMP2]])
5757	// CHECK: ret void
5758	void test_vst3_lane_f32(float32_t *a, float32x2x3_t b) {
5759	vst3_lane_f32(a, b, 1);
5760	}
5761
5762	// CHECK-LABEL: define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #2 {
5763	// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
5764	// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
5765	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t [[B]], i32 0, i32 0
5766	// CHECK: store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
5767	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x1x3_t [[__S1]] to i8*
5768	// CHECK: [[TMP1:%.]] = bitcast %struct.float64x1x3_t [[B]] to i8*
5769	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5770	// CHECK: [[TMP2:%.]] = bitcast double %a to i8*
5771	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t [[__S1]], i32 0, i32 0
5772	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>] [[VAL]], i64 0, i64 0
5773	// CHECK: [[TMP3:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX]], align 8
5774	// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
5775	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t [[__S1]], i32 0, i32 0
5776	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>] [[VAL1]], i64 0, i64 1
5777	// CHECK: [[TMP5:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX2]], align 8
5778	// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
5779	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t [[__S1]], i32 0, i32 0
5780	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>] [[VAL3]], i64 0, i64 2
5781	// CHECK: [[TMP7:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX4]], align 8
5782	// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
5783	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
5784	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
5785	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
5786	// CHECK: call void @llvm.aarch64.neon.st3lane.v1f64.p0i8(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i64 0, i8* [[TMP2]])
5787	// CHECK: ret void
5788	void test_vst3_lane_f64(float64_t *a, float64x1x3_t b) {
5789	vst3_lane_f64(a, b, 0);
5790	}
5791
5792	// CHECK-LABEL: define void @test_vst3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #2 {
5793	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
5794	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
5795	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[B]], i32 0, i32 0
5796	// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
5797	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x3_t [[__S1]] to i8*
5798	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x3_t [[B]] to i8*
5799	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5800	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
5801	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], i64 0, i64 0
5802	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
5803	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
5804	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], i64 0, i64 1
5805	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
5806	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
5807	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], i64 0, i64 2
5808	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
5809	// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
5810	// CHECK: ret void
5811	void test_vst3_lane_p8(poly8_t *a, poly8x8x3_t b) {
5812	vst3_lane_p8(a, b, 7);
5813	}
5814
5815	// CHECK-LABEL: define void @test_vst3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #2 {
5816	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
5817	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
5818	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[B]], i32 0, i32 0
5819	// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
5820	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x3_t [[__S1]] to i8*
5821	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x3_t [[B]] to i8*
5822	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5823	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
5824	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
5825	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], i64 0, i64 0
5826	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
5827	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
5828	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
5829	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], i64 0, i64 1
5830	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
5831	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5832	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
5833	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], i64 0, i64 2
5834	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
5835	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5836	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
5837	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5838	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5839	// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
5840	// CHECK: ret void
5841	void test_vst3_lane_p16(poly16_t *a, poly16x4x3_t b) {
5842	vst3_lane_p16(a, b, 3);
5843	}
5844
5845	// CHECK-LABEL: define void @test_vst3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #2 {
5846	// CHECK: [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
5847	// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
5848	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t [[B]], i32 0, i32 0
5849	// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
5850	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x1x3_t [[__S1]] to i8*
5851	// CHECK: [[TMP1:%.]] = bitcast %struct.poly64x1x3_t [[B]] to i8*
5852	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
5853	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
5854	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t [[__S1]], i32 0, i32 0
5855	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL]], i64 0, i64 0
5856	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
5857	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
5858	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t [[__S1]], i32 0, i32 0
5859	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL1]], i64 0, i64 1
5860	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
5861	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
5862	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t [[__S1]], i32 0, i32 0
5863	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL3]], i64 0, i64 2
5864	// CHECK: [[TMP7:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
5865	// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
5866	// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
5867	// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
5868	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
5869	// CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
5870	// CHECK: ret void
5871	void test_vst3_lane_p64(poly64_t *a, poly64x1x3_t b) {
5872	vst3_lane_p64(a, b, 0);
5873	}
5874
5875	// CHECK-LABEL: define void @test_vst4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #2 {
5876	// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
5877	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
5878	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[B]], i32 0, i32 0
5879	// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
5880	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x4_t [[__S1]] to i8*
5881	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x16x4_t [[B]] to i8*
5882	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
5883	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
5884	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL]], i64 0, i64 0
5885	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
5886	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
5887	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL1]], i64 0, i64 1
5888	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
5889	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
5890	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL3]], i64 0, i64 2
5891	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
5892	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
5893	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL5]], i64 0, i64 3
5894	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX6]], align 16
5895	// CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
5896	// CHECK: ret void
5897	void test_vst4q_lane_u8(uint8_t *a, uint8x16x4_t b) {
5898	vst4q_lane_u8(a, b, 15);
5899	}
5900
5901	// CHECK-LABEL: define void @test_vst4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #2 {
5902	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
5903	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
5904	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[B]], i32 0, i32 0
5905	// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
5906	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x4_t [[__S1]] to i8*
5907	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x4_t [[B]] to i8*
5908	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
5909	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
5910	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
5911	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i64 0, i64 0
5912	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
5913	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
5914	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
5915	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i64 0, i64 1
5916	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
5917	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5918	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
5919	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i64 0, i64 2
5920	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
5921	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5922	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
5923	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i64 0, i64 3
5924	// CHECK: [[TMP9:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
5925	// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
5926	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
5927	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5928	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5929	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
5930	// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
5931	// CHECK: ret void
5932	void test_vst4q_lane_u16(uint16_t *a, uint16x8x4_t b) {
5933	vst4q_lane_u16(a, b, 7);
5934	}
5935
5936	// CHECK-LABEL: define void @test_vst4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #2 {
5937	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
5938	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
5939	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[B]], i32 0, i32 0
5940	// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
5941	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x4_t [[__S1]] to i8*
5942	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x4_t [[B]] to i8*
5943	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
5944	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
5945	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
5946	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL]], i64 0, i64 0
5947	// CHECK: [[TMP3:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
5948	// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
5949	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
5950	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL1]], i64 0, i64 1
5951	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
5952	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5953	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
5954	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL3]], i64 0, i64 2
5955	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
5956	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5957	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
5958	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL5]], i64 0, i64 3
5959	// CHECK: [[TMP9:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX6]], align 16
5960	// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
5961	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
5962	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5963	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5964	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
5965	// CHECK: call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, i8* [[TMP2]])
5966	// CHECK: ret void
5967	void test_vst4q_lane_u32(uint32_t *a, uint32x4x4_t b) {
5968	vst4q_lane_u32(a, b, 3);
5969	}
5970
5971	// CHECK-LABEL: define void @test_vst4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #2 {
5972	// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
5973	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
5974	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[B]], i32 0, i32 0
5975	// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
5976	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x4_t [[__S1]] to i8*
5977	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x2x4_t [[B]] to i8*
5978	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
5979	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
5980	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[__S1]], i32 0, i32 0
5981	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL]], i64 0, i64 0
5982	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
5983	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
5984	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[__S1]], i32 0, i32 0
5985	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL1]], i64 0, i64 1
5986	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
5987	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
5988	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[__S1]], i32 0, i32 0
5989	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL3]], i64 0, i64 2
5990	// CHECK: [[TMP7:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align 16
5991	// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
5992	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[__S1]], i32 0, i32 0
5993	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL5]], i64 0, i64 3
5994	// CHECK: [[TMP9:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX6]], align 16
5995	// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
5996	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
5997	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
5998	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
5999	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
6000	// CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
6001	// CHECK: ret void
6002	void test_vst4q_lane_u64(uint64_t *a, uint64x2x4_t b) {
6003	vst4q_lane_u64(a, b, 1);
6004	}
6005
6006	// CHECK-LABEL: define void @test_vst4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #2 {
6007	// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
6008	// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
6009	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[B]], i32 0, i32 0
6010	// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
6011	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x4_t [[__S1]] to i8*
6012	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x16x4_t [[B]] to i8*
6013	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
6014	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
6015	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL]], i64 0, i64 0
6016	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
6017	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
6018	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL1]], i64 0, i64 1
6019	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
6020	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
6021	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL3]], i64 0, i64 2
6022	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
6023	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
6024	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL5]], i64 0, i64 3
6025	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX6]], align 16
6026	// CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
6027	// CHECK: ret void
6028	void test_vst4q_lane_s8(int8_t *a, int8x16x4_t b) {
6029	vst4q_lane_s8(a, b, 15);
6030	}
6031
6032	// CHECK-LABEL: define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #2 {
6033	// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
6034	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
6035	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[B]], i32 0, i32 0
6036	// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
6037	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x4_t [[__S1]] to i8*
6038	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x4_t [[B]] to i8*
6039	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
6040	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
6041	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
6042	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i64 0, i64 0
6043	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
6044	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
6045	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
6046	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i64 0, i64 1
6047	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
6048	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
6049	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
6050	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i64 0, i64 2
6051	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
6052	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
6053	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
6054	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i64 0, i64 3
6055	// CHECK: [[TMP9:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
6056	// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
6057	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
6058	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
6059	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
6060	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
6061	// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
6062	// CHECK: ret void
6063	void test_vst4q_lane_s16(int16_t *a, int16x8x4_t b) {
6064	vst4q_lane_s16(a, b, 7);
6065	}
6066
6067	// CHECK-LABEL: define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #2 {
6068	// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
6069	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
6070	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[B]], i32 0, i32 0
6071	// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
6072	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x4_t [[__S1]] to i8*
6073	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x4_t [[B]] to i8*
6074	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
6075	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
6076	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
6077	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL]], i64 0, i64 0
6078	// CHECK: [[TMP3:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align 16
6079	// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
6080	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
6081	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL1]], i64 0, i64 1
6082	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align 16
6083	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
6084	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
6085	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL3]], i64 0, i64 2
6086	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align 16
6087	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
6088	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
6089	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL5]], i64 0, i64 3
6090	// CHECK: [[TMP9:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX6]], align 16
6091	// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
6092	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
6093	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
6094	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
6095	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
6096	// CHECK: call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, i8* [[TMP2]])
6097	// CHECK: ret void
6098	void test_vst4q_lane_s32(int32_t *a, int32x4x4_t b) {
6099	vst4q_lane_s32(a, b, 3);
6100	}
6101
6102	// CHECK-LABEL: define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #2 {
6103	// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
6104	// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
6105	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[B]], i32 0, i32 0
6106	// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
6107	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x4_t [[__S1]] to i8*
6108	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x2x4_t [[B]] to i8*
6109	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
6110	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
6111	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[__S1]], i32 0, i32 0
6112	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL]], i64 0, i64 0
6113	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
6114	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
6115	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[__S1]], i32 0, i32 0
6116	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL1]], i64 0, i64 1
6117	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
6118	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
6119	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[__S1]], i32 0, i32 0
6120	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL3]], i64 0, i64 2
6121	// CHECK: [[TMP7:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align 16
6122	// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
6123	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[__S1]], i32 0, i32 0
6124	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL5]], i64 0, i64 3
6125	// CHECK: [[TMP9:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX6]], align 16
6126	// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
6127	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
6128	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
6129	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
6130	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
6131	// CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
6132	// CHECK: ret void
6133	void test_vst4q_lane_s64(int64_t *a, int64x2x4_t b) {
6134	vst4q_lane_s64(a, b, 1);
6135	}
6136
6137	// CHECK-LABEL: define void @test_vst4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #2 {
6138	// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
6139	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
6140	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[B]], i32 0, i32 0
6141	// CHECK: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
6142	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x4_t [[__S1]] to i8*
6143	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x4_t [[B]] to i8*
6144	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
6145	// CHECK: [[TMP2:%.]] = bitcast half %a to i8*
6146	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
6147	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL]], i64 0, i64 0
6148	// CHECK: [[TMP3:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align 16
6149	// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
6150	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
6151	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL1]], i64 0, i64 1
6152	// CHECK: [[TMP5:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align 16
6153	// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
6154	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
6155	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL3]], i64 0, i64 2
6156	// CHECK: [[TMP7:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX4]], align 16
6157	// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
6158	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
6159	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL5]], i64 0, i64 3
6160	// CHECK: [[TMP9:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX6]], align 16
6161	// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
6162	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
6163	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
6164	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
6165	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
6166	// CHECK: call void @llvm.aarch64.neon.st4lane.v8f16.p0i8(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], i64 7, i8* [[TMP2]])
6167	// CHECK: ret void
6168	void test_vst4q_lane_f16(float16_t *a, float16x8x4_t b) {
6169	vst4q_lane_f16(a, b, 7);
6170	}
6171
6172	// CHECK-LABEL: define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #2 {
6173	// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
6174	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
6175	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[B]], i32 0, i32 0
6176	// CHECK: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
6177	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x4_t [[__S1]] to i8*
6178	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x4_t [[B]] to i8*
6179	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
6180	// CHECK: [[TMP2:%.]] = bitcast float %a to i8*
6181	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
6182	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL]], i64 0, i64 0
6183	// CHECK: [[TMP3:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align 16
6184	// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
6185	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
6186	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL1]], i64 0, i64 1
6187	// CHECK: [[TMP5:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align 16
6188	// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
6189	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
6190	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL3]], i64 0, i64 2
6191	// CHECK: [[TMP7:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX4]], align 16
6192	// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
6193	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
6194	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL5]], i64 0, i64 3
6195	// CHECK: [[TMP9:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX6]], align 16
6196	// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
6197	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
6198	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
6199	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
6200	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
6201	// CHECK: call void @llvm.aarch64.neon.st4lane.v4f32.p0i8(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i64 3, i8* [[TMP2]])
6202	// CHECK: ret void
6203	void test_vst4q_lane_f32(float32_t *a, float32x4x4_t b) {
6204	vst4q_lane_f32(a, b, 3);
6205	}
6206
6207	// CHECK-LABEL: define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #2 {
6208	// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
6209	// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
6210	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t [[B]], i32 0, i32 0
6211	// CHECK: store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
6212	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x2x4_t [[__S1]] to i8*
6213	// CHECK: [[TMP1:%.]] = bitcast %struct.float64x2x4_t [[B]] to i8*
6214	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
6215	// CHECK: [[TMP2:%.]] = bitcast double %a to i8*
6216	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t [[__S1]], i32 0, i32 0
6217	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>] [[VAL]], i64 0, i64 0
6218	// CHECK: [[TMP3:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX]], align 16
6219	// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
6220	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t [[__S1]], i32 0, i32 0
6221	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>] [[VAL1]], i64 0, i64 1
6222	// CHECK: [[TMP5:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX2]], align 16
6223	// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
6224	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t [[__S1]], i32 0, i32 0
6225	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>] [[VAL3]], i64 0, i64 2
6226	// CHECK: [[TMP7:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX4]], align 16
6227	// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
6228	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t [[__S1]], i32 0, i32 0
6229	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>] [[VAL5]], i64 0, i64 3
6230	// CHECK: [[TMP9:%.]] = load <2 x double>, <2 x double> [[ARRAYIDX6]], align 16
6231	// CHECK: [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
6232	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
6233	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
6234	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
6235	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
6236	// CHECK: call void @llvm.aarch64.neon.st4lane.v2f64.p0i8(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i64 1, i8* [[TMP2]])
6237	// CHECK: ret void
6238	void test_vst4q_lane_f64(float64_t *a, float64x2x4_t b) {
6239	vst4q_lane_f64(a, b, 1);
6240	}
6241
6242	// CHECK-LABEL: define void @test_vst4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #2 {
6243	// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
6244	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
6245	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[B]], i32 0, i32 0
6246	// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
6247	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x4_t [[__S1]] to i8*
6248	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x16x4_t [[B]] to i8*
6249	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
6250	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
6251	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL]], i64 0, i64 0
6252	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align 16
6253	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
6254	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL1]], i64 0, i64 1
6255	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align 16
6256	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
6257	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL3]], i64 0, i64 2
6258	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align 16
6259	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
6260	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL5]], i64 0, i64 3
6261	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX6]], align 16
6262	// CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
6263	// CHECK: ret void
6264	void test_vst4q_lane_p8(poly8_t *a, poly8x16x4_t b) {
6265	vst4q_lane_p8(a, b, 15);
6266	}
6267
6268	// CHECK-LABEL: define void @test_vst4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #2 {
6269	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
6270	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
6271	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[B]], i32 0, i32 0
6272	// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
6273	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x4_t [[__S1]] to i8*
6274	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x4_t [[B]] to i8*
6275	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
6276	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
6277	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
6278	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], i64 0, i64 0
6279	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align 16
6280	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
6281	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
6282	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], i64 0, i64 1
6283	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align 16
6284	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
6285	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
6286	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], i64 0, i64 2
6287	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align 16
6288	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
6289	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
6290	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], i64 0, i64 3
6291	// CHECK: [[TMP9:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align 16
6292	// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
6293	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
6294	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
6295	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
6296	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
6297	// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
6298	// CHECK: ret void
6299	void test_vst4q_lane_p16(poly16_t *a, poly16x8x4_t b) {
6300	vst4q_lane_p16(a, b, 7);
6301	}
6302
6303	// CHECK-LABEL: define void @test_vst4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #2 {
6304	// CHECK: [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
6305	// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
6306	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t [[B]], i32 0, i32 0
6307	// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
6308	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x2x4_t [[__S1]] to i8*
6309	// CHECK: [[TMP1:%.]] = bitcast %struct.poly64x2x4_t [[B]] to i8*
6310	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
6311	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
6312	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t [[__S1]], i32 0, i32 0
6313	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL]], i64 0, i64 0
6314	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align 16
6315	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
6316	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t [[__S1]], i32 0, i32 0
6317	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL1]], i64 0, i64 1
6318	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align 16
6319	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
6320	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t [[__S1]], i32 0, i32 0
6321	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL3]], i64 0, i64 2
6322	// CHECK: [[TMP7:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align 16
6323	// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
6324	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t [[__S1]], i32 0, i32 0
6325	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL5]], i64 0, i64 3
6326	// CHECK: [[TMP9:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX6]], align 16
6327	// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
6328	// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
6329	// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
6330	// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
6331	// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
6332	// CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
6333	// CHECK: ret void
6334	void test_vst4q_lane_p64(poly64_t *a, poly64x2x4_t b) {
6335	vst4q_lane_p64(a, b, 1);
6336	}
6337
6338	// CHECK-LABEL: define void @test_vst4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #2 {
6339	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
6340	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
6341	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[B]], i32 0, i32 0
6342	// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
6343	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x4_t [[__S1]] to i8*
6344	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x4_t [[B]] to i8*
6345	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6346	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
6347	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i64 0, i64 0
6348	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
6349	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
6350	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i64 0, i64 1
6351	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
6352	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
6353	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i64 0, i64 2
6354	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
6355	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
6356	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i64 0, i64 3
6357	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
6358	// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
6359	// CHECK: ret void
6360	void test_vst4_lane_u8(uint8_t *a, uint8x8x4_t b) {
6361	vst4_lane_u8(a, b, 7);
6362	}
6363
6364	// CHECK-LABEL: define void @test_vst4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #2 {
6365	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
6366	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
6367	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[B]], i32 0, i32 0
6368	// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
6369	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x4_t [[__S1]] to i8*
6370	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x4_t [[B]] to i8*
6371	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6372	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
6373	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
6374	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i64 0, i64 0
6375	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
6376	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
6377	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
6378	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i64 0, i64 1
6379	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
6380	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
6381	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
6382	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i64 0, i64 2
6383	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
6384	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
6385	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
6386	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i64 0, i64 3
6387	// CHECK: [[TMP9:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
6388	// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
6389	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
6390	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
6391	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
6392	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
6393	// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
6394	// CHECK: ret void
6395	void test_vst4_lane_u16(uint16_t *a, uint16x4x4_t b) {
6396	vst4_lane_u16(a, b, 3);
6397	}
6398
6399	// CHECK-LABEL: define void @test_vst4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #2 {
6400	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
6401	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
6402	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[B]], i32 0, i32 0
6403	// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
6404	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x4_t [[__S1]] to i8*
6405	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x4_t [[B]] to i8*
6406	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6407	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
6408	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
6409	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL]], i64 0, i64 0
6410	// CHECK: [[TMP3:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
6411	// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
6412	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
6413	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL1]], i64 0, i64 1
6414	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
6415	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
6416	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
6417	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL3]], i64 0, i64 2
6418	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
6419	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
6420	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
6421	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL5]], i64 0, i64 3
6422	// CHECK: [[TMP9:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX6]], align 8
6423	// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
6424	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
6425	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
6426	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
6427	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
6428	// CHECK: call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, i8* [[TMP2]])
6429	// CHECK: ret void
6430	void test_vst4_lane_u32(uint32_t *a, uint32x2x4_t b) {
6431	vst4_lane_u32(a, b, 1);
6432	}
6433
6434	// CHECK-LABEL: define void @test_vst4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #2 {
6435	// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
6436	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
6437	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[B]], i32 0, i32 0
6438	// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
6439	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x4_t [[__S1]] to i8*
6440	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x1x4_t [[B]] to i8*
6441	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6442	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
6443	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
6444	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL]], i64 0, i64 0
6445	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
6446	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
6447	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
6448	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL1]], i64 0, i64 1
6449	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
6450	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
6451	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
6452	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL3]], i64 0, i64 2
6453	// CHECK: [[TMP7:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
6454	// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
6455	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
6456	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL5]], i64 0, i64 3
6457	// CHECK: [[TMP9:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX6]], align 8
6458	// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
6459	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
6460	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
6461	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
6462	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
6463	// CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
6464	// CHECK: ret void
6465	void test_vst4_lane_u64(uint64_t *a, uint64x1x4_t b) {
6466	vst4_lane_u64(a, b, 0);
6467	}
6468
6469	// CHECK-LABEL: define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #2 {
6470	// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
6471	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
6472	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[B]], i32 0, i32 0
6473	// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
6474	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x4_t [[__S1]] to i8*
6475	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x4_t [[B]] to i8*
6476	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6477	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
6478	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i64 0, i64 0
6479	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
6480	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
6481	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i64 0, i64 1
6482	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
6483	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
6484	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i64 0, i64 2
6485	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
6486	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
6487	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i64 0, i64 3
6488	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
6489	// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
6490	// CHECK: ret void
6491	void test_vst4_lane_s8(int8_t *a, int8x8x4_t b) {
6492	vst4_lane_s8(a, b, 7);
6493	}
6494
6495	// CHECK-LABEL: define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #2 {
6496	// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
6497	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
6498	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[B]], i32 0, i32 0
6499	// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
6500	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x4_t [[__S1]] to i8*
6501	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x4_t [[B]] to i8*
6502	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6503	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
6504	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
6505	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i64 0, i64 0
6506	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
6507	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
6508	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
6509	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i64 0, i64 1
6510	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
6511	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
6512	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
6513	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i64 0, i64 2
6514	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
6515	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
6516	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
6517	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i64 0, i64 3
6518	// CHECK: [[TMP9:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
6519	// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
6520	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
6521	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
6522	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
6523	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
6524	// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
6525	// CHECK: ret void
6526	void test_vst4_lane_s16(int16_t *a, int16x4x4_t b) {
6527	vst4_lane_s16(a, b, 3);
6528	}
6529
6530	// CHECK-LABEL: define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #2 {
6531	// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
6532	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
6533	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[B]], i32 0, i32 0
6534	// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
6535	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x4_t [[__S1]] to i8*
6536	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x4_t [[B]] to i8*
6537	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6538	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
6539	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
6540	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL]], i64 0, i64 0
6541	// CHECK: [[TMP3:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
6542	// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
6543	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
6544	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL1]], i64 0, i64 1
6545	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
6546	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
6547	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
6548	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL3]], i64 0, i64 2
6549	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
6550	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
6551	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
6552	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL5]], i64 0, i64 3
6553	// CHECK: [[TMP9:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX6]], align 8
6554	// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
6555	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
6556	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
6557	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
6558	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
6559	// CHECK: call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, i8* [[TMP2]])
6560	// CHECK: ret void
6561	void test_vst4_lane_s32(int32_t *a, int32x2x4_t b) {
6562	vst4_lane_s32(a, b, 1);
6563	}
6564
6565	// CHECK-LABEL: define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #2 {
6566	// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
6567	// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
6568	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[B]], i32 0, i32 0
6569	// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
6570	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x4_t [[__S1]] to i8*
6571	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x1x4_t [[B]] to i8*
6572	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6573	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
6574	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
6575	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL]], i64 0, i64 0
6576	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
6577	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
6578	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
6579	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL1]], i64 0, i64 1
6580	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
6581	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
6582	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
6583	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL3]], i64 0, i64 2
6584	// CHECK: [[TMP7:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
6585	// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
6586	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
6587	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL5]], i64 0, i64 3
6588	// CHECK: [[TMP9:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX6]], align 8
6589	// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
6590	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
6591	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
6592	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
6593	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
6594	// CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
6595	// CHECK: ret void
6596	void test_vst4_lane_s64(int64_t *a, int64x1x4_t b) {
6597	vst4_lane_s64(a, b, 0);
6598	}
6599
6600	// CHECK-LABEL: define void @test_vst4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #2 {
6601	// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
6602	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
6603	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[B]], i32 0, i32 0
6604	// CHECK: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
6605	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x4_t [[__S1]] to i8*
6606	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x4_t [[B]] to i8*
6607	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6608	// CHECK: [[TMP2:%.]] = bitcast half %a to i8*
6609	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
6610	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL]], i64 0, i64 0
6611	// CHECK: [[TMP3:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
6612	// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
6613	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
6614	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL1]], i64 0, i64 1
6615	// CHECK: [[TMP5:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
6616	// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
6617	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
6618	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL3]], i64 0, i64 2
6619	// CHECK: [[TMP7:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX4]], align 8
6620	// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
6621	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
6622	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL5]], i64 0, i64 3
6623	// CHECK: [[TMP9:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX6]], align 8
6624	// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
6625	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
6626	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
6627	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
6628	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
6629	// CHECK: call void @llvm.aarch64.neon.st4lane.v4f16.p0i8(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], i64 3, i8* [[TMP2]])
6630	// CHECK: ret void
6631	void test_vst4_lane_f16(float16_t *a, float16x4x4_t b) {
6632	vst4_lane_f16(a, b, 3);
6633	}
6634
6635	// CHECK-LABEL: define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #2 {
6636	// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
6637	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
6638	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[B]], i32 0, i32 0
6639	// CHECK: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
6640	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x4_t [[__S1]] to i8*
6641	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x4_t [[B]] to i8*
6642	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6643	// CHECK: [[TMP2:%.]] = bitcast float %a to i8*
6644	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
6645	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL]], i64 0, i64 0
6646	// CHECK: [[TMP3:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
6647	// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
6648	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
6649	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL1]], i64 0, i64 1
6650	// CHECK: [[TMP5:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
6651	// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
6652	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
6653	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL3]], i64 0, i64 2
6654	// CHECK: [[TMP7:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX4]], align 8
6655	// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
6656	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
6657	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL5]], i64 0, i64 3
6658	// CHECK: [[TMP9:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX6]], align 8
6659	// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
6660	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
6661	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
6662	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
6663	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
6664	// CHECK: call void @llvm.aarch64.neon.st4lane.v2f32.p0i8(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i64 1, i8* [[TMP2]])
6665	// CHECK: ret void
6666	void test_vst4_lane_f32(float32_t *a, float32x2x4_t b) {
6667	vst4_lane_f32(a, b, 1);
6668	}
6669
6670	// CHECK-LABEL: define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #2 {
6671	// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
6672	// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
6673	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t [[B]], i32 0, i32 0
6674	// CHECK: store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
6675	// CHECK: [[TMP0:%.]] = bitcast %struct.float64x1x4_t [[__S1]] to i8*
6676	// CHECK: [[TMP1:%.]] = bitcast %struct.float64x1x4_t [[B]] to i8*
6677	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6678	// CHECK: [[TMP2:%.]] = bitcast double %a to i8*
6679	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t [[__S1]], i32 0, i32 0
6680	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>] [[VAL]], i64 0, i64 0
6681	// CHECK: [[TMP3:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX]], align 8
6682	// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
6683	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t [[__S1]], i32 0, i32 0
6684	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>] [[VAL1]], i64 0, i64 1
6685	// CHECK: [[TMP5:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX2]], align 8
6686	// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
6687	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t [[__S1]], i32 0, i32 0
6688	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>] [[VAL3]], i64 0, i64 2
6689	// CHECK: [[TMP7:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX4]], align 8
6690	// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
6691	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t [[__S1]], i32 0, i32 0
6692	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>] [[VAL5]], i64 0, i64 3
6693	// CHECK: [[TMP9:%.]] = load <1 x double>, <1 x double> [[ARRAYIDX6]], align 8
6694	// CHECK: [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
6695	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
6696	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
6697	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
6698	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
6699	// CHECK: call void @llvm.aarch64.neon.st4lane.v1f64.p0i8(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i64 0, i8* [[TMP2]])
6700	// CHECK: ret void
6701	void test_vst4_lane_f64(float64_t *a, float64x1x4_t b) {
6702	vst4_lane_f64(a, b, 0);
6703	}
6704
6705	// CHECK-LABEL: define void @test_vst4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #2 {
6706	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
6707	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
6708	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[B]], i32 0, i32 0
6709	// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
6710	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x4_t [[__S1]] to i8*
6711	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x4_t [[B]] to i8*
6712	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6713	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
6714	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], i64 0, i64 0
6715	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
6716	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
6717	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], i64 0, i64 1
6718	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
6719	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
6720	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], i64 0, i64 2
6721	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
6722	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
6723	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], i64 0, i64 3
6724	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
6725	// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
6726	// CHECK: ret void
6727	void test_vst4_lane_p8(poly8_t *a, poly8x8x4_t b) {
6728	vst4_lane_p8(a, b, 7);
6729	}
6730
6731	// CHECK-LABEL: define void @test_vst4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #2 {
6732	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
6733	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
6734	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[B]], i32 0, i32 0
6735	// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
6736	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x4_t [[__S1]] to i8*
6737	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x4_t [[B]] to i8*
6738	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6739	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
6740	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
6741	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], i64 0, i64 0
6742	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
6743	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
6744	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
6745	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], i64 0, i64 1
6746	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
6747	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
6748	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
6749	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], i64 0, i64 2
6750	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
6751	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
6752	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
6753	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], i64 0, i64 3
6754	// CHECK: [[TMP9:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
6755	// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
6756	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
6757	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
6758	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
6759	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
6760	// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
6761	// CHECK: ret void
6762	void test_vst4_lane_p16(poly16_t *a, poly16x4x4_t b) {
6763	vst4_lane_p16(a, b, 3);
6764	}
6765
6766	// CHECK-LABEL: define void @test_vst4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #2 {
6767	// CHECK: [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
6768	// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
6769	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t [[B]], i32 0, i32 0
6770	// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
6771	// CHECK: [[TMP0:%.]] = bitcast %struct.poly64x1x4_t [[__S1]] to i8*
6772	// CHECK: [[TMP1:%.]] = bitcast %struct.poly64x1x4_t [[B]] to i8*
6773	// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
6774	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
6775	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t [[__S1]], i32 0, i32 0
6776	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL]], i64 0, i64 0
6777	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
6778	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
6779	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t [[__S1]], i32 0, i32 0
6780	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL1]], i64 0, i64 1
6781	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
6782	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
6783	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t [[__S1]], i32 0, i32 0
6784	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL3]], i64 0, i64 2
6785	// CHECK: [[TMP7:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
6786	// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
6787	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t [[__S1]], i32 0, i32 0
6788	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL5]], i64 0, i64 3
6789	// CHECK: [[TMP9:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX6]], align 8
6790	// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
6791	// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
6792	// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
6793	// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
6794	// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
6795	// CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
6796	// CHECK: ret void
6797	void test_vst4_lane_p64(poly64_t *a, poly64x1x4_t b) {
6798	vst4_lane_p64(a, b, 0);
6799	}
6800
6801	// CHECK: attributes #0 ={{.*}}"min-legal-vector-width"="128"
6802	// CHECK: attributes #1 ={{.*}}"min-legal-vector-width"="64"
6803	// CHECK: attributes #2 ={{.*}}"min-legal-vector-width"="0"
6804

Clang Project