arm-neon-vld.c source code [clang_source_code/test/CodeGen/arm-neon-vld.c]

1	// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
2	// RUN: -S -disable-O0-optnone -emit-llvm -o - %s \| opt -S -mem2reg \| \
3	// RUN: FileCheck -check-prefixes=CHECK,CHECK-A64 %s
4	// RUN: %clang_cc1 -triple armv8-none-linux-gnueabi -target-feature +neon \
5	// RUN: -target-feature +fp16 -S -disable-O0-optnone -emit-llvm -o - %s \| \
6	// RUN: opt -S -mem2reg \| FileCheck -check-prefixes=CHECK,CHECK-A32 %s
7
8	#include <arm_neon.h>
9
10	// CHECK-LABEL: @test_vld1_f16_x2(
11	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
12	// CHECK-A32: %struct.float16x4x2_t* noalias sret [[RETVAL:%.*]],
13	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
14	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x2_t [[__RET]] to i8*
15	// CHECK: [[TMP1:%.]] = bitcast half %a to i8*
16	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to [[HALF:(half\|i16)]]*
17	// CHECK: [[VLD1XN:%.]] = call { <4 x [[HALF]]>, <4 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x2.v4f16.p0f16\|arm.neon.vld1x2.v4i16.p0i16}}([[HALF]] [[TMP2]])
18	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]> }*
19	// CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD1XN]], { <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
20	// CHECK: [[TMP4:%.]] = bitcast %struct.float16x4x2_t [[RETVAL]] to i8*
21	// CHECK: [[TMP5:%.]] = bitcast %struct.float16x4x2_t [[__RET]] to i8*
22	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
23	// CHECK-A64: [[TMP6:%.]] = load %struct.float16x4x2_t, %struct.float16x4x2_t [[RETVAL]], align 8
24	// CHECK-A64: ret %struct.float16x4x2_t [[TMP6]]
25	// CHECK-A32: ret void
26	float16x4x2_t test_vld1_f16_x2(float16_t const *a) {
27	return vld1_f16_x2(a);
28	}
29
30	// CHECK-LABEL: @test_vld1_f16_x3(
31	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
32	// CHECK-A32: %struct.float16x4x3_t* noalias sret [[RETVAL:%.*]],
33	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
34	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x3_t [[__RET]] to i8*
35	// CHECK: [[TMP1:%.]] = bitcast half %a to i8*
36	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to [[HALF]]*
37	// CHECK: [[VLD1XN:%.]] = call { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x3.v4f16.p0f16\|arm.neon.vld1x3.v4i16.p0i16}}([[HALF]] [[TMP2]])
38	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }*
39	// CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD1XN]], { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
40	// CHECK: [[TMP4:%.]] = bitcast %struct.float16x4x3_t [[RETVAL]] to i8*
41	// CHECK: [[TMP5:%.]] = bitcast %struct.float16x4x3_t [[__RET]] to i8*
42	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
43	// CHECK-A64: [[TMP6:%.]] = load %struct.float16x4x3_t, %struct.float16x4x3_t [[RETVAL]], align 8
44	// CHECK-A64: ret %struct.float16x4x3_t [[TMP6]]
45	// CHECK-A32: ret void
46	float16x4x3_t test_vld1_f16_x3(float16_t const *a) {
47	return vld1_f16_x3(a);
48	}
49
50	// CHECK-LABEL: @test_vld1_f16_x4(
51	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
52	// CHECK-A32: %struct.float16x4x4_t* noalias sret [[RETVAL:%.*]],
53	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
54	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x4_t [[__RET]] to i8*
55	// CHECK: [[TMP1:%.]] = bitcast half %a to i8*
56	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to [[HALF]]*
57	// CHECK: [[VLD1XN:%.]] = call { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x4.v4f16.p0f16\|arm.neon.vld1x4.v4i16.p0i16}}([[HALF]] [[TMP2]])
58	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }*
59	// CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD1XN]], { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
60	// CHECK: [[TMP4:%.]] = bitcast %struct.float16x4x4_t [[RETVAL]] to i8*
61	// CHECK: [[TMP5:%.]] = bitcast %struct.float16x4x4_t [[__RET]] to i8*
62	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
63	// CHECK-A64: [[TMP6:%.]] = load %struct.float16x4x4_t, %struct.float16x4x4_t [[RETVAL]], align 8
64	// CHECK-A64: ret %struct.float16x4x4_t [[TMP6]]
65	// CHECK-A32: ret void
66	float16x4x4_t test_vld1_f16_x4(float16_t const *a) {
67	return vld1_f16_x4(a);
68	}
69
70	// CHECK-LABEL: @test_vld1_f32_x2(
71	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
72	// CHECK-A32: %struct.float32x2x2_t* noalias sret [[RETVAL:%.*]],
73	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
74	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x2_t [[__RET]] to i8*
75	// CHECK: [[TMP1:%.]] = bitcast float %a to i8*
76	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to float*
77	// CHECK: [[VLD1XN:%.]] = call { <2 x float>, <2 x float> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v2f32.p0f32(float [[TMP2]])
78	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x float>, <2 x float> }*
79	// CHECK: store { <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float> }* [[TMP3]]
80	// CHECK: [[TMP4:%.]] = bitcast %struct.float32x2x2_t [[RETVAL]] to i8*
81	// CHECK: [[TMP5:%.]] = bitcast %struct.float32x2x2_t [[__RET]] to i8*
82	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
83	// CHECK-A64: [[TMP6:%.]] = load %struct.float32x2x2_t, %struct.float32x2x2_t [[RETVAL]], align 8
84	// CHECK-A64: ret %struct.float32x2x2_t [[TMP6]]
85	// CHECK-A32: ret void
86	float32x2x2_t test_vld1_f32_x2(float32_t const *a) {
87	return vld1_f32_x2(a);
88	}
89
90	// CHECK-LABEL: @test_vld1_f32_x3(
91	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
92	// CHECK-A32: %struct.float32x2x3_t* noalias sret [[RETVAL:%.*]],
93	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
94	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x3_t [[__RET]] to i8*
95	// CHECK: [[TMP1:%.]] = bitcast float %a to i8*
96	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to float*
97	// CHECK: [[VLD1XN:%.]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v2f32.p0f32(float [[TMP2]])
98	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
99	// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
100	// CHECK: [[TMP4:%.]] = bitcast %struct.float32x2x3_t [[RETVAL]] to i8*
101	// CHECK: [[TMP5:%.]] = bitcast %struct.float32x2x3_t [[__RET]] to i8*
102	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
103	// CHECK-A64: [[TMP6:%.]] = load %struct.float32x2x3_t, %struct.float32x2x3_t [[RETVAL]], align 8
104	// CHECK-A64: ret %struct.float32x2x3_t [[TMP6]]
105	float32x2x3_t test_vld1_f32_x3(float32_t const *a) {
106	return vld1_f32_x3(a);
107	}
108
109	// CHECK-LABEL: @test_vld1_f32_x4(
110	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
111	// CHECK-A32: %struct.float32x2x4_t* noalias sret [[RETVAL:%.*]],
112	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
113	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x4_t [[__RET]] to i8*
114	// CHECK: [[TMP1:%.]] = bitcast float %a to i8*
115	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to float*
116	// CHECK: [[VLD1XN:%.]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v2f32.p0f32(float [[TMP2]])
117	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
118	// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
119	// CHECK: [[TMP4:%.]] = bitcast %struct.float32x2x4_t [[RETVAL]] to i8*
120	// CHECK: [[TMP5:%.]] = bitcast %struct.float32x2x4_t [[__RET]] to i8*
121	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
122	// CHECK-A64: [[TMP6:%.]] = load %struct.float32x2x4_t, %struct.float32x2x4_t [[RETVAL]], align 8
123	// CHECK-A64: ret %struct.float32x2x4_t [[TMP6]]
124	// CHECK-A32: ret void
125	float32x2x4_t test_vld1_f32_x4(float32_t const *a) {
126	return vld1_f32_x4(a);
127	}
128
129	// CHECK-LABEL: @test_vld1_p16_x2(
130	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
131	// CHECK-A32: %struct.poly16x4x2_t* noalias sret [[RETVAL:%.*]],
132	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
133	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x2_t [[__RET]] to i8*
134	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
135	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
136	// CHECK: [[VLD1XN:%.]] = call { <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v4i16.p0i16(i16 [[TMP2]])
137	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16> }*
138	// CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
139	// CHECK: [[TMP4:%.]] = bitcast %struct.poly16x4x2_t [[RETVAL]] to i8*
140	// CHECK: [[TMP5:%.]] = bitcast %struct.poly16x4x2_t [[__RET]] to i8*
141	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
142	// CHECK-A64: [[TMP6:%.]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t [[RETVAL]], align 8
143	// CHECK-A64: ret %struct.poly16x4x2_t [[TMP6]]
144	// CHECK-A32: ret void
145	poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) {
146	return vld1_p16_x2(a);
147	}
148
149	// CHECK-LABEL: @test_vld1_p16_x3(
150	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
151	// CHECK-A32: %struct.poly16x4x3_t* noalias sret [[RETVAL:%.*]],
152	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
153	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x3_t [[__RET]] to i8*
154	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
155	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
156	// CHECK: [[VLD1XN:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v4i16.p0i16(i16 [[TMP2]])
157	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
158	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
159	// CHECK: [[TMP4:%.]] = bitcast %struct.poly16x4x3_t [[RETVAL]] to i8*
160	// CHECK: [[TMP5:%.]] = bitcast %struct.poly16x4x3_t [[__RET]] to i8*
161	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
162	// CHECK-A64: [[TMP6:%.]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t [[RETVAL]], align 8
163	// CHECK-A64: ret %struct.poly16x4x3_t [[TMP6]]
164	// CHECK-A32: ret void
165	poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) {
166	return vld1_p16_x3(a);
167	}
168
169	// CHECK-LABEL: @test_vld1_p16_x4(
170	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
171	// CHECK-A32: %struct.poly16x4x4_t* noalias sret [[RETVAL:%.*]],
172	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
173	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x4_t [[__RET]] to i8*
174	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
175	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
176	// CHECK: [[VLD1XN:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v4i16.p0i16(i16 [[TMP2]])
177	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
178	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
179	// CHECK: [[TMP4:%.]] = bitcast %struct.poly16x4x4_t [[RETVAL]] to i8*
180	// CHECK: [[TMP5:%.]] = bitcast %struct.poly16x4x4_t [[__RET]] to i8*
181	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
182	// CHECK-A64: [[TMP6:%.]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t [[RETVAL]], align 8
183	// CHECK-A64: ret %struct.poly16x4x4_t [[TMP6]]
184	// CHECK-A32: ret void
185	poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) {
186	return vld1_p16_x4(a);
187	}
188
189	// CHECK-LABEL: @test_vld1_p8_x2(
190	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
191	// CHECK-A32: %struct.poly8x8x2_t* noalias sret [[RETVAL:%.*]],
192	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
193	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x2_t [[__RET]] to i8*
194	// CHECK: [[VLD1XN:%.]] = call { <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v8i8.p0i8(i8 %a)
195	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8> }*
196	// CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
197	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x2_t [[RETVAL]] to i8*
198	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x8x2_t [[__RET]] to i8*
199	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 16, i1 false)
200	// CHECK-A64: [[TMP4:%.]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t [[RETVAL]], align 8
201	// CHECK-A64: ret %struct.poly8x8x2_t [[TMP4]]
202	// CHECK-A32: ret void
203	poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) {
204	return vld1_p8_x2(a);
205	}
206
207	// CHECK-LABEL: @test_vld1_p8_x3(
208	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
209	// CHECK-A32: %struct.poly8x8x3_t* noalias sret [[RETVAL:%.*]],
210	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
211	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x3_t [[__RET]] to i8*
212	// CHECK: [[VLD1XN:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v8i8.p0i8(i8 %a)
213	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
214	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
215	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x3_t [[RETVAL]] to i8*
216	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x8x3_t [[__RET]] to i8*
217	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 24, i1 false)
218	// CHECK-A64: [[TMP4:%.]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t [[RETVAL]], align 8
219	// CHECK-A64: ret %struct.poly8x8x3_t [[TMP4]]
220	// CHECK-A32: ret void
221	poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) {
222	return vld1_p8_x3(a);
223	}
224
225	// CHECK-LABEL: @test_vld1_p8_x4(
226	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
227	// CHECK-A32: %struct.poly8x8x4_t* noalias sret [[RETVAL:%.*]],
228	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
229	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x4_t [[__RET]] to i8*
230	// CHECK: [[VLD1XN:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v8i8.p0i8(i8 %a)
231	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
232	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
233	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x4_t [[RETVAL]] to i8*
234	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x8x4_t [[__RET]] to i8*
235	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 32, i1 false)
236	// CHECK-A64: [[TMP4:%.]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t [[RETVAL]], align 8
237	// CHECK-A64: ret %struct.poly8x8x4_t [[TMP4]]
238	// CHECK-A32: ret void
239	poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) {
240	return vld1_p8_x4(a);
241	}
242
243	// CHECK-LABEL: @test_vld1_s16_x2(
244	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
245	// CHECK-A32: %struct.int16x4x2_t* noalias sret [[RETVAL:%.*]],
246	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
247	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x2_t [[__RET]] to i8*
248	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
249	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
250	// CHECK: [[VLD1XN:%.]] = call { <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v4i16.p0i16(i16 [[TMP2]])
251	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16> }*
252	// CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
253	// CHECK: [[TMP4:%.]] = bitcast %struct.int16x4x2_t [[RETVAL]] to i8*
254	// CHECK: [[TMP5:%.]] = bitcast %struct.int16x4x2_t [[__RET]] to i8*
255	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
256	// CHECK-A64: [[TMP6:%.]] = load %struct.int16x4x2_t, %struct.int16x4x2_t [[RETVAL]], align 8
257	// CHECK-A64: ret %struct.int16x4x2_t [[TMP6]]
258	// CHECK-A32: ret void
259	int16x4x2_t test_vld1_s16_x2(int16_t const *a) {
260	return vld1_s16_x2(a);
261	}
262
263	// CHECK-LABEL: @test_vld1_s16_x3(
264	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
265	// CHECK-A32: %struct.int16x4x3_t* noalias sret [[RETVAL:%.*]],
266	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
267	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x3_t [[__RET]] to i8*
268	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
269	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
270	// CHECK: [[VLD1XN:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v4i16.p0i16(i16 [[TMP2]])
271	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
272	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
273	// CHECK: [[TMP4:%.]] = bitcast %struct.int16x4x3_t [[RETVAL]] to i8*
274	// CHECK: [[TMP5:%.]] = bitcast %struct.int16x4x3_t [[__RET]] to i8*
275	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
276	// CHECK-A64: [[TMP6:%.]] = load %struct.int16x4x3_t, %struct.int16x4x3_t [[RETVAL]], align 8
277	// CHECK-A64: ret %struct.int16x4x3_t [[TMP6]]
278	// CHECK-A32: ret void
279	int16x4x3_t test_vld1_s16_x3(int16_t const *a) {
280	return vld1_s16_x3(a);
281	}
282
283	// CHECK-LABEL: @test_vld1_s16_x4(
284	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
285	// CHECK-A32: %struct.int16x4x4_t* noalias sret [[RETVAL:%.*]],
286	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
287	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x4_t [[__RET]] to i8*
288	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
289	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
290	// CHECK: [[VLD1XN:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v4i16.p0i16(i16 [[TMP2]])
291	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
292	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
293	// CHECK: [[TMP4:%.]] = bitcast %struct.int16x4x4_t [[RETVAL]] to i8*
294	// CHECK: [[TMP5:%.]] = bitcast %struct.int16x4x4_t [[__RET]] to i8*
295	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
296	// CHECK-A64: [[TMP6:%.]] = load %struct.int16x4x4_t, %struct.int16x4x4_t [[RETVAL]], align 8
297	// CHECK-A64: ret %struct.int16x4x4_t [[TMP6]]
298	// CHECK-A32: ret void
299	int16x4x4_t test_vld1_s16_x4(int16_t const *a) {
300	return vld1_s16_x4(a);
301	}
302
303	// CHECK-LABEL: @test_vld1_s32_x2(
304	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
305	// CHECK-A32: %struct.int32x2x2_t* noalias sret [[RETVAL:%.*]],
306	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
307	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x2_t [[__RET]] to i8*
308	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
309	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
310	// CHECK: [[VLD1XN:%.]] = call { <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v2i32.p0i32(i32 [[TMP2]])
311	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i32>, <2 x i32> }*
312	// CHECK: store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
313	// CHECK: [[TMP4:%.]] = bitcast %struct.int32x2x2_t [[RETVAL]] to i8*
314	// CHECK: [[TMP5:%.]] = bitcast %struct.int32x2x2_t [[__RET]] to i8*
315	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
316	// CHECK-A64: [[TMP6:%.]] = load %struct.int32x2x2_t, %struct.int32x2x2_t [[RETVAL]], align 8
317	// CHECK-A64: ret %struct.int32x2x2_t [[TMP6]]
318	// CHECK-A32: ret void
319	int32x2x2_t test_vld1_s32_x2(int32_t const *a) {
320	return vld1_s32_x2(a);
321	}
322
323	// CHECK-LABEL: @test_vld1_s32_x3(
324	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
325	// CHECK-A32: %struct.int32x2x3_t* noalias sret [[RETVAL:%.*]],
326	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
327	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x3_t [[__RET]] to i8*
328	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
329	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
330	// CHECK: [[VLD1XN:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v2i32.p0i32(i32 [[TMP2]])
331	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
332	// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
333	// CHECK: [[TMP4:%.]] = bitcast %struct.int32x2x3_t [[RETVAL]] to i8*
334	// CHECK: [[TMP5:%.]] = bitcast %struct.int32x2x3_t [[__RET]] to i8*
335	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
336	// CHECK-A64: [[TMP6:%.]] = load %struct.int32x2x3_t, %struct.int32x2x3_t [[RETVAL]], align 8
337	// CHECK-A64: ret %struct.int32x2x3_t [[TMP6]]
338	// CHECK-A32: ret void
339	int32x2x3_t test_vld1_s32_x3(int32_t const *a) {
340	return vld1_s32_x3(a);
341	}
342
343	// CHECK-LABEL: @test_vld1_s32_x4(
344	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
345	// CHECK-A32: %struct.int32x2x4_t* noalias sret [[RETVAL:%.*]],
346	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
347	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x4_t [[__RET]] to i8*
348	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
349	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
350	// CHECK: [[VLD1XN:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v2i32.p0i32(i32 [[TMP2]])
351	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
352	// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
353	// CHECK: [[TMP4:%.]] = bitcast %struct.int32x2x4_t [[RETVAL]] to i8*
354	// CHECK: [[TMP5:%.]] = bitcast %struct.int32x2x4_t [[__RET]] to i8*
355	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
356	// CHECK-A64: [[TMP6:%.]] = load %struct.int32x2x4_t, %struct.int32x2x4_t [[RETVAL]], align 8
357	// CHECK-A64: ret %struct.int32x2x4_t [[TMP6]]
358	// CHECK-A32: ret void
359	int32x2x4_t test_vld1_s32_x4(int32_t const *a) {
360	return vld1_s32_x4(a);
361	}
362
363	// CHECK-LABEL: @test_vld1_s64_x2(
364	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
365	// CHECK-A32: %struct.int64x1x2_t* noalias sret [[RETVAL:%.*]],
366	// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
367	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x2_t [[__RET]] to i8*
368	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
369	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
370	// CHECK: [[VLD1XN:%.]] = call { <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v1i64.p0i64(i64 [[TMP2]])
371	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64> }*
372	// CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
373	// CHECK: [[TMP4:%.]] = bitcast %struct.int64x1x2_t [[RETVAL]] to i8*
374	// CHECK: [[TMP5:%.]] = bitcast %struct.int64x1x2_t [[__RET]] to i8*
375	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
376	// CHECK-A64: [[TMP6:%.]] = load %struct.int64x1x2_t, %struct.int64x1x2_t [[RETVAL]], align 8
377	// CHECK-A64: ret %struct.int64x1x2_t [[TMP6]]
378	// CHECK-A32: ret void
379	int64x1x2_t test_vld1_s64_x2(int64_t const *a) {
380	return vld1_s64_x2(a);
381	}
382
383	// CHECK-LABEL: @test_vld1_s64_x3(
384	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
385	// CHECK-A32: %struct.int64x1x3_t* noalias sret [[RETVAL:%.*]],
386	// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
387	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x3_t [[__RET]] to i8*
388	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
389	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
390	// CHECK: [[VLD1XN:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v1i64.p0i64(i64 [[TMP2]])
391	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
392	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
393	// CHECK: [[TMP4:%.]] = bitcast %struct.int64x1x3_t [[RETVAL]] to i8*
394	// CHECK: [[TMP5:%.]] = bitcast %struct.int64x1x3_t [[__RET]] to i8*
395	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
396	// CHECK-A64: [[TMP6:%.]] = load %struct.int64x1x3_t, %struct.int64x1x3_t [[RETVAL]], align 8
397	// CHECK-A64: ret %struct.int64x1x3_t [[TMP6]]
398	// CHECK-A32: ret void
399	int64x1x3_t test_vld1_s64_x3(int64_t const *a) {
400	return vld1_s64_x3(a);
401	}
402
403	// CHECK-LABEL: @test_vld1_s64_x4(
404	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
405	// CHECK-A32: %struct.int64x1x4_t* noalias sret [[RETVAL:%.*]],
406	// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
407	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x4_t [[__RET]] to i8*
408	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
409	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
410	// CHECK: [[VLD1XN:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v1i64.p0i64(i64 [[TMP2]])
411	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
412	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
413	// CHECK: [[TMP4:%.]] = bitcast %struct.int64x1x4_t [[RETVAL]] to i8*
414	// CHECK: [[TMP5:%.]] = bitcast %struct.int64x1x4_t [[__RET]] to i8*
415	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
416	// CHECK-A64: [[TMP6:%.]] = load %struct.int64x1x4_t, %struct.int64x1x4_t [[RETVAL]], align 8
417	// CHECK-A64: ret %struct.int64x1x4_t [[TMP6]]
418	// CHECK-A32: ret void
419	int64x1x4_t test_vld1_s64_x4(int64_t const *a) {
420	return vld1_s64_x4(a);
421	}
422
423	// CHECK-LABEL: @test_vld1_s8_x2(
424	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
425	// CHECK-A32: %struct.int8x8x2_t* noalias sret [[RETVAL:%.*]],
426	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
427	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x2_t [[__RET]] to i8*
428	// CHECK: [[VLD1XN:%.]] = call { <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v8i8.p0i8(i8 %a)
429	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8> }*
430	// CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
431	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x2_t [[RETVAL]] to i8*
432	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x8x2_t [[__RET]] to i8*
433	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 16, i1 false)
434	// CHECK-A64: [[TMP4:%.]] = load %struct.int8x8x2_t, %struct.int8x8x2_t [[RETVAL]], align 8
435	// CHECK-A64: ret %struct.int8x8x2_t [[TMP4]]
436	// CHECK-A32: ret void
437	int8x8x2_t test_vld1_s8_x2(int8_t const *a) {
438	return vld1_s8_x2(a);
439	}
440
441	// CHECK-LABEL: @test_vld1_s8_x3(
442	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
443	// CHECK-A32: %struct.int8x8x3_t* noalias sret [[RETVAL:%.*]],
444	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
445	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x3_t [[__RET]] to i8*
446	// CHECK: [[VLD1XN:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v8i8.p0i8(i8 %a)
447	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
448	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
449	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x3_t [[RETVAL]] to i8*
450	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x8x3_t [[__RET]] to i8*
451	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 24, i1 false)
452	// CHECK-A64: [[TMP4:%.]] = load %struct.int8x8x3_t, %struct.int8x8x3_t [[RETVAL]], align 8
453	// CHECK-A64: ret %struct.int8x8x3_t [[TMP4]]
454	// CHECK-A32: ret void
455	int8x8x3_t test_vld1_s8_x3(int8_t const *a) {
456	return vld1_s8_x3(a);
457	}
458
459	// CHECK-LABEL: @test_vld1_s8_x4(
460	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
461	// CHECK-A32: %struct.int8x8x4_t* noalias sret [[RETVAL:%.*]],
462	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
463	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x4_t [[__RET]] to i8*
464	// CHECK: [[VLD1XN:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v8i8.p0i8(i8 %a)
465	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
466	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
467	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x4_t [[RETVAL]] to i8*
468	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x8x4_t [[__RET]] to i8*
469	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 32, i1 false)
470	// CHECK-A64: [[TMP4:%.]] = load %struct.int8x8x4_t, %struct.int8x8x4_t [[RETVAL]], align 8
471	// CHECK-A64: ret %struct.int8x8x4_t [[TMP4]]
472	// CHECK-A32: ret void
473	int8x8x4_t test_vld1_s8_x4(int8_t const *a) {
474	return vld1_s8_x4(a);
475	}
476
477	// CHECK-LABEL: @test_vld1_u16_x2(
478	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
479	// CHECK-A32: %struct.uint16x4x2_t* noalias sret [[RETVAL:%.*]],
480	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
481	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x2_t [[__RET]] to i8*
482	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
483	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
484	// CHECK: [[VLD1XN:%.]] = call { <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v4i16.p0i16(i16 [[TMP2]])
485	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16> }*
486	// CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
487	// CHECK: [[TMP4:%.]] = bitcast %struct.uint16x4x2_t [[RETVAL]] to i8*
488	// CHECK: [[TMP5:%.]] = bitcast %struct.uint16x4x2_t [[__RET]] to i8*
489	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
490	// CHECK-A64: [[TMP6:%.]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t [[RETVAL]], align 8
491	// CHECK-A64: ret %struct.uint16x4x2_t [[TMP6]]
492	// CHECK-A32: ret void
493	uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) {
494	return vld1_u16_x2(a);
495	}
496
497	// CHECK-LABEL: @test_vld1_u16_x3(
498	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
499	// CHECK-A32: %struct.uint16x4x3_t* noalias sret [[RETVAL:%.*]],
500	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
501	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x3_t [[__RET]] to i8*
502	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
503	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
504	// CHECK: [[VLD1XN:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v4i16.p0i16(i16 [[TMP2]])
505	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
506	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
507	// CHECK: [[TMP4:%.]] = bitcast %struct.uint16x4x3_t [[RETVAL]] to i8*
508	// CHECK: [[TMP5:%.]] = bitcast %struct.uint16x4x3_t [[__RET]] to i8*
509	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
510	// CHECK-A64: [[TMP6:%.]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t [[RETVAL]], align 8
511	// CHECK-A64: ret %struct.uint16x4x3_t [[TMP6]]
512	// CHECK-A32: ret void
513	uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) {
514	return vld1_u16_x3(a);
515	}
516
517	// CHECK-LABEL: @test_vld1_u16_x4(
518	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
519	// CHECK-A32: %struct.uint16x4x4_t* noalias sret [[RETVAL:%.*]],
520	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
521	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x4_t [[__RET]] to i8*
522	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
523	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
524	// CHECK: [[VLD1XN:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v4i16.p0i16(i16 [[TMP2]])
525	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
526	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
527	// CHECK: [[TMP4:%.]] = bitcast %struct.uint16x4x4_t [[RETVAL]] to i8*
528	// CHECK: [[TMP5:%.]] = bitcast %struct.uint16x4x4_t [[__RET]] to i8*
529	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
530	// CHECK-A64: [[TMP6:%.]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t [[RETVAL]], align 8
531	// CHECK-A64: ret %struct.uint16x4x4_t [[TMP6]]
532	// CHECK-A32: ret void
533	uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) {
534	return vld1_u16_x4(a);
535	}
536
537	// CHECK-LABEL: @test_vld1_u32_x2(
538	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
539	// CHECK-A32: %struct.uint32x2x2_t* noalias sret [[RETVAL:%.*]],
540	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
541	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x2_t [[__RET]] to i8*
542	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
543	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
544	// CHECK: [[VLD1XN:%.]] = call { <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v2i32.p0i32(i32 [[TMP2]])
545	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i32>, <2 x i32> }*
546	// CHECK: store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
547	// CHECK: [[TMP4:%.]] = bitcast %struct.uint32x2x2_t [[RETVAL]] to i8*
548	// CHECK: [[TMP5:%.]] = bitcast %struct.uint32x2x2_t [[__RET]] to i8*
549	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
550	// CHECK-A64: [[TMP6:%.]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t [[RETVAL]], align 8
551	// CHECK-A64: ret %struct.uint32x2x2_t [[TMP6]]
552	// CHECK-A32: ret void
553	uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) {
554	return vld1_u32_x2(a);
555	}
556
557	// CHECK-LABEL: @test_vld1_u32_x3(
558	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
559	// CHECK-A32: %struct.uint32x2x3_t* noalias sret [[RETVAL:%.*]],
560	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
561	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x3_t [[__RET]] to i8*
562	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
563	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
564	// CHECK: [[VLD1XN:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v2i32.p0i32(i32 [[TMP2]])
565	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
566	// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
567	// CHECK: [[TMP4:%.]] = bitcast %struct.uint32x2x3_t [[RETVAL]] to i8*
568	// CHECK: [[TMP5:%.]] = bitcast %struct.uint32x2x3_t [[__RET]] to i8*
569	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
570	// CHECK-A64: [[TMP6:%.]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t [[RETVAL]], align 8
571	// CHECK-A64: ret %struct.uint32x2x3_t [[TMP6]]
572	// CHECK-A32: ret void
573	uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) {
574	return vld1_u32_x3(a);
575	}
576
577	// CHECK-LABEL: @test_vld1_u32_x4(
578	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
579	// CHECK-A32: %struct.uint32x2x4_t* noalias sret [[RETVAL:%.*]],
580	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
581	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x4_t [[__RET]] to i8*
582	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
583	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
584	// CHECK: [[VLD1XN:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v2i32.p0i32(i32 [[TMP2]])
585	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
586	// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
587	// CHECK: [[TMP4:%.]] = bitcast %struct.uint32x2x4_t [[RETVAL]] to i8*
588	// CHECK: [[TMP5:%.]] = bitcast %struct.uint32x2x4_t [[__RET]] to i8*
589	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
590	// CHECK-A64: [[TMP6:%.]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t [[RETVAL]], align 8
591	// CHECK-A64: ret %struct.uint32x2x4_t [[TMP6]]
592	// CHECK-A32: ret void
593	uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) {
594	return vld1_u32_x4(a);
595	}
596
597	// CHECK-LABEL: @test_vld1_u64_x2(
598	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
599	// CHECK-A32: %struct.uint64x1x2_t* noalias sret [[RETVAL:%.*]],
600	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
601	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x2_t [[__RET]] to i8*
602	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
603	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
604	// CHECK: [[VLD1XN:%.]] = call { <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v1i64.p0i64(i64 [[TMP2]])
605	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64> }*
606	// CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
607	// CHECK: [[TMP4:%.]] = bitcast %struct.uint64x1x2_t [[RETVAL]] to i8*
608	// CHECK: [[TMP5:%.]] = bitcast %struct.uint64x1x2_t [[__RET]] to i8*
609	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
610	// CHECK-A64: [[TMP6:%.]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t [[RETVAL]], align 8
611	// CHECK-A64: ret %struct.uint64x1x2_t [[TMP6]]
612	// CHECK-A32: ret void
613	uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) {
614	return vld1_u64_x2(a);
615	}
616
617	// CHECK-LABEL: @test_vld1_u64_x3(
618	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
619	// CHECK-A32: %struct.uint64x1x3_t* noalias sret [[RETVAL:%.*]],
620	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
621	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x3_t [[__RET]] to i8*
622	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
623	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
624	// CHECK: [[VLD1XN:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v1i64.p0i64(i64 [[TMP2]])
625	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
626	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
627	// CHECK: [[TMP4:%.]] = bitcast %struct.uint64x1x3_t [[RETVAL]] to i8*
628	// CHECK: [[TMP5:%.]] = bitcast %struct.uint64x1x3_t [[__RET]] to i8*
629	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
630	// CHECK-A64: [[TMP6:%.]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t [[RETVAL]], align 8
631	// CHECK-A64: ret %struct.uint64x1x3_t [[TMP6]]
632	// CHECK-A32: ret void
633	uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) {
634	return vld1_u64_x3(a);
635	}
636
637	// CHECK-LABEL: @test_vld1_u64_x4(
638	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
639	// CHECK-A32: %struct.uint64x1x4_t* noalias sret [[RETVAL:%.*]],
640	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
641	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x4_t [[__RET]] to i8*
642	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
643	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
644	// CHECK: [[VLD1XN:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v1i64.p0i64(i64 [[TMP2]])
645	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
646	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
647	// CHECK: [[TMP4:%.]] = bitcast %struct.uint64x1x4_t [[RETVAL]] to i8*
648	// CHECK: [[TMP5:%.]] = bitcast %struct.uint64x1x4_t [[__RET]] to i8*
649	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
650	// CHECK-A64: [[TMP6:%.]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t [[RETVAL]], align 8
651	// CHECK-A64: ret %struct.uint64x1x4_t [[TMP6]]
652	// CHECK-A32: ret void
653	uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) {
654	return vld1_u64_x4(a);
655	}
656
657	// CHECK-LABEL: @test_vld1_u8_x2(
658	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
659	// CHECK-A32: %struct.uint8x8x2_t* noalias sret [[RETVAL:%.*]],
660	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
661	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x2_t [[__RET]] to i8*
662	// CHECK: [[VLD1XN:%.]] = call { <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v8i8.p0i8(i8 %a)
663	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8> }*
664	// CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
665	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x2_t [[RETVAL]] to i8*
666	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x8x2_t [[__RET]] to i8*
667	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 16, i1 false)
668	// CHECK-A64: [[TMP4:%.]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t [[RETVAL]], align 8
669	// CHECK-A64: ret %struct.uint8x8x2_t [[TMP4]]
670	// CHECK-A32: ret void
671	uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) {
672	return vld1_u8_x2(a);
673	}
674
675	// CHECK-LABEL: @test_vld1_u8_x3(
676	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
677	// CHECK-A32: %struct.uint8x8x3_t* noalias sret [[RETVAL:%.*]],
678	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
679	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x3_t [[__RET]] to i8*
680	// CHECK: [[VLD1XN:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v8i8.p0i8(i8 %a)
681	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
682	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
683	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x3_t [[RETVAL]] to i8*
684	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x8x3_t [[__RET]] to i8*
685	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 24, i1 false)
686	// CHECK-A64: [[TMP4:%.]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t [[RETVAL]], align 8
687	// CHECK-A64: ret %struct.uint8x8x3_t [[TMP4]]
688	// CHECK-A32: ret void
689	uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) {
690	return vld1_u8_x3(a);
691	}
692
693	// CHECK-LABEL: @test_vld1_u8_x4(
694	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
695	// CHECK-A32: %struct.uint8x8x4_t* noalias sret [[RETVAL:%.*]],
696	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
697	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x4_t [[__RET]] to i8*
698	// CHECK: [[VLD1XN:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v8i8.p0i8(i8 %a)
699	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
700	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
701	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x4_t [[RETVAL]] to i8*
702	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x8x4_t [[__RET]] to i8*
703	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 32, i1 false)
704	// CHECK-A64: [[TMP4:%.]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t [[RETVAL]], align 8
705	// CHECK-A64: ret %struct.uint8x8x4_t [[TMP4]]
706	// CHECK-A32: ret void
707	uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) {
708	return vld1_u8_x4(a);
709	}
710
711	// CHECK-LABEL: @test_vld1q_f16_x2(
712	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
713	// CHECK-A32: %struct.float16x8x2_t* noalias sret [[RETVAL:%.*]],
714	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align {{16\|8}}
715	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x2_t [[__RET]] to i8*
716	// CHECK: [[TMP1:%.]] = bitcast half %a to i8*
717	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to [[HALF]]*
718	// CHECK: [[VLD1XN:%.]] = call { <8 x [[HALF]]>, <8 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x2.v8f16.p0f16\|arm.neon.vld1x2.v8i16.p0i16}}([[HALF]] [[TMP2]])
719	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]> }*
720	// CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD1XN]], { <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
721	// CHECK: [[TMP4:%.]] = bitcast %struct.float16x8x2_t [[RETVAL]] to i8*
722	// CHECK: [[TMP5:%.]] = bitcast %struct.float16x8x2_t [[__RET]] to i8*
723	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
724	// CHECK-A64: [[TMP6:%.]] = load %struct.float16x8x2_t, %struct.float16x8x2_t [[RETVAL]], align 16
725	// CHECK-A64: ret %struct.float16x8x2_t [[TMP6]]
726	// CHECK-A32: ret void
727	float16x8x2_t test_vld1q_f16_x2(float16_t const *a) {
728	return vld1q_f16_x2(a);
729	}
730
731	// CHECK-LABEL: @test_vld1q_f16_x3(
732	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
733	// CHECK-A32: %struct.float16x8x3_t* noalias sret [[RETVAL:%.*]],
734	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align {{16\|8}}
735	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x3_t [[__RET]] to i8*
736	// CHECK: [[TMP1:%.]] = bitcast half %a to i8*
737	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to [[HALF]]*
738	// CHECK: [[VLD1XN:%.]] = call { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x3.v8f16.p0f16\|arm.neon.vld1x3.v8i16.p0i16}}([[HALF]] [[TMP2]])
739	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }*
740	// CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD1XN]], { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
741	// CHECK: [[TMP4:%.]] = bitcast %struct.float16x8x3_t [[RETVAL]] to i8*
742	// CHECK: [[TMP5:%.]] = bitcast %struct.float16x8x3_t [[__RET]] to i8*
743	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
744	// CHECK-A64: [[TMP6:%.]] = load %struct.float16x8x3_t, %struct.float16x8x3_t [[RETVAL]], align 16
745	// CHECK-A64: ret %struct.float16x8x3_t [[TMP6]]
746	// CHECK-A32: ret void
747	float16x8x3_t test_vld1q_f16_x3(float16_t const *a) {
748	return vld1q_f16_x3(a);
749	}
750
751	// CHECK-LABEL: @test_vld1q_f16_x4(
752	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
753	// CHECK-A32: %struct.float16x8x4_t* noalias sret [[RETVAL:%.*]],
754	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align {{16\|8}}
755	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x4_t [[__RET]] to i8*
756	// CHECK: [[TMP1:%.]] = bitcast half %a to i8*
757	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to [[HALF]]*
758	// CHECK: [[VLD1XN:%.]] = call { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } @llvm.{{aarch64.neon.ld1x4.v8f16.p0f16\|arm.neon.vld1x4.v8i16.p0i16}}([[HALF]] [[TMP2]])
759	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }*
760	// CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD1XN]], { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
761	// CHECK: [[TMP4:%.]] = bitcast %struct.float16x8x4_t [[RETVAL]] to i8*
762	// CHECK: [[TMP5:%.]] = bitcast %struct.float16x8x4_t [[__RET]] to i8*
763	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
764	// CHECK-A64: [[TMP6:%.]] = load %struct.float16x8x4_t, %struct.float16x8x4_t [[RETVAL]], align 16
765	// CHECK-A64: ret %struct.float16x8x4_t [[TMP6]]
766	// CHECK-A32: ret void
767	float16x8x4_t test_vld1q_f16_x4(float16_t const *a) {
768	return vld1q_f16_x4(a);
769	}
770
771	// CHECK-LABEL: @test_vld1q_f32_x2(
772	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
773	// CHECK-A32: %struct.float32x4x2_t* noalias sret [[RETVAL:%.*]],
774	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align {{16\|8}}
775	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x2_t [[__RET]] to i8*
776	// CHECK: [[TMP1:%.]] = bitcast float %a to i8*
777	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to float*
778	// CHECK: [[VLD1XN:%.]] = call { <4 x float>, <4 x float> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v4f32.p0f32(float [[TMP2]])
779	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x float>, <4 x float> }*
780	// CHECK: store { <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float> }* [[TMP3]]
781	// CHECK: [[TMP4:%.]] = bitcast %struct.float32x4x2_t [[RETVAL]] to i8*
782	// CHECK: [[TMP5:%.]] = bitcast %struct.float32x4x2_t [[__RET]] to i8*
783	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
784	// CHECK-A64: [[TMP6:%.]] = load %struct.float32x4x2_t, %struct.float32x4x2_t [[RETVAL]], align 16
785	// CHECK-A64: ret %struct.float32x4x2_t [[TMP6]]
786	// CHECK-A32: ret void
787	float32x4x2_t test_vld1q_f32_x2(float32_t const *a) {
788	return vld1q_f32_x2(a);
789	}
790
791	// CHECK-LABEL: @test_vld1q_f32_x3(
792	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
793	// CHECK-A32: %struct.float32x4x3_t* noalias sret [[RETVAL:%.*]],
794	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align {{16\|8}}
795	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x3_t [[__RET]] to i8*
796	// CHECK: [[TMP1:%.]] = bitcast float %a to i8*
797	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to float*
798	// CHECK: [[VLD1XN:%.]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v4f32.p0f32(float [[TMP2]])
799	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
800	// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
801	// CHECK: [[TMP4:%.]] = bitcast %struct.float32x4x3_t [[RETVAL]] to i8*
802	// CHECK: [[TMP5:%.]] = bitcast %struct.float32x4x3_t [[__RET]] to i8*
803	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
804	// CHECK-A64: [[TMP6:%.]] = load %struct.float32x4x3_t, %struct.float32x4x3_t [[RETVAL]], align 16
805	// CHECK-A64: ret %struct.float32x4x3_t [[TMP6]]
806	// CHECK-A32: ret void
807	float32x4x3_t test_vld1q_f32_x3(float32_t const *a) {
808	return vld1q_f32_x3(a);
809	}
810
811	// CHECK-LABEL: @test_vld1q_f32_x4(
812	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
813	// CHECK-A32: %struct.float32x4x4_t* noalias sret [[RETVAL:%.*]],
814	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align {{16\|8}}
815	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x4_t [[__RET]] to i8*
816	// CHECK: [[TMP1:%.]] = bitcast float %a to i8*
817	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to float*
818	// CHECK: [[VLD1XN:%.]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v4f32.p0f32(float [[TMP2]])
819	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
820	// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
821	// CHECK: [[TMP4:%.]] = bitcast %struct.float32x4x4_t [[RETVAL]] to i8*
822	// CHECK: [[TMP5:%.]] = bitcast %struct.float32x4x4_t [[__RET]] to i8*
823	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
824	// CHECK-A64: [[TMP6:%.]] = load %struct.float32x4x4_t, %struct.float32x4x4_t [[RETVAL]], align 16
825	// CHECK-A64: ret %struct.float32x4x4_t [[TMP6]]
826	// CHECK-A32: ret void
827	float32x4x4_t test_vld1q_f32_x4(float32_t const *a) {
828	return vld1q_f32_x4(a);
829	}
830
831	// CHECK-LABEL: @test_vld1q_p16_x2(
832	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
833	// CHECK-A32: %struct.poly16x8x2_t* noalias sret [[RETVAL:%.*]],
834	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align {{16\|8}}
835	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x2_t [[__RET]] to i8*
836	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
837	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
838	// CHECK: [[VLD1XN:%.]] = call { <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v8i16.p0i16(i16 [[TMP2]])
839	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16> }*
840	// CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
841	// CHECK: [[TMP4:%.]] = bitcast %struct.poly16x8x2_t [[RETVAL]] to i8*
842	// CHECK: [[TMP5:%.]] = bitcast %struct.poly16x8x2_t [[__RET]] to i8*
843	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
844	// CHECK-A64: [[TMP6:%.]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t [[RETVAL]], align 16
845	// CHECK-A64: ret %struct.poly16x8x2_t [[TMP6]]
846	// CHECK-A32: ret void
847	poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) {
848	return vld1q_p16_x2(a);
849	}
850
851	// CHECK-LABEL: @test_vld1q_p16_x3(
852	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
853	// CHECK-A32: %struct.poly16x8x3_t* noalias sret [[RETVAL:%.*]],
854	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align {{16\|8}}
855	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x3_t [[__RET]] to i8*
856	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
857	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
858	// CHECK: [[VLD1XN:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v8i16.p0i16(i16 [[TMP2]])
859	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
860	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
861	// CHECK: [[TMP4:%.]] = bitcast %struct.poly16x8x3_t [[RETVAL]] to i8*
862	// CHECK: [[TMP5:%.]] = bitcast %struct.poly16x8x3_t [[__RET]] to i8*
863	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
864	// CHECK-A64: [[TMP6:%.]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t [[RETVAL]], align 16
865	// CHECK-A64: ret %struct.poly16x8x3_t [[TMP6]]
866	// CHECK-A32: ret void
867	poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) {
868	return vld1q_p16_x3(a);
869	}
870
871	// CHECK-LABEL: @test_vld1q_p16_x4(
872	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
873	// CHECK-A32: %struct.poly16x8x4_t* noalias sret [[RETVAL:%.*]],
874	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align {{16\|8}}
875	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x4_t [[__RET]] to i8*
876	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
877	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
878	// CHECK: [[VLD1XN:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v8i16.p0i16(i16 [[TMP2]])
879	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
880	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
881	// CHECK: [[TMP4:%.]] = bitcast %struct.poly16x8x4_t [[RETVAL]] to i8*
882	// CHECK: [[TMP5:%.]] = bitcast %struct.poly16x8x4_t [[__RET]] to i8*
883	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
884	// CHECK-A64: [[TMP6:%.]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t [[RETVAL]], align 16
885	// CHECK-A64: ret %struct.poly16x8x4_t [[TMP6]]
886	// CHECK-A32: ret void
887	poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) {
888	return vld1q_p16_x4(a);
889	}
890
891	// CHECK-LABEL: @test_vld1q_p8_x2(
892	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
893	// CHECK-A32: %struct.poly8x16x2_t* noalias sret [[RETVAL:%.*]],
894	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align {{16\|8}}
895	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x2_t [[__RET]] to i8*
896	// CHECK: [[VLD1XN:%.]] = call { <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v16i8.p0i8(i8 %a)
897	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8> }*
898	// CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
899	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x16x2_t [[RETVAL]] to i8*
900	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x16x2_t [[__RET]] to i8*
901	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 32, i1 false)
902	// CHECK-A64: [[TMP4:%.]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t [[RETVAL]], align 16
903	// CHECK-A64: ret %struct.poly8x16x2_t [[TMP4]]
904	// CHECK-A32: ret void
905	poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) {
906	return vld1q_p8_x2(a);
907	}
908
909	// CHECK-LABEL: @test_vld1q_p8_x3(
910	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
911	// CHECK-A32: %struct.poly8x16x3_t* noalias sret [[RETVAL:%.*]],
912	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align {{16\|8}}
913	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x3_t [[__RET]] to i8*
914	// CHECK: [[VLD1XN:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v16i8.p0i8(i8 %a)
915	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
916	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
917	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x16x3_t [[RETVAL]] to i8*
918	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x16x3_t [[__RET]] to i8*
919	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 48, i1 false)
920	// CHECK-A64: [[TMP4:%.]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t [[RETVAL]], align 16
921	// CHECK-A64: ret %struct.poly8x16x3_t [[TMP4]]
922	// CHECK-A32: ret void
923	poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) {
924	return vld1q_p8_x3(a);
925	}
926
927	// CHECK-LABEL: @test_vld1q_p8_x4(
928	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
929	// CHECK-A32: %struct.poly8x16x4_t* noalias sret [[RETVAL:%.*]],
930	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align {{16\|8}}
931	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x4_t [[__RET]] to i8*
932	// CHECK: [[VLD1XN:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v16i8.p0i8(i8 %a)
933	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
934	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
935	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x16x4_t [[RETVAL]] to i8*
936	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x16x4_t [[__RET]] to i8*
937	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 64, i1 false)
938	// CHECK-A64: [[TMP4:%.]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t [[RETVAL]], align 16
939	// CHECK-A64: ret %struct.poly8x16x4_t [[TMP4]]
940	// CHECK-A32: ret void
941	poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) {
942	return vld1q_p8_x4(a);
943	}
944
945	// CHECK-LABEL: @test_vld1q_s16_x2(
946	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
947	// CHECK-A32: %struct.int16x8x2_t* noalias sret [[RETVAL:%.*]],
948	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align {{16\|8}}
949	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x2_t [[__RET]] to i8*
950	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
951	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
952	// CHECK: [[VLD1XN:%.]] = call { <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v8i16.p0i16(i16 [[TMP2]])
953	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16> }*
954	// CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
955	// CHECK: [[TMP4:%.]] = bitcast %struct.int16x8x2_t [[RETVAL]] to i8*
956	// CHECK: [[TMP5:%.]] = bitcast %struct.int16x8x2_t [[__RET]] to i8*
957	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
958	// CHECK-A64: [[TMP6:%.]] = load %struct.int16x8x2_t, %struct.int16x8x2_t [[RETVAL]], align 16
959	// CHECK-A64: ret %struct.int16x8x2_t [[TMP6]]
960	// CHECK-A32: ret void
961	int16x8x2_t test_vld1q_s16_x2(int16_t const *a) {
962	return vld1q_s16_x2(a);
963	}
964
965	// CHECK-LABEL: @test_vld1q_s16_x3(
966	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
967	// CHECK-A32: %struct.int16x8x3_t* noalias sret [[RETVAL:%.*]],
968	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align {{16\|8}}
969	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x3_t [[__RET]] to i8*
970	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
971	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
972	// CHECK: [[VLD1XN:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v8i16.p0i16(i16 [[TMP2]])
973	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
974	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
975	// CHECK: [[TMP4:%.]] = bitcast %struct.int16x8x3_t [[RETVAL]] to i8*
976	// CHECK: [[TMP5:%.]] = bitcast %struct.int16x8x3_t [[__RET]] to i8*
977	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
978	// CHECK-A64: [[TMP6:%.]] = load %struct.int16x8x3_t, %struct.int16x8x3_t [[RETVAL]], align 16
979	// CHECK-A64: ret %struct.int16x8x3_t [[TMP6]]
980	// CHECK-A32: ret void
981	int16x8x3_t test_vld1q_s16_x3(int16_t const *a) {
982	return vld1q_s16_x3(a);
983	}
984
985	// CHECK-LABEL: @test_vld1q_s16_x4(
986	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
987	// CHECK-A32: %struct.int16x8x4_t* noalias sret [[RETVAL:%.*]],
988	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align {{16\|8}}
989	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x4_t [[__RET]] to i8*
990	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
991	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
992	// CHECK: [[VLD1XN:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v8i16.p0i16(i16 [[TMP2]])
993	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
994	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
995	// CHECK: [[TMP4:%.]] = bitcast %struct.int16x8x4_t [[RETVAL]] to i8*
996	// CHECK: [[TMP5:%.]] = bitcast %struct.int16x8x4_t [[__RET]] to i8*
997	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
998	// CHECK-A64: [[TMP6:%.]] = load %struct.int16x8x4_t, %struct.int16x8x4_t [[RETVAL]], align 16
999	// CHECK-A64: ret %struct.int16x8x4_t [[TMP6]]
1000	// CHECK-A32: ret void
1001	int16x8x4_t test_vld1q_s16_x4(int16_t const *a) {
1002	return vld1q_s16_x4(a);
1003	}
1004
1005	// CHECK-LABEL: @test_vld1q_s32_x2(
1006	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
1007	// CHECK-A32: %struct.int32x4x2_t* noalias sret [[RETVAL:%.*]],
1008	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align {{16\|8}}
1009	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x2_t [[__RET]] to i8*
1010	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
1011	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
1012	// CHECK: [[VLD1XN:%.]] = call { <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v4i32.p0i32(i32 [[TMP2]])
1013	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i32>, <4 x i32> }*
1014	// CHECK: store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
1015	// CHECK: [[TMP4:%.]] = bitcast %struct.int32x4x2_t [[RETVAL]] to i8*
1016	// CHECK: [[TMP5:%.]] = bitcast %struct.int32x4x2_t [[__RET]] to i8*
1017	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
1018	// CHECK-A64: [[TMP6:%.]] = load %struct.int32x4x2_t, %struct.int32x4x2_t [[RETVAL]], align 16
1019	// CHECK-A64: ret %struct.int32x4x2_t [[TMP6]]
1020	// CHECK-A32: ret void
1021	int32x4x2_t test_vld1q_s32_x2(int32_t const *a) {
1022	return vld1q_s32_x2(a);
1023	}
1024
1025	// CHECK-LABEL: @test_vld1q_s32_x3(
1026	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
1027	// CHECK-A32: %struct.int32x4x3_t* noalias sret [[RETVAL:%.*]],
1028	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align {{16\|8}}
1029	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x3_t [[__RET]] to i8*
1030	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
1031	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
1032	// CHECK: [[VLD1XN:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v4i32.p0i32(i32 [[TMP2]])
1033	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
1034	// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
1035	// CHECK: [[TMP4:%.]] = bitcast %struct.int32x4x3_t [[RETVAL]] to i8*
1036	// CHECK: [[TMP5:%.]] = bitcast %struct.int32x4x3_t [[__RET]] to i8*
1037	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
1038	// CHECK-A64: [[TMP6:%.]] = load %struct.int32x4x3_t, %struct.int32x4x3_t [[RETVAL]], align 16
1039	// CHECK-A64: ret %struct.int32x4x3_t [[TMP6]]
1040	// CHECK-A32: ret void
1041	int32x4x3_t test_vld1q_s32_x3(int32_t const *a) {
1042	return vld1q_s32_x3(a);
1043	}
1044
1045	// CHECK-LABEL: @test_vld1q_s32_x4(
1046	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
1047	// CHECK-A32: %struct.int32x4x4_t* noalias sret [[RETVAL:%.*]],
1048	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align {{16\|8}}
1049	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x4_t [[__RET]] to i8*
1050	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
1051	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
1052	// CHECK: [[VLD1XN:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v4i32.p0i32(i32 [[TMP2]])
1053	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
1054	// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
1055	// CHECK: [[TMP4:%.]] = bitcast %struct.int32x4x4_t [[RETVAL]] to i8*
1056	// CHECK: [[TMP5:%.]] = bitcast %struct.int32x4x4_t [[__RET]] to i8*
1057	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
1058	// CHECK-A64: [[TMP6:%.]] = load %struct.int32x4x4_t, %struct.int32x4x4_t [[RETVAL]], align 16
1059	// CHECK-A64: ret %struct.int32x4x4_t [[TMP6]]
1060	// CHECK-A32: ret void
1061	int32x4x4_t test_vld1q_s32_x4(int32_t const *a) {
1062	return vld1q_s32_x4(a);
1063	}
1064
1065	// CHECK-LABEL: @test_vld1q_s64_x2(
1066	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
1067	// CHECK-A32: %struct.int64x2x2_t* noalias sret [[RETVAL:%.*]],
1068	// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align {{16\|8}}
1069	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x2_t [[__RET]] to i8*
1070	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
1071	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
1072	// CHECK: [[VLD1XN:%.]] = call { <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v2i64.p0i64(i64 [[TMP2]])
1073	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64> }*
1074	// CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
1075	// CHECK: [[TMP4:%.]] = bitcast %struct.int64x2x2_t [[RETVAL]] to i8*
1076	// CHECK: [[TMP5:%.]] = bitcast %struct.int64x2x2_t [[__RET]] to i8*
1077	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
1078	// CHECK-A64: [[TMP6:%.]] = load %struct.int64x2x2_t, %struct.int64x2x2_t [[RETVAL]], align 16
1079	// CHECK-A64: ret %struct.int64x2x2_t [[TMP6]]
1080	// CHECK-A32: ret void
1081	int64x2x2_t test_vld1q_s64_x2(int64_t const *a) {
1082	return vld1q_s64_x2(a);
1083	}
1084
1085	// CHECK-LABEL: @test_vld1q_s64_x3(
1086	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
1087	// CHECK-A32: %struct.int64x2x3_t* noalias sret [[RETVAL:%.*]],
1088	// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align {{16\|8}}
1089	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x3_t [[__RET]] to i8*
1090	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
1091	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
1092	// CHECK: [[VLD1XN:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v2i64.p0i64(i64 [[TMP2]])
1093	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
1094	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
1095	// CHECK: [[TMP4:%.]] = bitcast %struct.int64x2x3_t [[RETVAL]] to i8*
1096	// CHECK: [[TMP5:%.]] = bitcast %struct.int64x2x3_t [[__RET]] to i8*
1097	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
1098	// CHECK-A64: [[TMP6:%.]] = load %struct.int64x2x3_t, %struct.int64x2x3_t [[RETVAL]], align 16
1099	// CHECK-A64: ret %struct.int64x2x3_t [[TMP6]]
1100	// CHECK-A32: ret void
1101	int64x2x3_t test_vld1q_s64_x3(int64_t const *a) {
1102	return vld1q_s64_x3(a);
1103	}
1104
1105	// CHECK-LABEL: @test_vld1q_s64_x4(
1106	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
1107	// CHECK-A32: %struct.int64x2x4_t* noalias sret [[RETVAL:%.*]],
1108	// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align {{16\|8}}
1109	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x4_t [[__RET]] to i8*
1110	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
1111	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
1112	// CHECK: [[VLD1XN:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v2i64.p0i64(i64 [[TMP2]])
1113	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
1114	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
1115	// CHECK: [[TMP4:%.]] = bitcast %struct.int64x2x4_t [[RETVAL]] to i8*
1116	// CHECK: [[TMP5:%.]] = bitcast %struct.int64x2x4_t [[__RET]] to i8*
1117	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
1118	// CHECK-A64: [[TMP6:%.]] = load %struct.int64x2x4_t, %struct.int64x2x4_t [[RETVAL]], align 16
1119	// CHECK-A64: ret %struct.int64x2x4_t [[TMP6]]
1120	// CHECK-A32: ret void
1121	int64x2x4_t test_vld1q_s64_x4(int64_t const *a) {
1122	return vld1q_s64_x4(a);
1123	}
1124
1125	// CHECK-LABEL: @test_vld1q_s8_x2(
1126	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
1127	// CHECK-A32: %struct.int8x16x2_t* noalias sret [[RETVAL:%.*]],
1128	// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align {{16\|8}}
1129	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x2_t [[__RET]] to i8*
1130	// CHECK: [[VLD1XN:%.]] = call { <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v16i8.p0i8(i8 %a)
1131	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8> }*
1132	// CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
1133	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x16x2_t [[RETVAL]] to i8*
1134	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x16x2_t [[__RET]] to i8*
1135	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 32, i1 false)
1136	// CHECK-A64: [[TMP4:%.]] = load %struct.int8x16x2_t, %struct.int8x16x2_t [[RETVAL]], align 16
1137	// CHECK-A64: ret %struct.int8x16x2_t [[TMP4]]
1138	// CHECK-A32: ret void
1139	int8x16x2_t test_vld1q_s8_x2(int8_t const *a) {
1140	return vld1q_s8_x2(a);
1141	}
1142
1143	// CHECK-LABEL: @test_vld1q_s8_x3(
1144	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
1145	// CHECK-A32: %struct.int8x16x3_t* noalias sret [[RETVAL:%.*]],
1146	// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align {{16\|8}}
1147	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x3_t [[__RET]] to i8*
1148	// CHECK: [[VLD1XN:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v16i8.p0i8(i8 %a)
1149	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
1150	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
1151	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x16x3_t [[RETVAL]] to i8*
1152	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x16x3_t [[__RET]] to i8*
1153	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 48, i1 false)
1154	// CHECK-A64: [[TMP4:%.]] = load %struct.int8x16x3_t, %struct.int8x16x3_t [[RETVAL]], align 16
1155	// CHECK-A64: ret %struct.int8x16x3_t [[TMP4]]
1156	// CHECK-A32: ret void
1157	int8x16x3_t test_vld1q_s8_x3(int8_t const *a) {
1158	return vld1q_s8_x3(a);
1159	}
1160
1161	// CHECK-LABEL: @test_vld1q_s8_x4(
1162	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
1163	// CHECK-A32: %struct.int8x16x4_t* noalias sret [[RETVAL:%.*]],
1164	// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align {{16\|8}}
1165	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x4_t [[__RET]] to i8*
1166	// CHECK: [[VLD1XN:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v16i8.p0i8(i8 %a)
1167	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
1168	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
1169	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x16x4_t [[RETVAL]] to i8*
1170	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x16x4_t [[__RET]] to i8*
1171	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 64, i1 false)
1172	// CHECK-A64: [[TMP4:%.]] = load %struct.int8x16x4_t, %struct.int8x16x4_t [[RETVAL]], align 16
1173	// CHECK-A64: ret %struct.int8x16x4_t [[TMP4]]
1174	// CHECK-A32: ret void
1175	int8x16x4_t test_vld1q_s8_x4(int8_t const *a) {
1176	return vld1q_s8_x4(a);
1177	}
1178
1179	// CHECK-LABEL: @test_vld1q_u16_x2(
1180	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
1181	// CHECK-A32: %struct.uint16x8x2_t* noalias sret [[RETVAL:%.*]],
1182	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align {{16\|8}}
1183	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x2_t [[__RET]] to i8*
1184	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
1185	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
1186	// CHECK: [[VLD1XN:%.]] = call { <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v8i16.p0i16(i16 [[TMP2]])
1187	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16> }*
1188	// CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
1189	// CHECK: [[TMP4:%.]] = bitcast %struct.uint16x8x2_t [[RETVAL]] to i8*
1190	// CHECK: [[TMP5:%.]] = bitcast %struct.uint16x8x2_t [[__RET]] to i8*
1191	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
1192	// CHECK-A64: [[TMP6:%.]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t [[RETVAL]], align 16
1193	// CHECK-A64: ret %struct.uint16x8x2_t [[TMP6]]
1194	// CHECK-A32: ret void
1195	uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) {
1196	return vld1q_u16_x2(a);
1197	}
1198
1199	// CHECK-LABEL: @test_vld1q_u16_x3(
1200	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
1201	// CHECK-A32: %struct.uint16x8x3_t* noalias sret [[RETVAL:%.*]],
1202	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align {{16\|8}}
1203	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x3_t [[__RET]] to i8*
1204	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
1205	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
1206	// CHECK: [[VLD1XN:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v8i16.p0i16(i16 [[TMP2]])
1207	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
1208	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
1209	// CHECK: [[TMP4:%.]] = bitcast %struct.uint16x8x3_t [[RETVAL]] to i8*
1210	// CHECK: [[TMP5:%.]] = bitcast %struct.uint16x8x3_t [[__RET]] to i8*
1211	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
1212	// CHECK-A64: [[TMP6:%.]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t [[RETVAL]], align 16
1213	// CHECK-A64: ret %struct.uint16x8x3_t [[TMP6]]
1214	// CHECK-A32: ret void
1215	uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) {
1216	return vld1q_u16_x3(a);
1217	}
1218
1219	// CHECK-LABEL: @test_vld1q_u16_x4(
1220	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
1221	// CHECK-A32: %struct.uint16x8x4_t* noalias sret [[RETVAL:%.*]],
1222	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align {{16\|8}}
1223	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x4_t [[__RET]] to i8*
1224	// CHECK: [[TMP1:%.]] = bitcast i16 %a to i8*
1225	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
1226	// CHECK: [[VLD1XN:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v8i16.p0i16(i16 [[TMP2]])
1227	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
1228	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
1229	// CHECK: [[TMP4:%.]] = bitcast %struct.uint16x8x4_t [[RETVAL]] to i8*
1230	// CHECK: [[TMP5:%.]] = bitcast %struct.uint16x8x4_t [[__RET]] to i8*
1231	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
1232	// CHECK-A64: [[TMP6:%.]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t [[RETVAL]], align 16
1233	// CHECK-A64: ret %struct.uint16x8x4_t [[TMP6]]
1234	// CHECK-A32: ret void
1235	uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) {
1236	return vld1q_u16_x4(a);
1237	}
1238
1239	// CHECK-LABEL: @test_vld1q_u32_x2(
1240	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
1241	// CHECK-A32: %struct.uint32x4x2_t* noalias sret [[RETVAL:%.*]],
1242	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align {{16\|8}}
1243	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x2_t [[__RET]] to i8*
1244	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
1245	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
1246	// CHECK: [[VLD1XN:%.]] = call { <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v4i32.p0i32(i32 [[TMP2]])
1247	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i32>, <4 x i32> }*
1248	// CHECK: store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
1249	// CHECK: [[TMP4:%.]] = bitcast %struct.uint32x4x2_t [[RETVAL]] to i8*
1250	// CHECK: [[TMP5:%.]] = bitcast %struct.uint32x4x2_t [[__RET]] to i8*
1251	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
1252	// CHECK-A64: [[TMP6:%.]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t [[RETVAL]], align 16
1253	// CHECK-A64: ret %struct.uint32x4x2_t [[TMP6]]
1254	// CHECK-A32: ret void
1255	uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) {
1256	return vld1q_u32_x2(a);
1257	}
1258
1259	// CHECK-LABEL: @test_vld1q_u32_x3(
1260	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
1261	// CHECK-A32: %struct.uint32x4x3_t* noalias sret [[RETVAL:%.*]],
1262	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align {{16\|8}}
1263	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x3_t [[__RET]] to i8*
1264	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
1265	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
1266	// CHECK: [[VLD1XN:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v4i32.p0i32(i32 [[TMP2]])
1267	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
1268	// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
1269	// CHECK: [[TMP4:%.]] = bitcast %struct.uint32x4x3_t [[RETVAL]] to i8*
1270	// CHECK: [[TMP5:%.]] = bitcast %struct.uint32x4x3_t [[__RET]] to i8*
1271	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
1272	// CHECK-A64: [[TMP6:%.]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t [[RETVAL]], align 16
1273	// CHECK-A64: ret %struct.uint32x4x3_t [[TMP6]]
1274	// CHECK-A32: ret void
1275	uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) {
1276	return vld1q_u32_x3(a);
1277	}
1278
1279	// CHECK-LABEL: @test_vld1q_u32_x4(
1280	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
1281	// CHECK-A32: %struct.uint32x4x4_t* noalias sret [[RETVAL:%.*]],
1282	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align {{16\|8}}
1283	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x4_t [[__RET]] to i8*
1284	// CHECK: [[TMP1:%.]] = bitcast i32 %a to i8*
1285	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
1286	// CHECK: [[VLD1XN:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v4i32.p0i32(i32 [[TMP2]])
1287	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
1288	// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
1289	// CHECK: [[TMP4:%.]] = bitcast %struct.uint32x4x4_t [[RETVAL]] to i8*
1290	// CHECK: [[TMP5:%.]] = bitcast %struct.uint32x4x4_t [[__RET]] to i8*
1291	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
1292	// CHECK-A64: [[TMP6:%.]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t [[RETVAL]], align 16
1293	// CHECK-A64: ret %struct.uint32x4x4_t [[TMP6]]
1294	// CHECK-A32: ret void
1295	uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) {
1296	return vld1q_u32_x4(a);
1297	}
1298
1299	// CHECK-LABEL: @test_vld1q_u64_x2(
1300	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
1301	// CHECK-A32: %struct.uint64x2x2_t* noalias sret [[RETVAL:%.*]],
1302	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align {{16\|8}}
1303	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x2_t [[__RET]] to i8*
1304	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
1305	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
1306	// CHECK: [[VLD1XN:%.]] = call { <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v2i64.p0i64(i64 [[TMP2]])
1307	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64> }*
1308	// CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
1309	// CHECK: [[TMP4:%.]] = bitcast %struct.uint64x2x2_t [[RETVAL]] to i8*
1310	// CHECK: [[TMP5:%.]] = bitcast %struct.uint64x2x2_t [[__RET]] to i8*
1311	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
1312	// CHECK-A64: [[TMP6:%.]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t [[RETVAL]], align 16
1313	// CHECK-A64: ret %struct.uint64x2x2_t [[TMP6]]
1314	// CHECK-A32: ret void
1315	uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) {
1316	return vld1q_u64_x2(a);
1317	}
1318
1319	// CHECK-LABEL: @test_vld1q_u64_x3(
1320	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
1321	// CHECK-A32: %struct.uint64x2x3_t* noalias sret [[RETVAL:%.*]],
1322	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align {{16\|8}}
1323	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x3_t [[__RET]] to i8*
1324	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
1325	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
1326	// CHECK: [[VLD1XN:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v2i64.p0i64(i64 [[TMP2]])
1327	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
1328	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
1329	// CHECK: [[TMP4:%.]] = bitcast %struct.uint64x2x3_t [[RETVAL]] to i8*
1330	// CHECK: [[TMP5:%.]] = bitcast %struct.uint64x2x3_t [[__RET]] to i8*
1331	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
1332	// CHECK-A64: [[TMP6:%.]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t [[RETVAL]], align 16
1333	// CHECK-A64: ret %struct.uint64x2x3_t [[TMP6]]
1334	// CHECK-A32: ret void
1335	uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) {
1336	return vld1q_u64_x3(a);
1337	}
1338
1339	// CHECK-LABEL: @test_vld1q_u64_x4(
1340	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
1341	// CHECK-A32: %struct.uint64x2x4_t* noalias sret [[RETVAL:%.*]],
1342	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align {{16\|8}}
1343	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x4_t [[__RET]] to i8*
1344	// CHECK: [[TMP1:%.]] = bitcast i64 %a to i8*
1345	// CHECK: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
1346	// CHECK: [[VLD1XN:%.]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v2i64.p0i64(i64 [[TMP2]])
1347	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
1348	// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
1349	// CHECK: [[TMP4:%.]] = bitcast %struct.uint64x2x4_t [[RETVAL]] to i8*
1350	// CHECK: [[TMP5:%.]] = bitcast %struct.uint64x2x4_t [[__RET]] to i8*
1351	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
1352	// CHECK-A64: [[TMP6:%.]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t [[RETVAL]], align 16
1353	// CHECK-A64: ret %struct.uint64x2x4_t [[TMP6]]
1354	// CHECK-A32: ret void
1355	uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) {
1356	return vld1q_u64_x4(a);
1357	}
1358
1359	// CHECK-LABEL: @test_vld1q_u8_x2(
1360	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
1361	// CHECK-A32: %struct.uint8x16x2_t* noalias sret [[RETVAL:%.*]],
1362	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align {{16\|8}}
1363	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x2_t [[__RET]] to i8*
1364	// CHECK: [[VLD1XN:%.]] = call { <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x2\|arm.neon.vld1x2}}.v16i8.p0i8(i8 %a)
1365	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8> }*
1366	// CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
1367	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x16x2_t [[RETVAL]] to i8*
1368	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x16x2_t [[__RET]] to i8*
1369	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 32, i1 false)
1370	// CHECK-A64: [[TMP4:%.]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t [[RETVAL]], align 16
1371	// CHECK-A64: ret %struct.uint8x16x2_t [[TMP4]]
1372	// CHECK-A32: ret void
1373	uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) {
1374	return vld1q_u8_x2(a);
1375	}
1376
1377	// CHECK-LABEL: @test_vld1q_u8_x3(
1378	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
1379	// CHECK-A32: %struct.uint8x16x3_t* noalias sret [[RETVAL:%.*]],
1380	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align {{16\|8}}
1381	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x3_t [[__RET]] to i8*
1382	// CHECK: [[VLD1XN:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x3\|arm.neon.vld1x3}}.v16i8.p0i8(i8 %a)
1383	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
1384	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
1385	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x16x3_t [[RETVAL]] to i8*
1386	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x16x3_t [[__RET]] to i8*
1387	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 48, i1 false)
1388	// CHECK-A64: [[TMP4:%.]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t [[RETVAL]], align 16
1389	// CHECK-A64: ret %struct.uint8x16x3_t [[TMP4]]
1390	// CHECK-A32: ret void
1391	uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) {
1392	return vld1q_u8_x3(a);
1393	}
1394
1395	// CHECK-LABEL: @test_vld1q_u8_x4(
1396	// CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
1397	// CHECK-A32: %struct.uint8x16x4_t* noalias sret [[RETVAL:%.*]],
1398	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align {{16\|8}}
1399	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x4_t [[__RET]] to i8*
1400	// CHECK: [[VLD1XN:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x4\|arm.neon.vld1x4}}.v16i8.p0i8(i8 %a)
1401	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
1402	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
1403	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x16x4_t [[RETVAL]] to i8*
1404	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x16x4_t [[__RET]] to i8*
1405	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 64, i1 false)
1406	// CHECK-A64: [[TMP4:%.]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t [[RETVAL]], align 16
1407	// CHECK-A64: ret %struct.uint8x16x4_t [[TMP4]]
1408	// CHECK-A32: ret void
1409	uint8x16x4_t test_vld1q_u8_x4(uint8_t const *a) {
1410	return vld1q_u8_x4(a);
1411	}
1412
1413	// CHECK-LABEL: @test_vld2_dup_f16(
1414	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
1415	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x2_t [[__RET]] to i8*
1416	// CHECK: [[TMP1:%.]] = bitcast half %src to i8*
1417	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to half*
1418	// CHECK-A64: [[VLD2:%.]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half [[TMP2]])
1419	// CHECK-A32: [[VLD2:%.]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2dup.v4i16.p0i8(i8 [[TMP1]], i32 2)
1420	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]> }*
1421	// CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD2]], { <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
1422	// CHECK: [[TMP4:%.]] = bitcast %struct.float16x4x2_t %dest to i8*
1423	// CHECK: [[TMP5:%.]] = bitcast %struct.float16x4x2_t [[__RET]] to i8*
1424	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
1425	// CHECK: ret void
1426	void test_vld2_dup_f16(float16x4x2_t dest, const float16_t src) {
1427	*dest = vld2_dup_f16(src);
1428	}
1429
1430	// CHECK-LABEL: @test_vld2_dup_f32(
1431	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
1432	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x2_t [[__RET]] to i8*
1433	// CHECK: [[TMP1:%.]] = bitcast float %src to i8*
1434	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to float*
1435	// CHECK-A64: [[VLD2:%.]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float [[TMP2]])
1436	// CHECK-A32: [[VLD2:%.]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2dup.v2f32.p0i8(i8 [[TMP1]], i32 4)
1437	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x float>, <2 x float> }*
1438	// CHECK: store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
1439	// CHECK: [[TMP4:%.]] = bitcast %struct.float32x2x2_t %dest to i8*
1440	// CHECK: [[TMP5:%.]] = bitcast %struct.float32x2x2_t [[__RET]] to i8*
1441	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
1442	// CHECK: ret void
1443	void test_vld2_dup_f32(float32x2x2_t dest, const float32_t src) {
1444	*dest = vld2_dup_f32(src);
1445	}
1446
1447	// CHECK-LABEL: @test_vld2_dup_p16(
1448	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
1449	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x2_t [[__RET]] to i8*
1450	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
1451	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
1452	// CHECK-A64: [[VLD2:%.]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16 [[TMP2]])
1453	// CHECK-A32: [[VLD2:%.]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2dup.v4i16.p0i8(i8 [[TMP1]], i32 2)
1454	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16> }*
1455	// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
1456	// CHECK: [[TMP4:%.]] = bitcast %struct.poly16x4x2_t %dest to i8*
1457	// CHECK: [[TMP5:%.]] = bitcast %struct.poly16x4x2_t [[__RET]] to i8*
1458	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
1459	// CHECK: ret void
1460	void test_vld2_dup_p16(poly16x4x2_t dest, const poly16_t src) {
1461	*dest = vld2_dup_p16(src);
1462	}
1463
1464	// CHECK-LABEL: @test_vld2_dup_p8(
1465	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
1466	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x2_t [[__RET]] to i8*
1467	// CHECK-A64: [[VLD2:%.]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8 %src)
1468	// CHECK-A32: [[VLD2:%.]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2dup.v8i8.p0i8(i8 %src, i32 1)
1469	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8> }*
1470	// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
1471	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x2_t %dest to i8*
1472	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x8x2_t [[__RET]] to i8*
1473	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 16, i1 false)
1474	// CHECK: ret void
1475	void test_vld2_dup_p8(poly8x8x2_t dest, poly8_t src) {
1476	*dest = vld2_dup_p8(src);
1477	}
1478
1479	// CHECK-LABEL: @test_vld2_dup_s16(
1480	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
1481	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x2_t [[__RET]] to i8*
1482	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
1483	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
1484	// CHECK-A64: [[VLD2:%.]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16 [[TMP2]])
1485	// CHECK-A32: [[VLD2:%.]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2dup.v4i16.p0i8(i8 [[TMP1]], i32 2)
1486	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16> }*
1487	// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
1488	// CHECK: [[TMP4:%.]] = bitcast %struct.int16x4x2_t %dest to i8*
1489	// CHECK: [[TMP5:%.]] = bitcast %struct.int16x4x2_t [[__RET]] to i8*
1490	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
1491	// CHECK: ret void
1492	void test_vld2_dup_s16(int16x4x2_t dest, const int16_t src) {
1493	*dest = vld2_dup_s16(src);
1494	}
1495
1496	// CHECK-LABEL: @test_vld2_dup_s32(
1497	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
1498	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x2_t [[__RET]] to i8*
1499	// CHECK: [[TMP1:%.]] = bitcast i32 %src to i8*
1500	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
1501	// CHECK-A64: [[VLD2:%.]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32 [[TMP2]])
1502	// CHECK-A32: [[VLD2:%.]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2dup.v2i32.p0i8(i8 [[TMP1]], i32 4)
1503	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i32>, <2 x i32> }*
1504	// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
1505	// CHECK: [[TMP4:%.]] = bitcast %struct.int32x2x2_t %dest to i8*
1506	// CHECK: [[TMP5:%.]] = bitcast %struct.int32x2x2_t [[__RET]] to i8*
1507	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
1508	// CHECK: ret void
1509	void test_vld2_dup_s32(int32x2x2_t dest, const int32_t src) {
1510	*dest = vld2_dup_s32(src);
1511	}
1512
1513	// CHECK-LABEL: @test_vld2_dup_s8(
1514	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
1515	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x2_t [[__RET]] to i8*
1516	// CHECK-A64: [[VLD2:%.]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8 %src)
1517	// CHECK-A32: [[VLD2:%.]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2dup.v8i8.p0i8(i8 %src, i32 1)
1518	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8> }*
1519	// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
1520	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x2_t %dest to i8*
1521	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x8x2_t [[__RET]] to i8*
1522	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 16, i1 false)
1523	// CHECK: ret void
1524	void test_vld2_dup_s8(int8x8x2_t dest, int8_t src) {
1525	*dest = vld2_dup_s8(src);
1526	}
1527
1528	// CHECK-LABEL: @test_vld2_dup_u16(
1529	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
1530	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x2_t [[__RET]] to i8*
1531	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
1532	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
1533	// CHECK-A64: [[VLD2:%.]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16 [[TMP2]])
1534	// CHECK-A32: [[VLD2:%.]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2dup.v4i16.p0i8(i8 [[TMP1]], i32 2)
1535	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16> }*
1536	// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
1537	// CHECK: [[TMP4:%.]] = bitcast %struct.uint16x4x2_t %dest to i8*
1538	// CHECK: [[TMP5:%.]] = bitcast %struct.uint16x4x2_t [[__RET]] to i8*
1539	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
1540	// CHECK: ret void
1541	void test_vld2_dup_u16(uint16x4x2_t dest, const uint16_t src) {
1542	*dest = vld2_dup_u16(src);
1543	}
1544
1545	// CHECK-LABEL: @test_vld2_dup_u32(
1546	// CHECK: entry:
1547	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
1548	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x2_t [[__RET]] to i8*
1549	// CHECK: [[TMP1:%.]] = bitcast i32 %src to i8*
1550	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
1551	// CHECK-A64: [[VLD2:%.]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32 [[TMP2]])
1552	// CHECK-A32: [[VLD2:%.]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2dup.v2i32.p0i8(i8 [[TMP1]], i32 4)
1553	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i32>, <2 x i32> }*
1554	// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
1555	// CHECK: [[TMP4:%.]] = bitcast %struct.uint32x2x2_t %dest to i8*
1556	// CHECK: [[TMP5:%.]] = bitcast %struct.uint32x2x2_t [[__RET]] to i8*
1557	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
1558	// CHECK: ret void
1559	void test_vld2_dup_u32(uint32x2x2_t dest, const uint32_t src) {
1560	*dest = vld2_dup_u32(src);
1561	}
1562
1563	// CHECK-LABEL: @test_vld2_dup_s64(
1564	// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
1565	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x2_t [[__RET]] to i8*
1566	// CHECK: [[TMP1:%.]] = bitcast i64 %src to i8*
1567	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
1568	// CHECK-A64: [[VLD2:%.]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64 [[TMP2]])
1569	// CHECK-A32: [[VLD2:%.]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2dup.v1i64.p0i8(i8 [[TMP1]], i32 8)
1570	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64> }*
1571	// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
1572	// CHECK: [[TMP4:%.]] = bitcast %struct.int64x1x2_t %dest to i8*
1573	// CHECK: [[TMP5:%.]] = bitcast %struct.int64x1x2_t [[__RET]] to i8*
1574	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
1575	// CHECK: ret void
1576	void test_vld2_dup_s64(int64x1x2_t dest, const int64_t src) {
1577	*dest = vld2_dup_s64(src);
1578	}
1579
1580	// CHECK-LABEL: @test_vld2_dup_u64(
1581	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
1582	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x2_t [[__RET]] to i8*
1583	// CHECK: [[TMP1:%.]] = bitcast i64 %src to i8*
1584	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
1585	// CHECK-A64: [[VLD2:%.]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64 [[TMP2]])
1586	// CHECK-A32: [[VLD2:%.]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2dup.v1i64.p0i8(i8 [[TMP1]], i32 8)
1587	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64> }*
1588	// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
1589	// CHECK: [[TMP4:%.]] = bitcast %struct.uint64x1x2_t %dest to i8*
1590	// CHECK: [[TMP5:%.]] = bitcast %struct.uint64x1x2_t [[__RET]] to i8*
1591	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 16, i1 false)
1592	// CHECK: ret void
1593	void test_vld2_dup_u64(uint64x1x2_t dest, const uint64_t src) {
1594	*dest = vld2_dup_u64(src);
1595	}
1596
1597	// CHECK-LABEL: @test_vld2_dup_u8(
1598	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
1599	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x2_t [[__RET]] to i8*
1600	// CHECK-A64: [[VLD2:%.]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8 %src)
1601	// CHECK-A32: [[VLD2:%.]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2dup.v8i8.p0i8(i8 %src, i32 1)
1602	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8> }*
1603	// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
1604	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x2_t %dest to i8*
1605	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x8x2_t [[__RET]] to i8*
1606	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 16, i1 false)
1607	// CHECK: ret void
1608	void test_vld2_dup_u8(uint8x8x2_t dest, const uint8_t src) {
1609	*dest = vld2_dup_u8(src);
1610	}
1611
1612	// CHECK-LABEL: @test_vld3_dup_f16(
1613	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
1614	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x3_t [[__RET]] to i8*
1615	// CHECK: [[TMP1:%.]] = bitcast half %src to i8*
1616	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to half*
1617	// CHECK-A64: [[VLD3:%.]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half [[TMP2]])
1618	// CHECK-A32: [[VLD3:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3dup.v4i16.p0i8(i8 [[TMP1]], i32 2)
1619	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }*
1620	// CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD3]], { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
1621	// CHECK: [[TMP4:%.]] = bitcast %struct.float16x4x3_t %dest to i8*
1622	// CHECK: [[TMP5:%.]] = bitcast %struct.float16x4x3_t [[__RET]] to i8*
1623	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
1624	// CHECK: ret void
1625	void test_vld3_dup_f16(float16x4x3_t dest, float16_t src) {
1626	*dest = vld3_dup_f16(src);
1627	}
1628
1629	// CHECK-LABEL: @test_vld3_dup_f32(
1630	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
1631	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x3_t [[__RET]] to i8*
1632	// CHECK: [[TMP1:%.]] = bitcast float %src to i8*
1633	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to float*
1634	// CHECK-A64: [[VLD3:%.]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float [[TMP2]])
1635	// CHECK-A32: [[VLD3:%.]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3dup.v2f32.p0i8(i8 [[TMP1]], i32 4)
1636	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
1637	// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
1638	// CHECK: [[TMP4:%.]] = bitcast %struct.float32x2x3_t %dest to i8*
1639	// CHECK: [[TMP5:%.]] = bitcast %struct.float32x2x3_t [[__RET]] to i8*
1640	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
1641	// CHECK: ret void
1642	void test_vld3_dup_f32(float32x2x3_t dest, const float32_t src) {
1643	*dest = vld3_dup_f32(src);
1644	}
1645
1646	// CHECK-LABEL: @test_vld3_dup_p16(
1647	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
1648	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x3_t [[__RET]] to i8*
1649	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
1650	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
1651	// CHECK-A64: [[VLD3:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16 [[TMP2]])
1652	// CHECK-A32: [[VLD3:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3dup.v4i16.p0i8(i8 [[TMP1]], i32 2)
1653	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
1654	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
1655	// CHECK: [[TMP4:%.]] = bitcast %struct.poly16x4x3_t %dest to i8*
1656	// CHECK: [[TMP5:%.]] = bitcast %struct.poly16x4x3_t [[__RET]] to i8*
1657	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
1658	// CHECK: ret void
1659	void test_vld3_dup_p16(poly16x4x3_t dest, const poly16_t src) {
1660	*dest = vld3_dup_p16(src);
1661	}
1662
1663	// CHECK-LABEL: @test_vld3_dup_p8(
1664	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
1665	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x3_t [[__RET]] to i8*
1666	// CHECK-A64: [[VLD3:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8 %src)
1667	// CHECK-A32: [[VLD3:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3dup.v8i8.p0i8(i8 %src, i32 1)
1668	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
1669	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
1670	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x3_t %dest to i8*
1671	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x8x3_t [[__RET]] to i8*
1672	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 24, i1 false)
1673	// CHECK: ret void
1674	void test_vld3_dup_p8(poly8x8x3_t dest, const poly8_t src) {
1675	*dest = vld3_dup_p8(src);
1676	}
1677
1678	// CHECK-LABEL: @test_vld3_dup_s16(
1679	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
1680	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x3_t [[__RET]] to i8*
1681	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
1682	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
1683	// CHECK-A64: [[VLD3:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16 [[TMP2]])
1684	// CHECK-A32: [[VLD3:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3dup.v4i16.p0i8(i8 [[TMP1]], i32 2)
1685	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
1686	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
1687	// CHECK: [[TMP4:%.]] = bitcast %struct.int16x4x3_t %dest to i8*
1688	// CHECK: [[TMP5:%.]] = bitcast %struct.int16x4x3_t [[__RET]] to i8*
1689	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
1690	// CHECK: ret void
1691	void test_vld3_dup_s16(int16x4x3_t dest, const int16_t src) {
1692	*dest = vld3_dup_s16(src);
1693	}
1694
1695	// CHECK-LABEL: @test_vld3_dup_s32(
1696	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
1697	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x3_t [[__RET]] to i8*
1698	// CHECK: [[TMP1:%.]] = bitcast i32 %src to i8*
1699	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
1700	// CHECK-A64: [[VLD3:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32 [[TMP2]])
1701	// CHECK-A32: [[VLD3:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3dup.v2i32.p0i8(i8 [[TMP1]], i32 4)
1702	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
1703	// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
1704	// CHECK: [[TMP4:%.]] = bitcast %struct.int32x2x3_t %dest to i8*
1705	// CHECK: [[TMP5:%.]] = bitcast %struct.int32x2x3_t [[__RET]] to i8*
1706	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
1707	// CHECK: ret void
1708	void test_vld3_dup_s32(int32x2x3_t dest, const int32_t src) {
1709	*dest = vld3_dup_s32(src);
1710	}
1711
1712	// CHECK-LABEL: @test_vld3_dup_s8(
1713	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
1714	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x3_t [[__RET]] to i8*
1715	// CHECK-A64: [[VLD3:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8 %src)
1716	// CHECK-A32: [[VLD3:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3dup.v8i8.p0i8(i8 %src, i32 1)
1717	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
1718	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
1719	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x3_t %dest to i8*
1720	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x8x3_t [[__RET]] to i8*
1721	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 24, i1 false)
1722	// CHECK: ret void
1723	void test_vld3_dup_s8(int8x8x3_t dest, const int8_t src) {
1724	*dest = vld3_dup_s8(src);
1725	}
1726
1727	// CHECK-LABEL: @test_vld3_dup_u16(
1728	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
1729	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x3_t [[__RET]] to i8*
1730	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
1731	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
1732	// CHECK-A64: [[VLD3:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16 [[TMP2]])
1733	// CHECK-A32: [[VLD3:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3dup.v4i16.p0i8(i8 [[TMP1]], i32 2)
1734	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
1735	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
1736	// CHECK: [[TMP4:%.]] = bitcast %struct.uint16x4x3_t %dest to i8*
1737	// CHECK: [[TMP5:%.]] = bitcast %struct.uint16x4x3_t [[__RET]] to i8*
1738	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
1739	// CHECK: ret void
1740	void test_vld3_dup_u16(uint16x4x3_t dest, const uint16_t src) {
1741	*dest = vld3_dup_u16(src);
1742	}
1743
1744	// CHECK-LABEL: @test_vld3_dup_u32(
1745	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
1746	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x3_t [[__RET]] to i8*
1747	// CHECK: [[TMP1:%.]] = bitcast i32 %src to i8*
1748	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
1749	// CHECK-A64: [[VLD3:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32 [[TMP2]])
1750	// CHECK-A32: [[VLD3:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3dup.v2i32.p0i8(i8 [[TMP1]], i32 4)
1751	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
1752	// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
1753	// CHECK: [[TMP4:%.]] = bitcast %struct.uint32x2x3_t %dest to i8*
1754	// CHECK: [[TMP5:%.]] = bitcast %struct.uint32x2x3_t [[__RET]] to i8*
1755	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
1756	// CHECK: ret void
1757	void test_vld3_dup_u32(uint32x2x3_t dest, const uint32_t src) {
1758	*dest = vld3_dup_u32(src);
1759	}
1760
1761	// CHECK-LABEL: @test_vld3_dup_u8(
1762	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
1763	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x3_t [[__RET]] to i8*
1764	// CHECK-A64: [[VLD3:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8 %src)
1765	// CHECK-A32: [[VLD3:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3dup.v8i8.p0i8(i8 %src, i32 1)
1766	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
1767	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
1768	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x3_t %dest to i8*
1769	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x8x3_t [[__RET]] to i8*
1770	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 24, i1 false)
1771	// CHECK: ret void
1772	void test_vld3_dup_u8(uint8x8x3_t dest, const uint8_t src) {
1773	*dest = vld3_dup_u8(src);
1774	}
1775
1776	// CHECK-LABEL: @test_vld3_dup_s64(
1777	// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
1778	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x3_t [[__RET]] to i8*
1779	// CHECK: [[TMP1:%.]] = bitcast i64 %src to i8*
1780	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
1781	// CHECK-A64: [[VLD3:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64 [[TMP2]])
1782	// CHECK-A32: [[VLD3:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3dup.v1i64.p0i8(i8 [[TMP1]], i32 8)
1783	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
1784	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
1785	// CHECK: [[TMP4:%.]] = bitcast %struct.int64x1x3_t %dest to i8*
1786	// CHECK: [[TMP5:%.]] = bitcast %struct.int64x1x3_t [[__RET]] to i8*
1787	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
1788	// CHECK: ret void
1789	void test_vld3_dup_s64(int64x1x3_t dest, const int64_t src) {
1790	*dest = vld3_dup_s64(src);
1791	}
1792
1793	// CHECK-LABEL: @test_vld3_dup_u64(
1794	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
1795	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x3_t [[__RET]] to i8*
1796	// CHECK: [[TMP1:%.]] = bitcast i64 %src to i8*
1797	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
1798	// CHECK-A64: [[VLD3:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64 [[TMP2]])
1799	// CHECK-A32: [[VLD3:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3dup.v1i64.p0i8(i8 [[TMP1]], i32 8)
1800	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
1801	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
1802	// CHECK: [[TMP4:%.]] = bitcast %struct.uint64x1x3_t %dest to i8*
1803	// CHECK: [[TMP5:%.]] = bitcast %struct.uint64x1x3_t [[__RET]] to i8*
1804	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 24, i1 false)
1805	// CHECK: ret void
1806	void test_vld3_dup_u64(uint64x1x3_t dest, const uint64_t src) {
1807	*dest = vld3_dup_u64(src);
1808	}
1809
1810	// CHECK-LABEL: @test_vld4_dup_f16(
1811	// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
1812	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x4_t [[__RET]] to i8*
1813	// CHECK: [[TMP1:%.]] = bitcast half %src to i8*
1814	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to half*
1815	// CHECK-A64: [[VLD4:%.]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half [[TMP2]])
1816	// CHECK-A32: [[VLD4:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4dup.v4i16.p0i8(i8 [[TMP1]], i32 2)
1817	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }*
1818	// CHECK: store { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> } [[VLD4]], { <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]>, <4 x [[HALF]]> }* [[TMP3]]
1819	// CHECK: [[TMP4:%.]] = bitcast %struct.float16x4x4_t %dest to i8*
1820	// CHECK: [[TMP5:%.]] = bitcast %struct.float16x4x4_t [[__RET]] to i8*
1821	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
1822	// CHECK: ret void
1823	void test_vld4_dup_f16(float16x4x4_t dest, const float16_t src) {
1824	*dest = vld4_dup_f16(src);
1825	}
1826
1827	// CHECK-LABEL: @test_vld4_dup_f32(
1828	// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
1829	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x4_t [[__RET]] to i8*
1830	// CHECK: [[TMP1:%.]] = bitcast float %src to i8*
1831	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to float*
1832	// CHECK-A64: [[VLD4:%.]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float [[TMP2]])
1833	// CHECK-A32: [[VLD4:%.]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4dup.v2f32.p0i8(i8 [[TMP1]], i32 4)
1834	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
1835	// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
1836	// CHECK: [[TMP4:%.]] = bitcast %struct.float32x2x4_t %dest to i8*
1837	// CHECK: [[TMP5:%.]] = bitcast %struct.float32x2x4_t [[__RET]] to i8*
1838	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
1839	// CHECK: ret void
1840	void test_vld4_dup_f32(float32x2x4_t dest, const float32_t src) {
1841	*dest = vld4_dup_f32(src);
1842	}
1843
1844	// CHECK-LABEL: @test_vld4_dup_p16(
1845	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
1846	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x4_t [[__RET]] to i8*
1847	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
1848	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
1849	// CHECK-A64: [[VLD4:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16 [[TMP2]])
1850	// CHECK-A32: [[VLD4:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4dup.v4i16.p0i8(i8 [[TMP1]], i32 2)
1851	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
1852	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
1853	// CHECK: [[TMP4:%.]] = bitcast %struct.poly16x4x4_t %dest to i8*
1854	// CHECK: [[TMP5:%.]] = bitcast %struct.poly16x4x4_t [[__RET]] to i8*
1855	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
1856	// CHECK: ret void
1857	void test_vld4_dup_p16(poly16x4x4_t dest, const poly16_t src) {
1858	*dest = vld4_dup_p16(src);
1859	}
1860
1861	// CHECK-LABEL: @test_vld4_dup_p8(
1862	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
1863	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x4_t [[__RET]] to i8*
1864	// CHECK-A64: [[VLD4:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8 %src)
1865	// CHECK-A32: [[VLD4:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4dup.v8i8.p0i8(i8 %src, i32 1)
1866	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
1867	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
1868	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x8x4_t %dest to i8*
1869	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x8x4_t [[__RET]] to i8*
1870	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 32, i1 false)
1871	// CHECK: ret void
1872	void test_vld4_dup_p8(poly8x8x4_t dest, const poly8_t src) {
1873	*dest = vld4_dup_p8(src);
1874	}
1875
1876	// CHECK-LABEL: @test_vld4_dup_s16(
1877	// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
1878	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x4_t [[__RET]] to i8*
1879	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
1880	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
1881	// CHECK-A64: [[VLD4:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16 [[TMP2]])
1882	// CHECK-A32: [[VLD4:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4dup.v4i16.p0i8(i8 [[TMP1]], i32 2)
1883	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
1884	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
1885	// CHECK: [[TMP4:%.]] = bitcast %struct.int16x4x4_t %dest to i8*
1886	// CHECK: [[TMP5:%.]] = bitcast %struct.int16x4x4_t [[__RET]] to i8*
1887	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
1888	// CHECK: ret void
1889	void test_vld4_dup_s16(int16x4x4_t dest, const int16_t src) {
1890	*dest = vld4_dup_s16(src);
1891	}
1892
1893	// CHECK-LABEL: @test_vld4_dup_s32(
1894	// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
1895	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x4_t [[__RET]] to i8*
1896	// CHECK: [[TMP1:%.]] = bitcast i32 %src to i8*
1897	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
1898	// CHECK-A64: [[VLD4:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32 [[TMP2]])
1899	// CHECK-A32: [[VLD4:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4dup.v2i32.p0i8(i8 [[TMP1]], i32 4)
1900	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
1901	// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
1902	// CHECK: [[TMP4:%.]] = bitcast %struct.int32x2x4_t %dest to i8*
1903	// CHECK: [[TMP5:%.]] = bitcast %struct.int32x2x4_t [[__RET]] to i8*
1904	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
1905	// CHECK: ret void
1906	void test_vld4_dup_s32(int32x2x4_t dest, const int32_t src) {
1907	*dest = vld4_dup_s32(src);
1908	}
1909
1910	// CHECK-LABEL: @test_vld4_dup_s8(
1911	// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
1912	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x4_t [[__RET]] to i8*
1913	// CHECK-A64: [[VLD4:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8 %src)
1914	// CHECK-A32: [[VLD4:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4dup.v8i8.p0i8(i8 %src, i32 1)
1915	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
1916	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
1917	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x8x4_t %dest to i8*
1918	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x8x4_t [[__RET]] to i8*
1919	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 32, i1 false)
1920	// CHECK: ret void
1921	void test_vld4_dup_s8(int8x8x4_t dest, const int8_t src) {
1922	*dest = vld4_dup_s8(src);
1923	}
1924
1925	// CHECK-LABEL: @test_vld4_dup_u16(
1926	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
1927	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x4_t [[__RET]] to i8*
1928	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
1929	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
1930	// CHECK-A64: [[VLD4:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16 [[TMP2]])
1931	// CHECK-A32: [[VLD4:%.]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4dup.v4i16.p0i8(i8 [[TMP1]], i32 2)
1932	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
1933	// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
1934	// CHECK: [[TMP4:%.]] = bitcast %struct.uint16x4x4_t %dest to i8*
1935	// CHECK: [[TMP5:%.]] = bitcast %struct.uint16x4x4_t [[__RET]] to i8*
1936	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
1937	// CHECK: ret void
1938	void test_vld4_dup_u16(uint16x4x4_t dest, const uint16_t src) {
1939	*dest = vld4_dup_u16(src);
1940	}
1941
1942	// CHECK-LABEL: @test_vld4_dup_u32(
1943	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
1944	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x4_t [[__RET]] to i8*
1945	// CHECK: [[TMP1:%.]] = bitcast i32 %src to i8*
1946	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
1947	// CHECK-A64: [[VLD4:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32 [[TMP2]])
1948	// CHECK-A32: [[VLD4:%.]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4dup.v2i32.p0i8(i8 [[TMP1]], i32 4)
1949	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
1950	// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
1951	// CHECK: [[TMP4:%.]] = bitcast %struct.uint32x2x4_t %dest to i8*
1952	// CHECK: [[TMP5:%.]] = bitcast %struct.uint32x2x4_t [[__RET]] to i8*
1953	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
1954	// CHECK: ret void
1955	void test_vld4_dup_u32(uint32x2x4_t dest, const uint32_t src) {
1956	*dest = vld4_dup_u32(src);
1957	}
1958
1959	// CHECK-LABEL: @test_vld4_dup_u8(
1960	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
1961	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x4_t [[__RET]] to i8*
1962	// CHECK-A64: [[VLD4:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8 %src)
1963	// CHECK-A32: [[VLD4:%.]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4dup.v8i8.p0i8(i8 %src, i32 1)
1964	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
1965	// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
1966	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x8x4_t %dest to i8*
1967	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x8x4_t [[__RET]] to i8*
1968	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], {{i64\|i32}} 32, i1 false)
1969	// CHECK: ret void
1970	void test_vld4_dup_u8(uint8x8x4_t dest, const uint8_t src) {
1971	*dest = vld4_dup_u8(src);
1972	}
1973
1974	// CHECK-LABEL: @test_vld4_dup_s64(
1975	// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
1976	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x4_t [[__RET]] to i8*
1977	// CHECK: [[TMP1:%.]] = bitcast i64 %src to i8*
1978	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
1979	// CHECK-A64: [[VLD4:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64 [[TMP2]])
1980	// CHECK-A32: [[VLD4:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4dup.v1i64.p0i8(i8 [[TMP1]], i32 8)
1981	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
1982	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
1983	// CHECK: [[TMP4:%.]] = bitcast %struct.int64x1x4_t %dest to i8*
1984	// CHECK: [[TMP5:%.]] = bitcast %struct.int64x1x4_t [[__RET]] to i8*
1985	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
1986	// CHECK: ret void
1987	void test_vld4_dup_s64(int64x1x4_t dest, const int64_t src) {
1988	*dest = vld4_dup_s64(src);
1989	}
1990
1991	// CHECK-LABEL: @test_vld4_dup_u64(
1992	// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
1993	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x4_t [[__RET]] to i8*
1994	// CHECK: [[TMP1:%.]] = bitcast i64 %src to i8*
1995	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i64*
1996	// CHECK-A64: [[VLD4:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64 [[TMP2]])
1997	// CHECK-A32: [[VLD4:%.]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4dup.v1i64.p0i8(i8 [[TMP1]], i32 8)
1998	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
1999	// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
2000	// CHECK: [[TMP4:%.]] = bitcast %struct.uint64x1x4_t %dest to i8*
2001	// CHECK: [[TMP5:%.]] = bitcast %struct.uint64x1x4_t [[__RET]] to i8*
2002	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], {{i64\|i32}} 32, i1 false)
2003	// CHECK: ret void
2004	void test_vld4_dup_u64(uint64x1x4_t dest, const uint64_t src) {
2005	*dest = vld4_dup_u64(src);
2006	}
2007
2008	// CHECK-LABEL: @test_vld2q_dup_f16(
2009	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align {{16\|8}}
2010	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x2_t [[__RET]] to i8*
2011	// CHECK: [[TMP1:%.]] = bitcast half %src to i8*
2012	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to half*
2013	// CHECK-A64: [[VLD2:%.]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half [[TMP2]])
2014	// CHECK-A32: [[VLD2:%.]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2dup.v8i16.p0i8(i8 [[TMP1]], i32 2)
2015	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]> }*
2016	// CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD2]], { <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
2017	// CHECK: [[TMP4:%.]] = bitcast %struct.float16x8x2_t %dest to i8*
2018	// CHECK: [[TMP5:%.]] = bitcast %struct.float16x8x2_t [[__RET]] to i8*
2019	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
2020	// CHECK: ret void
2021	void test_vld2q_dup_f16(float16x8x2_t dest, const float16_t src) {
2022	*dest = vld2q_dup_f16(src);
2023	}
2024
2025	// CHECK-LABEL: @test_vld2q_dup_f32(
2026	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align {{16\|8}}
2027	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x2_t [[__RET]] to i8*
2028	// CHECK: [[TMP1:%.]] = bitcast float %src to i8*
2029	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to float*
2030	// CHECK-A64: [[VLD2:%.]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float [[TMP2]])
2031	// CHECK-A32: [[VLD2:%.]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2dup.v4f32.p0i8(i8 [[TMP1]], i32 4)
2032	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x float>, <4 x float> }*
2033	// CHECK: store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
2034	// CHECK: [[TMP4:%.]] = bitcast %struct.float32x4x2_t %dest to i8*
2035	// CHECK: [[TMP5:%.]] = bitcast %struct.float32x4x2_t [[__RET]] to i8*
2036	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
2037	// CHECK: ret void
2038	void test_vld2q_dup_f32(float32x4x2_t dest, const float32_t src) {
2039	*dest = vld2q_dup_f32(src);
2040	}
2041
2042	// CHECK-LABEL: @test_vld2q_dup_p16(
2043	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align {{16\|8}}
2044	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x2_t [[__RET]] to i8*
2045	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
2046	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
2047	// CHECK-A64: [[VLD2:%.]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16 [[TMP2]])
2048	// CHECK-A32: [[VLD2:%.]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2dup.v8i16.p0i8(i8 [[TMP1]], i32 2)
2049	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16> }*
2050	// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
2051	// CHECK: [[TMP4:%.]] = bitcast %struct.poly16x8x2_t %dest to i8*
2052	// CHECK: [[TMP5:%.]] = bitcast %struct.poly16x8x2_t [[__RET]] to i8*
2053	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
2054	// CHECK: ret void
2055	void test_vld2q_dup_p16(poly16x8x2_t dest, const poly16_t src) {
2056	*dest = vld2q_dup_p16(src);
2057	}
2058
2059	// CHECK-LABEL: @test_vld2q_dup_p8(
2060	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align {{16\|8}}
2061	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x2_t [[__RET]] to i8*
2062	// CHECK-A64: [[VLD2:%.]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8 %src)
2063	// CHECK-A32: [[VLD2:%.]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2dup.v16i8.p0i8(i8 %src, i32 1)
2064	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8> }*
2065	// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
2066	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x16x2_t %dest to i8*
2067	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x16x2_t [[__RET]] to i8*
2068	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 32, i1 false)
2069	// CHECK: ret void
2070	void test_vld2q_dup_p8(poly8x16x2_t dest, const poly8_t src) {
2071	*dest = vld2q_dup_p8(src);
2072	}
2073
2074	// CHECK-LABEL: @test_vld2q_dup_s16(
2075	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align {{16\|8}}
2076	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x2_t [[__RET]] to i8*
2077	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
2078	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
2079	// CHECK-A64: [[VLD2:%.]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16 [[TMP2]])
2080	// CHECK-A32: [[VLD2:%.]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2dup.v8i16.p0i8(i8 [[TMP1]], i32 2)
2081	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16> }*
2082	// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
2083	// CHECK: [[TMP4:%.]] = bitcast %struct.int16x8x2_t %dest to i8*
2084	// CHECK: [[TMP5:%.]] = bitcast %struct.int16x8x2_t [[__RET]] to i8*
2085	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
2086	// CHECK: ret void
2087	void test_vld2q_dup_s16(int16x8x2_t dest, const int16_t src) {
2088	*dest = vld2q_dup_s16(src);
2089	}
2090
2091	// CHECK-LABEL: @test_vld2q_dup_s32(
2092	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align {{16\|8}}
2093	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x2_t [[__RET]] to i8*
2094	// CHECK: [[TMP1:%.]] = bitcast i32 %src to i8*
2095	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
2096	// CHECK-A64: [[VLD2:%.]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32 [[TMP2]])
2097	// CHECK-A32: [[VLD2:%.]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2dup.v4i32.p0i8(i8 [[TMP1]], i32 4)
2098	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i32>, <4 x i32> }*
2099	// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
2100	// CHECK: [[TMP4:%.]] = bitcast %struct.int32x4x2_t %dest to i8*
2101	// CHECK: [[TMP5:%.]] = bitcast %struct.int32x4x2_t [[__RET]] to i8*
2102	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
2103	// CHECK: ret void
2104	void test_vld2q_dup_s32(int32x4x2_t dest, const int32_t src) {
2105	*dest = vld2q_dup_s32(src);
2106	}
2107
2108	// CHECK-LABEL: @test_vld2q_dup_s8(
2109	// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align {{16\|8}}
2110	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x2_t [[__RET]] to i8*
2111	// CHECK-A64: [[VLD2:%.]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8 %src)
2112	// CHECK-A32: [[VLD2:%.]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2dup.v16i8.p0i8(i8 %src, i32 1)
2113	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8> }*
2114	// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
2115	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x16x2_t %dest to i8*
2116	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x16x2_t [[__RET]] to i8*
2117	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 32, i1 false)
2118	// CHECK: ret void
2119	void test_vld2q_dup_s8(int8x16x2_t dest, const int8_t src) {
2120	*dest = vld2q_dup_s8(src);
2121	}
2122
2123	// CHECK-LABEL: @test_vld2q_dup_u16(
2124	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align {{16\|8}}
2125	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x2_t [[__RET]] to i8*
2126	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
2127	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
2128	// CHECK-A64: [[VLD2:%.]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16 [[TMP2]])
2129	// CHECK-A32: [[VLD2:%.]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2dup.v8i16.p0i8(i8 [[TMP1]], i32 2)
2130	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16> }*
2131	// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
2132	// CHECK: [[TMP4:%.]] = bitcast %struct.uint16x8x2_t %dest to i8*
2133	// CHECK: [[TMP5:%.]] = bitcast %struct.uint16x8x2_t [[__RET]] to i8*
2134	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
2135	// CHECK: ret void
2136	void test_vld2q_dup_u16(uint16x8x2_t dest, const uint16_t src) {
2137	*dest = vld2q_dup_u16(src);
2138	}
2139
2140	// CHECK-LABEL: @test_vld2q_dup_u32(
2141	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align {{16\|8}}
2142	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x2_t [[__RET]] to i8*
2143	// CHECK: [[TMP1:%.]] = bitcast i32 %src to i8*
2144	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
2145	// CHECK-A64: [[VLD2:%.]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32 [[TMP2]])
2146	// CHECK-A32: [[VLD2:%.]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2dup.v4i32.p0i8(i8 [[TMP1]], i32 4)
2147	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i32>, <4 x i32> }*
2148	// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
2149	// CHECK: [[TMP4:%.]] = bitcast %struct.uint32x4x2_t %dest to i8*
2150	// CHECK: [[TMP5:%.]] = bitcast %struct.uint32x4x2_t [[__RET]] to i8*
2151	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 32, i1 false)
2152	// CHECK: ret void
2153	void test_vld2q_dup_u32(uint32x4x2_t dest, const uint32_t src) {
2154	*dest = vld2q_dup_u32(src);
2155	}
2156
2157	// CHECK-LABEL: @test_vld2q_dup_u8(
2158	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align {{16\|8}}
2159	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x2_t [[__RET]] to i8*
2160	// CHECK-A64: [[VLD2:%.]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8 %src)
2161	// CHECK-A32: [[VLD2:%.]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2dup.v16i8.p0i8(i8 %src, i32 1)
2162	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8> }*
2163	// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
2164	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x16x2_t %dest to i8*
2165	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x16x2_t [[__RET]] to i8*
2166	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 32, i1 false)
2167	// CHECK: ret void
2168	void test_vld2q_dup_u8(uint8x16x2_t dest, const uint8_t src) {
2169	*dest = vld2q_dup_u8(src);
2170	}
2171
2172	// CHECK-LABEL: @test_vld3q_dup_f16(
2173	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align {{16\|8}}
2174	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x3_t [[__RET]] to i8*
2175	// CHECK: [[TMP1:%.]] = bitcast half %src to i8*
2176	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to half*
2177	// CHECK-A64: [[VLD3:%.]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half [[TMP2]])
2178	// CHECK-A32: [[VLD3:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3dup.v8i16.p0i8(i8 [[TMP1]], i32 2)
2179	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }*
2180	// CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD3]], { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
2181	// CHECK: [[TMP4:%.]] = bitcast %struct.float16x8x3_t %dest to i8*
2182	// CHECK: [[TMP5:%.]] = bitcast %struct.float16x8x3_t [[__RET]] to i8*
2183	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
2184	// CHECK: ret void
2185	void test_vld3q_dup_f16(float16x8x3_t dest, const float16_t src) {
2186	*dest = vld3q_dup_f16(src);
2187	}
2188
2189	// CHECK-LABEL: @test_vld3q_dup_f32(
2190	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align {{16\|8}}
2191	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x3_t [[__RET]] to i8*
2192	// CHECK: [[TMP1:%.]] = bitcast float %src to i8*
2193	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to float*
2194	// CHECK-A64: [[VLD3:%.]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float [[TMP2]])
2195	// CHECK-A32: [[VLD3:%.]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3dup.v4f32.p0i8(i8 [[TMP1]], i32 4)
2196	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
2197	// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
2198	// CHECK: [[TMP4:%.]] = bitcast %struct.float32x4x3_t %dest to i8*
2199	// CHECK: [[TMP5:%.]] = bitcast %struct.float32x4x3_t [[__RET]] to i8*
2200	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
2201	// CHECK: ret void
2202	void test_vld3q_dup_f32(float32x4x3_t dest, const float32_t src) {
2203	*dest = vld3q_dup_f32(src);
2204	}
2205
2206	// CHECK-LABEL: @test_vld3q_dup_p16(
2207	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align {{16\|8}}
2208	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x3_t [[__RET]] to i8*
2209	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
2210	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
2211	// CHECK-A64: [[VLD3:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16 [[TMP2]])
2212	// CHECK-A32: [[VLD3:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3dup.v8i16.p0i8(i8 [[TMP1]], i32 2)
2213	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
2214	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
2215	// CHECK: [[TMP4:%.]] = bitcast %struct.poly16x8x3_t %dest to i8*
2216	// CHECK: [[TMP5:%.]] = bitcast %struct.poly16x8x3_t [[__RET]] to i8*
2217	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
2218	// CHECK: ret void
2219	void test_vld3q_dup_p16(poly16x8x3_t dest, const poly16_t src) {
2220	*dest = vld3q_dup_p16(src);
2221	}
2222
2223	// CHECK-LABEL: @test_vld3q_dup_p8(
2224	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align {{16\|8}}
2225	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x3_t [[__RET]] to i8*
2226	// CHECK-A64: [[VLD3:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8 %src)
2227	// CHECK-A32: [[VLD3:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3dup.v16i8.p0i8(i8 %src, i32 1)
2228	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
2229	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
2230	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x16x3_t %dest to i8*
2231	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x16x3_t [[__RET]] to i8*
2232	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 48, i1 false)
2233	// CHECK: ret void
2234	void test_vld3q_dup_p8(poly8x16x3_t dest, const poly8_t src) {
2235	*dest = vld3q_dup_p8(src);
2236	}
2237
2238	// CHECK-LABEL: @test_vld3q_dup_s16(
2239	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align {{16\|8}}
2240	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x3_t [[__RET]] to i8*
2241	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
2242	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
2243	// CHECK-A64: [[VLD3:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16 [[TMP2]])
2244	// CHECK-A32: [[VLD3:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3dup.v8i16.p0i8(i8 [[TMP1]], i32 2)
2245	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
2246	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
2247	// CHECK: [[TMP4:%.]] = bitcast %struct.int16x8x3_t %dest to i8*
2248	// CHECK: [[TMP5:%.]] = bitcast %struct.int16x8x3_t [[__RET]] to i8*
2249	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
2250	// CHECK: ret void
2251	void test_vld3q_dup_s16(int16x8x3_t dest, const int16_t src) {
2252	*dest = vld3q_dup_s16(src);
2253	}
2254
2255	// CHECK-LABEL: @test_vld3q_dup_s32(
2256	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align {{16\|8}}
2257	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x3_t [[__RET]] to i8*
2258	// CHECK: [[TMP1:%.]] = bitcast i32 %src to i8*
2259	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
2260	// CHECK-A64: [[VLD3:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32 [[TMP2]])
2261	// CHECK-A32: [[VLD3:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3dup.v4i32.p0i8(i8 [[TMP1]], i32 4)
2262	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
2263	// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
2264	// CHECK: [[TMP4:%.]] = bitcast %struct.int32x4x3_t %dest to i8*
2265	// CHECK: [[TMP5:%.]] = bitcast %struct.int32x4x3_t [[__RET]] to i8*
2266	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
2267	// CHECK: ret void
2268	void test_vld3q_dup_s32(int32x4x3_t dest, const int32_t src) {
2269	*dest = vld3q_dup_s32(src);
2270	}
2271
2272	// CHECK-LABEL: @test_vld3q_dup_s8(
2273	// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align {{16\|8}}
2274	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x3_t [[__RET]] to i8*
2275	// CHECK-A64: [[VLD3:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8 %src)
2276	// CHECK-A32: [[VLD3:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3dup.v16i8.p0i8(i8 %src, i32 1)
2277	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
2278	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
2279	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x16x3_t %dest to i8*
2280	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x16x3_t [[__RET]] to i8*
2281	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 48, i1 false)
2282	// CHECK: ret void
2283	void test_vld3q_dup_s8(int8x16x3_t dest, const int8_t src) {
2284	*dest = vld3q_dup_s8(src);
2285	}
2286
2287	// CHECK-LABEL: @test_vld3q_dup_u16(
2288	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align {{16\|8}}
2289	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x3_t [[__RET]] to i8*
2290	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
2291	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
2292	// CHECK-A64: [[VLD3:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16 [[TMP2]])
2293	// CHECK-A32: [[VLD3:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3dup.v8i16.p0i8(i8 [[TMP1]], i32 2)
2294	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
2295	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
2296	// CHECK: [[TMP4:%.]] = bitcast %struct.uint16x8x3_t %dest to i8*
2297	// CHECK: [[TMP5:%.]] = bitcast %struct.uint16x8x3_t [[__RET]] to i8*
2298	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
2299	// CHECK: ret void
2300	void test_vld3q_dup_u16(uint16x8x3_t dest, const uint16_t src) {
2301	*dest = vld3q_dup_u16(src);
2302	}
2303
2304	// CHECK-LABEL: @test_vld3q_dup_u32(
2305	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align {{16\|8}}
2306	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x3_t [[__RET]] to i8*
2307	// CHECK: [[TMP1:%.]] = bitcast i32 %src to i8*
2308	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
2309	// CHECK-A64: [[VLD3:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32 [[TMP2]])
2310	// CHECK-A32: [[VLD3:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3dup.v4i32.p0i8(i8 [[TMP1]], i32 4)
2311	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
2312	// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
2313	// CHECK: [[TMP4:%.]] = bitcast %struct.uint32x4x3_t %dest to i8*
2314	// CHECK: [[TMP5:%.]] = bitcast %struct.uint32x4x3_t [[__RET]] to i8*
2315	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 48, i1 false)
2316	// CHECK: ret void
2317	void test_vld3q_dup_u32(uint32x4x3_t dest, const uint32_t src) {
2318	*dest = vld3q_dup_u32(src);
2319	}
2320
2321	// CHECK-LABEL: @test_vld3q_dup_u8(
2322	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align {{16\|8}}
2323	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x3_t [[__RET]] to i8*
2324	// CHECK-A64: [[VLD3:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8 %src)
2325	// CHECK-A32: [[VLD3:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3dup.v16i8.p0i8(i8 %src, i32 1)
2326	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
2327	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
2328	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x16x3_t %dest to i8*
2329	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x16x3_t [[__RET]] to i8*
2330	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 48, i1 false)
2331	// CHECK: ret void
2332	void test_vld3q_dup_u8(uint8x16x3_t dest, const uint8_t src) {
2333	*dest = vld3q_dup_u8(src);
2334	}
2335
2336	// CHECK-LABEL: @test_vld4q_dup_f16(
2337	// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align {{16\|8}}
2338	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x4_t [[__RET]] to i8*
2339	// CHECK: [[TMP1:%.]] = bitcast half %src to i8*
2340	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to half*
2341	// CHECK-A64: [[VLD4:%.]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half [[TMP2]])
2342	// CHECK-A32: [[VLD4:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4dup.v8i16.p0i8(i8 [[TMP1]], i32 2)
2343	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }*
2344	// CHECK: store { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> } [[VLD4]], { <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]>, <8 x [[HALF]]> }* [[TMP3]]
2345	// CHECK: [[TMP4:%.]] = bitcast %struct.float16x8x4_t %dest to i8*
2346	// CHECK: [[TMP5:%.]] = bitcast %struct.float16x8x4_t [[__RET]] to i8*
2347	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
2348	// CHECK: ret void
2349	void test_vld4q_dup_f16(float16x8x4_t dest, const float16_t src) {
2350	*dest = vld4q_dup_f16(src);
2351	}
2352
2353	// CHECK-LABEL: @test_vld4q_dup_f32(
2354	// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align {{16\|8}}
2355	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x4_t [[__RET]] to i8*
2356	// CHECK: [[TMP1:%.]] = bitcast float %src to i8*
2357	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to float*
2358	// CHECK-A64: [[VLD4:%.]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float [[TMP2]])
2359	// CHECK-A32: [[VLD4:%.]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4dup.v4f32.p0i8(i8 [[TMP1]], i32 4)
2360	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
2361	// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
2362	// CHECK: [[TMP4:%.]] = bitcast %struct.float32x4x4_t %dest to i8*
2363	// CHECK: [[TMP5:%.]] = bitcast %struct.float32x4x4_t [[__RET]] to i8*
2364	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
2365	// CHECK: ret void
2366	void test_vld4q_dup_f32(float32x4x4_t dest, const float32_t src) {
2367	*dest = vld4q_dup_f32(src);
2368	}
2369
2370	// CHECK-LABEL: @test_vld4q_dup_p16(
2371	// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align {{16\|8}}
2372	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x4_t [[__RET]] to i8*
2373	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
2374	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
2375	// CHECK-A64: [[VLD4:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16 [[TMP2]])
2376	// CHECK-A32: [[VLD4:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4dup.v8i16.p0i8(i8 [[TMP1]], i32 2)
2377	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
2378	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
2379	// CHECK: [[TMP4:%.]] = bitcast %struct.poly16x8x4_t %dest to i8*
2380	// CHECK: [[TMP5:%.]] = bitcast %struct.poly16x8x4_t [[__RET]] to i8*
2381	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
2382	// CHECK: ret void
2383	void test_vld4q_dup_p16(poly16x8x4_t dest, const poly16_t src) {
2384	*dest = vld4q_dup_p16(src);
2385	}
2386
2387	// CHECK-LABEL: @test_vld4q_dup_p8(
2388	// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align {{16\|8}}
2389	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x4_t [[__RET]] to i8*
2390	// CHECK-A64: [[VLD4:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8 %src)
2391	// CHECK-A32: [[VLD4:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4dup.v16i8.p0i8(i8 %src, i32 1)
2392	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
2393	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
2394	// CHECK: [[TMP2:%.]] = bitcast %struct.poly8x16x4_t %dest to i8*
2395	// CHECK: [[TMP3:%.]] = bitcast %struct.poly8x16x4_t [[__RET]] to i8*
2396	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 64, i1 false)
2397	// CHECK: ret void
2398	void test_vld4q_dup_p8(poly8x16x4_t dest, const poly8_t src) {
2399	*dest = vld4q_dup_p8(src);
2400	}
2401
2402	// CHECK-LABEL: @test_vld4q_dup_s16(
2403	// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align {{16\|8}}
2404	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x4_t [[__RET]] to i8*
2405	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
2406	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
2407	// CHECK-A64: [[VLD4:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16 [[TMP2]])
2408	// CHECK-A32: [[VLD4:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4dup.v8i16.p0i8(i8 [[TMP1]], i32 2)
2409	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
2410	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
2411	// CHECK: [[TMP4:%.]] = bitcast %struct.int16x8x4_t %dest to i8*
2412	// CHECK: [[TMP5:%.]] = bitcast %struct.int16x8x4_t [[__RET]] to i8*
2413	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
2414	// CHECK: ret void
2415	void test_vld4q_dup_s16(int16x8x4_t dest, const int16_t src) {
2416	*dest = vld4q_dup_s16(src);
2417	}
2418
2419	// CHECK-LABEL: @test_vld4q_dup_s32(
2420	// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align {{16\|8}}
2421	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x4_t [[__RET]] to i8*
2422	// CHECK: [[TMP1:%.]] = bitcast i32 %src to i8*
2423	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
2424	// CHECK-A64: [[VLD4:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32 [[TMP2]])
2425	// CHECK-A32: [[VLD4:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4dup.v4i32.p0i8(i8 [[TMP1]], i32 4)
2426	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
2427	// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
2428	// CHECK: [[TMP4:%.]] = bitcast %struct.int32x4x4_t %dest to i8*
2429	// CHECK: [[TMP5:%.]] = bitcast %struct.int32x4x4_t [[__RET]] to i8*
2430	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
2431	// CHECK: ret void
2432	void test_vld4q_dup_s32(int32x4x4_t dest, const int32_t src) {
2433	*dest = vld4q_dup_s32(src);
2434	}
2435
2436	// CHECK-LABEL: @test_vld4q_dup_s8(
2437	// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align {{16\|8}}
2438	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x4_t [[__RET]] to i8*
2439	// CHECK-A64: [[VLD4:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8 %src)
2440	// CHECK-A32: [[VLD4:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4dup.v16i8.p0i8(i8 %src, i32 1)
2441	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
2442	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
2443	// CHECK: [[TMP2:%.]] = bitcast %struct.int8x16x4_t %dest to i8*
2444	// CHECK: [[TMP3:%.]] = bitcast %struct.int8x16x4_t [[__RET]] to i8*
2445	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 64, i1 false)
2446	// CHECK: ret void
2447	void test_vld4q_dup_s8(int8x16x4_t dest, const int8_t src) {
2448	*dest = vld4q_dup_s8(src);
2449	}
2450
2451	// CHECK-LABEL: @test_vld4q_dup_u16(
2452	// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align {{16\|8}}
2453	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x4_t [[__RET]] to i8*
2454	// CHECK: [[TMP1:%.]] = bitcast i16 %src to i8*
2455	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i16*
2456	// CHECK-A64: [[VLD4:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16 [[TMP2]])
2457	// CHECK-A32: [[VLD4:%.]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4dup.v8i16.p0i8(i8 [[TMP1]], i32 2)
2458	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
2459	// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
2460	// CHECK: [[TMP4:%.]] = bitcast %struct.uint16x8x4_t %dest to i8*
2461	// CHECK: [[TMP5:%.]] = bitcast %struct.uint16x8x4_t [[__RET]] to i8*
2462	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
2463	// CHECK: ret void
2464	void test_vld4q_dup_u16(uint16x8x4_t dest, const uint16_t src) {
2465	*dest = vld4q_dup_u16(src);
2466	}
2467
2468	// CHECK-LABEL: @test_vld4q_dup_u32(
2469	// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align {{16\|8}}
2470	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x4_t [[__RET]] to i8*
2471	// CHECK: [[TMP1:%.]] = bitcast i32 %src to i8*
2472	// CHECK-A64: [[TMP2:%.]] = bitcast i8 [[TMP1]] to i32*
2473	// CHECK-A64: [[VLD4:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32 [[TMP2]])
2474	// CHECK-A32: [[VLD4:%.]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4dup.v4i32.p0i8(i8 [[TMP1]], i32 4)
2475	// CHECK: [[TMP3:%.]] = bitcast i8 [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
2476	// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
2477	// CHECK: [[TMP4:%.]] = bitcast %struct.uint32x4x4_t %dest to i8*
2478	// CHECK: [[TMP5:%.]] = bitcast %struct.uint32x4x4_t [[__RET]] to i8*
2479	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP4]], i8* align {{16\|8}} [[TMP5]], {{i64\|i32}} 64, i1 false)
2480	// CHECK: ret void
2481	void test_vld4q_dup_u32(uint32x4x4_t dest, const uint32_t src) {
2482	*dest = vld4q_dup_u32(src);
2483	}
2484
2485	// CHECK-LABEL: @test_vld4q_dup_u8(
2486	// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align {{16\|8}}
2487	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x4_t [[__RET]] to i8*
2488	// CHECK-A64: [[VLD4:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8 %src)
2489	// CHECK-A32: [[VLD4:%.]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4dup.v16i8.p0i8(i8 %src, i32 1)
2490	// CHECK: [[TMP1:%.]] = bitcast i8 [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
2491	// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
2492	// CHECK: [[TMP2:%.]] = bitcast %struct.uint8x16x4_t %dest to i8*
2493	// CHECK: [[TMP3:%.]] = bitcast %struct.uint8x16x4_t [[__RET]] to i8*
2494	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align {{16\|8}} [[TMP2]], i8* align {{16\|8}} [[TMP3]], {{i64\|i32}} 64, i1 false)
2495	// CHECK: ret void
2496	void test_vld4q_dup_u8(uint8x16x4_t dest, const uint8_t src) {
2497	*dest = vld4q_dup_u8(src);
2498	}
2499

Clang Project