arm-neon-vst.c source code [clang_source_code/test/CodeGen/arm-neon-vst.c]

1	// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
2	// RUN: -S -disable-O0-optnone -emit-llvm -o - %s \| opt -S -mem2reg \| \
3	// RUN: FileCheck -check-prefixes=CHECK,CHECK-A64 %s
4	// RUN: %clang_cc1 -triple armv8-none-linux-gnueabi -target-feature +neon \
5	// RUN: -target-feature +fp16 -S -disable-O0-optnone -emit-llvm -o - %s \| \
6	// RUN: opt -S -mem2reg \| FileCheck -check-prefixes=CHECK,CHECK-A32 %s
7
8	#include <arm_neon.h>
9
10	// CHECK-LABEL: @test_vst1_f16_x2(
11	// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
12	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
13	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[B]], i32 0, i32 0
14	// CHECK-A64: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
15	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <4 x half>] %coerce.dive to [2 x i64]*
16	// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
17	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x2_t [[__S1]] to i8*
18	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x2_t [[B]] to i8*
19	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 16, i1 false)
20	// CHECK: [[TMP2:%.]] = bitcast half %a to i8*
21	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[__S1]], i32 0, i32 0
22	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
23	// CHECK: [[TMP3:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
24	// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
25	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t [[__S1]], i32 0, i32 0
26	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
27	// CHECK: [[TMP5:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
28	// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
29	// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF:(half\|i16)]]>
30	// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]>
31	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to [[HALF]]*
32	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4f16.p0f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], half* [[TMP9]])
33	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* [[TMP9]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
34	// CHECK: ret void
35	void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) {
36	vst1_f16_x2(a, b);
37	}
38
39	// CHECK-LABEL: @test_vst1_f16_x3(
40	// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
41	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
42	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[B]], i32 0, i32 0
43	// CHECK-A64: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
44	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <4 x half>] %coerce.dive to [3 x i64]*
45	// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
46	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x3_t [[__S1]] to i8*
47	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x3_t [[B]] to i8*
48	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 24, i1 false)
49	// CHECK: [[TMP2:%.]] = bitcast half %a to i8*
50	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
51	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
52	// CHECK: [[TMP3:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
53	// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
54	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
55	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
56	// CHECK: [[TMP5:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
57	// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
58	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t [[__S1]], i32 0, i32 0
59	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
60	// CHECK: [[TMP7:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX4]], align 8
61	// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
62	// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF]]>
63	// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]>
64	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x [[HALF]]>
65	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to [[HALF]]*
66	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4f16.p0f16(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], half* [[TMP12]])
67	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* [[TMP12]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
68	// CHECK: ret void
69	void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) {
70	vst1_f16_x3(a, b);
71	}
72
73	// CHECK-LABEL: @test_vst1_f16_x4(
74	// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
75	// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
76	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[B]], i32 0, i32 0
77	// CHECK-A64: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
78	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <4 x half>] %coerce.dive to [4 x i64]*
79	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
80	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x4x4_t [[__S1]] to i8*
81	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x4x4_t [[B]] to i8*
82	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 32, i1 false)
83	// CHECK: [[TMP2:%.]] = bitcast half %a to i8*
84	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
85	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
86	// CHECK: [[TMP3:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX]], align 8
87	// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
88	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
89	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
90	// CHECK: [[TMP5:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX2]], align 8
91	// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
92	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
93	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
94	// CHECK: [[TMP7:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX4]], align 8
95	// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
96	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t [[__S1]], i32 0, i32 0
97	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
98	// CHECK: [[TMP9:%.]] = load <4 x half>, <4 x half> [[ARRAYIDX6]], align 8
99	// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
100	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF]]>
101	// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]>
102	// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x [[HALF]]>
103	// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x [[HALF]]>
104	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to [[HALF]]*
105	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4f16.p0f16(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], half* [[TMP15]])
106	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* [[TMP15]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
107	// CHECK: ret void
108	void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) {
109	vst1_f16_x4(a, b);
110	}
111
112	// CHECK-LABEL: @test_vst1_f32_x2(
113	// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
114	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
115	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[B]], i32 0, i32 0
116	// CHECK-A64: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
117	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <2 x float>] %coerce.dive to [2 x i64]*
118	// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
119	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x2_t [[__S1]] to i8*
120	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x2_t [[B]] to i8*
121	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 16, i1 false)
122	// CHECK: [[TMP2:%.]] = bitcast float %a to i8*
123	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[__S1]], i32 0, i32 0
124	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
125	// CHECK: [[TMP3:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
126	// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
127	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t [[__S1]], i32 0, i32 0
128	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
129	// CHECK: [[TMP5:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
130	// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
131	// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
132	// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
133	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to float*
134	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], float* [[TMP9]])
135	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0f32.v2f32(float* [[TMP9]], <2 x float> [[TMP7]], <2 x float> [[TMP8]])
136	// CHECK: ret void
137	void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) {
138	vst1_f32_x2(a, b);
139	}
140
141	// CHECK-LABEL: @test_vst1_f32_x3(
142	// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
143	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
144	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[B]], i32 0, i32 0
145	// CHECK-A64: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
146	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <2 x float>] %coerce.dive to [3 x i64]*
147	// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
148	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x3_t [[__S1]] to i8*
149	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x3_t [[B]] to i8*
150	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 24, i1 false)
151	// CHECK: [[TMP2:%.]] = bitcast float %a to i8*
152	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
153	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
154	// CHECK: [[TMP3:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
155	// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
156	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
157	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
158	// CHECK: [[TMP5:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
159	// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
160	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t [[__S1]], i32 0, i32 0
161	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
162	// CHECK: [[TMP7:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX4]], align 8
163	// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
164	// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
165	// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
166	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
167	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to float*
168	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], float* [[TMP12]])
169	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0f32.v2f32(float* [[TMP12]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]])
170	// CHECK: ret void
171	void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) {
172	vst1_f32_x3(a, b);
173	}
174
175	// CHECK-LABEL: @test_vst1_f32_x4(
176	// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
177	// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
178	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[B]], i32 0, i32 0
179	// CHECK-A64: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
180	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <2 x float>] %coerce.dive to [4 x i64]*
181	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
182	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x2x4_t [[__S1]] to i8*
183	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x2x4_t [[B]] to i8*
184	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 32, i1 false)
185	// CHECK: [[TMP2:%.]] = bitcast float %a to i8*
186	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
187	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
188	// CHECK: [[TMP3:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX]], align 8
189	// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
190	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
191	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
192	// CHECK: [[TMP5:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX2]], align 8
193	// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
194	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
195	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
196	// CHECK: [[TMP7:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX4]], align 8
197	// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
198	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t [[__S1]], i32 0, i32 0
199	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
200	// CHECK: [[TMP9:%.]] = load <2 x float>, <2 x float> [[ARRAYIDX6]], align 8
201	// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
202	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
203	// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
204	// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
205	// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
206	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to float*
207	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], float* [[TMP15]])
208	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0f32.v2f32(float* [[TMP15]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]])
209	// CHECK: ret void
210	void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) {
211	vst1_f32_x4(a, b);
212	}
213
214	// CHECK-LABEL: @test_vst1_p16_x2(
215	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
216	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
217	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[B]], i32 0, i32 0
218	// CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
219	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <4 x i16>] %coerce.dive to [2 x i64]*
220	// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
221	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x2_t [[__S1]] to i8*
222	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x2_t [[B]] to i8*
223	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 16, i1 false)
224	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
225	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[__S1]], i32 0, i32 0
226	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
227	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
228	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
229	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t [[__S1]], i32 0, i32 0
230	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
231	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
232	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
233	// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
234	// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
235	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i16*
236	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
237	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* [[TMP9]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
238	// CHECK: ret void
239	void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) {
240	vst1_p16_x2(a, b);
241	}
242
243	// CHECK-LABEL: @test_vst1_p16_x3(
244	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
245	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
246	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[B]], i32 0, i32 0
247	// CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
248	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <4 x i16>] %coerce.dive to [3 x i64]*
249	// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
250	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x3_t [[__S1]] to i8*
251	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x3_t [[B]] to i8*
252	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 24, i1 false)
253	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
254	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
255	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
256	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
257	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
258	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
259	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
260	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
261	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
262	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t [[__S1]], i32 0, i32 0
263	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
264	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
265	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
266	// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
267	// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
268	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
269	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i16*
270	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
271	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* [[TMP12]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
272	// CHECK: ret void
273	void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) {
274	vst1_p16_x3(a, b);
275	}
276
277	// CHECK-LABEL: @test_vst1_p16_x4(
278	// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
279	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
280	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[B]], i32 0, i32 0
281	// CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
282	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <4 x i16>] %coerce.dive to [4 x i64]*
283	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
284	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x4x4_t [[__S1]] to i8*
285	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x4x4_t [[B]] to i8*
286	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 32, i1 false)
287	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
288	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
289	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
290	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
291	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
292	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
293	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
294	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
295	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
296	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
297	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
298	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
299	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
300	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t [[__S1]], i32 0, i32 0
301	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
302	// CHECK: [[TMP9:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
303	// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
304	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
305	// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
306	// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
307	// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
308	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i16*
309	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
310	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* [[TMP15]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
311	// CHECK: ret void
312	void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) {
313	vst1_p16_x4(a, b);
314	}
315
316	// CHECK-LABEL: @test_vst1_p8_x2(
317	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
318	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
319	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[B]], i32 0, i32 0
320	// CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
321	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <8 x i8>] %coerce.dive to [2 x i64]*
322	// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
323	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x2_t [[__S1]] to i8*
324	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x2_t [[B]] to i8*
325	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 16, i1 false)
326	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__S1]], i32 0, i32 0
327	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
328	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
329	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t [[__S1]], i32 0, i32 0
330	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
331	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
332	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
333	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
334	// CHECK: ret void
335	void test_vst1_p8_x2(poly8_t *a, poly8x8x2_t b) {
336	vst1_p8_x2(a, b);
337	}
338
339	// CHECK-LABEL: @test_vst1_p8_x3(
340	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
341	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
342	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[B]], i32 0, i32 0
343	// CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
344	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <8 x i8>] %coerce.dive to [3 x i64]*
345	// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
346	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x3_t [[__S1]] to i8*
347	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x3_t [[B]] to i8*
348	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 24, i1 false)
349	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
350	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
351	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
352	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
353	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
354	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
355	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t [[__S1]], i32 0, i32 0
356	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
357	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
358	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
359	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]])
360	// CHECK: ret void
361	void test_vst1_p8_x3(poly8_t *a, poly8x8x3_t b) {
362	vst1_p8_x3(a, b);
363	}
364
365	// CHECK-LABEL: @test_vst1_p8_x4(
366	// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
367	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
368	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[B]], i32 0, i32 0
369	// CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
370	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <8 x i8>] %coerce.dive to [4 x i64]*
371	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
372	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x8x4_t [[__S1]] to i8*
373	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x8x4_t [[B]] to i8*
374	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 32, i1 false)
375	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
376	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
377	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
378	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
379	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
380	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
381	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
382	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
383	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
384	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t [[__S1]], i32 0, i32 0
385	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
386	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
387	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
388	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]])
389	// CHECK: ret void
390	void test_vst1_p8_x4(poly8_t *a, poly8x8x4_t b) {
391	vst1_p8_x4(a, b);
392	}
393
394	// CHECK-LABEL: @test_vst1_s16_x2(
395	// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
396	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
397	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[B]], i32 0, i32 0
398	// CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
399	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <4 x i16>] %coerce.dive to [2 x i64]*
400	// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
401	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x2_t [[__S1]] to i8*
402	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x2_t [[B]] to i8*
403	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 16, i1 false)
404	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
405	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[__S1]], i32 0, i32 0
406	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
407	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
408	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
409	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t [[__S1]], i32 0, i32 0
410	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
411	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
412	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
413	// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
414	// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
415	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i16*
416	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
417	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* [[TMP9]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
418	// CHECK: ret void
419	void test_vst1_s16_x2(int16_t *a, int16x4x2_t b) {
420	vst1_s16_x2(a, b);
421	}
422
423	// CHECK-LABEL: @test_vst1_s16_x3(
424	// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
425	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
426	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[B]], i32 0, i32 0
427	// CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
428	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <4 x i16>] %coerce.dive to [3 x i64]*
429	// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
430	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x3_t [[__S1]] to i8*
431	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x3_t [[B]] to i8*
432	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 24, i1 false)
433	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
434	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
435	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
436	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
437	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
438	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
439	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
440	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
441	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
442	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t [[__S1]], i32 0, i32 0
443	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
444	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
445	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
446	// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
447	// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
448	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
449	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i16*
450	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
451	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* [[TMP12]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
452	// CHECK: ret void
453	void test_vst1_s16_x3(int16_t *a, int16x4x3_t b) {
454	vst1_s16_x3(a, b);
455	}
456
457	// CHECK-LABEL: @test_vst1_s16_x4(
458	// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
459	// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
460	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[B]], i32 0, i32 0
461	// CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
462	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <4 x i16>] %coerce.dive to [4 x i64]*
463	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
464	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x4x4_t [[__S1]] to i8*
465	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x4x4_t [[B]] to i8*
466	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 32, i1 false)
467	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
468	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
469	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
470	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
471	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
472	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
473	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
474	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
475	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
476	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
477	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
478	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
479	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
480	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t [[__S1]], i32 0, i32 0
481	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
482	// CHECK: [[TMP9:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
483	// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
484	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
485	// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
486	// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
487	// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
488	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i16*
489	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
490	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* [[TMP15]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
491	// CHECK: ret void
492	void test_vst1_s16_x4(int16_t *a, int16x4x4_t b) {
493	vst1_s16_x4(a, b);
494	}
495
496	// CHECK-LABEL: @test_vst1_s32_x2(
497	// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
498	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
499	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[B]], i32 0, i32 0
500	// CHECK-A64: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
501	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <2 x i32>] %coerce.dive to [2 x i64]*
502	// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
503	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x2_t [[__S1]] to i8*
504	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x2_t [[B]] to i8*
505	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 16, i1 false)
506	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
507	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[__S1]], i32 0, i32 0
508	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
509	// CHECK: [[TMP3:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
510	// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
511	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t [[__S1]], i32 0, i32 0
512	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
513	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
514	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
515	// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
516	// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
517	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i32*
518	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
519	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i32.v2i32(i32* [[TMP9]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]])
520	// CHECK: ret void
521	void test_vst1_s32_x2(int32_t *a, int32x2x2_t b) {
522	vst1_s32_x2(a, b);
523	}
524
525	// CHECK-LABEL: @test_vst1_s32_x3(
526	// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
527	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
528	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[B]], i32 0, i32 0
529	// CHECK-A64: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
530	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <2 x i32>] %coerce.dive to [3 x i64]*
531	// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
532	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x3_t [[__S1]] to i8*
533	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x3_t [[B]] to i8*
534	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 24, i1 false)
535	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
536	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
537	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
538	// CHECK: [[TMP3:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
539	// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
540	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
541	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
542	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
543	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
544	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t [[__S1]], i32 0, i32 0
545	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
546	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
547	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
548	// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
549	// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
550	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
551	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i32*
552	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
553	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i32.v2i32(i32* [[TMP12]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]])
554	// CHECK: ret void
555	void test_vst1_s32_x3(int32_t *a, int32x2x3_t b) {
556	vst1_s32_x3(a, b);
557	}
558
559	// CHECK-LABEL: @test_vst1_s32_x4(
560	// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
561	// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
562	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[B]], i32 0, i32 0
563	// CHECK-A64: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
564	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <2 x i32>] %coerce.dive to [4 x i64]*
565	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
566	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x2x4_t [[__S1]] to i8*
567	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x2x4_t [[B]] to i8*
568	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 32, i1 false)
569	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
570	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
571	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
572	// CHECK: [[TMP3:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
573	// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
574	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
575	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
576	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
577	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
578	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
579	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
580	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
581	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
582	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t [[__S1]], i32 0, i32 0
583	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
584	// CHECK: [[TMP9:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX6]], align 8
585	// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
586	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
587	// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
588	// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
589	// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
590	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i32*
591	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
592	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i32.v2i32(i32* [[TMP15]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]])
593	// CHECK: ret void
594	void test_vst1_s32_x4(int32_t *a, int32x2x4_t b) {
595	vst1_s32_x4(a, b);
596	}
597
598	// CHECK-LABEL: @test_vst1_s64_x2(
599	// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
600	// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
601	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t [[B]], i32 0, i32 0
602	// CHECK-A64: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
603	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <1 x i64>] %coerce.dive to [2 x i64]*
604	// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
605	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x2_t [[__S1]] to i8*
606	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x1x2_t [[B]] to i8*
607	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 16, i1 false)
608	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
609	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t [[__S1]], i32 0, i32 0
610	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
611	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
612	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
613	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t [[__S1]], i32 0, i32 0
614	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
615	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
616	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
617	// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
618	// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
619	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i64*
620	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
621	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i64.v1i64(i64* [[TMP9]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]])
622	// CHECK: ret void
623	void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) {
624	vst1_s64_x2(a, b);
625	}
626
627	// CHECK-LABEL: @test_vst1_s64_x3(
628	// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
629	// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
630	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[B]], i32 0, i32 0
631	// CHECK-A64: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
632	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <1 x i64>] %coerce.dive to [3 x i64]*
633	// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
634	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x3_t [[__S1]] to i8*
635	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x1x3_t [[B]] to i8*
636	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 24, i1 false)
637	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
638	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[__S1]], i32 0, i32 0
639	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
640	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
641	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
642	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[__S1]], i32 0, i32 0
643	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
644	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
645	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
646	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t [[__S1]], i32 0, i32 0
647	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
648	// CHECK: [[TMP7:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
649	// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
650	// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
651	// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
652	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
653	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i64*
654	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
655	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i64.v1i64(i64* [[TMP12]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]])
656	// CHECK: ret void
657	void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) {
658	vst1_s64_x3(a, b);
659	}
660
661	// CHECK-LABEL: @test_vst1_s64_x4(
662	// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
663	// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
664	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[B]], i32 0, i32 0
665	// CHECK-A64: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
666	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <1 x i64>] %coerce.dive to [4 x i64]*
667	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
668	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x1x4_t [[__S1]] to i8*
669	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x1x4_t [[B]] to i8*
670	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 32, i1 false)
671	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
672	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
673	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
674	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
675	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
676	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
677	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
678	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
679	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
680	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
681	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
682	// CHECK: [[TMP7:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
683	// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
684	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t [[__S1]], i32 0, i32 0
685	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
686	// CHECK: [[TMP9:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX6]], align 8
687	// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
688	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
689	// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
690	// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
691	// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
692	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i64*
693	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
694	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i64.v1i64(i64* [[TMP15]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]])
695	// CHECK: ret void
696	void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) {
697	vst1_s64_x4(a, b);
698	}
699
700	// CHECK-LABEL: @test_vst1_s8_x2(
701	// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
702	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
703	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[B]], i32 0, i32 0
704	// CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
705	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <8 x i8>] %coerce.dive to [2 x i64]*
706	// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
707	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x2_t [[__S1]] to i8*
708	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x2_t [[B]] to i8*
709	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 16, i1 false)
710	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__S1]], i32 0, i32 0
711	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
712	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
713	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t [[__S1]], i32 0, i32 0
714	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
715	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
716	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
717	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
718	// CHECK: ret void
719	void test_vst1_s8_x2(int8_t *a, int8x8x2_t b) {
720	vst1_s8_x2(a, b);
721	}
722
723	// CHECK-LABEL: @test_vst1_s8_x3(
724	// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
725	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
726	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[B]], i32 0, i32 0
727	// CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
728	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <8 x i8>] %coerce.dive to [3 x i64]*
729	// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
730	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x3_t [[__S1]] to i8*
731	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x3_t [[B]] to i8*
732	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 24, i1 false)
733	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
734	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
735	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
736	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
737	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
738	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
739	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t [[__S1]], i32 0, i32 0
740	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
741	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
742	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
743	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]])
744	// CHECK: ret void
745	void test_vst1_s8_x3(int8_t *a, int8x8x3_t b) {
746	vst1_s8_x3(a, b);
747	}
748
749	// CHECK-LABEL: @test_vst1_s8_x4(
750	// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
751	// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
752	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[B]], i32 0, i32 0
753	// CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
754	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <8 x i8>] %coerce.dive to [4 x i64]*
755	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
756	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x8x4_t [[__S1]] to i8*
757	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x8x4_t [[B]] to i8*
758	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 32, i1 false)
759	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
760	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
761	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
762	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
763	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
764	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
765	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
766	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
767	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
768	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t [[__S1]], i32 0, i32 0
769	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
770	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
771	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
772	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]])
773	// CHECK: ret void
774	void test_vst1_s8_x4(int8_t *a, int8x8x4_t b) {
775	vst1_s8_x4(a, b);
776	}
777
778	// CHECK-LABEL: @test_vst1_u16_x2(
779	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
780	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
781	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[B]], i32 0, i32 0
782	// CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
783	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <4 x i16>] %coerce.dive to [2 x i64]*
784	// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
785	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x2_t [[__S1]] to i8*
786	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x2_t [[B]] to i8*
787	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 16, i1 false)
788	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
789	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[__S1]], i32 0, i32 0
790	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
791	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
792	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
793	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t [[__S1]], i32 0, i32 0
794	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
795	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
796	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
797	// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
798	// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
799	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i16*
800	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
801	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v4i16(i16* [[TMP9]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]])
802	// CHECK: ret void
803	void test_vst1_u16_x2(uint16_t *a, uint16x4x2_t b) {
804	vst1_u16_x2(a, b);
805	}
806
807	// CHECK-LABEL: @test_vst1_u16_x3(
808	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
809	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
810	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[B]], i32 0, i32 0
811	// CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
812	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <4 x i16>] %coerce.dive to [3 x i64]*
813	// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
814	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x3_t [[__S1]] to i8*
815	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x3_t [[B]] to i8*
816	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 24, i1 false)
817	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
818	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
819	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
820	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
821	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
822	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
823	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
824	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
825	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
826	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t [[__S1]], i32 0, i32 0
827	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
828	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
829	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
830	// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
831	// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
832	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
833	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i16*
834	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
835	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v4i16(i16* [[TMP12]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]])
836	// CHECK: ret void
837	void test_vst1_u16_x3(uint16_t *a, uint16x4x3_t b) {
838	vst1_u16_x3(a, b);
839	}
840
841	// CHECK-LABEL: @test_vst1_u16_x4(
842	// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
843	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
844	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[B]], i32 0, i32 0
845	// CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
846	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <4 x i16>] %coerce.dive to [4 x i64]*
847	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
848	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x4x4_t [[__S1]] to i8*
849	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x4x4_t [[B]] to i8*
850	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 32, i1 false)
851	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
852	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
853	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
854	// CHECK: [[TMP3:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX]], align 8
855	// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
856	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
857	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
858	// CHECK: [[TMP5:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX2]], align 8
859	// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
860	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
861	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
862	// CHECK: [[TMP7:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX4]], align 8
863	// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
864	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t [[__S1]], i32 0, i32 0
865	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
866	// CHECK: [[TMP9:%.]] = load <4 x i16>, <4 x i16> [[ARRAYIDX6]], align 8
867	// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
868	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
869	// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
870	// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
871	// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
872	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i16*
873	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
874	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v4i16(i16* [[TMP15]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]])
875	// CHECK: ret void
876	void test_vst1_u16_x4(uint16_t *a, uint16x4x4_t b) {
877	vst1_u16_x4(a, b);
878	}
879
880	// CHECK-LABEL: @test_vst1_u32_x2(
881	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
882	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
883	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[B]], i32 0, i32 0
884	// CHECK-A64: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
885	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <2 x i32>] %coerce.dive to [2 x i64]*
886	// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
887	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x2_t [[__S1]] to i8*
888	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x2_t [[B]] to i8*
889	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 16, i1 false)
890	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
891	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[__S1]], i32 0, i32 0
892	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
893	// CHECK: [[TMP3:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
894	// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
895	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t [[__S1]], i32 0, i32 0
896	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
897	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
898	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
899	// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
900	// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
901	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i32*
902	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
903	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i32.v2i32(i32* [[TMP9]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]])
904	// CHECK: ret void
905	void test_vst1_u32_x2(uint32_t *a, uint32x2x2_t b) {
906	vst1_u32_x2(a, b);
907	}
908
909	// CHECK-LABEL: @test_vst1_u32_x3(
910	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
911	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
912	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[B]], i32 0, i32 0
913	// CHECK-A64: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
914	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <2 x i32>] %coerce.dive to [3 x i64]*
915	// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
916	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x3_t [[__S1]] to i8*
917	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x3_t [[B]] to i8*
918	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 24, i1 false)
919	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
920	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
921	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
922	// CHECK: [[TMP3:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
923	// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
924	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
925	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
926	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
927	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
928	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t [[__S1]], i32 0, i32 0
929	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
930	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
931	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
932	// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
933	// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
934	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
935	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i32*
936	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
937	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i32.v2i32(i32* [[TMP12]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]])
938	// CHECK: ret void
939	void test_vst1_u32_x3(uint32_t *a, uint32x2x3_t b) {
940	vst1_u32_x3(a, b);
941	}
942
943	// CHECK-LABEL: @test_vst1_u32_x4(
944	// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
945	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
946	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[B]], i32 0, i32 0
947	// CHECK-A64: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
948	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <2 x i32>] %coerce.dive to [4 x i64]*
949	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
950	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x2x4_t [[__S1]] to i8*
951	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x2x4_t [[B]] to i8*
952	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 32, i1 false)
953	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
954	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
955	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
956	// CHECK: [[TMP3:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX]], align 8
957	// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
958	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
959	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
960	// CHECK: [[TMP5:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX2]], align 8
961	// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
962	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
963	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
964	// CHECK: [[TMP7:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX4]], align 8
965	// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
966	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t [[__S1]], i32 0, i32 0
967	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
968	// CHECK: [[TMP9:%.]] = load <2 x i32>, <2 x i32> [[ARRAYIDX6]], align 8
969	// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
970	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
971	// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
972	// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
973	// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
974	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i32*
975	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
976	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i32.v2i32(i32* [[TMP15]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]])
977	// CHECK: ret void
978	void test_vst1_u32_x4(uint32_t *a, uint32x2x4_t b) {
979	vst1_u32_x4(a, b);
980	}
981
982	// CHECK-LABEL: @test_vst1_u64_x2(
983	// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
984	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
985	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t [[B]], i32 0, i32 0
986	// CHECK-A64: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
987	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <1 x i64>] %coerce.dive to [2 x i64]*
988	// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
989	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x2_t [[__S1]] to i8*
990	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x1x2_t [[B]] to i8*
991	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 16, i1 false)
992	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
993	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t [[__S1]], i32 0, i32 0
994	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
995	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
996	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
997	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t [[__S1]], i32 0, i32 0
998	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
999	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
1000	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
1001	// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
1002	// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
1003	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i64*
1004	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
1005	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i64.v1i64(i64* [[TMP9]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]])
1006	// CHECK: ret void
1007	void test_vst1_u64_x2(uint64_t *a, uint64x1x2_t b) {
1008	vst1_u64_x2(a, b);
1009	}
1010
1011	// CHECK-LABEL: @test_vst1_u64_x3(
1012	// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
1013	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
1014	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[B]], i32 0, i32 0
1015	// CHECK-A64: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
1016	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <1 x i64>] %coerce.dive to [3 x i64]*
1017	// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
1018	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x3_t [[__S1]] to i8*
1019	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x1x3_t [[B]] to i8*
1020	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 24, i1 false)
1021	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
1022	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[__S1]], i32 0, i32 0
1023	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1024	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
1025	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
1026	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[__S1]], i32 0, i32 0
1027	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1028	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
1029	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
1030	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t [[__S1]], i32 0, i32 0
1031	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1032	// CHECK: [[TMP7:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
1033	// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
1034	// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
1035	// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
1036	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
1037	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i64*
1038	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
1039	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i64.v1i64(i64* [[TMP12]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]])
1040	// CHECK: ret void
1041	void test_vst1_u64_x3(uint64_t *a, uint64x1x3_t b) {
1042	vst1_u64_x3(a, b);
1043	}
1044
1045	// CHECK-LABEL: @test_vst1_u64_x4(
1046	// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
1047	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
1048	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[B]], i32 0, i32 0
1049	// CHECK-A64: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
1050	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <1 x i64>] %coerce.dive to [4 x i64]*
1051	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1052	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x1x4_t [[__S1]] to i8*
1053	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x1x4_t [[B]] to i8*
1054	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 32, i1 false)
1055	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
1056	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
1057	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1058	// CHECK: [[TMP3:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX]], align 8
1059	// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
1060	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
1061	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1062	// CHECK: [[TMP5:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX2]], align 8
1063	// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
1064	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
1065	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1066	// CHECK: [[TMP7:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX4]], align 8
1067	// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
1068	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t [[__S1]], i32 0, i32 0
1069	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
1070	// CHECK: [[TMP9:%.]] = load <1 x i64>, <1 x i64> [[ARRAYIDX6]], align 8
1071	// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
1072	// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
1073	// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
1074	// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
1075	// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
1076	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i64*
1077	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
1078	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i64.v1i64(i64* [[TMP15]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]])
1079	// CHECK: ret void
1080	void test_vst1_u64_x4(uint64_t *a, uint64x1x4_t b) {
1081	vst1_u64_x4(a, b);
1082	}
1083
1084	// CHECK-LABEL: @test_vst1_u8_x2(
1085	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
1086	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
1087	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[B]], i32 0, i32 0
1088	// CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
1089	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <8 x i8>] %coerce.dive to [2 x i64]*
1090	// CHECK-A32: store [2 x i64] %b.coerce, [2 x i64]* [[COERCE_DIVE_TMP]], align 8
1091	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x2_t [[__S1]] to i8*
1092	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x2_t [[B]] to i8*
1093	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 16, i1 false)
1094	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__S1]], i32 0, i32 0
1095	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1096	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
1097	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t [[__S1]], i32 0, i32 0
1098	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1099	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
1100	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
1101	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]])
1102	// CHECK: ret void
1103	void test_vst1_u8_x2(uint8_t *a, uint8x8x2_t b) {
1104	vst1_u8_x2(a, b);
1105	}
1106
1107	// CHECK-LABEL: @test_vst1_u8_x3(
1108	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
1109	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
1110	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[B]], i32 0, i32 0
1111	// CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
1112	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <8 x i8>] %coerce.dive to [3 x i64]*
1113	// CHECK-A32: store [3 x i64] %b.coerce, [3 x i64]* [[COERCE_DIVE_TMP]], align 8
1114	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x3_t [[__S1]] to i8*
1115	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x3_t [[B]] to i8*
1116	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 24, i1 false)
1117	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
1118	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1119	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
1120	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
1121	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1122	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
1123	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t [[__S1]], i32 0, i32 0
1124	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1125	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
1126	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
1127	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]])
1128	// CHECK: ret void
1129	void test_vst1_u8_x3(uint8_t *a, uint8x8x3_t b) {
1130	vst1_u8_x3(a, b);
1131	}
1132
1133	// CHECK-LABEL: @test_vst1_u8_x4(
1134	// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
1135	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
1136	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[B]], i32 0, i32 0
1137	// CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
1138	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <8 x i8>] %coerce.dive to [4 x i64]*
1139	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1140	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x8x4_t [[__S1]] to i8*
1141	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x8x4_t [[B]] to i8*
1142	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], {{i64\|i32}} 32, i1 false)
1143	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
1144	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1145	// CHECK: [[TMP2:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX]], align 8
1146	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
1147	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1148	// CHECK: [[TMP3:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX2]], align 8
1149	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
1150	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1151	// CHECK: [[TMP4:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX4]], align 8
1152	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t [[__S1]], i32 0, i32 0
1153	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
1154	// CHECK: [[TMP5:%.]] = load <8 x i8>, <8 x i8> [[ARRAYIDX6]], align 8
1155	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
1156	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]])
1157	// CHECK: ret void
1158	void test_vst1_u8_x4(uint8_t *a, uint8x8x4_t b) {
1159	vst1_u8_x4(a, b);
1160	}
1161
1162	// CHECK-LABEL: @test_vst1q_f16_x2(
1163	// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align [[QALIGN:(16\|8)]]
1164	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align [[QALIGN]]
1165	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[B]], i32 0, i32 0
1166	// CHECK-A64: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
1167	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <8 x half>] %coerce.dive to [4 x i64]*
1168	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1169	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x2_t [[__S1]] to i8*
1170	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x2_t [[B]] to i8*
1171	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 32, i1 false)
1172	// CHECK: [[TMP2:%.]] = bitcast half %a to i8*
1173	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[__S1]], i32 0, i32 0
1174	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1175	// CHECK: [[TMP3:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align [[QALIGN]]
1176	// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
1177	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t [[__S1]], i32 0, i32 0
1178	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1179	// CHECK: [[TMP5:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align [[QALIGN]]
1180	// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
1181	// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]>
1182	// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]>
1183	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to [[HALF]]*
1184	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8f16.p0f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], half* [[TMP9]])
1185	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* [[TMP9]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
1186	// CHECK: ret void
1187	void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) {
1188	vst1q_f16_x2(a, b);
1189	}
1190
1191	// CHECK-LABEL: @test_vst1q_f16_x3(
1192	// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align [[QALIGN]]
1193	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align [[QALIGN]]
1194	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[B]], i32 0, i32 0
1195	// CHECK-A64: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
1196	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <8 x half>] %coerce.dive to [6 x i64]*
1197	// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1198	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x3_t [[__S1]] to i8*
1199	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x3_t [[B]] to i8*
1200	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 48, i1 false)
1201	// CHECK: [[TMP2:%.]] = bitcast half %a to i8*
1202	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
1203	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1204	// CHECK: [[TMP3:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align [[QALIGN]]
1205	// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
1206	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
1207	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1208	// CHECK: [[TMP5:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align [[QALIGN]]
1209	// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
1210	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t [[__S1]], i32 0, i32 0
1211	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1212	// CHECK: [[TMP7:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX4]], align [[QALIGN]]
1213	// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
1214	// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]>
1215	// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]>
1216	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x [[HALF]]>
1217	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to [[HALF]]*
1218	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8f16.p0f16(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], half* [[TMP12]])
1219	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* [[TMP12]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
1220	// CHECK: ret void
1221	void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) {
1222	vst1q_f16_x3(a, b);
1223	}
1224
1225	// CHECK-LABEL: @test_vst1q_f16_x4(
1226	// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align [[QALIGN]]
1227	// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align [[QALIGN]]
1228	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[B]], i32 0, i32 0
1229	// CHECK-A64: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
1230	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <8 x half>] %coerce.dive to [8 x i64]*
1231	// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1232	// CHECK: [[TMP0:%.]] = bitcast %struct.float16x8x4_t [[__S1]] to i8*
1233	// CHECK: [[TMP1:%.]] = bitcast %struct.float16x8x4_t [[B]] to i8*
1234	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 64, i1 false)
1235	// CHECK: [[TMP2:%.]] = bitcast half %a to i8*
1236	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
1237	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1238	// CHECK: [[TMP3:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX]], align [[QALIGN]]
1239	// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
1240	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
1241	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1242	// CHECK: [[TMP5:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX2]], align [[QALIGN]]
1243	// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
1244	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
1245	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1246	// CHECK: [[TMP7:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX4]], align [[QALIGN]]
1247	// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
1248	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t [[__S1]], i32 0, i32 0
1249	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
1250	// CHECK: [[TMP9:%.]] = load <8 x half>, <8 x half> [[ARRAYIDX6]], align [[QALIGN]]
1251	// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
1252	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]>
1253	// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]>
1254	// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x [[HALF]]>
1255	// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x [[HALF]]>
1256	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to [[HALF]]*
1257	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8f16.p0f16(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], half* [[TMP15]])
1258	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* [[TMP15]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
1259	// CHECK: ret void
1260	void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) {
1261	vst1q_f16_x4(a, b);
1262	}
1263
1264	// CHECK-LABEL: @test_vst1q_f32_x2(
1265	// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align [[QALIGN]]
1266	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align [[QALIGN]]
1267	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[B]], i32 0, i32 0
1268	// CHECK-A64: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
1269	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <4 x float>] %coerce.dive to [4 x i64]*
1270	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1271	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x2_t [[__S1]] to i8*
1272	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x2_t [[B]] to i8*
1273	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 32, i1 false)
1274	// CHECK: [[TMP2:%.]] = bitcast float %a to i8*
1275	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[__S1]], i32 0, i32 0
1276	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1277	// CHECK: [[TMP3:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align [[QALIGN]]
1278	// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
1279	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t [[__S1]], i32 0, i32 0
1280	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1281	// CHECK: [[TMP5:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align [[QALIGN]]
1282	// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
1283	// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
1284	// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
1285	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to float*
1286	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], float* [[TMP9]])
1287	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0f32.v4f32(float* [[TMP9]], <4 x float> [[TMP7]], <4 x float> [[TMP8]])
1288	// CHECK: ret void
1289	void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) {
1290	vst1q_f32_x2(a, b);
1291	}
1292
1293	// CHECK-LABEL: @test_vst1q_f32_x3(
1294	// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align [[QALIGN]]
1295	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align [[QALIGN]]
1296	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[B]], i32 0, i32 0
1297	// CHECK-A64: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
1298	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <4 x float>] %coerce.dive to [6 x i64]*
1299	// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1300	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x3_t [[__S1]] to i8*
1301	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x3_t [[B]] to i8*
1302	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 48, i1 false)
1303	// CHECK: [[TMP2:%.]] = bitcast float %a to i8*
1304	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
1305	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1306	// CHECK: [[TMP3:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align [[QALIGN]]
1307	// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
1308	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
1309	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1310	// CHECK: [[TMP5:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align [[QALIGN]]
1311	// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
1312	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t [[__S1]], i32 0, i32 0
1313	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1314	// CHECK: [[TMP7:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX4]], align [[QALIGN]]
1315	// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
1316	// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
1317	// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
1318	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
1319	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to float*
1320	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], float* [[TMP12]])
1321	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0f32.v4f32(float* [[TMP12]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]])
1322	// CHECK: ret void
1323	void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) {
1324	vst1q_f32_x3(a, b);
1325	}
1326
1327	// CHECK-LABEL: @test_vst1q_f32_x4(
1328	// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align [[QALIGN]]
1329	// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align [[QALIGN]]
1330	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[B]], i32 0, i32 0
1331	// CHECK-A64: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
1332	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <4 x float>] %coerce.dive to [8 x i64]*
1333	// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1334	// CHECK: [[TMP0:%.]] = bitcast %struct.float32x4x4_t [[__S1]] to i8*
1335	// CHECK: [[TMP1:%.]] = bitcast %struct.float32x4x4_t [[B]] to i8*
1336	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 64, i1 false)
1337	// CHECK: [[TMP2:%.]] = bitcast float %a to i8*
1338	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
1339	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1340	// CHECK: [[TMP3:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX]], align [[QALIGN]]
1341	// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
1342	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
1343	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1344	// CHECK: [[TMP5:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX2]], align [[QALIGN]]
1345	// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
1346	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
1347	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1348	// CHECK: [[TMP7:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX4]], align [[QALIGN]]
1349	// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
1350	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t [[__S1]], i32 0, i32 0
1351	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
1352	// CHECK: [[TMP9:%.]] = load <4 x float>, <4 x float> [[ARRAYIDX6]], align [[QALIGN]]
1353	// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
1354	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
1355	// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
1356	// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
1357	// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
1358	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to float*
1359	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], float* [[TMP15]])
1360	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0f32.v4f32(float* [[TMP15]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]])
1361	// CHECK: ret void
1362	void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) {
1363	vst1q_f32_x4(a, b);
1364	}
1365
1366	// CHECK-LABEL: @test_vst1q_p16_x2(
1367	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align [[QALIGN]]
1368	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align [[QALIGN]]
1369	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[B]], i32 0, i32 0
1370	// CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
1371	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <8 x i16>] %coerce.dive to [4 x i64]*
1372	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1373	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x2_t [[__S1]] to i8*
1374	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x2_t [[B]] to i8*
1375	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 32, i1 false)
1376	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
1377	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[__S1]], i32 0, i32 0
1378	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1379	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align [[QALIGN]]
1380	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1381	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t [[__S1]], i32 0, i32 0
1382	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1383	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align [[QALIGN]]
1384	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1385	// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1386	// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1387	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i16*
1388	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
1389	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* [[TMP9]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
1390	// CHECK: ret void
1391	void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) {
1392	vst1q_p16_x2(a, b);
1393	}
1394
1395	// CHECK-LABEL: @test_vst1q_p16_x3(
1396	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align [[QALIGN]]
1397	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align [[QALIGN]]
1398	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[B]], i32 0, i32 0
1399	// CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
1400	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <8 x i16>] %coerce.dive to [6 x i64]*
1401	// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1402	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x3_t [[__S1]] to i8*
1403	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x3_t [[B]] to i8*
1404	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 48, i1 false)
1405	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
1406	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
1407	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1408	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align [[QALIGN]]
1409	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1410	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
1411	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1412	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align [[QALIGN]]
1413	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1414	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t [[__S1]], i32 0, i32 0
1415	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1416	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align [[QALIGN]]
1417	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1418	// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1419	// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1420	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1421	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i16*
1422	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
1423	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* [[TMP12]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
1424	// CHECK: ret void
1425	void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) {
1426	vst1q_p16_x3(a, b);
1427	}
1428
1429	// CHECK-LABEL: @test_vst1q_p16_x4(
1430	// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align [[QALIGN]]
1431	// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align [[QALIGN]]
1432	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[B]], i32 0, i32 0
1433	// CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
1434	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <8 x i16>] %coerce.dive to [8 x i64]*
1435	// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1436	// CHECK: [[TMP0:%.]] = bitcast %struct.poly16x8x4_t [[__S1]] to i8*
1437	// CHECK: [[TMP1:%.]] = bitcast %struct.poly16x8x4_t [[B]] to i8*
1438	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 64, i1 false)
1439	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
1440	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
1441	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1442	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align [[QALIGN]]
1443	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1444	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
1445	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1446	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align [[QALIGN]]
1447	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1448	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
1449	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1450	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align [[QALIGN]]
1451	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1452	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t [[__S1]], i32 0, i32 0
1453	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
1454	// CHECK: [[TMP9:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align [[QALIGN]]
1455	// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
1456	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1457	// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1458	// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1459	// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
1460	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i16*
1461	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
1462	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* [[TMP15]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
1463	// CHECK: ret void
1464	void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) {
1465	vst1q_p16_x4(a, b);
1466	}
1467
1468	// CHECK-LABEL: @test_vst1q_p8_x2(
1469	// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align [[QALIGN]]
1470	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align [[QALIGN]]
1471	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t [[B]], i32 0, i32 0
1472	// CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
1473	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <16 x i8>] %coerce.dive to [4 x i64]*
1474	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1475	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x2_t [[__S1]] to i8*
1476	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x16x2_t [[B]] to i8*
1477	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 32, i1 false)
1478	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t [[__S1]], i32 0, i32 0
1479	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1480	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align [[QALIGN]]
1481	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t [[__S1]], i32 0, i32 0
1482	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1483	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align [[QALIGN]]
1484	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
1485	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
1486	// CHECK: ret void
1487	void test_vst1q_p8_x2(poly8_t *a, poly8x16x2_t b) {
1488	vst1q_p8_x2(a, b);
1489	}
1490
1491	// CHECK-LABEL: @test_vst1q_p8_x3(
1492	// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align [[QALIGN]]
1493	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align [[QALIGN]]
1494	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[B]], i32 0, i32 0
1495	// CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
1496	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <16 x i8>] %coerce.dive to [6 x i64]*
1497	// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1498	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x3_t [[__S1]] to i8*
1499	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x16x3_t [[B]] to i8*
1500	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 48, i1 false)
1501	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[__S1]], i32 0, i32 0
1502	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1503	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align [[QALIGN]]
1504	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[__S1]], i32 0, i32 0
1505	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1506	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align [[QALIGN]]
1507	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t [[__S1]], i32 0, i32 0
1508	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1509	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align [[QALIGN]]
1510	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
1511	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
1512	// CHECK: ret void
1513	void test_vst1q_p8_x3(poly8_t *a, poly8x16x3_t b) {
1514	vst1q_p8_x3(a, b);
1515	}
1516
1517	// CHECK-LABEL: @test_vst1q_p8_x4(
1518	// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align [[QALIGN]]
1519	// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align [[QALIGN]]
1520	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[B]], i32 0, i32 0
1521	// CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
1522	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <16 x i8>] %coerce.dive to [8 x i64]*
1523	// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1524	// CHECK: [[TMP0:%.]] = bitcast %struct.poly8x16x4_t [[__S1]] to i8*
1525	// CHECK: [[TMP1:%.]] = bitcast %struct.poly8x16x4_t [[B]] to i8*
1526	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 64, i1 false)
1527	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
1528	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1529	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align [[QALIGN]]
1530	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
1531	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1532	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align [[QALIGN]]
1533	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
1534	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1535	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align [[QALIGN]]
1536	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t [[__S1]], i32 0, i32 0
1537	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
1538	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX6]], align [[QALIGN]]
1539	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
1540	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
1541	// CHECK: ret void
1542	void test_vst1q_p8_x4(poly8_t *a, poly8x16x4_t b) {
1543	vst1q_p8_x4(a, b);
1544	}
1545
1546	// CHECK-LABEL: @test_vst1q_s16_x2(
1547	// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align [[QALIGN]]
1548	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align [[QALIGN]]
1549	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[B]], i32 0, i32 0
1550	// CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
1551	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <8 x i16>] %coerce.dive to [4 x i64]*
1552	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1553	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x2_t [[__S1]] to i8*
1554	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x2_t [[B]] to i8*
1555	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 32, i1 false)
1556	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
1557	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[__S1]], i32 0, i32 0
1558	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1559	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align [[QALIGN]]
1560	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1561	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t [[__S1]], i32 0, i32 0
1562	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1563	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align [[QALIGN]]
1564	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1565	// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1566	// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1567	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i16*
1568	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
1569	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* [[TMP9]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
1570	// CHECK: ret void
1571	void test_vst1q_s16_x2(int16_t *a, int16x8x2_t b) {
1572	vst1q_s16_x2(a, b);
1573	}
1574
1575	// CHECK-LABEL: @test_vst1q_s16_x3(
1576	// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align [[QALIGN]]
1577	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align [[QALIGN]]
1578	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[B]], i32 0, i32 0
1579	// CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
1580	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <8 x i16>] %coerce.dive to [6 x i64]*
1581	// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1582	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x3_t [[__S1]] to i8*
1583	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x3_t [[B]] to i8*
1584	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 48, i1 false)
1585	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
1586	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
1587	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1588	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align [[QALIGN]]
1589	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1590	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
1591	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1592	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align [[QALIGN]]
1593	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1594	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t [[__S1]], i32 0, i32 0
1595	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1596	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align [[QALIGN]]
1597	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1598	// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1599	// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1600	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1601	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i16*
1602	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
1603	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* [[TMP12]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
1604	// CHECK: ret void
1605	void test_vst1q_s16_x3(int16_t *a, int16x8x3_t b) {
1606	vst1q_s16_x3(a, b);
1607	}
1608
1609	// CHECK-LABEL: @test_vst1q_s16_x4(
1610	// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align [[QALIGN]]
1611	// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align [[QALIGN]]
1612	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[B]], i32 0, i32 0
1613	// CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
1614	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <8 x i16>] %coerce.dive to [8 x i64]*
1615	// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1616	// CHECK: [[TMP0:%.]] = bitcast %struct.int16x8x4_t [[__S1]] to i8*
1617	// CHECK: [[TMP1:%.]] = bitcast %struct.int16x8x4_t [[B]] to i8*
1618	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 64, i1 false)
1619	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
1620	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
1621	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1622	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align [[QALIGN]]
1623	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1624	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
1625	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1626	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align [[QALIGN]]
1627	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1628	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
1629	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1630	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align [[QALIGN]]
1631	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1632	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t [[__S1]], i32 0, i32 0
1633	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
1634	// CHECK: [[TMP9:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align [[QALIGN]]
1635	// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
1636	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1637	// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1638	// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1639	// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
1640	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i16*
1641	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
1642	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* [[TMP15]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
1643	// CHECK: ret void
1644	void test_vst1q_s16_x4(int16_t *a, int16x8x4_t b) {
1645	vst1q_s16_x4(a, b);
1646	}
1647
1648	// CHECK-LABEL: @test_vst1q_s32_x2(
1649	// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align [[QALIGN]]
1650	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align [[QALIGN]]
1651	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[B]], i32 0, i32 0
1652	// CHECK-A64: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
1653	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <4 x i32>] %coerce.dive to [4 x i64]*
1654	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1655	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x2_t [[__S1]] to i8*
1656	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x2_t [[B]] to i8*
1657	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 32, i1 false)
1658	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
1659	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[__S1]], i32 0, i32 0
1660	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1661	// CHECK: [[TMP3:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align [[QALIGN]]
1662	// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1663	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t [[__S1]], i32 0, i32 0
1664	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1665	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align [[QALIGN]]
1666	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
1667	// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
1668	// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
1669	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i32*
1670	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
1671	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i32.v4i32(i32* [[TMP9]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
1672	// CHECK: ret void
1673	void test_vst1q_s32_x2(int32_t *a, int32x4x2_t b) {
1674	vst1q_s32_x2(a, b);
1675	}
1676
1677	// CHECK-LABEL: @test_vst1q_s32_x3(
1678	// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align [[QALIGN]]
1679	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align [[QALIGN]]
1680	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[B]], i32 0, i32 0
1681	// CHECK-A64: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
1682	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <4 x i32>] %coerce.dive to [6 x i64]*
1683	// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1684	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x3_t [[__S1]] to i8*
1685	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x3_t [[B]] to i8*
1686	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 48, i1 false)
1687	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
1688	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
1689	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1690	// CHECK: [[TMP3:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align [[QALIGN]]
1691	// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1692	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
1693	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1694	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align [[QALIGN]]
1695	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
1696	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t [[__S1]], i32 0, i32 0
1697	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1698	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align [[QALIGN]]
1699	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
1700	// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
1701	// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
1702	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
1703	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i32*
1704	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
1705	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i32.v4i32(i32* [[TMP12]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
1706	// CHECK: ret void
1707	void test_vst1q_s32_x3(int32_t *a, int32x4x3_t b) {
1708	vst1q_s32_x3(a, b);
1709	}
1710
1711	// CHECK-LABEL: @test_vst1q_s32_x4(
1712	// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align [[QALIGN]]
1713	// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align [[QALIGN]]
1714	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[B]], i32 0, i32 0
1715	// CHECK-A64: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
1716	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <4 x i32>] %coerce.dive to [8 x i64]*
1717	// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1718	// CHECK: [[TMP0:%.]] = bitcast %struct.int32x4x4_t [[__S1]] to i8*
1719	// CHECK: [[TMP1:%.]] = bitcast %struct.int32x4x4_t [[B]] to i8*
1720	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 64, i1 false)
1721	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
1722	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
1723	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1724	// CHECK: [[TMP3:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align [[QALIGN]]
1725	// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
1726	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
1727	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1728	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align [[QALIGN]]
1729	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
1730	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
1731	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1732	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align [[QALIGN]]
1733	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
1734	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t [[__S1]], i32 0, i32 0
1735	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
1736	// CHECK: [[TMP9:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX6]], align [[QALIGN]]
1737	// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
1738	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
1739	// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
1740	// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
1741	// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
1742	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i32*
1743	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
1744	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i32.v4i32(i32* [[TMP15]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
1745	// CHECK: ret void
1746	void test_vst1q_s32_x4(int32_t *a, int32x4x4_t b) {
1747	vst1q_s32_x4(a, b);
1748	}
1749
1750	// CHECK-LABEL: @test_vst1q_s64_x2(
1751	// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align [[QALIGN]]
1752	// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align [[QALIGN]]
1753	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t [[B]], i32 0, i32 0
1754	// CHECK-A64: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
1755	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <2 x i64>] %coerce.dive to [4 x i64]*
1756	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1757	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x2_t [[__S1]] to i8*
1758	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x2x2_t [[B]] to i8*
1759	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 32, i1 false)
1760	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
1761	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t [[__S1]], i32 0, i32 0
1762	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1763	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align [[QALIGN]]
1764	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
1765	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t [[__S1]], i32 0, i32 0
1766	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1767	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align [[QALIGN]]
1768	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
1769	// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
1770	// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
1771	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i64*
1772	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
1773	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i64.v2i64(i64* [[TMP9]], <2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
1774	// CHECK: ret void
1775	void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) {
1776	vst1q_s64_x2(a, b);
1777	}
1778
1779	// CHECK-LABEL: @test_vst1q_s64_x3(
1780	// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align [[QALIGN]]
1781	// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align [[QALIGN]]
1782	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t [[B]], i32 0, i32 0
1783	// CHECK-A64: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
1784	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <2 x i64>] %coerce.dive to [6 x i64]*
1785	// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1786	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x3_t [[__S1]] to i8*
1787	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x2x3_t [[B]] to i8*
1788	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 48, i1 false)
1789	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
1790	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t [[__S1]], i32 0, i32 0
1791	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1792	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align [[QALIGN]]
1793	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
1794	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t [[__S1]], i32 0, i32 0
1795	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1796	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align [[QALIGN]]
1797	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
1798	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t [[__S1]], i32 0, i32 0
1799	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1800	// CHECK: [[TMP7:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align [[QALIGN]]
1801	// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
1802	// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
1803	// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
1804	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
1805	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i64*
1806	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
1807	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i64.v2i64(i64* [[TMP12]], <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
1808	// CHECK: ret void
1809	void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) {
1810	vst1q_s64_x3(a, b);
1811	}
1812
1813	// CHECK-LABEL: @test_vst1q_s64_x4(
1814	// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align [[QALIGN]]
1815	// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align [[QALIGN]]
1816	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[B]], i32 0, i32 0
1817	// CHECK-A64: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
1818	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <2 x i64>] %coerce.dive to [8 x i64]*
1819	// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1820	// CHECK: [[TMP0:%.]] = bitcast %struct.int64x2x4_t [[__S1]] to i8*
1821	// CHECK: [[TMP1:%.]] = bitcast %struct.int64x2x4_t [[B]] to i8*
1822	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 64, i1 false)
1823	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
1824	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[__S1]], i32 0, i32 0
1825	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1826	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align [[QALIGN]]
1827	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
1828	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[__S1]], i32 0, i32 0
1829	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1830	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align [[QALIGN]]
1831	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
1832	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[__S1]], i32 0, i32 0
1833	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1834	// CHECK: [[TMP7:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align [[QALIGN]]
1835	// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
1836	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t [[__S1]], i32 0, i32 0
1837	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
1838	// CHECK: [[TMP9:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX6]], align [[QALIGN]]
1839	// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
1840	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
1841	// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
1842	// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
1843	// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
1844	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i64*
1845	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
1846	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i64.v2i64(i64* [[TMP15]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
1847	// CHECK: ret void
1848	void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) {
1849	vst1q_s64_x4(a, b);
1850	}
1851
1852	// CHECK-LABEL: @test_vst1q_s8_x2(
1853	// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align [[QALIGN]]
1854	// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align [[QALIGN]]
1855	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t [[B]], i32 0, i32 0
1856	// CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
1857	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <16 x i8>] %coerce.dive to [4 x i64]*
1858	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1859	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x2_t [[__S1]] to i8*
1860	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x16x2_t [[B]] to i8*
1861	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 32, i1 false)
1862	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t [[__S1]], i32 0, i32 0
1863	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1864	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align [[QALIGN]]
1865	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t [[__S1]], i32 0, i32 0
1866	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1867	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align [[QALIGN]]
1868	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
1869	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
1870	// CHECK: ret void
1871	void test_vst1q_s8_x2(int8_t *a, int8x16x2_t b) {
1872	vst1q_s8_x2(a, b);
1873	}
1874
1875	// CHECK-LABEL: @test_vst1q_s8_x3(
1876	// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align [[QALIGN]]
1877	// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align [[QALIGN]]
1878	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[B]], i32 0, i32 0
1879	// CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
1880	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <16 x i8>] %coerce.dive to [6 x i64]*
1881	// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1882	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x3_t [[__S1]] to i8*
1883	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x16x3_t [[B]] to i8*
1884	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 48, i1 false)
1885	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[__S1]], i32 0, i32 0
1886	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1887	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align [[QALIGN]]
1888	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[__S1]], i32 0, i32 0
1889	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1890	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align [[QALIGN]]
1891	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t [[__S1]], i32 0, i32 0
1892	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1893	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align [[QALIGN]]
1894	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
1895	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
1896	// CHECK: ret void
1897	void test_vst1q_s8_x3(int8_t *a, int8x16x3_t b) {
1898	vst1q_s8_x3(a, b);
1899	}
1900
1901	// CHECK-LABEL: @test_vst1q_s8_x4(
1902	// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align [[QALIGN]]
1903	// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align [[QALIGN]]
1904	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[B]], i32 0, i32 0
1905	// CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
1906	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <16 x i8>] %coerce.dive to [8 x i64]*
1907	// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
1908	// CHECK: [[TMP0:%.]] = bitcast %struct.int8x16x4_t [[__S1]] to i8*
1909	// CHECK: [[TMP1:%.]] = bitcast %struct.int8x16x4_t [[B]] to i8*
1910	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 64, i1 false)
1911	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
1912	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1913	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align [[QALIGN]]
1914	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
1915	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1916	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align [[QALIGN]]
1917	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
1918	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1919	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align [[QALIGN]]
1920	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t [[__S1]], i32 0, i32 0
1921	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
1922	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX6]], align [[QALIGN]]
1923	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
1924	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
1925	// CHECK: ret void
1926	void test_vst1q_s8_x4(int8_t *a, int8x16x4_t b) {
1927	vst1q_s8_x4(a, b);
1928	}
1929
1930	// CHECK-LABEL: @test_vst1q_u16_x2(
1931	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align [[QALIGN]]
1932	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align [[QALIGN]]
1933	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[B]], i32 0, i32 0
1934	// CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
1935	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <8 x i16>] %coerce.dive to [4 x i64]*
1936	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
1937	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x2_t [[__S1]] to i8*
1938	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x2_t [[B]] to i8*
1939	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 32, i1 false)
1940	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
1941	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[__S1]], i32 0, i32 0
1942	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1943	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align [[QALIGN]]
1944	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1945	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t [[__S1]], i32 0, i32 0
1946	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1947	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align [[QALIGN]]
1948	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1949	// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1950	// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1951	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i16*
1952	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
1953	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i16.v8i16(i16* [[TMP9]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
1954	// CHECK: ret void
1955	void test_vst1q_u16_x2(uint16_t *a, uint16x8x2_t b) {
1956	vst1q_u16_x2(a, b);
1957	}
1958
1959	// CHECK-LABEL: @test_vst1q_u16_x3(
1960	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align [[QALIGN]]
1961	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align [[QALIGN]]
1962	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[B]], i32 0, i32 0
1963	// CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
1964	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <8 x i16>] %coerce.dive to [6 x i64]*
1965	// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
1966	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x3_t [[__S1]] to i8*
1967	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x3_t [[B]] to i8*
1968	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 48, i1 false)
1969	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
1970	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
1971	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
1972	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align [[QALIGN]]
1973	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
1974	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
1975	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
1976	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align [[QALIGN]]
1977	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
1978	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t [[__S1]], i32 0, i32 0
1979	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
1980	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align [[QALIGN]]
1981	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
1982	// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
1983	// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
1984	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
1985	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i16*
1986	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
1987	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i16.v8i16(i16* [[TMP12]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
1988	// CHECK: ret void
1989	void test_vst1q_u16_x3(uint16_t *a, uint16x8x3_t b) {
1990	vst1q_u16_x3(a, b);
1991	}
1992
1993	// CHECK-LABEL: @test_vst1q_u16_x4(
1994	// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align [[QALIGN]]
1995	// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align [[QALIGN]]
1996	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[B]], i32 0, i32 0
1997	// CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
1998	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <8 x i16>] %coerce.dive to [8 x i64]*
1999	// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
2000	// CHECK: [[TMP0:%.]] = bitcast %struct.uint16x8x4_t [[__S1]] to i8*
2001	// CHECK: [[TMP1:%.]] = bitcast %struct.uint16x8x4_t [[B]] to i8*
2002	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 64, i1 false)
2003	// CHECK: [[TMP2:%.]] = bitcast i16 %a to i8*
2004	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
2005	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
2006	// CHECK: [[TMP3:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX]], align [[QALIGN]]
2007	// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
2008	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
2009	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
2010	// CHECK: [[TMP5:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX2]], align [[QALIGN]]
2011	// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
2012	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
2013	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
2014	// CHECK: [[TMP7:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX4]], align [[QALIGN]]
2015	// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
2016	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t [[__S1]], i32 0, i32 0
2017	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
2018	// CHECK: [[TMP9:%.]] = load <8 x i16>, <8 x i16> [[ARRAYIDX6]], align [[QALIGN]]
2019	// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
2020	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
2021	// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
2022	// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
2023	// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
2024	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i16*
2025	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
2026	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i16.v8i16(i16* [[TMP15]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
2027	// CHECK: ret void
2028	void test_vst1q_u16_x4(uint16_t *a, uint16x8x4_t b) {
2029	vst1q_u16_x4(a, b);
2030	}
2031
2032	// CHECK-LABEL: @test_vst1q_u32_x2(
2033	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align [[QALIGN]]
2034	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align [[QALIGN]]
2035	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[B]], i32 0, i32 0
2036	// CHECK-A64: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
2037	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <4 x i32>] %coerce.dive to [4 x i64]*
2038	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
2039	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x2_t [[__S1]] to i8*
2040	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x2_t [[B]] to i8*
2041	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 32, i1 false)
2042	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
2043	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[__S1]], i32 0, i32 0
2044	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
2045	// CHECK: [[TMP3:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align [[QALIGN]]
2046	// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
2047	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t [[__S1]], i32 0, i32 0
2048	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
2049	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align [[QALIGN]]
2050	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
2051	// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
2052	// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
2053	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i32*
2054	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
2055	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i32.v4i32(i32* [[TMP9]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
2056	// CHECK: ret void
2057	void test_vst1q_u32_x2(uint32_t *a, uint32x4x2_t b) {
2058	vst1q_u32_x2(a, b);
2059	}
2060
2061	// CHECK-LABEL: @test_vst1q_u32_x3(
2062	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align [[QALIGN]]
2063	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align [[QALIGN]]
2064	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[B]], i32 0, i32 0
2065	// CHECK-A64: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
2066	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <4 x i32>] %coerce.dive to [6 x i64]*
2067	// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
2068	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x3_t [[__S1]] to i8*
2069	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x3_t [[B]] to i8*
2070	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 48, i1 false)
2071	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
2072	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
2073	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
2074	// CHECK: [[TMP3:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align [[QALIGN]]
2075	// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
2076	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
2077	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
2078	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align [[QALIGN]]
2079	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
2080	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t [[__S1]], i32 0, i32 0
2081	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
2082	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align [[QALIGN]]
2083	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
2084	// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
2085	// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
2086	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
2087	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i32*
2088	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
2089	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i32.v4i32(i32* [[TMP12]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
2090	// CHECK: ret void
2091	void test_vst1q_u32_x3(uint32_t *a, uint32x4x3_t b) {
2092	vst1q_u32_x3(a, b);
2093	}
2094
2095	// CHECK-LABEL: @test_vst1q_u32_x4(
2096	// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align [[QALIGN]]
2097	// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align [[QALIGN]]
2098	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[B]], i32 0, i32 0
2099	// CHECK-A64: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
2100	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <4 x i32>] %coerce.dive to [8 x i64]*
2101	// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
2102	// CHECK: [[TMP0:%.]] = bitcast %struct.uint32x4x4_t [[__S1]] to i8*
2103	// CHECK: [[TMP1:%.]] = bitcast %struct.uint32x4x4_t [[B]] to i8*
2104	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 64, i1 false)
2105	// CHECK: [[TMP2:%.]] = bitcast i32 %a to i8*
2106	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
2107	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
2108	// CHECK: [[TMP3:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX]], align [[QALIGN]]
2109	// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
2110	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
2111	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
2112	// CHECK: [[TMP5:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX2]], align [[QALIGN]]
2113	// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
2114	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
2115	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
2116	// CHECK: [[TMP7:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX4]], align [[QALIGN]]
2117	// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
2118	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t [[__S1]], i32 0, i32 0
2119	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
2120	// CHECK: [[TMP9:%.]] = load <4 x i32>, <4 x i32> [[ARRAYIDX6]], align [[QALIGN]]
2121	// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
2122	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
2123	// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
2124	// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
2125	// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
2126	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i32*
2127	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
2128	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i32.v4i32(i32* [[TMP15]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
2129	// CHECK: ret void
2130	void test_vst1q_u32_x4(uint32_t *a, uint32x4x4_t b) {
2131	vst1q_u32_x4(a, b);
2132	}
2133
2134	// CHECK-LABEL: @test_vst1q_u64_x2(
2135	// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align [[QALIGN]]
2136	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align [[QALIGN]]
2137	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t [[B]], i32 0, i32 0
2138	// CHECK-A64: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
2139	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <2 x i64>] %coerce.dive to [4 x i64]*
2140	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
2141	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x2_t [[__S1]] to i8*
2142	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x2x2_t [[B]] to i8*
2143	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 32, i1 false)
2144	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
2145	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t [[__S1]], i32 0, i32 0
2146	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
2147	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align [[QALIGN]]
2148	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
2149	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t [[__S1]], i32 0, i32 0
2150	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
2151	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align [[QALIGN]]
2152	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
2153	// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
2154	// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
2155	// CHECK-DAG: [[TMP9:%.]] = bitcast i8 [[TMP2]] to i64*
2156	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
2157	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i64.v2i64(i64* [[TMP9]], <2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
2158	// CHECK: ret void
2159	void test_vst1q_u64_x2(uint64_t *a, uint64x2x2_t b) {
2160	vst1q_u64_x2(a, b);
2161	}
2162
2163	// CHECK-LABEL: @test_vst1q_u64_x3(
2164	// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align [[QALIGN]]
2165	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align [[QALIGN]]
2166	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t [[B]], i32 0, i32 0
2167	// CHECK-A64: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
2168	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <2 x i64>] %coerce.dive to [6 x i64]*
2169	// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
2170	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x3_t [[__S1]] to i8*
2171	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x2x3_t [[B]] to i8*
2172	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 48, i1 false)
2173	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
2174	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t [[__S1]], i32 0, i32 0
2175	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
2176	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align [[QALIGN]]
2177	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
2178	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t [[__S1]], i32 0, i32 0
2179	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
2180	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align [[QALIGN]]
2181	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
2182	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t [[__S1]], i32 0, i32 0
2183	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
2184	// CHECK: [[TMP7:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align [[QALIGN]]
2185	// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
2186	// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
2187	// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
2188	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
2189	// CHECK-DAG: [[TMP12:%.]] = bitcast i8 [[TMP2]] to i64*
2190	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
2191	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i64.v2i64(i64* [[TMP12]], <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
2192	// CHECK: ret void
2193	void test_vst1q_u64_x3(uint64_t *a, uint64x2x3_t b) {
2194	vst1q_u64_x3(a, b);
2195	}
2196
2197	// CHECK-LABEL: @test_vst1q_u64_x4(
2198	// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align [[QALIGN]]
2199	// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align [[QALIGN]]
2200	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[B]], i32 0, i32 0
2201	// CHECK-A64: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
2202	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <2 x i64>] %coerce.dive to [8 x i64]*
2203	// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
2204	// CHECK: [[TMP0:%.]] = bitcast %struct.uint64x2x4_t [[__S1]] to i8*
2205	// CHECK: [[TMP1:%.]] = bitcast %struct.uint64x2x4_t [[B]] to i8*
2206	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 64, i1 false)
2207	// CHECK: [[TMP2:%.]] = bitcast i64 %a to i8*
2208	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[__S1]], i32 0, i32 0
2209	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
2210	// CHECK: [[TMP3:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX]], align [[QALIGN]]
2211	// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
2212	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[__S1]], i32 0, i32 0
2213	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
2214	// CHECK: [[TMP5:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX2]], align [[QALIGN]]
2215	// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
2216	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[__S1]], i32 0, i32 0
2217	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
2218	// CHECK: [[TMP7:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX4]], align [[QALIGN]]
2219	// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
2220	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t [[__S1]], i32 0, i32 0
2221	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
2222	// CHECK: [[TMP9:%.]] = load <2 x i64>, <2 x i64> [[ARRAYIDX6]], align [[QALIGN]]
2223	// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
2224	// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
2225	// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
2226	// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
2227	// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
2228	// CHECK-DAG: [[TMP15:%.]] = bitcast i8 [[TMP2]] to i64*
2229	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
2230	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i64.v2i64(i64* [[TMP15]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
2231	// CHECK: ret void
2232	void test_vst1q_u64_x4(uint64_t *a, uint64x2x4_t b) {
2233	vst1q_u64_x4(a, b);
2234	}
2235
2236	// CHECK-LABEL: @test_vst1q_u8_x2(
2237	// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align [[QALIGN]]
2238	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align [[QALIGN]]
2239	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t [[B]], i32 0, i32 0
2240	// CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
2241	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [2 x <16 x i8>] %coerce.dive to [4 x i64]*
2242	// CHECK-A32: store [4 x i64] %b.coerce, [4 x i64]* [[COERCE_DIVE_TMP]], align 8
2243	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x2_t [[__S1]] to i8*
2244	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x16x2_t [[B]] to i8*
2245	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 32, i1 false)
2246	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t [[__S1]], i32 0, i32 0
2247	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
2248	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align [[QALIGN]]
2249	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t [[__S1]], i32 0, i32 0
2250	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
2251	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align [[QALIGN]]
2252	// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
2253	// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
2254	// CHECK: ret void
2255	void test_vst1q_u8_x2(uint8_t *a, uint8x16x2_t b) {
2256	vst1q_u8_x2(a, b);
2257	}
2258
2259	// CHECK-LABEL: @test_vst1q_u8_x3(
2260	// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align [[QALIGN]]
2261	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align [[QALIGN]]
2262	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[B]], i32 0, i32 0
2263	// CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
2264	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [3 x <16 x i8>] %coerce.dive to [6 x i64]*
2265	// CHECK-A32: store [6 x i64] %b.coerce, [6 x i64]* [[COERCE_DIVE_TMP]], align 8
2266	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x3_t [[__S1]] to i8*
2267	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x16x3_t [[B]] to i8*
2268	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 48, i1 false)
2269	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[__S1]], i32 0, i32 0
2270	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
2271	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align [[QALIGN]]
2272	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[__S1]], i32 0, i32 0
2273	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
2274	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align [[QALIGN]]
2275	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t [[__S1]], i32 0, i32 0
2276	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
2277	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align [[QALIGN]]
2278	// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
2279	// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
2280	// CHECK: ret void
2281	void test_vst1q_u8_x3(uint8_t *a, uint8x16x3_t b) {
2282	vst1q_u8_x3(a, b);
2283	}
2284
2285	// CHECK-LABEL: @test_vst1q_u8_x4(
2286	// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align [[QALIGN]]
2287	// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align [[QALIGN]]
2288	// CHECK: [[COERCE_DIVE:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[B]], i32 0, i32 0
2289	// CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
2290	// CHECK-A32: [[COERCE_DIVE_TMP:%.]] = bitcast [4 x <16 x i8>] %coerce.dive to [8 x i64]*
2291	// CHECK-A32: store [8 x i64] %b.coerce, [8 x i64]* [[COERCE_DIVE_TMP]], align 8
2292	// CHECK: [[TMP0:%.]] = bitcast %struct.uint8x16x4_t [[__S1]] to i8*
2293	// CHECK: [[TMP1:%.]] = bitcast %struct.uint8x16x4_t [[B]] to i8*
2294	// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64\|i32}}(i8* align [[QALIGN]] [[TMP0]], i8* align [[QALIGN]] [[TMP1]], {{i64\|i32}} 64, i1 false)
2295	// CHECK: [[VAL:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
2296	// CHECK: [[ARRAYIDX:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL]], {{i64\|i32}} 0, {{i64\|i32}} 0
2297	// CHECK: [[TMP2:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX]], align [[QALIGN]]
2298	// CHECK: [[VAL1:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
2299	// CHECK: [[ARRAYIDX2:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL1]], {{i64\|i32}} 0, {{i64\|i32}} 1
2300	// CHECK: [[TMP3:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX2]], align [[QALIGN]]
2301	// CHECK: [[VAL3:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
2302	// CHECK: [[ARRAYIDX4:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL3]], {{i64\|i32}} 0, {{i64\|i32}} 2
2303	// CHECK: [[TMP4:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX4]], align [[QALIGN]]
2304	// CHECK: [[VAL5:%.]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t [[__S1]], i32 0, i32 0
2305	// CHECK: [[ARRAYIDX6:%.]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>] [[VAL5]], {{i64\|i32}} 0, {{i64\|i32}} 3
2306	// CHECK: [[TMP5:%.]] = load <16 x i8>, <16 x i8> [[ARRAYIDX6]], align [[QALIGN]]
2307	// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
2308	// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
2309	// CHECK: ret void
2310	void test_vst1q_u8_x4(uint8_t *a, uint8x16x4_t b) {
2311	vst1q_u8_x4(a, b);
2312	}
2313

Clang Project