1*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s 2*9880d681SAndroid Build Coastguard Worker 3*9880d681SAndroid Build Coastguard Worker; Even though general vector types are not supported in PTX, we can still 4*9880d681SAndroid Build Coastguard Worker; optimize loads/stores with pseudo-vector instructions of the form: 5*9880d681SAndroid Build Coastguard Worker; 6*9880d681SAndroid Build Coastguard Worker; ld.v2.f32 {%f0, %f1}, [%r0] 7*9880d681SAndroid Build Coastguard Worker; 8*9880d681SAndroid Build Coastguard Worker; which will load two floats at once into scalar registers. 9*9880d681SAndroid Build Coastguard Worker 10*9880d681SAndroid Build Coastguard Workerdefine void @foo(<2 x float>* %a) { 11*9880d681SAndroid Build Coastguard Worker; CHECK: .func foo 12*9880d681SAndroid Build Coastguard Worker; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}} 13*9880d681SAndroid Build Coastguard Worker %t1 = load <2 x float>, <2 x float>* %a 14*9880d681SAndroid Build Coastguard Worker %t2 = fmul <2 x float> %t1, %t1 15*9880d681SAndroid Build Coastguard Worker store <2 x float> %t2, <2 x float>* %a 16*9880d681SAndroid Build Coastguard Worker ret void 17*9880d681SAndroid Build Coastguard Worker} 18*9880d681SAndroid Build Coastguard Worker 19*9880d681SAndroid Build Coastguard Workerdefine void @foo2(<4 x float>* %a) { 20*9880d681SAndroid Build Coastguard Worker; CHECK: .func foo2 21*9880d681SAndroid Build Coastguard Worker; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} 22*9880d681SAndroid Build Coastguard Worker %t1 = load <4 x float>, <4 x float>* %a 23*9880d681SAndroid Build Coastguard Worker %t2 = fmul <4 x float> %t1, %t1 24*9880d681SAndroid Build Coastguard Worker store <4 x float> %t2, <4 x float>* %a 25*9880d681SAndroid Build Coastguard Worker ret void 26*9880d681SAndroid Build Coastguard Worker} 27*9880d681SAndroid Build Coastguard Worker 28*9880d681SAndroid Build Coastguard Workerdefine void @foo3(<8 x float>* %a) { 29*9880d681SAndroid Build Coastguard Worker; CHECK: .func foo3 30*9880d681SAndroid Build Coastguard Worker; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} 31*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} 32*9880d681SAndroid Build Coastguard Worker %t1 = load <8 x float>, <8 x float>* %a 33*9880d681SAndroid Build Coastguard Worker %t2 = fmul <8 x float> %t1, %t1 34*9880d681SAndroid Build Coastguard Worker store <8 x float> %t2, <8 x float>* %a 35*9880d681SAndroid Build Coastguard Worker ret void 36*9880d681SAndroid Build Coastguard Worker} 37*9880d681SAndroid Build Coastguard Worker 38*9880d681SAndroid Build Coastguard Worker 39*9880d681SAndroid Build Coastguard Worker 40*9880d681SAndroid Build Coastguard Workerdefine void @foo4(<2 x i32>* %a) { 41*9880d681SAndroid Build Coastguard Worker; CHECK: .func foo4 42*9880d681SAndroid Build Coastguard Worker; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}} 43*9880d681SAndroid Build Coastguard Worker %t1 = load <2 x i32>, <2 x i32>* %a 44*9880d681SAndroid Build Coastguard Worker %t2 = mul <2 x i32> %t1, %t1 45*9880d681SAndroid Build Coastguard Worker store <2 x i32> %t2, <2 x i32>* %a 46*9880d681SAndroid Build Coastguard Worker ret void 47*9880d681SAndroid Build Coastguard Worker} 48*9880d681SAndroid Build Coastguard Worker 49*9880d681SAndroid Build Coastguard Workerdefine void @foo5(<4 x i32>* %a) { 50*9880d681SAndroid Build Coastguard Worker; CHECK: .func foo5 51*9880d681SAndroid Build Coastguard Worker; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} 52*9880d681SAndroid Build Coastguard Worker %t1 = load <4 x i32>, <4 x i32>* %a 53*9880d681SAndroid Build Coastguard Worker %t2 = mul <4 x i32> %t1, %t1 54*9880d681SAndroid Build Coastguard Worker store <4 x i32> %t2, <4 x i32>* %a 55*9880d681SAndroid Build Coastguard Worker ret void 56*9880d681SAndroid Build Coastguard Worker} 57*9880d681SAndroid Build Coastguard Worker 58*9880d681SAndroid Build Coastguard Workerdefine void @foo6(<8 x i32>* %a) { 59*9880d681SAndroid Build Coastguard Worker; CHECK: .func foo6 60*9880d681SAndroid Build Coastguard Worker; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} 61*9880d681SAndroid Build Coastguard Worker; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} 62*9880d681SAndroid Build Coastguard Worker %t1 = load <8 x i32>, <8 x i32>* %a 63*9880d681SAndroid Build Coastguard Worker %t2 = mul <8 x i32> %t1, %t1 64*9880d681SAndroid Build Coastguard Worker store <8 x i32> %t2, <8 x i32>* %a 65*9880d681SAndroid Build Coastguard Worker ret void 66*9880d681SAndroid Build Coastguard Worker} 67