xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/tests_data/saxpy.ll (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
2target triple = "nvptx64-unknown-unknown"
3
4%struct.uint3 = type { i32, i32, i32 }
5%struct.dim3 = type { i32, i32, i32 }
6
7@blockIdx = external addrspace(1) global %struct.uint3
8@blockDim = external addrspace(1) global %struct.dim3
9@threadIdx = external addrspace(1) global %struct.uint3
10
11; Function Attrs: alwaysinline nounwind readnone
12define float @expf(float %f) #0 {
13entry:
14  %f.addr = alloca float, align 4
15  store float %f, float* %f.addr, align 4
16  %0 = load float, float* %f.addr, align 4
17  %call = call float @__nv_expf(float %0)
18  ret float %call
19}
20
21declare float @__nv_expf(float) #1
22
23; Function Attrs: nounwind
24define void @cuda_saxpy(i32* %n, float* %a, float* %x, float* %y) #2 {
25entry:
26  %n.addr = alloca i32*, align 8
27  %a.addr = alloca float*, align 8
28  %x.addr = alloca float*, align 8
29  %y.addr = alloca float*, align 8
30  %i = alloca i32, align 4
31  store i32* %n, i32** %n.addr, align 8
32  store float* %a, float** %a.addr, align 8
33  store float* %x, float** %x.addr, align 8
34  store float* %y, float** %y.addr, align 8
35  %0 = load i32, i32* getelementptr inbounds (%struct.uint3, %struct.uint3* addrspacecast (%struct.uint3 addrspace(1)* @blockIdx to %struct.uint3*), i32 0, i32 0), align 4
36  %1 = load i32, i32* getelementptr inbounds (%struct.dim3, %struct.dim3* addrspacecast (%struct.dim3 addrspace(1)* @blockDim to %struct.dim3*), i32 0, i32 0), align 4
37  %mul = mul i32 %0, %1
38  %2 = load i32, i32* getelementptr inbounds (%struct.uint3, %struct.uint3* addrspacecast (%struct.uint3 addrspace(1)* @threadIdx to %struct.uint3*), i32 0, i32 0), align 4
39  %add = add i32 %mul, %2
40  store i32 %add, i32* %i, align 4
41  %3 = load i32, i32* %i, align 4
42  %4 = load i32*, i32** %n.addr, align 8
43  %arrayidx = getelementptr inbounds i32, i32* %4, i64 0
44  %5 = load i32, i32* %arrayidx, align 4
45  %cmp = icmp slt i32 %3, %5
46  br i1 %cmp, label %if.then, label %if.end
47
48if.then:                                          ; preds = %entry
49  %6 = load float*, float** %a.addr, align 8
50  %arrayidx1 = getelementptr inbounds float, float* %6, i64 0
51  %7 = load float, float* %arrayidx1, align 4
52  %8 = load i32, i32* %i, align 4
53  %idxprom = sext i32 %8 to i64
54  %9 = load float*, float** %x.addr, align 8
55  %arrayidx2 = getelementptr inbounds float, float* %9, i64 %idxprom
56  %10 = load float, float* %arrayidx2, align 4
57  %mul3 = fmul float %7, %10
58  %11 = load i32, i32* %i, align 4
59  %idxprom4 = sext i32 %11 to i64
60  %12 = load float*, float** %y.addr, align 8
61  %arrayidx5 = getelementptr inbounds float, float* %12, i64 %idxprom4
62  %13 = load float, float* %arrayidx5, align 4
63  %add6 = fadd float %mul3, %13
64  %14 = load i32, i32* %i, align 4
65  %idxprom7 = sext i32 %14 to i64
66  %15 = load float*, float** %y.addr, align 8
67  %arrayidx8 = getelementptr inbounds float, float* %15, i64 %idxprom7
68  store float %add6, float* %arrayidx8, align 4
69  br label %if.end
70
71if.end:                                           ; preds = %if.then, %entry
72  ret void
73}
74
75; Function Attrs: nounwind
76define void @cuda_saxpy_s(i32* %n, float* %a, float* %x, float* %y) #2 {
77entry:
78  %n.addr = alloca i32*, align 8
79  %a.addr = alloca float*, align 8
80  %x.addr = alloca float*, align 8
81  %y.addr = alloca float*, align 8
82  %i = alloca i32, align 4
83  store i32* %n, i32** %n.addr, align 8
84  store float* %a, float** %a.addr, align 8
85  store float* %x, float** %x.addr, align 8
86  store float* %y, float** %y.addr, align 8
87  %0 = load i32, i32* getelementptr inbounds (%struct.uint3, %struct.uint3* addrspacecast (%struct.uint3 addrspace(1)* @blockIdx to %struct.uint3*), i32 0, i32 0), align 4
88  %1 = load i32, i32* getelementptr inbounds (%struct.dim3, %struct.dim3* addrspacecast (%struct.dim3 addrspace(1)* @blockDim to %struct.dim3*), i32 0, i32 0), align 4
89  %mul = mul i32 %0, %1
90  %2 = load i32, i32* getelementptr inbounds (%struct.uint3, %struct.uint3* addrspacecast (%struct.uint3 addrspace(1)* @threadIdx to %struct.uint3*), i32 0, i32 0), align 4
91  %add = add i32 %mul, %2
92  store i32 %add, i32* %i, align 4
93  call void @llvm.cuda.syncthreads()
94  %3 = load i32, i32* %i, align 4
95  %4 = load i32*, i32** %n.addr, align 8
96  %arrayidx = getelementptr inbounds i32, i32* %4, i64 0
97  %5 = load i32, i32* %arrayidx, align 4
98  %cmp = icmp slt i32 %3, %5
99  br i1 %cmp, label %if.then, label %if.end
100
101if.then:                                          ; preds = %entry
102  %6 = load float*, float** %a.addr, align 8
103  %arrayidx1 = getelementptr inbounds float, float* %6, i64 0
104  %7 = load float, float* %arrayidx1, align 4
105  %8 = load i32, i32* %i, align 4
106  %idxprom = sext i32 %8 to i64
107  %9 = load float*, float** %x.addr, align 8
108  %arrayidx2 = getelementptr inbounds float, float* %9, i64 %idxprom
109  %10 = load float, float* %arrayidx2, align 4
110  %mul3 = fmul float %7, %10
111  %11 = load i32, i32* %i, align 4
112  %idxprom4 = sext i32 %11 to i64
113  %12 = load float*, float** %y.addr, align 8
114  %arrayidx5 = getelementptr inbounds float, float* %12, i64 %idxprom4
115  %13 = load float, float* %arrayidx5, align 4
116  %add6 = fadd float %mul3, %13
117  %14 = load i32, i32* %i, align 4
118  %idxprom7 = sext i32 %14 to i64
119  %15 = load float*, float** %y.addr, align 8
120  %arrayidx8 = getelementptr inbounds float, float* %15, i64 %idxprom7
121  store float %add6, float* %arrayidx8, align 4
122  br label %if.end
123
124if.end:                                           ; preds = %if.then, %entry
125  ret void
126}
127
128; Function Attrs: nounwind
129declare void @llvm.cuda.syncthreads() #3
130
131attributes #0 = { alwaysinline nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
132attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
133attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
134attributes #3 = { nounwind }
135
136!nvvm.annotations = !{!0, !1}
137!llvm.ident = !{!2}
138
139!0 = !{void (i32*, float*, float*, float*)* @cuda_saxpy, !"kernel", i32 1}
140!1 = !{void (i32*, float*, float*, float*)* @cuda_saxpy_s, !"kernel", i32 1}
141!2 = !{!"clang version xla-trunk (trunk r203011)"}
142