1*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST 2*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT 3*9880d681SAndroid Build Coastguard Worker 4*9880d681SAndroid Build Coastguard Workertarget triple = "nvptx64-unknown-cuda" 5*9880d681SAndroid Build Coastguard Worker 6*9880d681SAndroid Build Coastguard Worker;; Make sure we are generating proper instruction sequences for fused ops 7*9880d681SAndroid Build Coastguard Worker;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit 8*9880d681SAndroid Build Coastguard Worker;; add.f32 otherwise. Without an explicit rounding mode on add.f32, ptxas 9*9880d681SAndroid Build Coastguard Worker;; is free to fuse with a multiply if it is able. If fusion is not allowed, 10*9880d681SAndroid Build Coastguard Worker;; we do not form fma.rn at the PTX level and explicitly generate add.rn 11*9880d681SAndroid Build Coastguard Worker;; for all adds to prevent ptxas from fusion the ops. 12*9880d681SAndroid Build Coastguard Worker 13*9880d681SAndroid Build Coastguard Worker;; FAST-LABEL: @t0 14*9880d681SAndroid Build Coastguard Worker;; DEFAULT-LABEL: @t0 15*9880d681SAndroid Build Coastguard Workerdefine float @t0(float %a, float %b, float %c) { 16*9880d681SAndroid Build Coastguard Worker;; FAST: fma.rn.f32 17*9880d681SAndroid Build Coastguard Worker;; DEFAULT: mul.rn.f32 18*9880d681SAndroid Build Coastguard Worker;; DEFAULT: add.rn.f32 19*9880d681SAndroid Build Coastguard Worker %v0 = fmul float %a, %b 20*9880d681SAndroid Build Coastguard Worker %v1 = fadd float %v0, %c 21*9880d681SAndroid Build Coastguard Worker ret float %v1 22*9880d681SAndroid Build Coastguard Worker} 23*9880d681SAndroid Build Coastguard Worker 24*9880d681SAndroid Build Coastguard Worker;; FAST-LABEL: @t1 25*9880d681SAndroid Build Coastguard Worker;; DEFAULT-LABEL: @t1 26*9880d681SAndroid Build Coastguard Workerdefine float @t1(float %a, float %b) { 27*9880d681SAndroid Build Coastguard Worker;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32 28*9880d681SAndroid Build Coastguard Worker;; to prevent ptxas from fusing this with anything else. 29*9880d681SAndroid Build Coastguard Worker;; FAST: add.f32 30*9880d681SAndroid Build Coastguard Worker;; DEFAULT: add.rn.f32 31*9880d681SAndroid Build Coastguard Worker %v1 = fadd float %a, %b 32*9880d681SAndroid Build Coastguard Worker ret float %v1 33*9880d681SAndroid Build Coastguard Worker} 34