Home
last modified time | relevance | path

Searched defs:Engine0 (Results 1 – 4 of 4) sorted by relevance

/aosp_15_r20/external/pytorch/aten/src/ATen/native/transformers/cuda/flash_attn/
H A Dsoftmax.h24 __device__ __forceinline__ void thread_reduce_(Tensor<Engine0, Layout0> const &tensor, Tensor<Engin… in thread_reduce_()
39 __device__ __forceinline__ void quad_allreduce_(Tensor<Engine0, Layout0> &dst, Tensor<Engine1, Layo… in quad_allreduce_()
48 __device__ __forceinline__ void reduce_(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Lay… in reduce_()
54 __device__ __forceinline__ void reduce_max(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, … in reduce_max()
60 __device__ __forceinline__ void reduce_sum(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, … in reduce_sum()
67 __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, … in scale_apply_exp2()
93 __forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1… in max_scale_exp2_sum()
H A Drotary.h22 __forceinline__ __device__ void copy_rotary_interleaved(Tensor<Engine0, Layout0> const &S, in copy_rotary_interleaved()
85 __forceinline__ __device__ void copy_rotary_contiguous(Tensor<Engine0, Layout0> const &S, in copy_rotary_contiguous()
H A Dflash_bwd_preprocess_kernel.h24 inline __device__ void dot_do_o(Tensor<Engine0, Layout0> const &do_, Tensor<Engine0, Layout0> const… in dot_do_o()
H A Dmask.h85 Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &idx_rowcol, in apply_mask_causal_w_idx()