Home
last modified time | relevance | path

Searched defs:Engine1 (Results 1 – 4 of 4) sorted by relevance

/aosp_15_r20/external/pytorch/aten/src/ATen/native/transformers/cuda/flash_attn/
H A Dsoftmax.h24 …inline__ void thread_reduce_(Tensor<Engine0, Layout0> const &tensor, Tensor<Engine1, Layout1> &sum… in thread_reduce_()
39 __device__ __forceinline__ void quad_allreduce_(Tensor<Engine0, Layout0> &dst, Tensor<Engine1, Layo… in quad_allreduce_()
48 __device__ __forceinline__ void reduce_(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Lay… in reduce_()
54 __device__ __forceinline__ void reduce_max(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, … in reduce_max()
60 __device__ __forceinline__ void reduce_sum(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, … in reduce_sum()
67 __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, … in scale_apply_exp2()
93 __forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1 in max_scale_exp2_sum()
H A Drotary.h23 Tensor<Engine1, Layout1> &D, in copy_rotary_interleaved()
86 Tensor<Engine1, Layout1> &D, in copy_rotary_contiguous()
H A Dmask.h85 Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &idx_rowcol, in apply_mask_causal_w_idx()
H A Dflash_bwd_preprocess_kernel.h25Tensor<Engine1, Layout1> &dP_sum, const int gdP_col_stride, const float scale) { in dot_do_o()