Searched defs:Engine0 (Results 1 – 4 of 4) sorted by relevance
/aosp_15_r20/external/pytorch/aten/src/ATen/native/transformers/cuda/flash_attn/ |
H A D | softmax.h | 24 __device__ __forceinline__ void thread_reduce_(Tensor<Engine0, Layout0> const &tensor, Tensor<Engin… in thread_reduce_() 39 __device__ __forceinline__ void quad_allreduce_(Tensor<Engine0, Layout0> &dst, Tensor<Engine1, Layo… in quad_allreduce_() 48 __device__ __forceinline__ void reduce_(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Lay… in reduce_() 54 __device__ __forceinline__ void reduce_max(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, … in reduce_max() 60 __device__ __forceinline__ void reduce_sum(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, … in reduce_sum() 67 __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, … in scale_apply_exp2() 93 __forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1… in max_scale_exp2_sum()
|
H A D | rotary.h | 22 __forceinline__ __device__ void copy_rotary_interleaved(Tensor<Engine0, Layout0> const &S, in copy_rotary_interleaved() 85 __forceinline__ __device__ void copy_rotary_contiguous(Tensor<Engine0, Layout0> const &S, in copy_rotary_contiguous()
|
H A D | flash_bwd_preprocess_kernel.h | 24 inline __device__ void dot_do_o(Tensor<Engine0, Layout0> const &do_, Tensor<Engine0, Layout0> const… in dot_do_o()
|
H A D | mask.h | 85 Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &idx_rowcol, in apply_mask_causal_w_idx()
|