1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_intra_pred_chroma_ver_neon.s 22@* 23@* @brief 24@* contains function definitions for intra prediction dc filtering. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* yogeswaran rs 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* luma intraprediction filter for dc input 45@* 46@* @par description: 47@* 48@* @param[in] pu1_ref 49@* uword8 pointer to the source 50@* 51@* @param[out] pu1_dst 52@* uword8 pointer to the destination 53@* 54@* @param[in] src_strd 55@* integer source stride 56@* 57@* @param[in] dst_strd 58@* integer destination stride 59@* 60@* @param[in] nt 61@* size of tranform block 62@* 63@* @param[in] mode 64@* type of filtering 65@* 66@* @returns 67@* 68@* @remarks 69@* none 70@* 71@******************************************************************************* 72@*/ 73 74@void ihevc_intra_pred_chroma_ver(uword8 *pu1_ref, 75@ word32 src_strd, 76@ uword8 *pu1_dst, 77@ word32 dst_strd, 78@ word32 nt, 79@ word32 mode) 80@**************variables vs registers***************************************** 81@r0 => *pu1_ref 82@r1 => src_strd 83@r2 => *pu1_dst 84@r3 => dst_strd 85 86@stack contents from #40 87@ nt 88@ mode 89 90.equ nt_offset, 40 91 92.text 93.align 4 94 95 96 97 98.globl ihevc_intra_pred_chroma_ver_a9q 99 100.type ihevc_intra_pred_chroma_ver_a9q, %function 101 102ihevc_intra_pred_chroma_ver_a9q: 103 104 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 105 106 ldr r4,[sp,#nt_offset] @loads nt 107 lsl r5, r4, #2 @4nt 108 109 110 cmp r4, #8 111 beq blk_8 112 blt blk_4 113 114copy_16: 115 add r5, r5, #2 @2nt+2 116 add r6, r0, r5 @&src[2nt+1] 117 118 add r5, r2, r3 @pu1_dst + dst_strd 119 vld2.8 {d20,d21}, [r6]! @16 loads (col 0:15) 120 add r8, r5, r3 121 122 add r10, r8, r3 123 vld2.8 {d22,d23}, [r6] @16 loads (col 16:31) 124 lsl r11, r3, #2 125 126 add r11, r11, #0xfffffff0 127 128 129 vst2.8 {d20,d21}, [r2]! 130 vst2.8 {d20,d21}, [r5]! 131 vst2.8 {d20,d21}, [r8]! 132 vst2.8 {d20,d21}, [r10]! 133 134 vst2.8 {d22,d23}, [r2], r11 135 vst2.8 {d22,d23}, [r5], r11 136 vst2.8 {d22,d23}, [r8], r11 137 vst2.8 {d22,d23}, [r10], r11 138 139 subs r4, r4, #4 140 141kernel_copy_16: 142 vst2.8 {d20,d21}, [r2]! 143 vst2.8 {d20,d21}, [r5]! 144 vst2.8 {d20,d21}, [r8]! 145 vst2.8 {d20,d21}, [r10]! 146 147 vst2.8 {d22,d23}, [r2], r11 148 vst2.8 {d22,d23}, [r5], r11 149 vst2.8 {d22,d23}, [r8], r11 150 vst2.8 {d22,d23}, [r10], r11 151 152 subs r4, r4, #4 153 154 155 vst2.8 {d20,d21}, [r2]! 156 vst2.8 {d20,d21}, [r5]! 157 vst2.8 {d20,d21}, [r8]! 158 vst2.8 {d20,d21}, [r10]! 159 160 vst2.8 {d22,d23}, [r2], r11 161 vst2.8 {d22,d23}, [r5], r11 162 vst2.8 {d22,d23}, [r8], r11 163 vst2.8 {d22,d23}, [r10], r11 164 165 subs r4, r4, #4 166 167 vst2.8 {d20,d21}, [r2]! 168 vst2.8 {d20,d21}, [r5]! 169 vst2.8 {d20,d21}, [r8]! 170 vst2.8 {d20,d21}, [r10]! 171 172 vst2.8 {d22,d23}, [r2], r11 173 vst2.8 {d22,d23}, [r5], r11 174 vst2.8 {d22,d23}, [r8], r11 175 vst2.8 {d22,d23}, [r10], r11 176 177 subs r4, r4, #4 178 bne kernel_copy_16 179 180 b end_func 181 182blk_8: 183 184 add r5, r5, #2 @2nt+2 185 add r6, r0, r5 @&src[2nt+1] 186 187 add r5, r2, r3 @pu1_dst + dst_strd 188 vld2.8 {d20,d21}, [r6]! @16 loads (col 0:15) 189 add r8, r5, r3 190 191 add r10, r8, r3 192 vld2.8 {d22,d23}, [r6] @16 loads (col 16:31) 193 194 lsl r11,r3,#2 195 196 vst2.8 {d20,d21}, [r2],r11 197 vst2.8 {d20,d21}, [r5],r11 198 vst2.8 {d20,d21}, [r8],r11 199 vst2.8 {d20,d21}, [r10],r11 200 201 vst2.8 {d20,d21}, [r2] 202 vst2.8 {d20,d21}, [r5] 203 vst2.8 {d20,d21}, [r8] 204 vst2.8 {d20,d21}, [r10] 205 206 subs r4, r4, #8 207 beq end_func 208 209blk_4: 210 211 @lsl r5, r4, #2 @4nt 212 add r5, r5, #2 @2nt+2 213 add r6, r0, r5 @&src[2nt+1] 214 215 vld1.8 {d0},[r6] 216 add r5, r2, r3 @pu1_dst + dst_strd 217 218 vst1.8 {d0},[r2] 219 add r8, r5, r3 220 vst1.8 {d0},[r5] 221 add r10, r8, r3 222 vst1.8 {d0},[r8] 223 vst1.8 {d0},[r10] 224 225 226 227end_func: 228 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 229 230 231 232