一 C语言实现
static?inline?void?pixel_avg2_w20_altivec(uint8_t?*dst,?intptr_t?i_dst,
uint8_t?*src1,?intptr_t?i_src1,
uint8_t?*src2,?int?i_height)
{
????pixel_avg2_w16_altivec(dst,?idst,?src1,?i_src1,?src2,?i_height);?//前面16列
????pixel_avg2_w4_altivec(dst?+?16,?i_dst,?src1?+?16,?i_src1,?src2?+?16,?i_height);?//后面4列
}??
static?inline?void?pixel_avg2_w16_altivec(uint8_t?*dst,?intptr_t?i_dst,
uint8_t?*src1,?intptr_t?i_src1,?
uint8_t?*src2,?int?i_height)
{
????vec_u8_t?src1v,?src2v;
????for?(int?y?=?0;?y?<?i_height;?y++)
????{
????????src1v?=?vec_vsx_ld(0,?src1);
????????src2v?=?vec_vsx_ld(0,?src2);
????????src1v?=?vec_avg(src1v,?srcv2);
????????vec_st(src1v,?0,?dst);
????????dst?+=?i_dst;
????????src1?+=?i_src1;
????????src2?+=?i_src1;?
????}
}
static?inline?void?pixel_avg2_w4_altivec(uint8_t?*dst,?intptr_t?i_dst,
uint8_t?*src1,?intptr_t?i_src1,
uint8_t?*src2,?int?i_height)
{
????for?(int?y?=?0;?y?<?i_height;?y++)
????{
????????for?(int?x?=?0;?x?<?4;?x++)
????????????dst[x]?=?(src1[x]?+?src2[x]?+?1)?>>?1;
????????dst?+=?i_dst;
????????src1?+=?i_src1;
????????src2?+=?i_src1;
?????}
}
二?汇编实现
function?pixel_avg2_w20_neon
????ldr?ip,?[sp,?#4]
????push?{lr}
????sub?r1,?r1,?#16
????ldr?lr,?[sp,?#4]
i_height | |
src2 | sp开始的位置,sp?+?4就是i_height |
push?{lr} | lr寄存器在此,sp?+?4?就是?src2 |
function?pixel_avg2_w20_neon
????ldr?????????ip,??[sp,?#4]?//取参数?height?到?ip
????push????????{lr}?//push?lr到栈上,sp寄存器,也减小了?-?4
????sub?????????r1,??r1,??#16?//r1?=?r1?-?16
????ldr?????????lr,??[sp,?#4]?//src2?载入到?lr寄存器?
avg2_w20_loop:
????subs????????ip,??ip,??#2?//ip?=?ip?-?2
????vld1.64?????{d0-d2},??[r2],?r3?//先从r2(src1)处加载?24字节???d0?d1?d2
????vld1.64?????{d4-d6},??[lr],?r3?//先从lr(dst)处加载?24字节???d4?d5?d6
????vrhadd.u8???q0,??q0,??q2??//q0?=?(q0?+?q2?+?1)?>>?1??这里?q0?=?d0?d1??q2?=?d4?d4
????//因此上面这句话的意思是,?d0?=?d0?+?d4?,?d1?=?d1?+?d5??这里计算了128也就是16字节
????vrhadd.u8???d2,??d2,??d6??//d2?=?(d2?+?d6?+?1)?>>?1
????//上面这句话,是末尾还有8个字节的计算。?d2?=?d2?+?d6
????vld1.64?????{d4-d6},??[r2],?r3?//载入数据?d4?d5?d6?再次读入数据
????vld1.64?????{d16-d18},[lr],?r3?//载入数据?d16?d17?d18?再次读入数据
????vrhadd.u8???q2,??q2,??q8?//q2?=?(q2?+?q8?+?1)?>>?1
????//和上面操作类似,?q2?=?d4?d5,?q8?=?d16?d17?16个字节相加
????vst1.64?????{d0-d1},??[r0,:128]!?//d0?d1?->?存储到?r0,?128bits??存储回去128bits
????vrhadd.u8???d6,??d6,??d18?//?d6?=?(d6?+?d18?+?1)?>>?1?剩余的8字节相加?
????vst1.32?????{d2[0]},??[r0,:32],?r1?//vst1.32?{d2[0]},?[r0,:32],?r1?指令表示将寄存器??存储回去
????//d2?中的数据的低?32?位存储到内存地址?[r0?+?r1]?处。这里的?r0?是存储地址的基址,r1?是偏移量,表示存储地址的偏移量
????//vst1.32?{d2[1]},?[r0,:32],?r1?的意思是将寄存器?d2?中的数据的高?32?位存储到内存地址?
????//?[r0?+?r1]?处。这里的?r0?是存储地址的基址,r1?是偏移量,表示存储地址的偏移量。
????vst1.64?????{d4-d5},??[r0,:128]!?//d4?d5?->存储到?r0?刚才后面一组的计算结果?d4?d5存储回去
????vst1.32?????{d6[0]},??[r0,:32],?r1?//d6[0]?低?32位?存储到?r0的位置?最后计算的结果d6存储回去?
????//以上一次计算了2行的数据,所以?ip?=?ip?-?2,每次减少两行,再循环
????bgt?????????avg2_w20_loop
????pop?????????{pc}?//弹出?lr到pc上。
endfunc