From f6fed079c91dfc85deb834038dcc71b9695fecad Mon Sep 17 00:00:00 2001 From: Waldemar Brodkorb Date: Wed, 16 Oct 2013 13:09:30 +0200 Subject: add arm optimized dts decoding from raspberry pi developers --- package/ffmpeg/Makefile | 2 +- .../ffmpeg/patches/patch-libavcodec_arm_Makefile | 12 ++ .../patches/patch-libavcodec_arm_fft_init_arm_c | 25 +++ .../patch-libavcodec_arm_fft_init_arm_c.orig | 22 +++ .../ffmpeg/src/libavcodec/arm/synth_filter_vfp.S | 206 +++++++++++++++++++++ 5 files changed, 266 insertions(+), 1 deletion(-) create mode 100644 package/ffmpeg/patches/patch-libavcodec_arm_Makefile create mode 100644 package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c create mode 100644 package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c.orig create mode 100644 package/ffmpeg/src/libavcodec/arm/synth_filter_vfp.S (limited to 'package') diff --git a/package/ffmpeg/Makefile b/package/ffmpeg/Makefile index 045b9edec..9c4cea01c 100644 --- a/package/ffmpeg/Makefile +++ b/package/ffmpeg/Makefile @@ -5,7 +5,7 @@ include ${TOPDIR}/rules.mk PKG_NAME:= ffmpeg PKG_VERSION:= 2.0.2 -PKG_RELEASE:= 1 +PKG_RELEASE:= 2 PKG_MD5SUM:= 6c5cfed204d8a108325d1fc439ab734a PKG_DESCR:= record, convert and stream audio & video PKG_SECTION:= libs diff --git a/package/ffmpeg/patches/patch-libavcodec_arm_Makefile b/package/ffmpeg/patches/patch-libavcodec_arm_Makefile new file mode 100644 index 000000000..f504f60f9 --- /dev/null +++ b/package/ffmpeg/patches/patch-libavcodec_arm_Makefile @@ -0,0 +1,12 @@ +--- ffmpeg-2.0.1.orig/libavcodec/arm/Makefile 2013-08-11 01:23:24.000000000 +0200 ++++ ffmpeg-2.0.1/libavcodec/arm/Makefile 2013-10-14 17:47:19.000000000 +0200 +@@ -52,7 +52,8 @@ ARMV6-OBJS-$(CONFIG_VP8_DECODER) + + arm/vp8dsp_init_armv6.o \ + arm/vp8dsp_armv6.o + +-VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o ++VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \ ++ arm/synth_filter_vfp.o + + NEON-OBJS += arm/fmtconvert_neon.o + diff --git a/package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c b/package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c new file mode 100644 index 000000000..dc562a061 --- /dev/null +++ b/package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c @@ -0,0 +1,25 @@ +--- ffmpeg-2.0.2.orig/libavcodec/arm/fft_init_arm.c 2013-10-08 19:52:31.000000000 +0200 ++++ ffmpeg-2.0.2/libavcodec/arm/fft_init_arm.c 2013-10-16 12:59:04.000000000 +0200 +@@ -32,6 +32,12 @@ void ff_mdct_calc_neon(FFTContext *s, FF + + void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); + ++void ff_synth_filter_float_vfp(FFTContext *imdct, ++ float *synth_buf_ptr, int *synth_buf_offset, ++ float synth_buf2[32], const float window[512], ++ float out[32], const float in[32], ++ float scale); ++ + void ff_synth_filter_float_neon(FFTContext *imdct, + float *synth_buf_ptr, int *synth_buf_offset, + float synth_buf2[32], const float window[512], +@@ -71,6 +77,9 @@ av_cold void ff_synth_filter_init_arm(Sy + { + int cpu_flags = av_get_cpu_flags(); + ++ if (have_vfp(cpu_flags)) ++ s->synth_filter_float = ff_synth_filter_float_vfp; ++ + if (have_neon(cpu_flags)) + s->synth_filter_float = ff_synth_filter_float_neon; + } diff --git a/package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c.orig b/package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c.orig new file mode 100644 index 000000000..a7d9ff0ac --- /dev/null +++ b/package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c.orig @@ -0,0 +1,22 @@ +--- ffmpeg-2.0.1.orig/libavcodec/arm/fft_init_arm.c 2013-08-11 01:23:24.000000000 +0200 ++++ ffmpeg-2.0.1/libavcodec/arm/fft_init_arm.c 2013-10-14 17:45:32.000000000 +0200 +@@ -32,6 +32,12 @@ void ff_mdct_calc_neon(FFTContext *s, FF + + void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); + ++void ff_synth_filter_float_vfp(FFTContext *imdct, ++ float *synth_buf_ptr, int *synth_buf_offset, ++ float synth_buf2[32], const float window[512], ++ float out[32], const float in[32], ++ float scale); ++ + void ff_synth_filter_float_neon(FFTContext *imdct, + float *synth_buf_ptr, int *synth_buf_offset, + float synth_buf2[32], const float window[512], +@@ -73,5 +79,6 @@ av_cold void ff_synth_filter_init_arm(Sy + + if (have_neon(cpu_flags)) + s->synth_filter_float = ff_synth_filter_float_neon; ++ s->synth_filter_float = ff_synth_filter_float_vfp; + } + #endif diff --git a/package/ffmpeg/src/libavcodec/arm/synth_filter_vfp.S b/package/ffmpeg/src/libavcodec/arm/synth_filter_vfp.S new file mode 100644 index 000000000..451fe5ce9 --- /dev/null +++ b/package/ffmpeg/src/libavcodec/arm/synth_filter_vfp.S @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2013 RISC OS Open Ltd + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Author: Ben Avison + */ + +#include "libavutil/arm/asm.S" + +IMDCT .req r0 +ORIG_P_SB .req r1 +P_SB_OFF .req r2 +I .req r0 +P_SB2_UP .req r1 +OLDFPSCR .req r2 +P_SB2_DN .req r3 +P_WIN_DN .req r4 +P_OUT_DN .req r5 +P_SB .req r6 +J_WRAP .req r7 +P_WIN_UP .req r12 +P_OUT_UP .req r14 + +SCALE .req s0 +SBUF_DAT_REV0 .req s4 +SBUF_DAT_REV1 .req s5 +SBUF_DAT_REV2 .req s6 +SBUF_DAT_REV3 .req s7 +VA0 .req s8 +VA3 .req s11 +VB0 .req s12 +VB3 .req s15 +VC0 .req s8 +VC3 .req s11 +VD0 .req s12 +VD3 .req s15 +SBUF_DAT0 .req s16 +SBUF_DAT1 .req s17 +SBUF_DAT2 .req s18 +SBUF_DAT3 .req s19 +SBUF_DAT_ALT0 .req s20 +SBUF_DAT_ALT1 .req s21 +SBUF_DAT_ALT2 .req s22 +SBUF_DAT_ALT3 .req s23 +WIN_DN_DAT0 .req s24 +WIN_UP_DAT0 .req s28 + + +.macro inner_loop half, tail, head + .if (OFFSET & (64*4)) == 0 @ even numbered call + SBUF_DAT_THIS0 .req SBUF_DAT0 + SBUF_DAT_THIS1 .req SBUF_DAT1 + SBUF_DAT_THIS2 .req SBUF_DAT2 + SBUF_DAT_THIS3 .req SBUF_DAT3 + .ifnc "\head","" + vldr d8, [P_SB, #OFFSET] @ d8 = SBUF_DAT + vldr d9, [P_SB, #OFFSET+8] + .endif + .else + SBUF_DAT_THIS0 .req SBUF_DAT_ALT0 + SBUF_DAT_THIS1 .req SBUF_DAT_ALT1 + SBUF_DAT_THIS2 .req SBUF_DAT_ALT2 + SBUF_DAT_THIS3 .req SBUF_DAT_ALT3 + .ifnc "\head","" + vldr d10, [P_SB, #OFFSET] @ d10 = SBUF_DAT_ALT + vldr d11, [P_SB, #OFFSET+8] + .endif + .endif + .ifnc "\tail","" + .ifc "\half","ab" + vmls.f VA0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors + .else + vmla.f VD0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors + .endif + .endif + .ifnc "\head","" + vldr d14, [P_WIN_UP, #OFFSET] @ d14 = WIN_UP_DAT + vldr d15, [P_WIN_UP, #OFFSET+8] + vldr d12, [P_WIN_DN, #OFFSET] @ d12 = WIN_DN_DAT + vldr d13, [P_WIN_DN, #OFFSET+8] + vmov SBUF_DAT_REV3, SBUF_DAT_THIS0 + vmov SBUF_DAT_REV2, SBUF_DAT_THIS1 + vmov SBUF_DAT_REV1, SBUF_DAT_THIS2 + vmov SBUF_DAT_REV0, SBUF_DAT_THIS3 + .ifc "\half","ab" + vmla.f VB0, SBUF_DAT_THIS0, WIN_UP_DAT0 + .else + vmla.f VC0, SBUF_DAT_THIS0, WIN_UP_DAT0 + .endif + teq J_WRAP, #J + bne 2f @ strongly predictable, so better than cond exec in this case + sub P_SB, P_SB, #512*4 +2: + .set J, J - 64 + .set OFFSET, OFFSET + 64*4 + .endif + .unreq SBUF_DAT_THIS0 + .unreq SBUF_DAT_THIS1 + .unreq SBUF_DAT_THIS2 + .unreq SBUF_DAT_THIS3 +.endm + + +/* void ff_synth_filter_float_vfp(FFTContext *imdct, + * float *synth_buf_ptr, int *synth_buf_offset, + * float synth_buf2[32], const float window[512], + * float out[32], const float in[32], float scale) + */ +function ff_synth_filter_float_vfp, export=1 + push {r3-r7,lr} + vpush {s16-s31} + ldr lr, [P_SB_OFF] + add a2, ORIG_P_SB, lr, LSL #2 @ calculate synth_buf to pass to imdct_half + mov P_SB, a2 @ and keep a copy for ourselves + bic J_WRAP, lr, #63 @ mangled to make testing for wrap easier in inner loop + sub lr, lr, #32 + and lr, lr, #512-32 + str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call + ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half +VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case + bl ff_imdct_half_c +VFP vmov SCALE, s16 + + vmrs OLDFPSCR, FPSCR + ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 + vmsr FPSCR, lr + ldr P_SB2_DN, [sp, #16*4] + ldr P_WIN_DN, [sp, #(16+6+0)*4] + ldr P_OUT_DN, [sp, #(16+6+1)*4] +NOVFP vldr SCALE, [sp, #(16+6+3)*4] + +#define IMM_OFF_SKEW 956 /* also valid immediate constant when you add 16*4 */ + add P_SB, P_SB, #IMM_OFF_SKEW @ so we can use -ve offsets to use full immediate offset range + add P_SB2_UP, P_SB2_DN, #16*4 + add P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW + add P_OUT_UP, P_OUT_DN, #16*4 + add P_SB2_DN, P_SB2_DN, #16*4 + add P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW + add P_OUT_DN, P_OUT_DN, #16*4 + mov I, #4 +1: + vldmia P_SB2_UP!, {VB0-VB3} + vldmdb P_SB2_DN!, {VA0-VA3} + .set J, 512 - 64 + .set OFFSET, -IMM_OFF_SKEW + inner_loop ab,, head + .rept 7 + inner_loop ab, tail, head + .endr + inner_loop ab, tail + add P_WIN_UP, P_WIN_UP, #4*4 + sub P_WIN_DN, P_WIN_DN, #4*4 + vmul.f VB0, VB0, SCALE @ SCALE treated as scalar + add P_SB, P_SB, #(512+4)*4 + subs I, I, #1 + vmul.f VA0, VA0, SCALE + vstmia P_OUT_UP!, {VB0-VB3} + vstmdb P_OUT_DN!, {VA0-VA3} + bne 1b + + add P_SB2_DN, P_SB2_DN, #(16+28-12)*4 + sub P_SB2_UP, P_SB2_UP, #(16+16)*4 + add P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4 + mov I, #4 +1: + vldr.d d4, zero @ d4 = VC0 + vldr.d d5, zero + vldr.d d6, zero @ d6 = VD0 + vldr.d d7, zero + .set J, 512 - 64 + .set OFFSET, -IMM_OFF_SKEW + inner_loop cd,, head + .rept 7 + inner_loop cd, tail, head + .endr + inner_loop cd, tail + add P_WIN_UP, P_WIN_UP, #4*4 + sub P_WIN_DN, P_WIN_DN, #4*4 + add P_SB, P_SB, #(512+4)*4 + subs I, I, #1 + vstmia P_SB2_UP!, {VC0-VC3} + vstmdb P_SB2_DN!, {VD0-VD3} + bne 1b + + vmsr FPSCR, OLDFPSCR + vpop {s16-s31} + pop {r3-r7,pc} +endfunc + + .align 3 +zero: .word 0, 0 -- cgit v1.2.3