diff options
| author | Waldemar Brodkorb <wbx@openadk.org> | 2013-10-16 13:09:30 +0200 | 
|---|---|---|
| committer | Waldemar Brodkorb <wbx@openadk.org> | 2013-10-16 13:09:30 +0200 | 
| commit | f6fed079c91dfc85deb834038dcc71b9695fecad (patch) | |
| tree | f95126508d654472d46de0fb407f27a267b99e05 /package/ffmpeg | |
| parent | b8f7ebdccbb4520fb8196976e5378e866cf49f86 (diff) | |
add arm optimized dts decoding from raspberry pi developers
Diffstat (limited to 'package/ffmpeg')
| -rw-r--r-- | package/ffmpeg/Makefile | 2 | ||||
| -rw-r--r-- | package/ffmpeg/patches/patch-libavcodec_arm_Makefile | 12 | ||||
| -rw-r--r-- | package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c | 25 | ||||
| -rw-r--r-- | package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c.orig | 22 | ||||
| -rw-r--r-- | package/ffmpeg/src/libavcodec/arm/synth_filter_vfp.S | 206 | 
5 files changed, 266 insertions, 1 deletions
| diff --git a/package/ffmpeg/Makefile b/package/ffmpeg/Makefile index 045b9edec..9c4cea01c 100644 --- a/package/ffmpeg/Makefile +++ b/package/ffmpeg/Makefile @@ -5,7 +5,7 @@ include ${TOPDIR}/rules.mk  PKG_NAME:=		ffmpeg  PKG_VERSION:=		2.0.2 -PKG_RELEASE:=		1 +PKG_RELEASE:=		2  PKG_MD5SUM:=		6c5cfed204d8a108325d1fc439ab734a  PKG_DESCR:=		record, convert and stream audio & video  PKG_SECTION:=		libs diff --git a/package/ffmpeg/patches/patch-libavcodec_arm_Makefile b/package/ffmpeg/patches/patch-libavcodec_arm_Makefile new file mode 100644 index 000000000..f504f60f9 --- /dev/null +++ b/package/ffmpeg/patches/patch-libavcodec_arm_Makefile @@ -0,0 +1,12 @@ +--- ffmpeg-2.0.1.orig/libavcodec/arm/Makefile	2013-08-11 01:23:24.000000000 +0200 ++++ ffmpeg-2.0.1/libavcodec/arm/Makefile	2013-10-14 17:47:19.000000000 +0200 +@@ -52,7 +52,8 @@ ARMV6-OBJS-$(CONFIG_VP8_DECODER)       + +                                           arm/vp8dsp_init_armv6.o       \ +                                           arm/vp8dsp_armv6.o +  +-VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o ++VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o \ ++					  arm/synth_filter_vfp.o	 +  + NEON-OBJS                              += arm/fmtconvert_neon.o +  diff --git a/package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c b/package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c new file mode 100644 index 000000000..dc562a061 --- /dev/null +++ b/package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c @@ -0,0 +1,25 @@ +--- ffmpeg-2.0.2.orig/libavcodec/arm/fft_init_arm.c	2013-10-08 19:52:31.000000000 +0200 ++++ ffmpeg-2.0.2/libavcodec/arm/fft_init_arm.c	2013-10-16 12:59:04.000000000 +0200 +@@ -32,6 +32,12 @@ void ff_mdct_calc_neon(FFTContext *s, FF +  + void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); +  ++void ff_synth_filter_float_vfp(FFTContext *imdct, ++                               float *synth_buf_ptr, int *synth_buf_offset, ++                               float synth_buf2[32], const float window[512], ++                               float out[32], const float in[32], ++                               float scale); ++ + void ff_synth_filter_float_neon(FFTContext *imdct, +                                 float *synth_buf_ptr, int *synth_buf_offset, +                                 float synth_buf2[32], const float window[512], +@@ -71,6 +77,9 @@ av_cold void ff_synth_filter_init_arm(Sy + { +     int cpu_flags = av_get_cpu_flags(); +  ++    if (have_vfp(cpu_flags)) ++        s->synth_filter_float = ff_synth_filter_float_vfp; ++ +     if (have_neon(cpu_flags)) +         s->synth_filter_float = ff_synth_filter_float_neon; + } diff --git a/package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c.orig b/package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c.orig new file mode 100644 index 000000000..a7d9ff0ac --- /dev/null +++ b/package/ffmpeg/patches/patch-libavcodec_arm_fft_init_arm_c.orig @@ -0,0 +1,22 @@ +--- ffmpeg-2.0.1.orig/libavcodec/arm/fft_init_arm.c	2013-08-11 01:23:24.000000000 +0200 ++++ ffmpeg-2.0.1/libavcodec/arm/fft_init_arm.c	2013-10-14 17:45:32.000000000 +0200 +@@ -32,6 +32,12 @@ void ff_mdct_calc_neon(FFTContext *s, FF +  + void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); +  ++void ff_synth_filter_float_vfp(FFTContext *imdct, ++                               float *synth_buf_ptr, int *synth_buf_offset, ++                               float synth_buf2[32], const float window[512], ++                               float out[32], const float in[32], ++                               float scale); ++ + void ff_synth_filter_float_neon(FFTContext *imdct, +                                 float *synth_buf_ptr, int *synth_buf_offset, +                                 float synth_buf2[32], const float window[512], +@@ -73,5 +79,6 @@ av_cold void ff_synth_filter_init_arm(Sy +  +     if (have_neon(cpu_flags)) +         s->synth_filter_float = ff_synth_filter_float_neon; ++    s->synth_filter_float = ff_synth_filter_float_vfp; + } + #endif diff --git a/package/ffmpeg/src/libavcodec/arm/synth_filter_vfp.S b/package/ffmpeg/src/libavcodec/arm/synth_filter_vfp.S new file mode 100644 index 000000000..451fe5ce9 --- /dev/null +++ b/package/ffmpeg/src/libavcodec/arm/synth_filter_vfp.S @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2013 RISC OS Open Ltd + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Author: Ben Avison <bavison@riscosopen.org> + */ + +#include "libavutil/arm/asm.S" + +IMDCT         .req    r0 +ORIG_P_SB     .req    r1 +P_SB_OFF      .req    r2 +I             .req    r0 +P_SB2_UP      .req    r1 +OLDFPSCR      .req    r2 +P_SB2_DN      .req    r3 +P_WIN_DN      .req    r4 +P_OUT_DN      .req    r5 +P_SB          .req    r6 +J_WRAP        .req    r7 +P_WIN_UP      .req    r12 +P_OUT_UP      .req    r14 + +SCALE         .req    s0 +SBUF_DAT_REV0 .req    s4 +SBUF_DAT_REV1 .req    s5 +SBUF_DAT_REV2 .req    s6 +SBUF_DAT_REV3 .req    s7 +VA0           .req    s8 +VA3           .req    s11 +VB0           .req    s12 +VB3           .req    s15 +VC0           .req    s8 +VC3           .req    s11 +VD0           .req    s12 +VD3           .req    s15 +SBUF_DAT0     .req    s16 +SBUF_DAT1     .req    s17 +SBUF_DAT2     .req    s18 +SBUF_DAT3     .req    s19 +SBUF_DAT_ALT0 .req    s20 +SBUF_DAT_ALT1 .req    s21 +SBUF_DAT_ALT2 .req    s22 +SBUF_DAT_ALT3 .req    s23 +WIN_DN_DAT0   .req    s24 +WIN_UP_DAT0   .req    s28 + + +.macro inner_loop  half, tail, head + .if (OFFSET & (64*4)) == 0                @ even numbered call +        SBUF_DAT_THIS0 .req SBUF_DAT0 +        SBUF_DAT_THIS1 .req SBUF_DAT1 +        SBUF_DAT_THIS2 .req SBUF_DAT2 +        SBUF_DAT_THIS3 .req SBUF_DAT3 +  .ifnc "\head","" +        vldr    d8, [P_SB, #OFFSET]        @ d8 = SBUF_DAT +        vldr    d9, [P_SB, #OFFSET+8] +  .endif + .else +        SBUF_DAT_THIS0 .req SBUF_DAT_ALT0 +        SBUF_DAT_THIS1 .req SBUF_DAT_ALT1 +        SBUF_DAT_THIS2 .req SBUF_DAT_ALT2 +        SBUF_DAT_THIS3 .req SBUF_DAT_ALT3 +  .ifnc "\head","" +        vldr    d10, [P_SB, #OFFSET]       @ d10 = SBUF_DAT_ALT +        vldr    d11, [P_SB, #OFFSET+8] +  .endif + .endif + .ifnc "\tail","" +  .ifc "\half","ab" +        vmls.f  VA0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors +  .else +        vmla.f  VD0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors +  .endif + .endif + .ifnc "\head","" +        vldr    d14, [P_WIN_UP, #OFFSET]   @ d14 = WIN_UP_DAT +        vldr    d15, [P_WIN_UP, #OFFSET+8] +        vldr    d12, [P_WIN_DN, #OFFSET]   @ d12 = WIN_DN_DAT +        vldr    d13, [P_WIN_DN, #OFFSET+8] +        vmov    SBUF_DAT_REV3, SBUF_DAT_THIS0 +        vmov    SBUF_DAT_REV2, SBUF_DAT_THIS1 +        vmov    SBUF_DAT_REV1, SBUF_DAT_THIS2 +        vmov    SBUF_DAT_REV0, SBUF_DAT_THIS3 +  .ifc "\half","ab" +        vmla.f  VB0, SBUF_DAT_THIS0, WIN_UP_DAT0 +  .else +        vmla.f  VC0, SBUF_DAT_THIS0, WIN_UP_DAT0 +  .endif +        teq     J_WRAP, #J +        bne     2f             @ strongly predictable, so better than cond exec in this case +        sub     P_SB, P_SB, #512*4 +2: +  .set J, J - 64 +  .set OFFSET, OFFSET + 64*4 + .endif +        .unreq  SBUF_DAT_THIS0 +        .unreq  SBUF_DAT_THIS1 +        .unreq  SBUF_DAT_THIS2 +        .unreq  SBUF_DAT_THIS3 +.endm + + +/* void ff_synth_filter_float_vfp(FFTContext *imdct, + *                                float *synth_buf_ptr, int *synth_buf_offset, + *                                float synth_buf2[32], const float window[512], + *                                float out[32], const float in[32], float scale) + */ +function ff_synth_filter_float_vfp, export=1 +        push    {r3-r7,lr} +        vpush   {s16-s31} +        ldr     lr, [P_SB_OFF] +        add     a2, ORIG_P_SB, lr, LSL #2 @ calculate synth_buf to pass to imdct_half +        mov     P_SB, a2                  @ and keep a copy for ourselves +        bic     J_WRAP, lr, #63           @ mangled to make testing for wrap easier in inner loop +        sub     lr, lr, #32 +        and     lr, lr, #512-32 +        str     lr, [P_SB_OFF]            @ rotate offset, modulo buffer size, ready for next call +        ldr     a3, [sp, #(16+6+2)*4]     @ fetch in from stack, to pass to imdct_half +VFP     vmov    s16, SCALE                @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case +        bl      ff_imdct_half_c +VFP     vmov    SCALE, s16 + +        vmrs    OLDFPSCR, FPSCR +        ldr     lr, =0x03030000           @ RunFast mode, short vectors of length 4, stride 1 +        vmsr    FPSCR, lr +        ldr     P_SB2_DN, [sp, #16*4] +        ldr     P_WIN_DN, [sp, #(16+6+0)*4] +        ldr     P_OUT_DN, [sp, #(16+6+1)*4] +NOVFP   vldr    SCALE, [sp, #(16+6+3)*4] + +#define IMM_OFF_SKEW 956                   /* also valid immediate constant when you add 16*4 */ +        add     P_SB, P_SB, #IMM_OFF_SKEW  @ so we can use -ve offsets to use full immediate offset range +        add     P_SB2_UP, P_SB2_DN, #16*4 +        add     P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW +        add     P_OUT_UP, P_OUT_DN, #16*4 +        add     P_SB2_DN, P_SB2_DN, #16*4 +        add     P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW +        add     P_OUT_DN, P_OUT_DN, #16*4 +        mov     I, #4 +1: +        vldmia  P_SB2_UP!, {VB0-VB3} +        vldmdb  P_SB2_DN!, {VA0-VA3} + .set J, 512 - 64 + .set OFFSET, -IMM_OFF_SKEW +        inner_loop  ab,, head + .rept 7 +        inner_loop  ab, tail, head + .endr +        inner_loop  ab, tail +        add     P_WIN_UP, P_WIN_UP, #4*4 +        sub     P_WIN_DN, P_WIN_DN, #4*4 +        vmul.f  VB0, VB0, SCALE      @ SCALE treated as scalar +        add     P_SB, P_SB, #(512+4)*4 +        subs    I, I, #1 +        vmul.f  VA0, VA0, SCALE +        vstmia  P_OUT_UP!, {VB0-VB3} +        vstmdb  P_OUT_DN!, {VA0-VA3} +        bne     1b + +        add     P_SB2_DN, P_SB2_DN, #(16+28-12)*4 +        sub     P_SB2_UP, P_SB2_UP, #(16+16)*4 +        add     P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4 +        mov     I, #4 +1: +        vldr.d  d4, zero             @ d4 = VC0 +        vldr.d  d5, zero +        vldr.d  d6, zero             @ d6 = VD0 +        vldr.d  d7, zero + .set J, 512 - 64 + .set OFFSET, -IMM_OFF_SKEW +        inner_loop  cd,, head + .rept 7 +        inner_loop  cd, tail, head + .endr +        inner_loop  cd, tail +        add     P_WIN_UP, P_WIN_UP, #4*4 +        sub     P_WIN_DN, P_WIN_DN, #4*4 +        add     P_SB, P_SB, #(512+4)*4 +        subs    I, I, #1 +        vstmia  P_SB2_UP!, {VC0-VC3} +        vstmdb  P_SB2_DN!, {VD0-VD3} +        bne     1b + +        vmsr    FPSCR, OLDFPSCR +        vpop    {s16-s31} +        pop     {r3-r7,pc} +endfunc + +        .align  3 +zero:   .word   0, 0 | 
