package/ffmpeg/src/libavcodec/arm/synth_filter_vfp.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

/*
 * Copyright (c) 2013 RISC OS Open Ltd
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 *
 * Author: Ben Avison <bavison@riscosopen.org>
 */

#include "libavutil/arm/asm.S"

IMDCT         .req    r0
ORIG_P_SB     .req    r1
P_SB_OFF      .req    r2
I             .req    r0
P_SB2_UP      .req    r1
OLDFPSCR      .req    r2
P_SB2_DN      .req    r3
P_WIN_DN      .req    r4
P_OUT_DN      .req    r5
P_SB          .req    r6
J_WRAP        .req    r7
P_WIN_UP      .req    r12
P_OUT_UP      .req    r14

SCALE         .req    s0
SBUF_DAT_REV0 .req    s4
SBUF_DAT_REV1 .req    s5
SBUF_DAT_REV2 .req    s6
SBUF_DAT_REV3 .req    s7
VA0           .req    s8
VA3           .req    s11
VB0           .req    s12
VB3           .req    s15
VC0           .req    s8
VC3           .req    s11
VD0           .req    s12
VD3           .req    s15
SBUF_DAT0     .req    s16
SBUF_DAT1     .req    s17
SBUF_DAT2     .req    s18
SBUF_DAT3     .req    s19
SBUF_DAT_ALT0 .req    s20
SBUF_DAT_ALT1 .req    s21
SBUF_DAT_ALT2 .req    s22
SBUF_DAT_ALT3 .req    s23
WIN_DN_DAT0   .req    s24
WIN_UP_DAT0   .req    s28


.macro inner_loop  half, tail, head
 .if (OFFSET & (64*4)) == 0                @ even numbered call
        SBUF_DAT_THIS0 .req SBUF_DAT0
        SBUF_DAT_THIS1 .req SBUF_DAT1
        SBUF_DAT_THIS2 .req SBUF_DAT2
        SBUF_DAT_THIS3 .req SBUF_DAT3
  .ifnc "\head",""
        vldr    d8, [P_SB, #OFFSET]        @ d8 = SBUF_DAT
        vldr    d9, [P_SB, #OFFSET+8]
  .endif
 .else
        SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
        SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
        SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
        SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
  .ifnc "\head",""
        vldr    d10, [P_SB, #OFFSET]       @ d10 = SBUF_DAT_ALT
        vldr    d11, [P_SB, #OFFSET+8]
  .endif
 .endif
 .ifnc "\tail",""
  .ifc "\half","ab"
        vmls.f  VA0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
  .else
        vmla.f  VD0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
  .endif
 .endif
 .ifnc "\head",""
        vldr    d14, [P_WIN_UP, #OFFSET]   @ d14 = WIN_UP_DAT
        vldr    d15, [P_WIN_UP, #OFFSET+8]
        vldr    d12, [P_WIN_DN, #OFFSET]   @ d12 = WIN_DN_DAT
        vldr    d13, [P_WIN_DN, #OFFSET+8]
        vmov    SBUF_DAT_REV3, SBUF_DAT_THIS0
        vmov    SBUF_DAT_REV2, SBUF_DAT_THIS1
        vmov    SBUF_DAT_REV1, SBUF_DAT_THIS2
        vmov    SBUF_DAT_REV0, SBUF_DAT_THIS3
  .ifc "\half","ab"
        vmla.f  VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
  .else
        vmla.f  VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
  .endif
        teq     J_WRAP, #J
        bne     2f             @ strongly predictable, so better than cond exec in this case
        sub     P_SB, P_SB, #512*4
2:
  .set J, J - 64
  .set OFFSET, OFFSET + 64*4
 .endif
        .unreq  SBUF_DAT_THIS0
        .unreq  SBUF_DAT_THIS1
        .unreq  SBUF_DAT_THIS2
        .unreq  SBUF_DAT_THIS3
.endm


/* void ff_synth_filter_float_vfp(FFTContext *imdct,
 *                                float *synth_buf_ptr, int *synth_buf_offset,
 *                                float synth_buf2[32], const float window[512],
 *                                float out[32], const float in[32], float scale)
 */
function ff_synth_filter_float_vfp, export=1
        push    {r3-r7,lr}
        vpush   {s16-s31}
        ldr     lr, [P_SB_OFF]
        add     a2, ORIG_P_SB, lr, LSL #2 @ calculate synth_buf to pass to imdct_half
        mov     P_SB, a2                  @ and keep a copy for ourselves
        bic     J_WRAP, lr, #63           @ mangled to make testing for wrap easier in inner loop
        sub     lr, lr, #32
        and     lr, lr, #512-32
        str     lr, [P_SB_OFF]            @ rotate offset, modulo buffer size, ready for next call
        ldr     a3, [sp, #(16+6+2)*4]     @ fetch in from stack, to pass to imdct_half
VFP     vmov    s16, SCALE                @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
        bl      ff_imdct_half_c
VFP     vmov    SCALE, s16

        vmrs    OLDFPSCR, FPSCR
        ldr     lr, =0x03030000           @ RunFast mode, short vectors of length 4, stride 1
        vmsr    FPSCR, lr
        ldr     P_SB2_DN, [sp, #16*4]
        ldr     P_WIN_DN, [sp, #(16+6+0)*4]
        ldr     P_OUT_DN, [sp, #(16+6+1)*4]
NOVFP   vldr    SCALE, [sp, #(16+6+3)*4]

#define IMM_OFF_SKEW 956                   /* also valid immediate constant when you add 16*4 */
        add     P_SB, P_SB, #IMM_OFF_SKEW  @ so we can use -ve offsets to use full immediate offset range
        add     P_SB2_UP, P_SB2_DN, #16*4
        add     P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
        add     P_OUT_UP, P_OUT_DN, #16*4
        add     P_SB2_DN, P_SB2_DN, #16*4
        add     P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
        add     P_OUT_DN, P_OUT_DN, #16*4
        mov     I, #4
1:
        vldmia  P_SB2_UP!, {VB0-VB3}
        vldmdb  P_SB2_DN!, {VA0-VA3}
 .set J, 512 - 64
 .set OFFSET, -IMM_OFF_SKEW
        inner_loop  ab,, head
 .rept 7
        inner_loop  ab, tail, head
 .endr
        inner_loop  ab, tail
        add     P_WIN_UP, P_WIN_UP, #4*4
        sub     P_WIN_DN, P_WIN_DN, #4*4
        vmul.f  VB0, VB0, SCALE      @ SCALE treated as scalar
        add     P_SB, P_SB, #(512+4)*4
        subs    I, I, #1
        vmul.f  VA0, VA0, SCALE
        vstmia  P_OUT_UP!, {VB0-VB3}
        vstmdb  P_OUT_DN!, {VA0-VA3}
        bne     1b

        add     P_SB2_DN, P_SB2_DN, #(16+28-12)*4
        sub     P_SB2_UP, P_SB2_UP, #(16+16)*4
        add     P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
        mov     I, #4
1:
        vldr.d  d4, zero             @ d4 = VC0
        vldr.d  d5, zero
        vldr.d  d6, zero             @ d6 = VD0
        vldr.d  d7, zero
 .set J, 512 - 64
 .set OFFSET, -IMM_OFF_SKEW
        inner_loop  cd,, head
 .rept 7
        inner_loop  cd, tail, head
 .endr
        inner_loop  cd, tail
        add     P_WIN_UP, P_WIN_UP, #4*4
        sub     P_WIN_DN, P_WIN_DN, #4*4
        add     P_SB, P_SB, #(512+4)*4
        subs    I, I, #1
        vstmia  P_SB2_UP!, {VC0-VC3}
        vstmdb  P_SB2_DN!, {VD0-VD3}
        bne     1b

        vmsr    FPSCR, OLDFPSCR
        vpop    {s16-s31}
        pop     {r3-r7,pc}
endfunc

        .align  3
zero:   .word   0, 0