Newer
Older
monitord / lame-3.97 / libmp3lame / i386 / fftsse.nas
@root root on 23 Jan 2012 15 KB Migration from SVN revision 455
; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA

; GOGO-no-coda
;	Copyright (C) 1999 shigeo
;	special thanks to Keiichi SAKAI

%include "nasm.h"

	globaldef fht_SSE
	globaldef fft_side_SSE
	externdef costab_fft
	externdef sintab_fft

	segment_data
	align 16
Q_MMPP	dd	0x0,0x0,0x80000000,0x80000000
Q_MPMP	dd	0x0,0x80000000,0x0,0x80000000
Q_002	dd	0.02236068, 0.02236068, 0.02236068, 0.02236068
D_SQRT2	dd	1.414213562,1.414213562
S_025	dd	0.25
S_05	DD	0.5
S_00005	DD	0.0005

	segment_code
;------------------------------------------------------------------------
;	by K. SAKAI
;	99/08/18	PIII 23k[clk]
;	99/08/19	命令順序入れ換え PIII 22k[clk]
;	99/08/20	bit reversal を旧午後から移植した PIII 17k[clk]
;	99/08/23	一部 unroll PIII 14k[clk]
;	99/11/12	clean up
;
;void fht_SSE(float *fz, int n);
	align 16
fht_SSE:
	push	ebx
	push	esi
	push	edi
	push	ebp
%assign _P 4*4

	;2つ目のループ
	mov	eax,[esp+_P+4]	;eax=fz
	mov	ebp,[esp+_P+8]	;=n
	shl	ebp,2
	add	ebp,eax		; fn  = fz + n, この関数終了まで不変

	xor	ecx,ecx		; ecx=k=0
	xor	eax,eax
	mov	al,4		; =k1=1*(sizeof float)	// 4, 16, 64, 256,...
	xor	edx,edx
	mov	dl,12		; =k3=3*k1
	jmp	short .lp2

	align	16
.lp2:				; do{
	add	cl,2		; k  += 2;
	shl	eax,2
	shl	edx,2

	mov	esi,[esp+_P+4]	;esi=fi=fz
	mov	edi,eax
	shr	edi,1
	add	edi,esi		; edi=gi=fi+ki/2

; たかだか2並列しか期待できない部分はFPUのほうが速い。
	movss	xmm7,[D_SQRT2]
	jmp	short .lp20

	align	16
.lp20:				; do{
;                       f0     = fi[0 ] + fi[k1];
;                       f2     = fi[k2] + fi[k3];
;                       f1     = fi[0 ] - fi[k1];
;                       f3     = fi[k2] - fi[k3];
;                       fi[0 ] = f0     + f2;
;                       fi[k1] = f1     + f3;
;                       fi[k2] = f0     - f2;
;                       fi[k3] = f1     - f3;
	fld	dword [esi]
	fadd	dword [esi+eax]
	fld	dword [esi+eax*2]
	fadd	dword [esi+edx]

	fld	dword [esi]
	fsub	dword [esi+eax]
	fld	dword [esi+eax*2]
	fsub	dword [esi+edx]

	fld	st1
	fadd	st0,st1
	fstp	dword [esi+eax]
	fsubp	st1,st0
	fstp	dword [esi+edx]

	fld	st1
	fadd	st0,st1
	fstp	dword [esi]
	fsubp	st1,st0
	fstp	dword [esi+eax*2]

	lea	esi,[esi + eax*4]	; = fi += (k1 * 4);
;	add	esi,eax
;	add	esi,edx
;                       g0     = gi[0 ] + gi[k1];
;                       g2     = SQRT2  * gi[k2];
;                       g1     = gi[0 ] - gi[k1];
;                       g3     = SQRT2  * gi[k3];
;                       gi[0 ] = g0     + g2;
;                       gi[k2] = g0     - g2;
;                       gi[k1] = g1     + g3;
;                       gi[k3] = g1     - g3;
	fld	dword [edi]
	fadd	dword [edi+eax]
	fld	dword [D_SQRT2]
	fmul	dword [edi+eax*2]

	fld	dword [edi]
	fsub	dword [edi+eax]
	fld	dword [D_SQRT2]
	fmul	dword [edi+edx]

	fld	st1
	fadd	st0,st1
	fstp	dword [edi+eax]
	fsubp	st1,st0
	fstp	dword [edi+edx]

	fld	st1
	fadd	st0,st1
	fstp	dword [edi]
	fsubp	st1,st0
	fstp	dword [edi+eax*2]

	lea	edi,[edi + eax*4]	; = gi += (k1 * 4);
	cmp	esi,ebp
	jl	near .lp20		; while (fi<fn);

;               i = 1; //for (i=1;i<kx;i++){
;                       c1 = 1.0*t_c - 0.0*t_s;
;                       s1 = 0.0*t_c + 1.0*t_s;
	movss	xmm6,[costab_fft + ecx*4]
	movss	xmm1,[sintab_fft + ecx*4]
	shufps	xmm6,xmm1,0x00	; = {s1, s1, c1, c1}
	shufps	xmm6,xmm6,0x28	; = {+c1, +s1, +s1, +c1}
;                       c2 = c1*c1 - s1*s1;
;                       s2 = c1*s1 + s1*c1;
	movaps	xmm4,xmm6
	movaps	xmm7,xmm6
	unpcklps	xmm4,xmm4	; = {s1, s1, c1, c1}
	shufps	xmm7,xmm7,0x41
	mulps	xmm4,xmm6	; = {s1*c1, s1*s1, c1*s1, c1*c1}
	xorps	xmm7,[Q_MMPP]	; = {-s1, -c1, +c1, +s1}
	movhlps	xmm3,xmm4
	xorps	xmm3,[Q_MPMP]
	subps	xmm4,xmm3	; = {--, --, s2, c2}
	movlhps	xmm4,xmm4	; = {+s2, +c2, +s2, +c2}
	movaps	xmm5,xmm4
	shufps	xmm5,xmm5,0x11
	xorps	xmm5,[Q_MPMP]	; = {-c2, +s2, -c2, +s2}
	mov	esi,[esp+_P+4]	; = fz
	lea	edi,[esi + eax - 4]	; edi = gi = fz +k1-i
	add	esi,4		; esi = fi = fz + i
	jmp	short .lp21

	align	16
.lp21:				; do{
;                               a       = c2*fi[k1] + s2*gi[k1];
;                               b       = s2*fi[k1] - c2*gi[k1];
;                               c       = c2*fi[k3] + s2*gi[k3];
;                               d       = s2*fi[k3] - c2*gi[k3];
;                               f0      = fi[0 ]        + a;
;                               g0      = gi[0 ]        + b;
;                               f2      = fi[k1 * 2]    + c;
;                               g2      = gi[k1 * 2]    + d;
;                               f1      = fi[0 ]        - a;
;                               g1      = gi[0 ]        - b;
;                               f3      = fi[k1 * 2]    - c;
;                               g3      = gi[k1 * 2]    - d;

	movss	xmm0,[esi + eax]	; = fi[k1]
	movss	xmm2,[esi + edx]	; = fi[k3]
	shufps	xmm0,xmm2,0x00	; = {fi[k3], fi[k3], fi[k1], fi[k1]}
	movss	xmm1,[edi + eax]	; = fi[k1]
	movss	xmm3,[edi + edx]	; = fi[k3]
	shufps	xmm1,xmm3,0x00	; = {gi[k3], gi[k3], gi[k1], gi[k1]}
	movss	xmm2,[esi]		; = fi[0]
	mulps	xmm0,xmm4		; *= {+s2, +c2, +s2, +c2}
	movss	xmm3,[esi + eax*2]	; = fi[k2]
	unpcklps	xmm2,xmm3	; = {--, --, fi[k2], fi[0]}
	mulps	xmm1,xmm5		; *= {-c2, +s2, -c2, +s2}
	movss	xmm3,[edi + eax*2]	; = gi[k2]
	addps	xmm0,xmm1		; = {d, c, b, a}
	movss	xmm1,[edi]		; = gi[0]
	unpcklps	xmm1,xmm3	; = {--,  --, gi[k2], gi[0]}
	unpcklps	xmm2,xmm1	; = {gi[k2], fi[k2], gi[0], fi[0]}
	movaps	xmm1,xmm2
	addps	xmm1,xmm0	; = {g2, f2, g0, f0}
	subps	xmm2,xmm0	; = {g3, f3, g1, f1}

;                               a       = c1*f2     + s1*g3;
;                               c       = s1*g2     + c1*f3;
;                               b       = s1*f2     - c1*g3;
;                               d       = c1*g2     - s1*f3;
;                               fi[0 ]  = f0        + a;
;                               gi[0 ]  = g0        + c;
;                               gi[k1]  = g1        + b;
;                               fi[k1]  = f1        + d;
;                               fi[k1 * 2]  = f0    - a;
;                               gi[k1 * 2]  = g0    - c;
;                               gi[k3]      = g1    - b;
;                               fi[k3]      = f1    - d;
	movaps	xmm3,xmm1
	movhlps	xmm1,xmm1	; = {g2, f2, g2, f2}
	shufps	xmm3,xmm2,0x14	; = {f1, g1, g0, f0}
	mulps	xmm1,xmm6	; *= {+c1, +s1, +s1, +c1}
	shufps	xmm2,xmm2,0xBB	; = {f3, g3, f3, g3}
	mulps	xmm2,xmm7	; *= {-s1, -c1, +c1, +s1}
	addps	xmm1,xmm2	; = {d, b, c, a}
	movaps	xmm2,xmm3
	addps	xmm3,xmm1	; = {fi[k1], gi[k1], gi[0], fi[0]}
	subps	xmm2,xmm1	; = {fi[k3], gi[k3], gi[k1*2], fi[k1*2]}
	movhlps	xmm0,xmm3
	movss	[esi],xmm3
	shufps	xmm3,xmm3,0x55
	movss	[edi+eax],xmm0
	shufps	xmm0,xmm0,0x55
	movss	[edi],xmm3
	movss	[esi+eax],xmm0
	movhlps	xmm0,xmm2
	movss	[esi+eax*2],xmm2
	shufps	xmm2,xmm2,0x55
	movss	[edi+edx],xmm0
	shufps	xmm0,xmm0,0x55
	movss	[edi+eax*2],xmm2
	lea	edi,[edi + eax*4] ; gi += (k1 * 4);
	movss	[esi+edx],xmm0
	lea	esi,[esi + eax*4] ; fi += (k1 * 4);
	cmp	esi,ebp
	jl	near .lp21		; while (fi<fn);
; unroll前のdo loopは43+4命令

; 最内周ではないforループをunrollingした
; kx=   2,   8,  32,  128
; k4=  16,  64, 256, 1024
;       0, 6/2,30/2,126/2
; at here
;	xmm6 = {--, --, s1, c1}
;               c3 = c1; s3 = s1;
	xor	ebx,ebx
	mov	bl,4*4		; = i = 4
	cmp	ebx,eax		; i < k1
	jnl	near .F22

	shufps	xmm6,xmm6,0x14	; = {c1, s1, s1, c1}
	jmp	short .F220

	align	16
;               for (i=4;i<k1;i+=4){ // for (i=2;i<k1/2;i+=2){
.lp22:
	shufps	xmm6,xmm6,0x69	; xmm6 = {c3, s3, s3, c3}
.F220:
; at here, xmm6 is {c3, s3, s3, c3}
;                       c1 = c3*t_c - s3*t_s;
;                       s1 = c3*t_s + s3*t_c;
	movss	xmm0,[costab_fft + ecx*4]
	movss	xmm1,[sintab_fft + ecx*4]
	shufps	xmm0,xmm1,0x00	; = {t_s, t_s, t_c, t_c}
	mulps	xmm6,xmm0
	movhlps	xmm4,xmm6
	xorps	xmm4,[Q_MPMP]
	subps	xmm6,xmm4	; = {--, --, s1, c1}

;                       c3 = c1*t_c - s1*t_s;
;                       s3 = s1*t_c + c1*t_s;
	shufps	xmm6,xmm6,0x14	; = {c1, s1, s1, c1}
	mulps	xmm0,xmm6
	movhlps	xmm3,xmm0
	xorps	xmm3,[Q_MPMP]
	subps	xmm0,xmm3	; = {--, --, s3, c3}

	unpcklps	xmm6,xmm0	; xmm6 = {s3, s1, c3, c1}
	shufps	xmm6,xmm6,0xB4	; xmm6 = {s1, s3, c3, c1}

;                       c2 = c1*c1 - s1*s1;
;                       c4 = c3*c3 - s3*s3;
;                       s4 = s3*c3 + s3*c3;
;                       s2 = s1*c1 + s1*c1;
	movaps	xmm7,xmm6
	movaps	xmm4,xmm6
	shufps	xmm7,xmm7,0x14
	shufps	xmm4,xmm4,0xEB
	xorps	xmm4,[Q_MMPP]	; = {-c3,-c1, s3, s1}
	mulps	xmm7,xmm6
	mulps	xmm4,xmm6
	shufps	xmm4,xmm4,0x1B
	addps	xmm7,xmm4	; xmm7 = {s2, s4, c4, c2}

;                       fi = fz +i;
;                       gi = fz +k1-i;
	mov	edi,[esp+_P+4]	; = fz
	mov	esi,ebx
	shr	esi,1
	sub	edi,esi		; edi = fz - i/2
	lea	esi,[edi + ebx]	; esi = fi = fz +i/2
	add	edi,eax		; edi = gi = fz +k1-i/2
	sub	edi,4
;                       do{
.lp220:
; unroll後のdo loopは51+4命令
;                               a       = c2*fi[k1  ] + s2*gi[k1  ];
;                               e       = c4*fi[k1+1] + s4*gi[k1-1];
;                               f       = s4*fi[k1+1] - c4*gi[k1-1];
;                               b       = s2*fi[k1  ] - c2*gi[k1  ];
;                               c       = c2*fi[k3  ] + s2*gi[k3  ];
;                               g       = c4*fi[k3+1] + s4*gi[k3-1];
;                               h       = s4*fi[k3+1] - c4*gi[k3-1];
;                               d       = s2*fi[k3  ] - c2*gi[k3  ];

	movaps	xmm4,xmm7	; xmm7 = {s2, s4, c4, c2}
	shufps	xmm4,xmm4,0x1B
	xorps	xmm4,[Q_MMPP]
	movlps	xmm0,[esi+eax]
	movlps	xmm1,[edi+eax]
	movlps	xmm2,[esi+edx]
	movlps	xmm3,[edi+edx]
	shufps	xmm0,xmm0,0x14
	shufps	xmm1,xmm1,0x41
	shufps	xmm2,xmm2,0x14
	shufps	xmm3,xmm3,0x41
	mulps	xmm0,xmm7
	mulps	xmm1,xmm4
	mulps	xmm2,xmm7
	mulps	xmm3,xmm4
	addps	xmm0,xmm1	; xmm0 = {b, f, e, a}
	addps	xmm2,xmm3	; xmm2 = {d, h, g, c}
;17

;                               f0      = fi[0   ]    + a;
;                               f4      = fi[0 +1]    + e;
;                               g4      = gi[0 -1]    + f;
;                               g0      = gi[0   ]    + b;
;                               f1      = fi[0   ]    - a;
;                               f5      = fi[0 +1]    - e;
;                               g5      = gi[0 -1]    - f;
;                               g1      = gi[0   ]    - b;
;                               f2      = fi[k2  ]    + c;
;                               f6      = fi[k2+1]    + g;
;                               g6      = gi[k2-1]    + h;
;                               g2      = gi[k2  ]    + d;
;                               f3      = fi[k2  ]    - c;
;                               f7      = fi[k2+1]    - g;
;                               g7      = gi[k2-1]    - h;
;                               g3      = gi[k2  ]    - d;
	movlps	xmm4,[esi      ]
	movhps	xmm4,[edi      ]
	movaps	xmm1,xmm4
	subps	xmm1,xmm0	; xmm1 = {g1, g5, f5, f1}
	movlps	xmm5,[esi+eax*2]
	movhps	xmm5,[edi+eax*2]
	movaps	xmm3,xmm5
	subps	xmm3,xmm2	; xmm3 = {g3, g7, f7, f3}
	addps	xmm0,xmm4	; xmm0 = {g0, g4, f4, f0}
	addps	xmm2,xmm5	; xmm2 = {g2, g6, f6, f2}
;10

;                               a       = c1*f2     + s1*g3;	順*順 + 逆*逆
;                               e       = c3*f6     + s3*g7;
;                               g       = s3*g6     + c3*f7;
;                               c       = s1*g2     + c1*f3;
;                               d       = c1*g2     - s1*f3;	順*逆 - 逆*順
;                               h       = c3*g6     - s3*f7;
;                               f       = s3*f6     - c3*g7;
;                               b       = s1*f2     - c1*g3;

	movaps	xmm5,xmm6	; xmm6 = {s1, s3, c3, c1}
	shufps	xmm5,xmm5,0x1B	; = {c1, c3, s3, s1}
	movaps	xmm4,xmm2
	mulps	xmm4,xmm6
	shufps	xmm2,xmm2,0x1B	; xmm2 = {f2, f6, g6, g2}
	mulps	xmm2,xmm6
	mulps	xmm5,xmm3
	mulps	xmm3,xmm6
	shufps	xmm3,xmm3,0x1B
	addps	xmm4,xmm3	; = {c, g, e, a}
	subps	xmm2,xmm5	; = {b, f, h, d}
;10

;                               fi[0   ]  = f0        + a;
;                               fi[0 +1]  = f4        + e;
;                               gi[0 -1]  = g4        + g;
;                               gi[0   ]  = g0        + c;
;                               fi[k2  ]  = f0        - a;
;                               fi[k2+1]  = f4        - e;
;                               gi[k2-1]  = g4        - g;
;                               gi[k2  ]  = g0        - c;
;                               fi[k1  ]  = f1        + d;
;                               fi[k1+1]  = f5        + h;
;                               gi[k1-1]  = g5        + f;
;                               gi[k1  ]  = g1        + b;
;                               fi[k3  ]  = f1        - d;
;                               fi[k3+1]  = f5        - h;
;                               gi[k3-1]  = g5        - f;
;                               gi[k3  ]  = g1        - b;
	movaps	xmm3,xmm0
	subps	xmm0,xmm4
	movlps	[esi+eax*2],xmm0
	movhps	[edi+eax*2],xmm0
	addps	xmm4,xmm3
	movlps	[esi      ],xmm4
	movhps	[edi      ],xmm4

	movaps	xmm5,xmm1
	subps	xmm1,xmm2
	movlps	[esi+edx  ],xmm1
	movhps	[edi+edx  ],xmm1
	addps	xmm2,xmm5
	movlps	[esi+eax  ],xmm2
	movhps	[edi+eax  ],xmm2
; 14
;                               gi     += k4;
;                               fi     += k4;
	lea	edi,[edi + eax*4] ; gi += (k1 * 4);
	lea	esi,[esi + eax*4] ; fi += (k1 * 4);
	cmp	esi,ebp
	jl	near .lp220		; while (fi<fn);
;                       } while (fi<fn);

	add	ebx,4*4		; i+= 4
	cmp	ebx,eax		; i < k1
	jl	near .lp22
;               }
.F22:

	cmp	eax,[esp+_P+8]	; while ((k1 * 4)<n);
	jl	near .lp2

	pop	ebp
	pop	edi
	pop	esi
	pop	ebx
	ret

;------------------------------------------------------------------------
;	99/11/12	Initial version for SSE by K. SAKAI, 4300clk@P3
; This routine is very slow when wsamp_r_int is not aligned to 16byte boundary.
;
;void fft_side_SSE( float in[2][1024], int s, float *ret)
;        energy = (in[0][512] - in[1][512])^2;
;        energy = (in[0][1024-s] - in[1][1024-s])^2;
;        for (i=s,j=1024-s;i<512;i++,j--){
;                a = in[0][i] - in[1][i];
;                energy += a*a;
;                b = in[0][j-1] - in[1][j-1];
;                energy += b*b;
;        }
;        *ret = energy * 0.25;

	align	16
fft_side_SSE:
	mov	ecx,[esp+8]	; = i = s
	mov	edx,1024
	sub	edx,ecx		; = j
	mov	eax,[esp+4]	; = in
	movss	xmm7,[eax+1024*0*4+512*4]
	movss	xmm1,[eax+1024*1*4+512*4]
	subss	xmm7,xmm1
	mulss	xmm7,xmm7
	movss	xmm2,[eax+1024*0*4+edx*4]
	movss	xmm3,[eax+1024*1*4+edx*4]
	subss	xmm2,xmm3
	mulss	xmm2,xmm2
	addss	xmm7,xmm2

	test	cl,1
	jz	.even

.odd:	dec	edx
	movss	xmm0,[eax+1024*0*4+ecx*4]
	movss	xmm1,[eax+1024*1*4+ecx*4]
	inc	ecx
	movss	xmm2,[eax+1024*0*4+edx*4]
	movss	xmm3,[eax+1024*1*4+edx*4]
	cmp	ecx,edx
	subss	xmm0,xmm1
	subss	xmm2,xmm3
	mulss	xmm0,xmm0
	mulss	xmm2,xmm2
	addss	xmm7,xmm0
	addss	xmm7,xmm2
	je	near .exit1

.even:	test	cl,2
	jz	.f0
	sub	edx,2
	movlps	xmm0,[eax+1024*0*4+ecx*4]
	movlps	xmm1,[eax+1024*1*4+ecx*4]
	add	ecx,2
	movhps	xmm0,[eax+1024*0*4+edx*4]
	movhps	xmm1,[eax+1024*1*4+edx*4]
	cmp	ecx,edx
	subps	xmm0,xmm1
	mulps	xmm0,xmm0
	addps	xmm7,xmm0
	je	.exit4
	jmp	short .f0

	align	16
.f0:
.lp0:
	sub	edx,4
	movaps	xmm0,[eax+1024*0*4+ecx*4]
	movaps	xmm1,[eax+1024*1*4+ecx*4]
	add	ecx,4
	subps	xmm0,xmm1
	mulps	xmm0,xmm0
	addps	xmm7,xmm0
	movaps	xmm2,[eax+1024*0*4+edx*4]
	movaps	xmm3,[eax+1024*1*4+edx*4]
	cmp	ecx,edx
	subps	xmm2,xmm3
	mulps	xmm2,xmm2
	addps	xmm7,xmm2
	jne	.lp0

.exit4:	movhlps	xmm6,xmm7
	addps	xmm7,xmm6
	movaps	xmm6,xmm7
	shufps	xmm6,xmm6,01010101B
	addss	xmm7,xmm6

.exit1:	mulss	xmm7,[S_025]
	mov	eax,[esp+12]
	movss	[eax],xmm7
	ret


	end