Newer
Older
monitord / lame-3.97 / libmp3lame / i386 / choose_table.nas
@root root on 23 Jan 2012 12 KB Migration from SVN revision 455
; new count bit routine
;	part of this code is origined from
;	new GOGO-no-coda (1999, 2000)
;	Copyright (C) 1999 shigeo
;	modified by Keiichi SAKAI

%include "nasm.h"

	globaldef	choose_table_MMX
	globaldef	MMX_masking

	externdef	largetbl
	externdef	t1l
	externdef	table23
	externdef	table56

	segment_data
	align	16
D14_14_14_14	dd	0x000E000E, 0x000E000E
D15_15_15_15	dd	0xfff0fff0, 0xfff0fff0
mul_add		dd	0x00010010, 0x00010010
mul_add23	dd	0x00010003, 0x00010003
mul_add56	dd	0x00010004, 0x00010004
tableDEF
	dd	0x00010003,0x01,0x00050005,0x05,0x00070006,0x07,0x00090008,0x08,0x000a0008, 0x09
	dd	0x000a0009,0x0a,0x000b000a,0x0a,0x000b000a,0x0b,0x000c000a,0x0a,0x000c000b, 0x0b
	dd	0x000c000b,0x0c,0x000d000c,0x0c,0x000d000c,0x0d,0x000d000c,0x0d,0x000e000d, 0x0e
	dd	0x000b000e,0x0e,0x00040005,0x04,0x00060005,0x06,0x00080007,0x08,0x00090008, 0x09
	dd	0x000a0009,0x0a,0x000b0009,0x0a,0x000b000a,0x0b,0x000b000a,0x0b,0x000c000a, 0x0b
	dd	0x000c000b,0x0b,0x000c000b,0x0c,0x000d000c,0x0c,0x000e000c,0x0d,0x000d000c, 0x0e
	dd	0x000e000d,0x0e,0x000b000d,0x0e,0x00070006,0x07,0x00080007,0x08,0x00090007, 0x09
	dd	0x000a0008,0x0a,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c
	dd	0x000d000a,0x0b,0x000c000b,0x0c,0x000d000b,0x0c,0x000d000c,0x0d,0x000d000c, 0x0d
	dd	0x000e000d,0x0e,0x000e000d,0x0f,0x000c000d,0x0f,0x00090007,0x08,0x00090008, 0x09
	dd	0x000a0008,0x0a,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c
	dd	0x000c000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000c,0x0d,0x000e000c, 0x0d
	dd	0x000e000c,0x0d,0x000f000d,0x0e,0x000f000d,0x0f,0x000d000d,0x0f,0x000a0008, 0x09
	dd	0x000a0008,0x09,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c
	dd	0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0c,0x000e000b,0x0d,0x000e000c, 0x0d
	dd	0x000e000c,0x0e,0x000f000c,0x0e,0x000f000d,0x0f,0x000f000d,0x0f,0x000c000d, 0x10
	dd	0x000a0009,0x0a,0x000a0009,0x0a,0x000b0009,0x0b,0x000b000a,0x0c,0x000c000a, 0x0c
	dd	0x000d000a,0x0c,0x000d000b,0x0d,0x000e000b,0x0d,0x000d000b,0x0d,0x000e000b, 0x0d
	dd	0x000e000c,0x0e,0x000f000c,0x0d,0x000f000d,0x0f,0x000f000d,0x0f,0x0010000d, 0x10
	dd	0x000d000e,0x10,0x000b000a,0x0a,0x000b0009,0x0b,0x000b000a,0x0c,0x000c000a, 0x0c
	dd	0x000d000a,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000e000b, 0x0d
	dd	0x000e000c,0x0e,0x000e000c,0x0e,0x000e000c,0x0e,0x000f000d,0x0f,0x000f000d, 0x0f
	dd	0x0010000e,0x10,0x000d000e,0x10,0x000b000a,0x0b,0x000b000a,0x0b,0x000c000a, 0x0c
	dd	0x000c000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0e,0x000e000c, 0x0e
	dd	0x000e000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0f,0x000f000c,0x0f,0x000f000d, 0x0f
	dd	0x0011000d,0x10,0x0011000d,0x12,0x000d000e,0x12,0x000b000a,0x0a,0x000c000a, 0x0a
	dd	0x000c000a,0x0b,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000b, 0x0d
	dd	0x000e000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0e,0x000f000d, 0x0f
	dd	0x0010000d,0x0f,0x0010000e,0x10,0x0010000e,0x11,0x000d000e,0x11,0x000c000a, 0x0b
	dd	0x000c000a,0x0b,0x000c000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000b, 0x0d
	dd	0x000e000c,0x0d,0x000f000c,0x0f,0x000f000c,0x0e,0x000f000d,0x0f,0x000f000d, 0x0f
	dd	0x0010000d,0x10,0x000f000d,0x10,0x0010000e,0x10,0x000f000e,0x12,0x000e000e, 0x11
	dd	0x000c000b,0x0b,0x000d000b,0x0c,0x000c000b,0x0c,0x000d000b,0x0d,0x000e000c, 0x0d
	dd	0x000e000c,0x0e,0x000e000c,0x0e,0x000e000c,0x0f,0x000f000c,0x0e,0x0010000d, 0x0f
	dd	0x0010000d,0x10,0x0010000d,0x0f,0x0011000d,0x10,0x0011000e,0x11,0x0010000f, 0x12
	dd	0x000d000e,0x13,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b, 0x0d
	dd	0x000e000c,0x0e,0x000e000c,0x0e,0x000f000c,0x0e,0x0010000c,0x0e,0x0010000d, 0x0f
	dd	0x0010000d,0x0f,0x0010000d,0x0f,0x0010000d,0x10,0x0010000e,0x11,0x000f000e, 0x11
	dd	0x0010000e,0x11,0x000e000f,0x12,0x000d000c,0x0c,0x000e000c,0x0d,0x000e000b, 0x0d
	dd	0x000e000c,0x0e,0x000e000c,0x0e,0x000f000c,0x0f,0x000f000d,0x0e,0x000f000d, 0x0f
	dd	0x000f000d,0x10,0x0011000d,0x10,0x0010000d,0x11,0x0010000d,0x11,0x0010000e, 0x11
	dd	0x0010000e,0x12,0x0012000f,0x12,0x000e000f,0x12,0x000f000c,0x0d,0x000e000c, 0x0d
	dd	0x000e000c,0x0e,0x000e000c,0x0f,0x000f000c,0x0f,0x000f000d,0x0f,0x0010000d, 0x10
	dd	0x0010000d,0x10,0x0010000d,0x10,0x0012000e,0x10,0x0011000e,0x10,0x0011000e, 0x11
	dd	0x0011000e,0x12,0x0013000e,0x11,0x0011000f,0x12,0x000e000f,0x12,0x000e000d, 0x0e
	dd	0x000f000d,0x0e,0x000d000d,0x0e,0x000e000d,0x0f,0x0010000d,0x0f,0x0010000d, 0x0f
	dd	0x000f000d,0x11,0x0010000d,0x10,0x0010000e,0x10,0x0011000e,0x13,0x0012000e, 0x11
	dd	0x0011000e,0x11,0x0013000f,0x11,0x0011000f,0x13,0x0010000e,0x12,0x000e000f, 0x12
	dd	0x000b000d,0x0d,0x000b000d,0x0e,0x000b000d,0x0f,0x000c000d,0x10,0x000c000d, 0x10
	dd	0x000d000d,0x10,0x000d000d,0x11,0x000d000e,0x10,0x000e000e,0x11,0x000e000e, 0x11
	dd	0x000e000e,0x12,0x000e000e,0x12,0x000e000f,0x15,0x000e000f,0x14,0x000e000f, 0x15
	dd	0x000c000f,0x12

tableABC
	dd	0x00020004,0x1,0x00040004,0x4,0x00060006,0x7,0x00080008,0x9,0x00090009,0xa,0x000a000a,0xa
	dd	0x0009000a,0xa,0x000a000a,0xb,0x00000000,0x0,0x00020003,0x1,0x00040004,0x4,0x00070006,0x7
	dd	0x00090007,0x9,0x00090009,0x9,0x000a000a,0xa,0x00000000,0x0,0x00040004,0x4,0x00050005,0x6
	dd	0x00060006,0x8,0x00080007,0x9,0x000a0009,0xa,0x000a0009,0xb,0x0009000a,0xa,0x000a000a,0xa
	dd	0x00000000,0x0,0x00040004,0x4,0x00040005,0x6,0x00060006,0x8,0x000a0007,0x9,0x000a0008,0x9
	dd	0x000a000a,0xa,0x00000000,0x0,0x00060006,0x7,0x00070006,0x8,0x00080007,0x9,0x00090008,0xa
	dd	0x000a0009,0xb,0x000b000a,0xc,0x000a0009,0xb,0x000a000a,0xb,0x00000000,0x0,0x00070005,0x7
	dd	0x00060006,0x7,0x00080007,0x9,0x000a0008,0xa,0x000a0009,0xa,0x000b000a,0xb,0x00000000,0x0
	dd	0x00080007,0x8,0x00080007,0x9,0x00090008,0xa,0x000b0008,0xb,0x000a0009,0xc,0x000c000a,0xc
	dd	0x000a000a,0xb,0x000b000a,0xc,0x00000000,0x0,0x00090007,0x8,0x000a0007,0x9,0x000a0008,0xa
	dd	0x000b0009,0xb,0x000b0009,0xb,0x000c000a,0xb,0x00000000,0x0,0x00090008,0x9,0x000a0008,0xa
	dd	0x000a0009,0xb,0x000b0009,0xc,0x000b000a,0xc,0x000c000a,0xc,0x000b000a,0xc,0x000c000b,0xc
	dd	0x00000000,0x0,0x00090008,0x8,0x00090008,0x9,0x000a0009,0xa,0x000b0009,0xb,0x000c000a,0xb
	dd	0x000c000b,0xc,0x00000000,0x0,0x00090009,0xa,0x000a0009,0xb,0x000b000a,0xc,0x000c000a,0xc
	dd	0x000c000a,0xd,0x000d000b,0xd,0x000c000a,0xc,0x000d000b,0xd,0x00000000,0x0,0x000a0009,0x9
	dd	0x000a0009,0xa,0x000b000a,0xb,0x000b000a,0xc,0x000d000b,0xc,0x000d000b,0xc,0x00000000,0x0
	dd	0x00090009,0x9,0x00090009,0xa,0x00090009,0xb,0x000a000a,0xc,0x000b000a,0xc,0x000c000b,0xc
	dd	0x000c000b,0xd,0x000c000c,0xd,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0
	dd	0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x0009000a,0xa,0x0009000a,0xa
	dd	0x000a000a,0xb,0x000b000b,0xc,0x000c000b,0xc,0x000c000b,0xd,0x000c000b,0xd,0x000c000c,0xd
	dd	0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0
	dd	0x0,0x00000000, 0x0,0x00000000

linbits32
	dd	0x00040004,0x10001,0x00040004,0x20002,0x00040004,0x30003,0x00040004,0x40004
	dd	0x00050005,0x60006,0x00060006,0x60006,0x00070007,0x80008,0x00080008,0x80008
	dd	0x00090009,0xa000a,0x000b000b,0xa000a,0x000b000b,0xd000d,0x000d000d,0xd000d
	dd	0x000d000d,0xd000d


choose_table_H
	dw	0x1810, 0x1811, 0x1812, 0x1813, 0x1914, 0x1a14, 0x1b15, 0x1c15
	dw	0x1d16, 0x1e16, 0x1e17, 0x1f17, 0x1f17

choose_jump_table_L:
	dd	table_MMX.L_case_0
	dd	table_MMX.L_case_1
	dd	table_MMX.L_case_2
	dd	table_MMX.L_case_3
	dd	table_MMX.L_case_45
	dd	table_MMX.L_case_45
	dd	table_MMX.L_case_67
	dd	table_MMX.L_case_67
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15

	segment_code
;
; use MMX
;

	align	16
; int choose_table(int *ix, int *end, int *s)
choose_table_MMX:
	mov	ecx,[esp+4]	;ecx = begin
	mov	edx,[esp+8]	;edx = end
	sub	ecx,edx		;ecx = begin-end(should be minus)
	test	ecx,8
 	pxor	mm0,mm0		;mm0=[0:0]
	movq	mm1,[edx+ecx]
	jz	.lp

	add	ecx,8
	jz	.exit

	align	4
.lp:
	movq	mm4,[edx+ecx]
	movq	mm5,[edx+ecx+8]
	add	ecx,16
	psubusw	mm4,mm0	; 本当は dword でないといけないのだが
	psubusw	mm5,mm1	; そんなコマンドはない :-p
	paddw	mm0,mm4 ; が, ここで扱う値の範囲は 8191+15 以下なので問題ない
	paddw	mm1,mm5
	jnz	.lp
.exit:
	psubusw	mm1,mm0	; これも本当は dword でないといけない
	paddw	mm0,mm1

	movq	mm4,mm0
	punpckhdq	mm4,mm4
	psubusw	mm4,mm0	; これも本当は dword でないといけない
	paddw	mm0,mm4
	movd	eax,mm0

	cmp	eax,15
	ja	.with_ESC
	jmp	[choose_jump_table_L+eax*4]

.with_ESC1:
	emms
	mov	ecx, [esp+12]	; *s
	mov	[ecx], eax
	or	eax,-1
	ret

.with_ESC:
	cmp	eax, 8191+15
	ja	.with_ESC1

	sub	eax,15
	push	ebx
	push	esi
	bsr	eax, eax
%assign _P 4*2
	movq    mm5, [D15_15_15_15]
	movq	mm6, [D14_14_14_14]
	movq	mm3, [mul_add]

	mov	ecx, [esp+_P+4]		; = ix
;	mov	edx, [esp+_P+8]		; = end
	sub	ecx, edx

	xor	esi, esi	; sum = 0
	test    ecx, 8
	pxor	mm7, mm7	; linbits_sum, 14を越えたものの数
	jz	.H_dual_lp1

	movq	mm0, [edx+ecx]
	add	ecx,8
	packssdw	mm0,mm7
	movq	mm2, mm0
	paddusw	mm0, mm5	; mm0 = min(ix, 15)+0xfff0
	pcmpgtw	mm2, mm6	; 14より大きいか?
	psubw	mm7, mm2	; 14より大きいとき linbits_sum++;
	pmaddwd	mm0, mm3	; {0, 0, y, x}*{1, 16, 1, 16}
	movd	ebx, mm0
	mov	esi, [largetbl+ebx*4+(16*16+16)*4]

	jz	.H_dual_exit

	align   4
.H_dual_lp1:
	movq	mm0, [edx+ecx]
	movq	mm1, [edx+ecx+8]
	packssdw	mm0,mm1
	movq	mm2, mm0
	paddusw	mm0, mm5	; mm0 = min(ix, 15)+0xfff0
	pcmpgtw	mm2, mm6	; 14より大きいか?
	pmaddwd	mm0, mm3	; {y, x, y, x}*{1, 16, 1, 16}
	movd	ebx, mm0
	punpckhdq	mm0,mm0
	add	esi, [largetbl+ebx*4+(16*16+16)*4]
	movd	ebx, mm0
	add	esi, [largetbl+ebx*4+(16*16+16)*4]
	add	ecx, 16
	psubw	mm7, mm2	; 14より大きいとき linbits_sum++;
	jnz	.H_dual_lp1

.H_dual_exit:
	pmov	mm1,mm7
	punpckhdq	mm7,mm7
	paddd	mm7,mm1
	punpckldq	mm7,mm7

	pmaddwd	mm7, [linbits32+eax*8]	; linbits
	mov	ax, [choose_table_H+eax*2]

	movd	ecx, mm7
	punpckhdq	mm7,mm7
	movd	edx,mm7
	emms
	shl	edx, 16
	add	ecx, edx

	add	ecx, esi

	pop	esi
	pop	ebx

	mov	edx, ecx
	and	ecx, 0xffff	; ecx = sum2
	shr	edx, 16	; edx = sum

	cmp	edx, ecx
	jle	.chooseE_s1
	mov	edx, ecx
	shr	eax, 8
.chooseE_s1:
	mov	ecx, [esp+12] ; *s
	and	eax, 0xff
	add	[ecx], edx
	ret

table_MMX.L_case_0:
	emms
	ret

table_MMX.L_case_1:
	emms
	mov	eax, [esp+12] ; *s
	mov	ecx, [esp+4] ; *ix
	sub	ecx, edx
	push	ebx
.lp:
	mov	ebx, [edx+ecx]
	add	ebx, ebx
	add	ebx, [edx+ecx+4]
	movzx	ebx, byte [ebx+t1l]
	add	[eax], ebx
	add	ecx, 8
	jnz	.lp
	pop	ebx
	mov	eax, 1
	ret

table_MMX.L_case_45:
	push	dword 7
	mov	ecx, tableABC+9*8
	jmp	from3

table_MMX.L_case_67:
	push	dword 10
	mov	ecx, tableABC
	jmp	from3

table_MMX.L_case_8_15:
	push	dword 13
	mov	ecx, tableDEF
from3:
	mov	eax,[esp+8]	;eax = *begin
;	mov	edx,[esp+12]	;edx = *end

	push	ebx
	sub	eax, edx

	movq	mm5,[mul_add]
	pxor	mm2,mm2	;mm2 = sum

	test	eax, 8
	jz	.choose3_lp1
; odd length
	movq	mm0,[edx+eax]	;mm0 = ix[0] | ix[1]
	add	eax,8
	packssdw	mm0,mm2

	pmaddwd	mm0,mm5
	movd	ebx,mm0

	movq	mm2,  [ecx+ebx*8]

	jz	.choose3_exit

	align	4
.choose3_lp1
	movq	mm0,[edx+eax]
	movq	mm1,[edx+eax+8]
	add	eax,16
	packssdw	mm0,mm1 ;mm0 = ix[0]|ix[1]|ix[2]|ix[3]
	pmaddwd	mm0,mm5
	movd	ebx,mm0
	punpckhdq	mm0,mm0
	paddd	mm2, [ecx+ebx*8]
	movd	ebx,mm0
	paddd	mm2, [ecx+ebx*8]
	jnz	.choose3_lp1
.choose3_exit
;	xor	eax,eax
	movd	ebx, mm2
	punpckhdq	mm2,mm2
	mov	ecx, ebx
	and	ecx, 0xffff	; ecx = sum2
	shr	ebx, 16	; ebx = sum1
	movd	edx, mm2	; edx = sum

	cmp	edx, ebx
	jle	.choose3_s1
	mov	edx, ebx
	inc	eax
.choose3_s1:
	emms
	pop	ebx
	cmp	edx, ecx
	jle	.choose3_s2
	mov	edx, ecx
	mov	eax, 2
.choose3_s2:
	pop	ecx
	add	eax, ecx
	mov	ecx, [esp+12] ; *s
	add	[ecx], edx
	ret

table_MMX.L_case_2:
	push	dword 2
	mov	ecx,table23
	pmov	mm5,[mul_add23]
	jmp	from2
table_MMX.L_case_3:
	push	dword 5
	mov	ecx,table56
	pmov	mm5,[mul_add56]
from2:
	mov	eax,[esp+8]	;eax = *begin
;	mov	edx,[esp+12]	;edx = *end
	push	ebx
	push	edi

	sub	eax, edx
	xor	edi, edi
	test	eax, 8
	jz	.choose2_lp1
; odd length
	movq	mm0,[edx+eax]	;mm0 = ix[0] | ix[1]
	pxor	mm2,mm2		;mm2 = sum
	packssdw	mm0,mm2

	pmaddwd	mm0,mm5
	movd	ebx,mm0

	mov	edi,  [ecx+ebx*4]

	add	eax,8
	jz	.choose2_exit

	align	4
.choose2_lp1
	movq	mm0,[edx+eax]
	movq	mm1,[edx+eax+8]
	packssdw	mm0,mm1 ;mm0 = ix[0]|ix[1]|ix[2]|ix[3]
	pmaddwd	mm0,mm5
	movd	ebx,mm0
	punpckhdq	mm0,mm0
	add	edi, [ecx+ebx*4]
	movd	ebx, mm0
	add	edi, [ecx+ebx*4]
	add	eax,16
	jnc	.choose2_lp1
.choose2_exit
	mov	ecx, edi
	pop	edi
	pop	ebx
	pop	eax ; table num.
	emms

	mov	edx, ecx
	and	ecx, 0xffff	; ecx = sum2
	shr	edx, 16	; edx = sum1

	cmp	edx, ecx
	jle	.choose2_s1
	mov	edx, ecx
	inc	eax
.choose2_s1:
	mov	ecx, [esp+12] ; *s
	add	[ecx], edx
	ret

	end