; new count bit routine
; part of this code is origined from
; new GOGO-no-coda (1999, 2000)
; Copyright (C) 1999 shigeo
; modified by Keiichi SAKAI
%include "nasm.h"
globaldef choose_table_MMX
globaldef MMX_masking
externdef largetbl
externdef t1l
externdef table23
externdef table56
segment_data
align 16
D14_14_14_14 dd 0x000E000E, 0x000E000E
D15_15_15_15 dd 0xfff0fff0, 0xfff0fff0
mul_add dd 0x00010010, 0x00010010
mul_add23 dd 0x00010003, 0x00010003
mul_add56 dd 0x00010004, 0x00010004
tableDEF
dd 0x00010003,0x01,0x00050005,0x05,0x00070006,0x07,0x00090008,0x08,0x000a0008, 0x09
dd 0x000a0009,0x0a,0x000b000a,0x0a,0x000b000a,0x0b,0x000c000a,0x0a,0x000c000b, 0x0b
dd 0x000c000b,0x0c,0x000d000c,0x0c,0x000d000c,0x0d,0x000d000c,0x0d,0x000e000d, 0x0e
dd 0x000b000e,0x0e,0x00040005,0x04,0x00060005,0x06,0x00080007,0x08,0x00090008, 0x09
dd 0x000a0009,0x0a,0x000b0009,0x0a,0x000b000a,0x0b,0x000b000a,0x0b,0x000c000a, 0x0b
dd 0x000c000b,0x0b,0x000c000b,0x0c,0x000d000c,0x0c,0x000e000c,0x0d,0x000d000c, 0x0e
dd 0x000e000d,0x0e,0x000b000d,0x0e,0x00070006,0x07,0x00080007,0x08,0x00090007, 0x09
dd 0x000a0008,0x0a,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c
dd 0x000d000a,0x0b,0x000c000b,0x0c,0x000d000b,0x0c,0x000d000c,0x0d,0x000d000c, 0x0d
dd 0x000e000d,0x0e,0x000e000d,0x0f,0x000c000d,0x0f,0x00090007,0x08,0x00090008, 0x09
dd 0x000a0008,0x0a,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c
dd 0x000c000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000c,0x0d,0x000e000c, 0x0d
dd 0x000e000c,0x0d,0x000f000d,0x0e,0x000f000d,0x0f,0x000d000d,0x0f,0x000a0008, 0x09
dd 0x000a0008,0x09,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c
dd 0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0c,0x000e000b,0x0d,0x000e000c, 0x0d
dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000d,0x0f,0x000f000d,0x0f,0x000c000d, 0x10
dd 0x000a0009,0x0a,0x000a0009,0x0a,0x000b0009,0x0b,0x000b000a,0x0c,0x000c000a, 0x0c
dd 0x000d000a,0x0c,0x000d000b,0x0d,0x000e000b,0x0d,0x000d000b,0x0d,0x000e000b, 0x0d
dd 0x000e000c,0x0e,0x000f000c,0x0d,0x000f000d,0x0f,0x000f000d,0x0f,0x0010000d, 0x10
dd 0x000d000e,0x10,0x000b000a,0x0a,0x000b0009,0x0b,0x000b000a,0x0c,0x000c000a, 0x0c
dd 0x000d000a,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000e000b, 0x0d
dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000e000c,0x0e,0x000f000d,0x0f,0x000f000d, 0x0f
dd 0x0010000e,0x10,0x000d000e,0x10,0x000b000a,0x0b,0x000b000a,0x0b,0x000c000a, 0x0c
dd 0x000c000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0e,0x000e000c, 0x0e
dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0f,0x000f000c,0x0f,0x000f000d, 0x0f
dd 0x0011000d,0x10,0x0011000d,0x12,0x000d000e,0x12,0x000b000a,0x0a,0x000c000a, 0x0a
dd 0x000c000a,0x0b,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000b, 0x0d
dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0e,0x000f000d, 0x0f
dd 0x0010000d,0x0f,0x0010000e,0x10,0x0010000e,0x11,0x000d000e,0x11,0x000c000a, 0x0b
dd 0x000c000a,0x0b,0x000c000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000b, 0x0d
dd 0x000e000c,0x0d,0x000f000c,0x0f,0x000f000c,0x0e,0x000f000d,0x0f,0x000f000d, 0x0f
dd 0x0010000d,0x10,0x000f000d,0x10,0x0010000e,0x10,0x000f000e,0x12,0x000e000e, 0x11
dd 0x000c000b,0x0b,0x000d000b,0x0c,0x000c000b,0x0c,0x000d000b,0x0d,0x000e000c, 0x0d
dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000e000c,0x0f,0x000f000c,0x0e,0x0010000d, 0x0f
dd 0x0010000d,0x10,0x0010000d,0x0f,0x0011000d,0x10,0x0011000e,0x11,0x0010000f, 0x12
dd 0x000d000e,0x13,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b, 0x0d
dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000f000c,0x0e,0x0010000c,0x0e,0x0010000d, 0x0f
dd 0x0010000d,0x0f,0x0010000d,0x0f,0x0010000d,0x10,0x0010000e,0x11,0x000f000e, 0x11
dd 0x0010000e,0x11,0x000e000f,0x12,0x000d000c,0x0c,0x000e000c,0x0d,0x000e000b, 0x0d
dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000f000c,0x0f,0x000f000d,0x0e,0x000f000d, 0x0f
dd 0x000f000d,0x10,0x0011000d,0x10,0x0010000d,0x11,0x0010000d,0x11,0x0010000e, 0x11
dd 0x0010000e,0x12,0x0012000f,0x12,0x000e000f,0x12,0x000f000c,0x0d,0x000e000c, 0x0d
dd 0x000e000c,0x0e,0x000e000c,0x0f,0x000f000c,0x0f,0x000f000d,0x0f,0x0010000d, 0x10
dd 0x0010000d,0x10,0x0010000d,0x10,0x0012000e,0x10,0x0011000e,0x10,0x0011000e, 0x11
dd 0x0011000e,0x12,0x0013000e,0x11,0x0011000f,0x12,0x000e000f,0x12,0x000e000d, 0x0e
dd 0x000f000d,0x0e,0x000d000d,0x0e,0x000e000d,0x0f,0x0010000d,0x0f,0x0010000d, 0x0f
dd 0x000f000d,0x11,0x0010000d,0x10,0x0010000e,0x10,0x0011000e,0x13,0x0012000e, 0x11
dd 0x0011000e,0x11,0x0013000f,0x11,0x0011000f,0x13,0x0010000e,0x12,0x000e000f, 0x12
dd 0x000b000d,0x0d,0x000b000d,0x0e,0x000b000d,0x0f,0x000c000d,0x10,0x000c000d, 0x10
dd 0x000d000d,0x10,0x000d000d,0x11,0x000d000e,0x10,0x000e000e,0x11,0x000e000e, 0x11
dd 0x000e000e,0x12,0x000e000e,0x12,0x000e000f,0x15,0x000e000f,0x14,0x000e000f, 0x15
dd 0x000c000f,0x12
tableABC
dd 0x00020004,0x1,0x00040004,0x4,0x00060006,0x7,0x00080008,0x9,0x00090009,0xa,0x000a000a,0xa
dd 0x0009000a,0xa,0x000a000a,0xb,0x00000000,0x0,0x00020003,0x1,0x00040004,0x4,0x00070006,0x7
dd 0x00090007,0x9,0x00090009,0x9,0x000a000a,0xa,0x00000000,0x0,0x00040004,0x4,0x00050005,0x6
dd 0x00060006,0x8,0x00080007,0x9,0x000a0009,0xa,0x000a0009,0xb,0x0009000a,0xa,0x000a000a,0xa
dd 0x00000000,0x0,0x00040004,0x4,0x00040005,0x6,0x00060006,0x8,0x000a0007,0x9,0x000a0008,0x9
dd 0x000a000a,0xa,0x00000000,0x0,0x00060006,0x7,0x00070006,0x8,0x00080007,0x9,0x00090008,0xa
dd 0x000a0009,0xb,0x000b000a,0xc,0x000a0009,0xb,0x000a000a,0xb,0x00000000,0x0,0x00070005,0x7
dd 0x00060006,0x7,0x00080007,0x9,0x000a0008,0xa,0x000a0009,0xa,0x000b000a,0xb,0x00000000,0x0
dd 0x00080007,0x8,0x00080007,0x9,0x00090008,0xa,0x000b0008,0xb,0x000a0009,0xc,0x000c000a,0xc
dd 0x000a000a,0xb,0x000b000a,0xc,0x00000000,0x0,0x00090007,0x8,0x000a0007,0x9,0x000a0008,0xa
dd 0x000b0009,0xb,0x000b0009,0xb,0x000c000a,0xb,0x00000000,0x0,0x00090008,0x9,0x000a0008,0xa
dd 0x000a0009,0xb,0x000b0009,0xc,0x000b000a,0xc,0x000c000a,0xc,0x000b000a,0xc,0x000c000b,0xc
dd 0x00000000,0x0,0x00090008,0x8,0x00090008,0x9,0x000a0009,0xa,0x000b0009,0xb,0x000c000a,0xb
dd 0x000c000b,0xc,0x00000000,0x0,0x00090009,0xa,0x000a0009,0xb,0x000b000a,0xc,0x000c000a,0xc
dd 0x000c000a,0xd,0x000d000b,0xd,0x000c000a,0xc,0x000d000b,0xd,0x00000000,0x0,0x000a0009,0x9
dd 0x000a0009,0xa,0x000b000a,0xb,0x000b000a,0xc,0x000d000b,0xc,0x000d000b,0xc,0x00000000,0x0
dd 0x00090009,0x9,0x00090009,0xa,0x00090009,0xb,0x000a000a,0xc,0x000b000a,0xc,0x000c000b,0xc
dd 0x000c000b,0xd,0x000c000c,0xd,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0
dd 0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x0009000a,0xa,0x0009000a,0xa
dd 0x000a000a,0xb,0x000b000b,0xc,0x000c000b,0xc,0x000c000b,0xd,0x000c000b,0xd,0x000c000c,0xd
dd 0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0
dd 0x0,0x00000000, 0x0,0x00000000
linbits32
dd 0x00040004,0x10001,0x00040004,0x20002,0x00040004,0x30003,0x00040004,0x40004
dd 0x00050005,0x60006,0x00060006,0x60006,0x00070007,0x80008,0x00080008,0x80008
dd 0x00090009,0xa000a,0x000b000b,0xa000a,0x000b000b,0xd000d,0x000d000d,0xd000d
dd 0x000d000d,0xd000d
choose_table_H
dw 0x1810, 0x1811, 0x1812, 0x1813, 0x1914, 0x1a14, 0x1b15, 0x1c15
dw 0x1d16, 0x1e16, 0x1e17, 0x1f17, 0x1f17
choose_jump_table_L:
dd table_MMX.L_case_0
dd table_MMX.L_case_1
dd table_MMX.L_case_2
dd table_MMX.L_case_3
dd table_MMX.L_case_45
dd table_MMX.L_case_45
dd table_MMX.L_case_67
dd table_MMX.L_case_67
dd table_MMX.L_case_8_15
dd table_MMX.L_case_8_15
dd table_MMX.L_case_8_15
dd table_MMX.L_case_8_15
dd table_MMX.L_case_8_15
dd table_MMX.L_case_8_15
dd table_MMX.L_case_8_15
dd table_MMX.L_case_8_15
segment_code
;
; use MMX
;
align 16
; int choose_table(int *ix, int *end, int *s)
choose_table_MMX:
mov ecx,[esp+4] ;ecx = begin
mov edx,[esp+8] ;edx = end
sub ecx,edx ;ecx = begin-end(should be minus)
test ecx,8
pxor mm0,mm0 ;mm0=[0:0]
movq mm1,[edx+ecx]
jz .lp
add ecx,8
jz .exit
align 4
.lp:
movq mm4,[edx+ecx]
movq mm5,[edx+ecx+8]
add ecx,16
psubusw mm4,mm0 ; 本当は dword でないといけないのだが
psubusw mm5,mm1 ; そんなコマンドはない :-p
paddw mm0,mm4 ; が, ここで扱う値の範囲は 8191+15 以下なので問題ない
paddw mm1,mm5
jnz .lp
.exit:
psubusw mm1,mm0 ; これも本当は dword でないといけない
paddw mm0,mm1
movq mm4,mm0
punpckhdq mm4,mm4
psubusw mm4,mm0 ; これも本当は dword でないといけない
paddw mm0,mm4
movd eax,mm0
cmp eax,15
ja .with_ESC
jmp [choose_jump_table_L+eax*4]
.with_ESC1:
emms
mov ecx, [esp+12] ; *s
mov [ecx], eax
or eax,-1
ret
.with_ESC:
cmp eax, 8191+15
ja .with_ESC1
sub eax,15
push ebx
push esi
bsr eax, eax
%assign _P 4*2
movq mm5, [D15_15_15_15]
movq mm6, [D14_14_14_14]
movq mm3, [mul_add]
mov ecx, [esp+_P+4] ; = ix
; mov edx, [esp+_P+8] ; = end
sub ecx, edx
xor esi, esi ; sum = 0
test ecx, 8
pxor mm7, mm7 ; linbits_sum, 14を越えたものの数
jz .H_dual_lp1
movq mm0, [edx+ecx]
add ecx,8
packssdw mm0,mm7
movq mm2, mm0
paddusw mm0, mm5 ; mm0 = min(ix, 15)+0xfff0
pcmpgtw mm2, mm6 ; 14より大きいか?
psubw mm7, mm2 ; 14より大きいとき linbits_sum++;
pmaddwd mm0, mm3 ; {0, 0, y, x}*{1, 16, 1, 16}
movd ebx, mm0
mov esi, [largetbl+ebx*4+(16*16+16)*4]
jz .H_dual_exit
align 4
.H_dual_lp1:
movq mm0, [edx+ecx]
movq mm1, [edx+ecx+8]
packssdw mm0,mm1
movq mm2, mm0
paddusw mm0, mm5 ; mm0 = min(ix, 15)+0xfff0
pcmpgtw mm2, mm6 ; 14より大きいか?
pmaddwd mm0, mm3 ; {y, x, y, x}*{1, 16, 1, 16}
movd ebx, mm0
punpckhdq mm0,mm0
add esi, [largetbl+ebx*4+(16*16+16)*4]
movd ebx, mm0
add esi, [largetbl+ebx*4+(16*16+16)*4]
add ecx, 16
psubw mm7, mm2 ; 14より大きいとき linbits_sum++;
jnz .H_dual_lp1
.H_dual_exit:
pmov mm1,mm7
punpckhdq mm7,mm7
paddd mm7,mm1
punpckldq mm7,mm7
pmaddwd mm7, [linbits32+eax*8] ; linbits
mov ax, [choose_table_H+eax*2]
movd ecx, mm7
punpckhdq mm7,mm7
movd edx,mm7
emms
shl edx, 16
add ecx, edx
add ecx, esi
pop esi
pop ebx
mov edx, ecx
and ecx, 0xffff ; ecx = sum2
shr edx, 16 ; edx = sum
cmp edx, ecx
jle .chooseE_s1
mov edx, ecx
shr eax, 8
.chooseE_s1:
mov ecx, [esp+12] ; *s
and eax, 0xff
add [ecx], edx
ret
table_MMX.L_case_0:
emms
ret
table_MMX.L_case_1:
emms
mov eax, [esp+12] ; *s
mov ecx, [esp+4] ; *ix
sub ecx, edx
push ebx
.lp:
mov ebx, [edx+ecx]
add ebx, ebx
add ebx, [edx+ecx+4]
movzx ebx, byte [ebx+t1l]
add [eax], ebx
add ecx, 8
jnz .lp
pop ebx
mov eax, 1
ret
table_MMX.L_case_45:
push dword 7
mov ecx, tableABC+9*8
jmp from3
table_MMX.L_case_67:
push dword 10
mov ecx, tableABC
jmp from3
table_MMX.L_case_8_15:
push dword 13
mov ecx, tableDEF
from3:
mov eax,[esp+8] ;eax = *begin
; mov edx,[esp+12] ;edx = *end
push ebx
sub eax, edx
movq mm5,[mul_add]
pxor mm2,mm2 ;mm2 = sum
test eax, 8
jz .choose3_lp1
; odd length
movq mm0,[edx+eax] ;mm0 = ix[0] | ix[1]
add eax,8
packssdw mm0,mm2
pmaddwd mm0,mm5
movd ebx,mm0
movq mm2, [ecx+ebx*8]
jz .choose3_exit
align 4
.choose3_lp1
movq mm0,[edx+eax]
movq mm1,[edx+eax+8]
add eax,16
packssdw mm0,mm1 ;mm0 = ix[0]|ix[1]|ix[2]|ix[3]
pmaddwd mm0,mm5
movd ebx,mm0
punpckhdq mm0,mm0
paddd mm2, [ecx+ebx*8]
movd ebx,mm0
paddd mm2, [ecx+ebx*8]
jnz .choose3_lp1
.choose3_exit
; xor eax,eax
movd ebx, mm2
punpckhdq mm2,mm2
mov ecx, ebx
and ecx, 0xffff ; ecx = sum2
shr ebx, 16 ; ebx = sum1
movd edx, mm2 ; edx = sum
cmp edx, ebx
jle .choose3_s1
mov edx, ebx
inc eax
.choose3_s1:
emms
pop ebx
cmp edx, ecx
jle .choose3_s2
mov edx, ecx
mov eax, 2
.choose3_s2:
pop ecx
add eax, ecx
mov ecx, [esp+12] ; *s
add [ecx], edx
ret
table_MMX.L_case_2:
push dword 2
mov ecx,table23
pmov mm5,[mul_add23]
jmp from2
table_MMX.L_case_3:
push dword 5
mov ecx,table56
pmov mm5,[mul_add56]
from2:
mov eax,[esp+8] ;eax = *begin
; mov edx,[esp+12] ;edx = *end
push ebx
push edi
sub eax, edx
xor edi, edi
test eax, 8
jz .choose2_lp1
; odd length
movq mm0,[edx+eax] ;mm0 = ix[0] | ix[1]
pxor mm2,mm2 ;mm2 = sum
packssdw mm0,mm2
pmaddwd mm0,mm5
movd ebx,mm0
mov edi, [ecx+ebx*4]
add eax,8
jz .choose2_exit
align 4
.choose2_lp1
movq mm0,[edx+eax]
movq mm1,[edx+eax+8]
packssdw mm0,mm1 ;mm0 = ix[0]|ix[1]|ix[2]|ix[3]
pmaddwd mm0,mm5
movd ebx,mm0
punpckhdq mm0,mm0
add edi, [ecx+ebx*4]
movd ebx, mm0
add edi, [ecx+ebx*4]
add eax,16
jnc .choose2_lp1
.choose2_exit
mov ecx, edi
pop edi
pop ebx
pop eax ; table num.
emms
mov edx, ecx
and ecx, 0xffff ; ecx = sum2
shr edx, 16 ; edx = sum1
cmp edx, ecx
jle .choose2_s1
mov edx, ecx
inc eax
.choose2_s1:
mov ecx, [esp+12] ; *s
add [ecx], edx
ret
end