dxx-rebirth/texmap/tmap_per.asm

;THE COMPUTER CODE CONTAINED HEREIN IS THE SOLE PROPERTY OF PARALLAX
;SOFTWARE CORPORATION ("PARALLAX").  PARALLAX, IN DISTRIBUTING THE CODE TO
;END-USERS, AND SUBJECT TO ALL OF THE TERMS AND CONDITIONS HEREIN, GRANTS A
;ROYALTY-FREE, PERPETUAL LICENSE TO SUCH END-USERS FOR USE BY SUCH END-USERS
;IN USING, DISPLAYING,  AND CREATING DERIVATIVE WORKS THEREOF, SO LONG AS
;SUCH USE, DISPLAY OR CREATION IS FOR NON-COMMERCIAL, ROYALTY OR REVENUE
;FREE PURPOSES.  IN NO EVENT SHALL THE END-USER USE THE COMPUTER CODE
;CONTAINED HEREIN FOR REVENUE-BEARING PURPOSES.  THE END-USER UNDERSTANDS
;AND AGREES TO THE TERMS HEREIN AND ACCEPTS THE SAME BY USE OF THIS FILE.
;COPYRIGHT 1993-1998 PARALLAX SOFTWARE CORPORATION.  ALL RIGHTS RESERVED.
;
; $Source: /cvsroot/dxx-rebirth/d1x-rebirth/texmap/tmap_per.asm,v $
; $Revision: 1.1.1.1 $
; $Author: zicodxx $
; $Date: 2006/03/17 19:46:03 $
;
; Perspective texture mapper inner loop.
;
; $Log: tmap_per.asm,v $
; Revision 1.1.1.1  2006/03/17 19:46:03  zicodxx
; initial import
;
; Revision 1.1.1.1  1999/06/14 22:14:01  donut
; Import of d1x 1.37 source.
;
; Revision 1.26  1995/02/20  18:22:55  john
; Put all the externs in the assembly modules into tmap_inc.asm.
; Also, moved all the C versions of the inner loops into a new module,
; scanline.c.
;
; Revision 1.25  1995/02/20  17:09:08  john
; Added code so that you can build the tmapper with no assembly!
;
; Revision 1.24  1995/01/10  09:32:07  mike
; mostly fix garbage at end of scanline, but slow down by 1-4%.
;
; Revision 1.23  1994/12/02  23:29:57  mike
; optimizations.
;
; Revision 1.22  1994/11/30  00:57:00  mike
; optimization.
;
; Revision 1.21  1994/11/21  13:57:42  mike
; fix right side shear bug
;
; Revision 1.20  1994/11/12  16:41:09  mike
; jae -> ja.
;
; Revision 1.19  1994/10/27  19:40:00  john
; Made lighting table lookup be _gr_fade_table[eax] instead
; of fs:[eax], which gets rig of a segment override that
; supposedly costs 1 clock on a 486.  Mainly, I wanted to verify
; that the only reason we need selectors is for the source texture
; data .
;
; Revision 1.18  1994/05/03  11:08:32  mike
; Trap divide overflows.
;
; Revision 1.17  1994/04/21  15:03:41  mike
; make faster.
;
; Revision 1.16  1994/04/08  16:46:57  john
; Made 32 fade levels. Hacked.
;
; Revision 1.15  1994/03/31  08:35:18  mike
; Fix quantized-by-4 bug in inner loop.
;
; Revision 1.14  1994/03/14  17:41:14  mike
; Fix bug in unlighted version.
;
; Revision 1.13  1994/03/14  15:45:14  mike
; streamline code.
;
; Revision 1.12  1994/01/14  14:01:58  mike
; *** empty log message ***
;
; Revision 1.11  1993/12/18  14:43:44  john
; Messed around with doing 1/z, the u*(1/z) and v*(1/z)
; (Went from 23 fps to 21 fps... not good! )
;
; Revision 1.10  1993/12/17  16:14:17  john
; Split lighted/nonlighted, so there is no cmp lighting
; in the inner loop.
;
; Revision 1.9  1993/12/17  12:34:29  john
; Made leftover bytes use linear approx instead of correct...
; should save about 8 divides per scanline on average.
; Also, took out anti-aliasing code and rearranged to
; order of some instructions to help on 486 pipelining.
; (The anti-aliasing code did *not* look good, so I
; figure there was no reason to keep it in. )
;
; Revision 1.8  1993/12/16  18:37:52  mike
; Align some stuff on 4 byte boundaries.
;
; Revision 1.7  1993/11/30  08:44:18  john
; Made selector set check for < 64*64 bitmaps.
;
; Revision 1.6  1993/11/23  17:25:26  john
; Added safety "and eax, 0fffh" in lighting lookup.
;
; Revision 1.5  1993/11/23  15:08:52  mike
; Fixed lighting bug.
;
; Revision 1.4  1993/11/23  14:38:50  john
; optimized NORMAL code by switching EBX and ESI, so BH can be used in
; the lighting process.
;
; Revision 1.3  1993/11/23  14:30:53  john
; Made the perspective tmapper do 1/8 divides; added lighting.
;
; Revision 1.2  1993/11/22  10:24:59  mike
; *** empty log message ***
;
; Revision 1.1  1993/09/08  17:29:53  mike
; Initial revision
;
;
;

[BITS 32]

global	_asm_tmap_scanline_per
global	asm_tmap_scanline_per

%include        "tmap_inc.asm"

[SECTION .data]
align 4
    ;extern _per2_flag;:dword
%ifdef __LINUX__
; Cater for linux ELF compilers...
global x
%define _loop_count loop_count
%define _new_end new_end
%define _scan_doubling_flag scan_doubling_flag
%define _linear_if_far_flag linear_if_far_flag
%endif

	global _x
	global _loop_count
        global _new_end
	global _scan_doubling_flag
	global _linear_if_far_flag

;	 global _max_ecx
;	 global _min_ecx

    mem_edx dd 0
    x:
    _x		dd	0
    _loop_count dd	0

;    _max_ecx	 dd	 0
;    _min_ecx	 dd	 55555555h
    _new_end     dd      1       ; if set, use new, but slower, way of finishing off extra pixels on scanline, 01/10/95 --MK

    _scan_doubling_flag dd 0
    _linear_if_far_flag dd 0

;---------- local variables
align 4
    req_base    dd	0
    req_size    dd	0
    U0          dd	0
    U1          dd	0
    V0          dd	0
    V1          dd	0
    num_left_over   dd	0
    DU1         dd	0
    DV1         dd	0
    DZ1         dd	0

[SECTION .text]

; --------------------------------------------------------------------------------------------------
; Enter:
;	_xleft	fixed point left x coordinate
;	_xright	fixed point right x coordinate
;	_y	fixed point y coordinate
;	_pixptr	address of source pixel map
;	_u	fixed point initial u coordinate
;	_v	fixed point initial v coordinate
;	_z	fixed point initial z coordinate
;	_du_dx	fixed point du/dx
;	_dv_dx	fixed point dv/dx
;	_dz_dx	fixed point dz/dx

;   for (x = (int) xleft; x <= (int) xright; x++) {
;      _setcolor(read_pixel_from_tmap(srcb,((int) (u/z)) & 63,((int) (v/z)) & 63));
;      _setpixel(x,y);
;
;      u += du_dx;
;      v += dv_dx;
;      z += dz_dx;
;   }


align	16
_asm_tmap_scanline_per:
asm_tmap_scanline_per:
;        push    es
	pusha

;---------------------------- setup for loop ---------------------------------
; Setup for loop:	_loop_count  iterations = (int) xright - (int) xleft
;	esi	source pixel pointer = pixptr
;	edi	initial row pointer = y*320+x
; NOTE: fx_xright and fx_xleft changed from fix to int by mk on 12/01/94.

; set esi = pointer to start of texture map data

; set edi = address of first pixel to modify
	mov	edi,[_fx_y]
;        mov     es,[_pixel_data_selector]       ; selector[0*2]

	mov	edi,[_y_pointers+edi*4]

	mov	ebx,[_fx_xleft]
	test	ebx, ebx
	jns	ebx_ok
	xor	ebx, ebx
ebx_ok:	add	edi,[_write_buffer]
	add	edi,ebx

; set _loop_count = # of iterations
	mov	eax,[_fx_xright]
	sub	eax,ebx
	js	near _none_to_do
	mov	[_loop_count],eax

; lighting values are passed in fixed point, but need to be in 8 bit integer, 8 bit fraction so we can easily
; get the integer by reading %bh
	sar	dword [_fx_l], 8
	sar	dword [_fx_dl_dx],8
	jns	dl_dx_ok
	inc	dword [_fx_dl_dx]	; round towards 0 for negative deltas
dl_dx_ok:

; set initial values
	mov	ebx,[_fx_u]
	mov	ebp,[_fx_v]
	mov	ecx,[_fx_z]

	test	dword [_per2_flag],-1
	je	tmap_loop

	test	dword [_Lighting_on], -1
        je     near _tmap_loop_fast_nolight
        jmp     _tmap_loop_fast
;tmap_loop_fast_nolight_jumper:
;    jmp tmap_loop_fast_nolight

;================ PERSPECTIVE TEXTURE MAP INNER LOOPS ========================
;
; Usage in loop:	eax	division, pixel value
;	ebx	u
;	ecx	z
;	edx	division
;	ebp	v
;	esi	source pixel pointer
;	edi	destination pixel pointer

;-------------------- NORMAL PERSPECTIVE TEXTURE MAP LOOP -----------------
tmap_loop:
	mov	esi, ebx	; esi becomes u coordinate

	align	4
tmap_loop0:

; compute v coordinate
	mov	eax, ebp	; get v
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (v/z)

	and	eax,3fh	; mask with height-1
	mov	ebx,eax

; compute u coordinate
	mov	eax, esi	; get u
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (u/z)

	shl 	eax,26
	shld 	ebx,eax,6	; esi = v*64+u

; read 1 pixel
        add     ebx, [_pixptr]
	xor	eax, eax
	test	dword [_Lighting_on], -1
        mov     al, [ebx]    ; get pixel from source bitmap
	je	NoLight1

; LIGHTING CODE
	mov	ebx, [_fx_l]	; get temp copy of lighting value
	mov	ah, bh	; get lighting level
	add	ebx, [_fx_dl_dx]	; update lighting value
	mov	al, [_gr_fade_table+eax]	; xlat pixel thru lighting tables
	mov	[_fx_l], ebx	; save temp copy of lighting value

; transparency check
NoLight1:	cmp	al,255
	je	skip1

	mov	[edi],al
skip1:	inc	edi

; update deltas
	add	ebp,[_fx_dv_dx]
	add	esi,[_fx_du_dx]
	add	ecx,[_fx_dz_dx]
	je	_div_0_abort	; would be dividing by 0, so abort

	dec	dword [_loop_count]
	jns	tmap_loop0

_none_to_do:
	popa
;        pop     es
	ret

; We detected a z=0 condition, which seems pretty bogus, don't you think?
; So, we abort, but maybe we want to know about it.
_div_0_abort:
	jmp	_none_to_do

;-------------------------- PER/4 TMAPPER ----------------
;
;	x = x1
;	U0 = u/w; V0 = v/w;
;	while ( 1 )
;		u += du_dx*4; v+= dv_dx*4
;		U1 = u/w; V1 = v/w;
;		DUDX = (U1-U0)/4; DVDX = (V1-V0)/4;
;
;	; Pixel 0
;		pixels = texmap[V0*64+U0];
;		U0 += DUDX; V0 += DVDX
;	; Pixel 1
;		pixels = (pixels<<8)+texmap[V0*64+U0];
;		U0 += DUDX; V0 += DVDX
;	; Pixel 2
;		pixels = (pixels<<8)+texmap[V0*64+U0];
;		U0 += DUDX; V0 += DVDX
;	; Pixel 3
;		pixels = (pixels<<8)+texmap[V0*64+U0];
;
;		screen[x] = pixel
;		x += 4;
;		U0 = U1; V0 = V1

NBITS equ 4	; 2^NBITS pixels plotted per divide
ZSHIFT equ 4	; precision used in PDIV macro


;PDIV MACRO
; Returns EAX/ECX in 16.16 format in EAX. Trashes EDX
;          sig bits   6.3
;	mov	edx,eax
;	shl	eax,ZSHIFT
;	sar	edx,32-ZSHIFT
;	idiv	ecx	; eax = (v/z)
;   shl	eax, 16-ZSHIFT
;ENDM

global _tmap_loop_fast

; -------------------------------------- Start of Getting Dword Aligned ----------------------------------------------
;	ebx	fx_u

_tmap_loop_fast:
	mov	esi,ebx

	align	4
NotDwordAligned1:
	test	edi, 11b
	jz	DwordAligned1

; compute v coordinate
	mov	eax, ebp	; get v
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (v/z)

	and	eax,3fh	; mask with height-1
	mov	ebx,eax

; compute u coordinate
	mov	eax, esi	; get u
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (u/z)

	shl 	eax,26
	shld 	ebx,eax,6	; esi = v*64+u

; read 1  pixel
        add     ebx,[_pixptr]
	xor	eax, eax
        mov     al, [ebx]    ; get pixel from source bitmap

; lighting code
	mov	ebx, [_fx_l]	; get temp copy of lighting value
	mov	ah, bh	; get lighting level
	add	ebx, [_fx_dl_dx]	; update lighting value
	mov	[_fx_l], ebx	; save temp copy of lighting value

; transparency check
	cmp	al,255
	je	skip2	; this pixel is transparent, so don't write it (or light it)

	mov	al, [_gr_fade_table+eax]	; xlat pixel thru lighting tables

; write 1 pixel
	mov	[edi],al
skip2:	inc	edi

; update deltas
	add	ebp,[_fx_dv_dx]
	add	esi,[_fx_du_dx]
	add	ecx,[_fx_dz_dx]
	je	_div_0_abort	; would be dividing by 0, so abort

	dec	dword [_loop_count]
	jns	NotDwordAligned1

	jmp	_none_to_do

; -------------------------------------- End of Getting Dword Aligned ----------------------------------------------

DwordAligned1:

	mov	eax, [_loop_count]
	mov	ebx, esi	; get fx_u [pentium pipelining]
	inc	eax
	mov	esi, eax
	and	esi, (1 << NBITS) - 1
	sar	eax, NBITS
	mov	[num_left_over], esi
	je	near tmap_loop	; there are no 2^NBITS chunks, do divide/pixel for whole scanline
	mov	[_loop_count], eax	; _loop_count = pixels / NPIXS

; compute initial v coordinate
	mov	eax,ebp	; get v
	mov	edx,ebp
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT
	mov	[V0], eax

; compute initial u coordinate
	mov	eax,ebx	; get u
	mov	edx,ebx
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT
	mov	[U0], eax

; Set deltas to NPIXS pixel increments
	mov	eax, [_fx_du_dx]
	shl	eax, NBITS
	mov	[DU1], eax
	mov	eax, [_fx_dv_dx]
	shl	eax, NBITS
	mov	[DV1], eax
	mov	eax, [_fx_dz_dx]
	shl	eax, NBITS
	mov	[DZ1], eax

	align	4
TopOfLoop4:
	add	ebx, [DU1]
	add	ebp, [DV1]
	add	ecx, [DZ1]
	je	near _div_0_abort	; would be dividing by 0, so abort

; Done with ebx, ebp, ecx until next iteration
	push	ebx
	push	ecx
	push	ebp
	push	edi

; Find fixed U1
	mov	eax, ebx
	mov	edx,ebx
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT
	mov	ebx, eax	; ebx = U1 until pop's

; Find fixed V1
	mov	eax, ebp
	mov	edx, ebp
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)

	mov	ecx, [U0]	; ecx = U0 until pop's
	mov	edi, [V0]	; edi = V0 until pop's

	shl	eax, 16-ZSHIFT
	mov	ebp, eax	; ebp = V1 until pop's

; Make ESI =  V0:U0 in 6:10,6:10 format
	mov	eax, ecx
	shr	eax, 6
	mov	esi, edi
	shl	esi, 10
	mov	si, ax

; Make EDX = DV:DU in 6:10,6:10 format
	mov	eax, ebx
	sub	eax, ecx
	sar	eax, NBITS+6
	mov	edx, ebp
	sub	edx, edi
	shl	edx, 10-NBITS	; EDX = V1-V0/ 4 in 6:10 int:frac
	mov	dx, ax	; put delta u in low word

; Save the U1 and V1 so we don't have to divide on the next iteration
	mov	[U0], ebx
	mov	[V0], ebp

	pop	edi	; Restore EDI before using it

; LIGHTING CODE
	mov	ebx, [_fx_l]
	mov	ebp, [_fx_dl_dx]

	test	dword [_Transparency_on],-1
	je	near no_trans1

%macro repproc1 0
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	add	esi, edx	; inc u,v
        add     eax, [_pixptr]
        movzx   eax, byte [eax]    ; get pixel from source bitmap
	cmp	al,255
	je	%%skipa1
	mov	ah, bh	; form lighting table lookup value
	add	ebx, ebp	; update lighting value
	mov	al, [_gr_fade_table+eax]	; xlat thru lighting table into dest buffer
	mov	[edi],al
%%skipa1:
	inc	edi

; Do odd pixel
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	add	esi, edx	; inc u,v
        add     eax,[_pixptr]
        movzx   eax, byte [eax]    ; get pixel from source bitmap
	cmp	al,255
	je	%%skipa2
	mov	ah, bh	; form lighting table lookup value
	add	ebx, ebp	; update lighting value
	mov	al, [_gr_fade_table+eax]	; xlat thru lighting table into dest buffer
	mov	[edi],al
%%skipa2:
	inc	edi
%endmacro


%rep (2 << (NBITS-2))
;	local	skip3,no_trans1
;	local	skipa1,skipa2
    repproc1
%endrep

jmp	cont1

; -------------------------------------------------------
no_trans1:

%macro repproc2 0
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	add	esi, edx	; inc u,v
        add     eax,[_pixptr]
        movzx   eax, byte [eax]    ; get pixel from source bitmap
	mov	ah, bh	; form lighting table lookup value
	add	ebx, ebp	; update lighting value
	mov	cl, [_gr_fade_table+eax]	; xlat thru lighting table into dest buffer

; Do odd pixel
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	add	esi, edx	; inc u,v
        add     eax,[_pixptr]
        movzx   eax, byte [eax]    ; get pixel from source bitmap
	mov	ah, bh	; form lighting table lookup value
	add	ebx, ebp	; update lighting value
	mov	ch, [_gr_fade_table+eax]	; xlat thru lighting table into dest buffer

; ----- This is about 1% faster than the above, and could probably be optimized more.
; ----- Problem is, it gets the u,v coordinates backwards.  What you would need to do
; ----- is switch the packing of the u,v coordinates above (about 95 lines up).
;----------;	mov	eax, esi
;----------;	shr	ax, 10
;----------;	rol	eax, 6
;----------;	mov	dx, ax
;----------;	add	esi, mem_edx
;----------;	mov	dl, es:[edx]
;----------;	mov	dh, bh
;----------;	add	ebx, ebp
;----------;	mov	cl, _gr_fade_table[edx]
;----------;
;----------;	mov	eax, esi
;----------;	shr	ax, 10
;----------;	rol	eax, 6
;----------;	mov	dx, ax
;----------;	add	esi, mem_edx
;----------;	mov	dl, es:[edx]
;----------;	mov	dh, bh
;----------;	add	ebx, ebp
;----------;	mov	ch, _gr_fade_table[edx]

	ror	ecx, 16	; move to next double dest pixel position
%endmacro

%rep (1 << (NBITS-2))

    repproc2
    repproc2

	mov 	[edi],ecx	; Draw 4 pixels to display
	add 	edi,4
%endrep
;; pop edx
cont1:

; -------------------------------------------------------

; LIGHTING CODE
	mov	[_fx_l], ebx
	pop	ebp
	pop	ecx
	pop	ebx
	dec	dword [_loop_count]
	jnz	near TopOfLoop4

EndOfLoop4:
	test	dword [num_left_over], -1
	je	near _none_to_do

; ----------------------------------------- Start of LeftOver Pixels ------------------------------------------
DoEndPixels:
	push	ecx

	mov	eax, ecx
	lea	eax, [eax*2+eax]

	add	ecx, [DZ1]
	js	notokhere
	shl	ecx,2
	cmp	eax, ecx
	pop	ecx
	jl	okhere
	jmp	bah_bah
notokhere:
	pop	ecx
bah_bah:
        test    dword [_new_end],-1
	jne	near NewDoEndPixels
okhere:

	add	ebx, [DU1]
	add	ebp, [DV1]
	add	ecx, [DZ1]
	je	near _div_0_abort
	jns	dep_cont

; z went negative.
; this can happen because we added DZ1 to the current z, but dz1 represents dz for perhaps 16 pixels
; though we might only plot one more pixel.
	mov	cl, 1

dep_loop:	mov	eax, [DU1]
	sar	eax, cl
	sub	ebx, eax

	mov	eax, [DV1]
	sar	eax, cl
	sub	ebp, eax

	mov	eax, [DZ1]
	sar	eax, cl
	sub	ecx, eax
	je	near _div_0_abort
	jns	dep_cont

	inc	cl
	cmp	cl, NBITS
	jne	dep_loop

dep_cont:
	push	edi	; use edi as a temporary variable

	cmp	ecx,1 << (ZSHIFT+1)
	jg	ecx_ok
	mov	ecx, 1 << (ZSHIFT+1)
ecx_ok:

; Find fixed U1
	mov	eax, ebx
	;PDIV
	mov	edx,eax
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT

	mov	ebx, eax	; ebx = U1 until pop's

; Find fixed V1
	mov	eax, ebp
	;PDIV
	mov	edx,eax
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT

	mov	ebp, eax	; ebp = V1 until pop's

	mov	ecx, [U0]	; ecx = U0 until pop's
	mov	edi, [V0]	; edi = V0 until pop's

; Make ESI =  V0:U0 in 6:10,6:10 format
	mov	eax, ecx
	shr	eax, 6
	mov	esi, edi
	shl	esi, 10
	mov	si, ax

; Make EDX = DV:DU in 6:10,6:10 format
	mov	eax, ebx
	sub	eax, ecx
	sar	eax, NBITS+6
	mov	edx, ebp
	sub	edx, edi
	shl	edx, 10-NBITS	; EDX = V1-V0/ 4 in 6:10 int:frac
	mov	dx, ax	; put delta u in low word

	pop	edi	; Restore EDI before using it

	mov	ecx, [num_left_over]

; LIGHTING CODE
	mov	ebx, [_fx_l]
	mov	ebp, [_fx_dl_dx]

    ITERATION equ 0

%macro repproc3 0
; Do even pixel
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
        add     eax,[_pixptr]
        movzx   eax, byte [eax]    ; get pixel from source bitmap
	add	esi, edx	; inc u,v
	mov	ah, bh	; form lighting table lookup value
	add	ebx, ebp	; update lighting value
	cmp	al,255
	je	%%skip4
	mov	al, [_gr_fade_table+eax]	; xlat thru lighting table into dest buffer
	mov	[edi+ITERATION], al	; write pixel
%%skip4:	dec	ecx
	jz	near _none_to_do

; Do odd pixel
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
        shld    ax,si,6 ; shift in u, shifting up v
        add     eax,[_pixptr]
        movzx   eax, byte [eax]    ; get pixel from source bitmap
	add	esi, edx	; inc u,v
	mov	ah, bh	; form lighting table lookup value
	add	ebx, [_fx_dl_dx]	; update lighting value
	cmp	al,255
	je	%%skip5
	mov	al, [_gr_fade_table+eax]	; xlat thru lighting table into dest buffer
	mov	[edi+ITERATION+1], al	; write pixel
%%skip5:	dec	ecx
	jz	near _none_to_do
%endmacro

%rep (1 << (NBITS-1))
	;local	skip4, skip5
    repproc3
%assign ITERATION  ITERATION + 2

%endrep

; Should never get here!!!!
	int	3
	jmp	_none_to_do

; ----------------------------------------- End of LeftOver Pixels ------------------------------------------

; --BUGGY NEW--NewDoEndPixels:
; --BUGGY NEW--	mov	eax, num_left_over
; --BUGGY NEW--	and	num_left_over, 3
; --BUGGY NEW--	shr	eax, 2
; --BUGGY NEW--	je	NDEP_1
; --BUGGY NEW-- mov	_loop_count, eax
; --BUGGY NEW--
; --BUGGY NEW--; do 4 pixels per hunk, not 16, so div deltas by 4 (16/4=4)
; --BUGGY NEW-- shr DU1,2
; --BUGGY NEW-- shr DV1,2
; --BUGGY NEW-- shr DZ1,2
; --BUGGY NEW--
; --BUGGY NEW--NDEP_TopOfLoop4:
; --BUGGY NEW--	add	ebx, DU1
; --BUGGY NEW--	add	ebp, DV1
; --BUGGY NEW--	add	ecx, DZ1
; --BUGGY NEW--	je	_div_0_abort	; would be dividing by 0, so abort
; --BUGGY NEW--
; --BUGGY NEW--; Done with ebx, ebp, ecx until next iteration
; --BUGGY NEW--	push	ebx
; --BUGGY NEW--	push	ecx
; --BUGGY NEW--	push	ebp
; --BUGGY NEW--	push	edi
; --BUGGY NEW--
; --BUGGY NEW--; Find fixed U1
; --BUGGY NEW--	mov	eax, ebx
; --BUGGY NEW--	mov	edx,ebx
; --BUGGY NEW--	shl	eax,(ZSHIFT-2)
; --BUGGY NEW--	sar	edx,32-(ZSHIFT-2)
; --BUGGY NEW--	idiv	ecx	; eax = (v/z)
; --BUGGY NEW--	shl	eax, 16-(ZSHIFT-2)
; --BUGGY NEW--	mov	ebx, eax	; ebx = U1 until pop's
; --BUGGY NEW--
; --BUGGY NEW--; Find fixed V1
; --BUGGY NEW--	mov	eax, ebp
; --BUGGY NEW--	mov	edx, ebp
; --BUGGY NEW--	shl	eax,(ZSHIFT-2)
; --BUGGY NEW--	sar	edx,32-(ZSHIFT-2)
; --BUGGY NEW--	idiv	ecx	; eax = (v/z)
; --BUGGY NEW--
; --BUGGY NEW--	mov	ecx, U0	; ecx = U0 until pop's
; --BUGGY NEW--	mov	edi, V0	; edi = V0 until pop's
; --BUGGY NEW--
; --BUGGY NEW--	shl	eax, 16-(ZSHIFT-2)
; --BUGGY NEW--	mov	ebp, eax	; ebp = V1 until pop's
; --BUGGY NEW--
; --BUGGY NEW--; Make ESI =  V0:U0 in 6:10,6:10 format
; --BUGGY NEW--	mov	eax, ecx
; --BUGGY NEW--	shr	eax, 6
; --BUGGY NEW--	mov	esi, edi
; --BUGGY NEW--	shl	esi, 10
; --BUGGY NEW--	mov	si, ax
; --BUGGY NEW--
; --BUGGY NEW--; Make EDX = DV:DU in 6:10,6:10 format
; --BUGGY NEW--	mov	eax, ebx
; --BUGGY NEW--	sub	eax, ecx
; --BUGGY NEW--	sar	eax, (NBITS-2)+6
; --BUGGY NEW--	mov	edx, ebp
; --BUGGY NEW--	sub	edx, edi
; --BUGGY NEW--	shl	edx, 10-(NBITS-2)	; EDX = V1-V0/ 4 in 6:10 int:frac
; --BUGGY NEW--	mov	dx, ax	; put delta u in low word
; --BUGGY NEW--
; --BUGGY NEW--; Save the U1 and V1 so we don't have to divide on the next iteration
; --BUGGY NEW--	mov	U0, ebx
; --BUGGY NEW--	mov	V0, ebp
; --BUGGY NEW--
; --BUGGY NEW--	pop	edi	; Restore EDI before using it
; --BUGGY NEW--
; --BUGGY NEW--; LIGHTING CODE
; --BUGGY NEW--	mov	ebx, _fx_l
; --BUGGY NEW--	mov	ebp, _fx_dl_dx
; --BUGGY NEW--
; --BUGGY NEW--;**	test	_Transparency_on,-1
; --BUGGY NEW--;**	je	NDEP_no_trans1
; --BUGGY NEW--
; --BUGGY NEW--        REPT 2
; --BUGGY NEW--	local	NDEP_skipa1, NDEP_skipa2
; --BUGGY NEW--
; --BUGGY NEW--	mov	eax, esi	; get u,v
; --BUGGY NEW--	shr	eax, 26	; shift out all but int(v)
; --BUGGY NEW--	shld	ax,si,6	; shift in u, shifting up v
; --BUGGY NEW--	add	esi, edx	; inc u,v
; --BUGGY NEW--	mov 	al, es:[eax]	; get pixel from source bitmap
; --BUGGY NEW--	cmp	al,255
; --BUGGY NEW--	je	NDEP_skipa1
; --BUGGY NEW--	mov	ah, bh	; form lighting table lookup value
; --BUGGY NEW--	add	ebx, ebp	; update lighting value
; --BUGGY NEW--	mov	al, _gr_fade_table[eax]	; xlat thru lighting table into dest buffer
; --BUGGY NEW--	mov	[edi],al
; --BUGGY NEW--NDEP_skipa1:
; --BUGGY NEW--	inc	edi
; --BUGGY NEW--
; --BUGGY NEW--; Do odd pixel
; --BUGGY NEW--	mov	eax, esi	; get u,v
; --BUGGY NEW--	shr	eax, 26	; shift out all but int(v)
; --BUGGY NEW--	shld	ax,si,6	; shift in u, shifting up v
; --BUGGY NEW--	add	esi, edx	; inc u,v
; --BUGGY NEW--	mov 	al, es:[eax]	; get pixel from source bitmap
; --BUGGY NEW--	cmp	al,255
; --BUGGY NEW--	je	NDEP_skipa2
; --BUGGY NEW--	mov	ah, bh	; form lighting table lookup value
; --BUGGY NEW--	add	ebx, ebp	; update lighting value
; --BUGGY NEW--	mov	al, _gr_fade_table[eax]	; xlat thru lighting table into dest buffer
; --BUGGY NEW--	mov	[edi],al
; --BUGGY NEW--NDEP_skipa2:
; --BUGGY NEW--	inc	edi
; --BUGGY NEW--
; --BUGGY NEW--        ENDM
; --BUGGY NEW--
; --BUGGY NEW--	mov	_fx_l, ebx
; --BUGGY NEW--	pop	ebp
; --BUGGY NEW--	pop	ecx
; --BUGGY NEW--	pop	ebx
; --BUGGY NEW-- dec	_loop_count
; --BUGGY NEW--	jnz	NDEP_TopOfLoop4
; --BUGGY NEW--
; --BUGGY NEW--	test	num_left_over, -1
; --BUGGY NEW--	je	_none_to_do
; --BUGGY NEW--
; --BUGGY NEW--NDEP_1:
; --BUGGY NEW--	mov	esi,ebx
; --BUGGY NEW--
; --BUGGY NEW--	align	4
; --BUGGY NEW--NDEP_loop:
; --BUGGY NEW--
; --BUGGY NEW--; compute v coordinate
; --BUGGY NEW--	mov	eax, ebp	; get v
; --BUGGY NEW--	mov	edx, eax
; --BUGGY NEW--	sar	edx, 31
; --BUGGY NEW--	idiv	ecx	; eax = (v/z)
; --BUGGY NEW--
; --BUGGY NEW--	and	eax,3fh	; mask with height-1
; --BUGGY NEW--	mov	ebx,eax
; --BUGGY NEW--
; --BUGGY NEW--; compute u coordinate
; --BUGGY NEW--	mov	eax, 	esi	; get u
; --BUGGY NEW--	mov	edx, eax
; --BUGGY NEW--	sar	edx, 31
; --BUGGY NEW--	idiv	ecx	; eax = (u/z)
; --BUGGY NEW--
; --BUGGY NEW--	shl 	eax,26
; --BUGGY NEW--	shld 	ebx,eax,6	; esi = v*64+u
; --BUGGY NEW--
; --BUGGY NEW--; read 1  pixel
; --BUGGY NEW--	xor	eax, eax
; --BUGGY NEW--	mov	al, es:[ebx]	; get pixel from source bitmap
; --BUGGY NEW--
; --BUGGY NEW--; lighting code
; --BUGGY NEW--	mov	ebx, _fx_l	; get temp copy of lighting value
; --BUGGY NEW--	mov	ah, bh	; get lighting level
; --BUGGY NEW--	add	ebx, _fx_dl_dx	; update lighting value
; --BUGGY NEW--	mov	_fx_l, ebx	; save temp copy of lighting value
; --BUGGY NEW--
; --BUGGY NEW--; transparency check
; --BUGGY NEW--	cmp	al,255
; --BUGGY NEW--	je	NDEP_skip2	; this pixel is transparent, so don't write it (or light it)
; --BUGGY NEW--
; --BUGGY NEW--	mov	al, _gr_fade_table[eax]	; xlat pixel thru lighting tables
; --BUGGY NEW--
; --BUGGY NEW--; write 1 pixel
; --BUGGY NEW--	mov	[edi],al
; --BUGGY NEW--NDEP_skip2:	inc	edi
; --BUGGY NEW--
; --BUGGY NEW--; update deltas
; --BUGGY NEW--	add	ebp,_fx_dv_dx
; --BUGGY NEW--	add	esi,_fx_du_dx
; --BUGGY NEW--	add	ecx,_fx_dz_dx
; --BUGGY NEW--	je	_div_0_abort	; would be dividing by 0, so abort
; --BUGGY NEW--
; --BUGGY NEW--	dec	num_left_over
; --BUGGY NEW--	jne	NDEP_loop
; --BUGGY NEW--
; --BUGGY NEW--	jmp	_none_to_do

NewDoEndPixels:
	mov	esi,ebx

	align	4
NDEP_loop:

; compute v coordinate
	mov	eax, ebp	; get v
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (v/z)

	and	eax,3fh	; mask with height-1
	mov	ebx,eax

; compute u coordinate
	mov	eax, 	esi	; get u
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (u/z)

	shl 	eax,26
	shld 	ebx,eax,6	; esi = v*64+u

; read 1  pixel
        add     ebx,[_pixptr]
	xor	eax, eax
        mov     al, [ebx]    ; get pixel from source bitmap

; lighting code
	mov	ebx, [_fx_l]	; get temp copy of lighting value
	mov	ah, bh	; get lighting level
	add	ebx, [_fx_dl_dx]	; update lighting value
	mov	[_fx_l], ebx	; save temp copy of lighting value

; transparency check
	cmp	al,255
	je	NDEP_skip2	; this pixel is transparent, so don't write it (or light it)

	mov	al, [_gr_fade_table+eax]	; xlat pixel thru lighting tables

; write 1 pixel
	mov	[edi],al
NDEP_skip2:	inc	edi

; update deltas
	add	ebp,[_fx_dv_dx]
	add	esi,[_fx_du_dx]
	add	ecx,[_fx_dz_dx]
        je      near _div_0_abort    ; would be dividing by 0, so abort

	dec	dword [num_left_over]
	jne	NDEP_loop

	jmp	_none_to_do

; ==================================================== No Lighting Code ======================================================
global _tmap_loop_fast_nolight
_tmap_loop_fast_nolight:
	mov	esi,ebx

	align	4
NotDwordAligned1_nolight:
        test    edi, 11b
        jz      DwordAligned1_nolight

; compute v coordinate
	mov	eax,ebp	; get v
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (v/z)

	and	eax,3fh	; mask with height-1
	mov	ebx,eax

; compute u coordinate
	mov	eax, esi	; get u
	mov	edx, eax
	sar	edx, 31
	idiv	ecx	; eax = (u/z)

	shl 	eax,26
	shld 	ebx,eax,6	; esi = v*64+u

; read 1  pixel
        add     ebx,[_pixptr]
        mov     al,[ebx]     ; get pixel from source bitmap

; write 1 pixel
	cmp	al,255
	je	skip6
	mov	[edi],al
skip6:	inc	edi

; update deltas
	add	ebp,[_fx_dv_dx]
	add	esi,[_fx_du_dx]
	add	ecx,[_fx_dz_dx]
        je      near _div_0_abort    ; would be dividing by 0, so abort

	dec	dword [_loop_count]
        jns     NotDwordAligned1_nolight
	jmp	_none_to_do

DwordAligned1_nolight:
	mov	ebx,esi

	mov	eax, [_loop_count]
	inc	eax
	mov	[num_left_over], eax
	shr	eax, NBITS

	test	eax, -1
        je      near tmap_loop       ; no 2^NBITS chunks, do divide/pixel for whole scanline

	mov	[_loop_count], eax	; _loop_count = pixels / NPIXS
	shl	eax, NBITS
	sub	[num_left_over], eax	; num_left_over = obvious

; compute initial v coordinate
	mov	eax,ebp	; get v
	;PDIV
	mov	edx,eax
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT

	mov	[V0], eax

; compute initial u coordinate
	mov	eax,ebx	; get u
	;PDIV
	mov	edx,eax
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT

	mov	[U0], eax

; Set deltas to NPIXS pixel increments
	mov	eax, [_fx_du_dx]
	shl	eax, NBITS
	mov	[DU1], eax
	mov	eax, [_fx_dv_dx]
	shl	eax, NBITS
	mov	[DV1], eax
	mov	eax, [_fx_dz_dx]
	shl	eax, NBITS
	mov	[DZ1], eax

	align	4
TopOfLoop4_nolight:
	add	ebx, [DU1]
	add	ebp, [DV1]
	add	ecx, [DZ1]
        je      near _div_0_abort

; Done with ebx, ebp, ecx until next iteration
	push	ebx
	push	ecx
	push	ebp
	push	edi

; Find fixed U1
	mov	eax, ebx
	;PDIV
	mov	edx,eax
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT

	mov	ebx, eax	; ebx = U1 until pop's

; Find fixed V1
	mov	eax, ebp
	;PDIV
	mov	edx,eax
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT

	mov	ebp, eax	; ebp = V1 until pop's

	mov	ecx, [U0]	; ecx = U0 until pop's
	mov	edi, [V0]	; edi = V0 until pop's

; Make ESI =  V0:U0 in 6:10,6:10 format
	mov	eax, ecx
	shr	eax, 6
	mov	esi, edi
	shl	esi, 10
	mov	si, ax

; Make EDX = DV:DU in 6:10,6:10 format
	mov	eax, ebx
	sub	eax, ecx
	sar	eax, NBITS+6
	mov	edx, ebp
	sub	edx, edi
	shl	edx, 10-NBITS	; EDX = V1-V0/ 4 in 6:10 int:frac
	mov	dx, ax	; put delta u in low word

; Save the U1 and V1 so we don't have to divide on the next iteration
	mov	[U0], ebx
	mov	[V0], ebp

	pop	edi	; Restore EDI before using it

%macro repproc4 0
; Do 1 pixel
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
	add	esi, edx	; inc u,v
        add     eax,[_pixptr]
        mov     cl, [eax]    ; load into buffer register

	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
        add     eax,[_pixptr]
        mov     ch, [eax]    ; load into buffer register
	add	esi, edx	; inc u,v
	ror	ecx, 16	; move to next dest pixel

	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
        add     eax,[_pixptr]
        mov     cl, [eax]    ; load into buffer register
	add	esi, edx	; inc u,v

	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
        add     eax,[_pixptr]
        mov     ch, [eax]    ; load into buffer register
	add	esi, edx	; inc u,v
	ror	ecx, 16 ;-- can get rid of this, just write in different order below -- 	; move to next dest pixel

	test	dword [_Transparency_on],-1
	je	%%no_trans2
	cmp	ecx,-1
	je	%%skip7

	cmp	cl,255
	je	%%skip1q
	mov	[edi],cl
%%skip1q:

	cmp	ch,255
	je	%%skip2q
	mov	[edi+1],ch
%%skip2q:
	ror	ecx,16

	cmp	cl,255
        je      %%skip3q
	mov	[edi+2],cl
%%skip3q:


	cmp	ch,255
	je	%%skip4q
	mov	[edi+3],ch
%%skip4q:

	jmp	%%skip7
%%no_trans2:
	mov 	[edi],ecx	; Draw 4 pixels to display
%%skip7:	add 	edi,4
%endmacro

%rep (1 << (NBITS-2))
	;local	skip7, no_trans2, skip1q, skip2q, skip3q, skip4q
    repproc4

%endrep

	pop	ebp
	pop	ecx
	pop	ebx
	dec	dword [_loop_count]
        jnz     near TopOfLoop4_nolight

EndOfLoop4_nolight:

	test	dword [num_left_over], -1
        je      near _none_to_do

DoEndPixels_nolight:
	add	ebx, [DU1]
	add	ebp, [DV1]
	add	ecx, [DZ1]
        je      near _div_0_abort
	push	edi	; use edi as a temporary variable

; Find fixed U1
	mov	eax, ebx
	mov	edx,eax
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT
	mov	ebx, eax	; ebx = U1 until pop's

; Find fixed V1
	mov	eax, ebp
	mov	edx,eax
	shl	eax,ZSHIFT
	sar	edx,32-ZSHIFT
	idiv	ecx	; eax = (v/z)
	shl	eax, 16-ZSHIFT
	mov	ebp, eax	; ebp = V1 until pop's

	mov	ecx, [U0]	; ecx = U0 until pop's
	mov	edi, [V0]	; edi = V0 until pop's

; Make ESI =  V0:U0 in 6:10,6:10 format
	mov	eax, ecx
	shr	eax, 6
	mov	esi, edi
	shl	esi, 10
	mov	si, ax

; Make EDX = DV:DU in 6:10,6:10 format
	mov	eax, ebx
	sub	eax, ecx
	sar	eax, NBITS+6
	mov	edx, ebp
	sub	edx, edi
	shl	edx, 10-NBITS	; EDX = V1-V0/ 4 in 6:10 int:frac
	mov	dx, ax	; put delta u in low word

	pop	edi	; Restore EDI before using it

	mov	ecx, [num_left_over]

%assign ITERATION 0
%macro repproc5 0
; Do 1 pixel
	mov	eax, esi	; get u,v
	shr	eax, 26	; shift out all but int(v)
	shld	ax,si,6	; shift in u, shifting up v
        add     eax,[_pixptr]
        movzx   eax, byte [eax]    ; load into buffer register
	add	esi, edx	; inc u,v
	cmp	al,255
	je	%%skip8
	mov	[edi+ITERATION], al	; write pixel
%%skip8:	dec	ecx
        jz      near _none_to_do
%endmacro

%rep (1 << NBITS)
	;local	skip8
	repproc5
%assign ITERATION  ITERATION + 1
%endrep

; Should never get here!!!!!
	int	3
	jmp	_none_to_do