dxx-rebirth/texmap/tmapppc.a

1100 lines
31 KiB
Plaintext
Executable file

# -- tmap_per.1 = 8.88, 6/27/95
# -- tmap_per.3 = 8.99, 6/27/95
# -- tmap_per.4 = 9.10, 6/28/95
# -- Make more use of r0 and other volatile registers.
# -- Use fewer registers and only save/restore what's necessary.
# -- Use stmw instead of all the stws in epilog, prolog.
# -- mtctr loads "magical" counter register, then use bdnz _label
# -- Superopt from Freeware guys, ftp from prep.ai.mit.edu, look in /pub/gnu or something like that.
# -- At DoEndPixels, you write up to 15 slow pixels. Do chunks of 4.
# Macro added by allender 6/21/95 -- found in MPW PPC assembler examples. This macro
# used at beginning of routine should export all appropriate symbols and set us up
# properly for debug information
MACRO
MakeFunction &fnName
EXPORT &fnName[DS]
EXPORT .&fnName[PR]
TC &fnName[TC], &fnName[DS]
CSECT &fnName[DS]
DC.L .&fnName[PR]
DC.L TOC[tc0]
FUNCTION .&fnName[PR]
CSECT .&fnName[PR]
ENDM
# -------------------------------------------------------------
# Plot one pixel
# Lighting, No Transparency.
macro
ppc_pix_l
mr eax, V0
rlwimi eax, U0, 26, 16, 21 # Now, eax has v:u, but it's 10 bits too high and garbage above it
rlwinm eax, eax, 22, 20, 31 # Shift right 10 bits, mask out high garbage (preserve only low 12 bits)
lbzx tr2, eax, es # get source pixel
add U0, U0, edx # u0 = u0 + du
rlwimi tr2, tr0, 24, 16, 23 # mask lighting value (%bh) above pixel value (%al)
add V0, V0, esi # v0 = v0 + dv
lbzx eax, r3, tr2 # xlate lighting:pixel through lighting tables
stb eax, 0(edi) # {change this to stbu eax 1(edi) and kill the addi below}
addi edi, edi, 1
add tr0, tr0, tr1 # fx_l += fx_dl_dx
endm
# -------------------------------------------------------------
# Plot one pixel
# Lighting & Transparency.
macro
ppc_pix_lt
mr eax, V0
rlwimi eax, U0, 26, 16, 21 # Now, eax has v:u, but it's 10 bits too high and garbage above it
rlwinm eax, eax, 22, 20, 31 # Shift right 10 bits, mask out high garbage (preserve only low 12 bits)
lbzx tr2, eax, es # get source pixel
add U0, U0, edx # u0 = u0 + du
cmpwi cr0, tr2, 0xff
# cmpwi cr0, tr2, 0x0
add V0, V0, esi # v0 = v0 + dv
beq @skip
rlwimi tr2, tr0, 24, 16, 23 # mask lighting value (%bh) above pixel value (%al)
lbzx eax, r3, tr2 # xlate lighting:pixel through lighting tables
stb eax, 0(edi)
@skip: addi edi, edi, 1
add tr0, tr0, tr1 # fx_l += fx_dl_dx
endm
# -------------------------------------------------------------
# Plot one pixel
# No Lighting & Transparency.
macro
ppc_pix_t
mr eax, V0
rlwimi eax, U0, 26, 16, 21 # Now, eax has v:u, but it's 10 bits too high and garbage above it
rlwinm eax, eax, 22, 20, 31 # Shift right 10 bits, mask out high garbage (preserve only low 12 bits)
lbzx tr2, eax, es # get source pixel
add U0, U0, edx # u0 = u0 + du
cmpwi cr0, tr2, 0xff
add V0, V0, esi # v0 = v0 + dv
beq @skip
rlwimi tr2, tr0, 24, 16, 23 # mask lighting value (%bh) above pixel value (%al)
stb tr2, 0(edi)
@skip: addi edi, edi, 1
endm
# -------------------------------------------------------------
# Plot one pixel
# Lighting & Transparency.
# Decrements r_num_left_over. If goes 0, branches to _none_to_do.
# Added by allender 6/21/95 -- need TOC stuff to access global variables. We could
# probably help ourselves immensly if we used paramaters were possible.
toc
tc gr_fade_table[TC], gr_fade_table
tc write_buffer[TC], write_buffer
tc window_left[TC], window_left
tc window_right[TC], window_right
tc window_top[TC], window_top
tc window_bottom[TC], window_bottom
tc window_width[TC], window_width
tc bytes_per_row[TC], bytes_per_row
tc window_height[TC], window_height
tc y_pointers[TC], y_pointers
tc per2_flag[TC], per2_flag
tc tmap_flat_cthru_table[TC], tmap_flat_cthru_table
tc tmap_flat_color[TC], tmap_flat_color
tc tmap_flat_shade_value[TC], tmap_flat_shade_value
tc dither_intensity_lighting[TC], dither_intensity_lighting
tc Lighting_on[TC], Lighting_on
tc pixel_data_selector[TC], pixel_data_selector
tc gr_fade_table_selector[TC], gr_fade_table_selector
tc Transparency_on[TC], Transparency_on
tc fx_u[TC], fx_u
tc fx_v[TC], fx_v
tc fx_z[TC], fx_z
tc fx_l[TC], fx_l
tc fx_du_dx[TC], fx_du_dx
tc fx_dv_dx[TC], fx_dv_dx
tc fx_dz_dx[TC], fx_dz_dx
tc fx_dl_dx[TC], fx_dl_dx
tc fx_y[TC], fx_y
tc fx_xleft[TC], fx_xleft
tc fx_xright[TC], fx_xright
tc pixptr[TC], pixptr
# Added by allender 6/21/95 -- use the handy constants to set up the space to save on
# the stack
linkageArea: set 24 ; constant comes from the PowerPC Runtime Architecture Document
CalleesParams: set 32 ; always leave space for GPR's 3-10
CalleesLocalVars: set 0 ; none
numGPRs: set 19 ; num volitile GPR's (GPR's 13-31) used
numFPRs: set 0 ; num volitile FPR's (FPR's 14-31) used
stack_pad: set 12 ; padding to maintain quadword alignment
spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs + stack_pad
MakeFunction asm_tmap_scanline_per
align 4
# include 'tmap_inc.a'
import gr_fade_table
import write_buffer
import window_left
import window_right
import window_top
import window_bottom
import window_width
import bytes_per_row
import window_height
import y_pointers
#_lighting_tables equ _gr_fade_table
#write_buffer equ _write_buffer
#max_window_width equ 320
#num_iters = max_window_width
#
# if num_iters and 1
#num_iters = num_iters + 1
# endif
import per2_flag
import tmap_flat_cthru_table
import tmap_flat_color
import tmap_flat_shade_value
import dither_intensity_lighting
import Lighting_on
import pixel_data_selector
import gr_fade_table_selector
import Transparency_on
import fx_u
import fx_v
import fx_z
import fx_l
import fx_du_dx
import fx_dv_dx
import fx_dz_dx
import fx_dl_dx
import fx_y
import fx_xleft
import fx_xright
import pixptr
# include ('stdhdr.s')
#MWA -- not used num_left_over .long 0
# We don't pass any parameters, so we use those registers.
r_num_left_over: equ r4
# PowerPC equates for 80x86 compatibility
eax: equ r13
ebx: equ r14
ecx: equ r15
edx: equ r16
ebp: equ r17
esi: equ r18
edi: equ r19
es: equ r20
tr0: equ r21
tr1: equ r22
tr2: equ r23
r_loop_count: equ r24
U0: equ r25
U1: equ r26
V0: equ r27
V1: equ r28
DU1: equ r29
DV1: equ r30
DZ1: equ r31
#MWA csect texmap
# --------------------------------------------------------------------------------------------------
# Enter:
# _xleft fixed point left x coordinate
# _xright fixed point right x coordinate
# _y fixed point y coordinate
# _pixptr address of source pixel map
# _u fixed point initial u coordinate
# _v fixed point initial v coordinate
# _z fixed point initial z coordinate
# _du_dx fixed point du/dx
# _dv_dx fixed point dv/dx
# _dz_dx fixed point dz/dx
# for (x = (int) xleft# x <= (int) xright# x++) {
# _setcolor(read_pixel_from_tmap(srcb,((int) (u/z)) & 63,((int) (v/z)) & 63))#
# _setpixel(x,y)#
#
# u += du_dx;
# v += dv_dx;
# z += dz_dx;
# }
align 4 #
#asm_tmap_scanline_per_:
# Prolog
# 1. Save the contents of the Link Register if necessary. (I think it is not.)
# 2. Save the nonvolatile contents of the Condition Register to be used.
# 3. Save the contents of the nonvolatile floating-point registers to be used. (None to be used as of 6/20/95.)
# 4. Save the contents of the nonvolatile general-purpose registers to be used.
# 5. Store the current stack pointer (or back chain) and decrement the stack pointer by the size of the stack frame.
; PROLOGUE - called routine's responsibilities
mflr r0 ; Get link register
stw r0, 8(SP) ; Store the link resgister on the stack
stw r31, -4(sp)
stw r30, -8(sp)
stw r29, -12(sp)
stw r28, -16(sp)
stw r27, -20(sp)
stw r26, -24(sp)
stw r25, -28(sp)
stw r24, -32(sp)
stw r23, -36(sp)
stw r22, -40(sp)
stw r21, -44(sp)
stw r20, -48(sp)
stw r19, -52(sp)
stw r18, -56(sp)
stw r17, -60(sp)
stw r16, -64(sp)
stw r15, -68(sp)
stw r14, -72(sp)
stw r13, -76(sp)
stwu SP, -spaceToSave(SP); skip over the stack space where the caller
; might have saved stuff
#---------------------------- setup for loop ---------------------------------
# Setup for loop: _loop_count iterations = (int) xright - (int) xleft
# esi source pixel pointer = pixptr
# edi initial row pointer = y*320+x
# NOTE: fx_xright and fx_xleft changed from fix to int by mk on 12/01/94.
lwz es, pixptr[TC](RTOC)
lwzx es, r0, es
# set edi = address of first pixel to modify
lwz edi, fx_y[TC](RTOC) # mov edi,_fx_y
lwzx edi, r0, edi
slwi edi, edi, 2 # mov edi,_y_pointers[edi*4]
lwz r3, y_pointers[TC](RTOC)
lwzx edi, r3, edi
lwz ebx, fx_xleft[TC](RTOC) # mov ebx,_fx_xleft
lwzx ebx, r0, ebx
mr. ebx, ebx # test ebx, ebx
bgt ebx_ok # jns ebx_ok
xor ebx, ebx, ebx # xor ebx, ebx
ebx_ok:
lwz tr0, write_buffer[TC](RTOC) # add edi,write_buffer
add edi, edi, ebx # add edi,ebx
lwzx tr0, r0, tr0
add edi, edi, tr0
# set _loop_count = # of iterations
lwz eax, fx_xright[TC](RTOC) # mov eax,_fx_xright
lwzx eax, r0, eax
sub. eax, eax, ebx # sub eax,ebx
mr r_loop_count, eax
blt _none_to_do # js _none_to_do
# lighting values are passed in fixed point, but need to be in 8 bit integer, 8 bit fraction so we can easily
# get the integer by reading %bh
# (Not on the PowerPC and we could use the precision!)
### lwz tr0, fx_l[TC](RTOC) # sar _fx_l, 8
### lwzx tr1, r0, tr0
### srawi tr1, tr1, 8
### stwx tr1, r0, tr0
###
### lwz tr0, fx_dl_dx[TC](RTOC) # sar _fx_dl_dx,8
### lwzx tr1, r0, tr0
### srawi. tr1, tr1, 8
### bge dl_dx_ok # jns dl_dx_ok
### addi tr1, tr1, 1 # inc _fx_dl_dx # round towards 0 for negative deltas
###dl_dx_ok:
### stwx tr1, r0, tr0
# set initial values
lwz ebx, fx_u[TC](RTOC) # mov ebx,_fx_u
lwzx ebx, r0, ebx
lwz ebp, fx_v[TC](RTOC) # mov ebp,_fx_v
lwzx ebp, r0, ebp
lwz ecx, fx_z[TC](RTOC) # mov ecx,_fx_z
lwzx ecx, r0, ecx
lwz tr0, fx_dv_dx[TC](RTOC)
lwzx DV1, r0, tr0
lwz tr0, fx_du_dx[TC](RTOC)
lwzx DU1, r0, tr0
lwz tr0, fx_dz_dx[TC](RTOC)
lwzx DZ1, r0, tr0
lwz tr0, per2_flag[TC](RTOC) # test _per2_flag,-1
lwzx tr0, r0, tr0
mr. tr0, tr0
lwz tr0, Lighting_on[TC](RTOC) # test _Lighting_on, -1
beq tmap_slow # je tmap_loop
lwzx tr0, r0, tr0 # test _Lighting_on, -1
mr. tr0, tr0
beq tmap_loop_fast_nolight # je tmap_loop_fast_nolight
b tmap_loop_fast # jmp tmap_loop_fast
#================ PERSPECTIVE TEXTURE MAP INNER LOOPS ========================
#
# Usage in loop: eax division, pixel value
# ebx u
# ecx z
# edx division
# ebp v
# esi source pixel pointer
# edi destination pixel pointer
#-------------------- NORMAL PERSPECTIVE TEXTURE MAP LOOP -----------------
tmap_slow_from_fast:
mr r_loop_count, r_num_left_over
tmap_slow:
lwz tr0, Lighting_on[TC](RTOC) # test _Lighting_on, -1
lwzx tr0, r0, tr0
mr. tr0, tr0
beq tmap_slow_NoLight # je NoLight1
lwz tr0, fx_l[TC](RTOC)
lwzx esi, r0, tr0 # use esi for lighting value
lwz tr0, fx_dl_dx[TC](RTOC)
lwzx tr2, r0, tr0 # tr2 is delta lighting value
lwz r3, gr_fade_table[TC](RTOC) # r3 is fade table pointer
lwz tr0, Transparency_on[TC](RTOC)
lwzx tr0, r0, tr0
mr. tr0, tr0
beq tmap_slow_no_transparency
# ---------- Yes transparency. Yes lighting. ----------
tmap_loop0:
divw tr0, ebp, ecx # compute v coordinate
divw tr1, ebx, ecx # compute u coordinate
andi. tr1, tr1, 0x3f
rlwimi tr1, tr0, 6, 20, 25
lbzx tr1, es, tr1 # mov al, es:[ebx] # get pixel from source bitmap
rlwimi tr1, esi, 24, 16, 23 # mask lighting value (%bh) above pixel value (%al)
lbzx tr1, r3, tr1 # xlate lighting:pixel through lighting tables
add esi, esi, tr2 # update lighting value
cmpwi cr0, tr1, 0xff # check for transparency
beq skip1 # je skip1
stb tr1, 0(edi) # mov [edi],al
skip1: addi edi, edi, 1 # inc edi
add. ecx, ecx, DZ1 # add ecx,_fx_dz_dx
add ebp, ebp, DV1 # add ebp,_fx_dv_dx
add ebx, ebx, DU1 # add esi,_fx_du_dx
beq _div_0_abort # je _div_0_abort # would be dividing by 0, so abort
addic. r_loop_count, r_loop_count, -1 # dec _loop_count
bge tmap_loop0 # jns tmap_loop0
_none_to_do:
# added by allender 6/21/95 -- from MPW example and PPC developers book
; EPILOGUE - return sequence
addic SP,SP,spaceToSave ; Reset the stack pointer
lwz r31, -4(sp)
lwz r30, -8(sp)
lwz r29, -12(sp)
lwz r28, -16(sp)
lwz r27, -20(sp)
lwz r26, -24(sp)
lwz r25, -28(sp)
lwz r24, -32(sp)
lwz r23, -36(sp)
lwz r22, -40(sp)
lwz r21, -44(sp)
lwz r20, -48(sp)
lwz r19, -52(sp)
lwz r18, -56(sp)
lwz r17, -60(sp)
lwz r16, -64(sp)
lwz r15, -68(sp)
lwz r14, -72(sp)
lwz r13, -76(sp)
lwz r0, 8(sp)
mtlr r0 ; Reset the link register
blr ; return via the link register
# We detected a z=0 condition, which seems pretty bogus, don't you think?
# So, we abort, but maybe we want to know about it.
_div_0_abort: b _none_to_do
# ---------- No transparency. Yes lighting. ----------
tmap_slow_no_transparency:
tmap_loop0a: divw tr0, ebp, ecx # compute v coordinate
divw tr1, ebx, ecx # compute u coordinate
andi. tr1, tr1, 0x3f # get u coordinate in 0..63
rlwimi tr1, tr0, 6, 20, 25 # pack together v:u in low 12 bits
lbzx tr1, es, tr1 # Read source pixel.
rlwimi tr1, esi, 24, 16, 23 # mask lighting value (%bh) above pixel value (%al)
lbzx tr1, r3, tr1 # xlate lighting:pixel through lighting tables
add esi, esi, tr2 # update lighting value
stb tr1, 0(edi) # mov [edi],al
addi edi, edi, 1 # inc edi
add. ecx, ecx, DZ1 # add ecx,_fx_dz_dx
add ebp, ebp, DV1 # add ebp,_fx_dv_dx
add ebx, ebx, DU1 # add esi,_fx_du_dx
beq _div_0_abort # je _div_0_abort # would be dividing by 0, so abort
addic. r_loop_count, r_loop_count, -1 # dec _loop_count
bge tmap_loop0a # jns tmap_loop0
b _none_to_do
# ---------- Yes transparency. No lighting. ----------
# (Note: We don't know for sure there is lighting, but, except for debugging, if we aren't lighting, we _do_ have transparency.)
tmap_slow_NoLight:
tmap_loop0_nolight:
divw tr0, ebp, ecx # compute v coordinate
divw tr1, ebx, ecx # compute u coordinate
andi. tr1, tr1, 0x3f
rlwimi tr1, tr0, 6, 20, 25
lbzx tr1, es, tr1 # mov al, es:[ebx] # get pixel from source bitmap
cmpwi cr0, tr1, 0xff # check for transparency
# cmpwi cr0, tr1, 0x0 # check for transparency
beq skip1a # je skip1
stb tr1, 0(edi) # mov [edi],al
skip1a: addi edi, edi, 1 # inc edi
add. ecx, ecx, DZ1 # add ecx,_fx_dz_dx
add ebp, ebp, DV1 # add ebp,_fx_dv_dx
add ebx, ebx, DU1 # add esi,_fx_du_dx
beq _div_0_abort # je _div_0_abort # would be dividing by 0, so abort
addic. r_loop_count, r_loop_count, -1 # dec _loop_count
bge tmap_loop0_nolight # jns tmap_loop0
b _none_to_do
#-------------------------- PER/4 TMAPPER ----------------
#
# x = x1
# U0 = u/w# V0 = v/w#
# while ( 1 )
# u += du_dx*4# v+= dv_dx*4
# U1 = u/w# V1 = v/w#
# DUDX = (U1-U0)/4# DVDX = (V1-V0)/4#
#
# # Pixel 0
# pixels = texmap[V0*64+U0]#
# U0 += DUDX# V0 += DVDX
# # Pixel 1
# pixels = (pixels<<8)+texmap[V0*64+U0]#
# U0 += DUDX# V0 += DVDX
# # Pixel 2
# pixels = (pixels<<8)+texmap[V0*64+U0]#
# U0 += DUDX# V0 += DVDX
# # Pixel 3
# pixels = (pixels<<8)+texmap[V0*64+U0]#
#
# screen[x] = pixel
# x += 4#
# U0 = U1# V0 = V1
# Note: If you change NBITS, you must change the number of calls to the ppc_pix macros.
NBITS: equ 4 # 2^NBITS pixels plotted per divide
NBITS_mask: equ 15 # 2^NBITS-1
NBITS_shl_minus_2: equ 4 # 2 ^ (NBITS-2)
ZSHIFT: equ 4 # precision used in PDIV macro
DIV_SHIFT: equ 4 # Used to be 8...overflowed, smaller less likely to overflow
export tmap_loop_fast
# -------------------------------------- Start of Getting Dword Aligned ----------------------------------------------
# ebx fx_u
# ebp fx_v
# esi fx_l
# r3 gr_fade_table
tmap_loop_fast:
lwz esi, fx_l[TC](RTOC)
lwz r3, gr_fade_table[TC](RTOC)
lwz tr2, fx_dl_dx[TC](RTOC)
lwzx esi, r0, esi
lwzx tr2, r0, tr2
# This is a hack! If we allow zero pixels to be plotted for alignment, the code later hangs.
andi. tr0, edi, 3 # DEBUG HACK!!
beq skip_test # DEBUG HACK!!
NotDwordAligned1:
andi. tr0, edi, 3 # test edi, 11b
beq DwordAligned1 # jz DwordAligned1
skip_test: # DEBUG HACK!!
divw tr0, ebp, ecx # tr0: v coodinate
divw tr1, ebx, ecx # tr1: u coordinate
rlwimi tr1, tr0, 6, 20, 25 # get v:u in low 12 bits, but garbage above
andi. tr1, tr1, 0xfff # preserve only 12 bit index
add esi, esi, tr2 # fx_l += fx_dl_dx
lbzx tr1, es, tr1 # mov al, es:[ebx] # get pixel from source bitmap
rlwimi tr1, esi, 24, 16, 23 # mask lighting value (%bh) above pixel value (%al)
lbzx tr0, tr1, r3 # xlate lighting:pixel through lighting tables
cmpwi cr0, tr0, 0xff # transparent pixel?
beq skip2 # yes, skip
stb tr0, 0(edi) # mov [edi],al
skip2: addi edi, edi, 1 # inc edi
# update deltas
add. ecx, ecx, DZ1 # add ecx,_fx_dz_dx
add ebx, ebx, DU1 # add esi,_fx_du_dx
add ebp, ebp, DV1 # add ebp,_fx_dv_dx
beq _div_0_abort # je _div_0_abort # would be dividing by 0, so abort
addic. r_loop_count, r_loop_count, -1 # dec _loop_count
bge NotDwordAligned1
b _none_to_do
# -------------------------------------- End of Getting Dword Aligned ----------------------------------------------
DwordAligned1:
addi r_loop_count, r_loop_count, 1
andi. r_num_left_over, r_loop_count, NBITS_mask
srwi. r_loop_count, r_loop_count, NBITS
beq tmap_slow_from_fast
# compute initial u, v coordinates
slwi eax, ebp, DIV_SHIFT
divw V0, eax, ecx
slwi V0, V0, 16 - DIV_SHIFT
slwi eax, ebx, DIV_SHIFT
divw U0, eax, ecx
slwi U0, U0, 16 - DIV_SHIFT
# Set deltas to NPIXS pixel increments
lwz tr0, fx_du_dx[TC](RTOC)
lwzx tr1, r0, tr0
slwi DU1, tr1, NBITS
lwz tr0, fx_dv_dx[TC](RTOC)
lwzx tr1, r0, tr0
slwi DV1, tr1, NBITS
lwz tr0, fx_dz_dx[TC](RTOC)
lwzx tr1, r0, tr0
slwi DZ1, tr1, NBITS
# LIGHTING CODE
lwz tr0, fx_l[TC](RTOC) # mov ebx, _fx_l
lwzx tr0, r0, tr0 # mov ebx, _fx_l
lwz tr1, fx_dl_dx[TC](RTOC) # mov ebp, _fx_dl_dx
lwzx tr1, r0, tr1 # mov ebp, _fx_dl_dx
# Inside this loop, tr0 = fx_l, tr1 = fx_dl_dx
TopOfLoop4:
add. ecx, ecx, DZ1 # add ecx, DZ1
add ebx, ebx, DU1 # add ebx, DU1
add ebp, ebp, DV1 # add ebp, DV1
beq _div_0_abort # would be dividing by 0, so abort
# Find fixed U1, V1
slwi eax, ebx, DIV_SHIFT
divw U1, eax, ecx
slwi eax, ebp, DIV_SHIFT
divw V1, eax, ecx
slwi U1, U1, 16 - DIV_SHIFT
slwi V1, V1, 16 - DIV_SHIFT
# PPC: Make %esi be dv, %edx be du
sub esi, V1, V0
sub edx, U1, U0
srawi esi, esi, NBITS
srawi edx, edx, NBITS
lwz eax, Transparency_on[TC](RTOC) # test _Transparency_on,-1
lwzx eax, r0, eax # test _Transparency_on,-1
mr. eax, eax
bne yes_trans1 # je no_trans1
# Plot 16 pixels. (2^NBITS)
li r5, 4 # do 4 times...
mtctr r5
subi edi, edi, 1
# -----------------------------------------------
pix_loop1:
mr eax, V0
rlwimi eax, U0, 26, 16, 21 # Now, eax has v:u, but it's 10 bits too high and garbage above it
rlwinm eax, eax, 22, 20, 31 # Shift right 10 bits, mask out high garbage (preserve only low 12 bits)
add U0, U0, edx # u0 = u0 + du
lbzx tr2, eax, es # get source pixel
add V0, V0, esi # v0 = v0 + dv
rlwimi tr2, tr0, 24, 16, 23 # mask lighting value (%bh) above pixel value (%al)
lbzx r6, r3, tr2 # xlate lighting:pixel through lighting tables
# -----------------------------------------------
mr eax, V0
add tr0, tr0, tr1 # fx_l += fx_dl_dx
rlwimi eax, U0, 26, 16, 21 # Now, eax has v:u, but it's 10 bits too high and garbage above it
rlwinm eax, eax, 22, 20, 31 # Shift right 10 bits, mask out high garbage (preserve only low 12 bits)
stbu r6, 1(edi) # {change this to stbu eax 1(edi) and kill the addi below}
add U0, U0, edx # u0 = u0 + du
lbzx tr2, eax, es # get source pixel
add V0, V0, esi # v0 = v0 + dv
rlwimi tr2, tr0, 24, 16, 23 # mask lighting value (%bh) above pixel value (%al)
lbzx r6, r3, tr2 # xlate lighting:pixel through lighting tables
add tr0, tr0, tr1 # fx_l += fx_dl_dx
# -----------------------------------------------
mr eax, V0
rlwimi eax, U0, 26, 16, 21 # Now, eax has v:u, but it's 10 bits too high and garbage above it
rlwinm eax, eax, 22, 20, 31 # Shift right 10 bits, mask out high garbage (preserve only low 12 bits)
stbu r6, 1(edi) # {change this to stbu eax 1(edi) and kill the addi below}
add U0, U0, edx # u0 = u0 + du
lbzx tr2, eax, es # get source pixel
add V0, V0, esi # v0 = v0 + dv
rlwimi tr2, tr0, 24, 16, 23 # mask lighting value (%bh) above pixel value (%al)
lbzx r6, r3, tr2 # xlate lighting:pixel through lighting tables
add tr0, tr0, tr1 # fx_l += fx_dl_dx
# -----------------------------------------------
mr eax, V0
rlwimi eax, U0, 26, 16, 21 # Now, eax has v:u, but it's 10 bits too high and garbage above it
rlwinm eax, eax, 22, 20, 31 # Shift right 10 bits, mask out high garbage (preserve only low 12 bits)
stbu r6, 1(edi) # {change this to stbu eax 1(edi) and kill the addi below}
add U0, U0, edx # u0 = u0 + du
lbzx tr2, eax, es # get source pixel
add V0, V0, esi # v0 = v0 + dv
rlwimi tr2, tr0, 24, 16, 23 # mask lighting value (%bh) above pixel value (%al)
lbzx r6, r3, tr2 # xlate lighting:pixel through lighting tables
add tr0, tr0, tr1 # fx_l += fx_dl_dx
# -----------------------------------------------
stbu r6, 1(edi) # {change this to stbu eax 1(edi) and kill the addi below}
bdnz pix_loop1
addi edi, edi, 1
cont1:
addic. r_loop_count, r_loop_count, -1
mr U0, U1
mr V0, V1
bne TopOfLoop4
EndOfLoop4:
mr. r_num_left_over, r_num_left_over
beq _none_to_do
b DoEndPixels
; -------------------------------------------------------
yes_trans1:
# Plot 16 pixels. (2^NBITS)
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
ppc_pix_lt
b+ cont1
# ----------------------------------------- Start of LeftOver Pixels ------------------------------------------
DoEndPixels:
# This is the stuff to determine whether to use the slower, but more accurate, leftover pixel stuff.
add. ecx, ecx, DZ1 # add ecx, DZ1
add ebx, ebx, DU1 # add ebx, DU1
add ebp, ebp, DV1 # add ebp, DV1
beq _div_0_abort # je _div_0_abort
bgt+ dep_cont # jns dep_cont
# z went negative.
# this can happen because we added DZ1 to the current z, but dz1 represents dz for perhaps 16 pixels
# though we might only plot one more pixel.
# Instead of converting the ugly code below, I'm just going to abort if z went negative.
# It hardly ever does and we shipped shareware that way...
b _none_to_do
dep_cont:
slwi eax, ebx, DIV_SHIFT
divw U1, eax, ecx
slwi U1, U1, 16 - DIV_SHIFT
slwi eax, ebp, DIV_SHIFT
divw V1, eax, ecx
slwi V1, V1, 16 - DIV_SHIFT
sub esi, V1, V0
sub edx, U1, U0
srawi esi, esi, NBITS
srawi edx, edx, NBITS
leftover_loop: mr eax, V0
rlwimi eax, U0, 26, 16, 21 # Now, eax has v:u, but it's 10 bits too high and garbage above it
rlwinm eax, eax, 22, 20, 31 # Shift right 10 bits, mask out high garbage (preserve only low 12 bits)
lbzx tr2, eax, es # get source pixel
add U0, U0, edx # u0 = u0 + du
add V0, V0, esi # v0 = v0 + dv
cmpwi cr0, tr2, 0xff # transparent pixel?
# cmpwi cr0, tr2, 0x0 # transparent pixel?
add tr0, tr0, tr1 # fx_l += fx_dl_dx
beq skipa1
rlwimi tr2, tr0, 24, 16, 23 # mask lighting value (%bh) above pixel value (%al)
lbzx eax, r3, tr2 # xlate lighting:pixel through lighting tables
stb eax, 0(edi)
skipa1: addi edi, edi, 1
addic. r_num_left_over, r_num_left_over, -1
bne leftover_loop
b _none_to_do # jmp _none_to_do
nol_tmap_slow_from_fast:
mr r_loop_count, r_num_left_over
# ---------- Yes transparency. No lighting. ----------
# (Note: We don't know for sure there is lighting, but, except for debugging, if we aren't lighting, we _do_ have transparency.)
nol_tmap_slow_NoLight:
nol_tmap_loop0_nolight:
divw tr0, ebp, ecx # compute v coordinate
divw tr1, ebx, ecx # compute u coordinate
andi. tr1, tr1, 0x3f
rlwimi tr1, tr0, 6, 20, 25
lbzx tr1, es, tr1 # mov al, es:[ebx] # get pixel from source bitmap
cmpwi cr0, tr1, 0xff # check for transparency
beq nol_skip1a # je skip1
stb tr1, 0(edi) # mov [edi],al
nol_skip1a: addi edi, edi, 1 # inc edi
add. ecx, ecx, DZ1 # add ecx,_fx_dz_dx
add ebp, ebp, DV1 # add ebp,_fx_dv_dx
add ebx, ebx, DU1 # add esi,_fx_du_dx
beq _div_0_abort # je _div_0_abort # would be dividing by 0, so abort
addic. r_loop_count, r_loop_count, -1 # dec _loop_count
bge nol_tmap_loop0_nolight # jns tmap_loop0
b _none_to_do
export tmap_loop_fast_nolight
# -------------------------------------- Start of Getting Dword Aligned ----------------------------------------------
# ebx fx_u
# ebp fx_v
# esi fx_l
# r3 gr_fade_table
tmap_loop_fast_nolight:
# This is a hack! If we allow zero pixels to be plotted for alignment, the code later hangs.
andi. tr0, edi, 3 # DEBUG HACK!!
beq nol_skip_test # DEBUG HACK!!
nol_NotDwordAligned1:
andi. tr0, edi, 3 # test edi, 11b
beq nol_DwordAligned1 # jz DwordAligned1
nol_skip_test: # DEBUG HACK!!
divw tr0, ebp, ecx # tr0: v coodinate
divw tr1, ebx, ecx # tr1: u coordinate
rlwimi tr1, tr0, 6, 20, 25 # get v:u in low 12 bits, but garbage above
andi. tr1, tr1, 0xfff # preserve only 12 bit index
lbzx tr1, es, tr1 # mov al, es:[ebx] # get pixel from source bitmap
cmpwi cr0, tr1, 0xff # transparent pixel?
beq nol_skip2 # yes, skip
stb tr1, 0(edi) # mov [edi],al
nol_skip2: addi edi, edi, 1 # inc edi
# update deltas
add. ecx, ecx, DZ1 # add ecx,_fx_dz_dx
add ebx, ebx, DU1 # add esi,_fx_du_dx
add ebp, ebp, DV1 # add ebp,_fx_dv_dx
beq _div_0_abort # je _div_0_abort # would be dividing by 0, so abort
addic. r_loop_count, r_loop_count, -1 # dec _loop_count
bge nol_NotDwordAligned1
b _none_to_do
# -------------------------------------- End of Getting Dword Aligned ----------------------------------------------
nol_DwordAligned1:
addi r_loop_count, r_loop_count, 1
andi. r_num_left_over, r_loop_count, NBITS_mask
srwi. r_loop_count, r_loop_count, NBITS
beq nol_tmap_slow_from_fast
# compute initial u, v coordinates
slwi eax, ebp, DIV_SHIFT
divw V0, eax, ecx
slwi V0, V0, 16 - DIV_SHIFT
slwi eax, ebx, DIV_SHIFT
divw U0, eax, ecx
slwi U0, U0, 16 - DIV_SHIFT
# Set deltas to NPIXS pixel increments
lwz tr0, fx_du_dx[TC](RTOC)
lwzx tr1, r0, tr0
slwi DU1, tr1, NBITS
lwz tr0, fx_dv_dx[TC](RTOC)
lwzx tr1, r0, tr0
slwi DV1, tr1, NBITS
lwz tr0, fx_dz_dx[TC](RTOC)
lwzx tr1, r0, tr0
slwi DZ1, tr1, NBITS
nol_TopOfLoop4:
add. ecx, ecx, DZ1 # add ecx, DZ1
add ebx, ebx, DU1 # add ebx, DU1
add ebp, ebp, DV1 # add ebp, DV1
beq _div_0_abort # would be dividing by 0, so abort
# Find fixed U1, V1
slwi eax, ebx, DIV_SHIFT
divw U1, eax, ecx
slwi eax, ebp, DIV_SHIFT
divw V1, eax, ecx
slwi U1, U1, 16 - DIV_SHIFT
slwi V1, V1, 16 - DIV_SHIFT
# PPC: Make %esi be dv, %edx be du
sub esi, V1, V0
sub edx, U1, U0
srawi esi, esi, NBITS
srawi edx, edx, NBITS
lwz eax, Transparency_on[TC](RTOC) # test _Transparency_on,-1
lwzx eax, r0, eax # test _Transparency_on,-1
mr. eax, eax
bne nol_yes_trans1 # je no_trans1
# Plot 16 pixels. (2^NBITS)
li r5, 4 # do 4 times...
mtctr r5
subi edi, edi, 1
# -----------------------------------------------
nol_pix_loop1:
mr eax, V0
rlwimi eax, U0, 26, 16, 21 # Now, eax has v:u, but it's 10 bits too high and garbage above it
rlwinm eax, eax, 22, 20, 31 # Shift right 10 bits, mask out high garbage (preserve only low 12 bits)
add U0, U0, edx # u0 = u0 + du
lbzx r6, eax, es # get source pixel
add V0, V0, esi # v0 = v0 + dv
# -----------------------------------------------
mr eax, V0
rlwimi eax, U0, 26, 16, 21 # Now, eax has v:u, but it's 10 bits too high and garbage above it
rlwinm eax, eax, 22, 20, 31 # Shift right 10 bits, mask out high garbage (preserve only low 12 bits)
stbu r6, 1(edi) # {change this to stbu eax 1(edi) and kill the addi below}
add U0, U0, edx # u0 = u0 + du
lbzx r6, eax, es # get source pixel
add V0, V0, esi # v0 = v0 + dv
# -----------------------------------------------
mr eax, V0
rlwimi eax, U0, 26, 16, 21 # Now, eax has v:u, but it's 10 bits too high and garbage above it
rlwinm eax, eax, 22, 20, 31 # Shift right 10 bits, mask out high garbage (preserve only low 12 bits)
stbu r6, 1(edi) # {change this to stbu eax 1(edi) and kill the addi below}
add U0, U0, edx # u0 = u0 + du
lbzx r6, eax, es # get source pixel
add V0, V0, esi # v0 = v0 + dv
# -----------------------------------------------
mr eax, V0
rlwimi eax, U0, 26, 16, 21 # Now, eax has v:u, but it's 10 bits too high and garbage above it
rlwinm eax, eax, 22, 20, 31 # Shift right 10 bits, mask out high garbage (preserve only low 12 bits)
stbu r6, 1(edi) # {change this to stbu eax 1(edi) and kill the addi below}
add U0, U0, edx # u0 = u0 + du
lbzx r6, eax, es # get source pixel
add V0, V0, esi # v0 = v0 + dv
# -----------------------------------------------
stbu r6, 1(edi) # {change this to stbu eax 1(edi) and kill the addi below}
bdnz nol_pix_loop1
addi edi, edi, 1
nol_cont1:
addic. r_loop_count, r_loop_count, -1
mr U0, U1
mr V0, V1
bne nol_TopOfLoop4
nol_EndOfLoop4:
mr. r_num_left_over, r_num_left_over
beq _none_to_do
b nol_DoEndPixels
; -------------------------------------------------------
nol_yes_trans1:
# Plot 16 pixels. (2^NBITS)
ppc_pix_t
ppc_pix_t
ppc_pix_t
ppc_pix_t
ppc_pix_t
ppc_pix_t
ppc_pix_t
ppc_pix_t
ppc_pix_t
ppc_pix_t
ppc_pix_t
ppc_pix_t
ppc_pix_t
ppc_pix_t
ppc_pix_t
ppc_pix_t
b+ nol_cont1
# ----------------------------------------- Start of LeftOver Pixels ------------------------------------------
nol_DoEndPixels:
# This is the stuff to determine whether to use the slower, but more accurate, leftover pixel stuff.
add. ecx, ecx, DZ1 # add ecx, DZ1
add ebx, ebx, DU1 # add ebx, DU1
add ebp, ebp, DV1 # add ebp, DV1
beq _div_0_abort # je _div_0_abort
blt _none_to_do
slwi eax, ebx, DIV_SHIFT
divw U1, eax, ecx
slwi U1, U1, 16 - DIV_SHIFT
slwi eax, ebp, DIV_SHIFT
divw V1, eax, ecx
slwi V1, V1, 16 - DIV_SHIFT
sub esi, V1, V0
sub edx, U1, U0
srawi esi, esi, NBITS
srawi edx, edx, NBITS
nol_leftover_loop: mr eax, V0
rlwimi eax, U0, 26, 16, 21 # Now, eax has v:u, but it's 10 bits too high and garbage above it
rlwinm eax, eax, 22, 20, 31 # Shift right 10 bits, mask out high garbage (preserve only low 12 bits)
lbzx tr2, eax, es # get source pixel
add U0, U0, edx # u0 = u0 + du
add V0, V0, esi # v0 = v0 + dv
cmpwi cr0, tr2, 0xff # transparent pixel?
beq nol_skipa1
stb tr2, 0(edi)
nol_skipa1: addi edi, edi, 1
addic. r_num_left_over, r_num_left_over, -1
bne nol_leftover_loop
b _none_to_do # jmp _none_to_do