dxx-rebirth/texmap/tmapppro.S

611 lines
20 KiB
ArmAsm

/// $Id: tmapppro.S,v 1.6 2003-12-08 21:21:16 btb Exp $
/// tmap_scanline_per - Pentium-Pro-optimized assembly version
/// written by Brian Raiter, Mar 1998.
/// lighting roundoff error fixed by Matt Mueller, July 1999
/// The gist of the algorithm is as follows (note that this is
/// pseudocode, not actual C):
///
/// int u = fx_u;
/// int v = fx_v;
/// int z = fx_z;
/// int l = fx_l;
/// int x, ubyz, vbyz;
/// byte texmap[64][64] = pixptr;
/// byte framebuffer[][bytes_per_row] = write_buffer;
/// byte lightingtable[][256] = gr_fade_table;
/// byte c;
///
/// for (x = fx_xleft ; x <= fx_xright ; ++x) {
/// ubyz = (u / z) & 63;
/// vbyz = (v / z) & 63;
/// c = texmap[ubyz][vbyz];
/// if (c != TRANSPARENT_COLOR)
/// framebuffer[fx_y][x] = lightingtable[l / 65536][c];
/// u += fx_du_dx;
/// v += fx_dv_dx;
/// z += fx_dz_dx;
/// l += fx_dl_dx;
/// }
///
/// The global variable Transparency_on is zero when it is known that
/// there are no transparencies involved, so in that case we use a
/// different loop that skips the transparency test.
///
/// The actual algorithm used here only does the division calculations
/// every fourth pixel, and linearly interpolates the other three.
/// Something along the lines of:
///
/// /* Initial values as before */
/// int ubyz0, ubyz0, ubyz4, ubyz4, du1, dv1, i;
///
/// ubyz0 = u / z;
/// vbyz0 = v / z;
/// for (x = fx_xleft ; x <= fx_xright - 3 ; x += 4) {
/// u += fx_du_dx * 4;
/// v += fx_dv_dx * 4;
/// z += fx_dz_dx * 4;
/// ubyz4 = u / z;
/// vbyz4 = v / z;
/// du1 = (ubyz4 - ubyz0) / 4;
/// dv1 = (vbyz4 - vbyz0) / 4;
/// ubyz = ubyz0;
/// vbyz = vbyz0;
/// for (i = 0 ; i < 4 ; ++i) {
/// c = texmap[ubyz & 63][vbyz & 63];
/// if (c != TRANSPARENT_COLOR)
/// framebuffer[fx_y][x + i] = lightingtable[l / 65536][c];
/// ubyz += du1;
/// vbyz += dv1;
/// l += fx_dl_dx;
/// }
/// ubyz0 = ubyz4;
/// vbyz0 = vbyz4;
/// }
/// for ( ; x <= fx_xright ; ++x) {
/// /* Finish off remaining 0-3 pixels */
/// }
///
/// So much for the basic overview.
///
/// In this version, the PPro's floating-point unit is pressed into
/// service to do the actual divisions, so that 1/z can be calculated
/// first, and the resulting reciprocal multiplied with u and v. These
/// two products are then stored back out as integers. This keeps us
/// down to doing only one division every four pixels, during which
/// other integer instructions can be overlapped.
///
/// The algorithm actually divides 64 by z, so that the rounded-off
/// products will effectively be stored with six fraction bits. This
/// allows the algorithm to correct for minor floating-point roundoff
/// errors. Two fraction bits are kept during the interpolation of the
/// three middle pixels, which hopefully increases the accuracy of the
/// approximations.
///
/// We only need the lowest six (integral) bits of u/z and v/z for
/// each pixptr offset, so we only need eight bits of each fourth
/// pair of values to figure the interpolation. Add with the two
/// fractional bits we keep for extra precision flavor, this makes ten
/// bits for each value, or twenty to store the full pair. To simplify
/// the interpolation, the pair is packed into a single 32-bit
/// register like so:
///
/// 3 2 1
/// 1 4 6 8 0
/// vvVVVVVVvv____________uuUUUUUUuu
/// \v&63/ \u&63/
///
/// The unused bits between the u and v values permit the packed
/// values to be added/subtracted without the u values spilling over
/// into the v values. Then, the instructions "bswap %eax ; roll $6,
/// %eax ; andl $0x0FFF, %eax" will right-justify the desired values
/// into a pixptr offset.
///
/// The FP stack is loaded up with the values of u, v, and z,
/// converted to floats. %ebp is used to hold the value of l, %esi is
/// is set to pixptr, and %edi points to our current position in
/// write_buffer.
// This is used to abbreviate an annoying external variable name.
.equ fadetbl, _gr_fade_table
// The following macro encapsulates the floating-point instructions
// that put the results of a prior division to use and prepare for the
// next division. At the beginning of the macro, the FP stack contains
// (from top to bottom): 64/z, z, u, v. The macro computes (64*u)/z,
// which is stored in ubyz4, and (64*v)/z, which is stored in vybz4.
// Simultaneous with this, the macro adds dudx to u, dvdx to v, and
// dzdx to z, and finally puts 64 back onto the stack. At the end of
// the macro, the stack contains: 64, z, u, v.
.macro DoFPCalcs 0 // The FP stack after each instruction:
// 64/z z u v
fst %st(4) // 64/z z u v 64/z
fxch %st(2) // u z 64/z v 64/z
fmul %st, %st(4) // (64 * u) / z u z 64/z v u/z
fadds (dudx) // u += dudx u' z 64/z v u/z
fxch %st(3) // v z 64/z u' u/z
fmul %st, %st(2) // (64 * v) / z v z v/z u' u/z
fadds (dvdx) // v += dvdx v' z v/z u' u/z
fxch %st(1) // z v' v/z u' u/z
fadds (dzdx) // z += dzdx z' v' v/z u' u/z
fxch %st(2) // v/z v' z' u' u/z
flds (flt64) // 64 v/z v' z' u' u/z
fxch %st(5) // u/z v/z v' z' u' 64
fistpl (ubyz4) // v/z v' z' u' 64
fistpl (vbyz4) // v' z' u' 64
fxch %st(3) // 64 z' u' v'
// (ready to start the next division)
.endm
#ifdef __ELF__
.equ _pixptr, pixptr
.equ _gr_fade_table, gr_fade_table
.equ _write_buffer, write_buffer
.equ _bytes_per_row, bytes_per_row
.equ _fx_xleft, fx_xleft
.equ _fx_xright, fx_xright
.equ _fx_y, fx_y
.equ _fx_u, fx_u
.equ _fx_v, fx_v
.equ _fx_z, fx_z
.equ _fx_l, fx_l
.equ _fx_du_dx, fx_du_dx
.equ _fx_dv_dx, fx_dv_dx
.equ _fx_dz_dx, fx_dz_dx
.equ _fx_dl_dx, fx_dl_dx
.equ _Transparency_on, Transparency_on
.globl asm_ppro_tmap_scanline_per
#else
.globl _asm_ppro_tmap_scanline_per
#endif
.extern _pixptr, _gr_fade_table, _write_buffer
.extern _bytes_per_row, _fx_xleft, _fx_xright, _fx_y
.extern _fx_u, _fx_v, _fx_z, _fx_l
.extern _fx_du_dx, _fx_dv_dx, _fx_dz_dx, _fx_dl_dx
.extern _Transparency_on
//.local dudx, dvdx, dzdx, dldx, l
//.local ubyz, vbyz, uvzero
//.local lastquartet, lastpixel, ctwl
//.local flt64
.data
.balign 4
dudx: .long 0 // u's rate of change as a float
dvdx: .long 0 // v's rate of change as a float
dzdx: .long 0 // z's rate of change as a float
dldx: .long 0 // l's rate of change as an integer
l: .long 0 // the current l value
ubyz4: .long 0 // u/z for the next iteration
vbyz4: .long 0 // v/z for the next iteration
uvzero: .long 0 // packed u/z and v/z values
lastquartet: .long 0 // where to stop the 4-pixels loop
lastpixel: .long 0 // where to stop drawing entirely
flt64: .long 0x42800000 // 64.0 (what we divide z into)
ctlwd: .long 0 // the pre-tweaked FPU control word
.text
.balign 4
//
// void c_tmap_scanline_per(void)
//
#ifdef __ELF__
asm_ppro_tmap_scanline_per:
#else
_asm_ppro_tmap_scanline_per:
#endif
// Save registers the compiler might be using.
pushl %ebp
pushl %edi
pushl %esi
// Kick the FPU into the lowest precision (still enough for our needs)
// so as to speed up fdiv.
fnstcw (ctlwd)
movw (ctlwd), %ax
movl %eax, %ebx
andb $0xFC, %bh
movw %bx, (ctlwd)
fldcw (ctlwd)
movw %ax, (ctlwd)
// Multiply dudx, dvdx, and dzdx by four, and store locally, converted
// into floating point.
movl (_fx_du_dx), %eax
sall $2, %eax
movl %eax, (dudx)
movl (_fx_dv_dx), %eax
sall $2, %eax
movl %eax, (dvdx)
movl (_fx_dz_dx), %eax
sall $2, %eax
movl %eax, (dzdx)
fildl (dudx)
fildl (dvdx)
fildl (dzdx)
fxch %st(2)
fstps (dudx)
fstps (dvdx)
fstps (dzdx)
// bytes_per_row * fx_y is the offset for the current scanline. (We do
// this now before we start the first FP division.)
movl (_bytes_per_row), %eax
xorl %edx, %edx
mull (_fx_y)
// Push v, u, z, and 64.0 onto the FPU stack, and then start
// calculating the first 64 / z.
fildl (_fx_v)
fildl (_fx_u)
fildl (_fx_z)
flds (flt64)
fdiv %st(1)
// Meanwhile, get l and dldx (again, the latter multiplied by four).
// l will be stored in %ebp for the duration. The original values are
// divided by 256 so that the byte needed for the fade table offset
// will be aligned.
//Dividing by 256 is bad.. rounding errors and crap. We'll now do that
//right before we need to access the table instead. -MM
movl (_fx_l), %edx
// sarl $8, %edx
movl %edx, (l)
movl (_fx_dl_dx), %edx
// sarl $6, %edx
sall $2, %edx
movl %edx, (dldx)
// Store pixptr, the pointer to our 64x64 texture map, in %esi. Store
// write_buffer, the pointer to our frame buffer, in %edi. Then offset
// %edi so that it points to pixel [fx_y][fx_xleft]. Calculate a
// pointer to [fx_y][fx_xright + 1] so we know when to stop drawing.
// Also calculate a pointer to [fx_y][(fx_xright + 1) & ~3] so we know
// when to stop drawing four pixels at a time.
movl (_pixptr), %esi
movl (_write_buffer), %edi
movl (_fx_xright), %ecx
addl %eax, %edi
incl %ecx
addl %edi, %ecx
movl %ecx, (lastpixel)
addl (_fx_xleft), %edi
movl %ecx, %eax
subl %edi, %eax
jle LeaveNow
andl $3, %eax
subl %eax, %ecx
movl %ecx, (lastquartet)
// Calculate round(64 * u / z) and round(64 * v / z), store, and
// increment u, v, and z. Then start calculating the second 64 / z.
DoFPCalcs
fdiv %st(1)
// Get our u/z and v/z values, lop off the bits we don't care
// about, pack, and store in uvzero.
movl (ubyz4), %eax
incl %eax
andl $0x3FF0, %eax
shrl $4, %eax
movl (vbyz4), %ebx
incl %ebx
andl $0x3FF0, %ebx
shll $18, %ebx
orl %eax, %ebx
movl %ebx, (uvzero)
// Are there at least four pixels to draw? If not, skip to the epilog
// code.
cmpl %ecx, %edi
je LastBits
// Do we need to test for transparencies?
testl $(~0), (_Transparency_on)
jnz LoopTransOn
// If not, then use the simpler loop here.
.balign 4
LoopTransOff:
// While the FPU is busy dividing, the latest u/z and v/z values are
// retrieved, packed, and stored in uvzero (to be used again in the
// next iteration). The old uvzero value, which contains the uv values
// for pixel 0, gets subtracted from the new uvzero value to
// determined the total change in u/z and v/z across the four pixels,
// and this is divided by 4 to get the average. This average is then
// used to estimate the values for pixels 1, $2, and 3. The old uvzero
// value is used immediately to calculate pixel 0, while %eax, %ebx, and
// %ecx are entrusted with the uv values for pixels 1, $2, and 3
// respectively, while %edx is our "cleansed" register for using byte
// values as memory pointer offsets. %ebp is loaded with the high byte
// of l, forming half of the offset for the fade table lookup. (The
// pixel from the texture-map bitmap supplies the other half.) Each
// value is used to set its pixel as follows (assuming %eax holds our
// packed uv value):
//
// a: bswapl %eax / move u and v to the
// b: roll $6, %eax / far right
// c: andl $0x0FFF, %eax / mask off extra bits
// d: movb (%esi,%eax), %dl / get texture-map pixel
// e: movb fadetbl(%edx,%ebp), %dl / correct for lighting
// f: movb %dl, (%edi) / write to frame buffer
//
// The above is done four times, once for each pixel. Some of the
// calculations may appear to be interleaved haphazardly, but the PPro
// seems to like it this way.
DoFPCalcs
fdiv %st(1)
xorl %edx, %edx
movl (uvzero), %eax // %eax = uv for pixel 0
bswapl %eax // 0 a
roll $6, %eax // 0 b
andl $0x0FFF, %eax // 0 c
movb (%esi,%eax), %dl // 0 d
movl (l), %ebp
movl (dldx), %ecx
addl %ebp, %ecx
movl %ecx, (l)
sarl $8, %ebp
andl $0x7F00, %ebp
movb fadetbl(%edx,%ebp), %dl // 0 e
movl (vbyz4), %ebx
incl %ebx
andl $0x3FF0, %ebx
movl (ubyz4), %ecx
shll $18, %ebx
incl %ecx
andl $0x3FF0, %ecx
shrl $4, %ecx
movl (uvzero), %eax
orl %ebx, %ecx
movl %ecx, (uvzero)
orl $0x1000, %ecx
subl %eax, %ecx
shrl $2, %ecx
movb %dl, (%edi) // 0 f
lea (%eax,%ecx,2), %ebx // %ebx = uv for pixel 2
addl %ecx, %eax // %eax = uv for pixel 1
bswapl %eax // 1 a
roll $6, %eax // 1 b
addl %ebx, %ecx // %ecx = uv for pixel 3
bswapl %ebx // 2 a
roll $6, %ebx // 2 b
bswapl %ecx // 3 a
andl $0x0FFF, %eax // 1 c
andl $0x0FFF, %ebx // 2 c
roll $6, %ecx // 3 b
movb (%esi,%eax), %dl // 1 d
movb fadetbl(%edx,%ebp), %al // 1 e
movb (%esi,%ebx), %dl // 2 d
movb fadetbl(%edx,%ebp), %bl // 2 e
movb %al, 1(%edi) // 1 f
andl $0x0FFF, %ecx // 3 c
movb %bl, 2(%edi) // 2 f
movb (%esi,%ecx), %dl // 3 d
movb fadetbl(%edx,%ebp), %cl // 3 e
movb %cl, 3(%edi) // 3 f
addl $4, %edi
cmpl (lastquartet), %edi
jl LoopTransOff
// Are there any pixels left at all?
cmpl (lastpixel), %edi
jne LastBits
jmp LeaveNow
.balign 4
LoopTransOn:
// This is similar to the LoopTransOff loop, the big change being that
// each value retrieved from the texture map is tested against 255,
// the transparent "color". A value of 255 in the texture map means to
// let the existing value for that pixel in write_buffer go by
// unchanged. Thus the code for each pixel looks something like this
// instead:
//
// a: bswapl %eax / move u and v to the
// b: roll $6, %eax / far right
// c: andl $0x0FFF, %eax / mask off extra bits
// d: movb (%esi,%eax), %dl / get texture-map pixel
// e: cmpb $255, %dl / is pixel transparent?
// f: sbbb %ah, %ah / yes:%ah=00, no:%ah=FF
// g: movb fadetbl(%edx,%ebp), %dl / correct for lighting
// h: movb (%edi), %al / get current pixel
// i: xorb %al, %dl / combine the two
// j: andb %dl, %ah / use %ah as a mask to
// k: xorb %ah, %al / select which pixel
// l: movb %al, (%edi) / write to frame buffer
//
// When the texture-map value is 255, the code simply writes the
// original frame-buffer value back out again; otherwise the new pixel
// is written instead. The ands and xors used to accomplish this bulk
// up the code, but on the whole it is better than having four
// unpredictable jumps in the loop.
DoFPCalcs
fdiv %st(1)
movl (uvzero), %eax // %eax = uv for pixel 0
bswapl %eax // 0 a
movl (dldx), %ecx
movl (l), %ebp
addl %ebp, %ecx
roll $6, %eax // 0 b
andl $0x0FFF, %eax // 0 c
xorl %edx, %edx
movb (%esi,%eax), %dl // 0 d
cmpb $255, %dl // 0 e
sbbb %ah, %ah // 0 f
movl %ecx, (l)
sarl $8, %ebp
andl $0x7F00, %ebp
movb fadetbl(%edx,%ebp), %dl // 0 g
movb (%edi), %al // 0 h
xorb %al, %dl // 0 i
andb %dl, %ah // 0 j
xorb %ah, %al // 0 k
movb %al, (%edi) // 0 l
movl (vbyz4), %ebx
movl (ubyz4), %ecx
incl %ebx
andl $0x3FF0, %ebx
incl %ecx
andl $0x3FF0, %ecx
shll $18, %ebx
shrl $4, %ecx
orl %ebx, %ecx
movl (uvzero), %eax
movl %ecx, (uvzero)
orl $0x1000, %ecx
subl %eax, %ecx
shrl $2, %ecx
lea (%eax,%ecx,2), %ebx // %ebx = uv for pixel 2
addl %ecx, %eax // %eax = uv for pixel 1
bswapl %eax // 1 a
roll $6, %eax // 1 b
addl %ebx, %ecx // %ecx = uv for pixel 3
bswapl %ebx // 2 a
roll $6, %ebx // 2 b
andl $0x0FFF, %eax // 1 c
movb (%esi,%eax), %dl // 1 d
cmpb $255, %dl // 1 e
sbbb %ah, %ah // 1 f
bswapl %ecx // 3 a
movb 1(%edi), %al // 1 h
movb fadetbl(%edx,%ebp), %dl // 1 g
roll $6, %ecx // 3 b
andl $0x0FFF, %ebx // 2 c
xorb %al, %dl // 1 i
andb %dl, %ah // 1 j
movb (%esi,%ebx), %dl // 2 d
cmpb $255, %dl // 2 e
sbbb %bh, %bh // 2 f
movb fadetbl(%edx,%ebp), %dl // 2 g
andl $0x0FFF, %ecx // 3 c
movb 2(%edi), %bl // 2 h
xorb %bl, %dl // 2 i
andb %dl, %bh // 2 j
movb (%esi,%ecx), %dl // 3 d
cmpb $255, %dl // 3 e
sbbb %ch, %ch // 3 f
movb 3(%edi), %cl // 3 h
movb fadetbl(%edx,%ebp), %dl // 3 g
xorb %cl, %dl // 3 i
andb %dl, %ch // 3 j
xorb %ah, %al // 1 k
movb %al, 1(%edi) // 1 l
xorb %bh, %bl // 2 k
movb %bl, 2(%edi) // 2 l
xorb %ch, %cl // 3 k
movb %cl, 3(%edi) // 3 l
addl $4, %edi
cmpl (lastquartet), %edi
jl LoopTransOn
// Quit if there are none at all left.
cmpl (lastpixel), %edi
je LeaveNow
LastBits:
// Here we finish off the last one-to-three pixels assigned to us.
// Rather than calculating values for all four pixels, we just divide
// the difference by four and keep adding this average into the value
// as needed. (This code is not particularly optimized, by the way,
// since it represents such a miniscule amount of the running time.)
DoFPCalcs
movl (l), %ebp
sarl $8, %ebp
andl $0x7F00, %ebp
movl (ubyz4), %eax
incl %eax
andl $0x3FF0, %eax
shrl $4, %eax
movl (vbyz4), %ecx
incl %ecx
andl $0x3FF0, %ecx
shll $18, %ecx
orl %eax, %ecx
movl (uvzero), %ebx
orl $0x1000, %ecx
subl %ebx, %ecx
shrl $2, %ecx
xorl %edx, %edx
LoopLastBits: movl %ebx, %eax
bswapl %eax
roll $6, %eax
andl $0x0FFF, %eax
movb (%esi,%eax), %dl
cmpb $255, %dl
je LetPixelBy
movb fadetbl(%edx,%ebp), %dl
movb %dl, (%edi)
LetPixelBy: incl %edi
addl %ecx, %ebx
cmpl (lastpixel), %edi
jl LoopLastBits
LeaveNow:
// We're done! Clear the stacks, reset the FPU control word, and we
// are so out of here.
popl %esi
popl %edi
popl %ebp
fcompp
fcompp
fldcw (ctlwd)
ret