/*
 *  desktop -- The 3dfx Desktop Demo 
 *  COPYRIGHT 3DFX INTERACTIVE, INC. 1999
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include "basics.h"
/*#include "cpudetect.h"*/
#include "lighting.h"

//#define USE_ASM

#define FALL_OFF_DIST_FACTOR  (1000.0f)

// local variables
const float ZERO = 0.0f;
const float ONE = 1.0f;
const float ONE_HALF = 0.5f;
const float THREE_HALVES = 1.5f;

// global variables
float gLightAmbient = 0.25f;
Vector gXformedLightDir = {0.0f, 0.0f, 1.0f, 0.0f};
Vector gXformedCameraDir = {0.0f, 0.0f, 1.0f, 0.0f};
Vector gXformedLightPos = {1.0f, 1.0f, 1.0f, 1.0f};

// function prototypes
float DirLightDiffuse_ASM(Vector unit_norm);
float DirLightDiffuse_C(Vector unit_norm);
void DirLightDiffuseAndSpecular_ASM(Vector unit_norm, float *diffuse, float *specular);
void DirLightDiffuseAndSpecular_C(Vector unit_norm, float *diffuse, float *specular);

// function pointers
#ifdef USE_ASM
float (*DirLightDiffuse)(Vector unit_norm) = DirLightDiffuse_ASM;
void (*DirLightDiffuseAndSpecular)(Vector unit_norm, float *diffuse, float *specular) = DirLightDiffuseAndSpecular_ASM;
#else
float (*DirLightDiffuse)(Vector unit_norm) = DirLightDiffuse_C;
void (*DirLightDiffuseAndSpecular)(Vector unit_norm, float *diffuse, float *specular) = DirLightDiffuseAndSpecular_C;
#endif

void InitializeLighting(const CPUcaps *cpu_caps)
{
#ifdef USE_ASM
	if (cpu_caps->b_CMOV_support)
	{
		DirLightDiffuse = DirLightDiffuse_ASM;
		DirLightDiffuseAndSpecular = DirLightDiffuseAndSpecular_ASM;
	}
	else
	{
		DirLightDiffuse = DirLightDiffuse_C;
		DirLightDiffuseAndSpecular = DirLightDiffuseAndSpecular_C;
	}
#else
	DirLightDiffuse = DirLightDiffuse_C;
	DirLightDiffuseAndSpecular = DirLightDiffuseAndSpecular_C;
#endif // USE_ASM
}

void SetLightAmbient(float ambient)
{
	gLightAmbient = ambient;
}

// this only works on PPro and PII because of the fcomip and fcmovb instructions
// which aren't available on Pentium or AMD K6
#ifdef USE_ASM
__declspec(naked) float DirLightDiffuse_ASM(Vector unit_norm)
{
	__asm
	{
		mov			eax, [esp + 4] // eax = unit_norm

		fld			dword ptr [gXformedLightDir]
		// dir[X]
		fmul		dword ptr [eax]
		// dir[X]*norm[X] (2)
		fld			dword ptr [gXformedLightDir + 4]
		// dir[Y]
		// dir[X]*norm[X] (1)
		fmul		dword ptr [eax + 4]
		// dir[Y]*norm[Y] (2)
		// dir[X]*norm[X] (0)
		fld			dword ptr [gXformedLightDir + 8]
		// dir[Z]
		// dir[Y]*norm[Y] (1)
		// dir[X]*norm[X] (0)
		fmul		dword ptr [eax + 8]
		// dir[Z]*norm[Z] (2)
		// dir[Y]*norm[Y] (0)
		// dir[X]*norm[X] (0)
		fxch		st(2)
		// dir[X]*norm[X] (0)
		// dir[Y]*norm[Y] (0)
		// dir[Z]*norm[Z] (2)
		faddp		st(1), st
		// dir[X]*norm[X] + dir[Y]*norm[Y] (2)
		// dir[Z]*norm[Z] (1)

		// ******** stall 2 cycles ********
		faddp		st(1), st
		// dot (2)
		fld			dword ptr [ZERO]
		// 0.0f
		// dot (1)

		// ******** stall 1 cycle ********
		fcomi		st, st(1)
		fcmovb	st, st(1)

		// diffuse
		// dot
		fxch		st(1)
		// dot
		// diffuse
		fstp		st
		// diffuse

		fadd		dword ptr [gLightAmbient]
		// ambient + diffuse (2)
		fld			dword ptr [ONE]
		// 1.0f
		// ambient + diffuse (1)

		// ******** stall 1 cycle ********
		fcomi		st, st(1)
		fcmovnb	st, st(1)
		// clamped diffuse
		// ambient + diffuse
		fxch		st(1)
		// ambient + diffuse
		// clamped diffuse
		fstp		st
		// clamped diffuse

		ret
	}
}
#endif

float DirLightDiffuse_C(Vector unit_norm)
{
	float diffuse;
	int mask;

	// calculate the diffuse factor
	diffuse = DotProduct(gXformedLightDir, unit_norm);

	if ((*(int *)&diffuse) < 0)
	{
		diffuse = gLightAmbient;
	}
	else
	{
		diffuse += gLightAmbient;
		mask = *(int *)&ONE - *(int *)&diffuse;
		mask >>= 31;
		*(int *)&diffuse = (*(int *)&diffuse & (mask ^ 0xffffffff)) | (*(int *)&ONE & mask);
	}

	return diffuse;
}

#ifdef USE_ASM
__declspec(naked) void DirLightDiffuseAndSpecular_ASM(Vector unit_norm, float *diffuse, float *specular)
{
	__asm
	{
		mov			eax, [esp + 4] // eax = unit_norm
		mov			ecx, [esp + 8] // ecx = diffuse

		fld			dword ptr [gXformedLightDir]
		// dir[X]
		fmul		dword ptr [eax]
		// dir[X]*norm[X] (2)
		fld			dword ptr [gXformedLightDir + 4]
		// dir[Y]
		// dir[X]*norm[X] (1)
		fmul		dword ptr [eax + 4]
		// dir[Y]*norm[Y] (2)
		// dir[X]*norm[X] (0)
		fld			dword ptr [gXformedLightDir + 8]
		// dir[Z]
		// dir[Y]*norm[Y] (1)
		// dir[X]*norm[X] (0)
		fmul		dword ptr [eax + 8]
		// dir[Z]*norm[Z] (2)
		// dir[Y]*norm[Y] (0)
		// dir[X]*norm[X] (0)
		fxch		st(2)
		// dir[X]*norm[X] (0)
		// dir[Y]*norm[Y] (0)
		// dir[Z]*norm[Z] (2)
		faddp		st(1), st
		// dir[X]*norm[X] + dir[Y]*norm[Y] (2)
		// dir[Z]*norm[Z] (1)

		// ******** stall 2 cycles ********
		faddp		st(1), st
		// dot (2)
		fld			dword ptr [ZERO]
		// 0.0f
		// dot (1)

		// ******** stall 1 cycle ********
		fcomi		st, st(1)
		fcmovb	st, st(1)
		// diffuse
		// dot


		fadd		dword ptr [gLightAmbient]
		// ambient + diffuse (2)
		// dot
		fld			dword ptr [ONE]
		// 1.0f
		// ambient + diffuse (1)
		// dot

		// ******** stall 1 cycle ********
		fcomi		st, st(1)
		fcmovnb	st, st(1)
		// clamped diffuse
		// ambient + diffuse
		// dot
		fxch		st(1)
		// ambient + diffuse
		// clamped diffuse
		// dot
		fstp		st
		// clamped diffuse
		// dot


		fxch		st(1)
		// dot
		// diffuse
		fadd		st, st
		// 2*dot = dot+dot (2)
		// diffuse
		fld			dword ptr [eax]
		// norm[X]
		// 2*dot (1)
		// diffuse
		// ******** stall 1 cycle ********
		fmul		st, st(1)
		// 2*dot*norm[X] (2)
		// 2*dot (0)
		// diffuse
		fld			dword ptr [eax + 4]
		// norm[Y]
		// 2*dot*norm[X] (1)
		// 2*dot (0)
		// diffuse
		fmul		st, st(2)
		// 2*dot*norm[Y] (2)
		// 2*dot*norm[X] (0)
		// 2*dot (0)
		// diffuse
		fxch		st(1)
		// 2*dot*norm[X] (0)
		// 2*dot*norm[Y] (2)
		// 2*dot (0)
		// diffuse
		fsub		dword ptr [gXformedLightDir]
		// refl[X] = 2*dot*norm[X] - gXformedLightDir[X] (2)
		// 2*dot*norm[Y] (1)
		// 2*dot (0)
		// diffuse
		fxch		st(2)
		// 2*dot (0)
		// 2*dot*norm[Y] (1)
		// refl[X] (2)
		// diffuse
		fmul		dword ptr [eax + 8]
		// 2*dot*norm[Z] (2)
		// 2*dot*norm[Y] (0)
		// refl[X] (1)
		// diffuse
		fxch		st(1)
		// 2*dot*norm[Y] (0)
		// 2*dot*norm[Z] (2)
		// refl[X] (1)
		// diffuse
		fsub		dword ptr [gXformedLightDir + 4]
		// refl[Y] = 2*dot*norm[Y] - gXformedLightDir[Y] (2)
		// 2*dot*norm[Z] (1)
		// refl[X] (0)
		// diffuse
		fld			dword ptr [gXformedCameraDir]
		// cam[X]
		// refl[Y] (1)
		// 2*dot*norm[Z] (0)
		// refl[X] (0)
		// diffuse
		fmul		st, st(3)
		// refl[X]*cam[X] (2)
		// refl[Y] (0)
		// 2*dot*norm[Z] (0)
		// refl[X] (0)
		// diffuse
		fxch		st(2)
		// 2*dot*norm[Z] (0)
		// refl[Y] (0)
		// refl[X]*cam[X] (2)
		// refl[X] (0)
		// diffuse
		fsub		dword ptr [gXformedLightDir + 8]
		// refl[Z] = 2*dot*norm[Z] - gXformedLightDir[Z] (2)
		// refl[Y] (0)
		// refl[X]*cam[X] (1)
		// refl[X] (0)
		// diffuse
		fxch		st(2)
		// refl[X]*cam[X] (1)
		// refl[Y] (0)
		// refl[Z] (2)
		// refl[X] (0)
		// diffuse
		fld			dword ptr [gXformedCameraDir + 4]
		// cam[Y]
		// refl[X]*cam[X] (0)
		// refl[Y] (0)
		// refl[Z] (1)
		// refl[X] (0)
		// diffuse
		fmul		st, st(2)
		// refl[Y]*cam[Y] (2)
		// refl[X]*cam[X] (0)
		// refl[Y] (0)
		// refl[Z] (0)
		// refl[X] (0)
		// diffuse
		fld			dword ptr [gXformedCameraDir + 8]
		// cam[Z]
		// refl[Y]*cam[Y] (1)
		// refl[X]*cam[X] (0)
		// refl[Y] (0)
		// refl[Z] (0)
		// refl[X] (0)
		// diffuse
		fmul		st, st(4)
		// refl[Z]*cam[Z] (2)
		// refl[Y]*cam[Y] (0)
		// refl[X]*cam[X] (0)
		// refl[Y] (0)
		// refl[Z] (0)
		// refl[X] (0)
		// diffuse
		fxch		st(2)
		// refl[X]*cam[X] (0)
		// refl[Y]*cam[Y] (0)
		// refl[Z]*cam[Z] (2)
		// refl[Y] (0)
		// refl[Z] (0)
		// refl[X] (0)
		// diffuse
		faddp		st(1), st
		// refl[X]*cam[X] + refl[Y]*cam[Y] (2)
		// refl[Z]*cam[Z] (1)
		// refl[Y] (0)
		// refl[Z] (0)
		// refl[X] (0)
		// diffuse
		fxch		st(5)
		// diffuse
		// refl[Z]*cam[Z] (1)
		// refl[Y] (0)
		// refl[Z] (0)
		// refl[X] (0)
		// refl[X]*cam[X] + refl[Y]*cam[Y] (2)
		fstp		dword ptr [ecx]
		// refl[Z]*cam[Z] (0)
		// refl[Y] (0)
		// refl[Z] (0)
		// refl[X] (0)
		// refl[X]*cam[X] + refl[Y]*cam[Y] (0)
		faddp		st(4), st
		// refl[Y] (0)
		// refl[Z] (0)
		// refl[X] (0)
		// dot = refl[X]*cam[X] + refl[Y]*cam[Y] + refl[Z]*cam[Z] (2)
		fmul		st, st
		// refl[Y]*refl[Y] (2)
		// refl[Z] (0)
		// refl[X] (0)
		// dot = refl[X]*cam[X] + refl[Y]*cam[Y] + refl[Z]*cam[Z] (1)
		fxch		st(2)
		// refl[X] (0)
		// refl[Z] (0)
		// refl[Y]*refl[Y] (2)
		// dot = refl[X]*cam[X] + refl[Y]*cam[Y] + refl[Z]*cam[Z] (1)
		fld			dword ptr [ZERO]
		// 0.0f
		// refl[X] (0)
		// refl[Z] (0)
		// refl[Y]*refl[Y] (1)
		// dot = refl[X]*cam[X] + refl[Y]*cam[Y] + refl[Z]*cam[Z] (0)

		fcomip	st, st(4)

		mov			eax, [esp + 12] // eax = specular
		jb			CALC_SPECULAR

		// free the stack
		fcompp
		fcompp

		mov			dword ptr [eax], 0
		jmp			END

CALC_SPECULAR:
		// refl[X] (0)
		// refl[Z] (0)
		// refl[Y]*refl[Y] (0)
		// dot
		fmul		st, st
		// refl[X]*refl[X] (2)
		// refl[Z] (0)
		// refl[Y]*refl[Y] (0)
		// dot
		fxch		st(1)
		// refl[Z] (0)
		// refl[X]*refl[X] (2)
		// refl[Y]*refl[Y] (0)
		// dot
		// ******** stall 1 cycle (consecutive fmuls) ********
		fmul		st, st
		// refl[Z]*refl[Z] (2)
		// refl[X]*refl[X] (0)
		// refl[Y]*refl[Y] (0)
		// dot
		fxch		st(1)
		// refl[X]*refl[X] (0)
		// refl[Z]*refl[Z] (2)
		// refl[Y]*refl[Y] (0)
		// dot
		faddp		st(2), st
		// refl[Z]*refl[Z] (1)
		// refl[X]*refl[X] + refl[Y]*refl[Y] (2)
		// dot
		// ******** stall 2 cycles ********
		faddp		st(1), st
		// mag_sqr = refl[X]*refl[X] + refl[Y]*refl[Y] + refl[Z]*refl[Z] (2)
		// dot
		// calculate the approximate inverse squareroot as follows:
		//	float fsqrt_inv(float f)
		//	{
		//		long i;
		//		float x2, y;
		//		x2 = 0.5f*f;
		//		i = *(long *)&f;
		//		i = 0x5f3759df - (i>>1);
		//		y = *(float *)&i;
		//		// repeat this iteration for more accuracy
		//		y = 1.5f*y - (x2*y * y*y);
		//		return y;
		//	}
		// ******** stall 3 cycles ********
		fst			dword ptr [esp + 4]
		fmul		dword ptr [ONE_HALF]
		// x2 = 0.5f*mag_sqr (2)
		// dot

		mov			ecx, 0x5f3759df
		mov			edx, [esp + 4]

		shr			edx, 1

		sub			ecx, edx

		mov			[esp + 4], ecx

		fmul		dword ptr [esp + 4]
		// x2*y (2)
		// dot
		fld			dword ptr [esp + 4]
		// y
		// x2*y (1)
		// dot
		fmul		st, st
		// y*y (2)
		// x2*y (0)
		// dot
		fld			dword ptr [THREE_HALVES]
		// 1.5f
		// y*y (1)
		// x2*y (0)
		// dot
		fmul		dword ptr [esp + 4]
		// 1.5f*y (2)
		// y*y (0)
		// x2*y (0)
		// dot
		fxch		st(2)
		// x2*y (0)
		// y*y (0)
		// 1.5f*y (2)
		// dot
		// ******** stall 1 cycle (consecutive fmuls) ********
		fmulp		st(1), st
		// x2*y * y*y (0)
		// 1.5f*y (1)
		// dot
		// ******** stall 2 cycle ********
		fsubp		st(1), st
		// y = 1.5f*y - (x2*y * y*y) (2)
		// dot
		// ******** stall 2 cycles ********
		fmulp		st(1), st
		// specular = dot*mag_inv (2)
		// ******** stall 2 cycles ********
		fmul		st, st
		// specular^2 (2)
		// ******** stall 2 cycles ********
		fmul		st, st
		// specular^4 (2)
		// ******** stall 2 cycles ********
		fmul		st, st
		// specular^8 (2)
		// ******** stall 3 cycles ********
		fstp		dword ptr [eax]

END:

		ret
	}
}
#endif

void DirLightDiffuseAndSpecular_C(Vector unit_norm, float *diffuse, float *specular)
{
	Vector reflection;
	float factor;
	int mask;

	// calculate the diffuse factor
	*diffuse = DotProduct(gXformedLightDir, unit_norm);

	// find the reflection of the light about the normal
	factor = (*diffuse)+(*diffuse);
	reflection[X] = factor*unit_norm[X] - gXformedLightDir[X];
	reflection[Y] = factor*unit_norm[Y] - gXformedLightDir[Y];
	reflection[Z] = factor*unit_norm[Z] - gXformedLightDir[Z];

	// calculate the specular factor
	*specular = DotProduct(gXformedCameraDir, reflection);

	if ((*(int *)diffuse) < 0)
	{
		*diffuse = gLightAmbient;
	}
	else
	{
		*diffuse += gLightAmbient;
		mask = *(int *)&ONE - *(int *)diffuse;
		mask >>= 31;
		*(int *)diffuse = (*(int *)diffuse & (mask ^ 0xffffffff)) | (*(int *)&ONE & mask);
	}

	if ((*(int *)specular) < 0)
	{
		*specular = 0.0f;
	}
	else
	{
		*specular *= fsqrt_inv(reflection[X]*reflection[X] + reflection[Y]*reflection[Y] + reflection[Z]*reflection[Z]);
		*specular *= *specular;
		*specular *= *specular;
		*specular *= *specular; // exponential ^8
	}
}

// transforms the light and camera directions
// by applying the inverse of the rotation matrix
// applied to the object
// NOTE: inverse of a rotation matrix is just its transpose
void DirLightXforms(Matrix rot_mat, Vector light_dir, Vector camera_dir)
{
	Matrix m;

	TransposeMatrix(m, rot_mat);
	MatMultVec3x4_3(gXformedLightDir, m, light_dir);
	MatMultVec3x4_3(gXformedCameraDir, m, camera_dir);
	Normalize(gXformedLightDir);
	Normalize(gXformedCameraDir);
}

float PointLightDiffuse(Vector point, Vector unit_norm)
{
	Vector dir;
	float diffuse, dist_sqr, factor;

	dir[X] = gXformedLightPos[X] - point[X];
	dir[Y] = gXformedLightPos[Y] - point[Y];
	dir[Z] = gXformedLightPos[Z] - point[Z];
	FastApproxNormalize(dir);

	diffuse = DotProduct(dir, unit_norm);

	if ((*(int *)&diffuse) < 0)
	{
		diffuse = 0.0f;
	}
	else
	{
		dist_sqr = SQR(gXformedLightPos[X] - point[X]) + SQR(gXformedLightPos[Y] - point[Y]) + SQR(gXformedLightPos[Z] - point[Z]);
		factor = CLAMP(/*SQR*/(FALL_OFF_DIST_FACTOR/dist_sqr), 0.0f, 1.0f);
		diffuse *= factor;
	}

	return MIN(diffuse, 1.0f);
}

void PointLightDiffuseAndSpecular(Vector point, Vector unit_norm, float *diffuse, float *specular)
{
	Vector dir, reflection;
	float factor, dist_sqr, dist_factor;

	dir[X] = gXformedLightPos[X] - point[X];
	dir[Y] = gXformedLightPos[Y] - point[Y];
	dir[Z] = gXformedLightPos[Z] - point[Z];
	FastApproxNormalize(dir);

	*diffuse = DotProduct(dir, unit_norm);

	// find the reflection of the light about the normal
	factor = (*diffuse)+(*diffuse);
	reflection[X] = factor*unit_norm[X] - dir[X];
	reflection[Y] = factor*unit_norm[Y] - dir[Y];
	reflection[Z] = factor*unit_norm[Z] - dir[Z];

	// calculate the specular factor
	*specular = DotProduct(gXformedCameraDir, reflection);

	if ((*(int *)diffuse) < 0)
	{
		*diffuse = 0.0f;
		dist_factor = 0.0f;
	}
	else
	{
		dist_sqr = SQR(gXformedLightPos[X] - point[X]) + SQR(gXformedLightPos[Y] - point[Y]) + SQR(gXformedLightPos[Z] - point[Z]);
		dist_factor = CLAMP(/*SQR*/(FALL_OFF_DIST_FACTOR/dist_sqr), 0.0f, 1.0f);
		*diffuse *= dist_factor;

		*diffuse = MIN(*diffuse, 1.0f);
	}

	if ((*(int *)specular) < 0)
	{
		*specular = 0.0f;
	}
	else
	{
		*specular *= fsqrt_inv(reflection[X]*reflection[X] + reflection[Y]*reflection[Y] + reflection[Z]*reflection[Z]);
		*specular *= *specular;
		*specular *= *specular;
		*specular *= *specular; // exponential ^8
		*specular *= dist_factor;
	}
}

void PointLightXforms(Matrix mat, Vector light_pos)
{
	Matrix m;

	InvertMatrix3x4(m, mat);
	MatMultVec3x4_4(gXformedLightPos, m, light_pos);
}

float LightDiffuse(Vector point, Vector unit_norm)
{
	Vector dir;
	float diffuse, dist_sqr, factor;

	// calculate the diffuse factor
	diffuse = DotProduct(gXformedLightDir, unit_norm);

	if ((*(int *)&diffuse) < 0)
	{
		diffuse = 0.0f;
	}

	dir[X] = gXformedLightPos[X] - point[X];
	dir[Y] = gXformedLightPos[Y] - point[Y];
	dir[Z] = gXformedLightPos[Z] - point[Z];

	factor = DotProduct(dir, unit_norm);

	if ((*(int *)&factor) > 0)
	{
		dist_sqr = SQR(gXformedLightPos[X] - point[X]) + SQR(gXformedLightPos[Y] - point[Y]) + SQR(gXformedLightPos[Z] - point[Z]);
		diffuse += fsqrt_inv(dir[X]*dir[X] + dir[Y]*dir[Y] + dir[Z]*dir[Z])*factor*CLAMP(/*SQR*/(1.0f/dist_sqr), 0.0f, 1.0f);
	}

	return MIN(diffuse, 1.0f);
}
