#include <core/defines.h>
#include <gfx/xbox/nx/render.h>
#include "gfx/xbox/p_nxparticleflat.h"

extern DWORD PixelShader0;

namespace Nx
{

/******************************************************************/
/*                                                                */
/*                                                                */
/******************************************************************/
CXboxParticleFlat::CXboxParticleFlat()
{
}



/******************************************************************/
/*                                                                */
/*                                                                */
/******************************************************************/
CXboxParticleFlat::CXboxParticleFlat( uint32 checksum, int max_particles, uint32 texture_checksum, uint32 blendmode_checksum, int fix, int num_segments, float split )
{
	m_checksum		= checksum;
	m_max_particles = max_particles;
	m_num_particles = 0;
	
	mp_particle_array = new CParticleEntry[max_particles];

	// Allocate vertex buffer.
	mp_vertices = new float[max_particles * 3];

	// Create the engine representation.
	mp_engine_particle = new NxXbox::sParticleSystem( max_particles, NxXbox::PARTICLE_TYPE_FLAT, texture_checksum, blendmode_checksum, fix );

	// Default color.
	m_start_color.r = m_start_color.g = m_start_color.b = 128;
	m_start_color.a = 255;
	m_mid_color.r = m_mid_color.g = m_mid_color.b = 128;
	m_mid_color.a = 255;
	m_end_color.r = m_end_color.g = m_end_color.b = 128;
	m_end_color.a = 255;

	m_mid_time = -1.0f;
}



/******************************************************************/
/*                                                                */
/*                                                                */
/******************************************************************/
CXboxParticleFlat::~CXboxParticleFlat()
{
	delete [] mp_particle_array;
	delete [] mp_vertices;
	delete mp_engine_particle;
}



/******************************************************************/
/*                                                                */
/*                                                                */
/******************************************************************/
void CXboxParticleFlat::plat_get_position( int entry, int list, float * x, float * y, float * z )
{
	float* p_v = &mp_vertices[entry*3];
	*x = p_v[0];
	*y = p_v[1];
	*z = p_v[2];
}



/******************************************************************/
/*                                                                */
/*                                                                */
/******************************************************************/
void CXboxParticleFlat::plat_set_position( int entry, int list, float x, float y, float z )
{
	float* p_v = &mp_vertices[entry*3];
	p_v[0] = x;
	p_v[1] = y;
	p_v[2] = z;
}



/******************************************************************/
/*                                                                */
/*                                                                */
/******************************************************************/
void CXboxParticleFlat::plat_add_position( int entry, int list, float x, float y, float z )
{
	float* p_v = &mp_vertices[entry*3];
	p_v[0] += x;
	p_v[1] += y;
	p_v[2] += z;
}



/******************************************************************/
/*                                                                */
/*                                                                */
/******************************************************************/
int CXboxParticleFlat::plat_get_num_particle_colors( void ) { return 1; }
int CXboxParticleFlat::plat_get_num_vertex_lists( void ) { return 1; }

// Note these are r/b reversed for direct uploading to Xbox GPU.
void CXboxParticleFlat::plat_set_sr( int entry, uint8 value ) { m_start_color.b = value; }
void CXboxParticleFlat::plat_set_sg( int entry, uint8 value ) { m_start_color.g = value; }
void CXboxParticleFlat::plat_set_sb( int entry, uint8 value ) { m_start_color.r = value; }
void CXboxParticleFlat::plat_set_sa( int entry, uint8 value ) { m_start_color.a = value; }
void CXboxParticleFlat::plat_set_mr( int entry, uint8 value ) { m_mid_color.b = value; }
void CXboxParticleFlat::plat_set_mg( int entry, uint8 value ) { m_mid_color.g = value; }
void CXboxParticleFlat::plat_set_mb( int entry, uint8 value ) { m_mid_color.r = value; }
void CXboxParticleFlat::plat_set_ma( int entry, uint8 value ) { m_mid_color.a = value; }
void CXboxParticleFlat::plat_set_er( int entry, uint8 value ) { m_end_color.b = value; }
void CXboxParticleFlat::plat_set_eg( int entry, uint8 value ) { m_end_color.g = value; }
void CXboxParticleFlat::plat_set_eb( int entry, uint8 value ) { m_end_color.r = value; }
void CXboxParticleFlat::plat_set_ea( int entry, uint8 value ) { m_end_color.a = value; }


		



#if 1
/******************************************************************/
/*                                                                */
/*                                                                */
/******************************************************************/
void CXboxParticleFlat::plat_render( void )
{
	// Draw the particles.
	if( m_num_particles > 0 )
	{
		// Used to figure the right and up vectors for creating screen-aligned particle quads.
		D3DXMATRIX *p_matrix = (D3DXMATRIX*)&NxXbox::EngineGlobals.view_matrix;

		// Concatenate p_matrix with the emmission angle to create the direction.
		Mth::Vector up( 0.0f, 1.0f, 0.0f );

		// Get the 'right' vector as the cross product of camera 'at and world 'up'.
		Mth::Vector at( p_matrix->m[0][2], p_matrix->m[1][2], p_matrix->m[2][2] );
		Mth::Vector screen_right	= Mth::CrossProduct( at, up );
		Mth::Vector screen_up		= Mth::CrossProduct( screen_right, at );

		screen_right.Normalize();
		screen_up.Normalize();

		int				lp;
		CParticleEntry	*p_particle;
		float			*p_v;

		// Calculate space needed.
		DWORD dwords_per_particle	= 32;
		DWORD dword_count			= dwords_per_particle * m_num_particles;

		// Submit particle material.
		mp_engine_particle->mp_material->Submit();
		
		// Set up correct vertex and pixel shader.
		NxXbox::set_vertex_shader( ParticleFlatVS );
		NxXbox::set_pixel_shader( PixelShader0 );
		
		// Load up the combined world->view_projection matrix.
		XGMATRIX	temp_matrix;
		XGMATRIX	dest_matrix;
		XGMATRIX	projMatrix;
		XGMATRIX	viewMatrix;
		XGMATRIX	worldMatrix;
		
		// Projection matrix.
		XGMatrixTranspose( &projMatrix, &NxXbox::EngineGlobals.projection_matrix );
	
		// View matrix.
		XGMatrixTranspose( &viewMatrix, &NxXbox::EngineGlobals.view_matrix );
		viewMatrix.m[3][0] = 0.0f;
		viewMatrix.m[3][1] = 0.0f;
		viewMatrix.m[3][2] = 0.0f;
		viewMatrix.m[3][3] = 1.0f;

		// World space transformation matrix, set to be a translation matrix corresponding to the emitter position.
		XGMatrixTranslation( &worldMatrix, m_pos[0], m_pos[1], m_pos[2] );
		XGMatrixTranspose( &worldMatrix, &worldMatrix );

		// Calculate composite world->view->projection matrix.
		XGMatrixMultiply( &temp_matrix, &viewMatrix, &worldMatrix );
		XGMatrixMultiply( &dest_matrix, &projMatrix, &temp_matrix );

		// Load up the combined world, camera & projection matrix.
		D3DDevice_SetVertexShaderConstantFast( 0, (void*)&dest_matrix, 4 );

		float vector_upload[8];
		vector_upload[0]	= screen_right[X];
		vector_upload[1]	= screen_right[Y];
		vector_upload[2]	= screen_right[Z];
		vector_upload[4]	= screen_up[X];
		vector_upload[5]	= screen_up[Y];
		vector_upload[6]	= screen_up[Z];
		D3DDevice_SetVertexShaderConstantFast( 4, (void*)( &vector_upload[0] ), 2 );

		static float vconstants[32]	= {	0.0f,  0.0f, 1.0f, 1.0f,		// Vert tex coords in C8 through C11
										1.0f,  0.0f, 1.0f, 1.0f,
										1.0f,  1.0f, 1.0f, 1.0f,
										0.0f,  1.0f, 1.0f, 1.0f,
									   -1.0f,  1.0f, 1.0f, 1.0f,		// Vert w/h multipliers in C12 through C15
										1.0f,  1.0f, 1.0f, 1.0f,
										1.0f, -1.0f, 1.0f, 1.0f,
									   -1.0f, -1.0f, 1.0f, 1.0f };
		D3DDevice_SetVertexShaderConstantFast( 8, (void*)( &vconstants[0] ), 8 );

		// Obtain push buffer lock.
		// Note that p_push is returned as a pointer to write-combined memory. Writes to write-combined memory should be
		// consecutive and in increasing order. Reads should be avoided. Additionally, any CPU reads from memory or the
		// L2 cache can force expensive partial flushes of the 32-byte write-combine cache.
		DWORD *p_push; 
		p_push = D3DDevice_BeginPush( dword_count );

		// Set up loop variables here, since we be potentially enetering the loop more than once.
		lp			= 0;
		p_particle	= mp_particle_array;
		p_v			= mp_vertices;

		for( ; lp < m_num_particles; lp++, p_particle++, p_v += 3 )
		{
			// Calculate the interpolator ( 1.0f / particle_life ).
			float terp	= p_particle->m_time * ReciprocalEstimateNR_ASM( p_particle->m_life );

			// Separate interpolator for color.
			float col_terp;

			Mth::Vector	pos( p_v[0], p_v[1], p_v[2] );
			Image::RGBA *p_col0;
			Image::RGBA *p_col1;
		
			if( m_mid_time >= 0.0f )
			{
				if( terp < m_mid_time )
				{
					p_col0		= &m_start_color;
					p_col1		= &m_mid_color;

					// Adjust interpolation for this half of the color blend.
					col_terp	= terp / m_mid_time;
				}
				else
				{
					p_col0		= &m_mid_color;
					p_col1		= &m_end_color;

					// Adjust interpolation for this half of the color blend.
					col_terp	= ( terp - m_mid_time ) / ( 1.0f - m_mid_time );
				}
			}
			else
			{
				// No mid color specified.
				p_col0		= &m_start_color;
				p_col1		= &m_end_color;

				// Color interpoltor value is the same as the regular interpolator.
				col_terp	= terp;
			}

			// We're going to be loading constants.
			p_push[0]	= D3DPUSH_ENCODE( D3DPUSH_SET_TRANSFORM_CONSTANT_LOAD, 1 );

			// Specify the starting register (physical registers are offset by 96 from the D3D logical register).
			p_push[1]	= 96 + 16;

			// Specify the number of DWORDS to load. 12 DWORDS for 3 constants.
			p_push[2]	= D3DPUSH_ENCODE( D3DPUSH_SET_TRANSFORM_CONSTANT, 12 );

			// Load position.
			p_push[3]	= *((DWORD*)&pos[X] );
			p_push[4]	= *((DWORD*)&pos[Y] );
			p_push[5]	= *((DWORD*)&pos[Z] );

			// Load start and end width and height.
			p_push[7]	= *((DWORD*)&p_particle->m_sw );
			p_push[8]	= *((DWORD*)&p_particle->m_sh );
			p_push[9]	= *((DWORD*)&p_particle->m_ew );
			p_push[10]	= *((DWORD*)&p_particle->m_eh );

			// Load size and color interpolators.
			p_push[11]	= *((DWORD*)&terp );
			p_push[12]	= *((DWORD*)&col_terp );

			p_push		+= 15;

			p_push[0]	= D3DPUSH_ENCODE( D3DPUSH_SET_BEGIN_END, 1 );
			p_push[1]	= D3DPT_QUADLIST;
			p_push		+= 2;

			// NOTE: A maximum of 2047 DWORDs can be specified to D3DPUSH_ENCODE. If there is more than 2047 DWORDs of vertex
			// data, simply split the data into multiple D3DPUSH_ENCODE( D3DPUSH_INLINE_ARRAY ) sections.
			p_push[0]	= D3DPUSH_ENCODE( D3DPUSH_NOINCREMENT_FLAG | D3DPUSH_INLINE_ARRAY, 12 );
			++p_push;

			// Now we can start the actual vertex data.
			p_push[0]	= *((DWORD*)p_col0 );
			p_push[1]	= *((DWORD*)p_col1 );
			p_push[2]	= 0x00000000UL;

			p_push[3]	= *((DWORD*)p_col0 );
			p_push[4]	= *((DWORD*)p_col1 );
			p_push[5]	= 0x00010001UL;

			p_push[6]	= *((DWORD*)p_col0 );
			p_push[7]	= *((DWORD*)p_col1 );
			p_push[8]	= 0x00020002UL;
			
			p_push[9]	= *((DWORD*)p_col0 );
			p_push[10]	= *((DWORD*)p_col1 );
			p_push[11]	= 0x00030003UL;

			p_push		+= 12;

			// End of vertex data for this particle.
			p_push[0] = D3DPUSH_ENCODE( D3DPUSH_SET_BEGIN_END, 1 );
			p_push[1] = 0;
			p_push += 2;
		}
		D3DDevice_EndPush( p_push );
	}

	// Deal with the Ps2 specific extensions.
	if( m_emit_rate > 0.0f )
	{
		m_emit_rate_fractional += ( m_emit_rate * ( 1.0f / 60.0f ));

		if( m_emit_rate_fractional >= 1.0f )
		{
			// This should actually deal with fractional values by accumulating them.
			emit( Ftoi_ASM( m_emit_rate_fractional ));
			m_emit_rate_fractional -= (float)Ftoi_ASM( m_emit_rate_fractional );
		}
	}
}

#else
/******************************************************************/
/*                                                                */
/*                                                                */
/******************************************************************/
void CXboxParticleFlat::plat_render( void )
{
	// Draw the particles.
	if( m_num_particles > 0 )
	{
		// Used to figure the right and up vectors for creating screen-aligned particle quads.
		D3DXMATRIX *p_matrix = (D3DXMATRIX*)&NxXbox::EngineGlobals.view_matrix;

		// Concatenate p_matrix with the emmission angle to create the direction.
		Mth::Vector up( 0.0f, 1.0f, 0.0f );

		// Get the 'right' vector as the cross product of camera 'at and world 'up'.
		Mth::Vector at( p_matrix->m[0][2], p_matrix->m[1][2], p_matrix->m[2][2] );
		Mth::Vector screen_right	= Mth::CrossProduct( at, up );
		Mth::Vector screen_up		= Mth::CrossProduct( screen_right, at );

		screen_right.Normalize();
		screen_up.Normalize();

		int				lp;
		CParticleEntry	*p_particle;
		float			*p_v;

		// Submit particle material.
		mp_engine_particle->mp_material->Submit();
		
		// Set up correct vertex and pixel shader.
		NxXbox::set_vertex_shader( D3DFVF_XYZ | D3DFVF_DIFFUSE | D3DFVF_TEX1 | D3DFVF_TEXCOORDSIZE2( 0 ));
		NxXbox::set_pixel_shader( PixelShader0 );
		
		DWORD dwords_per_particle	= 24;
		DWORD dword_count			= dwords_per_particle * m_num_particles;

		// Obtain push buffer lock.
		// The additional number (+5 is minimum) is to reserve enough overhead for the encoding parameters. It can safely be more, but no less.
		DWORD *p_push; 
		p_push = D3DDevice_BeginPush( dword_count + ( dword_count / 2047 ) + 16 );

		// Note that p_push is returned as a pointer to write-combined memory. Writes to write-combined memory should be
		// consecutive and in increasing order. Reads should be avoided. Additionally, any CPU reads from memory or the
		// L2 cache can force expensive partial flushes of the 32-byte write-combine cache.
		p_push[0]	= D3DPUSH_ENCODE( D3DPUSH_SET_BEGIN_END, 1 );
		p_push[1]	= D3DPT_QUADLIST;
		p_push		+= 2;

		// Set up loop variables here, since we be potentially enetering the loop more than once.
		lp			= 0;
		p_particle	= mp_particle_array;
		p_v			= mp_vertices;

		while( dword_count > 0 )
		{
			int dwords_written = 0;

			// NOTE: A maximum of 2047 DWORDs can be specified to D3DPUSH_ENCODE. If there is more than 2047 DWORDs of vertex
			// data, simply split the data into multiple D3DPUSH_ENCODE( D3DPUSH_INLINE_ARRAY ) sections.
			p_push[0] = D3DPUSH_ENCODE( D3DPUSH_NOINCREMENT_FLAG | D3DPUSH_INLINE_ARRAY, ( dword_count > 2047 ) ? ((int)( 2047 / dwords_per_particle )) * dwords_per_particle: dword_count );
			++p_push;
		
			for( ; lp < m_num_particles; lp++, p_particle++, p_v += 3 )
			{
				// Check to see if writing another particle will take us over the edge.
				if(( dwords_written + dwords_per_particle ) > 2047 )
				{
					break;
				}
				
				// Calculate the interpolator ( 1.0f / particle_life ).
				float terp	= p_particle->m_time * ReciprocalEstimateNR_ASM( p_particle->m_life );
				float w		= p_particle->m_sw + (( p_particle->m_ew - p_particle->m_sw ) * terp );
				float h		= p_particle->m_sh + (( p_particle->m_eh - p_particle->m_sh ) * terp );

				// Todo: Move hook to matrix/emitter code to cut down on per particle calculation.
				Mth::Vector	pos( p_v[0] + m_pos[X], p_v[1] + m_pos[Y], p_v[2] + m_pos[Z] );
				Mth::Vector	ss_right, ss_up;
				Mth::Vector tmp;
		
				ss_right	= screen_right * w;
				ss_up		= screen_up * h;

				Image::RGBA color;
				Image::RGBA *p_col0;
				Image::RGBA *p_col1;
		
				if( m_mid_time >= 0.0f )
				{
					if( terp < m_mid_time )
					{
						p_col0 = &m_start_color;
						p_col1 = &m_mid_color;

						// Adjust interpolation for this half of the color blend.
						terp = terp / m_mid_time;
					}
					else
					{
						p_col0 = &m_mid_color;
						p_col1 = &m_end_color;

						// Adjust interpolation for this half of the color blend.
						terp = ( terp - m_mid_time ) / ( 1.0f - m_mid_time );
					}
				}
				else
				{
					// No mid color specified.
					p_col0 = &m_start_color;
					p_col1 = &m_end_color;
				}

				Image::RGBA start	= *p_col0++;
				Image::RGBA end		= *p_col1++;

				// Use fixed point math to avoid _ftol2 calls.
				int f_terp	= Ftoi_ASM( terp * 4096.0f );
				color.r		= ((((int)start.r ) * 4096 ) + (((int)end.r - (int)start.r ) * f_terp )) / 4096;
				color.g		= ((((int)start.g ) * 4096 ) + (((int)end.g - (int)start.g ) * f_terp )) / 4096;
				color.b		= ((((int)start.b ) * 4096 ) + (((int)end.b - (int)start.b ) * f_terp )) / 4096;
				color.a		= ((((int)start.a ) * 4096 ) + (((int)end.a - (int)start.a ) * f_terp )) / 4096;
		
				tmp			= pos - ss_right + ss_up;
				p_push[0]	= *((DWORD*)&tmp[X] );
				p_push[1]	= *((DWORD*)&tmp[Y] );
				p_push[2]	= *((DWORD*)&tmp[Z] );
				p_push[3]	= *((DWORD*)&color );
				p_push[4]	= 0x00000000UL;
				p_push[5]	= 0x00000000UL;
	
				tmp			= pos + ss_right + ss_up;		
				p_push[6]	= *((DWORD*)&tmp[X] );
				p_push[7]	= *((DWORD*)&tmp[Y] );
				p_push[8]	= *((DWORD*)&tmp[Z] );
				p_push[9]	= *((DWORD*)&color );
				p_push[10]	= 0x3F800000UL;
				p_push[11]	= 0x00000000UL;

				tmp			= pos + ss_right - ss_up;		
				p_push[12]	= *((DWORD*)&tmp[X] );
				p_push[13]	= *((DWORD*)&tmp[Y] );
				p_push[14]	= *((DWORD*)&tmp[Z] );
				p_push[15]	= *((DWORD*)&color );
				p_push[16]	= 0x3F800000UL;
				p_push[17]	= 0x3F800000UL;
			
				tmp			= pos - ss_right - ss_up;		
				p_push[18]	= *((DWORD*)&tmp[X] );
				p_push[19]	= *((DWORD*)&tmp[Y] );
				p_push[20]	= *((DWORD*)&tmp[Z] );
				p_push[21]	= *((DWORD*)&color );
				p_push[22]	= 0x00000000UL;
				p_push[23]	= 0x3F800000UL;

				p_push		+= 24;

				dwords_written	+= dwords_per_particle;
				dword_count		-= dwords_per_particle;
			}
		}

		p_push[0] = D3DPUSH_ENCODE( D3DPUSH_SET_BEGIN_END, 1 );
		p_push[1] = 0;
		p_push += 2;
		D3DDevice_EndPush( p_push );
	}
}
#endif



} // Nx