#include "precompiled.h"

#include "lib/byte_order.h"
#include "lib/res/mem.h"
#include "tex_codec.h"

// NOTE: the convention is bottom-up for DDS, but there's no way to tell.

// defs modified from ddraw header


#pragma pack(push, 1)

// DDPIXELFORMAT.dwFlags
#define DDPF_ALPHAPIXELS	0x00000001	

typedef struct
{
	u32 dwSize;                       // size of structure (32)
	u32 dwFlags;                      // indicates which fields are valid
	u32 dwFourCC;                     // (DDPF_FOURCC) FOURCC code, "DXTn"
	u32 dwReserved1[5];               // reserved
}
DDPIXELFORMAT;

typedef struct
{
	u32 dwCaps[4];
}
DDSCAPS2;

// DDSURFACEDESC2.dwFlags
#define DDSD_HEIGHT	        0x00000002	
#define DDSD_WIDTH	        0x00000004	
#define DDSD_PIXELFORMAT	0x00001000	
#define DDSD_MIPMAPCOUNT	0x00020000	

typedef struct
{
	u32 dwSize;                       // size of structure (124)
	u32 dwFlags;                      // indicates which fields are valid
	u32 dwHeight;                     // height of main image (pixels)
	u32 dwWidth;                      // width of main image (pixels)
	u32 dwLinearSize;                 // (DDSD_LINEARSIZE): total image size
	u32 dwDepth;                      // (DDSD_DEPTH) vol. textures: vol. depth
	u32 dwMipMapCount;                // (DDSD_MIPMAPCOUNT) total # levels
	u32 dwReserved1[11];              // reserved
	DDPIXELFORMAT ddpfPixelFormat;    // pixel format description of the surface
	DDSCAPS2 ddsCaps;                 // direct draw surface capabilities
	u32 dwReserved2;                  // reserved
}
DDSURFACEDESC2;

#pragma pack(pop)


// pixel colors are stored as uint[4]. uint rather than u8 protects from
// overflow during calculations, and padding to an even size is a bit
// more efficient (even though we don't need the alpha component).
enum RGBA {	R, G, B, A };


static inline void mix_2_3(uint dst[4], uint c0[4], uint c1[4])
{
	for(int i = 0; i < 3; i++) dst[i] = (c0[i]*2 + c1[i] + 1)/3;
}

static inline void mix_avg(uint dst[4], uint c0[4], uint c1[4])
{
	for(int i = 0; i < 3; i++) dst[i] = (c0[i]+c1[i])/2;
}

static inline uint access_bit_tbl(u32 tbl, uint idx, uint bit_width)
{
	uint val = tbl >> (idx*bit_width);
	val &= (1u << bit_width)-1;
	return val;
}

static inline uint access_bit_tbl64(u64 tbl, uint idx, uint bit_width)
{
	uint val = (uint)(tbl >> (idx*bit_width));
	val &= (1u << bit_width)-1;
	return val;
}

// extract a range of bits and expand to 8 bits (by replicating
// MS bits - see http://www.mindcontrol.org/~hplus/graphics/expand-bits.html ;
// this is also the algorithm used by graphics cards when decompressing S3TC).
// used to convert 565 to 32bpp RGB.
static inline uint unpack_to_8(u16 c, uint bits_below, uint num_bits)
{
	const uint num_filler_bits = 8-num_bits;
	const uint field = bits(c, bits_below, bits_below+num_bits-1);
	const uint filler = field >> (8-num_bits);
	return (field << num_filler_bits) | filler;
}


// for efficiency, we precalculate as much as possible about a block
// and store it here.
struct S3tcBlock
{
	// the 4 color choices for each pixel (RGBA)
	uint c[4][4];	// c[i][RGBA_component]

	// (DXT5 only) the 8 alpha choices
	u8 dxt5_a_tbl[8];

	// alpha block; interpretation depends on dxt.
	u64 a_bits;

	// table of 2-bit color selectors
	u32 c_selectors;
};


static void precalc_alpha(int dxt, const u8* a_block, S3tcBlock* b)
{
	// read block contents
	const uint a0 = a_block[0], a1 = a_block[1];
	b->a_bits = read_le64(a_block);	// see below

	if(dxt == 5)
	{
		// skip a0,a1 bytes (data is little endian)
		b->a_bits >>= 16;

		const bool is_dxt5_special_combination = (a0 <= a1);
		u8* a = b->dxt5_a_tbl;	// shorthand
		if(is_dxt5_special_combination)
		{
			a[0] = a0;
			a[1] = a1;
			a[2] = (4*a0 + 1*a1 + 2)/5;
			a[3] = (3*a0 + 2*a1 + 2)/5;
			a[4] = (2*a0 + 3*a1 + 2)/5;
			a[5] = (1*a0 + 4*a1 + 2)/5;
			a[6] = 0;
			a[7] = 255;
		}
		else
		{
			a[0] = a0;
			a[1] = a1;
			a[2] = (6*a0 + 1*a1 + 3)/7;
			a[3] = (5*a0 + 2*a1 + 3)/7;
			a[4] = (4*a0 + 3*a1 + 3)/7;
			a[5] = (3*a0 + 4*a1 + 3)/7;
			a[6] = (2*a0 + 5*a1 + 3)/7;
			a[7] = (1*a0 + 6*a1 + 3)/7;
		}
	}
}


static void precalc_color(int dxt, const u8* c_block, S3tcBlock* b)
{
	// read block contents
	// .. S3TC reference colors (565 format). the color table is generated
	//    from some combination of these, depending on their ordering.
	u16 rc[2];
	for(int i = 0; i < 2; i++)
		rc[i] = read_le16(c_block + 2*i);
	// .. table of 2-bit color selectors
	b->c_selectors = read_le32(c_block+4);

	const bool is_dxt1_special_combination =
		(dxt == 1 || dxt == DXT1A) && rc[0] <= rc[1];

	// c0 and c1 are the values of rc[], converted to 32bpp
	for(int i = 0; i < 2; i++)
	{
		b->c[i][R] = unpack_to_8(rc[i], 11, 5);
		b->c[i][G] = unpack_to_8(rc[i],  5, 6);
		b->c[i][B] = unpack_to_8(rc[i],  0, 5);
	}

	// c2 and c3 are combinations of c0 and c1:
	if(is_dxt1_special_combination)
	{
		mix_avg(b->c[2], b->c[0], b->c[1]);			// c2 = (c0+c1)/2
		for(int i = 0; i < 3; i++) b->c[3][i] = 0;	// c3 = black
		b->c[3][A] = (dxt == DXT1A)? 0 : 255;		// (transparent iff DXT1a)
	}
	else
	{
		mix_2_3(b->c[2], b->c[0], b->c[1]);			// c2 = 2/3*c0 + 1/3*c1
		mix_2_3(b->c[3], b->c[1], b->c[0]);			// c3 = 1/3*c0 + 2/3*c1
	}
}


static void block_precalc(int dxt, const u8* block, S3tcBlock* b)
{
	// (careful, 'dxt != 1' doesn't work)
	const u8* a_block = block;
	const u8* c_block = (dxt == 3 || dxt == 5)? block+8 : block;

	precalc_alpha(dxt, a_block, b);
	precalc_color(dxt, c_block, b);
}


static void write_pixel(int dxt, uint pixel_idx, const S3tcBlock* b, u8* out)
{
	debug_assert(pixel_idx < 16);

	// pixel index -> color selector (2 bit) -> color
	const uint c_selector = access_bit_tbl(b->c_selectors, pixel_idx, 2);
	const uint* c = b->c[c_selector];
	for(int i = 0; i < 3; i++)
		out[i] = c[i];

	// if no alpha, done
	if(dxt == 1)
		return;

	uint a;
	if(dxt == 3)
	{
		// table of 4-bit alpha entries
		a = access_bit_tbl64(b->a_bits, pixel_idx, 4);
		a |= a << 4; // expand to 8 bits (replicate high into low!)
	}
	else if(dxt == 5)
	{
		// pixel index -> alpha selector (3 bit) -> alpha
		const uint a_selector = access_bit_tbl64(b->a_bits, pixel_idx, 3);
		a = b->dxt5_a_tbl[a_selector];
	}
	// (dxt == DXT1A)
	else
		a = c[A];
	out[A] = a;
}



// in ogl_emulate_dds:	debug_assert(compressedimageSize == blocks * (dxt1? 8 : 16));


// note: this code is grossly inefficient (mostly due to splitting it up
// into function calls for readability). that's because it's only used to
// emulate hardware S3TC support - if that isn't available, everything will
// be dog-slow anyway due to increased vmem usage.
static int dds_decompress(Tex* t)
{
	int dxt = t->flags & TEX_DXT;
	debug_assert(dxt == 1 || dxt == 3 || dxt == 5);
	if(t->flags & TEX_ALPHA)
		dxt = DXT1A;
	// due to the above, dxt == 1 is the only non-alpha case.
	// note: adding or stripping alpha channels during transform is not
	// our job; we merely output the same pixel format as given
	// (tex.cpp's plain transform could cover it, if ever needed).
	const uint bpp = (dxt != 1)? 32 : 24;

	// note: 1x1 images are legitimate (e.g. in mipmaps). they report their
	// width as such for glTexImage, but the S3TC data is padded to
	// 4x4 pixel block boundaries.
	const uint blocks_w = (uint)(round_up(t->w, 4) / 4);
	const uint blocks_h = (uint)(round_up(t->h, 4) / 4);
	const uint blocks = blocks_w * blocks_h;
	const size_t img_size = blocks * 16 * bpp/8;
	Handle hm;
	void* img_data = mem_alloc(img_size, 64*KiB, 0, &hm);
	if(!img_data)
		return ERR_NO_MEM;

	const u8* s3tc_data = (const u8*)tex_get_data(t);
	// note: do not use tex_img_size! we must take into account padding
	// to 4x4 blocks, which is relevant for high mipmap levels (e.g. 2x2).
	const size_t s3tc_size = blocks * 16 * t->bpp/8;

	for(uint block_y = 0; block_y < blocks_h; block_y++)
		for(uint block_x = 0; block_x < blocks_w; block_x++)
		{
			S3tcBlock b;
			block_precalc(dxt, s3tc_data, &b);
			s3tc_data += 16 * t->bpp/8;

			uint pixel_idx = 0;
			for(int y = 0; y < 4; y++)
			{
				u8* out = (u8*)img_data + ((block_y*4+y)*blocks_w*4 + block_x*4) * bpp/8;
				for(int x = 0; x < 4; x++)
				{
					write_pixel(dxt, pixel_idx, &b, out);
					out += bpp/8;
					pixel_idx++;
				}
			}
		}	// for block_x


	debug_assert(tex_get_data(t) == s3tc_data - s3tc_size);

	mem_free_h(t->hm);
	t->hm  = hm;
	t->ofs = 0;
	t->bpp = bpp;
	t->flags &= ~TEX_DXT;
	return 0;
}


static int dds_transform(Tex* t, uint transforms)
{
	const int is_dxt = t->flags & TEX_DXT, transform_dxt = transforms & TEX_DXT;
	// requesting decompression
	if(is_dxt && transform_dxt)
		return dds_decompress(t);
	// both are DXT (unsupported; there are no flags we can change while
	// compressed) or requesting compression (not implemented) or
	// both not DXT (nothing we can do) - bail.
	else
		return TEX_CODEC_CANNOT_HANDLE;
}


static bool dds_is_hdr(const u8* file)
{
	return *(u32*)file == FOURCC('D','D','S',' ');
}


static bool dds_is_ext(const char* ext)
{
	return !stricmp(ext, "dds");
}


static size_t dds_hdr_size(const u8* UNUSED(file))
{
	return 4+sizeof(DDSURFACEDESC2);
}


static int dds_decode(DynArray* da, Tex* t)
{
	u8* file         = da->base;

	const DDSURFACEDESC2* hdr = (const DDSURFACEDESC2*)(file+4);
	const u32 sd_size   = read_le32(&hdr->dwSize);
	const u32 sd_flags  = read_le32(&hdr->dwFlags);
	const u32 h         = read_le32(&hdr->dwHeight);
	const u32 w         = read_le32(&hdr->dwWidth);
	      u32 mipmaps   = read_le32(&hdr->dwMipMapCount);
	const u32 pf_size   = read_le32(&hdr->ddpfPixelFormat.dwSize);
	const u32 pf_flags  = read_le32(&hdr->ddpfPixelFormat.dwFlags);
	const u32 fourcc    = hdr->ddpfPixelFormat.dwFourCC;
		// compared against FOURCC, which takes care of endian conversion.

	// we'll use these fields; make sure they're present below.
	// note: we can't guess image dimensions if not specified -
	//       the image isn't necessarily square.
	const u32 sd_req_flags = DDSD_WIDTH | DDSD_HEIGHT | DDSD_PIXELFORMAT;

	// make sure fields that aren't indicated as valid are zeroed.
	if(!(sd_flags & DDSD_MIPMAPCOUNT))
		mipmaps = 0;


	// determine flags and bpp.
	// we store DXT format (one of {1,3,5}) in flags & TEX_DXT.
	//
	// unfortunately there are problems with some DDS headers:
	// - DXTex doesn't set the required dwPitchOrLinearSize field -
	//   MS can't even write out their own file format correctly. *sigh*
	//   it's needed by OpenGL, so we calculate it from w, h, and bpp.
	// - pf_flags & DDPF_ALPHAPIXELS can only be used to check for
	//   DXT1a (the only way to detect it); we have observed some DXT3 files
	//   that don't have it set. grr
	int bpp = 0;
	int flags = 0;
	switch(fourcc)
	{
	case FOURCC('D','X','T','1'):
		bpp = 4;
		flags |= 1;
		if(pf_flags & DDPF_ALPHAPIXELS)
			flags |= TEX_ALPHA;
		break;
	case FOURCC('D','X','T','3'):
		bpp = 8;
		flags |= 3;
		flags |= TEX_ALPHA;
		break;
	case FOURCC('D','X','T','5'):
		bpp = 8;
		flags |= 5;
		flags |= TEX_ALPHA;
		break;
	}
	if(mipmaps)
		flags |= TEX_MIPMAPS;


	// sanity checks
	// .. dimensions not padded to S3TC block size
	if(w % 4 || h % 4)
		return ERR_TEX_INVALID_SIZE;
	// .. unknown FOURCC
	if((flags & TEX_DXT) == 0)
		return ERR_UNKNOWN_FORMAT;
	// .. missing required field(s)
	if((sd_flags & sd_req_flags) != sd_req_flags)
		return ERR_INCOMPLETE_HEADER;
	if(sizeof(DDPIXELFORMAT) != pf_size)
		return ERR_CORRUPTED;
	if(sizeof(DDSURFACEDESC2) != sd_size)
		return ERR_CORRUPTED;

	t->w     = w;
	t->h     = h;
	t->bpp   = bpp;
	t->flags = flags;
	return 0;
}


static int dds_encode(Tex* UNUSED(t), DynArray* UNUSED(da))
{
	// note: do not return ERR_NOT_IMPLEMENTED et al. because that would
	// break tex_write (which assumes either this, 0 or errors are returned).
	return TEX_CODEC_CANNOT_HANDLE;
	
}

TEX_CODEC_REGISTER(dds);