commit efdca4b5bb59afb5d1a6d728f02684001044ad1d
Author: Pavel Krajcevski <pavel@cs.unc.edu>
Date:   Fri Aug 24 15:56:45 2012 -0400

    Initial commit with a few modifications

diff --git a/BPTCEncoder/CMakeLists.txt b/BPTCEncoder/CMakeLists.txt
new file mode 100644
index 0000000..c053b96
--- /dev/null
+++ b/BPTCEncoder/CMakeLists.txt
@@ -0,0 +1,57 @@
+INCLUDE_DIRECTORIES(${TexC_SOURCE_DIR}/BPTCEncoder/include)
+
+INCLUDE(CheckCXXSourceCompiles)
+
+SET(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+IF(CMAKE_COMPILER_IS_GNUCC)
+	SET(CMAKE_REQUIRED_FLAGS -msse4.1)
+	CHECK_CXX_SOURCE_COMPILES("config/testsse4.1.cpp" HAS_SSE_41)
+
+	IF(HAS_SSE_41)
+		SET(CMAKE_REQUIRED_FLAGS -msse4.2)
+		CHECK_CXX_SOURCE_COMPILES("config/testsse4.2.cpp"	HAS_SSE_POPCNT)
+	ENDIF(HAS_SSE_41)
+
+ELSEIF(MSVC)
+ENDIF()
+SET(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+
+CONFIGURE_FILE(
+	"config/BC7Config.h.in"
+	"src/BC7Config.h"
+)
+
+IF(CMAKE_COMPILER_IS_GNUCC)
+	ADD_DEFINITIONS(-fasm-blocks)
+ENDIF(CMAKE_COMPILER_IS_GNUCC)
+
+SET( HEADERS
+	src/BC7CompressionMode.h
+	src/BC7IntTypes.h
+	src/BitStream.h
+	src/RGBAEndpoints.h
+)
+
+SET( SOURCES
+	src/BC7Compressor.cpp
+	src/RGBAEndpoints.cpp
+)
+
+IF( HAS_SSE_41 ) 
+	SET( HEADERS
+		${HEADERS}
+		src/RGBAEndpointsSIMD.h
+		src/BC7CompressionModeSIMD.h
+	)
+
+	SET( SOURCES
+		${SOURCES}
+		src/BC7CompressorSIMD.cpp
+		src/RGBAEndpointsSIMD.cpp
+	)
+ENDIF( HAS_SSE_41 )
+
+ADD_LIBRARY( BPTCEncoder
+	${SOURCES}
+	${SIMD_SOURCES}
+)
diff --git a/BPTCEncoder/config/BC7Config.h.in b/BPTCEncoder/config/BC7Config.h.in
new file mode 100644
index 0000000..158812d
--- /dev/null
+++ b/BPTCEncoder/config/BC7Config.h.in
@@ -0,0 +1,8 @@
+// Copyright (c) 2012 Pavel Krajcevski
+// All Rights Reserved
+
+// BC7Config.h.in  -- This file contains variables that are introduced
+// explicitly by the CMake build process.
+
+// Do we have the proper popcnt instruction defined?
+#define HAS_SSE_POPCNT @HAS_SSE_POPCNT@
diff --git a/BPTCEncoder/config/testsse4.1.cpp b/BPTCEncoder/config/testsse4.1.cpp
new file mode 100644
index 0000000..2792820
--- /dev/null
+++ b/BPTCEncoder/config/testsse4.1.cpp
@@ -0,0 +1,10 @@
+#include <smmintrin.h>
+
+int main() {
+  const __m128 fv = _mm_set1_ps(1.0f);
+  const __m128 fv2 = _mm_set1_ps(2.0f);
+
+  const __m128 ans = _mm_blend_ps(fv, fv2, 2);
+
+  return ((int *)(&ans))[0];
+}
diff --git a/BPTCEncoder/include/BC7Compressor.h b/BPTCEncoder/include/BC7Compressor.h
new file mode 100755
index 0000000..22eb2ce
--- /dev/null
+++ b/BPTCEncoder/include/BC7Compressor.h
@@ -0,0 +1,61 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+namespace BC7C
+{
+	// This is the error metric that is applied to our error measurement algorithm
+	// in order to bias calculation towards results that are more in-line with 
+	// how the Human Visual System works. Uniform error means that each color 
+	// channel is treated equally. For a while, the widely accepted non-uniform metric
+	// has been to give red 30%, green 59% and blue 11% weight when computing the error
+	// between two pixels.
+	enum ErrorMetric
+	{
+		eErrorMetric_Uniform, // Treats r, g, and b channels equally
+		eErrorMetric_Nonuniform, // { 0.3, 0.59, 0.11 }
+		
+		kNumErrorMetrics
+	};
+
+	// Sets the error metric to be the one specified.
+	void SetErrorMetric(ErrorMetric e);
+
+	// Retreives a float4 pointer for the r, g, b, a weights for each color channel, in
+	// that order, based on the current error metric.
+	const float *GetErrorMetric();
+
+	// Returns the enumeration for the current error metric.
+	ErrorMetric GetErrorMetricEnum();
+
+	// Sets the number of steps that we use to perform simulated annealing. In general, a
+	// larger number produces better results. The default is set to 50. This metric works
+	// on a logarithmic scale -- twice the value will double the compute time, but only
+	// decrease the error by two times a factor.
+	void SetQualityLevel(int q);
+	int GetQualityLevel();
+
+	// Compress the image given as RGBA data to BC7 format. Width and Height are the dimensions of
+	// the image in pixels.
+	void CompressImageBC7(const unsigned char *inBuf, unsigned char *outBuf, int width, int height);
+
+	// Compress the image given as RGBA data to BC7 format using an algorithm optimized for SIMD
+	// enabled platforms. Width and Height are the dimensions of the image in pixels.
+	void CompressImageBC7SIMD(const unsigned char* inBuf, unsigned char* outBuf, int width, int height);
+
+	// Decompress the image given as BC7 data to R8G8B8A8 format. Width and Height are the dimensions of the image in pixels.
+	void DecompressImageBC7SIMD(const unsigned char* inBuf, unsigned char* outBuf, int width, int height);
+}
diff --git a/BPTCEncoder/src/BC7CompressionMode.h b/BPTCEncoder/src/BC7CompressionMode.h
new file mode 100755
index 0000000..94c53e8
--- /dev/null
+++ b/BPTCEncoder/src/BC7CompressionMode.h
@@ -0,0 +1,191 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+#ifndef __BC7_COMPRESSIONMODE_SIMD_H__
+#define __BC7_COMPRESSIONMODE_SIMD_H__
+
+#include "RGBAEndpoints.h"
+
+// Forward Declarations
+class BitStream;
+const int kMaxEndpoints = 3;
+
+static const int kPBits[4][2] = {
+	{ 0, 0 },
+	{ 0, 1 },
+	{ 1, 0 },
+	{ 1, 1 }
+};
+
+// Abstract class that outlines all of the different settings for BC7 compression modes 
+// Note that at the moment, we only support modes 0-3, so we don't deal with alpha channels.
+class BC7CompressionMode {
+public:
+
+	static const int kMaxNumSubsets = 3;
+	static const int kNumModes = 8;
+
+	explicit BC7CompressionMode(int mode, bool opaque = true) : m_IsOpaque(opaque), m_Attributes(&(kModeAttributes[mode])), m_RotateMode(0), m_IndexMode(0) { }
+	~BC7CompressionMode() { }
+
+	static int NumUses[8];
+	static void ResetNumUses() { memset(NumUses, 0, sizeof(NumUses)); }
+	double Compress(BitStream &stream, const int shapeIdx, const RGBACluster *clusters);
+
+	// This switch controls the quality of the simulated annealing optimizer. We will not make
+	// more than this many steps regardless of how bad the error is. Higher values will produce
+	// better quality results but will run slower. Default is 20.
+	static int MaxAnnealingIterations; // This is a setting
+	static const int kMaxAnnealingIterations = 256; // This is a limit
+
+	enum EPBitType {
+		ePBitType_Shared,
+		ePBitType_NotShared,
+		ePBitType_None
+	};
+
+	static struct Attributes {
+		int modeNumber;
+		int numPartitionBits;
+		int numSubsets;
+		int numBitsPerIndex;
+		int numBitsPerAlpha;
+		int colorChannelPrecision;
+		int alphaChannelPrecision;
+		bool hasRotation;
+		bool hasIdxMode;
+		EPBitType pbitType;
+	} kModeAttributes[kNumModes];
+
+	static const Attributes *GetAttributesForMode(int mode) {
+		if(mode < 0 || mode >= 8) return NULL;
+		return &kModeAttributes[mode];
+	}
+
+private:
+	
+	const Attributes *const m_Attributes;
+
+	int m_RotateMode;
+	int m_IndexMode;
+
+	void SetIndexMode(int mode) { m_IndexMode = mode; }
+	void SetRotationMode(int mode) { m_RotateMode = mode; }
+
+	int GetRotationMode() const { return m_Attributes->hasRotation? m_RotateMode : 0; }
+
+	int GetModeNumber() const { return m_Attributes->modeNumber; }
+	int GetNumberOfPartitionBits() const { return m_Attributes->numPartitionBits; }
+	int GetNumberOfSubsets() const { return m_Attributes->numSubsets; }
+
+	int GetNumberOfBitsPerIndex(int indexMode = -1) const { 
+		if(indexMode < 0) indexMode = m_IndexMode;
+		if(indexMode == 0)
+			return m_Attributes->numBitsPerIndex; 
+		else
+			return m_Attributes->numBitsPerAlpha; 
+	}
+
+	int GetNumberOfBitsPerAlpha(int indexMode = -1) const { 
+		if(indexMode < 0) indexMode = m_IndexMode;
+		if(indexMode == 0)
+			return m_Attributes->numBitsPerAlpha; 
+		else
+			return m_Attributes->numBitsPerIndex; 
+	}
+
+	// If we handle alpha separately, then we will consider the alpha channel
+	// to be not used whenever we do any calculations...
+	int GetAlphaChannelPrecision() const { 
+		if(m_Attributes->hasRotation) return 0;
+		else return m_Attributes->alphaChannelPrecision;  
+	}
+
+	RGBAVector GetErrorMetric() const {
+		const float *w = BC7C::GetErrorMetric();
+		switch(GetRotationMode()) {
+			default:
+			case 0: return RGBAVector(w[0], w[1], w[2], w[3]);
+			case 1: return RGBAVector(w[3], w[1], w[2], w[0]);
+			case 2: return RGBAVector(w[0], w[3], w[2], w[1]);
+			case 3: return RGBAVector(w[0], w[1], w[3], w[2]);
+		}
+	}
+
+	EPBitType GetPBitType() const { return m_Attributes->pbitType; }
+
+	unsigned int GetQuantizationMask() const {	
+		const int maskSeed = 0x80000000;
+		return (
+			(maskSeed >> (24 + m_Attributes->colorChannelPrecision - 1) & 0xFF) |
+			(maskSeed >> (16 + m_Attributes->colorChannelPrecision - 1) & 0xFF00) |
+			(maskSeed >> (8 + m_Attributes->colorChannelPrecision - 1) & 0xFF0000) |
+			(maskSeed >> (GetAlphaChannelPrecision() - 1) & 0xFF000000)
+		);
+	}
+
+	int GetNumPbitCombos() const {
+		switch(GetPBitType()) {
+			case ePBitType_Shared: return 2;
+			case ePBitType_NotShared: return 4;
+			default:
+			case ePBitType_None: return 1;
+		}
+	}
+
+	const int *GetPBitCombo(int idx) const {
+		switch(GetPBitType()) {
+			case ePBitType_Shared: return (idx)? kPBits[3] : kPBits[0];
+			case ePBitType_NotShared: return kPBits[idx % 4];
+			default:
+			case ePBitType_None: return kPBits[0];
+		}
+	}
+	
+	double OptimizeEndpointsForCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int &bestPbitCombo) const;
+
+	struct VisitedState {
+		RGBAVector p1;
+		RGBAVector p2;
+		int pBitCombo;
+	};
+
+	void PickBestNeighboringEndpoints(
+		const RGBACluster &cluster, 
+		const RGBAVector &p1, const RGBAVector &p2, 
+		const int curPbitCombo, 
+		RGBAVector &np1, RGBAVector &np2, 
+		int &nPbitCombo, 
+		const VisitedState *visitedStates, 
+		int nVisited, 
+		float stepSz = 1.0f
+	) const;
+
+	bool AcceptNewEndpointError(double newError, double oldError, float temp) const;
+
+	double CompressSingleColor(const RGBAVector &p, RGBAVector &p1, RGBAVector &p2, int &bestPbitCombo) const;
+	double CompressCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int &bestPbitCombo) const;
+	double CompressCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int *alphaIndices) const;
+
+	void ClampEndpointsToGrid(RGBAVector &p1, RGBAVector &p2, int &bestPBitCombo) const;
+
+	const double m_IsOpaque;
+};
+
+extern const uint32 kBC7InterpolationValues[4][16][2];
+
+#endif // __BC7_COMPRESSIONMODE_SIMD_H__
diff --git a/BPTCEncoder/src/BC7CompressionModeSIMD.h b/BPTCEncoder/src/BC7CompressionModeSIMD.h
new file mode 100755
index 0000000..66b9d6f
--- /dev/null
+++ b/BPTCEncoder/src/BC7CompressionModeSIMD.h
@@ -0,0 +1,153 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+#ifndef __BC7_COMPRESSIONMODE_H__
+#define __BC7_COMPRESSIONMODE_H__
+
+#include "BC7IntTypes.h"
+#include "RGBAEndpointsSIMD.h"
+
+// Forward Declarations
+class BitStream;
+
+static const int kPBits[4][2] = {
+	{ 0, 0 },
+	{ 0, 1 },
+	{ 1, 0 },
+	{ 1, 1 }
+};
+
+// Abstract class that outlines all of the different settings for BC7 compression modes 
+// Note that at the moment, we only support modes 0-3, so we don't deal with alpha channels.
+class BC7CompressionModeSIMD {
+public:
+
+	static const int kMaxNumSubsets = 3;
+	static const int kNumModes = 8;
+
+	enum EPBitType {
+		ePBitType_Shared,
+		ePBitType_NotShared,
+		ePBitType_None
+	};
+
+	BC7CompressionModeSIMD(int mode, double err) : m_EstimatedError(err), m_Attributes(&(kModeAttributes[mode])) { }
+	~BC7CompressionModeSIMD() { }
+
+	static int NumUses[8];
+	static void ResetNumUses() { memset(NumUses, 0, sizeof(NumUses)); }
+
+	double Compress(BitStream &stream, const int shapeIdx, const RGBAClusterSIMD *clusters) const;
+
+	// This switch controls the quality of the simulated annealing optimizer. We will not make
+	// more than this many steps regardless of how bad the error is. Higher values will produce
+	// better quality results but will run slower. Default is 50.
+	static int MaxAnnealingIterations; // This is a setting
+
+private:
+
+	static struct Attributes {
+		int modeNumber;
+		int numPartitionBits;
+		int numSubsets;
+		int numBitsPerIndex;
+		int redChannelPrecision;
+		int greenChannelPrecision;
+		int blueChannelPrecision;
+		int alphaChannelPrecision;
+		EPBitType pbitType;
+	} kModeAttributes[kNumModes];
+
+protected:
+	const Attributes *const m_Attributes;
+
+	int GetModeNumber() const { return m_Attributes->modeNumber; }
+	int GetNumberOfPartitionBits() const { return m_Attributes->numPartitionBits; }
+	int GetNumberOfSubsets() const { return m_Attributes->numSubsets; }
+	int GetNumberOfBitsPerIndex() const { return m_Attributes->numBitsPerIndex; }
+
+	int GetRedChannelPrecision() const { return m_Attributes->redChannelPrecision; }
+	int GetGreenChannelPrecision() const { return m_Attributes->greenChannelPrecision; }
+	int GetBlueChannelPrecision() const { return m_Attributes->blueChannelPrecision; }
+	int GetAlphaChannelPrecision() const { return m_Attributes->alphaChannelPrecision; }
+
+	EPBitType GetPBitType() const { return m_Attributes->pbitType; }
+
+	// !SPEED! Add this to the attributes lookup table
+	void GetQuantizationMask(__m128i &mask) const {	
+		const int maskSeed = 0x80000000;
+		mask = _mm_set_epi32(
+			(GetAlphaChannelPrecision() > 0)? (maskSeed >> (24 + GetAlphaChannelPrecision() - 1) & 0xFF) : 0xFF, 
+			(maskSeed >> (24 + GetBlueChannelPrecision() - 1) & 0xFF), 
+			(maskSeed >> (24 + GetGreenChannelPrecision() - 1) & 0xFF),
+			(maskSeed >> (24 + GetRedChannelPrecision() - 1) & 0xFF)
+		);
+	}
+
+	int GetNumPbitCombos() const {
+		switch(GetPBitType()) {
+			case ePBitType_Shared: return 2;
+			case ePBitType_NotShared: return 4;
+			default:
+			case ePBitType_None: return 1;
+		}
+	}
+
+	const int *GetPBitCombo(int idx) const {
+		switch(GetPBitType()) {
+			case ePBitType_Shared: return (idx)? kPBits[3] : kPBits[0];
+			case ePBitType_NotShared: return kPBits[idx % 4];
+			default:
+			case ePBitType_None: return kPBits[0];
+		}
+	}
+	
+	double OptimizeEndpointsForCluster(const RGBAClusterSIMD &cluster, RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, __m128i *bestIndices, int &bestPbitCombo) const;
+
+	struct VisitedState {
+		RGBAVectorSIMD p1;
+		RGBAVectorSIMD p2;
+		int pBitCombo;
+	};
+
+	void PickBestNeighboringEndpoints(
+		const RGBAClusterSIMD &cluster, 
+		const RGBAVectorSIMD &p1, const RGBAVectorSIMD &p2, 
+		const int curPbitCombo, 
+		RGBAVectorSIMD &np1, RGBAVectorSIMD &np2, 
+		int &nPbitCombo, 
+		const __m128 &stepVec
+	) const;
+
+	bool AcceptNewEndpointError(float newError, float oldError, float temp) const;
+
+	double CompressSingleColor(const RGBAVectorSIMD &p, RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, int &bestPbitCombo) const;
+	double CompressCluster(const RGBAClusterSIMD &cluster, RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, __m128i *bestIndices, int &bestPbitCombo) const;
+
+	void ClampEndpointsToGrid(RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, int &bestPBitCombo) const;
+
+	int GetSubsetForIndex(int idx, const int shapeIdx) const;
+	int GetAnchorIndexForSubset(int subset, const int shapeIdx) const;
+
+	double GetEstimatedError() const { return m_EstimatedError; }
+	const double m_EstimatedError;
+};
+
+extern const __m128i kBC7InterpolationValuesSIMD[4][16][2];
+extern const uint32 kBC7InterpolationValuesScalar[4][16][2];
+
+#endif // __BC7_COMPRESSIONMODE_H__
diff --git a/BPTCEncoder/src/BC7Compressor.cpp b/BPTCEncoder/src/BC7Compressor.cpp
new file mode 100755
index 0000000..098fc25
--- /dev/null
+++ b/BPTCEncoder/src/BC7Compressor.cpp
@@ -0,0 +1,1925 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+#include "BC7IntTypes.h"
+#include "BC7Compressor.h"
+#include "BC7CompressionMode.h"
+#include "BCLookupTables.h"
+#include "RGBAEndpoints.h"
+#include "BitStream.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#include <cfloat>
+#include <ctime>
+
+static const uint32 kNumShapes2 = 64;
+static const uint16 kShapeMask2[kNumShapes2] = {
+	0xcccc, 0x8888, 0xeeee, 0xecc8, 0xc880, 0xfeec, 0xfec8, 0xec80,
+	0xc800, 0xffec, 0xfe80, 0xe800, 0xffe8, 0xff00, 0xfff0, 0xf000,
+	0xf710, 0x008e, 0x7100, 0x08ce, 0x008c, 0x7310, 0x3100, 0x8cce,
+	0x088c, 0x3110, 0x6666, 0x366c, 0x17e8, 0x0ff0, 0x718e, 0x399c,
+	0xaaaa, 0xf0f0, 0x5a5a, 0x33cc, 0x3c3c, 0x55aa, 0x9696, 0xa55a,
+	0x73ce, 0x13c8, 0x324c, 0x3bdc, 0x6996, 0xc33c, 0x9966, 0x0660,
+	0x0272, 0x04e4, 0x4e40, 0x2720, 0xc936, 0x936c, 0x39c6, 0x639c,
+	0x9336, 0x9cc6, 0x817e, 0xe718, 0xccf0, 0x0fcc, 0x7744, 0xee22
+};
+
+static const int kAnchorIdx2[kNumShapes2] = {
+	15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,
+    15, 2, 8, 2, 2, 8, 8,15,
+     2, 8, 2, 2, 8, 8, 2, 2,
+    15,15, 6, 8, 2, 8,15,15,
+     2, 8, 2, 2, 2,15,15, 6,
+     6, 2, 6, 8,15,15, 2, 2,
+    15,15,15,15,15, 2, 2, 15
+};
+
+static const uint32 kNumShapes3 = 64;
+static const uint16 kShapeMask3[kNumShapes3][2] = {
+	{ 0xfecc, 0xf600 }, { 0xffc8, 0x7300 }, { 0xff90, 0x3310 }, { 0xecce, 0x00ce }, { 0xff00, 0xcc00 }, { 0xcccc, 0xcc00 }, { 0xffcc, 0x00cc }, { 0xffcc, 0x3300 },
+	{ 0xff00, 0xf000 }, { 0xfff0, 0xf000 }, { 0xfff0, 0xff00 }, { 0xcccc, 0x8888 }, { 0xeeee, 0x8888 }, { 0xeeee, 0xcccc }, { 0xffec, 0xec80 }, { 0x739c, 0x7310 },
+	{ 0xfec8, 0xc800 }, { 0x39ce, 0x3100 }, { 0xfff0, 0xccc0 }, { 0xfccc, 0x0ccc }, { 0xeeee, 0xee00 }, { 0xff88, 0x7700 }, { 0xeec0, 0xcc00 }, { 0x7730, 0x3300 },
+	{ 0x0cee, 0x00cc }, { 0xffcc, 0xfc88 }, { 0x6ff6, 0x0660 }, { 0xff60, 0x6600 }, { 0xcbbc, 0xc88c }, { 0xf966, 0xf900 }, { 0xceec, 0x0cc0 }, { 0xff10, 0x7310 },
+	{ 0xff80, 0xec80 }, { 0xccce, 0x08ce }, { 0xeccc, 0xec80 }, { 0x6666, 0x4444 }, { 0x0ff0, 0x0f00 }, { 0x6db6, 0x4924 }, { 0x6bd6, 0x4294 }, { 0xcf3c, 0x0c30 },
+	{ 0xc3fc, 0x03c0 }, { 0xffaa, 0xff00 }, { 0xff00, 0x5500 }, { 0xfcfc, 0xcccc }, { 0xcccc, 0x0c0c }, { 0xf6f6, 0x6666 }, { 0xaffa, 0x0ff0 }, { 0xfff0, 0x5550 },
+	{ 0xfaaa, 0xf000 }, { 0xeeee, 0x0e0e }, { 0xf8f8, 0x8888 }, { 0xfff0, 0x9990 }, { 0xeeee, 0xe00e }, { 0x8ff8, 0x8888 }, { 0xf666, 0xf000 }, { 0xff00, 0x9900 },
+	{ 0xff66, 0xff00 }, { 0xcccc, 0xc00c }, { 0xcffc, 0xcccc }, { 0xf000, 0x9000 }, { 0x8888, 0x0808 }, { 0xfefe, 0xeeee }, { 0xfffa, 0xfff0 }, { 0x7bde, 0x7310 }
+};
+
+static const uint32 kWMValues[] = { 0x32b92180, 0x32ba3080, 0x31103200, 0x28103c80, 0x32bb3080, 0x25903600, 0x3530b900, 0x3b32b180, 0x34b5b980 };
+static const uint32 kNumWMVals = sizeof(kWMValues) / sizeof(kWMValues[0]);
+static uint32 gWMVal = -1;
+
+static const int kAnchorIdx3[2][kNumShapes3] = {
+	{ 3, 3,15,15, 8, 3,15,15,
+     8, 8, 6, 6, 6, 5, 3, 3,
+     3, 3, 8,15, 3, 3, 6,10,
+     5, 8, 8, 6, 8, 5,15,15,
+     8,15, 3, 5, 6,10, 8,15,
+    15, 3,15, 5,15,15,15,15,
+     3,15, 5, 5, 5, 8, 5,10,
+	 5,10, 8,13,15,12, 3, 3 },
+
+	{ 15, 8, 8, 3,15,15, 3, 8,
+    15,15,15,15,15,15,15, 8,
+    15, 8,15, 3,15, 8,15, 8,
+     3,15, 6,10,15,15,10, 8,
+    15, 3,15,10,10, 8, 9,10,
+     6,15, 8,15, 3, 6, 6, 8,
+    15, 3,15,15,15,15,15,15,
+	15,15,15,15, 3,15,15, 8 }
+};
+
+static int GetSubsetForIndex(int idx, const int shapeIdx, const int nSubsets) {
+	int subset = 0;
+	
+	switch(nSubsets) {
+		case 2:
+		{
+			subset = !!((1 << idx) & kShapeMask2[shapeIdx]);
+		}
+		break;
+
+		case 3:
+		{
+			if(1 << idx & kShapeMask3[shapeIdx][0])
+				subset = 1 + !!((1 << idx) & kShapeMask3[shapeIdx][1]);
+			else
+				subset = 0;
+		}
+		break;
+
+		default:
+		break;
+	}
+
+	return subset;
+}
+
+static int GetAnchorIndexForSubset(int subset, const int shapeIdx, const int nSubsets) {
+	
+	int anchorIdx = 0;
+	switch(subset) {
+		case 1:
+		{
+			if(nSubsets == 2) {
+				anchorIdx = kAnchorIdx2[shapeIdx];
+			}
+			else {
+				anchorIdx = kAnchorIdx3[0][shapeIdx];
+			}
+		}
+		break;
+
+		case 2:
+		{
+			assert(nSubsets == 3);
+			anchorIdx = kAnchorIdx3[1][shapeIdx];
+		}
+		break;
+
+		default:
+		break;
+	}
+
+	return anchorIdx;
+}
+
+static int GetPointMaskForSubset(int subset, const int shapeIdx, const int nSubsets) {
+	int mask = 0xFFFF;
+
+	assert(subset < nSubsets);
+
+	switch(nSubsets) {
+		case 2:
+		{
+			mask = (subset)? kShapeMask2[shapeIdx] : ~(kShapeMask2[shapeIdx]);
+		}
+		break;
+
+		case 3:
+		{
+			switch(subset) {
+				default:
+				case 0:
+				{
+					mask = ~(kShapeMask3[shapeIdx][0]);
+				}
+				break;
+
+				case 1:
+				{
+					mask = ~(~(kShapeMask3[shapeIdx][0]) | kShapeMask3[shapeIdx][1]);
+				}
+				break;
+
+				case 2:
+				{
+					mask = kShapeMask3[shapeIdx][1];
+				}
+				break;
+			}
+		}
+		break;
+
+		default:
+		break;
+	}
+
+	return mask;
+}
+
+#ifndef min
+#define min(a, b) (((a) > (b))? (b) : (a))
+#endif
+
+#ifndef max
+#define max(a, b) (((a) > (b))? (a) : (b))
+#endif
+
+template <typename T>
+static void insert(T* buf, int bufSz, T newVal, int idx = 0) {
+	int safeIdx = min(bufSz-1, max(idx, 0));
+	for(int i = bufSz - 1; i > safeIdx; i--) {
+		buf[i] = buf[i-1];
+	}
+	buf[safeIdx] = newVal;
+}
+
+template <typename T>
+static inline void swap(T &a, T &b) { T t = a; a = b; b = t; }
+
+const uint32 kBC7InterpolationValues[4][16][2] = {
+	{ {64, 0}, {33, 31}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+	{ {64, 0}, {43, 21}, {21, 43}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+	{ {64, 0}, {55, 9}, {46, 18}, {37, 27}, {27, 37}, {18, 46}, {9, 55}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0 },
+	{ {64, 0}, {60, 4}, {55, 9}, {51, 13}, {47, 17}, {43, 21}, {38, 26}, {34, 30}, {30, 34}, {26, 38}, {21, 43}, {17, 47}, {13, 51}, {9, 55}, {4, 60}, {0, 64} }
+};
+
+int BC7CompressionMode::MaxAnnealingIterations = 50; // This is a setting.
+int BC7CompressionMode::NumUses[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+BC7CompressionMode::Attributes BC7CompressionMode::kModeAttributes[kNumModes] = {
+	{ 0, 4, 3, 3, 0, 4, 0, false, false, BC7CompressionMode::ePBitType_NotShared },
+	{ 1, 6, 2, 3, 0, 6, 0, false, false, BC7CompressionMode::ePBitType_Shared },
+	{ 2, 6, 3, 2, 0, 5, 0, false, false, BC7CompressionMode::ePBitType_None },
+	{ 3, 6, 2, 2, 0, 7, 0, false, false, BC7CompressionMode::ePBitType_NotShared },
+	{ 4, 0, 1, 2, 3, 5, 6, true,  true,  BC7CompressionMode::ePBitType_None },
+	{ 5, 0, 1, 2, 2, 7, 8, true,  false, BC7CompressionMode::ePBitType_None },
+	{ 6, 0, 1, 4, 0, 7, 7, false, false, BC7CompressionMode::ePBitType_NotShared },
+	{ 7, 6, 2, 2, 0, 5, 5, false, false, BC7CompressionMode::ePBitType_NotShared },
+};
+
+void BC7CompressionMode::ClampEndpointsToGrid(RGBAVector &p1, RGBAVector &p2, int &bestPBitCombo) const {
+	const int nPbitCombos = GetNumPbitCombos();
+	const bool hasPbits = nPbitCombos > 1;
+	const uint32 qmask = GetQuantizationMask();
+
+	ClampEndpoints(p1, p2);
+
+	// !SPEED! This can be faster.
+	float minDist = FLT_MAX;
+	RGBAVector bp1, bp2;
+	for(int i = 0; i < nPbitCombos; i++) {
+
+		uint32 qp1, qp2;
+		if(hasPbits) {
+			qp1 = p1.ToPixel(qmask, GetPBitCombo(i)[0]);
+			qp2 = p2.ToPixel(qmask, GetPBitCombo(i)[1]);
+		}
+		else {
+			qp1 = p1.ToPixel(qmask);
+			qp2 = p2.ToPixel(qmask);
+		}
+
+		uint8 *pqp1 = (uint8 *)&qp1;
+		uint8 *pqp2 = (uint8 *)&qp2;
+
+		RGBAVector np1 = RGBAVector(float(pqp1[0]), float(pqp1[1]), float(pqp1[2]), float(pqp1[3]));
+		RGBAVector np2 = RGBAVector(float(pqp2[0]), float(pqp2[1]), float(pqp2[2]), float(pqp2[3]));
+
+		RGBAVector d1 = np1 - p1;
+		RGBAVector d2 = np2 - p2;
+		float dist = (d1 * d1) + (d2 * d2);
+		if(dist < minDist) {
+			minDist = dist;
+			bp1 = np1; bp2 = np2;
+			bestPBitCombo = i;
+		}
+	}
+
+	p1 = bp1;
+	p2 = bp2;
+}
+
+double BC7CompressionMode::CompressSingleColor(const RGBAVector &p, RGBAVector &p1, RGBAVector &p2, int &bestPbitCombo) const {
+
+	const uint32 pixel = p.ToPixel();
+
+	uint32 bestDist = 0xFF;
+	bestPbitCombo = -1;
+
+	for(int pbi = 0; pbi < GetNumPbitCombos(); pbi++) {
+
+		const int *pbitCombo = GetPBitCombo(pbi);
+		
+		uint32 dist = 0x0;
+		uint32 bestValI[kNumColorChannels] = { -1, -1, -1, -1 };
+		uint32 bestValJ[kNumColorChannels] = { -1, -1, -1, -1 };
+
+		for(int ci = 0; ci < kNumColorChannels; ci++) {
+
+			const uint8 val = (pixel >> (ci * 8)) & 0xFF;
+			int nBits = ci == 3? GetAlphaChannelPrecision() : m_Attributes->colorChannelPrecision;
+
+			// If we don't handle this channel, then we don't need to
+			// worry about how well we interpolate.
+			if(nBits == 0) { bestValI[ci] = bestValJ[ci] = 0xFF; continue; }
+
+			const int nPossVals = (1 << nBits);
+			int possValsH[256];
+			int possValsL[256];
+
+			// Do we have a pbit?
+			const bool havepbit = GetPBitType() != ePBitType_None;
+			if(havepbit)
+				nBits++;
+
+			for(int i = 0; i < nPossVals; i++) {
+
+				int vh = i, vl = i;
+				if(havepbit) {
+					vh <<= 1;
+					vl <<= 1;
+
+					vh |= pbitCombo[1];
+					vl |= pbitCombo[0];
+				}
+
+				possValsH[i] = (vh << (8 - nBits));
+				possValsH[i] |= (possValsH[i] >> nBits);
+
+				possValsL[i] = (vl << (8 - nBits));
+				possValsL[i] |= (possValsL[i] >> nBits);
+			}
+
+			const uint32 interpVal0 = kBC7InterpolationValues[GetNumberOfBitsPerIndex() - 1][1][0];
+			const uint32 interpVal1 = kBC7InterpolationValues[GetNumberOfBitsPerIndex() - 1][1][1];
+
+			// Find the closest interpolated val that to the given val...
+			uint32 bestChannelDist = 0xFF;
+			for(int i = 0; bestChannelDist > 0 && i < nPossVals; i++)
+			for(int j = 0; bestChannelDist > 0 && j < nPossVals; j++) {
+
+				const uint32 v1 = possValsL[i];
+				const uint32 v2 = possValsH[j];
+
+				const uint32 combo = (interpVal0*v1 + (interpVal1 * v2) + 32) >> 6;
+				const uint32 err = (combo > val)? combo - val : val - combo;
+
+				if(err < bestChannelDist) {
+					bestChannelDist = err;
+					bestValI[ci] = v1;
+					bestValJ[ci] = v2;
+				}
+			}
+
+			dist = max(bestChannelDist, dist);
+		}
+
+		if(dist < bestDist) {
+			bestDist = dist;
+			bestPbitCombo = pbi;
+
+			for(int ci = 0; ci < kNumColorChannels; ci++) {
+				p1.c[ci] = float(bestValI[ci]);
+				p2.c[ci] = float(bestValJ[ci]);
+			}
+		}
+	}
+
+	return bestDist;
+}
+
+// Fast random number generator. See more information at 
+// http://software.intel.com/en-us/articles/fast-random-number-generator-on-the-intel-pentiumr-4-processor/
+static uint32 g_seed = uint32(time(NULL));
+static inline uint32 fastrand() { 
+	g_seed = (214013 * g_seed + 2531011); 
+	return (g_seed>>16) & RAND_MAX; 
+} 
+
+static const int kNumStepDirections = 8;
+static const RGBADir kStepDirections[kNumStepDirections] = {
+
+	// For pBit changes, we have 8 possible directions.
+	RGBADir(RGBAVector(1.0f, 1.0f, 1.0f, 0.0f)), 
+	RGBADir(RGBAVector(-1.0f, 1.0f, 1.0f, 0.0f)),
+	RGBADir(RGBAVector(1.0f, -1.0f, 1.0f, 0.0f)), 
+	RGBADir(RGBAVector(-1.0f, -1.0f, 1.0f, 0.0f)),
+	RGBADir(RGBAVector(1.0f, 1.0f, -1.0f, 0.0f)), 
+	RGBADir(RGBAVector(-1.0f, 1.0f, -1.0f, 0.0f)),
+	RGBADir(RGBAVector(1.0f, -1.0f, -1.0f, 0.0f)), 
+	RGBADir(RGBAVector(-1.0f, -1.0f, -1.0f, 0.0f))
+};
+
+static void ChangePointForDirWithoutPbitChange(RGBAVector &v, int dir, const float step[kNumColorChannels]) {
+	if(dir % 2) {
+		v.x -= step[0];
+	}
+	else {
+		v.x += step[0];
+	}
+
+	if(((dir / 2) % 2)) {
+		v.y -= step[1];
+	}
+	else  {
+		v.y += step[1];
+	}
+
+	if(((dir / 4) % 2)) {
+		v.z -= step[2];
+	}
+	else {
+		v.z += step[2];
+	}
+
+	if(((dir / 8) % 2)) {
+		v.a -= step[3];
+	}
+	else {
+		v.a += step[3];
+	}
+}
+
+static void ChangePointForDirWithPbitChange(RGBAVector &v, int dir, int oldPbit, const float step[kNumColorChannels]) {
+	if(dir % 2 && oldPbit == 0) {
+		v.x -= step[0];
+	}
+	else if(!(dir % 2) && oldPbit == 1) {
+		v.x += step[0];
+	}
+
+	if(((dir / 2) % 2) && oldPbit == 0) {
+		v.y -= step[1];
+	}
+	else if(!((dir / 2) % 2) && oldPbit == 1) {
+		v.y += step[1];
+	}
+
+	if(((dir / 4) % 2) && oldPbit == 0) {
+		v.z -= step[2];
+	}
+	else if(!((dir / 4) % 2) && oldPbit == 1) {
+		v.z += step[2];
+	}
+
+	if(((dir / 8) % 2) && oldPbit == 0) {
+		v.a -= step[3];
+	}
+	else if(!((dir / 8) % 2) && oldPbit == 1) {
+		v.a += step[3];
+	}
+}
+
+void BC7CompressionMode::PickBestNeighboringEndpoints(const RGBACluster &cluster, const RGBAVector &p1, const RGBAVector &p2, const int curPbitCombo, RGBAVector &np1, RGBAVector &np2, int &nPbitCombo, const VisitedState *visitedStates, int nVisited, float stepSz) const {
+	
+	// !SPEED! There might be a way to make this faster since we're working
+	// with floating point values that are powers of two. We should be able
+	// to just set the proper bits in the exponent and leave the mantissa to 0.
+	float step[kNumColorChannels] = {
+		stepSz * float(1 << (8 - m_Attributes->colorChannelPrecision)),
+		stepSz * float(1 << (8 - m_Attributes->colorChannelPrecision)),
+		stepSz * float(1 << (8 - m_Attributes->colorChannelPrecision)),
+		stepSz * float(1 << (8 - GetAlphaChannelPrecision()))
+	};
+
+	if(m_IsOpaque) {
+		step[(GetRotationMode() + 3) % kNumColorChannels] = 0.0f;
+	}
+
+	// First, let's figure out the new pbit combo... if there's no pbit then we don't need
+	// to worry about it.
+	const bool hasPbits = GetPBitType() != ePBitType_None;
+	if(hasPbits) {
+
+		// If there is a pbit, then we must change it, because those will provide the closest values
+		// to the current point.
+		if(GetPBitType() == ePBitType_Shared)
+			nPbitCombo = (curPbitCombo + 1) % 2;
+		else {
+			// Not shared... p1 needs to change and p2 needs to change... which means that 
+			// combo 0 gets rotated to combo 3, combo 1 gets rotated to combo 2 and vice
+			// versa...
+			nPbitCombo = 3 - curPbitCombo;
+		}
+
+		assert(GetPBitCombo(curPbitCombo)[0] + GetPBitCombo(nPbitCombo)[0] == 1);
+		assert(GetPBitCombo(curPbitCombo)[1] + GetPBitCombo(nPbitCombo)[1] == 1);
+	}
+
+	bool visited = true;
+	int infLoopPrevent = -1;
+	while(visited && ++infLoopPrevent < 16) {
+		for(int pt = 0; pt < 2; pt++) {
+
+			const RGBAVector &p = (pt)? p1 : p2;
+			RGBAVector &np = (pt)? np1 : np2;
+
+			np = p;
+			if(hasPbits) 
+				ChangePointForDirWithPbitChange(np, fastrand() % 16, GetPBitCombo(curPbitCombo)[pt], step);
+			else
+				ChangePointForDirWithoutPbitChange(np, fastrand() % 16, step);
+
+			for(int i = 0; i < kNumColorChannels; i++) {
+				np.c[i] = min(max(np.c[i], 0.0f), 255.0f);
+			}
+		}
+
+		visited = false;
+		for(int i = 0; i < nVisited; i++) {
+			visited = visited || (
+				visitedStates[i].p1 == np1 &&
+				visitedStates[i].p2 == np2 &&
+				visitedStates[i].pBitCombo == nPbitCombo
+			);
+		}
+	}
+}
+
+// Fast generation of floats between 0 and 1. It generates a float
+// whose exponent forces the value to be between 1 and 2, then it 
+// populates the mantissa with a random assortment of bits, and returns
+// the bytes interpreted as a float. This prevents two things: 1, a
+// division, and 2, a cast from an integer to a float.
+
+#define COMPILE_ASSERT(x) extern int __compile_assert_[(int)(x)];
+COMPILE_ASSERT(RAND_MAX == 0x7FFF)
+
+static inline float frand() { 
+	const uint16 r = fastrand();
+	
+	// RAND_MAX is 0x7FFF, which offers 15 bits
+	// of precision. Therefore, we move the bits
+	// into the top of the 23 bit mantissa, and 
+	// repeat the most significant bits of r in 
+	// the least significant of the mantissa
+	const uint32 m = (r << 8) | (r >> 7);
+	const uint32 flt = (127 << 23) | m;
+	return *(reinterpret_cast<const float *>(&flt)) - 1.0f;
+}
+
+bool BC7CompressionMode::AcceptNewEndpointError(double newError, double oldError, float temp) const {
+
+	// Always accept better endpoints.
+	if(newError < oldError)
+		return true;
+
+	const double p = exp((0.1f * (oldError - newError)) / temp);
+	const double r = frand();
+
+	return r < p;
+}
+
+double BC7CompressionMode::OptimizeEndpointsForCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int &bestPbitCombo) const {
+	
+	const int nBuckets = (1 << GetNumberOfBitsPerIndex());
+	const int nPbitCombos = GetNumPbitCombos();
+	const uint32 qmask = GetQuantizationMask();
+
+	// Here we use simulated annealing to traverse the space of clusters to find the best possible endpoints.
+	double curError = cluster.QuantizedError(p1, p2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(bestPbitCombo), bestIndices);
+	int curPbitCombo = bestPbitCombo;
+	double bestError = curError;
+
+	// Clamp endpoints to the grid...
+	uint32 qp1, qp2;
+	if(GetPBitType() != ePBitType_None) {
+		qp1 = p1.ToPixel(qmask, GetPBitCombo(bestPbitCombo)[0]);
+		qp2 = p2.ToPixel(qmask, GetPBitCombo(bestPbitCombo)[1]);
+	}
+	else {
+		qp1 = p1.ToPixel(qmask);
+		qp2 = p2.ToPixel(qmask);
+	}
+
+	uint8 *pqp1 = (uint8 *)&qp1;
+	uint8 *pqp2 = (uint8 *)&qp2;
+
+	p1 = RGBAVector(float(pqp1[0]), float(pqp1[1]), float(pqp1[2]), float(pqp1[3]));
+	p2 = RGBAVector(float(pqp2[0]), float(pqp2[1]), float(pqp2[2]), float(pqp2[3]));
+
+	RGBAVector bp1 = p1, bp2 = p2;
+
+	assert(curError == cluster.QuantizedError(p1, p2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(bestPbitCombo)));
+	
+	int lastVisitedState = 0;
+	VisitedState visitedStates[kMaxAnnealingIterations];
+
+	visitedStates[lastVisitedState].p1 = p1;
+	visitedStates[lastVisitedState].p2 = p2; 
+	visitedStates[lastVisitedState].pBitCombo = curPbitCombo;
+	lastVisitedState++;
+
+	const int maxEnergy = MaxAnnealingIterations;
+
+	for(int energy = 0; bestError > 0 && energy < maxEnergy; energy++) {
+
+		float temp = float(energy) / float(maxEnergy-1);
+
+		int indices[kMaxNumDataPoints];
+		RGBAVector np1, np2;
+		int nPbitCombo;
+
+		PickBestNeighboringEndpoints(cluster, p1, p2, curPbitCombo, np1, np2, nPbitCombo, visitedStates, lastVisitedState);
+
+		double error = cluster.QuantizedError(np1, np2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(nPbitCombo), indices);
+		if(AcceptNewEndpointError(error, curError, temp)) {
+			curError = error;
+			p1 = np1;
+			p2 = np2;
+			curPbitCombo = nPbitCombo;
+		}
+
+		if(error < bestError) {
+			memcpy(bestIndices, indices, sizeof(indices));
+			bp1 = np1;
+			bp2 = np2;
+			bestPbitCombo = nPbitCombo;
+			bestError = error;
+
+			visitedStates[lastVisitedState].p1 = np1;
+			visitedStates[lastVisitedState].p2 = np2; 
+			visitedStates[lastVisitedState].pBitCombo = nPbitCombo;
+			lastVisitedState++;
+
+			// Restart...
+			energy = 0;
+		}
+	}
+
+	p1 = bp1;
+	p2 = bp2;
+
+	return bestError;
+}
+
+double BC7CompressionMode::CompressCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int *alphaIndices) const {
+		
+	assert(GetModeNumber() == 4 || GetModeNumber() == 5);
+	assert(GetNumberOfSubsets() == 1);
+	assert(cluster.GetNumPoints() == kMaxNumDataPoints);
+	assert(m_Attributes->alphaChannelPrecision > 0);
+
+	// If all the points are the same in the cluster, then we need to figure out what the best
+	// approximation to this point is....
+	if(cluster.AllSamePoint()) {
+
+		assert(!"We should only be using this function in modes 4 & 5 that have a single subset, in which case single colors should have been detected much earlier.");
+
+		const RGBAVector &p = cluster.GetPoint(0);
+		int dummyPbit = 0;
+		double bestErr = CompressSingleColor(p, p1, p2, dummyPbit);
+
+		// We're assuming all indices will be index 1...
+		for(int i = 0; i < cluster.GetNumPoints(); i++) {
+			bestIndices[i] = 1;
+			alphaIndices[i] = 1;
+		}
+		
+		return bestErr;
+	}
+
+	RGBACluster rgbCluster;
+	float alphaVals[kMaxNumDataPoints];
+
+	float alphaMin = FLT_MAX, alphaMax = -FLT_MAX;
+	for(int i = 0; i < cluster.GetNumPoints(); i++) {
+
+		RGBAVector v = cluster.GetPoint(i);
+		switch(GetRotationMode()) {
+			default:
+			case 0:
+				// Do nothing
+			break;
+
+			case 1:
+				swap(v.r, v.a);
+				break;
+
+			case 2:
+				swap(v.g, v.a);
+				break;
+
+			case 3:
+				swap(v.b, v.a);
+				break;
+		}
+
+		alphaVals[i] = v.a;
+		v.a = 255.0f;
+
+		alphaMin = min(alphaVals[i], alphaMin);
+		alphaMax = max(alphaVals[i], alphaMax);
+
+		rgbCluster.AddPoint(v);
+	}
+
+	int dummyPbit = 0;
+	RGBAVector rgbp1, rgbp2;
+	double rgbError = CompressCluster(rgbCluster, rgbp1, rgbp2, bestIndices, dummyPbit);
+
+	float a1 = alphaMin, a2 = alphaMax;
+	double alphaError = DBL_MAX;
+
+	typedef uint32 tInterpPair[2];
+	typedef tInterpPair tInterpLevel[16];
+	const tInterpLevel *interpVals = kBC7InterpolationValues + (GetNumberOfBitsPerAlpha() - 1);
+	const float weight = GetErrorMetric().a;
+
+	const int nBuckets = (1 << GetNumberOfBitsPerAlpha());
+
+	// If they're the same, then we can get them exactly.
+	if(a1 == a2) 
+	{	
+		const uint8 step = 1 << (8-GetAlphaChannelPrecision());
+		const uint8 a1be = uint8(a1);
+		const uint8 a2be = uint8(a2);
+		const uint8 a1b = ::QuantizeChannel(a1be, (((char)0x80) >> (GetAlphaChannelPrecision() - 1)));
+		const uint8 a2b = ::QuantizeChannel(a2be, (((char)0x80) >> (GetAlphaChannelPrecision() - 1)));
+
+		// Mode 5 has 8 bits of precision for alpha.
+		if(GetModeNumber() == 5) {
+
+			assert(a1 == float(a1b));
+			assert(a2 == float(a2b));
+
+			for(int i = 0; i < kMaxNumDataPoints; i++)
+				alphaIndices[i] = 0;
+
+			alphaError = 0.0;
+		}
+		else {
+			assert(GetModeNumber() == 4);
+			
+			// Mode 4 can be treated like the 6 channel of DXT1 compression.
+			if(Optimal6CompressDXT1[a1be][0][0]) {
+				a1 = float((Optimal6CompressDXT1[a1be][1][1] << 2) | (Optimal6CompressDXT1[a1be][0][1] >> 4));
+				a2 = float((Optimal6CompressDXT1[a2be][1][2] << 2) | (Optimal6CompressDXT1[a2be][0][1] >> 4));
+			}
+			else {
+				a1 = float((Optimal6CompressDXT1[a1be][0][1] << 2) | (Optimal6CompressDXT1[a1be][0][1] >> 4));
+				a2 = float((Optimal6CompressDXT1[a2be][0][2] << 2) | (Optimal6CompressDXT1[a2be][0][1] >> 4));
+			}
+
+			if(m_IndexMode == 1) {
+				for(int i = 0; i < kMaxNumDataPoints; i++)
+					alphaIndices[i] = 1;
+			}
+			else {
+				for(int i = 0; i < kMaxNumDataPoints; i++)
+					alphaIndices[i] = 2;
+			}
+
+			uint32 interp0 = (*interpVals)[alphaIndices[0] & 0xFF][0];
+			uint32 interp1 = (*interpVals)[alphaIndices[0] & 0xFF][1];
+
+			const uint8 ip = (((uint32(a1) * interp0) + (uint32(a2) * interp1) + 32) >> 6) & 0xFF;
+			float pxError = weight * float((a1be > ip)? a1be - ip : ip - a1be);
+			pxError *= pxError;
+			alphaError = 16 * pxError;
+		}
+	}
+	else {
+
+		float vals[1<<3];
+		memset(vals, 0, sizeof(vals));
+
+		int buckets[kMaxNumDataPoints];
+
+		// Figure out initial positioning.
+		for(int i = 0; i < nBuckets; i++) {
+			vals[i] = alphaMin + (float(i)/float(nBuckets-1)) * (alphaMax - alphaMin);
+		}
+
+		// Assign each value to a bucket
+		for(int i = 0; i < kMaxNumDataPoints; i++) {
+
+			float minDist = 255.0f;
+			for(int j = 0; j < nBuckets; j++) {
+				float dist = fabs(alphaVals[i] - vals[j]);
+				if(dist < minDist) {
+					minDist = dist;
+					buckets[i] = j;
+				}
+			}
+		}
+			
+		float npts[1 << 3];
+
+		// Do k-means
+		bool fixed = false;
+		while(!fixed) {
+
+			memset(npts, 0, sizeof(npts));
+
+			float avg[1 << 3];
+			memset(avg, 0, sizeof(avg));
+
+			// Calculate average of each cluster
+			for(int i = 0; i < nBuckets; i++) {
+				for(int j = 0; j < kMaxNumDataPoints; j++) {
+
+					if(buckets[j] == i) {
+						avg[i] += alphaVals[j];
+						npts[i] += 1.0f;
+					}
+				}
+
+				if(npts[i] > 0.0f) 
+					avg[i] /= npts[i];
+			}
+
+			// Did we change anything?
+			fixed = true;
+			for(int i = 0; i < nBuckets; i++) {
+				fixed = fixed && (avg[i] == vals[i]);
+			}
+
+			// Reassign indices...
+			memcpy(vals, avg, sizeof(vals));
+
+			// Reassign each value to a bucket
+			for(int i = 0; i < kMaxNumDataPoints; i++) {
+
+				float minDist = 255.0f;
+				for(int j = 0; j < nBuckets; j++) {
+					float dist = fabs(alphaVals[i] - vals[j]);
+					if(dist < minDist) {
+						minDist = dist;
+						buckets[i] = j;
+					}
+				}
+			}
+		}
+
+		// Do least squares fit of vals.
+		float asq = 0.0, bsq = 0.0, ab = 0.0;
+		float ax(0.0), bx(0.0);
+		for(int i = 0; i < nBuckets; i++) {
+			float a = float(nBuckets - 1 - i) / float(nBuckets - 1);
+			float b = float(i) / float(nBuckets - 1);
+
+			float n = npts[i];
+			float x = vals[i];
+
+			asq += n * a * a;
+			bsq += n * b * b;
+			ab += n * a * b;
+
+			ax += x * a * n;
+			bx += x * b * n;
+		}
+
+		float f = 1.0f / (asq * bsq - ab * ab);
+		a1 = f * (ax * bsq - bx * ab);
+		a2 = f * (bx * asq - ax * ab);
+
+		// Clamp
+		a1 = min(255.0f, max(0.0f, a1));
+		a2 = min(255.0f, max(0.0f, a2));
+
+		// Quantize
+		const uint8 a1b = ::QuantizeChannel(uint8(a1), (((char)0x80) >> (GetAlphaChannelPrecision() - 1)));
+		const uint8 a2b = ::QuantizeChannel(uint8(a2), (((char)0x80) >> (GetAlphaChannelPrecision() - 1)));
+
+		// Compute error
+		for(int i = 0; i < kMaxNumDataPoints; i++) {
+
+			uint8 val = uint8(alphaVals[i]);
+
+			float minError = FLT_MAX;
+			int bestBucket = -1;
+
+			for(int j = 0; j < nBuckets; j++) {
+				uint32 interp0 = (*interpVals)[j][0];
+				uint32 interp1 = (*interpVals)[j][1];
+
+				const uint8 ip = (((uint32(a1b) * interp0) + (uint32(a2b) * interp1) + 32) >> 6) & 0xFF;
+				float pxError = weight * float((val > ip)? val - ip : ip - val);
+				pxError *= pxError;
+
+				if(pxError < minError) {
+					minError = pxError;
+					bestBucket = j;
+				}
+			}
+
+			alphaError += minError;
+			alphaIndices[i] = bestBucket;
+		}
+	}
+
+	for(int i = 0; i < kNumColorChannels; i++) {
+		p1.c[i] = (i == (kNumColorChannels-1))? a1 : rgbp1.c[i];
+		p2.c[i] = (i == (kNumColorChannels-1))? a2 : rgbp2.c[i];
+	}
+
+	return rgbError + alphaError;
+}
+
+double BC7CompressionMode::CompressCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int &bestPbitCombo) const {
+		
+	// If all the points are the same in the cluster, then we need to figure out what the best
+	// approximation to this point is....
+	if(cluster.AllSamePoint()) {
+		const RGBAVector &p = cluster.GetPoint(0);
+		double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo);
+
+		// We're assuming all indices will be index 1...
+		for(int i = 0; i < cluster.GetNumPoints(); i++) {
+			bestIndices[i] = 1;
+		}
+		
+		return bestErr;
+	}
+	
+	const int nBuckets = (1 << GetNumberOfBitsPerIndex());
+	const int nPbitCombos = GetNumPbitCombos();
+	const uint32 qmask = GetQuantizationMask();
+
+#if 1
+	RGBAVector avg = cluster.GetTotal() / float(cluster.GetNumPoints());
+	RGBADir axis;
+	::GetPrincipalAxis(cluster.GetNumPoints(), cluster.GetPoints(), axis);
+
+	float mindp = FLT_MAX, maxdp = -FLT_MAX;
+	for(int i = 0 ; i < cluster.GetNumPoints(); i++) {
+		float dp = (cluster.GetPoint(i) - avg) * axis;
+		if(dp < mindp) mindp = dp;
+		if(dp > maxdp) maxdp = dp;
+	}
+	
+	p1 = avg + mindp * axis;
+	p2 = avg + maxdp * axis;
+#else
+	cluster.GetBoundingBox(p1, p2);
+#endif
+
+	ClampEndpoints(p1, p2);
+
+	RGBAVector pts[1 << 4]; // At most 4 bits per index.
+	int numPts[1<<4];
+	assert(nBuckets <= 1 << 4);
+
+	for(int i = 0; i < nBuckets; i++) {
+		float s = (float(i) / float(nBuckets - 1));
+		pts[i] = (1.0f - s) * p1 + s * p2;
+	}
+
+	assert(pts[0] == p1);
+	assert(pts[nBuckets - 1] == p2);
+
+	// Do k-means clustering...
+	int bucketIdx[kMaxNumDataPoints];
+
+	bool fixed = false;
+	while(!fixed) {
+		
+		RGBAVector newPts[1 << 4];
+
+		// Assign each of the existing points to one of the buckets...
+		for(int i = 0; i < cluster.GetNumPoints(); i++) {
+
+			int minBucket = -1;
+			float minDist = FLT_MAX;
+			for(int j = 0; j < nBuckets; j++) {
+				RGBAVector v = cluster.GetPoint(i) - pts[j];
+				float distSq = v * v;
+				if(distSq < minDist)
+				{
+					minDist = distSq;
+					minBucket = j;
+				}
+			}
+
+			assert(minBucket >= 0);
+			bucketIdx[i] = minBucket;
+		}
+
+		// Calculate new buckets based on centroids of clusters...
+		for(int i = 0; i < nBuckets; i++) {
+			
+			numPts[i] = 0;
+			newPts[i] = RGBAVector(0.0f);
+			for(int j = 0; j < cluster.GetNumPoints(); j++) {
+				if(bucketIdx[j] == i) {
+					numPts[i]++;
+					newPts[i] += cluster.GetPoint(j);
+				}
+			}
+
+			// If there are no points in this cluster, then it should
+			// remain the same as last time and avoid a divide by zero.
+			if(0 != numPts[i])
+				newPts[i] /= float(numPts[i]);
+		}
+
+		// If we haven't changed, then we're done.
+		fixed = true;
+		for(int i = 0; i < nBuckets; i++) {
+			if(pts[i] != newPts[i])
+				fixed = false;
+		}
+
+		// Assign the new points to be the old points.
+		for(int i = 0; i < nBuckets; i++) {
+			pts[i] = newPts[i];
+		}
+	}
+
+	// If there's only one bucket filled, then just compress for that single color...
+	int numBucketsFilled = 0, lastFilledBucket = -1;
+	for(int i = 0; i < nBuckets; i++) {
+		if(numPts[i] > 0) {
+			numBucketsFilled++;
+			lastFilledBucket = i;
+		}
+	}
+
+	assert(numBucketsFilled > 0);
+	if(1 == numBucketsFilled) {
+		const RGBAVector &p = pts[lastFilledBucket];
+		double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo);
+
+		// We're assuming all indices will be index 1...
+		for(int i = 0; i < cluster.GetNumPoints(); i++) {
+			bestIndices[i] = 1;
+		}
+		  
+		return bestErr;
+	}
+
+	// Now that we know the index of each pixel, we can assign the endpoints based on a least squares fit
+	// of the clusters. For more information, take a look at this article by NVidia:
+	// http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/dxtc/doc/cuda_dxtc.pdf
+	float asq = 0.0, bsq = 0.0, ab = 0.0;
+	RGBAVector ax(0.0), bx(0.0);
+	for(int i = 0; i < nBuckets; i++) {
+		float a = float(nBuckets - 1 - i) / float(nBuckets - 1);
+		float b = float(i) / float(nBuckets - 1);
+
+		int n = numPts[i];
+		RGBAVector x = pts[i];
+
+		asq += float(n) * a * a;
+		bsq += float(n) * b * b;
+		ab += float(n) * a * b;
+
+		ax += x * a * float(n);
+		bx += x * b * float(n);
+	}
+
+	float f = 1.0f / (asq * bsq - ab * ab);
+	p1 = f * (ax * bsq - bx * ab);
+	p2 = f * (bx * asq - ax * ab);
+
+	ClampEndpointsToGrid(p1, p2, bestPbitCombo);
+
+	#ifdef _DEBUG
+		int pBitCombo = bestPbitCombo;
+		RGBAVector tp1 = p1, tp2 = p2;
+		ClampEndpointsToGrid(tp1, tp2, pBitCombo);
+
+		assert(p1 == tp1);
+		assert(p2 == tp2);
+		assert(pBitCombo == bestPbitCombo);
+	#endif
+
+	assert(bestPbitCombo >= 0);
+
+	return OptimizeEndpointsForCluster(cluster, p1, p2, bestIndices, bestPbitCombo);
+}
+
+double BC7CompressionMode::Compress(BitStream &stream, const int shapeIdx, const RGBACluster *clusters) {
+
+	const int kModeNumber = GetModeNumber();
+	const int nPartitionBits = GetNumberOfPartitionBits();
+	const int nSubsets = GetNumberOfSubsets();
+
+	// Mode #
+	stream.WriteBits(1 << kModeNumber, kModeNumber + 1);
+
+	// Partition #
+	assert((((1 << nPartitionBits) - 1) & shapeIdx) == shapeIdx);
+	stream.WriteBits(shapeIdx, nPartitionBits);
+		
+	RGBAVector p1[kMaxNumSubsets], p2[kMaxNumSubsets];
+	int bestIndices[kMaxNumSubsets][kMaxNumDataPoints] = {
+		{ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+		{ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+		{ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }
+	};
+	int bestAlphaIndices[kMaxNumDataPoints] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
+	int bestPbitCombo[kMaxNumSubsets] = { -1, -1, -1 };
+	int bestRotationMode = -1, bestIndexMode = -1;
+
+	double totalErr = 0.0;
+	for(int cidx = 0; cidx < nSubsets; cidx++) {
+		int indices[kMaxNumDataPoints];
+
+		if(m_Attributes->hasRotation) {
+
+			assert(nSubsets == 1);
+
+			int alphaIndices[kMaxNumDataPoints];
+
+			double bestError = DBL_MAX;
+			for(int rotMode = 0; rotMode < 4; rotMode++) {
+
+				SetRotationMode(rotMode);
+				const int nIdxModes = kModeNumber == 4? 2 : 1;
+
+				for(int idxMode = 0; idxMode < nIdxModes; idxMode++) {
+
+					SetIndexMode(idxMode);
+
+					RGBAVector v1, v2;
+					double error = CompressCluster(clusters[cidx], v1, v2, indices, alphaIndices);
+					if(error < bestError) {
+						bestError = error;
+
+						memcpy(bestIndices[cidx], indices, sizeof(indices));
+						memcpy(bestAlphaIndices, alphaIndices, sizeof(alphaIndices));
+
+						bestRotationMode = rotMode;
+						bestIndexMode = idxMode;
+
+						p1[cidx] = v1;
+						p2[cidx] = v2;
+					}
+				}
+			}
+
+			totalErr += bestError;
+		}
+		else {
+			// Compress this cluster
+			totalErr += CompressCluster(clusters[cidx], p1[cidx], p2[cidx], indices, bestPbitCombo[cidx]);
+
+			// Map the indices to their proper position.
+			int idx = 0;
+			for(int i = 0; i < 16; i++) {
+				int subs = GetSubsetForIndex(i, shapeIdx, GetNumberOfSubsets());
+				if(subs == cidx) {
+					bestIndices[cidx][i] = indices[idx++];
+				}
+			}
+		}
+	}
+
+	stream.WriteBits(bestRotationMode, m_Attributes->hasRotation? 2 : 0);
+	stream.WriteBits(bestIndexMode, m_Attributes->hasIdxMode? 1 : 0);
+
+#ifdef _DEBUG
+	for(int i = 0; i < kMaxNumDataPoints; i++) {
+
+		int nSet = 0;
+		for(int j = 0; j < nSubsets; j++) {
+			if(bestIndices[j][i] >= 0)
+				nSet++;
+		}
+
+		assert(nSet == 1);
+	}
+#endif
+
+	// Get the quantization mask
+	const uint32 qmask = GetQuantizationMask();
+
+	//Quantize the points...
+	uint32 pixel1[kMaxNumSubsets], pixel2[kMaxNumSubsets];
+	for(int i = 0; i < nSubsets; i++) {
+		switch(GetPBitType()) {
+			default:
+			case ePBitType_None: 
+				pixel1[i] = p1[i].ToPixel(qmask); 
+				pixel2[i] = p2[i].ToPixel(qmask); 
+			break;
+
+			case ePBitType_Shared: 
+			case ePBitType_NotShared: 
+				pixel1[i] = p1[i].ToPixel(qmask, GetPBitCombo(bestPbitCombo[i])[0]); 
+				pixel2[i] = p2[i].ToPixel(qmask, GetPBitCombo(bestPbitCombo[i])[1]); 
+			break;
+		}
+	}
+
+	// If the anchor index does not have 0 in the leading bit, then 
+	// we need to swap EVERYTHING.
+	for(int sidx = 0; sidx < nSubsets; sidx++) {
+
+		int anchorIdx = GetAnchorIndexForSubset(sidx, shapeIdx, nSubsets);
+		assert(bestIndices[sidx][anchorIdx] != -1);
+
+		const int nAlphaIndexBits = GetNumberOfBitsPerAlpha(bestIndexMode);
+		const int nIndexBits = GetNumberOfBitsPerIndex(bestIndexMode);
+		if(bestIndices[sidx][anchorIdx] >> (nIndexBits - 1)) {
+			uint32 t = pixel1[sidx]; pixel1[sidx] = pixel2[sidx]; pixel2[sidx] = t;
+
+			int nIndexVals = 1 << nIndexBits;
+			for(int i = 0; i < 16; i++) {
+				bestIndices[sidx][i] = (nIndexVals - 1) - bestIndices[sidx][i];
+			}
+
+			int nAlphaIndexVals = 1 << nAlphaIndexBits;
+			if(m_Attributes->hasRotation) {
+				for(int i = 0; i < 16; i++) {
+					bestAlphaIndices[i] = (nAlphaIndexVals - 1) - bestAlphaIndices[i];
+				}
+			}
+		}
+
+		if(m_Attributes->hasRotation && bestAlphaIndices[anchorIdx] >> (nAlphaIndexBits - 1)) {
+			uint8 * bp1 = (uint8 *)(&pixel1[sidx]); 
+			uint8 * bp2 = (uint8 *)(&pixel2[sidx]); 
+			uint8 t = bp1[3]; bp1[3] = bp2[3]; bp2[3] = t;
+
+			int nAlphaIndexVals = 1 << nAlphaIndexBits;
+			for(int i = 0; i < 16; i++) {
+				bestAlphaIndices[i] = (nAlphaIndexVals - 1) - bestAlphaIndices[i];
+			}
+		}
+
+		assert(!(bestIndices[sidx][anchorIdx] >> (nIndexBits - 1)));
+		assert(!m_Attributes->hasRotation || !(bestAlphaIndices[anchorIdx] >> (nAlphaIndexBits - 1)));
+	}
+
+	// Get the quantized values...
+	uint8 r1[kMaxNumSubsets], g1[kMaxNumSubsets], b1[kMaxNumSubsets], a1[kMaxNumSubsets];
+	uint8 r2[kMaxNumSubsets], g2[kMaxNumSubsets], b2[kMaxNumSubsets], a2[kMaxNumSubsets];
+	for(int i = 0; i < nSubsets; i++) {
+		r1[i] = pixel1[i] & 0xFF;
+		r2[i] = pixel2[i] & 0xFF;
+
+		g1[i] = (pixel1[i] >> 8) & 0xFF;
+		g2[i] = (pixel2[i] >> 8) & 0xFF;
+
+		b1[i] = (pixel1[i] >> 16) & 0xFF;
+		b2[i] = (pixel2[i] >> 16) & 0xFF;
+
+		a1[i] = (pixel1[i] >> 24) & 0xFF;
+		a2[i] = (pixel2[i] >> 24) & 0xFF;
+	}
+
+	// Write them out...
+	const int nRedBits = m_Attributes->colorChannelPrecision;
+	for(int i = 0; i < nSubsets; i++) {
+		stream.WriteBits(r1[i] >> (8 - nRedBits), nRedBits);
+		stream.WriteBits(r2[i] >> (8 - nRedBits), nRedBits);
+	}
+
+	const int nGreenBits = m_Attributes->colorChannelPrecision;
+	for(int i = 0; i < nSubsets; i++) {
+		stream.WriteBits(g1[i] >> (8 - nGreenBits), nGreenBits);
+		stream.WriteBits(g2[i] >> (8 - nGreenBits), nGreenBits);
+	}
+
+	const int nBlueBits = m_Attributes->colorChannelPrecision;
+	for(int i = 0; i < nSubsets; i++) {
+		stream.WriteBits(b1[i] >> (8 - nBlueBits), nBlueBits);
+		stream.WriteBits(b2[i] >> (8 - nBlueBits), nBlueBits);
+	}
+
+	const int nAlphaBits = m_Attributes->alphaChannelPrecision;
+	for(int i = 0; i < nSubsets; i++) {
+		stream.WriteBits(a1[i] >> (8 - nAlphaBits), nAlphaBits);
+		stream.WriteBits(a2[i] >> (8 - nAlphaBits), nAlphaBits);
+	}
+
+	// Write out the best pbits..
+	if(GetPBitType() != ePBitType_None) {
+		for(int s = 0; s < nSubsets; s++) {
+			const int *pbits = GetPBitCombo(bestPbitCombo[s]);
+			stream.WriteBits(pbits[0], 1);
+			if(GetPBitType() != ePBitType_Shared)
+				stream.WriteBits(pbits[1], 1);
+		}
+	}
+
+	// If our index mode has changed, then we need to write the alpha indices first.
+	if(m_Attributes->hasIdxMode && bestIndexMode == 1) {
+
+		assert(m_Attributes->hasRotation);
+
+		for(int i = 0; i < 16; i++) {
+			const int idx = bestAlphaIndices[i];
+			assert(GetAnchorIndexForSubset(0, shapeIdx, nSubsets) == 0);
+			assert(GetNumberOfBitsPerAlpha(bestIndexMode) == 2);
+			assert(idx >= 0 && idx < (1 << 2));
+			assert(i != 0 || !(idx >> 1) || !"Leading bit of anchor index is not zero!");
+			stream.WriteBits(idx, (i == 0)? 1 : 2);
+		}
+
+		for(int i = 0; i < 16; i++) {
+			const int idx = bestIndices[0][i];
+			assert(GetSubsetForIndex(i, shapeIdx, nSubsets) == 0);
+			assert(GetAnchorIndexForSubset(0, shapeIdx, nSubsets) == 0);
+			assert(GetNumberOfBitsPerIndex(bestIndexMode) == 3);
+			assert(idx >= 0 && idx < (1 << 3));
+			assert(i != 0 || !(idx >> 2) || !"Leading bit of anchor index is not zero!");
+			stream.WriteBits(idx, (i == 0)? 2 : 3);
+		}
+	}
+	else {
+		for(int i = 0; i < 16; i++) {
+			const int subs = GetSubsetForIndex(i, shapeIdx, nSubsets);
+			const int idx = bestIndices[subs][i];
+			const int anchorIdx = GetAnchorIndexForSubset(subs, shapeIdx, nSubsets);
+			const int nBitsForIdx = GetNumberOfBitsPerIndex(bestIndexMode);
+			assert(idx >= 0 && idx < (1 << nBitsForIdx));
+			assert(i != anchorIdx || !(idx >> (nBitsForIdx - 1)) || !"Leading bit of anchor index is not zero!");
+			stream.WriteBits(idx, (i == anchorIdx)? nBitsForIdx - 1 : nBitsForIdx);
+		}
+
+		if(m_Attributes->hasRotation) {
+			for(int i = 0; i < 16; i++) {
+				const int idx = bestAlphaIndices[i];
+				const int anchorIdx = 0;
+				const int nBitsForIdx = GetNumberOfBitsPerAlpha(bestIndexMode);
+				assert(idx >= 0 && idx < (1 << nBitsForIdx));
+				assert(i != anchorIdx || !(idx >> (nBitsForIdx - 1)) || !"Leading bit of anchor index is not zero!");
+				stream.WriteBits(idx, (i == anchorIdx)? nBitsForIdx - 1 : nBitsForIdx);
+			}
+		}
+	}
+	assert(stream.GetBitsWritten() == 128);
+	return totalErr;
+}
+
+namespace BC7C
+{
+	// Function prototypes
+	static void ExtractBlock(const uint8* inPtr, int width, uint32* colorBlock);
+	static void CompressBC7Block(const uint32 *block, uint8 *outBuf);
+
+	static int gQualityLevel = 50;
+	void SetQualityLevel(int q) {
+		gQualityLevel = max(0, q);
+	}
+	int GetQualityLevel() { return gQualityLevel; }
+
+	// Returns true if the entire block is a single color.
+	static bool AllOneColor(const uint32 block[16]) {
+		const uint32 pixel = block[0];
+		for(int i = 1; i < 16; i++) {
+			if( block[i] != pixel )
+				return false;
+		}
+
+		return true;
+	}
+
+	// Write out a transparent block.
+	static void WriteTransparentBlock(BitStream &stream) {
+		// Use mode 6
+		stream.WriteBits(1 << 6, 7);
+		stream.WriteBits(0, 128-7);
+		assert(stream.GetBitsWritten() == 128);
+	}
+
+	// Compresses a single color optimally and outputs the result.
+	static void CompressOptimalColorBC7(uint32 pixel, BitStream &stream) {
+
+		stream.WriteBits(1 << 5, 6); // Mode 5
+		stream.WriteBits(0, 2); // No rotation bits.
+
+		uint8 r = pixel & 0xFF;
+		uint8 g = (pixel >> 8) & 0xFF;
+		uint8 b = (pixel >> 16) & 0xFF;
+		uint8 a = (pixel >> 24) & 0xFF;
+
+		// Red endpoints
+		stream.WriteBits(Optimal7CompressBC7Mode5[r][0], 7);
+		stream.WriteBits(Optimal7CompressBC7Mode5[r][1], 7);
+
+		// Green endpoints
+		stream.WriteBits(Optimal7CompressBC7Mode5[g][0], 7);
+		stream.WriteBits(Optimal7CompressBC7Mode5[g][1], 7);
+
+		// Blue endpoints
+		stream.WriteBits(Optimal7CompressBC7Mode5[b][0], 7);
+		stream.WriteBits(Optimal7CompressBC7Mode5[b][1], 7);
+
+		// Alpha endpoints... are just the same.
+		stream.WriteBits(a, 8);
+		stream.WriteBits(a, 8);
+		
+		// Color indices are 1 for each pixel...
+		// Anchor index is 0, so 1 bit for the first pixel, then
+		// 01 for each following pixel giving the sequence of 31 bits:
+		// ...010101011
+		stream.WriteBits(0xaaaaaaab, 31);
+
+		// Alpha indices...
+		stream.WriteBits(kWMValues[gWMVal = (gWMVal+1) % kNumWMVals], 31); 
+	}
+
+	static int gModeChosen = -1;
+	static int gBestMode = -1;
+
+	// Compress an image using BC7 compression. Use the inBuf parameter to point to an image in
+	// 4-byte RGBA format. The width and height parameters specify the size of the image in pixels.
+	// The buffer pointed to by outBuf should be large enough to store the compressed image. This
+	// implementation has an 4:1 compression ratio.
+	void CompressImageBC7(const uint8* inBuf, uint8* outBuf, int width, int height)
+	{
+		uint32 block[16];
+		BC7CompressionMode::ResetNumUses();
+		BC7CompressionMode::MaxAnnealingIterations = min(BC7CompressionMode::kMaxAnnealingIterations, GetQualityLevel());
+
+		for(int j = 0; j < height; j += 4, inBuf += width * 4 * 4)
+		{
+			for(int i = 0; i < width; i += 4)
+			{
+				ExtractBlock(inBuf + i * 4, width, block);
+				CompressBC7Block(block, outBuf);
+
+				BC7CompressionMode::NumUses[gBestMode]++;
+
+				outBuf += 16;
+			}
+		}
+	}
+
+	// Extract a 4 by 4 block of pixels from inPtr and store it in colorBlock. The width parameter
+	// specifies the size of the image in pixels.
+	static void ExtractBlock(const uint8* inPtr, int width, uint32* colorBlock)
+	{
+		for(int j = 0; j < 4; j++)
+		{
+			memcpy(&colorBlock[j * 4], inPtr, 4 * 4);
+			inPtr += width * 4;
+		}
+	}
+
+	static double CompressTwoClusters(int shapeIdx, const RGBACluster *clusters, uint8 *outBuf, bool opaque) {
+
+		uint8 tempBuf1[16];
+		BitStream tmpStream1(tempBuf1, 128, 0);
+		BC7CompressionMode compressor1(1, opaque);
+			
+		double bestError = compressor1.Compress(tmpStream1, shapeIdx, clusters);
+		memcpy(outBuf, tempBuf1, 16);
+		gModeChosen = 1;
+		if(bestError == 0.0) {
+			return 0.0;
+		}
+
+		uint8 tempBuf3[16];
+		BitStream tmpStream3(tempBuf3, 128, 0);
+		BC7CompressionMode compressor3(3, opaque);
+
+		double error;
+		if((error = compressor3.Compress(tmpStream3, shapeIdx, clusters)) < bestError) {
+			gModeChosen = 3;
+			bestError = error;
+			memcpy(outBuf, tempBuf3, 16);
+			if(bestError == 0.0) {
+				return 0.0;
+			}
+		}
+		
+		// Mode 3 offers more precision for RGB data. Mode 7 is really only if we have alpha.
+		if(!opaque) 
+		{
+			uint8 tempBuf7[16];
+			BitStream tmpStream7(tempBuf7, 128, 0);
+			BC7CompressionMode compressor7(7, opaque);		
+			if((error = compressor7.Compress(tmpStream7, shapeIdx, clusters)) < bestError) {
+				gModeChosen = 7;
+				memcpy(outBuf, tempBuf7, 16);
+				return error;
+			}
+		}
+
+		return bestError;
+	}
+
+	static double CompressThreeClusters(int shapeIdx, const RGBACluster *clusters, uint8 *outBuf, bool opaque) {
+
+		uint8 tempBuf0[16];
+		BitStream tmpStream0(tempBuf0, 128, 0);
+
+		uint8 tempBuf2[16];
+		BitStream tmpStream2(tempBuf2, 128, 0);
+
+		BC7CompressionMode compressor0(0, opaque);
+		BC7CompressionMode compressor2(2, opaque);
+			
+		double error, bestError = (shapeIdx < 16)? compressor0.Compress(tmpStream0, shapeIdx, clusters) : DBL_MAX;
+		gModeChosen = 0;
+		memcpy(outBuf, tempBuf0, 16);
+		if(bestError == 0.0) {
+			return 0.0;
+		}
+
+		if((error = compressor2.Compress(tmpStream2, shapeIdx, clusters)) < bestError) {
+			gModeChosen = 2;
+			memcpy(outBuf, tempBuf2, 16);
+			return error;
+		}
+
+		return bestError;
+	}
+
+	static void PopulateTwoClustersForShape(const RGBACluster &points, int shapeIdx, RGBACluster *clusters) {
+		const uint16 shape = kShapeMask2[shapeIdx]; 
+		for(int pt = 0; pt < kMaxNumDataPoints; pt++) {
+
+			const RGBAVector &p = points.GetPoint(pt);
+
+			if((1 << pt) & shape)
+				clusters[1].AddPoint(p);
+			else
+				clusters[0].AddPoint(p);
+		}
+
+		assert(!(clusters[0].GetPointBitString() & clusters[1].GetPointBitString()));
+		assert((clusters[0].GetPointBitString() ^ clusters[1].GetPointBitString()) == 0xFFFF);
+		assert((shape & clusters[1].GetPointBitString()) == shape);
+	}
+
+	static void PopulateThreeClustersForShape(const RGBACluster &points, int shapeIdx, RGBACluster *clusters) {
+		for(int pt = 0; pt < kMaxNumDataPoints; pt++) {
+
+			const RGBAVector &p = points.GetPoint(pt);
+
+			if((1 << pt) & kShapeMask3[shapeIdx][0]) {
+				if((1 << pt) & kShapeMask3[shapeIdx][1])
+					clusters[2].AddPoint(p);
+				else
+					clusters[1].AddPoint(p);
+			}
+			else
+				clusters[0].AddPoint(p);
+		}
+
+		assert(!(clusters[0].GetPointBitString() & clusters[1].GetPointBitString()));
+		assert(!(clusters[2].GetPointBitString() & clusters[1].GetPointBitString()));
+		assert(!(clusters[0].GetPointBitString() & clusters[2].GetPointBitString()));
+	}
+
+	static double EstimateTwoClusterError(RGBACluster &c) {
+		RGBAVector Min, Max, v;
+		c.GetBoundingBox(Min, Max);
+		v = Max - Min;
+		if(v * v == 0) {
+			return 0.0;
+		}
+
+		const float *w = BC7C::GetErrorMetric();
+		return 0.0001 + c.QuantizedError(Min, Max, 8, 0xFFFFFFFF, RGBAVector(w[0], w[1], w[2], w[3]));
+	}
+
+	static double EstimateThreeClusterError(RGBACluster &c) {
+		RGBAVector Min, Max, v;
+		c.GetBoundingBox(Min, Max);
+		v = Max - Min;
+		if(v * v == 0) {
+			return 0.0;
+		}
+
+		const float *w = BC7C::GetErrorMetric();
+		return 0.0001 + c.QuantizedError(Min, Max, 4, 0xFFFFFFFF, RGBAVector(w[0], w[1], w[2], w[3]));
+	}
+
+	// Compress a single block.
+	static void CompressBC7Block(const uint32 *block, uint8 *outBuf) {
+
+		// All a single color?
+		if(AllOneColor(block)) {
+			BitStream bStrm(outBuf, 128, 0);
+			CompressOptimalColorBC7(*block, bStrm);
+			gBestMode = 5;
+			return;
+		}
+
+		RGBACluster blockCluster;
+		bool opaque = true;
+		bool transparent = true;
+
+		for(int i = 0; i < kMaxNumDataPoints; i++) {
+			RGBAVector p = RGBAVector(i, block[i]);
+			blockCluster.AddPoint(p);
+			if(fabs(p.a - 255.0f) > 1e-10)
+				opaque = false;
+
+			if(p.a > 0.0f)
+				transparent = false;
+		}
+
+		// The whole block is transparent?
+		if(transparent) {
+			BitStream bStrm(outBuf, 128, 0);
+			WriteTransparentBlock(bStrm);
+			gBestMode = 6;
+			return;
+		}
+
+		// First we must figure out which shape to use. To do this, simply
+		// see which shape has the smallest sum of minimum bounding spheres.
+		double bestError[2] = { DBL_MAX, DBL_MAX };
+		int bestShapeIdx[2] = { -1, -1 };
+		RGBACluster bestClusters[2][3];
+
+		for(int i = 0; i < kNumShapes2; i++) 
+		{
+			RGBACluster clusters[2];
+			PopulateTwoClustersForShape(blockCluster, i, clusters);
+
+			double err = 0.0;
+			for(int ci = 0; ci < 2; ci++) {
+				err += EstimateTwoClusterError(clusters[ci]);
+			}
+
+			// If it's small, we'll take it!
+			if(err < 1e-9) {
+				CompressTwoClusters(i, clusters, outBuf, opaque);
+				gBestMode = gModeChosen;
+				return;
+			}
+			
+			if(err < bestError[0]) {
+				bestError[0] = err;
+				bestShapeIdx[0] = i;
+				bestClusters[0][0] = clusters[0];
+				bestClusters[0][1] = clusters[1];
+			}
+		}
+
+		// There are not 3 subset blocks that support alpha, so only check these
+		// if the entire block is opaque.
+		if(opaque) {
+			for(int i = 0; i < kNumShapes3; i++) {
+
+				RGBACluster clusters[3];
+				PopulateThreeClustersForShape(blockCluster, i, clusters);
+
+				double err = 0.0;
+				for(int ci = 0; ci < 3; ci++) {
+					err += EstimateThreeClusterError(clusters[ci]);
+				}
+
+				// If it's small, we'll take it!
+				if(err < 1e-9) {
+					CompressThreeClusters(i, clusters, outBuf, opaque);
+					gBestMode = gModeChosen;
+					return;
+				}
+
+				if(err < bestError[1]) {
+					bestError[1] = err;
+					bestShapeIdx[1] = i;
+					bestClusters[1][0] = clusters[0];
+					bestClusters[1][1] = clusters[1];
+					bestClusters[1][2] = clusters[2];
+				}
+			}
+		}
+
+		uint8 tempBuf1[16], tempBuf2[16];
+
+		BitStream tempStream1 (tempBuf1, 128, 0);
+		BC7CompressionMode compressor(6, opaque);
+		double best = compressor.Compress(tempStream1, 0, &blockCluster);
+		gBestMode = 6;
+		if(best == 0.0f) {
+			memcpy(outBuf, tempBuf1, 16);
+			return;
+		}
+
+		// Check modes 4 and 5 if the block isn't opaque...
+		if(!opaque) {
+			for(int mode = 4; mode <= 5; mode++) {
+
+				BitStream tempStream2(tempBuf2, 128, 0);
+				BC7CompressionMode compressorTry(mode, opaque);
+
+				double error = compressorTry.Compress(tempStream2, 0, &blockCluster);
+				if(error < best) {
+
+					gBestMode = mode;
+					best = error;
+
+					if(best == 0.0f) {
+						memcpy(outBuf, tempBuf2, 16);
+						return;
+					}
+					else {
+						memcpy(tempBuf1, tempBuf2, 16);
+					}
+				}
+			}
+		}
+
+		double error = CompressTwoClusters(bestShapeIdx[0], bestClusters[0], tempBuf2, opaque);
+		if(error < best) {
+
+			gBestMode = gModeChosen;
+			best = error;
+			
+			if(error == 0.0f) {
+				memcpy(outBuf, tempBuf2, 16);
+				return;
+			}
+			else {
+				memcpy(tempBuf1, tempBuf2, 16);
+			}
+		}
+
+		if(opaque) {
+			if(CompressThreeClusters(bestShapeIdx[1], bestClusters[1], tempBuf2, opaque) < best) {
+
+				gBestMode = gModeChosen;
+				memcpy(outBuf, tempBuf2, 16);
+
+				return;
+			}
+		}
+
+		memcpy(outBuf, tempBuf1, 16);
+	}
+
+	static void DecompressBC7Block(const uint8 block[16], uint32 outBuf[16]) {
+
+		BitStreamReadOnly strm(block);
+
+		uint32 mode = 0;
+		while(!strm.ReadBit()) {
+			mode++;
+		}
+
+		const BC7CompressionMode::Attributes *attrs = BC7CompressionMode::GetAttributesForMode(mode);
+		const uint32 nSubsets = attrs->numSubsets;
+
+		uint32 idxMode = 0;
+		uint32 rotMode = 0;
+		uint32 shapeIdx = 0;
+		if ( nSubsets > 1 ) {
+			shapeIdx = strm.ReadBits(mode == 0? 4 : 6);
+		}
+		else if( attrs->hasRotation ) {
+			rotMode = strm.ReadBits(2);
+			if( attrs->hasIdxMode )
+				idxMode = strm.ReadBit();
+		}
+
+		assert(idxMode < 2);
+		assert(rotMode < 4);
+		assert(shapeIdx < ((mode == 0)? 16 : 64));
+
+		uint32 cp = attrs->colorChannelPrecision;
+		const uint32 shift = 8 - cp;
+
+		uint8 eps[3][2][4];
+		for(uint32 ch = 0; ch < 3; ch++)
+		for(uint32 i = 0; i < nSubsets; i++)
+		for(uint32 ep = 0; ep < 2; ep++) 
+			eps[i][ep][ch] = strm.ReadBits(cp) << shift;
+
+		uint32 ap = attrs->alphaChannelPrecision;
+		const uint32 ash = 8 - ap;
+
+		for(uint32 i = 0; i < nSubsets; i++)
+		for(uint32 ep = 0; ep < 2; ep++) 
+			eps[i][ep][3] = strm.ReadBits(ap) << ash;
+
+		// Handle pbits
+		switch(attrs->pbitType) {
+			case BC7CompressionMode::ePBitType_None:
+				// Do nothing.
+			break;
+
+			case BC7CompressionMode::ePBitType_Shared:
+
+				cp += 1;
+				ap += 1;
+
+				for(uint32 i = 0; i < nSubsets; i++) {
+
+					uint32 pbit = strm.ReadBit();
+
+					for(uint32 j = 0; j < 2; j++)
+					for(uint32 ch = 0; ch < kNumColorChannels; ch++) {
+						const uint32 prec = ch == 3? ap : cp;
+						eps[i][j][ch] |= pbit << (8-prec);
+					}
+				}
+			break;
+
+			case BC7CompressionMode::ePBitType_NotShared:
+				
+				cp += 1;
+				ap += 1;
+			
+				for(uint32 i = 0; i < nSubsets; i++)
+				for(uint32 j = 0; j < 2; j++) {
+
+					uint32 pbit = strm.ReadBit();
+
+					for(uint32 ch = 0; ch < kNumColorChannels; ch++) {
+						const uint32 prec = ch == 3? ap : cp;
+						eps[i][j][ch] |= pbit << (8-prec);
+					}
+				}
+			break;
+		}
+
+		// Quantize endpoints...
+		for(uint32 i = 0; i < nSubsets; i++)
+		for(uint32 j = 0; j < 2; j++)
+		for(uint32 ch = 0; ch < kNumColorChannels; ch++) {
+			const uint32 prec = ch == 3? ap : cp;
+			eps[i][j][ch] |= eps[i][j][ch] >> prec;		
+		}
+
+		// Figure out indices...
+		uint32 alphaIndices[kMaxNumDataPoints];
+		uint32 colorIndices[kMaxNumDataPoints];
+
+		int nBitsPerAlpha = attrs->numBitsPerAlpha;
+		int nBitsPerColor = attrs->numBitsPerIndex;
+
+		uint32 idxPrec = attrs->numBitsPerIndex;
+		for(int i = 0; i < kMaxNumDataPoints; i++) {
+			uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets);
+
+			int idx = 0;
+			if(GetAnchorIndexForSubset(subset, shapeIdx, nSubsets) == i) {
+				idx = strm.ReadBits(idxPrec - 1);
+			}
+			else {
+				idx = strm.ReadBits(idxPrec);
+			}
+			colorIndices[i] = idx;
+		}
+
+		idxPrec = attrs->numBitsPerAlpha;
+		if(idxPrec == 0) {
+			memcpy(alphaIndices, colorIndices, sizeof(alphaIndices));
+		}
+		else {
+			for(int i = 0; i < kMaxNumDataPoints; i++) {
+				uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets);
+
+				int idx = 0;
+				if(GetAnchorIndexForSubset(subset, shapeIdx, nSubsets) == i) {
+					idx = strm.ReadBits(idxPrec - 1);
+				}
+				else {
+					idx = strm.ReadBits(idxPrec);
+				}
+				alphaIndices[i] = idx;
+			}
+
+			if(idxMode) {
+				for(int i = 0; i < kMaxNumDataPoints; i++) {
+					swap(alphaIndices[i], colorIndices[i]);
+				}
+
+				swap(nBitsPerAlpha, nBitsPerColor);
+			}
+		}
+
+		assert(strm.GetBitsRead() == 128);
+
+		// Get final colors by interpolating...
+		for(int i = 0; i < kMaxNumDataPoints; i++) {
+
+			const uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets);
+			uint32 &pixel = outBuf[i];
+			
+			pixel = 0;
+			for(int ch = 0; ch < 3; ch++) {
+				uint32 i0 = kBC7InterpolationValues[nBitsPerColor - 1][colorIndices[i]][0];
+				uint32 i1 = kBC7InterpolationValues[nBitsPerColor - 1][colorIndices[i]][1];
+
+				const uint8 ip = (((uint32(eps[subset][0][ch]) * i0) + (uint32(eps[subset][1][ch]) * i1) + 32) >> 6) & 0xFF;
+				pixel |= ip << (8*ch);
+			}
+
+			if(attrs->alphaChannelPrecision > 0) {
+				uint32 i0 = kBC7InterpolationValues[nBitsPerAlpha - 1][alphaIndices[i]][0];
+				uint32 i1 = kBC7InterpolationValues[nBitsPerAlpha - 1][alphaIndices[i]][1];
+
+				const uint8 ip = (((uint32(eps[subset][0][3]) * i0) + (uint32(eps[subset][1][3]) * i1) + 32) >> 6) & 0xFF;
+				pixel |= ip << 24;
+			}
+			else {
+				pixel |= 0xFF000000;
+			}
+			
+			// Swap colors if necessary...
+			uint8 *pb = (uint8 *)&pixel;
+			switch(rotMode) {
+				default:
+				case 0:
+					// Do nothing
+					break;
+
+				case 1:
+					swap(pb[0], pb[3]);
+					break;
+
+				case 2:
+					swap(pb[1], pb[3]);
+					break;
+
+				case 3:
+					swap(pb[2], pb[3]);
+					break;
+			}
+		}
+	}
+
+	// Convert the image from a BC7 buffer to a RGBA8 buffer
+	void DecompressImageBC7(const uint8 *inBuf, uint8* outBuf, int width, int height) {
+
+		int blockIdx = 0;
+		for(int j = 0; j < height; j += 4, outBuf += width * 3 * 4)
+		{
+			for(int i = 0; i < width; i += 4)
+			{
+				uint32 pixels[16];
+				DecompressBC7Block(inBuf + (16*(blockIdx++)), pixels);
+
+				memcpy(outBuf, pixels, 4 * sizeof(uint32));
+				memcpy(outBuf + (width * 4), pixels + 4, 4 * sizeof(uint32));
+				memcpy(outBuf + 2*(width * 4), pixels + 8, 4 * sizeof(uint32));
+				memcpy(outBuf + 3*(width * 4), pixels + 12, 4 * sizeof(uint32));
+				outBuf += 16;
+			}
+		}
+	}
+}
diff --git a/BPTCEncoder/src/BC7CompressorSIMD.cpp b/BPTCEncoder/src/BC7CompressorSIMD.cpp
new file mode 100755
index 0000000..ba543d6
--- /dev/null
+++ b/BPTCEncoder/src/BC7CompressorSIMD.cpp
@@ -0,0 +1,1270 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+#include "BC7IntTypes.h"
+#include "BC7Compressor.h"
+#include "BC7CompressionModeSIMD.h"
+#include "RGBAEndpointsSIMD.h"
+#include "BCLookupTables.h"
+#include "BitStream.h"
+
+#ifdef _MSC_VER
+#define ALIGN_SSE __declspec( align(16) )
+#else
+#define ALIGN_SSE __attribute__((aligned(16)))
+#endif
+
+static const uint32 kNumShapes2 = 64;
+static const uint16 kShapeMask2[kNumShapes2] = {
+	0xcccc, 0x8888, 0xeeee, 0xecc8, 0xc880, 0xfeec, 0xfec8, 0xec80,
+	0xc800, 0xffec, 0xfe80, 0xe800, 0xffe8, 0xff00, 0xfff0, 0xf000,
+	0xf710, 0x008e, 0x7100, 0x08ce, 0x008c, 0x7310, 0x3100, 0x8cce,
+	0x088c, 0x3110, 0x6666, 0x366c, 0x17e8, 0x0ff0, 0x718e, 0x399c,
+	0xaaaa, 0xf0f0, 0x5a5a, 0x33cc, 0x3c3c, 0x55aa, 0x9696, 0xa55a,
+	0x73ce, 0x13c8, 0x324c, 0x3bdc, 0x6996, 0xc33c, 0x9966, 0x0660,
+	0x0272, 0x04e4, 0x4e40, 0x2720, 0xc936, 0x936c, 0x39c6, 0x639c,
+	0x9336, 0x9cc6, 0x817e, 0xe718, 0xccf0, 0x0fcc, 0x7744, 0xee22
+};
+
+static const int kAnchorIdx2[kNumShapes2] = {
+	15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,
+    15, 2, 8, 2, 2, 8, 8,15,
+     2, 8, 2, 2, 8, 8, 2, 2,
+    15,15, 6, 8, 2, 8,15,15,
+     2, 8, 2, 2, 2,15,15, 6,
+     6, 2, 6, 8,15,15, 2, 2,
+    15,15,15,15,15, 2, 2, 15
+};
+
+static const uint32 kNumShapes3 = 64;
+static const uint16 kShapeMask3[kNumShapes3][2] = {
+	{ 0xfecc, 0xf600 }, { 0xffc8, 0x7300 }, { 0xff90, 0x3310 }, { 0xecce, 0x00ce }, { 0xff00, 0xcc00 }, { 0xcccc, 0xcc00 }, { 0xffcc, 0x00cc }, { 0xffcc, 0x3300 },
+	{ 0xff00, 0xf000 }, { 0xfff0, 0xf000 }, { 0xfff0, 0xff00 }, { 0xcccc, 0x8888 }, { 0xeeee, 0x8888 }, { 0xeeee, 0xcccc }, { 0xffec, 0xec80 }, { 0x739c, 0x7310 },
+	{ 0xfec8, 0xc800 }, { 0x39ce, 0x3100 }, { 0xfff0, 0xccc0 }, { 0xfccc, 0x0ccc }, { 0xeeee, 0xee00 }, { 0xff88, 0x7700 }, { 0xeec0, 0xcc00 }, { 0x7730, 0x3300 },
+	{ 0x0cee, 0x00cc }, { 0xffcc, 0xfc88 }, { 0x6ff6, 0x0660 }, { 0xff60, 0x6600 }, { 0xcbbc, 0xc88c }, { 0xf966, 0xf900 }, { 0xceec, 0x0cc0 }, { 0xff10, 0x7310 },
+	{ 0xff80, 0xec80 }, { 0xccce, 0x08ce }, { 0xeccc, 0xec80 }, { 0x6666, 0x4444 }, { 0x0ff0, 0x0f00 }, { 0x6db6, 0x4924 }, { 0x6bd6, 0x4294 }, { 0xcf3c, 0x0c30 },
+	{ 0xc3fc, 0x03c0 }, { 0xffaa, 0xff00 }, { 0xff00, 0x5500 }, { 0xfcfc, 0xcccc }, { 0xcccc, 0x0c0c }, { 0xf6f6, 0x6666 }, { 0xaffa, 0x0ff0 }, { 0xfff0, 0x5550 },
+	{ 0xfaaa, 0xf000 }, { 0xeeee, 0x0e0e }, { 0xf8f8, 0x8888 }, { 0xfff0, 0x9990 }, { 0xeeee, 0xe00e }, { 0x8ff8, 0x8888 }, { 0xf666, 0xf000 }, { 0xff00, 0x9900 },
+	{ 0xff66, 0xff00 }, { 0xcccc, 0xc00c }, { 0xcffc, 0xcccc }, { 0xf000, 0x9000 }, { 0x8888, 0x0808 }, { 0xfefe, 0xeeee }, { 0xfffa, 0xfff0 }, { 0x7bde, 0x7310 }
+};
+
+static const uint32 kWMValues[] = { 0x32b92180, 0x32ba3080, 0x31103200, 0x28103c80, 0x32bb3080, 0x25903600, 0x3530b900, 0x3b32b180, 0x34b5b980 };
+static const uint32 kNumWMVals = sizeof(kWMValues) / sizeof(kWMValues[0]);
+static uint32 gWMVal = -1;
+
+static const int kAnchorIdx3[2][kNumShapes3] = {
+	{ 3, 3,15,15, 8, 3,15,15,
+     8, 8, 6, 6, 6, 5, 3, 3,
+     3, 3, 8,15, 3, 3, 6,10,
+     5, 8, 8, 6, 8, 5,15,15,
+     8,15, 3, 5, 6,10, 8,15,
+    15, 3,15, 5,15,15,15,15,
+     3,15, 5, 5, 5, 8, 5,10,
+	 5,10, 8,13,15,12, 3, 3 },
+
+	{ 15, 8, 8, 3,15,15, 3, 8,
+    15,15,15,15,15,15,15, 8,
+    15, 8,15, 3,15, 8,15, 8,
+     3,15, 6,10,15,15,10, 8,
+    15, 3,15,10,10, 8, 9,10,
+     6,15, 8,15, 3, 6, 6, 8,
+    15, 3,15,15,15,15,15,15,
+	15,15,15,15, 3,15,15, 8 }
+};
+
+const uint32 kBC7InterpolationValuesScalar[4][16][2] = {
+	{ {64, 0}, {33, 31}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+	{ {64, 0}, {43, 21}, {21, 43}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+	{ {64, 0}, {55, 9}, {46, 18}, {37, 27}, {27, 37}, {18, 46}, {9, 55}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0 },
+	{ {64, 0}, {60, 4}, {55, 9}, {51, 13}, {47, 17}, {43, 21}, {38, 26}, {34, 30}, {30, 34}, {26, 38}, {21, 43}, {17, 47}, {13, 51}, {9, 55}, {4, 60}, {0, 64} }
+};
+
+static const ALIGN_SSE uint32 kZeroVector[4] = { 0, 0, 0, 0 };
+const __m128i kBC7InterpolationValuesSIMD[4][16][2] = {
+	{ 
+		{ _mm_set1_epi32(64), *((const __m128i *)kZeroVector)}, 
+		{ _mm_set1_epi32(33), _mm_set1_epi32(31) }, 
+		{ *((const __m128i *)kZeroVector), _mm_set1_epi32(64) }, 
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 
+	},
+	{ 
+		{ _mm_set1_epi32(64), *((const __m128i *)kZeroVector)}, 
+		{ _mm_set1_epi32(43), _mm_set1_epi32(21)}, 
+		{ _mm_set1_epi32(21), _mm_set1_epi32(43)}, 
+		{ *((const __m128i *)kZeroVector), _mm_set1_epi32(64)}, 
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 
+	},
+	{ 
+		{ _mm_set1_epi32(64), *((const __m128i *)kZeroVector) }, 
+		{ _mm_set1_epi32(55), _mm_set1_epi32(9) }, 
+		{ _mm_set1_epi32(46), _mm_set1_epi32(18)}, 
+		{ _mm_set1_epi32(37), _mm_set1_epi32(27)}, 
+		{ _mm_set1_epi32(27), _mm_set1_epi32(37)}, 
+		{ _mm_set1_epi32(18), _mm_set1_epi32(46)}, 
+		{ _mm_set1_epi32(9), _mm_set1_epi32(55)}, 
+		{ *((const __m128i *)kZeroVector), _mm_set1_epi32(64)}, 
+		0, 0, 0, 0, 0, 0, 0, 0 
+	},
+	{ 
+		{ _mm_set1_epi32(64), *((const __m128i *)kZeroVector)}, 
+		{ _mm_set1_epi32(60), _mm_set1_epi32(4)}, 
+		{ _mm_set1_epi32(55), _mm_set1_epi32(9)}, 
+		{ _mm_set1_epi32(51), _mm_set1_epi32(13)}, 
+		{ _mm_set1_epi32(47), _mm_set1_epi32(17)}, 
+		{ _mm_set1_epi32(43), _mm_set1_epi32(21)}, 
+		{ _mm_set1_epi32(38), _mm_set1_epi32(26)}, 
+		{ _mm_set1_epi32(34), _mm_set1_epi32(30)}, 
+		{ _mm_set1_epi32(30), _mm_set1_epi32(34)}, 
+		{ _mm_set1_epi32(26), _mm_set1_epi32(38)}, 
+		{ _mm_set1_epi32(21), _mm_set1_epi32(43)}, 
+		{ _mm_set1_epi32(17), _mm_set1_epi32(47)}, 
+		{ _mm_set1_epi32(13), _mm_set1_epi32(51)}, 
+		{ _mm_set1_epi32(9), _mm_set1_epi32(55)}, 
+		{ _mm_set1_epi32(4), _mm_set1_epi32(60)}, 
+		{ *((const __m128i *)kZeroVector), _mm_set1_epi32(64)} 
+	}
+};
+
+static const ALIGN_SSE uint32 kByteValMask[4] = { 0xFF, 0xFF, 0xFF, 0xFF };
+static inline __m128i sad(const __m128i &a, const __m128i &b) {
+	const __m128i maxab = _mm_max_epu8(a, b);
+	const __m128i minab = _mm_min_epu8(a, b);
+	return _mm_and_si128( *((const __m128i *)kByteValMask), _mm_subs_epu8( maxab, minab ) );
+}
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#include <cfloat>
+#include <ctime>
+
+#ifndef max
+template <typename T>
+static T max(const T &a, const T &b) {
+  return (a > b)? a : b; 
+}
+#endif
+
+#ifndef min
+template <typename T>
+static T min(const T &a, const T &b) {
+  return (a < b)? a : b;
+}
+#endif
+
+int BC7CompressionModeSIMD::MaxAnnealingIterations = 50; // This is a setting.
+int BC7CompressionModeSIMD::NumUses[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+BC7CompressionModeSIMD::Attributes BC7CompressionModeSIMD::kModeAttributes[kNumModes] = {
+	{ 0, 4, 3, 3, 4, 4, 4, 0, BC7CompressionModeSIMD::ePBitType_NotShared },
+	{ 1, 6, 2, 3, 6, 6, 6, 0, BC7CompressionModeSIMD::ePBitType_Shared },
+	{ 2, 6, 3, 2, 5, 5, 5, 0, BC7CompressionModeSIMD::ePBitType_None },
+	{ 3, 6, 2, 2, 7, 7, 7, 0, BC7CompressionModeSIMD::ePBitType_NotShared },
+	{ 0 }, // Mode 4 not supported
+	{ 0 }, // Mode 5 not supported
+	{ 6, 0, 1, 4, 7, 7, 7, 7, BC7CompressionModeSIMD::ePBitType_NotShared },
+	{ 7, 6, 2, 2, 5, 5, 5, 5, BC7CompressionModeSIMD::ePBitType_NotShared },
+};
+
+void BC7CompressionModeSIMD::ClampEndpointsToGrid(RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, int &bestPBitCombo) const {
+	const int nPbitCombos = GetNumPbitCombos();
+	const bool hasPbits = nPbitCombos > 1;
+	__m128i qmask;
+	GetQuantizationMask(qmask);
+
+	ClampEndpoints(p1, p2);
+
+	// !SPEED! This can be faster. We're searching through all possible
+	// pBit combos to find the best one. Instead, we should be seeing what
+	// the pBit type is for this compression mode and finding the closest 
+	// quantization.
+	float minDist = FLT_MAX;
+	RGBAVectorSIMD bp1, bp2;
+	for(int i = 0; i < nPbitCombos; i++) {
+
+		__m128i qp1, qp2;
+		if(hasPbits) {
+			qp1 = p1.ToPixel(qmask, GetPBitCombo(i)[0]);
+			qp2 = p2.ToPixel(qmask, GetPBitCombo(i)[1]);
+		}
+		else {
+			qp1 = p1.ToPixel(qmask);
+			qp2 = p2.ToPixel(qmask);
+		}
+
+		RGBAVectorSIMD np1 = RGBAVectorSIMD( _mm_cvtepi32_ps( qp1 ) );
+		RGBAVectorSIMD np2 = RGBAVectorSIMD( _mm_cvtepi32_ps( qp2 ) );
+
+		RGBAVectorSIMD d1 = np1 - p1;
+		RGBAVectorSIMD d2 = np2 - p2;
+		float dist = (d1 * d1) + (d2 * d2);
+		if(dist < minDist) {
+			minDist = dist;
+			bp1 = np1; bp2 = np2;
+			bestPBitCombo = i;
+		}
+	}
+
+	p1 = bp1;
+	p2 = bp2;
+}
+
+int BC7CompressionModeSIMD::GetSubsetForIndex(int idx, const int shapeIdx) const {
+	int subset = 0;
+	
+	const int nSubsets = GetNumberOfSubsets();
+	switch(nSubsets) {
+		case 2:
+		{
+			subset = !!((1 << idx) & kShapeMask2[shapeIdx]);
+		}
+		break;
+
+		case 3:
+		{
+			if(1 << idx & kShapeMask3[shapeIdx][0])
+				subset = 1 + !!((1 << idx) & kShapeMask3[shapeIdx][1]);
+			else
+				subset = 0;
+		}
+		break;
+
+		default:
+		break;
+	}
+
+	return subset;
+}
+
+int BC7CompressionModeSIMD::GetAnchorIndexForSubset(int subset, const int shapeIdx) const {
+	
+	const int nSubsets = GetNumberOfSubsets();
+	int anchorIdx = 0;
+
+	switch(subset) {
+		case 1:
+		{
+			if(nSubsets == 2) {
+				anchorIdx = kAnchorIdx2[shapeIdx];
+			}
+			else {
+				anchorIdx = kAnchorIdx3[0][shapeIdx];
+			}
+		}
+		break;
+
+		case 2:
+		{
+			assert(nSubsets == 3);
+			anchorIdx = kAnchorIdx3[1][shapeIdx];
+		}
+		break;
+
+		default:
+		break;
+	}
+
+	return anchorIdx;
+}
+
+double BC7CompressionModeSIMD::CompressSingleColor(const RGBAVectorSIMD &p, RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, int &bestPbitCombo) const {
+
+	// Our pixel to compress...
+	const __m128i pixel = p.ToPixel(*((const __m128i *)kByteValMask));
+
+	uint32 bestDist = 0xFF;
+	bestPbitCombo = -1;
+
+	for(int pbi = 0; pbi < GetNumPbitCombos(); pbi++) {
+
+		const int *pbitCombo = GetPBitCombo(pbi);
+		
+		uint32 dist = 0x0;
+		uint32 bestValI[kNumColorChannels] = { -1, -1, -1, -1 };
+		uint32 bestValJ[kNumColorChannels] = { -1, -1, -1, -1 };
+
+		for(int ci = 0; ci < kNumColorChannels; ci++) {
+
+			const uint8 val = ((uint8 *)(&pixel))[4*ci];
+			int nBits = 0;
+			switch(ci) {
+				case 0: nBits = GetRedChannelPrecision(); break;
+				case 1: nBits = GetGreenChannelPrecision(); break;
+				case 2: nBits = GetBlueChannelPrecision(); break;
+				case 3: nBits = GetAlphaChannelPrecision(); break;
+			}
+
+			// If we don't handle this channel, then we don't need to
+			// worry about how well we interpolate.
+			if(nBits == 0) { bestValI[ci] = bestValJ[ci] = 0xFF; continue; }
+
+			const int nPossVals = (1 << nBits);
+			int possValsH[256];
+			int possValsL[256];
+
+			// Do we have a pbit?
+			const bool havepbit = GetPBitType() != ePBitType_None;
+			if(havepbit)
+				nBits++;
+
+			for(int i = 0; i < nPossVals; i++) {
+
+				int vh = i, vl = i;
+				if(havepbit) {
+					vh <<= 1;
+					vl <<= 1;
+
+					vh |= pbitCombo[1];
+					vl |= pbitCombo[0];
+				}
+
+				possValsH[i] = (vh << (8 - nBits));
+				possValsH[i] |= (possValsH[i] >> nBits);
+
+				possValsL[i] = (vl << (8 - nBits));
+				possValsL[i] |= (possValsL[i] >> nBits);
+			}
+
+			const uint32 interpVal0 = kBC7InterpolationValuesScalar[GetNumberOfBitsPerIndex() - 1][1][0];
+			const uint32 interpVal1 = kBC7InterpolationValuesScalar[GetNumberOfBitsPerIndex() - 1][1][1];
+
+			// Find the closest interpolated val that to the given val...
+			uint32 bestChannelDist = 0xFF;
+			for(int i = 0; bestChannelDist > 0 && i < nPossVals; i++)
+			for(int j = 0; bestChannelDist > 0 && j < nPossVals; j++) {
+
+				const uint32 v1 = possValsL[i];
+				const uint32 v2 = possValsH[j];
+
+				const uint32 combo = (interpVal0*v1 + (interpVal1 * v2) + 32) >> 6;
+				const uint32 err = (combo > val)? combo - val : val - combo;
+
+				if(err < bestChannelDist) {
+					bestChannelDist = err;
+					bestValI[ci] = v1;
+					bestValJ[ci] = v2;
+				}
+			}
+
+			dist = max(bestChannelDist, dist);
+		}
+
+		if(dist < bestDist) {
+			bestDist = dist;
+			bestPbitCombo = pbi;
+
+			for(int ci = 0; ci < kNumColorChannels; ci++) {
+				p1.c[ci] = float(bestValI[ci]);
+				p2.c[ci] = float(bestValJ[ci]);
+			}
+		}
+	}
+
+	return bestDist;
+}
+
+static const ALIGN_SSE uint32 kOneVec[4] = { 1, 1, 1, 1 };
+
+// Fast random number generator. See more information at 
+// http://software.intel.com/en-us/articles/fast-random-number-generator-on-the-intel-pentiumr-4-processor/
+static uint32 g_seed = uint32(time(NULL));
+static inline uint32 fastrand() { 
+	g_seed = (214013 * g_seed + 2531011); 
+	return (g_seed>>16) & RAND_MAX; 
+} 
+
+static __m128i cur_seed = _mm_set1_epi32( int(time(NULL)) ); 	 
+static inline __m128i rand_dir()
+{
+	// static const __m128i mult = _mm_set_epi32( 214013, 17405, 214013, 69069 ); 
+	// static const __m128i gadd = _mm_set_epi32( 2531011, 10395331, 13737667, 1 ); 
+	static const ALIGN_SSE uint32 mult[4] = { 214013, 17405, 214013, 0 }; 
+	static const ALIGN_SSE uint32 gadd[4] = { 2531011, 10395331, 13737667, 0 }; 
+	static const ALIGN_SSE uint32 masklo[4] = { RAND_MAX, RAND_MAX, RAND_MAX, RAND_MAX };
+	
+	cur_seed = _mm_mullo_epi32( *((const __m128i *)mult), cur_seed );
+	cur_seed = _mm_add_epi32( *((const __m128i *)gadd), cur_seed );
+
+	const __m128i resShift = _mm_srai_epi32( cur_seed, 16 );
+	const __m128i result = _mm_and_si128( resShift, *((const __m128i *)kOneVec) );
+
+	return result;
+} 
+
+// Fast generation of floats between 0 and 1. It generates a float
+// whose exponent forces the value to be between 1 and 2, then it 
+// populates the mantissa with a random assortment of bits, and returns
+// the bytes interpreted as a float. This prevents two things: 1, a
+// division, and 2, a cast from an integer to a float.
+
+#define COMPILE_ASSERT(x) extern int __compile_assert_[(int)(x)];
+COMPILE_ASSERT(RAND_MAX == 0x7FFF)
+	 
+static inline float frand() { 
+	const uint16 r = fastrand();
+	
+	// RAND_MAX is 0x7FFF, which offers 15 bits
+	// of precision. Therefore, we move the bits
+	// into the top of the 23 bit mantissa, and 
+	// repeat the most significant bits of r in 
+	// the least significant of the mantissa
+	const uint32 m = (r << 8) | (r >> 7);
+	const uint32 flt = (127 << 23) | m;
+	return *(reinterpret_cast<const float *>(&flt)) - 1.0f;
+}
+
+static const ALIGN_SSE uint32 kSevenVec[4] = { 7, 7, 7, 7 };
+static const ALIGN_SSE uint32 kNegOneVec[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+static const ALIGN_SSE uint32 kFloatSignBit[4] = { 0x40000000, 0x40000000, 0x40000000, 0x40000000 };
+
+static void ChangePointForDirWithoutPbitChange(RGBAVectorSIMD &v, const __m128 &stepVec) {
+	
+	const __m128i dirBool = rand_dir();
+	const __m128i cmp = _mm_cmpeq_epi32( dirBool, *((const __m128i *)kZeroVector) );
+
+	const __m128 negStepVec = _mm_sub_ps( _mm_castsi128_ps( *((const __m128i *)kZeroVector) ), stepVec );
+	const __m128 step = _mm_blendv_ps( negStepVec, stepVec, _mm_castsi128_ps( cmp ) );
+	v.vec = _mm_add_ps( v.vec, step );
+}
+
+static void ChangePointForDirWithPbitChange(RGBAVectorSIMD &v, int oldPbit, const __m128 &stepVec) {
+
+	const __m128i pBitVec = _mm_set1_epi32( oldPbit );
+	const __m128i cmpPBit = _mm_cmpeq_epi32( pBitVec, *((const __m128i *)kZeroVector) );
+	const __m128i notCmpPBit = _mm_xor_si128( cmpPBit, *((const __m128i *)kNegOneVec) );
+
+	const __m128i dirBool = rand_dir();
+	const __m128i cmpDir = _mm_cmpeq_epi32( dirBool, *((const __m128i *)kOneVec) );
+	const __m128i notCmpDir = _mm_xor_si128( cmpDir, *((const __m128i *)kNegOneVec) );
+	
+	const __m128i shouldDec = _mm_and_si128( cmpDir, cmpPBit );
+	const __m128i shouldInc = _mm_and_si128( notCmpDir, notCmpPBit );
+
+	const __m128 decStep = _mm_blendv_ps( _mm_castsi128_ps( *((const __m128i *)kZeroVector) ), stepVec, _mm_castsi128_ps( shouldDec ) );
+	v.vec = _mm_sub_ps( v.vec, decStep );
+
+	const __m128 incStep = _mm_blendv_ps( _mm_castsi128_ps( *((const __m128i *)kZeroVector) ), stepVec, _mm_castsi128_ps( shouldInc ) );
+	v.vec = _mm_add_ps( v.vec, incStep );
+}
+
+void BC7CompressionModeSIMD::PickBestNeighboringEndpoints(const RGBAClusterSIMD &cluster, const RGBAVectorSIMD &p1, const RGBAVectorSIMD &p2, const int curPbitCombo, RGBAVectorSIMD &np1, RGBAVectorSIMD &np2, int &nPbitCombo, const __m128 &stepVec) const {
+
+	np1 = p1;
+	np2 = p2;
+
+	// First, let's figure out the new pbit combo... if there's no pbit then we don't need
+	// to worry about it.
+	const EPBitType pBitType = GetPBitType();
+	if(pBitType != ePBitType_None) {
+
+		// If there is a pbit, then we must change it, because those will provide the closest values
+		// to the current point.
+		if(pBitType == ePBitType_Shared)
+			nPbitCombo = (curPbitCombo + 1) % 2;
+		else {
+			// Not shared... p1 needs to change and p2 needs to change... which means that 
+			// combo 0 gets rotated to combo 3, combo 1 gets rotated to combo 2 and vice
+			// versa...
+			nPbitCombo = 3 - curPbitCombo;
+		}
+
+		assert(GetPBitCombo(curPbitCombo)[0] + GetPBitCombo(nPbitCombo)[0] == 1);
+		assert(GetPBitCombo(curPbitCombo)[1] + GetPBitCombo(nPbitCombo)[1] == 1);
+
+		const int *pBitCombo = GetPBitCombo(curPbitCombo);
+		ChangePointForDirWithPbitChange(np1, pBitCombo[0], stepVec);
+		ChangePointForDirWithPbitChange(np2, pBitCombo[1], stepVec);
+	}
+	else {
+		ChangePointForDirWithoutPbitChange(np1, stepVec);
+		ChangePointForDirWithoutPbitChange(np2, stepVec);
+	}
+
+	ClampEndpoints(np1, np2);
+}
+
+bool BC7CompressionModeSIMD::AcceptNewEndpointError(float newError, float oldError, float temp) const {
+
+	const float p = exp((0.15f * (oldError - newError)) / temp);
+	// const double r = (double(rand()) / double(RAND_MAX));
+	const float r = frand();
+
+	return r < p;
+}
+
+double BC7CompressionModeSIMD::OptimizeEndpointsForCluster(const RGBAClusterSIMD &cluster, RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, __m128i *bestIndices, int &bestPbitCombo) const {
+	
+	const int nBuckets = (1 << GetNumberOfBitsPerIndex());
+	const int nPbitCombos = GetNumPbitCombos();
+	__m128i qmask;
+	GetQuantizationMask(qmask);
+
+	// Here we use simulated annealing to traverse the space of clusters to find the best possible endpoints.
+	float curError = cluster.QuantizedError(p1, p2, nBuckets, qmask, GetPBitCombo(bestPbitCombo), bestIndices);
+	int curPbitCombo = bestPbitCombo;
+	float bestError = curError;
+	RGBAVectorSIMD bp1 = p1, bp2 = p2;
+
+	assert(curError == cluster.QuantizedError(p1, p2, nBuckets, qmask, GetPBitCombo(bestPbitCombo)));
+
+	__m128i precVec = _mm_setr_epi32( GetRedChannelPrecision(), GetGreenChannelPrecision(), GetBlueChannelPrecision(), GetAlphaChannelPrecision() );
+	const __m128i precMask = _mm_xor_si128( _mm_cmpeq_epi32( precVec, *((const __m128i *)kZeroVector) ), *((const __m128i *)kNegOneVec) );
+	precVec = _mm_sub_epi32( *((const __m128i *)kSevenVec), precVec );
+	precVec = _mm_slli_epi32( precVec, 23 );
+	precVec = _mm_or_si128( precVec, *((const __m128i *)kFloatSignBit) );
+	
+	//__m128 stepSzVec = _mm_set1_ps(1.0f);
+	//__m128 stepVec = _mm_mul_ps( stepSzVec, _mm_castsi128_ps( _mm_and_si128( precMask, precVec ) ) );
+	__m128 stepVec = _mm_castsi128_ps( _mm_and_si128( precMask, precVec ) );
+
+	const int maxEnergy = MaxAnnealingIterations;
+	for(int energy = 0; bestError > 0 && energy < maxEnergy; energy++) {
+
+		float temp = float(energy) / float(maxEnergy-1);
+
+		__m128i indices[kMaxNumDataPoints/4];
+		RGBAVectorSIMD np1, np2;
+		int nPbitCombo;
+
+		PickBestNeighboringEndpoints(cluster, p1, p2, curPbitCombo, np1, np2, nPbitCombo, stepVec);
+
+		float error = cluster.QuantizedError(np1, np2, nBuckets, qmask, GetPBitCombo(nPbitCombo), indices);
+		if(AcceptNewEndpointError(error, curError, temp)) {
+			curError = error;
+			p1 = np1;
+			p2 = np2;
+			curPbitCombo = nPbitCombo;
+		}
+
+		if(error < bestError) {
+			memcpy(bestIndices, indices, sizeof(indices));
+			bp1 = np1;
+			bp2 = np2;
+			bestPbitCombo = nPbitCombo;
+			bestError = error;
+
+			// Restart...
+			energy = 0;
+		}
+	}
+
+	p1 = bp1;
+	p2 = bp2;
+
+	return bestError;
+}
+
+double BC7CompressionModeSIMD::CompressCluster(const RGBAClusterSIMD &cluster, RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, __m128i *bestIndices, int &bestPbitCombo) const {
+		
+	// If all the points are the same in the cluster, then we need to figure out what the best
+	// approximation to this point is....
+	if(cluster.AllSamePoint()) {
+		const RGBAVectorSIMD &p = cluster.GetPoint(0);
+		double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo);
+
+		// We're assuming all indices will be index 1...
+		for(int i = 0; i < 4; i++) {
+			bestIndices[i] = _mm_set1_epi32(1);
+		}
+		
+		return bestErr;
+	}
+	
+	const int nBuckets = (1 << GetNumberOfBitsPerIndex());
+	const int nPbitCombos = GetNumPbitCombos();
+
+	RGBAVectorSIMD avg = cluster.GetTotal() / float(cluster.GetNumPoints());
+	RGBADirSIMD axis;
+	::GetPrincipalAxis(cluster, axis);
+
+	float mindp = FLT_MAX, maxdp = -FLT_MAX;
+	for(int i = 0 ; i < cluster.GetNumPoints(); i++) {
+		float dp = (cluster.GetPoint(i) - avg) * axis;
+		if(dp < mindp) mindp = dp;
+		if(dp > maxdp) maxdp = dp;
+	}
+
+	RGBAVectorSIMD pts[1 << 4]; // At most 4 bits per index.
+	float numPts[1<<4];
+	assert(nBuckets <= 1 << 4);
+	
+	p1 = avg + mindp * axis;
+	p2 = avg + maxdp * axis;
+
+	ClampEndpoints(p1, p2);
+
+	for(int i = 0; i < nBuckets; i++) {
+		float s = (float(i) / float(nBuckets - 1));
+		pts[i] = (1.0f - s) * p1 + s * p2;
+	}
+
+	assert(pts[0] == p1);
+	assert(pts[nBuckets - 1] == p2);
+
+	// Do k-means clustering...
+	int bucketIdx[kMaxNumDataPoints];
+
+	bool fixed = false;
+	while(!fixed) {
+		
+		RGBAVectorSIMD newPts[1 << 4];
+
+		// Assign each of the existing points to one of the buckets...
+		for(int i = 0; i < cluster.GetNumPoints(); i++) {
+
+			int minBucket = -1;
+			float minDist = FLT_MAX;
+			for(int j = 0; j < nBuckets; j++) {
+				RGBAVectorSIMD v = cluster.GetPoint(i) - pts[j];
+				float distSq = v * v;
+				if(distSq < minDist)
+				{
+					minDist = distSq;
+					minBucket = j;
+				}
+			}
+
+			assert(minBucket >= 0);
+			bucketIdx[i] = minBucket;
+		}
+
+		// Calculate new buckets based on centroids of clusters...
+		for(int i = 0; i < nBuckets; i++) {
+			
+			numPts[i] = 0.0f;
+			newPts[i] = RGBAVectorSIMD(0.0f);
+			for(int j = 0; j < cluster.GetNumPoints(); j++) {
+				if(bucketIdx[j] == i) {
+					numPts[i] += 1.0f;
+					newPts[i] += cluster.GetPoint(j);
+				}
+			}
+
+			// If there are no points in this cluster, then it should
+			// remain the same as last time and avoid a divide by zero.
+			if(0.0f != numPts[i])
+				newPts[i] /= numPts[i];
+		}
+
+		// If we haven't changed, then we're done.
+		fixed = true;
+		for(int i = 0; i < nBuckets; i++) {
+			if(pts[i] != newPts[i])
+				fixed = false;
+		}
+
+		// Assign the new points to be the old points.
+		for(int i = 0; i < nBuckets; i++) {
+			pts[i] = newPts[i];
+		}
+	}
+
+	// If there's only one bucket filled, then just compress for that single color...
+	int numBucketsFilled = 0, lastFilledBucket = -1;
+	for(int i = 0; i < nBuckets; i++) {
+		if(numPts[i] > 0.0f) {
+			numBucketsFilled++;
+			lastFilledBucket = i;
+		}
+	}
+
+	assert(numBucketsFilled > 0);
+	if(1 == numBucketsFilled) {
+		const RGBAVectorSIMD &p = pts[lastFilledBucket];
+		double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo);
+
+		// We're assuming all indices will be index 1...
+		for(int i = 0; i < 4; i++) {
+			bestIndices[i] = _mm_set1_epi32(1);
+		}
+		  
+		return bestErr;
+	}
+
+	// Now that we know the index of each pixel, we can assign the endpoints based on a least squares fit
+	// of the clusters. For more information, take a look at this article by NVidia:
+	// http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/dxtc/doc/cuda_dxtc.pdf
+	float asq = 0.0, bsq = 0.0, ab = 0.0;
+	RGBAVectorSIMD ax(0.0f), bx(0.0f);
+	for(int i = 0; i < nBuckets; i++) {
+		float a = float(nBuckets - 1 - i) / float(nBuckets - 1);
+		float b = float(i) / float(nBuckets - 1);
+
+		float n = numPts[i];
+		RGBAVectorSIMD x = pts[i];
+
+		asq += n * a * a;
+		bsq += n * b * b;
+		ab += n * a * b;
+
+		ax += x * a * n;
+		bx += x * b * n;
+	}
+
+	float f = 1.0f / (asq * bsq - ab * ab);
+	p1 = f * (ax * bsq - bx * ab);
+	p2 = f * (bx * asq - ax * ab);
+
+	ClampEndpointsToGrid(p1, p2, bestPbitCombo);
+
+	#ifdef _DEBUG
+		int pBitCombo = bestPbitCombo;
+		RGBAVectorSIMD tp1 = p1, tp2 = p2;
+		ClampEndpointsToGrid(tp1, tp2, pBitCombo);
+
+		assert(p1 == tp1);
+		assert(p2 == tp2);
+		assert(pBitCombo == bestPbitCombo);
+	#endif
+
+	assert(bestPbitCombo >= 0);
+
+	return OptimizeEndpointsForCluster(cluster, p1, p2, bestIndices, bestPbitCombo);
+}
+
+double BC7CompressionModeSIMD::Compress(BitStream &stream, const int shapeIdx, const RGBAClusterSIMD *clusters) const {	
+
+	const int kModeNumber = GetModeNumber();
+	const int nPartitionBits = GetNumberOfPartitionBits();
+	const int nSubsets = GetNumberOfSubsets();
+
+	// Mode #
+	stream.WriteBits(1 << kModeNumber, kModeNumber + 1);
+
+	// Partition #
+	assert((((1 << nPartitionBits) - 1) & shapeIdx) == shapeIdx);
+	stream.WriteBits(shapeIdx, nPartitionBits);
+		
+	RGBAVectorSIMD p1[kMaxNumSubsets], p2[kMaxNumSubsets];
+	int bestIndices[kMaxNumSubsets][kMaxNumDataPoints] = {
+		{ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+		{ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+		{ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }
+	};
+	int bestPbitCombo[kMaxNumSubsets] = { -1, -1, -1 };
+
+	double totalErr = 0.0;
+	for(int cidx = 0; cidx < nSubsets; cidx++) {
+		ALIGN_SSE int indices[kMaxNumDataPoints];
+
+		// Compress this cluster
+		totalErr += CompressCluster(clusters[cidx], p1[cidx], p2[cidx], (__m128i *)indices, bestPbitCombo[cidx]);
+
+		// !SPEED! We can precompute the subsets for each index based on the shape. This
+		// isn't the bottleneck for the compressor, but it could prove to be a little 
+		// faster...
+
+		// Map the indices to their proper position.
+		int idx = 0;
+		for(int i = 0; i < 16; i++) {
+			int subs = GetSubsetForIndex(i, shapeIdx);
+			if(subs == cidx) {
+				bestIndices[cidx][i] = indices[idx++];
+			}
+		}
+	}
+
+#ifdef _DEBUG
+	for(int i = 0; i < kMaxNumDataPoints; i++) {
+
+		int nSet = 0;
+		for(int j = 0; j < nSubsets; j++) {
+			if(bestIndices[j][i] >= 0)
+				nSet++;
+		}
+
+		assert(nSet == 1);
+	}
+#endif
+
+	// Get the quantization mask
+	__m128i qmask;
+	GetQuantizationMask(qmask);
+
+	//Quantize the points...
+	__m128i pixel1[kMaxNumSubsets], pixel2[kMaxNumSubsets];
+	for(int i = 0; i < nSubsets; i++) {
+		switch(GetPBitType()) {
+			default:
+			case ePBitType_None: 
+				pixel1[i] = p1[i].ToPixel(qmask); 
+				pixel2[i] = p2[i].ToPixel(qmask); 
+			break;
+
+			case ePBitType_Shared: 
+			case ePBitType_NotShared: 
+				pixel1[i] = p1[i].ToPixel(qmask, GetPBitCombo(bestPbitCombo[i])[0]); 
+				pixel2[i] = p2[i].ToPixel(qmask, GetPBitCombo(bestPbitCombo[i])[1]); 
+			break;
+		}
+	}
+
+	// If the anchor index does not have 0 in the leading bit, then 
+	// we need to swap EVERYTHING.
+	for(int sidx = 0; sidx < nSubsets; sidx++) {
+
+		int anchorIdx = GetAnchorIndexForSubset(sidx, shapeIdx);
+		assert(bestIndices[sidx][anchorIdx] != -1);
+
+		int nIndexBits = GetNumberOfBitsPerIndex();
+		if(bestIndices[sidx][anchorIdx] >> (nIndexBits - 1)) {
+			__m128i t = pixel1[sidx]; pixel1[sidx] = pixel2[sidx]; pixel2[sidx] = t;
+
+			int nIndexVals = 1 << nIndexBits;
+			for(int i = 0; i < 16; i++) {
+				bestIndices[sidx][i] = (nIndexVals - 1) - bestIndices[sidx][i];
+			}
+		}
+
+		assert(!(bestIndices[sidx][anchorIdx] >> (nIndexBits - 1)));
+	}
+
+	// Get the quantized values...
+	uint8 r1[kMaxNumSubsets], g1[kMaxNumSubsets], b1[kMaxNumSubsets], a1[kMaxNumSubsets];
+	uint8 r2[kMaxNumSubsets], g2[kMaxNumSubsets], b2[kMaxNumSubsets], a2[kMaxNumSubsets];
+	for(int i = 0; i < nSubsets; i++) {
+		r1[i] = ((uint8 *)(&(pixel1[i])))[0];
+		r2[i] = ((uint8 *)(&(pixel2[i])))[0];
+
+		g1[i] = ((uint8 *)(&(pixel1[i])))[4];
+		g2[i] = ((uint8 *)(&(pixel2[i])))[4];
+
+		b1[i] = ((uint8 *)(&(pixel1[i])))[8];
+		b2[i] = ((uint8 *)(&(pixel2[i])))[8];
+
+		a1[i] = ((uint8 *)(&(pixel1[i])))[12];
+		a2[i] = ((uint8 *)(&(pixel2[i])))[12];
+	}
+
+	// Write them out...
+	const int nRedBits = GetRedChannelPrecision();
+	for(int i = 0; i < nSubsets; i++) {
+		stream.WriteBits(r1[i] >> (8 - nRedBits), nRedBits);
+		stream.WriteBits(r2[i] >> (8 - nRedBits), nRedBits);
+	}
+
+	const int nGreenBits = GetGreenChannelPrecision();
+	for(int i = 0; i < nSubsets; i++) {
+		stream.WriteBits(g1[i] >> (8 - nGreenBits), nGreenBits);
+		stream.WriteBits(g2[i] >> (8 - nGreenBits), nGreenBits);
+	}
+
+	const int nBlueBits = GetBlueChannelPrecision();
+	for(int i = 0; i < nSubsets; i++) {
+		stream.WriteBits(b1[i] >> (8 - nBlueBits), nBlueBits);
+		stream.WriteBits(b2[i] >> (8 - nBlueBits), nBlueBits);
+	}
+
+	const int nAlphaBits = GetAlphaChannelPrecision();
+	for(int i = 0; i < nSubsets; i++) {
+		stream.WriteBits(a1[i] >> (8 - nAlphaBits), nAlphaBits);
+		stream.WriteBits(a2[i] >> (8 - nAlphaBits), nAlphaBits);
+	}
+
+	// Write out the best pbits..
+	if(GetPBitType() != ePBitType_None) {
+		for(int s = 0; s < nSubsets; s++) {
+			const int *pbits = GetPBitCombo(bestPbitCombo[s]);
+			stream.WriteBits(pbits[0], 1);
+			if(GetPBitType() != ePBitType_Shared)
+				stream.WriteBits(pbits[1], 1);
+		}
+	}
+
+	for(int i = 0; i < 16; i++) {
+		const int subs = GetSubsetForIndex(i, shapeIdx);
+		const int idx = bestIndices[subs][i];
+		const int anchorIdx = GetAnchorIndexForSubset(subs, shapeIdx);
+		const int nBitsForIdx = GetNumberOfBitsPerIndex();
+		assert(idx >= 0 && idx < (1 << nBitsForIdx));
+		assert(i != anchorIdx || !(idx >> (nBitsForIdx - 1)) || !"Leading bit of anchor index is not zero!");
+		stream.WriteBits(idx, (i == anchorIdx)? nBitsForIdx - 1 : nBitsForIdx);
+	}
+
+	assert(stream.GetBitsWritten() == 128);
+	return totalErr;
+}
+
+namespace BC7C
+{
+	static ErrorMetric gErrorMetric = eErrorMetric_Uniform;
+	void SetErrorMetric(ErrorMetric e) { gErrorMetric = e; }
+
+	ALIGN_SSE const float kErrorMetrics[kNumErrorMetrics][kNumColorChannels] = {
+		{ 1.0f, 1.0f, 1.0f, 1.0f },
+		{ sqrtf(0.3f), sqrtf(0.56f), sqrtf(0.11f), 1.0f }
+	};
+
+	const float *GetErrorMetric() { return kErrorMetrics[GetErrorMetricEnum()]; }
+	ErrorMetric GetErrorMetricEnum() { return gErrorMetric; }
+
+	// Function prototypes
+	static void ExtractBlock(const uint8* inPtr, int width, uint32* colorBlock);
+	static void CompressBC7Block(const uint32 *block, uint8 *outBuf);
+
+	// Returns true if the entire block is a single color.
+	static bool AllOneColor(const uint32 block[16]) {
+		const uint32 pixel = block[0];
+		for(int i = 1; i < 16; i++) {
+			if( block[i] != pixel )
+				return false;
+		}
+
+		return true;
+	}
+
+	// Write out a transparent block.
+	static void WriteTransparentBlock(BitStream &stream) {
+		// Use mode 6
+		stream.WriteBits(1 << 6, 7);
+		stream.WriteBits(0, 128-7);
+		assert(stream.GetBitsWritten() == 128);
+	}
+
+	// Compresses a single color optimally and outputs the result.
+	static void CompressOptimalColorBC7(uint32 pixel, BitStream &stream) {
+
+		stream.WriteBits(1 << 5, 6); // Mode 5
+		stream.WriteBits(0, 2); // No rotation bits.
+
+		uint8 r = pixel & 0xFF;
+		uint8 g = (pixel >> 8) & 0xFF;
+		uint8 b = (pixel >> 16) & 0xFF;
+		uint8 a = (pixel >> 24) & 0xFF;
+
+		// Red endpoints
+		stream.WriteBits(Optimal7CompressBC7Mode5[r][0], 7);
+		stream.WriteBits(Optimal7CompressBC7Mode5[r][1], 7);
+
+		// Green endpoints
+		stream.WriteBits(Optimal7CompressBC7Mode5[g][0], 7);
+		stream.WriteBits(Optimal7CompressBC7Mode5[g][1], 7);
+
+		// Blue endpoints
+		stream.WriteBits(Optimal7CompressBC7Mode5[b][0], 7);
+		stream.WriteBits(Optimal7CompressBC7Mode5[b][1], 7);
+
+		// Alpha endpoints... are just the same.
+		stream.WriteBits(a, 8);
+		stream.WriteBits(a, 8);
+		
+		// Color indices are 1 for each pixel...
+		// Anchor index is 0, so 1 bit for the first pixel, then
+		// 01 for each following pixel giving the sequence of 31 bits:
+		// ...010101011
+		stream.WriteBits(0xaaaaaaab, 31);
+
+		// Alpha indices...
+		stream.WriteBits(kWMValues[gWMVal = (gWMVal+1) % kNumWMVals], 31); 
+	}
+
+	// Compress an image using BC7 compression. Use the inBuf parameter to point to an image in
+	// 4-byte RGBA format. The width and height parameters specify the size of the image in pixels.
+	// The buffer pointed to by outBuf should be large enough to store the compressed image. This
+	// implementation has an 4:1 compression ratio.
+	void CompressImageBC7SIMD(const uint8* inBuf, uint8* outBuf, int width, int height)
+	{
+		ALIGN_SSE uint32 block[16];
+
+		_MM_SET_ROUNDING_MODE( _MM_ROUND_TOWARD_ZERO );
+		BC7CompressionModeSIMD::ResetNumUses();
+
+		BC7CompressionModeSIMD::MaxAnnealingIterations = GetQualityLevel();
+
+		for(int j = 0; j < height; j += 4, inBuf += width * 4 * 4)
+		{
+			for(int i = 0; i < width; i += 4)
+			{
+				ExtractBlock(inBuf + i * 4, width, block);
+				CompressBC7Block(block, outBuf);
+				outBuf += 16;
+			}
+		}
+	}
+
+	// Extract a 4 by 4 block of pixels from inPtr and store it in colorBlock. The width parameter
+	// specifies the size of the image in pixels.
+	static void ExtractBlock(const uint8* inPtr, int width, uint32* colorBlock)
+	{
+		// Compute the stride.
+		const int stride = width * 4;
+
+		// Copy the first row of pixels from inPtr into colorBlock.
+		_mm_store_si128((__m128i*)colorBlock, _mm_load_si128((__m128i*)inPtr));
+		inPtr += stride;
+
+		// Copy the second row of pixels from inPtr into colorBlock.
+		_mm_store_si128((__m128i*)(colorBlock + 4), _mm_load_si128((__m128i*)inPtr));
+		inPtr += stride;
+
+		// Copy the third row of pixels from inPtr into colorBlock.
+		_mm_store_si128((__m128i*)(colorBlock + 8), _mm_load_si128((__m128i*)inPtr));
+		inPtr += stride;
+
+		// Copy the forth row of pixels from inPtr into colorBlock.
+		_mm_store_si128((__m128i*)(colorBlock + 12), _mm_load_si128((__m128i*)inPtr));
+	}
+
+	static double CompressTwoClusters(int shapeIdx, const RGBAClusterSIMD *clusters, uint8 *outBuf, double estimatedError) {
+
+		uint8 tempBuf1[16];
+		BitStream tmpStream1(tempBuf1, 128, 0);
+		BC7CompressionModeSIMD compressor1(1, estimatedError);
+			
+		double bestError = compressor1.Compress(tmpStream1, shapeIdx, clusters);
+		memcpy(outBuf, tempBuf1, 16);
+		if(bestError == 0.0) {
+			return 0.0;
+		}
+
+		uint8 tempBuf3[16];
+		BitStream tmpStream3(tempBuf3, 128, 0);
+		BC7CompressionModeSIMD compressor3(3, estimatedError);
+
+		double error;
+		if((error = compressor3.Compress(tmpStream3, shapeIdx, clusters)) < bestError) {
+			bestError = error;
+			memcpy(outBuf, tempBuf3, 16);
+			if(bestError == 0.0) {
+				return 0.0;
+			}
+		}
+		
+		// Mode 3 offers more precision for RGB data. Mode 7 is really only if we have alpha.
+		//uint8 tempBuf7[16];
+		//BitStream tmpStream7(tempBuf7, 128, 0);
+		//BC7CompressionModeSIMD compressor7(7, estimatedError);		
+		//if((error = compressor7.Compress(tmpStream7, shapeIdx, clusters)) < bestError) {
+		//	memcpy(outBuf, tempBuf7, 16);
+		//	return error;
+		//}
+
+		return bestError;
+	}
+
+	static double CompressThreeClusters(int shapeIdx, const RGBAClusterSIMD *clusters, uint8 *outBuf, double estimatedError) {
+
+		uint8 tempBuf0[16];
+		BitStream tmpStream0(tempBuf0, 128, 0);
+
+		uint8 tempBuf2[16];
+		BitStream tmpStream2(tempBuf2, 128, 0);
+
+		BC7CompressionModeSIMD compressor0(0, estimatedError);
+		BC7CompressionModeSIMD compressor2(2, estimatedError);
+			
+		double error, bestError = (shapeIdx < 16)? compressor0.Compress(tmpStream0, shapeIdx, clusters) : DBL_MAX;
+		memcpy(outBuf, tempBuf0, 16);
+		if(bestError == 0.0) {
+			return 0.0;
+		}
+
+		if((error = compressor2.Compress(tmpStream2, shapeIdx, clusters)) < bestError) {
+			memcpy(outBuf, tempBuf2, 16);
+			return error;
+		}
+
+		return bestError;
+	}
+
+	static void PopulateTwoClustersForShape(const RGBAClusterSIMD &points, int shapeIdx, RGBAClusterSIMD *clusters) {
+		const uint16 shape = kShapeMask2[shapeIdx]; 
+		for(int pt = 0; pt < kMaxNumDataPoints; pt++) {
+
+			const RGBAVectorSIMD &p = points.GetPoint(pt);
+
+			if((1 << pt) & shape)
+				clusters[1].AddPoint(p, pt);
+			else
+				clusters[0].AddPoint(p, pt);
+		}
+
+		assert(!(clusters[0].GetPointBitString() & clusters[1].GetPointBitString()));
+		assert((clusters[0].GetPointBitString() ^ clusters[1].GetPointBitString()) == 0xFFFF);
+		assert((shape & clusters[1].GetPointBitString()) == shape);
+	}
+
+	static void PopulateThreeClustersForShape(const RGBAClusterSIMD &points, int shapeIdx, RGBAClusterSIMD *clusters) {
+		for(int pt = 0; pt < kMaxNumDataPoints; pt++) {
+
+			const RGBAVectorSIMD &p = points.GetPoint(pt);
+
+			if((1 << pt) & kShapeMask3[shapeIdx][0]) {
+				if((1 << pt) & kShapeMask3[shapeIdx][1])
+					clusters[2].AddPoint(p, pt);
+				else
+					clusters[1].AddPoint(p, pt);
+			}
+			else
+				clusters[0].AddPoint(p, pt);
+		}
+
+		assert(!(clusters[0].GetPointBitString() & clusters[1].GetPointBitString()));
+		assert(!(clusters[2].GetPointBitString() & clusters[1].GetPointBitString()));
+		assert(!(clusters[0].GetPointBitString() & clusters[2].GetPointBitString()));
+	}
+
+	static double EstimateTwoClusterError(RGBAClusterSIMD &c) {
+		RGBAVectorSIMD Min, Max, v;
+		c.GetBoundingBox(Min, Max);
+		v = Max - Min;
+		if(v * v == 0) {
+			return 0.0;
+		}
+
+		return 0.0001 + c.QuantizedError(Min, Max, 8, _mm_set1_epi32(0xFF));
+	}
+
+	static double EstimateThreeClusterError(RGBAClusterSIMD &c) {
+		RGBAVectorSIMD Min, Max, v;
+		c.GetBoundingBox(Min, Max);
+		v = Max - Min;
+		if(v * v == 0) {
+			return 0.0;
+		}
+
+		return 0.0001 + c.QuantizedError(Min, Max, 4, _mm_set1_epi32(0xFF));
+	}
+
+	// Compress a single block.
+	void CompressBC7Block(const uint32 *block, uint8 *outBuf) {
+		
+		// All a single color?
+		if(AllOneColor(block)) {
+			BitStream bStrm(outBuf, 128, 0);
+			CompressOptimalColorBC7(*((const uint32 *)block), bStrm);
+			return;
+		}		
+
+		RGBAClusterSIMD blockCluster;
+		bool opaque = true;
+		bool transparent = true;
+
+		for(int i = 0; i < kMaxNumDataPoints; i++) {
+			RGBAVectorSIMD p = RGBAVectorSIMD(block[i]);
+			blockCluster.AddPoint(p, i);
+			if(fabs(p.a - 255.0f) > 1e-10)
+				opaque = false;
+
+			if(p.a > 0.0f)
+				transparent = false;
+		}
+
+		// The whole block is transparent?
+		if(transparent) {
+			BitStream bStrm(outBuf, 128, 0);
+			WriteTransparentBlock(bStrm);
+			return;
+		}
+
+		// First we must figure out which shape to use. To do this, simply
+		// see which shape has the smallest sum of minimum bounding spheres.
+		double bestError[2] = { DBL_MAX, DBL_MAX };
+		int bestShapeIdx[2] = { -1, -1 };
+		RGBAClusterSIMD bestClusters[2][3];
+
+		for(int i = 0; i < kNumShapes2; i++) 
+		{
+			RGBAClusterSIMD clusters[2];
+			PopulateTwoClustersForShape(blockCluster, i, clusters);
+
+			double err = 0.0;
+			for(int ci = 0; ci < 2; ci++) {
+				err += EstimateTwoClusterError(clusters[ci]);
+			}
+
+			// If it's small, we'll take it!
+			if(err < 1e-9) {
+				CompressTwoClusters(i, clusters, outBuf, err);
+				return;
+			}
+
+			if(err < bestError[0]) {
+				bestError[0] = err;
+				bestShapeIdx[0] = i;
+				bestClusters[0][0] = clusters[0];
+				bestClusters[0][1] = clusters[1];
+			}
+		}
+
+		// There are not 3 subset blocks that support alpha...
+		if(opaque) {
+			for(int i = 0; i < kNumShapes3; i++) {
+
+				RGBAClusterSIMD clusters[3];
+				PopulateThreeClustersForShape(blockCluster, i, clusters);
+
+				double err = 0.0;
+				for(int ci = 0; ci < 3; ci++) {
+					err += EstimateThreeClusterError(clusters[ci]);
+				}
+
+				// If it's small, we'll take it!
+				if(err < 1e-9) {
+					CompressThreeClusters(i, clusters, outBuf, err);
+					return;
+				}
+
+				if(err < bestError[1]) {
+					bestError[1] = err;
+					bestShapeIdx[1] = i;
+					bestClusters[1][0] = clusters[0];
+					bestClusters[1][1] = clusters[1];
+					bestClusters[1][2] = clusters[2];
+				}
+			}
+		}
+
+		if(opaque) {
+
+			uint8 tempBuf1[16];
+			uint8 tempBuf2[16];
+
+			BitStream tempStream1 (tempBuf1, 128, 0);
+			BC7CompressionModeSIMD compressor(6, DBL_MAX);
+			double best = compressor.Compress(tempStream1, 0, &blockCluster);
+			if(best == 0.0f) {
+				memcpy(outBuf, tempBuf1, 16);
+				return;
+			}
+
+			double error = DBL_MAX;
+			if((error = CompressTwoClusters(bestShapeIdx[0], bestClusters[0], tempBuf2, bestError[0])) < best) {
+				best = error;
+				if(error == 0.0f) {
+					memcpy(outBuf, tempBuf2, 16);
+					return;
+				}
+				else {
+					memcpy(tempBuf1, tempBuf2, 16);
+				}
+			}
+
+			if(CompressThreeClusters(bestShapeIdx[1], bestClusters[1], tempBuf2, bestError[1]) < best) {
+				memcpy(outBuf, tempBuf2, 16);
+				return;
+			}
+
+			memcpy(outBuf, tempBuf1, 16);
+		}
+		else {
+			assert(!"Don't support alpha yet!");
+		}
+	}
+}
diff --git a/BPTCEncoder/src/BC7IntTypes.h b/BPTCEncoder/src/BC7IntTypes.h
new file mode 100644
index 0000000..3f32c10
--- /dev/null
+++ b/BPTCEncoder/src/BC7IntTypes.h
@@ -0,0 +1,31 @@
+
+// Copyright 2012 (c) Pavel Krajcevski
+// BC7IntTypes.h
+
+// This file contains all of the various platform definitions for fixed width integers
+// on various platforms.
+
+// !FIXME! Still needs to be tested on Windows platforms.
+
+
+#ifdef _MSC_VER
+typedef __int16 int16;
+typedef __uint16 uint16;
+typedef __int32 int32;
+typedef __uint32 uint32;
+typedef __int8 int8;
+typedef __uint8 uint8;
+
+#else
+
+#include <stdint.h>
+
+typedef int8_t int8;
+typedef int16_t int16;
+typedef int32_t int32;
+
+typedef uint8_t uint8;
+typedef uint16_t uint16;
+typedef uint32_t uint32;
+
+#endif
diff --git a/BPTCEncoder/src/BCLookupTables.h b/BPTCEncoder/src/BCLookupTables.h
new file mode 100755
index 0000000..37ec72b
--- /dev/null
+++ b/BPTCEncoder/src/BCLookupTables.h
@@ -0,0 +1,945 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+// Each value from 0 to 255 can be exactly interpolated between two other values
+// with 7 bit precision. BC7 Mode 5 gives us this precision, so we can use look-up
+// tables to speed up this precision by allowing every value to be 1/3 of the way
+// between the two colors specified.
+/*
+	UINT nbits = 7;
+	UINT lastNum = -1;
+	UINT vals[255];
+	UINT valIdx = 0;
+	for(UINT i = 0; i < 256; i++) {
+		UINT num = (i >> (8 - nbits));
+		num <<= (8-nbits);
+		num |= i >> nbits;
+
+		if(num != lastNum) {
+			lastNum = num;
+			vals[valIdx++] = num;
+		}
+	}
+
+	for(UINT i = 0; i < 256; i++) {
+
+		UINT mindist = 0xFFFFFFFF;
+		UINT minj = 0, mink = 0;
+
+		UINT tableEntry[2] = { 0, 0 };
+
+		mindist = 0xFFFFFFFF;
+		minj = 0, mink = 0;
+
+		for(UINT j = 0; j < valIdx; j++) {
+			for(UINT k = 0; k < valIdx ; k++) {
+
+				UINT combo = (43 * vals[j] + 21 * vals[k] + 32) >> 6;
+				UINT dist = ((i > combo) ? i - combo : combo - i);
+				if( dist < mindist )
+				{
+					mindist = dist;
+					minj = j;
+					mink = k;
+				}
+			}
+		}
+
+		assert(mindist == 0);
+
+		tableEntry[0] = vals[minj];
+		tableEntry[1] = vals[mink];
+
+		wchar_t tableEntryStr[256];
+		swprintf(tableEntryStr, 256, L"{ 0x%02x, 0x%02x },\n", 
+			tableEntry[0] >> (8 - nbits),
+			tableEntry[1] >> (8 - nbits)
+		);
+		OutputDebugString(tableEntryStr);
+	}
+*/
+static unsigned char Optimal7CompressBC7Mode5[256][2] = {
+	{ 0x00, 0x00 },
+	{ 0x00, 0x01 },
+	{ 0x00, 0x03 },
+	{ 0x00, 0x04 },
+	{ 0x00, 0x06 },
+	{ 0x00, 0x07 },
+	{ 0x00, 0x09 },
+	{ 0x00, 0x0a },
+	{ 0x00, 0x0c },
+	{ 0x00, 0x0d },
+	{ 0x00, 0x0f },
+	{ 0x00, 0x10 },
+	{ 0x00, 0x12 },
+	{ 0x00, 0x14 },
+	{ 0x00, 0x15 },
+	{ 0x00, 0x17 },
+	{ 0x00, 0x18 },
+	{ 0x00, 0x1a },
+	{ 0x00, 0x1b },
+	{ 0x00, 0x1d },
+	{ 0x00, 0x1e },
+	{ 0x00, 0x20 },
+	{ 0x00, 0x21 },
+	{ 0x00, 0x23 },
+	{ 0x00, 0x24 },
+	{ 0x00, 0x26 },
+	{ 0x00, 0x27 },
+	{ 0x00, 0x29 },
+	{ 0x00, 0x2a },
+	{ 0x00, 0x2c },
+	{ 0x00, 0x2d },
+	{ 0x00, 0x2f },
+	{ 0x00, 0x30 },
+	{ 0x00, 0x32 },
+	{ 0x00, 0x34 },
+	{ 0x00, 0x35 },
+	{ 0x00, 0x37 },
+	{ 0x00, 0x38 },
+	{ 0x00, 0x3a },
+	{ 0x00, 0x3b },
+	{ 0x00, 0x3d },
+	{ 0x00, 0x3e },
+	{ 0x00, 0x40 },
+	{ 0x00, 0x41 },
+	{ 0x00, 0x42 },
+	{ 0x00, 0x44 },
+	{ 0x00, 0x45 },
+	{ 0x00, 0x47 },
+	{ 0x00, 0x48 },
+	{ 0x00, 0x4a },
+	{ 0x00, 0x4b },
+	{ 0x00, 0x4d },
+	{ 0x00, 0x4e },
+	{ 0x00, 0x50 },
+	{ 0x00, 0x52 },
+	{ 0x00, 0x53 },
+	{ 0x00, 0x55 },
+	{ 0x00, 0x56 },
+	{ 0x00, 0x58 },
+	{ 0x00, 0x59 },
+	{ 0x00, 0x5b },
+	{ 0x00, 0x5c },
+	{ 0x00, 0x5e },
+	{ 0x00, 0x5f },
+	{ 0x00, 0x61 },
+	{ 0x00, 0x62 },
+	{ 0x00, 0x64 },
+	{ 0x00, 0x65 },
+	{ 0x00, 0x67 },
+	{ 0x00, 0x68 },
+	{ 0x00, 0x6a },
+	{ 0x00, 0x6b },
+	{ 0x00, 0x6d },
+	{ 0x00, 0x6e },
+	{ 0x00, 0x70 },
+	{ 0x00, 0x72 },
+	{ 0x00, 0x73 },
+	{ 0x00, 0x75 },
+	{ 0x00, 0x76 },
+	{ 0x00, 0x78 },
+	{ 0x00, 0x79 },
+	{ 0x00, 0x7b },
+	{ 0x00, 0x7c },
+	{ 0x00, 0x7e },
+	{ 0x00, 0x7f },
+	{ 0x01, 0x7f },
+	{ 0x02, 0x7e },
+	{ 0x03, 0x7e },
+	{ 0x03, 0x7f },
+	{ 0x04, 0x7f },
+	{ 0x05, 0x7e },
+	{ 0x06, 0x7e },
+	{ 0x06, 0x7f },
+	{ 0x07, 0x7f },
+	{ 0x08, 0x7e },
+	{ 0x09, 0x7e },
+	{ 0x09, 0x7f },
+	{ 0x0a, 0x7f },
+	{ 0x0b, 0x7e },
+	{ 0x0c, 0x7e },
+	{ 0x0c, 0x7f },
+	{ 0x0d, 0x7f },
+	{ 0x0e, 0x7e },
+	{ 0x0f, 0x7d },
+	{ 0x0f, 0x7f },
+	{ 0x10, 0x7e },
+	{ 0x11, 0x7e },
+	{ 0x11, 0x7f },
+	{ 0x12, 0x7f },
+	{ 0x13, 0x7e },
+	{ 0x14, 0x7e },
+	{ 0x14, 0x7f },
+	{ 0x15, 0x7f },
+	{ 0x16, 0x7e },
+	{ 0x17, 0x7e },
+	{ 0x17, 0x7f },
+	{ 0x18, 0x7f },
+	{ 0x19, 0x7e },
+	{ 0x1a, 0x7e },
+	{ 0x1a, 0x7f },
+	{ 0x1b, 0x7f },
+	{ 0x1c, 0x7e },
+	{ 0x1d, 0x7e },
+	{ 0x1d, 0x7f },
+	{ 0x1e, 0x7f },
+	{ 0x1f, 0x7e },
+	{ 0x20, 0x7e },
+	{ 0x20, 0x7f },
+	{ 0x21, 0x7f },
+	{ 0x22, 0x7e },
+	{ 0x23, 0x7e },
+	{ 0x23, 0x7f },
+	{ 0x24, 0x7f },
+	{ 0x25, 0x7e },
+	{ 0x26, 0x7e },
+	{ 0x26, 0x7f },
+	{ 0x27, 0x7f },
+	{ 0x28, 0x7e },
+	{ 0x29, 0x7e },
+	{ 0x29, 0x7f },
+	{ 0x2a, 0x7f },
+	{ 0x2b, 0x7e },
+	{ 0x2c, 0x7e },
+	{ 0x2c, 0x7f },
+	{ 0x2d, 0x7f },
+	{ 0x2e, 0x7e },
+	{ 0x2f, 0x7d },
+	{ 0x2f, 0x7f },
+	{ 0x30, 0x7e },
+	{ 0x31, 0x7e },
+	{ 0x31, 0x7f },
+	{ 0x32, 0x7f },
+	{ 0x33, 0x7e },
+	{ 0x34, 0x7e },
+	{ 0x34, 0x7f },
+	{ 0x35, 0x7f },
+	{ 0x36, 0x7e },
+	{ 0x37, 0x7e },
+	{ 0x37, 0x7f },
+	{ 0x38, 0x7f },
+	{ 0x39, 0x7e },
+	{ 0x3a, 0x7e },
+	{ 0x3a, 0x7f },
+	{ 0x3b, 0x7f },
+	{ 0x3c, 0x7e },
+	{ 0x3d, 0x7e },
+	{ 0x3d, 0x7f },
+	{ 0x3e, 0x7f },
+	{ 0x3f, 0x7e },
+	{ 0x40, 0x7d },
+	{ 0x40, 0x7e },
+	{ 0x41, 0x7e },
+	{ 0x41, 0x7f },
+	{ 0x42, 0x7f },
+	{ 0x43, 0x7e },
+	{ 0x44, 0x7e },
+	{ 0x44, 0x7f },
+	{ 0x45, 0x7f },
+	{ 0x46, 0x7e },
+	{ 0x47, 0x7e },
+	{ 0x47, 0x7f },
+	{ 0x48, 0x7f },
+	{ 0x49, 0x7e },
+	{ 0x4a, 0x7e },
+	{ 0x4a, 0x7f },
+	{ 0x4b, 0x7f },
+	{ 0x4c, 0x7e },
+	{ 0x4d, 0x7d },
+	{ 0x4d, 0x7f },
+	{ 0x4e, 0x7e },
+	{ 0x4f, 0x7e },
+	{ 0x4f, 0x7f },
+	{ 0x50, 0x7f },
+	{ 0x51, 0x7e },
+	{ 0x52, 0x7e },
+	{ 0x52, 0x7f },
+	{ 0x53, 0x7f },
+	{ 0x54, 0x7e },
+	{ 0x55, 0x7e },
+	{ 0x55, 0x7f },
+	{ 0x56, 0x7f },
+	{ 0x57, 0x7e },
+	{ 0x58, 0x7e },
+	{ 0x58, 0x7f },
+	{ 0x59, 0x7f },
+	{ 0x5a, 0x7e },
+	{ 0x5b, 0x7e },
+	{ 0x5b, 0x7f },
+	{ 0x5c, 0x7f },
+	{ 0x5d, 0x7e },
+	{ 0x5e, 0x7e },
+	{ 0x5e, 0x7f },
+	{ 0x5f, 0x7f },
+	{ 0x60, 0x7e },
+	{ 0x61, 0x7e },
+	{ 0x61, 0x7f },
+	{ 0x62, 0x7f },
+	{ 0x63, 0x7e },
+	{ 0x64, 0x7e },
+	{ 0x64, 0x7f },
+	{ 0x65, 0x7f },
+	{ 0x66, 0x7e },
+	{ 0x67, 0x7e },
+	{ 0x67, 0x7f },
+	{ 0x68, 0x7f },
+	{ 0x69, 0x7e },
+	{ 0x6a, 0x7e },
+	{ 0x6a, 0x7f },
+	{ 0x6b, 0x7f },
+	{ 0x6c, 0x7e },
+	{ 0x6d, 0x7d },
+	{ 0x6d, 0x7f },
+	{ 0x6e, 0x7e },
+	{ 0x6f, 0x7e },
+	{ 0x6f, 0x7f },
+	{ 0x70, 0x7f },
+	{ 0x71, 0x7e },
+	{ 0x72, 0x7e },
+	{ 0x72, 0x7f },
+	{ 0x73, 0x7f },
+	{ 0x74, 0x7e },
+	{ 0x75, 0x7e },
+	{ 0x75, 0x7f },
+	{ 0x76, 0x7f },
+	{ 0x77, 0x7e },
+	{ 0x78, 0x7e },
+	{ 0x78, 0x7f },
+	{ 0x79, 0x7f },
+	{ 0x7a, 0x7e },
+	{ 0x7b, 0x7e },
+	{ 0x7b, 0x7f },
+	{ 0x7c, 0x7f },
+	{ 0x7d, 0x7e },
+	{ 0x7e, 0x7e },
+	{ 0x7e, 0x7f },
+	{ 0x7f, 0x7f }
+};
+
+// For each value, we give the best possible compression range for that value with 5 bits.
+// The first value says whether or not it's
+// 1 - the midpoint of two other values, or 
+// 0 - 1/3 of the way in between two other values.
+// If the first value is 1 or 2 then the last two values are the range between which the
+// value should be interpolated. If the first value is 2, then it should be interpolated
+// one third of the way from the second to third value...
+//
+// The following tables were generated with the following program:
+/*
+ 	UINT nbits = 5;
+	UINT lastNum = -1;
+	UINT vals[255];
+	UINT valIdx = 0;
+	for(UINT i = 0; i < 256; i++) {
+		UINT num = (i >> (8 - nbits));
+		num <<= (8-nbits);
+		num |= i >> nbits;
+
+		if(num != lastNum) {
+			lastNum = num;
+			vals[valIdx++] = num;
+		}
+	}
+
+	for(UINT i = 0; i < 256; i++) {
+
+		UINT mindist = 0xFFFFFFFF;
+		UINT minj = 0, mink = 0;
+
+		UINT tableEntry[2][4] = { {1, 0, 0, 0xFFFFFFFF}, {0, 0, 0, 0xFFFFFFFF} };
+
+		for(UINT j = 0; j < valIdx; j++) {
+			for(UINT k = j; k < valIdx ; k++) {
+
+				UINT combo = (vals[j] + vals[k]) / 2;
+				UINT dist = ((i > combo) ? i - combo : combo - i);
+				if( dist < mindist )
+				{
+					mindist = dist;
+					minj = j;
+					mink = k;
+				}
+			}
+		}
+
+		tableEntry[0][1] = vals[minj];
+		tableEntry[0][2] = vals[mink];
+		tableEntry[0][3] = mindist;
+
+		mindist = 0xFFFFFFFF;
+		minj = 0, mink = 0;
+
+		for(UINT j = 0; j < valIdx; j++) {
+			for(UINT k = j; k < valIdx ; k++) {
+
+				UINT combo = (2 * vals[j] + vals[k]) / 3;
+				UINT dist = ((i > combo) ? i - combo : combo - i);
+				if( dist < mindist )
+				{
+					mindist = dist;
+					minj = j;
+					mink = k;
+				}
+			}
+		}
+
+		tableEntry[1][1] = vals[minj];
+		tableEntry[1][2] = vals[mink];
+		tableEntry[1][3] = mindist;
+
+		wchar_t tableEntryStr[256];
+		if(tableEntry[1][3] > tableEntry[0][3]) {
+			swprintf(tableEntryStr, 256, L"{ { %d, 0x%02x, 0x%02x }, { %d, 0x%02x, 0x%02x } },\n", 
+				tableEntry[0][0],
+				tableEntry[0][1] >> (8 - nbits),
+				tableEntry[0][2] >> (8 - nbits),
+				tableEntry[1][0],
+				tableEntry[1][1] >> (8 - nbits),
+				tableEntry[1][2] >> (8 - nbits)
+			);
+		}
+		else {
+			swprintf(tableEntryStr, 256, L"{ { %d, 0x%02x, 0x%02x }, { %d, 0x%02x, 0x%02x } },\n", 
+				tableEntry[1][0],
+				tableEntry[1][1] >> (8 - nbits),
+				tableEntry[1][2] >> (8 - nbits),
+				tableEntry[0][0],
+				tableEntry[0][1] >> (8 - nbits),
+				tableEntry[0][2] >> (8 - nbits)
+			);
+		}
+		OutputDebugString(tableEntryStr);
+	}
+*/
+static unsigned char Optimal5CompressDXT1[256][2][3] = {
+	{ { 0, 0x00, 0x00 }, { 1, 0x00, 0x00 } },
+	{ { 0, 0x00, 0x00 }, { 1, 0x00, 0x00 } },
+	{ { 0, 0x00, 0x01 }, { 1, 0x00, 0x00 } },
+	{ { 0, 0x00, 0x01 }, { 1, 0x00, 0x01 } },
+	{ { 1, 0x00, 0x01 }, { 0, 0x00, 0x02 } },
+	{ { 0, 0x00, 0x02 }, { 1, 0x00, 0x01 } },
+	{ { 0, 0x00, 0x02 }, { 1, 0x00, 0x01 } },
+	{ { 0, 0x00, 0x03 }, { 1, 0x00, 0x02 } },
+	{ { 0, 0x00, 0x03 }, { 1, 0x00, 0x02 } },
+	{ { 0, 0x00, 0x03 }, { 1, 0x00, 0x02 } },
+	{ { 0, 0x01, 0x02 }, { 1, 0x00, 0x02 } },
+	{ { 0, 0x00, 0x04 }, { 1, 0x00, 0x03 } },
+	{ { 1, 0x00, 0x03 }, { 0, 0x00, 0x04 } },
+	{ { 0, 0x00, 0x05 }, { 1, 0x00, 0x03 } },
+	{ { 0, 0x00, 0x05 }, { 1, 0x00, 0x03 } },
+	{ { 0, 0x00, 0x06 }, { 1, 0x00, 0x04 } },
+	{ { 0, 0x00, 0x06 }, { 1, 0x00, 0x04 } },
+	{ { 0, 0x00, 0x06 }, { 1, 0x00, 0x04 } },
+	{ { 0, 0x02, 0x03 }, { 1, 0x00, 0x04 } },
+	{ { 0, 0x00, 0x07 }, { 1, 0x00, 0x05 } },
+	{ { 1, 0x00, 0x05 }, { 0, 0x00, 0x07 } },
+	{ { 0, 0x01, 0x06 }, { 1, 0x00, 0x05 } },
+	{ { 0, 0x00, 0x08 }, { 1, 0x00, 0x05 } },
+	{ { 0, 0x00, 0x08 }, { 1, 0x00, 0x06 } },
+	{ { 0, 0x00, 0x09 }, { 1, 0x00, 0x06 } },
+	{ { 0, 0x00, 0x09 }, { 1, 0x00, 0x06 } },
+	{ { 0, 0x00, 0x0a }, { 1, 0x00, 0x06 } },
+	{ { 0, 0x00, 0x0a }, { 1, 0x00, 0x07 } },
+	{ { 1, 0x00, 0x07 }, { 0, 0x00, 0x0a } },
+	{ { 0, 0x02, 0x07 }, { 1, 0x00, 0x07 } },
+	{ { 0, 0x00, 0x0b }, { 1, 0x00, 0x07 } },
+	{ { 0, 0x00, 0x0b }, { 1, 0x01, 0x07 } },
+	{ { 0, 0x01, 0x0a }, { 1, 0x01, 0x07 } },
+	{ { 0, 0x00, 0x0c }, { 1, 0x00, 0x08 } },
+	{ { 0, 0x00, 0x0c }, { 1, 0x00, 0x08 } },
+	{ { 0, 0x00, 0x0d }, { 1, 0x02, 0x07 } },
+	{ { 1, 0x02, 0x07 }, { 0, 0x00, 0x0d } },
+	{ { 1, 0x00, 0x09 }, { 0, 0x00, 0x0e } },
+	{ { 0, 0x00, 0x0e }, { 1, 0x00, 0x09 } },
+	{ { 0, 0x00, 0x0e }, { 1, 0x03, 0x07 } },
+	{ { 0, 0x02, 0x0b }, { 1, 0x03, 0x07 } },
+	{ { 0, 0x00, 0x0f }, { 1, 0x00, 0x0a } },
+	{ { 0, 0x00, 0x0f }, { 1, 0x00, 0x0a } },
+	{ { 0, 0x01, 0x0e }, { 1, 0x00, 0x0a } },
+	{ { 0, 0x00, 0x10 }, { 1, 0x00, 0x0b } },
+	{ { 1, 0x00, 0x0b }, { 0, 0x00, 0x10 } },
+	{ { 0, 0x00, 0x11 }, { 1, 0x00, 0x0b } },
+	{ { 0, 0x00, 0x11 }, { 1, 0x00, 0x0b } },
+	{ { 0, 0x00, 0x12 }, { 1, 0x00, 0x0c } },
+	{ { 0, 0x00, 0x12 }, { 1, 0x00, 0x0c } },
+	{ { 0, 0x00, 0x12 }, { 1, 0x00, 0x0c } },
+	{ { 0, 0x02, 0x0f }, { 1, 0x00, 0x0c } },
+	{ { 0, 0x00, 0x13 }, { 1, 0x00, 0x0d } },
+	{ { 1, 0x00, 0x0d }, { 0, 0x00, 0x13 } },
+	{ { 0, 0x01, 0x12 }, { 1, 0x00, 0x0d } },
+	{ { 0, 0x00, 0x14 }, { 1, 0x00, 0x0d } },
+	{ { 0, 0x00, 0x14 }, { 1, 0x00, 0x0e } },
+	{ { 0, 0x00, 0x15 }, { 1, 0x00, 0x0e } },
+	{ { 0, 0x00, 0x15 }, { 1, 0x00, 0x0e } },
+	{ { 0, 0x00, 0x16 }, { 1, 0x00, 0x0e } },
+	{ { 0, 0x00, 0x16 }, { 1, 0x00, 0x0f } },
+	{ { 1, 0x00, 0x0f }, { 0, 0x00, 0x16 } },
+	{ { 0, 0x02, 0x13 }, { 1, 0x00, 0x0f } },
+	{ { 0, 0x00, 0x17 }, { 1, 0x00, 0x0f } },
+	{ { 0, 0x00, 0x17 }, { 1, 0x01, 0x0f } },
+	{ { 0, 0x01, 0x16 }, { 1, 0x01, 0x0f } },
+	{ { 0, 0x00, 0x18 }, { 1, 0x00, 0x10 } },
+	{ { 0, 0x00, 0x18 }, { 1, 0x00, 0x10 } },
+	{ { 0, 0x00, 0x19 }, { 1, 0x02, 0x0f } },
+	{ { 1, 0x02, 0x0f }, { 0, 0x00, 0x19 } },
+	{ { 1, 0x00, 0x11 }, { 0, 0x00, 0x1a } },
+	{ { 0, 0x00, 0x1a }, { 1, 0x00, 0x11 } },
+	{ { 0, 0x00, 0x1a }, { 1, 0x03, 0x0f } },
+	{ { 0, 0x02, 0x17 }, { 1, 0x03, 0x0f } },
+	{ { 0, 0x00, 0x1b }, { 1, 0x00, 0x12 } },
+	{ { 0, 0x00, 0x1b }, { 1, 0x00, 0x12 } },
+	{ { 0, 0x01, 0x1a }, { 1, 0x00, 0x12 } },
+	{ { 0, 0x00, 0x1c }, { 1, 0x00, 0x13 } },
+	{ { 1, 0x00, 0x13 }, { 0, 0x00, 0x1c } },
+	{ { 0, 0x00, 0x1d }, { 1, 0x00, 0x13 } },
+	{ { 0, 0x00, 0x1d }, { 1, 0x00, 0x13 } },
+	{ { 0, 0x00, 0x1e }, { 1, 0x00, 0x14 } },
+	{ { 0, 0x00, 0x1e }, { 1, 0x00, 0x14 } },
+	{ { 0, 0x00, 0x1e }, { 1, 0x00, 0x14 } },
+	{ { 0, 0x02, 0x1b }, { 1, 0x00, 0x14 } },
+	{ { 0, 0x00, 0x1f }, { 1, 0x00, 0x15 } },
+	{ { 1, 0x00, 0x15 }, { 0, 0x00, 0x1f } },
+	{ { 0, 0x01, 0x1e }, { 1, 0x00, 0x15 } },
+	{ { 0, 0x04, 0x18 }, { 1, 0x00, 0x15 } },
+	{ { 0, 0x01, 0x1f }, { 1, 0x00, 0x16 } },
+	{ { 0, 0x01, 0x1f }, { 1, 0x00, 0x16 } },
+	{ { 0, 0x01, 0x1f }, { 1, 0x00, 0x16 } },
+	{ { 0, 0x02, 0x1e }, { 1, 0x00, 0x16 } },
+	{ { 0, 0x02, 0x1e }, { 1, 0x00, 0x17 } },
+	{ { 1, 0x00, 0x17 }, { 0, 0x02, 0x1e } },
+	{ { 0, 0x02, 0x1f }, { 1, 0x00, 0x17 } },
+	{ { 0, 0x04, 0x1b }, { 1, 0x00, 0x17 } },
+	{ { 0, 0x03, 0x1e }, { 1, 0x01, 0x17 } },
+	{ { 0, 0x03, 0x1e }, { 1, 0x01, 0x17 } },
+	{ { 0, 0x04, 0x1c }, { 1, 0x00, 0x18 } },
+	{ { 0, 0x03, 0x1f }, { 1, 0x00, 0x18 } },
+	{ { 0, 0x03, 0x1f }, { 1, 0x02, 0x17 } },
+	{ { 1, 0x02, 0x17 }, { 0, 0x03, 0x1f } },
+	{ { 1, 0x00, 0x19 }, { 0, 0x04, 0x1e } },
+	{ { 0, 0x04, 0x1e }, { 1, 0x00, 0x19 } },
+	{ { 0, 0x04, 0x1e }, { 1, 0x03, 0x17 } },
+	{ { 0, 0x06, 0x1b }, { 1, 0x03, 0x17 } },
+	{ { 0, 0x04, 0x1f }, { 1, 0x00, 0x1a } },
+	{ { 0, 0x04, 0x1f }, { 1, 0x00, 0x1a } },
+	{ { 0, 0x05, 0x1e }, { 1, 0x00, 0x1a } },
+	{ { 0, 0x08, 0x18 }, { 1, 0x00, 0x1b } },
+	{ { 1, 0x00, 0x1b }, { 0, 0x05, 0x1f } },
+	{ { 0, 0x05, 0x1f }, { 1, 0x00, 0x1b } },
+	{ { 0, 0x05, 0x1f }, { 1, 0x00, 0x1b } },
+	{ { 0, 0x06, 0x1e }, { 1, 0x00, 0x1c } },
+	{ { 0, 0x06, 0x1e }, { 1, 0x00, 0x1c } },
+	{ { 0, 0x06, 0x1e }, { 1, 0x00, 0x1c } },
+	{ { 0, 0x06, 0x1f }, { 1, 0x00, 0x1c } },
+	{ { 0, 0x08, 0x1b }, { 1, 0x00, 0x1d } },
+	{ { 1, 0x00, 0x1d }, { 0, 0x07, 0x1e } },
+	{ { 0, 0x07, 0x1e }, { 1, 0x00, 0x1d } },
+	{ { 0, 0x08, 0x1c }, { 1, 0x00, 0x1d } },
+	{ { 0, 0x07, 0x1f }, { 1, 0x00, 0x1e } },
+	{ { 0, 0x07, 0x1f }, { 1, 0x00, 0x1e } },
+	{ { 0, 0x07, 0x1f }, { 1, 0x00, 0x1e } },
+	{ { 0, 0x08, 0x1e }, { 1, 0x00, 0x1e } },
+	{ { 0, 0x08, 0x1e }, { 1, 0x00, 0x1f } },
+	{ { 1, 0x00, 0x1f }, { 0, 0x08, 0x1e } },
+	{ { 0, 0x0a, 0x1b }, { 1, 0x00, 0x1f } },
+	{ { 0, 0x08, 0x1f }, { 1, 0x00, 0x1f } },
+	{ { 0, 0x08, 0x1f }, { 1, 0x01, 0x1f } },
+	{ { 0, 0x09, 0x1e }, { 1, 0x01, 0x1f } },
+	{ { 0, 0x0c, 0x18 }, { 1, 0x04, 0x1c } },
+	{ { 0, 0x09, 0x1f }, { 1, 0x04, 0x1c } },
+	{ { 0, 0x09, 0x1f }, { 1, 0x02, 0x1f } },
+	{ { 1, 0x02, 0x1f }, { 0, 0x09, 0x1f } },
+	{ { 1, 0x04, 0x1d }, { 0, 0x0a, 0x1e } },
+	{ { 0, 0x0a, 0x1e }, { 1, 0x04, 0x1d } },
+	{ { 0, 0x0a, 0x1e }, { 1, 0x03, 0x1f } },
+	{ { 0, 0x0a, 0x1f }, { 1, 0x03, 0x1f } },
+	{ { 0, 0x0c, 0x1b }, { 1, 0x04, 0x1e } },
+	{ { 0, 0x0b, 0x1e }, { 1, 0x04, 0x1e } },
+	{ { 0, 0x0b, 0x1e }, { 1, 0x04, 0x1e } },
+	{ { 0, 0x0c, 0x1c }, { 1, 0x04, 0x1f } },
+	{ { 1, 0x04, 0x1f }, { 0, 0x0b, 0x1f } },
+	{ { 0, 0x0b, 0x1f }, { 1, 0x04, 0x1f } },
+	{ { 0, 0x0b, 0x1f }, { 1, 0x04, 0x1f } },
+	{ { 0, 0x0c, 0x1e }, { 1, 0x05, 0x1f } },
+	{ { 0, 0x0c, 0x1e }, { 1, 0x05, 0x1f } },
+	{ { 0, 0x0c, 0x1e }, { 1, 0x05, 0x1f } },
+	{ { 0, 0x0e, 0x1b }, { 1, 0x05, 0x1f } },
+	{ { 0, 0x0c, 0x1f }, { 1, 0x06, 0x1f } },
+	{ { 1, 0x06, 0x1f }, { 0, 0x0c, 0x1f } },
+	{ { 0, 0x0d, 0x1e }, { 1, 0x06, 0x1f } },
+	{ { 0, 0x10, 0x18 }, { 1, 0x06, 0x1f } },
+	{ { 0, 0x0d, 0x1f }, { 1, 0x07, 0x1f } },
+	{ { 0, 0x0d, 0x1f }, { 1, 0x07, 0x1f } },
+	{ { 0, 0x0d, 0x1f }, { 1, 0x07, 0x1f } },
+	{ { 0, 0x0e, 0x1e }, { 1, 0x07, 0x1f } },
+	{ { 0, 0x0e, 0x1e }, { 1, 0x08, 0x1f } },
+	{ { 1, 0x08, 0x1f }, { 0, 0x0e, 0x1e } },
+	{ { 0, 0x0e, 0x1f }, { 1, 0x08, 0x1f } },
+	{ { 0, 0x10, 0x1b }, { 1, 0x08, 0x1f } },
+	{ { 0, 0x0f, 0x1e }, { 1, 0x09, 0x1f } },
+	{ { 0, 0x0f, 0x1e }, { 1, 0x09, 0x1f } },
+	{ { 0, 0x10, 0x1c }, { 1, 0x0c, 0x1c } },
+	{ { 0, 0x0f, 0x1f }, { 1, 0x0c, 0x1c } },
+	{ { 0, 0x0f, 0x1f }, { 1, 0x0a, 0x1f } },
+	{ { 1, 0x0a, 0x1f }, { 0, 0x0f, 0x1f } },
+	{ { 1, 0x0c, 0x1d }, { 0, 0x10, 0x1e } },
+	{ { 0, 0x10, 0x1e }, { 1, 0x0c, 0x1d } },
+	{ { 0, 0x10, 0x1e }, { 1, 0x0b, 0x1f } },
+	{ { 0, 0x12, 0x1b }, { 1, 0x0b, 0x1f } },
+	{ { 0, 0x10, 0x1f }, { 1, 0x0c, 0x1e } },
+	{ { 0, 0x10, 0x1f }, { 1, 0x0c, 0x1e } },
+	{ { 0, 0x11, 0x1e }, { 1, 0x0c, 0x1e } },
+	{ { 0, 0x14, 0x18 }, { 1, 0x0c, 0x1f } },
+	{ { 1, 0x0c, 0x1f }, { 0, 0x11, 0x1f } },
+	{ { 0, 0x11, 0x1f }, { 1, 0x0c, 0x1f } },
+	{ { 0, 0x11, 0x1f }, { 1, 0x0c, 0x1f } },
+	{ { 0, 0x12, 0x1e }, { 1, 0x0d, 0x1f } },
+	{ { 0, 0x12, 0x1e }, { 1, 0x0d, 0x1f } },
+	{ { 0, 0x12, 0x1e }, { 1, 0x0d, 0x1f } },
+	{ { 0, 0x12, 0x1f }, { 1, 0x0d, 0x1f } },
+	{ { 0, 0x14, 0x1b }, { 1, 0x0e, 0x1f } },
+	{ { 1, 0x0e, 0x1f }, { 0, 0x13, 0x1e } },
+	{ { 0, 0x13, 0x1e }, { 1, 0x0e, 0x1f } },
+	{ { 0, 0x14, 0x1c }, { 1, 0x0e, 0x1f } },
+	{ { 0, 0x13, 0x1f }, { 1, 0x0f, 0x1f } },
+	{ { 0, 0x13, 0x1f }, { 1, 0x0f, 0x1f } },
+	{ { 0, 0x13, 0x1f }, { 1, 0x0f, 0x1f } },
+	{ { 0, 0x14, 0x1e }, { 1, 0x0f, 0x1f } },
+	{ { 0, 0x14, 0x1e }, { 1, 0x10, 0x1f } },
+	{ { 1, 0x10, 0x1f }, { 0, 0x14, 0x1e } },
+	{ { 0, 0x16, 0x1b }, { 1, 0x10, 0x1f } },
+	{ { 0, 0x14, 0x1f }, { 1, 0x10, 0x1f } },
+	{ { 0, 0x14, 0x1f }, { 1, 0x11, 0x1f } },
+	{ { 0, 0x15, 0x1e }, { 1, 0x11, 0x1f } },
+	{ { 0, 0x18, 0x18 }, { 1, 0x14, 0x1c } },
+	{ { 0, 0x15, 0x1f }, { 1, 0x14, 0x1c } },
+	{ { 0, 0x15, 0x1f }, { 1, 0x12, 0x1f } },
+	{ { 1, 0x12, 0x1f }, { 0, 0x15, 0x1f } },
+	{ { 1, 0x14, 0x1d }, { 0, 0x16, 0x1e } },
+	{ { 0, 0x16, 0x1e }, { 1, 0x14, 0x1d } },
+	{ { 0, 0x16, 0x1e }, { 1, 0x13, 0x1f } },
+	{ { 0, 0x16, 0x1f }, { 1, 0x13, 0x1f } },
+	{ { 0, 0x18, 0x1b }, { 1, 0x14, 0x1e } },
+	{ { 0, 0x17, 0x1e }, { 1, 0x14, 0x1e } },
+	{ { 0, 0x17, 0x1e }, { 1, 0x14, 0x1e } },
+	{ { 0, 0x18, 0x1c }, { 1, 0x14, 0x1f } },
+	{ { 1, 0x14, 0x1f }, { 0, 0x17, 0x1f } },
+	{ { 0, 0x17, 0x1f }, { 1, 0x14, 0x1f } },
+	{ { 0, 0x17, 0x1f }, { 1, 0x14, 0x1f } },
+	{ { 0, 0x18, 0x1e }, { 1, 0x15, 0x1f } },
+	{ { 0, 0x18, 0x1e }, { 1, 0x15, 0x1f } },
+	{ { 0, 0x18, 0x1e }, { 1, 0x15, 0x1f } },
+	{ { 0, 0x1a, 0x1b }, { 1, 0x15, 0x1f } },
+	{ { 0, 0x18, 0x1f }, { 1, 0x16, 0x1f } },
+	{ { 1, 0x16, 0x1f }, { 0, 0x18, 0x1f } },
+	{ { 0, 0x19, 0x1e }, { 1, 0x16, 0x1f } },
+	{ { 0, 0x19, 0x1e }, { 1, 0x16, 0x1f } },
+	{ { 0, 0x19, 0x1f }, { 1, 0x17, 0x1f } },
+	{ { 0, 0x19, 0x1f }, { 1, 0x17, 0x1f } },
+	{ { 0, 0x19, 0x1f }, { 1, 0x17, 0x1f } },
+	{ { 0, 0x1a, 0x1e }, { 1, 0x17, 0x1f } },
+	{ { 0, 0x1a, 0x1e }, { 1, 0x18, 0x1f } },
+	{ { 1, 0x18, 0x1f }, { 0, 0x1a, 0x1e } },
+	{ { 0, 0x1a, 0x1f }, { 1, 0x18, 0x1f } },
+	{ { 0, 0x1a, 0x1f }, { 1, 0x18, 0x1f } },
+	{ { 0, 0x1b, 0x1e }, { 1, 0x19, 0x1f } },
+	{ { 0, 0x1b, 0x1e }, { 1, 0x19, 0x1f } },
+	{ { 0, 0x1c, 0x1c }, { 1, 0x1c, 0x1c } },
+	{ { 0, 0x1b, 0x1f }, { 1, 0x1c, 0x1c } },
+	{ { 0, 0x1b, 0x1f }, { 1, 0x1a, 0x1f } },
+	{ { 1, 0x1a, 0x1f }, { 0, 0x1b, 0x1f } },
+	{ { 1, 0x1c, 0x1d }, { 0, 0x1c, 0x1e } },
+	{ { 0, 0x1c, 0x1e }, { 1, 0x1c, 0x1d } },
+	{ { 0, 0x1c, 0x1e }, { 1, 0x1b, 0x1f } },
+	{ { 1, 0x1b, 0x1f }, { 0, 0x1c, 0x1f } },
+	{ { 0, 0x1c, 0x1f }, { 1, 0x1c, 0x1e } },
+	{ { 0, 0x1c, 0x1f }, { 1, 0x1c, 0x1e } },
+	{ { 0, 0x1d, 0x1e }, { 1, 0x1c, 0x1e } },
+	{ { 0, 0x1d, 0x1e }, { 1, 0x1c, 0x1f } },
+	{ { 1, 0x1c, 0x1f }, { 0, 0x1d, 0x1f } },
+	{ { 0, 0x1d, 0x1f }, { 1, 0x1c, 0x1f } },
+	{ { 0, 0x1d, 0x1f }, { 1, 0x1c, 0x1f } },
+	{ { 0, 0x1e, 0x1e }, { 1, 0x1d, 0x1f } },
+	{ { 0, 0x1e, 0x1e }, { 1, 0x1d, 0x1f } },
+	{ { 0, 0x1e, 0x1e }, { 1, 0x1d, 0x1f } },
+	{ { 0, 0x1e, 0x1f }, { 1, 0x1d, 0x1f } },
+	{ { 0, 0x1e, 0x1f }, { 1, 0x1e, 0x1f } },
+	{ { 1, 0x1e, 0x1f }, { 0, 0x1e, 0x1f } },
+	{ { 1, 0x1e, 0x1f }, { 0, 0x1e, 0x1f } },
+	{ { 0, 0x1f, 0x1f }, { 1, 0x1e, 0x1f } },
+	{ { 0, 0x1f, 0x1f }, { 1, 0x1f, 0x1f } },
+	{ { 0, 0x1f, 0x1f }, { 1, 0x1f, 0x1f } }
+};
+
+static unsigned char Optimal6CompressDXT1[256][2][3] = {
+	{ { 0, 0x00, 0x00 }, { 1, 0x00, 0x00 } },
+	{ { 0, 0x00, 0x01 }, { 1, 0x00, 0x00 } },
+	{ { 0, 0x00, 0x02 }, { 1, 0x00, 0x01 } },
+	{ { 0, 0x00, 0x02 }, { 1, 0x00, 0x01 } },
+	{ { 0, 0x00, 0x03 }, { 1, 0x00, 0x02 } },
+	{ { 0, 0x00, 0x04 }, { 1, 0x00, 0x02 } },
+	{ { 0, 0x00, 0x05 }, { 1, 0x00, 0x03 } },
+	{ { 0, 0x00, 0x05 }, { 1, 0x00, 0x03 } },
+	{ { 0, 0x00, 0x06 }, { 1, 0x00, 0x04 } },
+	{ { 0, 0x00, 0x07 }, { 1, 0x00, 0x04 } },
+	{ { 0, 0x00, 0x08 }, { 1, 0x00, 0x05 } },
+	{ { 0, 0x00, 0x08 }, { 1, 0x00, 0x05 } },
+	{ { 0, 0x00, 0x09 }, { 1, 0x00, 0x06 } },
+	{ { 0, 0x00, 0x0a }, { 1, 0x00, 0x06 } },
+	{ { 0, 0x00, 0x0b }, { 1, 0x00, 0x07 } },
+	{ { 0, 0x00, 0x0b }, { 1, 0x00, 0x07 } },
+	{ { 0, 0x00, 0x0c }, { 1, 0x00, 0x08 } },
+	{ { 0, 0x00, 0x0d }, { 1, 0x00, 0x08 } },
+	{ { 0, 0x00, 0x0e }, { 1, 0x00, 0x09 } },
+	{ { 0, 0x00, 0x0e }, { 1, 0x00, 0x09 } },
+	{ { 0, 0x00, 0x0f }, { 1, 0x00, 0x0a } },
+	{ { 0, 0x00, 0x10 }, { 1, 0x00, 0x0a } },
+	{ { 0, 0x01, 0x0f }, { 1, 0x00, 0x0b } },
+	{ { 0, 0x00, 0x11 }, { 1, 0x00, 0x0b } },
+	{ { 0, 0x00, 0x12 }, { 1, 0x00, 0x0c } },
+	{ { 0, 0x00, 0x13 }, { 1, 0x00, 0x0c } },
+	{ { 0, 0x03, 0x0e }, { 1, 0x00, 0x0d } },
+	{ { 0, 0x00, 0x14 }, { 1, 0x00, 0x0d } },
+	{ { 0, 0x00, 0x15 }, { 1, 0x00, 0x0e } },
+	{ { 0, 0x00, 0x16 }, { 1, 0x00, 0x0e } },
+	{ { 0, 0x04, 0x0f }, { 1, 0x00, 0x0f } },
+	{ { 0, 0x00, 0x17 }, { 1, 0x00, 0x0f } },
+	{ { 0, 0x00, 0x18 }, { 1, 0x00, 0x10 } },
+	{ { 0, 0x00, 0x19 }, { 1, 0x00, 0x10 } },
+	{ { 0, 0x06, 0x0e }, { 1, 0x00, 0x11 } },
+	{ { 0, 0x00, 0x1a }, { 1, 0x00, 0x11 } },
+	{ { 0, 0x00, 0x1b }, { 1, 0x00, 0x12 } },
+	{ { 0, 0x00, 0x1c }, { 1, 0x00, 0x12 } },
+	{ { 0, 0x07, 0x0f }, { 1, 0x00, 0x13 } },
+	{ { 0, 0x00, 0x1d }, { 1, 0x00, 0x13 } },
+	{ { 0, 0x00, 0x1e }, { 1, 0x00, 0x14 } },
+	{ { 0, 0x00, 0x1f }, { 1, 0x00, 0x14 } },
+	{ { 0, 0x09, 0x0e }, { 1, 0x00, 0x15 } },
+	{ { 0, 0x00, 0x20 }, { 1, 0x00, 0x15 } },
+	{ { 0, 0x00, 0x21 }, { 1, 0x00, 0x16 } },
+	{ { 0, 0x02, 0x1e }, { 1, 0x00, 0x16 } },
+	{ { 0, 0x00, 0x22 }, { 1, 0x00, 0x17 } },
+	{ { 0, 0x00, 0x23 }, { 1, 0x00, 0x17 } },
+	{ { 0, 0x00, 0x24 }, { 1, 0x00, 0x18 } },
+	{ { 0, 0x03, 0x1f }, { 1, 0x00, 0x18 } },
+	{ { 0, 0x00, 0x25 }, { 1, 0x00, 0x19 } },
+	{ { 0, 0x00, 0x26 }, { 1, 0x00, 0x19 } },
+	{ { 0, 0x00, 0x27 }, { 1, 0x00, 0x1a } },
+	{ { 0, 0x05, 0x1e }, { 1, 0x00, 0x1a } },
+	{ { 0, 0x00, 0x28 }, { 1, 0x00, 0x1b } },
+	{ { 0, 0x00, 0x29 }, { 1, 0x00, 0x1b } },
+	{ { 0, 0x00, 0x2a }, { 1, 0x00, 0x1c } },
+	{ { 0, 0x06, 0x1f }, { 1, 0x00, 0x1c } },
+	{ { 0, 0x00, 0x2b }, { 1, 0x00, 0x1d } },
+	{ { 0, 0x00, 0x2c }, { 1, 0x00, 0x1d } },
+	{ { 0, 0x00, 0x2d }, { 1, 0x00, 0x1e } },
+	{ { 0, 0x08, 0x1e }, { 1, 0x00, 0x1e } },
+	{ { 0, 0x00, 0x2e }, { 1, 0x00, 0x1f } },
+	{ { 0, 0x00, 0x2f }, { 1, 0x00, 0x1f } },
+	{ { 0, 0x01, 0x2e }, { 1, 0x01, 0x1f } },
+	{ { 0, 0x00, 0x30 }, { 1, 0x00, 0x20 } },
+	{ { 0, 0x00, 0x31 }, { 1, 0x02, 0x1f } },
+	{ { 0, 0x00, 0x32 }, { 1, 0x00, 0x21 } },
+	{ { 0, 0x02, 0x2f }, { 1, 0x03, 0x1f } },
+	{ { 0, 0x00, 0x33 }, { 1, 0x00, 0x22 } },
+	{ { 0, 0x00, 0x34 }, { 1, 0x04, 0x1f } },
+	{ { 0, 0x00, 0x35 }, { 1, 0x00, 0x23 } },
+	{ { 0, 0x04, 0x2e }, { 1, 0x05, 0x1f } },
+	{ { 0, 0x00, 0x36 }, { 1, 0x00, 0x24 } },
+	{ { 0, 0x00, 0x37 }, { 1, 0x06, 0x1f } },
+	{ { 0, 0x00, 0x38 }, { 1, 0x00, 0x25 } },
+	{ { 0, 0x05, 0x2f }, { 1, 0x07, 0x1f } },
+	{ { 0, 0x00, 0x39 }, { 1, 0x00, 0x26 } },
+	{ { 0, 0x00, 0x3a }, { 1, 0x08, 0x1f } },
+	{ { 0, 0x00, 0x3b }, { 1, 0x00, 0x27 } },
+	{ { 0, 0x07, 0x2e }, { 1, 0x09, 0x1f } },
+	{ { 0, 0x00, 0x3c }, { 1, 0x00, 0x28 } },
+	{ { 0, 0x00, 0x3d }, { 1, 0x0a, 0x1f } },
+	{ { 0, 0x00, 0x3e }, { 1, 0x00, 0x29 } },
+	{ { 0, 0x08, 0x2f }, { 1, 0x0b, 0x1f } },
+	{ { 0, 0x00, 0x3f }, { 1, 0x00, 0x2a } },
+	{ { 0, 0x01, 0x3e }, { 1, 0x0c, 0x1f } },
+	{ { 0, 0x01, 0x3f }, { 1, 0x00, 0x2b } },
+	{ { 0, 0x0a, 0x2e }, { 1, 0x0d, 0x1f } },
+	{ { 0, 0x02, 0x3e }, { 1, 0x00, 0x2c } },
+	{ { 0, 0x02, 0x3f }, { 1, 0x0e, 0x1f } },
+	{ { 0, 0x03, 0x3e }, { 1, 0x00, 0x2d } },
+	{ { 0, 0x0b, 0x2f }, { 1, 0x0f, 0x1f } },
+	{ { 0, 0x03, 0x3f }, { 1, 0x00, 0x2e } },
+	{ { 0, 0x04, 0x3e }, { 1, 0x00, 0x2e } },
+	{ { 0, 0x04, 0x3f }, { 1, 0x00, 0x2f } },
+	{ { 0, 0x0d, 0x2e }, { 1, 0x00, 0x2f } },
+	{ { 0, 0x05, 0x3e }, { 1, 0x00, 0x30 } },
+	{ { 0, 0x05, 0x3f }, { 1, 0x00, 0x30 } },
+	{ { 0, 0x06, 0x3e }, { 1, 0x00, 0x31 } },
+	{ { 0, 0x0e, 0x2f }, { 1, 0x00, 0x31 } },
+	{ { 0, 0x06, 0x3f }, { 1, 0x00, 0x32 } },
+	{ { 0, 0x07, 0x3e }, { 1, 0x00, 0x32 } },
+	{ { 0, 0x07, 0x3f }, { 1, 0x00, 0x33 } },
+	{ { 0, 0x10, 0x2d }, { 1, 0x00, 0x33 } },
+	{ { 0, 0x08, 0x3e }, { 1, 0x00, 0x34 } },
+	{ { 0, 0x08, 0x3f }, { 1, 0x00, 0x34 } },
+	{ { 0, 0x09, 0x3e }, { 1, 0x00, 0x35 } },
+	{ { 0, 0x10, 0x30 }, { 1, 0x00, 0x35 } },
+	{ { 0, 0x09, 0x3f }, { 1, 0x00, 0x36 } },
+	{ { 0, 0x0a, 0x3e }, { 1, 0x00, 0x36 } },
+	{ { 0, 0x0a, 0x3f }, { 1, 0x00, 0x37 } },
+	{ { 0, 0x10, 0x33 }, { 1, 0x00, 0x37 } },
+	{ { 0, 0x0b, 0x3e }, { 1, 0x00, 0x38 } },
+	{ { 0, 0x0b, 0x3f }, { 1, 0x00, 0x38 } },
+	{ { 0, 0x0c, 0x3e }, { 1, 0x00, 0x39 } },
+	{ { 0, 0x10, 0x36 }, { 1, 0x00, 0x39 } },
+	{ { 0, 0x0c, 0x3f }, { 1, 0x00, 0x3a } },
+	{ { 0, 0x0d, 0x3e }, { 1, 0x00, 0x3a } },
+	{ { 0, 0x0d, 0x3f }, { 1, 0x00, 0x3b } },
+	{ { 0, 0x10, 0x39 }, { 1, 0x00, 0x3b } },
+	{ { 0, 0x0e, 0x3e }, { 1, 0x00, 0x3c } },
+	{ { 0, 0x0e, 0x3f }, { 1, 0x00, 0x3c } },
+	{ { 0, 0x0f, 0x3e }, { 1, 0x00, 0x3d } },
+	{ { 0, 0x10, 0x3c }, { 1, 0x00, 0x3d } },
+	{ { 0, 0x0f, 0x3f }, { 1, 0x00, 0x3e } },
+	{ { 0, 0x18, 0x2e }, { 1, 0x00, 0x3e } },
+	{ { 0, 0x10, 0x3e }, { 1, 0x00, 0x3f } },
+	{ { 0, 0x10, 0x3f }, { 1, 0x00, 0x3f } },
+	{ { 0, 0x11, 0x3e }, { 1, 0x01, 0x3f } },
+	{ { 0, 0x19, 0x2f }, { 1, 0x10, 0x30 } },
+	{ { 0, 0x11, 0x3f }, { 1, 0x02, 0x3f } },
+	{ { 0, 0x12, 0x3e }, { 1, 0x10, 0x31 } },
+	{ { 0, 0x12, 0x3f }, { 1, 0x03, 0x3f } },
+	{ { 0, 0x1b, 0x2e }, { 1, 0x10, 0x32 } },
+	{ { 0, 0x13, 0x3e }, { 1, 0x04, 0x3f } },
+	{ { 0, 0x13, 0x3f }, { 1, 0x10, 0x33 } },
+	{ { 0, 0x14, 0x3e }, { 1, 0x05, 0x3f } },
+	{ { 0, 0x1c, 0x2f }, { 1, 0x10, 0x34 } },
+	{ { 0, 0x14, 0x3f }, { 1, 0x06, 0x3f } },
+	{ { 0, 0x15, 0x3e }, { 1, 0x10, 0x35 } },
+	{ { 0, 0x15, 0x3f }, { 1, 0x07, 0x3f } },
+	{ { 0, 0x1e, 0x2e }, { 1, 0x10, 0x36 } },
+	{ { 0, 0x16, 0x3e }, { 1, 0x08, 0x3f } },
+	{ { 0, 0x16, 0x3f }, { 1, 0x10, 0x37 } },
+	{ { 0, 0x17, 0x3e }, { 1, 0x09, 0x3f } },
+	{ { 0, 0x1f, 0x2f }, { 1, 0x10, 0x38 } },
+	{ { 0, 0x17, 0x3f }, { 1, 0x0a, 0x3f } },
+	{ { 0, 0x18, 0x3e }, { 1, 0x10, 0x39 } },
+	{ { 0, 0x18, 0x3f }, { 1, 0x0b, 0x3f } },
+	{ { 0, 0x20, 0x2f }, { 1, 0x10, 0x3a } },
+	{ { 0, 0x19, 0x3e }, { 1, 0x0c, 0x3f } },
+	{ { 0, 0x19, 0x3f }, { 1, 0x10, 0x3b } },
+	{ { 0, 0x1a, 0x3e }, { 1, 0x0d, 0x3f } },
+	{ { 0, 0x20, 0x32 }, { 1, 0x10, 0x3c } },
+	{ { 0, 0x1a, 0x3f }, { 1, 0x0e, 0x3f } },
+	{ { 0, 0x1b, 0x3e }, { 1, 0x10, 0x3d } },
+	{ { 0, 0x1b, 0x3f }, { 1, 0x0f, 0x3f } },
+	{ { 0, 0x20, 0x35 }, { 1, 0x10, 0x3e } },
+	{ { 0, 0x1c, 0x3e }, { 1, 0x10, 0x3e } },
+	{ { 0, 0x1c, 0x3f }, { 1, 0x10, 0x3f } },
+	{ { 0, 0x1d, 0x3e }, { 1, 0x10, 0x3f } },
+	{ { 0, 0x20, 0x38 }, { 1, 0x11, 0x3f } },
+	{ { 0, 0x1d, 0x3f }, { 1, 0x11, 0x3f } },
+	{ { 0, 0x1e, 0x3e }, { 1, 0x12, 0x3f } },
+	{ { 0, 0x1e, 0x3f }, { 1, 0x12, 0x3f } },
+	{ { 0, 0x20, 0x3b }, { 1, 0x13, 0x3f } },
+	{ { 0, 0x1f, 0x3e }, { 1, 0x13, 0x3f } },
+	{ { 0, 0x1f, 0x3f }, { 1, 0x14, 0x3f } },
+	{ { 0, 0x20, 0x3d }, { 1, 0x14, 0x3f } },
+	{ { 0, 0x20, 0x3e }, { 1, 0x15, 0x3f } },
+	{ { 0, 0x20, 0x3f }, { 1, 0x15, 0x3f } },
+	{ { 0, 0x29, 0x2e }, { 1, 0x16, 0x3f } },
+	{ { 0, 0x21, 0x3e }, { 1, 0x16, 0x3f } },
+	{ { 0, 0x21, 0x3f }, { 1, 0x17, 0x3f } },
+	{ { 0, 0x22, 0x3e }, { 1, 0x17, 0x3f } },
+	{ { 0, 0x2a, 0x2f }, { 1, 0x18, 0x3f } },
+	{ { 0, 0x22, 0x3f }, { 1, 0x18, 0x3f } },
+	{ { 0, 0x23, 0x3e }, { 1, 0x19, 0x3f } },
+	{ { 0, 0x23, 0x3f }, { 1, 0x19, 0x3f } },
+	{ { 0, 0x2c, 0x2e }, { 1, 0x1a, 0x3f } },
+	{ { 0, 0x24, 0x3e }, { 1, 0x1a, 0x3f } },
+	{ { 0, 0x24, 0x3f }, { 1, 0x1b, 0x3f } },
+	{ { 0, 0x25, 0x3e }, { 1, 0x1b, 0x3f } },
+	{ { 0, 0x2d, 0x2f }, { 1, 0x1c, 0x3f } },
+	{ { 0, 0x25, 0x3f }, { 1, 0x1c, 0x3f } },
+	{ { 0, 0x26, 0x3e }, { 1, 0x1d, 0x3f } },
+	{ { 0, 0x26, 0x3f }, { 1, 0x1d, 0x3f } },
+	{ { 1, 0x1e, 0x3f }, { 0, 0x26, 0x3f } },
+	{ { 0, 0x27, 0x3e }, { 1, 0x1e, 0x3f } },
+	{ { 0, 0x27, 0x3f }, { 1, 0x1f, 0x3f } },
+	{ { 0, 0x28, 0x3e }, { 1, 0x1f, 0x3f } },
+	{ { 1, 0x20, 0x3f }, { 0, 0x28, 0x3e } },
+	{ { 0, 0x28, 0x3f }, { 1, 0x20, 0x3f } },
+	{ { 0, 0x29, 0x3e }, { 1, 0x21, 0x3f } },
+	{ { 0, 0x29, 0x3f }, { 1, 0x30, 0x30 } },
+	{ { 0, 0x30, 0x31 }, { 1, 0x22, 0x3f } },
+	{ { 0, 0x2a, 0x3e }, { 1, 0x30, 0x31 } },
+	{ { 0, 0x2a, 0x3f }, { 1, 0x23, 0x3f } },
+	{ { 0, 0x2b, 0x3e }, { 1, 0x30, 0x32 } },
+	{ { 0, 0x30, 0x34 }, { 1, 0x24, 0x3f } },
+	{ { 0, 0x2b, 0x3f }, { 1, 0x30, 0x33 } },
+	{ { 0, 0x2c, 0x3e }, { 1, 0x25, 0x3f } },
+	{ { 0, 0x2c, 0x3f }, { 1, 0x30, 0x34 } },
+	{ { 0, 0x30, 0x37 }, { 1, 0x26, 0x3f } },
+	{ { 0, 0x2d, 0x3e }, { 1, 0x30, 0x35 } },
+	{ { 0, 0x2d, 0x3f }, { 1, 0x27, 0x3f } },
+	{ { 0, 0x2e, 0x3e }, { 1, 0x30, 0x36 } },
+	{ { 0, 0x30, 0x3a }, { 1, 0x28, 0x3f } },
+	{ { 0, 0x2e, 0x3f }, { 1, 0x30, 0x37 } },
+	{ { 0, 0x2f, 0x3e }, { 1, 0x29, 0x3f } },
+	{ { 0, 0x2f, 0x3f }, { 1, 0x30, 0x38 } },
+	{ { 0, 0x30, 0x3d }, { 1, 0x2a, 0x3f } },
+	{ { 0, 0x30, 0x3e }, { 1, 0x30, 0x39 } },
+	{ { 1, 0x2b, 0x3f }, { 0, 0x30, 0x3e } },
+	{ { 0, 0x30, 0x3f }, { 1, 0x30, 0x3a } },
+	{ { 0, 0x31, 0x3e }, { 1, 0x2c, 0x3f } },
+	{ { 0, 0x31, 0x3f }, { 1, 0x30, 0x3b } },
+	{ { 1, 0x2d, 0x3f }, { 0, 0x31, 0x3f } },
+	{ { 0, 0x32, 0x3e }, { 1, 0x30, 0x3c } },
+	{ { 0, 0x32, 0x3f }, { 1, 0x2e, 0x3f } },
+	{ { 0, 0x33, 0x3e }, { 1, 0x30, 0x3d } },
+	{ { 1, 0x2f, 0x3f }, { 0, 0x33, 0x3e } },
+	{ { 0, 0x33, 0x3f }, { 1, 0x30, 0x3e } },
+	{ { 0, 0x34, 0x3e }, { 1, 0x30, 0x3e } },
+	{ { 0, 0x34, 0x3f }, { 1, 0x30, 0x3f } },
+	{ { 0, 0x34, 0x3f }, { 1, 0x30, 0x3f } },
+	{ { 0, 0x35, 0x3e }, { 1, 0x31, 0x3f } },
+	{ { 0, 0x35, 0x3f }, { 1, 0x31, 0x3f } },
+	{ { 0, 0x36, 0x3e }, { 1, 0x32, 0x3f } },
+	{ { 0, 0x36, 0x3e }, { 1, 0x32, 0x3f } },
+	{ { 0, 0x36, 0x3f }, { 1, 0x33, 0x3f } },
+	{ { 0, 0x37, 0x3e }, { 1, 0x33, 0x3f } },
+	{ { 0, 0x37, 0x3f }, { 1, 0x34, 0x3f } },
+	{ { 0, 0x37, 0x3f }, { 1, 0x34, 0x3f } },
+	{ { 0, 0x38, 0x3e }, { 1, 0x35, 0x3f } },
+	{ { 0, 0x38, 0x3f }, { 1, 0x35, 0x3f } },
+	{ { 0, 0x39, 0x3e }, { 1, 0x36, 0x3f } },
+	{ { 0, 0x39, 0x3e }, { 1, 0x36, 0x3f } },
+	{ { 0, 0x39, 0x3f }, { 1, 0x37, 0x3f } },
+	{ { 0, 0x3a, 0x3e }, { 1, 0x37, 0x3f } },
+	{ { 0, 0x3a, 0x3f }, { 1, 0x38, 0x3f } },
+	{ { 0, 0x3a, 0x3f }, { 1, 0x38, 0x3f } },
+	{ { 0, 0x3b, 0x3e }, { 1, 0x39, 0x3f } },
+	{ { 0, 0x3b, 0x3f }, { 1, 0x39, 0x3f } },
+	{ { 0, 0x3c, 0x3e }, { 1, 0x3a, 0x3f } },
+	{ { 0, 0x3c, 0x3e }, { 1, 0x3a, 0x3f } },
+	{ { 0, 0x3c, 0x3f }, { 1, 0x3b, 0x3f } },
+	{ { 0, 0x3d, 0x3e }, { 1, 0x3b, 0x3f } },
+	{ { 0, 0x3d, 0x3f }, { 1, 0x3c, 0x3f } },
+	{ { 0, 0x3d, 0x3f }, { 1, 0x3c, 0x3f } },
+	{ { 0, 0x3e, 0x3e }, { 1, 0x3d, 0x3f } },
+	{ { 0, 0x3e, 0x3f }, { 1, 0x3d, 0x3f } },
+	{ { 1, 0x3e, 0x3f }, { 0, 0x3e, 0x3f } },
+	{ { 0, 0x3f, 0x3f }, { 1, 0x3e, 0x3f } },
+	{ { 0, 0x3f, 0x3f }, { 1, 0x3f, 0x3f } }
+};
\ No newline at end of file
diff --git a/BPTCEncoder/src/BitStream.h b/BPTCEncoder/src/BitStream.h
new file mode 100755
index 0000000..d3975ae
--- /dev/null
+++ b/BPTCEncoder/src/BitStream.h
@@ -0,0 +1,115 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+#ifndef __BITSTREAM_H__
+#define __BITSTREAM_H__
+
+class BitStream {
+public:
+	BitStream(unsigned char *ptr, int nBits, int start_offset) :
+	  m_BitsWritten(0),
+	  m_NumBits(nBits),
+	  m_NumBytes((nBits + start_offset + 7) >> 3),
+      m_CurByte(ptr),
+	  m_NextBit(start_offset % 8),
+	  done(false)
+	{ }
+
+	int GetBitsWritten() const { return m_BitsWritten; }
+	  
+	~BitStream() { }
+	void WriteBitsR(unsigned int val, unsigned int nBits) {
+		for(unsigned int i = 0; i < nBits; i++) {
+			WriteBit((val >> (nBits - i - 1)) & 1);
+		}
+	}
+
+	void WriteBits(unsigned int val, unsigned int nBits) {
+		for(unsigned int i = 0; i < nBits; i++) {
+			WriteBit((val >> i) & 1);
+		}
+	}
+
+private:
+	void WriteBit(int b) {
+
+		if(done) return;
+
+		const unsigned int mask = 1 << m_NextBit++;
+
+		// clear the bit
+		*m_CurByte &= ~mask;
+
+		// Write the bit, if necessary
+		if(b) *m_CurByte |= mask;
+
+		// Next byte?
+		if(m_NextBit >= 8) {
+			m_CurByte += 1;
+			m_NextBit = 0;
+		}
+
+		done = done || ++m_BitsWritten >= m_NumBits;
+	}
+
+	int m_BitsWritten;
+	int m_NextBit;
+	const int m_NumBytes;
+	const int m_NumBits;
+	unsigned char *m_CurByte;
+
+	bool done;
+};
+
+class BitStreamReadOnly {
+public:
+	BitStreamReadOnly(const unsigned char *ptr) :
+	  m_BitsRead(0),
+	  m_CurByte(ptr),
+	  m_NextBit(0)
+	{ }
+
+	int GetBitsRead() const { return m_BitsRead; }
+	  
+	~BitStreamReadOnly() { }
+	
+	int ReadBit() {
+
+		int bit = *m_CurByte >> m_NextBit++;
+		while(m_NextBit >= 8) {
+			m_NextBit -= 8;
+			m_CurByte++;
+		}
+		
+		m_BitsRead++;
+		return bit & 1;
+	}
+
+	unsigned int ReadBits(unsigned int nBits) {
+		unsigned int ret = 0;
+		for(unsigned int i = 0; i < nBits; i++) {
+			ret |= (ReadBit() & 1) << i;
+		}
+		return ret;
+	}
+
+private:
+	int m_BitsRead;
+	int m_NextBit;
+	const unsigned char *m_CurByte;
+};
+#endif //__BITSTREAM_H__
\ No newline at end of file
diff --git a/BPTCEncoder/src/RGBAEndpoints.cpp b/BPTCEncoder/src/RGBAEndpoints.cpp
new file mode 100755
index 0000000..0b7c541
--- /dev/null
+++ b/BPTCEncoder/src/RGBAEndpoints.cpp
@@ -0,0 +1,509 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+#include "BC7IntTypes.h"
+#include "RGBAEndpoints.h"
+#include "BC7Compressor.h"
+#include "BC7CompressionMode.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <cstdio>
+#include <cfloat>
+
+#ifndef min
+template <typename T>
+static T min(const T &a, const T &b) {
+  return (a > b)? b : a;
+}
+#endif
+
+#ifndef max
+template <typename T>
+static T max(const T &a, const T &b) {
+  return (a > b)? a : b;
+}
+#endif
+
+static const double kPi = 3.141592653589793238462643383279502884197;
+static const float kFloatConversion[256] = {
+	0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 
+	16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 
+	32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, 
+	48.0f, 49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f, 
+	64.0f, 65.0f, 66.0f, 67.0f, 68.0f, 69.0f, 70.0f, 71.0f, 72.0f, 73.0f, 74.0f, 75.0f, 76.0f, 77.0f, 78.0f, 79.0f, 
+	80.0f, 81.0f, 82.0f, 83.0f, 84.0f, 85.0f, 86.0f, 87.0f, 88.0f, 89.0f, 90.0f, 91.0f, 92.0f, 93.0f, 94.0f, 95.0f, 
+	96.0f, 97.0f, 98.0f, 99.0f, 100.0f, 101.0f, 102.0f, 103.0f, 104.0f, 105.0f, 106.0f, 107.0f, 108.0f, 109.0f, 110.0f, 111.0f, 
+	112.0f, 113.0f, 114.0f, 115.0f, 116.0f, 117.0f, 118.0f, 119.0f, 120.0f, 121.0f, 122.0f, 123.0f, 124.0f, 125.0f, 126.0f, 127.0f, 
+	128.0f, 129.0f, 130.0f, 131.0f, 132.0f, 133.0f, 134.0f, 135.0f, 136.0f, 137.0f, 138.0f, 139.0f, 140.0f, 141.0f, 142.0f, 143.0f, 
+	144.0f, 145.0f, 146.0f, 147.0f, 148.0f, 149.0f, 150.0f, 151.0f, 152.0f, 153.0f, 154.0f, 155.0f, 156.0f, 157.0f, 158.0f, 159.0f, 
+	160.0f, 161.0f, 162.0f, 163.0f, 164.0f, 165.0f, 166.0f, 167.0f, 168.0f, 169.0f, 170.0f, 171.0f, 172.0f, 173.0f, 174.0f, 175.0f, 
+	176.0f, 177.0f, 178.0f, 179.0f, 180.0f, 181.0f, 182.0f, 183.0f, 184.0f, 185.0f, 186.0f, 187.0f, 188.0f, 189.0f, 190.0f, 191.0f, 
+	192.0f, 193.0f, 194.0f, 195.0f, 196.0f, 197.0f, 198.0f, 199.0f, 200.0f, 201.0f, 202.0f, 203.0f, 204.0f, 205.0f, 206.0f, 207.0f, 
+	208.0f, 209.0f, 210.0f, 211.0f, 212.0f, 213.0f, 214.0f, 215.0f, 216.0f, 217.0f, 218.0f, 219.0f, 220.0f, 221.0f, 222.0f, 223.0f, 
+	224.0f, 225.0f, 226.0f, 227.0f, 228.0f, 229.0f, 230.0f, 231.0f, 232.0f, 233.0f, 234.0f, 235.0f, 236.0f, 237.0f, 238.0f, 239.0f, 
+	240.0f, 241.0f, 242.0f, 243.0f, 244.0f, 245.0f, 246.0f, 247.0f, 248.0f, 249.0f, 250.0f, 251.0f, 252.0f, 253.0f, 254.0f, 255.0f
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Static helper functions
+//
+///////////////////////////////////////////////////////////////////////////////
+static inline uint32 CountBitsInMask(uint8 n) {
+
+#if _WIN64
+	if(!n) return 0; // no bits set
+	if(!(n & (n-1))) return 1; // power of two
+
+	uint32 c;
+	for(c = 0; n; c++) {
+		n &= n - 1;
+	}
+	return c;
+#else
+
+	__asm
+	{
+		mov eax, 8
+		movzx ecx, n
+		bsf ecx, ecx
+		sub eax, ecx
+	}
+
+#endif
+}
+
+template <typename ty>
+static inline void clamp(ty &x, const ty &min, const ty &max) {
+	x = (x < min)? min : ((x > max)? max : x);
+}
+
+// absolute distance. It turns out the compiler does a much
+// better job of optimizing this than we can, since we can't 
+// translate the values to/from registers
+static uint8 sad(uint8 a, uint8 b) {
+#if 0
+	__asm
+	{
+		movzx eax, a
+		movzx ecx, b
+		sub eax, ecx
+		jns done
+		neg eax
+done:
+	}
+#else
+	//const INT d = a - b;
+	//const INT mask = d >> 31;
+	//return (d ^ mask) - mask;
+
+	// return abs(a - b);
+
+	return (a > b)? a - b : b - a;
+
+#endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// RGBAVector implementation
+//
+///////////////////////////////////////////////////////////////////////////////
+
+uint8 QuantizeChannel(const uint8 val, const uint8 mask, const int pBit) {
+
+	// If the mask is all the bits, then we can just return the value.
+	if(mask == 0xFF) {
+		return val;
+	}
+
+	uint32 prec = CountBitsInMask(mask);
+	const uint32 step = 1 << (8 - prec);
+
+	assert(step-1 == uint8(~mask));
+
+	uint32 lval = val & mask;
+	uint32 hval = lval + step;
+
+	if(pBit >= 0) {
+		prec++;
+		lval |= !!(pBit) << (8 - prec);
+		hval |= !!(pBit) << (8 - prec);
+	}
+
+	if(lval > val) {
+		lval -= step;
+		hval -= step;
+	}
+
+	lval |= lval >> prec;
+	hval |= hval >> prec;
+
+	if(sad(val, lval) < sad(val, hval))
+		return lval;
+	else
+		return hval;
+}
+
+uint32 RGBAVector::ToPixel(const uint32 channelMask, const int pBit) const {
+	uint32 ret = 0;
+	uint8 *pRet = (uint8 *)&ret;
+
+	const uint8 *channelMaskBytes = (const uint8 *)&channelMask;
+
+	pRet[0] = QuantizeChannel(uint32(r + 0.5) & 0xFF, channelMaskBytes[0], pBit);
+	pRet[1] = QuantizeChannel(uint32(g + 0.5) & 0xFF, channelMaskBytes[1], pBit);
+	pRet[2] = QuantizeChannel(uint32(b + 0.5) & 0xFF, channelMaskBytes[2], pBit);
+	pRet[3] = QuantizeChannel(uint32(a + 0.5) & 0xFF, channelMaskBytes[3], pBit);
+
+	return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// RGBAMatrix implementation
+//
+///////////////////////////////////////////////////////////////////////////////
+
+RGBAMatrix &RGBAMatrix::operator *=(const RGBAMatrix &mat) {
+	*this = ((*this) * mat);
+	return (*this);
+}
+
+RGBAMatrix RGBAMatrix::operator *(const RGBAMatrix &mat) const {
+
+	RGBAMatrix result;
+
+	for(int i = 0; i < 4; i++) {
+		for(int j = 0; j < 4; j++) {
+
+			result(i, j) = 0.0f;
+			for(int k = 0; k < 4; k++) {
+				result(i, j) += m[i*4 + k] * mat.m[k*4 + j];
+			}
+		}
+	}
+
+	return result;
+}
+
+RGBAVector RGBAMatrix::operator *(const RGBAVector &p) const {
+	return RGBAVector (
+		p.x * m1 + p.y * m2 + p.z * m3 + p.w * m4,
+		p.x * m5 + p.y * m6 + p.z * m7 + p.w * m8,
+		p.x * m9 + p.y * m10 + p.z * m11 + p.w * m12,
+		p.x * m13 + p.y * m14 + p.z * m15 + p.w * m16
+	);
+}
+
+RGBAMatrix RGBAMatrix::RotateX(float rad) {
+	RGBAMatrix result;
+	result.m6 = result.m11 = cos(rad);
+	result.m10 = sin(rad);
+	result.m7 = -result.m10;
+	return result;
+}
+
+RGBAMatrix RGBAMatrix::RotateY(float rad) {
+	RGBAMatrix result;
+	result.m1 = result.m11 = cos(rad);
+	result.m3 = sin(rad);
+	result.m9 = -result.m3;
+	return result;
+}
+
+RGBAMatrix RGBAMatrix::RotateZ(float rad) {
+	RGBAMatrix result;
+	result.m1 = result.m6 = cos(rad);
+	result.m5 = sin(rad);
+	result.m2 = -result.m5;
+	return result;
+}
+
+RGBAMatrix RGBAMatrix::Translate(const RGBAVector &t) {
+	RGBAMatrix result;
+	result.m4 = t.x;
+	result.m8 = t.y;
+	result.m12 = t.z;
+	result.m16 = t.w;
+	return result;
+}
+
+bool RGBAMatrix::Identity() {
+	for(int i = 0; i < 4; i++) {
+		for(int j = 0; j < 4; j++) {
+
+			if(i == j) {
+				if(fabs(m[i*4 + j] - 1.0f) > 1e-5)
+					return false;
+			}
+			else {
+				if(fabs(m[i*4 + j]) > 1e-5)
+					return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Cluster implementation
+//
+///////////////////////////////////////////////////////////////////////////////
+
+RGBACluster::RGBACluster(const RGBACluster &left, const RGBACluster &right) {
+	*this = left;
+	for(int i = 0; i < right.m_NumPoints; i++) {
+		const RGBAVector &p = right.m_DataPoints[i];
+		AddPoint(p);
+	}
+
+	m_PrincipalAxisCached = false;
+}	
+
+void RGBACluster::AddPoint(const RGBAVector &p) {
+	assert(m_NumPoints < kMaxNumDataPoints);
+	m_Total += p;
+	m_DataPoints[m_NumPoints++] = p;
+	m_PointBitString |= 1 << p.GetIdx();
+
+	for(int i = 0; i < kNumColorChannels; i++) {
+		m_Min.c[i] = min(p.c[i], m_Min.c[i]);
+		m_Max.c[i] = max(p.c[i], m_Max.c[i]);
+	}
+}
+
+void RGBACluster::GetPrincipalAxis(RGBADir &axis) {
+
+	if(m_PrincipalAxisCached) {
+		axis = m_PrincipalAxis;
+		return;
+	}
+
+	RGBAVector avg = m_Total / float(m_NumPoints);
+	::GetPrincipalAxis(m_NumPoints, m_DataPoints, m_PrincipalAxis);
+	m_PrincipalAxisCached = true;
+
+	GetPrincipalAxis(axis);
+}
+
+double RGBACluster::QuantizedError(const RGBAVector &p1, const RGBAVector &p2, uint8 nBuckets, uint32 bitMask, const RGBAVector &errorMetricVec, const int pbits[2], int *indices) const {
+
+	// nBuckets should be a power of two.
+	assert(nBuckets == 3 || !(nBuckets & (nBuckets - 1)));
+
+	const uint8 indexPrec = (nBuckets == 3)? 3 : 8-CountBitsInMask(~(nBuckets - 1));
+	
+	typedef uint32 tInterpPair[2];
+	typedef tInterpPair tInterpLevel[16];
+	const tInterpLevel *interpVals = (nBuckets == 3)? kBC7InterpolationValues : kBC7InterpolationValues + (indexPrec - 1);
+
+	assert(indexPrec >= 2 && indexPrec <= 4);
+
+	uint32 qp1, qp2;
+	if(pbits) {
+		qp1 = p1.ToPixel(bitMask, pbits[0]);
+		qp2 = p2.ToPixel(bitMask, pbits[1]);
+	}
+	else {
+		qp1 = p1.ToPixel(bitMask);
+		qp2 = p2.ToPixel(bitMask);
+	}
+
+	uint8 *pqp1 = (uint8 *)&qp1;
+	uint8 *pqp2 = (uint8 *)&qp2;
+
+	float totalError = 0.0;
+	for(int i = 0; i < m_NumPoints; i++) {
+
+		const uint32 pixel = m_DataPoints[i].ToPixel();
+		const uint8 *pb = (const uint8 *)(&pixel);
+
+		float minError = FLT_MAX;
+		int bestBucket = -1;
+		for(int j = 0; j < nBuckets; j++) {
+
+			uint32 interp0 = (*interpVals)[j][0];
+			uint32 interp1 = (*interpVals)[j][1];
+
+			RGBAVector errorVec (0.0f);
+			for(int k = 0; k < kNumColorChannels; k++) {
+				const uint8 ip = (((uint32(pqp1[k]) * interp0) + (uint32(pqp2[k]) * interp1) + 32) >> 6) & 0xFF;
+				const uint8 dist = sad(pb[k], ip);
+				errorVec.c[k] = kFloatConversion[dist];
+			}
+			
+			errorVec *= errorMetricVec;
+			float error = errorVec * errorVec;
+			if(error < minError) {
+				minError = error;
+				bestBucket = j;
+			}
+
+			// Conceptually, once the error starts growing, it doesn't stop growing (we're moving
+			// farther away from the reference point along the line). Hence we can early out here.
+			// However, quanitzation artifacts mean that this is not ALWAYS the case, so we do suffer
+			// about 0.01 RMS error. 
+			else if(error > minError) {
+				break;
+			}
+		}
+
+		totalError += minError;
+
+		assert(bestBucket >= 0);
+		if(indices) indices[i] = bestBucket;
+	}
+
+	return totalError;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Utility function implementation
+//
+///////////////////////////////////////////////////////////////////////////////
+
+void ClampEndpoints(RGBAVector &p1, RGBAVector &p2) {
+	clamp(p1.r, 0.0f, 255.0f);
+	clamp(p1.g, 0.0f, 255.0f);
+	clamp(p1.b, 0.0f, 255.0f);
+	clamp(p1.a, 0.0f, 255.0f);
+
+	clamp(p2.r, 0.0f, 255.0f);
+	clamp(p2.g, 0.0f, 255.0f);
+	clamp(p2.b, 0.0f, 255.0f);
+	clamp(p2.a, 0.0f, 255.0f);
+}
+
+void GetPrincipalAxis(int nPts, const RGBAVector *pts, RGBADir &axis) {
+
+	assert(nPts > 0);
+	assert(nPts <= kMaxNumDataPoints);
+
+	RGBAVector avg (0.0f);
+	for(int i = 0; i < nPts; i++) {
+		avg += pts[i];
+	}
+	avg /= float(nPts);
+
+	// We use these vectors for calculating the covariance matrix...
+	RGBAVector toPts[kMaxNumDataPoints];
+	RGBAVector toPtsMax(-FLT_MAX);
+	for(int i = 0; i < nPts; i++) {
+		toPts[i] = pts[i] - avg;
+
+		for(int j = 0; j < kNumColorChannels; j++) {
+			toPtsMax.c[j] = max(toPtsMax.c[j], toPts[i].c[j]);
+		}
+	}
+
+	// Generate a list of unique points...
+	RGBAVector upts[kMaxNumDataPoints];
+	int uptsIdx = 0;
+	for(int i = 0; i < nPts; i++) {
+		
+		bool hasPt = false;
+		for(int j = 0; j < uptsIdx; j++) {
+			if(upts[j] == pts[i])
+				hasPt = true;
+		}
+
+		if(!hasPt) {
+			upts[uptsIdx++] = pts[i];
+		}
+	}
+
+	assert(uptsIdx > 0);
+
+	if(uptsIdx == 1) {
+		axis.r = axis.g = axis.b = axis.a = 0.0f;
+		return;
+	}
+	// Collinear?
+	else {
+
+		RGBADir dir (upts[1] - upts[0]);
+		bool collinear = true;
+		for(int i = 2; i < nPts; i++) {
+			RGBAVector v = (upts[i] - upts[0]);
+			if(fabs(fabs(v*dir) - v.Length()) > 1e-7) {
+				collinear = false;
+				break;
+			}
+		}
+
+		if(collinear) {
+			axis = dir;
+			return;
+		}
+	}
+
+	RGBAMatrix covMatrix;
+
+	// Compute covariance.
+	for(int i = 0; i < kNumColorChannels; i++) {
+		for(int j = 0; j <= i; j++) {
+
+			float sum = 0.0;
+			for(int k = 0; k < nPts; k++) {
+				sum += toPts[k].c[i] * toPts[k].c[j];
+			}
+
+			covMatrix(i, j) = sum / kFloatConversion[kNumColorChannels - 1];
+			covMatrix(j, i) = covMatrix(i, j);
+		}
+	}
+
+	// !SPEED! Find eigenvectors by using the power method. This is good because the
+	// matrix is only 4x4, which allows us to use SIMD...
+	RGBAVector b = toPtsMax;
+	assert(b.Length() > 0);
+	b /= b.Length();
+
+	bool fixed = false;
+	int infLoopPrevention = 0;
+	const int kMaxNumIterations = 200;
+	while(!fixed && ++infLoopPrevention < kMaxNumIterations) {
+
+		RGBAVector newB = covMatrix * b;
+
+		// !HACK! If the principal eigenvector of the covariance matrix
+		// converges to zero, that means that the points lie equally 
+		// spaced on a sphere in this space. In this (extremely rare)
+		// situation, just choose a point and use it as the principal 
+		// direction.
+		const float newBlen = newB.Length();
+		if(newBlen < 1e-10) {
+			axis = toPts[0];
+			return;
+		}
+
+		newB /= newB.Length();
+
+		if(fabs(1.0f - (b * newB)) < 1e-5)
+			fixed = true;
+
+		b = newB;
+	}
+
+	assert(infLoopPrevention < kMaxNumIterations);
+	axis = b;
+}
diff --git a/BPTCEncoder/src/RGBAEndpoints.h b/BPTCEncoder/src/RGBAEndpoints.h
new file mode 100755
index 0000000..f84700d
--- /dev/null
+++ b/BPTCEncoder/src/RGBAEndpoints.h
@@ -0,0 +1,354 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+#ifndef __RGBA_ENDPOINTS_H__
+#define __RGBA_ENDPOINTS_H__
+
+#include "BC7IntTypes.h"
+#include <cmath>
+#include <cfloat>
+#include <cstring>
+
+static const int kNumColorChannels = 4;
+static const int kMaxNumDataPoints = 16;
+
+class RGBAVector {
+
+public:
+	union {
+		struct { float r, g, b, a; };
+		struct { float x, y, z, w; };
+		float c[4];
+	};
+
+	uint32 GetIdx() const { return  idx; }
+
+	RGBAVector() : r(-1.0), g(-1.0), b(-1.0), a(-1.0) { }
+	RGBAVector(uint32 _idx, uint32 pixel) : 
+		r(float(pixel & 0xFF)), 
+		g(float((pixel >> 8) & 0xFF)), 
+		b(float((pixel >> 16) & 0xFF)), 
+		a(float((pixel >> 24) & 0xFF)),
+		idx(_idx)
+	{ }
+
+	RGBAVector(float _r, float _g, float _b, float _a) :
+		r(_r), g(_g), b(_b), a(_a) { }
+
+	explicit RGBAVector(float cc) : r(cc), g(cc), b(cc), a(cc) { }
+
+	RGBAVector &operator =(const RGBAVector &other) {
+		this->idx = other.idx;
+		memcpy(c, other.c, sizeof(c));
+		return (*this);
+	}
+
+	RGBAVector operator +(const RGBAVector &p) const {
+		return RGBAVector(r + p.r, g + p.g, b + p.b, a + p.a);
+	}
+
+	RGBAVector &operator +=(const RGBAVector &p) {
+		r += p.r; g += p.g; b += p.b; a += p.a;
+		return *this;
+	}
+
+	RGBAVector operator -(const RGBAVector &p) const {
+		return RGBAVector(r - p.r, g - p.g, b - p.b, a - p.a);
+	}
+
+	RGBAVector &operator -=(const RGBAVector &p) {
+		r -= p.r; g -= p.g; b -= p.b; a -= p.a;
+		return *this;
+	}
+
+	RGBAVector operator /(const float s) const {
+		return RGBAVector(r / s, g / s, b / s, a / s);
+	}
+
+	RGBAVector &operator /=(const float s) {
+		r /= s; g /= s; b /= s; a /= s;
+		return *this;
+	}
+
+	float operator *(const RGBAVector &p) const {
+		return r * p.r + g * p.g + b * p.b + a * p.a;
+	}
+
+	float Length() const {
+		return sqrt((*this) * (*this));
+	}
+
+	RGBAVector &operator *=(const RGBAVector &v) {
+		r *= v.r; g *= v.g; b *= v.b; a *= v.a;
+		return *this;
+	}
+
+	RGBAVector operator *(const float s) const {
+		return RGBAVector(r * s, g * s, b * s, a * s);
+	}
+
+	friend RGBAVector operator *(const float s, const RGBAVector &p) {
+		return RGBAVector(p.r * s, p.g * s, p.b * s, p.a * s);
+	}
+
+	RGBAVector &operator *=(const float s) {
+		r *= s; g *= s; b *= s; a *= s;
+		return *this;
+	}
+
+	float &operator [](const int i) {
+		return c[i];
+	}
+
+	friend bool operator ==(const RGBAVector &rhs, const RGBAVector &lhs) {
+		const RGBAVector d = rhs - lhs;
+		return fabs(d.r) < 1e-7 && fabs(d.g) < 1e-7 && fabs(d.b) < 1e-7 && fabs(d.a) < 1e-7;
+	}
+
+	friend bool operator !=(const RGBAVector &rhs, const RGBAVector &lhs) {
+		return !(rhs == lhs);
+	}
+
+	operator float *() {
+		return c;
+	}
+
+	RGBAVector Cross(const RGBAVector &rhs) {
+		return RGBAVector(
+			rhs.y * z - y * rhs.z,
+			rhs.z * x - z * rhs.x,
+			rhs.x * y - x * rhs.y,
+			1.0f
+		);
+	}
+
+	// Quantize this point.
+	uint32 ToPixel(const uint32 channelMask = 0xFFFFFFFF, const int pBit = -1) const;
+
+private:
+	uint32 idx;
+};
+
+class RGBAMatrix {
+private:
+	union {
+		float m[kNumColorChannels*kNumColorChannels];
+		struct {
+			float m1, m2, m3, m4;
+			float m5, m6, m7, m8;
+			float m9, m10, m11, m12;
+			float m13, m14, m15, m16;
+		};
+	};
+
+	RGBAMatrix(const float *arr) {
+		memcpy(m, arr, sizeof(m));
+	}
+
+public:
+	
+	RGBAMatrix() : 
+		m1(1.0f), m2(0.0f), m3(0.0f), m4(0.0f),
+		m5(0.0f), m6(1.0f), m7(0.0f), m8(0.0f),
+		m9(0.0f), m10(0.0f), m11(1.0f), m12(0.0f),
+		m13(0.0f), m14(0.0f), m15(0.0f), m16(1.0f)
+	{ }
+
+	RGBAMatrix &operator =(const RGBAMatrix &other) {
+		memcpy(m, other.m, sizeof(m));
+		return (*this);
+	}
+
+	RGBAMatrix operator +(const RGBAMatrix &p) const {
+		float newm[kNumColorChannels*kNumColorChannels];
+		for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = m[i] + p.m[i];
+		return RGBAMatrix(newm);
+	}
+
+	RGBAMatrix &operator +=(const RGBAMatrix &p) {
+		for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) m[i] += p.m[i];
+		return *this;
+	}
+
+	RGBAMatrix operator -(const RGBAMatrix &p) const {
+		float newm[kNumColorChannels*kNumColorChannels];
+		for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = m[i] - p.m[i];
+		return RGBAMatrix(newm);
+	}
+
+	RGBAMatrix &operator -=(const RGBAMatrix &p) {
+		for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) m[i] -= p.m[i];
+		return *this;
+	}
+
+	RGBAMatrix operator /(const float s) const {
+		float newm[kNumColorChannels*kNumColorChannels];
+		for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = m[i] / s;
+		return RGBAMatrix(newm);
+	}
+
+	RGBAMatrix &operator /=(const float s) {
+		for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) m[i] /= s;
+		return *this;
+	}
+
+	RGBAMatrix operator *(const float s) const {
+		float newm[kNumColorChannels*kNumColorChannels];
+		for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = m[i] * s;
+		return RGBAMatrix(newm);
+	}
+
+	RGBAMatrix operator *(const double s) const {
+		float newm[kNumColorChannels*kNumColorChannels];
+		for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = float(double(m[i]) * s);
+		return RGBAMatrix(newm);
+	}
+
+	friend RGBAMatrix operator *(const float s, const RGBAMatrix &p) {
+		float newm[kNumColorChannels*kNumColorChannels];
+		for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = p.m[i] * s;
+		return RGBAMatrix(newm);	
+	}
+
+	friend RGBAMatrix operator *(const double s, const RGBAMatrix &p) {
+		float newm[kNumColorChannels*kNumColorChannels];
+		for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = float(double(p.m[i]) * s);
+		return RGBAMatrix(newm);	
+	}
+
+	RGBAMatrix &operator *=(const float s) {
+		for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) m[i] *= s;
+		return *this;
+	}
+
+	float &operator ()(const int i, const int j) {
+		return (*this)[i*4 + j];
+	}
+
+	float &operator [](const int i) {
+		return m[i];
+	}
+
+	friend bool operator ==(const RGBAMatrix &rhs, const RGBAMatrix &lhs) {
+		const RGBAMatrix d = rhs - lhs;
+		for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++)
+			if(d.m[i] > 1e-10)
+				return false;
+		return true;
+	}
+
+	operator float *() {
+		return m;
+	}
+
+	RGBAVector operator *(const RGBAVector &p) const;
+	RGBAMatrix operator *(const RGBAMatrix &mat) const;
+	RGBAMatrix &operator *=(const RGBAMatrix &mat);
+	static RGBAMatrix RotateX(float rad);
+	static RGBAMatrix RotateY(float rad);
+	static RGBAMatrix RotateZ(float rad);
+	static RGBAMatrix Translate(const RGBAVector &t);
+	bool Identity();
+};
+
+class RGBADir : public RGBAVector {
+public:
+	RGBADir() : RGBAVector() { }
+	RGBADir(const RGBAVector &p) : RGBAVector(p) {
+		*this /= Length();
+	}
+};
+
+// Makes sure that the values of the endpoints lie between 0 and 1.
+extern void ClampEndpoints(RGBAVector &p1, RGBAVector &p2);
+
+class RGBACluster {
+public:
+
+	RGBACluster() : 
+	  m_NumPoints(0), m_Total(0), 
+	  m_PointBitString(0),
+	  m_Min(FLT_MAX),
+	  m_Max(-FLT_MAX),
+	  m_PrincipalAxisCached(false)
+	{ } 
+
+	RGBACluster(const RGBACluster &c) : 
+		m_NumPoints(c.m_NumPoints),
+		m_Total(c.m_Total),
+		m_PointBitString(c.m_PointBitString), 
+		m_Min(c.m_Min),
+		m_Max(c.m_Max),
+		m_PrincipalAxisCached(false)
+	{ 
+		memcpy(this->m_DataPoints, c.m_DataPoints, m_NumPoints * sizeof(RGBAVector));
+	}
+
+	RGBACluster(const RGBACluster &left, const RGBACluster &right);
+	RGBACluster(const RGBAVector &p) : 
+		m_NumPoints(1),
+		m_Total(p),
+		m_PointBitString(0),
+		m_Min(p), m_Max(p),
+		m_PrincipalAxisCached(false)
+	{ 
+		m_DataPoints[0] = p;
+		m_PointBitString |= (1 << p.GetIdx());
+	}
+			
+	RGBAVector GetTotal() const { return m_Total; }
+	const RGBAVector &GetPoint(int idx) const { return m_DataPoints[idx]; }
+	int GetNumPoints() const { return m_NumPoints; }
+	RGBAVector GetAvg() const { return m_Total / float(m_NumPoints); }
+	const RGBAVector *GetPoints() const { return m_DataPoints; }
+
+	void AddPoint(const RGBAVector &p);
+
+	void GetBoundingBox(RGBAVector &Min, RGBAVector &Max) const {
+		Min = m_Min, Max = m_Max;
+	}
+
+	// Returns the error if we were to quantize the colors right now with the given number of buckets and bit mask.
+	double QuantizedError(const RGBAVector &p1, const RGBAVector &p2, uint8 nBuckets, uint32 bitMask, const RGBAVector &errorMetricVec, const int pbits[2] = NULL, int *indices = NULL) const;
+
+	// Returns the principal axis for this point cluster.
+	void GetPrincipalAxis(RGBADir &axis);
+
+	bool AllSamePoint() const { return m_Max == m_Min; }
+	int GetPointBitString() const { return m_PointBitString; }
+
+private:
+
+	// The number of points in the cluster.
+	int m_NumPoints;
+
+	RGBAVector m_Total;
+
+	// The points in the cluster.
+	RGBAVector m_DataPoints[kMaxNumDataPoints];
+
+	RGBAVector m_Min, m_Max;
+	int m_PointBitString;
+
+	RGBADir m_PrincipalAxis;
+	bool m_PrincipalAxisCached;
+};
+
+extern uint8 QuantizeChannel(const uint8 val, const uint8 mask, const int pBit = -1);
+extern void GetPrincipalAxis(int nPts, const RGBAVector *pts, RGBADir &axis);
+
+#endif //__RGBA_ENDPOINTS_H__
diff --git a/BPTCEncoder/src/RGBAEndpointsSIMD.cpp b/BPTCEncoder/src/RGBAEndpointsSIMD.cpp
new file mode 100755
index 0000000..7625bee
--- /dev/null
+++ b/BPTCEncoder/src/RGBAEndpointsSIMD.cpp
@@ -0,0 +1,420 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+#include "BC7Config.h"
+#include "RGBAEndpointsSIMD.h"
+#include "BC7Compressor.h"
+#include "BC7CompressionModeSIMD.h"
+
+#include <cassert>
+#include <cfloat>
+
+#ifndef HAS_SSE_POPCNT 
+static inline uint32 popcnt32(uint32 x) {
+  uint32 m1 = 0x55555555;
+  uint32 m2 = 0x33333333;
+  uint32 m3 = 0x0f0f0f0f;
+  x -= (x>>1) & 1;
+  x = (x&m2) + ((x>>2)&m2);
+  x = (x+(x>>4))&m3;
+  x += x>>8;
+  return (x+(x>>16)) & 0x3f;
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// RGBAVectorSIMD implementation
+//
+///////////////////////////////////////////////////////////////////////////////
+
+/* Original scalar implementation:
+
+	// If the mask is all the bits, then we can just return the value.
+	if(mask == 0xFF) {
+		return val;
+	}
+
+	uint32 prec = CountBitsInMask(mask);
+	const uint32 step = 1 << (8 - prec);
+
+	assert(step-1 == uint8(~mask));
+
+	uint32 lval = val & mask;
+	uint32 hval = lval + step;
+
+	if(pBit >= 0) {
+		prec++;
+		lval |= !!(pBit) << (8 - prec);
+		hval |= !!(pBit) << (8 - prec);
+	}
+
+	if(lval > val) {
+		lval -= step;
+		hval -= step;
+	}
+
+	lval |= lval >> prec;
+	hval |= hval >> prec;
+
+	if(sad(val, lval) < sad(val, hval))
+		return lval;
+	else
+		return hval;
+*/
+
+// !TODO! AVX2 supports an instruction known as vsllv, which shifts a vector
+// by the values stored in another vector. I.e. you can do something like this:
+//
+// __m128i shiftVals = _mm_set_epi32(1, 2, 3, 4);
+// __m128i someVector = _mm_set1_epi32(1) ;
+// __m128i shifted = _mm_srav_epi32 (someVector, shiftVals);
+//
+// and the result will be the same as __mm_Set_epi32(1, 4, 8, 16);
+//
+// This is useful because our color channels may have different precisions
+// when we're quantizing them, such as for BC7 modes 4 and 5. Hence, we would
+// want to do our quantization as accurately as possible, but currently it would
+// be very hard to vectorize.
+
+#ifdef _MSC_VER
+#define ALIGN_SSE __declspec ( align(16) )
+#else
+#define ALIGN_SSE __attribute__((aligned(16)))
+#endif
+
+// Constants. There are two ways to specify them: either by using the _mm_set* 
+// intrinsics, or by defining them as aligned arrays. You want to do the former 
+// when you use them infrequently, and the latter when you use them multiple times
+// in a short time frame (like in an inner loop)
+static const __m128 kZero = _mm_set1_ps(0.0f);
+static const __m128 kByteMax = _mm_set1_ps(255.0f);
+static const __m128 kHalfVector = _mm_set1_ps(0.5f);
+static const __m128i kOneVector = _mm_set1_epi32(1);
+static const __m128i kZeroVector = _mm_set1_epi32(0);
+static const ALIGN_SSE uint32 kThirtyTwoVector[4] = { 32, 32, 32, 32 };
+static const __m128i kByteValMask = _mm_set_epi32(0xFF, 0xFF, 0xFF, 0xFF);
+
+static inline __m128i sad(const __m128i &a, const __m128i &b) {
+	const __m128i maxab = _mm_max_epu8(a, b);
+	const __m128i minab = _mm_min_epu8(a, b);
+	return _mm_and_si128( kByteValMask, _mm_subs_epu8( maxab, minab ) );
+}
+
+__m128i RGBAVectorSIMD::ToPixel(const __m128i &qmask) const {
+
+	// !SPEED! We should figure out a way to get rid of these scalar operations.
+#ifdef HAS_SSE_POPCNT
+	const uint32 prec = _mm_popcnt32(((uint32 *)(&qmask))[0]);
+#else
+	const uint32 prec = popcnt32(((uint32 *)(&qmask))[0]);
+#endif
+	
+	assert(r >= 0.0f && r <= 255.0f);
+	assert(g >= 0.0f && g <= 255.0f);
+	assert(b >= 0.0f && b <= 255.0f);
+	assert(a >= 0.0f && a <= 255.0f);
+	assert(((uint32 *)(&qmask))[3] == 0xFF || ((uint32 *)(&qmask))[3] == ((uint32 *)(&qmask))[0]);
+	assert(((uint32 *)(&qmask))[2] == ((uint32 *)(&qmask))[1] && ((uint32 *)(&qmask))[0] == ((uint32 *)(&qmask))[1]);
+
+	const __m128i val = _mm_cvtps_epi32( _mm_add_ps(kHalfVector, vec) );
+
+	const __m128i step = _mm_slli_epi32( kOneVector, 8 - prec );
+	const __m128i &mask = qmask;
+
+	__m128i lval = _mm_and_si128(val, mask);
+	__m128i hval = _mm_add_epi32(lval, step);
+
+	const __m128i lvalShift = _mm_srli_epi32(lval, prec);
+	const __m128i hvalShift = _mm_srli_epi32(hval, prec);
+
+	lval = _mm_or_si128(lval, lvalShift);
+	hval = _mm_or_si128(hval, hvalShift);
+
+	const __m128i lvald = _mm_sub_epi32( val, lval );
+	const __m128i hvald = _mm_sub_epi32( hval, val );
+
+	const __m128i vd = _mm_cmplt_epi32(lvald, hvald);
+	__m128i ans = _mm_blendv_epi8(hval, lval, vd);
+
+	const __m128i chanExact = _mm_cmpeq_epi32(mask, kByteValMask);
+	ans = _mm_blendv_epi8( ans, val, chanExact );
+	return ans;
+}
+
+__m128i RGBAVectorSIMD::ToPixel(const __m128i &qmask, const int pBit) const {
+	
+	// !SPEED! We should figure out a way to get rid of these scalar operations.
+#ifdef HAS_SSE_POPCNT
+	const uint32 prec = _mm_popcnt32(((uint32 *)(&qmask))[0]);
+#else
+	const uint32 prec = popcnt32(((uint32 *)(&qmask))[0]);
+#endif
+	
+	assert(r >= 0.0f && r <= 255.0f);
+	assert(g >= 0.0f && g <= 255.0f);
+	assert(b >= 0.0f && b <= 255.0f);
+	assert(a >= 0.0f && a <= 255.0f);
+	assert(((uint32 *)(&qmask))[3] == 0xFF || ((uint32 *)(&qmask))[3] == ((uint32 *)(&qmask))[0]);
+	assert(((uint32 *)(&qmask))[2] == ((uint32 *)(&qmask))[1] && ((uint32 *)(&qmask))[0] == ((uint32 *)(&qmask))[1]);
+
+	const __m128i val = _mm_cvtps_epi32( _mm_add_ps(kHalfVector, vec) );
+	const __m128i pbit = _mm_set1_epi32(!!pBit);
+
+	const __m128i &mask = qmask; // _mm_set_epi32(alphaMask, channelMask, channelMask, channelMask);
+	const __m128i step = _mm_slli_epi32( kOneVector, 8 - prec );
+
+	__m128i lval = _mm_and_si128( val, mask );
+	__m128i hval = _mm_add_epi32( lval, step );
+
+	const __m128i pBitShifted = _mm_slli_epi32(pbit, 7 - prec);
+	lval = _mm_or_si128(lval, pBitShifted );
+	hval = _mm_or_si128(hval, pBitShifted);
+
+	// These next three lines we make sure that after adding the pbit that val is
+	// still in between lval and hval. If it isn't, then we subtract a
+	// step from both. Now, val should be larger than lval and less than
+	// hval, but certain situations make this not always the case (e.g. val
+	// is 0, precision is 4 bits, and pbit is 1). Hence, we add back the
+	// step if it goes below zero, making it equivalent to hval and so it
+	// doesn't matter which we choose.
+	{
+		__m128i cmp = _mm_cmpgt_epi32(lval, val);
+		cmp = _mm_mullo_epi32(cmp, step);
+		lval = _mm_add_epi32(lval, cmp);
+		hval = _mm_add_epi32(hval, cmp);
+
+		cmp = _mm_cmplt_epi32(lval, kZeroVector);
+		cmp = _mm_mullo_epi32(cmp, step);
+		lval = _mm_sub_epi32(lval, cmp);
+	}
+
+	const __m128i lvalShift = _mm_srli_epi32(lval, prec + 1);
+	const __m128i hvalShift = _mm_srli_epi32(hval, prec + 1);
+
+	lval = _mm_or_si128(lval, lvalShift);
+	hval = _mm_or_si128(hval, hvalShift);
+
+	const __m128i lvald = _mm_sub_epi32( val, lval );
+	const __m128i hvald = _mm_sub_epi32( hval, val );
+
+	const __m128i vd = _mm_cmplt_epi32(lvald, hvald);
+	__m128i ans = _mm_blendv_epi8(hval, lval, vd);
+
+	const __m128i chanExact = _mm_cmpeq_epi32(mask, kByteValMask);
+	ans = _mm_blendv_epi8( ans, val, chanExact );
+	return ans;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// RGBAMatrixSIMD implementation
+//
+///////////////////////////////////////////////////////////////////////////////
+
+RGBAVectorSIMD RGBAMatrixSIMD::operator *(const RGBAVectorSIMD &p) const {
+	
+	__m128 xVec = _mm_set1_ps( p.x );
+	__m128 yVec = _mm_set1_ps( p.y );
+	__m128 zVec = _mm_set1_ps( p.z );
+	__m128 wVec = _mm_set1_ps( p.w );
+
+	__m128 vec1 = _mm_mul_ps( xVec, col[0] );
+	__m128 vec2 = _mm_mul_ps( yVec, col[1] );
+	__m128 vec3 = _mm_mul_ps( zVec, col[2] );
+	__m128 vec4 = _mm_mul_ps( wVec, col[3] );
+
+	return RGBAVectorSIMD( _mm_add_ps( _mm_add_ps( vec1, vec2 ), _mm_add_ps( vec3, vec4 ) ) );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Cluster implementation
+//
+///////////////////////////////////////////////////////////////////////////////
+
+RGBAClusterSIMD::RGBAClusterSIMD(const RGBAClusterSIMD &left, const RGBAClusterSIMD &right) {
+
+	assert(!(left.m_PointBitString & right.m_PointBitString));
+
+	*this = left;
+	for(int i = 0; i < right.m_NumPoints; i++) {
+
+		const RGBAVectorSIMD &p = right.m_DataPoints[i];
+
+		assert(m_NumPoints < kMaxNumDataPoints);
+		m_Total += p;
+		m_DataPoints[m_NumPoints++] = p;
+
+		m_Min.vec = _mm_min_ps(m_Min.vec, p.vec);
+		m_Max.vec = _mm_max_ps(m_Max.vec, p.vec);
+	}
+
+	m_PointBitString = left.m_PointBitString | right.m_PointBitString;
+	m_PrincipalAxisCached = false;
+}	
+
+void RGBAClusterSIMD::AddPoint(const RGBAVectorSIMD &p, int idx) {
+	assert(m_NumPoints < kMaxNumDataPoints);
+	m_Total += p;
+	m_DataPoints[m_NumPoints++] = p;
+	m_PointBitString |= 1 << idx;
+
+	m_Min.vec = _mm_min_ps(m_Min.vec, p.vec);
+	m_Max.vec = _mm_max_ps(m_Max.vec, p.vec);
+}
+
+float RGBAClusterSIMD::QuantizedError(const RGBAVectorSIMD &p1, const RGBAVectorSIMD &p2, const uint8 nBuckets, const __m128i &bitMask, const int pbits[2], __m128i *indices) const {
+
+	// nBuckets should be a power of two.
+	assert(!(nBuckets & (nBuckets - 1)));
+
+	const uint8 indexPrec = 8-_mm_popcnt_u32(~(nBuckets - 1) & 0xFF);
+	assert(indexPrec >= 2 && indexPrec <= 4);
+
+	typedef __m128i tInterpPair[2];
+	typedef tInterpPair tInterpLevel[16];
+	const tInterpLevel *interpVals = kBC7InterpolationValuesSIMD + (indexPrec - 1);
+
+	__m128i qp1, qp2;
+	if(pbits) {
+		qp1 = p1.ToPixel(bitMask, pbits[0]);
+		qp2 = p2.ToPixel(bitMask, pbits[1]);
+	}
+	else {
+		qp1 = p1.ToPixel(bitMask);
+		qp2 = p2.ToPixel(bitMask);
+	}
+
+	__m128 errorMetricVec = _mm_load_ps( BC7C::GetErrorMetric() );
+
+	__m128 totalError = kZero;
+	for(int i = 0; i < m_NumPoints; i++) {
+
+		const __m128i pixel = m_DataPoints[i].ToPixel( kByteValMask );
+
+		__m128 minError = _mm_set1_ps(FLT_MAX);
+		__m128i bestBucket = _mm_set1_epi32(-1);
+		for(int j = 0; j < nBuckets; j++) {
+
+			const __m128i jVec = _mm_set1_epi32(j);
+			const __m128i interp0 = (*interpVals)[j][0];
+			const __m128i interp1 = (*interpVals)[j][1];
+
+			const __m128i ip0 = _mm_mullo_epi32( qp1, interp0 );
+			const __m128i ip1 = _mm_mullo_epi32( qp2, interp1 );
+			const __m128i ip = _mm_add_epi32( *((const __m128i *)kThirtyTwoVector), _mm_add_epi32( ip0, ip1 ) );
+			const __m128i dist = sad( _mm_and_si128( _mm_srli_epi32( ip, 6 ), kByteValMask ), pixel );
+			__m128 errorVec = _mm_cvtepi32_ps( dist );
+			
+			errorVec = _mm_mul_ps( errorVec, errorMetricVec );
+			errorVec = _mm_mul_ps( errorVec, errorVec );
+			errorVec = _mm_hadd_ps( errorVec, errorVec );
+			errorVec = _mm_hadd_ps( errorVec, errorVec );
+
+			const __m128 cmp = _mm_cmple_ps( errorVec, minError );
+			minError = _mm_blendv_ps( minError, errorVec, cmp );
+			bestBucket = _mm_blendv_epi8( bestBucket, jVec, _mm_castps_si128( cmp ) );
+
+			// Conceptually, once the error starts growing, it doesn't stop growing (we're moving
+			// farther away from the reference point along the line). Hence we can early out here.
+			// However, quanitzation artifacts mean that this is not ALWAYS the case, so we do suffer
+			// about 0.01 RMS error. 
+			if(!((uint8 *)(&cmp))[0])
+				break;
+		}
+
+		totalError = _mm_add_ps(totalError, minError);
+		if(indices) ((uint32 *)indices)[i] = ((uint32 *)(&bestBucket))[0];
+	}
+
+	return ((float *)(&totalError))[0];
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Utility function implementation
+//
+///////////////////////////////////////////////////////////////////////////////
+
+void ClampEndpoints(RGBAVectorSIMD &p1, RGBAVectorSIMD &p2) {
+	p1.vec = _mm_min_ps( kByteMax, _mm_max_ps( p1.vec, kZero ) );
+	p2.vec = _mm_min_ps( kByteMax, _mm_max_ps( p2.vec, kZero ) );
+}
+
+void GetPrincipalAxis(const RGBAClusterSIMD &c, RGBADirSIMD &axis) {
+
+	if(c.GetNumPoints() == 2) {
+		axis = c.GetPoint(1) - c.GetPoint(0);
+		return;
+	}
+
+	RGBAVectorSIMD avg = c.GetTotal();
+	avg /= float(c.GetNumPoints());
+
+	// We use these vectors for calculating the covariance matrix...
+	RGBAVectorSIMD toPts[kMaxNumDataPoints];
+	RGBAVectorSIMD toPtsMax(-FLT_MAX);
+	for(int i = 0; i < c.GetNumPoints(); i++) {
+		toPts[i] = c.GetPoint(i) - avg;
+		toPtsMax.vec = _mm_max_ps(toPtsMax.vec, toPts[i].vec);
+	}
+
+	RGBAMatrixSIMD covMatrix;
+
+	// Compute covariance.
+	const float fNumPoints = float(c.GetNumPoints());
+	for(int i = 0; i < kNumColorChannels; i++) {
+		for(int j = 0; j <= i; j++) {
+
+			float sum = 0.0;
+			for(int k = 0; k < c.GetNumPoints(); k++) {
+				sum += toPts[k].c[i] * toPts[k].c[j];
+			}
+
+			covMatrix(i, j) = sum / fNumPoints;
+			covMatrix(j, i) = covMatrix(i, j);
+		}
+	}
+
+	// !SPEED! Find eigenvectors by using the power method. This is good because the
+	// matrix is only 4x4, which allows us to use SIMD...
+	RGBAVectorSIMD b = toPtsMax;
+	assert(b.Length() > 0);
+	b /= b.Length();
+
+	RGBAVectorSIMD newB = covMatrix * b;
+
+	// !HACK! If the principal eigenvector of the covariance matrix
+	// converges to zero, that means that the points lie equally 
+	// spaced on a sphere in this space. In this (extremely rare)
+	// situation, just choose a point and use it as the principal 
+	// direction.
+	const float newBlen = newB.Length();
+	if(newBlen < 1e-10) {
+		axis = toPts[0];
+		return;
+	}
+
+	for(int i = 0; i < 8; i++) {
+		newB = covMatrix * b;
+		newB.Normalize();
+		b = newB;
+	}
+
+	axis = b;
+}
diff --git a/BPTCEncoder/src/RGBAEndpointsSIMD.h b/BPTCEncoder/src/RGBAEndpointsSIMD.h
new file mode 100755
index 0000000..b93f8a5
--- /dev/null
+++ b/BPTCEncoder/src/RGBAEndpointsSIMD.h
@@ -0,0 +1,374 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+#ifndef __RGBA_SIMD_ENDPOINTS_H__
+#define __RGBA_SIMD_ENDPOINTS_H__
+
+#include "BC7IntTypes.h"
+#include <cmath>
+#include <cfloat>
+#include <cstring>
+
+#include <smmintrin.h>
+
+static const int kNumColorChannels = 4;
+static const int kMaxNumDataPoints = 16;
+static const __m128 kEpsilonSIMD = _mm_set1_ps(1e-8f);
+
+class RGBAVectorSIMD {
+
+public:
+	union {
+		struct { float r, g, b, a; };
+		struct { float x, y, z, w; };
+		float c[4];
+		__m128 vec;
+	};
+
+	RGBAVectorSIMD() : r(-1.0), g(-1.0), b(-1.0), a(-1.0) { }
+	RGBAVectorSIMD(uint32 pixel) : 
+		r(float(pixel & 0xFF)), 
+		g(float((pixel >> 8) & 0xFF)), 
+		b(float((pixel >> 16) & 0xFF)), 
+		a(float((pixel >> 24) & 0xFF))
+	{ }
+
+	explicit RGBAVectorSIMD(float _r, float _g, float _b, float _a) :
+		r(_r), g(_g), b(_b), a(_a) { }
+
+	explicit RGBAVectorSIMD(float cc) : r(cc), g(cc), b(cc), a(cc) { }
+
+	RGBAVectorSIMD (const __m128 &newVec) : vec(newVec) { }
+	RGBAVectorSIMD (const RGBAVectorSIMD &other) : vec(other.vec) { }
+
+	RGBAVectorSIMD operator +(const RGBAVectorSIMD &p) const {
+		return RGBAVectorSIMD( _mm_add_ps(this->vec, p.vec) );
+	}
+
+	RGBAVectorSIMD &operator +=(const RGBAVectorSIMD &p) {
+		this->vec = _mm_add_ps(this->vec, p.vec);
+		return *this;
+	}
+
+	RGBAVectorSIMD operator -(const RGBAVectorSIMD &p) const {
+		return RGBAVectorSIMD( _mm_sub_ps(this->vec, p.vec) );
+	}
+
+	RGBAVectorSIMD &operator -=(const RGBAVectorSIMD &p) {
+		this->vec = _mm_sub_ps(this->vec, p.vec);
+		return *this;
+	}
+
+	RGBAVectorSIMD operator /(const float s) const {
+		return RGBAVectorSIMD( _mm_div_ps(this->vec, _mm_set1_ps(s) ) );
+	}
+
+	RGBAVectorSIMD &operator /=(const float s) {
+		this->vec = _mm_div_ps(this->vec, _mm_set1_ps(s) );
+		return *this;
+	}
+
+	float operator *(const RGBAVectorSIMD &p) const {
+		__m128 mul = _mm_mul_ps(this->vec, p.vec);
+		mul = _mm_hadd_ps(mul, mul);
+		mul = _mm_hadd_ps(mul, mul);
+		return ((float *)(&mul))[0];
+	}
+
+	void Normalize() {
+		__m128 rsqrt = _mm_rsqrt_ps( _mm_set1_ps( (*this) * (*this) ) );
+		vec = _mm_mul_ps( vec, rsqrt );
+	}
+
+	float Length() const {
+		return sqrt((*this) * (*this));
+	}
+
+	RGBAVectorSIMD &operator *=(const RGBAVectorSIMD &v) {
+		this->vec = _mm_mul_ps(this->vec, v.vec);
+		return *this;
+	}
+
+	RGBAVectorSIMD operator *(const float s) const {
+		return RGBAVectorSIMD( _mm_mul_ps( this->vec, _mm_set1_ps(s) ) );
+	}
+
+	friend RGBAVectorSIMD operator *(const float s, const RGBAVectorSIMD &p) {
+		return RGBAVectorSIMD( _mm_mul_ps( p.vec, _mm_set1_ps(s) ) );
+	}
+
+	RGBAVectorSIMD &operator *=(const float s) {
+		this->vec = _mm_mul_ps( this->vec, _mm_set1_ps(s) );
+		return *this;
+	}
+
+	float &operator [](const int i) {
+		return c[i];
+	}
+
+	friend bool operator ==(const RGBAVectorSIMD &rhs, const RGBAVectorSIMD &lhs) {
+		__m128 d = _mm_sub_ps(rhs.vec, lhs.vec);
+		d = _mm_mul_ps(d, d);
+		__m128 cmp = _mm_cmpgt_ps(d, kEpsilonSIMD);
+		cmp = _mm_hadd_ps(cmp, cmp);
+		cmp = _mm_hadd_ps(cmp, cmp);
+		return ((float *)(&cmp))[0] == 0.0f;
+	}
+
+	friend bool operator !=(const RGBAVectorSIMD &rhs, const RGBAVectorSIMD &lhs) {
+		return !(rhs == lhs);
+	}
+
+	operator float *() {
+		return c;
+	}
+
+	// Quantize this point.
+	__m128i ToPixel(const __m128i &channelMask, const int pBit) const;
+	__m128i ToPixel(const __m128i &channelMask) const;
+};
+
+class RGBAMatrixSIMD {
+private:
+	union {
+		float m[kNumColorChannels*kNumColorChannels];
+		struct {
+			float m1, m5, m9, m13;
+			float m2, m6, m10, m14;
+			float m3, m7, m11, m15;
+			float m4, m8, m12, m16;
+		};
+		__m128 col[kNumColorChannels];
+	};
+
+	RGBAMatrixSIMD(const float *arr) {
+		memcpy(m, arr, sizeof(m));
+	}
+
+	RGBAMatrixSIMD(const __m128 newcol[kNumColorChannels]) {
+		for(int i = 0; i < kNumColorChannels; i++) 
+			col[i] = newcol[i];
+	}
+
+public:
+	
+	RGBAMatrixSIMD() : 
+		m1(1.0f), m2(0.0f), m3(0.0f), m4(0.0f),
+		m5(0.0f), m6(1.0f), m7(0.0f), m8(0.0f),
+		m9(0.0f), m10(0.0f), m11(1.0f), m12(0.0f),
+		m13(0.0f), m14(0.0f), m15(0.0f), m16(1.0f)
+	{ }
+
+	RGBAMatrixSIMD &operator =(const RGBAMatrixSIMD &other) {
+		memcpy(m, other.m, sizeof(m));
+		return (*this);
+	}
+
+	RGBAMatrixSIMD operator +(const RGBAMatrixSIMD &p) const {
+		RGBAMatrixSIMD newm;
+		for(int i = 0; i < kNumColorChannels; i++) {
+			newm.col[i] = _mm_add_ps(col[i], p.col[i]);
+		}
+		return newm;
+	}
+
+	RGBAMatrixSIMD &operator +=(const RGBAMatrixSIMD &p) {
+		for(int i = 0; i < kNumColorChannels; i++) {
+			col[i] = _mm_add_ps( col[i], p.col[i] );
+		}
+		return *this;
+	}
+
+	RGBAMatrixSIMD operator -(const RGBAMatrixSIMD &p) const {
+		RGBAMatrixSIMD newm;
+		for(int i = 0; i < kNumColorChannels; i++) {
+			newm.col[i] = _mm_sub_ps( col[i], p.col[i] );
+		}
+		return newm;
+	}
+
+	RGBAMatrixSIMD &operator -=(const RGBAMatrixSIMD &p) {
+		for(int i = 0; i < kNumColorChannels; i++) {
+			col[i] = _mm_sub_ps( col[i], p.col[i] );
+		}
+		return *this;
+	}
+
+	RGBAMatrixSIMD operator /(const float s) const {
+		__m128 f = _mm_set1_ps(s);
+		RGBAMatrixSIMD newm;
+
+		for(int i = 0; i < kNumColorChannels; i++) {
+			newm.col[i] = _mm_div_ps( col[i], f );
+		}
+
+		return newm;
+	}
+
+	RGBAMatrixSIMD &operator /=(const float s) {
+
+		__m128 f = _mm_set1_ps(s);
+
+		for(int i = 0; i < kNumColorChannels; i++) {
+			col[i] = _mm_div_ps(col[i], f);
+		}
+
+		return *this;
+	}
+
+	RGBAMatrixSIMD operator *(const float s) const {
+		__m128 f = _mm_set1_ps(s);
+
+		RGBAMatrixSIMD newm;
+		for(int i = 0; i < kNumColorChannels; i++) {
+			newm.col[i] = _mm_mul_ps( col[i], f );
+		}
+		return newm;
+	}
+
+	friend RGBAMatrixSIMD operator *(const float s, const RGBAMatrixSIMD &p) {
+		__m128 f = _mm_set1_ps(s);
+		RGBAMatrixSIMD newm;
+
+		for(int i = 0; i < kNumColorChannels; i++) {
+			newm.col[i] = _mm_mul_ps( p.col[i], f );
+		}
+		return newm;
+	}
+
+	RGBAMatrixSIMD &operator *=(const float s) {
+		__m128 f = _mm_set1_ps(s);
+		for(int i = 0; i < kNumColorChannels; i++) 
+			col[i] = _mm_mul_ps(col[i], f);
+		return *this;
+	}
+
+	float &operator ()(const int i, const int j) {
+		return (*this)[j*4 + i];
+	}
+
+	float &operator [](const int i) {
+		return m[i];
+	}
+
+	friend bool operator ==(const RGBAMatrixSIMD &rhs, const RGBAMatrixSIMD &lhs) {
+		
+		__m128 sum = _mm_set1_ps(0.0f);
+		for(int i = 0; i < kNumColorChannels; i++) {
+			__m128 d = _mm_sub_ps(rhs.col[i], lhs.col[i]);
+			d = _mm_mul_ps(d, d);
+			__m128 cmp = _mm_cmpgt_ps(d, kEpsilonSIMD);
+			cmp = _mm_hadd_ps(cmp, cmp);
+			cmp = _mm_hadd_ps(cmp, cmp);
+			sum = _mm_add_ps(sum, cmp);
+		}
+
+		if(((float *)(&sum))[0] != 0)
+			return false;
+		else
+			return true;
+	}
+
+	operator float *() {
+		return m;
+	}
+
+	RGBAVectorSIMD operator *(const RGBAVectorSIMD &p) const;
+};
+
+class RGBADirSIMD : public RGBAVectorSIMD {
+public:
+	RGBADirSIMD() : RGBAVectorSIMD() { }
+	RGBADirSIMD(const RGBAVectorSIMD &p) : RGBAVectorSIMD(p) {
+		this->Normalize();
+	}
+};
+
+// Makes sure that the values of the endpoints lie between 0 and 1.
+extern void ClampEndpoints(RGBAVectorSIMD &p1, RGBAVectorSIMD &p2);
+
+class RGBAClusterSIMD {
+public:
+
+	RGBAClusterSIMD() : 
+	  m_NumPoints(0), m_Total(0.0f), 
+	  m_PointBitString(0),
+	  m_Min(FLT_MAX),
+	  m_Max(-FLT_MAX),
+	  m_PrincipalAxisCached(false)
+	{ } 
+
+	RGBAClusterSIMD(const RGBAClusterSIMD &c) : 
+		m_NumPoints(c.m_NumPoints),
+		m_Total(c.m_Total),
+		m_PointBitString(c.m_PointBitString), 
+		m_Min(c.m_Min),
+		m_Max(c.m_Max),
+		m_PrincipalAxisCached(false)
+	{ 
+		memcpy(this->m_DataPoints, c.m_DataPoints, m_NumPoints * sizeof(RGBAVectorSIMD));
+	}
+
+	RGBAClusterSIMD(const RGBAClusterSIMD &left, const RGBAClusterSIMD &right);
+	RGBAClusterSIMD(const RGBAVectorSIMD &p, int idx) : 
+		m_NumPoints(1),
+		m_Total(p),
+		m_PointBitString(0),
+		m_Min(p), m_Max(p),
+		m_PrincipalAxisCached(false)
+	{ 
+		m_DataPoints[0] = p;
+		m_PointBitString |= (1 << idx);
+	}
+			
+	RGBAVectorSIMD GetTotal() const { return m_Total; }
+	const RGBAVectorSIMD &GetPoint(int idx) const { return m_DataPoints[idx]; }
+	int GetNumPoints() const { return m_NumPoints; }
+	RGBAVectorSIMD GetAvg() const { return m_Total / float(m_NumPoints); }
+
+	void AddPoint(const RGBAVectorSIMD &p, int idx);
+
+	void GetBoundingBox(RGBAVectorSIMD &Min, RGBAVectorSIMD &Max) const {
+		Min = m_Min, Max = m_Max;
+	}
+
+	// Returns the error if we were to quantize the colors right now with the given number of buckets and bit mask.
+	float QuantizedError(const RGBAVectorSIMD &p1, const RGBAVectorSIMD &p2, const uint8 nBuckets, const __m128i &bitMask, const int pbits[2] = NULL, __m128i *indices = NULL) const;
+
+	bool AllSamePoint() const { return m_Max == m_Min; }
+	int GetPointBitString() const { return m_PointBitString; }
+
+private:
+
+	// The number of points in the cluster.
+	int m_NumPoints;
+
+	RGBAVectorSIMD m_Total;
+
+	// The points in the cluster.
+	RGBAVectorSIMD m_DataPoints[kMaxNumDataPoints];
+
+	RGBAVectorSIMD m_Min, m_Max;
+	int m_PointBitString;
+
+	RGBADirSIMD m_PrincipalAxis;
+	bool m_PrincipalAxisCached;
+};
+
+extern void GetPrincipalAxis(const RGBAClusterSIMD &c, RGBADirSIMD &axis);
+
+#endif //__RGBA_SIMD_ENDPOINTS_H__
diff --git a/CLTool/CMakeLists.txt b/CLTool/CMakeLists.txt
new file mode 100644
index 0000000..e69de29
diff --git a/CLTool/StopWatch.cpp b/CLTool/StopWatch.cpp
new file mode 100755
index 0000000..b8e2510
--- /dev/null
+++ b/CLTool/StopWatch.cpp
@@ -0,0 +1,106 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+#include "StopWatch.h"
+#include <cassert>
+
+// Initialize member variables.
+StopWatch::StopWatch() :
+	frequency(0),
+	start(0),
+	stop(0),
+	affinityMask(0)
+{
+	// Initialize the performance counter frequency.
+	LARGE_INTEGER perfQuery;
+	BOOL supported = QueryPerformanceFrequency(&perfQuery);
+	assert(supported == TRUE);
+	this->frequency = perfQuery.QuadPart;
+}
+
+// Start the stopwatch.
+void StopWatch::Start()
+{
+	// MSDN recommends setting the thread affinity to avoid bugs in the BIOS and HAL.
+	// Create an affinity mask for the current processor.
+	affinityMask = (DWORD_PTR)1 << GetCurrentProcessorNumber();
+	HANDLE currThread = GetCurrentThread();
+	DWORD_PTR prevAffinityMask = SetThreadAffinityMask(currThread, affinityMask);
+	assert(prevAffinityMask != 0);
+
+	// Query the performance counter.
+	LARGE_INTEGER perfQuery;
+	BOOL result = QueryPerformanceCounter(&perfQuery);
+    assert(result);
+    start = perfQuery.QuadPart;
+
+	// Restore the thread's affinity mask.
+	prevAffinityMask = SetThreadAffinityMask(currThread, prevAffinityMask);
+	assert(prevAffinityMask != 0);
+}
+
+// Stop the stopwatch.
+void StopWatch::Stop()
+{
+	// MSDN recommends setting the thread affinity to avoid bugs in the BIOS and HAL.
+	// Use the affinity mask that was created in the Start function.
+	HANDLE currThread = GetCurrentThread();
+	DWORD_PTR prevAffinityMask = SetThreadAffinityMask(currThread, affinityMask);
+	assert(prevAffinityMask != 0);
+
+	// Query the performance counter.
+	LARGE_INTEGER perfQuery;
+	BOOL result = QueryPerformanceCounter(&perfQuery);
+    assert(result);
+    stop = perfQuery.QuadPart;
+
+	// Restore the thread's affinity mask.
+	prevAffinityMask = SetThreadAffinityMask(currThread, prevAffinityMask);
+	assert(prevAffinityMask != 0);
+}
+
+// Reset the stopwatch.
+void StopWatch::Reset()
+{
+	start = 0;
+	stop = 0;
+	affinityMask = 0;
+}
+
+// Get the elapsed time in seconds.
+double StopWatch::TimeInSeconds() const
+{
+	// Return the elapsed time in seconds.
+	assert((stop - start) > 0);
+	return double(stop - start) / double(frequency);
+}
+
+// Get the elapsed time in milliseconds.
+double StopWatch::TimeInMilliseconds() const
+{
+	// Return the elapsed time in milliseconds.
+	assert((stop - start) > 0);
+	return double(stop - start) / double(frequency) * 1000.0;
+}
+
+// Get the elapsed time in microseconds.
+double StopWatch::TimeInMicroseconds() const
+{
+	// Return the elapsed time in microseconds.
+	assert((stop - start) > 0);
+	return double(stop - start) / double(frequency) * 1000000.0;
+}
diff --git a/CLTool/StopWatch.h b/CLTool/StopWatch.h
new file mode 100755
index 0000000..76573de
--- /dev/null
+++ b/CLTool/StopWatch.h
@@ -0,0 +1,41 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+#pragma once
+
+#include "Windows.h"
+
+// A simple stopwatch class using Windows' high-resolution performance counters.
+class StopWatch
+{
+public:
+	StopWatch();
+
+	void Start();
+	void Stop();
+	void Reset();
+
+	double TimeInSeconds() const;
+	double TimeInMilliseconds() const;
+	double TimeInMicroseconds() const;
+
+private:
+	LONGLONG frequency;
+	LONGLONG start;
+	LONGLONG stop;
+	DWORD_PTR affinityMask;
+};
diff --git a/CLTool/main.cpp b/CLTool/main.cpp
new file mode 100755
index 0000000..2dab017
--- /dev/null
+++ b/CLTool/main.cpp
@@ -0,0 +1,2126 @@
+//--------------------------------------------------------------------------------------
+// Copyright 2011 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+//--------------------------------------------------------------------------------------
+
+#include "DXUT.h"
+#include "DXUTcamera.h"
+#include "DXUTgui.h"
+#include "DXUTsettingsDlg.h"
+#include "SDKmisc.h"
+#include "SDKMesh.h"
+
+#include "DXTCompressorDLL.h" // DXT compressor DLL.
+#include "BC7CompressorDLL.h" // BC7 compressor DLL.
+
+#include "StopWatch.h" // Timer.
+#include "TaskMgrTBB.h" // TBB task manager.
+
+#include <tchar.h>
+#include <strsafe.h>
+
+#define ALIGN16(x) __declspec(align(16)) x
+#define ALIGN32(x) __declspec(align(32)) x
+
+// DXT compressor type.
+enum ECompressorType
+{
+	eCompType_DXT1,
+	eCompType_DXT5,
+	eCompType_BC7,
+
+	kNumCompressorTypes
+};
+
+const TCHAR *kCompressorTypeStr[kNumCompressorTypes] = {
+	_T("DXT1/BC1"),
+	_T("DXT5/BC3"),
+	_T("BC7"),
+};
+
+enum EInstructionSet
+{
+	eInstrSet_Scalar
+	, eInstrSet_SSE
+	, eInstrSet_AVX2
+
+	, kNumInstructionSets
+};
+
+const TCHAR *kInstructionSetStr[kNumInstructionSets] = {
+	_T("Scalar"),
+	_T("SSE"),
+	_T("AVX2"),
+};
+
+enum EThreadMode
+{
+	eThreadMode_None,
+	eThreadMode_TBB,
+	eThreadMode_Win32,
+
+	kNumThreadModes
+};
+
+const TCHAR *kThreadModeStr[kNumThreadModes] = {
+	_T("None"),
+	_T("TBB"),
+	_T("Win32")
+};
+
+static BOOL						g_DXT1Available = TRUE;
+static BOOL						g_AVX2Available = FALSE;
+static BOOL						g_DX11Available = FALSE;
+
+const struct ECompressionScheme {
+	const ECompressorType type;
+	const EInstructionSet instrSet;
+	const EThreadMode threadMode;
+	const BOOL &availabilityOverride;
+} kCompressionSchemes[] = {
+	{ eCompType_DXT1,	eInstrSet_Scalar,		eThreadMode_None,	g_DXT1Available },
+	{ eCompType_DXT1,	eInstrSet_Scalar,		eThreadMode_TBB,	g_DXT1Available },
+	{ eCompType_DXT1,	eInstrSet_Scalar,		eThreadMode_Win32,	g_DXT1Available },
+	{ eCompType_DXT1,	eInstrSet_SSE,			eThreadMode_None,	g_DXT1Available },
+	{ eCompType_DXT1,	eInstrSet_SSE,			eThreadMode_TBB,	g_DXT1Available },
+	{ eCompType_DXT1,	eInstrSet_SSE,			eThreadMode_Win32,	g_DXT1Available },
+	{ eCompType_DXT5,	eInstrSet_Scalar,		eThreadMode_None,	g_DXT1Available },
+	{ eCompType_DXT5,	eInstrSet_Scalar,		eThreadMode_TBB,	g_DXT1Available },
+	{ eCompType_DXT5,	eInstrSet_Scalar,		eThreadMode_Win32,	g_DXT1Available },
+	{ eCompType_DXT5,	eInstrSet_SSE,			eThreadMode_None,	g_DXT1Available },
+	{ eCompType_DXT5,	eInstrSet_SSE,			eThreadMode_TBB,	g_DXT1Available },
+	{ eCompType_DXT5,	eInstrSet_SSE,			eThreadMode_Win32,	g_DXT1Available },
+	{ eCompType_BC7,	eInstrSet_Scalar,		eThreadMode_None,	g_DX11Available },
+	{ eCompType_BC7,	eInstrSet_Scalar,		eThreadMode_Win32,	g_DX11Available },
+	{ eCompType_BC7,	eInstrSet_SSE,			eThreadMode_None,	g_DX11Available },
+	{ eCompType_BC7,	eInstrSet_SSE,			eThreadMode_Win32,	g_DX11Available },
+	{ eCompType_DXT1,	eInstrSet_AVX2,			eThreadMode_None,	g_AVX2Available },
+	{ eCompType_DXT1,	eInstrSet_AVX2,			eThreadMode_TBB,	g_AVX2Available },
+	{ eCompType_DXT1,	eInstrSet_AVX2,			eThreadMode_Win32,	g_AVX2Available },
+	{ eCompType_DXT5,	eInstrSet_AVX2,			eThreadMode_None,	g_AVX2Available },
+	{ eCompType_DXT5,	eInstrSet_AVX2,			eThreadMode_TBB,	g_AVX2Available },
+	{ eCompType_DXT5,	eInstrSet_AVX2,			eThreadMode_Win32,	g_AVX2Available },	
+};
+const int kNumCompressionSchemes = sizeof(kCompressionSchemes) / sizeof(kCompressionSchemes[0]);
+const ECompressionScheme *gCompressionScheme = kCompressionSchemes;
+
+// Textured vertex.
+struct Vertex
+{
+    D3DXVECTOR3 position;
+	D3DXVECTOR2 texCoord;
+};
+
+// Global variables
+CDXUTDialogResourceManager gDialogResourceManager; // manager for shared resources of dialogs
+CD3DSettingsDlg gD3DSettingsDlg; // Device settings dialog
+CDXUTDialog gHUD; // manages the 3D   
+CDXUTDialog gSampleUI; // dialog for sample specific controls
+bool gShowHelp = false; // If true, it renders the UI control text
+CDXUTTextHelper* gTxtHelper = NULL;
+double gCompTime = 0.0;
+double gCompRate = 0.0;
+int gBlocksPerTask = 256;
+int gFrameNum = 0;
+int gFrameDelay = 100;
+int gTexWidth = 0;
+int gTexHeight = 0;
+double gError = 0.0;
+
+#ifdef REPORT_RMSE
+static const WCHAR *kErrorStr = L"Root Mean Squared Error";
+#else
+static const WCHAR *kErrorStr = L"Peak Signal/Noise Ratio";
+#endif
+
+ID3D11DepthStencilState* gDepthStencilState = NULL;
+UINT gStencilReference = 0;
+ID3D11InputLayout* gVertexLayout = NULL;
+ID3D11Buffer* gVertexBuffer = NULL;
+ID3D11Buffer* gQuadVB = NULL;
+ID3D11Buffer* gIndexBuffer = NULL;
+ID3D11VertexShader* gVertexShader = NULL;
+ID3D11PixelShader* gRenderFramePS = NULL;
+ID3D11PixelShader* gRenderTexturePS = NULL;
+ID3D11SamplerState* gSamPoint = NULL;
+ID3D11ShaderResourceView* gUncompressedSRV = NULL; // Shader resource view for the uncompressed texture resource.
+ID3D11ShaderResourceView* gCompressedSRV = NULL; // Shader resource view for the compressed texture resource.
+ID3D11ShaderResourceView* gErrorSRV = NULL; // Shader resource view for the error texture.
+
+// Win32 thread API
+const int kMaxWinThreads = 16;
+
+enum EThreadState {
+	eThreadState_WaitForData,
+	eThreadState_DataLoaded,
+	eThreadState_Running,
+	eThreadState_Done
+};
+
+typedef void (* CompressionFunc)(const BYTE* inBuf, BYTE* outBuf, int width, int height);
+
+struct WinThreadData {
+	EThreadState state;
+	int threadIdx;
+	const BYTE *inBuf;
+	BYTE *outBuf;
+	int width;
+	int height;
+	void (*cmpFunc)(const BYTE* inBuf, BYTE* outBuf, int width, int height);
+
+	// Defaults..
+	WinThreadData() :
+		state(eThreadState_Done),
+		threadIdx(-1),
+		inBuf(NULL),
+		outBuf(NULL),
+		width(-1),
+		height(-1),
+		cmpFunc(NULL)
+	{ }
+
+} gWinThreadData[kMaxWinThreads];
+
+HANDLE gWinThreadWorkEvent[kMaxWinThreads];
+HANDLE gWinThreadStartEvent = NULL;
+HANDLE gWinThreadDoneEvent = NULL;
+int gNumWinThreads = 0;
+DWORD gNumProcessors = 1; // We have at least one processor.
+DWORD dwThreadIdArray[kMaxWinThreads];
+HANDLE hThreadArray[kMaxWinThreads];
+
+// UI control IDs
+#define IDC_TOGGLEFULLSCREEN          1
+#define IDC_TOGGLEREF                 2
+#define IDC_CHANGEDEVICE              3
+#define IDC_UNCOMPRESSEDTEXT          4
+#define IDC_COMPRESSEDTEXT            5
+#define IDC_ERRORTEXT                 6
+#define IDC_SIZETEXT                  7
+#define IDC_TIMETEXT                  8
+#define IDC_RATETEXT                  9
+#define IDC_TBB                       10
+#define IDC_SIMD                      11
+#define IDC_COMPRESSOR                12
+#define IDC_BLOCKSPERTASKTEXT         13
+#define IDC_BLOCKSPERTASK             14
+#define IDC_LOADTEXTURE               15
+#define IDC_RECOMPRESS                16
+#define IDC_RMSETEXT				  17
+
+// Forward declarations 
+bool CALLBACK ModifyDeviceSettings( DXUTDeviceSettings* pDeviceSettings, void* pUserContext );
+void CALLBACK OnFrameMove( double fTime, float fElapsedTime, void* pUserContext );
+LRESULT CALLBACK MsgProc( HWND hWnd, UINT uMsg, WPARAM wParam, LPARAM lParam, bool* pbNoFurtherProcessing,
+                          void* pUserContext );
+void CALLBACK OnKeyboard( UINT nChar, bool bKeyDown, bool bAltDown, void* pUserContext );
+void CALLBACK OnGUIEvent( UINT nEvent, int nControlID, CDXUTControl* pControl, void* pUserContext );
+
+bool CALLBACK IsD3D11DeviceAcceptable(const CD3D11EnumAdapterInfo *AdapterInfo, UINT Output, const CD3D11EnumDeviceInfo *DeviceInfo,
+                                       DXGI_FORMAT BackBufferFormat, bool bWindowed, void* pUserContext );
+HRESULT CALLBACK OnD3D11CreateDevice( ID3D11Device* pd3dDevice, const DXGI_SURFACE_DESC* pBackBufferSurfaceDesc,
+                                      void* pUserContext );
+HRESULT CALLBACK OnD3D11ResizedSwapChain( ID3D11Device* pd3dDevice, IDXGISwapChain* pSwapChain,
+                                          const DXGI_SURFACE_DESC* pBackBufferSurfaceDesc, void* pUserContext );
+void CALLBACK OnD3D11ReleasingSwapChain( void* pUserContext );
+void CALLBACK OnD3D11DestroyDevice( void* pUserContext );
+void CALLBACK OnD3D11FrameRender( ID3D11Device* pd3dDevice, ID3D11DeviceContext* pd3dImmediateContext, double fTime,
+                                  float fElapsedTime, void* pUserContext );
+
+void InitApp();
+void RenderText();
+
+void UpdateBlockSlider();
+void UpdateCompressionAlgorithms();
+void UpdateThreadingMode();
+void UpdateCompressionModes();
+void UpdateAllowedSettings();
+
+void SetCompressionScheme(EInstructionSet instrSet, ECompressorType compType, EThreadMode threadMode);
+
+HRESULT CreateTextures(LPTSTR file);
+void DestroyTextures();
+HRESULT LoadTexture(LPTSTR file);
+HRESULT PadTexture(ID3D11ShaderResourceView** textureSRV);
+HRESULT SaveTexture(ID3D11ShaderResourceView* textureSRV, LPTSTR file);
+HRESULT CompressTexture(ID3D11ShaderResourceView* uncompressedSRV, ID3D11ShaderResourceView** compressedSRV);
+HRESULT ComputeError(ID3D11ShaderResourceView* uncompressedSRV, ID3D11ShaderResourceView* compressedSRV, ID3D11ShaderResourceView** errorSRV);
+HRESULT RecompressTexture();
+
+void ComputeRMSE(const BYTE *errorData, const INT width, const INT height);
+
+void InitWin32Threads();
+void DestroyThreads();
+
+void StoreDepthStencilState();
+void RestoreDepthStencilState();
+HRESULT DisableDepthTest();
+
+namespace DXTC
+{
+	VOID CompressImageDXT(const BYTE* inBuf, BYTE* outBuf, INT width, INT height);
+
+	VOID CompressImageDXTNoThread(const BYTE* inBuf, BYTE* outBuf, INT width, INT height);
+	VOID CompressImageDXTTBB(const BYTE* inBuf, BYTE* outBuf, INT width, INT height);
+	VOID CompressImageDXTWIN(const BYTE* inBuf, BYTE* outBuf, INT width, INT height);
+
+	DWORD WINAPI CompressImageDXTWinThread( LPVOID lpParam );
+}
+
+#ifdef ENABLE_AVX2
+#ifdef _M_X64
+/* On x64, we can't have inline assembly in C files, see avxtest.asm */ 
+extern "C" int __stdcall supports_AVX2();
+
+#else ifdef WIN32
+/* AVX2 instructions require 64 bit mode. */ 
+extern "C" int __stdcall supports_AVX2() {
+	return 0;
+}
+#endif // _M_X64
+#endif // ENABLE_AVX2
+
+int WINAPI wWinMain( HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLine, int nCmdShow )
+{
+    // Enable run-time memory check for debug builds.
+#if defined(DEBUG) | defined(_DEBUG)
+    _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
+#endif
+
+#ifdef ENABLE_AVX2
+	g_AVX2Available = supports_AVX2();
+#endif
+
+	// Make sure that the event array is set to null...
+	memset(gWinThreadWorkEvent, 0, sizeof(gWinThreadWorkEvent));
+
+	// Figure out how many cores there are on this machine
+	SYSTEM_INFO sysinfo;
+	GetSystemInfo(&sysinfo);
+	gNumProcessors = sysinfo.dwNumberOfProcessors;
+
+	// Make sure all of our threads are empty.
+	for(int i = 0; i < kMaxWinThreads; i++) {
+		hThreadArray[i] = NULL;
+	}
+
+    // Set DXUT callbacks
+    DXUTSetCallbackDeviceChanging( ModifyDeviceSettings );
+    DXUTSetCallbackMsgProc( MsgProc );
+    DXUTSetCallbackKeyboard( OnKeyboard );
+    DXUTSetCallbackFrameMove( OnFrameMove );
+    DXUTSetCallbackD3D11DeviceAcceptable( IsD3D11DeviceAcceptable );
+    DXUTSetCallbackD3D11DeviceCreated( OnD3D11CreateDevice );
+    DXUTSetCallbackD3D11SwapChainResized( OnD3D11ResizedSwapChain );
+    DXUTSetCallbackD3D11FrameRender( OnD3D11FrameRender );
+    DXUTSetCallbackD3D11SwapChainReleasing( OnD3D11ReleasingSwapChain );
+    DXUTSetCallbackD3D11DeviceDestroyed( OnD3D11DestroyDevice );
+
+    InitApp();
+
+    DXUTInit( true, true, NULL );
+    DXUTSetCursorSettings( true, true );
+    DXUTCreateWindow( L"Fast Texture Compressor" );
+
+	// Try to create a device with DX11 feature set
+    DXUTCreateDevice (D3D_FEATURE_LEVEL_11_0, true, 1280, 1024 );
+
+	// If we don't have an adequate driver, then we revert to DX10 feature set...
+	DXUTDeviceSettings settings = DXUTGetDeviceSettings();
+	if(settings.d3d11.DriverType == D3D_DRIVER_TYPE_UNKNOWN || settings.d3d11.DriverType == D3D_DRIVER_TYPE_NULL) {
+		DXUTCreateDevice(D3D_FEATURE_LEVEL_10_1, true, 1280, 1024);
+
+		// !HACK! Force enumeration here in order to relocate hardware with new feature level
+		DXUTGetD3D11Enumeration(true);
+		DXUTCreateDevice(D3D_FEATURE_LEVEL_10_1, true, 1280, 1024);
+
+		const TCHAR *noDx11msg = _T("Your hardware does not seem to support DX11. BC7 Compression is disabled.");
+		MessageBox(NULL, noDx11msg, _T("Error"), MB_OK);
+	}
+	else {
+		g_DX11Available = TRUE;
+	}
+
+	// Now that we know what things are allowed, update the available options.
+	UpdateAllowedSettings();
+
+    DXUTMainLoop();
+
+	// Destroy all of the threads...
+	DestroyThreads();
+
+    return DXUTGetExitCode();
+}
+
+// Initialize the app 
+void InitApp()
+{
+	// Initialize dialogs
+	gD3DSettingsDlg.Init(&gDialogResourceManager);
+	gHUD.Init(&gDialogResourceManager);
+	gSampleUI.Init(&gDialogResourceManager);
+
+	gHUD.SetCallback(OnGUIEvent);
+	int x = 0;
+	int y = 10;
+	gHUD.AddButton(IDC_TOGGLEFULLSCREEN, L"Toggle full screen", x, y, 170, 23);
+	gHUD.AddButton(IDC_TOGGLEREF, L"Toggle REF (F3)", x, y += 26, 170, 23, VK_F3);
+	gHUD.AddButton(IDC_CHANGEDEVICE, L"Change device (F2)", x, y += 26, 170, 23, VK_F2);
+
+	gSampleUI.SetCallback(OnGUIEvent);
+	x = 0;
+	y = 0;
+    gSampleUI.AddStatic(IDC_UNCOMPRESSEDTEXT, L"Uncompressed", x, y, 125, 22);
+    gSampleUI.AddStatic(IDC_COMPRESSEDTEXT, L"Compressed", x, y, 125, 22);
+    gSampleUI.AddStatic(IDC_ERRORTEXT, L"Error", x, y, 125, 22);
+	WCHAR wstr[MAX_PATH];
+	swprintf_s(wstr, MAX_PATH, L"Texture Size: %d x %d", gTexWidth, gTexHeight);
+    gSampleUI.AddStatic(IDC_SIZETEXT, wstr, x, y, 125, 22);
+	swprintf_s(wstr, MAX_PATH, L"%s: %.2f", kErrorStr, gError);
+    gSampleUI.AddStatic(IDC_RMSETEXT, wstr, x, y, 125, 22);
+	swprintf_s(wstr, MAX_PATH, L"Compression Time: %0.2f ms", gCompTime);
+    gSampleUI.AddStatic(IDC_TIMETEXT, wstr, x, y, 125, 22);
+	swprintf_s(wstr, MAX_PATH, L"Compression Rate: %0.2f Mp/s", gCompRate);
+    gSampleUI.AddStatic(IDC_RATETEXT, wstr, x, y, 125, 22);
+	gSampleUI.AddComboBox(IDC_TBB, x, y, 95, 22);
+	gSampleUI.AddComboBox(IDC_SIMD, x, y, 140, 22);
+	gSampleUI.AddComboBox(IDC_COMPRESSOR, x, y, 105, 22);
+	swprintf_s(wstr, MAX_PATH, L"Blocks Per Task: %d", gBlocksPerTask);
+	gSampleUI.AddStatic(IDC_BLOCKSPERTASKTEXT, wstr, x, y, 125, 22);	
+	gSampleUI.AddSlider(IDC_BLOCKSPERTASK, x, y, 256, 22, 1, 512, gBlocksPerTask);
+	gSampleUI.AddButton(IDC_LOADTEXTURE, L"Load Texture", x, y, 125, 22);
+	gSampleUI.AddButton(IDC_RECOMPRESS, L"Recompress", x, y, 125, 22);
+}
+
+// Called right before creating a D3D11 device, allowing the app to modify the device settings as needed
+bool CALLBACK ModifyDeviceSettings( DXUTDeviceSettings* pDeviceSettings, void* pUserContext )
+{
+    // Uncomment this to get debug information from D3D11
+    //pDeviceSettings->d3d11.CreateFlags |= D3D11_CREATE_DEVICE_DEBUG;
+
+    // For the first device created if its a REF device, optionally display a warning dialog box
+    static bool s_bFirstTime = true;
+    if( s_bFirstTime )
+    {
+        s_bFirstTime = false;
+        if( ( DXUT_D3D11_DEVICE == pDeviceSettings->ver &&
+              pDeviceSettings->d3d11.DriverType == D3D_DRIVER_TYPE_REFERENCE ) )
+        {
+            DXUTDisplaySwitchingToREFWarning( pDeviceSettings->ver );
+        }
+    }
+
+    return true;
+}
+
+// Handle updates to the scene.
+void CALLBACK OnFrameMove( double fTime, float fElapsedTime, void* pUserContext )
+{
+
+}
+
+// Render the help and statistics text
+void RenderText()
+{
+    UINT nBackBufferHeight = ( DXUTIsAppRenderingWithD3D9() ) ? DXUTGetD3D9BackBufferSurfaceDesc()->Height :
+            DXUTGetDXGIBackBufferSurfaceDesc()->Height;
+
+    gTxtHelper->Begin();
+    gTxtHelper->SetInsertionPos( 2, 0 );
+    gTxtHelper->SetForegroundColor( D3DXCOLOR( 1.0f, 1.0f, 0.0f, 1.0f ) );
+    gTxtHelper->DrawTextLine( DXUTGetFrameStats( false ) );
+    gTxtHelper->DrawTextLine( DXUTGetDeviceStats() );
+
+    // Draw help
+    if( gShowHelp )
+    {
+        gTxtHelper->SetInsertionPos( 2, nBackBufferHeight - 20 * 6 );
+        gTxtHelper->SetForegroundColor( D3DXCOLOR( 1.0f, 0.75f, 0.0f, 1.0f ) );
+        gTxtHelper->DrawTextLine( L"Controls:" );
+
+        gTxtHelper->SetInsertionPos( 20, nBackBufferHeight - 20 * 5 );
+        gTxtHelper->DrawTextLine( L"Hide help: F1\n"
+                                    L"Quit: ESC\n" );
+    }
+    else
+    {
+        gTxtHelper->SetForegroundColor( D3DXCOLOR( 1.0f, 1.0f, 1.0f, 1.0f ) );
+        gTxtHelper->DrawTextLine( L"Press F1 for help" );
+    }
+
+    gTxtHelper->End();
+}
+
+// Handle messages to the application
+LRESULT CALLBACK MsgProc( HWND hWnd, UINT uMsg, WPARAM wParam, LPARAM lParam, bool* pbNoFurtherProcessing,
+                          void* pUserContext )
+{
+    // Pass messages to dialog resource manager calls so GUI state is updated correctly
+    *pbNoFurtherProcessing = gDialogResourceManager.MsgProc( hWnd, uMsg, wParam, lParam );
+    if( *pbNoFurtherProcessing )
+        return 0;
+
+    // Pass messages to settings dialog if its active
+    if( gD3DSettingsDlg.IsActive() )
+    {
+        gD3DSettingsDlg.MsgProc( hWnd, uMsg, wParam, lParam );
+        return 0;
+    }
+
+    // Give the dialogs a chance to handle the message first
+    *pbNoFurtherProcessing = gHUD.MsgProc( hWnd, uMsg, wParam, lParam );
+    if( *pbNoFurtherProcessing )
+        return 0;
+    *pbNoFurtherProcessing = gSampleUI.MsgProc( hWnd, uMsg, wParam, lParam );
+    if( *pbNoFurtherProcessing )
+        return 0;
+
+    return 0;
+}
+
+// Handle key presses
+void CALLBACK OnKeyboard( UINT nChar, bool bKeyDown, bool bAltDown, void* pUserContext )
+{
+    if( bKeyDown )
+    {
+        switch( nChar )
+        {
+            case VK_F1:
+                gShowHelp = !gShowHelp; break;
+        }
+    }
+}
+
+// Handles the GUI events
+void CALLBACK OnGUIEvent( UINT nEvent, int nControlID, CDXUTControl* pControl, void* pUserContext )
+{
+    switch( nControlID )
+    {
+        case IDC_TOGGLEFULLSCREEN:
+		{
+            DXUTToggleFullScreen();
+			break;
+		}
+        case IDC_TOGGLEREF:
+		{
+            DXUTToggleREF();
+			break;
+		}
+        case IDC_CHANGEDEVICE:
+		{
+            gD3DSettingsDlg.SetActive( !gD3DSettingsDlg.IsActive() );
+			break;
+		}
+
+		case IDC_TIMETEXT:
+		{
+			WCHAR wstr[MAX_PATH];
+			swprintf_s(wstr, MAX_PATH, L"Compression Time: %0.2f ms", gCompTime);
+			gSampleUI.GetStatic(IDC_TIMETEXT)->SetText(wstr);
+			break;
+		}
+		case IDC_RATETEXT:
+		{
+			WCHAR wstr[MAX_PATH];
+			swprintf_s(wstr, MAX_PATH, L"Compression Rate: %0.2f Mp/s", gCompRate);
+			gSampleUI.GetStatic(IDC_RATETEXT)->SetText(wstr);
+			break;
+		}
+
+		case IDC_RMSETEXT:
+		{
+			WCHAR wstr[MAX_PATH];
+			swprintf_s(wstr, MAX_PATH, L"%s: %.2f", kErrorStr, gError);
+			gSampleUI.GetStatic(IDC_RMSETEXT)->SetText(wstr);
+			break;
+		}
+
+		case IDC_TBB:
+		{
+			// Shut down all previous threading abilities.
+			DestroyThreads();
+
+			EInstructionSet instrSet = gCompressionScheme->instrSet;
+			ECompressorType compType = gCompressionScheme->type;
+
+			EThreadMode newMode = (EThreadMode)(INT_PTR)gSampleUI.GetComboBox(IDC_TBB)->GetSelectedData();
+
+			switch(newMode) {
+				case eThreadMode_TBB:
+
+					// Initialize the TBB task manager.
+					gTaskMgr.Init();
+
+					break;
+
+				case eThreadMode_Win32:
+
+					InitWin32Threads();
+
+					break;
+
+				case eThreadMode_None:
+					// Do nothing, our threads are fine.
+					break;
+			}
+
+			SetCompressionScheme(instrSet, compType, newMode);
+			UpdateAllowedSettings();
+
+			// Recompress the texture.
+			RecompressTexture();
+
+			break;
+		}
+
+		case IDC_SIMD:
+		{
+			EThreadMode threadMode = gCompressionScheme->threadMode;
+			ECompressorType compType = gCompressionScheme->type;
+
+			EInstructionSet newInstrSet = (EInstructionSet)(INT_PTR)gSampleUI.GetComboBox(IDC_SIMD)->GetSelectedData();
+
+			// If we selected AVX2, then the total number of blocks when using AVX2 changes, so we need
+			// to reflect that in the slider.
+			UpdateBlockSlider();
+
+			SetCompressionScheme(newInstrSet, compType, threadMode);
+			UpdateAllowedSettings();
+
+			// Recompress the texture.
+			RecompressTexture();
+
+			break;
+		}
+		case IDC_COMPRESSOR:
+		{
+			EThreadMode threadMode = gCompressionScheme->threadMode;
+			EInstructionSet instrSet = gCompressionScheme->instrSet;
+			ECompressorType newCompType = (ECompressorType)(INT_PTR)gSampleUI.GetComboBox(IDC_COMPRESSOR)->GetSelectedData();
+			
+			SetCompressionScheme(instrSet, newCompType, threadMode);
+			UpdateAllowedSettings();
+
+			// Recompress the texture.
+			RecompressTexture();
+
+			break;
+		}
+		case IDC_BLOCKSPERTASK:
+		{
+			gBlocksPerTask = gSampleUI.GetSlider(IDC_BLOCKSPERTASK)->GetValue();
+			WCHAR wstr[MAX_PATH];
+			swprintf_s(wstr, MAX_PATH, L"Blocks Per Task: %d", gBlocksPerTask);
+			gSampleUI.GetStatic(IDC_BLOCKSPERTASKTEXT)->SetText(wstr);
+
+			// Recompress the texture.
+			RecompressTexture();
+
+			break;
+		}
+		case IDC_LOADTEXTURE:
+		{
+			// Store the current working directory.
+			TCHAR workingDirectory[MAX_PATH];
+			GetCurrentDirectory(MAX_PATH, workingDirectory);
+
+			// Open a file dialog.
+			OPENFILENAME openFileName;
+			WCHAR file[MAX_PATH];
+			file[0] = 0;
+			ZeroMemory(&openFileName, sizeof(OPENFILENAME));
+			openFileName.lStructSize = sizeof(OPENFILENAME);
+			openFileName.lpstrFile = file;
+			openFileName.nMaxFile = MAX_PATH;
+			openFileName.lpstrFilter = L"DDS\0*.dds\0\0";
+			openFileName.nFilterIndex = 1;
+			openFileName.lpstrInitialDir = NULL;
+			openFileName.Flags = OFN_PATHMUSTEXIST | OFN_FILEMUSTEXIST;
+			if(GetOpenFileName(&openFileName))
+			{
+				CreateTextures(openFileName.lpstrFile);
+			}
+
+			// Restore the working directory. GetOpenFileName changes the current working directory which causes problems with relative paths to assets.
+			SetCurrentDirectory(workingDirectory);
+
+			break;
+		}
+		case IDC_RECOMPRESS:
+		{
+			// Recompress the texture.
+			RecompressTexture();
+
+			break;
+		}
+    }
+}
+
+// Reject any D3D11 devices that aren't acceptable by returning false
+bool CALLBACK IsD3D11DeviceAcceptable( const CD3D11EnumAdapterInfo *AdapterInfo, UINT Output, const CD3D11EnumDeviceInfo *DeviceInfo,
+                                       DXGI_FORMAT BackBufferFormat, bool bWindowed, void* pUserContext )
+{
+    return true;
+}
+
+// Find and compile the specified shader
+HRESULT CompileShaderFromFile( WCHAR* szFileName, LPCSTR szEntryPoint, LPCSTR szShaderModel, ID3DBlob** ppBlobOut )
+{
+    HRESULT hr = S_OK;
+
+    // find the file
+    WCHAR str[MAX_PATH];
+    V_RETURN( DXUTFindDXSDKMediaFileCch( str, MAX_PATH, szFileName ) );
+
+    DWORD dwShaderFlags = D3DCOMPILE_ENABLE_STRICTNESS;
+#if defined( DEBUG ) || defined( _DEBUG )
+    // Set the D3DCOMPILE_DEBUG flag to embed debug information in the shaders.
+    // Setting this flag improves the shader debugging experience, but still allows 
+    // the shaders to be optimized and to run exactly the way they will run in 
+    // the release configuration of this program.
+    dwShaderFlags |= D3DCOMPILE_DEBUG;
+#endif
+
+    ID3DBlob* pErrorBlob;
+    hr = D3DX11CompileFromFile( str, NULL, NULL, szEntryPoint, szShaderModel, 
+        dwShaderFlags, 0, NULL, ppBlobOut, &pErrorBlob, NULL );
+    if( FAILED(hr) )
+    {
+        if( pErrorBlob != NULL )
+            OutputDebugStringA( (char*)pErrorBlob->GetBufferPointer() );
+        SAFE_RELEASE( pErrorBlob );
+        return hr;
+    }
+    SAFE_RELEASE( pErrorBlob );
+
+    return S_OK;
+}
+
+// Create any D3D11 resources that aren't dependent on the back buffer
+HRESULT CALLBACK OnD3D11CreateDevice( ID3D11Device* pd3dDevice, const DXGI_SURFACE_DESC* pBackBufferSurfaceDesc,
+                                      void* pUserContext )
+{
+    HRESULT hr;
+
+    ID3D11DeviceContext* pd3dImmediateContext = DXUTGetD3D11DeviceContext();
+    V_RETURN(gDialogResourceManager.OnD3D11CreateDevice(pd3dDevice, pd3dImmediateContext));
+    V_RETURN(gD3DSettingsDlg.OnD3D11CreateDevice(pd3dDevice));
+    gTxtHelper = new CDXUTTextHelper(pd3dDevice, pd3dImmediateContext, &gDialogResourceManager, 15);
+
+    // Create a vertex shader.
+    ID3DBlob* vertexShaderBuffer = NULL;
+    V_RETURN(CompileShaderFromFile(L"FastTextureCompressor\\FastTextureCompressor.hlsl", "PassThroughVS", "vs_4_0", &vertexShaderBuffer));
+    V_RETURN(pd3dDevice->CreateVertexShader(vertexShaderBuffer->GetBufferPointer(), vertexShaderBuffer->GetBufferSize(), NULL, &gVertexShader));
+
+	// Create a pixel shader that renders the composite frame.
+    ID3DBlob* pixelShaderBuffer = NULL;
+    V_RETURN(CompileShaderFromFile(L"FastTextureCompressor\\FastTextureCompressor.hlsl", "RenderFramePS", "ps_4_0", &pixelShaderBuffer));
+    V_RETURN(pd3dDevice->CreatePixelShader(pixelShaderBuffer->GetBufferPointer(), pixelShaderBuffer->GetBufferSize(), NULL, &gRenderFramePS));
+
+	// Create a pixel shader that renders the error texture.
+    V_RETURN(CompileShaderFromFile(L"FastTextureCompressor\\FastTextureCompressor.hlsl", "RenderTexturePS", "ps_4_0", &pixelShaderBuffer));
+    V_RETURN(pd3dDevice->CreatePixelShader(pixelShaderBuffer->GetBufferPointer(), pixelShaderBuffer->GetBufferSize(), NULL, &gRenderTexturePS));
+
+    // Create our vertex input layout
+    const D3D11_INPUT_ELEMENT_DESC layout[] =
+    {
+        { "POSITION",  0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0,  D3D11_INPUT_PER_VERTEX_DATA, 0 },
+        { "TEXCOORD",  0, DXGI_FORMAT_R32G32_FLOAT,    0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0 }
+    };
+
+    V_RETURN(pd3dDevice->CreateInputLayout(layout, ARRAYSIZE(layout), vertexShaderBuffer->GetBufferPointer(), vertexShaderBuffer->GetBufferSize(), &gVertexLayout));
+
+    SAFE_RELEASE(vertexShaderBuffer);
+    SAFE_RELEASE(pixelShaderBuffer);
+
+	// Create a vertex buffer for three textured quads.
+	D3DXVECTOR2 quadSize(0.32f, 0.32f);
+	D3DXVECTOR2 quadOrigin(-0.66f, -0.0f);
+    Vertex tripleQuadVertices[18];
+	ZeroMemory(tripleQuadVertices, sizeof(tripleQuadVertices));
+	for(int i = 0; i < 18; i += 6)
+	{
+		tripleQuadVertices[i].position = D3DXVECTOR3(quadOrigin.x - quadSize.x, quadOrigin.y + quadSize.y, 0.0f);
+		tripleQuadVertices[i].texCoord = D3DXVECTOR2(0.0f, 0.0f);
+
+		tripleQuadVertices[i + 1].position = D3DXVECTOR3(quadOrigin.x + quadSize.x, quadOrigin.y + quadSize.y, 0.0f);
+		tripleQuadVertices[i + 1].texCoord = D3DXVECTOR2(1.0f, 0.0f);
+
+		tripleQuadVertices[i + 2].position = D3DXVECTOR3(quadOrigin.x + quadSize.x, quadOrigin.y - quadSize.y, 0.0f);
+		tripleQuadVertices[i + 2].texCoord = D3DXVECTOR2(1.0f, 1.0f);
+
+		tripleQuadVertices[i + 3].position = D3DXVECTOR3(quadOrigin.x + quadSize.x, quadOrigin.y - quadSize.y, 0.0f);
+		tripleQuadVertices[i + 3].texCoord = D3DXVECTOR2(1.0f, 1.0f);
+
+		tripleQuadVertices[i + 4].position = D3DXVECTOR3(quadOrigin.x - quadSize.x, quadOrigin.y - quadSize.y, 0.0f);
+		tripleQuadVertices[i + 4].texCoord = D3DXVECTOR2(0.0f, 1.0f);
+
+		tripleQuadVertices[i + 5].position = D3DXVECTOR3(quadOrigin.x - quadSize.x, quadOrigin.y + quadSize.y, 0.0f);
+		tripleQuadVertices[i + 5].texCoord = D3DXVECTOR2(0.0f, 0.0f);
+
+		quadOrigin.x += 0.66f;
+	}
+
+    D3D11_BUFFER_DESC bufDesc;
+	ZeroMemory(&bufDesc, sizeof(bufDesc));
+    bufDesc.Usage = D3D11_USAGE_DEFAULT;
+    bufDesc.ByteWidth = sizeof(tripleQuadVertices);
+    bufDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
+	bufDesc.CPUAccessFlags = 0;
+    D3D11_SUBRESOURCE_DATA data;
+	ZeroMemory(&data, sizeof(data));
+    data.pSysMem = tripleQuadVertices;
+    V_RETURN(pd3dDevice->CreateBuffer(&bufDesc, &data, &gVertexBuffer));
+
+	// Create a vertex buffer for a single textured quad.
+	quadSize = D3DXVECTOR2(1.0f, 1.0f);
+	quadOrigin = D3DXVECTOR2(0.0f, 0.0f);
+	Vertex singleQuadVertices[6];
+	singleQuadVertices[0].position = D3DXVECTOR3(quadOrigin.x - quadSize.x, quadOrigin.y + quadSize.y, 0.0f);
+	singleQuadVertices[0].texCoord = D3DXVECTOR2(0.0f, 0.0f);
+	singleQuadVertices[1].position = D3DXVECTOR3(quadOrigin.x + quadSize.x, quadOrigin.y + quadSize.y, 0.0f);
+	singleQuadVertices[1].texCoord = D3DXVECTOR2(1.0f, 0.0f);
+	singleQuadVertices[2].position = D3DXVECTOR3(quadOrigin.x + quadSize.x, quadOrigin.y - quadSize.y, 0.0f);
+	singleQuadVertices[2].texCoord = D3DXVECTOR2(1.0f, 1.0f);
+	singleQuadVertices[3].position = D3DXVECTOR3(quadOrigin.x + quadSize.x, quadOrigin.y - quadSize.y, 0.0f);
+	singleQuadVertices[3].texCoord = D3DXVECTOR2(1.0f, 1.0f);
+	singleQuadVertices[4].position = D3DXVECTOR3(quadOrigin.x - quadSize.x, quadOrigin.y - quadSize.y, 0.0f);
+	singleQuadVertices[4].texCoord = D3DXVECTOR2(0.0f, 1.0f);
+	singleQuadVertices[5].position = D3DXVECTOR3(quadOrigin.x - quadSize.x, quadOrigin.y + quadSize.y, 0.0f);
+	singleQuadVertices[5].texCoord = D3DXVECTOR2(0.0f, 0.0f);
+
+	ZeroMemory(&bufDesc, sizeof(bufDesc));
+    bufDesc.Usage = D3D11_USAGE_DEFAULT;
+    bufDesc.ByteWidth = sizeof(singleQuadVertices);
+    bufDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
+	bufDesc.CPUAccessFlags = 0;
+	ZeroMemory(&data, sizeof(data));
+    data.pSysMem = singleQuadVertices;
+    V_RETURN(pd3dDevice->CreateBuffer(&bufDesc, &data, &gQuadVB));
+
+    // Create a sampler state
+    D3D11_SAMPLER_DESC SamDesc;
+    SamDesc.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT;
+    SamDesc.AddressU = D3D11_TEXTURE_ADDRESS_WRAP;
+    SamDesc.AddressV = D3D11_TEXTURE_ADDRESS_WRAP;
+    SamDesc.AddressW = D3D11_TEXTURE_ADDRESS_WRAP;
+    SamDesc.MipLODBias = 0.0f;
+    SamDesc.MaxAnisotropy = 1;
+    SamDesc.ComparisonFunc = D3D11_COMPARISON_ALWAYS;
+    SamDesc.BorderColor[0] = SamDesc.BorderColor[1] = SamDesc.BorderColor[2] = SamDesc.BorderColor[3] = 0;
+    SamDesc.MinLOD = 0;
+    SamDesc.MaxLOD = D3D11_FLOAT32_MAX;
+    V_RETURN(pd3dDevice->CreateSamplerState(&SamDesc, &gSamPoint));
+
+	// Load and initialize the textures.
+    WCHAR path[MAX_PATH];
+    V_RETURN(DXUTFindDXSDKMediaFileCch(path, MAX_PATH, L"Images\\texture.dds"));
+	V_RETURN(CreateTextures(path));
+
+    return S_OK;
+}
+
+// Create any D3D11 resources that depend on the back buffer
+HRESULT CALLBACK OnD3D11ResizedSwapChain( ID3D11Device* pd3dDevice, IDXGISwapChain* pSwapChain,
+                                          const DXGI_SURFACE_DESC* pBackBufferSurfaceDesc, void* pUserContext )
+{
+    HRESULT hr;
+    V_RETURN( gDialogResourceManager.OnD3D11ResizedSwapChain( pd3dDevice, pBackBufferSurfaceDesc ) );
+    V_RETURN( gD3DSettingsDlg.OnD3D11ResizedSwapChain( pd3dDevice, pBackBufferSurfaceDesc ) );
+
+    gHUD.SetLocation( pBackBufferSurfaceDesc->Width - 170, 0 );
+    gHUD.SetSize( 170, 170 );
+
+    gSampleUI.SetLocation( 0, 0 );
+    gSampleUI.SetSize( pBackBufferSurfaceDesc->Width, pBackBufferSurfaceDesc->Height );
+
+	int oneThirdWidth = int(gSampleUI.GetWidth() / 3.0f);
+	int oneThirdHeight = int(gSampleUI.GetHeight() / 3.0f);
+	int x = 20;
+	int y = oneThirdHeight - 20;
+    gSampleUI.GetStatic(IDC_UNCOMPRESSEDTEXT)->SetLocation(x, y);
+    gSampleUI.GetStatic(IDC_COMPRESSEDTEXT)->SetLocation(x += oneThirdWidth, y);
+    gSampleUI.GetStatic(IDC_ERRORTEXT)->SetLocation(x += oneThirdWidth, y);
+	x = gSampleUI.GetWidth() - 276;
+	y = gSampleUI.GetHeight() - 216;
+    gSampleUI.GetStatic(IDC_SIZETEXT)->SetLocation(x, y);
+	gSampleUI.GetStatic(IDC_RMSETEXT)->SetLocation(x, y += 26);
+    gSampleUI.GetStatic(IDC_TIMETEXT)->SetLocation(x, y += 26);
+    gSampleUI.GetStatic(IDC_RATETEXT)->SetLocation(x, y += 26);
+	gSampleUI.GetComboBox(IDC_SIMD)->SetLocation(x, y += 26);
+	gSampleUI.GetComboBox(IDC_COMPRESSOR)->SetLocation(x + 150, y);
+	gSampleUI.GetStatic(IDC_BLOCKSPERTASKTEXT)->SetLocation(x, y += 26);
+	gSampleUI.GetComboBox(IDC_TBB)->SetLocation(x + 160, y);
+	gSampleUI.GetSlider(IDC_BLOCKSPERTASK)->SetLocation(x, y += 26);
+	gSampleUI.GetButton(IDC_LOADTEXTURE)->SetLocation(x, y += 26);
+	gSampleUI.GetButton(IDC_RECOMPRESS)->SetLocation(x + 131, y);
+
+    return S_OK;
+}
+
+// Render the scene using the D3D11 device
+void CALLBACK OnD3D11FrameRender( ID3D11Device* pd3dDevice, ID3D11DeviceContext* pd3dImmediateContext, double fTime,
+                                  float fElapsedTime, void* pUserContext )
+{
+	// Recompress the texture gFrameDelay frames after the app has started.  This produces more accurate timing of the
+	// compression algorithm.
+	if(gFrameNum == gFrameDelay)
+	{
+		RecompressTexture();
+		gFrameNum++;
+	}
+	else if(gFrameNum < gFrameDelay)
+	{
+		gFrameNum++;
+	}
+
+    // If the settings dialog is being shown, then render it instead of rendering the app's scene
+    if( gD3DSettingsDlg.IsActive() )
+    {
+        gD3DSettingsDlg.OnRender( fElapsedTime );
+        return;
+    }
+
+    // Clear the render target and depth stencil
+    float ClearColor[4] = { 0.02f, 0.02f, 0.02f, 1.0f };
+    ID3D11RenderTargetView* pRTV = DXUTGetD3D11RenderTargetView();
+    pd3dImmediateContext->ClearRenderTargetView( pRTV, ClearColor );
+    ID3D11DepthStencilView* pDSV = DXUTGetD3D11DepthStencilView();
+    pd3dImmediateContext->ClearDepthStencilView( pDSV, D3D11_CLEAR_DEPTH, 1.0, 0 );
+
+    // Set the input layout.
+    pd3dImmediateContext->IASetInputLayout( gVertexLayout );
+
+    // Set the vertex buffer.
+    UINT stride = sizeof( Vertex );
+    UINT offset = 0;
+    pd3dImmediateContext->IASetVertexBuffers( 0, 1, &gVertexBuffer, &stride, &offset );
+
+    // Set the primitive topology
+    pd3dImmediateContext->IASetPrimitiveTopology( D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST );
+
+    // Set the shaders
+    pd3dImmediateContext->VSSetShader( gVertexShader, NULL, 0 );
+    pd3dImmediateContext->PSSetShader( gRenderFramePS, NULL, 0 );
+    
+	// Set the texture sampler.
+    pd3dImmediateContext->PSSetSamplers( 0, 1, &gSamPoint );
+
+	// Render the uncompressed texture.
+	pd3dImmediateContext->PSSetShaderResources( 0, 1, &gUncompressedSRV );
+    pd3dImmediateContext->Draw( 6, 0 );
+
+	// Render the compressed texture.
+	pd3dImmediateContext->PSSetShaderResources( 0, 1, &gCompressedSRV );
+    pd3dImmediateContext->Draw( 6, 6 );
+
+	// Render the error texture.
+	pd3dImmediateContext->PSSetShaderResources( 0, 1, &gErrorSRV );
+    pd3dImmediateContext->Draw( 6, 12 );
+
+    DXUT_BeginPerfEvent( DXUT_PERFEVENTCOLOR, L"HUD / Stats" );
+    HRESULT hr;
+    V(gHUD.OnRender( fElapsedTime ));
+    V(gSampleUI.OnRender( fElapsedTime ));
+    RenderText();
+    DXUT_EndPerfEvent();
+}
+
+// Release D3D11 resources created in OnD3D11ResizedSwapChain 
+void CALLBACK OnD3D11ReleasingSwapChain( void* pUserContext )
+{
+    gDialogResourceManager.OnD3D11ReleasingSwapChain();
+}
+
+// Release D3D11 resources created in OnD3D11CreateDevice 
+void CALLBACK OnD3D11DestroyDevice( void* pUserContext )
+{
+    gDialogResourceManager.OnD3D11DestroyDevice();
+    gD3DSettingsDlg.OnD3D11DestroyDevice();
+    //CDXUTDirectionWidget::StaticOnD3D11DestroyDevice();
+    DXUTGetGlobalResourceCache().OnDestroyDevice();
+    SAFE_DELETE( gTxtHelper );
+
+    SAFE_RELEASE( gVertexLayout );
+    SAFE_RELEASE( gVertexBuffer );
+    SAFE_RELEASE( gQuadVB );
+    SAFE_RELEASE( gIndexBuffer );
+    SAFE_RELEASE( gVertexShader );
+    SAFE_RELEASE( gRenderFramePS );
+    SAFE_RELEASE( gRenderTexturePS );
+    SAFE_RELEASE( gSamPoint );
+
+	DestroyTextures();
+}
+
+// Free previously allocated texture resources and create new texture resources.
+HRESULT CreateTextures(LPTSTR file)
+{
+	// Destroy any previously created textures.
+	DestroyTextures();
+
+	// Load the uncompressed texture.
+	HRESULT hr;
+	V_RETURN(LoadTexture(file));
+
+	// Compress the texture.
+	V_RETURN(CompressTexture(gUncompressedSRV, &gCompressedSRV));
+
+	// Compute the error in the compressed texture.
+	V_RETURN(ComputeError(gUncompressedSRV, gCompressedSRV, &gErrorSRV));
+
+	return S_OK;
+}
+
+// Destroy texture resources.
+void DestroyTextures()
+{
+	SAFE_RELEASE(gErrorSRV);
+	SAFE_RELEASE(gCompressedSRV);
+	SAFE_RELEASE(gUncompressedSRV);
+}
+
+// This functions loads a texture and prepares it for compression. The compressor only works on texture
+// dimensions that are divisible by 4.  Textures that are not divisible by 4 are resized and padded with the edge values.
+HRESULT LoadTexture(LPTSTR file)
+{
+	// Load the uncrompressed texture.
+	// The loadInfo structure disables mipmapping by setting MipLevels to 1.
+	D3DX11_IMAGE_LOAD_INFO loadInfo;
+	ZeroMemory(&loadInfo, sizeof(D3DX11_IMAGE_LOAD_INFO));
+	loadInfo.Width = D3DX11_DEFAULT;
+	loadInfo.Height = D3DX11_DEFAULT;
+	loadInfo.Depth = D3DX11_DEFAULT;
+	loadInfo.FirstMipLevel = D3DX11_DEFAULT;
+	loadInfo.MipLevels = 1;
+	loadInfo.Usage = (D3D11_USAGE) D3DX11_DEFAULT;
+	loadInfo.BindFlags = D3D11_BIND_SHADER_RESOURCE;
+	loadInfo.CpuAccessFlags = D3DX11_DEFAULT;
+	loadInfo.MiscFlags = D3DX11_DEFAULT;
+	loadInfo.Format = DXGI_FORMAT_R8G8B8A8_UNORM_SRGB;
+	loadInfo.Filter = D3DX11_FILTER_POINT | D3DX11_FILTER_SRGB;
+	loadInfo.MipFilter = D3DX11_DEFAULT;
+	loadInfo.pSrcInfo = NULL;
+	HRESULT hr;
+	V_RETURN(D3DX11CreateShaderResourceViewFromFile(DXUTGetD3D11Device(), file, &loadInfo, NULL, &gUncompressedSRV, NULL));
+
+	// Pad the texture.
+	V_RETURN(PadTexture(&gUncompressedSRV));
+
+	// Query the texture description.
+	ID3D11Texture2D* tex;
+	gUncompressedSRV->GetResource((ID3D11Resource**)&tex);
+	D3D11_TEXTURE2D_DESC texDesc;
+	tex->GetDesc(&texDesc);
+	SAFE_RELEASE(tex);
+
+	// Update the UI's texture width and height.
+	gTexWidth = texDesc.Width;
+	gTexHeight = texDesc.Height;
+	
+	WCHAR wstr[MAX_PATH];
+	swprintf_s(wstr, MAX_PATH, L"Texture Size: %d x %d", gTexWidth, gTexHeight);
+	gSampleUI.GetStatic(IDC_SIZETEXT)->SetText(wstr);
+	// gSampleUI.SendEvent(IDC_SIZETEXT, true, gSampleUI.GetStatic(IDC_SIZETEXT));
+
+	UpdateBlockSlider();
+
+	return S_OK;
+}
+
+void SetCompressionScheme(EInstructionSet instrSet, ECompressorType compType, EThreadMode threadMode) {
+
+	bool foundMatch = false;
+	for(int i = 0; i < kNumCompressionSchemes; i++) {
+		bool match = true;
+		match = match && kCompressionSchemes[i].instrSet == instrSet;
+		match = match && kCompressionSchemes[i].type == compType;
+		match = match && kCompressionSchemes[i].threadMode == threadMode;
+		if(match) {
+			gCompressionScheme = &(kCompressionSchemes[i]);
+			foundMatch = true;
+			break;
+		}
+	}
+
+	if(!foundMatch) {
+		OutputDebugString(L"ERROR: Did not find match for compression scheme, not changing.\n");
+	}
+}
+
+void UpdateCompressionModes() {
+
+	CDXUTComboBox *comboBox = gSampleUI.GetComboBox(IDC_COMPRESSOR);
+	comboBox->RemoveAllItems();
+
+	// If we're updating the compression modes, then see 
+	// what we currently have selected and keep everything else constant.
+	EThreadMode currThreadMode = gCompressionScheme->threadMode;
+	EInstructionSet currInstrSet = gCompressionScheme->instrSet;
+
+	bool added[kNumCompressorTypes];
+	memset(added, 0, sizeof(added));
+
+	for(int i = 0; i < kNumCompressionSchemes; i++) {
+
+		bool match = kCompressionSchemes[i].instrSet == currInstrSet;
+		match = match && kCompressionSchemes[i].threadMode == currThreadMode;
+		match = match && kCompressionSchemes[i].availabilityOverride;
+
+		if(match) {
+			ECompressorType compType = kCompressionSchemes[i].type;
+			if(!added[compType]) {
+				comboBox->AddItem(kCompressorTypeStr[compType], (void*)(INT_PTR)compType);
+				added[compType] = true;
+			}
+		}
+	}
+
+	comboBox->SetSelectedByData((void *)(INT_PTR)(gCompressionScheme->type));
+}
+
+void UpdateCompressionAlgorithms() {
+	
+	CDXUTComboBox *comboBox = gSampleUI.GetComboBox(IDC_SIMD);
+	comboBox->RemoveAllItems();
+
+	// If we're updating the compression algorithms, then see 
+	// what we currently have selected and keep everything else constant.
+	EThreadMode currThreadMode = gCompressionScheme->threadMode;
+	ECompressorType currType = gCompressionScheme->type;
+
+	bool added[kNumInstructionSets];
+	memset(added, 0, sizeof(added));
+
+	for(int i = 0; i < kNumCompressionSchemes; i++) {
+
+		bool match = kCompressionSchemes[i].type == currType;
+		match = match && kCompressionSchemes[i].threadMode == currThreadMode;
+		match = match && kCompressionSchemes[i].availabilityOverride;
+
+		if(match) {
+			EInstructionSet instrSet = kCompressionSchemes[i].instrSet;
+			if(!added[instrSet]) {
+				comboBox->AddItem(kInstructionSetStr[instrSet], (void*)(INT_PTR)instrSet);
+				added[instrSet] = true;
+			}
+		}
+	}
+
+	comboBox->SetSelectedByData((void *)(INT_PTR)(gCompressionScheme->instrSet));
+}
+
+void UpdateThreadingMode() {
+	
+	CDXUTComboBox *comboBox = gSampleUI.GetComboBox(IDC_TBB);
+	comboBox->RemoveAllItems();
+
+	// If we're updating the compression algorithms, then see 
+	// what we currently have selected and keep everything else constant.
+	EInstructionSet currInstrSet = gCompressionScheme->instrSet;
+	ECompressorType currType = gCompressionScheme->type;
+
+	bool added[kNumThreadModes];
+	memset(added, 0, sizeof(added));
+
+	for(int i = 0; i < kNumCompressionSchemes; i++) {
+
+		bool match = kCompressionSchemes[i].type == currType;
+		match = match && kCompressionSchemes[i].instrSet == currInstrSet;
+		match = match && kCompressionSchemes[i].availabilityOverride;
+
+		if(match) {
+			EThreadMode threadMode = kCompressionSchemes[i].threadMode;
+			if(!added[threadMode]) {
+				comboBox->AddItem(kThreadModeStr[threadMode], (void*)(INT_PTR)threadMode);
+				added[threadMode] = true;
+			}
+		}
+	}
+
+	comboBox->SetSelectedByData((void *)(INT_PTR)(gCompressionScheme->threadMode));
+}
+
+void UpdateAllowedSettings() {
+	UpdateCompressionModes();
+	UpdateCompressionAlgorithms();
+	UpdateThreadingMode();
+}
+
+void UpdateBlockSlider() {
+
+	int blockRows = gTexHeight / 4;
+	int blockCols = gTexWidth / 4;
+	if(gCompressionScheme->instrSet == eInstrSet_AVX2) {
+		blockCols /= 2;
+	}
+
+	int numBlocks = blockRows * blockCols;
+	int blksPerProc = numBlocks / gNumProcessors;
+
+	gSampleUI.GetSlider(IDC_BLOCKSPERTASK)->SetRange(1, blksPerProc);
+}
+
+// Pad the texture to dimensions that are divisible by 4.
+HRESULT PadTexture(ID3D11ShaderResourceView** textureSRV)
+{
+	// Query the texture description.
+	ID3D11Texture2D* tex;
+	(*textureSRV)->GetResource((ID3D11Resource**)&tex);
+	D3D11_TEXTURE2D_DESC texDesc;
+	tex->GetDesc(&texDesc);
+
+	// Exit if the texture dimensions are divisible by 4.
+	if((texDesc.Width % 4 == 0) && (texDesc.Height % 4 == 0))
+	{
+		SAFE_RELEASE(tex);
+		return S_OK;
+	}
+
+	// Compute the size of the padded texture.
+	UINT padWidth = texDesc.Width / 4 * 4 + 4;
+	UINT padHeight = texDesc.Height / 4 * 4 + 4;
+
+	// Create a buffer for the padded texels.
+	BYTE* padTexels = new BYTE[padWidth * padHeight * 4];
+
+	// Create a staging resource for the texture.
+	HRESULT hr;
+	ID3D11Device* device = DXUTGetD3D11Device();
+	D3D11_TEXTURE2D_DESC stgTexDesc;
+	memcpy(&stgTexDesc, &texDesc, sizeof(D3D11_TEXTURE2D_DESC));
+	stgTexDesc.Usage = D3D11_USAGE_STAGING;
+	stgTexDesc.BindFlags = 0;
+	stgTexDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
+	ID3D11Texture2D* stgTex;
+	V_RETURN(device->CreateTexture2D(&stgTexDesc, NULL, &stgTex));
+
+	// Copy the texture into the staging resource.
+    ID3D11DeviceContext* deviceContext = DXUTGetD3D11DeviceContext();
+	deviceContext->CopyResource(stgTex, tex);
+
+	// Map the staging resource.
+	D3D11_MAPPED_SUBRESOURCE texData;
+	V_RETURN(deviceContext->Map(stgTex, D3D11CalcSubresource(0, 0, 1), D3D11_MAP_READ_WRITE, 0, &texData));
+
+	// Copy the beginning of each row.
+	BYTE* texels = (BYTE*)texData.pData;
+	for(UINT row = 0; row < stgTexDesc.Height; row++)
+	{
+		UINT rowStart = row * texData.RowPitch;
+		UINT padRowStart = row * padWidth * 4;
+		memcpy(padTexels + padRowStart, texels + rowStart, stgTexDesc.Width * 4); 
+
+		// Pad the end of each row.
+		if(padWidth > stgTexDesc.Width)
+		{
+			BYTE* padVal = texels + rowStart + (stgTexDesc.Width - 1) * 4;
+			for(UINT padCol = stgTexDesc.Width; padCol < padWidth; padCol++)
+			{
+				UINT padColStart = padCol * 4;
+				memcpy(padTexels + padRowStart + padColStart, padVal, 4);
+			}
+		}
+	}
+
+	// Pad the end of each column.
+	if(padHeight > stgTexDesc.Height)
+	{
+		UINT lastRow = (stgTexDesc.Height - 1);
+		UINT lastRowStart = lastRow * padWidth * 4;
+		BYTE* padVal = padTexels + lastRowStart;
+		for(UINT padRow = stgTexDesc.Height; padRow < padHeight; padRow++)
+		{
+			UINT padRowStart = padRow * padWidth * 4;
+			memcpy(padTexels + padRowStart, padVal, padWidth * 4);
+		}
+	}
+
+	// Unmap the staging resources.
+	deviceContext->Unmap(stgTex, D3D11CalcSubresource(0, 0, 1));
+
+	// Create a padded texture.
+	D3D11_TEXTURE2D_DESC padTexDesc;
+	memcpy(&padTexDesc, &texDesc, sizeof(D3D11_TEXTURE2D_DESC));
+	padTexDesc.Width = padWidth;
+	padTexDesc.Height = padHeight;
+	D3D11_SUBRESOURCE_DATA padTexData;
+	ZeroMemory(&padTexData, sizeof(D3D11_SUBRESOURCE_DATA));
+	padTexData.pSysMem = padTexels;
+	padTexData.SysMemPitch = padWidth * sizeof(BYTE) * 4;
+	ID3D11Texture2D* padTex;
+	V_RETURN(device->CreateTexture2D(&padTexDesc, &padTexData, &padTex));
+
+	// Delete the padded texel buffer.
+	delete [] padTexels;
+
+	// Release the shader resource view for the texture.
+	SAFE_RELEASE(*textureSRV);
+
+	// Create a shader resource view for the padded texture.
+	D3D11_SHADER_RESOURCE_VIEW_DESC padTexSRVDesc;
+	padTexSRVDesc.Format = padTexDesc.Format;
+	padTexSRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
+	padTexSRVDesc.Texture2D.MipLevels = padTexDesc.MipLevels;
+	padTexSRVDesc.Texture2D.MostDetailedMip = padTexDesc.MipLevels - 1;
+	V_RETURN(device->CreateShaderResourceView(padTex, &padTexSRVDesc, textureSRV));
+
+	// Release resources.
+	SAFE_RELEASE(padTex);
+	SAFE_RELEASE(stgTex);
+	SAFE_RELEASE(tex);
+
+	return S_OK;
+}
+
+// Save a texture to a file.
+HRESULT SaveTexture(ID3D11ShaderResourceView* textureSRV, LPTSTR file)
+{
+	// Get the texture resource.
+	ID3D11Resource* texRes;
+	textureSRV->GetResource(&texRes);
+	if(texRes == NULL)
+	{
+		return E_POINTER;
+	}
+
+	// Save the texture to a file.
+	HRESULT hr;
+	V_RETURN(D3DX11SaveTextureToFile(DXUTGetD3D11DeviceContext(), texRes, D3DX11_IFF_DDS, file));
+
+	// Release the texture resources.
+	SAFE_RELEASE(texRes);
+
+	return S_OK;
+}
+
+// Compress a texture.
+HRESULT CompressTexture(ID3D11ShaderResourceView* uncompressedSRV, ID3D11ShaderResourceView** compressedSRV)
+{
+	// Query the texture description of the uncompressed texture.
+	ID3D11Resource* uncompRes;
+	gUncompressedSRV->GetResource(&uncompRes);
+	D3D11_TEXTURE2D_DESC uncompTexDesc;
+	((ID3D11Texture2D*)uncompRes)->GetDesc(&uncompTexDesc);
+
+	// Create a 2D texture for the compressed texture.
+	HRESULT hr;
+	ID3D11Texture2D* compTex;
+	D3D11_TEXTURE2D_DESC compTexDesc;
+	memcpy(&compTexDesc, &uncompTexDesc, sizeof(D3D11_TEXTURE2D_DESC));
+	
+	switch(gCompressionScheme->type) {
+	default:
+	case eCompType_DXT1:
+		compTexDesc.Format = DXGI_FORMAT_BC1_UNORM_SRGB;
+		break;
+	case eCompType_DXT5:
+		compTexDesc.Format = DXGI_FORMAT_BC3_UNORM_SRGB;
+		break;
+	case eCompType_BC7:
+		compTexDesc.Format = DXGI_FORMAT_BC7_UNORM_SRGB;
+		break;
+	}
+
+	ID3D11Device* device = DXUTGetD3D11Device();
+	V_RETURN(device->CreateTexture2D(&compTexDesc, NULL, &compTex));
+
+	// Create a shader resource view for the compressed texture.
+	SAFE_RELEASE(*compressedSRV);
+	D3D11_SHADER_RESOURCE_VIEW_DESC compSRVDesc;
+	compSRVDesc.Format = compTexDesc.Format;
+	compSRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
+	compSRVDesc.Texture2D.MipLevels = compTexDesc.MipLevels;
+	compSRVDesc.Texture2D.MostDetailedMip = compTexDesc.MipLevels - 1;
+	V_RETURN(device->CreateShaderResourceView(compTex, &compSRVDesc, compressedSRV));
+
+	// Create a staging resource for the compressed texture.
+	compTexDesc.Usage = D3D11_USAGE_STAGING;
+	compTexDesc.BindFlags = 0;
+	compTexDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
+	ID3D11Texture2D* compStgTex;
+	V_RETURN(device->CreateTexture2D(&compTexDesc, NULL, &compStgTex));
+
+	// Create a staging resource for the uncompressed texture.
+	uncompTexDesc.Usage = D3D11_USAGE_STAGING;
+	uncompTexDesc.BindFlags = 0;
+	uncompTexDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
+	ID3D11Texture2D* uncompStgTex;
+	V_RETURN(device->CreateTexture2D(&uncompTexDesc, NULL, &uncompStgTex));
+
+	// Copy the uncompressed texture into the staging resource.
+    ID3D11DeviceContext* deviceContext = DXUTGetD3D11DeviceContext();
+	deviceContext->CopyResource(uncompStgTex, uncompRes);
+
+	// Map the staging resources.
+	D3D11_MAPPED_SUBRESOURCE uncompData;
+	V_RETURN(deviceContext->Map(uncompStgTex, D3D11CalcSubresource(0, 0, 1), D3D11_MAP_READ_WRITE, 0, &uncompData));
+	D3D11_MAPPED_SUBRESOURCE compData;
+	V_RETURN(deviceContext->Map(compStgTex, D3D11CalcSubresource(0, 0, 1), D3D11_MAP_READ_WRITE, 0, &compData));
+
+	// Time the compression.
+	StopWatch stopWatch;
+	stopWatch.Start();
+
+	const int kNumCompressions = 1;
+	for(int cmpNum = 0; cmpNum < kNumCompressions; cmpNum++) {
+
+		// Compress the uncompressed texels.
+		DXTC::CompressImageDXT((BYTE*)uncompData.pData, (BYTE*)compData.pData, uncompTexDesc.Width, uncompTexDesc.Height);
+	}
+
+	// Update the compression time.
+	stopWatch.Stop();
+	gCompTime = stopWatch.TimeInMilliseconds();
+	gSampleUI.SendEvent(IDC_TIMETEXT, true, gSampleUI.GetStatic(IDC_TIMETEXT));
+
+	// Compute the compression rate.
+	INT numPixels = compTexDesc.Width * compTexDesc.Height * kNumCompressions;
+	gCompRate = (double)numPixels / stopWatch.TimeInSeconds() / 1000000.0;
+	gSampleUI.SendEvent(IDC_RATETEXT, true, gSampleUI.GetStatic(IDC_RATETEXT));
+	stopWatch.Reset();
+
+	// Unmap the staging resources.
+	deviceContext->Unmap(compStgTex, D3D11CalcSubresource(0, 0, 1));
+	deviceContext->Unmap(uncompStgTex, D3D11CalcSubresource(0, 0, 1));
+
+	// Copy the staging resourse into the compressed texture.
+	deviceContext->CopyResource(compTex, compStgTex);
+
+	// Release resources.
+	SAFE_RELEASE(uncompStgTex);
+	SAFE_RELEASE(compStgTex);
+	SAFE_RELEASE(compTex);
+	SAFE_RELEASE(uncompRes);
+
+	return S_OK;
+}
+
+#define CHECK_WIN_THREAD_FUNC(x) \
+	do { \
+		if(NULL == (x)) { \
+			wchar_t wstr[256]; \
+			swprintf_s(wstr, L"Error detected from call %s at line %d of main.cpp", _T(#x), __LINE__); \
+			ReportWinThreadError(wstr); \
+		} \
+	} \
+	while(0)
+
+void ReportWinThreadError(const wchar_t *str) {
+
+	// Retrieve the system error message for the last-error code.
+	LPVOID lpMsgBuf;
+	LPVOID lpDisplayBuf;
+	DWORD dw = GetLastError(); 
+
+	FormatMessage(
+		FORMAT_MESSAGE_ALLOCATE_BUFFER | 
+		FORMAT_MESSAGE_FROM_SYSTEM |
+		FORMAT_MESSAGE_IGNORE_INSERTS,
+		NULL,
+		dw,
+		MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+		(LPTSTR) &lpMsgBuf,
+		0, NULL );
+
+	// Display the error message.
+
+	lpDisplayBuf = (LPVOID)LocalAlloc(LMEM_ZEROINIT, 
+		(lstrlen((LPCTSTR) lpMsgBuf) + lstrlen((LPCTSTR)str) + 40) * sizeof(TCHAR)); 
+	StringCchPrintf((LPTSTR)lpDisplayBuf, 
+		LocalSize(lpDisplayBuf) / sizeof(TCHAR),
+		TEXT("%s failed with error %d: %s"), 
+		str, dw, lpMsgBuf); 
+	MessageBox(NULL, (LPCTSTR) lpDisplayBuf, TEXT("Error"), MB_OK); 
+
+	// Free error-handling buffer allocations.
+
+	LocalFree(lpMsgBuf);
+	LocalFree(lpDisplayBuf);
+}
+
+void InitWin32Threads() {
+
+	// Already initialized?
+	if(gNumWinThreads > 0) {
+		return;
+	}
+	
+	SetLastError(0);
+
+	gNumWinThreads = gNumProcessors;
+	if(gNumWinThreads >= MAXIMUM_WAIT_OBJECTS)
+		gNumWinThreads = MAXIMUM_WAIT_OBJECTS;
+
+	// Create the synchronization events.
+	for(int i = 0; i < gNumWinThreads; i++) {
+		CHECK_WIN_THREAD_FUNC(gWinThreadWorkEvent[i] = CreateEvent(NULL, FALSE, FALSE, NULL));
+	}
+
+	CHECK_WIN_THREAD_FUNC(gWinThreadStartEvent = CreateEvent(NULL, TRUE, FALSE, NULL));
+	CHECK_WIN_THREAD_FUNC(gWinThreadDoneEvent = CreateEvent(NULL, TRUE, FALSE, NULL));
+
+	// Create threads
+	for(int threadIdx = 0; threadIdx < gNumWinThreads; threadIdx++) {
+		gWinThreadData[threadIdx].state = eThreadState_WaitForData;
+		CHECK_WIN_THREAD_FUNC(hThreadArray[threadIdx] = CreateThread(NULL, 0, DXTC::CompressImageDXTWinThread, &gWinThreadData[threadIdx], 0, &dwThreadIdArray[threadIdx]));
+	}
+}
+
+void DestroyThreads() {
+
+	switch(gCompressionScheme->threadMode) {
+		case eThreadMode_TBB:
+		{
+			// Shutdown the TBB task manager.
+			gTaskMgr.Shutdown();
+		}
+		break;
+
+		case eThreadMode_Win32:
+		{
+			// Release all windows threads that may be active...
+			for(int i=0; i < gNumWinThreads; i++) {
+				gWinThreadData[i].state = eThreadState_Done;
+			}
+
+			// Send the event for the threads to start.
+			CHECK_WIN_THREAD_FUNC(ResetEvent(gWinThreadDoneEvent));
+			CHECK_WIN_THREAD_FUNC(SetEvent(gWinThreadStartEvent));
+
+			// Wait for all the threads to finish....
+			DWORD dwWaitRet = WaitForMultipleObjects(gNumWinThreads, hThreadArray, TRUE, INFINITE);
+			if(WAIT_FAILED == dwWaitRet)
+				ReportWinThreadError(L"DestroyThreads() -- WaitForMultipleObjects");
+
+			// !HACK! This doesn't actually do anything. There is either a bug in the 
+			// Intel compiler or the windows run-time that causes the threads to not
+			// be cleaned up properly if the following two lines of code are not present.
+			// Since we're passing INFINITE to WaitForMultipleObjects, that function will
+			// never time out and per-microsoft spec, should never give this return value...
+			// Even with these lines, the bug does not consistently disappear unless you
+			// clean and rebuild. Heigenbug?
+			//
+			// If we compile with MSVC, then the following two lines are not necessary.
+			else if(WAIT_TIMEOUT == dwWaitRet)
+				OutputDebugString(L"DestroyThreads() -- WaitForMultipleObjects -- TIMEOUT");
+
+			// Reset the start event
+			CHECK_WIN_THREAD_FUNC(ResetEvent(gWinThreadStartEvent));
+			CHECK_WIN_THREAD_FUNC(SetEvent(gWinThreadDoneEvent));
+
+			// Close all thread handles.
+			for(int i=0; i < gNumWinThreads; i++) {
+				CHECK_WIN_THREAD_FUNC(CloseHandle(hThreadArray[i]));
+			}
+
+			for(int i =0; i < kMaxWinThreads; i++ ){
+				hThreadArray[i] = NULL;
+			}
+
+			// Close all event handles...
+			CHECK_WIN_THREAD_FUNC(CloseHandle(gWinThreadDoneEvent)); 
+			gWinThreadDoneEvent = NULL;
+			
+			CHECK_WIN_THREAD_FUNC(CloseHandle(gWinThreadStartEvent)); 
+			gWinThreadStartEvent = NULL;
+
+			for(int i = 0; i < gNumWinThreads; i++) {
+				CHECK_WIN_THREAD_FUNC(CloseHandle(gWinThreadWorkEvent[i]));
+			}
+
+			for(int i = 0; i < kMaxWinThreads; i++) {
+				gWinThreadWorkEvent[i] = NULL;
+			}
+
+			gNumWinThreads = 0;
+		}
+		break;
+
+		case eThreadMode_None:
+			// Do nothing.
+			break;
+	}
+}
+
+static inline DXGI_FORMAT GetNonSRGBFormat(DXGI_FORMAT f) {
+	switch(f) {
+		case DXGI_FORMAT_BC1_UNORM_SRGB: return DXGI_FORMAT_BC1_UNORM;
+		case DXGI_FORMAT_BC3_UNORM_SRGB: return DXGI_FORMAT_BC3_UNORM;
+		case DXGI_FORMAT_BC7_UNORM_SRGB: return DXGI_FORMAT_BC7_UNORM; 
+		case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: return DXGI_FORMAT_R8G8B8A8_UNORM;
+		default: assert(!"Unknown format!");
+	}
+	return DXGI_FORMAT_R8G8B8A8_UNORM;
+}
+
+// Compute an "error" texture that represents the absolute difference in color between an
+// uncompressed texture and a compressed texture.
+HRESULT ComputeError(ID3D11ShaderResourceView* uncompressedSRV, ID3D11ShaderResourceView* compressedSRV, ID3D11ShaderResourceView** errorSRV)
+{
+	HRESULT hr;
+
+	// Query the texture description of the uncompressed texture.
+	ID3D11Resource* uncompRes;
+	gUncompressedSRV->GetResource(&uncompRes);
+	D3D11_TEXTURE2D_DESC uncompTexDesc;
+	((ID3D11Texture2D*)uncompRes)->GetDesc(&uncompTexDesc);
+
+	// Query the texture description of the uncompressed texture.
+	ID3D11Resource* compRes;
+	gCompressedSRV->GetResource(&compRes);
+	D3D11_TEXTURE2D_DESC compTexDesc;
+	((ID3D11Texture2D*)compRes)->GetDesc(&compTexDesc);
+
+	// Create a 2D resource without gamma correction for the two textures.
+	compTexDesc.Format = GetNonSRGBFormat(compTexDesc.Format);
+	uncompTexDesc.Format = GetNonSRGBFormat(uncompTexDesc.Format);
+
+	ID3D11Device* device = DXUTGetD3D11Device();
+
+	ID3D11Texture2D* uncompTex;
+	device->CreateTexture2D(&uncompTexDesc, NULL, &uncompTex);
+
+	ID3D11Texture2D* compTex;
+	device->CreateTexture2D(&compTexDesc, NULL, &compTex);
+
+	// Create a shader resource view for the two textures.
+	D3D11_SHADER_RESOURCE_VIEW_DESC compSRVDesc;
+	compSRVDesc.Format = compTexDesc.Format;
+	compSRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
+	compSRVDesc.Texture2D.MipLevels = compTexDesc.MipLevels;
+	compSRVDesc.Texture2D.MostDetailedMip = compTexDesc.MipLevels - 1;
+	ID3D11ShaderResourceView *compSRV;
+	V_RETURN(device->CreateShaderResourceView(compTex, &compSRVDesc, &compSRV));
+
+	D3D11_SHADER_RESOURCE_VIEW_DESC uncompSRVDesc;
+	uncompSRVDesc.Format = uncompTexDesc.Format;
+	uncompSRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
+	uncompSRVDesc.Texture2D.MipLevels = uncompTexDesc.MipLevels;
+	uncompSRVDesc.Texture2D.MostDetailedMip = uncompTexDesc.MipLevels - 1;
+	ID3D11ShaderResourceView *uncompSRV;
+	V_RETURN(device->CreateShaderResourceView(uncompTex, &uncompSRVDesc, &uncompSRV));
+
+	// Create a 2D texture for the error texture.
+	ID3D11Texture2D* errorTex;
+	D3D11_TEXTURE2D_DESC errorTexDesc;
+	memcpy(&errorTexDesc, &uncompTexDesc, sizeof(D3D11_TEXTURE2D_DESC));
+	errorTexDesc.BindFlags = D3D11_BIND_RENDER_TARGET | D3D11_BIND_SHADER_RESOURCE;
+	V_RETURN(device->CreateTexture2D(&errorTexDesc, NULL, &errorTex));
+
+	// Create a render target view for the error texture.
+	D3D11_RENDER_TARGET_VIEW_DESC errorRTVDesc;
+	errorRTVDesc.Format = errorTexDesc.Format;
+	errorRTVDesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D;
+	errorRTVDesc.Texture2D.MipSlice = 0;
+	ID3D11RenderTargetView* errorRTV;
+	V_RETURN(device->CreateRenderTargetView(errorTex, &errorRTVDesc, &errorRTV));
+
+	// Create a shader resource view for the error texture.
+	D3D11_SHADER_RESOURCE_VIEW_DESC errorSRVDesc;
+	errorSRVDesc.Format = errorTexDesc.Format;
+	errorSRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
+	errorSRVDesc.Texture2D.MipLevels = errorTexDesc.MipLevels;
+	errorSRVDesc.Texture2D.MostDetailedMip = errorTexDesc.MipLevels - 1;
+	V_RETURN(device->CreateShaderResourceView(errorTex, &errorSRVDesc, errorSRV));
+
+	// Create a query for the GPU operations...
+	D3D11_QUERY_DESC GPUQueryDesc;
+	GPUQueryDesc.Query = D3D11_QUERY_EVENT;
+	GPUQueryDesc.MiscFlags = 0;
+
+#ifdef _DEBUG
+	D3D11_QUERY_DESC OcclusionQueryDesc;
+	OcclusionQueryDesc.Query = D3D11_QUERY_OCCLUSION;
+	OcclusionQueryDesc.MiscFlags = 0;
+
+	D3D11_QUERY_DESC StatsQueryDesc;
+	StatsQueryDesc.Query = D3D11_QUERY_PIPELINE_STATISTICS;
+	StatsQueryDesc.MiscFlags = 0;
+#endif
+
+	ID3D11Query *GPUQuery;
+	V_RETURN(device->CreateQuery(&GPUQueryDesc, &GPUQuery));
+
+	ID3D11DeviceContext* deviceContext = DXUTGetD3D11DeviceContext();
+
+	deviceContext->CopyResource(compTex, compRes);
+	deviceContext->CopyResource(uncompTex, uncompRes);
+
+#ifdef _DEBUG
+	ID3D11Query *OcclusionQuery, *StatsQuery;
+	V_RETURN(device->CreateQuery(&OcclusionQueryDesc, &OcclusionQuery));
+	V_RETURN(device->CreateQuery(&StatsQueryDesc, &StatsQuery));
+
+	deviceContext->Begin(OcclusionQuery);
+	deviceContext->Begin(StatsQuery);
+#endif	
+
+	// Set the viewport to a 1:1 mapping of pixels to texels.
+	D3D11_VIEWPORT viewport;
+	viewport.Width = (FLOAT)errorTexDesc.Width;
+	viewport.Height = (FLOAT)errorTexDesc.Height;
+	viewport.MinDepth = 0;
+	viewport.MaxDepth = 1;
+	viewport.TopLeftX = 0;
+	viewport.TopLeftY = 0;
+	deviceContext->RSSetViewports(1, &viewport);
+
+	// Bind the render target view of the error texture.
+	ID3D11RenderTargetView* RTV[1] = { errorRTV };
+	deviceContext->OMSetRenderTargets(1, RTV, NULL);
+
+	// Clear the render target.
+	FLOAT color[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
+	deviceContext->ClearRenderTargetView(errorRTV, color);
+
+	// Set the input layout.
+	deviceContext->IASetInputLayout(gVertexLayout);
+
+	// Set vertex buffer
+	UINT stride = sizeof(Vertex);
+	UINT offset = 0;
+	deviceContext->IASetVertexBuffers(0, 1, &gQuadVB, &stride, &offset);
+
+	// Set the primitive topology
+	deviceContext->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
+
+	// Set the shaders
+	deviceContext->VSSetShader(gVertexShader, NULL, 0);
+	deviceContext->PSSetShader(gRenderTexturePS, NULL, 0);
+
+	// Set the texture sampler.
+	deviceContext->PSSetSamplers(0, 1, &gSamPoint);
+
+	// Bind the textures.
+	ID3D11ShaderResourceView* SRV[2] = { compSRV, uncompSRV};
+	deviceContext->PSSetShaderResources(0, 2, SRV);
+
+	// Store the depth/stencil state.
+	StoreDepthStencilState();
+
+	// Disable depth testing.
+	V_RETURN(DisableDepthTest());
+
+	// Render a quad.
+	deviceContext->Draw(6, 0);
+
+	// Restore the depth/stencil state.
+	RestoreDepthStencilState();
+
+	// Reset the render target.
+	RTV[0] = DXUTGetD3D11RenderTargetView();
+    deviceContext->OMSetRenderTargets(1, RTV, DXUTGetD3D11DepthStencilView());
+
+	// Reset the viewport.
+	viewport.Width = (FLOAT)DXUTGetDXGIBackBufferSurfaceDesc()->Width;
+	viewport.Height = (FLOAT)DXUTGetDXGIBackBufferSurfaceDesc()->Height;
+	deviceContext->RSSetViewports(1, &viewport);
+
+	deviceContext->End(GPUQuery);
+#ifdef _DEBUG
+	deviceContext->End(OcclusionQuery);
+	deviceContext->End(StatsQuery);
+#endif
+
+	BOOL finishedGPU = false;
+
+	// If we do not have a d3d 11 context, we will still hit this line and try to
+	// finish using the GPU. If this happens this enters an infinite loop.
+	int infLoopPrevention = 0;
+	while(!finishedGPU && ++infLoopPrevention < 10000) {
+		HRESULT ret;
+		V_RETURN(ret = deviceContext->GetData(GPUQuery, &finishedGPU, sizeof(BOOL), 0));
+		if(ret != S_OK)
+			Sleep(1);
+	}
+
+#ifdef _DEBUG
+	UINT64 nPixelsWritten = 0;
+	deviceContext->GetData(OcclusionQuery, (void *)&nPixelsWritten, sizeof(UINT64), 0);
+
+	D3D11_QUERY_DATA_PIPELINE_STATISTICS stats;
+	deviceContext->GetData(StatsQuery, (void *)&stats, sizeof(D3D11_QUERY_DATA_PIPELINE_STATISTICS), 0);
+
+	TCHAR nPixelsWrittenMsg[256];
+	_stprintf(nPixelsWrittenMsg, _T("Pixels rendered during error computation: %d\n"), nPixelsWritten);
+	OutputDebugString(nPixelsWrittenMsg);
+#endif
+
+	// Create a copy of the error texture that is accessible by the CPU
+	ID3D11Texture2D* errorTexCopy;
+	D3D11_TEXTURE2D_DESC errorTexCopyDesc;
+	memcpy(&errorTexCopyDesc, &uncompTexDesc, sizeof(D3D11_TEXTURE2D_DESC));
+	errorTexCopyDesc.Usage = D3D11_USAGE_STAGING;
+	errorTexCopyDesc.BindFlags = 0;
+	errorTexCopyDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
+	V_RETURN(device->CreateTexture2D(&errorTexCopyDesc, NULL, &errorTexCopy));
+
+	// Copy the error texture into the copy....
+	deviceContext->CopyResource(errorTexCopy, errorTex);
+
+	// Map the staging resource.
+	D3D11_MAPPED_SUBRESOURCE errorData;
+	V_RETURN(deviceContext->Map(errorTexCopy, D3D11CalcSubresource(0, 0, 1), D3D11_MAP_READ, 0, &errorData));
+
+	// Calculate PSNR
+	ComputeRMSE((const BYTE *)(errorData.pData), errorTexCopyDesc.Width, errorTexCopyDesc.Height);
+	gSampleUI.SendEvent(IDC_RMSETEXT, true, gSampleUI.GetStatic(IDC_RMSETEXT));
+
+	// Unmap the staging resources.
+	deviceContext->Unmap(errorTexCopy, D3D11CalcSubresource(0, 0, 1));
+
+	// Release resources.
+	SAFE_RELEASE(errorRTV);
+	SAFE_RELEASE(errorTex);
+	SAFE_RELEASE(errorTexCopy);
+	SAFE_RELEASE(uncompRes);
+	SAFE_RELEASE(compRes);
+	SAFE_RELEASE(GPUQuery);
+
+#ifdef _DEBUG
+	SAFE_RELEASE(OcclusionQuery);
+	SAFE_RELEASE(StatsQuery);
+#endif
+
+	SAFE_RELEASE(compSRV);
+	SAFE_RELEASE(uncompSRV);
+	SAFE_RELEASE(compTex);
+	SAFE_RELEASE(uncompTex);
+
+	return S_OK;
+}
+
+// Recompresses the already loaded texture and recomputes the error.
+HRESULT RecompressTexture()
+{
+	// Destroy any previously created textures.
+	SAFE_RELEASE(gErrorSRV);
+	SAFE_RELEASE(gCompressedSRV);
+
+	// Compress the texture.
+	HRESULT hr;
+	V_RETURN(CompressTexture(gUncompressedSRV, &gCompressedSRV));
+
+	// Compute the error in the compressed texture.
+	V_RETURN(ComputeError(gUncompressedSRV, gCompressedSRV, &gErrorSRV));
+
+	return S_OK;
+}
+
+// Store the depth-stencil state.
+void StoreDepthStencilState()
+{
+	DXUTGetD3D11DeviceContext()->OMGetDepthStencilState(&gDepthStencilState, &gStencilReference);
+}
+
+// Restore the depth-stencil state.
+void RestoreDepthStencilState()
+{
+	DXUTGetD3D11DeviceContext()->OMSetDepthStencilState(gDepthStencilState, gStencilReference);
+}
+
+// Disable depth testing.
+HRESULT DisableDepthTest()
+{
+	D3D11_DEPTH_STENCIL_DESC depStenDesc;
+	ZeroMemory(&depStenDesc, sizeof(D3D11_DEPTH_STENCIL_DESC));
+	depStenDesc.DepthEnable = FALSE;
+	depStenDesc.DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ALL;
+	depStenDesc.DepthFunc = D3D11_COMPARISON_LESS;
+	depStenDesc.StencilEnable = FALSE;
+	depStenDesc.StencilReadMask = D3D11_DEFAULT_STENCIL_READ_MASK;
+	depStenDesc.StencilWriteMask = D3D11_DEFAULT_STENCIL_WRITE_MASK;
+	depStenDesc.FrontFace.StencilFailOp = D3D11_STENCIL_OP_KEEP;
+	depStenDesc.FrontFace.StencilDepthFailOp = D3D11_STENCIL_OP_KEEP;
+	depStenDesc.FrontFace.StencilPassOp = D3D11_STENCIL_OP_KEEP;
+	depStenDesc.FrontFace.StencilFunc = D3D11_COMPARISON_ALWAYS;
+	depStenDesc.BackFace.StencilFailOp = D3D11_STENCIL_OP_KEEP;
+	depStenDesc.BackFace.StencilDepthFailOp = D3D11_STENCIL_OP_KEEP;
+	depStenDesc.BackFace.StencilPassOp = D3D11_STENCIL_OP_KEEP;
+	depStenDesc.BackFace.StencilFunc = D3D11_COMPARISON_ALWAYS;
+	ID3D11DepthStencilState* depStenState;
+	HRESULT hr;
+	V_RETURN(DXUTGetD3D11Device()->CreateDepthStencilState(&depStenDesc, &depStenState));
+
+    DXUTGetD3D11DeviceContext()->OMSetDepthStencilState(depStenState, 0);
+
+	SAFE_RELEASE(depStenState);
+
+	return S_OK;
+}
+
+void ComputeRMSE(const BYTE *errorData, const INT width, const INT height) {
+	
+	const float *w = BC7C::GetErrorMetric();
+
+	const double wr = w[0];
+	const double wg = w[1];
+	const double wb = w[2];
+	
+	double MSE = 0.0;
+	for(int i = 0; i < width; i++) {
+		for(int j = 0; j < height; j++) {
+			const INT pixel = ((const INT *)errorData)[j * width + i];
+
+			double dr = double(pixel & 0xFF) * wr;
+			double dg = double((pixel >> 8) & 0xFF) * wg;
+			double db = double((pixel >> 16) & 0xFF) * wb;
+
+			const double pixelMSE = (double(dr) * double(dr)) + (double(dg) * double(dg)) + (double(db) * double(db));
+			MSE += pixelMSE;
+		}
+	}
+
+	MSE /= (double(width) * double(height));
+#ifdef REPORT_RMSE
+	gError = sqrt(MSE);
+#else
+	double MAXI = (255.0 * wr) * (255.0 * wr) + (255.0 * wg) * (255.0 * wg) + (255.0 * wb) * (255.0 * wb);
+	gError= 10 * log10(MAXI/MSE);
+#endif
+}
+
+namespace DXTC
+{
+	VOID CompressImageDXT(const BYTE* inBuf, BYTE* outBuf, INT width, INT height) {
+
+		// If we aren't multi-cored, then just run everything serially.
+		if(gNumProcessors <= 1) {
+			CompressImageDXTNoThread(inBuf, outBuf, width, height);
+			return;
+		}
+
+		switch(gCompressionScheme->threadMode) {
+			case eThreadMode_None:
+				CompressImageDXTNoThread(inBuf, outBuf, width, height);
+				break;
+			case eThreadMode_TBB:
+				CompressImageDXTTBB(inBuf, outBuf, width, height);
+				break;
+			case eThreadMode_Win32:
+				CompressImageDXTWIN(inBuf, outBuf, width, height);
+				break;
+		}
+	}
+
+	CompressionFunc GetCompressionFunc() {
+		switch(gCompressionScheme->instrSet)
+		{
+			case eInstrSet_SSE: 
+			{
+				switch(gCompressionScheme->type) {
+					case eCompType_DXT1: return DXTC::CompressImageDXT1SSE2;
+					case eCompType_DXT5: return DXTC::CompressImageDXT5SSE2;
+					case eCompType_BC7: return BC7C::CompressImageBC7SIMD;
+				}
+			}
+			break;
+
+			case eInstrSet_Scalar:
+			{
+				switch(gCompressionScheme->type) {
+					case eCompType_DXT1: return DXTC::CompressImageDXT1;
+					case eCompType_DXT5: return DXTC::CompressImageDXT5;
+					case eCompType_BC7: return BC7C::CompressImageBC7;
+				}
+			}
+			break;
+
+#ifdef ENABLE_AVX2
+			case eInstrSet_AVX2:
+			{
+				switch(gCompressionScheme->type) {
+					case eCompType_DXT1: return DXTC::CompressImageDXT1AVX2;
+					case eCompType_DXT5: return DXTC::CompressImageDXT5AVX2;
+				}
+			}
+#endif
+		}
+		return NULL;
+	}
+
+	void CompressImageDXTNoThread(const BYTE* inBuf, BYTE* outBuf, INT width, INT height) {
+
+		CompressionFunc cmpFunc = GetCompressionFunc();
+
+		if(cmpFunc == NULL) {
+			OutputDebugString(L"DXTC::CompressImageDXTNoThread -- Compression Scheme not implemented!\n");
+			return;
+		}
+
+		// Do the compression.
+		(*cmpFunc)(inBuf, outBuf, width, height);
+	}
+
+	// Use the TBB task manager to compress an image with DXT compression.
+	VOID CompressImageDXTTBB(const BYTE* inBuf, BYTE* outBuf, INT width, INT height)
+	{
+		// Initialize the data.
+		DXTTaskData data;
+		data.inBuf = inBuf;
+		data.outBuf = outBuf;
+		data.width = width;
+		data.height = height;
+		data.numBlocks = width * height / 16;
+		if(gCompressionScheme->instrSet == eInstrSet_AVX2) {
+			data.numBlocks = width * height / 32;
+		}
+		data.kBlocksPerTask = gBlocksPerTask;
+
+		// Compute the task count.
+		UINT taskCount = (UINT)ceil((float)data.numBlocks / gBlocksPerTask);
+
+		// Create the task set.
+		TASKSETFUNC taskFunc = NULL;
+		switch(gCompressionScheme->instrSet)
+		{
+			case eInstrSet_SSE:
+			{
+				switch(gCompressionScheme->type) {
+					case eCompType_DXT1: taskFunc = DXTC::CompressImageDXT1SSE2Task; break;
+					case eCompType_DXT5: taskFunc = DXTC::CompressImageDXT5SSE2Task; break;
+				}
+			}
+			break;
+
+			case eInstrSet_Scalar:
+			{
+				switch(gCompressionScheme->type) {
+					case eCompType_DXT1: taskFunc = DXTC::CompressImageDXT1Task; break;
+					case eCompType_DXT5: taskFunc = DXTC::CompressImageDXT5Task; break;
+				}
+			}
+			break;
+
+#ifdef ENABLE_AVX2
+			case eInstrSet_AVX2:
+			{
+				switch(gCompressionScheme->type) {
+					case eCompType_DXT1: taskFunc = DXTC::CompressImageDXT1AVX2Task; break;
+					case eCompType_DXT5: taskFunc = DXTC::CompressImageDXT5AVX2Task; break;
+				}
+			}
+			break;
+#endif
+		}
+
+		TASKSETHANDLE taskSet;
+		gTaskMgr.CreateTaskSet(taskFunc, &data, taskCount, NULL, 0, "Fast Texture Compression", &taskSet);
+		if(taskSet == TASKSETHANDLE_INVALID)
+		{
+			return;
+		}
+
+		// Wait for the task set.
+		gTaskMgr.WaitForSet(taskSet);
+
+		// Release the task set.
+		gTaskMgr.ReleaseHandle(taskSet);
+		taskSet = TASKSETHANDLE_INVALID;
+	}
+
+	int GetBlocksPerLoop() {
+		if(gCompressionScheme->instrSet == eInstrSet_AVX2)
+			return 2;
+		return 1;
+	}
+
+	int GetBytesPerBlock() {
+		switch(gCompressionScheme->type) {
+			default:
+			case eCompType_DXT1:
+				return 8;
+				
+			case eCompType_DXT5:
+			case eCompType_BC7:
+				return 16;
+		}
+	}
+
+	VOID CompressImageDXTWIN(const BYTE* inBuf, BYTE* outBuf, INT width, INT height) {
+
+		const int numThreads = gNumWinThreads;
+		const int blocksPerLoop = GetBlocksPerLoop();
+		const int bytesPerBlock = GetBytesPerBlock();
+
+		// We want to split the data evenly among all threads.
+		const int kNumPixels = width * height;
+		const int kNumBlocks = kNumPixels >> (3 + blocksPerLoop);
+		const int kBlocksPerRow = width >> (1 + blocksPerLoop);
+
+		const int kBlocksPerThread = kNumBlocks / numThreads;
+		const int kBlocksPerColumn = height >> 2;
+		const int kBlockRowsPerThread = kBlocksPerThread / kBlocksPerRow;
+		const int kBlockColsPerThread = kBlocksPerThread % kBlocksPerRow;
+		const int kOffsetPerThread = kBlockRowsPerThread * width * 4 * 4 + kBlockColsPerThread * 4 * 4 * (blocksPerLoop);
+		const int kHeightPerThread = (blocksPerLoop * 16 * kBlocksPerThread) / width;
+
+		CompressionFunc cmpFunc = GetCompressionFunc();
+		if(cmpFunc == NULL) {
+			OutputDebugString(L"DXTC::CompressImageDXTNoThread -- Compression Scheme not implemented!\n");
+			return;
+		}
+
+		// Load the threads.
+		for(int threadIdx = 0; threadIdx < numThreads; threadIdx++) {
+
+			WinThreadData *data = &gWinThreadData[threadIdx];
+			data->inBuf = inBuf + (threadIdx * kOffsetPerThread);
+			data->outBuf = outBuf + (threadIdx * kBlocksPerThread * blocksPerLoop * bytesPerBlock);
+			data->width = width;
+			data->height = kHeightPerThread;
+			data->cmpFunc = cmpFunc;
+			data->state = eThreadState_DataLoaded;
+			data->threadIdx = threadIdx;
+		}
+
+		// Send the event for the threads to start.
+		CHECK_WIN_THREAD_FUNC(ResetEvent(gWinThreadDoneEvent));
+		CHECK_WIN_THREAD_FUNC(SetEvent(gWinThreadStartEvent));
+
+		// Wait for all the threads to finish
+		if(WAIT_FAILED == WaitForMultipleObjects(numThreads, gWinThreadWorkEvent, TRUE, INFINITE))
+				ReportWinThreadError(TEXT("CompressImageDXTWIN -- WaitForMultipleObjects"));
+
+		// Reset the start event
+		CHECK_WIN_THREAD_FUNC(ResetEvent(gWinThreadStartEvent));
+		CHECK_WIN_THREAD_FUNC(SetEvent(gWinThreadDoneEvent));
+	}
+
+	DWORD WINAPI CompressImageDXTWinThread( LPVOID lpParam ) {
+		WinThreadData *data = (WinThreadData *)lpParam;
+
+		while(data->state != eThreadState_Done) {
+
+			if(WAIT_FAILED == WaitForSingleObject(gWinThreadStartEvent, INFINITE))
+				ReportWinThreadError(TEXT("CompressImageDXTWinThread -- WaitForSingleObject"));
+
+			if(data->state == eThreadState_Done)
+				break;
+
+			data->state = eThreadState_Running;
+			(*(data->cmpFunc))(data->inBuf, data->outBuf, data->width, data->height);
+
+			data->state = eThreadState_WaitForData;
+
+			HANDLE workEvent = gWinThreadWorkEvent[data->threadIdx];
+			if(WAIT_FAILED == SignalObjectAndWait(workEvent, gWinThreadDoneEvent, INFINITE, FALSE))
+				ReportWinThreadError(TEXT("CompressImageDXTWinThread -- SignalObjectAndWait"));
+		}
+
+		return 0;
+	}
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..c296081
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,4 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
+PROJECT(TexC)
+
+ADD_SUBDIRECTORY(BPTCEncoder)
diff --git a/QtGUI/CMakeLists.txt b/QtGUI/CMakeLists.txt
new file mode 100644
index 0000000..e69de29