commit efdca4b5bb59afb5d1a6d728f02684001044ad1d Author: Pavel Krajcevski Date: Fri Aug 24 15:56:45 2012 -0400 Initial commit with a few modifications diff --git a/BPTCEncoder/CMakeLists.txt b/BPTCEncoder/CMakeLists.txt new file mode 100644 index 0000000..c053b96 --- /dev/null +++ b/BPTCEncoder/CMakeLists.txt @@ -0,0 +1,57 @@ +INCLUDE_DIRECTORIES(${TexC_SOURCE_DIR}/BPTCEncoder/include) + +INCLUDE(CheckCXXSourceCompiles) + +SET(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) +IF(CMAKE_COMPILER_IS_GNUCC) + SET(CMAKE_REQUIRED_FLAGS -msse4.1) + CHECK_CXX_SOURCE_COMPILES("config/testsse4.1.cpp" HAS_SSE_41) + + IF(HAS_SSE_41) + SET(CMAKE_REQUIRED_FLAGS -msse4.2) + CHECK_CXX_SOURCE_COMPILES("config/testsse4.2.cpp" HAS_SSE_POPCNT) + ENDIF(HAS_SSE_41) + +ELSEIF(MSVC) +ENDIF() +SET(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS}) + +CONFIGURE_FILE( + "config/BC7Config.h.in" + "src/BC7Config.h" +) + +IF(CMAKE_COMPILER_IS_GNUCC) + ADD_DEFINITIONS(-fasm-blocks) +ENDIF(CMAKE_COMPILER_IS_GNUCC) + +SET( HEADERS + src/BC7CompressionMode.h + src/BC7IntTypes.h + src/BitStream.h + src/RGBAEndpoints.h +) + +SET( SOURCES + src/BC7Compressor.cpp + src/RGBAEndpoints.cpp +) + +IF( HAS_SSE_41 ) + SET( HEADERS + ${HEADERS} + src/RGBAEndpointsSIMD.h + src/BC7CompressionModeSIMD.h + ) + + SET( SOURCES + ${SOURCES} + src/BC7CompressorSIMD.cpp + src/RGBAEndpointsSIMD.cpp + ) +ENDIF( HAS_SSE_41 ) + +ADD_LIBRARY( BPTCEncoder + ${SOURCES} + ${SIMD_SOURCES} +) diff --git a/BPTCEncoder/config/BC7Config.h.in b/BPTCEncoder/config/BC7Config.h.in new file mode 100644 index 0000000..158812d --- /dev/null +++ b/BPTCEncoder/config/BC7Config.h.in @@ -0,0 +1,8 @@ +// Copyright (c) 2012 Pavel Krajcevski +// All Rights Reserved + +// BC7Config.h.in -- This file contains variables that are introduced +// explicitly by the CMake build process. + +// Do we have the proper popcnt instruction defined? +#define HAS_SSE_POPCNT @HAS_SSE_POPCNT@ diff --git a/BPTCEncoder/config/testsse4.1.cpp b/BPTCEncoder/config/testsse4.1.cpp new file mode 100644 index 0000000..2792820 --- /dev/null +++ b/BPTCEncoder/config/testsse4.1.cpp @@ -0,0 +1,10 @@ +#include + +int main() { + const __m128 fv = _mm_set1_ps(1.0f); + const __m128 fv2 = _mm_set1_ps(2.0f); + + const __m128 ans = _mm_blend_ps(fv, fv2, 2); + + return ((int *)(&ans))[0]; +} diff --git a/BPTCEncoder/include/BC7Compressor.h b/BPTCEncoder/include/BC7Compressor.h new file mode 100755 index 0000000..22eb2ce --- /dev/null +++ b/BPTCEncoder/include/BC7Compressor.h @@ -0,0 +1,61 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +namespace BC7C +{ + // This is the error metric that is applied to our error measurement algorithm + // in order to bias calculation towards results that are more in-line with + // how the Human Visual System works. Uniform error means that each color + // channel is treated equally. For a while, the widely accepted non-uniform metric + // has been to give red 30%, green 59% and blue 11% weight when computing the error + // between two pixels. + enum ErrorMetric + { + eErrorMetric_Uniform, // Treats r, g, and b channels equally + eErrorMetric_Nonuniform, // { 0.3, 0.59, 0.11 } + + kNumErrorMetrics + }; + + // Sets the error metric to be the one specified. + void SetErrorMetric(ErrorMetric e); + + // Retreives a float4 pointer for the r, g, b, a weights for each color channel, in + // that order, based on the current error metric. + const float *GetErrorMetric(); + + // Returns the enumeration for the current error metric. + ErrorMetric GetErrorMetricEnum(); + + // Sets the number of steps that we use to perform simulated annealing. In general, a + // larger number produces better results. The default is set to 50. This metric works + // on a logarithmic scale -- twice the value will double the compute time, but only + // decrease the error by two times a factor. + void SetQualityLevel(int q); + int GetQualityLevel(); + + // Compress the image given as RGBA data to BC7 format. Width and Height are the dimensions of + // the image in pixels. + void CompressImageBC7(const unsigned char *inBuf, unsigned char *outBuf, int width, int height); + + // Compress the image given as RGBA data to BC7 format using an algorithm optimized for SIMD + // enabled platforms. Width and Height are the dimensions of the image in pixels. + void CompressImageBC7SIMD(const unsigned char* inBuf, unsigned char* outBuf, int width, int height); + + // Decompress the image given as BC7 data to R8G8B8A8 format. Width and Height are the dimensions of the image in pixels. + void DecompressImageBC7SIMD(const unsigned char* inBuf, unsigned char* outBuf, int width, int height); +} diff --git a/BPTCEncoder/src/BC7CompressionMode.h b/BPTCEncoder/src/BC7CompressionMode.h new file mode 100755 index 0000000..94c53e8 --- /dev/null +++ b/BPTCEncoder/src/BC7CompressionMode.h @@ -0,0 +1,191 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +#ifndef __BC7_COMPRESSIONMODE_SIMD_H__ +#define __BC7_COMPRESSIONMODE_SIMD_H__ + +#include "RGBAEndpoints.h" + +// Forward Declarations +class BitStream; +const int kMaxEndpoints = 3; + +static const int kPBits[4][2] = { + { 0, 0 }, + { 0, 1 }, + { 1, 0 }, + { 1, 1 } +}; + +// Abstract class that outlines all of the different settings for BC7 compression modes +// Note that at the moment, we only support modes 0-3, so we don't deal with alpha channels. +class BC7CompressionMode { +public: + + static const int kMaxNumSubsets = 3; + static const int kNumModes = 8; + + explicit BC7CompressionMode(int mode, bool opaque = true) : m_IsOpaque(opaque), m_Attributes(&(kModeAttributes[mode])), m_RotateMode(0), m_IndexMode(0) { } + ~BC7CompressionMode() { } + + static int NumUses[8]; + static void ResetNumUses() { memset(NumUses, 0, sizeof(NumUses)); } + double Compress(BitStream &stream, const int shapeIdx, const RGBACluster *clusters); + + // This switch controls the quality of the simulated annealing optimizer. We will not make + // more than this many steps regardless of how bad the error is. Higher values will produce + // better quality results but will run slower. Default is 20. + static int MaxAnnealingIterations; // This is a setting + static const int kMaxAnnealingIterations = 256; // This is a limit + + enum EPBitType { + ePBitType_Shared, + ePBitType_NotShared, + ePBitType_None + }; + + static struct Attributes { + int modeNumber; + int numPartitionBits; + int numSubsets; + int numBitsPerIndex; + int numBitsPerAlpha; + int colorChannelPrecision; + int alphaChannelPrecision; + bool hasRotation; + bool hasIdxMode; + EPBitType pbitType; + } kModeAttributes[kNumModes]; + + static const Attributes *GetAttributesForMode(int mode) { + if(mode < 0 || mode >= 8) return NULL; + return &kModeAttributes[mode]; + } + +private: + + const Attributes *const m_Attributes; + + int m_RotateMode; + int m_IndexMode; + + void SetIndexMode(int mode) { m_IndexMode = mode; } + void SetRotationMode(int mode) { m_RotateMode = mode; } + + int GetRotationMode() const { return m_Attributes->hasRotation? m_RotateMode : 0; } + + int GetModeNumber() const { return m_Attributes->modeNumber; } + int GetNumberOfPartitionBits() const { return m_Attributes->numPartitionBits; } + int GetNumberOfSubsets() const { return m_Attributes->numSubsets; } + + int GetNumberOfBitsPerIndex(int indexMode = -1) const { + if(indexMode < 0) indexMode = m_IndexMode; + if(indexMode == 0) + return m_Attributes->numBitsPerIndex; + else + return m_Attributes->numBitsPerAlpha; + } + + int GetNumberOfBitsPerAlpha(int indexMode = -1) const { + if(indexMode < 0) indexMode = m_IndexMode; + if(indexMode == 0) + return m_Attributes->numBitsPerAlpha; + else + return m_Attributes->numBitsPerIndex; + } + + // If we handle alpha separately, then we will consider the alpha channel + // to be not used whenever we do any calculations... + int GetAlphaChannelPrecision() const { + if(m_Attributes->hasRotation) return 0; + else return m_Attributes->alphaChannelPrecision; + } + + RGBAVector GetErrorMetric() const { + const float *w = BC7C::GetErrorMetric(); + switch(GetRotationMode()) { + default: + case 0: return RGBAVector(w[0], w[1], w[2], w[3]); + case 1: return RGBAVector(w[3], w[1], w[2], w[0]); + case 2: return RGBAVector(w[0], w[3], w[2], w[1]); + case 3: return RGBAVector(w[0], w[1], w[3], w[2]); + } + } + + EPBitType GetPBitType() const { return m_Attributes->pbitType; } + + unsigned int GetQuantizationMask() const { + const int maskSeed = 0x80000000; + return ( + (maskSeed >> (24 + m_Attributes->colorChannelPrecision - 1) & 0xFF) | + (maskSeed >> (16 + m_Attributes->colorChannelPrecision - 1) & 0xFF00) | + (maskSeed >> (8 + m_Attributes->colorChannelPrecision - 1) & 0xFF0000) | + (maskSeed >> (GetAlphaChannelPrecision() - 1) & 0xFF000000) + ); + } + + int GetNumPbitCombos() const { + switch(GetPBitType()) { + case ePBitType_Shared: return 2; + case ePBitType_NotShared: return 4; + default: + case ePBitType_None: return 1; + } + } + + const int *GetPBitCombo(int idx) const { + switch(GetPBitType()) { + case ePBitType_Shared: return (idx)? kPBits[3] : kPBits[0]; + case ePBitType_NotShared: return kPBits[idx % 4]; + default: + case ePBitType_None: return kPBits[0]; + } + } + + double OptimizeEndpointsForCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int &bestPbitCombo) const; + + struct VisitedState { + RGBAVector p1; + RGBAVector p2; + int pBitCombo; + }; + + void PickBestNeighboringEndpoints( + const RGBACluster &cluster, + const RGBAVector &p1, const RGBAVector &p2, + const int curPbitCombo, + RGBAVector &np1, RGBAVector &np2, + int &nPbitCombo, + const VisitedState *visitedStates, + int nVisited, + float stepSz = 1.0f + ) const; + + bool AcceptNewEndpointError(double newError, double oldError, float temp) const; + + double CompressSingleColor(const RGBAVector &p, RGBAVector &p1, RGBAVector &p2, int &bestPbitCombo) const; + double CompressCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int &bestPbitCombo) const; + double CompressCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int *alphaIndices) const; + + void ClampEndpointsToGrid(RGBAVector &p1, RGBAVector &p2, int &bestPBitCombo) const; + + const double m_IsOpaque; +}; + +extern const uint32 kBC7InterpolationValues[4][16][2]; + +#endif // __BC7_COMPRESSIONMODE_SIMD_H__ diff --git a/BPTCEncoder/src/BC7CompressionModeSIMD.h b/BPTCEncoder/src/BC7CompressionModeSIMD.h new file mode 100755 index 0000000..66b9d6f --- /dev/null +++ b/BPTCEncoder/src/BC7CompressionModeSIMD.h @@ -0,0 +1,153 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +#ifndef __BC7_COMPRESSIONMODE_H__ +#define __BC7_COMPRESSIONMODE_H__ + +#include "BC7IntTypes.h" +#include "RGBAEndpointsSIMD.h" + +// Forward Declarations +class BitStream; + +static const int kPBits[4][2] = { + { 0, 0 }, + { 0, 1 }, + { 1, 0 }, + { 1, 1 } +}; + +// Abstract class that outlines all of the different settings for BC7 compression modes +// Note that at the moment, we only support modes 0-3, so we don't deal with alpha channels. +class BC7CompressionModeSIMD { +public: + + static const int kMaxNumSubsets = 3; + static const int kNumModes = 8; + + enum EPBitType { + ePBitType_Shared, + ePBitType_NotShared, + ePBitType_None + }; + + BC7CompressionModeSIMD(int mode, double err) : m_EstimatedError(err), m_Attributes(&(kModeAttributes[mode])) { } + ~BC7CompressionModeSIMD() { } + + static int NumUses[8]; + static void ResetNumUses() { memset(NumUses, 0, sizeof(NumUses)); } + + double Compress(BitStream &stream, const int shapeIdx, const RGBAClusterSIMD *clusters) const; + + // This switch controls the quality of the simulated annealing optimizer. We will not make + // more than this many steps regardless of how bad the error is. Higher values will produce + // better quality results but will run slower. Default is 50. + static int MaxAnnealingIterations; // This is a setting + +private: + + static struct Attributes { + int modeNumber; + int numPartitionBits; + int numSubsets; + int numBitsPerIndex; + int redChannelPrecision; + int greenChannelPrecision; + int blueChannelPrecision; + int alphaChannelPrecision; + EPBitType pbitType; + } kModeAttributes[kNumModes]; + +protected: + const Attributes *const m_Attributes; + + int GetModeNumber() const { return m_Attributes->modeNumber; } + int GetNumberOfPartitionBits() const { return m_Attributes->numPartitionBits; } + int GetNumberOfSubsets() const { return m_Attributes->numSubsets; } + int GetNumberOfBitsPerIndex() const { return m_Attributes->numBitsPerIndex; } + + int GetRedChannelPrecision() const { return m_Attributes->redChannelPrecision; } + int GetGreenChannelPrecision() const { return m_Attributes->greenChannelPrecision; } + int GetBlueChannelPrecision() const { return m_Attributes->blueChannelPrecision; } + int GetAlphaChannelPrecision() const { return m_Attributes->alphaChannelPrecision; } + + EPBitType GetPBitType() const { return m_Attributes->pbitType; } + + // !SPEED! Add this to the attributes lookup table + void GetQuantizationMask(__m128i &mask) const { + const int maskSeed = 0x80000000; + mask = _mm_set_epi32( + (GetAlphaChannelPrecision() > 0)? (maskSeed >> (24 + GetAlphaChannelPrecision() - 1) & 0xFF) : 0xFF, + (maskSeed >> (24 + GetBlueChannelPrecision() - 1) & 0xFF), + (maskSeed >> (24 + GetGreenChannelPrecision() - 1) & 0xFF), + (maskSeed >> (24 + GetRedChannelPrecision() - 1) & 0xFF) + ); + } + + int GetNumPbitCombos() const { + switch(GetPBitType()) { + case ePBitType_Shared: return 2; + case ePBitType_NotShared: return 4; + default: + case ePBitType_None: return 1; + } + } + + const int *GetPBitCombo(int idx) const { + switch(GetPBitType()) { + case ePBitType_Shared: return (idx)? kPBits[3] : kPBits[0]; + case ePBitType_NotShared: return kPBits[idx % 4]; + default: + case ePBitType_None: return kPBits[0]; + } + } + + double OptimizeEndpointsForCluster(const RGBAClusterSIMD &cluster, RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, __m128i *bestIndices, int &bestPbitCombo) const; + + struct VisitedState { + RGBAVectorSIMD p1; + RGBAVectorSIMD p2; + int pBitCombo; + }; + + void PickBestNeighboringEndpoints( + const RGBAClusterSIMD &cluster, + const RGBAVectorSIMD &p1, const RGBAVectorSIMD &p2, + const int curPbitCombo, + RGBAVectorSIMD &np1, RGBAVectorSIMD &np2, + int &nPbitCombo, + const __m128 &stepVec + ) const; + + bool AcceptNewEndpointError(float newError, float oldError, float temp) const; + + double CompressSingleColor(const RGBAVectorSIMD &p, RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, int &bestPbitCombo) const; + double CompressCluster(const RGBAClusterSIMD &cluster, RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, __m128i *bestIndices, int &bestPbitCombo) const; + + void ClampEndpointsToGrid(RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, int &bestPBitCombo) const; + + int GetSubsetForIndex(int idx, const int shapeIdx) const; + int GetAnchorIndexForSubset(int subset, const int shapeIdx) const; + + double GetEstimatedError() const { return m_EstimatedError; } + const double m_EstimatedError; +}; + +extern const __m128i kBC7InterpolationValuesSIMD[4][16][2]; +extern const uint32 kBC7InterpolationValuesScalar[4][16][2]; + +#endif // __BC7_COMPRESSIONMODE_H__ diff --git a/BPTCEncoder/src/BC7Compressor.cpp b/BPTCEncoder/src/BC7Compressor.cpp new file mode 100755 index 0000000..098fc25 --- /dev/null +++ b/BPTCEncoder/src/BC7Compressor.cpp @@ -0,0 +1,1925 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +#include "BC7IntTypes.h" +#include "BC7Compressor.h" +#include "BC7CompressionMode.h" +#include "BCLookupTables.h" +#include "RGBAEndpoints.h" +#include "BitStream.h" + +#include +#include +#include +#include +#include +#include + +static const uint32 kNumShapes2 = 64; +static const uint16 kShapeMask2[kNumShapes2] = { + 0xcccc, 0x8888, 0xeeee, 0xecc8, 0xc880, 0xfeec, 0xfec8, 0xec80, + 0xc800, 0xffec, 0xfe80, 0xe800, 0xffe8, 0xff00, 0xfff0, 0xf000, + 0xf710, 0x008e, 0x7100, 0x08ce, 0x008c, 0x7310, 0x3100, 0x8cce, + 0x088c, 0x3110, 0x6666, 0x366c, 0x17e8, 0x0ff0, 0x718e, 0x399c, + 0xaaaa, 0xf0f0, 0x5a5a, 0x33cc, 0x3c3c, 0x55aa, 0x9696, 0xa55a, + 0x73ce, 0x13c8, 0x324c, 0x3bdc, 0x6996, 0xc33c, 0x9966, 0x0660, + 0x0272, 0x04e4, 0x4e40, 0x2720, 0xc936, 0x936c, 0x39c6, 0x639c, + 0x9336, 0x9cc6, 0x817e, 0xe718, 0xccf0, 0x0fcc, 0x7744, 0xee22 +}; + +static const int kAnchorIdx2[kNumShapes2] = { + 15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15, + 15, 2, 8, 2, 2, 8, 8,15, + 2, 8, 2, 2, 8, 8, 2, 2, + 15,15, 6, 8, 2, 8,15,15, + 2, 8, 2, 2, 2,15,15, 6, + 6, 2, 6, 8,15,15, 2, 2, + 15,15,15,15,15, 2, 2, 15 +}; + +static const uint32 kNumShapes3 = 64; +static const uint16 kShapeMask3[kNumShapes3][2] = { + { 0xfecc, 0xf600 }, { 0xffc8, 0x7300 }, { 0xff90, 0x3310 }, { 0xecce, 0x00ce }, { 0xff00, 0xcc00 }, { 0xcccc, 0xcc00 }, { 0xffcc, 0x00cc }, { 0xffcc, 0x3300 }, + { 0xff00, 0xf000 }, { 0xfff0, 0xf000 }, { 0xfff0, 0xff00 }, { 0xcccc, 0x8888 }, { 0xeeee, 0x8888 }, { 0xeeee, 0xcccc }, { 0xffec, 0xec80 }, { 0x739c, 0x7310 }, + { 0xfec8, 0xc800 }, { 0x39ce, 0x3100 }, { 0xfff0, 0xccc0 }, { 0xfccc, 0x0ccc }, { 0xeeee, 0xee00 }, { 0xff88, 0x7700 }, { 0xeec0, 0xcc00 }, { 0x7730, 0x3300 }, + { 0x0cee, 0x00cc }, { 0xffcc, 0xfc88 }, { 0x6ff6, 0x0660 }, { 0xff60, 0x6600 }, { 0xcbbc, 0xc88c }, { 0xf966, 0xf900 }, { 0xceec, 0x0cc0 }, { 0xff10, 0x7310 }, + { 0xff80, 0xec80 }, { 0xccce, 0x08ce }, { 0xeccc, 0xec80 }, { 0x6666, 0x4444 }, { 0x0ff0, 0x0f00 }, { 0x6db6, 0x4924 }, { 0x6bd6, 0x4294 }, { 0xcf3c, 0x0c30 }, + { 0xc3fc, 0x03c0 }, { 0xffaa, 0xff00 }, { 0xff00, 0x5500 }, { 0xfcfc, 0xcccc }, { 0xcccc, 0x0c0c }, { 0xf6f6, 0x6666 }, { 0xaffa, 0x0ff0 }, { 0xfff0, 0x5550 }, + { 0xfaaa, 0xf000 }, { 0xeeee, 0x0e0e }, { 0xf8f8, 0x8888 }, { 0xfff0, 0x9990 }, { 0xeeee, 0xe00e }, { 0x8ff8, 0x8888 }, { 0xf666, 0xf000 }, { 0xff00, 0x9900 }, + { 0xff66, 0xff00 }, { 0xcccc, 0xc00c }, { 0xcffc, 0xcccc }, { 0xf000, 0x9000 }, { 0x8888, 0x0808 }, { 0xfefe, 0xeeee }, { 0xfffa, 0xfff0 }, { 0x7bde, 0x7310 } +}; + +static const uint32 kWMValues[] = { 0x32b92180, 0x32ba3080, 0x31103200, 0x28103c80, 0x32bb3080, 0x25903600, 0x3530b900, 0x3b32b180, 0x34b5b980 }; +static const uint32 kNumWMVals = sizeof(kWMValues) / sizeof(kWMValues[0]); +static uint32 gWMVal = -1; + +static const int kAnchorIdx3[2][kNumShapes3] = { + { 3, 3,15,15, 8, 3,15,15, + 8, 8, 6, 6, 6, 5, 3, 3, + 3, 3, 8,15, 3, 3, 6,10, + 5, 8, 8, 6, 8, 5,15,15, + 8,15, 3, 5, 6,10, 8,15, + 15, 3,15, 5,15,15,15,15, + 3,15, 5, 5, 5, 8, 5,10, + 5,10, 8,13,15,12, 3, 3 }, + + { 15, 8, 8, 3,15,15, 3, 8, + 15,15,15,15,15,15,15, 8, + 15, 8,15, 3,15, 8,15, 8, + 3,15, 6,10,15,15,10, 8, + 15, 3,15,10,10, 8, 9,10, + 6,15, 8,15, 3, 6, 6, 8, + 15, 3,15,15,15,15,15,15, + 15,15,15,15, 3,15,15, 8 } +}; + +static int GetSubsetForIndex(int idx, const int shapeIdx, const int nSubsets) { + int subset = 0; + + switch(nSubsets) { + case 2: + { + subset = !!((1 << idx) & kShapeMask2[shapeIdx]); + } + break; + + case 3: + { + if(1 << idx & kShapeMask3[shapeIdx][0]) + subset = 1 + !!((1 << idx) & kShapeMask3[shapeIdx][1]); + else + subset = 0; + } + break; + + default: + break; + } + + return subset; +} + +static int GetAnchorIndexForSubset(int subset, const int shapeIdx, const int nSubsets) { + + int anchorIdx = 0; + switch(subset) { + case 1: + { + if(nSubsets == 2) { + anchorIdx = kAnchorIdx2[shapeIdx]; + } + else { + anchorIdx = kAnchorIdx3[0][shapeIdx]; + } + } + break; + + case 2: + { + assert(nSubsets == 3); + anchorIdx = kAnchorIdx3[1][shapeIdx]; + } + break; + + default: + break; + } + + return anchorIdx; +} + +static int GetPointMaskForSubset(int subset, const int shapeIdx, const int nSubsets) { + int mask = 0xFFFF; + + assert(subset < nSubsets); + + switch(nSubsets) { + case 2: + { + mask = (subset)? kShapeMask2[shapeIdx] : ~(kShapeMask2[shapeIdx]); + } + break; + + case 3: + { + switch(subset) { + default: + case 0: + { + mask = ~(kShapeMask3[shapeIdx][0]); + } + break; + + case 1: + { + mask = ~(~(kShapeMask3[shapeIdx][0]) | kShapeMask3[shapeIdx][1]); + } + break; + + case 2: + { + mask = kShapeMask3[shapeIdx][1]; + } + break; + } + } + break; + + default: + break; + } + + return mask; +} + +#ifndef min +#define min(a, b) (((a) > (b))? (b) : (a)) +#endif + +#ifndef max +#define max(a, b) (((a) > (b))? (a) : (b)) +#endif + +template +static void insert(T* buf, int bufSz, T newVal, int idx = 0) { + int safeIdx = min(bufSz-1, max(idx, 0)); + for(int i = bufSz - 1; i > safeIdx; i--) { + buf[i] = buf[i-1]; + } + buf[safeIdx] = newVal; +} + +template +static inline void swap(T &a, T &b) { T t = a; a = b; b = t; } + +const uint32 kBC7InterpolationValues[4][16][2] = { + { {64, 0}, {33, 31}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { {64, 0}, {43, 21}, {21, 43}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { {64, 0}, {55, 9}, {46, 18}, {37, 27}, {27, 37}, {18, 46}, {9, 55}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0 }, + { {64, 0}, {60, 4}, {55, 9}, {51, 13}, {47, 17}, {43, 21}, {38, 26}, {34, 30}, {30, 34}, {26, 38}, {21, 43}, {17, 47}, {13, 51}, {9, 55}, {4, 60}, {0, 64} } +}; + +int BC7CompressionMode::MaxAnnealingIterations = 50; // This is a setting. +int BC7CompressionMode::NumUses[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + +BC7CompressionMode::Attributes BC7CompressionMode::kModeAttributes[kNumModes] = { + { 0, 4, 3, 3, 0, 4, 0, false, false, BC7CompressionMode::ePBitType_NotShared }, + { 1, 6, 2, 3, 0, 6, 0, false, false, BC7CompressionMode::ePBitType_Shared }, + { 2, 6, 3, 2, 0, 5, 0, false, false, BC7CompressionMode::ePBitType_None }, + { 3, 6, 2, 2, 0, 7, 0, false, false, BC7CompressionMode::ePBitType_NotShared }, + { 4, 0, 1, 2, 3, 5, 6, true, true, BC7CompressionMode::ePBitType_None }, + { 5, 0, 1, 2, 2, 7, 8, true, false, BC7CompressionMode::ePBitType_None }, + { 6, 0, 1, 4, 0, 7, 7, false, false, BC7CompressionMode::ePBitType_NotShared }, + { 7, 6, 2, 2, 0, 5, 5, false, false, BC7CompressionMode::ePBitType_NotShared }, +}; + +void BC7CompressionMode::ClampEndpointsToGrid(RGBAVector &p1, RGBAVector &p2, int &bestPBitCombo) const { + const int nPbitCombos = GetNumPbitCombos(); + const bool hasPbits = nPbitCombos > 1; + const uint32 qmask = GetQuantizationMask(); + + ClampEndpoints(p1, p2); + + // !SPEED! This can be faster. + float minDist = FLT_MAX; + RGBAVector bp1, bp2; + for(int i = 0; i < nPbitCombos; i++) { + + uint32 qp1, qp2; + if(hasPbits) { + qp1 = p1.ToPixel(qmask, GetPBitCombo(i)[0]); + qp2 = p2.ToPixel(qmask, GetPBitCombo(i)[1]); + } + else { + qp1 = p1.ToPixel(qmask); + qp2 = p2.ToPixel(qmask); + } + + uint8 *pqp1 = (uint8 *)&qp1; + uint8 *pqp2 = (uint8 *)&qp2; + + RGBAVector np1 = RGBAVector(float(pqp1[0]), float(pqp1[1]), float(pqp1[2]), float(pqp1[3])); + RGBAVector np2 = RGBAVector(float(pqp2[0]), float(pqp2[1]), float(pqp2[2]), float(pqp2[3])); + + RGBAVector d1 = np1 - p1; + RGBAVector d2 = np2 - p2; + float dist = (d1 * d1) + (d2 * d2); + if(dist < minDist) { + minDist = dist; + bp1 = np1; bp2 = np2; + bestPBitCombo = i; + } + } + + p1 = bp1; + p2 = bp2; +} + +double BC7CompressionMode::CompressSingleColor(const RGBAVector &p, RGBAVector &p1, RGBAVector &p2, int &bestPbitCombo) const { + + const uint32 pixel = p.ToPixel(); + + uint32 bestDist = 0xFF; + bestPbitCombo = -1; + + for(int pbi = 0; pbi < GetNumPbitCombos(); pbi++) { + + const int *pbitCombo = GetPBitCombo(pbi); + + uint32 dist = 0x0; + uint32 bestValI[kNumColorChannels] = { -1, -1, -1, -1 }; + uint32 bestValJ[kNumColorChannels] = { -1, -1, -1, -1 }; + + for(int ci = 0; ci < kNumColorChannels; ci++) { + + const uint8 val = (pixel >> (ci * 8)) & 0xFF; + int nBits = ci == 3? GetAlphaChannelPrecision() : m_Attributes->colorChannelPrecision; + + // If we don't handle this channel, then we don't need to + // worry about how well we interpolate. + if(nBits == 0) { bestValI[ci] = bestValJ[ci] = 0xFF; continue; } + + const int nPossVals = (1 << nBits); + int possValsH[256]; + int possValsL[256]; + + // Do we have a pbit? + const bool havepbit = GetPBitType() != ePBitType_None; + if(havepbit) + nBits++; + + for(int i = 0; i < nPossVals; i++) { + + int vh = i, vl = i; + if(havepbit) { + vh <<= 1; + vl <<= 1; + + vh |= pbitCombo[1]; + vl |= pbitCombo[0]; + } + + possValsH[i] = (vh << (8 - nBits)); + possValsH[i] |= (possValsH[i] >> nBits); + + possValsL[i] = (vl << (8 - nBits)); + possValsL[i] |= (possValsL[i] >> nBits); + } + + const uint32 interpVal0 = kBC7InterpolationValues[GetNumberOfBitsPerIndex() - 1][1][0]; + const uint32 interpVal1 = kBC7InterpolationValues[GetNumberOfBitsPerIndex() - 1][1][1]; + + // Find the closest interpolated val that to the given val... + uint32 bestChannelDist = 0xFF; + for(int i = 0; bestChannelDist > 0 && i < nPossVals; i++) + for(int j = 0; bestChannelDist > 0 && j < nPossVals; j++) { + + const uint32 v1 = possValsL[i]; + const uint32 v2 = possValsH[j]; + + const uint32 combo = (interpVal0*v1 + (interpVal1 * v2) + 32) >> 6; + const uint32 err = (combo > val)? combo - val : val - combo; + + if(err < bestChannelDist) { + bestChannelDist = err; + bestValI[ci] = v1; + bestValJ[ci] = v2; + } + } + + dist = max(bestChannelDist, dist); + } + + if(dist < bestDist) { + bestDist = dist; + bestPbitCombo = pbi; + + for(int ci = 0; ci < kNumColorChannels; ci++) { + p1.c[ci] = float(bestValI[ci]); + p2.c[ci] = float(bestValJ[ci]); + } + } + } + + return bestDist; +} + +// Fast random number generator. See more information at +// http://software.intel.com/en-us/articles/fast-random-number-generator-on-the-intel-pentiumr-4-processor/ +static uint32 g_seed = uint32(time(NULL)); +static inline uint32 fastrand() { + g_seed = (214013 * g_seed + 2531011); + return (g_seed>>16) & RAND_MAX; +} + +static const int kNumStepDirections = 8; +static const RGBADir kStepDirections[kNumStepDirections] = { + + // For pBit changes, we have 8 possible directions. + RGBADir(RGBAVector(1.0f, 1.0f, 1.0f, 0.0f)), + RGBADir(RGBAVector(-1.0f, 1.0f, 1.0f, 0.0f)), + RGBADir(RGBAVector(1.0f, -1.0f, 1.0f, 0.0f)), + RGBADir(RGBAVector(-1.0f, -1.0f, 1.0f, 0.0f)), + RGBADir(RGBAVector(1.0f, 1.0f, -1.0f, 0.0f)), + RGBADir(RGBAVector(-1.0f, 1.0f, -1.0f, 0.0f)), + RGBADir(RGBAVector(1.0f, -1.0f, -1.0f, 0.0f)), + RGBADir(RGBAVector(-1.0f, -1.0f, -1.0f, 0.0f)) +}; + +static void ChangePointForDirWithoutPbitChange(RGBAVector &v, int dir, const float step[kNumColorChannels]) { + if(dir % 2) { + v.x -= step[0]; + } + else { + v.x += step[0]; + } + + if(((dir / 2) % 2)) { + v.y -= step[1]; + } + else { + v.y += step[1]; + } + + if(((dir / 4) % 2)) { + v.z -= step[2]; + } + else { + v.z += step[2]; + } + + if(((dir / 8) % 2)) { + v.a -= step[3]; + } + else { + v.a += step[3]; + } +} + +static void ChangePointForDirWithPbitChange(RGBAVector &v, int dir, int oldPbit, const float step[kNumColorChannels]) { + if(dir % 2 && oldPbit == 0) { + v.x -= step[0]; + } + else if(!(dir % 2) && oldPbit == 1) { + v.x += step[0]; + } + + if(((dir / 2) % 2) && oldPbit == 0) { + v.y -= step[1]; + } + else if(!((dir / 2) % 2) && oldPbit == 1) { + v.y += step[1]; + } + + if(((dir / 4) % 2) && oldPbit == 0) { + v.z -= step[2]; + } + else if(!((dir / 4) % 2) && oldPbit == 1) { + v.z += step[2]; + } + + if(((dir / 8) % 2) && oldPbit == 0) { + v.a -= step[3]; + } + else if(!((dir / 8) % 2) && oldPbit == 1) { + v.a += step[3]; + } +} + +void BC7CompressionMode::PickBestNeighboringEndpoints(const RGBACluster &cluster, const RGBAVector &p1, const RGBAVector &p2, const int curPbitCombo, RGBAVector &np1, RGBAVector &np2, int &nPbitCombo, const VisitedState *visitedStates, int nVisited, float stepSz) const { + + // !SPEED! There might be a way to make this faster since we're working + // with floating point values that are powers of two. We should be able + // to just set the proper bits in the exponent and leave the mantissa to 0. + float step[kNumColorChannels] = { + stepSz * float(1 << (8 - m_Attributes->colorChannelPrecision)), + stepSz * float(1 << (8 - m_Attributes->colorChannelPrecision)), + stepSz * float(1 << (8 - m_Attributes->colorChannelPrecision)), + stepSz * float(1 << (8 - GetAlphaChannelPrecision())) + }; + + if(m_IsOpaque) { + step[(GetRotationMode() + 3) % kNumColorChannels] = 0.0f; + } + + // First, let's figure out the new pbit combo... if there's no pbit then we don't need + // to worry about it. + const bool hasPbits = GetPBitType() != ePBitType_None; + if(hasPbits) { + + // If there is a pbit, then we must change it, because those will provide the closest values + // to the current point. + if(GetPBitType() == ePBitType_Shared) + nPbitCombo = (curPbitCombo + 1) % 2; + else { + // Not shared... p1 needs to change and p2 needs to change... which means that + // combo 0 gets rotated to combo 3, combo 1 gets rotated to combo 2 and vice + // versa... + nPbitCombo = 3 - curPbitCombo; + } + + assert(GetPBitCombo(curPbitCombo)[0] + GetPBitCombo(nPbitCombo)[0] == 1); + assert(GetPBitCombo(curPbitCombo)[1] + GetPBitCombo(nPbitCombo)[1] == 1); + } + + bool visited = true; + int infLoopPrevent = -1; + while(visited && ++infLoopPrevent < 16) { + for(int pt = 0; pt < 2; pt++) { + + const RGBAVector &p = (pt)? p1 : p2; + RGBAVector &np = (pt)? np1 : np2; + + np = p; + if(hasPbits) + ChangePointForDirWithPbitChange(np, fastrand() % 16, GetPBitCombo(curPbitCombo)[pt], step); + else + ChangePointForDirWithoutPbitChange(np, fastrand() % 16, step); + + for(int i = 0; i < kNumColorChannels; i++) { + np.c[i] = min(max(np.c[i], 0.0f), 255.0f); + } + } + + visited = false; + for(int i = 0; i < nVisited; i++) { + visited = visited || ( + visitedStates[i].p1 == np1 && + visitedStates[i].p2 == np2 && + visitedStates[i].pBitCombo == nPbitCombo + ); + } + } +} + +// Fast generation of floats between 0 and 1. It generates a float +// whose exponent forces the value to be between 1 and 2, then it +// populates the mantissa with a random assortment of bits, and returns +// the bytes interpreted as a float. This prevents two things: 1, a +// division, and 2, a cast from an integer to a float. + +#define COMPILE_ASSERT(x) extern int __compile_assert_[(int)(x)]; +COMPILE_ASSERT(RAND_MAX == 0x7FFF) + +static inline float frand() { + const uint16 r = fastrand(); + + // RAND_MAX is 0x7FFF, which offers 15 bits + // of precision. Therefore, we move the bits + // into the top of the 23 bit mantissa, and + // repeat the most significant bits of r in + // the least significant of the mantissa + const uint32 m = (r << 8) | (r >> 7); + const uint32 flt = (127 << 23) | m; + return *(reinterpret_cast(&flt)) - 1.0f; +} + +bool BC7CompressionMode::AcceptNewEndpointError(double newError, double oldError, float temp) const { + + // Always accept better endpoints. + if(newError < oldError) + return true; + + const double p = exp((0.1f * (oldError - newError)) / temp); + const double r = frand(); + + return r < p; +} + +double BC7CompressionMode::OptimizeEndpointsForCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int &bestPbitCombo) const { + + const int nBuckets = (1 << GetNumberOfBitsPerIndex()); + const int nPbitCombos = GetNumPbitCombos(); + const uint32 qmask = GetQuantizationMask(); + + // Here we use simulated annealing to traverse the space of clusters to find the best possible endpoints. + double curError = cluster.QuantizedError(p1, p2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(bestPbitCombo), bestIndices); + int curPbitCombo = bestPbitCombo; + double bestError = curError; + + // Clamp endpoints to the grid... + uint32 qp1, qp2; + if(GetPBitType() != ePBitType_None) { + qp1 = p1.ToPixel(qmask, GetPBitCombo(bestPbitCombo)[0]); + qp2 = p2.ToPixel(qmask, GetPBitCombo(bestPbitCombo)[1]); + } + else { + qp1 = p1.ToPixel(qmask); + qp2 = p2.ToPixel(qmask); + } + + uint8 *pqp1 = (uint8 *)&qp1; + uint8 *pqp2 = (uint8 *)&qp2; + + p1 = RGBAVector(float(pqp1[0]), float(pqp1[1]), float(pqp1[2]), float(pqp1[3])); + p2 = RGBAVector(float(pqp2[0]), float(pqp2[1]), float(pqp2[2]), float(pqp2[3])); + + RGBAVector bp1 = p1, bp2 = p2; + + assert(curError == cluster.QuantizedError(p1, p2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(bestPbitCombo))); + + int lastVisitedState = 0; + VisitedState visitedStates[kMaxAnnealingIterations]; + + visitedStates[lastVisitedState].p1 = p1; + visitedStates[lastVisitedState].p2 = p2; + visitedStates[lastVisitedState].pBitCombo = curPbitCombo; + lastVisitedState++; + + const int maxEnergy = MaxAnnealingIterations; + + for(int energy = 0; bestError > 0 && energy < maxEnergy; energy++) { + + float temp = float(energy) / float(maxEnergy-1); + + int indices[kMaxNumDataPoints]; + RGBAVector np1, np2; + int nPbitCombo; + + PickBestNeighboringEndpoints(cluster, p1, p2, curPbitCombo, np1, np2, nPbitCombo, visitedStates, lastVisitedState); + + double error = cluster.QuantizedError(np1, np2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(nPbitCombo), indices); + if(AcceptNewEndpointError(error, curError, temp)) { + curError = error; + p1 = np1; + p2 = np2; + curPbitCombo = nPbitCombo; + } + + if(error < bestError) { + memcpy(bestIndices, indices, sizeof(indices)); + bp1 = np1; + bp2 = np2; + bestPbitCombo = nPbitCombo; + bestError = error; + + visitedStates[lastVisitedState].p1 = np1; + visitedStates[lastVisitedState].p2 = np2; + visitedStates[lastVisitedState].pBitCombo = nPbitCombo; + lastVisitedState++; + + // Restart... + energy = 0; + } + } + + p1 = bp1; + p2 = bp2; + + return bestError; +} + +double BC7CompressionMode::CompressCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int *alphaIndices) const { + + assert(GetModeNumber() == 4 || GetModeNumber() == 5); + assert(GetNumberOfSubsets() == 1); + assert(cluster.GetNumPoints() == kMaxNumDataPoints); + assert(m_Attributes->alphaChannelPrecision > 0); + + // If all the points are the same in the cluster, then we need to figure out what the best + // approximation to this point is.... + if(cluster.AllSamePoint()) { + + assert(!"We should only be using this function in modes 4 & 5 that have a single subset, in which case single colors should have been detected much earlier."); + + const RGBAVector &p = cluster.GetPoint(0); + int dummyPbit = 0; + double bestErr = CompressSingleColor(p, p1, p2, dummyPbit); + + // We're assuming all indices will be index 1... + for(int i = 0; i < cluster.GetNumPoints(); i++) { + bestIndices[i] = 1; + alphaIndices[i] = 1; + } + + return bestErr; + } + + RGBACluster rgbCluster; + float alphaVals[kMaxNumDataPoints]; + + float alphaMin = FLT_MAX, alphaMax = -FLT_MAX; + for(int i = 0; i < cluster.GetNumPoints(); i++) { + + RGBAVector v = cluster.GetPoint(i); + switch(GetRotationMode()) { + default: + case 0: + // Do nothing + break; + + case 1: + swap(v.r, v.a); + break; + + case 2: + swap(v.g, v.a); + break; + + case 3: + swap(v.b, v.a); + break; + } + + alphaVals[i] = v.a; + v.a = 255.0f; + + alphaMin = min(alphaVals[i], alphaMin); + alphaMax = max(alphaVals[i], alphaMax); + + rgbCluster.AddPoint(v); + } + + int dummyPbit = 0; + RGBAVector rgbp1, rgbp2; + double rgbError = CompressCluster(rgbCluster, rgbp1, rgbp2, bestIndices, dummyPbit); + + float a1 = alphaMin, a2 = alphaMax; + double alphaError = DBL_MAX; + + typedef uint32 tInterpPair[2]; + typedef tInterpPair tInterpLevel[16]; + const tInterpLevel *interpVals = kBC7InterpolationValues + (GetNumberOfBitsPerAlpha() - 1); + const float weight = GetErrorMetric().a; + + const int nBuckets = (1 << GetNumberOfBitsPerAlpha()); + + // If they're the same, then we can get them exactly. + if(a1 == a2) + { + const uint8 step = 1 << (8-GetAlphaChannelPrecision()); + const uint8 a1be = uint8(a1); + const uint8 a2be = uint8(a2); + const uint8 a1b = ::QuantizeChannel(a1be, (((char)0x80) >> (GetAlphaChannelPrecision() - 1))); + const uint8 a2b = ::QuantizeChannel(a2be, (((char)0x80) >> (GetAlphaChannelPrecision() - 1))); + + // Mode 5 has 8 bits of precision for alpha. + if(GetModeNumber() == 5) { + + assert(a1 == float(a1b)); + assert(a2 == float(a2b)); + + for(int i = 0; i < kMaxNumDataPoints; i++) + alphaIndices[i] = 0; + + alphaError = 0.0; + } + else { + assert(GetModeNumber() == 4); + + // Mode 4 can be treated like the 6 channel of DXT1 compression. + if(Optimal6CompressDXT1[a1be][0][0]) { + a1 = float((Optimal6CompressDXT1[a1be][1][1] << 2) | (Optimal6CompressDXT1[a1be][0][1] >> 4)); + a2 = float((Optimal6CompressDXT1[a2be][1][2] << 2) | (Optimal6CompressDXT1[a2be][0][1] >> 4)); + } + else { + a1 = float((Optimal6CompressDXT1[a1be][0][1] << 2) | (Optimal6CompressDXT1[a1be][0][1] >> 4)); + a2 = float((Optimal6CompressDXT1[a2be][0][2] << 2) | (Optimal6CompressDXT1[a2be][0][1] >> 4)); + } + + if(m_IndexMode == 1) { + for(int i = 0; i < kMaxNumDataPoints; i++) + alphaIndices[i] = 1; + } + else { + for(int i = 0; i < kMaxNumDataPoints; i++) + alphaIndices[i] = 2; + } + + uint32 interp0 = (*interpVals)[alphaIndices[0] & 0xFF][0]; + uint32 interp1 = (*interpVals)[alphaIndices[0] & 0xFF][1]; + + const uint8 ip = (((uint32(a1) * interp0) + (uint32(a2) * interp1) + 32) >> 6) & 0xFF; + float pxError = weight * float((a1be > ip)? a1be - ip : ip - a1be); + pxError *= pxError; + alphaError = 16 * pxError; + } + } + else { + + float vals[1<<3]; + memset(vals, 0, sizeof(vals)); + + int buckets[kMaxNumDataPoints]; + + // Figure out initial positioning. + for(int i = 0; i < nBuckets; i++) { + vals[i] = alphaMin + (float(i)/float(nBuckets-1)) * (alphaMax - alphaMin); + } + + // Assign each value to a bucket + for(int i = 0; i < kMaxNumDataPoints; i++) { + + float minDist = 255.0f; + for(int j = 0; j < nBuckets; j++) { + float dist = fabs(alphaVals[i] - vals[j]); + if(dist < minDist) { + minDist = dist; + buckets[i] = j; + } + } + } + + float npts[1 << 3]; + + // Do k-means + bool fixed = false; + while(!fixed) { + + memset(npts, 0, sizeof(npts)); + + float avg[1 << 3]; + memset(avg, 0, sizeof(avg)); + + // Calculate average of each cluster + for(int i = 0; i < nBuckets; i++) { + for(int j = 0; j < kMaxNumDataPoints; j++) { + + if(buckets[j] == i) { + avg[i] += alphaVals[j]; + npts[i] += 1.0f; + } + } + + if(npts[i] > 0.0f) + avg[i] /= npts[i]; + } + + // Did we change anything? + fixed = true; + for(int i = 0; i < nBuckets; i++) { + fixed = fixed && (avg[i] == vals[i]); + } + + // Reassign indices... + memcpy(vals, avg, sizeof(vals)); + + // Reassign each value to a bucket + for(int i = 0; i < kMaxNumDataPoints; i++) { + + float minDist = 255.0f; + for(int j = 0; j < nBuckets; j++) { + float dist = fabs(alphaVals[i] - vals[j]); + if(dist < minDist) { + minDist = dist; + buckets[i] = j; + } + } + } + } + + // Do least squares fit of vals. + float asq = 0.0, bsq = 0.0, ab = 0.0; + float ax(0.0), bx(0.0); + for(int i = 0; i < nBuckets; i++) { + float a = float(nBuckets - 1 - i) / float(nBuckets - 1); + float b = float(i) / float(nBuckets - 1); + + float n = npts[i]; + float x = vals[i]; + + asq += n * a * a; + bsq += n * b * b; + ab += n * a * b; + + ax += x * a * n; + bx += x * b * n; + } + + float f = 1.0f / (asq * bsq - ab * ab); + a1 = f * (ax * bsq - bx * ab); + a2 = f * (bx * asq - ax * ab); + + // Clamp + a1 = min(255.0f, max(0.0f, a1)); + a2 = min(255.0f, max(0.0f, a2)); + + // Quantize + const uint8 a1b = ::QuantizeChannel(uint8(a1), (((char)0x80) >> (GetAlphaChannelPrecision() - 1))); + const uint8 a2b = ::QuantizeChannel(uint8(a2), (((char)0x80) >> (GetAlphaChannelPrecision() - 1))); + + // Compute error + for(int i = 0; i < kMaxNumDataPoints; i++) { + + uint8 val = uint8(alphaVals[i]); + + float minError = FLT_MAX; + int bestBucket = -1; + + for(int j = 0; j < nBuckets; j++) { + uint32 interp0 = (*interpVals)[j][0]; + uint32 interp1 = (*interpVals)[j][1]; + + const uint8 ip = (((uint32(a1b) * interp0) + (uint32(a2b) * interp1) + 32) >> 6) & 0xFF; + float pxError = weight * float((val > ip)? val - ip : ip - val); + pxError *= pxError; + + if(pxError < minError) { + minError = pxError; + bestBucket = j; + } + } + + alphaError += minError; + alphaIndices[i] = bestBucket; + } + } + + for(int i = 0; i < kNumColorChannels; i++) { + p1.c[i] = (i == (kNumColorChannels-1))? a1 : rgbp1.c[i]; + p2.c[i] = (i == (kNumColorChannels-1))? a2 : rgbp2.c[i]; + } + + return rgbError + alphaError; +} + +double BC7CompressionMode::CompressCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int &bestPbitCombo) const { + + // If all the points are the same in the cluster, then we need to figure out what the best + // approximation to this point is.... + if(cluster.AllSamePoint()) { + const RGBAVector &p = cluster.GetPoint(0); + double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo); + + // We're assuming all indices will be index 1... + for(int i = 0; i < cluster.GetNumPoints(); i++) { + bestIndices[i] = 1; + } + + return bestErr; + } + + const int nBuckets = (1 << GetNumberOfBitsPerIndex()); + const int nPbitCombos = GetNumPbitCombos(); + const uint32 qmask = GetQuantizationMask(); + +#if 1 + RGBAVector avg = cluster.GetTotal() / float(cluster.GetNumPoints()); + RGBADir axis; + ::GetPrincipalAxis(cluster.GetNumPoints(), cluster.GetPoints(), axis); + + float mindp = FLT_MAX, maxdp = -FLT_MAX; + for(int i = 0 ; i < cluster.GetNumPoints(); i++) { + float dp = (cluster.GetPoint(i) - avg) * axis; + if(dp < mindp) mindp = dp; + if(dp > maxdp) maxdp = dp; + } + + p1 = avg + mindp * axis; + p2 = avg + maxdp * axis; +#else + cluster.GetBoundingBox(p1, p2); +#endif + + ClampEndpoints(p1, p2); + + RGBAVector pts[1 << 4]; // At most 4 bits per index. + int numPts[1<<4]; + assert(nBuckets <= 1 << 4); + + for(int i = 0; i < nBuckets; i++) { + float s = (float(i) / float(nBuckets - 1)); + pts[i] = (1.0f - s) * p1 + s * p2; + } + + assert(pts[0] == p1); + assert(pts[nBuckets - 1] == p2); + + // Do k-means clustering... + int bucketIdx[kMaxNumDataPoints]; + + bool fixed = false; + while(!fixed) { + + RGBAVector newPts[1 << 4]; + + // Assign each of the existing points to one of the buckets... + for(int i = 0; i < cluster.GetNumPoints(); i++) { + + int minBucket = -1; + float minDist = FLT_MAX; + for(int j = 0; j < nBuckets; j++) { + RGBAVector v = cluster.GetPoint(i) - pts[j]; + float distSq = v * v; + if(distSq < minDist) + { + minDist = distSq; + minBucket = j; + } + } + + assert(minBucket >= 0); + bucketIdx[i] = minBucket; + } + + // Calculate new buckets based on centroids of clusters... + for(int i = 0; i < nBuckets; i++) { + + numPts[i] = 0; + newPts[i] = RGBAVector(0.0f); + for(int j = 0; j < cluster.GetNumPoints(); j++) { + if(bucketIdx[j] == i) { + numPts[i]++; + newPts[i] += cluster.GetPoint(j); + } + } + + // If there are no points in this cluster, then it should + // remain the same as last time and avoid a divide by zero. + if(0 != numPts[i]) + newPts[i] /= float(numPts[i]); + } + + // If we haven't changed, then we're done. + fixed = true; + for(int i = 0; i < nBuckets; i++) { + if(pts[i] != newPts[i]) + fixed = false; + } + + // Assign the new points to be the old points. + for(int i = 0; i < nBuckets; i++) { + pts[i] = newPts[i]; + } + } + + // If there's only one bucket filled, then just compress for that single color... + int numBucketsFilled = 0, lastFilledBucket = -1; + for(int i = 0; i < nBuckets; i++) { + if(numPts[i] > 0) { + numBucketsFilled++; + lastFilledBucket = i; + } + } + + assert(numBucketsFilled > 0); + if(1 == numBucketsFilled) { + const RGBAVector &p = pts[lastFilledBucket]; + double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo); + + // We're assuming all indices will be index 1... + for(int i = 0; i < cluster.GetNumPoints(); i++) { + bestIndices[i] = 1; + } + + return bestErr; + } + + // Now that we know the index of each pixel, we can assign the endpoints based on a least squares fit + // of the clusters. For more information, take a look at this article by NVidia: + // http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/dxtc/doc/cuda_dxtc.pdf + float asq = 0.0, bsq = 0.0, ab = 0.0; + RGBAVector ax(0.0), bx(0.0); + for(int i = 0; i < nBuckets; i++) { + float a = float(nBuckets - 1 - i) / float(nBuckets - 1); + float b = float(i) / float(nBuckets - 1); + + int n = numPts[i]; + RGBAVector x = pts[i]; + + asq += float(n) * a * a; + bsq += float(n) * b * b; + ab += float(n) * a * b; + + ax += x * a * float(n); + bx += x * b * float(n); + } + + float f = 1.0f / (asq * bsq - ab * ab); + p1 = f * (ax * bsq - bx * ab); + p2 = f * (bx * asq - ax * ab); + + ClampEndpointsToGrid(p1, p2, bestPbitCombo); + + #ifdef _DEBUG + int pBitCombo = bestPbitCombo; + RGBAVector tp1 = p1, tp2 = p2; + ClampEndpointsToGrid(tp1, tp2, pBitCombo); + + assert(p1 == tp1); + assert(p2 == tp2); + assert(pBitCombo == bestPbitCombo); + #endif + + assert(bestPbitCombo >= 0); + + return OptimizeEndpointsForCluster(cluster, p1, p2, bestIndices, bestPbitCombo); +} + +double BC7CompressionMode::Compress(BitStream &stream, const int shapeIdx, const RGBACluster *clusters) { + + const int kModeNumber = GetModeNumber(); + const int nPartitionBits = GetNumberOfPartitionBits(); + const int nSubsets = GetNumberOfSubsets(); + + // Mode # + stream.WriteBits(1 << kModeNumber, kModeNumber + 1); + + // Partition # + assert((((1 << nPartitionBits) - 1) & shapeIdx) == shapeIdx); + stream.WriteBits(shapeIdx, nPartitionBits); + + RGBAVector p1[kMaxNumSubsets], p2[kMaxNumSubsets]; + int bestIndices[kMaxNumSubsets][kMaxNumDataPoints] = { + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 } + }; + int bestAlphaIndices[kMaxNumDataPoints] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; + int bestPbitCombo[kMaxNumSubsets] = { -1, -1, -1 }; + int bestRotationMode = -1, bestIndexMode = -1; + + double totalErr = 0.0; + for(int cidx = 0; cidx < nSubsets; cidx++) { + int indices[kMaxNumDataPoints]; + + if(m_Attributes->hasRotation) { + + assert(nSubsets == 1); + + int alphaIndices[kMaxNumDataPoints]; + + double bestError = DBL_MAX; + for(int rotMode = 0; rotMode < 4; rotMode++) { + + SetRotationMode(rotMode); + const int nIdxModes = kModeNumber == 4? 2 : 1; + + for(int idxMode = 0; idxMode < nIdxModes; idxMode++) { + + SetIndexMode(idxMode); + + RGBAVector v1, v2; + double error = CompressCluster(clusters[cidx], v1, v2, indices, alphaIndices); + if(error < bestError) { + bestError = error; + + memcpy(bestIndices[cidx], indices, sizeof(indices)); + memcpy(bestAlphaIndices, alphaIndices, sizeof(alphaIndices)); + + bestRotationMode = rotMode; + bestIndexMode = idxMode; + + p1[cidx] = v1; + p2[cidx] = v2; + } + } + } + + totalErr += bestError; + } + else { + // Compress this cluster + totalErr += CompressCluster(clusters[cidx], p1[cidx], p2[cidx], indices, bestPbitCombo[cidx]); + + // Map the indices to their proper position. + int idx = 0; + for(int i = 0; i < 16; i++) { + int subs = GetSubsetForIndex(i, shapeIdx, GetNumberOfSubsets()); + if(subs == cidx) { + bestIndices[cidx][i] = indices[idx++]; + } + } + } + } + + stream.WriteBits(bestRotationMode, m_Attributes->hasRotation? 2 : 0); + stream.WriteBits(bestIndexMode, m_Attributes->hasIdxMode? 1 : 0); + +#ifdef _DEBUG + for(int i = 0; i < kMaxNumDataPoints; i++) { + + int nSet = 0; + for(int j = 0; j < nSubsets; j++) { + if(bestIndices[j][i] >= 0) + nSet++; + } + + assert(nSet == 1); + } +#endif + + // Get the quantization mask + const uint32 qmask = GetQuantizationMask(); + + //Quantize the points... + uint32 pixel1[kMaxNumSubsets], pixel2[kMaxNumSubsets]; + for(int i = 0; i < nSubsets; i++) { + switch(GetPBitType()) { + default: + case ePBitType_None: + pixel1[i] = p1[i].ToPixel(qmask); + pixel2[i] = p2[i].ToPixel(qmask); + break; + + case ePBitType_Shared: + case ePBitType_NotShared: + pixel1[i] = p1[i].ToPixel(qmask, GetPBitCombo(bestPbitCombo[i])[0]); + pixel2[i] = p2[i].ToPixel(qmask, GetPBitCombo(bestPbitCombo[i])[1]); + break; + } + } + + // If the anchor index does not have 0 in the leading bit, then + // we need to swap EVERYTHING. + for(int sidx = 0; sidx < nSubsets; sidx++) { + + int anchorIdx = GetAnchorIndexForSubset(sidx, shapeIdx, nSubsets); + assert(bestIndices[sidx][anchorIdx] != -1); + + const int nAlphaIndexBits = GetNumberOfBitsPerAlpha(bestIndexMode); + const int nIndexBits = GetNumberOfBitsPerIndex(bestIndexMode); + if(bestIndices[sidx][anchorIdx] >> (nIndexBits - 1)) { + uint32 t = pixel1[sidx]; pixel1[sidx] = pixel2[sidx]; pixel2[sidx] = t; + + int nIndexVals = 1 << nIndexBits; + for(int i = 0; i < 16; i++) { + bestIndices[sidx][i] = (nIndexVals - 1) - bestIndices[sidx][i]; + } + + int nAlphaIndexVals = 1 << nAlphaIndexBits; + if(m_Attributes->hasRotation) { + for(int i = 0; i < 16; i++) { + bestAlphaIndices[i] = (nAlphaIndexVals - 1) - bestAlphaIndices[i]; + } + } + } + + if(m_Attributes->hasRotation && bestAlphaIndices[anchorIdx] >> (nAlphaIndexBits - 1)) { + uint8 * bp1 = (uint8 *)(&pixel1[sidx]); + uint8 * bp2 = (uint8 *)(&pixel2[sidx]); + uint8 t = bp1[3]; bp1[3] = bp2[3]; bp2[3] = t; + + int nAlphaIndexVals = 1 << nAlphaIndexBits; + for(int i = 0; i < 16; i++) { + bestAlphaIndices[i] = (nAlphaIndexVals - 1) - bestAlphaIndices[i]; + } + } + + assert(!(bestIndices[sidx][anchorIdx] >> (nIndexBits - 1))); + assert(!m_Attributes->hasRotation || !(bestAlphaIndices[anchorIdx] >> (nAlphaIndexBits - 1))); + } + + // Get the quantized values... + uint8 r1[kMaxNumSubsets], g1[kMaxNumSubsets], b1[kMaxNumSubsets], a1[kMaxNumSubsets]; + uint8 r2[kMaxNumSubsets], g2[kMaxNumSubsets], b2[kMaxNumSubsets], a2[kMaxNumSubsets]; + for(int i = 0; i < nSubsets; i++) { + r1[i] = pixel1[i] & 0xFF; + r2[i] = pixel2[i] & 0xFF; + + g1[i] = (pixel1[i] >> 8) & 0xFF; + g2[i] = (pixel2[i] >> 8) & 0xFF; + + b1[i] = (pixel1[i] >> 16) & 0xFF; + b2[i] = (pixel2[i] >> 16) & 0xFF; + + a1[i] = (pixel1[i] >> 24) & 0xFF; + a2[i] = (pixel2[i] >> 24) & 0xFF; + } + + // Write them out... + const int nRedBits = m_Attributes->colorChannelPrecision; + for(int i = 0; i < nSubsets; i++) { + stream.WriteBits(r1[i] >> (8 - nRedBits), nRedBits); + stream.WriteBits(r2[i] >> (8 - nRedBits), nRedBits); + } + + const int nGreenBits = m_Attributes->colorChannelPrecision; + for(int i = 0; i < nSubsets; i++) { + stream.WriteBits(g1[i] >> (8 - nGreenBits), nGreenBits); + stream.WriteBits(g2[i] >> (8 - nGreenBits), nGreenBits); + } + + const int nBlueBits = m_Attributes->colorChannelPrecision; + for(int i = 0; i < nSubsets; i++) { + stream.WriteBits(b1[i] >> (8 - nBlueBits), nBlueBits); + stream.WriteBits(b2[i] >> (8 - nBlueBits), nBlueBits); + } + + const int nAlphaBits = m_Attributes->alphaChannelPrecision; + for(int i = 0; i < nSubsets; i++) { + stream.WriteBits(a1[i] >> (8 - nAlphaBits), nAlphaBits); + stream.WriteBits(a2[i] >> (8 - nAlphaBits), nAlphaBits); + } + + // Write out the best pbits.. + if(GetPBitType() != ePBitType_None) { + for(int s = 0; s < nSubsets; s++) { + const int *pbits = GetPBitCombo(bestPbitCombo[s]); + stream.WriteBits(pbits[0], 1); + if(GetPBitType() != ePBitType_Shared) + stream.WriteBits(pbits[1], 1); + } + } + + // If our index mode has changed, then we need to write the alpha indices first. + if(m_Attributes->hasIdxMode && bestIndexMode == 1) { + + assert(m_Attributes->hasRotation); + + for(int i = 0; i < 16; i++) { + const int idx = bestAlphaIndices[i]; + assert(GetAnchorIndexForSubset(0, shapeIdx, nSubsets) == 0); + assert(GetNumberOfBitsPerAlpha(bestIndexMode) == 2); + assert(idx >= 0 && idx < (1 << 2)); + assert(i != 0 || !(idx >> 1) || !"Leading bit of anchor index is not zero!"); + stream.WriteBits(idx, (i == 0)? 1 : 2); + } + + for(int i = 0; i < 16; i++) { + const int idx = bestIndices[0][i]; + assert(GetSubsetForIndex(i, shapeIdx, nSubsets) == 0); + assert(GetAnchorIndexForSubset(0, shapeIdx, nSubsets) == 0); + assert(GetNumberOfBitsPerIndex(bestIndexMode) == 3); + assert(idx >= 0 && idx < (1 << 3)); + assert(i != 0 || !(idx >> 2) || !"Leading bit of anchor index is not zero!"); + stream.WriteBits(idx, (i == 0)? 2 : 3); + } + } + else { + for(int i = 0; i < 16; i++) { + const int subs = GetSubsetForIndex(i, shapeIdx, nSubsets); + const int idx = bestIndices[subs][i]; + const int anchorIdx = GetAnchorIndexForSubset(subs, shapeIdx, nSubsets); + const int nBitsForIdx = GetNumberOfBitsPerIndex(bestIndexMode); + assert(idx >= 0 && idx < (1 << nBitsForIdx)); + assert(i != anchorIdx || !(idx >> (nBitsForIdx - 1)) || !"Leading bit of anchor index is not zero!"); + stream.WriteBits(idx, (i == anchorIdx)? nBitsForIdx - 1 : nBitsForIdx); + } + + if(m_Attributes->hasRotation) { + for(int i = 0; i < 16; i++) { + const int idx = bestAlphaIndices[i]; + const int anchorIdx = 0; + const int nBitsForIdx = GetNumberOfBitsPerAlpha(bestIndexMode); + assert(idx >= 0 && idx < (1 << nBitsForIdx)); + assert(i != anchorIdx || !(idx >> (nBitsForIdx - 1)) || !"Leading bit of anchor index is not zero!"); + stream.WriteBits(idx, (i == anchorIdx)? nBitsForIdx - 1 : nBitsForIdx); + } + } + } + assert(stream.GetBitsWritten() == 128); + return totalErr; +} + +namespace BC7C +{ + // Function prototypes + static void ExtractBlock(const uint8* inPtr, int width, uint32* colorBlock); + static void CompressBC7Block(const uint32 *block, uint8 *outBuf); + + static int gQualityLevel = 50; + void SetQualityLevel(int q) { + gQualityLevel = max(0, q); + } + int GetQualityLevel() { return gQualityLevel; } + + // Returns true if the entire block is a single color. + static bool AllOneColor(const uint32 block[16]) { + const uint32 pixel = block[0]; + for(int i = 1; i < 16; i++) { + if( block[i] != pixel ) + return false; + } + + return true; + } + + // Write out a transparent block. + static void WriteTransparentBlock(BitStream &stream) { + // Use mode 6 + stream.WriteBits(1 << 6, 7); + stream.WriteBits(0, 128-7); + assert(stream.GetBitsWritten() == 128); + } + + // Compresses a single color optimally and outputs the result. + static void CompressOptimalColorBC7(uint32 pixel, BitStream &stream) { + + stream.WriteBits(1 << 5, 6); // Mode 5 + stream.WriteBits(0, 2); // No rotation bits. + + uint8 r = pixel & 0xFF; + uint8 g = (pixel >> 8) & 0xFF; + uint8 b = (pixel >> 16) & 0xFF; + uint8 a = (pixel >> 24) & 0xFF; + + // Red endpoints + stream.WriteBits(Optimal7CompressBC7Mode5[r][0], 7); + stream.WriteBits(Optimal7CompressBC7Mode5[r][1], 7); + + // Green endpoints + stream.WriteBits(Optimal7CompressBC7Mode5[g][0], 7); + stream.WriteBits(Optimal7CompressBC7Mode5[g][1], 7); + + // Blue endpoints + stream.WriteBits(Optimal7CompressBC7Mode5[b][0], 7); + stream.WriteBits(Optimal7CompressBC7Mode5[b][1], 7); + + // Alpha endpoints... are just the same. + stream.WriteBits(a, 8); + stream.WriteBits(a, 8); + + // Color indices are 1 for each pixel... + // Anchor index is 0, so 1 bit for the first pixel, then + // 01 for each following pixel giving the sequence of 31 bits: + // ...010101011 + stream.WriteBits(0xaaaaaaab, 31); + + // Alpha indices... + stream.WriteBits(kWMValues[gWMVal = (gWMVal+1) % kNumWMVals], 31); + } + + static int gModeChosen = -1; + static int gBestMode = -1; + + // Compress an image using BC7 compression. Use the inBuf parameter to point to an image in + // 4-byte RGBA format. The width and height parameters specify the size of the image in pixels. + // The buffer pointed to by outBuf should be large enough to store the compressed image. This + // implementation has an 4:1 compression ratio. + void CompressImageBC7(const uint8* inBuf, uint8* outBuf, int width, int height) + { + uint32 block[16]; + BC7CompressionMode::ResetNumUses(); + BC7CompressionMode::MaxAnnealingIterations = min(BC7CompressionMode::kMaxAnnealingIterations, GetQualityLevel()); + + for(int j = 0; j < height; j += 4, inBuf += width * 4 * 4) + { + for(int i = 0; i < width; i += 4) + { + ExtractBlock(inBuf + i * 4, width, block); + CompressBC7Block(block, outBuf); + + BC7CompressionMode::NumUses[gBestMode]++; + + outBuf += 16; + } + } + } + + // Extract a 4 by 4 block of pixels from inPtr and store it in colorBlock. The width parameter + // specifies the size of the image in pixels. + static void ExtractBlock(const uint8* inPtr, int width, uint32* colorBlock) + { + for(int j = 0; j < 4; j++) + { + memcpy(&colorBlock[j * 4], inPtr, 4 * 4); + inPtr += width * 4; + } + } + + static double CompressTwoClusters(int shapeIdx, const RGBACluster *clusters, uint8 *outBuf, bool opaque) { + + uint8 tempBuf1[16]; + BitStream tmpStream1(tempBuf1, 128, 0); + BC7CompressionMode compressor1(1, opaque); + + double bestError = compressor1.Compress(tmpStream1, shapeIdx, clusters); + memcpy(outBuf, tempBuf1, 16); + gModeChosen = 1; + if(bestError == 0.0) { + return 0.0; + } + + uint8 tempBuf3[16]; + BitStream tmpStream3(tempBuf3, 128, 0); + BC7CompressionMode compressor3(3, opaque); + + double error; + if((error = compressor3.Compress(tmpStream3, shapeIdx, clusters)) < bestError) { + gModeChosen = 3; + bestError = error; + memcpy(outBuf, tempBuf3, 16); + if(bestError == 0.0) { + return 0.0; + } + } + + // Mode 3 offers more precision for RGB data. Mode 7 is really only if we have alpha. + if(!opaque) + { + uint8 tempBuf7[16]; + BitStream tmpStream7(tempBuf7, 128, 0); + BC7CompressionMode compressor7(7, opaque); + if((error = compressor7.Compress(tmpStream7, shapeIdx, clusters)) < bestError) { + gModeChosen = 7; + memcpy(outBuf, tempBuf7, 16); + return error; + } + } + + return bestError; + } + + static double CompressThreeClusters(int shapeIdx, const RGBACluster *clusters, uint8 *outBuf, bool opaque) { + + uint8 tempBuf0[16]; + BitStream tmpStream0(tempBuf0, 128, 0); + + uint8 tempBuf2[16]; + BitStream tmpStream2(tempBuf2, 128, 0); + + BC7CompressionMode compressor0(0, opaque); + BC7CompressionMode compressor2(2, opaque); + + double error, bestError = (shapeIdx < 16)? compressor0.Compress(tmpStream0, shapeIdx, clusters) : DBL_MAX; + gModeChosen = 0; + memcpy(outBuf, tempBuf0, 16); + if(bestError == 0.0) { + return 0.0; + } + + if((error = compressor2.Compress(tmpStream2, shapeIdx, clusters)) < bestError) { + gModeChosen = 2; + memcpy(outBuf, tempBuf2, 16); + return error; + } + + return bestError; + } + + static void PopulateTwoClustersForShape(const RGBACluster &points, int shapeIdx, RGBACluster *clusters) { + const uint16 shape = kShapeMask2[shapeIdx]; + for(int pt = 0; pt < kMaxNumDataPoints; pt++) { + + const RGBAVector &p = points.GetPoint(pt); + + if((1 << pt) & shape) + clusters[1].AddPoint(p); + else + clusters[0].AddPoint(p); + } + + assert(!(clusters[0].GetPointBitString() & clusters[1].GetPointBitString())); + assert((clusters[0].GetPointBitString() ^ clusters[1].GetPointBitString()) == 0xFFFF); + assert((shape & clusters[1].GetPointBitString()) == shape); + } + + static void PopulateThreeClustersForShape(const RGBACluster &points, int shapeIdx, RGBACluster *clusters) { + for(int pt = 0; pt < kMaxNumDataPoints; pt++) { + + const RGBAVector &p = points.GetPoint(pt); + + if((1 << pt) & kShapeMask3[shapeIdx][0]) { + if((1 << pt) & kShapeMask3[shapeIdx][1]) + clusters[2].AddPoint(p); + else + clusters[1].AddPoint(p); + } + else + clusters[0].AddPoint(p); + } + + assert(!(clusters[0].GetPointBitString() & clusters[1].GetPointBitString())); + assert(!(clusters[2].GetPointBitString() & clusters[1].GetPointBitString())); + assert(!(clusters[0].GetPointBitString() & clusters[2].GetPointBitString())); + } + + static double EstimateTwoClusterError(RGBACluster &c) { + RGBAVector Min, Max, v; + c.GetBoundingBox(Min, Max); + v = Max - Min; + if(v * v == 0) { + return 0.0; + } + + const float *w = BC7C::GetErrorMetric(); + return 0.0001 + c.QuantizedError(Min, Max, 8, 0xFFFFFFFF, RGBAVector(w[0], w[1], w[2], w[3])); + } + + static double EstimateThreeClusterError(RGBACluster &c) { + RGBAVector Min, Max, v; + c.GetBoundingBox(Min, Max); + v = Max - Min; + if(v * v == 0) { + return 0.0; + } + + const float *w = BC7C::GetErrorMetric(); + return 0.0001 + c.QuantizedError(Min, Max, 4, 0xFFFFFFFF, RGBAVector(w[0], w[1], w[2], w[3])); + } + + // Compress a single block. + static void CompressBC7Block(const uint32 *block, uint8 *outBuf) { + + // All a single color? + if(AllOneColor(block)) { + BitStream bStrm(outBuf, 128, 0); + CompressOptimalColorBC7(*block, bStrm); + gBestMode = 5; + return; + } + + RGBACluster blockCluster; + bool opaque = true; + bool transparent = true; + + for(int i = 0; i < kMaxNumDataPoints; i++) { + RGBAVector p = RGBAVector(i, block[i]); + blockCluster.AddPoint(p); + if(fabs(p.a - 255.0f) > 1e-10) + opaque = false; + + if(p.a > 0.0f) + transparent = false; + } + + // The whole block is transparent? + if(transparent) { + BitStream bStrm(outBuf, 128, 0); + WriteTransparentBlock(bStrm); + gBestMode = 6; + return; + } + + // First we must figure out which shape to use. To do this, simply + // see which shape has the smallest sum of minimum bounding spheres. + double bestError[2] = { DBL_MAX, DBL_MAX }; + int bestShapeIdx[2] = { -1, -1 }; + RGBACluster bestClusters[2][3]; + + for(int i = 0; i < kNumShapes2; i++) + { + RGBACluster clusters[2]; + PopulateTwoClustersForShape(blockCluster, i, clusters); + + double err = 0.0; + for(int ci = 0; ci < 2; ci++) { + err += EstimateTwoClusterError(clusters[ci]); + } + + // If it's small, we'll take it! + if(err < 1e-9) { + CompressTwoClusters(i, clusters, outBuf, opaque); + gBestMode = gModeChosen; + return; + } + + if(err < bestError[0]) { + bestError[0] = err; + bestShapeIdx[0] = i; + bestClusters[0][0] = clusters[0]; + bestClusters[0][1] = clusters[1]; + } + } + + // There are not 3 subset blocks that support alpha, so only check these + // if the entire block is opaque. + if(opaque) { + for(int i = 0; i < kNumShapes3; i++) { + + RGBACluster clusters[3]; + PopulateThreeClustersForShape(blockCluster, i, clusters); + + double err = 0.0; + for(int ci = 0; ci < 3; ci++) { + err += EstimateThreeClusterError(clusters[ci]); + } + + // If it's small, we'll take it! + if(err < 1e-9) { + CompressThreeClusters(i, clusters, outBuf, opaque); + gBestMode = gModeChosen; + return; + } + + if(err < bestError[1]) { + bestError[1] = err; + bestShapeIdx[1] = i; + bestClusters[1][0] = clusters[0]; + bestClusters[1][1] = clusters[1]; + bestClusters[1][2] = clusters[2]; + } + } + } + + uint8 tempBuf1[16], tempBuf2[16]; + + BitStream tempStream1 (tempBuf1, 128, 0); + BC7CompressionMode compressor(6, opaque); + double best = compressor.Compress(tempStream1, 0, &blockCluster); + gBestMode = 6; + if(best == 0.0f) { + memcpy(outBuf, tempBuf1, 16); + return; + } + + // Check modes 4 and 5 if the block isn't opaque... + if(!opaque) { + for(int mode = 4; mode <= 5; mode++) { + + BitStream tempStream2(tempBuf2, 128, 0); + BC7CompressionMode compressorTry(mode, opaque); + + double error = compressorTry.Compress(tempStream2, 0, &blockCluster); + if(error < best) { + + gBestMode = mode; + best = error; + + if(best == 0.0f) { + memcpy(outBuf, tempBuf2, 16); + return; + } + else { + memcpy(tempBuf1, tempBuf2, 16); + } + } + } + } + + double error = CompressTwoClusters(bestShapeIdx[0], bestClusters[0], tempBuf2, opaque); + if(error < best) { + + gBestMode = gModeChosen; + best = error; + + if(error == 0.0f) { + memcpy(outBuf, tempBuf2, 16); + return; + } + else { + memcpy(tempBuf1, tempBuf2, 16); + } + } + + if(opaque) { + if(CompressThreeClusters(bestShapeIdx[1], bestClusters[1], tempBuf2, opaque) < best) { + + gBestMode = gModeChosen; + memcpy(outBuf, tempBuf2, 16); + + return; + } + } + + memcpy(outBuf, tempBuf1, 16); + } + + static void DecompressBC7Block(const uint8 block[16], uint32 outBuf[16]) { + + BitStreamReadOnly strm(block); + + uint32 mode = 0; + while(!strm.ReadBit()) { + mode++; + } + + const BC7CompressionMode::Attributes *attrs = BC7CompressionMode::GetAttributesForMode(mode); + const uint32 nSubsets = attrs->numSubsets; + + uint32 idxMode = 0; + uint32 rotMode = 0; + uint32 shapeIdx = 0; + if ( nSubsets > 1 ) { + shapeIdx = strm.ReadBits(mode == 0? 4 : 6); + } + else if( attrs->hasRotation ) { + rotMode = strm.ReadBits(2); + if( attrs->hasIdxMode ) + idxMode = strm.ReadBit(); + } + + assert(idxMode < 2); + assert(rotMode < 4); + assert(shapeIdx < ((mode == 0)? 16 : 64)); + + uint32 cp = attrs->colorChannelPrecision; + const uint32 shift = 8 - cp; + + uint8 eps[3][2][4]; + for(uint32 ch = 0; ch < 3; ch++) + for(uint32 i = 0; i < nSubsets; i++) + for(uint32 ep = 0; ep < 2; ep++) + eps[i][ep][ch] = strm.ReadBits(cp) << shift; + + uint32 ap = attrs->alphaChannelPrecision; + const uint32 ash = 8 - ap; + + for(uint32 i = 0; i < nSubsets; i++) + for(uint32 ep = 0; ep < 2; ep++) + eps[i][ep][3] = strm.ReadBits(ap) << ash; + + // Handle pbits + switch(attrs->pbitType) { + case BC7CompressionMode::ePBitType_None: + // Do nothing. + break; + + case BC7CompressionMode::ePBitType_Shared: + + cp += 1; + ap += 1; + + for(uint32 i = 0; i < nSubsets; i++) { + + uint32 pbit = strm.ReadBit(); + + for(uint32 j = 0; j < 2; j++) + for(uint32 ch = 0; ch < kNumColorChannels; ch++) { + const uint32 prec = ch == 3? ap : cp; + eps[i][j][ch] |= pbit << (8-prec); + } + } + break; + + case BC7CompressionMode::ePBitType_NotShared: + + cp += 1; + ap += 1; + + for(uint32 i = 0; i < nSubsets; i++) + for(uint32 j = 0; j < 2; j++) { + + uint32 pbit = strm.ReadBit(); + + for(uint32 ch = 0; ch < kNumColorChannels; ch++) { + const uint32 prec = ch == 3? ap : cp; + eps[i][j][ch] |= pbit << (8-prec); + } + } + break; + } + + // Quantize endpoints... + for(uint32 i = 0; i < nSubsets; i++) + for(uint32 j = 0; j < 2; j++) + for(uint32 ch = 0; ch < kNumColorChannels; ch++) { + const uint32 prec = ch == 3? ap : cp; + eps[i][j][ch] |= eps[i][j][ch] >> prec; + } + + // Figure out indices... + uint32 alphaIndices[kMaxNumDataPoints]; + uint32 colorIndices[kMaxNumDataPoints]; + + int nBitsPerAlpha = attrs->numBitsPerAlpha; + int nBitsPerColor = attrs->numBitsPerIndex; + + uint32 idxPrec = attrs->numBitsPerIndex; + for(int i = 0; i < kMaxNumDataPoints; i++) { + uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets); + + int idx = 0; + if(GetAnchorIndexForSubset(subset, shapeIdx, nSubsets) == i) { + idx = strm.ReadBits(idxPrec - 1); + } + else { + idx = strm.ReadBits(idxPrec); + } + colorIndices[i] = idx; + } + + idxPrec = attrs->numBitsPerAlpha; + if(idxPrec == 0) { + memcpy(alphaIndices, colorIndices, sizeof(alphaIndices)); + } + else { + for(int i = 0; i < kMaxNumDataPoints; i++) { + uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets); + + int idx = 0; + if(GetAnchorIndexForSubset(subset, shapeIdx, nSubsets) == i) { + idx = strm.ReadBits(idxPrec - 1); + } + else { + idx = strm.ReadBits(idxPrec); + } + alphaIndices[i] = idx; + } + + if(idxMode) { + for(int i = 0; i < kMaxNumDataPoints; i++) { + swap(alphaIndices[i], colorIndices[i]); + } + + swap(nBitsPerAlpha, nBitsPerColor); + } + } + + assert(strm.GetBitsRead() == 128); + + // Get final colors by interpolating... + for(int i = 0; i < kMaxNumDataPoints; i++) { + + const uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets); + uint32 &pixel = outBuf[i]; + + pixel = 0; + for(int ch = 0; ch < 3; ch++) { + uint32 i0 = kBC7InterpolationValues[nBitsPerColor - 1][colorIndices[i]][0]; + uint32 i1 = kBC7InterpolationValues[nBitsPerColor - 1][colorIndices[i]][1]; + + const uint8 ip = (((uint32(eps[subset][0][ch]) * i0) + (uint32(eps[subset][1][ch]) * i1) + 32) >> 6) & 0xFF; + pixel |= ip << (8*ch); + } + + if(attrs->alphaChannelPrecision > 0) { + uint32 i0 = kBC7InterpolationValues[nBitsPerAlpha - 1][alphaIndices[i]][0]; + uint32 i1 = kBC7InterpolationValues[nBitsPerAlpha - 1][alphaIndices[i]][1]; + + const uint8 ip = (((uint32(eps[subset][0][3]) * i0) + (uint32(eps[subset][1][3]) * i1) + 32) >> 6) & 0xFF; + pixel |= ip << 24; + } + else { + pixel |= 0xFF000000; + } + + // Swap colors if necessary... + uint8 *pb = (uint8 *)&pixel; + switch(rotMode) { + default: + case 0: + // Do nothing + break; + + case 1: + swap(pb[0], pb[3]); + break; + + case 2: + swap(pb[1], pb[3]); + break; + + case 3: + swap(pb[2], pb[3]); + break; + } + } + } + + // Convert the image from a BC7 buffer to a RGBA8 buffer + void DecompressImageBC7(const uint8 *inBuf, uint8* outBuf, int width, int height) { + + int blockIdx = 0; + for(int j = 0; j < height; j += 4, outBuf += width * 3 * 4) + { + for(int i = 0; i < width; i += 4) + { + uint32 pixels[16]; + DecompressBC7Block(inBuf + (16*(blockIdx++)), pixels); + + memcpy(outBuf, pixels, 4 * sizeof(uint32)); + memcpy(outBuf + (width * 4), pixels + 4, 4 * sizeof(uint32)); + memcpy(outBuf + 2*(width * 4), pixels + 8, 4 * sizeof(uint32)); + memcpy(outBuf + 3*(width * 4), pixels + 12, 4 * sizeof(uint32)); + outBuf += 16; + } + } + } +} diff --git a/BPTCEncoder/src/BC7CompressorSIMD.cpp b/BPTCEncoder/src/BC7CompressorSIMD.cpp new file mode 100755 index 0000000..ba543d6 --- /dev/null +++ b/BPTCEncoder/src/BC7CompressorSIMD.cpp @@ -0,0 +1,1270 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +#include "BC7IntTypes.h" +#include "BC7Compressor.h" +#include "BC7CompressionModeSIMD.h" +#include "RGBAEndpointsSIMD.h" +#include "BCLookupTables.h" +#include "BitStream.h" + +#ifdef _MSC_VER +#define ALIGN_SSE __declspec( align(16) ) +#else +#define ALIGN_SSE __attribute__((aligned(16))) +#endif + +static const uint32 kNumShapes2 = 64; +static const uint16 kShapeMask2[kNumShapes2] = { + 0xcccc, 0x8888, 0xeeee, 0xecc8, 0xc880, 0xfeec, 0xfec8, 0xec80, + 0xc800, 0xffec, 0xfe80, 0xe800, 0xffe8, 0xff00, 0xfff0, 0xf000, + 0xf710, 0x008e, 0x7100, 0x08ce, 0x008c, 0x7310, 0x3100, 0x8cce, + 0x088c, 0x3110, 0x6666, 0x366c, 0x17e8, 0x0ff0, 0x718e, 0x399c, + 0xaaaa, 0xf0f0, 0x5a5a, 0x33cc, 0x3c3c, 0x55aa, 0x9696, 0xa55a, + 0x73ce, 0x13c8, 0x324c, 0x3bdc, 0x6996, 0xc33c, 0x9966, 0x0660, + 0x0272, 0x04e4, 0x4e40, 0x2720, 0xc936, 0x936c, 0x39c6, 0x639c, + 0x9336, 0x9cc6, 0x817e, 0xe718, 0xccf0, 0x0fcc, 0x7744, 0xee22 +}; + +static const int kAnchorIdx2[kNumShapes2] = { + 15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15, + 15, 2, 8, 2, 2, 8, 8,15, + 2, 8, 2, 2, 8, 8, 2, 2, + 15,15, 6, 8, 2, 8,15,15, + 2, 8, 2, 2, 2,15,15, 6, + 6, 2, 6, 8,15,15, 2, 2, + 15,15,15,15,15, 2, 2, 15 +}; + +static const uint32 kNumShapes3 = 64; +static const uint16 kShapeMask3[kNumShapes3][2] = { + { 0xfecc, 0xf600 }, { 0xffc8, 0x7300 }, { 0xff90, 0x3310 }, { 0xecce, 0x00ce }, { 0xff00, 0xcc00 }, { 0xcccc, 0xcc00 }, { 0xffcc, 0x00cc }, { 0xffcc, 0x3300 }, + { 0xff00, 0xf000 }, { 0xfff0, 0xf000 }, { 0xfff0, 0xff00 }, { 0xcccc, 0x8888 }, { 0xeeee, 0x8888 }, { 0xeeee, 0xcccc }, { 0xffec, 0xec80 }, { 0x739c, 0x7310 }, + { 0xfec8, 0xc800 }, { 0x39ce, 0x3100 }, { 0xfff0, 0xccc0 }, { 0xfccc, 0x0ccc }, { 0xeeee, 0xee00 }, { 0xff88, 0x7700 }, { 0xeec0, 0xcc00 }, { 0x7730, 0x3300 }, + { 0x0cee, 0x00cc }, { 0xffcc, 0xfc88 }, { 0x6ff6, 0x0660 }, { 0xff60, 0x6600 }, { 0xcbbc, 0xc88c }, { 0xf966, 0xf900 }, { 0xceec, 0x0cc0 }, { 0xff10, 0x7310 }, + { 0xff80, 0xec80 }, { 0xccce, 0x08ce }, { 0xeccc, 0xec80 }, { 0x6666, 0x4444 }, { 0x0ff0, 0x0f00 }, { 0x6db6, 0x4924 }, { 0x6bd6, 0x4294 }, { 0xcf3c, 0x0c30 }, + { 0xc3fc, 0x03c0 }, { 0xffaa, 0xff00 }, { 0xff00, 0x5500 }, { 0xfcfc, 0xcccc }, { 0xcccc, 0x0c0c }, { 0xf6f6, 0x6666 }, { 0xaffa, 0x0ff0 }, { 0xfff0, 0x5550 }, + { 0xfaaa, 0xf000 }, { 0xeeee, 0x0e0e }, { 0xf8f8, 0x8888 }, { 0xfff0, 0x9990 }, { 0xeeee, 0xe00e }, { 0x8ff8, 0x8888 }, { 0xf666, 0xf000 }, { 0xff00, 0x9900 }, + { 0xff66, 0xff00 }, { 0xcccc, 0xc00c }, { 0xcffc, 0xcccc }, { 0xf000, 0x9000 }, { 0x8888, 0x0808 }, { 0xfefe, 0xeeee }, { 0xfffa, 0xfff0 }, { 0x7bde, 0x7310 } +}; + +static const uint32 kWMValues[] = { 0x32b92180, 0x32ba3080, 0x31103200, 0x28103c80, 0x32bb3080, 0x25903600, 0x3530b900, 0x3b32b180, 0x34b5b980 }; +static const uint32 kNumWMVals = sizeof(kWMValues) / sizeof(kWMValues[0]); +static uint32 gWMVal = -1; + +static const int kAnchorIdx3[2][kNumShapes3] = { + { 3, 3,15,15, 8, 3,15,15, + 8, 8, 6, 6, 6, 5, 3, 3, + 3, 3, 8,15, 3, 3, 6,10, + 5, 8, 8, 6, 8, 5,15,15, + 8,15, 3, 5, 6,10, 8,15, + 15, 3,15, 5,15,15,15,15, + 3,15, 5, 5, 5, 8, 5,10, + 5,10, 8,13,15,12, 3, 3 }, + + { 15, 8, 8, 3,15,15, 3, 8, + 15,15,15,15,15,15,15, 8, + 15, 8,15, 3,15, 8,15, 8, + 3,15, 6,10,15,15,10, 8, + 15, 3,15,10,10, 8, 9,10, + 6,15, 8,15, 3, 6, 6, 8, + 15, 3,15,15,15,15,15,15, + 15,15,15,15, 3,15,15, 8 } +}; + +const uint32 kBC7InterpolationValuesScalar[4][16][2] = { + { {64, 0}, {33, 31}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { {64, 0}, {43, 21}, {21, 43}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { {64, 0}, {55, 9}, {46, 18}, {37, 27}, {27, 37}, {18, 46}, {9, 55}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0 }, + { {64, 0}, {60, 4}, {55, 9}, {51, 13}, {47, 17}, {43, 21}, {38, 26}, {34, 30}, {30, 34}, {26, 38}, {21, 43}, {17, 47}, {13, 51}, {9, 55}, {4, 60}, {0, 64} } +}; + +static const ALIGN_SSE uint32 kZeroVector[4] = { 0, 0, 0, 0 }; +const __m128i kBC7InterpolationValuesSIMD[4][16][2] = { + { + { _mm_set1_epi32(64), *((const __m128i *)kZeroVector)}, + { _mm_set1_epi32(33), _mm_set1_epi32(31) }, + { *((const __m128i *)kZeroVector), _mm_set1_epi32(64) }, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }, + { + { _mm_set1_epi32(64), *((const __m128i *)kZeroVector)}, + { _mm_set1_epi32(43), _mm_set1_epi32(21)}, + { _mm_set1_epi32(21), _mm_set1_epi32(43)}, + { *((const __m128i *)kZeroVector), _mm_set1_epi32(64)}, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }, + { + { _mm_set1_epi32(64), *((const __m128i *)kZeroVector) }, + { _mm_set1_epi32(55), _mm_set1_epi32(9) }, + { _mm_set1_epi32(46), _mm_set1_epi32(18)}, + { _mm_set1_epi32(37), _mm_set1_epi32(27)}, + { _mm_set1_epi32(27), _mm_set1_epi32(37)}, + { _mm_set1_epi32(18), _mm_set1_epi32(46)}, + { _mm_set1_epi32(9), _mm_set1_epi32(55)}, + { *((const __m128i *)kZeroVector), _mm_set1_epi32(64)}, + 0, 0, 0, 0, 0, 0, 0, 0 + }, + { + { _mm_set1_epi32(64), *((const __m128i *)kZeroVector)}, + { _mm_set1_epi32(60), _mm_set1_epi32(4)}, + { _mm_set1_epi32(55), _mm_set1_epi32(9)}, + { _mm_set1_epi32(51), _mm_set1_epi32(13)}, + { _mm_set1_epi32(47), _mm_set1_epi32(17)}, + { _mm_set1_epi32(43), _mm_set1_epi32(21)}, + { _mm_set1_epi32(38), _mm_set1_epi32(26)}, + { _mm_set1_epi32(34), _mm_set1_epi32(30)}, + { _mm_set1_epi32(30), _mm_set1_epi32(34)}, + { _mm_set1_epi32(26), _mm_set1_epi32(38)}, + { _mm_set1_epi32(21), _mm_set1_epi32(43)}, + { _mm_set1_epi32(17), _mm_set1_epi32(47)}, + { _mm_set1_epi32(13), _mm_set1_epi32(51)}, + { _mm_set1_epi32(9), _mm_set1_epi32(55)}, + { _mm_set1_epi32(4), _mm_set1_epi32(60)}, + { *((const __m128i *)kZeroVector), _mm_set1_epi32(64)} + } +}; + +static const ALIGN_SSE uint32 kByteValMask[4] = { 0xFF, 0xFF, 0xFF, 0xFF }; +static inline __m128i sad(const __m128i &a, const __m128i &b) { + const __m128i maxab = _mm_max_epu8(a, b); + const __m128i minab = _mm_min_epu8(a, b); + return _mm_and_si128( *((const __m128i *)kByteValMask), _mm_subs_epu8( maxab, minab ) ); +} + +#include +#include +#include +#include +#include +#include + +#ifndef max +template +static T max(const T &a, const T &b) { + return (a > b)? a : b; +} +#endif + +#ifndef min +template +static T min(const T &a, const T &b) { + return (a < b)? a : b; +} +#endif + +int BC7CompressionModeSIMD::MaxAnnealingIterations = 50; // This is a setting. +int BC7CompressionModeSIMD::NumUses[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + +BC7CompressionModeSIMD::Attributes BC7CompressionModeSIMD::kModeAttributes[kNumModes] = { + { 0, 4, 3, 3, 4, 4, 4, 0, BC7CompressionModeSIMD::ePBitType_NotShared }, + { 1, 6, 2, 3, 6, 6, 6, 0, BC7CompressionModeSIMD::ePBitType_Shared }, + { 2, 6, 3, 2, 5, 5, 5, 0, BC7CompressionModeSIMD::ePBitType_None }, + { 3, 6, 2, 2, 7, 7, 7, 0, BC7CompressionModeSIMD::ePBitType_NotShared }, + { 0 }, // Mode 4 not supported + { 0 }, // Mode 5 not supported + { 6, 0, 1, 4, 7, 7, 7, 7, BC7CompressionModeSIMD::ePBitType_NotShared }, + { 7, 6, 2, 2, 5, 5, 5, 5, BC7CompressionModeSIMD::ePBitType_NotShared }, +}; + +void BC7CompressionModeSIMD::ClampEndpointsToGrid(RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, int &bestPBitCombo) const { + const int nPbitCombos = GetNumPbitCombos(); + const bool hasPbits = nPbitCombos > 1; + __m128i qmask; + GetQuantizationMask(qmask); + + ClampEndpoints(p1, p2); + + // !SPEED! This can be faster. We're searching through all possible + // pBit combos to find the best one. Instead, we should be seeing what + // the pBit type is for this compression mode and finding the closest + // quantization. + float minDist = FLT_MAX; + RGBAVectorSIMD bp1, bp2; + for(int i = 0; i < nPbitCombos; i++) { + + __m128i qp1, qp2; + if(hasPbits) { + qp1 = p1.ToPixel(qmask, GetPBitCombo(i)[0]); + qp2 = p2.ToPixel(qmask, GetPBitCombo(i)[1]); + } + else { + qp1 = p1.ToPixel(qmask); + qp2 = p2.ToPixel(qmask); + } + + RGBAVectorSIMD np1 = RGBAVectorSIMD( _mm_cvtepi32_ps( qp1 ) ); + RGBAVectorSIMD np2 = RGBAVectorSIMD( _mm_cvtepi32_ps( qp2 ) ); + + RGBAVectorSIMD d1 = np1 - p1; + RGBAVectorSIMD d2 = np2 - p2; + float dist = (d1 * d1) + (d2 * d2); + if(dist < minDist) { + minDist = dist; + bp1 = np1; bp2 = np2; + bestPBitCombo = i; + } + } + + p1 = bp1; + p2 = bp2; +} + +int BC7CompressionModeSIMD::GetSubsetForIndex(int idx, const int shapeIdx) const { + int subset = 0; + + const int nSubsets = GetNumberOfSubsets(); + switch(nSubsets) { + case 2: + { + subset = !!((1 << idx) & kShapeMask2[shapeIdx]); + } + break; + + case 3: + { + if(1 << idx & kShapeMask3[shapeIdx][0]) + subset = 1 + !!((1 << idx) & kShapeMask3[shapeIdx][1]); + else + subset = 0; + } + break; + + default: + break; + } + + return subset; +} + +int BC7CompressionModeSIMD::GetAnchorIndexForSubset(int subset, const int shapeIdx) const { + + const int nSubsets = GetNumberOfSubsets(); + int anchorIdx = 0; + + switch(subset) { + case 1: + { + if(nSubsets == 2) { + anchorIdx = kAnchorIdx2[shapeIdx]; + } + else { + anchorIdx = kAnchorIdx3[0][shapeIdx]; + } + } + break; + + case 2: + { + assert(nSubsets == 3); + anchorIdx = kAnchorIdx3[1][shapeIdx]; + } + break; + + default: + break; + } + + return anchorIdx; +} + +double BC7CompressionModeSIMD::CompressSingleColor(const RGBAVectorSIMD &p, RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, int &bestPbitCombo) const { + + // Our pixel to compress... + const __m128i pixel = p.ToPixel(*((const __m128i *)kByteValMask)); + + uint32 bestDist = 0xFF; + bestPbitCombo = -1; + + for(int pbi = 0; pbi < GetNumPbitCombos(); pbi++) { + + const int *pbitCombo = GetPBitCombo(pbi); + + uint32 dist = 0x0; + uint32 bestValI[kNumColorChannels] = { -1, -1, -1, -1 }; + uint32 bestValJ[kNumColorChannels] = { -1, -1, -1, -1 }; + + for(int ci = 0; ci < kNumColorChannels; ci++) { + + const uint8 val = ((uint8 *)(&pixel))[4*ci]; + int nBits = 0; + switch(ci) { + case 0: nBits = GetRedChannelPrecision(); break; + case 1: nBits = GetGreenChannelPrecision(); break; + case 2: nBits = GetBlueChannelPrecision(); break; + case 3: nBits = GetAlphaChannelPrecision(); break; + } + + // If we don't handle this channel, then we don't need to + // worry about how well we interpolate. + if(nBits == 0) { bestValI[ci] = bestValJ[ci] = 0xFF; continue; } + + const int nPossVals = (1 << nBits); + int possValsH[256]; + int possValsL[256]; + + // Do we have a pbit? + const bool havepbit = GetPBitType() != ePBitType_None; + if(havepbit) + nBits++; + + for(int i = 0; i < nPossVals; i++) { + + int vh = i, vl = i; + if(havepbit) { + vh <<= 1; + vl <<= 1; + + vh |= pbitCombo[1]; + vl |= pbitCombo[0]; + } + + possValsH[i] = (vh << (8 - nBits)); + possValsH[i] |= (possValsH[i] >> nBits); + + possValsL[i] = (vl << (8 - nBits)); + possValsL[i] |= (possValsL[i] >> nBits); + } + + const uint32 interpVal0 = kBC7InterpolationValuesScalar[GetNumberOfBitsPerIndex() - 1][1][0]; + const uint32 interpVal1 = kBC7InterpolationValuesScalar[GetNumberOfBitsPerIndex() - 1][1][1]; + + // Find the closest interpolated val that to the given val... + uint32 bestChannelDist = 0xFF; + for(int i = 0; bestChannelDist > 0 && i < nPossVals; i++) + for(int j = 0; bestChannelDist > 0 && j < nPossVals; j++) { + + const uint32 v1 = possValsL[i]; + const uint32 v2 = possValsH[j]; + + const uint32 combo = (interpVal0*v1 + (interpVal1 * v2) + 32) >> 6; + const uint32 err = (combo > val)? combo - val : val - combo; + + if(err < bestChannelDist) { + bestChannelDist = err; + bestValI[ci] = v1; + bestValJ[ci] = v2; + } + } + + dist = max(bestChannelDist, dist); + } + + if(dist < bestDist) { + bestDist = dist; + bestPbitCombo = pbi; + + for(int ci = 0; ci < kNumColorChannels; ci++) { + p1.c[ci] = float(bestValI[ci]); + p2.c[ci] = float(bestValJ[ci]); + } + } + } + + return bestDist; +} + +static const ALIGN_SSE uint32 kOneVec[4] = { 1, 1, 1, 1 }; + +// Fast random number generator. See more information at +// http://software.intel.com/en-us/articles/fast-random-number-generator-on-the-intel-pentiumr-4-processor/ +static uint32 g_seed = uint32(time(NULL)); +static inline uint32 fastrand() { + g_seed = (214013 * g_seed + 2531011); + return (g_seed>>16) & RAND_MAX; +} + +static __m128i cur_seed = _mm_set1_epi32( int(time(NULL)) ); +static inline __m128i rand_dir() +{ + // static const __m128i mult = _mm_set_epi32( 214013, 17405, 214013, 69069 ); + // static const __m128i gadd = _mm_set_epi32( 2531011, 10395331, 13737667, 1 ); + static const ALIGN_SSE uint32 mult[4] = { 214013, 17405, 214013, 0 }; + static const ALIGN_SSE uint32 gadd[4] = { 2531011, 10395331, 13737667, 0 }; + static const ALIGN_SSE uint32 masklo[4] = { RAND_MAX, RAND_MAX, RAND_MAX, RAND_MAX }; + + cur_seed = _mm_mullo_epi32( *((const __m128i *)mult), cur_seed ); + cur_seed = _mm_add_epi32( *((const __m128i *)gadd), cur_seed ); + + const __m128i resShift = _mm_srai_epi32( cur_seed, 16 ); + const __m128i result = _mm_and_si128( resShift, *((const __m128i *)kOneVec) ); + + return result; +} + +// Fast generation of floats between 0 and 1. It generates a float +// whose exponent forces the value to be between 1 and 2, then it +// populates the mantissa with a random assortment of bits, and returns +// the bytes interpreted as a float. This prevents two things: 1, a +// division, and 2, a cast from an integer to a float. + +#define COMPILE_ASSERT(x) extern int __compile_assert_[(int)(x)]; +COMPILE_ASSERT(RAND_MAX == 0x7FFF) + +static inline float frand() { + const uint16 r = fastrand(); + + // RAND_MAX is 0x7FFF, which offers 15 bits + // of precision. Therefore, we move the bits + // into the top of the 23 bit mantissa, and + // repeat the most significant bits of r in + // the least significant of the mantissa + const uint32 m = (r << 8) | (r >> 7); + const uint32 flt = (127 << 23) | m; + return *(reinterpret_cast(&flt)) - 1.0f; +} + +static const ALIGN_SSE uint32 kSevenVec[4] = { 7, 7, 7, 7 }; +static const ALIGN_SSE uint32 kNegOneVec[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; +static const ALIGN_SSE uint32 kFloatSignBit[4] = { 0x40000000, 0x40000000, 0x40000000, 0x40000000 }; + +static void ChangePointForDirWithoutPbitChange(RGBAVectorSIMD &v, const __m128 &stepVec) { + + const __m128i dirBool = rand_dir(); + const __m128i cmp = _mm_cmpeq_epi32( dirBool, *((const __m128i *)kZeroVector) ); + + const __m128 negStepVec = _mm_sub_ps( _mm_castsi128_ps( *((const __m128i *)kZeroVector) ), stepVec ); + const __m128 step = _mm_blendv_ps( negStepVec, stepVec, _mm_castsi128_ps( cmp ) ); + v.vec = _mm_add_ps( v.vec, step ); +} + +static void ChangePointForDirWithPbitChange(RGBAVectorSIMD &v, int oldPbit, const __m128 &stepVec) { + + const __m128i pBitVec = _mm_set1_epi32( oldPbit ); + const __m128i cmpPBit = _mm_cmpeq_epi32( pBitVec, *((const __m128i *)kZeroVector) ); + const __m128i notCmpPBit = _mm_xor_si128( cmpPBit, *((const __m128i *)kNegOneVec) ); + + const __m128i dirBool = rand_dir(); + const __m128i cmpDir = _mm_cmpeq_epi32( dirBool, *((const __m128i *)kOneVec) ); + const __m128i notCmpDir = _mm_xor_si128( cmpDir, *((const __m128i *)kNegOneVec) ); + + const __m128i shouldDec = _mm_and_si128( cmpDir, cmpPBit ); + const __m128i shouldInc = _mm_and_si128( notCmpDir, notCmpPBit ); + + const __m128 decStep = _mm_blendv_ps( _mm_castsi128_ps( *((const __m128i *)kZeroVector) ), stepVec, _mm_castsi128_ps( shouldDec ) ); + v.vec = _mm_sub_ps( v.vec, decStep ); + + const __m128 incStep = _mm_blendv_ps( _mm_castsi128_ps( *((const __m128i *)kZeroVector) ), stepVec, _mm_castsi128_ps( shouldInc ) ); + v.vec = _mm_add_ps( v.vec, incStep ); +} + +void BC7CompressionModeSIMD::PickBestNeighboringEndpoints(const RGBAClusterSIMD &cluster, const RGBAVectorSIMD &p1, const RGBAVectorSIMD &p2, const int curPbitCombo, RGBAVectorSIMD &np1, RGBAVectorSIMD &np2, int &nPbitCombo, const __m128 &stepVec) const { + + np1 = p1; + np2 = p2; + + // First, let's figure out the new pbit combo... if there's no pbit then we don't need + // to worry about it. + const EPBitType pBitType = GetPBitType(); + if(pBitType != ePBitType_None) { + + // If there is a pbit, then we must change it, because those will provide the closest values + // to the current point. + if(pBitType == ePBitType_Shared) + nPbitCombo = (curPbitCombo + 1) % 2; + else { + // Not shared... p1 needs to change and p2 needs to change... which means that + // combo 0 gets rotated to combo 3, combo 1 gets rotated to combo 2 and vice + // versa... + nPbitCombo = 3 - curPbitCombo; + } + + assert(GetPBitCombo(curPbitCombo)[0] + GetPBitCombo(nPbitCombo)[0] == 1); + assert(GetPBitCombo(curPbitCombo)[1] + GetPBitCombo(nPbitCombo)[1] == 1); + + const int *pBitCombo = GetPBitCombo(curPbitCombo); + ChangePointForDirWithPbitChange(np1, pBitCombo[0], stepVec); + ChangePointForDirWithPbitChange(np2, pBitCombo[1], stepVec); + } + else { + ChangePointForDirWithoutPbitChange(np1, stepVec); + ChangePointForDirWithoutPbitChange(np2, stepVec); + } + + ClampEndpoints(np1, np2); +} + +bool BC7CompressionModeSIMD::AcceptNewEndpointError(float newError, float oldError, float temp) const { + + const float p = exp((0.15f * (oldError - newError)) / temp); + // const double r = (double(rand()) / double(RAND_MAX)); + const float r = frand(); + + return r < p; +} + +double BC7CompressionModeSIMD::OptimizeEndpointsForCluster(const RGBAClusterSIMD &cluster, RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, __m128i *bestIndices, int &bestPbitCombo) const { + + const int nBuckets = (1 << GetNumberOfBitsPerIndex()); + const int nPbitCombos = GetNumPbitCombos(); + __m128i qmask; + GetQuantizationMask(qmask); + + // Here we use simulated annealing to traverse the space of clusters to find the best possible endpoints. + float curError = cluster.QuantizedError(p1, p2, nBuckets, qmask, GetPBitCombo(bestPbitCombo), bestIndices); + int curPbitCombo = bestPbitCombo; + float bestError = curError; + RGBAVectorSIMD bp1 = p1, bp2 = p2; + + assert(curError == cluster.QuantizedError(p1, p2, nBuckets, qmask, GetPBitCombo(bestPbitCombo))); + + __m128i precVec = _mm_setr_epi32( GetRedChannelPrecision(), GetGreenChannelPrecision(), GetBlueChannelPrecision(), GetAlphaChannelPrecision() ); + const __m128i precMask = _mm_xor_si128( _mm_cmpeq_epi32( precVec, *((const __m128i *)kZeroVector) ), *((const __m128i *)kNegOneVec) ); + precVec = _mm_sub_epi32( *((const __m128i *)kSevenVec), precVec ); + precVec = _mm_slli_epi32( precVec, 23 ); + precVec = _mm_or_si128( precVec, *((const __m128i *)kFloatSignBit) ); + + //__m128 stepSzVec = _mm_set1_ps(1.0f); + //__m128 stepVec = _mm_mul_ps( stepSzVec, _mm_castsi128_ps( _mm_and_si128( precMask, precVec ) ) ); + __m128 stepVec = _mm_castsi128_ps( _mm_and_si128( precMask, precVec ) ); + + const int maxEnergy = MaxAnnealingIterations; + for(int energy = 0; bestError > 0 && energy < maxEnergy; energy++) { + + float temp = float(energy) / float(maxEnergy-1); + + __m128i indices[kMaxNumDataPoints/4]; + RGBAVectorSIMD np1, np2; + int nPbitCombo; + + PickBestNeighboringEndpoints(cluster, p1, p2, curPbitCombo, np1, np2, nPbitCombo, stepVec); + + float error = cluster.QuantizedError(np1, np2, nBuckets, qmask, GetPBitCombo(nPbitCombo), indices); + if(AcceptNewEndpointError(error, curError, temp)) { + curError = error; + p1 = np1; + p2 = np2; + curPbitCombo = nPbitCombo; + } + + if(error < bestError) { + memcpy(bestIndices, indices, sizeof(indices)); + bp1 = np1; + bp2 = np2; + bestPbitCombo = nPbitCombo; + bestError = error; + + // Restart... + energy = 0; + } + } + + p1 = bp1; + p2 = bp2; + + return bestError; +} + +double BC7CompressionModeSIMD::CompressCluster(const RGBAClusterSIMD &cluster, RGBAVectorSIMD &p1, RGBAVectorSIMD &p2, __m128i *bestIndices, int &bestPbitCombo) const { + + // If all the points are the same in the cluster, then we need to figure out what the best + // approximation to this point is.... + if(cluster.AllSamePoint()) { + const RGBAVectorSIMD &p = cluster.GetPoint(0); + double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo); + + // We're assuming all indices will be index 1... + for(int i = 0; i < 4; i++) { + bestIndices[i] = _mm_set1_epi32(1); + } + + return bestErr; + } + + const int nBuckets = (1 << GetNumberOfBitsPerIndex()); + const int nPbitCombos = GetNumPbitCombos(); + + RGBAVectorSIMD avg = cluster.GetTotal() / float(cluster.GetNumPoints()); + RGBADirSIMD axis; + ::GetPrincipalAxis(cluster, axis); + + float mindp = FLT_MAX, maxdp = -FLT_MAX; + for(int i = 0 ; i < cluster.GetNumPoints(); i++) { + float dp = (cluster.GetPoint(i) - avg) * axis; + if(dp < mindp) mindp = dp; + if(dp > maxdp) maxdp = dp; + } + + RGBAVectorSIMD pts[1 << 4]; // At most 4 bits per index. + float numPts[1<<4]; + assert(nBuckets <= 1 << 4); + + p1 = avg + mindp * axis; + p2 = avg + maxdp * axis; + + ClampEndpoints(p1, p2); + + for(int i = 0; i < nBuckets; i++) { + float s = (float(i) / float(nBuckets - 1)); + pts[i] = (1.0f - s) * p1 + s * p2; + } + + assert(pts[0] == p1); + assert(pts[nBuckets - 1] == p2); + + // Do k-means clustering... + int bucketIdx[kMaxNumDataPoints]; + + bool fixed = false; + while(!fixed) { + + RGBAVectorSIMD newPts[1 << 4]; + + // Assign each of the existing points to one of the buckets... + for(int i = 0; i < cluster.GetNumPoints(); i++) { + + int minBucket = -1; + float minDist = FLT_MAX; + for(int j = 0; j < nBuckets; j++) { + RGBAVectorSIMD v = cluster.GetPoint(i) - pts[j]; + float distSq = v * v; + if(distSq < minDist) + { + minDist = distSq; + minBucket = j; + } + } + + assert(minBucket >= 0); + bucketIdx[i] = minBucket; + } + + // Calculate new buckets based on centroids of clusters... + for(int i = 0; i < nBuckets; i++) { + + numPts[i] = 0.0f; + newPts[i] = RGBAVectorSIMD(0.0f); + for(int j = 0; j < cluster.GetNumPoints(); j++) { + if(bucketIdx[j] == i) { + numPts[i] += 1.0f; + newPts[i] += cluster.GetPoint(j); + } + } + + // If there are no points in this cluster, then it should + // remain the same as last time and avoid a divide by zero. + if(0.0f != numPts[i]) + newPts[i] /= numPts[i]; + } + + // If we haven't changed, then we're done. + fixed = true; + for(int i = 0; i < nBuckets; i++) { + if(pts[i] != newPts[i]) + fixed = false; + } + + // Assign the new points to be the old points. + for(int i = 0; i < nBuckets; i++) { + pts[i] = newPts[i]; + } + } + + // If there's only one bucket filled, then just compress for that single color... + int numBucketsFilled = 0, lastFilledBucket = -1; + for(int i = 0; i < nBuckets; i++) { + if(numPts[i] > 0.0f) { + numBucketsFilled++; + lastFilledBucket = i; + } + } + + assert(numBucketsFilled > 0); + if(1 == numBucketsFilled) { + const RGBAVectorSIMD &p = pts[lastFilledBucket]; + double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo); + + // We're assuming all indices will be index 1... + for(int i = 0; i < 4; i++) { + bestIndices[i] = _mm_set1_epi32(1); + } + + return bestErr; + } + + // Now that we know the index of each pixel, we can assign the endpoints based on a least squares fit + // of the clusters. For more information, take a look at this article by NVidia: + // http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/dxtc/doc/cuda_dxtc.pdf + float asq = 0.0, bsq = 0.0, ab = 0.0; + RGBAVectorSIMD ax(0.0f), bx(0.0f); + for(int i = 0; i < nBuckets; i++) { + float a = float(nBuckets - 1 - i) / float(nBuckets - 1); + float b = float(i) / float(nBuckets - 1); + + float n = numPts[i]; + RGBAVectorSIMD x = pts[i]; + + asq += n * a * a; + bsq += n * b * b; + ab += n * a * b; + + ax += x * a * n; + bx += x * b * n; + } + + float f = 1.0f / (asq * bsq - ab * ab); + p1 = f * (ax * bsq - bx * ab); + p2 = f * (bx * asq - ax * ab); + + ClampEndpointsToGrid(p1, p2, bestPbitCombo); + + #ifdef _DEBUG + int pBitCombo = bestPbitCombo; + RGBAVectorSIMD tp1 = p1, tp2 = p2; + ClampEndpointsToGrid(tp1, tp2, pBitCombo); + + assert(p1 == tp1); + assert(p2 == tp2); + assert(pBitCombo == bestPbitCombo); + #endif + + assert(bestPbitCombo >= 0); + + return OptimizeEndpointsForCluster(cluster, p1, p2, bestIndices, bestPbitCombo); +} + +double BC7CompressionModeSIMD::Compress(BitStream &stream, const int shapeIdx, const RGBAClusterSIMD *clusters) const { + + const int kModeNumber = GetModeNumber(); + const int nPartitionBits = GetNumberOfPartitionBits(); + const int nSubsets = GetNumberOfSubsets(); + + // Mode # + stream.WriteBits(1 << kModeNumber, kModeNumber + 1); + + // Partition # + assert((((1 << nPartitionBits) - 1) & shapeIdx) == shapeIdx); + stream.WriteBits(shapeIdx, nPartitionBits); + + RGBAVectorSIMD p1[kMaxNumSubsets], p2[kMaxNumSubsets]; + int bestIndices[kMaxNumSubsets][kMaxNumDataPoints] = { + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 } + }; + int bestPbitCombo[kMaxNumSubsets] = { -1, -1, -1 }; + + double totalErr = 0.0; + for(int cidx = 0; cidx < nSubsets; cidx++) { + ALIGN_SSE int indices[kMaxNumDataPoints]; + + // Compress this cluster + totalErr += CompressCluster(clusters[cidx], p1[cidx], p2[cidx], (__m128i *)indices, bestPbitCombo[cidx]); + + // !SPEED! We can precompute the subsets for each index based on the shape. This + // isn't the bottleneck for the compressor, but it could prove to be a little + // faster... + + // Map the indices to their proper position. + int idx = 0; + for(int i = 0; i < 16; i++) { + int subs = GetSubsetForIndex(i, shapeIdx); + if(subs == cidx) { + bestIndices[cidx][i] = indices[idx++]; + } + } + } + +#ifdef _DEBUG + for(int i = 0; i < kMaxNumDataPoints; i++) { + + int nSet = 0; + for(int j = 0; j < nSubsets; j++) { + if(bestIndices[j][i] >= 0) + nSet++; + } + + assert(nSet == 1); + } +#endif + + // Get the quantization mask + __m128i qmask; + GetQuantizationMask(qmask); + + //Quantize the points... + __m128i pixel1[kMaxNumSubsets], pixel2[kMaxNumSubsets]; + for(int i = 0; i < nSubsets; i++) { + switch(GetPBitType()) { + default: + case ePBitType_None: + pixel1[i] = p1[i].ToPixel(qmask); + pixel2[i] = p2[i].ToPixel(qmask); + break; + + case ePBitType_Shared: + case ePBitType_NotShared: + pixel1[i] = p1[i].ToPixel(qmask, GetPBitCombo(bestPbitCombo[i])[0]); + pixel2[i] = p2[i].ToPixel(qmask, GetPBitCombo(bestPbitCombo[i])[1]); + break; + } + } + + // If the anchor index does not have 0 in the leading bit, then + // we need to swap EVERYTHING. + for(int sidx = 0; sidx < nSubsets; sidx++) { + + int anchorIdx = GetAnchorIndexForSubset(sidx, shapeIdx); + assert(bestIndices[sidx][anchorIdx] != -1); + + int nIndexBits = GetNumberOfBitsPerIndex(); + if(bestIndices[sidx][anchorIdx] >> (nIndexBits - 1)) { + __m128i t = pixel1[sidx]; pixel1[sidx] = pixel2[sidx]; pixel2[sidx] = t; + + int nIndexVals = 1 << nIndexBits; + for(int i = 0; i < 16; i++) { + bestIndices[sidx][i] = (nIndexVals - 1) - bestIndices[sidx][i]; + } + } + + assert(!(bestIndices[sidx][anchorIdx] >> (nIndexBits - 1))); + } + + // Get the quantized values... + uint8 r1[kMaxNumSubsets], g1[kMaxNumSubsets], b1[kMaxNumSubsets], a1[kMaxNumSubsets]; + uint8 r2[kMaxNumSubsets], g2[kMaxNumSubsets], b2[kMaxNumSubsets], a2[kMaxNumSubsets]; + for(int i = 0; i < nSubsets; i++) { + r1[i] = ((uint8 *)(&(pixel1[i])))[0]; + r2[i] = ((uint8 *)(&(pixel2[i])))[0]; + + g1[i] = ((uint8 *)(&(pixel1[i])))[4]; + g2[i] = ((uint8 *)(&(pixel2[i])))[4]; + + b1[i] = ((uint8 *)(&(pixel1[i])))[8]; + b2[i] = ((uint8 *)(&(pixel2[i])))[8]; + + a1[i] = ((uint8 *)(&(pixel1[i])))[12]; + a2[i] = ((uint8 *)(&(pixel2[i])))[12]; + } + + // Write them out... + const int nRedBits = GetRedChannelPrecision(); + for(int i = 0; i < nSubsets; i++) { + stream.WriteBits(r1[i] >> (8 - nRedBits), nRedBits); + stream.WriteBits(r2[i] >> (8 - nRedBits), nRedBits); + } + + const int nGreenBits = GetGreenChannelPrecision(); + for(int i = 0; i < nSubsets; i++) { + stream.WriteBits(g1[i] >> (8 - nGreenBits), nGreenBits); + stream.WriteBits(g2[i] >> (8 - nGreenBits), nGreenBits); + } + + const int nBlueBits = GetBlueChannelPrecision(); + for(int i = 0; i < nSubsets; i++) { + stream.WriteBits(b1[i] >> (8 - nBlueBits), nBlueBits); + stream.WriteBits(b2[i] >> (8 - nBlueBits), nBlueBits); + } + + const int nAlphaBits = GetAlphaChannelPrecision(); + for(int i = 0; i < nSubsets; i++) { + stream.WriteBits(a1[i] >> (8 - nAlphaBits), nAlphaBits); + stream.WriteBits(a2[i] >> (8 - nAlphaBits), nAlphaBits); + } + + // Write out the best pbits.. + if(GetPBitType() != ePBitType_None) { + for(int s = 0; s < nSubsets; s++) { + const int *pbits = GetPBitCombo(bestPbitCombo[s]); + stream.WriteBits(pbits[0], 1); + if(GetPBitType() != ePBitType_Shared) + stream.WriteBits(pbits[1], 1); + } + } + + for(int i = 0; i < 16; i++) { + const int subs = GetSubsetForIndex(i, shapeIdx); + const int idx = bestIndices[subs][i]; + const int anchorIdx = GetAnchorIndexForSubset(subs, shapeIdx); + const int nBitsForIdx = GetNumberOfBitsPerIndex(); + assert(idx >= 0 && idx < (1 << nBitsForIdx)); + assert(i != anchorIdx || !(idx >> (nBitsForIdx - 1)) || !"Leading bit of anchor index is not zero!"); + stream.WriteBits(idx, (i == anchorIdx)? nBitsForIdx - 1 : nBitsForIdx); + } + + assert(stream.GetBitsWritten() == 128); + return totalErr; +} + +namespace BC7C +{ + static ErrorMetric gErrorMetric = eErrorMetric_Uniform; + void SetErrorMetric(ErrorMetric e) { gErrorMetric = e; } + + ALIGN_SSE const float kErrorMetrics[kNumErrorMetrics][kNumColorChannels] = { + { 1.0f, 1.0f, 1.0f, 1.0f }, + { sqrtf(0.3f), sqrtf(0.56f), sqrtf(0.11f), 1.0f } + }; + + const float *GetErrorMetric() { return kErrorMetrics[GetErrorMetricEnum()]; } + ErrorMetric GetErrorMetricEnum() { return gErrorMetric; } + + // Function prototypes + static void ExtractBlock(const uint8* inPtr, int width, uint32* colorBlock); + static void CompressBC7Block(const uint32 *block, uint8 *outBuf); + + // Returns true if the entire block is a single color. + static bool AllOneColor(const uint32 block[16]) { + const uint32 pixel = block[0]; + for(int i = 1; i < 16; i++) { + if( block[i] != pixel ) + return false; + } + + return true; + } + + // Write out a transparent block. + static void WriteTransparentBlock(BitStream &stream) { + // Use mode 6 + stream.WriteBits(1 << 6, 7); + stream.WriteBits(0, 128-7); + assert(stream.GetBitsWritten() == 128); + } + + // Compresses a single color optimally and outputs the result. + static void CompressOptimalColorBC7(uint32 pixel, BitStream &stream) { + + stream.WriteBits(1 << 5, 6); // Mode 5 + stream.WriteBits(0, 2); // No rotation bits. + + uint8 r = pixel & 0xFF; + uint8 g = (pixel >> 8) & 0xFF; + uint8 b = (pixel >> 16) & 0xFF; + uint8 a = (pixel >> 24) & 0xFF; + + // Red endpoints + stream.WriteBits(Optimal7CompressBC7Mode5[r][0], 7); + stream.WriteBits(Optimal7CompressBC7Mode5[r][1], 7); + + // Green endpoints + stream.WriteBits(Optimal7CompressBC7Mode5[g][0], 7); + stream.WriteBits(Optimal7CompressBC7Mode5[g][1], 7); + + // Blue endpoints + stream.WriteBits(Optimal7CompressBC7Mode5[b][0], 7); + stream.WriteBits(Optimal7CompressBC7Mode5[b][1], 7); + + // Alpha endpoints... are just the same. + stream.WriteBits(a, 8); + stream.WriteBits(a, 8); + + // Color indices are 1 for each pixel... + // Anchor index is 0, so 1 bit for the first pixel, then + // 01 for each following pixel giving the sequence of 31 bits: + // ...010101011 + stream.WriteBits(0xaaaaaaab, 31); + + // Alpha indices... + stream.WriteBits(kWMValues[gWMVal = (gWMVal+1) % kNumWMVals], 31); + } + + // Compress an image using BC7 compression. Use the inBuf parameter to point to an image in + // 4-byte RGBA format. The width and height parameters specify the size of the image in pixels. + // The buffer pointed to by outBuf should be large enough to store the compressed image. This + // implementation has an 4:1 compression ratio. + void CompressImageBC7SIMD(const uint8* inBuf, uint8* outBuf, int width, int height) + { + ALIGN_SSE uint32 block[16]; + + _MM_SET_ROUNDING_MODE( _MM_ROUND_TOWARD_ZERO ); + BC7CompressionModeSIMD::ResetNumUses(); + + BC7CompressionModeSIMD::MaxAnnealingIterations = GetQualityLevel(); + + for(int j = 0; j < height; j += 4, inBuf += width * 4 * 4) + { + for(int i = 0; i < width; i += 4) + { + ExtractBlock(inBuf + i * 4, width, block); + CompressBC7Block(block, outBuf); + outBuf += 16; + } + } + } + + // Extract a 4 by 4 block of pixels from inPtr and store it in colorBlock. The width parameter + // specifies the size of the image in pixels. + static void ExtractBlock(const uint8* inPtr, int width, uint32* colorBlock) + { + // Compute the stride. + const int stride = width * 4; + + // Copy the first row of pixels from inPtr into colorBlock. + _mm_store_si128((__m128i*)colorBlock, _mm_load_si128((__m128i*)inPtr)); + inPtr += stride; + + // Copy the second row of pixels from inPtr into colorBlock. + _mm_store_si128((__m128i*)(colorBlock + 4), _mm_load_si128((__m128i*)inPtr)); + inPtr += stride; + + // Copy the third row of pixels from inPtr into colorBlock. + _mm_store_si128((__m128i*)(colorBlock + 8), _mm_load_si128((__m128i*)inPtr)); + inPtr += stride; + + // Copy the forth row of pixels from inPtr into colorBlock. + _mm_store_si128((__m128i*)(colorBlock + 12), _mm_load_si128((__m128i*)inPtr)); + } + + static double CompressTwoClusters(int shapeIdx, const RGBAClusterSIMD *clusters, uint8 *outBuf, double estimatedError) { + + uint8 tempBuf1[16]; + BitStream tmpStream1(tempBuf1, 128, 0); + BC7CompressionModeSIMD compressor1(1, estimatedError); + + double bestError = compressor1.Compress(tmpStream1, shapeIdx, clusters); + memcpy(outBuf, tempBuf1, 16); + if(bestError == 0.0) { + return 0.0; + } + + uint8 tempBuf3[16]; + BitStream tmpStream3(tempBuf3, 128, 0); + BC7CompressionModeSIMD compressor3(3, estimatedError); + + double error; + if((error = compressor3.Compress(tmpStream3, shapeIdx, clusters)) < bestError) { + bestError = error; + memcpy(outBuf, tempBuf3, 16); + if(bestError == 0.0) { + return 0.0; + } + } + + // Mode 3 offers more precision for RGB data. Mode 7 is really only if we have alpha. + //uint8 tempBuf7[16]; + //BitStream tmpStream7(tempBuf7, 128, 0); + //BC7CompressionModeSIMD compressor7(7, estimatedError); + //if((error = compressor7.Compress(tmpStream7, shapeIdx, clusters)) < bestError) { + // memcpy(outBuf, tempBuf7, 16); + // return error; + //} + + return bestError; + } + + static double CompressThreeClusters(int shapeIdx, const RGBAClusterSIMD *clusters, uint8 *outBuf, double estimatedError) { + + uint8 tempBuf0[16]; + BitStream tmpStream0(tempBuf0, 128, 0); + + uint8 tempBuf2[16]; + BitStream tmpStream2(tempBuf2, 128, 0); + + BC7CompressionModeSIMD compressor0(0, estimatedError); + BC7CompressionModeSIMD compressor2(2, estimatedError); + + double error, bestError = (shapeIdx < 16)? compressor0.Compress(tmpStream0, shapeIdx, clusters) : DBL_MAX; + memcpy(outBuf, tempBuf0, 16); + if(bestError == 0.0) { + return 0.0; + } + + if((error = compressor2.Compress(tmpStream2, shapeIdx, clusters)) < bestError) { + memcpy(outBuf, tempBuf2, 16); + return error; + } + + return bestError; + } + + static void PopulateTwoClustersForShape(const RGBAClusterSIMD &points, int shapeIdx, RGBAClusterSIMD *clusters) { + const uint16 shape = kShapeMask2[shapeIdx]; + for(int pt = 0; pt < kMaxNumDataPoints; pt++) { + + const RGBAVectorSIMD &p = points.GetPoint(pt); + + if((1 << pt) & shape) + clusters[1].AddPoint(p, pt); + else + clusters[0].AddPoint(p, pt); + } + + assert(!(clusters[0].GetPointBitString() & clusters[1].GetPointBitString())); + assert((clusters[0].GetPointBitString() ^ clusters[1].GetPointBitString()) == 0xFFFF); + assert((shape & clusters[1].GetPointBitString()) == shape); + } + + static void PopulateThreeClustersForShape(const RGBAClusterSIMD &points, int shapeIdx, RGBAClusterSIMD *clusters) { + for(int pt = 0; pt < kMaxNumDataPoints; pt++) { + + const RGBAVectorSIMD &p = points.GetPoint(pt); + + if((1 << pt) & kShapeMask3[shapeIdx][0]) { + if((1 << pt) & kShapeMask3[shapeIdx][1]) + clusters[2].AddPoint(p, pt); + else + clusters[1].AddPoint(p, pt); + } + else + clusters[0].AddPoint(p, pt); + } + + assert(!(clusters[0].GetPointBitString() & clusters[1].GetPointBitString())); + assert(!(clusters[2].GetPointBitString() & clusters[1].GetPointBitString())); + assert(!(clusters[0].GetPointBitString() & clusters[2].GetPointBitString())); + } + + static double EstimateTwoClusterError(RGBAClusterSIMD &c) { + RGBAVectorSIMD Min, Max, v; + c.GetBoundingBox(Min, Max); + v = Max - Min; + if(v * v == 0) { + return 0.0; + } + + return 0.0001 + c.QuantizedError(Min, Max, 8, _mm_set1_epi32(0xFF)); + } + + static double EstimateThreeClusterError(RGBAClusterSIMD &c) { + RGBAVectorSIMD Min, Max, v; + c.GetBoundingBox(Min, Max); + v = Max - Min; + if(v * v == 0) { + return 0.0; + } + + return 0.0001 + c.QuantizedError(Min, Max, 4, _mm_set1_epi32(0xFF)); + } + + // Compress a single block. + void CompressBC7Block(const uint32 *block, uint8 *outBuf) { + + // All a single color? + if(AllOneColor(block)) { + BitStream bStrm(outBuf, 128, 0); + CompressOptimalColorBC7(*((const uint32 *)block), bStrm); + return; + } + + RGBAClusterSIMD blockCluster; + bool opaque = true; + bool transparent = true; + + for(int i = 0; i < kMaxNumDataPoints; i++) { + RGBAVectorSIMD p = RGBAVectorSIMD(block[i]); + blockCluster.AddPoint(p, i); + if(fabs(p.a - 255.0f) > 1e-10) + opaque = false; + + if(p.a > 0.0f) + transparent = false; + } + + // The whole block is transparent? + if(transparent) { + BitStream bStrm(outBuf, 128, 0); + WriteTransparentBlock(bStrm); + return; + } + + // First we must figure out which shape to use. To do this, simply + // see which shape has the smallest sum of minimum bounding spheres. + double bestError[2] = { DBL_MAX, DBL_MAX }; + int bestShapeIdx[2] = { -1, -1 }; + RGBAClusterSIMD bestClusters[2][3]; + + for(int i = 0; i < kNumShapes2; i++) + { + RGBAClusterSIMD clusters[2]; + PopulateTwoClustersForShape(blockCluster, i, clusters); + + double err = 0.0; + for(int ci = 0; ci < 2; ci++) { + err += EstimateTwoClusterError(clusters[ci]); + } + + // If it's small, we'll take it! + if(err < 1e-9) { + CompressTwoClusters(i, clusters, outBuf, err); + return; + } + + if(err < bestError[0]) { + bestError[0] = err; + bestShapeIdx[0] = i; + bestClusters[0][0] = clusters[0]; + bestClusters[0][1] = clusters[1]; + } + } + + // There are not 3 subset blocks that support alpha... + if(opaque) { + for(int i = 0; i < kNumShapes3; i++) { + + RGBAClusterSIMD clusters[3]; + PopulateThreeClustersForShape(blockCluster, i, clusters); + + double err = 0.0; + for(int ci = 0; ci < 3; ci++) { + err += EstimateThreeClusterError(clusters[ci]); + } + + // If it's small, we'll take it! + if(err < 1e-9) { + CompressThreeClusters(i, clusters, outBuf, err); + return; + } + + if(err < bestError[1]) { + bestError[1] = err; + bestShapeIdx[1] = i; + bestClusters[1][0] = clusters[0]; + bestClusters[1][1] = clusters[1]; + bestClusters[1][2] = clusters[2]; + } + } + } + + if(opaque) { + + uint8 tempBuf1[16]; + uint8 tempBuf2[16]; + + BitStream tempStream1 (tempBuf1, 128, 0); + BC7CompressionModeSIMD compressor(6, DBL_MAX); + double best = compressor.Compress(tempStream1, 0, &blockCluster); + if(best == 0.0f) { + memcpy(outBuf, tempBuf1, 16); + return; + } + + double error = DBL_MAX; + if((error = CompressTwoClusters(bestShapeIdx[0], bestClusters[0], tempBuf2, bestError[0])) < best) { + best = error; + if(error == 0.0f) { + memcpy(outBuf, tempBuf2, 16); + return; + } + else { + memcpy(tempBuf1, tempBuf2, 16); + } + } + + if(CompressThreeClusters(bestShapeIdx[1], bestClusters[1], tempBuf2, bestError[1]) < best) { + memcpy(outBuf, tempBuf2, 16); + return; + } + + memcpy(outBuf, tempBuf1, 16); + } + else { + assert(!"Don't support alpha yet!"); + } + } +} diff --git a/BPTCEncoder/src/BC7IntTypes.h b/BPTCEncoder/src/BC7IntTypes.h new file mode 100644 index 0000000..3f32c10 --- /dev/null +++ b/BPTCEncoder/src/BC7IntTypes.h @@ -0,0 +1,31 @@ + +// Copyright 2012 (c) Pavel Krajcevski +// BC7IntTypes.h + +// This file contains all of the various platform definitions for fixed width integers +// on various platforms. + +// !FIXME! Still needs to be tested on Windows platforms. + + +#ifdef _MSC_VER +typedef __int16 int16; +typedef __uint16 uint16; +typedef __int32 int32; +typedef __uint32 uint32; +typedef __int8 int8; +typedef __uint8 uint8; + +#else + +#include + +typedef int8_t int8; +typedef int16_t int16; +typedef int32_t int32; + +typedef uint8_t uint8; +typedef uint16_t uint16; +typedef uint32_t uint32; + +#endif diff --git a/BPTCEncoder/src/BCLookupTables.h b/BPTCEncoder/src/BCLookupTables.h new file mode 100755 index 0000000..37ec72b --- /dev/null +++ b/BPTCEncoder/src/BCLookupTables.h @@ -0,0 +1,945 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +// Each value from 0 to 255 can be exactly interpolated between two other values +// with 7 bit precision. BC7 Mode 5 gives us this precision, so we can use look-up +// tables to speed up this precision by allowing every value to be 1/3 of the way +// between the two colors specified. +/* + UINT nbits = 7; + UINT lastNum = -1; + UINT vals[255]; + UINT valIdx = 0; + for(UINT i = 0; i < 256; i++) { + UINT num = (i >> (8 - nbits)); + num <<= (8-nbits); + num |= i >> nbits; + + if(num != lastNum) { + lastNum = num; + vals[valIdx++] = num; + } + } + + for(UINT i = 0; i < 256; i++) { + + UINT mindist = 0xFFFFFFFF; + UINT minj = 0, mink = 0; + + UINT tableEntry[2] = { 0, 0 }; + + mindist = 0xFFFFFFFF; + minj = 0, mink = 0; + + for(UINT j = 0; j < valIdx; j++) { + for(UINT k = 0; k < valIdx ; k++) { + + UINT combo = (43 * vals[j] + 21 * vals[k] + 32) >> 6; + UINT dist = ((i > combo) ? i - combo : combo - i); + if( dist < mindist ) + { + mindist = dist; + minj = j; + mink = k; + } + } + } + + assert(mindist == 0); + + tableEntry[0] = vals[minj]; + tableEntry[1] = vals[mink]; + + wchar_t tableEntryStr[256]; + swprintf(tableEntryStr, 256, L"{ 0x%02x, 0x%02x },\n", + tableEntry[0] >> (8 - nbits), + tableEntry[1] >> (8 - nbits) + ); + OutputDebugString(tableEntryStr); + } +*/ +static unsigned char Optimal7CompressBC7Mode5[256][2] = { + { 0x00, 0x00 }, + { 0x00, 0x01 }, + { 0x00, 0x03 }, + { 0x00, 0x04 }, + { 0x00, 0x06 }, + { 0x00, 0x07 }, + { 0x00, 0x09 }, + { 0x00, 0x0a }, + { 0x00, 0x0c }, + { 0x00, 0x0d }, + { 0x00, 0x0f }, + { 0x00, 0x10 }, + { 0x00, 0x12 }, + { 0x00, 0x14 }, + { 0x00, 0x15 }, + { 0x00, 0x17 }, + { 0x00, 0x18 }, + { 0x00, 0x1a }, + { 0x00, 0x1b }, + { 0x00, 0x1d }, + { 0x00, 0x1e }, + { 0x00, 0x20 }, + { 0x00, 0x21 }, + { 0x00, 0x23 }, + { 0x00, 0x24 }, + { 0x00, 0x26 }, + { 0x00, 0x27 }, + { 0x00, 0x29 }, + { 0x00, 0x2a }, + { 0x00, 0x2c }, + { 0x00, 0x2d }, + { 0x00, 0x2f }, + { 0x00, 0x30 }, + { 0x00, 0x32 }, + { 0x00, 0x34 }, + { 0x00, 0x35 }, + { 0x00, 0x37 }, + { 0x00, 0x38 }, + { 0x00, 0x3a }, + { 0x00, 0x3b }, + { 0x00, 0x3d }, + { 0x00, 0x3e }, + { 0x00, 0x40 }, + { 0x00, 0x41 }, + { 0x00, 0x42 }, + { 0x00, 0x44 }, + { 0x00, 0x45 }, + { 0x00, 0x47 }, + { 0x00, 0x48 }, + { 0x00, 0x4a }, + { 0x00, 0x4b }, + { 0x00, 0x4d }, + { 0x00, 0x4e }, + { 0x00, 0x50 }, + { 0x00, 0x52 }, + { 0x00, 0x53 }, + { 0x00, 0x55 }, + { 0x00, 0x56 }, + { 0x00, 0x58 }, + { 0x00, 0x59 }, + { 0x00, 0x5b }, + { 0x00, 0x5c }, + { 0x00, 0x5e }, + { 0x00, 0x5f }, + { 0x00, 0x61 }, + { 0x00, 0x62 }, + { 0x00, 0x64 }, + { 0x00, 0x65 }, + { 0x00, 0x67 }, + { 0x00, 0x68 }, + { 0x00, 0x6a }, + { 0x00, 0x6b }, + { 0x00, 0x6d }, + { 0x00, 0x6e }, + { 0x00, 0x70 }, + { 0x00, 0x72 }, + { 0x00, 0x73 }, + { 0x00, 0x75 }, + { 0x00, 0x76 }, + { 0x00, 0x78 }, + { 0x00, 0x79 }, + { 0x00, 0x7b }, + { 0x00, 0x7c }, + { 0x00, 0x7e }, + { 0x00, 0x7f }, + { 0x01, 0x7f }, + { 0x02, 0x7e }, + { 0x03, 0x7e }, + { 0x03, 0x7f }, + { 0x04, 0x7f }, + { 0x05, 0x7e }, + { 0x06, 0x7e }, + { 0x06, 0x7f }, + { 0x07, 0x7f }, + { 0x08, 0x7e }, + { 0x09, 0x7e }, + { 0x09, 0x7f }, + { 0x0a, 0x7f }, + { 0x0b, 0x7e }, + { 0x0c, 0x7e }, + { 0x0c, 0x7f }, + { 0x0d, 0x7f }, + { 0x0e, 0x7e }, + { 0x0f, 0x7d }, + { 0x0f, 0x7f }, + { 0x10, 0x7e }, + { 0x11, 0x7e }, + { 0x11, 0x7f }, + { 0x12, 0x7f }, + { 0x13, 0x7e }, + { 0x14, 0x7e }, + { 0x14, 0x7f }, + { 0x15, 0x7f }, + { 0x16, 0x7e }, + { 0x17, 0x7e }, + { 0x17, 0x7f }, + { 0x18, 0x7f }, + { 0x19, 0x7e }, + { 0x1a, 0x7e }, + { 0x1a, 0x7f }, + { 0x1b, 0x7f }, + { 0x1c, 0x7e }, + { 0x1d, 0x7e }, + { 0x1d, 0x7f }, + { 0x1e, 0x7f }, + { 0x1f, 0x7e }, + { 0x20, 0x7e }, + { 0x20, 0x7f }, + { 0x21, 0x7f }, + { 0x22, 0x7e }, + { 0x23, 0x7e }, + { 0x23, 0x7f }, + { 0x24, 0x7f }, + { 0x25, 0x7e }, + { 0x26, 0x7e }, + { 0x26, 0x7f }, + { 0x27, 0x7f }, + { 0x28, 0x7e }, + { 0x29, 0x7e }, + { 0x29, 0x7f }, + { 0x2a, 0x7f }, + { 0x2b, 0x7e }, + { 0x2c, 0x7e }, + { 0x2c, 0x7f }, + { 0x2d, 0x7f }, + { 0x2e, 0x7e }, + { 0x2f, 0x7d }, + { 0x2f, 0x7f }, + { 0x30, 0x7e }, + { 0x31, 0x7e }, + { 0x31, 0x7f }, + { 0x32, 0x7f }, + { 0x33, 0x7e }, + { 0x34, 0x7e }, + { 0x34, 0x7f }, + { 0x35, 0x7f }, + { 0x36, 0x7e }, + { 0x37, 0x7e }, + { 0x37, 0x7f }, + { 0x38, 0x7f }, + { 0x39, 0x7e }, + { 0x3a, 0x7e }, + { 0x3a, 0x7f }, + { 0x3b, 0x7f }, + { 0x3c, 0x7e }, + { 0x3d, 0x7e }, + { 0x3d, 0x7f }, + { 0x3e, 0x7f }, + { 0x3f, 0x7e }, + { 0x40, 0x7d }, + { 0x40, 0x7e }, + { 0x41, 0x7e }, + { 0x41, 0x7f }, + { 0x42, 0x7f }, + { 0x43, 0x7e }, + { 0x44, 0x7e }, + { 0x44, 0x7f }, + { 0x45, 0x7f }, + { 0x46, 0x7e }, + { 0x47, 0x7e }, + { 0x47, 0x7f }, + { 0x48, 0x7f }, + { 0x49, 0x7e }, + { 0x4a, 0x7e }, + { 0x4a, 0x7f }, + { 0x4b, 0x7f }, + { 0x4c, 0x7e }, + { 0x4d, 0x7d }, + { 0x4d, 0x7f }, + { 0x4e, 0x7e }, + { 0x4f, 0x7e }, + { 0x4f, 0x7f }, + { 0x50, 0x7f }, + { 0x51, 0x7e }, + { 0x52, 0x7e }, + { 0x52, 0x7f }, + { 0x53, 0x7f }, + { 0x54, 0x7e }, + { 0x55, 0x7e }, + { 0x55, 0x7f }, + { 0x56, 0x7f }, + { 0x57, 0x7e }, + { 0x58, 0x7e }, + { 0x58, 0x7f }, + { 0x59, 0x7f }, + { 0x5a, 0x7e }, + { 0x5b, 0x7e }, + { 0x5b, 0x7f }, + { 0x5c, 0x7f }, + { 0x5d, 0x7e }, + { 0x5e, 0x7e }, + { 0x5e, 0x7f }, + { 0x5f, 0x7f }, + { 0x60, 0x7e }, + { 0x61, 0x7e }, + { 0x61, 0x7f }, + { 0x62, 0x7f }, + { 0x63, 0x7e }, + { 0x64, 0x7e }, + { 0x64, 0x7f }, + { 0x65, 0x7f }, + { 0x66, 0x7e }, + { 0x67, 0x7e }, + { 0x67, 0x7f }, + { 0x68, 0x7f }, + { 0x69, 0x7e }, + { 0x6a, 0x7e }, + { 0x6a, 0x7f }, + { 0x6b, 0x7f }, + { 0x6c, 0x7e }, + { 0x6d, 0x7d }, + { 0x6d, 0x7f }, + { 0x6e, 0x7e }, + { 0x6f, 0x7e }, + { 0x6f, 0x7f }, + { 0x70, 0x7f }, + { 0x71, 0x7e }, + { 0x72, 0x7e }, + { 0x72, 0x7f }, + { 0x73, 0x7f }, + { 0x74, 0x7e }, + { 0x75, 0x7e }, + { 0x75, 0x7f }, + { 0x76, 0x7f }, + { 0x77, 0x7e }, + { 0x78, 0x7e }, + { 0x78, 0x7f }, + { 0x79, 0x7f }, + { 0x7a, 0x7e }, + { 0x7b, 0x7e }, + { 0x7b, 0x7f }, + { 0x7c, 0x7f }, + { 0x7d, 0x7e }, + { 0x7e, 0x7e }, + { 0x7e, 0x7f }, + { 0x7f, 0x7f } +}; + +// For each value, we give the best possible compression range for that value with 5 bits. +// The first value says whether or not it's +// 1 - the midpoint of two other values, or +// 0 - 1/3 of the way in between two other values. +// If the first value is 1 or 2 then the last two values are the range between which the +// value should be interpolated. If the first value is 2, then it should be interpolated +// one third of the way from the second to third value... +// +// The following tables were generated with the following program: +/* + UINT nbits = 5; + UINT lastNum = -1; + UINT vals[255]; + UINT valIdx = 0; + for(UINT i = 0; i < 256; i++) { + UINT num = (i >> (8 - nbits)); + num <<= (8-nbits); + num |= i >> nbits; + + if(num != lastNum) { + lastNum = num; + vals[valIdx++] = num; + } + } + + for(UINT i = 0; i < 256; i++) { + + UINT mindist = 0xFFFFFFFF; + UINT minj = 0, mink = 0; + + UINT tableEntry[2][4] = { {1, 0, 0, 0xFFFFFFFF}, {0, 0, 0, 0xFFFFFFFF} }; + + for(UINT j = 0; j < valIdx; j++) { + for(UINT k = j; k < valIdx ; k++) { + + UINT combo = (vals[j] + vals[k]) / 2; + UINT dist = ((i > combo) ? i - combo : combo - i); + if( dist < mindist ) + { + mindist = dist; + minj = j; + mink = k; + } + } + } + + tableEntry[0][1] = vals[minj]; + tableEntry[0][2] = vals[mink]; + tableEntry[0][3] = mindist; + + mindist = 0xFFFFFFFF; + minj = 0, mink = 0; + + for(UINT j = 0; j < valIdx; j++) { + for(UINT k = j; k < valIdx ; k++) { + + UINT combo = (2 * vals[j] + vals[k]) / 3; + UINT dist = ((i > combo) ? i - combo : combo - i); + if( dist < mindist ) + { + mindist = dist; + minj = j; + mink = k; + } + } + } + + tableEntry[1][1] = vals[minj]; + tableEntry[1][2] = vals[mink]; + tableEntry[1][3] = mindist; + + wchar_t tableEntryStr[256]; + if(tableEntry[1][3] > tableEntry[0][3]) { + swprintf(tableEntryStr, 256, L"{ { %d, 0x%02x, 0x%02x }, { %d, 0x%02x, 0x%02x } },\n", + tableEntry[0][0], + tableEntry[0][1] >> (8 - nbits), + tableEntry[0][2] >> (8 - nbits), + tableEntry[1][0], + tableEntry[1][1] >> (8 - nbits), + tableEntry[1][2] >> (8 - nbits) + ); + } + else { + swprintf(tableEntryStr, 256, L"{ { %d, 0x%02x, 0x%02x }, { %d, 0x%02x, 0x%02x } },\n", + tableEntry[1][0], + tableEntry[1][1] >> (8 - nbits), + tableEntry[1][2] >> (8 - nbits), + tableEntry[0][0], + tableEntry[0][1] >> (8 - nbits), + tableEntry[0][2] >> (8 - nbits) + ); + } + OutputDebugString(tableEntryStr); + } +*/ +static unsigned char Optimal5CompressDXT1[256][2][3] = { + { { 0, 0x00, 0x00 }, { 1, 0x00, 0x00 } }, + { { 0, 0x00, 0x00 }, { 1, 0x00, 0x00 } }, + { { 0, 0x00, 0x01 }, { 1, 0x00, 0x00 } }, + { { 0, 0x00, 0x01 }, { 1, 0x00, 0x01 } }, + { { 1, 0x00, 0x01 }, { 0, 0x00, 0x02 } }, + { { 0, 0x00, 0x02 }, { 1, 0x00, 0x01 } }, + { { 0, 0x00, 0x02 }, { 1, 0x00, 0x01 } }, + { { 0, 0x00, 0x03 }, { 1, 0x00, 0x02 } }, + { { 0, 0x00, 0x03 }, { 1, 0x00, 0x02 } }, + { { 0, 0x00, 0x03 }, { 1, 0x00, 0x02 } }, + { { 0, 0x01, 0x02 }, { 1, 0x00, 0x02 } }, + { { 0, 0x00, 0x04 }, { 1, 0x00, 0x03 } }, + { { 1, 0x00, 0x03 }, { 0, 0x00, 0x04 } }, + { { 0, 0x00, 0x05 }, { 1, 0x00, 0x03 } }, + { { 0, 0x00, 0x05 }, { 1, 0x00, 0x03 } }, + { { 0, 0x00, 0x06 }, { 1, 0x00, 0x04 } }, + { { 0, 0x00, 0x06 }, { 1, 0x00, 0x04 } }, + { { 0, 0x00, 0x06 }, { 1, 0x00, 0x04 } }, + { { 0, 0x02, 0x03 }, { 1, 0x00, 0x04 } }, + { { 0, 0x00, 0x07 }, { 1, 0x00, 0x05 } }, + { { 1, 0x00, 0x05 }, { 0, 0x00, 0x07 } }, + { { 0, 0x01, 0x06 }, { 1, 0x00, 0x05 } }, + { { 0, 0x00, 0x08 }, { 1, 0x00, 0x05 } }, + { { 0, 0x00, 0x08 }, { 1, 0x00, 0x06 } }, + { { 0, 0x00, 0x09 }, { 1, 0x00, 0x06 } }, + { { 0, 0x00, 0x09 }, { 1, 0x00, 0x06 } }, + { { 0, 0x00, 0x0a }, { 1, 0x00, 0x06 } }, + { { 0, 0x00, 0x0a }, { 1, 0x00, 0x07 } }, + { { 1, 0x00, 0x07 }, { 0, 0x00, 0x0a } }, + { { 0, 0x02, 0x07 }, { 1, 0x00, 0x07 } }, + { { 0, 0x00, 0x0b }, { 1, 0x00, 0x07 } }, + { { 0, 0x00, 0x0b }, { 1, 0x01, 0x07 } }, + { { 0, 0x01, 0x0a }, { 1, 0x01, 0x07 } }, + { { 0, 0x00, 0x0c }, { 1, 0x00, 0x08 } }, + { { 0, 0x00, 0x0c }, { 1, 0x00, 0x08 } }, + { { 0, 0x00, 0x0d }, { 1, 0x02, 0x07 } }, + { { 1, 0x02, 0x07 }, { 0, 0x00, 0x0d } }, + { { 1, 0x00, 0x09 }, { 0, 0x00, 0x0e } }, + { { 0, 0x00, 0x0e }, { 1, 0x00, 0x09 } }, + { { 0, 0x00, 0x0e }, { 1, 0x03, 0x07 } }, + { { 0, 0x02, 0x0b }, { 1, 0x03, 0x07 } }, + { { 0, 0x00, 0x0f }, { 1, 0x00, 0x0a } }, + { { 0, 0x00, 0x0f }, { 1, 0x00, 0x0a } }, + { { 0, 0x01, 0x0e }, { 1, 0x00, 0x0a } }, + { { 0, 0x00, 0x10 }, { 1, 0x00, 0x0b } }, + { { 1, 0x00, 0x0b }, { 0, 0x00, 0x10 } }, + { { 0, 0x00, 0x11 }, { 1, 0x00, 0x0b } }, + { { 0, 0x00, 0x11 }, { 1, 0x00, 0x0b } }, + { { 0, 0x00, 0x12 }, { 1, 0x00, 0x0c } }, + { { 0, 0x00, 0x12 }, { 1, 0x00, 0x0c } }, + { { 0, 0x00, 0x12 }, { 1, 0x00, 0x0c } }, + { { 0, 0x02, 0x0f }, { 1, 0x00, 0x0c } }, + { { 0, 0x00, 0x13 }, { 1, 0x00, 0x0d } }, + { { 1, 0x00, 0x0d }, { 0, 0x00, 0x13 } }, + { { 0, 0x01, 0x12 }, { 1, 0x00, 0x0d } }, + { { 0, 0x00, 0x14 }, { 1, 0x00, 0x0d } }, + { { 0, 0x00, 0x14 }, { 1, 0x00, 0x0e } }, + { { 0, 0x00, 0x15 }, { 1, 0x00, 0x0e } }, + { { 0, 0x00, 0x15 }, { 1, 0x00, 0x0e } }, + { { 0, 0x00, 0x16 }, { 1, 0x00, 0x0e } }, + { { 0, 0x00, 0x16 }, { 1, 0x00, 0x0f } }, + { { 1, 0x00, 0x0f }, { 0, 0x00, 0x16 } }, + { { 0, 0x02, 0x13 }, { 1, 0x00, 0x0f } }, + { { 0, 0x00, 0x17 }, { 1, 0x00, 0x0f } }, + { { 0, 0x00, 0x17 }, { 1, 0x01, 0x0f } }, + { { 0, 0x01, 0x16 }, { 1, 0x01, 0x0f } }, + { { 0, 0x00, 0x18 }, { 1, 0x00, 0x10 } }, + { { 0, 0x00, 0x18 }, { 1, 0x00, 0x10 } }, + { { 0, 0x00, 0x19 }, { 1, 0x02, 0x0f } }, + { { 1, 0x02, 0x0f }, { 0, 0x00, 0x19 } }, + { { 1, 0x00, 0x11 }, { 0, 0x00, 0x1a } }, + { { 0, 0x00, 0x1a }, { 1, 0x00, 0x11 } }, + { { 0, 0x00, 0x1a }, { 1, 0x03, 0x0f } }, + { { 0, 0x02, 0x17 }, { 1, 0x03, 0x0f } }, + { { 0, 0x00, 0x1b }, { 1, 0x00, 0x12 } }, + { { 0, 0x00, 0x1b }, { 1, 0x00, 0x12 } }, + { { 0, 0x01, 0x1a }, { 1, 0x00, 0x12 } }, + { { 0, 0x00, 0x1c }, { 1, 0x00, 0x13 } }, + { { 1, 0x00, 0x13 }, { 0, 0x00, 0x1c } }, + { { 0, 0x00, 0x1d }, { 1, 0x00, 0x13 } }, + { { 0, 0x00, 0x1d }, { 1, 0x00, 0x13 } }, + { { 0, 0x00, 0x1e }, { 1, 0x00, 0x14 } }, + { { 0, 0x00, 0x1e }, { 1, 0x00, 0x14 } }, + { { 0, 0x00, 0x1e }, { 1, 0x00, 0x14 } }, + { { 0, 0x02, 0x1b }, { 1, 0x00, 0x14 } }, + { { 0, 0x00, 0x1f }, { 1, 0x00, 0x15 } }, + { { 1, 0x00, 0x15 }, { 0, 0x00, 0x1f } }, + { { 0, 0x01, 0x1e }, { 1, 0x00, 0x15 } }, + { { 0, 0x04, 0x18 }, { 1, 0x00, 0x15 } }, + { { 0, 0x01, 0x1f }, { 1, 0x00, 0x16 } }, + { { 0, 0x01, 0x1f }, { 1, 0x00, 0x16 } }, + { { 0, 0x01, 0x1f }, { 1, 0x00, 0x16 } }, + { { 0, 0x02, 0x1e }, { 1, 0x00, 0x16 } }, + { { 0, 0x02, 0x1e }, { 1, 0x00, 0x17 } }, + { { 1, 0x00, 0x17 }, { 0, 0x02, 0x1e } }, + { { 0, 0x02, 0x1f }, { 1, 0x00, 0x17 } }, + { { 0, 0x04, 0x1b }, { 1, 0x00, 0x17 } }, + { { 0, 0x03, 0x1e }, { 1, 0x01, 0x17 } }, + { { 0, 0x03, 0x1e }, { 1, 0x01, 0x17 } }, + { { 0, 0x04, 0x1c }, { 1, 0x00, 0x18 } }, + { { 0, 0x03, 0x1f }, { 1, 0x00, 0x18 } }, + { { 0, 0x03, 0x1f }, { 1, 0x02, 0x17 } }, + { { 1, 0x02, 0x17 }, { 0, 0x03, 0x1f } }, + { { 1, 0x00, 0x19 }, { 0, 0x04, 0x1e } }, + { { 0, 0x04, 0x1e }, { 1, 0x00, 0x19 } }, + { { 0, 0x04, 0x1e }, { 1, 0x03, 0x17 } }, + { { 0, 0x06, 0x1b }, { 1, 0x03, 0x17 } }, + { { 0, 0x04, 0x1f }, { 1, 0x00, 0x1a } }, + { { 0, 0x04, 0x1f }, { 1, 0x00, 0x1a } }, + { { 0, 0x05, 0x1e }, { 1, 0x00, 0x1a } }, + { { 0, 0x08, 0x18 }, { 1, 0x00, 0x1b } }, + { { 1, 0x00, 0x1b }, { 0, 0x05, 0x1f } }, + { { 0, 0x05, 0x1f }, { 1, 0x00, 0x1b } }, + { { 0, 0x05, 0x1f }, { 1, 0x00, 0x1b } }, + { { 0, 0x06, 0x1e }, { 1, 0x00, 0x1c } }, + { { 0, 0x06, 0x1e }, { 1, 0x00, 0x1c } }, + { { 0, 0x06, 0x1e }, { 1, 0x00, 0x1c } }, + { { 0, 0x06, 0x1f }, { 1, 0x00, 0x1c } }, + { { 0, 0x08, 0x1b }, { 1, 0x00, 0x1d } }, + { { 1, 0x00, 0x1d }, { 0, 0x07, 0x1e } }, + { { 0, 0x07, 0x1e }, { 1, 0x00, 0x1d } }, + { { 0, 0x08, 0x1c }, { 1, 0x00, 0x1d } }, + { { 0, 0x07, 0x1f }, { 1, 0x00, 0x1e } }, + { { 0, 0x07, 0x1f }, { 1, 0x00, 0x1e } }, + { { 0, 0x07, 0x1f }, { 1, 0x00, 0x1e } }, + { { 0, 0x08, 0x1e }, { 1, 0x00, 0x1e } }, + { { 0, 0x08, 0x1e }, { 1, 0x00, 0x1f } }, + { { 1, 0x00, 0x1f }, { 0, 0x08, 0x1e } }, + { { 0, 0x0a, 0x1b }, { 1, 0x00, 0x1f } }, + { { 0, 0x08, 0x1f }, { 1, 0x00, 0x1f } }, + { { 0, 0x08, 0x1f }, { 1, 0x01, 0x1f } }, + { { 0, 0x09, 0x1e }, { 1, 0x01, 0x1f } }, + { { 0, 0x0c, 0x18 }, { 1, 0x04, 0x1c } }, + { { 0, 0x09, 0x1f }, { 1, 0x04, 0x1c } }, + { { 0, 0x09, 0x1f }, { 1, 0x02, 0x1f } }, + { { 1, 0x02, 0x1f }, { 0, 0x09, 0x1f } }, + { { 1, 0x04, 0x1d }, { 0, 0x0a, 0x1e } }, + { { 0, 0x0a, 0x1e }, { 1, 0x04, 0x1d } }, + { { 0, 0x0a, 0x1e }, { 1, 0x03, 0x1f } }, + { { 0, 0x0a, 0x1f }, { 1, 0x03, 0x1f } }, + { { 0, 0x0c, 0x1b }, { 1, 0x04, 0x1e } }, + { { 0, 0x0b, 0x1e }, { 1, 0x04, 0x1e } }, + { { 0, 0x0b, 0x1e }, { 1, 0x04, 0x1e } }, + { { 0, 0x0c, 0x1c }, { 1, 0x04, 0x1f } }, + { { 1, 0x04, 0x1f }, { 0, 0x0b, 0x1f } }, + { { 0, 0x0b, 0x1f }, { 1, 0x04, 0x1f } }, + { { 0, 0x0b, 0x1f }, { 1, 0x04, 0x1f } }, + { { 0, 0x0c, 0x1e }, { 1, 0x05, 0x1f } }, + { { 0, 0x0c, 0x1e }, { 1, 0x05, 0x1f } }, + { { 0, 0x0c, 0x1e }, { 1, 0x05, 0x1f } }, + { { 0, 0x0e, 0x1b }, { 1, 0x05, 0x1f } }, + { { 0, 0x0c, 0x1f }, { 1, 0x06, 0x1f } }, + { { 1, 0x06, 0x1f }, { 0, 0x0c, 0x1f } }, + { { 0, 0x0d, 0x1e }, { 1, 0x06, 0x1f } }, + { { 0, 0x10, 0x18 }, { 1, 0x06, 0x1f } }, + { { 0, 0x0d, 0x1f }, { 1, 0x07, 0x1f } }, + { { 0, 0x0d, 0x1f }, { 1, 0x07, 0x1f } }, + { { 0, 0x0d, 0x1f }, { 1, 0x07, 0x1f } }, + { { 0, 0x0e, 0x1e }, { 1, 0x07, 0x1f } }, + { { 0, 0x0e, 0x1e }, { 1, 0x08, 0x1f } }, + { { 1, 0x08, 0x1f }, { 0, 0x0e, 0x1e } }, + { { 0, 0x0e, 0x1f }, { 1, 0x08, 0x1f } }, + { { 0, 0x10, 0x1b }, { 1, 0x08, 0x1f } }, + { { 0, 0x0f, 0x1e }, { 1, 0x09, 0x1f } }, + { { 0, 0x0f, 0x1e }, { 1, 0x09, 0x1f } }, + { { 0, 0x10, 0x1c }, { 1, 0x0c, 0x1c } }, + { { 0, 0x0f, 0x1f }, { 1, 0x0c, 0x1c } }, + { { 0, 0x0f, 0x1f }, { 1, 0x0a, 0x1f } }, + { { 1, 0x0a, 0x1f }, { 0, 0x0f, 0x1f } }, + { { 1, 0x0c, 0x1d }, { 0, 0x10, 0x1e } }, + { { 0, 0x10, 0x1e }, { 1, 0x0c, 0x1d } }, + { { 0, 0x10, 0x1e }, { 1, 0x0b, 0x1f } }, + { { 0, 0x12, 0x1b }, { 1, 0x0b, 0x1f } }, + { { 0, 0x10, 0x1f }, { 1, 0x0c, 0x1e } }, + { { 0, 0x10, 0x1f }, { 1, 0x0c, 0x1e } }, + { { 0, 0x11, 0x1e }, { 1, 0x0c, 0x1e } }, + { { 0, 0x14, 0x18 }, { 1, 0x0c, 0x1f } }, + { { 1, 0x0c, 0x1f }, { 0, 0x11, 0x1f } }, + { { 0, 0x11, 0x1f }, { 1, 0x0c, 0x1f } }, + { { 0, 0x11, 0x1f }, { 1, 0x0c, 0x1f } }, + { { 0, 0x12, 0x1e }, { 1, 0x0d, 0x1f } }, + { { 0, 0x12, 0x1e }, { 1, 0x0d, 0x1f } }, + { { 0, 0x12, 0x1e }, { 1, 0x0d, 0x1f } }, + { { 0, 0x12, 0x1f }, { 1, 0x0d, 0x1f } }, + { { 0, 0x14, 0x1b }, { 1, 0x0e, 0x1f } }, + { { 1, 0x0e, 0x1f }, { 0, 0x13, 0x1e } }, + { { 0, 0x13, 0x1e }, { 1, 0x0e, 0x1f } }, + { { 0, 0x14, 0x1c }, { 1, 0x0e, 0x1f } }, + { { 0, 0x13, 0x1f }, { 1, 0x0f, 0x1f } }, + { { 0, 0x13, 0x1f }, { 1, 0x0f, 0x1f } }, + { { 0, 0x13, 0x1f }, { 1, 0x0f, 0x1f } }, + { { 0, 0x14, 0x1e }, { 1, 0x0f, 0x1f } }, + { { 0, 0x14, 0x1e }, { 1, 0x10, 0x1f } }, + { { 1, 0x10, 0x1f }, { 0, 0x14, 0x1e } }, + { { 0, 0x16, 0x1b }, { 1, 0x10, 0x1f } }, + { { 0, 0x14, 0x1f }, { 1, 0x10, 0x1f } }, + { { 0, 0x14, 0x1f }, { 1, 0x11, 0x1f } }, + { { 0, 0x15, 0x1e }, { 1, 0x11, 0x1f } }, + { { 0, 0x18, 0x18 }, { 1, 0x14, 0x1c } }, + { { 0, 0x15, 0x1f }, { 1, 0x14, 0x1c } }, + { { 0, 0x15, 0x1f }, { 1, 0x12, 0x1f } }, + { { 1, 0x12, 0x1f }, { 0, 0x15, 0x1f } }, + { { 1, 0x14, 0x1d }, { 0, 0x16, 0x1e } }, + { { 0, 0x16, 0x1e }, { 1, 0x14, 0x1d } }, + { { 0, 0x16, 0x1e }, { 1, 0x13, 0x1f } }, + { { 0, 0x16, 0x1f }, { 1, 0x13, 0x1f } }, + { { 0, 0x18, 0x1b }, { 1, 0x14, 0x1e } }, + { { 0, 0x17, 0x1e }, { 1, 0x14, 0x1e } }, + { { 0, 0x17, 0x1e }, { 1, 0x14, 0x1e } }, + { { 0, 0x18, 0x1c }, { 1, 0x14, 0x1f } }, + { { 1, 0x14, 0x1f }, { 0, 0x17, 0x1f } }, + { { 0, 0x17, 0x1f }, { 1, 0x14, 0x1f } }, + { { 0, 0x17, 0x1f }, { 1, 0x14, 0x1f } }, + { { 0, 0x18, 0x1e }, { 1, 0x15, 0x1f } }, + { { 0, 0x18, 0x1e }, { 1, 0x15, 0x1f } }, + { { 0, 0x18, 0x1e }, { 1, 0x15, 0x1f } }, + { { 0, 0x1a, 0x1b }, { 1, 0x15, 0x1f } }, + { { 0, 0x18, 0x1f }, { 1, 0x16, 0x1f } }, + { { 1, 0x16, 0x1f }, { 0, 0x18, 0x1f } }, + { { 0, 0x19, 0x1e }, { 1, 0x16, 0x1f } }, + { { 0, 0x19, 0x1e }, { 1, 0x16, 0x1f } }, + { { 0, 0x19, 0x1f }, { 1, 0x17, 0x1f } }, + { { 0, 0x19, 0x1f }, { 1, 0x17, 0x1f } }, + { { 0, 0x19, 0x1f }, { 1, 0x17, 0x1f } }, + { { 0, 0x1a, 0x1e }, { 1, 0x17, 0x1f } }, + { { 0, 0x1a, 0x1e }, { 1, 0x18, 0x1f } }, + { { 1, 0x18, 0x1f }, { 0, 0x1a, 0x1e } }, + { { 0, 0x1a, 0x1f }, { 1, 0x18, 0x1f } }, + { { 0, 0x1a, 0x1f }, { 1, 0x18, 0x1f } }, + { { 0, 0x1b, 0x1e }, { 1, 0x19, 0x1f } }, + { { 0, 0x1b, 0x1e }, { 1, 0x19, 0x1f } }, + { { 0, 0x1c, 0x1c }, { 1, 0x1c, 0x1c } }, + { { 0, 0x1b, 0x1f }, { 1, 0x1c, 0x1c } }, + { { 0, 0x1b, 0x1f }, { 1, 0x1a, 0x1f } }, + { { 1, 0x1a, 0x1f }, { 0, 0x1b, 0x1f } }, + { { 1, 0x1c, 0x1d }, { 0, 0x1c, 0x1e } }, + { { 0, 0x1c, 0x1e }, { 1, 0x1c, 0x1d } }, + { { 0, 0x1c, 0x1e }, { 1, 0x1b, 0x1f } }, + { { 1, 0x1b, 0x1f }, { 0, 0x1c, 0x1f } }, + { { 0, 0x1c, 0x1f }, { 1, 0x1c, 0x1e } }, + { { 0, 0x1c, 0x1f }, { 1, 0x1c, 0x1e } }, + { { 0, 0x1d, 0x1e }, { 1, 0x1c, 0x1e } }, + { { 0, 0x1d, 0x1e }, { 1, 0x1c, 0x1f } }, + { { 1, 0x1c, 0x1f }, { 0, 0x1d, 0x1f } }, + { { 0, 0x1d, 0x1f }, { 1, 0x1c, 0x1f } }, + { { 0, 0x1d, 0x1f }, { 1, 0x1c, 0x1f } }, + { { 0, 0x1e, 0x1e }, { 1, 0x1d, 0x1f } }, + { { 0, 0x1e, 0x1e }, { 1, 0x1d, 0x1f } }, + { { 0, 0x1e, 0x1e }, { 1, 0x1d, 0x1f } }, + { { 0, 0x1e, 0x1f }, { 1, 0x1d, 0x1f } }, + { { 0, 0x1e, 0x1f }, { 1, 0x1e, 0x1f } }, + { { 1, 0x1e, 0x1f }, { 0, 0x1e, 0x1f } }, + { { 1, 0x1e, 0x1f }, { 0, 0x1e, 0x1f } }, + { { 0, 0x1f, 0x1f }, { 1, 0x1e, 0x1f } }, + { { 0, 0x1f, 0x1f }, { 1, 0x1f, 0x1f } }, + { { 0, 0x1f, 0x1f }, { 1, 0x1f, 0x1f } } +}; + +static unsigned char Optimal6CompressDXT1[256][2][3] = { + { { 0, 0x00, 0x00 }, { 1, 0x00, 0x00 } }, + { { 0, 0x00, 0x01 }, { 1, 0x00, 0x00 } }, + { { 0, 0x00, 0x02 }, { 1, 0x00, 0x01 } }, + { { 0, 0x00, 0x02 }, { 1, 0x00, 0x01 } }, + { { 0, 0x00, 0x03 }, { 1, 0x00, 0x02 } }, + { { 0, 0x00, 0x04 }, { 1, 0x00, 0x02 } }, + { { 0, 0x00, 0x05 }, { 1, 0x00, 0x03 } }, + { { 0, 0x00, 0x05 }, { 1, 0x00, 0x03 } }, + { { 0, 0x00, 0x06 }, { 1, 0x00, 0x04 } }, + { { 0, 0x00, 0x07 }, { 1, 0x00, 0x04 } }, + { { 0, 0x00, 0x08 }, { 1, 0x00, 0x05 } }, + { { 0, 0x00, 0x08 }, { 1, 0x00, 0x05 } }, + { { 0, 0x00, 0x09 }, { 1, 0x00, 0x06 } }, + { { 0, 0x00, 0x0a }, { 1, 0x00, 0x06 } }, + { { 0, 0x00, 0x0b }, { 1, 0x00, 0x07 } }, + { { 0, 0x00, 0x0b }, { 1, 0x00, 0x07 } }, + { { 0, 0x00, 0x0c }, { 1, 0x00, 0x08 } }, + { { 0, 0x00, 0x0d }, { 1, 0x00, 0x08 } }, + { { 0, 0x00, 0x0e }, { 1, 0x00, 0x09 } }, + { { 0, 0x00, 0x0e }, { 1, 0x00, 0x09 } }, + { { 0, 0x00, 0x0f }, { 1, 0x00, 0x0a } }, + { { 0, 0x00, 0x10 }, { 1, 0x00, 0x0a } }, + { { 0, 0x01, 0x0f }, { 1, 0x00, 0x0b } }, + { { 0, 0x00, 0x11 }, { 1, 0x00, 0x0b } }, + { { 0, 0x00, 0x12 }, { 1, 0x00, 0x0c } }, + { { 0, 0x00, 0x13 }, { 1, 0x00, 0x0c } }, + { { 0, 0x03, 0x0e }, { 1, 0x00, 0x0d } }, + { { 0, 0x00, 0x14 }, { 1, 0x00, 0x0d } }, + { { 0, 0x00, 0x15 }, { 1, 0x00, 0x0e } }, + { { 0, 0x00, 0x16 }, { 1, 0x00, 0x0e } }, + { { 0, 0x04, 0x0f }, { 1, 0x00, 0x0f } }, + { { 0, 0x00, 0x17 }, { 1, 0x00, 0x0f } }, + { { 0, 0x00, 0x18 }, { 1, 0x00, 0x10 } }, + { { 0, 0x00, 0x19 }, { 1, 0x00, 0x10 } }, + { { 0, 0x06, 0x0e }, { 1, 0x00, 0x11 } }, + { { 0, 0x00, 0x1a }, { 1, 0x00, 0x11 } }, + { { 0, 0x00, 0x1b }, { 1, 0x00, 0x12 } }, + { { 0, 0x00, 0x1c }, { 1, 0x00, 0x12 } }, + { { 0, 0x07, 0x0f }, { 1, 0x00, 0x13 } }, + { { 0, 0x00, 0x1d }, { 1, 0x00, 0x13 } }, + { { 0, 0x00, 0x1e }, { 1, 0x00, 0x14 } }, + { { 0, 0x00, 0x1f }, { 1, 0x00, 0x14 } }, + { { 0, 0x09, 0x0e }, { 1, 0x00, 0x15 } }, + { { 0, 0x00, 0x20 }, { 1, 0x00, 0x15 } }, + { { 0, 0x00, 0x21 }, { 1, 0x00, 0x16 } }, + { { 0, 0x02, 0x1e }, { 1, 0x00, 0x16 } }, + { { 0, 0x00, 0x22 }, { 1, 0x00, 0x17 } }, + { { 0, 0x00, 0x23 }, { 1, 0x00, 0x17 } }, + { { 0, 0x00, 0x24 }, { 1, 0x00, 0x18 } }, + { { 0, 0x03, 0x1f }, { 1, 0x00, 0x18 } }, + { { 0, 0x00, 0x25 }, { 1, 0x00, 0x19 } }, + { { 0, 0x00, 0x26 }, { 1, 0x00, 0x19 } }, + { { 0, 0x00, 0x27 }, { 1, 0x00, 0x1a } }, + { { 0, 0x05, 0x1e }, { 1, 0x00, 0x1a } }, + { { 0, 0x00, 0x28 }, { 1, 0x00, 0x1b } }, + { { 0, 0x00, 0x29 }, { 1, 0x00, 0x1b } }, + { { 0, 0x00, 0x2a }, { 1, 0x00, 0x1c } }, + { { 0, 0x06, 0x1f }, { 1, 0x00, 0x1c } }, + { { 0, 0x00, 0x2b }, { 1, 0x00, 0x1d } }, + { { 0, 0x00, 0x2c }, { 1, 0x00, 0x1d } }, + { { 0, 0x00, 0x2d }, { 1, 0x00, 0x1e } }, + { { 0, 0x08, 0x1e }, { 1, 0x00, 0x1e } }, + { { 0, 0x00, 0x2e }, { 1, 0x00, 0x1f } }, + { { 0, 0x00, 0x2f }, { 1, 0x00, 0x1f } }, + { { 0, 0x01, 0x2e }, { 1, 0x01, 0x1f } }, + { { 0, 0x00, 0x30 }, { 1, 0x00, 0x20 } }, + { { 0, 0x00, 0x31 }, { 1, 0x02, 0x1f } }, + { { 0, 0x00, 0x32 }, { 1, 0x00, 0x21 } }, + { { 0, 0x02, 0x2f }, { 1, 0x03, 0x1f } }, + { { 0, 0x00, 0x33 }, { 1, 0x00, 0x22 } }, + { { 0, 0x00, 0x34 }, { 1, 0x04, 0x1f } }, + { { 0, 0x00, 0x35 }, { 1, 0x00, 0x23 } }, + { { 0, 0x04, 0x2e }, { 1, 0x05, 0x1f } }, + { { 0, 0x00, 0x36 }, { 1, 0x00, 0x24 } }, + { { 0, 0x00, 0x37 }, { 1, 0x06, 0x1f } }, + { { 0, 0x00, 0x38 }, { 1, 0x00, 0x25 } }, + { { 0, 0x05, 0x2f }, { 1, 0x07, 0x1f } }, + { { 0, 0x00, 0x39 }, { 1, 0x00, 0x26 } }, + { { 0, 0x00, 0x3a }, { 1, 0x08, 0x1f } }, + { { 0, 0x00, 0x3b }, { 1, 0x00, 0x27 } }, + { { 0, 0x07, 0x2e }, { 1, 0x09, 0x1f } }, + { { 0, 0x00, 0x3c }, { 1, 0x00, 0x28 } }, + { { 0, 0x00, 0x3d }, { 1, 0x0a, 0x1f } }, + { { 0, 0x00, 0x3e }, { 1, 0x00, 0x29 } }, + { { 0, 0x08, 0x2f }, { 1, 0x0b, 0x1f } }, + { { 0, 0x00, 0x3f }, { 1, 0x00, 0x2a } }, + { { 0, 0x01, 0x3e }, { 1, 0x0c, 0x1f } }, + { { 0, 0x01, 0x3f }, { 1, 0x00, 0x2b } }, + { { 0, 0x0a, 0x2e }, { 1, 0x0d, 0x1f } }, + { { 0, 0x02, 0x3e }, { 1, 0x00, 0x2c } }, + { { 0, 0x02, 0x3f }, { 1, 0x0e, 0x1f } }, + { { 0, 0x03, 0x3e }, { 1, 0x00, 0x2d } }, + { { 0, 0x0b, 0x2f }, { 1, 0x0f, 0x1f } }, + { { 0, 0x03, 0x3f }, { 1, 0x00, 0x2e } }, + { { 0, 0x04, 0x3e }, { 1, 0x00, 0x2e } }, + { { 0, 0x04, 0x3f }, { 1, 0x00, 0x2f } }, + { { 0, 0x0d, 0x2e }, { 1, 0x00, 0x2f } }, + { { 0, 0x05, 0x3e }, { 1, 0x00, 0x30 } }, + { { 0, 0x05, 0x3f }, { 1, 0x00, 0x30 } }, + { { 0, 0x06, 0x3e }, { 1, 0x00, 0x31 } }, + { { 0, 0x0e, 0x2f }, { 1, 0x00, 0x31 } }, + { { 0, 0x06, 0x3f }, { 1, 0x00, 0x32 } }, + { { 0, 0x07, 0x3e }, { 1, 0x00, 0x32 } }, + { { 0, 0x07, 0x3f }, { 1, 0x00, 0x33 } }, + { { 0, 0x10, 0x2d }, { 1, 0x00, 0x33 } }, + { { 0, 0x08, 0x3e }, { 1, 0x00, 0x34 } }, + { { 0, 0x08, 0x3f }, { 1, 0x00, 0x34 } }, + { { 0, 0x09, 0x3e }, { 1, 0x00, 0x35 } }, + { { 0, 0x10, 0x30 }, { 1, 0x00, 0x35 } }, + { { 0, 0x09, 0x3f }, { 1, 0x00, 0x36 } }, + { { 0, 0x0a, 0x3e }, { 1, 0x00, 0x36 } }, + { { 0, 0x0a, 0x3f }, { 1, 0x00, 0x37 } }, + { { 0, 0x10, 0x33 }, { 1, 0x00, 0x37 } }, + { { 0, 0x0b, 0x3e }, { 1, 0x00, 0x38 } }, + { { 0, 0x0b, 0x3f }, { 1, 0x00, 0x38 } }, + { { 0, 0x0c, 0x3e }, { 1, 0x00, 0x39 } }, + { { 0, 0x10, 0x36 }, { 1, 0x00, 0x39 } }, + { { 0, 0x0c, 0x3f }, { 1, 0x00, 0x3a } }, + { { 0, 0x0d, 0x3e }, { 1, 0x00, 0x3a } }, + { { 0, 0x0d, 0x3f }, { 1, 0x00, 0x3b } }, + { { 0, 0x10, 0x39 }, { 1, 0x00, 0x3b } }, + { { 0, 0x0e, 0x3e }, { 1, 0x00, 0x3c } }, + { { 0, 0x0e, 0x3f }, { 1, 0x00, 0x3c } }, + { { 0, 0x0f, 0x3e }, { 1, 0x00, 0x3d } }, + { { 0, 0x10, 0x3c }, { 1, 0x00, 0x3d } }, + { { 0, 0x0f, 0x3f }, { 1, 0x00, 0x3e } }, + { { 0, 0x18, 0x2e }, { 1, 0x00, 0x3e } }, + { { 0, 0x10, 0x3e }, { 1, 0x00, 0x3f } }, + { { 0, 0x10, 0x3f }, { 1, 0x00, 0x3f } }, + { { 0, 0x11, 0x3e }, { 1, 0x01, 0x3f } }, + { { 0, 0x19, 0x2f }, { 1, 0x10, 0x30 } }, + { { 0, 0x11, 0x3f }, { 1, 0x02, 0x3f } }, + { { 0, 0x12, 0x3e }, { 1, 0x10, 0x31 } }, + { { 0, 0x12, 0x3f }, { 1, 0x03, 0x3f } }, + { { 0, 0x1b, 0x2e }, { 1, 0x10, 0x32 } }, + { { 0, 0x13, 0x3e }, { 1, 0x04, 0x3f } }, + { { 0, 0x13, 0x3f }, { 1, 0x10, 0x33 } }, + { { 0, 0x14, 0x3e }, { 1, 0x05, 0x3f } }, + { { 0, 0x1c, 0x2f }, { 1, 0x10, 0x34 } }, + { { 0, 0x14, 0x3f }, { 1, 0x06, 0x3f } }, + { { 0, 0x15, 0x3e }, { 1, 0x10, 0x35 } }, + { { 0, 0x15, 0x3f }, { 1, 0x07, 0x3f } }, + { { 0, 0x1e, 0x2e }, { 1, 0x10, 0x36 } }, + { { 0, 0x16, 0x3e }, { 1, 0x08, 0x3f } }, + { { 0, 0x16, 0x3f }, { 1, 0x10, 0x37 } }, + { { 0, 0x17, 0x3e }, { 1, 0x09, 0x3f } }, + { { 0, 0x1f, 0x2f }, { 1, 0x10, 0x38 } }, + { { 0, 0x17, 0x3f }, { 1, 0x0a, 0x3f } }, + { { 0, 0x18, 0x3e }, { 1, 0x10, 0x39 } }, + { { 0, 0x18, 0x3f }, { 1, 0x0b, 0x3f } }, + { { 0, 0x20, 0x2f }, { 1, 0x10, 0x3a } }, + { { 0, 0x19, 0x3e }, { 1, 0x0c, 0x3f } }, + { { 0, 0x19, 0x3f }, { 1, 0x10, 0x3b } }, + { { 0, 0x1a, 0x3e }, { 1, 0x0d, 0x3f } }, + { { 0, 0x20, 0x32 }, { 1, 0x10, 0x3c } }, + { { 0, 0x1a, 0x3f }, { 1, 0x0e, 0x3f } }, + { { 0, 0x1b, 0x3e }, { 1, 0x10, 0x3d } }, + { { 0, 0x1b, 0x3f }, { 1, 0x0f, 0x3f } }, + { { 0, 0x20, 0x35 }, { 1, 0x10, 0x3e } }, + { { 0, 0x1c, 0x3e }, { 1, 0x10, 0x3e } }, + { { 0, 0x1c, 0x3f }, { 1, 0x10, 0x3f } }, + { { 0, 0x1d, 0x3e }, { 1, 0x10, 0x3f } }, + { { 0, 0x20, 0x38 }, { 1, 0x11, 0x3f } }, + { { 0, 0x1d, 0x3f }, { 1, 0x11, 0x3f } }, + { { 0, 0x1e, 0x3e }, { 1, 0x12, 0x3f } }, + { { 0, 0x1e, 0x3f }, { 1, 0x12, 0x3f } }, + { { 0, 0x20, 0x3b }, { 1, 0x13, 0x3f } }, + { { 0, 0x1f, 0x3e }, { 1, 0x13, 0x3f } }, + { { 0, 0x1f, 0x3f }, { 1, 0x14, 0x3f } }, + { { 0, 0x20, 0x3d }, { 1, 0x14, 0x3f } }, + { { 0, 0x20, 0x3e }, { 1, 0x15, 0x3f } }, + { { 0, 0x20, 0x3f }, { 1, 0x15, 0x3f } }, + { { 0, 0x29, 0x2e }, { 1, 0x16, 0x3f } }, + { { 0, 0x21, 0x3e }, { 1, 0x16, 0x3f } }, + { { 0, 0x21, 0x3f }, { 1, 0x17, 0x3f } }, + { { 0, 0x22, 0x3e }, { 1, 0x17, 0x3f } }, + { { 0, 0x2a, 0x2f }, { 1, 0x18, 0x3f } }, + { { 0, 0x22, 0x3f }, { 1, 0x18, 0x3f } }, + { { 0, 0x23, 0x3e }, { 1, 0x19, 0x3f } }, + { { 0, 0x23, 0x3f }, { 1, 0x19, 0x3f } }, + { { 0, 0x2c, 0x2e }, { 1, 0x1a, 0x3f } }, + { { 0, 0x24, 0x3e }, { 1, 0x1a, 0x3f } }, + { { 0, 0x24, 0x3f }, { 1, 0x1b, 0x3f } }, + { { 0, 0x25, 0x3e }, { 1, 0x1b, 0x3f } }, + { { 0, 0x2d, 0x2f }, { 1, 0x1c, 0x3f } }, + { { 0, 0x25, 0x3f }, { 1, 0x1c, 0x3f } }, + { { 0, 0x26, 0x3e }, { 1, 0x1d, 0x3f } }, + { { 0, 0x26, 0x3f }, { 1, 0x1d, 0x3f } }, + { { 1, 0x1e, 0x3f }, { 0, 0x26, 0x3f } }, + { { 0, 0x27, 0x3e }, { 1, 0x1e, 0x3f } }, + { { 0, 0x27, 0x3f }, { 1, 0x1f, 0x3f } }, + { { 0, 0x28, 0x3e }, { 1, 0x1f, 0x3f } }, + { { 1, 0x20, 0x3f }, { 0, 0x28, 0x3e } }, + { { 0, 0x28, 0x3f }, { 1, 0x20, 0x3f } }, + { { 0, 0x29, 0x3e }, { 1, 0x21, 0x3f } }, + { { 0, 0x29, 0x3f }, { 1, 0x30, 0x30 } }, + { { 0, 0x30, 0x31 }, { 1, 0x22, 0x3f } }, + { { 0, 0x2a, 0x3e }, { 1, 0x30, 0x31 } }, + { { 0, 0x2a, 0x3f }, { 1, 0x23, 0x3f } }, + { { 0, 0x2b, 0x3e }, { 1, 0x30, 0x32 } }, + { { 0, 0x30, 0x34 }, { 1, 0x24, 0x3f } }, + { { 0, 0x2b, 0x3f }, { 1, 0x30, 0x33 } }, + { { 0, 0x2c, 0x3e }, { 1, 0x25, 0x3f } }, + { { 0, 0x2c, 0x3f }, { 1, 0x30, 0x34 } }, + { { 0, 0x30, 0x37 }, { 1, 0x26, 0x3f } }, + { { 0, 0x2d, 0x3e }, { 1, 0x30, 0x35 } }, + { { 0, 0x2d, 0x3f }, { 1, 0x27, 0x3f } }, + { { 0, 0x2e, 0x3e }, { 1, 0x30, 0x36 } }, + { { 0, 0x30, 0x3a }, { 1, 0x28, 0x3f } }, + { { 0, 0x2e, 0x3f }, { 1, 0x30, 0x37 } }, + { { 0, 0x2f, 0x3e }, { 1, 0x29, 0x3f } }, + { { 0, 0x2f, 0x3f }, { 1, 0x30, 0x38 } }, + { { 0, 0x30, 0x3d }, { 1, 0x2a, 0x3f } }, + { { 0, 0x30, 0x3e }, { 1, 0x30, 0x39 } }, + { { 1, 0x2b, 0x3f }, { 0, 0x30, 0x3e } }, + { { 0, 0x30, 0x3f }, { 1, 0x30, 0x3a } }, + { { 0, 0x31, 0x3e }, { 1, 0x2c, 0x3f } }, + { { 0, 0x31, 0x3f }, { 1, 0x30, 0x3b } }, + { { 1, 0x2d, 0x3f }, { 0, 0x31, 0x3f } }, + { { 0, 0x32, 0x3e }, { 1, 0x30, 0x3c } }, + { { 0, 0x32, 0x3f }, { 1, 0x2e, 0x3f } }, + { { 0, 0x33, 0x3e }, { 1, 0x30, 0x3d } }, + { { 1, 0x2f, 0x3f }, { 0, 0x33, 0x3e } }, + { { 0, 0x33, 0x3f }, { 1, 0x30, 0x3e } }, + { { 0, 0x34, 0x3e }, { 1, 0x30, 0x3e } }, + { { 0, 0x34, 0x3f }, { 1, 0x30, 0x3f } }, + { { 0, 0x34, 0x3f }, { 1, 0x30, 0x3f } }, + { { 0, 0x35, 0x3e }, { 1, 0x31, 0x3f } }, + { { 0, 0x35, 0x3f }, { 1, 0x31, 0x3f } }, + { { 0, 0x36, 0x3e }, { 1, 0x32, 0x3f } }, + { { 0, 0x36, 0x3e }, { 1, 0x32, 0x3f } }, + { { 0, 0x36, 0x3f }, { 1, 0x33, 0x3f } }, + { { 0, 0x37, 0x3e }, { 1, 0x33, 0x3f } }, + { { 0, 0x37, 0x3f }, { 1, 0x34, 0x3f } }, + { { 0, 0x37, 0x3f }, { 1, 0x34, 0x3f } }, + { { 0, 0x38, 0x3e }, { 1, 0x35, 0x3f } }, + { { 0, 0x38, 0x3f }, { 1, 0x35, 0x3f } }, + { { 0, 0x39, 0x3e }, { 1, 0x36, 0x3f } }, + { { 0, 0x39, 0x3e }, { 1, 0x36, 0x3f } }, + { { 0, 0x39, 0x3f }, { 1, 0x37, 0x3f } }, + { { 0, 0x3a, 0x3e }, { 1, 0x37, 0x3f } }, + { { 0, 0x3a, 0x3f }, { 1, 0x38, 0x3f } }, + { { 0, 0x3a, 0x3f }, { 1, 0x38, 0x3f } }, + { { 0, 0x3b, 0x3e }, { 1, 0x39, 0x3f } }, + { { 0, 0x3b, 0x3f }, { 1, 0x39, 0x3f } }, + { { 0, 0x3c, 0x3e }, { 1, 0x3a, 0x3f } }, + { { 0, 0x3c, 0x3e }, { 1, 0x3a, 0x3f } }, + { { 0, 0x3c, 0x3f }, { 1, 0x3b, 0x3f } }, + { { 0, 0x3d, 0x3e }, { 1, 0x3b, 0x3f } }, + { { 0, 0x3d, 0x3f }, { 1, 0x3c, 0x3f } }, + { { 0, 0x3d, 0x3f }, { 1, 0x3c, 0x3f } }, + { { 0, 0x3e, 0x3e }, { 1, 0x3d, 0x3f } }, + { { 0, 0x3e, 0x3f }, { 1, 0x3d, 0x3f } }, + { { 1, 0x3e, 0x3f }, { 0, 0x3e, 0x3f } }, + { { 0, 0x3f, 0x3f }, { 1, 0x3e, 0x3f } }, + { { 0, 0x3f, 0x3f }, { 1, 0x3f, 0x3f } } +}; \ No newline at end of file diff --git a/BPTCEncoder/src/BitStream.h b/BPTCEncoder/src/BitStream.h new file mode 100755 index 0000000..d3975ae --- /dev/null +++ b/BPTCEncoder/src/BitStream.h @@ -0,0 +1,115 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +#ifndef __BITSTREAM_H__ +#define __BITSTREAM_H__ + +class BitStream { +public: + BitStream(unsigned char *ptr, int nBits, int start_offset) : + m_BitsWritten(0), + m_NumBits(nBits), + m_NumBytes((nBits + start_offset + 7) >> 3), + m_CurByte(ptr), + m_NextBit(start_offset % 8), + done(false) + { } + + int GetBitsWritten() const { return m_BitsWritten; } + + ~BitStream() { } + void WriteBitsR(unsigned int val, unsigned int nBits) { + for(unsigned int i = 0; i < nBits; i++) { + WriteBit((val >> (nBits - i - 1)) & 1); + } + } + + void WriteBits(unsigned int val, unsigned int nBits) { + for(unsigned int i = 0; i < nBits; i++) { + WriteBit((val >> i) & 1); + } + } + +private: + void WriteBit(int b) { + + if(done) return; + + const unsigned int mask = 1 << m_NextBit++; + + // clear the bit + *m_CurByte &= ~mask; + + // Write the bit, if necessary + if(b) *m_CurByte |= mask; + + // Next byte? + if(m_NextBit >= 8) { + m_CurByte += 1; + m_NextBit = 0; + } + + done = done || ++m_BitsWritten >= m_NumBits; + } + + int m_BitsWritten; + int m_NextBit; + const int m_NumBytes; + const int m_NumBits; + unsigned char *m_CurByte; + + bool done; +}; + +class BitStreamReadOnly { +public: + BitStreamReadOnly(const unsigned char *ptr) : + m_BitsRead(0), + m_CurByte(ptr), + m_NextBit(0) + { } + + int GetBitsRead() const { return m_BitsRead; } + + ~BitStreamReadOnly() { } + + int ReadBit() { + + int bit = *m_CurByte >> m_NextBit++; + while(m_NextBit >= 8) { + m_NextBit -= 8; + m_CurByte++; + } + + m_BitsRead++; + return bit & 1; + } + + unsigned int ReadBits(unsigned int nBits) { + unsigned int ret = 0; + for(unsigned int i = 0; i < nBits; i++) { + ret |= (ReadBit() & 1) << i; + } + return ret; + } + +private: + int m_BitsRead; + int m_NextBit; + const unsigned char *m_CurByte; +}; +#endif //__BITSTREAM_H__ \ No newline at end of file diff --git a/BPTCEncoder/src/RGBAEndpoints.cpp b/BPTCEncoder/src/RGBAEndpoints.cpp new file mode 100755 index 0000000..0b7c541 --- /dev/null +++ b/BPTCEncoder/src/RGBAEndpoints.cpp @@ -0,0 +1,509 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +#include "BC7IntTypes.h" +#include "RGBAEndpoints.h" +#include "BC7Compressor.h" +#include "BC7CompressionMode.h" + +#include +#include +#include +#include + +#ifndef min +template +static T min(const T &a, const T &b) { + return (a > b)? b : a; +} +#endif + +#ifndef max +template +static T max(const T &a, const T &b) { + return (a > b)? a : b; +} +#endif + +static const double kPi = 3.141592653589793238462643383279502884197; +static const float kFloatConversion[256] = { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, + 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, + 32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f, 40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f, + 48.0f, 49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f, + 64.0f, 65.0f, 66.0f, 67.0f, 68.0f, 69.0f, 70.0f, 71.0f, 72.0f, 73.0f, 74.0f, 75.0f, 76.0f, 77.0f, 78.0f, 79.0f, + 80.0f, 81.0f, 82.0f, 83.0f, 84.0f, 85.0f, 86.0f, 87.0f, 88.0f, 89.0f, 90.0f, 91.0f, 92.0f, 93.0f, 94.0f, 95.0f, + 96.0f, 97.0f, 98.0f, 99.0f, 100.0f, 101.0f, 102.0f, 103.0f, 104.0f, 105.0f, 106.0f, 107.0f, 108.0f, 109.0f, 110.0f, 111.0f, + 112.0f, 113.0f, 114.0f, 115.0f, 116.0f, 117.0f, 118.0f, 119.0f, 120.0f, 121.0f, 122.0f, 123.0f, 124.0f, 125.0f, 126.0f, 127.0f, + 128.0f, 129.0f, 130.0f, 131.0f, 132.0f, 133.0f, 134.0f, 135.0f, 136.0f, 137.0f, 138.0f, 139.0f, 140.0f, 141.0f, 142.0f, 143.0f, + 144.0f, 145.0f, 146.0f, 147.0f, 148.0f, 149.0f, 150.0f, 151.0f, 152.0f, 153.0f, 154.0f, 155.0f, 156.0f, 157.0f, 158.0f, 159.0f, + 160.0f, 161.0f, 162.0f, 163.0f, 164.0f, 165.0f, 166.0f, 167.0f, 168.0f, 169.0f, 170.0f, 171.0f, 172.0f, 173.0f, 174.0f, 175.0f, + 176.0f, 177.0f, 178.0f, 179.0f, 180.0f, 181.0f, 182.0f, 183.0f, 184.0f, 185.0f, 186.0f, 187.0f, 188.0f, 189.0f, 190.0f, 191.0f, + 192.0f, 193.0f, 194.0f, 195.0f, 196.0f, 197.0f, 198.0f, 199.0f, 200.0f, 201.0f, 202.0f, 203.0f, 204.0f, 205.0f, 206.0f, 207.0f, + 208.0f, 209.0f, 210.0f, 211.0f, 212.0f, 213.0f, 214.0f, 215.0f, 216.0f, 217.0f, 218.0f, 219.0f, 220.0f, 221.0f, 222.0f, 223.0f, + 224.0f, 225.0f, 226.0f, 227.0f, 228.0f, 229.0f, 230.0f, 231.0f, 232.0f, 233.0f, 234.0f, 235.0f, 236.0f, 237.0f, 238.0f, 239.0f, + 240.0f, 241.0f, 242.0f, 243.0f, 244.0f, 245.0f, 246.0f, 247.0f, 248.0f, 249.0f, 250.0f, 251.0f, 252.0f, 253.0f, 254.0f, 255.0f +}; + +/////////////////////////////////////////////////////////////////////////////// +// +// Static helper functions +// +/////////////////////////////////////////////////////////////////////////////// +static inline uint32 CountBitsInMask(uint8 n) { + +#if _WIN64 + if(!n) return 0; // no bits set + if(!(n & (n-1))) return 1; // power of two + + uint32 c; + for(c = 0; n; c++) { + n &= n - 1; + } + return c; +#else + + __asm + { + mov eax, 8 + movzx ecx, n + bsf ecx, ecx + sub eax, ecx + } + +#endif +} + +template +static inline void clamp(ty &x, const ty &min, const ty &max) { + x = (x < min)? min : ((x > max)? max : x); +} + +// absolute distance. It turns out the compiler does a much +// better job of optimizing this than we can, since we can't +// translate the values to/from registers +static uint8 sad(uint8 a, uint8 b) { +#if 0 + __asm + { + movzx eax, a + movzx ecx, b + sub eax, ecx + jns done + neg eax +done: + } +#else + //const INT d = a - b; + //const INT mask = d >> 31; + //return (d ^ mask) - mask; + + // return abs(a - b); + + return (a > b)? a - b : b - a; + +#endif +} + +/////////////////////////////////////////////////////////////////////////////// +// +// RGBAVector implementation +// +/////////////////////////////////////////////////////////////////////////////// + +uint8 QuantizeChannel(const uint8 val, const uint8 mask, const int pBit) { + + // If the mask is all the bits, then we can just return the value. + if(mask == 0xFF) { + return val; + } + + uint32 prec = CountBitsInMask(mask); + const uint32 step = 1 << (8 - prec); + + assert(step-1 == uint8(~mask)); + + uint32 lval = val & mask; + uint32 hval = lval + step; + + if(pBit >= 0) { + prec++; + lval |= !!(pBit) << (8 - prec); + hval |= !!(pBit) << (8 - prec); + } + + if(lval > val) { + lval -= step; + hval -= step; + } + + lval |= lval >> prec; + hval |= hval >> prec; + + if(sad(val, lval) < sad(val, hval)) + return lval; + else + return hval; +} + +uint32 RGBAVector::ToPixel(const uint32 channelMask, const int pBit) const { + uint32 ret = 0; + uint8 *pRet = (uint8 *)&ret; + + const uint8 *channelMaskBytes = (const uint8 *)&channelMask; + + pRet[0] = QuantizeChannel(uint32(r + 0.5) & 0xFF, channelMaskBytes[0], pBit); + pRet[1] = QuantizeChannel(uint32(g + 0.5) & 0xFF, channelMaskBytes[1], pBit); + pRet[2] = QuantizeChannel(uint32(b + 0.5) & 0xFF, channelMaskBytes[2], pBit); + pRet[3] = QuantizeChannel(uint32(a + 0.5) & 0xFF, channelMaskBytes[3], pBit); + + return ret; +} + +/////////////////////////////////////////////////////////////////////////////// +// +// RGBAMatrix implementation +// +/////////////////////////////////////////////////////////////////////////////// + +RGBAMatrix &RGBAMatrix::operator *=(const RGBAMatrix &mat) { + *this = ((*this) * mat); + return (*this); +} + +RGBAMatrix RGBAMatrix::operator *(const RGBAMatrix &mat) const { + + RGBAMatrix result; + + for(int i = 0; i < 4; i++) { + for(int j = 0; j < 4; j++) { + + result(i, j) = 0.0f; + for(int k = 0; k < 4; k++) { + result(i, j) += m[i*4 + k] * mat.m[k*4 + j]; + } + } + } + + return result; +} + +RGBAVector RGBAMatrix::operator *(const RGBAVector &p) const { + return RGBAVector ( + p.x * m1 + p.y * m2 + p.z * m3 + p.w * m4, + p.x * m5 + p.y * m6 + p.z * m7 + p.w * m8, + p.x * m9 + p.y * m10 + p.z * m11 + p.w * m12, + p.x * m13 + p.y * m14 + p.z * m15 + p.w * m16 + ); +} + +RGBAMatrix RGBAMatrix::RotateX(float rad) { + RGBAMatrix result; + result.m6 = result.m11 = cos(rad); + result.m10 = sin(rad); + result.m7 = -result.m10; + return result; +} + +RGBAMatrix RGBAMatrix::RotateY(float rad) { + RGBAMatrix result; + result.m1 = result.m11 = cos(rad); + result.m3 = sin(rad); + result.m9 = -result.m3; + return result; +} + +RGBAMatrix RGBAMatrix::RotateZ(float rad) { + RGBAMatrix result; + result.m1 = result.m6 = cos(rad); + result.m5 = sin(rad); + result.m2 = -result.m5; + return result; +} + +RGBAMatrix RGBAMatrix::Translate(const RGBAVector &t) { + RGBAMatrix result; + result.m4 = t.x; + result.m8 = t.y; + result.m12 = t.z; + result.m16 = t.w; + return result; +} + +bool RGBAMatrix::Identity() { + for(int i = 0; i < 4; i++) { + for(int j = 0; j < 4; j++) { + + if(i == j) { + if(fabs(m[i*4 + j] - 1.0f) > 1e-5) + return false; + } + else { + if(fabs(m[i*4 + j]) > 1e-5) + return false; + } + } + } + + return true; +} + +/////////////////////////////////////////////////////////////////////////////// +// +// Cluster implementation +// +/////////////////////////////////////////////////////////////////////////////// + +RGBACluster::RGBACluster(const RGBACluster &left, const RGBACluster &right) { + *this = left; + for(int i = 0; i < right.m_NumPoints; i++) { + const RGBAVector &p = right.m_DataPoints[i]; + AddPoint(p); + } + + m_PrincipalAxisCached = false; +} + +void RGBACluster::AddPoint(const RGBAVector &p) { + assert(m_NumPoints < kMaxNumDataPoints); + m_Total += p; + m_DataPoints[m_NumPoints++] = p; + m_PointBitString |= 1 << p.GetIdx(); + + for(int i = 0; i < kNumColorChannels; i++) { + m_Min.c[i] = min(p.c[i], m_Min.c[i]); + m_Max.c[i] = max(p.c[i], m_Max.c[i]); + } +} + +void RGBACluster::GetPrincipalAxis(RGBADir &axis) { + + if(m_PrincipalAxisCached) { + axis = m_PrincipalAxis; + return; + } + + RGBAVector avg = m_Total / float(m_NumPoints); + ::GetPrincipalAxis(m_NumPoints, m_DataPoints, m_PrincipalAxis); + m_PrincipalAxisCached = true; + + GetPrincipalAxis(axis); +} + +double RGBACluster::QuantizedError(const RGBAVector &p1, const RGBAVector &p2, uint8 nBuckets, uint32 bitMask, const RGBAVector &errorMetricVec, const int pbits[2], int *indices) const { + + // nBuckets should be a power of two. + assert(nBuckets == 3 || !(nBuckets & (nBuckets - 1))); + + const uint8 indexPrec = (nBuckets == 3)? 3 : 8-CountBitsInMask(~(nBuckets - 1)); + + typedef uint32 tInterpPair[2]; + typedef tInterpPair tInterpLevel[16]; + const tInterpLevel *interpVals = (nBuckets == 3)? kBC7InterpolationValues : kBC7InterpolationValues + (indexPrec - 1); + + assert(indexPrec >= 2 && indexPrec <= 4); + + uint32 qp1, qp2; + if(pbits) { + qp1 = p1.ToPixel(bitMask, pbits[0]); + qp2 = p2.ToPixel(bitMask, pbits[1]); + } + else { + qp1 = p1.ToPixel(bitMask); + qp2 = p2.ToPixel(bitMask); + } + + uint8 *pqp1 = (uint8 *)&qp1; + uint8 *pqp2 = (uint8 *)&qp2; + + float totalError = 0.0; + for(int i = 0; i < m_NumPoints; i++) { + + const uint32 pixel = m_DataPoints[i].ToPixel(); + const uint8 *pb = (const uint8 *)(&pixel); + + float minError = FLT_MAX; + int bestBucket = -1; + for(int j = 0; j < nBuckets; j++) { + + uint32 interp0 = (*interpVals)[j][0]; + uint32 interp1 = (*interpVals)[j][1]; + + RGBAVector errorVec (0.0f); + for(int k = 0; k < kNumColorChannels; k++) { + const uint8 ip = (((uint32(pqp1[k]) * interp0) + (uint32(pqp2[k]) * interp1) + 32) >> 6) & 0xFF; + const uint8 dist = sad(pb[k], ip); + errorVec.c[k] = kFloatConversion[dist]; + } + + errorVec *= errorMetricVec; + float error = errorVec * errorVec; + if(error < minError) { + minError = error; + bestBucket = j; + } + + // Conceptually, once the error starts growing, it doesn't stop growing (we're moving + // farther away from the reference point along the line). Hence we can early out here. + // However, quanitzation artifacts mean that this is not ALWAYS the case, so we do suffer + // about 0.01 RMS error. + else if(error > minError) { + break; + } + } + + totalError += minError; + + assert(bestBucket >= 0); + if(indices) indices[i] = bestBucket; + } + + return totalError; +} + +/////////////////////////////////////////////////////////////////////////////// +// +// Utility function implementation +// +/////////////////////////////////////////////////////////////////////////////// + +void ClampEndpoints(RGBAVector &p1, RGBAVector &p2) { + clamp(p1.r, 0.0f, 255.0f); + clamp(p1.g, 0.0f, 255.0f); + clamp(p1.b, 0.0f, 255.0f); + clamp(p1.a, 0.0f, 255.0f); + + clamp(p2.r, 0.0f, 255.0f); + clamp(p2.g, 0.0f, 255.0f); + clamp(p2.b, 0.0f, 255.0f); + clamp(p2.a, 0.0f, 255.0f); +} + +void GetPrincipalAxis(int nPts, const RGBAVector *pts, RGBADir &axis) { + + assert(nPts > 0); + assert(nPts <= kMaxNumDataPoints); + + RGBAVector avg (0.0f); + for(int i = 0; i < nPts; i++) { + avg += pts[i]; + } + avg /= float(nPts); + + // We use these vectors for calculating the covariance matrix... + RGBAVector toPts[kMaxNumDataPoints]; + RGBAVector toPtsMax(-FLT_MAX); + for(int i = 0; i < nPts; i++) { + toPts[i] = pts[i] - avg; + + for(int j = 0; j < kNumColorChannels; j++) { + toPtsMax.c[j] = max(toPtsMax.c[j], toPts[i].c[j]); + } + } + + // Generate a list of unique points... + RGBAVector upts[kMaxNumDataPoints]; + int uptsIdx = 0; + for(int i = 0; i < nPts; i++) { + + bool hasPt = false; + for(int j = 0; j < uptsIdx; j++) { + if(upts[j] == pts[i]) + hasPt = true; + } + + if(!hasPt) { + upts[uptsIdx++] = pts[i]; + } + } + + assert(uptsIdx > 0); + + if(uptsIdx == 1) { + axis.r = axis.g = axis.b = axis.a = 0.0f; + return; + } + // Collinear? + else { + + RGBADir dir (upts[1] - upts[0]); + bool collinear = true; + for(int i = 2; i < nPts; i++) { + RGBAVector v = (upts[i] - upts[0]); + if(fabs(fabs(v*dir) - v.Length()) > 1e-7) { + collinear = false; + break; + } + } + + if(collinear) { + axis = dir; + return; + } + } + + RGBAMatrix covMatrix; + + // Compute covariance. + for(int i = 0; i < kNumColorChannels; i++) { + for(int j = 0; j <= i; j++) { + + float sum = 0.0; + for(int k = 0; k < nPts; k++) { + sum += toPts[k].c[i] * toPts[k].c[j]; + } + + covMatrix(i, j) = sum / kFloatConversion[kNumColorChannels - 1]; + covMatrix(j, i) = covMatrix(i, j); + } + } + + // !SPEED! Find eigenvectors by using the power method. This is good because the + // matrix is only 4x4, which allows us to use SIMD... + RGBAVector b = toPtsMax; + assert(b.Length() > 0); + b /= b.Length(); + + bool fixed = false; + int infLoopPrevention = 0; + const int kMaxNumIterations = 200; + while(!fixed && ++infLoopPrevention < kMaxNumIterations) { + + RGBAVector newB = covMatrix * b; + + // !HACK! If the principal eigenvector of the covariance matrix + // converges to zero, that means that the points lie equally + // spaced on a sphere in this space. In this (extremely rare) + // situation, just choose a point and use it as the principal + // direction. + const float newBlen = newB.Length(); + if(newBlen < 1e-10) { + axis = toPts[0]; + return; + } + + newB /= newB.Length(); + + if(fabs(1.0f - (b * newB)) < 1e-5) + fixed = true; + + b = newB; + } + + assert(infLoopPrevention < kMaxNumIterations); + axis = b; +} diff --git a/BPTCEncoder/src/RGBAEndpoints.h b/BPTCEncoder/src/RGBAEndpoints.h new file mode 100755 index 0000000..f84700d --- /dev/null +++ b/BPTCEncoder/src/RGBAEndpoints.h @@ -0,0 +1,354 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +#ifndef __RGBA_ENDPOINTS_H__ +#define __RGBA_ENDPOINTS_H__ + +#include "BC7IntTypes.h" +#include +#include +#include + +static const int kNumColorChannels = 4; +static const int kMaxNumDataPoints = 16; + +class RGBAVector { + +public: + union { + struct { float r, g, b, a; }; + struct { float x, y, z, w; }; + float c[4]; + }; + + uint32 GetIdx() const { return idx; } + + RGBAVector() : r(-1.0), g(-1.0), b(-1.0), a(-1.0) { } + RGBAVector(uint32 _idx, uint32 pixel) : + r(float(pixel & 0xFF)), + g(float((pixel >> 8) & 0xFF)), + b(float((pixel >> 16) & 0xFF)), + a(float((pixel >> 24) & 0xFF)), + idx(_idx) + { } + + RGBAVector(float _r, float _g, float _b, float _a) : + r(_r), g(_g), b(_b), a(_a) { } + + explicit RGBAVector(float cc) : r(cc), g(cc), b(cc), a(cc) { } + + RGBAVector &operator =(const RGBAVector &other) { + this->idx = other.idx; + memcpy(c, other.c, sizeof(c)); + return (*this); + } + + RGBAVector operator +(const RGBAVector &p) const { + return RGBAVector(r + p.r, g + p.g, b + p.b, a + p.a); + } + + RGBAVector &operator +=(const RGBAVector &p) { + r += p.r; g += p.g; b += p.b; a += p.a; + return *this; + } + + RGBAVector operator -(const RGBAVector &p) const { + return RGBAVector(r - p.r, g - p.g, b - p.b, a - p.a); + } + + RGBAVector &operator -=(const RGBAVector &p) { + r -= p.r; g -= p.g; b -= p.b; a -= p.a; + return *this; + } + + RGBAVector operator /(const float s) const { + return RGBAVector(r / s, g / s, b / s, a / s); + } + + RGBAVector &operator /=(const float s) { + r /= s; g /= s; b /= s; a /= s; + return *this; + } + + float operator *(const RGBAVector &p) const { + return r * p.r + g * p.g + b * p.b + a * p.a; + } + + float Length() const { + return sqrt((*this) * (*this)); + } + + RGBAVector &operator *=(const RGBAVector &v) { + r *= v.r; g *= v.g; b *= v.b; a *= v.a; + return *this; + } + + RGBAVector operator *(const float s) const { + return RGBAVector(r * s, g * s, b * s, a * s); + } + + friend RGBAVector operator *(const float s, const RGBAVector &p) { + return RGBAVector(p.r * s, p.g * s, p.b * s, p.a * s); + } + + RGBAVector &operator *=(const float s) { + r *= s; g *= s; b *= s; a *= s; + return *this; + } + + float &operator [](const int i) { + return c[i]; + } + + friend bool operator ==(const RGBAVector &rhs, const RGBAVector &lhs) { + const RGBAVector d = rhs - lhs; + return fabs(d.r) < 1e-7 && fabs(d.g) < 1e-7 && fabs(d.b) < 1e-7 && fabs(d.a) < 1e-7; + } + + friend bool operator !=(const RGBAVector &rhs, const RGBAVector &lhs) { + return !(rhs == lhs); + } + + operator float *() { + return c; + } + + RGBAVector Cross(const RGBAVector &rhs) { + return RGBAVector( + rhs.y * z - y * rhs.z, + rhs.z * x - z * rhs.x, + rhs.x * y - x * rhs.y, + 1.0f + ); + } + + // Quantize this point. + uint32 ToPixel(const uint32 channelMask = 0xFFFFFFFF, const int pBit = -1) const; + +private: + uint32 idx; +}; + +class RGBAMatrix { +private: + union { + float m[kNumColorChannels*kNumColorChannels]; + struct { + float m1, m2, m3, m4; + float m5, m6, m7, m8; + float m9, m10, m11, m12; + float m13, m14, m15, m16; + }; + }; + + RGBAMatrix(const float *arr) { + memcpy(m, arr, sizeof(m)); + } + +public: + + RGBAMatrix() : + m1(1.0f), m2(0.0f), m3(0.0f), m4(0.0f), + m5(0.0f), m6(1.0f), m7(0.0f), m8(0.0f), + m9(0.0f), m10(0.0f), m11(1.0f), m12(0.0f), + m13(0.0f), m14(0.0f), m15(0.0f), m16(1.0f) + { } + + RGBAMatrix &operator =(const RGBAMatrix &other) { + memcpy(m, other.m, sizeof(m)); + return (*this); + } + + RGBAMatrix operator +(const RGBAMatrix &p) const { + float newm[kNumColorChannels*kNumColorChannels]; + for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = m[i] + p.m[i]; + return RGBAMatrix(newm); + } + + RGBAMatrix &operator +=(const RGBAMatrix &p) { + for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) m[i] += p.m[i]; + return *this; + } + + RGBAMatrix operator -(const RGBAMatrix &p) const { + float newm[kNumColorChannels*kNumColorChannels]; + for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = m[i] - p.m[i]; + return RGBAMatrix(newm); + } + + RGBAMatrix &operator -=(const RGBAMatrix &p) { + for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) m[i] -= p.m[i]; + return *this; + } + + RGBAMatrix operator /(const float s) const { + float newm[kNumColorChannels*kNumColorChannels]; + for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = m[i] / s; + return RGBAMatrix(newm); + } + + RGBAMatrix &operator /=(const float s) { + for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) m[i] /= s; + return *this; + } + + RGBAMatrix operator *(const float s) const { + float newm[kNumColorChannels*kNumColorChannels]; + for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = m[i] * s; + return RGBAMatrix(newm); + } + + RGBAMatrix operator *(const double s) const { + float newm[kNumColorChannels*kNumColorChannels]; + for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = float(double(m[i]) * s); + return RGBAMatrix(newm); + } + + friend RGBAMatrix operator *(const float s, const RGBAMatrix &p) { + float newm[kNumColorChannels*kNumColorChannels]; + for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = p.m[i] * s; + return RGBAMatrix(newm); + } + + friend RGBAMatrix operator *(const double s, const RGBAMatrix &p) { + float newm[kNumColorChannels*kNumColorChannels]; + for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) newm[i] = float(double(p.m[i]) * s); + return RGBAMatrix(newm); + } + + RGBAMatrix &operator *=(const float s) { + for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) m[i] *= s; + return *this; + } + + float &operator ()(const int i, const int j) { + return (*this)[i*4 + j]; + } + + float &operator [](const int i) { + return m[i]; + } + + friend bool operator ==(const RGBAMatrix &rhs, const RGBAMatrix &lhs) { + const RGBAMatrix d = rhs - lhs; + for(int i = 0; i < kNumColorChannels*kNumColorChannels; i++) + if(d.m[i] > 1e-10) + return false; + return true; + } + + operator float *() { + return m; + } + + RGBAVector operator *(const RGBAVector &p) const; + RGBAMatrix operator *(const RGBAMatrix &mat) const; + RGBAMatrix &operator *=(const RGBAMatrix &mat); + static RGBAMatrix RotateX(float rad); + static RGBAMatrix RotateY(float rad); + static RGBAMatrix RotateZ(float rad); + static RGBAMatrix Translate(const RGBAVector &t); + bool Identity(); +}; + +class RGBADir : public RGBAVector { +public: + RGBADir() : RGBAVector() { } + RGBADir(const RGBAVector &p) : RGBAVector(p) { + *this /= Length(); + } +}; + +// Makes sure that the values of the endpoints lie between 0 and 1. +extern void ClampEndpoints(RGBAVector &p1, RGBAVector &p2); + +class RGBACluster { +public: + + RGBACluster() : + m_NumPoints(0), m_Total(0), + m_PointBitString(0), + m_Min(FLT_MAX), + m_Max(-FLT_MAX), + m_PrincipalAxisCached(false) + { } + + RGBACluster(const RGBACluster &c) : + m_NumPoints(c.m_NumPoints), + m_Total(c.m_Total), + m_PointBitString(c.m_PointBitString), + m_Min(c.m_Min), + m_Max(c.m_Max), + m_PrincipalAxisCached(false) + { + memcpy(this->m_DataPoints, c.m_DataPoints, m_NumPoints * sizeof(RGBAVector)); + } + + RGBACluster(const RGBACluster &left, const RGBACluster &right); + RGBACluster(const RGBAVector &p) : + m_NumPoints(1), + m_Total(p), + m_PointBitString(0), + m_Min(p), m_Max(p), + m_PrincipalAxisCached(false) + { + m_DataPoints[0] = p; + m_PointBitString |= (1 << p.GetIdx()); + } + + RGBAVector GetTotal() const { return m_Total; } + const RGBAVector &GetPoint(int idx) const { return m_DataPoints[idx]; } + int GetNumPoints() const { return m_NumPoints; } + RGBAVector GetAvg() const { return m_Total / float(m_NumPoints); } + const RGBAVector *GetPoints() const { return m_DataPoints; } + + void AddPoint(const RGBAVector &p); + + void GetBoundingBox(RGBAVector &Min, RGBAVector &Max) const { + Min = m_Min, Max = m_Max; + } + + // Returns the error if we were to quantize the colors right now with the given number of buckets and bit mask. + double QuantizedError(const RGBAVector &p1, const RGBAVector &p2, uint8 nBuckets, uint32 bitMask, const RGBAVector &errorMetricVec, const int pbits[2] = NULL, int *indices = NULL) const; + + // Returns the principal axis for this point cluster. + void GetPrincipalAxis(RGBADir &axis); + + bool AllSamePoint() const { return m_Max == m_Min; } + int GetPointBitString() const { return m_PointBitString; } + +private: + + // The number of points in the cluster. + int m_NumPoints; + + RGBAVector m_Total; + + // The points in the cluster. + RGBAVector m_DataPoints[kMaxNumDataPoints]; + + RGBAVector m_Min, m_Max; + int m_PointBitString; + + RGBADir m_PrincipalAxis; + bool m_PrincipalAxisCached; +}; + +extern uint8 QuantizeChannel(const uint8 val, const uint8 mask, const int pBit = -1); +extern void GetPrincipalAxis(int nPts, const RGBAVector *pts, RGBADir &axis); + +#endif //__RGBA_ENDPOINTS_H__ diff --git a/BPTCEncoder/src/RGBAEndpointsSIMD.cpp b/BPTCEncoder/src/RGBAEndpointsSIMD.cpp new file mode 100755 index 0000000..7625bee --- /dev/null +++ b/BPTCEncoder/src/RGBAEndpointsSIMD.cpp @@ -0,0 +1,420 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +#include "BC7Config.h" +#include "RGBAEndpointsSIMD.h" +#include "BC7Compressor.h" +#include "BC7CompressionModeSIMD.h" + +#include +#include + +#ifndef HAS_SSE_POPCNT +static inline uint32 popcnt32(uint32 x) { + uint32 m1 = 0x55555555; + uint32 m2 = 0x33333333; + uint32 m3 = 0x0f0f0f0f; + x -= (x>>1) & 1; + x = (x&m2) + ((x>>2)&m2); + x = (x+(x>>4))&m3; + x += x>>8; + return (x+(x>>16)) & 0x3f; +} +#endif + +/////////////////////////////////////////////////////////////////////////////// +// +// RGBAVectorSIMD implementation +// +/////////////////////////////////////////////////////////////////////////////// + +/* Original scalar implementation: + + // If the mask is all the bits, then we can just return the value. + if(mask == 0xFF) { + return val; + } + + uint32 prec = CountBitsInMask(mask); + const uint32 step = 1 << (8 - prec); + + assert(step-1 == uint8(~mask)); + + uint32 lval = val & mask; + uint32 hval = lval + step; + + if(pBit >= 0) { + prec++; + lval |= !!(pBit) << (8 - prec); + hval |= !!(pBit) << (8 - prec); + } + + if(lval > val) { + lval -= step; + hval -= step; + } + + lval |= lval >> prec; + hval |= hval >> prec; + + if(sad(val, lval) < sad(val, hval)) + return lval; + else + return hval; +*/ + +// !TODO! AVX2 supports an instruction known as vsllv, which shifts a vector +// by the values stored in another vector. I.e. you can do something like this: +// +// __m128i shiftVals = _mm_set_epi32(1, 2, 3, 4); +// __m128i someVector = _mm_set1_epi32(1) ; +// __m128i shifted = _mm_srav_epi32 (someVector, shiftVals); +// +// and the result will be the same as __mm_Set_epi32(1, 4, 8, 16); +// +// This is useful because our color channels may have different precisions +// when we're quantizing them, such as for BC7 modes 4 and 5. Hence, we would +// want to do our quantization as accurately as possible, but currently it would +// be very hard to vectorize. + +#ifdef _MSC_VER +#define ALIGN_SSE __declspec ( align(16) ) +#else +#define ALIGN_SSE __attribute__((aligned(16))) +#endif + +// Constants. There are two ways to specify them: either by using the _mm_set* +// intrinsics, or by defining them as aligned arrays. You want to do the former +// when you use them infrequently, and the latter when you use them multiple times +// in a short time frame (like in an inner loop) +static const __m128 kZero = _mm_set1_ps(0.0f); +static const __m128 kByteMax = _mm_set1_ps(255.0f); +static const __m128 kHalfVector = _mm_set1_ps(0.5f); +static const __m128i kOneVector = _mm_set1_epi32(1); +static const __m128i kZeroVector = _mm_set1_epi32(0); +static const ALIGN_SSE uint32 kThirtyTwoVector[4] = { 32, 32, 32, 32 }; +static const __m128i kByteValMask = _mm_set_epi32(0xFF, 0xFF, 0xFF, 0xFF); + +static inline __m128i sad(const __m128i &a, const __m128i &b) { + const __m128i maxab = _mm_max_epu8(a, b); + const __m128i minab = _mm_min_epu8(a, b); + return _mm_and_si128( kByteValMask, _mm_subs_epu8( maxab, minab ) ); +} + +__m128i RGBAVectorSIMD::ToPixel(const __m128i &qmask) const { + + // !SPEED! We should figure out a way to get rid of these scalar operations. +#ifdef HAS_SSE_POPCNT + const uint32 prec = _mm_popcnt32(((uint32 *)(&qmask))[0]); +#else + const uint32 prec = popcnt32(((uint32 *)(&qmask))[0]); +#endif + + assert(r >= 0.0f && r <= 255.0f); + assert(g >= 0.0f && g <= 255.0f); + assert(b >= 0.0f && b <= 255.0f); + assert(a >= 0.0f && a <= 255.0f); + assert(((uint32 *)(&qmask))[3] == 0xFF || ((uint32 *)(&qmask))[3] == ((uint32 *)(&qmask))[0]); + assert(((uint32 *)(&qmask))[2] == ((uint32 *)(&qmask))[1] && ((uint32 *)(&qmask))[0] == ((uint32 *)(&qmask))[1]); + + const __m128i val = _mm_cvtps_epi32( _mm_add_ps(kHalfVector, vec) ); + + const __m128i step = _mm_slli_epi32( kOneVector, 8 - prec ); + const __m128i &mask = qmask; + + __m128i lval = _mm_and_si128(val, mask); + __m128i hval = _mm_add_epi32(lval, step); + + const __m128i lvalShift = _mm_srli_epi32(lval, prec); + const __m128i hvalShift = _mm_srli_epi32(hval, prec); + + lval = _mm_or_si128(lval, lvalShift); + hval = _mm_or_si128(hval, hvalShift); + + const __m128i lvald = _mm_sub_epi32( val, lval ); + const __m128i hvald = _mm_sub_epi32( hval, val ); + + const __m128i vd = _mm_cmplt_epi32(lvald, hvald); + __m128i ans = _mm_blendv_epi8(hval, lval, vd); + + const __m128i chanExact = _mm_cmpeq_epi32(mask, kByteValMask); + ans = _mm_blendv_epi8( ans, val, chanExact ); + return ans; +} + +__m128i RGBAVectorSIMD::ToPixel(const __m128i &qmask, const int pBit) const { + + // !SPEED! We should figure out a way to get rid of these scalar operations. +#ifdef HAS_SSE_POPCNT + const uint32 prec = _mm_popcnt32(((uint32 *)(&qmask))[0]); +#else + const uint32 prec = popcnt32(((uint32 *)(&qmask))[0]); +#endif + + assert(r >= 0.0f && r <= 255.0f); + assert(g >= 0.0f && g <= 255.0f); + assert(b >= 0.0f && b <= 255.0f); + assert(a >= 0.0f && a <= 255.0f); + assert(((uint32 *)(&qmask))[3] == 0xFF || ((uint32 *)(&qmask))[3] == ((uint32 *)(&qmask))[0]); + assert(((uint32 *)(&qmask))[2] == ((uint32 *)(&qmask))[1] && ((uint32 *)(&qmask))[0] == ((uint32 *)(&qmask))[1]); + + const __m128i val = _mm_cvtps_epi32( _mm_add_ps(kHalfVector, vec) ); + const __m128i pbit = _mm_set1_epi32(!!pBit); + + const __m128i &mask = qmask; // _mm_set_epi32(alphaMask, channelMask, channelMask, channelMask); + const __m128i step = _mm_slli_epi32( kOneVector, 8 - prec ); + + __m128i lval = _mm_and_si128( val, mask ); + __m128i hval = _mm_add_epi32( lval, step ); + + const __m128i pBitShifted = _mm_slli_epi32(pbit, 7 - prec); + lval = _mm_or_si128(lval, pBitShifted ); + hval = _mm_or_si128(hval, pBitShifted); + + // These next three lines we make sure that after adding the pbit that val is + // still in between lval and hval. If it isn't, then we subtract a + // step from both. Now, val should be larger than lval and less than + // hval, but certain situations make this not always the case (e.g. val + // is 0, precision is 4 bits, and pbit is 1). Hence, we add back the + // step if it goes below zero, making it equivalent to hval and so it + // doesn't matter which we choose. + { + __m128i cmp = _mm_cmpgt_epi32(lval, val); + cmp = _mm_mullo_epi32(cmp, step); + lval = _mm_add_epi32(lval, cmp); + hval = _mm_add_epi32(hval, cmp); + + cmp = _mm_cmplt_epi32(lval, kZeroVector); + cmp = _mm_mullo_epi32(cmp, step); + lval = _mm_sub_epi32(lval, cmp); + } + + const __m128i lvalShift = _mm_srli_epi32(lval, prec + 1); + const __m128i hvalShift = _mm_srli_epi32(hval, prec + 1); + + lval = _mm_or_si128(lval, lvalShift); + hval = _mm_or_si128(hval, hvalShift); + + const __m128i lvald = _mm_sub_epi32( val, lval ); + const __m128i hvald = _mm_sub_epi32( hval, val ); + + const __m128i vd = _mm_cmplt_epi32(lvald, hvald); + __m128i ans = _mm_blendv_epi8(hval, lval, vd); + + const __m128i chanExact = _mm_cmpeq_epi32(mask, kByteValMask); + ans = _mm_blendv_epi8( ans, val, chanExact ); + return ans; +} + +/////////////////////////////////////////////////////////////////////////////// +// +// RGBAMatrixSIMD implementation +// +/////////////////////////////////////////////////////////////////////////////// + +RGBAVectorSIMD RGBAMatrixSIMD::operator *(const RGBAVectorSIMD &p) const { + + __m128 xVec = _mm_set1_ps( p.x ); + __m128 yVec = _mm_set1_ps( p.y ); + __m128 zVec = _mm_set1_ps( p.z ); + __m128 wVec = _mm_set1_ps( p.w ); + + __m128 vec1 = _mm_mul_ps( xVec, col[0] ); + __m128 vec2 = _mm_mul_ps( yVec, col[1] ); + __m128 vec3 = _mm_mul_ps( zVec, col[2] ); + __m128 vec4 = _mm_mul_ps( wVec, col[3] ); + + return RGBAVectorSIMD( _mm_add_ps( _mm_add_ps( vec1, vec2 ), _mm_add_ps( vec3, vec4 ) ) ); +} + +/////////////////////////////////////////////////////////////////////////////// +// +// Cluster implementation +// +/////////////////////////////////////////////////////////////////////////////// + +RGBAClusterSIMD::RGBAClusterSIMD(const RGBAClusterSIMD &left, const RGBAClusterSIMD &right) { + + assert(!(left.m_PointBitString & right.m_PointBitString)); + + *this = left; + for(int i = 0; i < right.m_NumPoints; i++) { + + const RGBAVectorSIMD &p = right.m_DataPoints[i]; + + assert(m_NumPoints < kMaxNumDataPoints); + m_Total += p; + m_DataPoints[m_NumPoints++] = p; + + m_Min.vec = _mm_min_ps(m_Min.vec, p.vec); + m_Max.vec = _mm_max_ps(m_Max.vec, p.vec); + } + + m_PointBitString = left.m_PointBitString | right.m_PointBitString; + m_PrincipalAxisCached = false; +} + +void RGBAClusterSIMD::AddPoint(const RGBAVectorSIMD &p, int idx) { + assert(m_NumPoints < kMaxNumDataPoints); + m_Total += p; + m_DataPoints[m_NumPoints++] = p; + m_PointBitString |= 1 << idx; + + m_Min.vec = _mm_min_ps(m_Min.vec, p.vec); + m_Max.vec = _mm_max_ps(m_Max.vec, p.vec); +} + +float RGBAClusterSIMD::QuantizedError(const RGBAVectorSIMD &p1, const RGBAVectorSIMD &p2, const uint8 nBuckets, const __m128i &bitMask, const int pbits[2], __m128i *indices) const { + + // nBuckets should be a power of two. + assert(!(nBuckets & (nBuckets - 1))); + + const uint8 indexPrec = 8-_mm_popcnt_u32(~(nBuckets - 1) & 0xFF); + assert(indexPrec >= 2 && indexPrec <= 4); + + typedef __m128i tInterpPair[2]; + typedef tInterpPair tInterpLevel[16]; + const tInterpLevel *interpVals = kBC7InterpolationValuesSIMD + (indexPrec - 1); + + __m128i qp1, qp2; + if(pbits) { + qp1 = p1.ToPixel(bitMask, pbits[0]); + qp2 = p2.ToPixel(bitMask, pbits[1]); + } + else { + qp1 = p1.ToPixel(bitMask); + qp2 = p2.ToPixel(bitMask); + } + + __m128 errorMetricVec = _mm_load_ps( BC7C::GetErrorMetric() ); + + __m128 totalError = kZero; + for(int i = 0; i < m_NumPoints; i++) { + + const __m128i pixel = m_DataPoints[i].ToPixel( kByteValMask ); + + __m128 minError = _mm_set1_ps(FLT_MAX); + __m128i bestBucket = _mm_set1_epi32(-1); + for(int j = 0; j < nBuckets; j++) { + + const __m128i jVec = _mm_set1_epi32(j); + const __m128i interp0 = (*interpVals)[j][0]; + const __m128i interp1 = (*interpVals)[j][1]; + + const __m128i ip0 = _mm_mullo_epi32( qp1, interp0 ); + const __m128i ip1 = _mm_mullo_epi32( qp2, interp1 ); + const __m128i ip = _mm_add_epi32( *((const __m128i *)kThirtyTwoVector), _mm_add_epi32( ip0, ip1 ) ); + const __m128i dist = sad( _mm_and_si128( _mm_srli_epi32( ip, 6 ), kByteValMask ), pixel ); + __m128 errorVec = _mm_cvtepi32_ps( dist ); + + errorVec = _mm_mul_ps( errorVec, errorMetricVec ); + errorVec = _mm_mul_ps( errorVec, errorVec ); + errorVec = _mm_hadd_ps( errorVec, errorVec ); + errorVec = _mm_hadd_ps( errorVec, errorVec ); + + const __m128 cmp = _mm_cmple_ps( errorVec, minError ); + minError = _mm_blendv_ps( minError, errorVec, cmp ); + bestBucket = _mm_blendv_epi8( bestBucket, jVec, _mm_castps_si128( cmp ) ); + + // Conceptually, once the error starts growing, it doesn't stop growing (we're moving + // farther away from the reference point along the line). Hence we can early out here. + // However, quanitzation artifacts mean that this is not ALWAYS the case, so we do suffer + // about 0.01 RMS error. + if(!((uint8 *)(&cmp))[0]) + break; + } + + totalError = _mm_add_ps(totalError, minError); + if(indices) ((uint32 *)indices)[i] = ((uint32 *)(&bestBucket))[0]; + } + + return ((float *)(&totalError))[0]; +} + +/////////////////////////////////////////////////////////////////////////////// +// +// Utility function implementation +// +/////////////////////////////////////////////////////////////////////////////// + +void ClampEndpoints(RGBAVectorSIMD &p1, RGBAVectorSIMD &p2) { + p1.vec = _mm_min_ps( kByteMax, _mm_max_ps( p1.vec, kZero ) ); + p2.vec = _mm_min_ps( kByteMax, _mm_max_ps( p2.vec, kZero ) ); +} + +void GetPrincipalAxis(const RGBAClusterSIMD &c, RGBADirSIMD &axis) { + + if(c.GetNumPoints() == 2) { + axis = c.GetPoint(1) - c.GetPoint(0); + return; + } + + RGBAVectorSIMD avg = c.GetTotal(); + avg /= float(c.GetNumPoints()); + + // We use these vectors for calculating the covariance matrix... + RGBAVectorSIMD toPts[kMaxNumDataPoints]; + RGBAVectorSIMD toPtsMax(-FLT_MAX); + for(int i = 0; i < c.GetNumPoints(); i++) { + toPts[i] = c.GetPoint(i) - avg; + toPtsMax.vec = _mm_max_ps(toPtsMax.vec, toPts[i].vec); + } + + RGBAMatrixSIMD covMatrix; + + // Compute covariance. + const float fNumPoints = float(c.GetNumPoints()); + for(int i = 0; i < kNumColorChannels; i++) { + for(int j = 0; j <= i; j++) { + + float sum = 0.0; + for(int k = 0; k < c.GetNumPoints(); k++) { + sum += toPts[k].c[i] * toPts[k].c[j]; + } + + covMatrix(i, j) = sum / fNumPoints; + covMatrix(j, i) = covMatrix(i, j); + } + } + + // !SPEED! Find eigenvectors by using the power method. This is good because the + // matrix is only 4x4, which allows us to use SIMD... + RGBAVectorSIMD b = toPtsMax; + assert(b.Length() > 0); + b /= b.Length(); + + RGBAVectorSIMD newB = covMatrix * b; + + // !HACK! If the principal eigenvector of the covariance matrix + // converges to zero, that means that the points lie equally + // spaced on a sphere in this space. In this (extremely rare) + // situation, just choose a point and use it as the principal + // direction. + const float newBlen = newB.Length(); + if(newBlen < 1e-10) { + axis = toPts[0]; + return; + } + + for(int i = 0; i < 8; i++) { + newB = covMatrix * b; + newB.Normalize(); + b = newB; + } + + axis = b; +} diff --git a/BPTCEncoder/src/RGBAEndpointsSIMD.h b/BPTCEncoder/src/RGBAEndpointsSIMD.h new file mode 100755 index 0000000..b93f8a5 --- /dev/null +++ b/BPTCEncoder/src/RGBAEndpointsSIMD.h @@ -0,0 +1,374 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +#ifndef __RGBA_SIMD_ENDPOINTS_H__ +#define __RGBA_SIMD_ENDPOINTS_H__ + +#include "BC7IntTypes.h" +#include +#include +#include + +#include + +static const int kNumColorChannels = 4; +static const int kMaxNumDataPoints = 16; +static const __m128 kEpsilonSIMD = _mm_set1_ps(1e-8f); + +class RGBAVectorSIMD { + +public: + union { + struct { float r, g, b, a; }; + struct { float x, y, z, w; }; + float c[4]; + __m128 vec; + }; + + RGBAVectorSIMD() : r(-1.0), g(-1.0), b(-1.0), a(-1.0) { } + RGBAVectorSIMD(uint32 pixel) : + r(float(pixel & 0xFF)), + g(float((pixel >> 8) & 0xFF)), + b(float((pixel >> 16) & 0xFF)), + a(float((pixel >> 24) & 0xFF)) + { } + + explicit RGBAVectorSIMD(float _r, float _g, float _b, float _a) : + r(_r), g(_g), b(_b), a(_a) { } + + explicit RGBAVectorSIMD(float cc) : r(cc), g(cc), b(cc), a(cc) { } + + RGBAVectorSIMD (const __m128 &newVec) : vec(newVec) { } + RGBAVectorSIMD (const RGBAVectorSIMD &other) : vec(other.vec) { } + + RGBAVectorSIMD operator +(const RGBAVectorSIMD &p) const { + return RGBAVectorSIMD( _mm_add_ps(this->vec, p.vec) ); + } + + RGBAVectorSIMD &operator +=(const RGBAVectorSIMD &p) { + this->vec = _mm_add_ps(this->vec, p.vec); + return *this; + } + + RGBAVectorSIMD operator -(const RGBAVectorSIMD &p) const { + return RGBAVectorSIMD( _mm_sub_ps(this->vec, p.vec) ); + } + + RGBAVectorSIMD &operator -=(const RGBAVectorSIMD &p) { + this->vec = _mm_sub_ps(this->vec, p.vec); + return *this; + } + + RGBAVectorSIMD operator /(const float s) const { + return RGBAVectorSIMD( _mm_div_ps(this->vec, _mm_set1_ps(s) ) ); + } + + RGBAVectorSIMD &operator /=(const float s) { + this->vec = _mm_div_ps(this->vec, _mm_set1_ps(s) ); + return *this; + } + + float operator *(const RGBAVectorSIMD &p) const { + __m128 mul = _mm_mul_ps(this->vec, p.vec); + mul = _mm_hadd_ps(mul, mul); + mul = _mm_hadd_ps(mul, mul); + return ((float *)(&mul))[0]; + } + + void Normalize() { + __m128 rsqrt = _mm_rsqrt_ps( _mm_set1_ps( (*this) * (*this) ) ); + vec = _mm_mul_ps( vec, rsqrt ); + } + + float Length() const { + return sqrt((*this) * (*this)); + } + + RGBAVectorSIMD &operator *=(const RGBAVectorSIMD &v) { + this->vec = _mm_mul_ps(this->vec, v.vec); + return *this; + } + + RGBAVectorSIMD operator *(const float s) const { + return RGBAVectorSIMD( _mm_mul_ps( this->vec, _mm_set1_ps(s) ) ); + } + + friend RGBAVectorSIMD operator *(const float s, const RGBAVectorSIMD &p) { + return RGBAVectorSIMD( _mm_mul_ps( p.vec, _mm_set1_ps(s) ) ); + } + + RGBAVectorSIMD &operator *=(const float s) { + this->vec = _mm_mul_ps( this->vec, _mm_set1_ps(s) ); + return *this; + } + + float &operator [](const int i) { + return c[i]; + } + + friend bool operator ==(const RGBAVectorSIMD &rhs, const RGBAVectorSIMD &lhs) { + __m128 d = _mm_sub_ps(rhs.vec, lhs.vec); + d = _mm_mul_ps(d, d); + __m128 cmp = _mm_cmpgt_ps(d, kEpsilonSIMD); + cmp = _mm_hadd_ps(cmp, cmp); + cmp = _mm_hadd_ps(cmp, cmp); + return ((float *)(&cmp))[0] == 0.0f; + } + + friend bool operator !=(const RGBAVectorSIMD &rhs, const RGBAVectorSIMD &lhs) { + return !(rhs == lhs); + } + + operator float *() { + return c; + } + + // Quantize this point. + __m128i ToPixel(const __m128i &channelMask, const int pBit) const; + __m128i ToPixel(const __m128i &channelMask) const; +}; + +class RGBAMatrixSIMD { +private: + union { + float m[kNumColorChannels*kNumColorChannels]; + struct { + float m1, m5, m9, m13; + float m2, m6, m10, m14; + float m3, m7, m11, m15; + float m4, m8, m12, m16; + }; + __m128 col[kNumColorChannels]; + }; + + RGBAMatrixSIMD(const float *arr) { + memcpy(m, arr, sizeof(m)); + } + + RGBAMatrixSIMD(const __m128 newcol[kNumColorChannels]) { + for(int i = 0; i < kNumColorChannels; i++) + col[i] = newcol[i]; + } + +public: + + RGBAMatrixSIMD() : + m1(1.0f), m2(0.0f), m3(0.0f), m4(0.0f), + m5(0.0f), m6(1.0f), m7(0.0f), m8(0.0f), + m9(0.0f), m10(0.0f), m11(1.0f), m12(0.0f), + m13(0.0f), m14(0.0f), m15(0.0f), m16(1.0f) + { } + + RGBAMatrixSIMD &operator =(const RGBAMatrixSIMD &other) { + memcpy(m, other.m, sizeof(m)); + return (*this); + } + + RGBAMatrixSIMD operator +(const RGBAMatrixSIMD &p) const { + RGBAMatrixSIMD newm; + for(int i = 0; i < kNumColorChannels; i++) { + newm.col[i] = _mm_add_ps(col[i], p.col[i]); + } + return newm; + } + + RGBAMatrixSIMD &operator +=(const RGBAMatrixSIMD &p) { + for(int i = 0; i < kNumColorChannels; i++) { + col[i] = _mm_add_ps( col[i], p.col[i] ); + } + return *this; + } + + RGBAMatrixSIMD operator -(const RGBAMatrixSIMD &p) const { + RGBAMatrixSIMD newm; + for(int i = 0; i < kNumColorChannels; i++) { + newm.col[i] = _mm_sub_ps( col[i], p.col[i] ); + } + return newm; + } + + RGBAMatrixSIMD &operator -=(const RGBAMatrixSIMD &p) { + for(int i = 0; i < kNumColorChannels; i++) { + col[i] = _mm_sub_ps( col[i], p.col[i] ); + } + return *this; + } + + RGBAMatrixSIMD operator /(const float s) const { + __m128 f = _mm_set1_ps(s); + RGBAMatrixSIMD newm; + + for(int i = 0; i < kNumColorChannels; i++) { + newm.col[i] = _mm_div_ps( col[i], f ); + } + + return newm; + } + + RGBAMatrixSIMD &operator /=(const float s) { + + __m128 f = _mm_set1_ps(s); + + for(int i = 0; i < kNumColorChannels; i++) { + col[i] = _mm_div_ps(col[i], f); + } + + return *this; + } + + RGBAMatrixSIMD operator *(const float s) const { + __m128 f = _mm_set1_ps(s); + + RGBAMatrixSIMD newm; + for(int i = 0; i < kNumColorChannels; i++) { + newm.col[i] = _mm_mul_ps( col[i], f ); + } + return newm; + } + + friend RGBAMatrixSIMD operator *(const float s, const RGBAMatrixSIMD &p) { + __m128 f = _mm_set1_ps(s); + RGBAMatrixSIMD newm; + + for(int i = 0; i < kNumColorChannels; i++) { + newm.col[i] = _mm_mul_ps( p.col[i], f ); + } + return newm; + } + + RGBAMatrixSIMD &operator *=(const float s) { + __m128 f = _mm_set1_ps(s); + for(int i = 0; i < kNumColorChannels; i++) + col[i] = _mm_mul_ps(col[i], f); + return *this; + } + + float &operator ()(const int i, const int j) { + return (*this)[j*4 + i]; + } + + float &operator [](const int i) { + return m[i]; + } + + friend bool operator ==(const RGBAMatrixSIMD &rhs, const RGBAMatrixSIMD &lhs) { + + __m128 sum = _mm_set1_ps(0.0f); + for(int i = 0; i < kNumColorChannels; i++) { + __m128 d = _mm_sub_ps(rhs.col[i], lhs.col[i]); + d = _mm_mul_ps(d, d); + __m128 cmp = _mm_cmpgt_ps(d, kEpsilonSIMD); + cmp = _mm_hadd_ps(cmp, cmp); + cmp = _mm_hadd_ps(cmp, cmp); + sum = _mm_add_ps(sum, cmp); + } + + if(((float *)(&sum))[0] != 0) + return false; + else + return true; + } + + operator float *() { + return m; + } + + RGBAVectorSIMD operator *(const RGBAVectorSIMD &p) const; +}; + +class RGBADirSIMD : public RGBAVectorSIMD { +public: + RGBADirSIMD() : RGBAVectorSIMD() { } + RGBADirSIMD(const RGBAVectorSIMD &p) : RGBAVectorSIMD(p) { + this->Normalize(); + } +}; + +// Makes sure that the values of the endpoints lie between 0 and 1. +extern void ClampEndpoints(RGBAVectorSIMD &p1, RGBAVectorSIMD &p2); + +class RGBAClusterSIMD { +public: + + RGBAClusterSIMD() : + m_NumPoints(0), m_Total(0.0f), + m_PointBitString(0), + m_Min(FLT_MAX), + m_Max(-FLT_MAX), + m_PrincipalAxisCached(false) + { } + + RGBAClusterSIMD(const RGBAClusterSIMD &c) : + m_NumPoints(c.m_NumPoints), + m_Total(c.m_Total), + m_PointBitString(c.m_PointBitString), + m_Min(c.m_Min), + m_Max(c.m_Max), + m_PrincipalAxisCached(false) + { + memcpy(this->m_DataPoints, c.m_DataPoints, m_NumPoints * sizeof(RGBAVectorSIMD)); + } + + RGBAClusterSIMD(const RGBAClusterSIMD &left, const RGBAClusterSIMD &right); + RGBAClusterSIMD(const RGBAVectorSIMD &p, int idx) : + m_NumPoints(1), + m_Total(p), + m_PointBitString(0), + m_Min(p), m_Max(p), + m_PrincipalAxisCached(false) + { + m_DataPoints[0] = p; + m_PointBitString |= (1 << idx); + } + + RGBAVectorSIMD GetTotal() const { return m_Total; } + const RGBAVectorSIMD &GetPoint(int idx) const { return m_DataPoints[idx]; } + int GetNumPoints() const { return m_NumPoints; } + RGBAVectorSIMD GetAvg() const { return m_Total / float(m_NumPoints); } + + void AddPoint(const RGBAVectorSIMD &p, int idx); + + void GetBoundingBox(RGBAVectorSIMD &Min, RGBAVectorSIMD &Max) const { + Min = m_Min, Max = m_Max; + } + + // Returns the error if we were to quantize the colors right now with the given number of buckets and bit mask. + float QuantizedError(const RGBAVectorSIMD &p1, const RGBAVectorSIMD &p2, const uint8 nBuckets, const __m128i &bitMask, const int pbits[2] = NULL, __m128i *indices = NULL) const; + + bool AllSamePoint() const { return m_Max == m_Min; } + int GetPointBitString() const { return m_PointBitString; } + +private: + + // The number of points in the cluster. + int m_NumPoints; + + RGBAVectorSIMD m_Total; + + // The points in the cluster. + RGBAVectorSIMD m_DataPoints[kMaxNumDataPoints]; + + RGBAVectorSIMD m_Min, m_Max; + int m_PointBitString; + + RGBADirSIMD m_PrincipalAxis; + bool m_PrincipalAxisCached; +}; + +extern void GetPrincipalAxis(const RGBAClusterSIMD &c, RGBADirSIMD &axis); + +#endif //__RGBA_SIMD_ENDPOINTS_H__ diff --git a/CLTool/CMakeLists.txt b/CLTool/CMakeLists.txt new file mode 100644 index 0000000..e69de29 diff --git a/CLTool/StopWatch.cpp b/CLTool/StopWatch.cpp new file mode 100755 index 0000000..b8e2510 --- /dev/null +++ b/CLTool/StopWatch.cpp @@ -0,0 +1,106 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +#include "StopWatch.h" +#include + +// Initialize member variables. +StopWatch::StopWatch() : + frequency(0), + start(0), + stop(0), + affinityMask(0) +{ + // Initialize the performance counter frequency. + LARGE_INTEGER perfQuery; + BOOL supported = QueryPerformanceFrequency(&perfQuery); + assert(supported == TRUE); + this->frequency = perfQuery.QuadPart; +} + +// Start the stopwatch. +void StopWatch::Start() +{ + // MSDN recommends setting the thread affinity to avoid bugs in the BIOS and HAL. + // Create an affinity mask for the current processor. + affinityMask = (DWORD_PTR)1 << GetCurrentProcessorNumber(); + HANDLE currThread = GetCurrentThread(); + DWORD_PTR prevAffinityMask = SetThreadAffinityMask(currThread, affinityMask); + assert(prevAffinityMask != 0); + + // Query the performance counter. + LARGE_INTEGER perfQuery; + BOOL result = QueryPerformanceCounter(&perfQuery); + assert(result); + start = perfQuery.QuadPart; + + // Restore the thread's affinity mask. + prevAffinityMask = SetThreadAffinityMask(currThread, prevAffinityMask); + assert(prevAffinityMask != 0); +} + +// Stop the stopwatch. +void StopWatch::Stop() +{ + // MSDN recommends setting the thread affinity to avoid bugs in the BIOS and HAL. + // Use the affinity mask that was created in the Start function. + HANDLE currThread = GetCurrentThread(); + DWORD_PTR prevAffinityMask = SetThreadAffinityMask(currThread, affinityMask); + assert(prevAffinityMask != 0); + + // Query the performance counter. + LARGE_INTEGER perfQuery; + BOOL result = QueryPerformanceCounter(&perfQuery); + assert(result); + stop = perfQuery.QuadPart; + + // Restore the thread's affinity mask. + prevAffinityMask = SetThreadAffinityMask(currThread, prevAffinityMask); + assert(prevAffinityMask != 0); +} + +// Reset the stopwatch. +void StopWatch::Reset() +{ + start = 0; + stop = 0; + affinityMask = 0; +} + +// Get the elapsed time in seconds. +double StopWatch::TimeInSeconds() const +{ + // Return the elapsed time in seconds. + assert((stop - start) > 0); + return double(stop - start) / double(frequency); +} + +// Get the elapsed time in milliseconds. +double StopWatch::TimeInMilliseconds() const +{ + // Return the elapsed time in milliseconds. + assert((stop - start) > 0); + return double(stop - start) / double(frequency) * 1000.0; +} + +// Get the elapsed time in microseconds. +double StopWatch::TimeInMicroseconds() const +{ + // Return the elapsed time in microseconds. + assert((stop - start) > 0); + return double(stop - start) / double(frequency) * 1000000.0; +} diff --git a/CLTool/StopWatch.h b/CLTool/StopWatch.h new file mode 100755 index 0000000..76573de --- /dev/null +++ b/CLTool/StopWatch.h @@ -0,0 +1,41 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +#pragma once + +#include "Windows.h" + +// A simple stopwatch class using Windows' high-resolution performance counters. +class StopWatch +{ +public: + StopWatch(); + + void Start(); + void Stop(); + void Reset(); + + double TimeInSeconds() const; + double TimeInMilliseconds() const; + double TimeInMicroseconds() const; + +private: + LONGLONG frequency; + LONGLONG start; + LONGLONG stop; + DWORD_PTR affinityMask; +}; diff --git a/CLTool/main.cpp b/CLTool/main.cpp new file mode 100755 index 0000000..2dab017 --- /dev/null +++ b/CLTool/main.cpp @@ -0,0 +1,2126 @@ +//-------------------------------------------------------------------------------------- +// Copyright 2011 Intel Corporation +// All Rights Reserved +// +// Permission is granted to use, copy, distribute and prepare derivative works of this +// software for any purpose and without fee, provided, that the above copyright notice +// and this statement appear in all copies. Intel makes no representations about the +// suitability of this software for any purpose. THIS SOFTWARE IS PROVIDED "AS IS." +// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY, +// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, +// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not +// assume any responsibility for any errors which may appear in this software nor any +// responsibility to update it. +// +//-------------------------------------------------------------------------------------- + +#include "DXUT.h" +#include "DXUTcamera.h" +#include "DXUTgui.h" +#include "DXUTsettingsDlg.h" +#include "SDKmisc.h" +#include "SDKMesh.h" + +#include "DXTCompressorDLL.h" // DXT compressor DLL. +#include "BC7CompressorDLL.h" // BC7 compressor DLL. + +#include "StopWatch.h" // Timer. +#include "TaskMgrTBB.h" // TBB task manager. + +#include +#include + +#define ALIGN16(x) __declspec(align(16)) x +#define ALIGN32(x) __declspec(align(32)) x + +// DXT compressor type. +enum ECompressorType +{ + eCompType_DXT1, + eCompType_DXT5, + eCompType_BC7, + + kNumCompressorTypes +}; + +const TCHAR *kCompressorTypeStr[kNumCompressorTypes] = { + _T("DXT1/BC1"), + _T("DXT5/BC3"), + _T("BC7"), +}; + +enum EInstructionSet +{ + eInstrSet_Scalar + , eInstrSet_SSE + , eInstrSet_AVX2 + + , kNumInstructionSets +}; + +const TCHAR *kInstructionSetStr[kNumInstructionSets] = { + _T("Scalar"), + _T("SSE"), + _T("AVX2"), +}; + +enum EThreadMode +{ + eThreadMode_None, + eThreadMode_TBB, + eThreadMode_Win32, + + kNumThreadModes +}; + +const TCHAR *kThreadModeStr[kNumThreadModes] = { + _T("None"), + _T("TBB"), + _T("Win32") +}; + +static BOOL g_DXT1Available = TRUE; +static BOOL g_AVX2Available = FALSE; +static BOOL g_DX11Available = FALSE; + +const struct ECompressionScheme { + const ECompressorType type; + const EInstructionSet instrSet; + const EThreadMode threadMode; + const BOOL &availabilityOverride; +} kCompressionSchemes[] = { + { eCompType_DXT1, eInstrSet_Scalar, eThreadMode_None, g_DXT1Available }, + { eCompType_DXT1, eInstrSet_Scalar, eThreadMode_TBB, g_DXT1Available }, + { eCompType_DXT1, eInstrSet_Scalar, eThreadMode_Win32, g_DXT1Available }, + { eCompType_DXT1, eInstrSet_SSE, eThreadMode_None, g_DXT1Available }, + { eCompType_DXT1, eInstrSet_SSE, eThreadMode_TBB, g_DXT1Available }, + { eCompType_DXT1, eInstrSet_SSE, eThreadMode_Win32, g_DXT1Available }, + { eCompType_DXT5, eInstrSet_Scalar, eThreadMode_None, g_DXT1Available }, + { eCompType_DXT5, eInstrSet_Scalar, eThreadMode_TBB, g_DXT1Available }, + { eCompType_DXT5, eInstrSet_Scalar, eThreadMode_Win32, g_DXT1Available }, + { eCompType_DXT5, eInstrSet_SSE, eThreadMode_None, g_DXT1Available }, + { eCompType_DXT5, eInstrSet_SSE, eThreadMode_TBB, g_DXT1Available }, + { eCompType_DXT5, eInstrSet_SSE, eThreadMode_Win32, g_DXT1Available }, + { eCompType_BC7, eInstrSet_Scalar, eThreadMode_None, g_DX11Available }, + { eCompType_BC7, eInstrSet_Scalar, eThreadMode_Win32, g_DX11Available }, + { eCompType_BC7, eInstrSet_SSE, eThreadMode_None, g_DX11Available }, + { eCompType_BC7, eInstrSet_SSE, eThreadMode_Win32, g_DX11Available }, + { eCompType_DXT1, eInstrSet_AVX2, eThreadMode_None, g_AVX2Available }, + { eCompType_DXT1, eInstrSet_AVX2, eThreadMode_TBB, g_AVX2Available }, + { eCompType_DXT1, eInstrSet_AVX2, eThreadMode_Win32, g_AVX2Available }, + { eCompType_DXT5, eInstrSet_AVX2, eThreadMode_None, g_AVX2Available }, + { eCompType_DXT5, eInstrSet_AVX2, eThreadMode_TBB, g_AVX2Available }, + { eCompType_DXT5, eInstrSet_AVX2, eThreadMode_Win32, g_AVX2Available }, +}; +const int kNumCompressionSchemes = sizeof(kCompressionSchemes) / sizeof(kCompressionSchemes[0]); +const ECompressionScheme *gCompressionScheme = kCompressionSchemes; + +// Textured vertex. +struct Vertex +{ + D3DXVECTOR3 position; + D3DXVECTOR2 texCoord; +}; + +// Global variables +CDXUTDialogResourceManager gDialogResourceManager; // manager for shared resources of dialogs +CD3DSettingsDlg gD3DSettingsDlg; // Device settings dialog +CDXUTDialog gHUD; // manages the 3D +CDXUTDialog gSampleUI; // dialog for sample specific controls +bool gShowHelp = false; // If true, it renders the UI control text +CDXUTTextHelper* gTxtHelper = NULL; +double gCompTime = 0.0; +double gCompRate = 0.0; +int gBlocksPerTask = 256; +int gFrameNum = 0; +int gFrameDelay = 100; +int gTexWidth = 0; +int gTexHeight = 0; +double gError = 0.0; + +#ifdef REPORT_RMSE +static const WCHAR *kErrorStr = L"Root Mean Squared Error"; +#else +static const WCHAR *kErrorStr = L"Peak Signal/Noise Ratio"; +#endif + +ID3D11DepthStencilState* gDepthStencilState = NULL; +UINT gStencilReference = 0; +ID3D11InputLayout* gVertexLayout = NULL; +ID3D11Buffer* gVertexBuffer = NULL; +ID3D11Buffer* gQuadVB = NULL; +ID3D11Buffer* gIndexBuffer = NULL; +ID3D11VertexShader* gVertexShader = NULL; +ID3D11PixelShader* gRenderFramePS = NULL; +ID3D11PixelShader* gRenderTexturePS = NULL; +ID3D11SamplerState* gSamPoint = NULL; +ID3D11ShaderResourceView* gUncompressedSRV = NULL; // Shader resource view for the uncompressed texture resource. +ID3D11ShaderResourceView* gCompressedSRV = NULL; // Shader resource view for the compressed texture resource. +ID3D11ShaderResourceView* gErrorSRV = NULL; // Shader resource view for the error texture. + +// Win32 thread API +const int kMaxWinThreads = 16; + +enum EThreadState { + eThreadState_WaitForData, + eThreadState_DataLoaded, + eThreadState_Running, + eThreadState_Done +}; + +typedef void (* CompressionFunc)(const BYTE* inBuf, BYTE* outBuf, int width, int height); + +struct WinThreadData { + EThreadState state; + int threadIdx; + const BYTE *inBuf; + BYTE *outBuf; + int width; + int height; + void (*cmpFunc)(const BYTE* inBuf, BYTE* outBuf, int width, int height); + + // Defaults.. + WinThreadData() : + state(eThreadState_Done), + threadIdx(-1), + inBuf(NULL), + outBuf(NULL), + width(-1), + height(-1), + cmpFunc(NULL) + { } + +} gWinThreadData[kMaxWinThreads]; + +HANDLE gWinThreadWorkEvent[kMaxWinThreads]; +HANDLE gWinThreadStartEvent = NULL; +HANDLE gWinThreadDoneEvent = NULL; +int gNumWinThreads = 0; +DWORD gNumProcessors = 1; // We have at least one processor. +DWORD dwThreadIdArray[kMaxWinThreads]; +HANDLE hThreadArray[kMaxWinThreads]; + +// UI control IDs +#define IDC_TOGGLEFULLSCREEN 1 +#define IDC_TOGGLEREF 2 +#define IDC_CHANGEDEVICE 3 +#define IDC_UNCOMPRESSEDTEXT 4 +#define IDC_COMPRESSEDTEXT 5 +#define IDC_ERRORTEXT 6 +#define IDC_SIZETEXT 7 +#define IDC_TIMETEXT 8 +#define IDC_RATETEXT 9 +#define IDC_TBB 10 +#define IDC_SIMD 11 +#define IDC_COMPRESSOR 12 +#define IDC_BLOCKSPERTASKTEXT 13 +#define IDC_BLOCKSPERTASK 14 +#define IDC_LOADTEXTURE 15 +#define IDC_RECOMPRESS 16 +#define IDC_RMSETEXT 17 + +// Forward declarations +bool CALLBACK ModifyDeviceSettings( DXUTDeviceSettings* pDeviceSettings, void* pUserContext ); +void CALLBACK OnFrameMove( double fTime, float fElapsedTime, void* pUserContext ); +LRESULT CALLBACK MsgProc( HWND hWnd, UINT uMsg, WPARAM wParam, LPARAM lParam, bool* pbNoFurtherProcessing, + void* pUserContext ); +void CALLBACK OnKeyboard( UINT nChar, bool bKeyDown, bool bAltDown, void* pUserContext ); +void CALLBACK OnGUIEvent( UINT nEvent, int nControlID, CDXUTControl* pControl, void* pUserContext ); + +bool CALLBACK IsD3D11DeviceAcceptable(const CD3D11EnumAdapterInfo *AdapterInfo, UINT Output, const CD3D11EnumDeviceInfo *DeviceInfo, + DXGI_FORMAT BackBufferFormat, bool bWindowed, void* pUserContext ); +HRESULT CALLBACK OnD3D11CreateDevice( ID3D11Device* pd3dDevice, const DXGI_SURFACE_DESC* pBackBufferSurfaceDesc, + void* pUserContext ); +HRESULT CALLBACK OnD3D11ResizedSwapChain( ID3D11Device* pd3dDevice, IDXGISwapChain* pSwapChain, + const DXGI_SURFACE_DESC* pBackBufferSurfaceDesc, void* pUserContext ); +void CALLBACK OnD3D11ReleasingSwapChain( void* pUserContext ); +void CALLBACK OnD3D11DestroyDevice( void* pUserContext ); +void CALLBACK OnD3D11FrameRender( ID3D11Device* pd3dDevice, ID3D11DeviceContext* pd3dImmediateContext, double fTime, + float fElapsedTime, void* pUserContext ); + +void InitApp(); +void RenderText(); + +void UpdateBlockSlider(); +void UpdateCompressionAlgorithms(); +void UpdateThreadingMode(); +void UpdateCompressionModes(); +void UpdateAllowedSettings(); + +void SetCompressionScheme(EInstructionSet instrSet, ECompressorType compType, EThreadMode threadMode); + +HRESULT CreateTextures(LPTSTR file); +void DestroyTextures(); +HRESULT LoadTexture(LPTSTR file); +HRESULT PadTexture(ID3D11ShaderResourceView** textureSRV); +HRESULT SaveTexture(ID3D11ShaderResourceView* textureSRV, LPTSTR file); +HRESULT CompressTexture(ID3D11ShaderResourceView* uncompressedSRV, ID3D11ShaderResourceView** compressedSRV); +HRESULT ComputeError(ID3D11ShaderResourceView* uncompressedSRV, ID3D11ShaderResourceView* compressedSRV, ID3D11ShaderResourceView** errorSRV); +HRESULT RecompressTexture(); + +void ComputeRMSE(const BYTE *errorData, const INT width, const INT height); + +void InitWin32Threads(); +void DestroyThreads(); + +void StoreDepthStencilState(); +void RestoreDepthStencilState(); +HRESULT DisableDepthTest(); + +namespace DXTC +{ + VOID CompressImageDXT(const BYTE* inBuf, BYTE* outBuf, INT width, INT height); + + VOID CompressImageDXTNoThread(const BYTE* inBuf, BYTE* outBuf, INT width, INT height); + VOID CompressImageDXTTBB(const BYTE* inBuf, BYTE* outBuf, INT width, INT height); + VOID CompressImageDXTWIN(const BYTE* inBuf, BYTE* outBuf, INT width, INT height); + + DWORD WINAPI CompressImageDXTWinThread( LPVOID lpParam ); +} + +#ifdef ENABLE_AVX2 +#ifdef _M_X64 +/* On x64, we can't have inline assembly in C files, see avxtest.asm */ +extern "C" int __stdcall supports_AVX2(); + +#else ifdef WIN32 +/* AVX2 instructions require 64 bit mode. */ +extern "C" int __stdcall supports_AVX2() { + return 0; +} +#endif // _M_X64 +#endif // ENABLE_AVX2 + +int WINAPI wWinMain( HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdLine, int nCmdShow ) +{ + // Enable run-time memory check for debug builds. +#if defined(DEBUG) | defined(_DEBUG) + _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); +#endif + +#ifdef ENABLE_AVX2 + g_AVX2Available = supports_AVX2(); +#endif + + // Make sure that the event array is set to null... + memset(gWinThreadWorkEvent, 0, sizeof(gWinThreadWorkEvent)); + + // Figure out how many cores there are on this machine + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + gNumProcessors = sysinfo.dwNumberOfProcessors; + + // Make sure all of our threads are empty. + for(int i = 0; i < kMaxWinThreads; i++) { + hThreadArray[i] = NULL; + } + + // Set DXUT callbacks + DXUTSetCallbackDeviceChanging( ModifyDeviceSettings ); + DXUTSetCallbackMsgProc( MsgProc ); + DXUTSetCallbackKeyboard( OnKeyboard ); + DXUTSetCallbackFrameMove( OnFrameMove ); + DXUTSetCallbackD3D11DeviceAcceptable( IsD3D11DeviceAcceptable ); + DXUTSetCallbackD3D11DeviceCreated( OnD3D11CreateDevice ); + DXUTSetCallbackD3D11SwapChainResized( OnD3D11ResizedSwapChain ); + DXUTSetCallbackD3D11FrameRender( OnD3D11FrameRender ); + DXUTSetCallbackD3D11SwapChainReleasing( OnD3D11ReleasingSwapChain ); + DXUTSetCallbackD3D11DeviceDestroyed( OnD3D11DestroyDevice ); + + InitApp(); + + DXUTInit( true, true, NULL ); + DXUTSetCursorSettings( true, true ); + DXUTCreateWindow( L"Fast Texture Compressor" ); + + // Try to create a device with DX11 feature set + DXUTCreateDevice (D3D_FEATURE_LEVEL_11_0, true, 1280, 1024 ); + + // If we don't have an adequate driver, then we revert to DX10 feature set... + DXUTDeviceSettings settings = DXUTGetDeviceSettings(); + if(settings.d3d11.DriverType == D3D_DRIVER_TYPE_UNKNOWN || settings.d3d11.DriverType == D3D_DRIVER_TYPE_NULL) { + DXUTCreateDevice(D3D_FEATURE_LEVEL_10_1, true, 1280, 1024); + + // !HACK! Force enumeration here in order to relocate hardware with new feature level + DXUTGetD3D11Enumeration(true); + DXUTCreateDevice(D3D_FEATURE_LEVEL_10_1, true, 1280, 1024); + + const TCHAR *noDx11msg = _T("Your hardware does not seem to support DX11. BC7 Compression is disabled."); + MessageBox(NULL, noDx11msg, _T("Error"), MB_OK); + } + else { + g_DX11Available = TRUE; + } + + // Now that we know what things are allowed, update the available options. + UpdateAllowedSettings(); + + DXUTMainLoop(); + + // Destroy all of the threads... + DestroyThreads(); + + return DXUTGetExitCode(); +} + +// Initialize the app +void InitApp() +{ + // Initialize dialogs + gD3DSettingsDlg.Init(&gDialogResourceManager); + gHUD.Init(&gDialogResourceManager); + gSampleUI.Init(&gDialogResourceManager); + + gHUD.SetCallback(OnGUIEvent); + int x = 0; + int y = 10; + gHUD.AddButton(IDC_TOGGLEFULLSCREEN, L"Toggle full screen", x, y, 170, 23); + gHUD.AddButton(IDC_TOGGLEREF, L"Toggle REF (F3)", x, y += 26, 170, 23, VK_F3); + gHUD.AddButton(IDC_CHANGEDEVICE, L"Change device (F2)", x, y += 26, 170, 23, VK_F2); + + gSampleUI.SetCallback(OnGUIEvent); + x = 0; + y = 0; + gSampleUI.AddStatic(IDC_UNCOMPRESSEDTEXT, L"Uncompressed", x, y, 125, 22); + gSampleUI.AddStatic(IDC_COMPRESSEDTEXT, L"Compressed", x, y, 125, 22); + gSampleUI.AddStatic(IDC_ERRORTEXT, L"Error", x, y, 125, 22); + WCHAR wstr[MAX_PATH]; + swprintf_s(wstr, MAX_PATH, L"Texture Size: %d x %d", gTexWidth, gTexHeight); + gSampleUI.AddStatic(IDC_SIZETEXT, wstr, x, y, 125, 22); + swprintf_s(wstr, MAX_PATH, L"%s: %.2f", kErrorStr, gError); + gSampleUI.AddStatic(IDC_RMSETEXT, wstr, x, y, 125, 22); + swprintf_s(wstr, MAX_PATH, L"Compression Time: %0.2f ms", gCompTime); + gSampleUI.AddStatic(IDC_TIMETEXT, wstr, x, y, 125, 22); + swprintf_s(wstr, MAX_PATH, L"Compression Rate: %0.2f Mp/s", gCompRate); + gSampleUI.AddStatic(IDC_RATETEXT, wstr, x, y, 125, 22); + gSampleUI.AddComboBox(IDC_TBB, x, y, 95, 22); + gSampleUI.AddComboBox(IDC_SIMD, x, y, 140, 22); + gSampleUI.AddComboBox(IDC_COMPRESSOR, x, y, 105, 22); + swprintf_s(wstr, MAX_PATH, L"Blocks Per Task: %d", gBlocksPerTask); + gSampleUI.AddStatic(IDC_BLOCKSPERTASKTEXT, wstr, x, y, 125, 22); + gSampleUI.AddSlider(IDC_BLOCKSPERTASK, x, y, 256, 22, 1, 512, gBlocksPerTask); + gSampleUI.AddButton(IDC_LOADTEXTURE, L"Load Texture", x, y, 125, 22); + gSampleUI.AddButton(IDC_RECOMPRESS, L"Recompress", x, y, 125, 22); +} + +// Called right before creating a D3D11 device, allowing the app to modify the device settings as needed +bool CALLBACK ModifyDeviceSettings( DXUTDeviceSettings* pDeviceSettings, void* pUserContext ) +{ + // Uncomment this to get debug information from D3D11 + //pDeviceSettings->d3d11.CreateFlags |= D3D11_CREATE_DEVICE_DEBUG; + + // For the first device created if its a REF device, optionally display a warning dialog box + static bool s_bFirstTime = true; + if( s_bFirstTime ) + { + s_bFirstTime = false; + if( ( DXUT_D3D11_DEVICE == pDeviceSettings->ver && + pDeviceSettings->d3d11.DriverType == D3D_DRIVER_TYPE_REFERENCE ) ) + { + DXUTDisplaySwitchingToREFWarning( pDeviceSettings->ver ); + } + } + + return true; +} + +// Handle updates to the scene. +void CALLBACK OnFrameMove( double fTime, float fElapsedTime, void* pUserContext ) +{ + +} + +// Render the help and statistics text +void RenderText() +{ + UINT nBackBufferHeight = ( DXUTIsAppRenderingWithD3D9() ) ? DXUTGetD3D9BackBufferSurfaceDesc()->Height : + DXUTGetDXGIBackBufferSurfaceDesc()->Height; + + gTxtHelper->Begin(); + gTxtHelper->SetInsertionPos( 2, 0 ); + gTxtHelper->SetForegroundColor( D3DXCOLOR( 1.0f, 1.0f, 0.0f, 1.0f ) ); + gTxtHelper->DrawTextLine( DXUTGetFrameStats( false ) ); + gTxtHelper->DrawTextLine( DXUTGetDeviceStats() ); + + // Draw help + if( gShowHelp ) + { + gTxtHelper->SetInsertionPos( 2, nBackBufferHeight - 20 * 6 ); + gTxtHelper->SetForegroundColor( D3DXCOLOR( 1.0f, 0.75f, 0.0f, 1.0f ) ); + gTxtHelper->DrawTextLine( L"Controls:" ); + + gTxtHelper->SetInsertionPos( 20, nBackBufferHeight - 20 * 5 ); + gTxtHelper->DrawTextLine( L"Hide help: F1\n" + L"Quit: ESC\n" ); + } + else + { + gTxtHelper->SetForegroundColor( D3DXCOLOR( 1.0f, 1.0f, 1.0f, 1.0f ) ); + gTxtHelper->DrawTextLine( L"Press F1 for help" ); + } + + gTxtHelper->End(); +} + +// Handle messages to the application +LRESULT CALLBACK MsgProc( HWND hWnd, UINT uMsg, WPARAM wParam, LPARAM lParam, bool* pbNoFurtherProcessing, + void* pUserContext ) +{ + // Pass messages to dialog resource manager calls so GUI state is updated correctly + *pbNoFurtherProcessing = gDialogResourceManager.MsgProc( hWnd, uMsg, wParam, lParam ); + if( *pbNoFurtherProcessing ) + return 0; + + // Pass messages to settings dialog if its active + if( gD3DSettingsDlg.IsActive() ) + { + gD3DSettingsDlg.MsgProc( hWnd, uMsg, wParam, lParam ); + return 0; + } + + // Give the dialogs a chance to handle the message first + *pbNoFurtherProcessing = gHUD.MsgProc( hWnd, uMsg, wParam, lParam ); + if( *pbNoFurtherProcessing ) + return 0; + *pbNoFurtherProcessing = gSampleUI.MsgProc( hWnd, uMsg, wParam, lParam ); + if( *pbNoFurtherProcessing ) + return 0; + + return 0; +} + +// Handle key presses +void CALLBACK OnKeyboard( UINT nChar, bool bKeyDown, bool bAltDown, void* pUserContext ) +{ + if( bKeyDown ) + { + switch( nChar ) + { + case VK_F1: + gShowHelp = !gShowHelp; break; + } + } +} + +// Handles the GUI events +void CALLBACK OnGUIEvent( UINT nEvent, int nControlID, CDXUTControl* pControl, void* pUserContext ) +{ + switch( nControlID ) + { + case IDC_TOGGLEFULLSCREEN: + { + DXUTToggleFullScreen(); + break; + } + case IDC_TOGGLEREF: + { + DXUTToggleREF(); + break; + } + case IDC_CHANGEDEVICE: + { + gD3DSettingsDlg.SetActive( !gD3DSettingsDlg.IsActive() ); + break; + } + + case IDC_TIMETEXT: + { + WCHAR wstr[MAX_PATH]; + swprintf_s(wstr, MAX_PATH, L"Compression Time: %0.2f ms", gCompTime); + gSampleUI.GetStatic(IDC_TIMETEXT)->SetText(wstr); + break; + } + case IDC_RATETEXT: + { + WCHAR wstr[MAX_PATH]; + swprintf_s(wstr, MAX_PATH, L"Compression Rate: %0.2f Mp/s", gCompRate); + gSampleUI.GetStatic(IDC_RATETEXT)->SetText(wstr); + break; + } + + case IDC_RMSETEXT: + { + WCHAR wstr[MAX_PATH]; + swprintf_s(wstr, MAX_PATH, L"%s: %.2f", kErrorStr, gError); + gSampleUI.GetStatic(IDC_RMSETEXT)->SetText(wstr); + break; + } + + case IDC_TBB: + { + // Shut down all previous threading abilities. + DestroyThreads(); + + EInstructionSet instrSet = gCompressionScheme->instrSet; + ECompressorType compType = gCompressionScheme->type; + + EThreadMode newMode = (EThreadMode)(INT_PTR)gSampleUI.GetComboBox(IDC_TBB)->GetSelectedData(); + + switch(newMode) { + case eThreadMode_TBB: + + // Initialize the TBB task manager. + gTaskMgr.Init(); + + break; + + case eThreadMode_Win32: + + InitWin32Threads(); + + break; + + case eThreadMode_None: + // Do nothing, our threads are fine. + break; + } + + SetCompressionScheme(instrSet, compType, newMode); + UpdateAllowedSettings(); + + // Recompress the texture. + RecompressTexture(); + + break; + } + + case IDC_SIMD: + { + EThreadMode threadMode = gCompressionScheme->threadMode; + ECompressorType compType = gCompressionScheme->type; + + EInstructionSet newInstrSet = (EInstructionSet)(INT_PTR)gSampleUI.GetComboBox(IDC_SIMD)->GetSelectedData(); + + // If we selected AVX2, then the total number of blocks when using AVX2 changes, so we need + // to reflect that in the slider. + UpdateBlockSlider(); + + SetCompressionScheme(newInstrSet, compType, threadMode); + UpdateAllowedSettings(); + + // Recompress the texture. + RecompressTexture(); + + break; + } + case IDC_COMPRESSOR: + { + EThreadMode threadMode = gCompressionScheme->threadMode; + EInstructionSet instrSet = gCompressionScheme->instrSet; + ECompressorType newCompType = (ECompressorType)(INT_PTR)gSampleUI.GetComboBox(IDC_COMPRESSOR)->GetSelectedData(); + + SetCompressionScheme(instrSet, newCompType, threadMode); + UpdateAllowedSettings(); + + // Recompress the texture. + RecompressTexture(); + + break; + } + case IDC_BLOCKSPERTASK: + { + gBlocksPerTask = gSampleUI.GetSlider(IDC_BLOCKSPERTASK)->GetValue(); + WCHAR wstr[MAX_PATH]; + swprintf_s(wstr, MAX_PATH, L"Blocks Per Task: %d", gBlocksPerTask); + gSampleUI.GetStatic(IDC_BLOCKSPERTASKTEXT)->SetText(wstr); + + // Recompress the texture. + RecompressTexture(); + + break; + } + case IDC_LOADTEXTURE: + { + // Store the current working directory. + TCHAR workingDirectory[MAX_PATH]; + GetCurrentDirectory(MAX_PATH, workingDirectory); + + // Open a file dialog. + OPENFILENAME openFileName; + WCHAR file[MAX_PATH]; + file[0] = 0; + ZeroMemory(&openFileName, sizeof(OPENFILENAME)); + openFileName.lStructSize = sizeof(OPENFILENAME); + openFileName.lpstrFile = file; + openFileName.nMaxFile = MAX_PATH; + openFileName.lpstrFilter = L"DDS\0*.dds\0\0"; + openFileName.nFilterIndex = 1; + openFileName.lpstrInitialDir = NULL; + openFileName.Flags = OFN_PATHMUSTEXIST | OFN_FILEMUSTEXIST; + if(GetOpenFileName(&openFileName)) + { + CreateTextures(openFileName.lpstrFile); + } + + // Restore the working directory. GetOpenFileName changes the current working directory which causes problems with relative paths to assets. + SetCurrentDirectory(workingDirectory); + + break; + } + case IDC_RECOMPRESS: + { + // Recompress the texture. + RecompressTexture(); + + break; + } + } +} + +// Reject any D3D11 devices that aren't acceptable by returning false +bool CALLBACK IsD3D11DeviceAcceptable( const CD3D11EnumAdapterInfo *AdapterInfo, UINT Output, const CD3D11EnumDeviceInfo *DeviceInfo, + DXGI_FORMAT BackBufferFormat, bool bWindowed, void* pUserContext ) +{ + return true; +} + +// Find and compile the specified shader +HRESULT CompileShaderFromFile( WCHAR* szFileName, LPCSTR szEntryPoint, LPCSTR szShaderModel, ID3DBlob** ppBlobOut ) +{ + HRESULT hr = S_OK; + + // find the file + WCHAR str[MAX_PATH]; + V_RETURN( DXUTFindDXSDKMediaFileCch( str, MAX_PATH, szFileName ) ); + + DWORD dwShaderFlags = D3DCOMPILE_ENABLE_STRICTNESS; +#if defined( DEBUG ) || defined( _DEBUG ) + // Set the D3DCOMPILE_DEBUG flag to embed debug information in the shaders. + // Setting this flag improves the shader debugging experience, but still allows + // the shaders to be optimized and to run exactly the way they will run in + // the release configuration of this program. + dwShaderFlags |= D3DCOMPILE_DEBUG; +#endif + + ID3DBlob* pErrorBlob; + hr = D3DX11CompileFromFile( str, NULL, NULL, szEntryPoint, szShaderModel, + dwShaderFlags, 0, NULL, ppBlobOut, &pErrorBlob, NULL ); + if( FAILED(hr) ) + { + if( pErrorBlob != NULL ) + OutputDebugStringA( (char*)pErrorBlob->GetBufferPointer() ); + SAFE_RELEASE( pErrorBlob ); + return hr; + } + SAFE_RELEASE( pErrorBlob ); + + return S_OK; +} + +// Create any D3D11 resources that aren't dependent on the back buffer +HRESULT CALLBACK OnD3D11CreateDevice( ID3D11Device* pd3dDevice, const DXGI_SURFACE_DESC* pBackBufferSurfaceDesc, + void* pUserContext ) +{ + HRESULT hr; + + ID3D11DeviceContext* pd3dImmediateContext = DXUTGetD3D11DeviceContext(); + V_RETURN(gDialogResourceManager.OnD3D11CreateDevice(pd3dDevice, pd3dImmediateContext)); + V_RETURN(gD3DSettingsDlg.OnD3D11CreateDevice(pd3dDevice)); + gTxtHelper = new CDXUTTextHelper(pd3dDevice, pd3dImmediateContext, &gDialogResourceManager, 15); + + // Create a vertex shader. + ID3DBlob* vertexShaderBuffer = NULL; + V_RETURN(CompileShaderFromFile(L"FastTextureCompressor\\FastTextureCompressor.hlsl", "PassThroughVS", "vs_4_0", &vertexShaderBuffer)); + V_RETURN(pd3dDevice->CreateVertexShader(vertexShaderBuffer->GetBufferPointer(), vertexShaderBuffer->GetBufferSize(), NULL, &gVertexShader)); + + // Create a pixel shader that renders the composite frame. + ID3DBlob* pixelShaderBuffer = NULL; + V_RETURN(CompileShaderFromFile(L"FastTextureCompressor\\FastTextureCompressor.hlsl", "RenderFramePS", "ps_4_0", &pixelShaderBuffer)); + V_RETURN(pd3dDevice->CreatePixelShader(pixelShaderBuffer->GetBufferPointer(), pixelShaderBuffer->GetBufferSize(), NULL, &gRenderFramePS)); + + // Create a pixel shader that renders the error texture. + V_RETURN(CompileShaderFromFile(L"FastTextureCompressor\\FastTextureCompressor.hlsl", "RenderTexturePS", "ps_4_0", &pixelShaderBuffer)); + V_RETURN(pd3dDevice->CreatePixelShader(pixelShaderBuffer->GetBufferPointer(), pixelShaderBuffer->GetBufferSize(), NULL, &gRenderTexturePS)); + + // Create our vertex input layout + const D3D11_INPUT_ELEMENT_DESC layout[] = + { + { "POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0 }, + { "TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0 } + }; + + V_RETURN(pd3dDevice->CreateInputLayout(layout, ARRAYSIZE(layout), vertexShaderBuffer->GetBufferPointer(), vertexShaderBuffer->GetBufferSize(), &gVertexLayout)); + + SAFE_RELEASE(vertexShaderBuffer); + SAFE_RELEASE(pixelShaderBuffer); + + // Create a vertex buffer for three textured quads. + D3DXVECTOR2 quadSize(0.32f, 0.32f); + D3DXVECTOR2 quadOrigin(-0.66f, -0.0f); + Vertex tripleQuadVertices[18]; + ZeroMemory(tripleQuadVertices, sizeof(tripleQuadVertices)); + for(int i = 0; i < 18; i += 6) + { + tripleQuadVertices[i].position = D3DXVECTOR3(quadOrigin.x - quadSize.x, quadOrigin.y + quadSize.y, 0.0f); + tripleQuadVertices[i].texCoord = D3DXVECTOR2(0.0f, 0.0f); + + tripleQuadVertices[i + 1].position = D3DXVECTOR3(quadOrigin.x + quadSize.x, quadOrigin.y + quadSize.y, 0.0f); + tripleQuadVertices[i + 1].texCoord = D3DXVECTOR2(1.0f, 0.0f); + + tripleQuadVertices[i + 2].position = D3DXVECTOR3(quadOrigin.x + quadSize.x, quadOrigin.y - quadSize.y, 0.0f); + tripleQuadVertices[i + 2].texCoord = D3DXVECTOR2(1.0f, 1.0f); + + tripleQuadVertices[i + 3].position = D3DXVECTOR3(quadOrigin.x + quadSize.x, quadOrigin.y - quadSize.y, 0.0f); + tripleQuadVertices[i + 3].texCoord = D3DXVECTOR2(1.0f, 1.0f); + + tripleQuadVertices[i + 4].position = D3DXVECTOR3(quadOrigin.x - quadSize.x, quadOrigin.y - quadSize.y, 0.0f); + tripleQuadVertices[i + 4].texCoord = D3DXVECTOR2(0.0f, 1.0f); + + tripleQuadVertices[i + 5].position = D3DXVECTOR3(quadOrigin.x - quadSize.x, quadOrigin.y + quadSize.y, 0.0f); + tripleQuadVertices[i + 5].texCoord = D3DXVECTOR2(0.0f, 0.0f); + + quadOrigin.x += 0.66f; + } + + D3D11_BUFFER_DESC bufDesc; + ZeroMemory(&bufDesc, sizeof(bufDesc)); + bufDesc.Usage = D3D11_USAGE_DEFAULT; + bufDesc.ByteWidth = sizeof(tripleQuadVertices); + bufDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER; + bufDesc.CPUAccessFlags = 0; + D3D11_SUBRESOURCE_DATA data; + ZeroMemory(&data, sizeof(data)); + data.pSysMem = tripleQuadVertices; + V_RETURN(pd3dDevice->CreateBuffer(&bufDesc, &data, &gVertexBuffer)); + + // Create a vertex buffer for a single textured quad. + quadSize = D3DXVECTOR2(1.0f, 1.0f); + quadOrigin = D3DXVECTOR2(0.0f, 0.0f); + Vertex singleQuadVertices[6]; + singleQuadVertices[0].position = D3DXVECTOR3(quadOrigin.x - quadSize.x, quadOrigin.y + quadSize.y, 0.0f); + singleQuadVertices[0].texCoord = D3DXVECTOR2(0.0f, 0.0f); + singleQuadVertices[1].position = D3DXVECTOR3(quadOrigin.x + quadSize.x, quadOrigin.y + quadSize.y, 0.0f); + singleQuadVertices[1].texCoord = D3DXVECTOR2(1.0f, 0.0f); + singleQuadVertices[2].position = D3DXVECTOR3(quadOrigin.x + quadSize.x, quadOrigin.y - quadSize.y, 0.0f); + singleQuadVertices[2].texCoord = D3DXVECTOR2(1.0f, 1.0f); + singleQuadVertices[3].position = D3DXVECTOR3(quadOrigin.x + quadSize.x, quadOrigin.y - quadSize.y, 0.0f); + singleQuadVertices[3].texCoord = D3DXVECTOR2(1.0f, 1.0f); + singleQuadVertices[4].position = D3DXVECTOR3(quadOrigin.x - quadSize.x, quadOrigin.y - quadSize.y, 0.0f); + singleQuadVertices[4].texCoord = D3DXVECTOR2(0.0f, 1.0f); + singleQuadVertices[5].position = D3DXVECTOR3(quadOrigin.x - quadSize.x, quadOrigin.y + quadSize.y, 0.0f); + singleQuadVertices[5].texCoord = D3DXVECTOR2(0.0f, 0.0f); + + ZeroMemory(&bufDesc, sizeof(bufDesc)); + bufDesc.Usage = D3D11_USAGE_DEFAULT; + bufDesc.ByteWidth = sizeof(singleQuadVertices); + bufDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER; + bufDesc.CPUAccessFlags = 0; + ZeroMemory(&data, sizeof(data)); + data.pSysMem = singleQuadVertices; + V_RETURN(pd3dDevice->CreateBuffer(&bufDesc, &data, &gQuadVB)); + + // Create a sampler state + D3D11_SAMPLER_DESC SamDesc; + SamDesc.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT; + SamDesc.AddressU = D3D11_TEXTURE_ADDRESS_WRAP; + SamDesc.AddressV = D3D11_TEXTURE_ADDRESS_WRAP; + SamDesc.AddressW = D3D11_TEXTURE_ADDRESS_WRAP; + SamDesc.MipLODBias = 0.0f; + SamDesc.MaxAnisotropy = 1; + SamDesc.ComparisonFunc = D3D11_COMPARISON_ALWAYS; + SamDesc.BorderColor[0] = SamDesc.BorderColor[1] = SamDesc.BorderColor[2] = SamDesc.BorderColor[3] = 0; + SamDesc.MinLOD = 0; + SamDesc.MaxLOD = D3D11_FLOAT32_MAX; + V_RETURN(pd3dDevice->CreateSamplerState(&SamDesc, &gSamPoint)); + + // Load and initialize the textures. + WCHAR path[MAX_PATH]; + V_RETURN(DXUTFindDXSDKMediaFileCch(path, MAX_PATH, L"Images\\texture.dds")); + V_RETURN(CreateTextures(path)); + + return S_OK; +} + +// Create any D3D11 resources that depend on the back buffer +HRESULT CALLBACK OnD3D11ResizedSwapChain( ID3D11Device* pd3dDevice, IDXGISwapChain* pSwapChain, + const DXGI_SURFACE_DESC* pBackBufferSurfaceDesc, void* pUserContext ) +{ + HRESULT hr; + V_RETURN( gDialogResourceManager.OnD3D11ResizedSwapChain( pd3dDevice, pBackBufferSurfaceDesc ) ); + V_RETURN( gD3DSettingsDlg.OnD3D11ResizedSwapChain( pd3dDevice, pBackBufferSurfaceDesc ) ); + + gHUD.SetLocation( pBackBufferSurfaceDesc->Width - 170, 0 ); + gHUD.SetSize( 170, 170 ); + + gSampleUI.SetLocation( 0, 0 ); + gSampleUI.SetSize( pBackBufferSurfaceDesc->Width, pBackBufferSurfaceDesc->Height ); + + int oneThirdWidth = int(gSampleUI.GetWidth() / 3.0f); + int oneThirdHeight = int(gSampleUI.GetHeight() / 3.0f); + int x = 20; + int y = oneThirdHeight - 20; + gSampleUI.GetStatic(IDC_UNCOMPRESSEDTEXT)->SetLocation(x, y); + gSampleUI.GetStatic(IDC_COMPRESSEDTEXT)->SetLocation(x += oneThirdWidth, y); + gSampleUI.GetStatic(IDC_ERRORTEXT)->SetLocation(x += oneThirdWidth, y); + x = gSampleUI.GetWidth() - 276; + y = gSampleUI.GetHeight() - 216; + gSampleUI.GetStatic(IDC_SIZETEXT)->SetLocation(x, y); + gSampleUI.GetStatic(IDC_RMSETEXT)->SetLocation(x, y += 26); + gSampleUI.GetStatic(IDC_TIMETEXT)->SetLocation(x, y += 26); + gSampleUI.GetStatic(IDC_RATETEXT)->SetLocation(x, y += 26); + gSampleUI.GetComboBox(IDC_SIMD)->SetLocation(x, y += 26); + gSampleUI.GetComboBox(IDC_COMPRESSOR)->SetLocation(x + 150, y); + gSampleUI.GetStatic(IDC_BLOCKSPERTASKTEXT)->SetLocation(x, y += 26); + gSampleUI.GetComboBox(IDC_TBB)->SetLocation(x + 160, y); + gSampleUI.GetSlider(IDC_BLOCKSPERTASK)->SetLocation(x, y += 26); + gSampleUI.GetButton(IDC_LOADTEXTURE)->SetLocation(x, y += 26); + gSampleUI.GetButton(IDC_RECOMPRESS)->SetLocation(x + 131, y); + + return S_OK; +} + +// Render the scene using the D3D11 device +void CALLBACK OnD3D11FrameRender( ID3D11Device* pd3dDevice, ID3D11DeviceContext* pd3dImmediateContext, double fTime, + float fElapsedTime, void* pUserContext ) +{ + // Recompress the texture gFrameDelay frames after the app has started. This produces more accurate timing of the + // compression algorithm. + if(gFrameNum == gFrameDelay) + { + RecompressTexture(); + gFrameNum++; + } + else if(gFrameNum < gFrameDelay) + { + gFrameNum++; + } + + // If the settings dialog is being shown, then render it instead of rendering the app's scene + if( gD3DSettingsDlg.IsActive() ) + { + gD3DSettingsDlg.OnRender( fElapsedTime ); + return; + } + + // Clear the render target and depth stencil + float ClearColor[4] = { 0.02f, 0.02f, 0.02f, 1.0f }; + ID3D11RenderTargetView* pRTV = DXUTGetD3D11RenderTargetView(); + pd3dImmediateContext->ClearRenderTargetView( pRTV, ClearColor ); + ID3D11DepthStencilView* pDSV = DXUTGetD3D11DepthStencilView(); + pd3dImmediateContext->ClearDepthStencilView( pDSV, D3D11_CLEAR_DEPTH, 1.0, 0 ); + + // Set the input layout. + pd3dImmediateContext->IASetInputLayout( gVertexLayout ); + + // Set the vertex buffer. + UINT stride = sizeof( Vertex ); + UINT offset = 0; + pd3dImmediateContext->IASetVertexBuffers( 0, 1, &gVertexBuffer, &stride, &offset ); + + // Set the primitive topology + pd3dImmediateContext->IASetPrimitiveTopology( D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST ); + + // Set the shaders + pd3dImmediateContext->VSSetShader( gVertexShader, NULL, 0 ); + pd3dImmediateContext->PSSetShader( gRenderFramePS, NULL, 0 ); + + // Set the texture sampler. + pd3dImmediateContext->PSSetSamplers( 0, 1, &gSamPoint ); + + // Render the uncompressed texture. + pd3dImmediateContext->PSSetShaderResources( 0, 1, &gUncompressedSRV ); + pd3dImmediateContext->Draw( 6, 0 ); + + // Render the compressed texture. + pd3dImmediateContext->PSSetShaderResources( 0, 1, &gCompressedSRV ); + pd3dImmediateContext->Draw( 6, 6 ); + + // Render the error texture. + pd3dImmediateContext->PSSetShaderResources( 0, 1, &gErrorSRV ); + pd3dImmediateContext->Draw( 6, 12 ); + + DXUT_BeginPerfEvent( DXUT_PERFEVENTCOLOR, L"HUD / Stats" ); + HRESULT hr; + V(gHUD.OnRender( fElapsedTime )); + V(gSampleUI.OnRender( fElapsedTime )); + RenderText(); + DXUT_EndPerfEvent(); +} + +// Release D3D11 resources created in OnD3D11ResizedSwapChain +void CALLBACK OnD3D11ReleasingSwapChain( void* pUserContext ) +{ + gDialogResourceManager.OnD3D11ReleasingSwapChain(); +} + +// Release D3D11 resources created in OnD3D11CreateDevice +void CALLBACK OnD3D11DestroyDevice( void* pUserContext ) +{ + gDialogResourceManager.OnD3D11DestroyDevice(); + gD3DSettingsDlg.OnD3D11DestroyDevice(); + //CDXUTDirectionWidget::StaticOnD3D11DestroyDevice(); + DXUTGetGlobalResourceCache().OnDestroyDevice(); + SAFE_DELETE( gTxtHelper ); + + SAFE_RELEASE( gVertexLayout ); + SAFE_RELEASE( gVertexBuffer ); + SAFE_RELEASE( gQuadVB ); + SAFE_RELEASE( gIndexBuffer ); + SAFE_RELEASE( gVertexShader ); + SAFE_RELEASE( gRenderFramePS ); + SAFE_RELEASE( gRenderTexturePS ); + SAFE_RELEASE( gSamPoint ); + + DestroyTextures(); +} + +// Free previously allocated texture resources and create new texture resources. +HRESULT CreateTextures(LPTSTR file) +{ + // Destroy any previously created textures. + DestroyTextures(); + + // Load the uncompressed texture. + HRESULT hr; + V_RETURN(LoadTexture(file)); + + // Compress the texture. + V_RETURN(CompressTexture(gUncompressedSRV, &gCompressedSRV)); + + // Compute the error in the compressed texture. + V_RETURN(ComputeError(gUncompressedSRV, gCompressedSRV, &gErrorSRV)); + + return S_OK; +} + +// Destroy texture resources. +void DestroyTextures() +{ + SAFE_RELEASE(gErrorSRV); + SAFE_RELEASE(gCompressedSRV); + SAFE_RELEASE(gUncompressedSRV); +} + +// This functions loads a texture and prepares it for compression. The compressor only works on texture +// dimensions that are divisible by 4. Textures that are not divisible by 4 are resized and padded with the edge values. +HRESULT LoadTexture(LPTSTR file) +{ + // Load the uncrompressed texture. + // The loadInfo structure disables mipmapping by setting MipLevels to 1. + D3DX11_IMAGE_LOAD_INFO loadInfo; + ZeroMemory(&loadInfo, sizeof(D3DX11_IMAGE_LOAD_INFO)); + loadInfo.Width = D3DX11_DEFAULT; + loadInfo.Height = D3DX11_DEFAULT; + loadInfo.Depth = D3DX11_DEFAULT; + loadInfo.FirstMipLevel = D3DX11_DEFAULT; + loadInfo.MipLevels = 1; + loadInfo.Usage = (D3D11_USAGE) D3DX11_DEFAULT; + loadInfo.BindFlags = D3D11_BIND_SHADER_RESOURCE; + loadInfo.CpuAccessFlags = D3DX11_DEFAULT; + loadInfo.MiscFlags = D3DX11_DEFAULT; + loadInfo.Format = DXGI_FORMAT_R8G8B8A8_UNORM_SRGB; + loadInfo.Filter = D3DX11_FILTER_POINT | D3DX11_FILTER_SRGB; + loadInfo.MipFilter = D3DX11_DEFAULT; + loadInfo.pSrcInfo = NULL; + HRESULT hr; + V_RETURN(D3DX11CreateShaderResourceViewFromFile(DXUTGetD3D11Device(), file, &loadInfo, NULL, &gUncompressedSRV, NULL)); + + // Pad the texture. + V_RETURN(PadTexture(&gUncompressedSRV)); + + // Query the texture description. + ID3D11Texture2D* tex; + gUncompressedSRV->GetResource((ID3D11Resource**)&tex); + D3D11_TEXTURE2D_DESC texDesc; + tex->GetDesc(&texDesc); + SAFE_RELEASE(tex); + + // Update the UI's texture width and height. + gTexWidth = texDesc.Width; + gTexHeight = texDesc.Height; + + WCHAR wstr[MAX_PATH]; + swprintf_s(wstr, MAX_PATH, L"Texture Size: %d x %d", gTexWidth, gTexHeight); + gSampleUI.GetStatic(IDC_SIZETEXT)->SetText(wstr); + // gSampleUI.SendEvent(IDC_SIZETEXT, true, gSampleUI.GetStatic(IDC_SIZETEXT)); + + UpdateBlockSlider(); + + return S_OK; +} + +void SetCompressionScheme(EInstructionSet instrSet, ECompressorType compType, EThreadMode threadMode) { + + bool foundMatch = false; + for(int i = 0; i < kNumCompressionSchemes; i++) { + bool match = true; + match = match && kCompressionSchemes[i].instrSet == instrSet; + match = match && kCompressionSchemes[i].type == compType; + match = match && kCompressionSchemes[i].threadMode == threadMode; + if(match) { + gCompressionScheme = &(kCompressionSchemes[i]); + foundMatch = true; + break; + } + } + + if(!foundMatch) { + OutputDebugString(L"ERROR: Did not find match for compression scheme, not changing.\n"); + } +} + +void UpdateCompressionModes() { + + CDXUTComboBox *comboBox = gSampleUI.GetComboBox(IDC_COMPRESSOR); + comboBox->RemoveAllItems(); + + // If we're updating the compression modes, then see + // what we currently have selected and keep everything else constant. + EThreadMode currThreadMode = gCompressionScheme->threadMode; + EInstructionSet currInstrSet = gCompressionScheme->instrSet; + + bool added[kNumCompressorTypes]; + memset(added, 0, sizeof(added)); + + for(int i = 0; i < kNumCompressionSchemes; i++) { + + bool match = kCompressionSchemes[i].instrSet == currInstrSet; + match = match && kCompressionSchemes[i].threadMode == currThreadMode; + match = match && kCompressionSchemes[i].availabilityOverride; + + if(match) { + ECompressorType compType = kCompressionSchemes[i].type; + if(!added[compType]) { + comboBox->AddItem(kCompressorTypeStr[compType], (void*)(INT_PTR)compType); + added[compType] = true; + } + } + } + + comboBox->SetSelectedByData((void *)(INT_PTR)(gCompressionScheme->type)); +} + +void UpdateCompressionAlgorithms() { + + CDXUTComboBox *comboBox = gSampleUI.GetComboBox(IDC_SIMD); + comboBox->RemoveAllItems(); + + // If we're updating the compression algorithms, then see + // what we currently have selected and keep everything else constant. + EThreadMode currThreadMode = gCompressionScheme->threadMode; + ECompressorType currType = gCompressionScheme->type; + + bool added[kNumInstructionSets]; + memset(added, 0, sizeof(added)); + + for(int i = 0; i < kNumCompressionSchemes; i++) { + + bool match = kCompressionSchemes[i].type == currType; + match = match && kCompressionSchemes[i].threadMode == currThreadMode; + match = match && kCompressionSchemes[i].availabilityOverride; + + if(match) { + EInstructionSet instrSet = kCompressionSchemes[i].instrSet; + if(!added[instrSet]) { + comboBox->AddItem(kInstructionSetStr[instrSet], (void*)(INT_PTR)instrSet); + added[instrSet] = true; + } + } + } + + comboBox->SetSelectedByData((void *)(INT_PTR)(gCompressionScheme->instrSet)); +} + +void UpdateThreadingMode() { + + CDXUTComboBox *comboBox = gSampleUI.GetComboBox(IDC_TBB); + comboBox->RemoveAllItems(); + + // If we're updating the compression algorithms, then see + // what we currently have selected and keep everything else constant. + EInstructionSet currInstrSet = gCompressionScheme->instrSet; + ECompressorType currType = gCompressionScheme->type; + + bool added[kNumThreadModes]; + memset(added, 0, sizeof(added)); + + for(int i = 0; i < kNumCompressionSchemes; i++) { + + bool match = kCompressionSchemes[i].type == currType; + match = match && kCompressionSchemes[i].instrSet == currInstrSet; + match = match && kCompressionSchemes[i].availabilityOverride; + + if(match) { + EThreadMode threadMode = kCompressionSchemes[i].threadMode; + if(!added[threadMode]) { + comboBox->AddItem(kThreadModeStr[threadMode], (void*)(INT_PTR)threadMode); + added[threadMode] = true; + } + } + } + + comboBox->SetSelectedByData((void *)(INT_PTR)(gCompressionScheme->threadMode)); +} + +void UpdateAllowedSettings() { + UpdateCompressionModes(); + UpdateCompressionAlgorithms(); + UpdateThreadingMode(); +} + +void UpdateBlockSlider() { + + int blockRows = gTexHeight / 4; + int blockCols = gTexWidth / 4; + if(gCompressionScheme->instrSet == eInstrSet_AVX2) { + blockCols /= 2; + } + + int numBlocks = blockRows * blockCols; + int blksPerProc = numBlocks / gNumProcessors; + + gSampleUI.GetSlider(IDC_BLOCKSPERTASK)->SetRange(1, blksPerProc); +} + +// Pad the texture to dimensions that are divisible by 4. +HRESULT PadTexture(ID3D11ShaderResourceView** textureSRV) +{ + // Query the texture description. + ID3D11Texture2D* tex; + (*textureSRV)->GetResource((ID3D11Resource**)&tex); + D3D11_TEXTURE2D_DESC texDesc; + tex->GetDesc(&texDesc); + + // Exit if the texture dimensions are divisible by 4. + if((texDesc.Width % 4 == 0) && (texDesc.Height % 4 == 0)) + { + SAFE_RELEASE(tex); + return S_OK; + } + + // Compute the size of the padded texture. + UINT padWidth = texDesc.Width / 4 * 4 + 4; + UINT padHeight = texDesc.Height / 4 * 4 + 4; + + // Create a buffer for the padded texels. + BYTE* padTexels = new BYTE[padWidth * padHeight * 4]; + + // Create a staging resource for the texture. + HRESULT hr; + ID3D11Device* device = DXUTGetD3D11Device(); + D3D11_TEXTURE2D_DESC stgTexDesc; + memcpy(&stgTexDesc, &texDesc, sizeof(D3D11_TEXTURE2D_DESC)); + stgTexDesc.Usage = D3D11_USAGE_STAGING; + stgTexDesc.BindFlags = 0; + stgTexDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE; + ID3D11Texture2D* stgTex; + V_RETURN(device->CreateTexture2D(&stgTexDesc, NULL, &stgTex)); + + // Copy the texture into the staging resource. + ID3D11DeviceContext* deviceContext = DXUTGetD3D11DeviceContext(); + deviceContext->CopyResource(stgTex, tex); + + // Map the staging resource. + D3D11_MAPPED_SUBRESOURCE texData; + V_RETURN(deviceContext->Map(stgTex, D3D11CalcSubresource(0, 0, 1), D3D11_MAP_READ_WRITE, 0, &texData)); + + // Copy the beginning of each row. + BYTE* texels = (BYTE*)texData.pData; + for(UINT row = 0; row < stgTexDesc.Height; row++) + { + UINT rowStart = row * texData.RowPitch; + UINT padRowStart = row * padWidth * 4; + memcpy(padTexels + padRowStart, texels + rowStart, stgTexDesc.Width * 4); + + // Pad the end of each row. + if(padWidth > stgTexDesc.Width) + { + BYTE* padVal = texels + rowStart + (stgTexDesc.Width - 1) * 4; + for(UINT padCol = stgTexDesc.Width; padCol < padWidth; padCol++) + { + UINT padColStart = padCol * 4; + memcpy(padTexels + padRowStart + padColStart, padVal, 4); + } + } + } + + // Pad the end of each column. + if(padHeight > stgTexDesc.Height) + { + UINT lastRow = (stgTexDesc.Height - 1); + UINT lastRowStart = lastRow * padWidth * 4; + BYTE* padVal = padTexels + lastRowStart; + for(UINT padRow = stgTexDesc.Height; padRow < padHeight; padRow++) + { + UINT padRowStart = padRow * padWidth * 4; + memcpy(padTexels + padRowStart, padVal, padWidth * 4); + } + } + + // Unmap the staging resources. + deviceContext->Unmap(stgTex, D3D11CalcSubresource(0, 0, 1)); + + // Create a padded texture. + D3D11_TEXTURE2D_DESC padTexDesc; + memcpy(&padTexDesc, &texDesc, sizeof(D3D11_TEXTURE2D_DESC)); + padTexDesc.Width = padWidth; + padTexDesc.Height = padHeight; + D3D11_SUBRESOURCE_DATA padTexData; + ZeroMemory(&padTexData, sizeof(D3D11_SUBRESOURCE_DATA)); + padTexData.pSysMem = padTexels; + padTexData.SysMemPitch = padWidth * sizeof(BYTE) * 4; + ID3D11Texture2D* padTex; + V_RETURN(device->CreateTexture2D(&padTexDesc, &padTexData, &padTex)); + + // Delete the padded texel buffer. + delete [] padTexels; + + // Release the shader resource view for the texture. + SAFE_RELEASE(*textureSRV); + + // Create a shader resource view for the padded texture. + D3D11_SHADER_RESOURCE_VIEW_DESC padTexSRVDesc; + padTexSRVDesc.Format = padTexDesc.Format; + padTexSRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + padTexSRVDesc.Texture2D.MipLevels = padTexDesc.MipLevels; + padTexSRVDesc.Texture2D.MostDetailedMip = padTexDesc.MipLevels - 1; + V_RETURN(device->CreateShaderResourceView(padTex, &padTexSRVDesc, textureSRV)); + + // Release resources. + SAFE_RELEASE(padTex); + SAFE_RELEASE(stgTex); + SAFE_RELEASE(tex); + + return S_OK; +} + +// Save a texture to a file. +HRESULT SaveTexture(ID3D11ShaderResourceView* textureSRV, LPTSTR file) +{ + // Get the texture resource. + ID3D11Resource* texRes; + textureSRV->GetResource(&texRes); + if(texRes == NULL) + { + return E_POINTER; + } + + // Save the texture to a file. + HRESULT hr; + V_RETURN(D3DX11SaveTextureToFile(DXUTGetD3D11DeviceContext(), texRes, D3DX11_IFF_DDS, file)); + + // Release the texture resources. + SAFE_RELEASE(texRes); + + return S_OK; +} + +// Compress a texture. +HRESULT CompressTexture(ID3D11ShaderResourceView* uncompressedSRV, ID3D11ShaderResourceView** compressedSRV) +{ + // Query the texture description of the uncompressed texture. + ID3D11Resource* uncompRes; + gUncompressedSRV->GetResource(&uncompRes); + D3D11_TEXTURE2D_DESC uncompTexDesc; + ((ID3D11Texture2D*)uncompRes)->GetDesc(&uncompTexDesc); + + // Create a 2D texture for the compressed texture. + HRESULT hr; + ID3D11Texture2D* compTex; + D3D11_TEXTURE2D_DESC compTexDesc; + memcpy(&compTexDesc, &uncompTexDesc, sizeof(D3D11_TEXTURE2D_DESC)); + + switch(gCompressionScheme->type) { + default: + case eCompType_DXT1: + compTexDesc.Format = DXGI_FORMAT_BC1_UNORM_SRGB; + break; + case eCompType_DXT5: + compTexDesc.Format = DXGI_FORMAT_BC3_UNORM_SRGB; + break; + case eCompType_BC7: + compTexDesc.Format = DXGI_FORMAT_BC7_UNORM_SRGB; + break; + } + + ID3D11Device* device = DXUTGetD3D11Device(); + V_RETURN(device->CreateTexture2D(&compTexDesc, NULL, &compTex)); + + // Create a shader resource view for the compressed texture. + SAFE_RELEASE(*compressedSRV); + D3D11_SHADER_RESOURCE_VIEW_DESC compSRVDesc; + compSRVDesc.Format = compTexDesc.Format; + compSRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + compSRVDesc.Texture2D.MipLevels = compTexDesc.MipLevels; + compSRVDesc.Texture2D.MostDetailedMip = compTexDesc.MipLevels - 1; + V_RETURN(device->CreateShaderResourceView(compTex, &compSRVDesc, compressedSRV)); + + // Create a staging resource for the compressed texture. + compTexDesc.Usage = D3D11_USAGE_STAGING; + compTexDesc.BindFlags = 0; + compTexDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE; + ID3D11Texture2D* compStgTex; + V_RETURN(device->CreateTexture2D(&compTexDesc, NULL, &compStgTex)); + + // Create a staging resource for the uncompressed texture. + uncompTexDesc.Usage = D3D11_USAGE_STAGING; + uncompTexDesc.BindFlags = 0; + uncompTexDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE; + ID3D11Texture2D* uncompStgTex; + V_RETURN(device->CreateTexture2D(&uncompTexDesc, NULL, &uncompStgTex)); + + // Copy the uncompressed texture into the staging resource. + ID3D11DeviceContext* deviceContext = DXUTGetD3D11DeviceContext(); + deviceContext->CopyResource(uncompStgTex, uncompRes); + + // Map the staging resources. + D3D11_MAPPED_SUBRESOURCE uncompData; + V_RETURN(deviceContext->Map(uncompStgTex, D3D11CalcSubresource(0, 0, 1), D3D11_MAP_READ_WRITE, 0, &uncompData)); + D3D11_MAPPED_SUBRESOURCE compData; + V_RETURN(deviceContext->Map(compStgTex, D3D11CalcSubresource(0, 0, 1), D3D11_MAP_READ_WRITE, 0, &compData)); + + // Time the compression. + StopWatch stopWatch; + stopWatch.Start(); + + const int kNumCompressions = 1; + for(int cmpNum = 0; cmpNum < kNumCompressions; cmpNum++) { + + // Compress the uncompressed texels. + DXTC::CompressImageDXT((BYTE*)uncompData.pData, (BYTE*)compData.pData, uncompTexDesc.Width, uncompTexDesc.Height); + } + + // Update the compression time. + stopWatch.Stop(); + gCompTime = stopWatch.TimeInMilliseconds(); + gSampleUI.SendEvent(IDC_TIMETEXT, true, gSampleUI.GetStatic(IDC_TIMETEXT)); + + // Compute the compression rate. + INT numPixels = compTexDesc.Width * compTexDesc.Height * kNumCompressions; + gCompRate = (double)numPixels / stopWatch.TimeInSeconds() / 1000000.0; + gSampleUI.SendEvent(IDC_RATETEXT, true, gSampleUI.GetStatic(IDC_RATETEXT)); + stopWatch.Reset(); + + // Unmap the staging resources. + deviceContext->Unmap(compStgTex, D3D11CalcSubresource(0, 0, 1)); + deviceContext->Unmap(uncompStgTex, D3D11CalcSubresource(0, 0, 1)); + + // Copy the staging resourse into the compressed texture. + deviceContext->CopyResource(compTex, compStgTex); + + // Release resources. + SAFE_RELEASE(uncompStgTex); + SAFE_RELEASE(compStgTex); + SAFE_RELEASE(compTex); + SAFE_RELEASE(uncompRes); + + return S_OK; +} + +#define CHECK_WIN_THREAD_FUNC(x) \ + do { \ + if(NULL == (x)) { \ + wchar_t wstr[256]; \ + swprintf_s(wstr, L"Error detected from call %s at line %d of main.cpp", _T(#x), __LINE__); \ + ReportWinThreadError(wstr); \ + } \ + } \ + while(0) + +void ReportWinThreadError(const wchar_t *str) { + + // Retrieve the system error message for the last-error code. + LPVOID lpMsgBuf; + LPVOID lpDisplayBuf; + DWORD dw = GetLastError(); + + FormatMessage( + FORMAT_MESSAGE_ALLOCATE_BUFFER | + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, + dw, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPTSTR) &lpMsgBuf, + 0, NULL ); + + // Display the error message. + + lpDisplayBuf = (LPVOID)LocalAlloc(LMEM_ZEROINIT, + (lstrlen((LPCTSTR) lpMsgBuf) + lstrlen((LPCTSTR)str) + 40) * sizeof(TCHAR)); + StringCchPrintf((LPTSTR)lpDisplayBuf, + LocalSize(lpDisplayBuf) / sizeof(TCHAR), + TEXT("%s failed with error %d: %s"), + str, dw, lpMsgBuf); + MessageBox(NULL, (LPCTSTR) lpDisplayBuf, TEXT("Error"), MB_OK); + + // Free error-handling buffer allocations. + + LocalFree(lpMsgBuf); + LocalFree(lpDisplayBuf); +} + +void InitWin32Threads() { + + // Already initialized? + if(gNumWinThreads > 0) { + return; + } + + SetLastError(0); + + gNumWinThreads = gNumProcessors; + if(gNumWinThreads >= MAXIMUM_WAIT_OBJECTS) + gNumWinThreads = MAXIMUM_WAIT_OBJECTS; + + // Create the synchronization events. + for(int i = 0; i < gNumWinThreads; i++) { + CHECK_WIN_THREAD_FUNC(gWinThreadWorkEvent[i] = CreateEvent(NULL, FALSE, FALSE, NULL)); + } + + CHECK_WIN_THREAD_FUNC(gWinThreadStartEvent = CreateEvent(NULL, TRUE, FALSE, NULL)); + CHECK_WIN_THREAD_FUNC(gWinThreadDoneEvent = CreateEvent(NULL, TRUE, FALSE, NULL)); + + // Create threads + for(int threadIdx = 0; threadIdx < gNumWinThreads; threadIdx++) { + gWinThreadData[threadIdx].state = eThreadState_WaitForData; + CHECK_WIN_THREAD_FUNC(hThreadArray[threadIdx] = CreateThread(NULL, 0, DXTC::CompressImageDXTWinThread, &gWinThreadData[threadIdx], 0, &dwThreadIdArray[threadIdx])); + } +} + +void DestroyThreads() { + + switch(gCompressionScheme->threadMode) { + case eThreadMode_TBB: + { + // Shutdown the TBB task manager. + gTaskMgr.Shutdown(); + } + break; + + case eThreadMode_Win32: + { + // Release all windows threads that may be active... + for(int i=0; i < gNumWinThreads; i++) { + gWinThreadData[i].state = eThreadState_Done; + } + + // Send the event for the threads to start. + CHECK_WIN_THREAD_FUNC(ResetEvent(gWinThreadDoneEvent)); + CHECK_WIN_THREAD_FUNC(SetEvent(gWinThreadStartEvent)); + + // Wait for all the threads to finish.... + DWORD dwWaitRet = WaitForMultipleObjects(gNumWinThreads, hThreadArray, TRUE, INFINITE); + if(WAIT_FAILED == dwWaitRet) + ReportWinThreadError(L"DestroyThreads() -- WaitForMultipleObjects"); + + // !HACK! This doesn't actually do anything. There is either a bug in the + // Intel compiler or the windows run-time that causes the threads to not + // be cleaned up properly if the following two lines of code are not present. + // Since we're passing INFINITE to WaitForMultipleObjects, that function will + // never time out and per-microsoft spec, should never give this return value... + // Even with these lines, the bug does not consistently disappear unless you + // clean and rebuild. Heigenbug? + // + // If we compile with MSVC, then the following two lines are not necessary. + else if(WAIT_TIMEOUT == dwWaitRet) + OutputDebugString(L"DestroyThreads() -- WaitForMultipleObjects -- TIMEOUT"); + + // Reset the start event + CHECK_WIN_THREAD_FUNC(ResetEvent(gWinThreadStartEvent)); + CHECK_WIN_THREAD_FUNC(SetEvent(gWinThreadDoneEvent)); + + // Close all thread handles. + for(int i=0; i < gNumWinThreads; i++) { + CHECK_WIN_THREAD_FUNC(CloseHandle(hThreadArray[i])); + } + + for(int i =0; i < kMaxWinThreads; i++ ){ + hThreadArray[i] = NULL; + } + + // Close all event handles... + CHECK_WIN_THREAD_FUNC(CloseHandle(gWinThreadDoneEvent)); + gWinThreadDoneEvent = NULL; + + CHECK_WIN_THREAD_FUNC(CloseHandle(gWinThreadStartEvent)); + gWinThreadStartEvent = NULL; + + for(int i = 0; i < gNumWinThreads; i++) { + CHECK_WIN_THREAD_FUNC(CloseHandle(gWinThreadWorkEvent[i])); + } + + for(int i = 0; i < kMaxWinThreads; i++) { + gWinThreadWorkEvent[i] = NULL; + } + + gNumWinThreads = 0; + } + break; + + case eThreadMode_None: + // Do nothing. + break; + } +} + +static inline DXGI_FORMAT GetNonSRGBFormat(DXGI_FORMAT f) { + switch(f) { + case DXGI_FORMAT_BC1_UNORM_SRGB: return DXGI_FORMAT_BC1_UNORM; + case DXGI_FORMAT_BC3_UNORM_SRGB: return DXGI_FORMAT_BC3_UNORM; + case DXGI_FORMAT_BC7_UNORM_SRGB: return DXGI_FORMAT_BC7_UNORM; + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: return DXGI_FORMAT_R8G8B8A8_UNORM; + default: assert(!"Unknown format!"); + } + return DXGI_FORMAT_R8G8B8A8_UNORM; +} + +// Compute an "error" texture that represents the absolute difference in color between an +// uncompressed texture and a compressed texture. +HRESULT ComputeError(ID3D11ShaderResourceView* uncompressedSRV, ID3D11ShaderResourceView* compressedSRV, ID3D11ShaderResourceView** errorSRV) +{ + HRESULT hr; + + // Query the texture description of the uncompressed texture. + ID3D11Resource* uncompRes; + gUncompressedSRV->GetResource(&uncompRes); + D3D11_TEXTURE2D_DESC uncompTexDesc; + ((ID3D11Texture2D*)uncompRes)->GetDesc(&uncompTexDesc); + + // Query the texture description of the uncompressed texture. + ID3D11Resource* compRes; + gCompressedSRV->GetResource(&compRes); + D3D11_TEXTURE2D_DESC compTexDesc; + ((ID3D11Texture2D*)compRes)->GetDesc(&compTexDesc); + + // Create a 2D resource without gamma correction for the two textures. + compTexDesc.Format = GetNonSRGBFormat(compTexDesc.Format); + uncompTexDesc.Format = GetNonSRGBFormat(uncompTexDesc.Format); + + ID3D11Device* device = DXUTGetD3D11Device(); + + ID3D11Texture2D* uncompTex; + device->CreateTexture2D(&uncompTexDesc, NULL, &uncompTex); + + ID3D11Texture2D* compTex; + device->CreateTexture2D(&compTexDesc, NULL, &compTex); + + // Create a shader resource view for the two textures. + D3D11_SHADER_RESOURCE_VIEW_DESC compSRVDesc; + compSRVDesc.Format = compTexDesc.Format; + compSRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + compSRVDesc.Texture2D.MipLevels = compTexDesc.MipLevels; + compSRVDesc.Texture2D.MostDetailedMip = compTexDesc.MipLevels - 1; + ID3D11ShaderResourceView *compSRV; + V_RETURN(device->CreateShaderResourceView(compTex, &compSRVDesc, &compSRV)); + + D3D11_SHADER_RESOURCE_VIEW_DESC uncompSRVDesc; + uncompSRVDesc.Format = uncompTexDesc.Format; + uncompSRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + uncompSRVDesc.Texture2D.MipLevels = uncompTexDesc.MipLevels; + uncompSRVDesc.Texture2D.MostDetailedMip = uncompTexDesc.MipLevels - 1; + ID3D11ShaderResourceView *uncompSRV; + V_RETURN(device->CreateShaderResourceView(uncompTex, &uncompSRVDesc, &uncompSRV)); + + // Create a 2D texture for the error texture. + ID3D11Texture2D* errorTex; + D3D11_TEXTURE2D_DESC errorTexDesc; + memcpy(&errorTexDesc, &uncompTexDesc, sizeof(D3D11_TEXTURE2D_DESC)); + errorTexDesc.BindFlags = D3D11_BIND_RENDER_TARGET | D3D11_BIND_SHADER_RESOURCE; + V_RETURN(device->CreateTexture2D(&errorTexDesc, NULL, &errorTex)); + + // Create a render target view for the error texture. + D3D11_RENDER_TARGET_VIEW_DESC errorRTVDesc; + errorRTVDesc.Format = errorTexDesc.Format; + errorRTVDesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D; + errorRTVDesc.Texture2D.MipSlice = 0; + ID3D11RenderTargetView* errorRTV; + V_RETURN(device->CreateRenderTargetView(errorTex, &errorRTVDesc, &errorRTV)); + + // Create a shader resource view for the error texture. + D3D11_SHADER_RESOURCE_VIEW_DESC errorSRVDesc; + errorSRVDesc.Format = errorTexDesc.Format; + errorSRVDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + errorSRVDesc.Texture2D.MipLevels = errorTexDesc.MipLevels; + errorSRVDesc.Texture2D.MostDetailedMip = errorTexDesc.MipLevels - 1; + V_RETURN(device->CreateShaderResourceView(errorTex, &errorSRVDesc, errorSRV)); + + // Create a query for the GPU operations... + D3D11_QUERY_DESC GPUQueryDesc; + GPUQueryDesc.Query = D3D11_QUERY_EVENT; + GPUQueryDesc.MiscFlags = 0; + +#ifdef _DEBUG + D3D11_QUERY_DESC OcclusionQueryDesc; + OcclusionQueryDesc.Query = D3D11_QUERY_OCCLUSION; + OcclusionQueryDesc.MiscFlags = 0; + + D3D11_QUERY_DESC StatsQueryDesc; + StatsQueryDesc.Query = D3D11_QUERY_PIPELINE_STATISTICS; + StatsQueryDesc.MiscFlags = 0; +#endif + + ID3D11Query *GPUQuery; + V_RETURN(device->CreateQuery(&GPUQueryDesc, &GPUQuery)); + + ID3D11DeviceContext* deviceContext = DXUTGetD3D11DeviceContext(); + + deviceContext->CopyResource(compTex, compRes); + deviceContext->CopyResource(uncompTex, uncompRes); + +#ifdef _DEBUG + ID3D11Query *OcclusionQuery, *StatsQuery; + V_RETURN(device->CreateQuery(&OcclusionQueryDesc, &OcclusionQuery)); + V_RETURN(device->CreateQuery(&StatsQueryDesc, &StatsQuery)); + + deviceContext->Begin(OcclusionQuery); + deviceContext->Begin(StatsQuery); +#endif + + // Set the viewport to a 1:1 mapping of pixels to texels. + D3D11_VIEWPORT viewport; + viewport.Width = (FLOAT)errorTexDesc.Width; + viewport.Height = (FLOAT)errorTexDesc.Height; + viewport.MinDepth = 0; + viewport.MaxDepth = 1; + viewport.TopLeftX = 0; + viewport.TopLeftY = 0; + deviceContext->RSSetViewports(1, &viewport); + + // Bind the render target view of the error texture. + ID3D11RenderTargetView* RTV[1] = { errorRTV }; + deviceContext->OMSetRenderTargets(1, RTV, NULL); + + // Clear the render target. + FLOAT color[4] = { 0.0f, 0.0f, 0.0f, 1.0f }; + deviceContext->ClearRenderTargetView(errorRTV, color); + + // Set the input layout. + deviceContext->IASetInputLayout(gVertexLayout); + + // Set vertex buffer + UINT stride = sizeof(Vertex); + UINT offset = 0; + deviceContext->IASetVertexBuffers(0, 1, &gQuadVB, &stride, &offset); + + // Set the primitive topology + deviceContext->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST); + + // Set the shaders + deviceContext->VSSetShader(gVertexShader, NULL, 0); + deviceContext->PSSetShader(gRenderTexturePS, NULL, 0); + + // Set the texture sampler. + deviceContext->PSSetSamplers(0, 1, &gSamPoint); + + // Bind the textures. + ID3D11ShaderResourceView* SRV[2] = { compSRV, uncompSRV}; + deviceContext->PSSetShaderResources(0, 2, SRV); + + // Store the depth/stencil state. + StoreDepthStencilState(); + + // Disable depth testing. + V_RETURN(DisableDepthTest()); + + // Render a quad. + deviceContext->Draw(6, 0); + + // Restore the depth/stencil state. + RestoreDepthStencilState(); + + // Reset the render target. + RTV[0] = DXUTGetD3D11RenderTargetView(); + deviceContext->OMSetRenderTargets(1, RTV, DXUTGetD3D11DepthStencilView()); + + // Reset the viewport. + viewport.Width = (FLOAT)DXUTGetDXGIBackBufferSurfaceDesc()->Width; + viewport.Height = (FLOAT)DXUTGetDXGIBackBufferSurfaceDesc()->Height; + deviceContext->RSSetViewports(1, &viewport); + + deviceContext->End(GPUQuery); +#ifdef _DEBUG + deviceContext->End(OcclusionQuery); + deviceContext->End(StatsQuery); +#endif + + BOOL finishedGPU = false; + + // If we do not have a d3d 11 context, we will still hit this line and try to + // finish using the GPU. If this happens this enters an infinite loop. + int infLoopPrevention = 0; + while(!finishedGPU && ++infLoopPrevention < 10000) { + HRESULT ret; + V_RETURN(ret = deviceContext->GetData(GPUQuery, &finishedGPU, sizeof(BOOL), 0)); + if(ret != S_OK) + Sleep(1); + } + +#ifdef _DEBUG + UINT64 nPixelsWritten = 0; + deviceContext->GetData(OcclusionQuery, (void *)&nPixelsWritten, sizeof(UINT64), 0); + + D3D11_QUERY_DATA_PIPELINE_STATISTICS stats; + deviceContext->GetData(StatsQuery, (void *)&stats, sizeof(D3D11_QUERY_DATA_PIPELINE_STATISTICS), 0); + + TCHAR nPixelsWrittenMsg[256]; + _stprintf(nPixelsWrittenMsg, _T("Pixels rendered during error computation: %d\n"), nPixelsWritten); + OutputDebugString(nPixelsWrittenMsg); +#endif + + // Create a copy of the error texture that is accessible by the CPU + ID3D11Texture2D* errorTexCopy; + D3D11_TEXTURE2D_DESC errorTexCopyDesc; + memcpy(&errorTexCopyDesc, &uncompTexDesc, sizeof(D3D11_TEXTURE2D_DESC)); + errorTexCopyDesc.Usage = D3D11_USAGE_STAGING; + errorTexCopyDesc.BindFlags = 0; + errorTexCopyDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE; + V_RETURN(device->CreateTexture2D(&errorTexCopyDesc, NULL, &errorTexCopy)); + + // Copy the error texture into the copy.... + deviceContext->CopyResource(errorTexCopy, errorTex); + + // Map the staging resource. + D3D11_MAPPED_SUBRESOURCE errorData; + V_RETURN(deviceContext->Map(errorTexCopy, D3D11CalcSubresource(0, 0, 1), D3D11_MAP_READ, 0, &errorData)); + + // Calculate PSNR + ComputeRMSE((const BYTE *)(errorData.pData), errorTexCopyDesc.Width, errorTexCopyDesc.Height); + gSampleUI.SendEvent(IDC_RMSETEXT, true, gSampleUI.GetStatic(IDC_RMSETEXT)); + + // Unmap the staging resources. + deviceContext->Unmap(errorTexCopy, D3D11CalcSubresource(0, 0, 1)); + + // Release resources. + SAFE_RELEASE(errorRTV); + SAFE_RELEASE(errorTex); + SAFE_RELEASE(errorTexCopy); + SAFE_RELEASE(uncompRes); + SAFE_RELEASE(compRes); + SAFE_RELEASE(GPUQuery); + +#ifdef _DEBUG + SAFE_RELEASE(OcclusionQuery); + SAFE_RELEASE(StatsQuery); +#endif + + SAFE_RELEASE(compSRV); + SAFE_RELEASE(uncompSRV); + SAFE_RELEASE(compTex); + SAFE_RELEASE(uncompTex); + + return S_OK; +} + +// Recompresses the already loaded texture and recomputes the error. +HRESULT RecompressTexture() +{ + // Destroy any previously created textures. + SAFE_RELEASE(gErrorSRV); + SAFE_RELEASE(gCompressedSRV); + + // Compress the texture. + HRESULT hr; + V_RETURN(CompressTexture(gUncompressedSRV, &gCompressedSRV)); + + // Compute the error in the compressed texture. + V_RETURN(ComputeError(gUncompressedSRV, gCompressedSRV, &gErrorSRV)); + + return S_OK; +} + +// Store the depth-stencil state. +void StoreDepthStencilState() +{ + DXUTGetD3D11DeviceContext()->OMGetDepthStencilState(&gDepthStencilState, &gStencilReference); +} + +// Restore the depth-stencil state. +void RestoreDepthStencilState() +{ + DXUTGetD3D11DeviceContext()->OMSetDepthStencilState(gDepthStencilState, gStencilReference); +} + +// Disable depth testing. +HRESULT DisableDepthTest() +{ + D3D11_DEPTH_STENCIL_DESC depStenDesc; + ZeroMemory(&depStenDesc, sizeof(D3D11_DEPTH_STENCIL_DESC)); + depStenDesc.DepthEnable = FALSE; + depStenDesc.DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ALL; + depStenDesc.DepthFunc = D3D11_COMPARISON_LESS; + depStenDesc.StencilEnable = FALSE; + depStenDesc.StencilReadMask = D3D11_DEFAULT_STENCIL_READ_MASK; + depStenDesc.StencilWriteMask = D3D11_DEFAULT_STENCIL_WRITE_MASK; + depStenDesc.FrontFace.StencilFailOp = D3D11_STENCIL_OP_KEEP; + depStenDesc.FrontFace.StencilDepthFailOp = D3D11_STENCIL_OP_KEEP; + depStenDesc.FrontFace.StencilPassOp = D3D11_STENCIL_OP_KEEP; + depStenDesc.FrontFace.StencilFunc = D3D11_COMPARISON_ALWAYS; + depStenDesc.BackFace.StencilFailOp = D3D11_STENCIL_OP_KEEP; + depStenDesc.BackFace.StencilDepthFailOp = D3D11_STENCIL_OP_KEEP; + depStenDesc.BackFace.StencilPassOp = D3D11_STENCIL_OP_KEEP; + depStenDesc.BackFace.StencilFunc = D3D11_COMPARISON_ALWAYS; + ID3D11DepthStencilState* depStenState; + HRESULT hr; + V_RETURN(DXUTGetD3D11Device()->CreateDepthStencilState(&depStenDesc, &depStenState)); + + DXUTGetD3D11DeviceContext()->OMSetDepthStencilState(depStenState, 0); + + SAFE_RELEASE(depStenState); + + return S_OK; +} + +void ComputeRMSE(const BYTE *errorData, const INT width, const INT height) { + + const float *w = BC7C::GetErrorMetric(); + + const double wr = w[0]; + const double wg = w[1]; + const double wb = w[2]; + + double MSE = 0.0; + for(int i = 0; i < width; i++) { + for(int j = 0; j < height; j++) { + const INT pixel = ((const INT *)errorData)[j * width + i]; + + double dr = double(pixel & 0xFF) * wr; + double dg = double((pixel >> 8) & 0xFF) * wg; + double db = double((pixel >> 16) & 0xFF) * wb; + + const double pixelMSE = (double(dr) * double(dr)) + (double(dg) * double(dg)) + (double(db) * double(db)); + MSE += pixelMSE; + } + } + + MSE /= (double(width) * double(height)); +#ifdef REPORT_RMSE + gError = sqrt(MSE); +#else + double MAXI = (255.0 * wr) * (255.0 * wr) + (255.0 * wg) * (255.0 * wg) + (255.0 * wb) * (255.0 * wb); + gError= 10 * log10(MAXI/MSE); +#endif +} + +namespace DXTC +{ + VOID CompressImageDXT(const BYTE* inBuf, BYTE* outBuf, INT width, INT height) { + + // If we aren't multi-cored, then just run everything serially. + if(gNumProcessors <= 1) { + CompressImageDXTNoThread(inBuf, outBuf, width, height); + return; + } + + switch(gCompressionScheme->threadMode) { + case eThreadMode_None: + CompressImageDXTNoThread(inBuf, outBuf, width, height); + break; + case eThreadMode_TBB: + CompressImageDXTTBB(inBuf, outBuf, width, height); + break; + case eThreadMode_Win32: + CompressImageDXTWIN(inBuf, outBuf, width, height); + break; + } + } + + CompressionFunc GetCompressionFunc() { + switch(gCompressionScheme->instrSet) + { + case eInstrSet_SSE: + { + switch(gCompressionScheme->type) { + case eCompType_DXT1: return DXTC::CompressImageDXT1SSE2; + case eCompType_DXT5: return DXTC::CompressImageDXT5SSE2; + case eCompType_BC7: return BC7C::CompressImageBC7SIMD; + } + } + break; + + case eInstrSet_Scalar: + { + switch(gCompressionScheme->type) { + case eCompType_DXT1: return DXTC::CompressImageDXT1; + case eCompType_DXT5: return DXTC::CompressImageDXT5; + case eCompType_BC7: return BC7C::CompressImageBC7; + } + } + break; + +#ifdef ENABLE_AVX2 + case eInstrSet_AVX2: + { + switch(gCompressionScheme->type) { + case eCompType_DXT1: return DXTC::CompressImageDXT1AVX2; + case eCompType_DXT5: return DXTC::CompressImageDXT5AVX2; + } + } +#endif + } + return NULL; + } + + void CompressImageDXTNoThread(const BYTE* inBuf, BYTE* outBuf, INT width, INT height) { + + CompressionFunc cmpFunc = GetCompressionFunc(); + + if(cmpFunc == NULL) { + OutputDebugString(L"DXTC::CompressImageDXTNoThread -- Compression Scheme not implemented!\n"); + return; + } + + // Do the compression. + (*cmpFunc)(inBuf, outBuf, width, height); + } + + // Use the TBB task manager to compress an image with DXT compression. + VOID CompressImageDXTTBB(const BYTE* inBuf, BYTE* outBuf, INT width, INT height) + { + // Initialize the data. + DXTTaskData data; + data.inBuf = inBuf; + data.outBuf = outBuf; + data.width = width; + data.height = height; + data.numBlocks = width * height / 16; + if(gCompressionScheme->instrSet == eInstrSet_AVX2) { + data.numBlocks = width * height / 32; + } + data.kBlocksPerTask = gBlocksPerTask; + + // Compute the task count. + UINT taskCount = (UINT)ceil((float)data.numBlocks / gBlocksPerTask); + + // Create the task set. + TASKSETFUNC taskFunc = NULL; + switch(gCompressionScheme->instrSet) + { + case eInstrSet_SSE: + { + switch(gCompressionScheme->type) { + case eCompType_DXT1: taskFunc = DXTC::CompressImageDXT1SSE2Task; break; + case eCompType_DXT5: taskFunc = DXTC::CompressImageDXT5SSE2Task; break; + } + } + break; + + case eInstrSet_Scalar: + { + switch(gCompressionScheme->type) { + case eCompType_DXT1: taskFunc = DXTC::CompressImageDXT1Task; break; + case eCompType_DXT5: taskFunc = DXTC::CompressImageDXT5Task; break; + } + } + break; + +#ifdef ENABLE_AVX2 + case eInstrSet_AVX2: + { + switch(gCompressionScheme->type) { + case eCompType_DXT1: taskFunc = DXTC::CompressImageDXT1AVX2Task; break; + case eCompType_DXT5: taskFunc = DXTC::CompressImageDXT5AVX2Task; break; + } + } + break; +#endif + } + + TASKSETHANDLE taskSet; + gTaskMgr.CreateTaskSet(taskFunc, &data, taskCount, NULL, 0, "Fast Texture Compression", &taskSet); + if(taskSet == TASKSETHANDLE_INVALID) + { + return; + } + + // Wait for the task set. + gTaskMgr.WaitForSet(taskSet); + + // Release the task set. + gTaskMgr.ReleaseHandle(taskSet); + taskSet = TASKSETHANDLE_INVALID; + } + + int GetBlocksPerLoop() { + if(gCompressionScheme->instrSet == eInstrSet_AVX2) + return 2; + return 1; + } + + int GetBytesPerBlock() { + switch(gCompressionScheme->type) { + default: + case eCompType_DXT1: + return 8; + + case eCompType_DXT5: + case eCompType_BC7: + return 16; + } + } + + VOID CompressImageDXTWIN(const BYTE* inBuf, BYTE* outBuf, INT width, INT height) { + + const int numThreads = gNumWinThreads; + const int blocksPerLoop = GetBlocksPerLoop(); + const int bytesPerBlock = GetBytesPerBlock(); + + // We want to split the data evenly among all threads. + const int kNumPixels = width * height; + const int kNumBlocks = kNumPixels >> (3 + blocksPerLoop); + const int kBlocksPerRow = width >> (1 + blocksPerLoop); + + const int kBlocksPerThread = kNumBlocks / numThreads; + const int kBlocksPerColumn = height >> 2; + const int kBlockRowsPerThread = kBlocksPerThread / kBlocksPerRow; + const int kBlockColsPerThread = kBlocksPerThread % kBlocksPerRow; + const int kOffsetPerThread = kBlockRowsPerThread * width * 4 * 4 + kBlockColsPerThread * 4 * 4 * (blocksPerLoop); + const int kHeightPerThread = (blocksPerLoop * 16 * kBlocksPerThread) / width; + + CompressionFunc cmpFunc = GetCompressionFunc(); + if(cmpFunc == NULL) { + OutputDebugString(L"DXTC::CompressImageDXTNoThread -- Compression Scheme not implemented!\n"); + return; + } + + // Load the threads. + for(int threadIdx = 0; threadIdx < numThreads; threadIdx++) { + + WinThreadData *data = &gWinThreadData[threadIdx]; + data->inBuf = inBuf + (threadIdx * kOffsetPerThread); + data->outBuf = outBuf + (threadIdx * kBlocksPerThread * blocksPerLoop * bytesPerBlock); + data->width = width; + data->height = kHeightPerThread; + data->cmpFunc = cmpFunc; + data->state = eThreadState_DataLoaded; + data->threadIdx = threadIdx; + } + + // Send the event for the threads to start. + CHECK_WIN_THREAD_FUNC(ResetEvent(gWinThreadDoneEvent)); + CHECK_WIN_THREAD_FUNC(SetEvent(gWinThreadStartEvent)); + + // Wait for all the threads to finish + if(WAIT_FAILED == WaitForMultipleObjects(numThreads, gWinThreadWorkEvent, TRUE, INFINITE)) + ReportWinThreadError(TEXT("CompressImageDXTWIN -- WaitForMultipleObjects")); + + // Reset the start event + CHECK_WIN_THREAD_FUNC(ResetEvent(gWinThreadStartEvent)); + CHECK_WIN_THREAD_FUNC(SetEvent(gWinThreadDoneEvent)); + } + + DWORD WINAPI CompressImageDXTWinThread( LPVOID lpParam ) { + WinThreadData *data = (WinThreadData *)lpParam; + + while(data->state != eThreadState_Done) { + + if(WAIT_FAILED == WaitForSingleObject(gWinThreadStartEvent, INFINITE)) + ReportWinThreadError(TEXT("CompressImageDXTWinThread -- WaitForSingleObject")); + + if(data->state == eThreadState_Done) + break; + + data->state = eThreadState_Running; + (*(data->cmpFunc))(data->inBuf, data->outBuf, data->width, data->height); + + data->state = eThreadState_WaitForData; + + HANDLE workEvent = gWinThreadWorkEvent[data->threadIdx]; + if(WAIT_FAILED == SignalObjectAndWait(workEvent, gWinThreadDoneEvent, INFINITE, FALSE)) + ReportWinThreadError(TEXT("CompressImageDXTWinThread -- SignalObjectAndWait")); + } + + return 0; + } +} diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..c296081 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,4 @@ +CMAKE_MINIMUM_REQUIRED(VERSION 2.6) +PROJECT(TexC) + +ADD_SUBDIRECTORY(BPTCEncoder) diff --git a/QtGUI/CMakeLists.txt b/QtGUI/CMakeLists.txt new file mode 100644 index 0000000..e69de29