decoders: Optimize swizzle copy performance (#6790)

This makes UnswizzleTexture up to two times faster. It is the main bottleneck in NVDEC video decoding.
This commit is contained in:
yzct12345 2021-08-02 15:18:58 +00:00 committed by GitHub
parent 381aacdbb1
commit f56d0db5bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -18,9 +18,9 @@
namespace Tegra::Texture { namespace Tegra::Texture {
namespace { namespace {
template <bool TO_LINEAR> template <bool TO_LINEAR, u32 BYTES_PER_PIXEL>
void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 height, u32 depth,
u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { u32 block_height, u32 block_depth, u32 stride_alignment) {
// The origin of the transformation can be configured here, leave it as zero as the current API // The origin of the transformation can be configured here, leave it as zero as the current API
// doesn't expose it. // doesn't expose it.
static constexpr u32 origin_x = 0; static constexpr u32 origin_x = 0;
@ -28,9 +28,9 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
static constexpr u32 origin_z = 0; static constexpr u32 origin_z = 0;
// We can configure here a custom pitch // We can configure here a custom pitch
// As it's not exposed 'width * bpp' will be the expected pitch. // As it's not exposed 'width * BYTES_PER_PIXEL' will be the expected pitch.
const u32 pitch = width * bytes_per_pixel; const u32 pitch = width * BYTES_PER_PIXEL;
const u32 stride = Common::AlignUpLog2(width, stride_alignment) * bytes_per_pixel; const u32 stride = Common::AlignUpLog2(width, stride_alignment) * BYTES_PER_PIXEL;
const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT); const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT);
const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth); const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
@ -54,14 +54,14 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
((block_y & block_height_mask) << GOB_SIZE_SHIFT); ((block_y & block_height_mask) << GOB_SIZE_SHIFT);
for (u32 column = 0; column < width; ++column) { for (u32 column = 0; column < width; ++column) {
const u32 x = (column + origin_x) * bytes_per_pixel; const u32 x = (column + origin_x) * BYTES_PER_PIXEL;
const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift; const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift;
const u32 base_swizzled_offset = offset_z + offset_y + offset_x; const u32 base_swizzled_offset = offset_z + offset_y + offset_x;
const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X]; const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X];
const u32 unswizzled_offset = const u32 unswizzled_offset =
slice * pitch * height + line * pitch + column * bytes_per_pixel; slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL;
if (const auto offset = (TO_LINEAR ? unswizzled_offset : swizzled_offset); if (const auto offset = (TO_LINEAR ? unswizzled_offset : swizzled_offset);
offset >= input.size()) { offset >= input.size()) {
@ -73,11 +73,45 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset]; u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset];
const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset]; const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset];
std::memcpy(dst, src, bytes_per_pixel);
std::memcpy(dst, src, BYTES_PER_PIXEL);
} }
} }
} }
} }
template <bool TO_LINEAR>
void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
switch (bytes_per_pixel) {
case 1:
return SwizzleImpl<TO_LINEAR, 1>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
case 2:
return SwizzleImpl<TO_LINEAR, 2>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
case 3:
return SwizzleImpl<TO_LINEAR, 3>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
case 4:
return SwizzleImpl<TO_LINEAR, 4>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
case 6:
return SwizzleImpl<TO_LINEAR, 6>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
case 8:
return SwizzleImpl<TO_LINEAR, 8>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
case 12:
return SwizzleImpl<TO_LINEAR, 12>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
case 16:
return SwizzleImpl<TO_LINEAR, 16>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
default:
UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel);
}
}
} // Anonymous namespace } // Anonymous namespace
void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,