// The MIT License(MIT) // // Copyright(c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy of // this software and associated documentation files(the "Software"), to deal in // the Software without restriction, including without limitation the rights to // use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies of // the Software, and to permit persons to whom the Software is furnished to do so, // subject to the following conditions : // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. //--------------------------------------------------------------------------------- // NVIDIA Image Scaling SDK - v1.0 //--------------------------------------------------------------------------------- // The NVIDIA Image Scaling SDK provides a single spatial scaling and sharpening algorithm // for cross-platform support. The scaling algorithm uses a 6-tap scaling filter combined // with 4 directional scaling and adaptive sharpening filters, which creates nice smooth images // and sharp edges. In addition, the SDK provides a state-of-the-art adaptive directional sharpening algorithm // for use in applications where no scaling is required. // // The directional scaling and sharpening algorithm is named NVScaler while the adaptive-directional-sharpening-only // algorithm is named NVSharpen. Both algorithms are provided as compute shaders and // developers are free to integrate them in their applications. Note that if you integrate NVScaler, you // should NOT integrate NVSharpen, as NVScaler already includes a sharpening pass // // Pipeline Placement // ------------------ // The call into the NVIDIA Image Scaling shaders must occur during the post-processing phase after tone-mapping. // Applying the scaling in linear HDR in-game color-space may result in a sharpening effect that is // either not visible or too strong. Since sharpening algorithms can enhance noisy or grainy regions, it is recommended // that certain effects such as film grain should occur after NVScaler or NVSharpen. Low-pass filters such as motion blur or // light bloom are recommended to be applied before NVScaler or NVSharpen to avoid sharpening attenuation. // // Color Space and Ranges // ---------------------- // NVIDIA Image Scaling shaders can process color textures stored as either LDR or HDR with the following // restrictions: // 1) LDR // - The range of color values must be in the [0, 1] range // - The input color texture must be in display-referred color-space after tone mapping and OETF (gamma-correction) // has been applied // 2) HDR PQ // - The range of color values must be in the [0, 1] range // - The input color texture must be in display-referred color-space after tone mapping with Rec.2020 PQ OETF applied // 3) HDR Linear // - The recommended range of color values is [0, 12.5], where luminance value (as per BT. 709) of // 1.0 maps to brightness value of 80nits (sRGB peak) and 12.5 maps to 1000nits // - The input color texture may have luminance values that are either linear and scene-referred or // linear and display-referred (after tone mapping) // // If the input color texture sent to NVScaler/NVSharpen is in HDR format set NIS_HDR_MODE define to either // NIS_HDR_MODE_LINEAR (1) or NIS_HDR_MODE_PQ (2). // // Supported Texture Formats // ------------------------- // Input and output formats: // Input and output formats are expected to be in the rages defined in previous section and should be // specified using non-integer data types such as DXGI_FORMAT_R8G8B8A8_UNORM. // // Coefficients formats: // The scaler coefficients and USM coefficients format should be specified using float4 type such as // DXGI_FORMAT_R32G32B32A32_FLOAT or DXGI_FORMAT_R16G16B16A16_FLOAT. // // Resource States, Buffers, and Sampler: // The game or application calling NVIDIA Image Scaling SDK shaders must ensure that the textures are in // the correct state. // - Input color textures must be in pixel shader read state. Shader Resource View (SRV) in DirectX // - The output texture must be in read/write state. Unordered Access View (UAV) in DirectX // - The coefficients texture for NVScaler must be in read state. Shader Resource View (SRV) in DirectX // - The configuration variables must be passed as constant buffer. Constant Buffer View (CBV) in DirectX // - The sampler for texture pixel sampling. Linear clamp SamplerState in Direct // // Adding NVIDIA Image Scaling SDK to a Project // -------------------------------------------- // Include NIS_Scaler.h directly in your application or alternative use the provided NIS_Main.hlsl shader file. // Use NIS_Config.h to get the ideal shader dispatch values for your platform, to configure the algorithm constant // values (NVScalerUpdateConfig, and NVSharpenUpdateConfig), and to access the algorithm coefficients (coef_scale and coef_USM). // // Defines: // NIS_SCALER: default (1) NVScaler, (0) fast NVSharpen only, no upscaling // NIS_HDR_MODE: default (0) disabled, (1) Linear, (2) PQ // NIS_BLOCK_WIDTH: pixels per block width. Use GetOptimalBlockWidth query for your platform // NIS_BLOCK_HEIGHT: pixels per block height. Use GetOptimalBlockHeight query for your platform // NIS_THREAD_GROUP_SIZE: number of threads per group. Use GetOptimalThreadGroupSize query for your platform // NIS_USE_HALF_PRECISION: default (0) disabled, (1) enable half pression computation // NIS_HLSL: (1) enabled, (0) disabled // NIS_HLSL_6_2: default (0) HLSL v5, (1) HLSL v6.2 forces NIS_HLSL=1 // NIS_GLSL: (1) enabled, (0) disabled // NIS_VIEWPORT_SUPPORT: default(0) disabled, (1) enable input/output viewport support // // Default NVScaler shader constants: // [NIS_BLOCK_WIDTH, NIS_BLOCK_HEIGHT, NIS_THREAD_GROUP_SIZE] = [32, 24, 256] // // Default NVSharpen shader constants: // [NIS_BLOCK_WIDTH, NIS_BLOCK_HEIGHT, NIS_THREAD_GROUP_SIZE] = [32, 32, 256] //--------------------------------------------------------------------------------- // NVScaler enable by default. Set to 0 for NVSharpen only #ifndef NIS_SCALER #define NIS_SCALER 1 #endif // HDR Modes #define NIS_HDR_MODE_NONE 0 #define NIS_HDR_MODE_LINEAR 1 #define NIS_HDR_MODE_PQ 2 #ifndef NIS_HDR_MODE #define NIS_HDR_MODE NIS_HDR_MODE_NONE #endif #define kHDRCompressionFactor 0.282842712f // Viewport support #ifndef NIS_VIEWPORT_SUPPORT #define NIS_VIEWPORT_SUPPORT 0 #endif // HLSL, GLSL #if NIS_HLSL==0 && !defined(NIS_GLSL) #define NIS_GLSL 1 #endif #if NIS_HLSL_6_2 || (!NIS_GLSL && !NIS_HLSL) #if defined(NIS_HLSL) #undef NIS_HLSL #endif #define NIS_HLSL 1 #endif #if NIS_HLSL && NIS_GLSL #undef NIS_GLSL #define NIS_GLSL 0 #endif // Half precision #ifndef NIS_USE_HALF_PRECISION #define NIS_USE_HALF_PRECISION 0 #endif #if NIS_HLSL // Generic type and function aliases for HLSL #define NVF float #define NVF2 float2 #define NVF3 float3 #define NVF4 float4 #define NVI int #define NVI2 int2 #define NVU uint #define NVU2 uint2 #if NIS_USE_HALF_PRECISION #if NIS_HLSL_6_2 #define NVH float16_t #define NVH2 float16_t2 #define NVH3 float16_t3 #define NVH4 float16_t4 #else #define NVH min16float #define NVH2 min16float2 #define NVH3 min16float3 #define NVH4 min16float4 #endif // NIS_HLSL_6_2 #else // FP32 types #define NVH NVF #define NVH2 NVF2 #define NVH3 NVF3 #define NVH4 NVF4 #endif // NIS_USE_HALF_PRECISION #define NVSHARED groupshared #define NVTEX_LOAD(x, pos) x[pos] #define NVTEX_SAMPLE(x, sampler, pos) x.SampleLevel(sampler, pos, 0) #define NVTEX_SAMPLE_RED(x, sampler, pos) x.GatherRed(sampler, pos) #define NVTEX_SAMPLE_GREEN(x, sampler, pos) x.GatherGreen(sampler, pos) #define NVTEX_SAMPLE_BLUE(x, sampler, pos) x.GatherBlue(sampler, pos) #define NVTEX_STORE(x, pos, v) x[pos] = v #ifndef NIS_UNROLL #define NIS_UNROLL [unroll] #endif #endif // NIS_HLSL // Generic type and function aliases for GLSL #if NIS_GLSL #define NVF float #define NVF2 vec2 #define NVF3 vec3 #define NVF4 vec4 #define NVI int #define NVI2 ivec2 #define NVU uint #define NVU2 uvec2 #if NIS_USE_HALF_PRECISION #define NVH float16_t #define NVH2 f16vec2 #define NVH3 f16vec3 #define NVH4 f16vec4 #else // FP32 types #define NVH NVF #define NVH2 NVF2 #define NVH3 NVF3 #define NVH4 NVF4 #endif // NIS_USE_HALF_PRECISION #define NVSHARED shared #define NVTEX_LOAD(x, pos) texelFetch(sampler2D(x, samplerLinearClamp), pos, 0) #define NVTEX_SAMPLE(x, sampler, pos) textureLod(sampler2D(x, sampler), pos, 0) #define NVTEX_SAMPLE_RED(x, sampler, pos) textureGather(sampler2D(x, sampler), pos, 0) #define NVTEX_SAMPLE_GREEN(x, sampler, pos) textureGather(sampler2D(x, sampler), pos, 1) #define NVTEX_SAMPLE_BLUE(x, sampler, pos) textureGather(sampler2D(x, sampler), pos, 2) #define NVTEX_STORE(x, pos, v) imageStore(x, NVI2(pos), v) #define saturate(x) clamp(x, 0, 1) #define lerp(a, b, x) mix(a, b, x) #define GroupMemoryBarrierWithGroupSync() groupMemoryBarrier(); barrier() #ifndef NIS_UNROLL #define NIS_UNROLL #endif #endif // NIS_GLSL // Texture gather #ifndef NIS_TEXTURE_GATHER #define NIS_TEXTURE_GATHER 0 #endif // NIS Scaling #define NIS_SCALE_INT 1 #define NIS_SCALE_FLOAT NVF(1.f) NVF getY(NVF3 rgba) { #if NIS_HDR_MODE == NIS_HDR_MODE_PQ return NVF(0.262f) * rgba.x + NVF(0.678f) * rgba.y + NVF(0.0593f) * rgba.z; #elif NIS_HDR_MODE == NIS_HDR_MODE_LINEAR return sqrt(NVF(0.2126f) * rgba.x + NVF(0.7152f) * rgba.y + NVF(0.0722f) * rgba.z) * kHDRCompressionFactor; #else return NVF(0.2126f) * rgba.x + NVF(0.7152f) * rgba.y + NVF(0.0722f) * rgba.z; #endif } NVF getYLinear(NVF3 rgba) { return NVF(0.2126f) * rgba.x + NVF(0.7152f) * rgba.y + NVF(0.0722f) * rgba.z; } #if NIS_SCALER NVF4 GetEdgeMap(NVF p[4][4], NVI i, NVI j) #else NVF4 GetEdgeMap(NVF p[5][5], NVI i, NVI j) #endif { const NVF g_0 = abs(p[0 + i][0 + j] + p[0 + i][1 + j] + p[0 + i][2 + j] - p[2 + i][0 + j] - p[2 + i][1 + j] - p[2 + i][2 + j]); const NVF g_45 = abs(p[1 + i][0 + j] + p[0 + i][0 + j] + p[0 + i][1 + j] - p[2 + i][1 + j] - p[2 + i][2 + j] - p[1 + i][2 + j]); const NVF g_90 = abs(p[0 + i][0 + j] + p[1 + i][0 + j] + p[2 + i][0 + j] - p[0 + i][2 + j] - p[1 + i][2 + j] - p[2 + i][2 + j]); const NVF g_135 = abs(p[1 + i][0 + j] + p[2 + i][0 + j] + p[2 + i][1 + j] - p[0 + i][1 + j] - p[0 + i][2 + j] - p[1 + i][2 + j]); const NVF g_0_90_max = max(g_0, g_90); const NVF g_0_90_min = min(g_0, g_90); const NVF g_45_135_max = max(g_45, g_135); const NVF g_45_135_min = min(g_45, g_135); NVF e_0_90 = 0; NVF e_45_135 = 0; if ((g_0_90_max + g_45_135_max) != 0) { e_0_90 = g_0_90_max / (g_0_90_max + g_45_135_max); e_0_90 = min(e_0_90, 1.0f); e_45_135 = 1.0f - e_0_90; } NVF e = ((g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min)) ? 1.f : 0.f; NVF edge_0 = (g_0_90_max == g_0) ? e : 0.f; NVF edge_90 = (g_0_90_max == g_0) ? 0.f : e; e = ((g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min)) ? 1.f : 0.f; NVF edge_45 = (g_45_135_max == g_45) ? e : 0.f; NVF edge_135 = (g_45_135_max == g_45) ? 0.f : e; NVF weight_0 = 0.f; NVF weight_90 = 0.f; NVF weight_45 = 0.f; NVF weight_135 = 0.f; if ((edge_0 + edge_90 + edge_45 + edge_135) >= 2.0f) { weight_0 = (edge_0 == 1.0f) ? e_0_90 : 0.f; weight_90 = (edge_0 == 1.0f) ? 0.f : e_0_90; weight_45 = (edge_45 == 1.0f) ? e_45_135 : 0.f; weight_135 = (edge_45 == 1.0f) ? 0.f : e_45_135; } else if ((edge_0 + edge_90 + edge_45 + edge_135) >= 1.0f) { weight_0 = edge_0; weight_90 = edge_90; weight_45 = edge_45; weight_135 = edge_135; } return NVF4(weight_0, weight_90, weight_45, weight_135); } #if NIS_SCALER #ifndef NIS_BLOCK_WIDTH #define NIS_BLOCK_WIDTH 32 #endif #ifndef NIS_BLOCK_HEIGHT #define NIS_BLOCK_HEIGHT 24 #endif #ifndef NIS_THREAD_GROUP_SIZE #define NIS_THREAD_GROUP_SIZE 256 #endif #define kPhaseCount 64 #define kFilterSize 6 #define kSupportSize 6 #define kPadSize kSupportSize // 'Tile' is the region of source luminance values that we load into shPixelsY. // It is the area of source pixels covered by the destination 'Block' plus a // 3 pixel border of support pixels. #define kTilePitch (NIS_BLOCK_WIDTH + kPadSize) #define kTileSize (kTilePitch * (NIS_BLOCK_HEIGHT + kPadSize)) // 'EdgeMap' is the region of source pixels for which edge map vectors are derived. // It is the area of source pixels covered by the destination 'Block' plus a // 1 pixel border. #define kEdgeMapPitch (NIS_BLOCK_WIDTH + 2) #define kEdgeMapSize (kEdgeMapPitch * (NIS_BLOCK_HEIGHT + 2)) NVSHARED NVF shPixelsY[kTileSize]; NVSHARED NVH shCoefScaler[kPhaseCount][kFilterSize]; NVSHARED NVH shCoefUSM[kPhaseCount][kFilterSize]; NVSHARED NVH4 shEdgeMap[kEdgeMapSize]; void LoadFilterBanksSh(NVI i0, NVI di) { // Load up filter banks to shared memory // The work is spread over (kPhaseCount * 2) threads for (NVI i = i0; i < kPhaseCount * 2; i += di) { NVI phase = i / 2; NVI vIdx = i & 1; NVH4 v = NVH4(NVTEX_LOAD(coef_scaler, NVI2(vIdx, phase))); NVI filterOffset = vIdx * 4; shCoefScaler[phase][filterOffset + 0] = v.x; shCoefScaler[phase][filterOffset + 1] = v.y; if (vIdx == 0) { shCoefScaler[phase][2] = v.z; shCoefScaler[phase][3] = v.w; } v = NVH4(NVTEX_LOAD(coef_usm, NVI2(vIdx, phase))); shCoefUSM[phase][filterOffset + 0] = v.x; shCoefUSM[phase][filterOffset + 1] = v.y; if (vIdx == 0) { shCoefUSM[phase][2] = v.z; shCoefUSM[phase][3] = v.w; } } } NVF CalcLTI(NVF p0, NVF p1, NVF p2, NVF p3, NVF p4, NVF p5, NVI phase_index) { const bool selector = (phase_index <= kPhaseCount / 2); NVF sel = selector ? p0 : p3; const NVF a_min = min(min(p1, p2), sel); const NVF a_max = max(max(p1, p2), sel); sel = selector ? p2 : p5; const NVF b_min = min(min(p3, p4), sel); const NVF b_max = max(max(p3, p4), sel); const NVF a_cont = a_max - a_min; const NVF b_cont = b_max - b_min; const NVF cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps); return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost; } NVF4 GetInterpEdgeMap(const NVF4 edge[2][2], NVF phase_frac_x, NVF phase_frac_y) { NVF4 h0 = lerp(edge[0][0], edge[0][1], phase_frac_x); NVF4 h1 = lerp(edge[1][0], edge[1][1], phase_frac_x); return lerp(h0, h1, phase_frac_y); } NVF EvalPoly6(const NVF pxl[6], NVI phase_int) { NVF y = 0.f; { NIS_UNROLL for (NVI i = 0; i < 6; ++i) { y += shCoefScaler[phase_int][i] * pxl[i]; } } NVF y_usm = 0.f; { NIS_UNROLL for (NVI i = 0; i < 6; ++i) { y_usm += shCoefUSM[phase_int][i] * pxl[i]; } } // let's compute a piece-wise ramp based on luma const NVF y_scale = 1.0f - saturate((y * (1.0f / NIS_SCALE_FLOAT) - kSharpStartY) * kSharpScaleY); // scale the ramp to sharpen as a function of luma const NVF y_sharpness = y_scale * kSharpStrengthScale + kSharpStrengthMin; y_usm *= y_sharpness; // scale the ramp to limit USM as a function of luma const NVF y_sharpness_limit = (y_scale * kSharpLimitScale + kSharpLimitMin) * y; y_usm = min(y_sharpness_limit, max(-y_sharpness_limit, y_usm)); // reduce ringing y_usm *= CalcLTI(pxl[0], pxl[1], pxl[2], pxl[3], pxl[4], pxl[5], phase_int); return y + y_usm; } NVF FilterNormal(const NVF p[6][6], NVI phase_x_frac_int, NVI phase_y_frac_int) { NVF h_acc = 0.0f; NIS_UNROLL for (NVI j = 0; j < 6; ++j) { NVF v_acc = 0.0f; NIS_UNROLL for (NVI i = 0; i < 6; ++i) { v_acc += p[i][j] * shCoefScaler[phase_y_frac_int][i]; } h_acc += v_acc * shCoefScaler[phase_x_frac_int][j]; } // let's return the sum unpacked -> we can accumulate it later return h_acc; } NVF4 GetDirFilters(NVF p[6][6], NVF phase_x_frac, NVF phase_y_frac, NVI phase_x_frac_int, NVI phase_y_frac_int) { NVF4 f; // 0 deg filter NVF interp0Deg[6]; { NIS_UNROLL for (NVI i = 0; i < 6; ++i) { interp0Deg[i] = lerp(p[i][2], p[i][3], phase_x_frac); } } f.x = EvalPoly6(interp0Deg, phase_y_frac_int); // 90 deg filter NVF interp90Deg[6]; { NIS_UNROLL for (NVI i = 0; i < 6; ++i) { interp90Deg[i] = lerp(p[2][i], p[3][i], phase_y_frac); } } f.y = EvalPoly6(interp90Deg, phase_x_frac_int); //45 deg filter NVF pphase_b45; pphase_b45 = 0.5f + 0.5f * (phase_x_frac - phase_y_frac); NVF temp_interp45Deg[7]; temp_interp45Deg[1] = lerp(p[2][1], p[1][2], pphase_b45); temp_interp45Deg[3] = lerp(p[3][2], p[2][3], pphase_b45); temp_interp45Deg[5] = lerp(p[4][3], p[3][4], pphase_b45); { pphase_b45 = pphase_b45 - 0.5f; NVF a = (pphase_b45 >= 0.f) ? p[0][2] : p[2][0]; NVF b = (pphase_b45 >= 0.f) ? p[1][3] : p[3][1]; NVF c = (pphase_b45 >= 0.f) ? p[2][4] : p[4][2]; NVF d = (pphase_b45 >= 0.f) ? p[3][5] : p[5][3]; temp_interp45Deg[0] = lerp(p[1][1], a, abs(pphase_b45)); temp_interp45Deg[2] = lerp(p[2][2], b, abs(pphase_b45)); temp_interp45Deg[4] = lerp(p[3][3], c, abs(pphase_b45)); temp_interp45Deg[6] = lerp(p[4][4], d, abs(pphase_b45)); } NVF interp45Deg[6]; NVF pphase_p45 = phase_x_frac + phase_y_frac; if (pphase_p45 >= 1) { NIS_UNROLL for (NVI i = 0; i < 6; i++) { interp45Deg[i] = temp_interp45Deg[i + 1]; } pphase_p45 = pphase_p45 - 1; } else { NIS_UNROLL for (NVI i = 0; i < 6; i++) { interp45Deg[i] = temp_interp45Deg[i]; } } f.z = EvalPoly6(interp45Deg, NVI(pphase_p45 * 64)); //135 deg filter NVF pphase_b135; pphase_b135 = 0.5f * (phase_x_frac + phase_y_frac); NVF temp_interp135Deg[7]; temp_interp135Deg[1] = lerp(p[3][1], p[4][2], pphase_b135); temp_interp135Deg[3] = lerp(p[2][2], p[3][3], pphase_b135); temp_interp135Deg[5] = lerp(p[1][3], p[2][4], pphase_b135); { pphase_b135 = pphase_b135 - 0.5f; NVF a = (pphase_b135 >= 0.f) ? p[5][2] : p[3][0]; NVF b = (pphase_b135 >= 0.f) ? p[4][3] : p[2][1]; NVF c = (pphase_b135 >= 0.f) ? p[3][4] : p[1][2]; NVF d = (pphase_b135 >= 0.f) ? p[2][5] : p[0][3]; temp_interp135Deg[0] = lerp(p[4][1], a, abs(pphase_b135)); temp_interp135Deg[2] = lerp(p[3][2], b, abs(pphase_b135)); temp_interp135Deg[4] = lerp(p[2][3], c, abs(pphase_b135)); temp_interp135Deg[6] = lerp(p[1][4], d, abs(pphase_b135)); } NVF interp135Deg[6]; NVF pphase_p135 = 1 + (phase_x_frac - phase_y_frac); if (pphase_p135 >= 1) { NIS_UNROLL for (NVI i = 0; i < 6; ++i) { interp135Deg[i] = temp_interp135Deg[i + 1]; } pphase_p135 = pphase_p135 - 1; } else { NIS_UNROLL for (NVI i = 0; i < 6; ++i) { interp135Deg[i] = temp_interp135Deg[i]; } } f.w = EvalPoly6(interp135Deg, NVI(pphase_p135 * 64)); return f; } //----------------------------------------------------------------------------------------------- // NVScaler //----------------------------------------------------------------------------------------------- void NVScaler(NVU2 blockIdx, NVU threadIdx) { // Figure out the range of pixels from input image that would be needed to be loaded for this thread-block NVI dstBlockX = NVI(NIS_BLOCK_WIDTH * blockIdx.x); NVI dstBlockY = NVI(NIS_BLOCK_HEIGHT * blockIdx.y); const NVI srcBlockStartX = NVI(floor((dstBlockX + 0.5f) * kScaleX - 0.5f)); const NVI srcBlockStartY = NVI(floor((dstBlockY + 0.5f) * kScaleY - 0.5f)); const NVI srcBlockEndX = NVI(ceil((dstBlockX + NIS_BLOCK_WIDTH + 0.5f) * kScaleX - 0.5f)); const NVI srcBlockEndY = NVI(ceil((dstBlockY + NIS_BLOCK_HEIGHT + 0.5f) * kScaleY - 0.5f)); NVI numTilePixelsX = srcBlockEndX - srcBlockStartX + kSupportSize - 1; NVI numTilePixelsY = srcBlockEndY - srcBlockStartY + kSupportSize - 1; // round-up load region to even size since we're loading in 2x2 batches numTilePixelsX += numTilePixelsX & 0x1; numTilePixelsY += numTilePixelsY & 0x1; const NVI numTilePixels = numTilePixelsX * numTilePixelsY; // calculate the equivalent values for the edge map const NVI numEdgeMapPixelsX = numTilePixelsX - kSupportSize + 2; const NVI numEdgeMapPixelsY = numTilePixelsY - kSupportSize + 2; const NVI numEdgeMapPixels = numEdgeMapPixelsX * numEdgeMapPixelsY; // fill in input luma tile (shPixelsY) in batches of 2x2 pixels // we use texture gather to get extra support necessary // to compute 2x2 edge map outputs too { for (NVI i = NVI(threadIdx) * 2; i < numTilePixels / 2; i += NIS_THREAD_GROUP_SIZE * 2) { NVI py = (i / numTilePixelsX) * 2; NVI px = i % numTilePixelsX; // 0.5 to be in the center of texel // -1.0 to sample top-left corner of 3x3 halo necessary // -kSupportSize/2 to shift by the kernel support size NVF kShift = 0.5f - 1.0f - (kSupportSize - 1) / 2; #if NIS_VIEWPORT_SUPPORT const NVF tx = (srcBlockStartX + px + kInputViewportOriginX + kShift) * kSrcNormX; const NVF ty = (srcBlockStartY + py + kInputViewportOriginY + kShift) * kSrcNormY; #else const NVF tx = (srcBlockStartX + px + kShift) * kSrcNormX; const NVF ty = (srcBlockStartY + py + kShift) * kSrcNormY; #endif NVF p[2][2]; #if NIS_TEXTURE_GATHER { const NVF4 sr = NVTEX_SAMPLE_RED(in_texture, samplerLinearClamp, NVF2(tx, ty)); const NVF4 sg = NVTEX_SAMPLE_GREEN(in_texture, samplerLinearClamp, NVF2(tx, ty)); const NVF4 sb = NVTEX_SAMPLE_BLUE(in_texture, samplerLinearClamp, NVF2(tx, ty)); p[0][0] = getY(NVF3(sr.w, sg.w, sb.w)); p[0][1] = getY(NVF3(sr.z, sg.z, sb.z)); p[1][0] = getY(NVF3(sr.x, sg.x, sb.x)); p[1][1] = getY(NVF3(sr.y, sg.y, sb.y)); } #else NIS_UNROLL for (NVI j = 0; j < 2; j++) { NIS_UNROLL for (NVI k = 0; k < 2; k++) { const NVF4 px = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2(tx + k * kSrcNormX, ty + j * kSrcNormY)); p[j][k] = getY(px.xyz); } } #endif const NVI idx = py * kTilePitch + px; shPixelsY[idx] = NVH(p[0][0]); shPixelsY[idx + 1] = NVH(p[0][1]); shPixelsY[idx + kTilePitch] = NVH(p[1][0]); shPixelsY[idx + kTilePitch + 1] = NVH(p[1][1]); } } GroupMemoryBarrierWithGroupSync(); { // fill in the edge map of 2x2 pixels for (NVI i = NVI(threadIdx) * 2; i < numEdgeMapPixels / 2; i += NIS_THREAD_GROUP_SIZE * 2) { NVI py = (i / numEdgeMapPixelsX) * 2; NVI px = i % numEdgeMapPixelsX; const NVI edgeMapIdx = py * kEdgeMapPitch + px; NVI tileCornerIdx = (py+1) * kTilePitch + px + 1; NVF p[4][4]; NIS_UNROLL for (NVI j = 0; j < 4; j++) { NIS_UNROLL for (NVI k = 0; k < 4; k++) { p[j][k] = shPixelsY[tileCornerIdx + j * kTilePitch + k]; } } shEdgeMap[edgeMapIdx] = NVH4(GetEdgeMap(p, 0, 0)); shEdgeMap[edgeMapIdx + 1] = NVH4(GetEdgeMap(p, 0, 1)); shEdgeMap[edgeMapIdx + kEdgeMapPitch] = NVH4(GetEdgeMap(p, 1, 0)); shEdgeMap[edgeMapIdx + kEdgeMapPitch + 1] = NVH4(GetEdgeMap(p, 1, 1)); } } LoadFilterBanksSh(NVI(threadIdx), NIS_THREAD_GROUP_SIZE); GroupMemoryBarrierWithGroupSync(); for (NVI k = NVI(threadIdx); k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT; k += NIS_THREAD_GROUP_SIZE) { const NVI2 pos = NVI2(k % NIS_BLOCK_WIDTH, k / NIS_BLOCK_WIDTH); const NVI dstX = dstBlockX + pos.x; const NVI dstY = dstBlockY + pos.y; const NVF srcX = (0.5f + dstX) * kScaleX - 0.5f; const NVF srcY = (0.5f + dstY) * kScaleY - 0.5f; #if NIS_VIEWPORT_SUPPORT if (srcX > kInputViewportWidth || srcY > kInputViewportHeight || dstX > kOutputViewportWidth || dstY > kOutputViewportHeight) { return; } #endif const NVI px = NVI(floor(srcX) - srcBlockStartX); const NVI py = NVI(floor(srcY) - srcBlockStartY); const NVI startTileIdx = py * kTilePitch + px; // load 6x6 support to regs NVF p[6][6]; { NIS_UNROLL for (NVI i = 0; i < 6; ++i) { NIS_UNROLL for (NVI j = 0; j < 6; ++j) { p[i][j] = shPixelsY[startTileIdx + i * kTilePitch + j]; } } } // compute discretized filter phase const NVF fx = srcX - floor(srcX); const NVF fy = srcY - floor(srcY); const NVI fx_int = NVI(fx * kPhaseCount); const NVI fy_int = NVI(fy * kPhaseCount); // get traditional scaler filter output const NVF pixel_n = FilterNormal(p, fx_int, fy_int); // get directional filter bank output NVF4 opDirYU = GetDirFilters(p, fx, fy, fx_int, fy_int); // final luma is a weighted product of directional & normal filters // generate weights for directional filters const NVI startEdgeMapIdx = py * kEdgeMapPitch + px; NVF4 edge[2][2]; NIS_UNROLL for (NVI i = 0; i < 2; i++) { NIS_UNROLL for (NVI j = 0; j < 2; j++) { // need to shift edge map sampling since it's a 2x2 centered inside 6x6 grid edge[i][j] = shEdgeMap[startEdgeMapIdx + (i * kEdgeMapPitch) + j]; } } const NVF4 w = GetInterpEdgeMap(edge, fx, fy) * NIS_SCALE_INT; // final pixel is a weighted sum filter outputs const NVF opY = (opDirYU.x * w.x + opDirYU.y * w.y + opDirYU.z * w.z + opDirYU.w * w.w + pixel_n * (NIS_SCALE_FLOAT - w.x - w.y - w.z - w.w)) * (1.0f / NIS_SCALE_FLOAT); // do bilinear tap for chroma upscaling #if NIS_VIEWPORT_SUPPORT NVF4 op = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2((srcX + kInputViewportOriginX) * kSrcNormX, (srcY + kInputViewportOriginY) * kSrcNormY)); #else NVF4 op = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2((dstX + 0.5f) * kDstNormX, (dstY + 0.5f) * kDstNormY)); #endif #if NIS_HDR_MODE == NIS_HDR_MODE_LINEAR const NVF kEps = 1e-4f; const NVF kNorm = 1.0f / (NIS_SCALE_FLOAT * kHDRCompressionFactor); const NVF opYN = max(opY, 0.0f) * kNorm; const NVF corr = (opYN * opYN + kEps) / (max(getYLinear(NVF3(op.x, op.y, op.z)), 0.0f) + kEps); op.x *= corr; op.y *= corr; op.z *= corr; #else const NVF corr = opY * (1.0f / NIS_SCALE_FLOAT) - getY(NVF3(op.x, op.y, op.z)); op.x += corr; op.y += corr; op.z += corr; #endif #if NIS_VIEWPORT_SUPPORT NVTEX_STORE(out_texture, NVU2(dstX + kOutputViewportOriginX, dstY + kOutputViewportOriginY), op); #else NVTEX_STORE(out_texture, NVU2(dstX, dstY), op); #endif } } #else #ifndef NIS_BLOCK_WIDTH #define NIS_BLOCK_WIDTH 32 #endif #ifndef NIS_BLOCK_HEIGHT #define NIS_BLOCK_HEIGHT 32 #endif #ifndef NIS_THREAD_GROUP_SIZE #define NIS_THREAD_GROUP_SIZE 256 #endif #define kSupportSize 5 #define kNumPixelsX (NIS_BLOCK_WIDTH + kSupportSize + 1) #define kNumPixelsY (NIS_BLOCK_HEIGHT + kSupportSize + 1) NVSHARED NVF shPixelsY[kNumPixelsY][kNumPixelsX]; NVF CalcLTIFast(const NVF y[5]) { const NVF a_min = min(min(y[0], y[1]), y[2]); const NVF a_max = max(max(y[0], y[1]), y[2]); const NVF b_min = min(min(y[2], y[3]), y[4]); const NVF b_max = max(max(y[2], y[3]), y[4]); const NVF a_cont = a_max - a_min; const NVF b_cont = b_max - b_min; const NVF cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps * (1.0f / NIS_SCALE_FLOAT)); return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost; } NVF EvalUSM(const NVF pxl[5], const NVF sharpnessStrength, const NVF sharpnessLimit) { // USM profile NVF y_usm = -0.6001f * pxl[1] + 1.2002f * pxl[2] - 0.6001f * pxl[3]; // boost USM profile y_usm *= sharpnessStrength; // clamp to the limit y_usm = min(sharpnessLimit, max(-sharpnessLimit, y_usm)); // reduce ringing y_usm *= CalcLTIFast(pxl); return y_usm; } NVF4 GetDirUSM(const NVF p[5][5]) { // sharpness boost & limit are the same for all directions const NVF scaleY = 1.0f - saturate((p[2][2] - kSharpStartY) * kSharpScaleY); // scale the ramp to sharpen as a function of luma const NVF sharpnessStrength = scaleY * kSharpStrengthScale + kSharpStrengthMin; // scale the ramp to limit USM as a function of luma const NVF sharpnessLimit = (scaleY * kSharpLimitScale + kSharpLimitMin) * p[2][2]; NVF4 rval; // 0 deg filter NVF interp0Deg[5]; { for (NVI i = 0; i < 5; ++i) { interp0Deg[i] = p[i][2]; } } rval.x = EvalUSM(interp0Deg, sharpnessStrength, sharpnessLimit); // 90 deg filter NVF interp90Deg[5]; { for (NVI i = 0; i < 5; ++i) { interp90Deg[i] = p[2][i]; } } rval.y = EvalUSM(interp90Deg, sharpnessStrength, sharpnessLimit); //45 deg filter NVF interp45Deg[5]; interp45Deg[0] = p[1][1]; interp45Deg[1] = lerp(p[2][1], p[1][2], 0.5f); interp45Deg[2] = p[2][2]; interp45Deg[3] = lerp(p[3][2], p[2][3], 0.5f); interp45Deg[4] = p[3][3]; rval.z = EvalUSM(interp45Deg, sharpnessStrength, sharpnessLimit); //135 deg filter NVF interp135Deg[5]; interp135Deg[0] = p[3][1]; interp135Deg[1] = lerp(p[3][2], p[2][1], 0.5f); interp135Deg[2] = p[2][2]; interp135Deg[3] = lerp(p[2][3], p[1][2], 0.5f); interp135Deg[4] = p[1][3]; rval.w = EvalUSM(interp135Deg, sharpnessStrength, sharpnessLimit); return rval; } //----------------------------------------------------------------------------------------------- // NVSharpen //----------------------------------------------------------------------------------------------- void NVSharpen(NVU2 blockIdx, NVU threadIdx) { const NVI dstBlockX = NVI(NIS_BLOCK_WIDTH * blockIdx.x); const NVI dstBlockY = NVI(NIS_BLOCK_HEIGHT * blockIdx.y); // fill in input luma tile in batches of 2x2 pixels // we use texture gather to get extra support necessary // to compute 2x2 edge map outputs too const NVF kShift = 0.5f - kSupportSize / 2; for (NVI i = NVI(threadIdx) * 2; i < kNumPixelsX * kNumPixelsY / 2; i += NIS_THREAD_GROUP_SIZE * 2) { NVU2 pos = NVU2(i % kNumPixelsX, i / kNumPixelsX * 2); NIS_UNROLL for (NVI dy = 0; dy < 2; dy++) { NIS_UNROLL for (NVI dx = 0; dx < 2; dx++) { #if NIS_VIEWPORT_SUPPORT const NVF tx = (dstBlockX + pos.x + kInputViewportOriginX + dx + kShift) * kSrcNormX; const NVF ty = (dstBlockY + pos.y + kInputViewportOriginY + dy + kShift) * kSrcNormY; #else const NVF tx = (dstBlockX + pos.x + dx + kShift) * kSrcNormX; const NVF ty = (dstBlockY + pos.y + dy + kShift) * kSrcNormY; #endif const NVF4 px = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2(tx, ty)); shPixelsY[pos.y + dy][pos.x + dx] = getY(px.xyz); } } } GroupMemoryBarrierWithGroupSync(); for (NVI k = NVI(threadIdx); k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT; k += NIS_THREAD_GROUP_SIZE) { const NVI2 pos = NVI2(k % NIS_BLOCK_WIDTH, k / NIS_BLOCK_WIDTH); // load 5x5 support to regs NVF p[5][5]; NIS_UNROLL for (NVI i = 0; i < 5; ++i) { NIS_UNROLL for (NVI j = 0; j < 5; ++j) { p[i][j] = shPixelsY[pos.y + i][pos.x + j]; } } // get directional filter bank output NVF4 dirUSM = GetDirUSM(p); // generate weights for directional filters NVF4 w = GetEdgeMap(p, kSupportSize / 2 - 1, kSupportSize / 2 - 1); // final USM is a weighted sum filter outputs const NVF usmY = (dirUSM.x * w.x + dirUSM.y * w.y + dirUSM.z * w.z + dirUSM.w * w.w); // do bilinear tap and correct rgb texel so it produces new sharpened luma const NVI dstX = dstBlockX + pos.x; const NVI dstY = dstBlockY + pos.y; #if NIS_VIEWPORT_SUPPORT if (dstX > kOutputViewportWidth || dstY > kOutputViewportHeight) { return; } #endif #if NIS_VIEWPORT_SUPPORT NVF4 op = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2((dstX + kInputViewportOriginX) * kSrcNormX, (dstY + kInputViewportOriginY) * kSrcNormY)); #else NVF4 op = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2((dstX + 0.5f) * kDstNormX, (dstY + 0.5f) * kDstNormY)); #endif #if NIS_HDR_MODE == NIS_HDR_MODE_LINEAR const NVF kEps = 1e-4f * kHDRCompressionFactor * kHDRCompressionFactor; NVF newY = p[2][2] + usmY; newY = max(newY, 0.0f); const NVF oldY = p[2][2]; const NVF corr = (newY * newY + kEps) / (oldY * oldY + kEps); op.x *= corr; op.y *= corr; op.z *= corr; #else op.x += usmY; op.y += usmY; op.z += usmY; #endif #if NIS_VIEWPORT_SUPPORT NVTEX_STORE(out_texture, NVU2(dstX + kOutputViewportOriginX, dstY + kOutputViewportOriginY), op); #else NVTEX_STORE(out_texture, NVU2(dstX, dstY), op); #endif } } #endif