NIS_Scaler.h 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965
  1. // The MIT License(MIT)
  2. //
  3. // Copyright(c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4. //
  5. // Permission is hereby granted, free of charge, to any person obtaining a copy of
  6. // this software and associated documentation files(the "Software"), to deal in
  7. // the Software without restriction, including without limitation the rights to
  8. // use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies of
  9. // the Software, and to permit persons to whom the Software is furnished to do so,
  10. // subject to the following conditions :
  11. //
  12. // The above copyright notice and this permission notice shall be included in all
  13. // copies or substantial portions of the Software.
  14. //
  15. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
  17. // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR
  18. // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
  19. // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  20. // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  21. //---------------------------------------------------------------------------------
  22. // NVIDIA Image Scaling SDK - v1.0.2
  23. //---------------------------------------------------------------------------------
  24. // The NVIDIA Image Scaling SDK provides a single spatial scaling and sharpening algorithm
  25. // for cross-platform support. The scaling algorithm uses a 6-tap scaling filter combined
  26. // with 4 directional scaling and adaptive sharpening filters, which creates nice smooth images
  27. // and sharp edges. In addition, the SDK provides a state-of-the-art adaptive directional sharpening algorithm
  28. // for use in applications where no scaling is required.
  29. //
  30. // The directional scaling and sharpening algorithm is named NVScaler while the adaptive-directional-sharpening-only
  31. // algorithm is named NVSharpen. Both algorithms are provided as compute shaders and
  32. // developers are free to integrate them in their applications. Note that if you integrate NVScaler, you
  33. // should NOT integrate NVSharpen, as NVScaler already includes a sharpening pass
  34. //
  35. // Pipeline Placement
  36. // ------------------
  37. // The call into the NVIDIA Image Scaling shaders must occur during the post-processing phase after tone-mapping.
  38. // Applying the scaling in linear HDR in-game color-space may result in a sharpening effect that is
  39. // either not visible or too strong. Since sharpening algorithms can enhance noisy or grainy regions, it is recommended
  40. // that certain effects such as film grain should occur after NVScaler or NVSharpen. Low-pass filters such as motion blur or
  41. // light bloom are recommended to be applied before NVScaler or NVSharpen to avoid sharpening attenuation.
  42. //
  43. // Color Space and Ranges
  44. // ----------------------
  45. // NVIDIA Image Scaling shaders can process color textures stored as either LDR or HDR with the following
  46. // restrictions:
  47. // 1) LDR
  48. // - The range of color values must be in the [0, 1] range
  49. // - The input color texture must be in display-referred color-space after tone mapping and OETF (gamma-correction)
  50. // has been applied
  51. // 2) HDR PQ
  52. // - The range of color values must be in the [0, 1] range
  53. // - The input color texture must be in display-referred color-space after tone mapping with Rec.2020 PQ OETF applied
  54. // 3) HDR Linear
  55. // - The recommended range of color values is [0, 12.5], where luminance value (as per BT. 709) of
  56. // 1.0 maps to brightness value of 80nits (sRGB peak) and 12.5 maps to 1000nits
  57. // - The input color texture may have luminance values that are either linear and scene-referred or
  58. // linear and display-referred (after tone mapping)
  59. //
  60. // If the input color texture sent to NVScaler/NVSharpen is in HDR format set NIS_HDR_MODE define to either
  61. // NIS_HDR_MODE_LINEAR (1) or NIS_HDR_MODE_PQ (2).
  62. //
  63. // Supported Texture Formats
  64. // -------------------------
  65. // Input and output formats:
  66. // Input and output formats are expected to be in the rages defined in previous section and should be
  67. // specified using non-integer data types such as DXGI_FORMAT_R8G8B8A8_UNORM.
  68. //
  69. // Coefficients formats:
  70. // The scaler coefficients and USM coefficients format should be specified using float4 type such as
  71. // DXGI_FORMAT_R32G32B32A32_FLOAT or DXGI_FORMAT_R16G16B16A16_FLOAT.
  72. //
  73. // Resource States, Buffers, and Sampler:
  74. // The game or application calling NVIDIA Image Scaling SDK shaders must ensure that the textures are in
  75. // the correct state.
  76. // - Input color textures must be in pixel shader read state. Shader Resource View (SRV) in DirectX
  77. // - The output texture must be in read/write state. Unordered Access View (UAV) in DirectX
  78. // - The coefficients texture for NVScaler must be in read state. Shader Resource View (SRV) in DirectX
  79. // - The configuration variables must be passed as constant buffer. Constant Buffer View (CBV) in DirectX
  80. // - The sampler for texture pixel sampling. Linear clamp SamplerState in Direct
  81. //
  82. // Adding NVIDIA Image Scaling SDK to a Project
  83. // --------------------------------------------
  84. // Include NIS_Scaler.h directly in your application or alternative use the provided NIS_Main.hlsl shader file.
  85. // Use NIS_Config.h to get the ideal shader dispatch values for your platform, to configure the algorithm constant
  86. // values (NVScalerUpdateConfig, and NVSharpenUpdateConfig), and to access the algorithm coefficients (coef_scale and coef_USM).
  87. //
  88. // Defines:
  89. // NIS_SCALER: default (1) NVScaler, (0) fast NVSharpen only, no upscaling
  90. // NIS_HDR_MODE: default (0) disabled, (1) Linear, (2) PQ
  91. // NIS_BLOCK_WIDTH: pixels per block width. Use GetOptimalBlockWidth query for your platform
  92. // NIS_BLOCK_HEIGHT: pixels per block height. Use GetOptimalBlockHeight query for your platform
  93. // NIS_THREAD_GROUP_SIZE: number of threads per group. Use GetOptimalThreadGroupSize query for your platform
  94. // NIS_USE_HALF_PRECISION: default (0) disabled, (1) enable half pression computation
  95. // NIS_HLSL: (1) enabled, (0) disabled
  96. // NIS_HLSL_6_2: default (0) HLSL v5, (1) HLSL v6.2 forces NIS_HLSL=1
  97. // NIS_GLSL: (1) enabled, (0) disabled
  98. // NIS_VIEWPORT_SUPPORT: default(0) disabled, (1) enable input/output viewport support
  99. //
  100. // Default NVScaler shader constants:
  101. // [NIS_BLOCK_WIDTH, NIS_BLOCK_HEIGHT, NIS_THREAD_GROUP_SIZE] = [32, 24, 256]
  102. //
  103. // Default NVSharpen shader constants:
  104. // [NIS_BLOCK_WIDTH, NIS_BLOCK_HEIGHT, NIS_THREAD_GROUP_SIZE] = [32, 32, 256]
  105. //---------------------------------------------------------------------------------
  106. // NVScaler enable by default. Set to 0 for NVSharpen only
  107. #ifndef NIS_SCALER
  108. #define NIS_SCALER 1
  109. #endif
  110. // HDR Modes
  111. #define NIS_HDR_MODE_NONE 0
  112. #define NIS_HDR_MODE_LINEAR 1
  113. #define NIS_HDR_MODE_PQ 2
  114. #ifndef NIS_HDR_MODE
  115. #define NIS_HDR_MODE NIS_HDR_MODE_NONE
  116. #endif
  117. #define kHDRCompressionFactor 0.282842712f
  118. // Viewport support
  119. #ifndef NIS_VIEWPORT_SUPPORT
  120. #define NIS_VIEWPORT_SUPPORT 0
  121. #endif
  122. // HLSL, GLSL
  123. #if NIS_HLSL==0 && !defined(NIS_GLSL)
  124. #define NIS_GLSL 1
  125. #endif
  126. #if NIS_HLSL_6_2 || (!NIS_GLSL && !NIS_HLSL)
  127. #if defined(NIS_HLSL)
  128. #undef NIS_HLSL
  129. #endif
  130. #define NIS_HLSL 1
  131. #endif
  132. #if NIS_HLSL && NIS_GLSL
  133. #undef NIS_GLSL
  134. #define NIS_GLSL 0
  135. #endif
  136. // Half precision
  137. #ifndef NIS_USE_HALF_PRECISION
  138. #define NIS_USE_HALF_PRECISION 0
  139. #endif
  140. #if NIS_HLSL
  141. // Generic type and function aliases for HLSL
  142. #define NVF float
  143. #define NVF2 float2
  144. #define NVF3 float3
  145. #define NVF4 float4
  146. #define NVI int
  147. #define NVI2 int2
  148. #define NVU uint
  149. #define NVU2 uint2
  150. #define NVB bool
  151. #if NIS_USE_HALF_PRECISION
  152. #if NIS_HLSL_6_2
  153. #define NVH float16_t
  154. #define NVH2 float16_t2
  155. #define NVH3 float16_t3
  156. #define NVH4 float16_t4
  157. #else
  158. #define NVH min16float
  159. #define NVH2 min16float2
  160. #define NVH3 min16float3
  161. #define NVH4 min16float4
  162. #endif // NIS_HLSL_6_2
  163. #else // FP32 types
  164. #define NVH NVF
  165. #define NVH2 NVF2
  166. #define NVH3 NVF3
  167. #define NVH4 NVF4
  168. #endif // NIS_USE_HALF_PRECISION
  169. #define NVSHARED groupshared
  170. #define NVTEX_LOAD(x, pos) x[pos]
  171. #define NVTEX_SAMPLE(x, sampler, pos) x.SampleLevel(sampler, pos, 0)
  172. #define NVTEX_SAMPLE_RED(x, sampler, pos) x.GatherRed(sampler, pos)
  173. #define NVTEX_SAMPLE_GREEN(x, sampler, pos) x.GatherGreen(sampler, pos)
  174. #define NVTEX_SAMPLE_BLUE(x, sampler, pos) x.GatherBlue(sampler, pos)
  175. #define NVTEX_STORE(x, pos, v) x[pos] = v
  176. #ifndef NIS_UNROLL
  177. #define NIS_UNROLL [unroll]
  178. #endif
  179. #endif // NIS_HLSL
  180. // Generic type and function aliases for GLSL
  181. #if NIS_GLSL
  182. #define NVF float
  183. #define NVF2 vec2
  184. #define NVF3 vec3
  185. #define NVF4 vec4
  186. #define NVI int
  187. #define NVI2 ivec2
  188. #define NVU uint
  189. #define NVU2 uvec2
  190. #define NVB bool
  191. #if NIS_USE_HALF_PRECISION
  192. #define NVH float16_t
  193. #define NVH2 f16vec2
  194. #define NVH3 f16vec3
  195. #define NVH4 f16vec4
  196. #else // FP32 types
  197. #define NVH NVF
  198. #define NVH2 NVF2
  199. #define NVH3 NVF3
  200. #define NVH4 NVF4
  201. #endif // NIS_USE_HALF_PRECISION
  202. #define NVSHARED shared
  203. #define NVTEX_LOAD(x, pos) texelFetch(sampler2D(x, samplerLinearClamp), pos, 0)
  204. #define NVTEX_SAMPLE(x, sampler, pos) textureLod(sampler2D(x, sampler), pos, 0)
  205. #define NVTEX_SAMPLE_RED(x, sampler, pos) textureGather(sampler2D(x, sampler), pos, 0)
  206. #define NVTEX_SAMPLE_GREEN(x, sampler, pos) textureGather(sampler2D(x, sampler), pos, 1)
  207. #define NVTEX_SAMPLE_BLUE(x, sampler, pos) textureGather(sampler2D(x, sampler), pos, 2)
  208. #define NVTEX_STORE(x, pos, v) imageStore(x, NVI2(pos), v)
  209. #define saturate(x) clamp(x, 0, 1)
  210. #define lerp(a, b, x) mix(a, b, x)
  211. #define GroupMemoryBarrierWithGroupSync() groupMemoryBarrier(); barrier()
  212. #ifndef NIS_UNROLL
  213. #define NIS_UNROLL
  214. #endif
  215. #endif // NIS_GLSL
  216. // Texture gather
  217. #ifndef NIS_TEXTURE_GATHER
  218. #define NIS_TEXTURE_GATHER 0
  219. #endif
  220. // NIS Scaling
  221. #define NIS_SCALE_INT 1
  222. #define NIS_SCALE_FLOAT NVF(1.f)
  223. NVF getY(NVF3 rgba)
  224. {
  225. #if NIS_HDR_MODE == NIS_HDR_MODE_PQ
  226. return NVF(0.262f) * rgba.x + NVF(0.678f) * rgba.y + NVF(0.0593f) * rgba.z;
  227. #elif NIS_HDR_MODE == NIS_HDR_MODE_LINEAR
  228. return sqrt(NVF(0.2126f) * rgba.x + NVF(0.7152f) * rgba.y + NVF(0.0722f) * rgba.z) * kHDRCompressionFactor;
  229. #else
  230. return NVF(0.2126f) * rgba.x + NVF(0.7152f) * rgba.y + NVF(0.0722f) * rgba.z;
  231. #endif
  232. }
  233. NVF getYLinear(NVF3 rgba)
  234. {
  235. return NVF(0.2126f) * rgba.x + NVF(0.7152f) * rgba.y + NVF(0.0722f) * rgba.z;
  236. }
  237. #if NIS_SCALER
  238. NVF4 GetEdgeMap(NVF p[4][4], NVI i, NVI j)
  239. #else
  240. NVF4 GetEdgeMap(NVF p[5][5], NVI i, NVI j)
  241. #endif
  242. {
  243. const NVF g_0 = abs(p[0 + i][0 + j] + p[0 + i][1 + j] + p[0 + i][2 + j] - p[2 + i][0 + j] - p[2 + i][1 + j] - p[2 + i][2 + j]);
  244. const NVF g_45 = abs(p[1 + i][0 + j] + p[0 + i][0 + j] + p[0 + i][1 + j] - p[2 + i][1 + j] - p[2 + i][2 + j] - p[1 + i][2 + j]);
  245. const NVF g_90 = abs(p[0 + i][0 + j] + p[1 + i][0 + j] + p[2 + i][0 + j] - p[0 + i][2 + j] - p[1 + i][2 + j] - p[2 + i][2 + j]);
  246. const NVF g_135 = abs(p[1 + i][0 + j] + p[2 + i][0 + j] + p[2 + i][1 + j] - p[0 + i][1 + j] - p[0 + i][2 + j] - p[1 + i][2 + j]);
  247. const NVF g_0_90_max = max(g_0, g_90);
  248. const NVF g_0_90_min = min(g_0, g_90);
  249. const NVF g_45_135_max = max(g_45, g_135);
  250. const NVF g_45_135_min = min(g_45, g_135);
  251. NVF e_0_90 = 0;
  252. NVF e_45_135 = 0;
  253. if (g_0_90_max + g_45_135_max == 0)
  254. {
  255. return NVF4(0, 0, 0, 0);
  256. }
  257. e_0_90 = min(g_0_90_max / (g_0_90_max + g_45_135_max), 1.0f);
  258. e_45_135 = 1.0f - e_0_90;
  259. NVB c_0_90 = (g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min);
  260. NVB c_45_135 = (g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min);
  261. NVB c_g_0_90 = g_0_90_max == g_0;
  262. NVB c_g_45_135 = g_45_135_max == g_45;
  263. NVF f_e_0_90 = (c_0_90 && c_45_135) ? e_0_90 : 1.0f;
  264. NVF f_e_45_135 = (c_0_90 && c_45_135) ? e_45_135 : 1.0f;
  265. NVF weight_0 = (c_0_90 && c_g_0_90) ? f_e_0_90 : 0.0f;
  266. NVF weight_90 = (c_0_90 && !c_g_0_90) ? f_e_0_90 : 0.0f;
  267. NVF weight_45 = (c_45_135 && c_g_45_135) ? f_e_45_135 : 0.0f;
  268. NVF weight_135 = (c_45_135 && !c_g_45_135) ? f_e_45_135 : 0.0f;
  269. return NVF4(weight_0, weight_90, weight_45, weight_135);
  270. }
  271. #if NIS_SCALER
  272. #ifndef NIS_BLOCK_WIDTH
  273. #define NIS_BLOCK_WIDTH 32
  274. #endif
  275. #ifndef NIS_BLOCK_HEIGHT
  276. #define NIS_BLOCK_HEIGHT 24
  277. #endif
  278. #ifndef NIS_THREAD_GROUP_SIZE
  279. #define NIS_THREAD_GROUP_SIZE 256
  280. #endif
  281. #define kPhaseCount 64
  282. #define kFilterSize 6
  283. #define kSupportSize 6
  284. #define kPadSize kSupportSize
  285. // 'Tile' is the region of source luminance values that we load into shPixelsY.
  286. // It is the area of source pixels covered by the destination 'Block' plus a
  287. // 3 pixel border of support pixels.
  288. #define kTilePitch (NIS_BLOCK_WIDTH + kPadSize)
  289. #define kTileSize (kTilePitch * (NIS_BLOCK_HEIGHT + kPadSize))
  290. // 'EdgeMap' is the region of source pixels for which edge map vectors are derived.
  291. // It is the area of source pixels covered by the destination 'Block' plus a
  292. // 1 pixel border.
  293. #define kEdgeMapPitch (NIS_BLOCK_WIDTH + 2)
  294. #define kEdgeMapSize (kEdgeMapPitch * (NIS_BLOCK_HEIGHT + 2))
  295. NVSHARED NVF shPixelsY[kTileSize];
  296. NVSHARED NVH shCoefScaler[kPhaseCount][kFilterSize];
  297. NVSHARED NVH shCoefUSM[kPhaseCount][kFilterSize];
  298. NVSHARED NVH4 shEdgeMap[kEdgeMapSize];
  299. void LoadFilterBanksSh(NVI i0, NVI di) {
  300. // Load up filter banks to shared memory
  301. // The work is spread over (kPhaseCount * 2) threads
  302. for (NVI i = i0; i < kPhaseCount * 2; i += di)
  303. {
  304. NVI phase = i >> 1;
  305. NVI vIdx = i & 1;
  306. NVH4 v = NVH4(NVTEX_LOAD(coef_scaler, NVI2(vIdx, phase)));
  307. NVI filterOffset = vIdx * 4;
  308. shCoefScaler[phase][filterOffset + 0] = v.x;
  309. shCoefScaler[phase][filterOffset + 1] = v.y;
  310. if (vIdx == 0)
  311. {
  312. shCoefScaler[phase][2] = v.z;
  313. shCoefScaler[phase][3] = v.w;
  314. }
  315. v = NVH4(NVTEX_LOAD(coef_usm, NVI2(vIdx, phase)));
  316. shCoefUSM[phase][filterOffset + 0] = v.x;
  317. shCoefUSM[phase][filterOffset + 1] = v.y;
  318. if (vIdx == 0)
  319. {
  320. shCoefUSM[phase][2] = v.z;
  321. shCoefUSM[phase][3] = v.w;
  322. }
  323. }
  324. }
  325. NVF CalcLTI(NVF p0, NVF p1, NVF p2, NVF p3, NVF p4, NVF p5, NVI phase_index)
  326. {
  327. const NVB selector = (phase_index <= kPhaseCount / 2);
  328. NVF sel = selector ? p0 : p3;
  329. const NVF a_min = min(min(p1, p2), sel);
  330. const NVF a_max = max(max(p1, p2), sel);
  331. sel = selector ? p2 : p5;
  332. const NVF b_min = min(min(p3, p4), sel);
  333. const NVF b_max = max(max(p3, p4), sel);
  334. const NVF a_cont = a_max - a_min;
  335. const NVF b_cont = b_max - b_min;
  336. const NVF cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps);
  337. return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
  338. }
  339. NVF4 GetInterpEdgeMap(const NVF4 edge[2][2], NVF phase_frac_x, NVF phase_frac_y)
  340. {
  341. NVF4 h0 = lerp(edge[0][0], edge[0][1], phase_frac_x);
  342. NVF4 h1 = lerp(edge[1][0], edge[1][1], phase_frac_x);
  343. return lerp(h0, h1, phase_frac_y);
  344. }
  345. NVF EvalPoly6(const NVF pxl[6], NVI phase_int)
  346. {
  347. NVF y = 0.f;
  348. {
  349. NIS_UNROLL
  350. for (NVI i = 0; i < 6; ++i)
  351. {
  352. y += shCoefScaler[phase_int][i] * pxl[i];
  353. }
  354. }
  355. NVF y_usm = 0.f;
  356. {
  357. NIS_UNROLL
  358. for (NVI i = 0; i < 6; ++i)
  359. {
  360. y_usm += shCoefUSM[phase_int][i] * pxl[i];
  361. }
  362. }
  363. // let's compute a piece-wise ramp based on luma
  364. const NVF y_scale = 1.0f - saturate((y * (1.0f / NIS_SCALE_FLOAT) - kSharpStartY) * kSharpScaleY);
  365. // scale the ramp to sharpen as a function of luma
  366. const NVF y_sharpness = y_scale * kSharpStrengthScale + kSharpStrengthMin;
  367. y_usm *= y_sharpness;
  368. // scale the ramp to limit USM as a function of luma
  369. const NVF y_sharpness_limit = (y_scale * kSharpLimitScale + kSharpLimitMin) * y;
  370. y_usm = min(y_sharpness_limit, max(-y_sharpness_limit, y_usm));
  371. // reduce ringing
  372. y_usm *= CalcLTI(pxl[0], pxl[1], pxl[2], pxl[3], pxl[4], pxl[5], phase_int);
  373. return y + y_usm;
  374. }
  375. NVF FilterNormal(const NVF p[6][6], NVI phase_x_frac_int, NVI phase_y_frac_int)
  376. {
  377. NVF h_acc = 0.0f;
  378. NIS_UNROLL
  379. for (NVI j = 0; j < 6; ++j)
  380. {
  381. NVF v_acc = 0.0f;
  382. NIS_UNROLL
  383. for (NVI i = 0; i < 6; ++i)
  384. {
  385. v_acc += p[i][j] * shCoefScaler[phase_y_frac_int][i];
  386. }
  387. h_acc += v_acc * shCoefScaler[phase_x_frac_int][j];
  388. }
  389. // let's return the sum unpacked -> we can accumulate it later
  390. return h_acc;
  391. }
  392. NVF AddDirFilters(NVF p[6][6], NVF phase_x_frac, NVF phase_y_frac, NVI phase_x_frac_int, NVI phase_y_frac_int, NVF4 w)
  393. {
  394. NVF f = 0;
  395. if (w.x > 0.0f)
  396. {
  397. // 0 deg filter
  398. NVF interp0Deg[6];
  399. {
  400. NIS_UNROLL
  401. for (NVI i = 0; i < 6; ++i)
  402. {
  403. interp0Deg[i] = lerp(p[i][2], p[i][3], phase_x_frac);
  404. }
  405. }
  406. f += EvalPoly6(interp0Deg, phase_y_frac_int) * w.x;
  407. }
  408. if (w.y > 0.0f)
  409. {
  410. // 90 deg filter
  411. NVF interp90Deg[6];
  412. {
  413. NIS_UNROLL
  414. for (NVI i = 0; i < 6; ++i)
  415. {
  416. interp90Deg[i] = lerp(p[2][i], p[3][i], phase_y_frac);
  417. }
  418. }
  419. f += EvalPoly6(interp90Deg, phase_x_frac_int) * w.y;
  420. }
  421. if (w.z > 0.0f)
  422. {
  423. //45 deg filter
  424. NVF pphase_b45 = 0.5f + 0.5f * (phase_x_frac - phase_y_frac);
  425. NVF temp_interp45Deg[7];
  426. temp_interp45Deg[1] = lerp(p[2][1], p[1][2], pphase_b45);
  427. temp_interp45Deg[3] = lerp(p[3][2], p[2][3], pphase_b45);
  428. temp_interp45Deg[5] = lerp(p[4][3], p[3][4], pphase_b45);
  429. {
  430. pphase_b45 = pphase_b45 - 0.5f;
  431. NVF a = (pphase_b45 >= 0.f) ? p[0][2] : p[2][0];
  432. NVF b = (pphase_b45 >= 0.f) ? p[1][3] : p[3][1];
  433. NVF c = (pphase_b45 >= 0.f) ? p[2][4] : p[4][2];
  434. NVF d = (pphase_b45 >= 0.f) ? p[3][5] : p[5][3];
  435. temp_interp45Deg[0] = lerp(p[1][1], a, abs(pphase_b45));
  436. temp_interp45Deg[2] = lerp(p[2][2], b, abs(pphase_b45));
  437. temp_interp45Deg[4] = lerp(p[3][3], c, abs(pphase_b45));
  438. temp_interp45Deg[6] = lerp(p[4][4], d, abs(pphase_b45));
  439. }
  440. NVF interp45Deg[6];
  441. NVF pphase_p45 = phase_x_frac + phase_y_frac;
  442. if (pphase_p45 >= 1)
  443. {
  444. NIS_UNROLL
  445. for (NVI i = 0; i < 6; i++)
  446. {
  447. interp45Deg[i] = temp_interp45Deg[i + 1];
  448. }
  449. pphase_p45 = pphase_p45 - 1;
  450. }
  451. else
  452. {
  453. NIS_UNROLL
  454. for (NVI i = 0; i < 6; i++)
  455. {
  456. interp45Deg[i] = temp_interp45Deg[i];
  457. }
  458. }
  459. f += EvalPoly6(interp45Deg, NVI(pphase_p45 * 64)) * w.z;
  460. }
  461. if (w.w > 0.0f)
  462. {
  463. //135 deg filter
  464. NVF pphase_b135 = 0.5f * (phase_x_frac + phase_y_frac);
  465. NVF temp_interp135Deg[7];
  466. temp_interp135Deg[1] = lerp(p[3][1], p[4][2], pphase_b135);
  467. temp_interp135Deg[3] = lerp(p[2][2], p[3][3], pphase_b135);
  468. temp_interp135Deg[5] = lerp(p[1][3], p[2][4], pphase_b135);
  469. {
  470. pphase_b135 = pphase_b135 - 0.5f;
  471. NVF a = (pphase_b135 >= 0.f) ? p[5][2] : p[3][0];
  472. NVF b = (pphase_b135 >= 0.f) ? p[4][3] : p[2][1];
  473. NVF c = (pphase_b135 >= 0.f) ? p[3][4] : p[1][2];
  474. NVF d = (pphase_b135 >= 0.f) ? p[2][5] : p[0][3];
  475. temp_interp135Deg[0] = lerp(p[4][1], a, abs(pphase_b135));
  476. temp_interp135Deg[2] = lerp(p[3][2], b, abs(pphase_b135));
  477. temp_interp135Deg[4] = lerp(p[2][3], c, abs(pphase_b135));
  478. temp_interp135Deg[6] = lerp(p[1][4], d, abs(pphase_b135));
  479. }
  480. NVF interp135Deg[6];
  481. NVF pphase_p135 = 1 + (phase_x_frac - phase_y_frac);
  482. if (pphase_p135 >= 1)
  483. {
  484. NIS_UNROLL
  485. for (NVI i = 0; i < 6; ++i)
  486. {
  487. interp135Deg[i] = temp_interp135Deg[i + 1];
  488. }
  489. pphase_p135 = pphase_p135 - 1;
  490. }
  491. else
  492. {
  493. NIS_UNROLL
  494. for (NVI i = 0; i < 6; ++i)
  495. {
  496. interp135Deg[i] = temp_interp135Deg[i];
  497. }
  498. }
  499. f += EvalPoly6(interp135Deg, NVI(pphase_p135 * 64)) * w.w;
  500. }
  501. return f;
  502. }
  503. //-----------------------------------------------------------------------------------------------
  504. // NVScaler
  505. //-----------------------------------------------------------------------------------------------
  506. void NVScaler(NVU2 blockIdx, NVU threadIdx)
  507. {
  508. // Figure out the range of pixels from input image that would be needed to be loaded for this thread-block
  509. NVI dstBlockX = NVI(NIS_BLOCK_WIDTH * blockIdx.x);
  510. NVI dstBlockY = NVI(NIS_BLOCK_HEIGHT * blockIdx.y);
  511. const NVI srcBlockStartX = NVI(floor((dstBlockX + 0.5f) * kScaleX - 0.5f));
  512. const NVI srcBlockStartY = NVI(floor((dstBlockY + 0.5f) * kScaleY - 0.5f));
  513. const NVI srcBlockEndX = NVI(ceil((dstBlockX + NIS_BLOCK_WIDTH + 0.5f) * kScaleX - 0.5f));
  514. const NVI srcBlockEndY = NVI(ceil((dstBlockY + NIS_BLOCK_HEIGHT + 0.5f) * kScaleY - 0.5f));
  515. NVI numTilePixelsX = srcBlockEndX - srcBlockStartX + kSupportSize - 1;
  516. NVI numTilePixelsY = srcBlockEndY - srcBlockStartY + kSupportSize - 1;
  517. // round-up load region to even size since we're loading in 2x2 batches
  518. numTilePixelsX += numTilePixelsX & 0x1;
  519. numTilePixelsY += numTilePixelsY & 0x1;
  520. const NVI numTilePixels = numTilePixelsX * numTilePixelsY;
  521. // calculate the equivalent values for the edge map
  522. const NVI numEdgeMapPixelsX = numTilePixelsX - kSupportSize + 2;
  523. const NVI numEdgeMapPixelsY = numTilePixelsY - kSupportSize + 2;
  524. const NVI numEdgeMapPixels = numEdgeMapPixelsX * numEdgeMapPixelsY;
  525. // fill in input luma tile (shPixelsY) in batches of 2x2 pixels
  526. // we use texture gather to get extra support necessary
  527. // to compute 2x2 edge map outputs too
  528. {
  529. for (NVU i = threadIdx * 2; i < NVU(numTilePixels) >> 1; i += NIS_THREAD_GROUP_SIZE * 2)
  530. {
  531. NVU py = (i / numTilePixelsX) * 2;
  532. NVU px = i % numTilePixelsX;
  533. // 0.5 to be in the center of texel
  534. // - (kSupportSize - 1) / 2 to shift by the kernel support size
  535. NVF kShift = 0.5f - (kSupportSize - 1) / 2;
  536. #if NIS_VIEWPORT_SUPPORT
  537. const NVF tx = (srcBlockStartX + px + kInputViewportOriginX + kShift) * kSrcNormX;
  538. const NVF ty = (srcBlockStartY + py + kInputViewportOriginY + kShift) * kSrcNormY;
  539. #else
  540. const NVF tx = (srcBlockStartX + px + kShift) * kSrcNormX;
  541. const NVF ty = (srcBlockStartY + py + kShift) * kSrcNormY;
  542. #endif
  543. NVF p[2][2];
  544. #if NIS_TEXTURE_GATHER
  545. {
  546. const NVF4 sr = NVTEX_SAMPLE_RED(in_texture, samplerLinearClamp, NVF2(tx, ty));
  547. const NVF4 sg = NVTEX_SAMPLE_GREEN(in_texture, samplerLinearClamp, NVF2(tx, ty));
  548. const NVF4 sb = NVTEX_SAMPLE_BLUE(in_texture, samplerLinearClamp, NVF2(tx, ty));
  549. p[0][0] = getY(NVF3(sr.w, sg.w, sb.w));
  550. p[0][1] = getY(NVF3(sr.z, sg.z, sb.z));
  551. p[1][0] = getY(NVF3(sr.x, sg.x, sb.x));
  552. p[1][1] = getY(NVF3(sr.y, sg.y, sb.y));
  553. }
  554. #else
  555. NIS_UNROLL
  556. for (NVI j = 0; j < 2; j++)
  557. {
  558. NIS_UNROLL
  559. for (NVI k = 0; k < 2; k++)
  560. {
  561. const NVF4 px = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2(tx + k * kSrcNormX, ty + j * kSrcNormY));
  562. p[j][k] = getY(px.xyz);
  563. }
  564. }
  565. #endif
  566. const NVU idx = py * kTilePitch + px;
  567. shPixelsY[idx] = NVH(p[0][0]);
  568. shPixelsY[idx + 1] = NVH(p[0][1]);
  569. shPixelsY[idx + kTilePitch] = NVH(p[1][0]);
  570. shPixelsY[idx + kTilePitch + 1] = NVH(p[1][1]);
  571. }
  572. }
  573. GroupMemoryBarrierWithGroupSync();
  574. {
  575. // fill in the edge map of 2x2 pixels
  576. for (NVU i = threadIdx * 2; i < NVU(numEdgeMapPixels) >> 1; i += NIS_THREAD_GROUP_SIZE * 2)
  577. {
  578. NVU py = (i / numEdgeMapPixelsX) * 2;
  579. NVU px = i % numEdgeMapPixelsX;
  580. const NVU edgeMapIdx = py * kEdgeMapPitch + px;
  581. NVU tileCornerIdx = (py+1) * kTilePitch + px + 1;
  582. NVF p[4][4];
  583. NIS_UNROLL
  584. for (NVI j = 0; j < 4; j++)
  585. {
  586. NIS_UNROLL
  587. for (NVI k = 0; k < 4; k++)
  588. {
  589. p[j][k] = shPixelsY[tileCornerIdx + j * kTilePitch + k];
  590. }
  591. }
  592. shEdgeMap[edgeMapIdx] = NVH4(GetEdgeMap(p, 0, 0));
  593. shEdgeMap[edgeMapIdx + 1] = NVH4(GetEdgeMap(p, 0, 1));
  594. shEdgeMap[edgeMapIdx + kEdgeMapPitch] = NVH4(GetEdgeMap(p, 1, 0));
  595. shEdgeMap[edgeMapIdx + kEdgeMapPitch + 1] = NVH4(GetEdgeMap(p, 1, 1));
  596. }
  597. }
  598. LoadFilterBanksSh(NVI(threadIdx), NIS_THREAD_GROUP_SIZE);
  599. GroupMemoryBarrierWithGroupSync();
  600. // output coord within a tile
  601. const NVI2 pos = NVI2(NVU(threadIdx) % NVU(NIS_BLOCK_WIDTH), NVU(threadIdx) / NVU(NIS_BLOCK_WIDTH));
  602. // x coord inside the output image
  603. const NVI dstX = dstBlockX + pos.x;
  604. // x coord inside the input image
  605. const NVF srcX = (0.5f + dstX) * kScaleX - 0.5f;
  606. // nearest integer part
  607. const NVI px = NVI(floor(srcX) - srcBlockStartX);
  608. // fractional part
  609. const NVF fx = srcX - floor(srcX);
  610. // discretized phase
  611. const NVI fx_int = NVI(fx * kPhaseCount);
  612. for (NVI k = 0; k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT / NIS_THREAD_GROUP_SIZE; ++k)
  613. {
  614. // y coord inside the output image
  615. const NVI dstY = dstBlockY + pos.y + k * (NIS_THREAD_GROUP_SIZE / NIS_BLOCK_WIDTH);
  616. // y coord inside the input image
  617. const NVF srcY = (0.5f + dstY) * kScaleY - 0.5f;
  618. #if NIS_VIEWPORT_SUPPORT
  619. if (srcX > kInputViewportWidth || srcY > kInputViewportHeight ||
  620. dstX > kOutputViewportWidth || dstY > kOutputViewportHeight)
  621. {
  622. return;
  623. }
  624. #endif
  625. // nearest integer part
  626. const NVI py = NVI(floor(srcY) - srcBlockStartY);
  627. // fractional part
  628. const NVF fy = srcY - floor(srcY);
  629. // discretized phase
  630. const NVI fy_int = NVI(fy * kPhaseCount);
  631. // generate weights for directional filters
  632. const NVI startEdgeMapIdx = py * kEdgeMapPitch + px;
  633. NVF4 edge[2][2];
  634. NIS_UNROLL
  635. for (NVI i = 0; i < 2; i++)
  636. {
  637. NIS_UNROLL
  638. for (NVI j = 0; j < 2; j++)
  639. {
  640. // need to shift edge map sampling since it's a 2x2 centered inside 6x6 grid
  641. edge[i][j] = shEdgeMap[startEdgeMapIdx + (i * kEdgeMapPitch) + j];
  642. }
  643. }
  644. const NVF4 w = GetInterpEdgeMap(edge, fx, fy) * NIS_SCALE_INT;
  645. // load 6x6 support to regs
  646. const NVI startTileIdx = py * kTilePitch + px;
  647. NVF p[6][6];
  648. {
  649. NIS_UNROLL
  650. for (NVI i = 0; i < 6; ++i)
  651. {
  652. NIS_UNROLL
  653. for (NVI j = 0; j < 6; ++j)
  654. {
  655. p[i][j] = shPixelsY[startTileIdx + i * kTilePitch + j];
  656. }
  657. }
  658. }
  659. // weigth for luma
  660. const NVF baseWeight = NIS_SCALE_FLOAT - w.x - w.y - w.z - w.w;
  661. // final luma is a weighted product of directional & normal filters
  662. NVF opY = 0;
  663. // get traditional scaler filter output
  664. opY += FilterNormal(p, fx_int, fy_int) * baseWeight;
  665. // get directional filter bank output
  666. opY += AddDirFilters(p, fx, fy, fx_int, fy_int, w);
  667. // do bilinear tap for chroma upscaling
  668. #if NIS_VIEWPORT_SUPPORT
  669. NVF4 op = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2((srcX + kInputViewportOriginX + 0.5f) * kSrcNormX, (srcY + kInputViewportOriginY + 0.5f) * kSrcNormY));
  670. #else
  671. NVF4 op = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2((srcX + 0.5f) * kSrcNormX, (srcY + 0.5f) * kSrcNormY));
  672. #endif
  673. #if NIS_HDR_MODE == NIS_HDR_MODE_LINEAR
  674. const NVF kEps = 1e-4f;
  675. const NVF kNorm = 1.0f / (NIS_SCALE_FLOAT * kHDRCompressionFactor);
  676. const NVF opYN = max(opY, 0.0f) * kNorm;
  677. const NVF corr = (opYN * opYN + kEps) / (max(getYLinear(NVF3(op.x, op.y, op.z)), 0.0f) + kEps);
  678. op.x *= corr;
  679. op.y *= corr;
  680. op.z *= corr;
  681. #else
  682. const NVF corr = opY * (1.0f / NIS_SCALE_FLOAT) - getY(NVF3(op.x, op.y, op.z));
  683. op.x += corr;
  684. op.y += corr;
  685. op.z += corr;
  686. #endif
  687. #if NIS_VIEWPORT_SUPPORT
  688. NVTEX_STORE(out_texture, NVU2(dstX + kOutputViewportOriginX, dstY + kOutputViewportOriginY), op);
  689. #else
  690. NVTEX_STORE(out_texture, NVU2(dstX, dstY), op);
  691. #endif
  692. }
  693. }
  694. #else
  695. #ifndef NIS_BLOCK_WIDTH
  696. #define NIS_BLOCK_WIDTH 32
  697. #endif
  698. #ifndef NIS_BLOCK_HEIGHT
  699. #define NIS_BLOCK_HEIGHT 32
  700. #endif
  701. #ifndef NIS_THREAD_GROUP_SIZE
  702. #define NIS_THREAD_GROUP_SIZE 256
  703. #endif
  704. #define kSupportSize 5
  705. #define kNumPixelsX (NIS_BLOCK_WIDTH + kSupportSize + 1)
  706. #define kNumPixelsY (NIS_BLOCK_HEIGHT + kSupportSize + 1)
  707. NVSHARED NVF shPixelsY[kNumPixelsY][kNumPixelsX];
  708. NVF CalcLTIFast(const NVF y[5])
  709. {
  710. const NVF a_min = min(min(y[0], y[1]), y[2]);
  711. const NVF a_max = max(max(y[0], y[1]), y[2]);
  712. const NVF b_min = min(min(y[2], y[3]), y[4]);
  713. const NVF b_max = max(max(y[2], y[3]), y[4]);
  714. const NVF a_cont = a_max - a_min;
  715. const NVF b_cont = b_max - b_min;
  716. const NVF cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps);
  717. return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
  718. }
  719. NVF EvalUSM(const NVF pxl[5], const NVF sharpnessStrength, const NVF sharpnessLimit)
  720. {
  721. // USM profile
  722. NVF y_usm = -0.6001f * pxl[1] + 1.2002f * pxl[2] - 0.6001f * pxl[3];
  723. // boost USM profile
  724. y_usm *= sharpnessStrength;
  725. // clamp to the limit
  726. y_usm = min(sharpnessLimit, max(-sharpnessLimit, y_usm));
  727. // reduce ringing
  728. y_usm *= CalcLTIFast(pxl);
  729. return y_usm;
  730. }
  731. NVF4 GetDirUSM(const NVF p[5][5])
  732. {
  733. // sharpness boost & limit are the same for all directions
  734. const NVF scaleY = 1.0f - saturate((p[2][2] - kSharpStartY) * kSharpScaleY);
  735. // scale the ramp to sharpen as a function of luma
  736. const NVF sharpnessStrength = scaleY * kSharpStrengthScale + kSharpStrengthMin;
  737. // scale the ramp to limit USM as a function of luma
  738. const NVF sharpnessLimit = (scaleY * kSharpLimitScale + kSharpLimitMin) * p[2][2];
  739. NVF4 rval;
  740. // 0 deg filter
  741. NVF interp0Deg[5];
  742. {
  743. for (NVI i = 0; i < 5; ++i)
  744. {
  745. interp0Deg[i] = p[i][2];
  746. }
  747. }
  748. rval.x = EvalUSM(interp0Deg, sharpnessStrength, sharpnessLimit);
  749. // 90 deg filter
  750. NVF interp90Deg[5];
  751. {
  752. for (NVI i = 0; i < 5; ++i)
  753. {
  754. interp90Deg[i] = p[2][i];
  755. }
  756. }
  757. rval.y = EvalUSM(interp90Deg, sharpnessStrength, sharpnessLimit);
  758. //45 deg filter
  759. NVF interp45Deg[5];
  760. interp45Deg[0] = p[1][1];
  761. interp45Deg[1] = lerp(p[2][1], p[1][2], 0.5f);
  762. interp45Deg[2] = p[2][2];
  763. interp45Deg[3] = lerp(p[3][2], p[2][3], 0.5f);
  764. interp45Deg[4] = p[3][3];
  765. rval.z = EvalUSM(interp45Deg, sharpnessStrength, sharpnessLimit);
  766. //135 deg filter
  767. NVF interp135Deg[5];
  768. interp135Deg[0] = p[3][1];
  769. interp135Deg[1] = lerp(p[3][2], p[2][1], 0.5f);
  770. interp135Deg[2] = p[2][2];
  771. interp135Deg[3] = lerp(p[2][3], p[1][2], 0.5f);
  772. interp135Deg[4] = p[1][3];
  773. rval.w = EvalUSM(interp135Deg, sharpnessStrength, sharpnessLimit);
  774. return rval;
  775. }
  776. //-----------------------------------------------------------------------------------------------
  777. // NVSharpen
  778. //-----------------------------------------------------------------------------------------------
  779. void NVSharpen(NVU2 blockIdx, NVU threadIdx)
  780. {
  781. const NVI dstBlockX = NVI(NIS_BLOCK_WIDTH * blockIdx.x);
  782. const NVI dstBlockY = NVI(NIS_BLOCK_HEIGHT * blockIdx.y);
  783. // fill in input luma tile in batches of 2x2 pixels
  784. // we use texture gather to get extra support necessary
  785. // to compute 2x2 edge map outputs too
  786. const NVF kShift = 0.5f - kSupportSize / 2;
  787. for (NVI i = NVI(threadIdx) * 2; i < kNumPixelsX * kNumPixelsY / 2; i += NIS_THREAD_GROUP_SIZE * 2)
  788. {
  789. NVU2 pos = NVU2(NVU(i) % NVU(kNumPixelsX), NVU(i) / NVU(kNumPixelsX) * 2);
  790. NIS_UNROLL
  791. for (NVI dy = 0; dy < 2; dy++)
  792. {
  793. NIS_UNROLL
  794. for (NVI dx = 0; dx < 2; dx++)
  795. {
  796. #if NIS_VIEWPORT_SUPPORT
  797. const NVF tx = (dstBlockX + pos.x + kInputViewportOriginX + dx + kShift) * kSrcNormX;
  798. const NVF ty = (dstBlockY + pos.y + kInputViewportOriginY + dy + kShift) * kSrcNormY;
  799. #else
  800. const NVF tx = (dstBlockX + pos.x + dx + kShift) * kSrcNormX;
  801. const NVF ty = (dstBlockY + pos.y + dy + kShift) * kSrcNormY;
  802. #endif
  803. const NVF4 px = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2(tx, ty));
  804. shPixelsY[pos.y + dy][pos.x + dx] = getY(px.xyz);
  805. }
  806. }
  807. }
  808. GroupMemoryBarrierWithGroupSync();
  809. for (NVI k = NVI(threadIdx); k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT; k += NIS_THREAD_GROUP_SIZE)
  810. {
  811. const NVI2 pos = NVI2(NVU(k) % NVU(NIS_BLOCK_WIDTH), NVU(k) / NVU(NIS_BLOCK_WIDTH));
  812. // load 5x5 support to regs
  813. NVF p[5][5];
  814. NIS_UNROLL
  815. for (NVI i = 0; i < 5; ++i)
  816. {
  817. NIS_UNROLL
  818. for (NVI j = 0; j < 5; ++j)
  819. {
  820. p[i][j] = shPixelsY[pos.y + i][pos.x + j];
  821. }
  822. }
  823. // get directional filter bank output
  824. NVF4 dirUSM = GetDirUSM(p);
  825. // generate weights for directional filters
  826. NVF4 w = GetEdgeMap(p, kSupportSize / 2 - 1, kSupportSize / 2 - 1);
  827. // final USM is a weighted sum filter outputs
  828. const NVF usmY = (dirUSM.x * w.x + dirUSM.y * w.y + dirUSM.z * w.z + dirUSM.w * w.w);
  829. // do bilinear tap and correct rgb texel so it produces new sharpened luma
  830. const NVI dstX = dstBlockX + pos.x;
  831. const NVI dstY = dstBlockY + pos.y;
  832. #if NIS_VIEWPORT_SUPPORT
  833. if (dstX > kOutputViewportWidth || dstY > kOutputViewportHeight)
  834. {
  835. return;
  836. }
  837. #endif
  838. #if NIS_VIEWPORT_SUPPORT
  839. NVF4 op = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2((dstX + kInputViewportOriginX + 0.5f) * kSrcNormX, (dstY + kInputViewportOriginY + 0.5f) * kSrcNormY));
  840. #else
  841. NVF4 op = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2((dstX + 0.5f) * kSrcNormX, (dstY + 0.5f) * kSrcNormY));
  842. #endif
  843. #if NIS_HDR_MODE == NIS_HDR_MODE_LINEAR
  844. const NVF kEps = 1e-4f * kHDRCompressionFactor * kHDRCompressionFactor;
  845. NVF newY = p[2][2] + usmY;
  846. newY = max(newY, 0.0f);
  847. const NVF oldY = p[2][2];
  848. const NVF corr = (newY * newY + kEps) / (oldY * oldY + kEps);
  849. op.x *= corr;
  850. op.y *= corr;
  851. op.z *= corr;
  852. #else
  853. op.x += usmY;
  854. op.y += usmY;
  855. op.z += usmY;
  856. #endif
  857. #if NIS_VIEWPORT_SUPPORT
  858. NVTEX_STORE(out_texture, NVU2(dstX + kOutputViewportOriginX, dstY + kOutputViewportOriginY), op);
  859. #else
  860. NVTEX_STORE(out_texture, NVU2(dstX, dstY), op);
  861. #endif
  862. }
  863. }
  864. #endif