NIS_Scaler.h 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973
  1. // The MIT License(MIT)
  2. //
  3. // Copyright(c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4. //
  5. // Permission is hereby granted, free of charge, to any person obtaining a copy of
  6. // this software and associated documentation files(the "Software"), to deal in
  7. // the Software without restriction, including without limitation the rights to
  8. // use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies of
  9. // the Software, and to permit persons to whom the Software is furnished to do so,
  10. // subject to the following conditions :
  11. //
  12. // The above copyright notice and this permission notice shall be included in all
  13. // copies or substantial portions of the Software.
  14. //
  15. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
  17. // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR
  18. // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
  19. // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  20. // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  21. //---------------------------------------------------------------------------------
  22. // NVIDIA Image Scaling SDK - v1.0
  23. //---------------------------------------------------------------------------------
  24. // The NVIDIA Image Scaling SDK provides a single spatial scaling and sharpening algorithm
  25. // for cross-platform support. The scaling algorithm uses a 6-tap scaling filter combined
  26. // with 4 directional scaling and adaptive sharpening filters, which creates nice smooth images
  27. // and sharp edges. In addition, the SDK provides a state-of-the-art adaptive directional sharpening algorithm
  28. // for use in applications where no scaling is required.
  29. //
  30. // The directional scaling and sharpening algorithm is named NVScaler while the adaptive-directional-sharpening-only
  31. // algorithm is named NVSharpen. Both algorithms are provided as compute shaders and
  32. // developers are free to integrate them in their applications. Note that if you integrate NVScaler, you
  33. // should NOT integrate NVSharpen, as NVScaler already includes a sharpening pass
  34. //
  35. // Pipeline Placement
  36. // ------------------
  37. // The call into the NVIDIA Image Scaling shaders must occur during the post-processing phase after tone-mapping.
  38. // Applying the scaling in linear HDR in-game color-space may result in a sharpening effect that is
  39. // either not visible or too strong. Since sharpening algorithms can enhance noisy or grainy regions, it is recommended
  40. // that certain effects such as film grain should occur after NVScaler or NVSharpen. Low-pass filters such as motion blur or
  41. // light bloom are recommended to be applied before NVScaler or NVSharpen to avoid sharpening attenuation.
  42. //
  43. // Color Space and Ranges
  44. // ----------------------
  45. // NVIDIA Image Scaling shaders can process color textures stored as either LDR or HDR with the following
  46. // restrictions:
  47. // 1) LDR
  48. // - The range of color values must be in the [0, 1] range
  49. // - The input color texture must be in display-referred color-space after tone mapping and OETF (gamma-correction)
  50. // has been applied
  51. // 2) HDR PQ
  52. // - The range of color values must be in the [0, 1] range
  53. // - The input color texture must be in display-referred color-space after tone mapping with Rec.2020 PQ OETF applied
  54. // 3) HDR Linear
  55. // - The recommended range of color values is [0, 12.5], where luminance value (as per BT. 709) of
  56. // 1.0 maps to brightness value of 80nits (sRGB peak) and 12.5 maps to 1000nits
  57. // - The input color texture may have luminance values that are either linear and scene-referred or
  58. // linear and display-referred (after tone mapping)
  59. //
  60. // If the input color texture sent to NVScaler/NVSharpen is in HDR format set NIS_HDR_MODE define to either
  61. // NIS_HDR_MODE_LINEAR (1) or NIS_HDR_MODE_PQ (2).
  62. //
  63. // Supported Texture Formats
  64. // -------------------------
  65. // Input and output formats:
  66. // Input and output formats are expected to be in the rages defined in previous section and should be
  67. // specified using non-integer data types such as DXGI_FORMAT_R8G8B8A8_UNORM.
  68. //
  69. // Coefficients formats:
  70. // The scaler coefficients and USM coefficients format should be specified using float4 type such as
  71. // DXGI_FORMAT_R32G32B32A32_FLOAT or DXGI_FORMAT_R16G16B16A16_FLOAT.
  72. //
  73. // Resource States, Buffers, and Sampler:
  74. // The game or application calling NVIDIA Image Scaling SDK shaders must ensure that the textures are in
  75. // the correct state.
  76. // - Input color textures must be in pixel shader read state. Shader Resource View (SRV) in DirectX
  77. // - The output texture must be in read/write state. Unordered Access View (UAV) in DirectX
  78. // - The coefficients texture for NVScaler must be in read state. Shader Resource View (SRV) in DirectX
  79. // - The configuration variables must be passed as constant buffer. Constant Buffer View (CBV) in DirectX
  80. // - The sampler for texture pixel sampling. Linear clamp SamplerState in Direct
  81. //
  82. // Adding NVIDIA Image Scaling SDK to a Project
  83. // --------------------------------------------
  84. // Include NIS_Scaler.h directly in your application or alternative use the provided NIS_Main.hlsl shader file.
  85. // Use NIS_Config.h to get the ideal shader dispatch values for your platform, to configure the algorithm constant
  86. // values (NVScalerUpdateConfig, and NVSharpenUpdateConfig), and to access the algorithm coefficients (coef_scale and coef_USM).
  87. //
  88. // Defines:
  89. // NIS_SCALER: default (1) NVScaler, (0) fast NVSharpen only, no upscaling
  90. // NIS_HDR_MODE: default(0) disabled, (1) Linear, (2) PQ
  91. // NIS_BLOCK_WIDTH: pixels per block width. Use GetOptimalBlockWidth query for your platform
  92. // NIS_BLOCK_HEIGHT: pixels per block height. Use GetOptimalBlockHeight query for your platform
  93. // NIS_THREAD_GROUP_SIZE: number of threads per group. Use GetOptimalThreadGroupSize query for your platform
  94. // NIS_USE_HALF_PRECISION: default(0) disabled, (1) enable half pression computation
  95. // NIS_HLSL_6_2: default (0) HLSL v5, (1) HLSL v6.2
  96. // NIS_VIEWPORT_SUPPORT: default(0) disabled, (1) enable input/output viewport support
  97. //
  98. // Default NVScaler shader constants:
  99. // [NIS_BLOCK_WIDTH, NIS_BLOCK_HEIGHT, NIS_THREAD_GROUP_SIZE] = [32, 24, 256]
  100. //
  101. // Default NVSharpen shader constants:
  102. // [NIS_BLOCK_WIDTH, NIS_BLOCK_HEIGHT, NIS_THREAD_GROUP_SIZE] = [32, 32, 256]
  103. //---------------------------------------------------------------------------------
  104. // NVScaler enable by default. Set to 0 for NVSharpen only
  105. #ifndef NIS_SCALER
  106. #define NIS_SCALER 1
  107. #endif
  108. // HDR Modes
  109. #define NIS_HDR_MODE_NONE 0
  110. #define NIS_HDR_MODE_LINEAR 1
  111. #define NIS_HDR_MODE_PQ 2
  112. #ifndef NIS_HDR_MODE
  113. #define NIS_HDR_MODE NIS_HDR_MODE_NONE
  114. #endif
  115. #define kHDRCompressionFactor 0.282842712f
  116. // Viewport support
  117. #ifndef NIS_VIEWPORT_SUPPORT
  118. #define NIS_VIEWPORT_SUPPORT 0
  119. #endif
  120. // Half precision
  121. #ifndef NIS_USE_HALF_PRECISION
  122. #define NIS_USE_HALF_PRECISION 0
  123. #endif
  124. #ifndef NIS_HLSL_6_2
  125. #define NIS_HLSL_6_2 0
  126. #endif
  127. #if NIS_USE_HALF_PRECISION
  128. #if NIS_HLSL_6_2
  129. typedef float16_t4 NVF4;
  130. typedef float16_t NVF;
  131. #else
  132. typedef min16float4 NVF4;
  133. typedef min16float NVF;
  134. #endif // NIS_HLSL_6_2
  135. #define NIS_SCALE_INT 1
  136. #define NIS_SCALE_FLOAT 1.0
  137. #else
  138. typedef float4 NVF4;
  139. typedef float NVF;
  140. #define NIS_SCALE_INT 255
  141. #define NIS_SCALE_FLOAT 255.0
  142. #endif // NIS_USE_HALF_PRECISION
  143. // Loop unrolling
  144. #ifndef NIS_UNROLL
  145. #define NIS_UNROLL [unroll]
  146. #endif
  147. // Texture gather
  148. #ifndef NIS_TEXTURE_GATHER
  149. #define NIS_TEXTURE_GATHER 0
  150. #endif
  151. float getY(float3 rgba)
  152. {
  153. #if NIS_HDR_MODE == NIS_HDR_MODE_PQ
  154. return 0.262f * rgba.x + 0.678f * rgba.y + 0.0593f * rgba.z;
  155. #elif NIS_HDR_MODE == NIS_HDR_MODE_LINEAR
  156. return sqrt(0.2126f * rgba.x + 0.7152f * rgba.y + 0.0722f * rgba.z) * kHDRCompressionFactor;
  157. #else
  158. return 0.2126f * rgba.x + 0.7152f * rgba.y + 0.0722f * rgba.z;
  159. #endif
  160. }
  161. float getYLinear(float3 rgba)
  162. {
  163. return 0.2126f * rgba.x + 0.7152f * rgba.y + 0.0722f * rgba.z;
  164. };
  165. #if NIS_SCALER
  166. float4 GetEdgeMap(float p[4][4], int i, int j)
  167. #else
  168. float4 GetEdgeMap(float p[5][5], int i, int j)
  169. #endif
  170. {
  171. const float g_0 = abs(p[0 + i][0 + j] + p[0 + i][1 + j] + p[0 + i][2 + j] - p[2 + i][0 + j] - p[2 + i][1 + j] - p[2 + i][2 + j]);
  172. const float g_45 = abs(p[1 + i][0 + j] + p[0 + i][0 + j] + p[0 + i][1 + j] - p[2 + i][1 + j] - p[2 + i][2 + j] - p[1 + i][2 + j]);
  173. const float g_90 = abs(p[0 + i][0 + j] + p[1 + i][0 + j] + p[2 + i][0 + j] - p[0 + i][2 + j] - p[1 + i][2 + j] - p[2 + i][2 + j]);
  174. const float g_135 = abs(p[1 + i][0 + j] + p[2 + i][0 + j] + p[2 + i][1 + j] - p[0 + i][1 + j] - p[0 + i][2 + j] - p[1 + i][2 + j]);
  175. const float g_0_90_max = max(g_0, g_90);
  176. const float g_0_90_min = min(g_0, g_90);
  177. const float g_45_135_max = max(g_45, g_135);
  178. const float g_45_135_min = min(g_45, g_135);
  179. float e_0_90 = 0;
  180. float e_45_135 = 0;
  181. float edge_0 = 0;
  182. float edge_45 = 0;
  183. float edge_90 = 0;
  184. float edge_135 = 0;
  185. if ((g_0_90_max + g_45_135_max) == 0)
  186. {
  187. e_0_90 = 0;
  188. e_45_135 = 0;
  189. }
  190. else
  191. {
  192. e_0_90 = g_0_90_max / (g_0_90_max + g_45_135_max);
  193. e_0_90 = min(e_0_90, 1.0f);
  194. e_45_135 = 1.0f - e_0_90;
  195. }
  196. if ((g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min))
  197. {
  198. if (g_0_90_max == g_0)
  199. {
  200. edge_0 = 1.0f;
  201. edge_90 = 0;
  202. }
  203. else
  204. {
  205. edge_0 = 0;
  206. edge_90 = 1.0f;
  207. }
  208. }
  209. else
  210. {
  211. edge_0 = 0;
  212. edge_90 = 0;
  213. }
  214. if ((g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) &&
  215. (g_45_135_max > g_0_90_min))
  216. {
  217. if (g_45_135_max == g_45)
  218. {
  219. edge_45 = 1.0f;
  220. edge_135 = 0;
  221. }
  222. else
  223. {
  224. edge_45 = 0;
  225. edge_135 = 1.0f;
  226. }
  227. }
  228. else
  229. {
  230. edge_45 = 0;
  231. edge_135 = 0;
  232. }
  233. float weight_0, weight_90, weight_45, weight_135;
  234. if ((edge_0 + edge_90 + edge_45 + edge_135) >= 2.0f)
  235. {
  236. if (edge_0 == 1.0f)
  237. {
  238. weight_0 = e_0_90;
  239. weight_90 = 0;
  240. }
  241. else
  242. {
  243. weight_0 = 0;
  244. weight_90 = e_0_90;
  245. }
  246. if (edge_45 == 1.0f)
  247. {
  248. weight_45 = e_45_135;
  249. weight_135 = 0;
  250. }
  251. else
  252. {
  253. weight_45 = 0;
  254. weight_135 = e_45_135;
  255. }
  256. }
  257. else if ((edge_0 + edge_90 + edge_45 + edge_135) >= 1.0f)
  258. {
  259. weight_0 = edge_0;
  260. weight_90 = edge_90;
  261. weight_45 = edge_45;
  262. weight_135 = edge_135;
  263. }
  264. else
  265. {
  266. weight_0 = 0;
  267. weight_90 = 0;
  268. weight_45 = 0;
  269. weight_135 = 0;
  270. }
  271. return float4(weight_0, weight_90, weight_45, weight_135);
  272. }
  273. #if NIS_SCALER
  274. #ifndef NIS_BLOCK_WIDTH
  275. #define NIS_BLOCK_WIDTH 32
  276. #endif
  277. #ifndef NIS_BLOCK_HEIGHT
  278. #define NIS_BLOCK_HEIGHT 24
  279. #endif
  280. #ifndef NIS_THREAD_GROUP_SIZE
  281. #define NIS_THREAD_GROUP_SIZE 256
  282. #endif
  283. #define kPhaseCount 64
  284. #define kFilterSize 8
  285. #define kSupportSize 6
  286. #define kPadSize kSupportSize
  287. #define kTileSize (NIS_BLOCK_WIDTH + kPadSize) * (NIS_BLOCK_HEIGHT + kPadSize)
  288. #define blockDim NIS_THREAD_GROUP_SIZE
  289. groupshared NVF shPixelsY[kTileSize];
  290. groupshared NVF shCoefScaler[kPhaseCount][kFilterSize];
  291. groupshared NVF shCoefUSM[kPhaseCount][kFilterSize];
  292. groupshared NVF4 shEdgeMap[kTileSize];
  293. void LoadFilterBanksSh(int i0, int di)
  294. {
  295. // load up filter banks to shared memory
  296. for (int i = i0; i < kFilterSize * kPhaseCount / 4 / 2; i += di)
  297. {
  298. NVF4 v0 = coef_scaler[int2(0, i)];
  299. NVF4 v1 = coef_scaler[int2(1, i)];
  300. shCoefScaler[i][0] = (NVF)v0.x;
  301. shCoefScaler[i][1] = (NVF)v0.y;
  302. shCoefScaler[i][2] = (NVF)v0.z;
  303. shCoefScaler[i][3] = (NVF)v0.w;
  304. shCoefScaler[i][4] = (NVF)v1.x;
  305. shCoefScaler[i][5] = (NVF)v1.y;
  306. v0 = coef_usm[int2(0, i)];
  307. v1 = coef_usm[int2(1, i)];
  308. shCoefUSM[i][0] = (NVF)v0.x;
  309. shCoefUSM[i][1] = (NVF)v0.y;
  310. shCoefUSM[i][2] = (NVF)v0.z;
  311. shCoefUSM[i][3] = (NVF)v0.w;
  312. shCoefUSM[i][4] = (NVF)v1.x;
  313. shCoefUSM[i][5] = (NVF)v1.y;
  314. }
  315. }
  316. float CalcLTI(float p0, float p1, float p2, float p3, float p4, float p5, int phase_index)
  317. {
  318. float y0, y1, y2, y3, y4;
  319. if (phase_index <= kPhaseCount / 2)
  320. {
  321. y0 = p0;
  322. y1 = p1;
  323. y2 = p2;
  324. y3 = p3;
  325. y4 = p4;
  326. }
  327. else
  328. {
  329. y0 = p1;
  330. y1 = p2;
  331. y2 = p3;
  332. y3 = p4;
  333. y4 = p5;
  334. }
  335. const float a_min = min(min(y0, y1), y2);
  336. const float a_max = max(max(y0, y1), y2);
  337. const float b_min = min(min(y2, y3), y4);
  338. const float b_max = max(max(y2, y3), y4);
  339. const float a_cont = a_max - a_min;
  340. const float b_cont = b_max - b_min;
  341. const float cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps);
  342. return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
  343. }
  344. float4 GetInterpEdgeMap(const float4 edge[2][2], float phase_frac_x, float phase_frac_y)
  345. {
  346. float4 h0, h1, f;
  347. h0.x = lerp(edge[0][0].x, edge[0][1].x, phase_frac_x);
  348. h0.y = lerp(edge[0][0].y, edge[0][1].y, phase_frac_x);
  349. h0.z = lerp(edge[0][0].z, edge[0][1].z, phase_frac_x);
  350. h0.w = lerp(edge[0][0].w, edge[0][1].w, phase_frac_x);
  351. h1.x = lerp(edge[1][0].x, edge[1][1].x, phase_frac_x);
  352. h1.y = lerp(edge[1][0].y, edge[1][1].y, phase_frac_x);
  353. h1.z = lerp(edge[1][0].z, edge[1][1].z, phase_frac_x);
  354. h1.w = lerp(edge[1][0].w, edge[1][1].w, phase_frac_x);
  355. f.x = lerp(h0.x, h1.x, phase_frac_y);
  356. f.y = lerp(h0.y, h1.y, phase_frac_y);
  357. f.z = lerp(h0.z, h1.z, phase_frac_y);
  358. f.w = lerp(h0.w, h1.w, phase_frac_y);
  359. return f;
  360. }
  361. float EvalPoly6(const float pxl[6], int phase_int)
  362. {
  363. float y = 0.f;
  364. {
  365. NIS_UNROLL
  366. for (int i = 0; i < 6; ++i)
  367. {
  368. y += shCoefScaler[phase_int][i] * pxl[i];
  369. }
  370. }
  371. float y_usm = 0.f;
  372. {
  373. NIS_UNROLL
  374. for (int i = 0; i < 6; ++i)
  375. {
  376. y_usm += shCoefUSM[phase_int][i] * pxl[i];
  377. }
  378. }
  379. // let's compute a piece-wise ramp based on luma
  380. const float y_scale = 1.0f - saturate((y * (1.0f / 255) - kSharpStartY) * kSharpScaleY);
  381. // scale the ramp to sharpen as a function of luma
  382. const float y_sharpness = y_scale * kSharpStrengthScale + kSharpStrengthMin;
  383. y_usm *= y_sharpness;
  384. // scale the ramp to limit USM as a function of luma
  385. const float y_sharpness_limit = (y_scale * kSharpLimitScale + kSharpLimitMin) * y;
  386. y_usm = min(y_sharpness_limit, max(-y_sharpness_limit, y_usm));
  387. // reduce ringing
  388. y_usm *= CalcLTI(pxl[0], pxl[1], pxl[2], pxl[3], pxl[4], pxl[5], phase_int);
  389. return y + y_usm;
  390. }
  391. float FilterNormal(const float p[6][6], int phase_x_frac_int, int phase_y_frac_int)
  392. {
  393. float h_acc = 0.0f;
  394. NIS_UNROLL
  395. for (int j = 0; j < 6; ++j)
  396. {
  397. float v_acc = 0.0f;
  398. NIS_UNROLL
  399. for (int i = 0; i < 6; ++i)
  400. {
  401. v_acc += p[i][j] * shCoefScaler[phase_y_frac_int][i];
  402. }
  403. h_acc += v_acc * shCoefScaler[phase_x_frac_int][j];
  404. }
  405. // let's return the sum unpacked -> we can accumulate it later
  406. return h_acc;
  407. }
  408. float4 GetDirFilters(float p[6][6], float phase_x_frac, float phase_y_frac, int phase_x_frac_int, int phase_y_frac_int)
  409. {
  410. float4 f;
  411. // 0 deg filter
  412. float interp0Deg[6];
  413. {
  414. NIS_UNROLL
  415. for (int i = 0; i < 6; ++i)
  416. {
  417. interp0Deg[i] = lerp(p[i][2], p[i][3], phase_x_frac);
  418. }
  419. }
  420. f.x = EvalPoly6(interp0Deg, phase_y_frac_int);
  421. // 90 deg filter
  422. float interp90Deg[6];
  423. {
  424. NIS_UNROLL
  425. for (int i = 0; i < 6; ++i)
  426. {
  427. interp90Deg[i] = lerp(p[2][i], p[3][i], phase_y_frac);
  428. }
  429. }
  430. f.y = EvalPoly6(interp90Deg, phase_x_frac_int);
  431. //45 deg filter
  432. float pphase_b45;
  433. pphase_b45 = 0.5f + 0.5f * (phase_x_frac - phase_y_frac);
  434. float temp_interp45Deg[7];
  435. temp_interp45Deg[1] = lerp(p[2][1], p[1][2], pphase_b45);
  436. temp_interp45Deg[3] = lerp(p[3][2], p[2][3], pphase_b45);
  437. temp_interp45Deg[5] = lerp(p[4][3], p[3][4], pphase_b45);
  438. if (pphase_b45 >= 0.5f)
  439. {
  440. pphase_b45 = pphase_b45 - 0.5f;
  441. temp_interp45Deg[0] = lerp(p[1][1], p[0][2], pphase_b45);
  442. temp_interp45Deg[2] = lerp(p[2][2], p[1][3], pphase_b45);
  443. temp_interp45Deg[4] = lerp(p[3][3], p[2][4], pphase_b45);
  444. temp_interp45Deg[6] = lerp(p[4][4], p[3][5], pphase_b45);
  445. }
  446. else
  447. {
  448. pphase_b45 = 0.5f - pphase_b45;
  449. temp_interp45Deg[0] = lerp(p[1][1], p[2][0], pphase_b45);
  450. temp_interp45Deg[2] = lerp(p[2][2], p[3][1], pphase_b45);
  451. temp_interp45Deg[4] = lerp(p[3][3], p[4][2], pphase_b45);
  452. temp_interp45Deg[6] = lerp(p[4][4], p[5][3], pphase_b45);
  453. }
  454. float interp45Deg[6];
  455. float pphase_p45 = phase_x_frac + phase_y_frac;
  456. if (pphase_p45 >= 1)
  457. {
  458. NIS_UNROLL
  459. for (int i = 0; i < 6; i++)
  460. {
  461. interp45Deg[i] = temp_interp45Deg[i + 1];
  462. }
  463. pphase_p45 = pphase_p45 - 1;
  464. }
  465. else
  466. {
  467. NIS_UNROLL
  468. for (int i = 0; i < 6; i++)
  469. {
  470. interp45Deg[i] = temp_interp45Deg[i];
  471. }
  472. }
  473. f.z = EvalPoly6(interp45Deg, (int)(pphase_p45 * 64));
  474. //135 deg filter
  475. float pphase_b135;
  476. pphase_b135 = 0.5f * (phase_x_frac + phase_y_frac);
  477. float temp_interp135Deg[7];
  478. temp_interp135Deg[1] = lerp(p[3][1], p[4][2], pphase_b135);
  479. temp_interp135Deg[3] = lerp(p[2][2], p[3][3], pphase_b135);
  480. temp_interp135Deg[5] = lerp(p[1][3], p[2][4], pphase_b135);
  481. if (pphase_b135 >= 0.5f)
  482. {
  483. pphase_b135 = pphase_b135 - 0.5f;
  484. temp_interp135Deg[0] = lerp(p[4][1], p[5][2], pphase_b135);
  485. temp_interp135Deg[2] = lerp(p[3][2], p[4][3], pphase_b135);
  486. temp_interp135Deg[4] = lerp(p[2][3], p[3][4], pphase_b135);
  487. temp_interp135Deg[6] = lerp(p[1][4], p[2][5], pphase_b135);
  488. }
  489. else
  490. {
  491. pphase_b135 = 0.5f - pphase_b135;
  492. temp_interp135Deg[0] = lerp(p[4][1], p[3][0], pphase_b135);
  493. temp_interp135Deg[2] = lerp(p[3][2], p[2][1], pphase_b135);
  494. temp_interp135Deg[4] = lerp(p[2][3], p[1][2], pphase_b135);
  495. temp_interp135Deg[6] = lerp(p[1][4], p[0][3], pphase_b135);
  496. }
  497. float interp135Deg[6];
  498. float pphase_p135 = 1 + (phase_x_frac - phase_y_frac);
  499. if (pphase_p135 >= 1)
  500. {
  501. NIS_UNROLL
  502. for (int i = 0; i < 6; ++i)
  503. {
  504. interp135Deg[i] = temp_interp135Deg[i + 1];
  505. }
  506. pphase_p135 = pphase_p135 - 1;
  507. }
  508. else
  509. {
  510. NIS_UNROLL
  511. for (int i = 0; i < 6; ++i)
  512. {
  513. interp135Deg[i] = temp_interp135Deg[i];
  514. }
  515. }
  516. f.w = EvalPoly6(interp135Deg, (int)(pphase_p135 * 64));
  517. return f;
  518. }
  519. //-----------------------------------------------------------------------------------------------
  520. // NVScaler
  521. //-----------------------------------------------------------------------------------------------
  522. void NVScaler(uint2 blockIdx, uint threadIdx)
  523. {
  524. // Figure out the range of pixels from input image that would be needed to be loaded for this thread-block
  525. const int dstBlockX = NIS_BLOCK_WIDTH * blockIdx.x;
  526. const int dstBlockY = NIS_BLOCK_HEIGHT * blockIdx.y;
  527. const int srcBlockStartX = floor((dstBlockX + 0.5f) * kScaleX - 0.5f);
  528. const int srcBlockStartY = floor((dstBlockY + 0.5f) * kScaleY - 0.5f);
  529. const int srcBlockEndX = ceil((dstBlockX + NIS_BLOCK_WIDTH + 0.5f) * kScaleX - 0.5f);
  530. const int srcBlockEndY = ceil((dstBlockY + NIS_BLOCK_HEIGHT + 0.5f) * kScaleY - 0.5f);
  531. int numPixelsX = srcBlockEndX - srcBlockStartX + kSupportSize - 1;
  532. int numPixelsY = srcBlockEndY - srcBlockStartY + kSupportSize - 1;
  533. // round-up load region to even size since we're loading in 2x2 batches
  534. numPixelsX += numPixelsX & 0x1;
  535. numPixelsY += numPixelsY & 0x1;
  536. const float invNumPixelX = 1.0f / numPixelsX;
  537. const uint numPixels = numPixelsX * numPixelsY;
  538. // fill in input luma tile in batches of 2x2 pixels
  539. // we use texture gather to get extra support necessary
  540. // to compute 2x2 edge map outputs too
  541. for (uint i = threadIdx * 2; i < numPixels / 2; i += blockDim * 2)
  542. {
  543. float py = floor(i * invNumPixelX);
  544. const float px = i - py * numPixelsX;
  545. py *= 2.0f;
  546. // 0.5 to be in the center of texel
  547. // -1.0 to sample top-left corner of 3x3 halo necessary
  548. // -kSupportSize/2 to shift by the kernel support size
  549. float kShift = 0.5f - 1.0f - (kSupportSize - 1) / 2;
  550. #if NIS_VIEWPORT_SUPPORT
  551. const float tx = (srcBlockStartX + px + kInputViewportOriginX + kShift) * kSrcNormX;
  552. const float ty = (srcBlockStartY + py + kInputViewportOriginY + kShift) * kSrcNormY;
  553. #else
  554. const float tx = (srcBlockStartX + px + kShift) * kSrcNormX;
  555. const float ty = (srcBlockStartY + py + kShift) * kSrcNormY;
  556. #endif
  557. float p[4][4];
  558. #if NIS_TEXTURE_GATHER
  559. NIS_UNROLL for (int j = 0; j < 4; j += 2)
  560. {
  561. NIS_UNROLL for (int k = 0; k < 4; k += 2)
  562. {
  563. const float4 sr = in_texture.GatherRed(samplerLinearClamp, float2(tx + k * kSrcNormX, ty + j * kSrcNormY), int2(0, 0));
  564. const float4 sg = in_texture.GatherGreen(samplerLinearClamp, float2(tx + k * kSrcNormX, ty + j * kSrcNormY), int2(0, 0));
  565. const float4 sb = in_texture.GatherBlue(samplerLinearClamp, float2(tx + k * kSrcNormX, ty + j * kSrcNormY), int2(0, 0));
  566. p[j + 0][k + 0] = getY(float3(sr.w, sg.w, sb.w));
  567. p[j + 0][k + 1] = getY(float3(sr.z, sg.z, sb.z));
  568. p[j + 1][k + 0] = getY(float3(sr.x, sg.x, sb.x));
  569. p[j + 1][k + 1] = getY(float3(sr.y, sg.y, sb.y));
  570. }
  571. }
  572. #else
  573. NIS_UNROLL
  574. for (int j = 0; j < 4; j++)
  575. {
  576. NIS_UNROLL
  577. for (int k = 0; k < 4; k++)
  578. {
  579. const float3 px = in_texture.SampleLevel(samplerLinearClamp, float2(tx + k * kSrcNormX, ty + j * kSrcNormY), 0).xyz;
  580. p[j][k] = getY(px);
  581. }
  582. }
  583. #endif
  584. const int idx = py * numPixelsX + px;
  585. shEdgeMap[idx] = (NVF4)GetEdgeMap(p, 0, 0);
  586. shEdgeMap[idx + 1] = (NVF4)GetEdgeMap(p, 0, 1);
  587. shEdgeMap[idx + numPixelsX] = (NVF4)GetEdgeMap(p, 1, 0);
  588. shEdgeMap[idx + numPixelsX + 1] = (NVF4)GetEdgeMap(p, 1, 1);
  589. // normalize luma to 255.0f and write out to shmem
  590. shPixelsY[idx] = (NVF)(p[1][1] * NIS_SCALE_FLOAT);
  591. shPixelsY[idx + 1] = (NVF)(p[1][2] * NIS_SCALE_FLOAT);
  592. shPixelsY[idx + numPixelsX] = (NVF)(p[2][1] * NIS_SCALE_FLOAT);
  593. shPixelsY[idx + numPixelsX + 1] = (NVF)(p[2][2] * NIS_SCALE_FLOAT);
  594. }
  595. LoadFilterBanksSh(threadIdx, blockDim);
  596. GroupMemoryBarrierWithGroupSync();
  597. for (uint k = threadIdx; k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT; k += blockDim)
  598. {
  599. const int2 pos = int2(k % NIS_BLOCK_WIDTH, k / NIS_BLOCK_WIDTH);
  600. const int dstX = dstBlockX + pos.x;
  601. const int dstY = dstBlockY + pos.y;
  602. const float srcX = (0.5f + dstX) * kScaleX - 0.5f;
  603. const float srcY = (0.5f + dstY) * kScaleY - 0.5f;
  604. #if NIS_VIEWPORT_SUPPORT
  605. if (srcX > kInputViewportWidth || srcY > kInputViewportHeight ||
  606. dstX > kOutputViewportWidth || dstY > kOutputViewportHeight)
  607. {
  608. return;
  609. }
  610. #endif
  611. const int px = floor(srcX) - srcBlockStartX;
  612. const int py = floor(srcY) - srcBlockStartY;
  613. const int start_idx = py * numPixelsX + px;
  614. // load 6x6 support to regs
  615. float p[6][6];
  616. {
  617. NIS_UNROLL
  618. for (int i = 0; i < 6; ++i)
  619. {
  620. NIS_UNROLL
  621. for (int j = 0; j < 6; ++j)
  622. {
  623. p[i][j] = shPixelsY[start_idx + i * numPixelsX + j];
  624. }
  625. }
  626. }
  627. // compute discretized filter phase
  628. const float fx = srcX - floor(srcX);
  629. const float fy = srcY - floor(srcY);
  630. const int fx_int = (int)(fx * kPhaseCount);
  631. const int fy_int = (int)(fy * kPhaseCount);
  632. // get traditional scaler filter output
  633. const float pixel_n = FilterNormal(p, fx_int, fy_int);
  634. // get directional filter bank output
  635. float4 opDirYU = GetDirFilters(p, fx, fy, fx_int, fy_int);
  636. // final luma is a weighted product of directional & normal filters
  637. // generate weights for directional filters
  638. const int kShift = (kSupportSize - 2) / 2;
  639. float4 edge[2][2];
  640. NIS_UNROLL
  641. for (int i = 0; i < 2; i++)
  642. {
  643. NIS_UNROLL
  644. for (int j = 0; j < 2; j++)
  645. {
  646. // need to shift edge map sampling since it's a 2x2 centered inside 6x6 grid
  647. edge[i][j] = shEdgeMap[start_idx + (i + kShift) * numPixelsX + (j + kShift)];
  648. }
  649. }
  650. const float4 w = GetInterpEdgeMap(edge, fx, fy) * NIS_SCALE_INT;
  651. // final pixel is a weighted sum filter outputs
  652. const float opY = (opDirYU.x * w.x + opDirYU.y * w.y + opDirYU.z * w.z + opDirYU.w * w.w +
  653. pixel_n * (NIS_SCALE_FLOAT - w.x - w.y - w.z - w.w)) * (1.0f / NIS_SCALE_FLOAT);
  654. // do bilinear tap for chroma upscaling
  655. #if NIS_VIEWPORT_SUPPORT
  656. float4 op = in_texture.SampleLevel(samplerLinearClamp, float2((srcX + kInputViewportOriginX) * kSrcNormX, (srcY + kInputViewportOriginY) * kSrcNormY), 0);
  657. #else
  658. float4 op = in_texture.SampleLevel(samplerLinearClamp, float2((dstX + 0.5f) * kDstNormX, (dstY + 0.5f) * kDstNormY), 0);
  659. #endif
  660. #if NIS_HDR_MODE == NIS_HDR_MODE_LINEAR
  661. const float kEps = 1e-4f;
  662. const float kNorm = 1.0f / (NIS_SCALE_FLOAT * kHDRCompressionFactor);
  663. const float opYN = max(opY, 0.0f) * kNorm;
  664. const float corr = (opYN * opYN + kEps) / (max(getYLinear(float3(op.x, op.y, op.z)), 0.0f) + kEps);
  665. op.x *= corr;
  666. op.y *= corr;
  667. op.z *= corr;
  668. #else
  669. const float corr = opY * (1.0f / NIS_SCALE_FLOAT) - getY(float3(op.x, op.y, op.z));
  670. op.x += corr;
  671. op.y += corr;
  672. op.z += corr;
  673. #endif
  674. #if NIS_VIEWPORT_SUPPORT
  675. out_texture[uint2(dstX + kOutputViewportOriginX, dstY + kOutputViewportOriginY)] = op;
  676. #else
  677. out_texture[uint2(dstX, dstY)] = op;
  678. #endif
  679. }
  680. }
  681. #else
  682. #ifndef NIS_BLOCK_WIDTH
  683. #define NIS_BLOCK_WIDTH 32
  684. #endif
  685. #ifndef NIS_BLOCK_HEIGHT
  686. #define NIS_BLOCK_HEIGHT 32
  687. #endif
  688. #ifndef NIS_THREAD_GROUP_SIZE
  689. #define NIS_THREAD_GROUP_SIZE 256
  690. #endif
  691. #define kSupportSize 5
  692. #define kNumPixelsX (NIS_BLOCK_WIDTH + kSupportSize + 1)
  693. #define kNumPixelsY (NIS_BLOCK_HEIGHT + kSupportSize + 1)
  694. #define blockDim NIS_THREAD_GROUP_SIZE
  695. groupshared float shPixelsY[kNumPixelsY][kNumPixelsX];
  696. float CalcLTIFast(const float y[5])
  697. {
  698. const float a_min = min(min(y[0], y[1]), y[2]);
  699. const float a_max = max(max(y[0], y[1]), y[2]);
  700. const float b_min = min(min(y[2], y[3]), y[4]);
  701. const float b_max = max(max(y[2], y[3]), y[4]);
  702. const float a_cont = a_max - a_min;
  703. const float b_cont = b_max - b_min;
  704. const float cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps * (1.0f / 255.0f));
  705. return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
  706. }
  707. float EvalUSM(const float pxl[5], const float sharpnessStrength, const float sharpnessLimit)
  708. {
  709. // USM profile
  710. float y_usm = -0.6001f * pxl[1] + 1.2002f * pxl[2] - 0.6001f * pxl[3];
  711. // boost USM profile
  712. y_usm *= sharpnessStrength;
  713. // clamp to the limit
  714. y_usm = min(sharpnessLimit, max(-sharpnessLimit, y_usm));
  715. // reduce ringing
  716. y_usm *= CalcLTIFast(pxl);
  717. return y_usm;
  718. }
  719. float4 GetDirUSM(const float p[5][5])
  720. {
  721. // sharpness boost & limit are the same for all directions
  722. const float scaleY = 1.0f - saturate((p[2][2] - kSharpStartY) * kSharpScaleY);
  723. // scale the ramp to sharpen as a function of luma
  724. const float sharpnessStrength = scaleY * kSharpStrengthScale + kSharpStrengthMin;
  725. // scale the ramp to limit USM as a function of luma
  726. const float sharpnessLimit = (scaleY * kSharpLimitScale + kSharpLimitMin) * p[2][2];
  727. float4 rval;
  728. // 0 deg filter
  729. float interp0Deg[5];
  730. {
  731. for (int i = 0; i < 5; ++i)
  732. {
  733. interp0Deg[i] = p[i][2];
  734. }
  735. }
  736. rval.x = EvalUSM(interp0Deg, sharpnessStrength, sharpnessLimit);
  737. // 90 deg filter
  738. float interp90Deg[5];
  739. {
  740. for (int i = 0; i < 5; ++i)
  741. {
  742. interp90Deg[i] = p[2][i];
  743. }
  744. }
  745. rval.y = EvalUSM(interp90Deg, sharpnessStrength, sharpnessLimit);
  746. //45 deg filter
  747. float interp45Deg[5];
  748. interp45Deg[0] = p[1][1];
  749. interp45Deg[1] = lerp(p[2][1], p[1][2], 0.5f);
  750. interp45Deg[2] = p[2][2];
  751. interp45Deg[3] = lerp(p[3][2], p[2][3], 0.5f);
  752. interp45Deg[4] = p[3][3];
  753. rval.z = EvalUSM(interp45Deg, sharpnessStrength, sharpnessLimit);
  754. //135 deg filter
  755. float interp135Deg[5];
  756. interp135Deg[0] = p[3][1];
  757. interp135Deg[1] = lerp(p[3][2], p[2][1], 0.5f);
  758. interp135Deg[2] = p[2][2];
  759. interp135Deg[3] = lerp(p[2][3], p[1][2], 0.5f);
  760. interp135Deg[4] = p[1][3];
  761. rval.w = EvalUSM(interp135Deg, sharpnessStrength, sharpnessLimit);
  762. return rval;
  763. }
  764. //-----------------------------------------------------------------------------------------------
  765. // NVSharpen
  766. //-----------------------------------------------------------------------------------------------
  767. void NVSharpen(uint2 blockIdx, uint threadIdx)
  768. {
  769. const int dstBlockX = NIS_BLOCK_WIDTH * blockIdx.x;
  770. const int dstBlockY = NIS_BLOCK_HEIGHT * blockIdx.y;
  771. // fill in input luma tile in batches of 2x2 pixels
  772. // we use texture gather to get extra support necessary
  773. // to compute 2x2 edge map outputs too
  774. const float kShift = 0.5f - kSupportSize / 2;
  775. for (uint i = threadIdx * 2; i < kNumPixelsX * kNumPixelsY / 2; i += blockDim * 2)
  776. {
  777. uint2 pos = uint2(i % kNumPixelsX, i / kNumPixelsX * 2);
  778. NIS_UNROLL
  779. for (int dy = 0; dy < 2; dy++)
  780. {
  781. NIS_UNROLL
  782. for (int dx = 0; dx < 2; dx++)
  783. {
  784. #if NIS_VIEWPORT_SUPPORT
  785. const float tx = (dstBlockX + pos.x + kInputViewportOriginX + dx + kShift) * kSrcNormX;
  786. const float ty = (dstBlockY + pos.y + kInputViewportOriginY + dy + kShift) * kSrcNormY;
  787. #else
  788. const float tx = (dstBlockX + pos.x + dx + kShift) * kSrcNormX;
  789. const float ty = (dstBlockY + pos.y + dy + kShift) * kSrcNormY;
  790. #endif
  791. const float3 px = in_texture.SampleLevel(samplerLinearClamp, float2(tx, ty), 0).xyz;
  792. shPixelsY[pos.y + dy][pos.x + dx] = getY(px);
  793. }
  794. }
  795. }
  796. GroupMemoryBarrierWithGroupSync();
  797. for (int k = threadIdx; k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT; k += blockDim)
  798. {
  799. const int2 pos = int2(k % NIS_BLOCK_WIDTH, k / NIS_BLOCK_WIDTH);
  800. // load 5x5 support to regs
  801. float p[5][5];
  802. NIS_UNROLL
  803. for (int i = 0; i < 5; ++i)
  804. {
  805. NIS_UNROLL
  806. for (int j = 0; j < 5; ++j)
  807. {
  808. p[i][j] = shPixelsY[pos.y + i][pos.x + j];
  809. }
  810. }
  811. // get directional filter bank output
  812. const float4 dirUSM = GetDirUSM(p);
  813. // generate weights for directional filters
  814. float4 w = GetEdgeMap(p, kSupportSize / 2 - 1, kSupportSize / 2 - 1);
  815. // final USM is a weighted sum filter outputs
  816. const float usmY = (dirUSM.x * w.x + dirUSM.y * w.y + dirUSM.z * w.z + dirUSM.w * w.w);
  817. // do bilinear tap and correct rgb texel so it produces new sharpened luma
  818. const int dstX = dstBlockX + pos.x;
  819. const int dstY = dstBlockY + pos.y;
  820. #if NIS_VIEWPORT_SUPPORT
  821. if (dstX > kOutputViewportWidth || dstY > kOutputViewportHeight)
  822. {
  823. return;
  824. }
  825. #endif
  826. #if NIS_VIEWPORT_SUPPORT
  827. float4 op = in_texture.SampleLevel(samplerLinearClamp, float2((dstX + kInputViewportOriginX) * kSrcNormX, (dstY + kInputViewportOriginY) * kSrcNormY), 0);
  828. #else
  829. float4 op = in_texture.SampleLevel(samplerLinearClamp, float2((dstX + 0.5f) * kDstNormX, (dstY + 0.5f) * kDstNormY), 0);
  830. #endif
  831. #if NIS_HDR_MODE == NIS_HDR_MODE_LINEAR
  832. const float kEps = 1e-4f * kHDRCompressionFactor * kHDRCompressionFactor;
  833. float newY = p[2][2] + usmY;
  834. newY = max(newY, 0.0f);
  835. const float oldY = p[2][2];
  836. const float corr = (newY * newY + kEps) / (oldY * oldY + kEps);
  837. op.x *= corr;
  838. op.y *= corr;
  839. op.z *= corr;
  840. #else
  841. op.x += usmY;
  842. op.y += usmY;
  843. op.z += usmY;
  844. #endif
  845. #if NIS_VIEWPORT_SUPPORT
  846. out_texture[uint2(dstX + kOutputViewportOriginX, dstY + kOutputViewportOriginY)] = op;
  847. #else
  848. out_texture[uint2(dstX, dstY)] = op;
  849. #endif
  850. }
  851. }
  852. #endif