123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971 |
- #ifndef NIS_SCALER
- #define NIS_SCALER 1
- #endif
- #define NIS_HDR_MODE_NONE 0
- #define NIS_HDR_MODE_LINEAR 1
- #define NIS_HDR_MODE_PQ 2
- #ifndef NIS_HDR_MODE
- #define NIS_HDR_MODE NIS_HDR_MODE_NONE
- #endif
- #define kHDRCompressionFactor 0.282842712f
- #ifndef NIS_VIEWPORT_SUPPORT
- #define NIS_VIEWPORT_SUPPORT 0
- #endif
- #if NIS_HLSL==0 && !defined(NIS_GLSL)
- #define NIS_GLSL 1
- #endif
- #if NIS_HLSL_6_2 || (!NIS_GLSL && !NIS_HLSL)
- #if defined(NIS_HLSL)
- #undef NIS_HLSL
- #endif
- #define NIS_HLSL 1
- #endif
- #if NIS_HLSL && NIS_GLSL
- #undef NIS_GLSL
- #define NIS_GLSL 0
- #endif
- #ifndef NIS_USE_HALF_PRECISION
- #define NIS_USE_HALF_PRECISION 0
- #endif
- #if NIS_HLSL
- #define NVF float
- #define NVF2 float2
- #define NVF3 float3
- #define NVF4 float4
- #define NVI int
- #define NVI2 int2
- #define NVU uint
- #define NVU2 uint2
- #if NIS_USE_HALF_PRECISION
- #if NIS_HLSL_6_2
- #define NVH float16_t
- #define NVH2 float16_t2
- #define NVH3 float16_t3
- #define NVH4 float16_t4
- #else
- #define NVH min16float
- #define NVH2 min16float2
- #define NVH3 min16float3
- #define NVH4 min16float4
- #endif
- #else
- #define NVH NVF
- #define NVH2 NVF2
- #define NVH3 NVF3
- #define NVH4 NVF4
- #endif
- #define NVSHARED groupshared
- #define NVTEX_LOAD(x, pos) x[pos]
- #define NVTEX_SAMPLE(x, sampler, pos) x.SampleLevel(sampler, pos, 0)
- #define NVTEX_SAMPLE_RED(x, sampler, pos) x.GatherRed(sampler, pos)
- #define NVTEX_SAMPLE_GREEN(x, sampler, pos) x.GatherGreen(sampler, pos)
- #define NVTEX_SAMPLE_BLUE(x, sampler, pos) x.GatherBlue(sampler, pos)
- #define NVTEX_STORE(x, pos, v) x[pos] = v
- #ifndef NIS_UNROLL
- #define NIS_UNROLL [unroll]
- #endif
- #endif
- #if NIS_GLSL
- #define NVF float
- #define NVF2 vec2
- #define NVF3 vec3
- #define NVF4 vec4
- #define NVI int
- #define NVI2 ivec2
- #define NVU uint
- #define NVU2 uvec2
- #if NIS_USE_HALF_PRECISION
- #define NVH float16_t
- #define NVH2 f16vec2
- #define NVH3 f16vec3
- #define NVH4 f16vec4
- #else
- #define NVH NVF
- #define NVH2 NVF2
- #define NVH3 NVF3
- #define NVH4 NVF4
- #endif
- #define NVSHARED shared
- #define NVTEX_LOAD(x, pos) texelFetch(sampler2D(x, samplerLinearClamp), pos, 0)
- #define NVTEX_SAMPLE(x, sampler, pos) textureLod(sampler2D(x, sampler), pos, 0)
- #define NVTEX_SAMPLE_RED(x, sampler, pos) textureGather(sampler2D(x, sampler), pos, 0)
- #define NVTEX_SAMPLE_GREEN(x, sampler, pos) textureGather(sampler2D(x, sampler), pos, 1)
- #define NVTEX_SAMPLE_BLUE(x, sampler, pos) textureGather(sampler2D(x, sampler), pos, 2)
- #define NVTEX_STORE(x, pos, v) imageStore(x, NVI2(pos), v)
- #define saturate(x) clamp(x, 0, 1)
- #define lerp(a, b, x) mix(a, b, x)
- #define GroupMemoryBarrierWithGroupSync() groupMemoryBarrier(); barrier()
- #ifndef NIS_UNROLL
- #define NIS_UNROLL
- #endif
- #endif
- #ifndef NIS_TEXTURE_GATHER
- #define NIS_TEXTURE_GATHER 0
- #endif
- #define NIS_SCALE_INT 1
- #define NIS_SCALE_FLOAT NVF(1.f)
- NVF getY(NVF3 rgba)
- {
- #if NIS_HDR_MODE == NIS_HDR_MODE_PQ
- return NVF(0.262f) * rgba.x + NVF(0.678f) * rgba.y + NVF(0.0593f) * rgba.z;
- #elif NIS_HDR_MODE == NIS_HDR_MODE_LINEAR
- return sqrt(NVF(0.2126f) * rgba.x + NVF(0.7152f) * rgba.y + NVF(0.0722f) * rgba.z) * kHDRCompressionFactor;
- #else
- return NVF(0.2126f) * rgba.x + NVF(0.7152f) * rgba.y + NVF(0.0722f) * rgba.z;
- #endif
- }
- NVF getYLinear(NVF3 rgba)
- {
- return NVF(0.2126f) * rgba.x + NVF(0.7152f) * rgba.y + NVF(0.0722f) * rgba.z;
- }
- #if NIS_SCALER
- NVF4 GetEdgeMap(NVF p[4][4], NVI i, NVI j)
- #else
- NVF4 GetEdgeMap(NVF p[5][5], NVI i, NVI j)
- #endif
- {
- const NVF g_0 = abs(p[0 + i][0 + j] + p[0 + i][1 + j] + p[0 + i][2 + j] - p[2 + i][0 + j] - p[2 + i][1 + j] - p[2 + i][2 + j]);
- const NVF g_45 = abs(p[1 + i][0 + j] + p[0 + i][0 + j] + p[0 + i][1 + j] - p[2 + i][1 + j] - p[2 + i][2 + j] - p[1 + i][2 + j]);
- const NVF g_90 = abs(p[0 + i][0 + j] + p[1 + i][0 + j] + p[2 + i][0 + j] - p[0 + i][2 + j] - p[1 + i][2 + j] - p[2 + i][2 + j]);
- const NVF g_135 = abs(p[1 + i][0 + j] + p[2 + i][0 + j] + p[2 + i][1 + j] - p[0 + i][1 + j] - p[0 + i][2 + j] - p[1 + i][2 + j]);
- const NVF g_0_90_max = max(g_0, g_90);
- const NVF g_0_90_min = min(g_0, g_90);
- const NVF g_45_135_max = max(g_45, g_135);
- const NVF g_45_135_min = min(g_45, g_135);
- NVF e_0_90 = 0;
- NVF e_45_135 = 0;
- if ((g_0_90_max + g_45_135_max) != 0)
- {
- e_0_90 = g_0_90_max / (g_0_90_max + g_45_135_max);
- e_0_90 = min(e_0_90, 1.0f);
- e_45_135 = 1.0f - e_0_90;
- }
- NVF e = ((g_0_90_max > (g_0_90_min * kDetectRatio)) && (g_0_90_max > kDetectThres) && (g_0_90_max > g_45_135_min)) ? 1.f : 0.f;
- NVF edge_0 = (g_0_90_max == g_0) ? e : 0.f;
- NVF edge_90 = (g_0_90_max == g_0) ? 0.f : e;
- e = ((g_45_135_max > (g_45_135_min * kDetectRatio)) && (g_45_135_max > kDetectThres) && (g_45_135_max > g_0_90_min)) ? 1.f : 0.f;
- NVF edge_45 = (g_45_135_max == g_45) ? e : 0.f;
- NVF edge_135 = (g_45_135_max == g_45) ? 0.f : e;
- NVF weight_0 = 0.f;
- NVF weight_90 = 0.f;
- NVF weight_45 = 0.f;
- NVF weight_135 = 0.f;
- if ((edge_0 + edge_90 + edge_45 + edge_135) >= 2.0f)
- {
- weight_0 = (edge_0 == 1.0f) ? e_0_90 : 0.f;
- weight_90 = (edge_0 == 1.0f) ? 0.f : e_0_90;
- weight_45 = (edge_45 == 1.0f) ? e_45_135 : 0.f;
- weight_135 = (edge_45 == 1.0f) ? 0.f : e_45_135;
- }
- else if ((edge_0 + edge_90 + edge_45 + edge_135) >= 1.0f)
- {
- weight_0 = edge_0;
- weight_90 = edge_90;
- weight_45 = edge_45;
- weight_135 = edge_135;
- }
- return NVF4(weight_0, weight_90, weight_45, weight_135);
- }
- #if NIS_SCALER
- #ifndef NIS_BLOCK_WIDTH
- #define NIS_BLOCK_WIDTH 32
- #endif
- #ifndef NIS_BLOCK_HEIGHT
- #define NIS_BLOCK_HEIGHT 24
- #endif
- #ifndef NIS_THREAD_GROUP_SIZE
- #define NIS_THREAD_GROUP_SIZE 256
- #endif
- #define kPhaseCount 64
- #define kFilterSize 6
- #define kSupportSize 6
- #define kPadSize kSupportSize
- #define kTilePitch (NIS_BLOCK_WIDTH + kPadSize)
- #define kTileSize (kTilePitch * (NIS_BLOCK_HEIGHT + kPadSize))
- #define kEdgeMapPitch (NIS_BLOCK_WIDTH + 2)
- #define kEdgeMapSize (kEdgeMapPitch * (NIS_BLOCK_HEIGHT + 2))
- NVSHARED NVF shPixelsY[kTileSize];
- NVSHARED NVH shCoefScaler[kPhaseCount][kFilterSize];
- NVSHARED NVH shCoefUSM[kPhaseCount][kFilterSize];
- NVSHARED NVH4 shEdgeMap[kEdgeMapSize];
- void LoadFilterBanksSh(NVI i0, NVI di) {
-
-
- for (NVI i = i0; i < kPhaseCount * 2; i += di)
- {
- NVI phase = i / 2;
- NVI vIdx = i & 1;
- NVH4 v = NVH4(NVTEX_LOAD(coef_scaler, NVI2(vIdx, phase)));
- NVI filterOffset = vIdx * 4;
- shCoefScaler[phase][filterOffset + 0] = v.x;
- shCoefScaler[phase][filterOffset + 1] = v.y;
- if (vIdx == 0)
- {
- shCoefScaler[phase][2] = v.z;
- shCoefScaler[phase][3] = v.w;
- }
- v = NVH4(NVTEX_LOAD(coef_usm, NVI2(vIdx, phase)));
- shCoefUSM[phase][filterOffset + 0] = v.x;
- shCoefUSM[phase][filterOffset + 1] = v.y;
- if (vIdx == 0)
- {
- shCoefUSM[phase][2] = v.z;
- shCoefUSM[phase][3] = v.w;
- }
- }
- }
- NVF CalcLTI(NVF p0, NVF p1, NVF p2, NVF p3, NVF p4, NVF p5, NVI phase_index)
- {
- const bool selector = (phase_index <= kPhaseCount / 2);
- NVF sel = selector ? p0 : p3;
- const NVF a_min = min(min(p1, p2), sel);
- const NVF a_max = max(max(p1, p2), sel);
- sel = selector ? p2 : p5;
- const NVF b_min = min(min(p3, p4), sel);
- const NVF b_max = max(max(p3, p4), sel);
- const NVF a_cont = a_max - a_min;
- const NVF b_cont = b_max - b_min;
- const NVF cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps);
- return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
- }
- NVF4 GetInterpEdgeMap(const NVF4 edge[2][2], NVF phase_frac_x, NVF phase_frac_y)
- {
- NVF4 h0 = lerp(edge[0][0], edge[0][1], phase_frac_x);
- NVF4 h1 = lerp(edge[1][0], edge[1][1], phase_frac_x);
- return lerp(h0, h1, phase_frac_y);
- }
- NVF EvalPoly6(const NVF pxl[6], NVI phase_int)
- {
- NVF y = 0.f;
- {
- NIS_UNROLL
- for (NVI i = 0; i < 6; ++i)
- {
- y += shCoefScaler[phase_int][i] * pxl[i];
- }
- }
- NVF y_usm = 0.f;
- {
- NIS_UNROLL
- for (NVI i = 0; i < 6; ++i)
- {
- y_usm += shCoefUSM[phase_int][i] * pxl[i];
- }
- }
-
- const NVF y_scale = 1.0f - saturate((y * (1.0f / NIS_SCALE_FLOAT) - kSharpStartY) * kSharpScaleY);
-
- const NVF y_sharpness = y_scale * kSharpStrengthScale + kSharpStrengthMin;
- y_usm *= y_sharpness;
-
- const NVF y_sharpness_limit = (y_scale * kSharpLimitScale + kSharpLimitMin) * y;
- y_usm = min(y_sharpness_limit, max(-y_sharpness_limit, y_usm));
-
- y_usm *= CalcLTI(pxl[0], pxl[1], pxl[2], pxl[3], pxl[4], pxl[5], phase_int);
- return y + y_usm;
- }
- NVF FilterNormal(const NVF p[6][6], NVI phase_x_frac_int, NVI phase_y_frac_int)
- {
- NVF h_acc = 0.0f;
- NIS_UNROLL
- for (NVI j = 0; j < 6; ++j)
- {
- NVF v_acc = 0.0f;
- NIS_UNROLL
- for (NVI i = 0; i < 6; ++i)
- {
- v_acc += p[i][j] * shCoefScaler[phase_y_frac_int][i];
- }
- h_acc += v_acc * shCoefScaler[phase_x_frac_int][j];
- }
-
- return h_acc;
- }
- NVF4 GetDirFilters(NVF p[6][6], NVF phase_x_frac, NVF phase_y_frac, NVI phase_x_frac_int, NVI phase_y_frac_int)
- {
- NVF4 f;
-
- NVF interp0Deg[6];
- {
- NIS_UNROLL
- for (NVI i = 0; i < 6; ++i)
- {
- interp0Deg[i] = lerp(p[i][2], p[i][3], phase_x_frac);
- }
- }
- f.x = EvalPoly6(interp0Deg, phase_y_frac_int);
-
- NVF interp90Deg[6];
- {
- NIS_UNROLL
- for (NVI i = 0; i < 6; ++i)
- {
- interp90Deg[i] = lerp(p[2][i], p[3][i], phase_y_frac);
- }
- }
- f.y = EvalPoly6(interp90Deg, phase_x_frac_int);
-
- NVF pphase_b45;
- pphase_b45 = 0.5f + 0.5f * (phase_x_frac - phase_y_frac);
- NVF temp_interp45Deg[7];
- temp_interp45Deg[1] = lerp(p[2][1], p[1][2], pphase_b45);
- temp_interp45Deg[3] = lerp(p[3][2], p[2][3], pphase_b45);
- temp_interp45Deg[5] = lerp(p[4][3], p[3][4], pphase_b45);
- {
- pphase_b45 = pphase_b45 - 0.5f;
- NVF a = (pphase_b45 >= 0.f) ? p[0][2] : p[2][0];
- NVF b = (pphase_b45 >= 0.f) ? p[1][3] : p[3][1];
- NVF c = (pphase_b45 >= 0.f) ? p[2][4] : p[4][2];
- NVF d = (pphase_b45 >= 0.f) ? p[3][5] : p[5][3];
- temp_interp45Deg[0] = lerp(p[1][1], a, abs(pphase_b45));
- temp_interp45Deg[2] = lerp(p[2][2], b, abs(pphase_b45));
- temp_interp45Deg[4] = lerp(p[3][3], c, abs(pphase_b45));
- temp_interp45Deg[6] = lerp(p[4][4], d, abs(pphase_b45));
- }
- NVF interp45Deg[6];
- NVF pphase_p45 = phase_x_frac + phase_y_frac;
- if (pphase_p45 >= 1)
- {
- NIS_UNROLL
- for (NVI i = 0; i < 6; i++)
- {
- interp45Deg[i] = temp_interp45Deg[i + 1];
- }
- pphase_p45 = pphase_p45 - 1;
- }
- else
- {
- NIS_UNROLL
- for (NVI i = 0; i < 6; i++)
- {
- interp45Deg[i] = temp_interp45Deg[i];
- }
- }
- f.z = EvalPoly6(interp45Deg, NVI(pphase_p45 * 64));
-
- NVF pphase_b135;
- pphase_b135 = 0.5f * (phase_x_frac + phase_y_frac);
- NVF temp_interp135Deg[7];
- temp_interp135Deg[1] = lerp(p[3][1], p[4][2], pphase_b135);
- temp_interp135Deg[3] = lerp(p[2][2], p[3][3], pphase_b135);
- temp_interp135Deg[5] = lerp(p[1][3], p[2][4], pphase_b135);
- {
- pphase_b135 = pphase_b135 - 0.5f;
- NVF a = (pphase_b135 >= 0.f) ? p[5][2] : p[3][0];
- NVF b = (pphase_b135 >= 0.f) ? p[4][3] : p[2][1];
- NVF c = (pphase_b135 >= 0.f) ? p[3][4] : p[1][2];
- NVF d = (pphase_b135 >= 0.f) ? p[2][5] : p[0][3];
- temp_interp135Deg[0] = lerp(p[4][1], a, abs(pphase_b135));
- temp_interp135Deg[2] = lerp(p[3][2], b, abs(pphase_b135));
- temp_interp135Deg[4] = lerp(p[2][3], c, abs(pphase_b135));
- temp_interp135Deg[6] = lerp(p[1][4], d, abs(pphase_b135));
- }
- NVF interp135Deg[6];
- NVF pphase_p135 = 1 + (phase_x_frac - phase_y_frac);
- if (pphase_p135 >= 1)
- {
- NIS_UNROLL
- for (NVI i = 0; i < 6; ++i)
- {
- interp135Deg[i] = temp_interp135Deg[i + 1];
- }
- pphase_p135 = pphase_p135 - 1;
- }
- else
- {
- NIS_UNROLL
- for (NVI i = 0; i < 6; ++i)
- {
- interp135Deg[i] = temp_interp135Deg[i];
- }
- }
- f.w = EvalPoly6(interp135Deg, NVI(pphase_p135 * 64));
- return f;
- }
- void NVScaler(NVU2 blockIdx, NVU threadIdx)
- {
-
- NVI dstBlockX = NVI(NIS_BLOCK_WIDTH * blockIdx.x);
- NVI dstBlockY = NVI(NIS_BLOCK_HEIGHT * blockIdx.y);
- const NVI srcBlockStartX = NVI(floor((dstBlockX + 0.5f) * kScaleX - 0.5f));
- const NVI srcBlockStartY = NVI(floor((dstBlockY + 0.5f) * kScaleY - 0.5f));
- const NVI srcBlockEndX = NVI(ceil((dstBlockX + NIS_BLOCK_WIDTH + 0.5f) * kScaleX - 0.5f));
- const NVI srcBlockEndY = NVI(ceil((dstBlockY + NIS_BLOCK_HEIGHT + 0.5f) * kScaleY - 0.5f));
- NVI numTilePixelsX = srcBlockEndX - srcBlockStartX + kSupportSize - 1;
- NVI numTilePixelsY = srcBlockEndY - srcBlockStartY + kSupportSize - 1;
-
- numTilePixelsX += numTilePixelsX & 0x1;
- numTilePixelsY += numTilePixelsY & 0x1;
- const NVI numTilePixels = numTilePixelsX * numTilePixelsY;
-
- const NVI numEdgeMapPixelsX = numTilePixelsX - kSupportSize + 2;
- const NVI numEdgeMapPixelsY = numTilePixelsY - kSupportSize + 2;
- const NVI numEdgeMapPixels = numEdgeMapPixelsX * numEdgeMapPixelsY;
-
-
-
- {
- for (NVI i = NVI(threadIdx) * 2; i < numTilePixels / 2; i += NIS_THREAD_GROUP_SIZE * 2)
- {
- NVI py = (i / numTilePixelsX) * 2;
- NVI px = i % numTilePixelsX;
-
-
-
- NVF kShift = 0.5f - 1.0f - (kSupportSize - 1) / 2;
- #if NIS_VIEWPORT_SUPPORT
- const NVF tx = (srcBlockStartX + px + kInputViewportOriginX + kShift) * kSrcNormX;
- const NVF ty = (srcBlockStartY + py + kInputViewportOriginY + kShift) * kSrcNormY;
- #else
- const NVF tx = (srcBlockStartX + px + kShift) * kSrcNormX;
- const NVF ty = (srcBlockStartY + py + kShift) * kSrcNormY;
- #endif
- NVF p[2][2];
- #if NIS_TEXTURE_GATHER
- {
- const NVF4 sr = NVTEX_SAMPLE_RED(in_texture, samplerLinearClamp, NVF2(tx, ty));
- const NVF4 sg = NVTEX_SAMPLE_GREEN(in_texture, samplerLinearClamp, NVF2(tx, ty));
- const NVF4 sb = NVTEX_SAMPLE_BLUE(in_texture, samplerLinearClamp, NVF2(tx, ty));
- p[0][0] = getY(NVF3(sr.w, sg.w, sb.w));
- p[0][1] = getY(NVF3(sr.z, sg.z, sb.z));
- p[1][0] = getY(NVF3(sr.x, sg.x, sb.x));
- p[1][1] = getY(NVF3(sr.y, sg.y, sb.y));
- }
- #else
- NIS_UNROLL
- for (NVI j = 0; j < 2; j++)
- {
- NIS_UNROLL
- for (NVI k = 0; k < 2; k++)
- {
- const NVF4 px = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2(tx + k * kSrcNormX, ty + j * kSrcNormY));
- p[j][k] = getY(px.xyz);
- }
- }
- #endif
- const NVI idx = py * kTilePitch + px;
- shPixelsY[idx] = NVH(p[0][0]);
- shPixelsY[idx + 1] = NVH(p[0][1]);
- shPixelsY[idx + kTilePitch] = NVH(p[1][0]);
- shPixelsY[idx + kTilePitch + 1] = NVH(p[1][1]);
- }
- }
- GroupMemoryBarrierWithGroupSync();
- {
-
- for (NVI i = NVI(threadIdx) * 2; i < numEdgeMapPixels / 2; i += NIS_THREAD_GROUP_SIZE * 2)
- {
- NVI py = (i / numEdgeMapPixelsX) * 2;
- NVI px = i % numEdgeMapPixelsX;
- const NVI edgeMapIdx = py * kEdgeMapPitch + px;
- NVI tileCornerIdx = (py+1) * kTilePitch + px + 1;
- NVF p[4][4];
- NIS_UNROLL
- for (NVI j = 0; j < 4; j++)
- {
- NIS_UNROLL
- for (NVI k = 0; k < 4; k++)
- {
- p[j][k] = shPixelsY[tileCornerIdx + j * kTilePitch + k];
- }
- }
- shEdgeMap[edgeMapIdx] = NVH4(GetEdgeMap(p, 0, 0));
- shEdgeMap[edgeMapIdx + 1] = NVH4(GetEdgeMap(p, 0, 1));
- shEdgeMap[edgeMapIdx + kEdgeMapPitch] = NVH4(GetEdgeMap(p, 1, 0));
- shEdgeMap[edgeMapIdx + kEdgeMapPitch + 1] = NVH4(GetEdgeMap(p, 1, 1));
- }
- }
- LoadFilterBanksSh(NVI(threadIdx), NIS_THREAD_GROUP_SIZE);
- GroupMemoryBarrierWithGroupSync();
- for (NVI k = NVI(threadIdx); k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT; k += NIS_THREAD_GROUP_SIZE)
- {
- const NVI2 pos = NVI2(k % NIS_BLOCK_WIDTH, k / NIS_BLOCK_WIDTH);
- const NVI dstX = dstBlockX + pos.x;
- const NVI dstY = dstBlockY + pos.y;
- const NVF srcX = (0.5f + dstX) * kScaleX - 0.5f;
- const NVF srcY = (0.5f + dstY) * kScaleY - 0.5f;
- #if NIS_VIEWPORT_SUPPORT
- if (srcX > kInputViewportWidth || srcY > kInputViewportHeight ||
- dstX > kOutputViewportWidth || dstY > kOutputViewportHeight)
- {
- return;
- }
- #endif
- const NVI px = NVI(floor(srcX) - srcBlockStartX);
- const NVI py = NVI(floor(srcY) - srcBlockStartY);
- const NVI startTileIdx = py * kTilePitch + px;
-
- NVF p[6][6];
- {
- NIS_UNROLL
- for (NVI i = 0; i < 6; ++i)
- {
- NIS_UNROLL
- for (NVI j = 0; j < 6; ++j)
- {
- p[i][j] = shPixelsY[startTileIdx + i * kTilePitch + j];
- }
- }
- }
-
- const NVF fx = srcX - floor(srcX);
- const NVF fy = srcY - floor(srcY);
- const NVI fx_int = NVI(fx * kPhaseCount);
- const NVI fy_int = NVI(fy * kPhaseCount);
-
- const NVF pixel_n = FilterNormal(p, fx_int, fy_int);
-
- NVF4 opDirYU = GetDirFilters(p, fx, fy, fx_int, fy_int);
-
-
- const NVI startEdgeMapIdx = py * kEdgeMapPitch + px;
- NVF4 edge[2][2];
- NIS_UNROLL
- for (NVI i = 0; i < 2; i++)
- {
- NIS_UNROLL
- for (NVI j = 0; j < 2; j++)
- {
-
- edge[i][j] = shEdgeMap[startEdgeMapIdx + (i * kEdgeMapPitch) + j];
- }
- }
- const NVF4 w = GetInterpEdgeMap(edge, fx, fy) * NIS_SCALE_INT;
-
- const NVF opY = (opDirYU.x * w.x + opDirYU.y * w.y + opDirYU.z * w.z + opDirYU.w * w.w +
- pixel_n * (NIS_SCALE_FLOAT - w.x - w.y - w.z - w.w)) * (1.0f / NIS_SCALE_FLOAT);
-
- #if NIS_VIEWPORT_SUPPORT
- NVF4 op = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2((srcX + kInputViewportOriginX) * kSrcNormX, (srcY + kInputViewportOriginY) * kSrcNormY));
- #else
- NVF4 op = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2((dstX + 0.5f) * kDstNormX, (dstY + 0.5f) * kDstNormY));
- #endif
- #if NIS_HDR_MODE == NIS_HDR_MODE_LINEAR
- const NVF kEps = 1e-4f;
- const NVF kNorm = 1.0f / (NIS_SCALE_FLOAT * kHDRCompressionFactor);
- const NVF opYN = max(opY, 0.0f) * kNorm;
- const NVF corr = (opYN * opYN + kEps) / (max(getYLinear(NVF3(op.x, op.y, op.z)), 0.0f) + kEps);
- op.x *= corr;
- op.y *= corr;
- op.z *= corr;
- #else
- const NVF corr = opY * (1.0f / NIS_SCALE_FLOAT) - getY(NVF3(op.x, op.y, op.z));
- op.x += corr;
- op.y += corr;
- op.z += corr;
- #endif
- #if NIS_VIEWPORT_SUPPORT
- NVTEX_STORE(out_texture, NVU2(dstX + kOutputViewportOriginX, dstY + kOutputViewportOriginY), op);
- #else
- NVTEX_STORE(out_texture, NVU2(dstX, dstY), op);
- #endif
- }
- }
- #else
- #ifndef NIS_BLOCK_WIDTH
- #define NIS_BLOCK_WIDTH 32
- #endif
- #ifndef NIS_BLOCK_HEIGHT
- #define NIS_BLOCK_HEIGHT 32
- #endif
- #ifndef NIS_THREAD_GROUP_SIZE
- #define NIS_THREAD_GROUP_SIZE 256
- #endif
- #define kSupportSize 5
- #define kNumPixelsX (NIS_BLOCK_WIDTH + kSupportSize + 1)
- #define kNumPixelsY (NIS_BLOCK_HEIGHT + kSupportSize + 1)
- NVSHARED NVF shPixelsY[kNumPixelsY][kNumPixelsX];
- NVF CalcLTIFast(const NVF y[5])
- {
- const NVF a_min = min(min(y[0], y[1]), y[2]);
- const NVF a_max = max(max(y[0], y[1]), y[2]);
- const NVF b_min = min(min(y[2], y[3]), y[4]);
- const NVF b_max = max(max(y[2], y[3]), y[4]);
- const NVF a_cont = a_max - a_min;
- const NVF b_cont = b_max - b_min;
- const NVF cont_ratio = max(a_cont, b_cont) / (min(a_cont, b_cont) + kEps * (1.0f / NIS_SCALE_FLOAT));
- return (1.0f - saturate((cont_ratio - kMinContrastRatio) * kRatioNorm)) * kContrastBoost;
- }
- NVF EvalUSM(const NVF pxl[5], const NVF sharpnessStrength, const NVF sharpnessLimit)
- {
-
- NVF y_usm = -0.6001f * pxl[1] + 1.2002f * pxl[2] - 0.6001f * pxl[3];
-
- y_usm *= sharpnessStrength;
-
- y_usm = min(sharpnessLimit, max(-sharpnessLimit, y_usm));
-
- y_usm *= CalcLTIFast(pxl);
- return y_usm;
- }
- NVF4 GetDirUSM(const NVF p[5][5])
- {
-
- const NVF scaleY = 1.0f - saturate((p[2][2] - kSharpStartY) * kSharpScaleY);
-
- const NVF sharpnessStrength = scaleY * kSharpStrengthScale + kSharpStrengthMin;
-
- const NVF sharpnessLimit = (scaleY * kSharpLimitScale + kSharpLimitMin) * p[2][2];
- NVF4 rval;
-
- NVF interp0Deg[5];
- {
- for (NVI i = 0; i < 5; ++i)
- {
- interp0Deg[i] = p[i][2];
- }
- }
- rval.x = EvalUSM(interp0Deg, sharpnessStrength, sharpnessLimit);
-
- NVF interp90Deg[5];
- {
- for (NVI i = 0; i < 5; ++i)
- {
- interp90Deg[i] = p[2][i];
- }
- }
- rval.y = EvalUSM(interp90Deg, sharpnessStrength, sharpnessLimit);
-
- NVF interp45Deg[5];
- interp45Deg[0] = p[1][1];
- interp45Deg[1] = lerp(p[2][1], p[1][2], 0.5f);
- interp45Deg[2] = p[2][2];
- interp45Deg[3] = lerp(p[3][2], p[2][3], 0.5f);
- interp45Deg[4] = p[3][3];
- rval.z = EvalUSM(interp45Deg, sharpnessStrength, sharpnessLimit);
-
- NVF interp135Deg[5];
- interp135Deg[0] = p[3][1];
- interp135Deg[1] = lerp(p[3][2], p[2][1], 0.5f);
- interp135Deg[2] = p[2][2];
- interp135Deg[3] = lerp(p[2][3], p[1][2], 0.5f);
- interp135Deg[4] = p[1][3];
- rval.w = EvalUSM(interp135Deg, sharpnessStrength, sharpnessLimit);
- return rval;
- }
- void NVSharpen(NVU2 blockIdx, NVU threadIdx)
- {
- const NVI dstBlockX = NVI(NIS_BLOCK_WIDTH * blockIdx.x);
- const NVI dstBlockY = NVI(NIS_BLOCK_HEIGHT * blockIdx.y);
-
-
-
- const NVF kShift = 0.5f - kSupportSize / 2;
- for (NVI i = NVI(threadIdx) * 2; i < kNumPixelsX * kNumPixelsY / 2; i += NIS_THREAD_GROUP_SIZE * 2)
- {
- NVU2 pos = NVU2(i % kNumPixelsX, i / kNumPixelsX * 2);
- NIS_UNROLL
- for (NVI dy = 0; dy < 2; dy++)
- {
- NIS_UNROLL
- for (NVI dx = 0; dx < 2; dx++)
- {
- #if NIS_VIEWPORT_SUPPORT
- const NVF tx = (dstBlockX + pos.x + kInputViewportOriginX + dx + kShift) * kSrcNormX;
- const NVF ty = (dstBlockY + pos.y + kInputViewportOriginY + dy + kShift) * kSrcNormY;
- #else
- const NVF tx = (dstBlockX + pos.x + dx + kShift) * kSrcNormX;
- const NVF ty = (dstBlockY + pos.y + dy + kShift) * kSrcNormY;
- #endif
- const NVF4 px = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2(tx, ty));
- shPixelsY[pos.y + dy][pos.x + dx] = getY(px.xyz);
- }
- }
- }
- GroupMemoryBarrierWithGroupSync();
- for (NVI k = NVI(threadIdx); k < NIS_BLOCK_WIDTH * NIS_BLOCK_HEIGHT; k += NIS_THREAD_GROUP_SIZE)
- {
- const NVI2 pos = NVI2(k % NIS_BLOCK_WIDTH, k / NIS_BLOCK_WIDTH);
-
- NVF p[5][5];
- NIS_UNROLL
- for (NVI i = 0; i < 5; ++i)
- {
- NIS_UNROLL
- for (NVI j = 0; j < 5; ++j)
- {
- p[i][j] = shPixelsY[pos.y + i][pos.x + j];
- }
- }
-
- NVF4 dirUSM = GetDirUSM(p);
-
- NVF4 w = GetEdgeMap(p, kSupportSize / 2 - 1, kSupportSize / 2 - 1);
-
- const NVF usmY = (dirUSM.x * w.x + dirUSM.y * w.y + dirUSM.z * w.z + dirUSM.w * w.w);
-
- const NVI dstX = dstBlockX + pos.x;
- const NVI dstY = dstBlockY + pos.y;
- #if NIS_VIEWPORT_SUPPORT
- if (dstX > kOutputViewportWidth || dstY > kOutputViewportHeight)
- {
- return;
- }
- #endif
- #if NIS_VIEWPORT_SUPPORT
- NVF4 op = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2((dstX + kInputViewportOriginX) * kSrcNormX, (dstY + kInputViewportOriginY) * kSrcNormY));
- #else
- NVF4 op = NVTEX_SAMPLE(in_texture, samplerLinearClamp, NVF2((dstX + 0.5f) * kDstNormX, (dstY + 0.5f) * kDstNormY));
- #endif
- #if NIS_HDR_MODE == NIS_HDR_MODE_LINEAR
- const NVF kEps = 1e-4f * kHDRCompressionFactor * kHDRCompressionFactor;
- NVF newY = p[2][2] + usmY;
- newY = max(newY, 0.0f);
- const NVF oldY = p[2][2];
- const NVF corr = (newY * newY + kEps) / (oldY * oldY + kEps);
- op.x *= corr;
- op.y *= corr;
- op.z *= corr;
- #else
- op.x += usmY;
- op.y += usmY;
- op.z += usmY;
- #endif
- #if NIS_VIEWPORT_SUPPORT
- NVTEX_STORE(out_texture, NVU2(dstX + kOutputViewportOriginX, dstY + kOutputViewportOriginY), op);
- #else
- NVTEX_STORE(out_texture, NVU2(dstX, dstY), op);
- #endif
- }
- }
- #endif
|