diff --git a/Minecraft.Client/GameRenderer.cpp b/Minecraft.Client/GameRenderer.cpp index 9e38d93b..15a4362f 100644 --- a/Minecraft.Client/GameRenderer.cpp +++ b/Minecraft.Client/GameRenderer.cpp @@ -52,6 +52,10 @@ #include "TextureAtlas.h" #include "Common/PostProcesser.h" +#ifdef __AVX__ +#include +#endif + bool GameRenderer::anaglyph3d = false; int GameRenderer::anaglyphPass = 0; @@ -852,6 +856,18 @@ void GameRenderer::updateLightTexture(float a) { CachePlayerGammas(); +#ifdef __AVX__ + const float amount = darkenWorldAmountO + (darkenWorldAmount - darkenWorldAmountO) * a; + const __m256 vAmount = _mm256_set1_ps(amount); + const __m256 vOneMinusAmount = _mm256_set1_ps(1.0f - amount); + + const __m256 v096 = _mm256_set1_ps(0.96f); + const __m256 v003 = _mm256_set1_ps(0.03f); + const __m256 vZero = _mm256_set1_ps(0.0f); + const __m256 vOne = _mm256_set1_ps(1.0f); + const __m256 v255 = _mm256_set1_ps(255.0f); +#endif + for (int j = 0; j < XUSER_MAX_COUNT; j++) { shared_ptr player = Minecraft::GetInstance()->localplayers[j]; @@ -861,6 +877,97 @@ void GameRenderer::updateLightTexture(float a) Level *level = player->level; const float skyDarken1 = level->getSkyDarken(1.0f); +#ifdef __AVX__ + const float darken = skyDarken1 * 0.95f + 0.05f; + const float blockMult = (blr * 0.1f + 1.5f); + const float skyWeight = skyDarken1 * 0.65f + 0.35f; + + const __m256 vDarken = _mm256_set1_ps(darken); + const __m256 vSkyWeight = _mm256_set1_ps(skyWeight); + const __m256 vBlockMult = _mm256_set1_ps(blockMult); + const bool skyFlash = level->skyFlashTime < 0; + const bool isEnd = level->dimension->id == 1; + const bool hasNV = player->hasEffect(MobEffect::nightVision); + const float nvScale = hasNV ? getNightVisionScale(player, a) : 0.0f; + const __m256 vNVScale = _mm256_set1_ps(nvScale); + + for (int i = 0; i < 256; i += 8) + { + alignas(32) float sRamp[8], bRamp[8]; + for(int k=0; k<8; ++k) { + sRamp[k] = level->dimension->brightnessRamp[(i + k) / 16]; + bRamp[k] = level->dimension->brightnessRamp[(i + k) % 16]; + } + + __m256 vSkyRamp = _mm256_load_ps(sRamp); + __m256 vBlockRamp = _mm256_load_ps(bRamp); + + __m256 vSky = skyFlash ? vSkyRamp : _mm256_mul_ps(vSkyRamp, vDarken); + __m256 vBlock = _mm256_mul_ps(vBlockRamp, vBlockMult); + + __m256 vRs = _mm256_mul_ps(vSky, vSkyWeight); + __m256 vGs = vRs; + __m256 vBs = vSky; + + __m256 vRb = vBlock; + __m256 vGb = _mm256_mul_ps(vBlock, _mm256_add_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(vBlock, _mm256_set1_ps(0.6f)), _mm256_set1_ps(0.4f)), _mm256_set1_ps(0.6f)), _mm256_set1_ps(0.4f))); + __m256 vBb = _mm256_mul_ps(vBlock, _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(vBlock, vBlock), _mm256_set1_ps(0.6f)), _mm256_set1_ps(0.4f))); + + __m256 vR = _mm256_add_ps(vRs, vRb); + __m256 vG = _mm256_add_ps(vGs, vGb); + __m256 vB = _mm256_add_ps(vBs, vBb); + + vR = _mm256_add_ps(_mm256_mul_ps(vR, v096), v003); + vG = _mm256_add_ps(_mm256_mul_ps(vG, v096), v003); + vB = _mm256_add_ps(_mm256_mul_ps(vB, v096), v003); + + if (darkenWorldAmount > 0) { + vR = _mm256_add_ps(_mm256_mul_ps(vR, vOneMinusAmount), _mm256_mul_ps(_mm256_mul_ps(vR, _mm256_set1_ps(0.7f)), vAmount)); + vG = _mm256_add_ps(_mm256_mul_ps(vG, vOneMinusAmount), _mm256_mul_ps(_mm256_mul_ps(vG, _mm256_set1_ps(0.6f)), vAmount)); + vB = _mm256_add_ps(_mm256_mul_ps(vB, vOneMinusAmount), _mm256_mul_ps(_mm256_mul_ps(vB, _mm256_set1_ps(0.6f)), vAmount)); + } + + if (isEnd) { + vR = _mm256_add_ps(_mm256_set1_ps(0.22f), _mm256_mul_ps(vRb, _mm256_set1_ps(0.75f))); + vG = _mm256_add_ps(_mm256_set1_ps(0.28f), _mm256_mul_ps(vGb, _mm256_set1_ps(0.75f))); + vB = _mm256_add_ps(_mm256_set1_ps(0.25f), _mm256_mul_ps(vBb, _mm256_set1_ps(0.75f))); + } + + if (hasNV) { + __m256 vInvR = _mm256_rcp_ps(vR); + __m256 vInvG = _mm256_rcp_ps(vG); + __m256 vInvB = _mm256_rcp_ps(vB); + __m256 vDist = _mm256_min_ps(vInvR, _mm256_min_ps(vInvG, vInvB)); + + vR = _mm256_add_ps(_mm256_mul_ps(vR, _mm256_sub_ps(vOne, vNVScale)), _mm256_mul_ps(_mm256_mul_ps(vR, vDist), vNVScale)); + vG = _mm256_add_ps(_mm256_mul_ps(vG, _mm256_sub_ps(vOne, vNVScale)), _mm256_mul_ps(_mm256_mul_ps(vG, vDist), vNVScale)); + vB = _mm256_add_ps(_mm256_mul_ps(vB, _mm256_sub_ps(vOne, vNVScale)), _mm256_mul_ps(_mm256_mul_ps(vB, vDist), vNVScale)); + } + + vR = _mm256_max_ps(vZero, _mm256_min_ps(vOne, vR)); + vG = _mm256_max_ps(vZero, _mm256_min_ps(vOne, vG)); + vB = _mm256_max_ps(vZero, _mm256_min_ps(vOne, vB)); + + vR = _mm256_mul_ps(vR, v255); + vG = _mm256_mul_ps(vG, v255); + vB = _mm256_mul_ps(vB, v255); + + alignas(32) float finalR[8], finalG[8], finalB[8]; + _mm256_store_ps(finalR, vR); + _mm256_store_ps(finalG, vG); + _mm256_store_ps(finalB, vB); + + for(int k=0; k<8; ++k) + { + int r = (int)finalR[k]; + int g = (int)finalG[k]; + int b = (int)finalB[k]; + constexpr int alpha = 255; + + lightPixels[j][i + k] = alpha << 24 | b << 16 | g << 8 | r; + } + } +#else for (int i = 0; i < 256; i++) { const float darken = skyDarken1 * 0.95f + 0.05f; @@ -934,7 +1041,7 @@ void GameRenderer::updateLightTexture(float a) const int g = static_cast(_g * 255); const int b = static_cast(_b * 255); -#if ( defined _DURANGO || defined _WIN64 || __PSVITA__ ) +#if ( defined _DURANGO || defined _WINDOWS64 || __PSVITA__ ) lightPixels[j][i] = alpha << 24 | b << 16 | g << 8 | r; #elif ( defined _XBOX || defined __ORBIS__ ) lightPixels[j][i] = alpha << 24 | r << 16 | g << 8 | b; @@ -942,6 +1049,7 @@ void GameRenderer::updateLightTexture(float a) lightPixels[j][i] = r << 24 | g << 16 | b << 8 | alpha; #endif } +#endif mc->textures->replaceTextureDirect(lightPixels[j], 16, 16, getLightTexture(j, level)); } diff --git a/Minecraft.Client/LevelRenderer.cpp b/Minecraft.Client/LevelRenderer.cpp index 0d814d30..d618c5ac 100644 --- a/Minecraft.Client/LevelRenderer.cpp +++ b/Minecraft.Client/LevelRenderer.cpp @@ -1965,7 +1965,121 @@ bool LevelRenderer::updateDirtyChunks() int numClipChunks = static_cast(chunks[p].length); ClipChunk *pClipChunk = chunks[p].data; + +#ifdef __AVX__ + int i = 0; + __m256 vPx = _mm256_set1_ps((float)px); + __m256 vPy = _mm256_set1_ps((float)py); + __m256 vPz = _mm256_set1_ps((float)pz); + + for (; i <= numClipChunks - 8; i += 8) + { + bool anyDirty = false; + for(int j=0; j<8; ++j) { + int gIdx = pClipChunk[i+j].globalIdx; + if (gIdx >= 0 && (globalChunkFlags[gIdx] & CHUNK_FLAG_DIRTY)) { + anyDirty = true; break; + } + } + + if (!anyDirty) continue; + + alignas(32) float cx[8], cy[8], cz[8]; + for (int j = 0; j < 8; ++j) + { + cx[j] = (float)pClipChunk[i+j].xm; + cy[j] = (float)pClipChunk[i+j].ym; + cz[j] = (float)pClipChunk[i+j].zm; + } + + __m256 vCx = _mm256_load_ps(cx); + __m256 vCy = _mm256_load_ps(cy); + __m256 vCz = _mm256_load_ps(cz); + + __m256 vXd = _mm256_sub_ps(vCx, vPx); + __m256 vYd = _mm256_sub_ps(vCy, vPy); + __m256 vZd = _mm256_sub_ps(vCz, vPz); + + __m256 vDistSq = _mm256_add_ps(_mm256_mul_ps(vXd, vXd), + _mm256_add_ps(_mm256_mul_ps(vYd, vYd), + _mm256_mul_ps(vZd, vZd))); + + __m256 vYd2_3 = _mm256_mul_ps(_mm256_set1_ps(3.0f), _mm256_mul_ps(vYd, vYd)); + __m256 vDistSqWeighted = _mm256_add_ps(vDistSq, vYd2_3); + + alignas(32) float distOut[8], distWOut[8]; + _mm256_store_ps(distOut, vDistSq); + _mm256_store_ps(distWOut, vDistSqWeighted); + + for (int j = 0; j < 8; ++j) + { + ClipChunk* currentChunk = &pClipChunk[i+j]; + const int gIdx = currentChunk->globalIdx; + + if (gIdx < 0 || !(globalChunkFlags[gIdx] & CHUNK_FLAG_DIRTY)) continue; + + Chunk *chunk = currentChunk->chunk; + if (chunk == nullptr) continue; + + const int ySlice = (currentChunk->ym - (CHUNK_SIZE / 2)) / CHUNK_SIZE; + LevelChunk *lc = level[p]->getChunkAt(chunk->x, chunk->z); + if (lc == nullptr || lc->isRenderChunkEmpty(ySlice * 16)) + { + chunk->clearDirty(); + globalChunkFlags[gIdx] |= CHUNK_FLAG_EMPTYBOTH; + continue; + } + + int dSq = (int)distOut[j]; + int dSqW = (int)distWOut[j]; + + if ( (!onlyRebuild) || (globalChunkFlags[gIdx] & CHUNK_FLAG_COMPILED) || (dSq < 96 * 96) ) + { +#ifdef _LARGE_WORLDS + bool isNearer = nearestClipChunks.empty(); + auto itNearest = nearestClipChunks.begin(); + for(; itNearest != nearestClipChunks.end(); ++itNearest) + { + isNearer = dSqW < itNearest->second; + if(isNearer) break; + } + isNearer = isNearer || (nearestClipChunks.size() < maxNearestChunks); +#else + bool isNearer = dSqW < minDistSq; +#endif + +#ifdef _CRITICAL_CHUNKS + if( (!veryNearCount && isNearer) || (dSq < 20 * 20 && (globalChunkFlags[gIdx] & CHUNK_FLAG_CRITICAL)) ) +#else + if( isNearer ) +#endif + { + nearChunk = currentChunk; + minDistSq = dSqW; +#ifdef _LARGE_WORLDS + nearestClipChunks.insert(itNearest, std::make_pair(nearChunk, minDistSq)); + if(nearestClipChunks.size() > maxNearestChunks) + nearestClipChunks.pop_back(); +#endif + } + +#ifdef _CRITICAL_CHUNKS + if( dSq < 20 * 20 && (globalChunkFlags[gIdx] & CHUNK_FLAG_CRITICAL) ) +#else + if( dSq < 20 * 20 ) +#endif + { + veryNearCount++; + } + } + } + } + pClipChunk = &chunks[p].data[i]; + for( ; i < numClipChunks; i++, pClipChunk++ ) +#else + // OG for( int i = 0; i < numClipChunks; i++, pClipChunk++ ) +#endif { // Fast reject: skip non-dirty chunks immediately before any distance work. // globalIdx can be -1 for unassigned chunks.