whoimi

A geek blog

View on GitHub

Unity光源配置:Tiled And Cluster

unity最新的HDRP中采用了Cluster和Tiled的光源配置,这里主要记录光源是如何从Unity当中的Light配置设置到最终Shader的对应Pass当中的。

Cluster是把视锥体看做三维体素,每个体素当中计算影响这个体素的光线,就和Volumetric Fog一样的原理。

Tiled是把屏幕分成格子,对每个格子计算影响这个格子的光源。

Tiled光照列表的计算总共分为三步:

  1. 计算光照列表的所有凸包
  2. 使用凸包,计算bigtile的光照列表的大致计算。
  3. 使用fplt进行更精细的光照列表计算。

##Tile光照列表计算

下面是光源计算,为每一个Tile生成光照列表:

public void BuildGPULightListsCommon(HDCamera hdCamera, CommandBuffer cmd, RenderTargetIdentifier cameraDepthBufferRT, RenderTargetIdentifier stencilTextureRT, bool skyEnabled)
{
  var camera = hdCamera.camera;
  // 计算光照列表开始
  cmd.BeginSample("Build Light List");

  var w = (int)hdCamera.screenSize.x;
  var h = (int)hdCamera.screenSize.y;
  s_TempScreenDimArray[0] = w;
  s_TempScreenDimArray[1] = h;
  //首先设置Tile的大小  64,64
  var numBigTilesX = (w + 63) / 64;
  var numBigTilesY = (h + 63) / 64;

  
  // 计算各种矩阵:
  m_LightListProjMatrices[0] = 
  m_LightListProjscrMatrices[0] = 
  m_LightListInvProjscrMatrices[0] =
  m_LightListProjHMatrices[0] = ;
  m_LightListInvProjHMatrices[0] = 
     
  // 如果有光源 ,计算所有光源的AABB包围盒 :见shader:lightlistbuild-bigtile.compute
   if (m_lightCount != 0)
   {
	//第一步计算AABB
     var genAABBKernel = isProjectionOblique ? s_GenAABBKernel_Oblique : s_GenAABBKernel;

     // 是否正交
     cmd.SetComputeIntParam(buildScreenAABBShader, HDShaderIDs.g_isOrthographic, isOrthographic ? 1 : 0);
	 // 光源数量
     cmd.SetComputeIntParam(buildScreenAABBShader, HDShaderIDs.g_iNrVisibLights, m_lightCount);
     // 光源凸包数据
     cmd.SetComputeBufferParam(buildScreenAABBShader, genAABBKernel, HDShaderIDs.g_data, s_ConvexBoundsBuffer);
	// 矩阵数组
     cmd.SetComputeMatrixArrayParam(buildScreenAABBShader, HDShaderIDs.g_mProjectionArr, m_LightListProjHMatrices);
     cmd.SetComputeMatrixArrayParam(buildScreenAABBShader, HDShaderIDs.g_mInvProjectionArr, m_LightListInvProjHMatrices);

     // AABB结果
     cmd.SetComputeBufferParam(buildScreenAABBShader, genAABBKernel, HDShaderIDs.g_vBoundsBuffer, s_AABBBoundsBuffer);

     int tgY = (int)hdCamera.numEyes;
     
     // 提交计算
     cmd.DispatchCompute(buildScreenAABBShader, genAABBKernel, (m_lightCount + 7) / 8, tgY, 1);
   }
  
  
  // 粗糙的Tile计算。
  // enable coarse 2D pass on 64x64 tiles (used for both fptl and clustered).
  if (m_FrameSettings.lightLoopSettings.enableBigTilePrepass)
  {
    cmd.SetComputeIntParam(buildPerBigTileLightListShader, HDShaderIDs.g_iNrVisibLights, m_lightCount);
    cmd.SetComputeIntParam(buildPerBigTileLightListShader, HDShaderIDs.g_isOrthographic, isOrthographic ? 1 : 0);
    cmd.SetComputeIntParams(buildPerBigTileLightListShader, HDShaderIDs.g_viDimensions, s_TempScreenDimArray);

    // TODO: These two aren't actually used...
    cmd.SetComputeIntParam(buildPerBigTileLightListShader, HDShaderIDs._EnvLightIndexShift, m_lightList.lights.Count);
    cmd.SetComputeIntParam(buildPerBigTileLightListShader, HDShaderIDs._DecalIndexShift, m_lightList.lights.Count + m_lightList.envLights.Count);

    cmd.SetComputeMatrixArrayParam(buildPerBigTileLightListShader, HDShaderIDs.g_mScrProjectionArr, m_LightListProjscrMatrices);
    cmd.SetComputeMatrixArrayParam(buildPerBigTileLightListShader, HDShaderIDs.g_mInvScrProjectionArr, m_LightListInvProjscrMatrices);

    cmd.SetComputeFloatParam(buildPerBigTileLightListShader, HDShaderIDs.g_fNearPlane, camera.nearClipPlane);
    cmd.SetComputeFloatParam(buildPerBigTileLightListShader, HDShaderIDs.g_fFarPlane, camera.farClipPlane);
    cmd.SetComputeBufferParam(buildPerBigTileLightListShader, s_GenListPerBigTileKernel, HDShaderIDs.g_vLightList, s_BigTileLightList);
    cmd.SetComputeBufferParam(buildPerBigTileLightListShader, s_GenListPerBigTileKernel, HDShaderIDs.g_vBoundsBuffer, s_AABBBoundsBuffer);
    cmd.SetComputeBufferParam(buildPerBigTileLightListShader, s_GenListPerBigTileKernel, HDShaderIDs._LightVolumeData, s_LightVolumeDataBuffer);
    cmd.SetComputeBufferParam(buildPerBigTileLightListShader, s_GenListPerBigTileKernel, HDShaderIDs.g_data, s_ConvexBoundsBuffer);

    int tgZ = (int)hdCamera.numEyes;
    cmd.DispatchCompute(buildPerBigTileLightListShader, s_GenListPerBigTileKernel, numBigTilesX, numBigTilesY, tgZ);
  }
  
  
  // fplt计算
  // optimized for opaques only
    if (m_FrameSettings.lightLoopSettings.isFptlEnabled)
    {
      var genListPerTileKernel = isProjectionOblique ? s_GenListPerTileKernel_Oblique : s_GenListPerTileKernel;

      cmd.SetComputeIntParam(buildPerTileLightListShader, HDShaderIDs.g_isOrthographic, isOrthographic ? 1 : 0);
      cmd.SetComputeIntParams(buildPerTileLightListShader, HDShaderIDs.g_viDimensions, s_TempScreenDimArray);
      cmd.SetComputeIntParam(buildPerTileLightListShader, HDShaderIDs._EnvLightIndexShift, m_lightList.lights.Count);
      cmd.SetComputeIntParam(buildPerTileLightListShader, HDShaderIDs._DecalIndexShift, m_lightList.lights.Count + m_lightList.envLights.Count);
      cmd.SetComputeIntParam(buildPerTileLightListShader, HDShaderIDs.g_iNrVisibLights, m_lightCount);

      cmd.SetComputeBufferParam(buildPerTileLightListShader, genListPerTileKernel, HDShaderIDs.g_vBoundsBuffer, s_AABBBoundsBuffer);
      cmd.SetComputeBufferParam(buildPerTileLightListShader, genListPerTileKernel, HDShaderIDs._LightVolumeData, s_LightVolumeDataBuffer);
      cmd.SetComputeBufferParam(buildPerTileLightListShader, genListPerTileKernel, HDShaderIDs.g_data, s_ConvexBoundsBuffer);

      cmd.SetComputeMatrixParam(buildPerTileLightListShader, HDShaderIDs.g_mScrProjection, m_LightListProjscrMatrices[0]);
      cmd.SetComputeMatrixParam(buildPerTileLightListShader, HDShaderIDs.g_mInvScrProjection, m_LightListInvProjscrMatrices[0]);

      cmd.SetComputeTextureParam(buildPerTileLightListShader, genListPerTileKernel, HDShaderIDs.g_depth_tex, cameraDepthBufferRT);
      cmd.SetComputeBufferParam(buildPerTileLightListShader, genListPerTileKernel, HDShaderIDs.g_vLightList, s_LightList);
      if (m_FrameSettings.lightLoopSettings.enableBigTilePrepass)
        cmd.SetComputeBufferParam(buildPerTileLightListShader, genListPerTileKernel, HDShaderIDs.g_vBigTileLightList, s_BigTileLightList);

      if (enableFeatureVariants)
      {
        uint baseFeatureFlags = 0;
        if (m_lightList.directionalLights.Count > 0)
        {
          baseFeatureFlags |= (uint)LightFeatureFlags.Directional;
        }
        if (skyEnabled)
        {
          baseFeatureFlags |= (uint)LightFeatureFlags.Sky;
        }
        if (!m_FrameSettings.lightLoopSettings.enableComputeMaterialVariants)
        {
          baseFeatureFlags |= LightDefinitions.s_MaterialFeatureMaskFlags;
        }
        cmd.SetComputeIntParam(buildPerTileLightListShader, HDShaderIDs.g_BaseFeatureFlags, (int)baseFeatureFlags);
        cmd.SetComputeBufferParam(buildPerTileLightListShader, genListPerTileKernel, HDShaderIDs.g_TileFeatureFlags, s_TileFeatureFlags);
      }

      cmd.DispatchCompute(buildPerTileLightListShader, genListPerTileKernel, numTilesX, numTilesY, 1);
    }
  
  // Cluster 计算
  VoxelLightListGeneration(cmd, hdCamera, m_LightListProjscrMatrices, m_LightListInvProjscrMatrices, cameraDepthBufferRT);

  // 计算光照列表结束
  cmd.EndSample("Build Light List");
}

光源包围盒计算

计算屏幕空间光源包围盒的shader:scrbound.compute

原始代码中记录的包围盒的类型,中心和几个扩展方向

    public struct SFiniteLightBound
    {
        public Vector3 boxAxisX;
        public Vector3 boxAxisY;
        public Vector3 boxAxisZ;
        public Vector3 center;        // a center in camera space inside the bounding volume of the light source.
        public Vector2 scaleXY;
        public float radius;
    };

每个光源都有一个上面的结构体,从一个统一的结构体转换到屏幕对齐的AABB包围盒。

###BigTile光照列表计算

计算BigTile光照列表的compute shader:

使用了三种方法:

  1. 包围盒相交。
  2. 球形相交。
  3. CullByExactEdgeTests:这个还没有看。
// 根据NR_THREADS作为不同线程的采样间隔,计算互斥的光源信息
[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
  /*
  这里的计算是每个Tile一次提交,每个提交,多个Thread并行。,每个Thread处理不想交的一组灯光。
  */
  	// 这个是Tile的ID
    uint eyeIndex = u3GroupID.z;
    uint2 tileIDX = u3GroupID.xy;
  	//  这个是每个Tile当中的threadID
    uint t=threadID;
  
	// 计算每个Tile的实际大小。
    uint iWidth = g_viDimensions.x;
    uint iHeight = g_viDimensions.y;
    uint nrBigTilesX = (iWidth+63)/64;
    uint nrBigTilesY = (iHeight+63)/64;
	
  // 线程共享,只让第一个线程进行初始化操作。
    if(t==0) lightOffs = 0;

#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
    GroupMemoryBarrierWithGroupSync();
#endif

    // Raw pixel coordinates of tile
    uint2 viTilLL = 64*tileIDX;
    uint2 viTilUR = min( viTilLL+uint2(64,64), uint2(iWidth, iHeight) );            // not width and height minus 1 since viTilUR represents the end of the tile corner.

    // 每个tile的包围盒
    float2 vTileLL = float2(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight);
    float2 vTileUR = float2(viTilUR.x/(float) iWidth, viTilUR.y/(float) iHeight);

    // build coarse list using AABB
    for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)
    {
      	// 获取灯光包围盒信息
        const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, eyeIndex);
      	// 灯光的包围盒信息
        const float2 vMi = g_vBoundsBuffer[boundsIndices.min].xy;
        const float2 vMa = g_vBoundsBuffer[boundsIndices.max].xy;
		
  	    // 灯光的包围盒和tile的包围盒是否相交
        if( all(vMa>vTileLL) && all(vMi<vTileUR))
        {
            unsigned int uInc = 1;
            unsigned int uIndex;
          	// 原子求和操作,让lightOffs加1.
            InterlockedAdd(lightOffs, uInc, uIndex);
			// 通过加到当前Tile的灯光列表,uIndex是组间不同的。
            if(uIndex<MAX_NR_BIGTILE_LIGHTS) lightsListLDS[uIndex] = l;    
        }
    }

#if /*!defined(SHADER_API_XBOXONE) && */!defined(SHADER_API_PSSL)
    GroupMemoryBarrierWithGroupSync();
#endif

  	// 计算实际的光照数。
    int iNrCoarseLights = min(lightOffs,MAX_NR_BIGTILE_LIGHTS);

  // 球形相交计算,如果球形没有相交, 则把lightlist中记录原来相交的id变成uint_max
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
    SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(64/2,64/2), uint2(iWidth-1, iHeight-1))), eyeIndex );
#endif

#ifdef EXACT_EDGE_TESTS
    CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, eyeIndex);
#endif

    // 灯光按类型排序
    SORTLIST(lightsListLDS, iNrCoarseLights, MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE, t, NR_THREADS);

    if(t==0) lightOffs = 0;
    GroupMemoryBarrierWithGroupSync();
  // 重新计算光照数量
    int i;
    for(i=t; i<iNrCoarseLights; i+=NR_THREADS) 
      if(lightsListLDS[i]<(uint)g_iNrVisibLights) 
        InterlockedAdd(lightOffs, 1);
    GroupMemoryBarrierWithGroupSync();
    iNrCoarseLights = lightOffs;
// 计算当前tile的索引
    int offs = tileIDX.y*nrBigTilesX + tileIDX.x + (eyeIndex * nrBigTilesX * nrBigTilesY);
	//最终的光照列表,放到记录光照的列表当中,所有的tile的光照列表存在一个大的tile当中。
    for(i=t; i<(iNrCoarseLights+1); i+=NR_THREADS)
        g_vLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*offs + i] = i==0 ? iNrCoarseLights : lightsListLDS[max(i-1, 0)];
}

FPTL

精准的判断每个tile当中的光照列表,这里使用了更加精细的tile划分(16*16),相对于bigtile还会对Z值进行比较。原本的Bigtile 只是用了屏幕方向的xy。

同时加入了光源类型相关的操作。

shader使用:lightlistbuild.compute

compute Shader使用的kernel确定:

if (GetFeatureVariantsEnabled())
{
  s_GenListPerTileKernel = buildPerTileLightListShader.FindKernel(m_FrameSettings.lightLoopSettings.enableBigTilePrepass ? "TileLightListGen_SrcBigTile_FeatureFlags" : "TileLightListGen_FeatureFlags");
  s_GenListPerTileKernel_Oblique = buildPerTileLightListShader.FindKernel(m_FrameSettings.lightLoopSettings.enableBigTilePrepass ? "TileLightListGen_SrcBigTile_FeatureFlags_Oblique" : "TileLightListGen_FeatureFlags_Oblique");

}
else
{
  s_GenListPerTileKernel = buildPerTileLightListShader.FindKernel(m_FrameSettings.lightLoopSettings.enableBigTilePrepass ? "TileLightListGen_SrcBigTile" : "TileLightListGen");
  s_GenListPerTileKernel_Oblique = buildPerTileLightListShader.FindKernel(m_FrameSettings.lightLoopSettings.enableBigTilePrepass ? "TileLightListGen_SrcBigTile_Oblique" : "TileLightListGen_Oblique");
}

FPTLshader

[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    uint2 tileIDX = u3GroupID.xy;
    uint t=threadID;

    if(t<MAX_NR_COARSE_ENTRIES)
        prunedList[t]=0;

    uint iWidth = g_viDimensions.x;
    uint iHeight = g_viDimensions.y;
    uint nrTilesX = (iWidth+15)/16;
    uint nrTilesY = (iHeight+15)/16;
    uint nrTiles = nrTilesX * nrTilesY; // Precompute?

    // build tile scr boundary
    const uint uFltMax = 0x7f7fffff;  // FLT_MAX as a uint
    if(t==0)
    {
        ldsZMin = uFltMax;
        ldsZMax = 0;
        lightOffs = 0;
    }

#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
    GroupMemoryBarrierWithGroupSync();
#endif
    uint2 viTilLL = 16*tileIDX;

    // establish min and max depth first
    float dpt_mi=asfloat(uFltMax), dpt_ma=0.0;
	// 计算包含深度的包围盒。
    float4 vLinDepths;
    {
        // Fetch depths and calculate min/max
        UNITY_UNROLL
        for(int i = 0; i < 4; i++)
        {
            int idx = i * NR_THREADS + t;
            uint2 uCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) );
            const float fDepth = FetchDepth(g_depth_tex, uCrd);
            vLinDepths[i] = GetLinearDepth(uCrd+float2(0.5,0.5), fDepth);
            if(fDepth<VIEWPORT_SCALE_Z)     // if not skydome
            {
                dpt_mi = min(fDepth, dpt_mi);
                dpt_ma = max(fDepth, dpt_ma);
            }
        }

        InterlockedMax(ldsZMax, asuint(dpt_ma));
        InterlockedMin(ldsZMin, asuint(dpt_mi));

#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
        GroupMemoryBarrierWithGroupSync();
#endif
    }


    float3 vTileLL = float3(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight, asfloat(ldsZMin));
    float3 vTileUR = float3((viTilLL.x+16)/(float) iWidth, (viTilLL.y+16)/(float) iHeight, asfloat(ldsZMax));
    vTileUR.xy = min(vTileUR.xy,float2(1.0,1.0)).xy;

	// 使用aabb计算粗糙的光照列表。
    // build coarse list using AABB
#ifdef USE_TWO_PASS_TILED_LIGHTING
    const uint log2BigTileToTileRatio = firstbithigh(64) - firstbithigh(16);

    int NrBigTilesX = (nrTilesX+((1<<log2BigTileToTileRatio)-1))>>log2BigTileToTileRatio;
    const int bigTileIdx = (tileIDX.y>>log2BigTileToTileRatio)*NrBigTilesX + (tileIDX.x>>log2BigTileToTileRatio);       // map the idx to 64x64 tiles
    int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+0];
    for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS)
    {
        int l = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+l0+1];
#else
    for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)
    {
#endif
        // Skip density volumes (lights are sorted by category). TODO: improve data locality
        if (_LightVolumeData[l].lightCategory == LIGHTCATEGORY_DENSITY_VOLUME) { break; }

        const float3 vMi = g_vBoundsBuffer[l].xyz;
        const float3 vMa = g_vBoundsBuffer[l+g_iNrVisibLights].xyz;
		// 包围盒在xyz方向上同时判断相交。
        if( all(vMa>vTileLL) && all(vMi<vTileUR))
        {
            unsigned int uInc = 1;
            unsigned int uIndex;
            InterlockedAdd(lightOffs, uInc, uIndex);
            if(uIndex<MAX_NR_COARSE_ENTRIES) coarseList[uIndex] = l;        // add to light list
        }
    }
      // 粗糙的光照列表计算结束

      // FPTL进行精细调整。
#ifdef FINE_PRUNING_ENABLED
    if(t<2) ldsDoesLightIntersect[t] = 0;
#endif

#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
    GroupMemoryBarrierWithGroupSync();
#endif
	
    int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);

      // 球形相交和 bigtile类似
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
    iNrCoarseLights = SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(16/2,16/2), uint2(iWidth-1, iHeight-1))) );
#endif

#ifndef FINE_PRUNING_ENABLED
    {
      // 修改保存位置
        if((int)t<iNrCoarseLights) prunedList[t] = coarseList[t];
        if(t==0) ldsNrLightsFinal=iNrCoarseLights;
    }
#else
    {
        // initializes ldsNrLightsFinal with the number of accepted lights.
        // all accepted entries delivered in prunedList[].
        FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
    }
#endif
	// 光照类型相关,t是线程号。每个线程对应一个类型。
    if(t<CATEGORY_LIST_SIZE) ldsCategoryListCount[t]=0;
#ifdef USE_FEATURE_FLAGS
    if(t==0) ldsFeatureFlags=0;
#endif

#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
    GroupMemoryBarrierWithGroupSync();
#endif


     // 统计每种类型光源的数量
    int nrLightsCombinedList = min(ldsNrLightsFinal,MAX_NR_COARSE_ENTRIES);
    for(int i=t; i<nrLightsCombinedList; i+=NR_THREADS)
    {
        InterlockedAdd(ldsCategoryListCount[_LightVolumeData[prunedList[i]].lightCategory], 1);
#ifdef USE_FEATURE_FLAGS
        InterlockedOr(ldsFeatureFlags, _LightVolumeData[prunedList[i]].featureFlags);
#endif
    }

      // 光照排序,减少在计算光照过程中分支的情况
    // sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
#if !defined(SHADER_API_XBOXONE) && !defined(SHADER_API_PSSL)
    SORTLIST(prunedList, nrLightsCombinedList, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
    //MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);
#endif

#ifdef USE_FEATURE_FLAGS
    if(t == 0)
    {
        uint featureFlags = ldsFeatureFlags | g_BaseFeatureFlags;
        // In case of back
        if(ldsZMax < ldsZMin)   // is background pixel
        {
            // There is no stencil usage with compute path, featureFlags set to 0 is use to have fast rejection of tile in this case. It will still execute but will do nothing
            featureFlags = 0;
        }

        g_TileFeatureFlags[tileIDX.y * nrTilesX + tileIDX.x] = featureFlags;
    }
#endif

    // write lights to global buffers
    int localOffs=0;
    int offs = tileIDX.y*nrTilesX + tileIDX.x;

    // All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
    // to make it work correctly
    int shiftIndex[CATEGORY_LIST_SIZE];
    ZERO_INITIALIZE_ARRAY(int, shiftIndex, CATEGORY_LIST_SIZE);
    shiftIndex[CATEGORY_LIST_SIZE - 2] = _EnvLightIndexShift;
    shiftIndex[CATEGORY_LIST_SIZE - 1] = _DecalIndexShift;
	
      //将光源按照类型顺序写入到全局结果中。
    for(int category=0; category<CATEGORY_LIST_SIZE; category++)
    {
        int nrLightsFinal = ldsCategoryListCount[category];
        int nrLightsFinalClamped = nrLightsFinal<MAX_NR_PRUNED_ENTRIES ? nrLightsFinal : MAX_NR_PRUNED_ENTRIES;

        const int nrDWords = ((nrLightsFinalClamped+1)+1)>>1;
        for(int l=(int) t; l<(int) nrDWords; l += NR_THREADS)
        {
            // We remap the prunedList index to the original LightData / EnvLightData indices
            uint uLow = l==0 ? nrLightsFinalClamped : prunedList[max(0,2 * l - 1 + localOffs)] - shiftIndex[category];
            uint uHigh = prunedList[2 * l + 0 + localOffs] - shiftIndex[category];

            g_vLightList[16*offs + l] = (uLow&0xffff) | (uHigh<<16);
        }

        localOffs += nrLightsFinal;
        offs += (nrTilesX*nrTilesY);
    }
}