/*---------------------------------------------------------------------------*

  Copyright 2010-2014 Nintendo.  All rights reserved.

  These coded instructions, statements, and computer programs contain
  proprietary information of Nintendo of America Inc. and/or Nintendo
  Company Ltd., and are protected by Federal copyright law.  They may
  not be disclosed to third parties or copied or duplicated in any form,
  in whole or in part, without the prior written consent of Nintendo.

 *---------------------------------------------------------------------------*/

#include <stdio.h>
#include <string.h>
#include <math.h>

#if defined(WIN32) || defined(WIN64)
#include <pc/gx2.h>
#include <pc/demo.h>
#endif
#include <cafe/os.h>
#include <cafe/mem.h>
#include <cafe/gx2.h>
#include <cafe/demo.h>

#include <cafe/gfd.h>
#include <cafe/gx2ut.h>

//Include generated shaders
#include "shaders/headers/gx2utClearSurfaceRect.h"

/*
 * 0 - Simple clear shader
 */
static const GX2VertexShader* const VS_SHADERS[] = { &gx2utClearSurfaceRect_VS};

static const GX2PixelShader* const PS_SHADERS[] = { &gx2utClearSurfaceRect_PS};

static const u32 NUM_SHADERS = 1;

typedef struct _ClearSurfaceShader {
    // These variables hold the three types of shaders needed for a call to
    // GX2SetShaders. The vertex and pixel shaders are loaded from the
    // header, but since the fetch shader is generated at run-time
    // it must be handled slightly differently.
    const GX2VertexShader *pVertexShader;
    const GX2PixelShader *pPixelShader;

    // The register locations where the offset uniforms are stored for
    // the pixel and vertex shaders.
    u32 u_positionLocation;
    u32 u_clearColorLocation;
} ClearSurfaceShader;

//For now, share one fetch shader buffer for all shaders, since it should be identical
#define FETCH_SHADER_SIZE 32  //hard code this value for now
ALIGNVAR(GX2_SHADER_ALIGNMENT) static u8 g_GX2UTFetchShader[FETCH_SHADER_SIZE];


static ClearSurfaceShader g_clearSurfaceShader[NUM_SHADERS];
static GX2FetchShader fetchShader;
typedef struct _VtxFmtF32x2 {
    f32 position[2];
} VtxFmtF32x2;

// This data works for both RECT and TRISTRIP
static const VtxFmtF32x2 CLEAR_SURFACE_RECT_POSITION_DATA[] =
{
    {0.0f,  0.0f},
    {1.0f,  0.0f},
    {0.0f,  1.0f},
    {1.0f,  1.0f}
};

static const u32 VERTEX_COUNT = sizeof(CLEAR_SURFACE_RECT_POSITION_DATA)
                         / sizeof(CLEAR_SURFACE_RECT_POSITION_DATA[0]);

// Initializes how surfaces will be copied
void GX2UTClearSurfaceRectInit(void)
{
    static GX2Boolean initDone = GX2_FALSE;

    if (initDone == GX2_TRUE)
    {
        //OSReport("Skipping init in GX2UTClearSurfaceRectInit\n");
        return;
    }

    // Setup shaders
    u32 i;

    GX2NotifyMemAlloc(g_GX2UTFetchShader, 
                      FETCH_SHADER_SIZE, 
                      GX2_SHADER_ALIGNMENT);

    for (i = 0; i < NUM_SHADERS; ++i)
    {
        g_clearSurfaceShader[i].pVertexShader = VS_SHADERS[i];
        g_clearSurfaceShader[i].pPixelShader = PS_SHADERS[i];

        GX2Invalidate(GX2_INVALIDATE_CPU_SHADER,
            g_clearSurfaceShader[i].pVertexShader->shaderPtr,
            g_clearSurfaceShader[i].pVertexShader->shaderSize);

        GX2NotifyMemAlloc(g_clearSurfaceShader[i].pVertexShader->shaderPtr,
            g_clearSurfaceShader[i].pVertexShader->shaderSize,
            GX2_SHADER_ALIGNMENT);

        GX2Invalidate(GX2_INVALIDATE_CPU_SHADER,
            g_clearSurfaceShader[i].pPixelShader->shaderPtr,
            g_clearSurfaceShader[i].pPixelShader->shaderSize);

        GX2NotifyMemAlloc(g_clearSurfaceShader[i].pPixelShader->shaderPtr,
            g_clearSurfaceShader[i].pPixelShader->shaderSize,
            GX2_SHADER_ALIGNMENT);

        // Lookup the uniform locations in the vertex shader and pixel shader.
        // The shader author chose the names "u_positions", "u_depth", and "u_clearColor"
        g_clearSurfaceShader[i].u_positionLocation =
            (u32)GX2GetVertexUniformVarOffset(g_clearSurfaceShader[i].pVertexShader, "u_positions");
        g_clearSurfaceShader[i].u_clearColorLocation =
            (u32)GX2GetPixelUniformVarOffset(g_clearSurfaceShader[i].pPixelShader, "u_clearColor");
        ASSERT((g_clearSurfaceShader[i].u_positionLocation != GX2_UNIFORM_VAR_INVALID_OFFSET)
            && (g_clearSurfaceShader[i].u_clearColorLocation != GX2_UNIFORM_VAR_INVALID_OFFSET)
            && "Couldn't find the correct vertex and pixel shader uniforms.");

    }

    ASSERT(GX2CalcFetchShaderSize(0) <= sizeof(g_GX2UTFetchShader) && "g_GX2UTFetchShader too small!\n");
    GX2InitFetchShader(&fetchShader, g_GX2UTFetchShader, 0, NULL);
    
    initDone = GX2_TRUE;
}

/// returns the bits of a floating point value as an unsigned integer
static u32 FloatToBits(f32 f)
    {
        union { f32 f; 
                u32 u; } converter;
        converter.f = f;
        return converter.u; 
    }

/// returns the bits of a floating point value as an unsigned integer
static f32 BitsToFloat(u32 u)
    {
        union { f32 f; 
                u32 u; } converter;
        converter.u = u;
        return converter.f; 
    }

/*******************************************************************************
*   ConvertFP32ToSmallFP
*
*   @brief
*      Converts 32 bit floating point value to one with the supplied 
*      representation.  Code assume the dest format follows representation 
*      similar to the fp32 IEEE format.
*
*   @return
*      Converted value as an unsigned integer.
*******************************************************************************/
u32 ConvertFP32ToSmallFP(
     f32 fp32,      ///< Original fp32 value
     BOOL signBit,  ///< Sign bit in destination format
     u32 expBits,   ///< Exponent bits in destination format
     u32 mantBits)  ///< Mantissa bits in destination format
{
    u32 uiFp32, fp32Sign, fp32Exp, fp32Mant, fp32MantBits, fp32Bias;
    u32 maxExp, bias;
    u32 outFp;

    // Extract relevant values from input value
    uiFp32 = FloatToBits(fp32);
    fp32Sign = (uiFp32 & 0x80000000) >> 31;
    fp32Exp = (uiFp32 & 0x7F800000) >> 23;
    fp32Mant = uiFp32 & 0x007FFFFF;
    fp32MantBits = 23;
    fp32Bias = 127;

    // Compute exponent bias for destination format.  This is also the max positive (and negative)
    // unbiased exponents in the format.
    bias = (1 << (expBits - 1)) - 1;

    // Compute max exponent reserved for NaN and Infs
    maxExp = (1 << expBits) - 1;

    if (fp32Exp == 0xff)
    {
        // Handle NaNs and Infs first.  According to the DX10 spec these get converted to NaNs
        // and Infs in the lower precision format when available, otherwise they go to 0.  We
        // assume the destination format has representations for NaNs and Infs except for when
        // there is no sign bit to represent signed NaN and Inf.
        if ((signBit == TRUE) || (fp32Sign == 0))
        {
            outFp = (fp32Sign << (expBits + mantBits)) | (maxExp << mantBits) |
                (fp32Mant >> (fp32MantBits - mantBits));
        }
        else
        {
            outFp = 0;
        }
    }
    else if ((signBit == FALSE) && (fp32Sign == 1))
    {
        // Negative numbers go to zero if they can't be represented
        outFp = 0;
    }
    else if (fp32Exp > (fp32Bias + bias))
    {
        // Too large to be represented in the destination format are made into signed MAX_FLOAT.
        outFp = (fp32Sign << (expBits + mantBits)) | ((maxExp - 1) << mantBits) |
            ((1 << mantBits) - 1);
    }
    else if (fp32Exp < (fp32Bias - (bias - 1)))
    {
        // Too small to be represented as a normalized number or it's zero
        u32 shift;

        // Shift amount is the difference between the fp32 exponent and the the minimum
        // exponent in the dest format.
        shift = fp32Bias - (bias - 1) - fp32Exp;

        // Large enough shifts will generate 0
        if (shift > (fp32MantBits + 1))
        {
            fp32Mant = 0;
        }
        else
        {
            // Add in hidden bit and right shift to align to new format
            fp32Mant = (fp32Mant | 0x00800000) >> (fp32MantBits - mantBits);
            fp32Mant = fp32Mant >> shift;
        }

        outFp = (fp32Sign << (expBits + mantBits)) | fp32Mant;
    }
    else
    {
        // Can be represented as a normalized number in the new format
        outFp = (fp32Sign << (expBits + mantBits)) |
            ((fp32Exp + bias - fp32Bias) << mantBits) |
            (fp32Mant >> (fp32MantBits - mantBits));
    }

    // Sanity check
    ASSERT((outFp & ~((1 << (signBit + expBits + mantBits)) - 1)) == 0x0);

    return outFp;
}


u32 ConvertFP32ToUnorm(
    f32 fp32,      ///< fp32 value to convert
    u32 numBits)   ///< number of bits in destination unorm
{
    u32 uiFp32, out, maxVal;
    u32 fp32Sign, fp32Exp, fp32Mant;

    // Extract relevant floating point parts
    uiFp32 = FloatToBits(fp32);
    fp32Sign = (uiFp32 & 0x80000000) >> 31;
    fp32Exp  = (uiFp32 & 0x7F800000) >> 23;
    fp32Mant = uiFp32 & 0x007FFFFF;

    // Maximum representable unorm
    maxVal = (1 << numBits) - 1;

    // Handle NaNs and Infs values separately
    if (fp32Exp == 0xff)
    {
        // Nans and -Inf go to 0
        if ((fp32Mant != 0x0) || (fp32Sign == 1))
        {
            out = 0;
        }
        else
        {
            // +Inf goes to max representable value
            out = maxVal;
        }
    }
    else if (fp32 > 1.0f)
    {
        out = maxVal;
    }
    else if (fp32 < 0.0f)
    {
        out = 0;
    }
    else
    {
        out = static_cast<u32>((fp32 * maxVal) + 0.5f);
    }

    ASSERT(out <= maxVal);

    return out;
}

/*******************************************************************************
*   PackClearColor
*
*   @brief
*       Pack the clear color for the given format into a 32 bit quantity.
*
*   @return
*       Packed 32-bit clear value.
*******************************************************************************/
u32 PackClearColor(f32 r, f32 g, f32 b, f32 a, ///< Clear color
                   GX2SurfaceFormat format)    ///< Color format
{
    u32 clearColor = 0;

    if ((format == GX2_SURFACE_FORMAT_TCS_R8_G8_B8_A8_UNORM))
    {
        u32 red, green, blue, alpha;

        red   = ConvertFP32ToUnorm(r, 8);
        green = ConvertFP32ToUnorm(g, 8);
        blue  = ConvertFP32ToUnorm(b, 8);
        alpha = ConvertFP32ToUnorm(a, 8);

        clearColor = (alpha << 24) | (blue << 16) | (green << 8) | red;
    }
    else if ((format == GX2_SURFACE_FORMAT_TCS_R10_G10_B10_A2_UNORM) ||
             (format == GX2_SURFACE_FORMAT_TCS_A2_B10_G10_R10_UNORM))
    {
        u32 red, green, blue, alpha;

        red =   ConvertFP32ToUnorm(r, 10);
        green = ConvertFP32ToUnorm(g, 10);
        blue =  ConvertFP32ToUnorm(b, 10);
        alpha = ConvertFP32ToUnorm(a, 2);

        if (format == GX2_SURFACE_FORMAT_TCS_R10_G10_B10_A2_UNORM)
        {
            clearColor = (alpha << 30) | (blue << 20) | (green << 10) | red;
        }
        else if (format == GX2_SURFACE_FORMAT_TCS_A2_B10_G10_R10_UNORM)
        {
            clearColor = (blue << 22) | (green << 12) | (red << 2) | alpha;
        }
    }
    else if (format == GX2_SURFACE_FORMAT_TC_R11_G11_B10_FLOAT)
    {
        u32 redFP11, greenFP11, blueFP10;

        redFP11 =   ConvertFP32ToSmallFP(r, FALSE, 5, 6);
        greenFP11 = ConvertFP32ToSmallFP(g, FALSE, 5, 6);
        blueFP10 =  ConvertFP32ToSmallFP(b, FALSE, 5, 5);

        clearColor = (blueFP10 << 22) | (greenFP11 << 11) | redFP11;
    }
    else if (format == GX2_SURFACE_FORMAT_TC_R16_G16_FLOAT)
    {
        u32 redFP16, greenFP16;

        redFP16 =   ConvertFP32ToSmallFP(r, GX2_TRUE, 5, 10);
        greenFP16 = ConvertFP32ToSmallFP(g, GX2_TRUE, 5, 10);

        clearColor = (greenFP16 << 16) | (redFP16);
    }

    else
    {
        // Not implemented yet.
        ASSERT(FALSE);
    }

    return clearColor;
}

// Identify CB formats that can benefit by being cleared with the DB
GX2Boolean ClearUsingDB(GX2ColorBuffer *colorBuffer, GX2UTRect *dstRect)
{
    GX2Surface *dstSurface = &colorBuffer->surface;
    u32 dstMip = colorBuffer->viewMip;

    // Can't apply this optimization unless we are clearing the entire surface
    if ((dstRect->left != 0) ||
        (dstRect->top != 0) ||
        (dstRect->right != dstSurface->width >> dstMip) ||
        (dstRect->bottom != dstSurface->height >> dstMip))
    {
        return GX2_FALSE;
    }
    else
    {
        switch (dstSurface->format)
        {
            case GX2_SURFACE_FORMAT_TCS_R8_G8_B8_A8_UNORM:
            case GX2_SURFACE_FORMAT_TCS_R10_G10_B10_A2_UNORM:
            case GX2_SURFACE_FORMAT_TCS_A2_B10_G10_R10_UNORM:
            case GX2_SURFACE_FORMAT_TC_R11_G11_B10_FLOAT:
            case GX2_SURFACE_FORMAT_TC_R16_G16_FLOAT:
            case GX2_SURFACE_FORMAT_TCD_R32_FLOAT:
                break;
            default:
                return GX2_FALSE; // NO
        }
    }
    
    switch (dstSurface->tileMode)
    {
        case GX2_TILE_MODE_1D_TILED_THIN1:
        case GX2_TILE_MODE_2D_TILED_THIN1:
        case GX2_TILE_MODE_2D_TILED_THIN2:
        case GX2_TILE_MODE_2D_TILED_THIN4:
        case GX2_TILE_MODE_2B_TILED_THIN1:
        case GX2_TILE_MODE_2B_TILED_THIN2:
        case GX2_TILE_MODE_2B_TILED_THIN4:
            break;

        default:
            return GX2_FALSE;
    }

    if (dstSurface->aa != GX2_AA_MODE_1X)
    {
        return GX2_FALSE;
    }
     
    return GX2_TRUE;
}              

GX2Boolean IsValidDBFloat(u32 val)
{
    GX2Boolean valid = GX2_TRUE;

    // The following floating point values are not preserved by the DB when the depth value comes
    // from vertex Z:
    // - DeNorms are flushed to 0
    // - NaNs (and Infs) are converted to 0
    if (((val != 0x0) && ((val & 0x7F800000) == 0x0)) || // Denorms
        ((val & 0x7F800000) == 0x7F800000))              // NaNs & Infs
    {
        valid = GX2_FALSE;
    }

    return valid;
}

void GX2UTClearRectOp(GX2ColorBuffer *colorBuffer, GX2DepthBuffer *depthBuffer,
                      f32 r, f32 g, f32 b, f32 a,
                      f32 depthValue, u8 stencilValue,
                      GX2ClearMode clearFlags, GX2HiStencilInfo *hiStencil,
                      GX2UTRect *dstRect)
{
    u32 cbFirstSlice = 0;
    u32 dbFirstSlice = 0;
    u32 numSlices = 0;
    u32 dstWidth, dstHeight, uDepthValue;
    u32 dstMip, dstSlice;
    GX2CompareFunction stencilFunc = GX2_COMPARE_NEVER;
    GX2Boolean bColorAsDepth = GX2_FALSE;
    GX2Boolean depthTestEnable = GX2_FALSE;
    GX2Boolean stencilTestEnable = GX2_DISABLE;
    GX2ColorBuffer cb;
    GX2DepthBuffer db;
    
    GX2UTDebugTagIndent(__func__);

    // blt with width or height <= 0 does nothing
    ASSERT((colorBuffer != NULL || depthBuffer != NULL) && (dstRect != NULL));
    ASSERT((dstRect->bottom > dstRect->top) && (dstRect->right > dstRect->left) && "Invalid destination region!");
    ASSERT(depthValue >= 0.0f && depthValue <= 1.0f);

    // Verify the parameters
    if (colorBuffer && depthBuffer)
    {
        ASSERT((colorBuffer->surface.width >> colorBuffer->viewMip) == (depthBuffer->surface.width >> depthBuffer->viewMip));
        ASSERT((colorBuffer->surface.height >> colorBuffer->viewMip) == (depthBuffer->surface.height >> depthBuffer->viewMip));
        ASSERT(colorBuffer->surface.aa == depthBuffer->surface.aa);
        ASSERT(colorBuffer->viewNumSlices == depthBuffer->viewNumSlices);
    }


    if (clearFlags & GX2_CLEAR_D_REG) {
        ASSERT(depthBuffer != NULL);
        GX2SetClearDepth(depthBuffer, depthValue);
    }
    if (clearFlags & GX2_CLEAR_S_REG) {
        ASSERT(depthBuffer != NULL);
        GX2SetClearStencil(depthBuffer, stencilValue);
    }

    // Initialize the resources needed to clear surfaces.
    // This function only does work the first time it's called.
    GX2UTClearSurfaceRectInit();

    uDepthValue = FloatToBits(depthValue);

    // Some color-only clears can be accelerated by using the depth unit instead
    if (colorBuffer != NULL && depthBuffer == NULL &&
        ClearUsingDB(colorBuffer, dstRect))
    {
        u32 dv = 0;

        if (colorBuffer->surface.format == GX2_SURFACE_FORMAT_TCD_R32_FLOAT)
        {
            // only use red component of clearcolor
            dv = FloatToBits(r);
        }
        else
        {
            // PackColorToFp32
            dv = PackClearColor(r, g, b, a, colorBuffer->surface.format);
        }

        if (IsValidDBFloat(dv))
        {
            // DB can only handle valid floats
            uDepthValue = dv;
            dstSlice = colorBuffer->viewFirstSlice;
            dstMip = colorBuffer->viewMip;

            // Populate db and associated surface accordingly
            db.surface = colorBuffer->surface;
            db.surface.format = GX2_SURFACE_FORMAT_TCD_R32_FLOAT;
            db.surface.use = GX2_SURFACE_USE_DEPTH_BUFFER;
            db.viewMip = dstMip;
            db.viewFirstSlice = dstSlice;
            db.viewNumSlices = colorBuffer->viewNumSlices;
            db.hiZPtr = NULL;
            db.hiZSize = 0;

            // Instruct logic below that we are clearing depth only
            clearFlags = GX2_CLEAR_DEPTH;
            colorBuffer = NULL;
            depthBuffer = &db;

            GX2SetSpecialState(GX2_SPECIAL_STATE_CLEAR_COLOR_AS_DEPTH, GX2_ENABLE);
            bColorAsDepth = GX2_TRUE;
        }
    }

    // Only set if Color Writes are on
    if (colorBuffer != NULL)
    {
        // If AA, must have auxPtr + auxSize
        ASSERT(colorBuffer->surface.aa == GX2_AA_MODE_1X ||
               (colorBuffer->auxPtr != NULL && colorBuffer->auxSize != 0));

        //Currently we do not support clearing BC formats.  Use GX2ClearColor() for this.
        ASSERT(!GX2SurfaceIsCompressed(colorBuffer->surface.format));
        ASSERT((colorBuffer->viewFirstSlice + colorBuffer->viewNumSlices <= colorBuffer->surface.depth));

        dstMip = colorBuffer->viewMip;
        dstWidth  = GX2Max(1, colorBuffer->surface.width  >> dstMip);
        dstHeight = GX2Max(1, colorBuffer->surface.height >> dstMip);

        // Create shallow copy of dest surface to be used as render target
        cb = *colorBuffer;
        cb.surface.use = GX2_SURFACE_USE_COLOR_BUFFER_TEXTURE;
        cb.viewNumSlices = 1;
        // Will reinit regs later

        if (cb.surface.format == GX2_SURFACE_FORMAT_TC_R16_G16_FLOAT)
        {
            // Change format to B8G8R8A8_UNORM and PackFP16ToRGBA8
            cb.surface.format = GX2_SURFACE_FORMAT_TCS_R8_G8_B8_A8_UNORM;

            // Adjust clear color 
            u32 packedColor;

            // Pack 2 channel FP16 clear color into a 32 bit quantity
            packedColor = PackClearColor(r, g, b, a,
                                         GX2_SURFACE_FORMAT_TC_R16_G16_FLOAT);

            // Extract RGBA8 values from packed color and convert to clear values
            a = ((packedColor >> 24) & 0xff) * (1.0f / 255.0f);
            b = ((packedColor >> 16) & 0xff) * (1.0f / 255.0f);
            g = ((packedColor >> 8) & 0xff) * (1.0f / 255.0f);
            r = (packedColor & 0xff) * (1.0f / 255.0f);
        }   

        // Enable Color Writes
        GX2SetColorControl(GX2_LOGIC_OP_COPY,
                           0, //disable blending
                           GX2_DISABLE,
                           GX2_ENABLE);

        numSlices = colorBuffer->viewNumSlices;
        cbFirstSlice = cb.viewFirstSlice;
    }
    else
    {
        // Use Depth Buffer dimensions
        dstMip = depthBuffer->viewMip;
        dstWidth  = GX2Max(1, depthBuffer->surface.width >> dstMip);
        dstHeight = GX2Max(1, depthBuffer->surface.height >> dstMip);

        // Disable Color Writes
        GX2SetColorControl(GX2_LOGIC_OP_COPY,
                           0, //disable blending
                           GX2_DISABLE,
                           GX2_DISABLE);

        GX2SetAAMode(depthBuffer->surface.aa);
    }

    // Render to destination surface dimensions
    GX2SetViewport(0, 0, (f32)dstWidth, (f32)dstHeight, 0.0f, 1.0f);
    GX2SetScissor(0, 0, dstWidth, dstHeight);

    // Only set if Depth or Stencil Writes are on
    if (depthBuffer != NULL)
    {
        ASSERT((depthBuffer->viewFirstSlice + depthBuffer->viewNumSlices <= depthBuffer->surface.depth));

        db = *depthBuffer;
        numSlices = depthBuffer->viewNumSlices;
        db.viewNumSlices = 1;
        // Will reinit regs later

        if (clearFlags & GX2_CLEAR_DEPTH)
            depthTestEnable = GX2_TRUE;
        else
            depthTestEnable = GX2_FALSE;

        if (clearFlags & GX2_CLEAR_STENCIL)
        {
            GX2SetStencilMask(0xff,         //preMaskFront
                              0xff,         //writeMaskFront
                              stencilValue, //refFront
                              0xff,         //preMaskBack
                              0xff,         //writeMaskBack
                              stencilValue);//refBack
            stencilFunc = GX2_COMPARE_ALWAYS;
            stencilTestEnable = GX2_ENABLE;
        } 
        else
        {
            stencilFunc = GX2_COMPARE_NEVER;           
            stencilTestEnable = GX2_DISABLE;
        }

        // fast clears require HiZ and all edges on micro-tile boundaries
        if ((depthBuffer->hiZPtr) &&
            !((dstRect->bottom | dstRect->top | dstRect->left | dstRect->right) & 0x7))

        {
            GX2SetSpecialState(GX2_SPECIAL_STATE_CLEAR_HIZ, GX2_ENABLE);
        }

        dbFirstSlice = db.viewFirstSlice;
    }

    // Depth Writes
    GX2SetDepthStencilControl(depthTestEnable,    //depthTestEnable
                              GX2_ENABLE,         //depthWriteEnable
                              GX2_COMPARE_ALWAYS, //depthFunc
                              stencilTestEnable,  //stencilTestEnable
                              stencilTestEnable,  //backStencilEnable

                              stencilFunc,        //frontStencilFunc
                              GX2_STENCIL_REPLACE,//frontStencilZPass
                              GX2_STENCIL_REPLACE,//frontStencilZFail
                              GX2_STENCIL_REPLACE,//frontStencilFail

                              stencilFunc,        //backStencilFunc
                              GX2_STENCIL_REPLACE,//backStencilZPass
                              GX2_STENCIL_REPLACE,//backStencilZFail
                              GX2_STENCIL_REPLACE //backStencilFail
                            );

    if (hiStencil != NULL)
        GX2SetHiStencilInfo(hiStencil);

    // Only one clear shader
    u32 shaderIdx = 0;

    // Set shaders
    GX2SetFetchShader(&fetchShader);
    GX2SetVertexShader(g_clearSurfaceShader[shaderIdx].pVertexShader);
    GX2SetPixelShader(g_clearSurfaceShader[shaderIdx].pPixelShader);

    // Set the uniforms to be used by the vertex shader and pixel shader
    f32 position_base_scale[] = 
    {
        (f32)dstRect->left,
        (f32)dstRect->top,
        (f32)dstRect->right - (f32)dstRect->left,
        (f32)dstRect->bottom - (f32)dstRect->top,
    };
    
    for (int i = 0; i < 4; i++)
    {
        f32 pos[] = {
            position_base_scale[0] + position_base_scale[2] * CLEAR_SURFACE_RECT_POSITION_DATA[i].position[0],
            position_base_scale[1] + position_base_scale[3] * CLEAR_SURFACE_RECT_POSITION_DATA[i].position[1],
            BitsToFloat(uDepthValue),
            1.0f
        };

        GX2SetVertexUniformReg(g_clearSurfaceShader[shaderIdx].u_positionLocation + i*4, 1*4, pos);
    }

    f32 clearColor[] = {r, g, b, a};
    
    GX2SetPixelUniformReg(g_clearSurfaceShader[shaderIdx].u_clearColorLocation, 1*4, clearColor);

    for (int slice = 0; slice < numSlices; slice++)
    {
        // Reset the color buffer to the next slice
        if (colorBuffer)
        {
            cb.viewFirstSlice = cbFirstSlice + slice;
            GX2InitColorBufferRegs(&cb);
            GX2SetColorBuffer(&cb, GX2_RENDER_TARGET_0);
        }

        // Reset the depth buffer to the next slice
        if (depthBuffer)
        {
            db.viewFirstSlice = dbFirstSlice + slice;
            GX2InitDepthBufferRegs(&db);
            GX2SetDepthBuffer(&db);
        }

        //Call the render function pointer
        GX2Draw(GX2_PRIMITIVE_RECTS, VERTEX_COUNT);
    }

    if (bColorAsDepth)
    {
        GX2SetSpecialState(GX2_SPECIAL_STATE_CLEAR_COLOR_AS_DEPTH, GX2_DISABLE);
    }

    if (colorBuffer != NULL)
    {
        if ( colorBuffer->viewMip )
            GX2Invalidate(GX2_INVALIDATE_COLOR_BUFFER, colorBuffer->surface.mipPtr, colorBuffer->surface.mipSize);
        else
            GX2Invalidate(GX2_INVALIDATE_COLOR_BUFFER, colorBuffer->surface.imagePtr, colorBuffer->surface.imageSize);
    }

    if (depthBuffer != NULL)
    {
        if ( depthBuffer->viewMip )
            GX2Invalidate(GX2_INVALIDATE_DEPTH_BUFFER, depthBuffer->surface.mipPtr, depthBuffer->surface.mipSize);
        else
            GX2Invalidate(GX2_INVALIDATE_DEPTH_BUFFER, depthBuffer->surface.imagePtr, depthBuffer->surface.imageSize);

        // Disable HiZ optimization
        if ((depthBuffer->hiZPtr) &&
            !((dstRect->bottom | dstRect->top | dstRect->left | dstRect->right) & 0x7))

        {
            GX2SetSpecialState(GX2_SPECIAL_STATE_CLEAR_HIZ, GX2_DISABLE);
        }
    }
    GX2UTDebugTagUndent();
}

// Clears a region of the HiStencil buffer
// Setting the rectangular regions to a top-left of (0,0) and a bottom-right of (width, height)
// will blt the entire surface without any flipping (the right and bottom are exclusive).
// Must be invoked to invalidate HiStencil when changing pretest state during a frame.
void GX2UTInvalidateHiStencilRect(GX2UTRect *dstRect, GX2DepthBuffer *depthBuffer)
{
    u32 dstWidth, dstHeight;

    GX2UTDebugTagIndent(__func__);

    //Disable state shadowing.  If your app is using state shadowing,
    //you will need to restore the context after calling this function.
    GX2SetContextState(NULL);

    // Initialize the resources needed to clear surfaces.
    // This function only does work the first time it's called.
    GX2UTClearSurfaceRectInit();

    // must have a deptBuffer w/ hiZPtr
    // blt with width or height <= 0 does nothing
    ASSERT((depthBuffer != NULL) && (depthBuffer->hiZPtr != NULL) && (dstRect != NULL));
    ASSERT((dstRect->bottom > dstRect->top) && (dstRect->right > dstRect->left) && "Invalid destination region!");

    // Disable Color Writes
    GX2SetColorControl(GX2_LOGIC_OP_COPY,
                       0, //disable blending
                       GX2_DISABLE,
                       GX2_DISABLE);

    // Disable Depth and Stencil Writes
    GX2SetDepthStencilControl(GX2_DISABLE,          // depthEnable
                              GX2_DISABLE,          // depthWriteEnable
                              GX2_COMPARE_ALWAYS,   // depthFunc
                              GX2_DISABLE,          // stencilTestEnable
                              GX2_DISABLE,          // backStencilEnable
                              GX2_COMPARE_ALWAYS,   // frontStencilFunc
                              GX2_STENCIL_KEEP,     // frontStencilZPass
                              GX2_STENCIL_KEEP,     // frontStencilZFail
                              GX2_STENCIL_KEEP,     // frontStencilFail
                              GX2_COMPARE_ALWAYS,   // backStencilFunc
                              GX2_STENCIL_KEEP,     // backStencilZPass
                              GX2_STENCIL_KEEP,     // backStencilZFail
                              GX2_STENCIL_KEEP);    // backStencilFail
    GX2SetStencilMask(0x00, //preMaskFront
                      0x00, //writeMaskFront
                      0x00, //refFront
                      0x00, //preMaskBack
                      0x00, //writeMaskBack
                      0x00);//refBack

    dstWidth  = depthBuffer->surface.width;
    dstHeight = depthBuffer->surface.height;

    // Render to destination surface dimensions
    GX2SetViewport(dstRect->left, dstRect->bottom, (f32)dstWidth, (f32)dstHeight, 0.0f, 1.0f);
    GX2SetScissor(dstRect->left, dstRect->bottom, dstWidth, dstHeight);

    GX2SetDepthBuffer(depthBuffer);
    
    // Create HiStencilInfo that will clear the HiStencil pretest results
    GX2HiStencilInfo hiStencilInfo;    
    hiStencilInfo.state[0].function = GX2_COMPARE_ALWAYS;
    hiStencilInfo.state[0].reference = 0;
    hiStencilInfo.state[0].mask = 0xFF;
    hiStencilInfo.state[0].enable = GX2_FALSE;
    hiStencilInfo.state[1].function = GX2_COMPARE_ALWAYS;
    hiStencilInfo.state[1].reference = 0;
    hiStencilInfo.state[1].mask = 0xFF;
    hiStencilInfo.state[1].enable = GX2_FALSE;
    GX2InitHiStencilInfoRegs(&hiStencilInfo);
    GX2SetHiStencilInfo(&hiStencilInfo);
        
    // Only one clear shader
    u32 shaderIdx = 0;

    // Set shaders
    // NOTE: No fetch shader is needed for our shaders, see GLSL for details.
    GX2SetVertexShader(g_clearSurfaceShader[shaderIdx].pVertexShader);
    GX2SetPixelShader(g_clearSurfaceShader[shaderIdx].pPixelShader);

    // Set the uniforms to be used by the vertex shader and pixel shader
    f32 position_base_scale[] = 
    {
        -1.0f + 2.0f * (f32)dstRect->left / (f32)dstWidth,
         1.0f - 2.0f * (f32)dstRect->top / (f32)dstHeight,
         2.0f * (f32)(dstRect->right - dstRect->left) / (f32)dstWidth,
        -2.0f * (f32)(dstRect->bottom - dstRect->top) / (f32)dstHeight,
    };
    
    for (int i = 0; i < 4; i++)
    {
        f32 pos[] = {
            position_base_scale[0] + position_base_scale[2] * CLEAR_SURFACE_RECT_POSITION_DATA[i].position[0],
            position_base_scale[1] + position_base_scale[3] * CLEAR_SURFACE_RECT_POSITION_DATA[i].position[1],
            0.0,
            1.0
        };

        GX2SetVertexUniformReg(g_clearSurfaceShader[shaderIdx].u_positionLocation + i*4, 1*4, pos);
    }

    // We don't need to set clearColor or depthValue because we aren't writing those buffers
    
    //Call the render function pointer
    GX2Draw(GX2_PRIMITIVE_TRIANGLE_STRIP, VERTEX_COUNT);

    GX2UTDebugTagUndent();
}

//Setup all of the constant renderstate needed for the clear
void GX2UTSetClearState(GX2Boolean enable)
{
    if (enable)
    {
        // If your application's steady state can be set to GX2UT common state
        // using a small number of discrete GX2 calls, then customize here
        // instead of using GX2UTSetCommonState()
        GX2UTSetCommonState();

        // Enable any special GX2 state
        GX2SetSpecialState(GX2_SPECIAL_STATE_CLEAR, GX2_ENABLE);
        // That call just clobbered  RasterizerClipControl
    }
    else
    {
        // Disable any special GX2 state
        GX2SetSpecialState(GX2_SPECIAL_STATE_CLEAR, GX2_DISABLE);

        // The purpose of the following is to return the context to GX2 default
        // state. Integration with your application's state management may
        // differ. The clear operation itself clobbered DepthStencilControl
        // and ColorControl
        GX2SetDepthStencilControl(GX2_TRUE,            //depthTestEnable
                                  GX2_TRUE,            //depthWriteEnable
                                  GX2_COMPARE_LESS,    //depthFunc
                                  GX2_FALSE,           //stencilTestEnable
                                  GX2_FALSE,           //backStencilEnable
                                  GX2_COMPARE_ALWAYS,  //frontStencilFunc
                                  GX2_STENCIL_REPLACE, //frontStencilZPass
                                  GX2_STENCIL_REPLACE, //frontStencilZFail
                                  GX2_STENCIL_REPLACE, //frontStencilFail
                                  GX2_COMPARE_ALWAYS,  //backStencilFunc
                                  GX2_STENCIL_REPLACE, //backStencilZPass
                                  GX2_STENCIL_REPLACE, //backStencilZFail
                                  GX2_STENCIL_REPLACE);//backStencilFail

        GX2SetColorControl(GX2_LOGIC_OP_COPY, GX2_DISABLE, GX2_DISABLE, GX2_ENABLE);
    }
}

void GX2UTSetupColorAuxBufferOp(GX2ColorBuffer *colorBuffer)
{
    if (colorBuffer->auxPtr)
    {
        u32 ctileOffset = colorBuffer->_regs[4]; // cmask_offset
        u32 ctileSize = colorBuffer->auxSize - ctileOffset;
        u8* ctilePtr = (u8*)colorBuffer->auxPtr + ctileOffset;
        ASSERT((ctileSize & 0x1FF) == 0 && "Invalid MSAA Color Buffer auxSize!");

        GX2ColorBuffer tmpBuf;
        u32 width = 16;
        u32 height = ctileSize / 4 / width;

        GX2InitColorBuffer(&tmpBuf, width, height, GX2_SURFACE_FORMAT_TCS_R8_G8_B8_A8_UNORM, GX2_AA_MODE_1X);
        GX2InitColorBufferPtr(&tmpBuf, ctilePtr);
        tmpBuf.surface.tileMode = GX2_TILE_MODE_1D_TILED_THIN1;
        GX2CalcSurfaceSizeAndAlignment(&tmpBuf.surface);
        ASSERT(tmpBuf.surface.imageSize == ctileSize && "CMask Tile Size must match calculated image size!");
        GX2InitColorBufferRegs(&tmpBuf);

        GX2UTClearOp(&tmpBuf, NULL,
                     GX2_AUX_BUFFER_CLEAR_VALUE/255.0f,
                     GX2_AUX_BUFFER_CLEAR_VALUE/255.0f,
                     GX2_AUX_BUFFER_CLEAR_VALUE/255.0f,
                     GX2_AUX_BUFFER_CLEAR_VALUE/255.0f,
                     0.0f, 0u, GX2_CLEAR_NONE, NULL);
    }
}