/*---------------------------------------------------------------------------* Project: Cafe File: lcstream.h - example locked cache (LC) streaming API Copyright (C) 2011 Nintendo. All rights reserved. These coded instructions, statements, and computer programs contain proprietary information of Nintendo of America Inc. and/or Nintendo Company Ltd., and are protected by Federal copyright law. They may not be disclosed to third parties or copied or duplicated in any form, in whole or in part, without the prior written consent of Nintendo. *---------------------------------------------------------------------------*/ // Streaming has two advantages: // 1. highest bandwidth CPU <-> MEMx // 2. can parallelize computation and data transfer // 3. operating on large aggregate data sizes does not thrash the // L1d/L2 caches // // This API allows a function to be called on a set of input and output // streams. Each input/output stream is broken down into "strip" sized // data. Espresso's DMA engine is used to the transfer strip-sized data // in/out of the locked cache (LC). The function operates on "strip" size // portions while it is in the LC. // // The LCSTREAMProcess() function takes care of pipelining the data // transfer and calling the users compute function to overlap computation // with data transfer. // // This API supports exactly 1 output and 1-4 input streams. All the // streams must be the same size. It is also assumed that the streams // begin on a 64-byte (PPC_IO_BUFFER_ALIGN in OSCore.h) boundary and // are a multiple of the strip size. // // // The model of this API is: // LCSTREAMAlloc() - allocate some space in LC // // LCSTREAMAssign() - assign the stream obj to streams in memory // LCSTREAMProcess() - start stream processing // ... // LCSTREAMAssign() - can reassign stream object if same stream type // LCSTREAMProcess() // ... // LCSTREAMFree() - free LC space // // // Restrictions // 1. Each input and output stream must be the same size (for example, animation // blending, adding vectors/arrays). This streaming model does not work // for cases where the input/output streams are different lengths (for example, // compression or decompression). // // 2. The input and output streams must be aligned to PPC_IO_BUFFER_ALIGN. // // 3. The input and output streams should be a multiple of the 64 bytes // (PPC_IO_BUFFER_ALIGN or LL_CACHE_FETCH_SIZE) to eliminate // cache coherency issues with DMA data. #ifndef __LCSTREAM_H__ #define __LCSTREAM_H__ #ifdef __cplusplus extern "C" { #endif #define LCSTREAM_MAX_NUM_INPUTS 4 // maximum number of input streams // Type for a function that can be used with LCSTREAMProcess. The function // is called with strips of the input and output streams. The locked // cache (LC) pointers of the input strips are passed after the outStrip // argument. funArg is a caller argument that is passed along from the // LCSTREAMProcess call. typedef void (*LCStreamFunction) (u32 stripSize, void* funArg, void* outStrip, ...); typedef struct LCStream { u32 size; // in bytes. Size of input(s) and output are same. u8* in[LCSTREAM_MAX_NUM_INPUTS]; // memory input stream(s) u8* out; // memory output stream u32 num_outputs; // for now, only 1 supported u32 num_inputs; // 1 to LCSTREAM_MAX_NUM_INPUTS u32 total_strips; // =size/strip_size u32 strip_size; // strip size in locked cache u32 strip_size_blocks; // strip_size in cache blocks; /32B // LC buffers to double buffer the sets of LC strips u8* lc_buffer_A; // LC pointer to the start of "A" buffers u8* lc_buffer_B; // LC pointer to the start of "B" buffers } LCStream; // Returns the locked cache size (in bytes) required for the number of // inputs and outputs. // stripSize - LC buffer size to use. 1KB recommended. // numOutputs - number of output streams. Only 1 supported currently. // numInputs - number of input streams. Only 1-4 supported currently. u32 LCSTREAMLCSizeRequired(u32 stripSize, u32 numOutputs, u32 numInputs); // Returns number of bytes free/unallocated in the locked cache (LC). u32 LCSTREAMLCSizeAvailable(void); // Initializes a stream object and allocates space in the LC. The number // of bytes allocated is returned (matches LCSTREAMLCSizeRequired()). // s - stream to initialize // stripSize - LC buffer size to use. 1KB recommended. // numOutputs - number of output streams. Only 1 supported currently. // numInputs - number of input streams. Only 1-4 supported currently. u32 LCSTREAMAlloc(LCStream* s, u32 stripSize, u32 numOutputs, u32 numInputs); // Frees the LC allocated memory associated with the stream object. // s - stream whose LC allocated memory to free void LCSTREAMFree(LCStream* s); // Assigns a stream object to memory input(s) and output. // input streams are specified after the numInputs arguments. The number // of input stream pointers should equal numInputs. // s - stream object to assign memory pointers to // streamLength - total stream data length. Output and inputs are same size. // output - pointer to output stream. Stream PPC_IO_BUFFER_ALIGN aligned. // numInputs - number of input streams // input1 - pointer to input1 stream. Stream PPC_IO_BUFFER_ALIGN aligned. // [input2, input3, input4] - input streams 2-4. Streams PPC_IO_BUFFER_ALIGN aligned. void LCSTREAMAssign(LCStream* s, u32 streamLength, void* output, u32 numInputs, ...); // Call stream function f on the input + output stream. The function is // called on strip sized portions. // s - stream which to apply function f to // do_flush - if TRUE, flushes the input streams from the cache hierarchy and // cache invalidates the output stream. If it known that none of // the input or output stream exists any core cache hierarchy, // specify FALSE to avoid unnecessary flushes/invalidates. About // 20% faster. // // f - function to call on input and output streams // funArg - 32b value that is passed to f. Can also be used as a pointer to // a struct that can provide more caller data to f. void LCSTREAMProcess(LCStream* s, BOOL do_flush, LCStreamFunction f, void* funArg); #ifdef __cplusplus } #endif #endif