1 /*---------------------------------------------------------------------------* 2 Project: Cafe 3 File: lcstream.h - example locked cache (LC) streaming API 4 5 Copyright (C) Nintendo. All rights reserved. 6 7 These coded instructions, statements, and computer programs contain 8 proprietary information of Nintendo of America Inc. and/or Nintendo 9 Company Ltd., and are protected by Federal copyright law. They may 10 not be disclosed to third parties or copied or duplicated in any form, 11 in whole or in part, without the prior written consent of Nintendo. 12 13 *---------------------------------------------------------------------------*/ 14 15 // Streaming has two advantages: 16 // 1. highest bandwidth CPU <-> MEMx 17 // 2. can parallelize computation and data transfer 18 // 3. operating on large aggregate data sizes does not thrash the 19 // L1d/L2 caches 20 // 21 // This API allows a function to be called on a set of input and output 22 // streams. Each input/output stream is broken down into "strip" sized 23 // data. Espresso's DMA engine is used to the transfer strip-sized data 24 // in/out of the locked cache (LC). The function operates on "strip" size 25 // portions while it is in the LC. 26 // 27 // The LCSTREAMProcess() function takes care of pipelining the data 28 // transfer and calling the users compute function to overlap computation 29 // with data transfer. 30 // 31 // This API supports exactly 1 output and 1-4 input streams. All the 32 // streams must be the same size. It is also assumed that the streams 33 // begin on a 64-byte (PPC_IO_BUFFER_ALIGN in OSCore.h) boundary and 34 // are a multiple of the strip size. 35 // 36 // 37 // The model of this API is: 38 // LCSTREAMAlloc() - allocate some space in LC 39 // 40 // LCSTREAMAssign() - assign the stream obj to streams in memory 41 // LCSTREAMProcess() - start stream processing 42 // ... 43 // LCSTREAMAssign() - can reassign stream object if same stream type 44 // LCSTREAMProcess() 45 // ... 46 // LCSTREAMFree() - free LC space 47 // 48 // 49 // Restrictions 50 // 1. Each input and output stream must be the same size (for example, animation 51 // blending, adding vectors/arrays). This streaming model does not work 52 // for cases where the input/output streams are different lengths (for example, 53 // compression or decompression). 54 // 55 // 2. The input and output streams must be aligned to PPC_IO_BUFFER_ALIGN. 56 // 57 // 3. The input and output streams should be a multiple of the 64 bytes 58 // (PPC_IO_BUFFER_ALIGN or LL_CACHE_FETCH_SIZE) to eliminate 59 // cache coherency issues with DMA data. 60 61 62 #ifndef __LCSTREAM_H__ 63 #define __LCSTREAM_H__ 64 65 #ifdef __cplusplus 66 extern "C" { 67 #endif 68 69 70 #define LCSTREAM_MAX_NUM_INPUTS 4 // maximum number of input streams 71 72 // Type for a function that can be used with LCSTREAMProcess. The function 73 // is called with strips of the input and output streams. The locked 74 // cache (LC) pointers of the input strips are passed after the outStrip 75 // argument. funArg is a caller argument that is passed along from the 76 // LCSTREAMProcess call. 77 typedef void (*LCStreamFunction) (u32 stripSize, 78 void* funArg, 79 void* outStrip, 80 ...); 81 82 83 typedef struct LCStream { 84 u32 size; // in bytes. Size of input(s) and output are same. 85 u8* in[LCSTREAM_MAX_NUM_INPUTS]; // memory input stream(s) 86 u8* out; // memory output stream 87 88 u32 num_outputs; // for now, only 1 supported 89 u32 num_inputs; // 1 to LCSTREAM_MAX_NUM_INPUTS 90 91 u32 total_strips; // =size/strip_size 92 u32 strip_size; // strip size in locked cache 93 u32 strip_size_blocks; // strip_size in cache blocks; /32B 94 95 // LC buffers to double buffer the sets of LC strips 96 u8* lc_buffer_A; // LC pointer to the start of "A" buffers 97 u8* lc_buffer_B; // LC pointer to the start of "B" buffers 98 } LCStream; 99 100 101 // Returns the locked cache size (in bytes) required for the number of 102 // inputs and outputs. 103 // stripSize - LC buffer size to use. 1KB recommended. 104 // numOutputs - number of output streams. Only 1 supported currently. 105 // numInputs - number of input streams. Only 1-4 supported currently. 106 u32 LCSTREAMLCSizeRequired(u32 stripSize, u32 numOutputs, u32 numInputs); 107 108 109 // Returns number of bytes free/unallocated in the locked cache (LC). 110 u32 LCSTREAMLCSizeAvailable(void); 111 112 113 // Initializes a stream object and allocates space in the LC. The number 114 // of bytes allocated is returned (matches LCSTREAMLCSizeRequired()). 115 // s - stream to initialize 116 // stripSize - LC buffer size to use. 1KB recommended. 117 // numOutputs - number of output streams. Only 1 supported currently. 118 // numInputs - number of input streams. Only 1-4 supported currently. 119 u32 LCSTREAMAlloc(LCStream* s, u32 stripSize, u32 numOutputs, u32 numInputs); 120 121 122 // Frees the LC allocated memory associated with the stream object. 123 // s - stream whose LC allocated memory to free 124 void LCSTREAMFree(LCStream* s); 125 126 127 // Assigns a stream object to memory input(s) and output. 128 // input streams are specified after the numInputs arguments. The number 129 // of input stream pointers should equal numInputs. 130 // s - stream object to assign memory pointers to 131 // streamLength - total stream data length. Output and inputs are same size. 132 // output - pointer to output stream. Stream PPC_IO_BUFFER_ALIGN aligned. 133 // numInputs - number of input streams 134 // input1 - pointer to input1 stream. Stream PPC_IO_BUFFER_ALIGN aligned. 135 // [input2, input3, input4] - input streams 2-4. Streams PPC_IO_BUFFER_ALIGN aligned. 136 void LCSTREAMAssign(LCStream* s, u32 streamLength, void* output, u32 numInputs, ...); 137 138 139 // Call stream function f on the input + output stream. The function is 140 // called on strip sized portions. 141 // s - stream which to apply function f to 142 // do_flush - if TRUE, flushes the input streams from the cache hierarchy and 143 // cache invalidates the output stream. If it known that none of 144 // the input or output stream exists any core cache hierarchy, 145 // specify FALSE to avoid unnecessary flushes/invalidates. About 146 // 20% faster. 147 // 148 // f - function to call on input and output streams 149 // funArg - 32b value that is passed to f. Can also be used as a pointer to 150 // a struct that can provide more caller data to f. 151 void LCSTREAMProcess(LCStream* s, BOOL do_flush, LCStreamFunction f, void* funArg); 152 153 #ifdef __cplusplus 154 } 155 #endif 156 #endif 157 158