1  /*---------------------------------------------------------------------------*
2   Project:  Cafe
3   File:     lcstream.h - example locked cache (LC) streaming API
4 
5   Copyright (C) 2011 Nintendo.  All rights reserved.
6 
7   These coded instructions, statements, and computer programs contain
8   proprietary information of Nintendo of America Inc. and/or Nintendo
9   Company Ltd., and are protected by Federal copyright law.  They may
10   not be disclosed to third parties or copied or duplicated in any form,
11   in whole or in part, without the prior written consent of Nintendo.
12 
13  *---------------------------------------------------------------------------*/
14 
15 // Streaming has two advantages:
16 //   1. highest bandwidth CPU <-> MEMx
17 //   2. can parallelize computation and data transfer
18 //   3. operating on large aggregate data sizes does not thrash the
19 //      L1d/L2 caches
20 //
21 // This API allows a function to be called on a set of input and output
22 // streams.  Each input/output stream is broken down into "strip" sized
23 // data. Espresso's DMA engine is used to the transfer strip-sized data
24 // in/out of the locked cache (LC). The function operates on "strip" size
25 // portions while it is in the LC.
26 //
27 // The LCSTREAMProcess() function takes care of pipelining the data
28 // transfer and calling the users compute function to overlap computation
29 // with data transfer.
30 //
31 // This API supports exactly 1 output and 1-4 input streams.  All the
32 // streams must be the same size.  It is also assumed that the streams
33 // begin on a 64-byte (PPC_IO_BUFFER_ALIGN in OSCore.h) boundary and
34 // are a multiple of the strip size.
35 //
36 //
37 // The model of this API is:
38 //   LCSTREAMAlloc()  - allocate some space in LC
39 //
40 //   LCSTREAMAssign() - assign the stream obj to streams in memory
41 //   LCSTREAMProcess() - start stream processing
42 //    ...
43 //   LCSTREAMAssign() - can reassign stream object if same stream type
44 //   LCSTREAMProcess()
45 //    ...
46 //   LCSTREAMFree()   - free LC space
47 //
48 //
49 //  Restrictions
50 //  1. Each input and output stream must be the same size (for example, animation
51 //     blending, adding vectors/arrays).  This streaming model does not work
52 //     for cases where the input/output streams are different lengths (for example,
53 //     compression or decompression).
54 //
55 //  2. The input and output streams must be aligned to PPC_IO_BUFFER_ALIGN.
56 //
57 //  3. The input and output streams should be a multiple of the 64 bytes
58 //     (PPC_IO_BUFFER_ALIGN or LL_CACHE_FETCH_SIZE) to eliminate
59 //     cache coherency issues with DMA data.
60 
61 
62 #ifndef __LCSTREAM_H__
63 #define __LCSTREAM_H__
64 
65 #ifdef __cplusplus
66 extern "C" {
67 #endif
68 
69 
70 #define LCSTREAM_MAX_NUM_INPUTS    4    // maximum number of input streams
71 
72 // Type for a function that can be used with LCSTREAMProcess.  The function
73 // is called with strips of the input and output streams.  The locked
74 // cache (LC) pointers of the input strips are passed after the outStrip
75 // argument. funArg is a caller argument that is passed along from the
76 // LCSTREAMProcess call.
77 typedef void (*LCStreamFunction) (u32   stripSize,
78                                 void* funArg,
79                                 void* outStrip,
80                                 ...);
81 
82 
83 typedef struct LCStream {
84     u32 size;                // in bytes. Size of input(s) and output are same.
85     u8* in[LCSTREAM_MAX_NUM_INPUTS];    // memory input stream(s)
86     u8* out;                          // memory output stream
87 
88     u32 num_outputs;                  // for now, only 1 supported
89     u32 num_inputs;                   // 1 to LCSTREAM_MAX_NUM_INPUTS
90 
91     u32 total_strips;                 // =size/strip_size
92     u32 strip_size;                   // strip size in locked cache
93     u32 strip_size_blocks;            // strip_size in cache blocks; /32B
94 
95     // LC buffers to double buffer the sets of LC strips
96     u8* lc_buffer_A;                  // LC pointer to the start of "A" buffers
97     u8* lc_buffer_B;                  // LC pointer to the start of "B" buffers
98 } LCStream;
99 
100 
101 // Returns the locked cache size (in bytes) required for the number of
102 // inputs and outputs.
103 //    stripSize  - LC buffer size to use. 1KB recommended.
104 //    numOutputs - number of output streams.  Only 1 supported currently.
105 //    numInputs  - number of input streams.  Only 1-4 supported currently.
106 u32  LCSTREAMLCSizeRequired(u32 stripSize, u32 numOutputs, u32 numInputs);
107 
108 
109 // Returns number of bytes free/unallocated in the locked cache (LC).
110 u32  LCSTREAMLCSizeAvailable(void);
111 
112 
113 // Initializes a stream object and allocates space in the LC.  The number
114 // of bytes allocated is returned (matches LCSTREAMLCSizeRequired()).
115 //    s          - stream to initialize
116 //    stripSize  - LC buffer size to use. 1KB recommended.
117 //    numOutputs - number of output streams.  Only 1 supported currently.
118 //    numInputs  - number of input streams.  Only 1-4 supported currently.
119 u32  LCSTREAMAlloc(LCStream* s, u32 stripSize, u32 numOutputs, u32 numInputs);
120 
121 
122 // Frees the LC allocated memory associated with the stream object.
123 //    s - stream whose LC allocated memory to free
124 void LCSTREAMFree(LCStream* s);
125 
126 
127 // Assigns a stream object to memory input(s) and output.
128 // input streams are specified after the numInputs arguments.  The number
129 // of input stream pointers should equal numInputs.
130 //    s            - stream object to assign memory pointers to
131 //    streamLength - total stream data length.  Output and inputs are same size.
132 //    output       - pointer to output stream. Stream PPC_IO_BUFFER_ALIGN aligned.
133 //    numInputs    - number of input streams
134 //    input1       - pointer to input1 stream. Stream PPC_IO_BUFFER_ALIGN aligned.
135 //   [input2, input3, input4] - input streams 2-4. Streams PPC_IO_BUFFER_ALIGN aligned.
136 void LCSTREAMAssign(LCStream* s, u32 streamLength, void* output, u32 numInputs, ...);
137 
138 
139 // Call stream function f on the input + output stream.  The function is
140 // called on strip sized portions.
141 //    s        - stream which to apply function f to
142 //    do_flush - if TRUE, flushes the input streams from the cache hierarchy and
143 //               cache invalidates the output stream. If it known that none of
144 //               the input or output stream exists any core cache hierarchy,
145 //               specify FALSE to avoid unnecessary flushes/invalidates.  About
146 //               20% faster.
147 //
148 //    f        - function to call on input and output streams
149 //    funArg   - 32b value that is passed to f. Can also be used as a pointer to
150 //               a struct that can provide more caller data to f.
151 void LCSTREAMProcess(LCStream* s, BOOL do_flush, LCStreamFunction f, void* funArg);
152 
153 #ifdef __cplusplus
154 }
155 #endif
156 #endif
157 
158