1 /*---------------------------------------------------------------------------*
2   Project:  Cafe
3   File:     stream.c
4 
5   Copyright 2011 Nintendo.  All rights reserved.
6 
7   These coded instructions, statements, and computer programs contain
8   proprietary information of Nintendo of America Inc. and/or Nintendo
9   Company Ltd., and are protected by Federal copyright law.  They may
10   not be disclosed to third parties or copied or duplicated in any form,
11   in whole or in part, without the prior written consent of Nintendo.
12 
13  *---------------------------------------------------------------------------*/
14 
15 #include <stdio.h>
16 #include <cafe.h>
17 #include <cafe/lcstream.h>
18 
19 
20 #define PPC_SYNC()                  __SYNC()
21 
22 
LCSTREAMLCSizeRequired(u32 stripSize,u32 numOutputs,u32 numInputs)23 u32  LCSTREAMLCSizeRequired(u32 stripSize, u32 numOutputs, u32 numInputs) {
24     return 2*stripSize*(numOutputs+numInputs);
25 }
26 
27 
LCSTREAMLCSizeAvailable(void)28 u32  LCSTREAMLCSizeAvailable(void) {
29     return LCGetUnallocated();
30 }
31 
32 
LCSTREAMAlloc(LCStream * s,u32 stripSize,u32 numOutputs,u32 numInputs)33 u32 LCSTREAMAlloc(LCStream* s, u32 stripSize, u32 numOutputs, u32 numInputs) {
34     if (numInputs > LCSTREAM_MAX_NUM_INPUTS) return 0;
35 
36     s->size         = 0;
37     s->strip_size   = stripSize;
38     // todo: system level macro for max dma size?
39     s->strip_size_blocks = (stripSize>=LC_MAX_DMA_BYTES)? 0 : stripSize/CACHE_BLOCK_SIZE;
40     s->total_strips = 0;     // stream not yet associated with any data
41 
42     s->num_outputs  = numOutputs;
43     s->num_inputs   = numInputs;
44 
45     // reserve some area within the LC.  We should later change the memory
46     // allocation model of LC to that of a heap with fixed sized allocations.
47     // Either 512 or 1KB blocks.
48     u8* lc_area = (u8*) LCAlloc(2*stripSize*(numOutputs + numInputs));
49     if (!lc_area) {
50         s->lc_buffer_A = NULL;
51         s->lc_buffer_B = NULL;
52         return 0;                       // failed to allocate
53     }
54 
55     // divide up area as the input/output buffers
56     s->lc_buffer_A = lc_area;
57     s->lc_buffer_B = lc_area + s->strip_size*(numOutputs + numInputs);
58 
59     return 2*stripSize*(numOutputs + numInputs);
60 }
61 
62 
LCSTREAMFree(LCStream * s)63 void LCSTREAMFree(LCStream* s) {
64     ASSERT(s->lc_buffer_A);             // make sure we have memory
65     LCDealloc(s->lc_buffer_A);
66     s->lc_buffer_A = NULL;
67 }
68 
69 
LCSTREAMAssign(LCStream * s,u32 streamLength,void * output,u32 numInputs,...)70 void LCSTREAMAssign(LCStream* s, u32 streamLength, void* output, u32 numInputs, ...) {
71     s->size         = streamLength;
72     s->total_strips = streamLength/s->strip_size;
73 
74     s->out = output;
75 
76     va_list ap;
77     va_start(ap, numInputs);
78     u32 i;
79     for (i=0; i<s->num_inputs; i++) {
80         s->in[i] = (u8*) va_arg(ap, void*);
81     }
82     va_end(ap);
83 }
84 
85 
Process1In1Out(LCStream * s,LCStreamFunction f,void * funArg)86 void Process1In1Out(LCStream* s, LCStreamFunction f, void* funArg) {
87     // pointers to memory currently being loaded or stored do.  These will
88     // be used to traverse n strips of total data.
89     u8* mem_in1 = s->in[0];
90     u8* mem_out = s->out;
91 
92 
93     u8* curr_lc_buffers = s->lc_buffer_A;   // initial start of "A" buffers
94     u8* next_lc_buffers = s->lc_buffer_B;   // initial start of "B" buffers
95     u32 toggle = 0;                         // used to toggle between buffers
96 
97 
98     // Enqueue one DMA load stage ahead to setup pipelining.
99     // A load stage mean one DMA load requests for input1.
100     LCLoadDMABlocks(curr_lc_buffers,                    // LC addr
101                     mem_in1,                            // mem addr
102                     s->strip_size_blocks);
103     mem_in1 += s->strip_size;       // advance input pointer
104 
105     LCWaitDMAQueue(0);              // wait until this first load stage is done
106 
107     u32 i;
108     for (i=0; i<s->total_strips-1; i++) {
109         // enqueue DMA for next load stage
110         LCLoadDMABlocks(next_lc_buffers,                // LC addr
111                         mem_in1,                        // mem addr
112                         s->strip_size_blocks);
113         mem_in1 += s->strip_size;  // advance input memory pointer
114 
115         // wait until one store and load stages left in queue. Overlap 1 load + 1 store DMA requests with compute
116         LCWaitDMAQueue(2);
117 
118         // compute.  Call callback function f on strips in LC.
119         f(s->strip_size,                                // strip size in LC
120           funArg,                                       // user's own arg
121           curr_lc_buffers + s->strip_size,              // output
122           curr_lc_buffers);                             // input
123 
124         // enqueue store stage
125         LCStoreDMABlocks(mem_out,                           // mem addr
126                          curr_lc_buffers + s->strip_size,   // LC addr
127                          s->strip_size_blocks);
128         mem_out += s->strip_size;           // advance output memory pointer
129 
130         // toggle buffers
131         toggle = ++(toggle) & 0x1;          // switch buffers curr<->next
132         if (toggle) {
133             curr_lc_buffers = s->lc_buffer_B;
134             next_lc_buffers = s->lc_buffer_A;
135         } else {
136             curr_lc_buffers = s->lc_buffer_A;
137             next_lc_buffers = s->lc_buffer_B;
138         }
139     }
140 
141     // wait until last load done. Only second to last store pending.
142     LCWaitDMAQueue(1);
143 
144     // last compute. Call callback function f on strips in LC.
145     f(s->strip_size,                                // strip size in LC
146       funArg,                                       // user's own arg
147       curr_lc_buffers + s->strip_size,              // output
148       curr_lc_buffers);                             // input
149 
150     // enqueue last store stage
151     LCStoreDMABlocks(mem_out,                           // mem addr
152                      curr_lc_buffers + s->strip_size,   // LC addr
153                      s->strip_size_blocks);
154 
155     // wait until last store done
156     LCWaitDMAQueue(0);
157 }
158 
159 
Process2In1Out(LCStream * s,LCStreamFunction f,void * funArg)160 void Process2In1Out(LCStream* s, LCStreamFunction f, void* funArg) {
161     // pointers to memory currently being loaded or stored do.  These will
162     // be used to traverse n strips of total data.
163     u8* mem_in1 = s->in[0];
164     u8* mem_in2 = s->in[1];
165     u8* mem_out = s->out;
166 
167 
168     u8* curr_lc_buffers = s->lc_buffer_A;   // initial start of "A" buffers
169     u8* next_lc_buffers = s->lc_buffer_B;   // initial start of "B" buffers
170     u32 toggle = 0;                         // used to toggle between buffers
171 
172 
173     // Enqueue one DMA load stage ahead to setup pipelining.
174     // A load stage mean two DMA load requests for input1 and input2.
175     LCLoadDMABlocks(curr_lc_buffers,                    // LC addr
176                     mem_in1,                            // mem addr
177                     s->strip_size_blocks);
178     LCLoadDMABlocks(curr_lc_buffers + s->strip_size,    // LC addr
179                     mem_in2,                            // mem addr
180                     s->strip_size_blocks);
181     // advance input pointers
182     mem_in1 += s->strip_size;
183     mem_in2 += s->strip_size;
184 
185     LCWaitDMAQueue(0);              // wait until this first load stage is done
186 
187     u32 i;
188     for (i=0; i<s->total_strips-1; i++) {
189         // enqueue DMAs for next load stage
190         LCLoadDMABlocks(next_lc_buffers,                 // LC addr
191                         mem_in1,                         // mem addr
192                         s->strip_size_blocks);
193         LCLoadDMABlocks(next_lc_buffers + s->strip_size, // LC addr
194                         mem_in2,                         // mem addr
195                         s->strip_size_blocks);
196         // advance input memory pointers
197         mem_in1 += s->strip_size;
198         mem_in2 += s->strip_size;
199 
200         // wait until one store and load stages left in queue. Overlap 2 loads + 1 store DMA requests with compute
201         LCWaitDMAQueue(3);
202 
203         // compute.  Call callback function f on strips in LC.
204         f(s->strip_size,                                // strip size in LC
205           funArg,                                       // user's own arg
206           curr_lc_buffers + 2*s->strip_size,            // output
207           curr_lc_buffers,                              // input 1
208           curr_lc_buffers +   s->strip_size);           // input 2
209 
210 
211         // enqueue store stage
212         LCStoreDMABlocks(mem_out,                           // mem addr
213                          curr_lc_buffers + 2*s->strip_size, // LC addr
214                          s->strip_size_blocks);
215         mem_out += s->strip_size;          // advance output memory pointer
216 
217         // toggle buffers
218         toggle = ++(toggle) & 0x1;         // switch buffers curr<->next
219         if (toggle) {
220             curr_lc_buffers = s->lc_buffer_B;
221             next_lc_buffers = s->lc_buffer_A;
222         } else {
223             curr_lc_buffers = s->lc_buffer_A;
224             next_lc_buffers = s->lc_buffer_B;
225         }
226     }
227 
228     // wait until previous loads done. Only second to last store pending.
229     LCWaitDMAQueue(1);
230 
231     // last compute.  Call callback function f on strips in LC.
232     f(s->strip_size,                                // strip size in LC
233       funArg,                                       // user's own arg
234       curr_lc_buffers + 2*s->strip_size,            // output
235       curr_lc_buffers,                              // input 1
236       curr_lc_buffers +   s->strip_size);           // input 2
237 
238     // enqueue last store stage
239     LCStoreDMABlocks(mem_out,                           // mem addr
240                      curr_lc_buffers + 2*s->strip_size, // LC addr
241                      s->strip_size_blocks);
242 
243     // Wait until last store done.
244     LCWaitDMAQueue(0);
245 }
246 
247 
Process3In1Out(LCStream * s,LCStreamFunction f,void * funArg)248 void Process3In1Out(LCStream* s, LCStreamFunction f, void* funArg) {
249     // pointers to memory currently being loaded or stored do.  These will
250     // be used to traverse n strips of total data.
251     u8* mem_in1 = s->in[0];
252     u8* mem_in2 = s->in[1];
253     u8* mem_in3 = s->in[2];
254     u8* mem_out = s->out;
255 
256     u8* curr_lc_buffers = s->lc_buffer_A;   // initial start of "A" buffers
257     u8* next_lc_buffers = s->lc_buffer_B;   // initial start of "B" buffers
258     u32 toggle = 0;                         // used to toggle between buffers
259 
260 
261     // Enqueue one DMA load stage ahead to setup pipelining.
262     // A load stage mean three DMA load requests for inputs 1-3.
263     LCLoadDMABlocks(curr_lc_buffers,                    // LC addr
264                     mem_in1,                            // mem addr
265                     s->strip_size_blocks);
266     LCLoadDMABlocks(curr_lc_buffers + s->strip_size,    // LC addr
267                     mem_in2,                            // mem addr
268                     s->strip_size_blocks);
269     LCLoadDMABlocks(curr_lc_buffers + 2*s->strip_size,  // LC addr
270                     mem_in3,                            // mem addr
271                     s->strip_size_blocks);
272     // advance input pointers
273     mem_in1 += s->strip_size;
274     mem_in2 += s->strip_size;
275     mem_in3 += s->strip_size;
276 
277     LCWaitDMAQueue(0);              // wait until this first load stage is done
278 
279     u32 i;
280     for (i=0; i<s->total_strips-1; i++) {
281         // enqueue DMAs for next load stage
282         LCLoadDMABlocks(next_lc_buffers,                 // LC addr
283                         mem_in1,                         // mem addr
284                         s->strip_size_blocks);
285         LCLoadDMABlocks(next_lc_buffers + s->strip_size, // LC addr
286                         mem_in2,                         // mem addr
287                         s->strip_size_blocks);
288         LCLoadDMABlocks(next_lc_buffers + 2*s->strip_size,// LC addr
289                         mem_in3,                          // mem addr
290                         s->strip_size_blocks);
291         // advance input memory pointers
292         mem_in1 += s->strip_size;
293         mem_in2 += s->strip_size;
294         mem_in3 += s->strip_size;
295 
296         // wait until one store and load stages left in queue. Overlap 3 loads + 1 store DMA requests with compute
297         LCWaitDMAQueue(4);
298 
299         // compute.  Call callback function f on strips in LC.
300         f(s->strip_size,                                // strip size in LC
301           funArg,                                       // user's own arg
302           curr_lc_buffers + 3*s->strip_size,            // output
303           curr_lc_buffers,                              // input 1
304           curr_lc_buffers +   s->strip_size,            // input 2
305           curr_lc_buffers + 2*s->strip_size);           // input 3
306 
307 
308         // enqueue store stage
309         LCStoreDMABlocks(mem_out,                           // mem addr
310                          curr_lc_buffers + 3*s->strip_size, // LC addr
311                          s->strip_size_blocks);
312         mem_out += s->strip_size;          // advance output memory pointer
313 
314         // toggle buffers
315         toggle = ++(toggle) & 0x1;         // switch buffers curr<->next
316         if (toggle) {
317             curr_lc_buffers = s->lc_buffer_B;
318             next_lc_buffers = s->lc_buffer_A;
319         } else {
320             curr_lc_buffers = s->lc_buffer_A;
321             next_lc_buffers = s->lc_buffer_B;
322         }
323     }
324 
325     // wait until previous loads done. Only second to last store pending.
326     LCWaitDMAQueue(1);
327 
328     // last compute.  Call callback function f on strips in LC.
329     f(s->strip_size,                                // strip size in LC
330       funArg,                                       // user's own arg
331       curr_lc_buffers + 3*s->strip_size,            // output
332       curr_lc_buffers,                              // input 1
333       curr_lc_buffers +   s->strip_size,            // input 2
334       curr_lc_buffers + 2*s->strip_size);           // input 3
335 
336     // enqueue last store stage
337     LCStoreDMABlocks(mem_out,                           // mem addr
338                      curr_lc_buffers + 3*s->strip_size, // LC addr
339                      s->strip_size_blocks);
340 
341     // Wait until last store done.
342     LCWaitDMAQueue(0);
343 }
344 
345 
Process4In1Out(LCStream * s,LCStreamFunction f,void * funArg)346 void Process4In1Out(LCStream* s, LCStreamFunction f, void* funArg) {
347     // pointers to memory currently being loaded or stored do.  These will
348     // be used to traverse n strips of total data.
349     u8* mem_in1 = s->in[0];
350     u8* mem_in2 = s->in[1];
351     u8* mem_in3 = s->in[2];
352     u8* mem_in4 = s->in[3];
353     u8* mem_out = s->out;
354 
355     u8* curr_lc_buffers = s->lc_buffer_A;   // initial start of "A" buffers
356     u8* next_lc_buffers = s->lc_buffer_B;   // initial start of "B" buffers
357     u32 toggle = 0;                         // used to toggle between buffers
358 
359 
360     // Enqueue one DMA load stage ahead to setup pipelining.
361     // A load stage mean three DMA load requests for inputs 1-3.
362     LCLoadDMABlocks(curr_lc_buffers,                    // LC addr
363                     mem_in1,                           // mem addr
364                     s->strip_size_blocks);
365     LCLoadDMABlocks(curr_lc_buffers + s->strip_size,    // LC addr
366                     mem_in2,                           // mem addr
367                     s->strip_size_blocks);
368     LCLoadDMABlocks(curr_lc_buffers + 2*s->strip_size,  // LC addr
369                     mem_in3,                           // mem addr
370                     s->strip_size_blocks);
371     LCLoadDMABlocks(curr_lc_buffers + 3*s->strip_size,  // LC addr
372                     mem_in4,                           // mem addr
373                     s->strip_size_blocks);
374     // advance input pointers
375     mem_in1 += s->strip_size;
376     mem_in2 += s->strip_size;
377     mem_in3 += s->strip_size;
378     mem_in4 += s->strip_size;
379 
380     LCWaitDMAQueue(0);              // wait until this first load stage is done
381 
382     u32 i;
383     for (i=0; i<s->total_strips-1; i++) {
384         // enqueue DMAs for next load stage
385         LCLoadDMABlocks(next_lc_buffers,                 // LC addr
386                         mem_in1,                        // mem addr
387                         s->strip_size_blocks);
388         LCLoadDMABlocks(next_lc_buffers + s->strip_size, // LC addr
389                         mem_in2,                        // mem addr
390                         s->strip_size_blocks);
391         LCLoadDMABlocks(next_lc_buffers + 2*s->strip_size,// LC addr
392                         mem_in3,                         // mem addr
393                         s->strip_size_blocks);
394         LCLoadDMABlocks(next_lc_buffers + 3*s->strip_size,// LC addr
395                         mem_in4,                         // mem addr
396                         s->strip_size_blocks);
397         // advance input memory pointers
398         mem_in1 += s->strip_size;
399         mem_in2 += s->strip_size;
400         mem_in3 += s->strip_size;
401         mem_in4 += s->strip_size;
402 
403         // wait until one store and load stages left in queue. Overlap 3 loads + 1 store DMA requests with compute
404         LCWaitDMAQueue(4);
405 
406         // compute.  Call callback function f on strips in LC.
407         f(s->strip_size,                                // strip size in LC
408           funArg,                                       // user's own arg
409           curr_lc_buffers + 4*s->strip_size,            // output
410           curr_lc_buffers,                              // input 1
411           curr_lc_buffers +   s->strip_size,            // input 2
412           curr_lc_buffers + 2*s->strip_size,            // input 3
413           curr_lc_buffers + 3*s->strip_size);           // input 4
414 
415 
416         // enqueue store stage
417         LCStoreDMABlocks(mem_out,                          // mem addr
418                          curr_lc_buffers + 4*s->strip_size, // LC addr
419                          s->strip_size_blocks);
420         mem_out += s->strip_size;          // advance output memory pointer
421 
422 
423         // toggle buffers
424         toggle = ++(toggle) % 2;            // switch buffers curr<->next
425         if (toggle) {
426             curr_lc_buffers = s->lc_buffer_B;
427             next_lc_buffers = s->lc_buffer_A;
428         } else {
429             curr_lc_buffers = s->lc_buffer_A;
430             next_lc_buffers = s->lc_buffer_B;
431         }
432     }
433 
434     // wait until previous loads done. Only second to last store pending.
435     LCWaitDMAQueue(1);
436 
437     // last compute.  Call callback function f on strips in LC.
438     f(s->strip_size,                                // strip size in LC
439       funArg,                                       // user's own arg
440       curr_lc_buffers + 4*s->strip_size,            // output
441       curr_lc_buffers,                              // input 1
442       curr_lc_buffers +   s->strip_size,            // input 2
443       curr_lc_buffers + 2*s->strip_size,            // input 3
444       curr_lc_buffers + 3*s->strip_size);           // input 4
445 
446     // enqueue last store stage
447     LCStoreDMABlocks(mem_out,                           // mem addr
448                      curr_lc_buffers + 4*s->strip_size, // LC addr
449                      s->strip_size_blocks);
450 
451     // Wait until last store done.
452     LCWaitDMAQueue(0);
453 }
454 
455 
LCSTREAMProcess(LCStream * s,BOOL do_flush,LCStreamFunction f,void * funArg)456 void LCSTREAMProcess(LCStream* s, BOOL do_flush, LCStreamFunction f, void* funArg) {
457     // make sure input and output data are not in the cache hierarchy before
458     // we DMA memory <-> LC?
459     if (do_flush) {
460         u32 i;
461         for (i=0; i<s->num_inputs; i++) {    // flush all the input streams
462             DCFlushRangeNoSync(s->in[i], s->size);
463         }
464          DCInvalidateRange(s->out, s->size);  // invalidate the output stream
465         // printf("dcflush %p:%p)\n", s->out, s->out + s->size);
466         PPC_SYNC();
467     }
468 
469     switch (s->num_inputs) {
470     case 1:
471         Process1In1Out(s, f, funArg);
472         break;
473     case 2:
474         Process2In1Out(s, f, funArg);
475         break;
476     case 3:
477         Process3In1Out(s, f, funArg);
478         break;
479     case 4:
480         Process4In1Out(s, f, funArg);
481         break;
482     }
483 }
484