1 /*---------------------------------------------------------------------------*
2 Project: Cafe
3 File: stream.c
4
5 Copyright 2011 Nintendo. All rights reserved.
6
7 These coded instructions, statements, and computer programs contain
8 proprietary information of Nintendo of America Inc. and/or Nintendo
9 Company Ltd., and are protected by Federal copyright law. They may
10 not be disclosed to third parties or copied or duplicated in any form,
11 in whole or in part, without the prior written consent of Nintendo.
12
13 *---------------------------------------------------------------------------*/
14
15 #include <stdio.h>
16 #include <cafe.h>
17 #include <cafe/lcstream.h>
18
19
20 #define PPC_SYNC() __SYNC()
21
22
LCSTREAMLCSizeRequired(u32 stripSize,u32 numOutputs,u32 numInputs)23 u32 LCSTREAMLCSizeRequired(u32 stripSize, u32 numOutputs, u32 numInputs) {
24 return 2*stripSize*(numOutputs+numInputs);
25 }
26
27
LCSTREAMLCSizeAvailable(void)28 u32 LCSTREAMLCSizeAvailable(void) {
29 return LCGetUnallocated();
30 }
31
32
LCSTREAMAlloc(LCStream * s,u32 stripSize,u32 numOutputs,u32 numInputs)33 u32 LCSTREAMAlloc(LCStream* s, u32 stripSize, u32 numOutputs, u32 numInputs) {
34 if (numInputs > LCSTREAM_MAX_NUM_INPUTS) return 0;
35
36 s->size = 0;
37 s->strip_size = stripSize;
38 // todo: system level macro for max dma size?
39 s->strip_size_blocks = (stripSize>=LC_MAX_DMA_BYTES)? 0 : stripSize/CACHE_BLOCK_SIZE;
40 s->total_strips = 0; // stream not yet associated with any data
41
42 s->num_outputs = numOutputs;
43 s->num_inputs = numInputs;
44
45 // reserve some area within the LC. We should later change the memory
46 // allocation model of LC to that of a heap with fixed sized allocations.
47 // Either 512 or 1KB blocks.
48 u8* lc_area = (u8*) LCAlloc(2*stripSize*(numOutputs + numInputs));
49 if (!lc_area) {
50 s->lc_buffer_A = NULL;
51 s->lc_buffer_B = NULL;
52 return 0; // failed to allocate
53 }
54
55 // divide up area as the input/output buffers
56 s->lc_buffer_A = lc_area;
57 s->lc_buffer_B = lc_area + s->strip_size*(numOutputs + numInputs);
58
59 return 2*stripSize*(numOutputs + numInputs);
60 }
61
62
LCSTREAMFree(LCStream * s)63 void LCSTREAMFree(LCStream* s) {
64 ASSERT(s->lc_buffer_A); // make sure we have memory
65 LCDealloc(s->lc_buffer_A);
66 s->lc_buffer_A = NULL;
67 }
68
69
LCSTREAMAssign(LCStream * s,u32 streamLength,void * output,u32 numInputs,...)70 void LCSTREAMAssign(LCStream* s, u32 streamLength, void* output, u32 numInputs, ...) {
71 s->size = streamLength;
72 s->total_strips = streamLength/s->strip_size;
73
74 s->out = output;
75
76 va_list ap;
77 va_start(ap, numInputs);
78 u32 i;
79 for (i=0; i<s->num_inputs; i++) {
80 s->in[i] = (u8*) va_arg(ap, void*);
81 }
82 va_end(ap);
83 }
84
85
Process1In1Out(LCStream * s,LCStreamFunction f,void * funArg)86 void Process1In1Out(LCStream* s, LCStreamFunction f, void* funArg) {
87 // pointers to memory currently being loaded or stored do. These will
88 // be used to traverse n strips of total data.
89 u8* mem_in1 = s->in[0];
90 u8* mem_out = s->out;
91
92
93 u8* curr_lc_buffers = s->lc_buffer_A; // initial start of "A" buffers
94 u8* next_lc_buffers = s->lc_buffer_B; // initial start of "B" buffers
95 u32 toggle = 0; // used to toggle between buffers
96
97
98 // Enqueue one DMA load stage ahead to setup pipelining.
99 // A load stage mean one DMA load requests for input1.
100 LCLoadDMABlocks(curr_lc_buffers, // LC addr
101 mem_in1, // mem addr
102 s->strip_size_blocks);
103 mem_in1 += s->strip_size; // advance input pointer
104
105 LCWaitDMAQueue(0); // wait until this first load stage is done
106
107 u32 i;
108 for (i=0; i<s->total_strips-1; i++) {
109 // enqueue DMA for next load stage
110 LCLoadDMABlocks(next_lc_buffers, // LC addr
111 mem_in1, // mem addr
112 s->strip_size_blocks);
113 mem_in1 += s->strip_size; // advance input memory pointer
114
115 // wait until one store and load stages left in queue. Overlap 1 load + 1 store DMA requests with compute
116 LCWaitDMAQueue(2);
117
118 // compute. Call callback function f on strips in LC.
119 f(s->strip_size, // strip size in LC
120 funArg, // user's own arg
121 curr_lc_buffers + s->strip_size, // output
122 curr_lc_buffers); // input
123
124 // enqueue store stage
125 LCStoreDMABlocks(mem_out, // mem addr
126 curr_lc_buffers + s->strip_size, // LC addr
127 s->strip_size_blocks);
128 mem_out += s->strip_size; // advance output memory pointer
129
130 // toggle buffers
131 toggle = ++(toggle) & 0x1; // switch buffers curr<->next
132 if (toggle) {
133 curr_lc_buffers = s->lc_buffer_B;
134 next_lc_buffers = s->lc_buffer_A;
135 } else {
136 curr_lc_buffers = s->lc_buffer_A;
137 next_lc_buffers = s->lc_buffer_B;
138 }
139 }
140
141 // wait until last load done. Only second to last store pending.
142 LCWaitDMAQueue(1);
143
144 // last compute. Call callback function f on strips in LC.
145 f(s->strip_size, // strip size in LC
146 funArg, // user's own arg
147 curr_lc_buffers + s->strip_size, // output
148 curr_lc_buffers); // input
149
150 // enqueue last store stage
151 LCStoreDMABlocks(mem_out, // mem addr
152 curr_lc_buffers + s->strip_size, // LC addr
153 s->strip_size_blocks);
154
155 // wait until last store done
156 LCWaitDMAQueue(0);
157 }
158
159
Process2In1Out(LCStream * s,LCStreamFunction f,void * funArg)160 void Process2In1Out(LCStream* s, LCStreamFunction f, void* funArg) {
161 // pointers to memory currently being loaded or stored do. These will
162 // be used to traverse n strips of total data.
163 u8* mem_in1 = s->in[0];
164 u8* mem_in2 = s->in[1];
165 u8* mem_out = s->out;
166
167
168 u8* curr_lc_buffers = s->lc_buffer_A; // initial start of "A" buffers
169 u8* next_lc_buffers = s->lc_buffer_B; // initial start of "B" buffers
170 u32 toggle = 0; // used to toggle between buffers
171
172
173 // Enqueue one DMA load stage ahead to setup pipelining.
174 // A load stage mean two DMA load requests for input1 and input2.
175 LCLoadDMABlocks(curr_lc_buffers, // LC addr
176 mem_in1, // mem addr
177 s->strip_size_blocks);
178 LCLoadDMABlocks(curr_lc_buffers + s->strip_size, // LC addr
179 mem_in2, // mem addr
180 s->strip_size_blocks);
181 // advance input pointers
182 mem_in1 += s->strip_size;
183 mem_in2 += s->strip_size;
184
185 LCWaitDMAQueue(0); // wait until this first load stage is done
186
187 u32 i;
188 for (i=0; i<s->total_strips-1; i++) {
189 // enqueue DMAs for next load stage
190 LCLoadDMABlocks(next_lc_buffers, // LC addr
191 mem_in1, // mem addr
192 s->strip_size_blocks);
193 LCLoadDMABlocks(next_lc_buffers + s->strip_size, // LC addr
194 mem_in2, // mem addr
195 s->strip_size_blocks);
196 // advance input memory pointers
197 mem_in1 += s->strip_size;
198 mem_in2 += s->strip_size;
199
200 // wait until one store and load stages left in queue. Overlap 2 loads + 1 store DMA requests with compute
201 LCWaitDMAQueue(3);
202
203 // compute. Call callback function f on strips in LC.
204 f(s->strip_size, // strip size in LC
205 funArg, // user's own arg
206 curr_lc_buffers + 2*s->strip_size, // output
207 curr_lc_buffers, // input 1
208 curr_lc_buffers + s->strip_size); // input 2
209
210
211 // enqueue store stage
212 LCStoreDMABlocks(mem_out, // mem addr
213 curr_lc_buffers + 2*s->strip_size, // LC addr
214 s->strip_size_blocks);
215 mem_out += s->strip_size; // advance output memory pointer
216
217 // toggle buffers
218 toggle = ++(toggle) & 0x1; // switch buffers curr<->next
219 if (toggle) {
220 curr_lc_buffers = s->lc_buffer_B;
221 next_lc_buffers = s->lc_buffer_A;
222 } else {
223 curr_lc_buffers = s->lc_buffer_A;
224 next_lc_buffers = s->lc_buffer_B;
225 }
226 }
227
228 // wait until previous loads done. Only second to last store pending.
229 LCWaitDMAQueue(1);
230
231 // last compute. Call callback function f on strips in LC.
232 f(s->strip_size, // strip size in LC
233 funArg, // user's own arg
234 curr_lc_buffers + 2*s->strip_size, // output
235 curr_lc_buffers, // input 1
236 curr_lc_buffers + s->strip_size); // input 2
237
238 // enqueue last store stage
239 LCStoreDMABlocks(mem_out, // mem addr
240 curr_lc_buffers + 2*s->strip_size, // LC addr
241 s->strip_size_blocks);
242
243 // Wait until last store done.
244 LCWaitDMAQueue(0);
245 }
246
247
Process3In1Out(LCStream * s,LCStreamFunction f,void * funArg)248 void Process3In1Out(LCStream* s, LCStreamFunction f, void* funArg) {
249 // pointers to memory currently being loaded or stored do. These will
250 // be used to traverse n strips of total data.
251 u8* mem_in1 = s->in[0];
252 u8* mem_in2 = s->in[1];
253 u8* mem_in3 = s->in[2];
254 u8* mem_out = s->out;
255
256 u8* curr_lc_buffers = s->lc_buffer_A; // initial start of "A" buffers
257 u8* next_lc_buffers = s->lc_buffer_B; // initial start of "B" buffers
258 u32 toggle = 0; // used to toggle between buffers
259
260
261 // Enqueue one DMA load stage ahead to setup pipelining.
262 // A load stage mean three DMA load requests for inputs 1-3.
263 LCLoadDMABlocks(curr_lc_buffers, // LC addr
264 mem_in1, // mem addr
265 s->strip_size_blocks);
266 LCLoadDMABlocks(curr_lc_buffers + s->strip_size, // LC addr
267 mem_in2, // mem addr
268 s->strip_size_blocks);
269 LCLoadDMABlocks(curr_lc_buffers + 2*s->strip_size, // LC addr
270 mem_in3, // mem addr
271 s->strip_size_blocks);
272 // advance input pointers
273 mem_in1 += s->strip_size;
274 mem_in2 += s->strip_size;
275 mem_in3 += s->strip_size;
276
277 LCWaitDMAQueue(0); // wait until this first load stage is done
278
279 u32 i;
280 for (i=0; i<s->total_strips-1; i++) {
281 // enqueue DMAs for next load stage
282 LCLoadDMABlocks(next_lc_buffers, // LC addr
283 mem_in1, // mem addr
284 s->strip_size_blocks);
285 LCLoadDMABlocks(next_lc_buffers + s->strip_size, // LC addr
286 mem_in2, // mem addr
287 s->strip_size_blocks);
288 LCLoadDMABlocks(next_lc_buffers + 2*s->strip_size,// LC addr
289 mem_in3, // mem addr
290 s->strip_size_blocks);
291 // advance input memory pointers
292 mem_in1 += s->strip_size;
293 mem_in2 += s->strip_size;
294 mem_in3 += s->strip_size;
295
296 // wait until one store and load stages left in queue. Overlap 3 loads + 1 store DMA requests with compute
297 LCWaitDMAQueue(4);
298
299 // compute. Call callback function f on strips in LC.
300 f(s->strip_size, // strip size in LC
301 funArg, // user's own arg
302 curr_lc_buffers + 3*s->strip_size, // output
303 curr_lc_buffers, // input 1
304 curr_lc_buffers + s->strip_size, // input 2
305 curr_lc_buffers + 2*s->strip_size); // input 3
306
307
308 // enqueue store stage
309 LCStoreDMABlocks(mem_out, // mem addr
310 curr_lc_buffers + 3*s->strip_size, // LC addr
311 s->strip_size_blocks);
312 mem_out += s->strip_size; // advance output memory pointer
313
314 // toggle buffers
315 toggle = ++(toggle) & 0x1; // switch buffers curr<->next
316 if (toggle) {
317 curr_lc_buffers = s->lc_buffer_B;
318 next_lc_buffers = s->lc_buffer_A;
319 } else {
320 curr_lc_buffers = s->lc_buffer_A;
321 next_lc_buffers = s->lc_buffer_B;
322 }
323 }
324
325 // wait until previous loads done. Only second to last store pending.
326 LCWaitDMAQueue(1);
327
328 // last compute. Call callback function f on strips in LC.
329 f(s->strip_size, // strip size in LC
330 funArg, // user's own arg
331 curr_lc_buffers + 3*s->strip_size, // output
332 curr_lc_buffers, // input 1
333 curr_lc_buffers + s->strip_size, // input 2
334 curr_lc_buffers + 2*s->strip_size); // input 3
335
336 // enqueue last store stage
337 LCStoreDMABlocks(mem_out, // mem addr
338 curr_lc_buffers + 3*s->strip_size, // LC addr
339 s->strip_size_blocks);
340
341 // Wait until last store done.
342 LCWaitDMAQueue(0);
343 }
344
345
Process4In1Out(LCStream * s,LCStreamFunction f,void * funArg)346 void Process4In1Out(LCStream* s, LCStreamFunction f, void* funArg) {
347 // pointers to memory currently being loaded or stored do. These will
348 // be used to traverse n strips of total data.
349 u8* mem_in1 = s->in[0];
350 u8* mem_in2 = s->in[1];
351 u8* mem_in3 = s->in[2];
352 u8* mem_in4 = s->in[3];
353 u8* mem_out = s->out;
354
355 u8* curr_lc_buffers = s->lc_buffer_A; // initial start of "A" buffers
356 u8* next_lc_buffers = s->lc_buffer_B; // initial start of "B" buffers
357 u32 toggle = 0; // used to toggle between buffers
358
359
360 // Enqueue one DMA load stage ahead to setup pipelining.
361 // A load stage mean three DMA load requests for inputs 1-3.
362 LCLoadDMABlocks(curr_lc_buffers, // LC addr
363 mem_in1, // mem addr
364 s->strip_size_blocks);
365 LCLoadDMABlocks(curr_lc_buffers + s->strip_size, // LC addr
366 mem_in2, // mem addr
367 s->strip_size_blocks);
368 LCLoadDMABlocks(curr_lc_buffers + 2*s->strip_size, // LC addr
369 mem_in3, // mem addr
370 s->strip_size_blocks);
371 LCLoadDMABlocks(curr_lc_buffers + 3*s->strip_size, // LC addr
372 mem_in4, // mem addr
373 s->strip_size_blocks);
374 // advance input pointers
375 mem_in1 += s->strip_size;
376 mem_in2 += s->strip_size;
377 mem_in3 += s->strip_size;
378 mem_in4 += s->strip_size;
379
380 LCWaitDMAQueue(0); // wait until this first load stage is done
381
382 u32 i;
383 for (i=0; i<s->total_strips-1; i++) {
384 // enqueue DMAs for next load stage
385 LCLoadDMABlocks(next_lc_buffers, // LC addr
386 mem_in1, // mem addr
387 s->strip_size_blocks);
388 LCLoadDMABlocks(next_lc_buffers + s->strip_size, // LC addr
389 mem_in2, // mem addr
390 s->strip_size_blocks);
391 LCLoadDMABlocks(next_lc_buffers + 2*s->strip_size,// LC addr
392 mem_in3, // mem addr
393 s->strip_size_blocks);
394 LCLoadDMABlocks(next_lc_buffers + 3*s->strip_size,// LC addr
395 mem_in4, // mem addr
396 s->strip_size_blocks);
397 // advance input memory pointers
398 mem_in1 += s->strip_size;
399 mem_in2 += s->strip_size;
400 mem_in3 += s->strip_size;
401 mem_in4 += s->strip_size;
402
403 // wait until one store and load stages left in queue. Overlap 3 loads + 1 store DMA requests with compute
404 LCWaitDMAQueue(4);
405
406 // compute. Call callback function f on strips in LC.
407 f(s->strip_size, // strip size in LC
408 funArg, // user's own arg
409 curr_lc_buffers + 4*s->strip_size, // output
410 curr_lc_buffers, // input 1
411 curr_lc_buffers + s->strip_size, // input 2
412 curr_lc_buffers + 2*s->strip_size, // input 3
413 curr_lc_buffers + 3*s->strip_size); // input 4
414
415
416 // enqueue store stage
417 LCStoreDMABlocks(mem_out, // mem addr
418 curr_lc_buffers + 4*s->strip_size, // LC addr
419 s->strip_size_blocks);
420 mem_out += s->strip_size; // advance output memory pointer
421
422
423 // toggle buffers
424 toggle = ++(toggle) % 2; // switch buffers curr<->next
425 if (toggle) {
426 curr_lc_buffers = s->lc_buffer_B;
427 next_lc_buffers = s->lc_buffer_A;
428 } else {
429 curr_lc_buffers = s->lc_buffer_A;
430 next_lc_buffers = s->lc_buffer_B;
431 }
432 }
433
434 // wait until previous loads done. Only second to last store pending.
435 LCWaitDMAQueue(1);
436
437 // last compute. Call callback function f on strips in LC.
438 f(s->strip_size, // strip size in LC
439 funArg, // user's own arg
440 curr_lc_buffers + 4*s->strip_size, // output
441 curr_lc_buffers, // input 1
442 curr_lc_buffers + s->strip_size, // input 2
443 curr_lc_buffers + 2*s->strip_size, // input 3
444 curr_lc_buffers + 3*s->strip_size); // input 4
445
446 // enqueue last store stage
447 LCStoreDMABlocks(mem_out, // mem addr
448 curr_lc_buffers + 4*s->strip_size, // LC addr
449 s->strip_size_blocks);
450
451 // Wait until last store done.
452 LCWaitDMAQueue(0);
453 }
454
455
LCSTREAMProcess(LCStream * s,BOOL do_flush,LCStreamFunction f,void * funArg)456 void LCSTREAMProcess(LCStream* s, BOOL do_flush, LCStreamFunction f, void* funArg) {
457 // make sure input and output data are not in the cache hierarchy before
458 // we DMA memory <-> LC?
459 if (do_flush) {
460 u32 i;
461 for (i=0; i<s->num_inputs; i++) { // flush all the input streams
462 DCFlushRangeNoSync(s->in[i], s->size);
463 }
464 DCInvalidateRange(s->out, s->size); // invalidate the output stream
465 // printf("dcflush %p:%p)\n", s->out, s->out + s->size);
466 PPC_SYNC();
467 }
468
469 switch (s->num_inputs) {
470 case 1:
471 Process1In1Out(s, f, funArg);
472 break;
473 case 2:
474 Process2In1Out(s, f, funArg);
475 break;
476 case 3:
477 Process3In1Out(s, f, funArg);
478 break;
479 case 4:
480 Process4In1Out(s, f, funArg);
481 break;
482 }
483 }
484