/*---------------------------------------------------------------------------* Project: Cafe File: stream.c Copyright 2011 Nintendo. All rights reserved. These coded instructions, statements, and computer programs contain proprietary information of Nintendo of America Inc. and/or Nintendo Company Ltd., and are protected by Federal copyright law. They may not be disclosed to third parties or copied or duplicated in any form, in whole or in part, without the prior written consent of Nintendo. *---------------------------------------------------------------------------*/ #include #include #include #define PPC_SYNC() __SYNC() u32 LCSTREAMLCSizeRequired(u32 stripSize, u32 numOutputs, u32 numInputs) { return 2*stripSize*(numOutputs+numInputs); } u32 LCSTREAMLCSizeAvailable(void) { return LCGetUnallocated(); } u32 LCSTREAMAlloc(LCStream* s, u32 stripSize, u32 numOutputs, u32 numInputs) { if (numInputs > LCSTREAM_MAX_NUM_INPUTS) return 0; s->size = 0; s->strip_size = stripSize; // todo: system level macro for max dma size? s->strip_size_blocks = (stripSize>=LC_MAX_DMA_BYTES)? 0 : stripSize/CACHE_BLOCK_SIZE; s->total_strips = 0; // stream not yet associated with any data s->num_outputs = numOutputs; s->num_inputs = numInputs; // reserve some area within the LC. We should later change the memory // allocation model of LC to that of a heap with fixed sized allocations. // Either 512 or 1KB blocks. u8* lc_area = (u8*) LCAlloc(2*stripSize*(numOutputs + numInputs)); if (!lc_area) { s->lc_buffer_A = NULL; s->lc_buffer_B = NULL; return 0; // failed to allocate } // divide up area as the input/output buffers s->lc_buffer_A = lc_area; s->lc_buffer_B = lc_area + s->strip_size*(numOutputs + numInputs); return 2*stripSize*(numOutputs + numInputs); } void LCSTREAMFree(LCStream* s) { ASSERT(s->lc_buffer_A); // make sure we have memory LCDealloc(s->lc_buffer_A); s->lc_buffer_A = NULL; } void LCSTREAMAssign(LCStream* s, u32 streamLength, void* output, u32 numInputs, ...) { s->size = streamLength; s->total_strips = streamLength/s->strip_size; s->out = output; va_list ap; va_start(ap, numInputs); u32 i; for (i=0; inum_inputs; i++) { s->in[i] = (u8*) va_arg(ap, void*); } va_end(ap); } void Process1In1Out(LCStream* s, LCStreamFunction f, void* funArg) { // pointers to memory currently being loaded or stored do. These will // be used to traverse n strips of total data. u8* mem_in1 = s->in[0]; u8* mem_out = s->out; u8* curr_lc_buffers = s->lc_buffer_A; // initial start of "A" buffers u8* next_lc_buffers = s->lc_buffer_B; // initial start of "B" buffers u32 toggle = 0; // used to toggle between buffers // Enqueue one DMA load stage ahead to setup pipelining. // A load stage mean one DMA load requests for input1. LCLoadDMABlocks(curr_lc_buffers, // LC addr mem_in1, // mem addr s->strip_size_blocks); mem_in1 += s->strip_size; // advance input pointer LCWaitDMAQueue(0); // wait until this first load stage is done u32 i; for (i=0; itotal_strips-1; i++) { // enqueue DMA for next load stage LCLoadDMABlocks(next_lc_buffers, // LC addr mem_in1, // mem addr s->strip_size_blocks); mem_in1 += s->strip_size; // advance input memory pointer // wait until one store and load stages left in queue. Overlap 1 load + 1 store DMA requests with compute LCWaitDMAQueue(2); // compute. Call callback function f on strips in LC. f(s->strip_size, // strip size in LC funArg, // user's own arg curr_lc_buffers + s->strip_size, // output curr_lc_buffers); // input // enqueue store stage LCStoreDMABlocks(mem_out, // mem addr curr_lc_buffers + s->strip_size, // LC addr s->strip_size_blocks); mem_out += s->strip_size; // advance output memory pointer // toggle buffers toggle = ++(toggle) & 0x1; // switch buffers curr<->next if (toggle) { curr_lc_buffers = s->lc_buffer_B; next_lc_buffers = s->lc_buffer_A; } else { curr_lc_buffers = s->lc_buffer_A; next_lc_buffers = s->lc_buffer_B; } } // wait until last load done. Only second to last store pending. LCWaitDMAQueue(1); // last compute. Call callback function f on strips in LC. f(s->strip_size, // strip size in LC funArg, // user's own arg curr_lc_buffers + s->strip_size, // output curr_lc_buffers); // input // enqueue last store stage LCStoreDMABlocks(mem_out, // mem addr curr_lc_buffers + s->strip_size, // LC addr s->strip_size_blocks); // wait until last store done LCWaitDMAQueue(0); } void Process2In1Out(LCStream* s, LCStreamFunction f, void* funArg) { // pointers to memory currently being loaded or stored do. These will // be used to traverse n strips of total data. u8* mem_in1 = s->in[0]; u8* mem_in2 = s->in[1]; u8* mem_out = s->out; u8* curr_lc_buffers = s->lc_buffer_A; // initial start of "A" buffers u8* next_lc_buffers = s->lc_buffer_B; // initial start of "B" buffers u32 toggle = 0; // used to toggle between buffers // Enqueue one DMA load stage ahead to setup pipelining. // A load stage mean two DMA load requests for input1 and input2. LCLoadDMABlocks(curr_lc_buffers, // LC addr mem_in1, // mem addr s->strip_size_blocks); LCLoadDMABlocks(curr_lc_buffers + s->strip_size, // LC addr mem_in2, // mem addr s->strip_size_blocks); // advance input pointers mem_in1 += s->strip_size; mem_in2 += s->strip_size; LCWaitDMAQueue(0); // wait until this first load stage is done u32 i; for (i=0; itotal_strips-1; i++) { // enqueue DMAs for next load stage LCLoadDMABlocks(next_lc_buffers, // LC addr mem_in1, // mem addr s->strip_size_blocks); LCLoadDMABlocks(next_lc_buffers + s->strip_size, // LC addr mem_in2, // mem addr s->strip_size_blocks); // advance input memory pointers mem_in1 += s->strip_size; mem_in2 += s->strip_size; // wait until one store and load stages left in queue. Overlap 2 loads + 1 store DMA requests with compute LCWaitDMAQueue(3); // compute. Call callback function f on strips in LC. f(s->strip_size, // strip size in LC funArg, // user's own arg curr_lc_buffers + 2*s->strip_size, // output curr_lc_buffers, // input 1 curr_lc_buffers + s->strip_size); // input 2 // enqueue store stage LCStoreDMABlocks(mem_out, // mem addr curr_lc_buffers + 2*s->strip_size, // LC addr s->strip_size_blocks); mem_out += s->strip_size; // advance output memory pointer // toggle buffers toggle = ++(toggle) & 0x1; // switch buffers curr<->next if (toggle) { curr_lc_buffers = s->lc_buffer_B; next_lc_buffers = s->lc_buffer_A; } else { curr_lc_buffers = s->lc_buffer_A; next_lc_buffers = s->lc_buffer_B; } } // wait until previous loads done. Only second to last store pending. LCWaitDMAQueue(1); // last compute. Call callback function f on strips in LC. f(s->strip_size, // strip size in LC funArg, // user's own arg curr_lc_buffers + 2*s->strip_size, // output curr_lc_buffers, // input 1 curr_lc_buffers + s->strip_size); // input 2 // enqueue last store stage LCStoreDMABlocks(mem_out, // mem addr curr_lc_buffers + 2*s->strip_size, // LC addr s->strip_size_blocks); // Wait until last store done. LCWaitDMAQueue(0); } void Process3In1Out(LCStream* s, LCStreamFunction f, void* funArg) { // pointers to memory currently being loaded or stored do. These will // be used to traverse n strips of total data. u8* mem_in1 = s->in[0]; u8* mem_in2 = s->in[1]; u8* mem_in3 = s->in[2]; u8* mem_out = s->out; u8* curr_lc_buffers = s->lc_buffer_A; // initial start of "A" buffers u8* next_lc_buffers = s->lc_buffer_B; // initial start of "B" buffers u32 toggle = 0; // used to toggle between buffers // Enqueue one DMA load stage ahead to setup pipelining. // A load stage mean three DMA load requests for inputs 1-3. LCLoadDMABlocks(curr_lc_buffers, // LC addr mem_in1, // mem addr s->strip_size_blocks); LCLoadDMABlocks(curr_lc_buffers + s->strip_size, // LC addr mem_in2, // mem addr s->strip_size_blocks); LCLoadDMABlocks(curr_lc_buffers + 2*s->strip_size, // LC addr mem_in3, // mem addr s->strip_size_blocks); // advance input pointers mem_in1 += s->strip_size; mem_in2 += s->strip_size; mem_in3 += s->strip_size; LCWaitDMAQueue(0); // wait until this first load stage is done u32 i; for (i=0; itotal_strips-1; i++) { // enqueue DMAs for next load stage LCLoadDMABlocks(next_lc_buffers, // LC addr mem_in1, // mem addr s->strip_size_blocks); LCLoadDMABlocks(next_lc_buffers + s->strip_size, // LC addr mem_in2, // mem addr s->strip_size_blocks); LCLoadDMABlocks(next_lc_buffers + 2*s->strip_size,// LC addr mem_in3, // mem addr s->strip_size_blocks); // advance input memory pointers mem_in1 += s->strip_size; mem_in2 += s->strip_size; mem_in3 += s->strip_size; // wait until one store and load stages left in queue. Overlap 3 loads + 1 store DMA requests with compute LCWaitDMAQueue(4); // compute. Call callback function f on strips in LC. f(s->strip_size, // strip size in LC funArg, // user's own arg curr_lc_buffers + 3*s->strip_size, // output curr_lc_buffers, // input 1 curr_lc_buffers + s->strip_size, // input 2 curr_lc_buffers + 2*s->strip_size); // input 3 // enqueue store stage LCStoreDMABlocks(mem_out, // mem addr curr_lc_buffers + 3*s->strip_size, // LC addr s->strip_size_blocks); mem_out += s->strip_size; // advance output memory pointer // toggle buffers toggle = ++(toggle) & 0x1; // switch buffers curr<->next if (toggle) { curr_lc_buffers = s->lc_buffer_B; next_lc_buffers = s->lc_buffer_A; } else { curr_lc_buffers = s->lc_buffer_A; next_lc_buffers = s->lc_buffer_B; } } // wait until previous loads done. Only second to last store pending. LCWaitDMAQueue(1); // last compute. Call callback function f on strips in LC. f(s->strip_size, // strip size in LC funArg, // user's own arg curr_lc_buffers + 3*s->strip_size, // output curr_lc_buffers, // input 1 curr_lc_buffers + s->strip_size, // input 2 curr_lc_buffers + 2*s->strip_size); // input 3 // enqueue last store stage LCStoreDMABlocks(mem_out, // mem addr curr_lc_buffers + 3*s->strip_size, // LC addr s->strip_size_blocks); // Wait until last store done. LCWaitDMAQueue(0); } void Process4In1Out(LCStream* s, LCStreamFunction f, void* funArg) { // pointers to memory currently being loaded or stored do. These will // be used to traverse n strips of total data. u8* mem_in1 = s->in[0]; u8* mem_in2 = s->in[1]; u8* mem_in3 = s->in[2]; u8* mem_in4 = s->in[3]; u8* mem_out = s->out; u8* curr_lc_buffers = s->lc_buffer_A; // initial start of "A" buffers u8* next_lc_buffers = s->lc_buffer_B; // initial start of "B" buffers u32 toggle = 0; // used to toggle between buffers // Enqueue one DMA load stage ahead to setup pipelining. // A load stage mean three DMA load requests for inputs 1-3. LCLoadDMABlocks(curr_lc_buffers, // LC addr mem_in1, // mem addr s->strip_size_blocks); LCLoadDMABlocks(curr_lc_buffers + s->strip_size, // LC addr mem_in2, // mem addr s->strip_size_blocks); LCLoadDMABlocks(curr_lc_buffers + 2*s->strip_size, // LC addr mem_in3, // mem addr s->strip_size_blocks); LCLoadDMABlocks(curr_lc_buffers + 3*s->strip_size, // LC addr mem_in4, // mem addr s->strip_size_blocks); // advance input pointers mem_in1 += s->strip_size; mem_in2 += s->strip_size; mem_in3 += s->strip_size; mem_in4 += s->strip_size; LCWaitDMAQueue(0); // wait until this first load stage is done u32 i; for (i=0; itotal_strips-1; i++) { // enqueue DMAs for next load stage LCLoadDMABlocks(next_lc_buffers, // LC addr mem_in1, // mem addr s->strip_size_blocks); LCLoadDMABlocks(next_lc_buffers + s->strip_size, // LC addr mem_in2, // mem addr s->strip_size_blocks); LCLoadDMABlocks(next_lc_buffers + 2*s->strip_size,// LC addr mem_in3, // mem addr s->strip_size_blocks); LCLoadDMABlocks(next_lc_buffers + 3*s->strip_size,// LC addr mem_in4, // mem addr s->strip_size_blocks); // advance input memory pointers mem_in1 += s->strip_size; mem_in2 += s->strip_size; mem_in3 += s->strip_size; mem_in4 += s->strip_size; // wait until one store and load stages left in queue. Overlap 3 loads + 1 store DMA requests with compute LCWaitDMAQueue(4); // compute. Call callback function f on strips in LC. f(s->strip_size, // strip size in LC funArg, // user's own arg curr_lc_buffers + 4*s->strip_size, // output curr_lc_buffers, // input 1 curr_lc_buffers + s->strip_size, // input 2 curr_lc_buffers + 2*s->strip_size, // input 3 curr_lc_buffers + 3*s->strip_size); // input 4 // enqueue store stage LCStoreDMABlocks(mem_out, // mem addr curr_lc_buffers + 4*s->strip_size, // LC addr s->strip_size_blocks); mem_out += s->strip_size; // advance output memory pointer // toggle buffers toggle = ++(toggle) % 2; // switch buffers curr<->next if (toggle) { curr_lc_buffers = s->lc_buffer_B; next_lc_buffers = s->lc_buffer_A; } else { curr_lc_buffers = s->lc_buffer_A; next_lc_buffers = s->lc_buffer_B; } } // wait until previous loads done. Only second to last store pending. LCWaitDMAQueue(1); // last compute. Call callback function f on strips in LC. f(s->strip_size, // strip size in LC funArg, // user's own arg curr_lc_buffers + 4*s->strip_size, // output curr_lc_buffers, // input 1 curr_lc_buffers + s->strip_size, // input 2 curr_lc_buffers + 2*s->strip_size, // input 3 curr_lc_buffers + 3*s->strip_size); // input 4 // enqueue last store stage LCStoreDMABlocks(mem_out, // mem addr curr_lc_buffers + 4*s->strip_size, // LC addr s->strip_size_blocks); // Wait until last store done. LCWaitDMAQueue(0); } void LCSTREAMProcess(LCStream* s, BOOL do_flush, LCStreamFunction f, void* funArg) { // make sure input and output data are not in the cache hierarchy before // we DMA memory <-> LC? if (do_flush) { u32 i; for (i=0; inum_inputs; i++) { // flush all the input streams DCFlushRangeNoSync(s->in[i], s->size); } DCInvalidateRange(s->out, s->size); // invalidate the output stream // printf("dcflush %p:%p)\n", s->out, s->out + s->size); PPC_SYNC(); } switch (s->num_inputs) { case 1: Process1In1Out(s, f, funArg); break; case 2: Process2In1Out(s, f, funArg); break; case 3: Process3In1Out(s, f, funArg); break; case 4: Process4In1Out(s, f, funArg); break; } }