#include <revolution/os.h>
u32 LCStoreData(void* destAddr, void* srcAddr, u32 nBytes);
destAddr |
Start address of the destination in memory. Must be 32-byte aligned. |
srcAddr |
Start address for the locked cache address to be copied to destAddr. Must be 32-byte aligned. |
|
Transfer size. Must be 32-byte aligned. |
Returns the number of transactions added to the DMA queue.
Enqueues DMA transactions to send data from the locked cache at srcAddr to main memory at destAddr. The range of valid source addresses is the 16KB region starting from the value returned by LCGetBase. The number of transactions issued is returned.
The only way to determine if the transactions are complete is to poll the length of the DMA queue with LCQueueLength, or to wait until the queue length reaches a certain value with LCQueueWait. See the example below.
The largest allowed single DMA transfer is 128 cache blocks (4 KB). Thus, the number of transfers added (to DMA queue) is always rounded up (nBytes/4 KB) . LCStoreBlocks is a more efficient function that creates a single DMA transfer. However, it does not provide error checking or argument restrictions.
No more than 15 outstanding DMA transfers are allowed in the DMA queue. If the queue overflows, a machine check exception will occur.
The locked cache must be enabled throughout the transfer; otherwise, a machine check exception will occur during the DMA.
If the DMA finds the source address in the normal cache, a machine check exception occurs.
The following code example splits the locked cache into 2 buffers, and ping pongs between them to process a large array in main memory. The code is structured so that each buffer has no more than one outstanding load and store at any given time. When a buffer finishes storing its old (freshly processed) data, this information can be used to ensure the next block of data to process loads correctly.
// define 2 8k buffers in locked cache region
// note that NUMBUFFERS * BUFFER_SIZE <= 16k
#define BUFFER_SIZE (8*1024)
#define NUM_BUFFERS (2)
#define DATA_ELEMENTS (10*1024*1024)
:
// real mem loc of Buffers[i] is at BufAddr[i]
u8* Buffers[NUM_BUFFERS];
u8* BufAddr[NUM_BUFFERS];
:
void main ()
{
u8* data;
u8* currDataPtr; // offset into data
u32 i;
void* arenaLo;
void* arenaHi;
u32 numTransactions;
OSInit();
LCEnable();
arenaLo = OSGetArenaLo();
arenaHi = OSGetArenaHi();
:
OSReport("Splitting locked cache into %d buffers\n", NUM_BUFFERS);
for (i = 0; i < NUM_BUFFERS; i++)
{
Buffers[i] = (u8*) ((u32)LCGetBase() + BUFFER_SIZE*i);
OSReport("Locked Cache : Allocated %d bytes at 0x%x\n",
BUFFER_SIZE,
Buffers[i]);
}
// Initialize source data
data = (u8*)OSAlloc(DATA_ELEMENTS * sizeof(u8));
:
DCFlushRange(data, DATA_ELEMENTS);
OSReport(" Test 1 : using high level interface for DMA load/store \n");
for (i = 0; i < NUM_BUFFERS; i++)
{
BufAddr[i] = data + BUFFER_SIZE*i;
numTransactions = LCLoadData(Buffers[i], BufAddr[i], BUFFER_SIZE);
}
currDataPtr = data + BUFFER_SIZE * NUM_BUFFERS;
LCQueueWait((NUM_BUFFERS-1) * 4);
while (currDataPtr <= data+DATA_ELEMENTS)
{
for (i = 0; i < NUM_BUFFERS; i++)
{
LCQueueWait((NUM_BUFFERS-1)*numTransactions); // prevstore + prevload, each takes 2
ProcessBuf(Buffers[i]);
LCStoreData(BufAddr[i], Buffers[i], BUFFER_SIZE);
BufAddr[i] = currDataPtr; // move to next unprocessed buffer
LCLoadData(Buffers[i], BufAddr[i], BUFFER_SIZE);
// advance the next block to be read
currDataPtr += BUFFER_SIZE;
}
}
LCQueueWait(numTransactions); // don't care about last dma's
:
OSHalt("Test complete");
}
Cache Functions,
LCEnable, LCLoadData, LCQueueLength, LCQueueWait, LCStoreBlocks
03/01/2006 Initial version.