#include <revolution/os.h> u32 LCStoreData(void* destAddr, void* srcAddr, u32 nBytes);
| destAddr | Start address of the transfer destination in memory. Must be 32-byte aligned. |
|---|---|
| srcAddr | Start address of the locked cache address to be copied to destAddr. Must be 32-byte aligned. |
| nBytes | Transfer size. Must be 32-byte aligned. |
Returns the number of transactions added to the DMA queue.
Enqueues DMA transactions that transfer data from the locked cache at srcAddr to main memory at destAddr. The range of valid source addresses is the 16KB region starting at the value returned by the LCGetBase function. The number of transactions issued is returned as the return value.
The only methods to determine whether the transaction has completed are to poll the length of the DMA queue with the LCQueueLength function or to wait until the queue length reaches a fixed value with the LCQueueWait function. See the following example.
The largest transfer allowed in a single DMA transaction is 128 cache blocks (4 KB). Thus, the number of transactions added (to the DMA queue) is always (nBytes / 4 KB), rounded up. The LCStoreBlocks function is a more efficient function that creates a single DMA transaction. However, its arguments are restricted and it does not provide error checking.
Note that a maximum of 15 DMA requests can be issued to the DMA queue. If the queue overflows, a machine check exception will occur.
The locked cache must be enabled throughout the transfer; or else a machine exception will occur during the DMA.
If the DMA finds a source address in normal cache, a machine check exception occurs.
The following code example splits the locked cache into 2 buffers, and ping-pongs between them to process a large array in main memory. The code is structured so that each buffer has no more than one outstanding unsent load and store at any given time. This fact is used to ensure that a buffer completed storing its old (just now processed) data and then completed loading the block of data to process next.
// define 2 8k buffers in locked cache region
// note that NUMBUFFERS * BUFFER_SIZE <= 16k
#define BUFFER_SIZE (8*1024)
#define NUM_BUFFERS (2)
#define DATA_ELEMENTS (10*1024*1024)
:
// real mem loc of Buffers[i] is at BufAddr[i]
u8* Buffers[NUM_BUFFERS];
u8* BufAddr[NUM_BUFFERS];
:
void main ()
{
u8* data;
u8* currDataPtr; // offset into data
u32 i;
void* arenaLo;
void* arenaHi;
u32 numTransactions;
OSInit();
LCEnable();
arenaLo = OSGetArenaLo();
arenaHi = OSGetArenaHi();
:
OSReport("Splitting locked cache into %d buffers\n", NUM_BUFFERS);
for (i = 0; i < NUM_BUFFERS; i++)
{
Buffers[i] = (u8*) ((u32)LCGetBase() + BUFFER_SIZE*i);
OSReport("Locked Cache : Allocated %d bytes at 0x%x\n",
BUFFER_SIZE,
Buffers[i]);
}
// Initialize source data
data = (u8*)OSAlloc(DATA_ELEMENTS * sizeof(u8));
:
DCFlushRange(data, DATA_ELEMENTS);
OSReport(" Test 1 : using high level interface for DMA load/store\n");
for (i = 0; i < NUM_BUFFERS; i++)
{
BufAddr[i] = data + BUFFER_SIZE*i;
numTransactions = LCLoadData(Buffers[i], BufAddr[i], BUFFER_SIZE);
}
currDataPtr = data + BUFFER_SIZE * NUM_BUFFERS;
LCQueueWait((NUM_BUFFERS-1) * 4);
while (currDataPtr <= data+DATA_ELEMENTS)
{
for (i = 0; i < NUM_BUFFERS; i++)
{
LCQueueWait((NUM_BUFFERS-1)*numTransactions); // prevstore + prevload, each takes 2
ProcessBuf(Buffers[i]);
LCStoreData(BufAddr[i], Buffers[i], BUFFER_SIZE);
BufAddr[i] = currDataPtr; // move to next unprocessed buffer
LCLoadData(Buffers[i], BufAddr[i], BUFFER_SIZE);
// advance the next block to be read
currDataPtr += BUFFER_SIZE;
}
}
LCQueueWait(numTransactions); // don't care about last dma's
:
OSHalt("Test complete");
}
Cache Functions, LCEnable, LCLoadData, LCQueueLength, LCQueueWait, LCStoreBlocks
2006/03/01 Initial version.
CONFIDENTIAL