/*---------------------------------------------------------------------------* Project: TwlSDK - MI - File: mi_uncompress.c Copyright 2003-2008 Nintendo. All rights reserved. These coded instructions, statements, and computer programs contain proprietary information of Nintendo of America Inc. and/or Nintendo Company Ltd., and are protected by Federal copyright law. They may not be disclosed to third parties or copied or duplicated in any form, in whole or in part, without the prior written consent of Nintendo. $Date:: 2008-09-17#$ $Rev: 8556 $ $Author: okubata_ryoma $ *---------------------------------------------------------------------------*/ #include #include //****Bug fix**** // Because halfword access instructions such as ldrh and strh are not passed through by the inline assembler due to a CW bug, instruction values are written directly using dcd in order to avoid this bug. // // When the bug is fixed, the 'define' below will be removed. //#define CW_BUG_FOR_LDRH_AND_STRH #define UNCOMPRESS_RL16_CODE32 //---- This code will be compiled in ARM-Mode #include //====================================================================== // Expanding compressed data //====================================================================== //---------------------------------------------------------------------- // Expanding bit compressed data // //- Unpacks data padded with bits fixed to 0. //- Align the destination address to a 4-byte boundary. // //Arguments: // void *srcp : source address // void *destp : destination address // MIUnpackBitsParam *paramp : Address of MIUnpackBitsParam structure // //MIUnpackBitsParam Structure // u16 srcNum: Number of bytes of source data // u8 srcBitNum: Number of bits per source data // u8 destBitNum: Number of bits per destination data // u32 destOffset:31 : Offset number to add to source data. // destOffset0_On:1 : Flag for whether to add an offset to 0 data. // //- Return value: None //---------------------------------------------------------------------- asm void MI_UnpackBits( register const void *srcp, register void *destp, register MIUnpackBitsParam *paramp ) { stmfd sp!, {r4-r11, lr} sub sp, sp, #4 ldrh r7, [r2, #0] // r7: srcNum = unPackBitsParamp->srcNum @00: ldrb r6, [r2, #2] // r6: srcBitNum = unPackBitsParamp->srcBitNum rsb r10, r6, #8 // r10: srcBitNumInv = 8 - srcBitNum mov r14, #0 // r14: destBak = 0 ldr r11, [r2, #4] // r8: destOffset0_On mov r8, r11, lsr #31 // = unPackBitsParamp->destOffset0_On ldr r11,[r2, #4] // destOffset = unPackBitsParamp->destOffset mov r11,r11, lsl #1 mov r11,r11, lsr #1 str r11,[sp, #0] ldrb r2, [r2, #3] // r2: destBitNum = unPackBitsParamp->destBitNum mov r3, #0 // r3: destBitCount = 0 @01: subs r7, r7, #1 // while (--srcNum >= 0) { blt @06 mov r11, #0xff // r5: srcMask = 0xff >> srcBitNumInv; mov r5, r11, asr r10 ldrb r9, [r0], #1 // r9: srcTmp = *srcp++; mov r4, #0 // r4: srcBitCount = 0; @02: cmp r4, #8 // while (srcBitCount < 8) { bge @01 and r11, r9, r5 // r12: destTmp = ((srcTmp & srcMask) >>srcBitCount); movs r12, r11, lsr r4 cmpeq r8, #0 beq @04 @03: ldr r11, [sp, #0] // destTmp += destOffset; add r12, r12, r11 @04: orr r14, r14, r12, lsl r3 // destBak |= destTmp << destBitCount; add r3, r3, r2 // destBitCount += destBitNum; cmp r3, #0x20 // if (destBitCount >= 32) { blt @05 str r14, [r1], #4 // *destp++ = destBak; mov r14, #0 // destBak = 0; mov r3, #0 // destBitCount = 0; // } @05: mov r5, r5, lsl r6 // srcMask <<= srcBitNum; add r4, r4, r6 // srcBitCount += srcBitNum; b @02 // } // } @06: add sp, sp, #4 ldmfd sp!, {r4-r11, lr} bx lr } //---------------------------------------------------------------------- // 8-bit decompression of LZ77 compressed data // //* Expands LZ77-compressed data and writes it in 8-bit units. //- Cannot decompress directly into VRAM //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible. // //- Use 4 byte alignment for the source address. // //Arguments: // void *srcp : source address // void *destp : destination address // //- Data header // u32 :4 : Reserved // compType:4 Compression type( = 1) // destSize:24 Data size after decompression // //- Flag data format // u8 flags Compression/no compression flag // (0, 1) = (not compressed, compressed) //- Code data format (Big Endian) // u16 length:4 Decompressed data length - 3 (only compress when the match length is 3 bytes or greater) // offset:12 Match data offset - 1 // //- Return value: None //---------------------------------------------------------------------- asm void MI_UncompressLZ8( register const void *srcp, register void *destp ) { stmfd sp!, {r4-r7, lr} ldr r5, [r0], #4 // r2: destCount = *(u32 *)srcp >> 8 mov r2, r5, lsr #8 // r0: srcp += 4 mov r7, #0 tst r5, #0x0F // r7: isExFormat = (*header & 0x0F)? 1 : 0 movne r7, #1 @21: cmp r2, #0 // while (destCount > 0) { ble @26 ldrb r14, [r0], #1 // r14: flags = *srcp++ mov r4, #8 // @22: subs r4, r4, #1 // for ( i = 8; --i >= 0; ) { blt @21 tst r14, #0x80 // if (!(flags & 0x80)) { bne @23 ldrb r6, [r0], #1 // *srcp++; swpb r6, r6, [r1] // r1: *destp++; (Byte-writing countermeasure) add r1, r1, #1 sub r2, r2, #1 // destCount--; b @25 // } else { @23: ldrb r5, [r0, #0] // r3: length = (*srcp >> 4); cmp r7, #0 // if ( ! isExFormat ) { length += 3; } moveq r6, #3 beq @23_2 // else { tst r5, #0xE0 // if ( length > 1 ) { movne r6, #1 // length += 1; bne @23_2 // } else { add r0, r0, #1 // isWide = (length == 1)? 1 : 0; and r6, r5, #0xf // length = (*srcp++ & 0x0F) << 4 mov r6, r6, lsl #4 tst r5, #0x10 beq @23_1 // if ( isWide ) { mov r6, r6, lsl #8 // length <<= 8; ldrb r5, [r0], #1 // length += (*srcp++) << 4; add r6, r6, r5, lsl #4 // length += 0xFF + 1; add r6, r6, #0x100 // } @23_1: add r6, r6, #0x11 // length += 0xF + 2; ldrb r5, [r0, #0] // length += (*srcp >> 4); @23_2: // } add r3, r6, r5, asr #4 // } add r0, r0, #1 // r12: offset = (*srcp++ & 0x0f) << 8; and r5, r5, #0xf mov r12,r5, lsl #8 ldrb r6, [r0], #1 // offset = (offset | *srcp++) + 1; orr r5, r6, r12 add r12,r5, #1 sub r2, r2, r3 // destCount -= length; // do { @24: ldrb r5, [r1, -r12] // *destp++ = destp[-offset] swpb r5, r5, [r1] // (Byte-writing countermeasure) add r1, r1, #1 subs r3, r3, #1 // } while (--length > 0); bgt @24 // } @25: cmp r2, #0 // if (destCount <= 0) break; movgt r14, r14, lsl #1 // flags <<= 1 bgt @22 // } b @21 // } @26: ldmfd sp!, {r4-r7, lr} bx lr } //---------------------------------------------------------------------- // 16-bit decompression of LZ77 compressed data // //* Expands LZ77-compressed data and writes it in 16-bit units. //* Although it can also expand in data TCM and main memory, it is slower than MI_UncompressLZ77(). // //* For compressed data, search for a matching character string from a minimum of 2 bytes previous. //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible. // //- Use 4 byte alignment for the source address. // //Arguments: // void *srcp : source address // void *destp : destination address // //- Data header // u32 :4 : Reserved // compType:4 Compression type( = 1) // destSize:24 Data size after decompression // //- Flag data format // u8 flags Compression/no compression flag // (0, 1) = (not compressed, compressed) //- Code data format (Big Endian) // u16 length:4 Decompressed data length - 3 (only compress when the match length is 3 bytes or greater) // offset:12 : Match data offset ( >= 2) - 1 // //- Return value: None //---------------------------------------------------------------------- asm void MI_UncompressLZ16( register const void *srcp, register void *destp ) { stmfd sp!, {r4-r11, lr} mov r3, #0 // r3: destTmp = 0 ldr r8, [r0], #4 // r10: destCount = *(u32 *)srcp >> 8 mov r10, r8, lsr #8 // r0: srcp += 4 mov r2, #0 // r2: shift = 0 mov r11, #0 tst r8, #0x0F // r11: isExFormat = (*header & 0x0F)? 1 : 0; movne r11, #1 @31: cmp r10, #0 // while (destCount > 0) { ble @36 ldrb r6, [r0], #1 // r6: flags = *srcp++; mov r7, #8 // for ( i = 8; --i >= 0; ) { @32: subs r7, r7, #1 blt @31 tst r6, #0x80 // if (!(flags & 0x80)) { bne @33 ldrb r9, [r0], #1 // destTmp |= *srcp++ << shift; orr r3, r3, r9, lsl r2 sub r10, r10, #1 // destCount--; eors r2, r2, #8 // if (!(shift ^= 8)) { #ifndef CW_BUG_FOR_LDRH_AND_STRH streqh r3, [r1], #2 // *destp++ = destTmp; #else dcd 0x00c130b2 #endif moveq r3, #0 // destTmp = 0; b @35 // } else { @33: ldrb r9, [r0, #0] // r5: length = (*srcp >> 4) + 3; cmp r11, #0 // if ( ! isExFormat ) { length += 3; } moveq r8, #3 beq @33_2 // else { tst r9, #0xE0 // if ( length > 1 ) { movne r8, #1 // length += 1 bne @33_2 // } else { add r0, r0, #1 // isWide = (length == 1)? 1 : 0; and r8, r9, #0xf // length = (*srcp++ & 0x0F) << 4 mov r8, r8, lsl #4 tst r9, #0x10 beq @33_1 // if ( isWide ) { mov r8, r8, lsl #8 // length <<= 8; ldrb r9, [r0], #1 // length += (*srcp++) << 4 add r8, r8, r9, lsl #4 // length += 0xFF + 1 add r8, r8, #0x100 // } @33_1: add r8, r8, #0x11 // length += 0xF + 2; ldrb r9, [r0, #0] // length += (*srcp >> 4); @33_2: // } add r5, r8, r9, asr #4 // } ldrb r9, [r0], #1 // r4: offset = (*srcp++ & 0x0f) << 8; and r8, r9, #0xf mov r4, r8, lsl #8 ldrb r9, [r0], #1 // offset = (offset | *srcp++) + 1; orr r8, r9, r4 add r4, r8, #1 rsb r8, r2, #8 // r14: offset0_8 = (8 - shift) and r9, r4, #1 // ^ ((offset & 1) << 3); eor r14, r8, r9, lsl #3 sub r10, r10, r5 // destCount -= length; // do { @34: eor r14, r14, #8 // offset0_8 ^= 8; rsb r8, r2, #8 // destTmp |= (destp[-((offset add r8, r4, r8, lsr #3 // + ((8 - shift) >> 3)) >> 1)]; mov r8, r8, lsr #1 mov r8, r8, lsl #1 #ifndef CW_BUG_FOR_LDRH_AND_STRH ldrh r9, [r1, -r8] #else dcd 0xe11190b8 #endif mov r8, #0xff and r8, r9, r8, lsl r14 mov r8, r8, asr r14 orr r3, r3, r8, lsl r2 eors r2, r2, #8 // if (!(shift ^= 8)) { #ifndef CW_BUG_FOR_LDRH_AND_STRH streqh r3, [r1], #2 // *destp++ = destTmp; #else dcd 0x00c130b2 #endif moveq r3, #0 // destTmp = 0; // } subs r5, r5, #1 // } while (--length > 0); bgt @34 // } @35: cmp r10, #0 // if (destCount <= 0) break; movgt r6, r6, lsl #1 // flags <<= 1 bgt @32 // } b @31 // } @36: ldmfd sp!, {r4-r11, lr} bx lr } //---------------------------------------------------------------------- // Decompression of Huffman compressed data // //- Decompresses Huffman compressed data, writing in 32 bit units. //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible. // //- Use 4 byte alignment for the source address. // //Arguments: // void *srcp : source address // void *destp : destination address // //- Data header // u32 bitSize:4 1 data bit size (Normally 4|8) // compType:4 Compression type( = 2) // destSize:24 Data size after decompression // //- Tree table // u8 treeSize Tree table size/2 - 1 // TreeNodeData nodeRoot Root node // // TreeNodeData nodeLeft Root left node // TreeNodeData nodeRight Root right node // // TreeNodeData nodeLeftLeft Left left node // TreeNodeData nodeLeftRight Left right node // // TreeNodeData nodeRightLeft Right left node // TreeNodeData nodeRightRight Right right node // // E // E // // The compressed data itself follows // //- TreeNodeData structure // u8 nodeNextOffset:6 : Offset to the next node data - 1 (2 byte units) // rightEndFlag:1 Right node end flag // leftEndzflag:1 Left node end flag // When end flag is set, there is data in next node. // // //- Return value: None //---------------------------------------------------------------------- asm void MI_UncompressHuffman( register const void *srcp, register void *destp ) { stmfd sp!, {r4-r11, lr} sub sp, sp, #4 add r2, r0, #4 // r2: *treep = (u8 *)srcp + 4 add r7, r2, #1 // r7: *treeStartp = treep + 1 ldrb r10,[r0, #0] // r4: DataBit = *(u8 *)srcp & 0x0f and r4, r10, #0xf mov r3, #0 // r3: destTmp = 0 mov r14,#0 // r14: destTmpCount = 0 and r10,r4, #7 // destTmpDataNum = 4 + (DataBit & 0x7) add r11,r10, #4 str r11,[sp, #0] ldr r10,[r0, #0] // r12: destCount = *srcp >> 8 mov r12,r10, lsr #8 ldrb r10,[r2, #0] // r0: srcp = (u32 *)(treep + ((*treep + 1) << 1)) add r10,r10, #1 add r0, r2, r10, lsl #1 mov r2, r7 // treep = treeStartp @11: cmp r12, #0 // while (destCount > 0) { ble @14 mov r8, #32 // r8: srcCount = 32; ldr r5, [r0], #4 // r5: srcTmp = *srcp++; @12: subs r8, r8, #1 // while (--srcCount >= 0) { blt @11 mov r10,#1 // r9: treeShift = (srcTmp >> 31) & 0x1 and r9, r10, r5, lsr #31 ldrb r6, [r2, #0] // r6: treeCheck = *treep mov r6, r6, lsl r9 // treeCheck <<= treeShift mov r10,r2, lsr #1 // treep = (u8 *)((((u32 )treep>>1) <<1) mov r10,r10,lsl #1 // + (((*treep & 0x3f)+1) <<1)+treeShift) ldrb r11,[r2,#0] and r11,r11,#0x3f add r11,r11,#1 add r10,r10,r11,lsl #1 add r2, r10,r9 tst r6, #0x80 // if (treeCheck & TREE_END) { beq @13 mov r3, r3, lsr r4 // destTmp >>= DataBit; ldrb r10,[r2, #0] // destTmp |= *treep << (32 - DataBit); rsb r11, r4, #32 orr r3, r3, r10, lsl r11 mov r2, r7 // treep = treeStartp; add r14,r14,#1 // if (++destTmpCount == destTmpDataNum) { ldr r11,[sp, #0] cmp r14,r11 streq r3, [r1], #4 // *destp++ = destTmp; subeq r12,r12, #4 // destCount -= 4; moveq r14,#0 // destTmpCount = 0; // } // } @13: cmp r12,#0 // if (destCount <= 0) break; movgt r5, r5, lsl #1 // srcTmp <<= 1; bgt @12 // } b @11 // } @14: add sp, sp, #4 ldmfd sp!, {r4-r11, lr} bx lr } //---------------------------------------------------------------------- // 8-bit decompression of run-length compressed data // //- Decompresses run-length compressed data, writing in 8 bit units. //With NITRO, cannot decompress directly into VRAM //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible. // //- Use 4 byte alignment for the source address. // //Arguments: // void *srcp : source address // void *destp : destination address // //- Data header // u32 :4 : Reserved // compType:4 Compression type( = 3) // destSize:24 Data size after decompression // //- Flag data format // u8 length:7 Decompressed data length - 1 (When not compressed) // Decompressed data length - 3 (only compress when the contiguous length is 3 bytes or greater) // flag:1 (0, 1) = (not compressed, compressed) // //- Return value: None //---------------------------------------------------------------------- asm void MI_UncompressRL8( register const void *srcp, register void *destp ) { stmfd sp!, {r4, r5, r7} ldmia r0!, {r3} // r7: destCount = *(u32 *)srcp >> 8; mov r7, r3, lsr #8 // r0: srcp += 4; @41: cmp r7, #0 // while (destCount > 0) { ble @45 ldrb r4, [r0], #1 // r4: flags = *srcp++; ands r2, r4, #0x7f // r2: length = flags & 0x7f; tst r4, #0x80 // if (!(flags & 0x80)) { bne @43 add r2, r2, #1 // length++; sub r7, r7, r2 // destCount -= length; @42: ldrb r3, [r0], #1 // do { swpb r3, r3, [r1] // *destp++ = *srcp++; (Byte-writing countermeasure) add r1, r1, #1 subs r2, r2, #1 // } while (--length > 0); bgt @42 // } else { b @41 @43: add r2, r2, #3 // length += 3; sub r7, r7, r2 // destCount -= length; ldrb r5, [r0], #1 // srcTmp = *srcp++; @44: swpb r4, r5, [r1] // do { (Byte writing countermeasure) add r1, r1, #1 // *destp++ = srcTmp; subs r2, r2, #1 // } while (--length > 0); bgt @44 // } b @41 // } @45: ldmfd sp!, {r4, r5, r7} bx lr } //---------------------------------------------------------------------- // 16-bit decompression of run-length compressed data // //- Decompresses run-length compressed data, writing in 16 bit units. //- Can be decompressed to the data TCM and VRAM //- When decompressing to the main RAM, it is slower than MI_Uncomressed RL80 //- When decompressing to data TCM or VRAM, it is faster than UncompresedRL90. //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible. // //- Use 4 byte alignment for the source address. // //Arguments: // void *srcp : source address // void *destp : destination address // //- Data header // u32 :4 : Reserved // compType:4 Compression type( = 3) // destSize:24 Data size after decompression // //- Flag data format // u8 length:7 Decompressed data length - 1 (When not compressed) // Decompressed data length - 3 (only compress when the contiguous length is 3 bytes or greater) // flag:1 (0, 1) = (not compressed, compressed) // //- Return value: None //---------------------------------------------------------------------- #if !defined(UNCOMPRESS_RL16_CODE32) //---- This code will be compiled in Thumb-Mode #include asm void MI_UncompressRL16( register const void *srcp, register void *destp ) { push {r4-r7} sub sp, #0xc mov r7, #0 // r7: destTmp = 0 ldmia r0!, {r3} // r5: destCount = *(u32 *)srcp >> 8 lsr r5, r3, #8 // r0: srcp += 4 mov r4, #0 // r4: shift = 0 @51: cmp r5, #0 // while (destCount > 0) { ble @57 ldrb r3, [r0, #0] // flags = *srcp++; str r3, [sp, #4] add r0, #1 ldr r3, [sp, #4] // r2: length = flags & 0x7f; lsl r2, r3, #25 lsr r2, r2, #25 ldr r6, [sp, #4] // if (!(flags & 0x80)) { lsr r3, r6, #8 bcs @54 add r2, #1 // length++; sub r5, r5, r2 // destCount -= length; // do { @52: ldrb r6, [r0, #0] // destTmp |= *srcp++ << shift; lsl r6, r4 orr r7, r6 add r0, #1 mov r3, #8 // if (!(shift ^= 8)) { eor r4, r3 bne @53 strh r7, [r1, #0] // *destp++ = destTmp; add r1, #2 mov r7, #0 // destTmp = 0; // } @53: sub r2, r2, #1 // } while (--length > 0); bgt @52 b @51 // } else { @54: add r2, #3 // length += 3; sub r5, r5, r2 // destCount -= length; ldrb r6, [r0, #0] // srcTmp = *srcp++; str r6, [sp, #8] add r0, #1 // do { @55: ldr r6, [sp, #8] // destTmp |= srcTmp << shift; lsl r6, r4 orr r7, r6 mov r3, #8 // if (!(shift ^= 8)) { eor r4, r3 bne @56 strh r7, [r1, #0] // *destp++ = destTmp; add r1, #2 mov r7, #0 // destTmp = 0; // } @56: sub r2, r2, #1 // } while (--length > 0); bgt @55 // } b @51 // } @57: add sp, #0xc pop {r4-r7} bx lr } //---- This code will be compiled in Thumb-Mode #include #else // defined(UNCOMPRESS_RL16_CODE32) asm void MI_UncompressRL16( register const void *srcp, register void *destp ) { stmdb sp!, {r4-r6} ldr r12, [r0], #4 // load header mov r12, r12, lsr #8 // dest. count mov r4, #0 // write back accumulator mov r5, #8 // low-high byte switch and shifter @L1 cmp r12, #0 // finished ? ble @L4 ldrb r2, [r0], #1 // load marker byte ldrb r3, [r0], #1 // load char ie. 1st char // <- still 1 cycle interlock from the 1st LDRB (LDRB/LDRH impose 2 cycles pipeline interlock!) tst r2, #0x80 // check if marker has MSB set bne @L3 // path decoding stríngs of different characters add r2, r2, #1 // adjust counter sub r12, r12, r2 // decrement dest. count @L2 eors r5, r5, #8 // toggle low-high byte switch and shifter orr r4, r4, r3, lsl r5 // OR in the next byte as either low or high byte strneh r4, [r1], #2 // if halfword is full store it away movne r4, #0 // clear accu subs r2, r2, #1 // decrement counter ldrneb r3, [r0], #1 // preload char for the next loop to have the 2 cycle interlock of LDRB compensated by the pipeline flush imposed by the branch bne @L2 b @L1 // path decoding stríngs of same character @L3 sub r2, r2, #0x7d // adjust counter sub r12, r12, r2 // decrement dest. count eors r5, r5, #8 // toggle low-high byte switch and shifter orr r4, r4, r3, lsl r5 // OR in the next byte as either low or high byte strneh r4, [r1], #2 // if halfword is full store it away movne r4, #0 // and clear accu orr r6, r3, r3, lsl #8 // the char extended to halfword @L5 subs r2, r2, #2 // sub 2, to handle the repeatitive char loop halfword-wise ble @L6 // done or only one more to go, step out to keep the flow for multiple chars as fast as possible strh r6, [r1], #2 // more than one to go -> just store the double char b @L5 @L6 blt @L1 // done eors r5, r5, #8 // one more to go, so toggle the switch orr r4, r4, r3, lsl r5 // OR in the next byte as either low or high byte strneh r4, [r1], #2 // if halfword is full store it away movne r4, #0 // and clear accu b @L1 @L4 ldmia sp!, {r4-r6} bx lr } #endif // defined(UNCOMPRESS_RL16_CODE32) //---------------------------------------------------------------------- // 32-bit decompression of run-length compressed data // //- Decompresses run-length compressed data, writing in 32 bit units. //- Higher speed than MI_UncompressRL8(), MI_UncompressRL16() //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible. // //- Use 4 byte alignment for the source address. // //Arguments: // void *srcp : source address // void *destp : destination address // //- Data header // u32 :4 : Reserved // compType:4 Compression type( = 3) // destSize:24 Data size after decompression // //- Flag data format // u8 length:7 Decompressed data length - 1 (When not compressed) // Decompressed data length - 3 (only compress when the contiguous length is 3 bytes or greater) // flag:1 (0, 1) = (not compressed, compressed) // //- Return value: None //---------------------------------------------------------------------- asm void MI_UncompressRL32(register const void *srcp,register void *destp) { stmdb sp!, {r4-r5} ldr r12, [r0], #4 // load header mov r12, r12, lsr #8 // dest. count mov r4, #0 // write back accumulator mov r5, #32 // shifter reg. for write back accumulator @L1 cmp r12, #0 // finished ? ble @L6 ldrb r2, [r0], #1 // load marker byte ldrb r3, [r0], #1 // load char ie. 1st char // <- still 1 cycle interlock from the 1st LDRB (LDRB/LDRH impose 2 cycles pipeline interlock!) tst r2, #0x80 // check if marker has MSB set bne @L3 // path decoding stríngs of different characters add r2, r2, #1 // adjust counter sub r12, r12,r2 // decrement dest. count @L2 orr r4, r4, r3, ror r5 // accumulate (LSL R5 would be more obvious, but then the shifter reg. must be counted up and an add. CMP would be needed) subs r5, r5, #8 // decrement shifter beq @L5 // write back branch is only taken every 4th loop, so it's best to jump out (non-taken branch is only 1 cycle compared to 3 for a taken one) subs r2, r2, #1 // decrement counter ldrneb r3, [r0], #1 // preload char for the next loop to have the 2 cycle interlock of LDRB compensated by the pipeline flush imposed by the branch bne @L2 b @L1 @L5 str r4, [r1], #4 // write back mov r5, #32 // restore shifter mov r4, #0 // reset write back accumulator subs r2, r2, #1 // not to jump back saves 2 cycles ldrneb r3, [r0], #1 // preload char bne @L2 b @L1 // path decoding stríngs of same character @L3 sub r2, r2, #0x7d // adjust counter sub r12, r12,r2 // decrement dest. count @L4 orr r4, r4, r3, ror r5 // accumulate (the first word) subs r5, r5, #8 // decrement shifter beq @L7 // if word is full step out subs r2, r2, #1 // if not, check if there are still bytes left bne @L4 // and continue b @L1 // or be done @L7 str r4, [r1], #4 // write back (the one word that may have chars from the other path in it) orr r4, r3, r3, lsl #8 orr r4, r4, r4, lsl #16 // the char extended to word @L8 subs r2, r2, #4 // full words are processed now, so decrement counter by 4 ble @L9 // if no full words are left jump out str r4, [r1], #4 // otherwise write back b @L8 @L9 mov r4, #0 // clear accu mov r5, #32 // reset shifter reg. adds r2, r2, #3 // re-adjust counter (0-3) beq @L1 // 0 then done @L0 orr r4, r4, r3, ror r5 // accumulate (the max. 3 remaining bytes) sub r5, r5, #8 // decrement shifter subs r2, r2, #1 // decrement counter bne @L0 b @L1 @L6 ldmia sp!, {r4-r5} bx lr } //---------------------------------------------------------------------- // 8-bit decompression to restore differential filter conversion. // //- Restores a differential filter, writing in 8 bit units. //With NITRO, cannot decompress directly into VRAM //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible. // //- Use 4 byte alignment for the source address. // //Arguments: // void *srcp : source address // void *destp : destination address // //- Data header // u32 :4 : Bit size of unit // compType:4 Compression type( = 3) // destSize:24 Data size after decompression // //- Return value: None //---------------------------------------------------------------------- asm void MI_UnfilterDiff8( register const void *srcp, register void *destp ) { stmfd sp!, {r4} ldmia r0, {r2} // r2: destCount = (u32 *)srcp; mov r3, #0 // r3: sum = 0; and r4, r2, #0xF // r4: bitSize = (u32 *)srcp & 0xF; mov r2, r2, lsr #8 // destCount = (u32 *)srcp >> 8; cmp r4, #1 // if (bitSize != 1) { bne @63 @61 // Difference calculation in units of 8 bits add r0, r0, #3 // srcp += 4; sub r1, r1, #1 @62 // do { ldrb r4, [r0, #1]! // tmp = *(srcp++); subs r2, r2, #1 // destCount--; Executes at this position in order to eliminate pipeline stall add r3, r3, r4 // sum += tmp strb r3, [r1, #1]! // *(destp++) = sum; bgt @62 // } while ( destCount > 0 ); b @65 // } else { @63 // Difference calculation in units of 16 bits add r0, r0, #2 // sub r1, r1, #2 // @64 // do { ldrh r4, [r0, #2]! // tmp = *(u16*)srcp; srcp += 2; subs r2, r2, #2 // destCount -= 2; Executes at this position in order to eliminate pipeline stall add r3, r3, r4 // sum += tmp; strh r3, [r1, #2]! // *(u16*)destp = sum; destp += 2; bgt @64 // } while ( destCount > 0 ); // } @65 ldmfd sp!, {r4} bx lr } //---------------------------------------------------------------------- // 16-bit decompression to restore differential filter conversion. // //- Restores a differential filter, writing in 16 bit units. //- Can be decompressed to the data TCM and VRAM //- Higher speed than MI_UnfilterDiff16() //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible. // //- Use 4 byte alignment for the source address. // //Arguments: // void *srcp : source address // void *destp : destination address // //- Data header // u32 :4 : Bit size of unit // compType:4 Compression type( = 3) // destSize:24 Data size after decompression // //- Return value: None //---------------------------------------------------------------------- asm void MI_UnfilterDiff16( register const void *srcp, register void *destp ) { stmfd sp!, {r4, r5} ldmia r0, {r2} // r2: destCount = (u32 *)srcp; mov r3, #0 // r3: sum = 0; and r4, r2, #0xF // r4: bitSize = (u32 *)srcp & 0xF; mov r2, r2, lsr #8 // destCount = (u32 *)srcp >> 8; cmp r4, #1 // if (bitSize != 1) { bne @63 @61 // Difference calculation in units of 8 bits add r0, r0, #2 // srcp += 4; sub r1, r1, #2 @62 // do { ldrh r4, [r0, #2]! // tmp = *(u16*)srcp; srcp += 2; sub r2, r2, #2 // destCount -= 2; Executes at this position in order to eliminate pipeline stall add r3, r3, r4 // sum += tmp and r5, r3, #0xFF // r5: tmp2 = sum & 0xFF; add r3, r3, r4, lsr #8 // sum += (tmp >> 8); add r5, r5, r3, lsl #8 // tmp2 += (sum << 8); strh r5, [r1, #2]! // *(u16*)destp = tmp2; destp += 2; cmp r2, #1 // bgt @62 // } while ( destCount > 1 ); bne @65 // if ( destCount < 1 ) return; // else // if (destCount == 1) { ldrh r4, [r0, #2]! // tmp = *(u16*)srcp; srcp += 2; add r3, r3, r4 // sum += tmp; and r5, r3, #0xFF // tmp2 = sum & 0xFF strh r5, [r1, #2]! // *(u16*)destp = tmp2; destp += 2; b @65 // } // } else { @63 // Difference calculation in units of 16 bits add r0, r0, #2 sub r1, r1, #2 @64 // do { ldrh r4, [r0, #2]! // tmp = *(u16*)srcp; srcp += 2; subs r2, r2, #2 // destCount -= 2; Executes at this position in order to eliminate pipeline stall add r3, r3, r4 // sum += tmp; strh r3, [r1, #2]! // *(u16*)destp = sum; destp += 2; bgt @64 // } while ( destCount > 0 ); // } @65 ldmfd sp!, {r4,r5} bx lr } //---------------------------------------------------------------------- // 32-bit decompression to restore differential filter conversion. // //- Restores a differential filter, writing in 32 bit units. //- Can be decompressed to the data TCM and VRAM // Faster than the MI_Uncompress8 function//---- //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible. // //- Use 4 byte alignment for the source address. // //Arguments: // void *srcp : source address // void *destp : destination address // //- Data header // u32 :4 : Bit size of unit // compType:4 Compression type( = 3) // destSize:24 Data size after decompression // //- Return value: None //---------------------------------------------------------------------- asm void MI_UnfilterDiff32( register const void *srcp,register void *destp ) { stmdb sp!, {r4-r6} ldr r4, [r0], #4 // load header mov r5, #0xff // byte mask mov r6, r4, lsr #8 // Dest. count tst r4, #1 // Bitsize 8 or 16? mov r4, #0 // R4 - accumulator beq @bs16 @L1 ldr r2, [r0], #4 // load data subs r6, r6, #4 // decrement dest. count (at this stage to prevent pipeline stall) add r4, r2, r4, lsr #24 // 1st sum and r4, r4, r5 // mask add r3, r4, r2, lsr #8 // 2nd sum and r3, r3, r5 // mask orr r4, r4, r3, lsl #8 // accumulate add r3, r3, r2, lsr #16 // 3rd sum and r3, r3, r5 // mask orr r4, r4, r3, lsl #16 // accumulate add r3, r3, r2, lsr #24 // 4th sum and r3, r3, r5 // mask orr r4, r4, r3, lsl #24 // accumulate str r4, [r1], #4 // write back bgt @L1 // loop if dest. count not reached ldmia sp!, {r4-r6} bx lr @bs16 orr r5, r5, r5, lsl #8 // extend mask to 16 bit @L2 ldr r2, [r0], #4 // load data subs r6, r6, #4 // decrement Dest. count (at this stage to prevent pipeline stall) add r4, r2, r4, lsr #16 // 1st sum and r4, r4, r5 // mask add r3, r4, r2, lsr #16 // 2nd sum and r3, r3, r5 // mask orr r4, r4, r3, lsl #16 // accumulate str r4, [r1], #4 // write back bgt @L2 // loop if dest. count not reached ldmia sp!, {r4-r6} bx lr } //---------------------------------------------------------------------- // 8-bit expansion of the differential filter conversion // //- Converts the differential filter, writing in 8 bit units. //With NITRO, cannot decompress directly into VRAM //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible. // //- Use 4 byte alignment for the source address. // //Arguments: // void *srcp : source address // void *destp : destination address // u32 size Source size // BOOL bitsize Differential size (TRUE: 16bit, FALSE: 8bit) // //- Data header // u32 :4 : Bit size of unit // compType:4 Compression type( = 3) // destSize:24 Data size after decompression // //- Return value: None //---------------------------------------------------------------------- asm void MI_FilterDiff8(register const void *srcp, register void *destp, register u32 size, register BOOL bitsize) { stmdb sp!, {r4-r6} add r4, r3, #1 // u32 :4 : Bit size of unit orr r4, r4, #0x80 // compType:4 : Compression type( = 3) orr r4, r4, r2, lsl #8 // destSize:24 : Data size before compression str r4, [r1], #4 // write header mov r4, #0 // 1st subtrahend mov r5, #0xff // byte mask tst r3, #1 // Bitsize 8 or 16? bne @bs16 @L1 ldrb r3,[r0],#1 // load data subs r2, r2, #1 // decrement Dest. count (at this stage to prevent pipeline stall) sub r4, r3, r4 // diff and r6, r4, r5 // mask mov r4, r3 // set as previous value strb r6, [r1], #1 // write back bgt @L1 // loop if dest. count not reached ldmia sp!, {r4-r6} bx lr @bs16 orr r5, r5, r5, lsl #8 // extend mask to 16 bit @L2 ldrh r3,[r0],#2 // load data subs r2, r2, #2 // decrement Dest. count (at this stage to prevent pipeline stall) sub r4, r3, r4 // 1st diff and r6, r4, r5 // mask mov r4, r3 // set as previous value strh r6, [r1], #2 // write back bgt @L2 // loop if dest. count not reached ldmia sp!, {r4-r6} bx lr } //---------------------------------------------------------------------- // 16-bit expansion of the differential filter conversion // //- Converts the differential filter, writing in 16 bit units. //- Can be decompressed to the data TCM and VRAM //- Higher speed than MI_FilterDiff8() //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible. // //- Use 4 byte alignment for the source address. // //Arguments: // void *srcp : source address // void *destp : destination address // u32 size Source size // BOOL bitsize Differential size (TRUE: 16bit, FALSE: 8bit) // //- Data header // u32 :4 : Bit size of unit // compType:4 Compression type( = 3) // destSize:24 Data size after decompression // //- Return value: None //---------------------------------------------------------------------- asm void MI_FilterDiff16(register const void *srcp, register void *destp, register u32 size, register BOOL bitsize) { stmdb sp!, {r4-r6} add r4, r3, #1 // u32 :4 : Bit size of unit orr r4, r4, #0x80 // compType:4 : Compression type( = 3) orr r4, r4, r2, lsl #8 // destSize:24 : Data size before compression str r4, [r1], #4 // write header mov r4, #0 // 1st subtrahend mov r5, #0xff // byte mask tst r3, #1 // Bitsize 8 or 16? bne @bs16 @L1 ldrh r3,[r0],#2 // load data subs r2, r2, #2 // decrement Dest. count (at this stage to prevent pipeline stall) sub r4, r3, r4, lsr #8 // 1st diff and r6, r4, r5 // mask rsb r4, r3, r3, lsr #8 // 2nd diff and r4, r4, r5 // mask orr r6, r6, r4, lsl #8 // accumulate mov r4, r3 // set as previous value strh r6, [r1], #2 // write back bgt @L1 // loop if dest. count not reached ldmia sp!, {r4-r6} bx lr @bs16 orr r5, r5, r5, lsl #8 // extend mask to 16 bit @L2 ldrh r3,[r0],#2 // load data subs r2, r2, #2 // decrement Dest. count (at this stage to prevent pipeline stall) sub r4, r3, r4 // 1st diff and r6, r4, r5 // mask mov r4, r3 // set as previous value strh r6, [r1], #2 // write back bgt @L2 // loop if dest. count not reached ldmia sp!, {r4-r6} bx lr } //---------------------------------------------------------------------- // 32-bit expansion of the differential filter conversion // //- Converts the differential filter, writing in 32 bit units. //- Higher speed than MI_FilterDiff16() //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible. // //- Use 4 byte alignment for the source address. // //Arguments: // void *srcp : source address // void *destp : destination address // u32 size Source size // BOOL bitsize Differential size (TRUE: 16bit, FALSE: 8bit) // //- Data header // u32 :4 : Bit size of unit // compType:4 Compression type( = 3) // destSize:24 Data size after decompression // //- Return value: None //---------------------------------------------------------------------- asm void MI_FilterDiff32(register const void *srcp, register void *destp, register u32 size, register BOOL bitsize) { stmdb sp!, {r4-r6} add r4, r3, #1 // u32 :4 : Bit size of unit orr r4, r4, #0x80 // compType:4 : Compression type( = 3) orr r4, r4, r2, lsl #8 // destSize:24 : Data size before compression str r4, [r1], #4 // write header mov r4, #0 // 1st subtrahend mov r5, #0xff // byte mask tst r3, #1 // Bitsize 8 or 16? bne @bs16 @L1 ldr r3,[r0],#4 // load data subs r2, r2, #4 // decrement Dest. count (at this stage to prevent pipeline stall) sub r4, r3, r4, lsr #8 // 1st diff and r6, r4, r5 // mask rsb r4, r3, r3, lsr #8 // 2nd diff and r4, r4, r5 // mask orr r6, r6, r4, lsl #8 // accumulate mov r4, r3, lsr #16 // shift higher halfword down sub r3, r4, r3, lsr #8 // 3rd diff and r3, r3, r5 // mask orr r6, r6, r3, lsl #16 // accumulate rsb r3, r4, r4, lsr #8 // 4th diff and r3, r3, r5 // mask orr r6, r6, r3, lsl #24 // accumulate str r6, [r1], #4 // write back bgt @L1 // loop if dest. count not reached ldmia sp!, {r4-r6} bx lr @bs16 orr r5, r5, r5, lsl #8 // extend mask to 16 bit @L2 ldr r3,[r0],#4 // load data subs r2, r2, #4 // decrement Dest. count (at this stage to prevent pipeline stall) sub r4, r3, r4 // 1st diff and r6, r4, r5 // mask mov r4, r3, lsr #16 // shift higher halfword down rsb r3, r3, r3, lsr #16 // 2nd diff and r3, r3, r5 // mask orr r6, r6, r3, lsl #16 // accumulate str r6, [r1], #4 // write back bgt @L2 // loop if dest. count not reached ldmia sp!, {r4-r6} bx lr } //---- end limitation of processer mode #include