1 /*---------------------------------------------------------------------------*
2   Project:  TwlSDK - MI -
3   File:     mi_uncompress.c
4 
5   Copyright 2003-2008 Nintendo. All rights reserved.
6 
7   These coded instructions, statements, and computer programs contain
8   proprietary information of Nintendo of America Inc. and/or Nintendo
9   Company Ltd., and are protected by Federal copyright law. They may
10   not be disclosed to third parties or copied or duplicated in any form,
11   in whole or in part, without the prior written consent of Nintendo.
12 
13   $Date:: 2008-09-17#$
14   $Rev: 8556 $
15   $Author: okubata_ryoma $
16 
17  *---------------------------------------------------------------------------*/
18 
19 #include <nitro/types.h>
20 #include <nitro/mi/uncompress.h>
21 
22 //****Bug fix****
23 //  Because halfword access instructions such as ldrh and strh are not passed through by the inline assembler due to a CW bug, instruction values are written directly using dcd in order to avoid this bug.
24 //
25 //   When the bug is fixed, the 'define' below will be removed.
26 //#define CW_BUG_FOR_LDRH_AND_STRH
27 
28 #define UNCOMPRESS_RL16_CODE32
29 
30 //---- This code will be compiled in ARM-Mode
31 #include <nitro/code32.h>
32 
33 //======================================================================
34 //          Expanding compressed data
35 //======================================================================
36 //----------------------------------------------------------------------
37 //          Expanding bit compressed data
38 //
39 //- Unpacks data padded with bits fixed to 0.
40 //- Align the destination address to a 4-byte boundary.
41 //
42 //Arguments:
43 //             void *srcp :              source address
44 //             void *destp :            destination address
45 //  MIUnpackBitsParam *paramp :  Address of MIUnpackBitsParam structure
46 //
47 //MIUnpackBitsParam Structure
48 //    u16 srcNum:              Number of bytes of source data
49 //    u8  srcBitNum:           Number of bits per source data
50 //    u8  destBitNum:          Number of bits per destination data
51 //    u32 destOffset:31 :       Offset number to add to source data.
52 //        destOffset0_On:1 :    Flag for whether to add an offset to 0 data.
53 //
54 //- Return value: None
55 //----------------------------------------------------------------------
56 
MI_UnpackBits(register const void * srcp,register void * destp,register MIUnpackBitsParam * paramp)57 asm void MI_UnpackBits( register const void *srcp, register void *destp, register MIUnpackBitsParam *paramp )
58 {
59                 stmfd   sp!, {r4-r11, lr}
60                 sub     sp, sp, #4
61 
62                 ldrh    r7, [r2, #0]            // r7:  srcNum    = unPackBitsParamp->srcNum
63 
64 @00:            ldrb    r6, [r2, #2]            // r6:  srcBitNum = unPackBitsParamp->srcBitNum
65                 rsb     r10, r6, #8             // r10: srcBitNumInv = 8 - srcBitNum
66                 mov     r14, #0                 // r14: destBak = 0
67                 ldr     r11, [r2, #4]           // r8:  destOffset0_On
68                 mov     r8, r11, lsr #31        //          = unPackBitsParamp->destOffset0_On
69                 ldr     r11,[r2, #4]            //      destOffset = unPackBitsParamp->destOffset
70                 mov     r11,r11, lsl #1
71                 mov     r11,r11, lsr #1
72                 str     r11,[sp, #0]
73                 ldrb    r2, [r2, #3]            // r2:  destBitNum = unPackBitsParamp->destBitNum
74                 mov     r3, #0                  // r3:  destBitCount = 0
75 
76 @01:            subs    r7, r7, #1              //  while (--srcNum >= 0) {
77                 blt     @06
78 
79                 mov     r11, #0xff              // r5:  srcMask = 0xff >> srcBitNumInv;
80                 mov     r5, r11, asr r10
81                 ldrb    r9, [r0], #1            // r9:  srcTmp  = *srcp++;
82                 mov     r4, #0                  // r4:  srcBitCount = 0;
83 
84 @02:            cmp     r4, #8                  //      while (srcBitCount < 8) {
85                 bge     @01
86 
87                 and     r11, r9, r5             // r12:     destTmp = ((srcTmp & srcMask) >>srcBitCount);
88                 movs    r12, r11, lsr r4
89                 cmpeq   r8,  #0
90                 beq     @04
91 
92 @03:            ldr     r11, [sp, #0]           //          destTmp += destOffset;
93                 add     r12, r12, r11
94 @04:            orr     r14, r14, r12, lsl r3   //          destBak |= destTmp << destBitCount;
95                 add     r3, r3, r2              //          destBitCount += destBitNum;
96 
97                 cmp     r3, #0x20               //          if (destBitCount >= 32) {
98                 blt     @05
99 
100                 str     r14, [r1], #4           //              *destp++ = destBak;
101                 mov     r14, #0                 //              destBak = 0;
102                 mov     r3,  #0                 //              destBitCount = 0;
103                                                 //          }
104 @05:            mov     r5, r5, lsl r6          //          srcMask    <<= srcBitNum;
105                 add     r4, r4, r6              //          srcBitCount += srcBitNum;
106                 b       @02                     //      }
107                                                 //  }
108 @06:            add     sp, sp, #4
109                 ldmfd   sp!, {r4-r11, lr}
110                 bx      lr
111 }
112 
113 //----------------------------------------------------------------------
114 //          8-bit decompression of LZ77 compressed data
115 //
116 //* Expands LZ77-compressed data and writes it in 8-bit units.
117 //- Cannot decompress directly into VRAM
118 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
119 //
120 //- Use 4 byte alignment for the source address.
121 //
122 //Arguments:
123 //    void *srcp :              source address
124 //    void *destp :            destination address
125 //
126 //- Data header
127 //    u32 :4  :                Reserved
128 //        compType:4          Compression type( = 1)
129 //        destSize:24         Data size after decompression
130 //
131 //- Flag data format
132 //    u8  flags               Compression/no compression flag
133 //                            (0, 1) = (not compressed, compressed)
134 //- Code data format (Big Endian)
135 //    u16 length:4            Decompressed data length - 3 (only compress when the match length is 3 bytes or greater)
136 //        offset:12           Match data offset - 1
137 //
138 //- Return value: None
139 //----------------------------------------------------------------------
140 
MI_UncompressLZ8(register const void * srcp,register void * destp)141 asm void MI_UncompressLZ8( register const void *srcp, register void *destp )
142 {
143                 stmfd   sp!, {r4-r7, lr}
144 
145                 ldr     r5, [r0], #4            // r2:  destCount = *(u32 *)srcp >> 8
146                 mov     r2, r5, lsr #8          // r0:  srcp += 4
147                 mov     r7, #0
148                 tst     r5, #0x0F               // r7:  isExFormat = (*header & 0x0F)? 1 : 0
149                 movne   r7, #1
150 
151 @21:            cmp     r2, #0                  //  while (destCount > 0) {
152                 ble     @26
153 
154                 ldrb    r14, [r0], #1           // r14: flags = *srcp++
155                 mov     r4, #8                  //
156 @22:            subs    r4, r4, #1              //      for ( i = 8; --i >= 0; ) {
157                 blt     @21
158 
159                 tst     r14, #0x80              //          if (!(flags & 0x80)) {
160                 bne     @23
161 
162                 ldrb    r6, [r0], #1            //              *srcp++;
163                 swpb    r6, r6, [r1]            // r1:          *destp++; (Byte-writing countermeasure)
164                 add     r1, r1, #1
165                 sub     r2, r2, #1              //              destCount--;
166                 b       @25
167                                                 //          } else {
168 @23:            ldrb    r5, [r0, #0]            // r3:          length = (*srcp >> 4);
169                 cmp     r7, #0                  //              if ( ! isExFormat ) { length += 3; }
170                 moveq   r6, #3
171                 beq     @23_2
172                                                 //              else {
173                 tst     r5, #0xE0               //                  if ( length > 1 ) {
174                 movne   r6, #1                  //                      length += 1;
175                 bne     @23_2                   //                  } else {
176 
177                 add     r0, r0, #1              //                      isWide = (length == 1)? 1 : 0;
178                 and     r6, r5, #0xf            //                      length = (*srcp++ & 0x0F) << 4
179                 mov     r6, r6, lsl #4
180                 tst     r5, #0x10
181                 beq     @23_1                   //                      if ( isWide ) {
182 
183                 mov     r6, r6, lsl #8          //                          length <<= 8;
184                 ldrb    r5, [r0], #1            //                          length += (*srcp++) << 4;
185                 add     r6, r6, r5, lsl #4      //                          length += 0xFF + 1;
186                 add     r6, r6, #0x100          //                      }
187 @23_1:
188                 add     r6, r6, #0x11           //                      length += 0xF + 2;
189                 ldrb    r5, [r0, #0]            //                      length += (*srcp >> 4);
190 @23_2:                                          //                  }
191                 add     r3, r6, r5, asr #4      //              }
192                 add     r0, r0, #1              // r12:         offset = (*srcp++ & 0x0f) << 8;
193                 and     r5, r5, #0xf
194                 mov     r12,r5, lsl #8
195                 ldrb    r6, [r0], #1            //              offset = (offset | *srcp++) + 1;
196                 orr     r5, r6, r12
197                 add     r12,r5, #1
198                 sub     r2, r2, r3              //              destCount -= length;
199                                                 //              do {
200 @24:            ldrb    r5, [r1, -r12]          //                  *destp++ = destp[-offset]
201                 swpb    r5, r5, [r1]            //    (Byte-writing countermeasure)
202                 add     r1, r1, #1
203                 subs    r3, r3, #1              //              } while (--length > 0);
204                 bgt     @24
205                                                 //          }
206 @25:            cmp     r2, #0                  //          if (destCount <= 0)   break;
207                 movgt   r14, r14, lsl #1        //          flags <<= 1
208                 bgt     @22                     //      }
209                 b       @21                     //  }
210 
211 @26:            ldmfd   sp!, {r4-r7, lr}
212                 bx      lr
213 }
214 
215 
216 //----------------------------------------------------------------------
217 //          16-bit decompression of LZ77 compressed data
218 //
219 //* Expands LZ77-compressed data and writes it in 16-bit units.
220 //* Although it can also expand in data TCM and main memory, it is slower than MI_UncompressLZ77().
221 //
222 //* For compressed data, search for a matching character string from a minimum of 2 bytes previous.
223 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
224 //
225 //- Use 4 byte alignment for the source address.
226 //
227 //Arguments:
228 //    void *srcp :              source address
229 //    void *destp :            destination address
230 //
231 //- Data header
232 //    u32 :4  :                Reserved
233 //        compType:4          Compression type( = 1)
234 //        destSize:24         Data size after decompression
235 //
236 //- Flag data format
237 //    u8  flags               Compression/no compression flag
238 //                            (0, 1) = (not compressed, compressed)
239 //- Code data format (Big Endian)
240 //    u16 length:4            Decompressed data length - 3 (only compress when the match length is 3 bytes or greater)
241 //        offset:12 :          Match data offset ( >= 2) - 1
242 //
243 //- Return value: None
244 //----------------------------------------------------------------------
245 
MI_UncompressLZ16(register const void * srcp,register void * destp)246 asm void MI_UncompressLZ16( register const void *srcp, register void *destp )
247 {
248                 stmfd   sp!, {r4-r11, lr}
249 
250                 mov     r3,  #0                 // r3:  destTmp = 0
251                 ldr     r8,  [r0], #4           // r10: destCount = *(u32 *)srcp >> 8
252                 mov     r10, r8, lsr #8         // r0:  srcp += 4
253                 mov     r2,  #0                 // r2:  shift = 0
254                 mov     r11, #0
255                 tst     r8,  #0x0F              // r11: isExFormat = (*header & 0x0F)? 1 : 0;
256                 movne   r11, #1
257 
258 @31:            cmp     r10, #0                 //  while (destCount > 0) {
259                 ble     @36
260 
261                 ldrb    r6, [r0], #1            // r6:  flags = *srcp++;
262                 mov     r7, #8                  //      for ( i = 8; --i >= 0; ) {
263 @32:            subs    r7, r7, #1
264                 blt     @31
265 
266                 tst     r6, #0x80               //          if (!(flags & 0x80)) {
267                 bne     @33
268 
269                 ldrb    r9, [r0], #1            //              destTmp |= *srcp++ << shift;
270                 orr     r3, r3, r9, lsl r2
271                 sub     r10, r10, #1            //              destCount--;
272 
273                 eors    r2, r2, #8              //              if (!(shift ^= 8)) {
274 #ifndef CW_BUG_FOR_LDRH_AND_STRH
275                 streqh  r3, [r1], #2            //              *destp++ = destTmp;
276 #else
277                 dcd     0x00c130b2
278 #endif
279                 moveq   r3, #0                  //              destTmp = 0;
280                 b       @35                     //          } else {
281 
282 @33:            ldrb    r9, [r0, #0]            // r5:          length = (*srcp >> 4) + 3;
283                 cmp     r11, #0                 //              if ( ! isExFormat ) { length += 3; }
284                 moveq   r8,  #3
285                 beq     @33_2
286                                                 //              else {
287                 tst     r9, #0xE0               //                  if ( length > 1 ) {
288                 movne   r8, #1                  //                      length += 1
289                 bne     @33_2                   //                  } else {
290 
291                 add     r0, r0, #1              //                      isWide = (length == 1)? 1 : 0;
292                 and     r8, r9, #0xf            //                      length = (*srcp++ & 0x0F) << 4
293                 mov     r8, r8, lsl #4
294                 tst     r9, #0x10
295                 beq     @33_1                   //                      if ( isWide ) {
296 
297                 mov     r8, r8, lsl #8          //                          length <<= 8;
298                 ldrb    r9, [r0], #1            //                          length += (*srcp++) << 4
299                 add     r8, r8, r9, lsl #4      //                          length += 0xFF + 1
300                 add     r8, r8, #0x100          //                      }
301 @33_1:
302                 add     r8, r8, #0x11           //                      length += 0xF + 2;
303                 ldrb    r9, [r0, #0]            //                      length += (*srcp >> 4);
304 @33_2:                                          //                  }
305                 add     r5, r8, r9, asr #4      //              }
306                 ldrb    r9, [r0], #1            // r4:          offset = (*srcp++ & 0x0f) << 8;
307                 and     r8, r9, #0xf
308                 mov     r4, r8, lsl #8
309                 ldrb    r9, [r0], #1            //              offset = (offset | *srcp++) + 1;
310                 orr     r8, r9, r4
311                 add     r4, r8, #1
312                 rsb     r8, r2, #8              // r14:         offset0_8 = (8 - shift)
313                 and     r9, r4, #1              //                          ^ ((offset & 1) << 3);
314                 eor     r14, r8, r9, lsl #3
315                 sub     r10, r10, r5            //              destCount -= length;
316                                                 //              do {
317 @34:            eor     r14, r14, #8            //                  offset0_8 ^= 8;
318                 rsb     r8, r2, #8              //                  destTmp |= (destp[-((offset
319                 add     r8, r4, r8, lsr #3      //                              + ((8 - shift) >> 3)) >> 1)];
320                 mov     r8, r8, lsr #1
321                 mov     r8, r8, lsl #1
322 #ifndef CW_BUG_FOR_LDRH_AND_STRH
323                 ldrh    r9, [r1, -r8]
324 #else
325                 dcd     0xe11190b8
326 #endif
327                 mov     r8, #0xff
328                 and     r8, r9, r8, lsl r14
329                 mov     r8, r8, asr r14
330                 orr     r3, r3, r8, lsl r2
331                 eors    r2, r2, #8              //                  if (!(shift ^= 8)) {
332 #ifndef CW_BUG_FOR_LDRH_AND_STRH
333                 streqh  r3, [r1], #2            //                      *destp++ = destTmp;
334 #else
335                 dcd     0x00c130b2
336 #endif
337                 moveq   r3, #0                  //                      destTmp = 0;
338                                                 //                  }
339                 subs    r5, r5, #1              //              } while (--length > 0);
340                 bgt     @34                     //          }
341 
342 @35:            cmp     r10, #0                 //          if (destCount <= 0)   break;
343                 movgt   r6, r6, lsl #1          //          flags <<= 1
344                 bgt     @32                     //      }
345                 b       @31                     //  }
346 
347 @36:            ldmfd   sp!, {r4-r11, lr}
348                 bx      lr
349 }
350 
351 //----------------------------------------------------------------------
352 //          Decompression of Huffman compressed data
353 //
354 //- Decompresses Huffman compressed data, writing in 32 bit units.
355 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
356 //
357 //- Use 4 byte alignment for the source address.
358 //
359 //Arguments:
360 //    void *srcp :              source address
361 //    void *destp :            destination address
362 //
363 //- Data header
364 //    u32 bitSize:4           1 data bit size (Normally 4|8)
365 //        compType:4          Compression type( = 2)
366 //        destSize:24         Data size after decompression
367 //
368 //- Tree table
369 //    u8           treeSize        Tree table size/2 - 1
370 //    TreeNodeData nodeRoot        Root node
371 //
372 //    TreeNodeData nodeLeft        Root left node
373 //    TreeNodeData nodeRight       Root right node
374 //
375 //    TreeNodeData nodeLeftLeft    Left left node
376 //    TreeNodeData nodeLeftRight   Left right node
377 //
378 //    TreeNodeData nodeRightLeft   Right left node
379 //    TreeNodeData nodeRightRight  Right right node
380 //
381 //            �E
382 //            �E
383 //
384 //  The compressed data itself follows
385 //
386 //- TreeNodeData structure
387 //    u8  nodeNextOffset:6 :   Offset to the next node data - 1 (2 byte units)
388 //        rightEndFlag:1      Right node end flag
389 //        leftEndzflag:1      Left node end flag
390 //                            When end flag is set, there is data in next node.
391 //
392 //
393 //- Return value: None
394 //----------------------------------------------------------------------
395 
MI_UncompressHuffman(register const void * srcp,register void * destp)396 asm void MI_UncompressHuffman( register const void *srcp, register void *destp )
397 {
398                 stmfd   sp!, {r4-r11, lr}
399                 sub     sp, sp, #4
400 
401                 add     r2, r0, #4              // r2:  *treep = (u8 *)srcp + 4
402                 add     r7, r2, #1              // r7:  *treeStartp = treep + 1
403                 ldrb    r10,[r0, #0]            // r4:  DataBit = *(u8 *)srcp & 0x0f
404                 and     r4, r10, #0xf
405                 mov     r3, #0                  // r3:  destTmp = 0
406                 mov     r14,#0                  // r14: destTmpCount = 0
407                 and     r10,r4,  #7             //      destTmpDataNum = 4 + (DataBit & 0x7)
408                 add     r11,r10, #4
409                 str     r11,[sp, #0]
410                 ldr     r10,[r0, #0]            // r12: destCount = *srcp >> 8
411                 mov     r12,r10, lsr #8
412                 ldrb    r10,[r2, #0]            // r0:  srcp  = (u32 *)(treep + ((*treep + 1) << 1))
413                 add     r10,r10, #1
414                 add     r0, r2, r10, lsl #1
415                 mov     r2, r7                  //      treep = treeStartp
416 
417 @11:            cmp     r12, #0                 //  while (destCount > 0) {
418                 ble     @14
419 
420                 mov     r8,  #32                // r8:  srcCount = 32;
421                 ldr     r5, [r0], #4            // r5:  srcTmp  = *srcp++;
422 
423 @12:            subs    r8, r8,  #1             //      while (--srcCount >= 0) {
424                 blt     @11
425 
426                 mov     r10,#1                  // r9:      treeShift = (srcTmp >> 31) & 0x1
427                 and     r9, r10, r5, lsr #31
428                 ldrb    r6, [r2, #0]            // r6:      treeCheck = *treep
429                 mov     r6, r6, lsl r9          //          treeCheck <<= treeShift
430                 mov     r10,r2, lsr #1          //          treep = (u8 *)((((u32 )treep>>1) <<1)
431                 mov     r10,r10,lsl #1          //                          + (((*treep & 0x3f)+1) <<1)+treeShift)
432                 ldrb    r11,[r2,#0]
433                 and     r11,r11,#0x3f
434                 add     r11,r11,#1
435                 add     r10,r10,r11,lsl #1
436                 add     r2, r10,r9
437 
438                 tst     r6, #0x80               //          if (treeCheck & TREE_END) {
439                 beq     @13
440 
441                 mov     r3, r3, lsr r4          //              destTmp >>= DataBit;
442                 ldrb    r10,[r2, #0]            //              destTmp |= *treep << (32 - DataBit);
443                 rsb     r11, r4,  #32
444                 orr     r3, r3, r10, lsl r11
445                 mov     r2, r7                  //              treep = treeStartp;
446                 add     r14,r14,#1              //              if (++destTmpCount == destTmpDataNum) {
447                 ldr     r11,[sp, #0]
448                 cmp     r14,r11
449 
450                 streq   r3, [r1], #4            //                  *destp++ = destTmp;
451                 subeq   r12,r12,  #4            //                  destCount -= 4;
452                 moveq   r14,#0                  //                  destTmpCount = 0;
453                                                 //              }
454                                                 //          }
455 @13:            cmp     r12,#0                  //          if (destCount <= 0)   break;
456                 movgt   r5, r5, lsl #1          //          srcTmp <<= 1;
457                 bgt     @12                     //      }
458                 b       @11                     //  }
459 
460 @14:            add     sp, sp, #4
461                 ldmfd   sp!, {r4-r11, lr}
462                 bx      lr
463 }
464 
465 //----------------------------------------------------------------------
466 //          8-bit decompression of run-length compressed data
467 //
468 //- Decompresses run-length compressed data, writing in 8 bit units.
469 //With NITRO, cannot decompress directly into VRAM
470 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
471 //
472 //- Use 4 byte alignment for the source address.
473 //
474 //Arguments:
475 //    void *srcp :              source address
476 //    void *destp :            destination address
477 //
478 //- Data header
479 //    u32 :4  :                Reserved
480 //        compType:4          Compression type( = 3)
481 //        destSize:24         Data size after decompression
482 //
483 //- Flag data format
484 //    u8  length:7            Decompressed data length - 1 (When not compressed)
485 //                            Decompressed data length - 3 (only compress when the contiguous length is 3 bytes or greater)
486 //        flag:1              (0, 1) = (not compressed, compressed)
487 //
488 //- Return value: None
489 //----------------------------------------------------------------------
490 
MI_UncompressRL8(register const void * srcp,register void * destp)491 asm void MI_UncompressRL8( register const void *srcp, register void *destp )
492 {
493                 stmfd   sp!, {r4, r5, r7}
494 
495                 ldmia   r0!, {r3}               // r7:  destCount = *(u32 *)srcp >> 8;
496                 mov     r7, r3, lsr #8          // r0:  srcp += 4;
497 
498 @41:            cmp     r7, #0                  //  while (destCount > 0) {
499                 ble     @45
500 
501                 ldrb    r4, [r0], #1            // r4:  flags = *srcp++;
502                 ands    r2, r4, #0x7f           // r2:  length = flags & 0x7f;
503                 tst     r4, #0x80               //      if (!(flags & 0x80)) {
504                 bne     @43
505 
506                 add     r2, r2, #1              //          length++;
507                 sub     r7, r7, r2              //          destCount -= length;
508 @42:            ldrb    r3, [r0], #1            //          do {
509                 swpb    r3, r3, [r1]            //              *destp++ = *srcp++; (Byte-writing countermeasure)
510                 add     r1, r1, #1
511                 subs    r2, r2, #1              //          } while (--length > 0);
512                 bgt     @42                     //      } else {
513                 b       @41
514 
515 @43:            add     r2, r2, #3              //          length += 3;
516                 sub     r7, r7, r2              //          destCount -= length;
517                 ldrb    r5, [r0], #1            //          srcTmp  = *srcp++;
518 @44:            swpb    r4, r5, [r1]            //          do { (Byte writing countermeasure)
519                 add     r1, r1, #1              //              *destp++ =  srcTmp;
520                 subs    r2, r2, #1              //          } while (--length > 0);
521                 bgt     @44                     //      }
522                 b       @41                     //  }
523 
524 @45:            ldmfd   sp!, {r4, r5, r7}
525                 bx      lr
526 }
527 
528 //----------------------------------------------------------------------
529 //          16-bit decompression of run-length compressed data
530 //
531 //- Decompresses run-length compressed data, writing in 16 bit units.
532 //- Can be decompressed to the data TCM and VRAM
533 //- When decompressing to the main RAM, it is slower than MI_Uncomressed RL80
534 //- When decompressing to data TCM or VRAM, it is faster than UncompresedRL90.
535 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
536 //
537 //- Use 4 byte alignment for the source address.
538 //
539 //Arguments:
540 //    void *srcp :              source address
541 //    void *destp :            destination address
542 //
543 //- Data header
544 //    u32 :4  :                Reserved
545 //        compType:4          Compression type( = 3)
546 //        destSize:24         Data size after decompression
547 //
548 //- Flag data format
549 //    u8  length:7            Decompressed data length - 1 (When not compressed)
550 //                            Decompressed data length - 3 (only compress when the contiguous length is 3 bytes or greater)
551 //        flag:1              (0, 1) = (not compressed, compressed)
552 //
553 //- Return value: None
554 //----------------------------------------------------------------------
555 
556 #if !defined(UNCOMPRESS_RL16_CODE32)
557 //---- This code will be compiled in Thumb-Mode
558 #include <nitro/code16.h>
559 
MI_UncompressRL16(register const void * srcp,register void * destp)560 asm void MI_UncompressRL16( register const void *srcp, register void *destp )
561 {
562                 push    {r4-r7}
563                 sub     sp, #0xc
564 
565                 mov     r7, #0                  // r7:  destTmp = 0
566                 ldmia   r0!, {r3}               // r5:  destCount = *(u32 *)srcp >> 8
567                 lsr     r5, r3, #8              // r0:  srcp += 4
568 
569                 mov     r4, #0                  // r4:  shift = 0
570 @51:            cmp     r5, #0                  //  while (destCount > 0) {
571                 ble     @57
572 
573                 ldrb    r3, [r0,  #0]           //      flags = *srcp++;
574                 str     r3, [sp, #4]
575                 add     r0, #1
576                 ldr     r3, [sp, #4]            // r2:  length = flags & 0x7f;
577                 lsl     r2, r3, #25
578                 lsr     r2, r2, #25
579 
580                 ldr     r6, [sp, #4]            //      if (!(flags & 0x80)) {
581                 lsr     r3, r6, #8
582                 bcs     @54
583 
584                 add     r2, #1                  //          length++;
585                 sub     r5, r5, r2              //          destCount -= length;
586                                                 //          do {
587 @52:            ldrb    r6, [r0, #0]            //              destTmp |= *srcp++ << shift;
588                 lsl     r6, r4
589                 orr     r7, r6
590                 add     r0, #1
591                 mov     r3, #8                  //              if (!(shift ^= 8)) {
592                 eor     r4, r3
593                 bne     @53
594                 strh    r7, [r1, #0]            //                  *destp++ = destTmp;
595                 add     r1, #2
596                 mov     r7, #0                  //                  destTmp = 0;
597                                                 //              }
598 @53:            sub     r2, r2, #1              //          } while (--length > 0);
599                 bgt     @52
600                 b       @51                     //      } else {
601 
602 @54:            add     r2, #3                  //          length += 3;
603                 sub     r5, r5, r2              //          destCount -= length;
604                 ldrb    r6, [r0, #0]            //          srcTmp  = *srcp++;
605                 str     r6, [sp, #8]
606                 add     r0, #1
607                                                 //          do {
608 @55:            ldr     r6, [sp, #8]            //              destTmp |= srcTmp  << shift;
609                 lsl     r6, r4
610                 orr     r7, r6
611                 mov     r3, #8                  //              if (!(shift ^= 8)) {
612                 eor     r4, r3
613                 bne     @56
614                 strh    r7, [r1, #0]            //                  *destp++ = destTmp;
615                 add     r1, #2
616                 mov     r7, #0                  //                  destTmp = 0;
617                                                 //              }
618 @56:            sub     r2, r2, #1              //          } while (--length > 0);
619                 bgt     @55                     //      }
620                 b       @51                     //  }
621 
622 @57:            add     sp, #0xc
623                 pop     {r4-r7}
624                 bx      lr
625 }
626 
627 //---- This code will be compiled in Thumb-Mode
628 #include <nitro/code32.h>
629 #else  // defined(UNCOMPRESS_RL16_CODE32)
MI_UncompressRL16(register const void * srcp,register void * destp)630 asm void MI_UncompressRL16( register const void *srcp, register void *destp )
631 {
632                 stmdb   sp!, {r4-r6}
633 
634                 ldr     r12, [r0], #4           // load header
635                 mov     r12, r12, lsr #8        // dest. count
636                 mov     r4,  #0                 // write back accumulator
637                 mov     r5,  #8                 // low-high byte switch and shifter
638 
639 @L1             cmp     r12, #0                 // finished ?
640                 ble     @L4
641 
642                 ldrb    r2,  [r0], #1           // load marker byte
643                 ldrb    r3,  [r0], #1           // load char ie. 1st char
644                                                 // <- still 1 cycle interlock from the 1st LDRB (LDRB/LDRH impose 2 cycles pipeline interlock!)
645                 tst     r2,  #0x80              // check if marker has MSB set
646                 bne     @L3                     // path decoding str&iacute;ngs of different characters
647                 add     r2,  r2,   #1           // adjust counter
648                 sub     r12, r12,  r2           // decrement dest. count
649 
650 @L2             eors    r5,  r5, #8             // toggle low-high byte switch and shifter
651                 orr     r4,  r4, r3, lsl r5     // OR in the next byte as either low or high byte
652                 strneh  r4,  [r1], #2           // if halfword is full store it away
653                 movne   r4,  #0                 // clear accu
654                 subs    r2,  r2,   #1           // decrement counter
655                 ldrneb  r3,  [r0], #1           // preload char for the next loop to have the 2 cycle interlock of LDRB compensated by the pipeline flush imposed by the branch
656                 bne     @L2
657                 b       @L1                     // path decoding str&iacute;ngs of same character
658 
659 @L3             sub     r2,  r2, #0x7d          // adjust counter
660                 sub     r12, r12, r2            // decrement dest. count
661 
662                 eors    r5,  r5, #8             // toggle low-high byte switch and shifter
663                 orr     r4,  r4, r3, lsl r5     // OR in the next byte as either low or high byte
664                 strneh  r4,  [r1], #2           // if halfword is full store it away
665                 movne   r4,  #0                 // and clear accu
666 
667                 orr     r6,  r3, r3, lsl #8     // the char extended to halfword
668 
669 @L5             subs    r2,  r2, #2             // sub 2, to handle the repeatitive char loop halfword-wise
670                 ble     @L6                     // done or only one more to go, step out to keep the flow for multiple chars as fast as possible
671                 strh    r6,  [r1], #2           // more than one to go -> just store the double char
672                 b       @L5
673 
674 @L6             blt     @L1                     // done
675                 eors    r5,  r5, #8             // one more to go, so toggle the switch
676                 orr     r4,  r4, r3, lsl r5     // OR in the next byte as either low or high byte
677                 strneh  r4,  [r1], #2           // if halfword is full store it away
678                 movne   r4,  #0                 // and clear accu
679                 b       @L1
680 
681 @L4             ldmia   sp!, {r4-r6}
682                 bx      lr
683 }
684 #endif  // defined(UNCOMPRESS_RL16_CODE32)
685 
686 //----------------------------------------------------------------------
687 //          32-bit decompression of run-length compressed data
688 //
689 //- Decompresses run-length compressed data, writing in 32 bit units.
690 //- Higher speed than MI_UncompressRL8(), MI_UncompressRL16()
691 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
692 //
693 //- Use 4 byte alignment for the source address.
694 //
695 //Arguments:
696 //    void *srcp :              source address
697 //    void *destp :            destination address
698 //
699 //- Data header
700 //    u32 :4  :                Reserved
701 //        compType:4          Compression type( = 3)
702 //        destSize:24         Data size after decompression
703 //
704 //- Flag data format
705 //    u8  length:7            Decompressed data length - 1 (When not compressed)
706 //                            Decompressed data length - 3 (only compress when the contiguous length is 3 bytes or greater)
707 //        flag:1              (0, 1) = (not compressed, compressed)
708 //
709 //- Return value: None
710 //----------------------------------------------------------------------
711 
MI_UncompressRL32(register const void * srcp,register void * destp)712 asm void MI_UncompressRL32(register const void *srcp,register void *destp)
713 {
714                 stmdb   sp!, {r4-r5}
715 
716                 ldr     r12, [r0], #4           // load header
717                 mov     r12, r12, lsr #8        // dest. count
718                 mov     r4,  #0                 // write back accumulator
719                 mov     r5,  #32                // shifter reg. for write back accumulator
720 
721 @L1             cmp     r12, #0                 // finished ?
722                 ble     @L6
723 
724                 ldrb    r2,  [r0], #1           // load marker byte
725                 ldrb    r3,  [r0], #1           // load char ie. 1st char
726                                                 // <- still 1 cycle interlock from the 1st LDRB (LDRB/LDRH impose 2 cycles pipeline interlock!)
727                 tst     r2,  #0x80              // check if marker has MSB set
728                 bne     @L3                     // path decoding str&iacute;ngs of different characters
729                 add     r2,  r2, #1             // adjust counter
730                 sub     r12, r12,r2             // decrement dest. count
731 
732 @L2             orr     r4,  r4, r3, ror r5     // accumulate (LSL R5 would be more obvious, but then the shifter reg. must be counted up and an add. CMP would be needed)
733                 subs    r5,  r5, #8             // decrement shifter
734                 beq     @L5                     // write back branch is only taken every 4th loop, so it's best to jump out (non-taken branch is only 1 cycle compared to 3 for a taken one)
735                 subs    r2,  r2, #1             // decrement counter
736                 ldrneb  r3,  [r0], #1           // preload char for the next loop to have the 2 cycle interlock of LDRB compensated by the pipeline flush imposed by the branch
737                 bne     @L2
738                 b       @L1
739 
740 @L5             str     r4,  [r1], #4           // write back
741                 mov     r5,  #32                // restore shifter
742                 mov     r4,  #0                 // reset write back accumulator
743                 subs    r2,  r2, #1             // not to jump back saves 2 cycles
744                 ldrneb  r3,  [r0], #1           // preload char
745                 bne     @L2
746                 b       @L1                     // path decoding str&iacute;ngs of same character
747 
748 @L3             sub     r2,  r2, #0x7d          // adjust counter
749                 sub     r12, r12,r2             // decrement dest. count
750 
751 @L4             orr     r4,  r4, r3, ror r5     // accumulate (the first word)
752                 subs    r5,  r5, #8             // decrement shifter
753                 beq     @L7                     // if word is full step out
754                 subs    r2,  r2, #1             // if not, check if there are still bytes left
755                 bne     @L4                     // and continue
756                 b       @L1                     // or be done
757 
758 @L7             str     r4,  [r1], #4           // write back (the one word that may have chars from the other path in it)
759 
760                 orr     r4,  r3, r3, lsl #8
761                 orr     r4,  r4, r4, lsl #16    // the char extended to word
762 
763 @L8             subs    r2,  r2, #4             // full words are processed now, so decrement counter by 4
764                 ble     @L9                     // if no full words are left jump out
765                 str     r4,  [r1], #4           // otherwise write back
766                 b       @L8
767 
768 @L9             mov     r4,  #0                 // clear accu
769                 mov     r5,  #32                // reset shifter reg.
770                 adds    r2,  r2, #3             // re-adjust counter (0-3)
771                 beq     @L1                     // 0 then done
772 
773 @L0             orr     r4,  r4, r3, ror r5     // accumulate (the max. 3 remaining bytes)
774                 sub     r5,  r5, #8             // decrement shifter
775                 subs    r2,  r2, #1             // decrement counter
776                 bne     @L0
777                 b       @L1
778 
779 @L6             ldmia   sp!, {r4-r5}
780                 bx      lr
781 }
782 
783 //----------------------------------------------------------------------
784 //          8-bit decompression to restore differential filter conversion.
785 //
786 //- Restores a differential filter, writing in 8 bit units.
787 //With NITRO, cannot decompress directly into VRAM
788 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
789 //
790 //- Use 4 byte alignment for the source address.
791 //
792 //Arguments:
793 //    void *srcp :              source address
794 //    void *destp :            destination address
795 //
796 //- Data header
797 //    u32 :4 :                 Bit size of unit
798 //        compType:4          Compression type( = 3)
799 //        destSize:24         Data size after decompression
800 //
801 //- Return value: None
802 //----------------------------------------------------------------------
803 
MI_UnfilterDiff8(register const void * srcp,register void * destp)804 asm void MI_UnfilterDiff8( register const void *srcp, register void *destp )
805 {
806                 stmfd   sp!, {r4}
807 
808                 ldmia   r0, {r2}                // r2:  destCount = (u32 *)srcp;
809                 mov     r3, #0                  // r3:  sum = 0;
810                 and     r4, r2, #0xF            // r4:  bitSize   = (u32 *)srcp & 0xF;
811                 mov     r2, r2, lsr #8          //      destCount = (u32 *)srcp >> 8;
812                 cmp     r4, #1                  //      if (bitSize != 1) {
813                 bne     @63
814 
815 @61             // Difference calculation in units of 8 bits
816                 add     r0, r0, #3              //          srcp += 4;
817                 sub     r1, r1, #1
818 @62                                             //          do {
819                 ldrb    r4, [r0, #1]!           //              tmp = *(srcp++);
820                 subs    r2, r2, #1              //              destCount--; Executes at this position in order to eliminate pipeline stall
821                 add     r3, r3, r4              //              sum += tmp
822                 strb    r3, [r1, #1]!           //              *(destp++) = sum;
823                 bgt     @62                     //          } while ( destCount > 0 );
824                 b       @65                     //      } else {
825 
826 @63             // Difference calculation in units of 16 bits
827                 add     r0, r0, #2              //
828                 sub     r1, r1, #2              //
829 @64                                             //          do {
830                 ldrh    r4, [r0, #2]!           //              tmp = *(u16*)srcp; srcp += 2;
831                 subs    r2, r2, #2              //              destCount -= 2; Executes at this position in order to eliminate pipeline stall
832                 add     r3, r3, r4              //              sum += tmp;
833                 strh    r3, [r1, #2]!           //              *(u16*)destp = sum; destp += 2;
834                 bgt     @64                     //          } while ( destCount > 0 );
835                                                 //      }
836 @65
837                 ldmfd   sp!, {r4}
838                 bx      lr
839 }
840 
841 //----------------------------------------------------------------------
842 //          16-bit decompression to restore differential filter conversion.
843 //
844 //- Restores a differential filter, writing in 16 bit units.
845 //- Can be decompressed to the data TCM and VRAM
846 //- Higher speed than MI_UnfilterDiff16()
847 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
848 //
849 //- Use 4 byte alignment for the source address.
850 //
851 //Arguments:
852 //    void *srcp :              source address
853 //    void *destp :            destination address
854 //
855 //- Data header
856 //    u32 :4 :                 Bit size of unit
857 //        compType:4          Compression type( = 3)
858 //        destSize:24         Data size after decompression
859 //
860 //- Return value: None
861 //----------------------------------------------------------------------
862 
MI_UnfilterDiff16(register const void * srcp,register void * destp)863 asm void MI_UnfilterDiff16( register const void *srcp, register void *destp )
864 {
865                 stmfd   sp!, {r4, r5}
866 
867                 ldmia   r0, {r2}                // r2:  destCount = (u32 *)srcp;
868                 mov     r3, #0                  // r3:  sum = 0;
869                 and     r4, r2, #0xF            // r4:  bitSize   = (u32 *)srcp & 0xF;
870                 mov     r2, r2, lsr #8          //      destCount = (u32 *)srcp >> 8;
871                 cmp     r4, #1                  //      if (bitSize != 1) {
872                 bne     @63
873 
874 @61             // Difference calculation in units of 8 bits
875                 add     r0, r0, #2              //          srcp += 4;
876                 sub     r1, r1, #2
877 @62                                             //          do {
878                 ldrh    r4, [r0, #2]!           //          tmp = *(u16*)srcp; srcp += 2;
879                 sub     r2, r2, #2              //              destCount -= 2; Executes at this position in order to eliminate pipeline stall
880                 add     r3, r3, r4              //              sum += tmp
881                 and     r5, r3, #0xFF           // r5:          tmp2 = sum & 0xFF;
882                 add     r3, r3, r4, lsr #8      //              sum += (tmp >> 8);
883                 add     r5, r5, r3, lsl #8      //              tmp2 += (sum << 8);
884                 strh    r5, [r1, #2]!           //              *(u16*)destp = tmp2; destp += 2;
885                 cmp     r2, #1                  //
886                 bgt     @62                     //          } while ( destCount > 1 );
887                 bne     @65                     //          if ( destCount < 1 ) return;
888                                                 //          else // if (destCount == 1) {
889                 ldrh    r4, [r0, #2]!           //              tmp = *(u16*)srcp; srcp += 2;
890                 add     r3, r3, r4              //              sum += tmp;
891                 and     r5, r3, #0xFF           //              tmp2 = sum & 0xFF
892                 strh    r5, [r1, #2]!           //              *(u16*)destp = tmp2; destp += 2;
893                 b       @65                     //          }
894                                                 //      } else {
895 @63             // Difference calculation in units of 16 bits
896                 add     r0, r0, #2
897                 sub     r1, r1, #2
898 @64                                             //          do {
899                 ldrh    r4, [r0, #2]!           //              tmp = *(u16*)srcp; srcp += 2;
900                 subs    r2, r2, #2              //              destCount -= 2; Executes at this position in order to eliminate pipeline stall
901                 add     r3, r3, r4              //              sum += tmp;
902                 strh    r3, [r1, #2]!           //              *(u16*)destp = sum; destp += 2;
903                 bgt     @64                     //          } while ( destCount > 0 );
904                                                 //      }
905 @65
906                 ldmfd   sp!, {r4,r5}
907                 bx      lr
908 }
909 
910 //----------------------------------------------------------------------
911 //          32-bit decompression to restore differential filter conversion.
912 //
913 //- Restores a differential filter, writing in 32 bit units.
914 //- Can be decompressed to the data TCM and VRAM
915 //  Faster than the MI_Uncompress8 function//----
916 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
917 //
918 //- Use 4 byte alignment for the source address.
919 //
920 //Arguments:
921 //    void *srcp :              source address
922 //    void *destp :            destination address
923 //
924 //- Data header
925 //    u32 :4 :                 Bit size of unit
926 //        compType:4          Compression type( = 3)
927 //        destSize:24         Data size after decompression
928 //
929 //- Return value: None
930 //----------------------------------------------------------------------
931 
MI_UnfilterDiff32(register const void * srcp,register void * destp)932 asm void MI_UnfilterDiff32( register const void *srcp,register void *destp )
933 {
934                 stmdb   sp!, {r4-r6}
935 
936                 ldr     r4, [r0], #4            // load header
937                 mov     r5, #0xff               // byte mask
938                 mov     r6, r4, lsr #8          // Dest. count
939                 tst     r4, #1                  // Bitsize 8 or 16?
940                 mov     r4, #0                  // R4 - accumulator
941                 beq     @bs16
942 
943 @L1             ldr     r2, [r0], #4            // load data
944 
945                 subs    r6, r6, #4              // decrement dest. count (at this stage to prevent pipeline stall)
946 
947                 add     r4, r2, r4, lsr #24     // 1st sum
948                 and     r4, r4, r5              // mask
949 
950                 add     r3, r4, r2, lsr #8      // 2nd sum
951                 and     r3, r3, r5              // mask
952                 orr     r4, r4, r3, lsl #8      // accumulate
953 
954                 add     r3, r3, r2, lsr #16     // 3rd sum
955                 and     r3, r3, r5              // mask
956                 orr     r4, r4, r3, lsl #16     // accumulate
957 
958                 add     r3, r3, r2, lsr #24     // 4th sum
959                 and     r3, r3, r5              // mask
960                 orr     r4, r4, r3, lsl #24     // accumulate
961 
962                 str     r4, [r1], #4            // write back
963 
964                 bgt     @L1                     // loop if dest. count not reached
965 
966                 ldmia   sp!, {r4-r6}
967                 bx      lr
968 
969 @bs16
970                 orr     r5, r5, r5, lsl #8      // extend mask to 16 bit
971 
972 @L2             ldr     r2, [r0], #4            // load data
973 
974                 subs    r6, r6, #4              // decrement Dest. count (at this stage to prevent pipeline stall)
975 
976                 add     r4, r2, r4, lsr #16     // 1st sum
977                 and     r4, r4, r5              // mask
978 
979                 add     r3, r4, r2, lsr #16     // 2nd sum
980                 and     r3, r3, r5              // mask
981                 orr     r4, r4, r3, lsl #16     // accumulate
982 
983                 str     r4, [r1], #4            // write back
984 
985                 bgt     @L2                     // loop if dest. count not reached
986 
987                 ldmia   sp!, {r4-r6}
988                 bx      lr
989 }
990 
991 //----------------------------------------------------------------------
992 //          8-bit expansion of the differential filter conversion
993 //
994 //- Converts the differential filter, writing in 8 bit units.
995 //With NITRO, cannot decompress directly into VRAM
996 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
997 //
998 //- Use 4 byte alignment for the source address.
999 //
1000 //Arguments:
1001 //    void *srcp :              source address
1002 //    void *destp :            destination address
1003 //    u32  size               Source size
1004 //    BOOL bitsize            Differential size (TRUE: 16bit, FALSE: 8bit)
1005 //
1006 //- Data header
1007 //    u32 :4 :                 Bit size of unit
1008 //        compType:4          Compression type( = 3)
1009 //        destSize:24         Data size after decompression
1010 //
1011 //- Return value: None
1012 //----------------------------------------------------------------------
1013 
MI_FilterDiff8(register const void * srcp,register void * destp,register u32 size,register BOOL bitsize)1014 asm void MI_FilterDiff8(register const void *srcp, register void *destp, register u32 size, register BOOL bitsize)
1015 {
1016                 stmdb   sp!, {r4-r6}
1017 
1018                 add     r4, r3, #1              // u32 :4 : Bit size of unit
1019                 orr     r4, r4, #0x80           // compType:4 : Compression type( = 3)
1020                 orr     r4, r4, r2, lsl #8      // destSize:24 :  Data size before compression
1021                 str     r4, [r1], #4            // write header
1022                 mov     r4, #0                  // 1st subtrahend
1023                 mov     r5, #0xff               // byte mask
1024                 tst     r3, #1                  // Bitsize 8 or 16?
1025                 bne     @bs16
1026 
1027 @L1             ldrb    r3,[r0],#1              // load data
1028 
1029                 subs    r2, r2, #1              // decrement Dest. count (at this stage to prevent pipeline stall)
1030 
1031                 sub     r4, r3, r4              // diff
1032                 and     r6, r4, r5              // mask
1033 
1034                 mov     r4, r3                  // set as previous value
1035 
1036                 strb    r6, [r1], #1            // write back
1037 
1038                 bgt     @L1                     // loop if dest. count not reached
1039 
1040                 ldmia   sp!, {r4-r6}
1041                 bx      lr
1042 
1043 @bs16
1044                 orr     r5, r5, r5, lsl #8      // extend mask to 16 bit
1045 
1046 @L2             ldrh    r3,[r0],#2              // load data
1047 
1048                 subs    r2, r2, #2              // decrement Dest. count (at this stage to prevent pipeline stall)
1049 
1050                 sub     r4, r3, r4              // 1st diff
1051                 and     r6, r4, r5              // mask
1052 
1053                 mov     r4, r3                  // set as previous value
1054 
1055                 strh    r6, [r1], #2            // write back
1056 
1057                 bgt     @L2                     // loop if dest. count not reached
1058 
1059                 ldmia   sp!, {r4-r6}
1060                 bx      lr
1061 }
1062 
1063 //----------------------------------------------------------------------
1064 //          16-bit expansion of the differential filter conversion
1065 //
1066 //- Converts the differential filter, writing in 16 bit units.
1067 //- Can be decompressed to the data TCM and VRAM
1068 //- Higher speed than MI_FilterDiff8()
1069 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
1070 //
1071 //- Use 4 byte alignment for the source address.
1072 //
1073 //Arguments:
1074 //    void *srcp :              source address
1075 //    void *destp :            destination address
1076 //    u32  size               Source size
1077 //    BOOL bitsize            Differential size (TRUE: 16bit, FALSE: 8bit)
1078 //
1079 //- Data header
1080 //    u32 :4 :                 Bit size of unit
1081 //        compType:4          Compression type( = 3)
1082 //        destSize:24         Data size after decompression
1083 //
1084 //- Return value: None
1085 //----------------------------------------------------------------------
1086 
MI_FilterDiff16(register const void * srcp,register void * destp,register u32 size,register BOOL bitsize)1087 asm void MI_FilterDiff16(register const void *srcp, register void *destp, register u32 size, register BOOL bitsize)
1088 {
1089                 stmdb   sp!, {r4-r6}
1090 
1091                 add     r4, r3, #1              // u32 :4 : Bit size of unit
1092                 orr     r4, r4, #0x80           // compType:4 : Compression type( = 3)
1093                 orr     r4, r4, r2, lsl #8      // destSize:24 :  Data size before compression
1094                 str     r4, [r1], #4            // write header
1095                 mov     r4, #0                  // 1st subtrahend
1096                 mov     r5, #0xff               // byte mask
1097                 tst     r3, #1                  // Bitsize 8 or 16?
1098                 bne     @bs16
1099 
1100 @L1             ldrh    r3,[r0],#2              // load data
1101 
1102                 subs    r2, r2, #2              // decrement Dest. count (at this stage to prevent pipeline stall)
1103 
1104                 sub     r4, r3, r4, lsr #8              // 1st diff
1105                 and     r6, r4, r5              // mask
1106 
1107                 rsb     r4, r3, r3, lsr #8      // 2nd diff
1108                 and     r4, r4, r5              // mask
1109                 orr     r6, r6, r4, lsl #8      // accumulate
1110 
1111                 mov     r4, r3                  // set as previous value
1112 
1113                 strh    r6, [r1], #2            // write back
1114 
1115                 bgt     @L1                     // loop if dest. count not reached
1116 
1117                 ldmia   sp!, {r4-r6}
1118                 bx      lr
1119 
1120 @bs16
1121                 orr     r5, r5, r5, lsl #8      // extend mask to 16 bit
1122 
1123 @L2             ldrh    r3,[r0],#2              // load data
1124 
1125                 subs    r2, r2, #2              // decrement Dest. count (at this stage to prevent pipeline stall)
1126 
1127                 sub     r4, r3, r4              // 1st diff
1128                 and     r6, r4, r5              // mask
1129 
1130                 mov     r4, r3                  // set as previous value
1131 
1132                 strh    r6, [r1], #2            // write back
1133 
1134                 bgt     @L2                     // loop if dest. count not reached
1135 
1136                 ldmia   sp!, {r4-r6}
1137                 bx      lr
1138 }
1139 
1140 //----------------------------------------------------------------------
1141 //          32-bit expansion of the differential filter conversion
1142 //
1143 //- Converts the differential filter, writing in 32 bit units.
1144 //- Higher speed than MI_FilterDiff16()
1145 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
1146 //
1147 //- Use 4 byte alignment for the source address.
1148 //
1149 //Arguments:
1150 //    void *srcp :              source address
1151 //    void *destp :            destination address
1152 //    u32  size               Source size
1153 //    BOOL bitsize            Differential size (TRUE: 16bit, FALSE: 8bit)
1154 //
1155 //- Data header
1156 //    u32 :4 :                 Bit size of unit
1157 //        compType:4          Compression type( = 3)
1158 //        destSize:24         Data size after decompression
1159 //
1160 //- Return value: None
1161 //----------------------------------------------------------------------
1162 
MI_FilterDiff32(register const void * srcp,register void * destp,register u32 size,register BOOL bitsize)1163 asm void MI_FilterDiff32(register const void *srcp, register void *destp, register u32 size, register BOOL bitsize)
1164 {
1165                 stmdb   sp!, {r4-r6}
1166 
1167                 add     r4, r3, #1              // u32 :4 : Bit size of unit
1168                 orr     r4, r4, #0x80           // compType:4 : Compression type( = 3)
1169                 orr     r4, r4, r2, lsl #8      // destSize:24 :  Data size before compression
1170                 str     r4, [r1], #4            // write header
1171                 mov     r4, #0                  // 1st subtrahend
1172                 mov     r5, #0xff               // byte mask
1173                 tst     r3, #1                  // Bitsize 8 or 16?
1174                 bne     @bs16
1175 
1176 @L1             ldr    r3,[r0],#4               // load data
1177 
1178                 subs    r2, r2, #4              // decrement Dest. count (at this stage to prevent pipeline stall)
1179 
1180                 sub     r4, r3, r4, lsr #8      // 1st diff
1181                 and     r6, r4, r5              // mask
1182 
1183                 rsb     r4, r3, r3, lsr #8      // 2nd diff
1184                 and     r4, r4, r5              // mask
1185                 orr     r6, r6, r4, lsl #8      // accumulate
1186 
1187                 mov     r4, r3, lsr #16         // shift higher halfword down
1188 
1189                 sub     r3, r4, r3, lsr #8      // 3rd diff
1190                 and     r3, r3, r5              // mask
1191                 orr     r6, r6, r3, lsl #16     // accumulate
1192 
1193                 rsb     r3, r4, r4, lsr #8      // 4th diff
1194                 and     r3, r3, r5              // mask
1195                 orr     r6, r6, r3, lsl #24     // accumulate
1196 
1197                 str     r6, [r1], #4            // write back
1198 
1199                 bgt     @L1                     // loop if dest. count not reached
1200 
1201                 ldmia   sp!, {r4-r6}
1202                 bx      lr
1203 
1204 @bs16
1205                 orr     r5, r5, r5, lsl #8      // extend mask to 16 bit
1206 
1207 @L2             ldr     r3,[r0],#4              // load data
1208 
1209                 subs    r2, r2, #4              // decrement Dest. count (at this stage to prevent pipeline stall)
1210 
1211                 sub     r4, r3, r4              // 1st diff
1212                 and     r6, r4, r5              // mask
1213 
1214                 mov     r4, r3, lsr #16         // shift higher halfword down
1215 
1216                 rsb     r3, r3, r3, lsr #16     // 2nd diff
1217                 and     r3, r3, r5              // mask
1218                 orr     r6, r6, r3, lsl #16     // accumulate
1219 
1220                 str     r6, [r1], #4            // write back
1221 
1222                 bgt     @L2                     // loop if dest. count not reached
1223 
1224                 ldmia   sp!, {r4-r6}
1225                 bx      lr
1226 }
1227 
1228 //---- end limitation of processer mode
1229 #include <nitro/codereset.h>
1230