1 /*---------------------------------------------------------------------------*
2 Project: TwlSDK - MI -
3 File: mi_uncompress.c
4
5 Copyright 2003-2008 Nintendo. All rights reserved.
6
7 These coded instructions, statements, and computer programs contain
8 proprietary information of Nintendo of America Inc. and/or Nintendo
9 Company Ltd., and are protected by Federal copyright law. They may
10 not be disclosed to third parties or copied or duplicated in any form,
11 in whole or in part, without the prior written consent of Nintendo.
12
13 $Date:: 2008-09-17#$
14 $Rev: 8556 $
15 $Author: okubata_ryoma $
16
17 *---------------------------------------------------------------------------*/
18
19 #include <nitro/types.h>
20 #include <nitro/mi/uncompress.h>
21
22 //****Bug fix****
23 // Because halfword access instructions such as ldrh and strh are not passed through by the inline assembler due to a CW bug, instruction values are written directly using dcd in order to avoid this bug.
24 //
25 // When the bug is fixed, the 'define' below will be removed.
26 //#define CW_BUG_FOR_LDRH_AND_STRH
27
28 #define UNCOMPRESS_RL16_CODE32
29
30 //---- This code will be compiled in ARM-Mode
31 #include <nitro/code32.h>
32
33 //======================================================================
34 // Expanding compressed data
35 //======================================================================
36 //----------------------------------------------------------------------
37 // Expanding bit compressed data
38 //
39 //- Unpacks data padded with bits fixed to 0.
40 //- Align the destination address to a 4-byte boundary.
41 //
42 //Arguments:
43 // void *srcp : source address
44 // void *destp : destination address
45 // MIUnpackBitsParam *paramp : Address of MIUnpackBitsParam structure
46 //
47 //MIUnpackBitsParam Structure
48 // u16 srcNum: Number of bytes of source data
49 // u8 srcBitNum: Number of bits per source data
50 // u8 destBitNum: Number of bits per destination data
51 // u32 destOffset:31 : Offset number to add to source data.
52 // destOffset0_On:1 : Flag for whether to add an offset to 0 data.
53 //
54 //- Return value: None
55 //----------------------------------------------------------------------
56
MI_UnpackBits(register const void * srcp,register void * destp,register MIUnpackBitsParam * paramp)57 asm void MI_UnpackBits( register const void *srcp, register void *destp, register MIUnpackBitsParam *paramp )
58 {
59 stmfd sp!, {r4-r11, lr}
60 sub sp, sp, #4
61
62 ldrh r7, [r2, #0] // r7: srcNum = unPackBitsParamp->srcNum
63
64 @00: ldrb r6, [r2, #2] // r6: srcBitNum = unPackBitsParamp->srcBitNum
65 rsb r10, r6, #8 // r10: srcBitNumInv = 8 - srcBitNum
66 mov r14, #0 // r14: destBak = 0
67 ldr r11, [r2, #4] // r8: destOffset0_On
68 mov r8, r11, lsr #31 // = unPackBitsParamp->destOffset0_On
69 ldr r11,[r2, #4] // destOffset = unPackBitsParamp->destOffset
70 mov r11,r11, lsl #1
71 mov r11,r11, lsr #1
72 str r11,[sp, #0]
73 ldrb r2, [r2, #3] // r2: destBitNum = unPackBitsParamp->destBitNum
74 mov r3, #0 // r3: destBitCount = 0
75
76 @01: subs r7, r7, #1 // while (--srcNum >= 0) {
77 blt @06
78
79 mov r11, #0xff // r5: srcMask = 0xff >> srcBitNumInv;
80 mov r5, r11, asr r10
81 ldrb r9, [r0], #1 // r9: srcTmp = *srcp++;
82 mov r4, #0 // r4: srcBitCount = 0;
83
84 @02: cmp r4, #8 // while (srcBitCount < 8) {
85 bge @01
86
87 and r11, r9, r5 // r12: destTmp = ((srcTmp & srcMask) >>srcBitCount);
88 movs r12, r11, lsr r4
89 cmpeq r8, #0
90 beq @04
91
92 @03: ldr r11, [sp, #0] // destTmp += destOffset;
93 add r12, r12, r11
94 @04: orr r14, r14, r12, lsl r3 // destBak |= destTmp << destBitCount;
95 add r3, r3, r2 // destBitCount += destBitNum;
96
97 cmp r3, #0x20 // if (destBitCount >= 32) {
98 blt @05
99
100 str r14, [r1], #4 // *destp++ = destBak;
101 mov r14, #0 // destBak = 0;
102 mov r3, #0 // destBitCount = 0;
103 // }
104 @05: mov r5, r5, lsl r6 // srcMask <<= srcBitNum;
105 add r4, r4, r6 // srcBitCount += srcBitNum;
106 b @02 // }
107 // }
108 @06: add sp, sp, #4
109 ldmfd sp!, {r4-r11, lr}
110 bx lr
111 }
112
113 //----------------------------------------------------------------------
114 // 8-bit decompression of LZ77 compressed data
115 //
116 //* Expands LZ77-compressed data and writes it in 8-bit units.
117 //- Cannot decompress directly into VRAM
118 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
119 //
120 //- Use 4 byte alignment for the source address.
121 //
122 //Arguments:
123 // void *srcp : source address
124 // void *destp : destination address
125 //
126 //- Data header
127 // u32 :4 : Reserved
128 // compType:4 Compression type( = 1)
129 // destSize:24 Data size after decompression
130 //
131 //- Flag data format
132 // u8 flags Compression/no compression flag
133 // (0, 1) = (not compressed, compressed)
134 //- Code data format (Big Endian)
135 // u16 length:4 Decompressed data length - 3 (only compress when the match length is 3 bytes or greater)
136 // offset:12 Match data offset - 1
137 //
138 //- Return value: None
139 //----------------------------------------------------------------------
140
MI_UncompressLZ8(register const void * srcp,register void * destp)141 asm void MI_UncompressLZ8( register const void *srcp, register void *destp )
142 {
143 stmfd sp!, {r4-r7, lr}
144
145 ldr r5, [r0], #4 // r2: destCount = *(u32 *)srcp >> 8
146 mov r2, r5, lsr #8 // r0: srcp += 4
147 mov r7, #0
148 tst r5, #0x0F // r7: isExFormat = (*header & 0x0F)? 1 : 0
149 movne r7, #1
150
151 @21: cmp r2, #0 // while (destCount > 0) {
152 ble @26
153
154 ldrb r14, [r0], #1 // r14: flags = *srcp++
155 mov r4, #8 //
156 @22: subs r4, r4, #1 // for ( i = 8; --i >= 0; ) {
157 blt @21
158
159 tst r14, #0x80 // if (!(flags & 0x80)) {
160 bne @23
161
162 ldrb r6, [r0], #1 // *srcp++;
163 swpb r6, r6, [r1] // r1: *destp++; (Byte-writing countermeasure)
164 add r1, r1, #1
165 sub r2, r2, #1 // destCount--;
166 b @25
167 // } else {
168 @23: ldrb r5, [r0, #0] // r3: length = (*srcp >> 4);
169 cmp r7, #0 // if ( ! isExFormat ) { length += 3; }
170 moveq r6, #3
171 beq @23_2
172 // else {
173 tst r5, #0xE0 // if ( length > 1 ) {
174 movne r6, #1 // length += 1;
175 bne @23_2 // } else {
176
177 add r0, r0, #1 // isWide = (length == 1)? 1 : 0;
178 and r6, r5, #0xf // length = (*srcp++ & 0x0F) << 4
179 mov r6, r6, lsl #4
180 tst r5, #0x10
181 beq @23_1 // if ( isWide ) {
182
183 mov r6, r6, lsl #8 // length <<= 8;
184 ldrb r5, [r0], #1 // length += (*srcp++) << 4;
185 add r6, r6, r5, lsl #4 // length += 0xFF + 1;
186 add r6, r6, #0x100 // }
187 @23_1:
188 add r6, r6, #0x11 // length += 0xF + 2;
189 ldrb r5, [r0, #0] // length += (*srcp >> 4);
190 @23_2: // }
191 add r3, r6, r5, asr #4 // }
192 add r0, r0, #1 // r12: offset = (*srcp++ & 0x0f) << 8;
193 and r5, r5, #0xf
194 mov r12,r5, lsl #8
195 ldrb r6, [r0], #1 // offset = (offset | *srcp++) + 1;
196 orr r5, r6, r12
197 add r12,r5, #1
198 sub r2, r2, r3 // destCount -= length;
199 // do {
200 @24: ldrb r5, [r1, -r12] // *destp++ = destp[-offset]
201 swpb r5, r5, [r1] // (Byte-writing countermeasure)
202 add r1, r1, #1
203 subs r3, r3, #1 // } while (--length > 0);
204 bgt @24
205 // }
206 @25: cmp r2, #0 // if (destCount <= 0) break;
207 movgt r14, r14, lsl #1 // flags <<= 1
208 bgt @22 // }
209 b @21 // }
210
211 @26: ldmfd sp!, {r4-r7, lr}
212 bx lr
213 }
214
215
216 //----------------------------------------------------------------------
217 // 16-bit decompression of LZ77 compressed data
218 //
219 //* Expands LZ77-compressed data and writes it in 16-bit units.
220 //* Although it can also expand in data TCM and main memory, it is slower than MI_UncompressLZ77().
221 //
222 //* For compressed data, search for a matching character string from a minimum of 2 bytes previous.
223 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
224 //
225 //- Use 4 byte alignment for the source address.
226 //
227 //Arguments:
228 // void *srcp : source address
229 // void *destp : destination address
230 //
231 //- Data header
232 // u32 :4 : Reserved
233 // compType:4 Compression type( = 1)
234 // destSize:24 Data size after decompression
235 //
236 //- Flag data format
237 // u8 flags Compression/no compression flag
238 // (0, 1) = (not compressed, compressed)
239 //- Code data format (Big Endian)
240 // u16 length:4 Decompressed data length - 3 (only compress when the match length is 3 bytes or greater)
241 // offset:12 : Match data offset ( >= 2) - 1
242 //
243 //- Return value: None
244 //----------------------------------------------------------------------
245
MI_UncompressLZ16(register const void * srcp,register void * destp)246 asm void MI_UncompressLZ16( register const void *srcp, register void *destp )
247 {
248 stmfd sp!, {r4-r11, lr}
249
250 mov r3, #0 // r3: destTmp = 0
251 ldr r8, [r0], #4 // r10: destCount = *(u32 *)srcp >> 8
252 mov r10, r8, lsr #8 // r0: srcp += 4
253 mov r2, #0 // r2: shift = 0
254 mov r11, #0
255 tst r8, #0x0F // r11: isExFormat = (*header & 0x0F)? 1 : 0;
256 movne r11, #1
257
258 @31: cmp r10, #0 // while (destCount > 0) {
259 ble @36
260
261 ldrb r6, [r0], #1 // r6: flags = *srcp++;
262 mov r7, #8 // for ( i = 8; --i >= 0; ) {
263 @32: subs r7, r7, #1
264 blt @31
265
266 tst r6, #0x80 // if (!(flags & 0x80)) {
267 bne @33
268
269 ldrb r9, [r0], #1 // destTmp |= *srcp++ << shift;
270 orr r3, r3, r9, lsl r2
271 sub r10, r10, #1 // destCount--;
272
273 eors r2, r2, #8 // if (!(shift ^= 8)) {
274 #ifndef CW_BUG_FOR_LDRH_AND_STRH
275 streqh r3, [r1], #2 // *destp++ = destTmp;
276 #else
277 dcd 0x00c130b2
278 #endif
279 moveq r3, #0 // destTmp = 0;
280 b @35 // } else {
281
282 @33: ldrb r9, [r0, #0] // r5: length = (*srcp >> 4) + 3;
283 cmp r11, #0 // if ( ! isExFormat ) { length += 3; }
284 moveq r8, #3
285 beq @33_2
286 // else {
287 tst r9, #0xE0 // if ( length > 1 ) {
288 movne r8, #1 // length += 1
289 bne @33_2 // } else {
290
291 add r0, r0, #1 // isWide = (length == 1)? 1 : 0;
292 and r8, r9, #0xf // length = (*srcp++ & 0x0F) << 4
293 mov r8, r8, lsl #4
294 tst r9, #0x10
295 beq @33_1 // if ( isWide ) {
296
297 mov r8, r8, lsl #8 // length <<= 8;
298 ldrb r9, [r0], #1 // length += (*srcp++) << 4
299 add r8, r8, r9, lsl #4 // length += 0xFF + 1
300 add r8, r8, #0x100 // }
301 @33_1:
302 add r8, r8, #0x11 // length += 0xF + 2;
303 ldrb r9, [r0, #0] // length += (*srcp >> 4);
304 @33_2: // }
305 add r5, r8, r9, asr #4 // }
306 ldrb r9, [r0], #1 // r4: offset = (*srcp++ & 0x0f) << 8;
307 and r8, r9, #0xf
308 mov r4, r8, lsl #8
309 ldrb r9, [r0], #1 // offset = (offset | *srcp++) + 1;
310 orr r8, r9, r4
311 add r4, r8, #1
312 rsb r8, r2, #8 // r14: offset0_8 = (8 - shift)
313 and r9, r4, #1 // ^ ((offset & 1) << 3);
314 eor r14, r8, r9, lsl #3
315 sub r10, r10, r5 // destCount -= length;
316 // do {
317 @34: eor r14, r14, #8 // offset0_8 ^= 8;
318 rsb r8, r2, #8 // destTmp |= (destp[-((offset
319 add r8, r4, r8, lsr #3 // + ((8 - shift) >> 3)) >> 1)];
320 mov r8, r8, lsr #1
321 mov r8, r8, lsl #1
322 #ifndef CW_BUG_FOR_LDRH_AND_STRH
323 ldrh r9, [r1, -r8]
324 #else
325 dcd 0xe11190b8
326 #endif
327 mov r8, #0xff
328 and r8, r9, r8, lsl r14
329 mov r8, r8, asr r14
330 orr r3, r3, r8, lsl r2
331 eors r2, r2, #8 // if (!(shift ^= 8)) {
332 #ifndef CW_BUG_FOR_LDRH_AND_STRH
333 streqh r3, [r1], #2 // *destp++ = destTmp;
334 #else
335 dcd 0x00c130b2
336 #endif
337 moveq r3, #0 // destTmp = 0;
338 // }
339 subs r5, r5, #1 // } while (--length > 0);
340 bgt @34 // }
341
342 @35: cmp r10, #0 // if (destCount <= 0) break;
343 movgt r6, r6, lsl #1 // flags <<= 1
344 bgt @32 // }
345 b @31 // }
346
347 @36: ldmfd sp!, {r4-r11, lr}
348 bx lr
349 }
350
351 //----------------------------------------------------------------------
352 // Decompression of Huffman compressed data
353 //
354 //- Decompresses Huffman compressed data, writing in 32 bit units.
355 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
356 //
357 //- Use 4 byte alignment for the source address.
358 //
359 //Arguments:
360 // void *srcp : source address
361 // void *destp : destination address
362 //
363 //- Data header
364 // u32 bitSize:4 1 data bit size (Normally 4|8)
365 // compType:4 Compression type( = 2)
366 // destSize:24 Data size after decompression
367 //
368 //- Tree table
369 // u8 treeSize Tree table size/2 - 1
370 // TreeNodeData nodeRoot Root node
371 //
372 // TreeNodeData nodeLeft Root left node
373 // TreeNodeData nodeRight Root right node
374 //
375 // TreeNodeData nodeLeftLeft Left left node
376 // TreeNodeData nodeLeftRight Left right node
377 //
378 // TreeNodeData nodeRightLeft Right left node
379 // TreeNodeData nodeRightRight Right right node
380 //
381 // �E
382 // �E
383 //
384 // The compressed data itself follows
385 //
386 //- TreeNodeData structure
387 // u8 nodeNextOffset:6 : Offset to the next node data - 1 (2 byte units)
388 // rightEndFlag:1 Right node end flag
389 // leftEndzflag:1 Left node end flag
390 // When end flag is set, there is data in next node.
391 //
392 //
393 //- Return value: None
394 //----------------------------------------------------------------------
395
MI_UncompressHuffman(register const void * srcp,register void * destp)396 asm void MI_UncompressHuffman( register const void *srcp, register void *destp )
397 {
398 stmfd sp!, {r4-r11, lr}
399 sub sp, sp, #4
400
401 add r2, r0, #4 // r2: *treep = (u8 *)srcp + 4
402 add r7, r2, #1 // r7: *treeStartp = treep + 1
403 ldrb r10,[r0, #0] // r4: DataBit = *(u8 *)srcp & 0x0f
404 and r4, r10, #0xf
405 mov r3, #0 // r3: destTmp = 0
406 mov r14,#0 // r14: destTmpCount = 0
407 and r10,r4, #7 // destTmpDataNum = 4 + (DataBit & 0x7)
408 add r11,r10, #4
409 str r11,[sp, #0]
410 ldr r10,[r0, #0] // r12: destCount = *srcp >> 8
411 mov r12,r10, lsr #8
412 ldrb r10,[r2, #0] // r0: srcp = (u32 *)(treep + ((*treep + 1) << 1))
413 add r10,r10, #1
414 add r0, r2, r10, lsl #1
415 mov r2, r7 // treep = treeStartp
416
417 @11: cmp r12, #0 // while (destCount > 0) {
418 ble @14
419
420 mov r8, #32 // r8: srcCount = 32;
421 ldr r5, [r0], #4 // r5: srcTmp = *srcp++;
422
423 @12: subs r8, r8, #1 // while (--srcCount >= 0) {
424 blt @11
425
426 mov r10,#1 // r9: treeShift = (srcTmp >> 31) & 0x1
427 and r9, r10, r5, lsr #31
428 ldrb r6, [r2, #0] // r6: treeCheck = *treep
429 mov r6, r6, lsl r9 // treeCheck <<= treeShift
430 mov r10,r2, lsr #1 // treep = (u8 *)((((u32 )treep>>1) <<1)
431 mov r10,r10,lsl #1 // + (((*treep & 0x3f)+1) <<1)+treeShift)
432 ldrb r11,[r2,#0]
433 and r11,r11,#0x3f
434 add r11,r11,#1
435 add r10,r10,r11,lsl #1
436 add r2, r10,r9
437
438 tst r6, #0x80 // if (treeCheck & TREE_END) {
439 beq @13
440
441 mov r3, r3, lsr r4 // destTmp >>= DataBit;
442 ldrb r10,[r2, #0] // destTmp |= *treep << (32 - DataBit);
443 rsb r11, r4, #32
444 orr r3, r3, r10, lsl r11
445 mov r2, r7 // treep = treeStartp;
446 add r14,r14,#1 // if (++destTmpCount == destTmpDataNum) {
447 ldr r11,[sp, #0]
448 cmp r14,r11
449
450 streq r3, [r1], #4 // *destp++ = destTmp;
451 subeq r12,r12, #4 // destCount -= 4;
452 moveq r14,#0 // destTmpCount = 0;
453 // }
454 // }
455 @13: cmp r12,#0 // if (destCount <= 0) break;
456 movgt r5, r5, lsl #1 // srcTmp <<= 1;
457 bgt @12 // }
458 b @11 // }
459
460 @14: add sp, sp, #4
461 ldmfd sp!, {r4-r11, lr}
462 bx lr
463 }
464
465 //----------------------------------------------------------------------
466 // 8-bit decompression of run-length compressed data
467 //
468 //- Decompresses run-length compressed data, writing in 8 bit units.
469 //With NITRO, cannot decompress directly into VRAM
470 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
471 //
472 //- Use 4 byte alignment for the source address.
473 //
474 //Arguments:
475 // void *srcp : source address
476 // void *destp : destination address
477 //
478 //- Data header
479 // u32 :4 : Reserved
480 // compType:4 Compression type( = 3)
481 // destSize:24 Data size after decompression
482 //
483 //- Flag data format
484 // u8 length:7 Decompressed data length - 1 (When not compressed)
485 // Decompressed data length - 3 (only compress when the contiguous length is 3 bytes or greater)
486 // flag:1 (0, 1) = (not compressed, compressed)
487 //
488 //- Return value: None
489 //----------------------------------------------------------------------
490
MI_UncompressRL8(register const void * srcp,register void * destp)491 asm void MI_UncompressRL8( register const void *srcp, register void *destp )
492 {
493 stmfd sp!, {r4, r5, r7}
494
495 ldmia r0!, {r3} // r7: destCount = *(u32 *)srcp >> 8;
496 mov r7, r3, lsr #8 // r0: srcp += 4;
497
498 @41: cmp r7, #0 // while (destCount > 0) {
499 ble @45
500
501 ldrb r4, [r0], #1 // r4: flags = *srcp++;
502 ands r2, r4, #0x7f // r2: length = flags & 0x7f;
503 tst r4, #0x80 // if (!(flags & 0x80)) {
504 bne @43
505
506 add r2, r2, #1 // length++;
507 sub r7, r7, r2 // destCount -= length;
508 @42: ldrb r3, [r0], #1 // do {
509 swpb r3, r3, [r1] // *destp++ = *srcp++; (Byte-writing countermeasure)
510 add r1, r1, #1
511 subs r2, r2, #1 // } while (--length > 0);
512 bgt @42 // } else {
513 b @41
514
515 @43: add r2, r2, #3 // length += 3;
516 sub r7, r7, r2 // destCount -= length;
517 ldrb r5, [r0], #1 // srcTmp = *srcp++;
518 @44: swpb r4, r5, [r1] // do { (Byte writing countermeasure)
519 add r1, r1, #1 // *destp++ = srcTmp;
520 subs r2, r2, #1 // } while (--length > 0);
521 bgt @44 // }
522 b @41 // }
523
524 @45: ldmfd sp!, {r4, r5, r7}
525 bx lr
526 }
527
528 //----------------------------------------------------------------------
529 // 16-bit decompression of run-length compressed data
530 //
531 //- Decompresses run-length compressed data, writing in 16 bit units.
532 //- Can be decompressed to the data TCM and VRAM
533 //- When decompressing to the main RAM, it is slower than MI_Uncomressed RL80
534 //- When decompressing to data TCM or VRAM, it is faster than UncompresedRL90.
535 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
536 //
537 //- Use 4 byte alignment for the source address.
538 //
539 //Arguments:
540 // void *srcp : source address
541 // void *destp : destination address
542 //
543 //- Data header
544 // u32 :4 : Reserved
545 // compType:4 Compression type( = 3)
546 // destSize:24 Data size after decompression
547 //
548 //- Flag data format
549 // u8 length:7 Decompressed data length - 1 (When not compressed)
550 // Decompressed data length - 3 (only compress when the contiguous length is 3 bytes or greater)
551 // flag:1 (0, 1) = (not compressed, compressed)
552 //
553 //- Return value: None
554 //----------------------------------------------------------------------
555
556 #if !defined(UNCOMPRESS_RL16_CODE32)
557 //---- This code will be compiled in Thumb-Mode
558 #include <nitro/code16.h>
559
MI_UncompressRL16(register const void * srcp,register void * destp)560 asm void MI_UncompressRL16( register const void *srcp, register void *destp )
561 {
562 push {r4-r7}
563 sub sp, #0xc
564
565 mov r7, #0 // r7: destTmp = 0
566 ldmia r0!, {r3} // r5: destCount = *(u32 *)srcp >> 8
567 lsr r5, r3, #8 // r0: srcp += 4
568
569 mov r4, #0 // r4: shift = 0
570 @51: cmp r5, #0 // while (destCount > 0) {
571 ble @57
572
573 ldrb r3, [r0, #0] // flags = *srcp++;
574 str r3, [sp, #4]
575 add r0, #1
576 ldr r3, [sp, #4] // r2: length = flags & 0x7f;
577 lsl r2, r3, #25
578 lsr r2, r2, #25
579
580 ldr r6, [sp, #4] // if (!(flags & 0x80)) {
581 lsr r3, r6, #8
582 bcs @54
583
584 add r2, #1 // length++;
585 sub r5, r5, r2 // destCount -= length;
586 // do {
587 @52: ldrb r6, [r0, #0] // destTmp |= *srcp++ << shift;
588 lsl r6, r4
589 orr r7, r6
590 add r0, #1
591 mov r3, #8 // if (!(shift ^= 8)) {
592 eor r4, r3
593 bne @53
594 strh r7, [r1, #0] // *destp++ = destTmp;
595 add r1, #2
596 mov r7, #0 // destTmp = 0;
597 // }
598 @53: sub r2, r2, #1 // } while (--length > 0);
599 bgt @52
600 b @51 // } else {
601
602 @54: add r2, #3 // length += 3;
603 sub r5, r5, r2 // destCount -= length;
604 ldrb r6, [r0, #0] // srcTmp = *srcp++;
605 str r6, [sp, #8]
606 add r0, #1
607 // do {
608 @55: ldr r6, [sp, #8] // destTmp |= srcTmp << shift;
609 lsl r6, r4
610 orr r7, r6
611 mov r3, #8 // if (!(shift ^= 8)) {
612 eor r4, r3
613 bne @56
614 strh r7, [r1, #0] // *destp++ = destTmp;
615 add r1, #2
616 mov r7, #0 // destTmp = 0;
617 // }
618 @56: sub r2, r2, #1 // } while (--length > 0);
619 bgt @55 // }
620 b @51 // }
621
622 @57: add sp, #0xc
623 pop {r4-r7}
624 bx lr
625 }
626
627 //---- This code will be compiled in Thumb-Mode
628 #include <nitro/code32.h>
629 #else // defined(UNCOMPRESS_RL16_CODE32)
MI_UncompressRL16(register const void * srcp,register void * destp)630 asm void MI_UncompressRL16( register const void *srcp, register void *destp )
631 {
632 stmdb sp!, {r4-r6}
633
634 ldr r12, [r0], #4 // load header
635 mov r12, r12, lsr #8 // dest. count
636 mov r4, #0 // write back accumulator
637 mov r5, #8 // low-high byte switch and shifter
638
639 @L1 cmp r12, #0 // finished ?
640 ble @L4
641
642 ldrb r2, [r0], #1 // load marker byte
643 ldrb r3, [r0], #1 // load char ie. 1st char
644 // <- still 1 cycle interlock from the 1st LDRB (LDRB/LDRH impose 2 cycles pipeline interlock!)
645 tst r2, #0x80 // check if marker has MSB set
646 bne @L3 // path decoding stríngs of different characters
647 add r2, r2, #1 // adjust counter
648 sub r12, r12, r2 // decrement dest. count
649
650 @L2 eors r5, r5, #8 // toggle low-high byte switch and shifter
651 orr r4, r4, r3, lsl r5 // OR in the next byte as either low or high byte
652 strneh r4, [r1], #2 // if halfword is full store it away
653 movne r4, #0 // clear accu
654 subs r2, r2, #1 // decrement counter
655 ldrneb r3, [r0], #1 // preload char for the next loop to have the 2 cycle interlock of LDRB compensated by the pipeline flush imposed by the branch
656 bne @L2
657 b @L1 // path decoding stríngs of same character
658
659 @L3 sub r2, r2, #0x7d // adjust counter
660 sub r12, r12, r2 // decrement dest. count
661
662 eors r5, r5, #8 // toggle low-high byte switch and shifter
663 orr r4, r4, r3, lsl r5 // OR in the next byte as either low or high byte
664 strneh r4, [r1], #2 // if halfword is full store it away
665 movne r4, #0 // and clear accu
666
667 orr r6, r3, r3, lsl #8 // the char extended to halfword
668
669 @L5 subs r2, r2, #2 // sub 2, to handle the repeatitive char loop halfword-wise
670 ble @L6 // done or only one more to go, step out to keep the flow for multiple chars as fast as possible
671 strh r6, [r1], #2 // more than one to go -> just store the double char
672 b @L5
673
674 @L6 blt @L1 // done
675 eors r5, r5, #8 // one more to go, so toggle the switch
676 orr r4, r4, r3, lsl r5 // OR in the next byte as either low or high byte
677 strneh r4, [r1], #2 // if halfword is full store it away
678 movne r4, #0 // and clear accu
679 b @L1
680
681 @L4 ldmia sp!, {r4-r6}
682 bx lr
683 }
684 #endif // defined(UNCOMPRESS_RL16_CODE32)
685
686 //----------------------------------------------------------------------
687 // 32-bit decompression of run-length compressed data
688 //
689 //- Decompresses run-length compressed data, writing in 32 bit units.
690 //- Higher speed than MI_UncompressRL8(), MI_UncompressRL16()
691 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
692 //
693 //- Use 4 byte alignment for the source address.
694 //
695 //Arguments:
696 // void *srcp : source address
697 // void *destp : destination address
698 //
699 //- Data header
700 // u32 :4 : Reserved
701 // compType:4 Compression type( = 3)
702 // destSize:24 Data size after decompression
703 //
704 //- Flag data format
705 // u8 length:7 Decompressed data length - 1 (When not compressed)
706 // Decompressed data length - 3 (only compress when the contiguous length is 3 bytes or greater)
707 // flag:1 (0, 1) = (not compressed, compressed)
708 //
709 //- Return value: None
710 //----------------------------------------------------------------------
711
MI_UncompressRL32(register const void * srcp,register void * destp)712 asm void MI_UncompressRL32(register const void *srcp,register void *destp)
713 {
714 stmdb sp!, {r4-r5}
715
716 ldr r12, [r0], #4 // load header
717 mov r12, r12, lsr #8 // dest. count
718 mov r4, #0 // write back accumulator
719 mov r5, #32 // shifter reg. for write back accumulator
720
721 @L1 cmp r12, #0 // finished ?
722 ble @L6
723
724 ldrb r2, [r0], #1 // load marker byte
725 ldrb r3, [r0], #1 // load char ie. 1st char
726 // <- still 1 cycle interlock from the 1st LDRB (LDRB/LDRH impose 2 cycles pipeline interlock!)
727 tst r2, #0x80 // check if marker has MSB set
728 bne @L3 // path decoding stríngs of different characters
729 add r2, r2, #1 // adjust counter
730 sub r12, r12,r2 // decrement dest. count
731
732 @L2 orr r4, r4, r3, ror r5 // accumulate (LSL R5 would be more obvious, but then the shifter reg. must be counted up and an add. CMP would be needed)
733 subs r5, r5, #8 // decrement shifter
734 beq @L5 // write back branch is only taken every 4th loop, so it's best to jump out (non-taken branch is only 1 cycle compared to 3 for a taken one)
735 subs r2, r2, #1 // decrement counter
736 ldrneb r3, [r0], #1 // preload char for the next loop to have the 2 cycle interlock of LDRB compensated by the pipeline flush imposed by the branch
737 bne @L2
738 b @L1
739
740 @L5 str r4, [r1], #4 // write back
741 mov r5, #32 // restore shifter
742 mov r4, #0 // reset write back accumulator
743 subs r2, r2, #1 // not to jump back saves 2 cycles
744 ldrneb r3, [r0], #1 // preload char
745 bne @L2
746 b @L1 // path decoding stríngs of same character
747
748 @L3 sub r2, r2, #0x7d // adjust counter
749 sub r12, r12,r2 // decrement dest. count
750
751 @L4 orr r4, r4, r3, ror r5 // accumulate (the first word)
752 subs r5, r5, #8 // decrement shifter
753 beq @L7 // if word is full step out
754 subs r2, r2, #1 // if not, check if there are still bytes left
755 bne @L4 // and continue
756 b @L1 // or be done
757
758 @L7 str r4, [r1], #4 // write back (the one word that may have chars from the other path in it)
759
760 orr r4, r3, r3, lsl #8
761 orr r4, r4, r4, lsl #16 // the char extended to word
762
763 @L8 subs r2, r2, #4 // full words are processed now, so decrement counter by 4
764 ble @L9 // if no full words are left jump out
765 str r4, [r1], #4 // otherwise write back
766 b @L8
767
768 @L9 mov r4, #0 // clear accu
769 mov r5, #32 // reset shifter reg.
770 adds r2, r2, #3 // re-adjust counter (0-3)
771 beq @L1 // 0 then done
772
773 @L0 orr r4, r4, r3, ror r5 // accumulate (the max. 3 remaining bytes)
774 sub r5, r5, #8 // decrement shifter
775 subs r2, r2, #1 // decrement counter
776 bne @L0
777 b @L1
778
779 @L6 ldmia sp!, {r4-r5}
780 bx lr
781 }
782
783 //----------------------------------------------------------------------
784 // 8-bit decompression to restore differential filter conversion.
785 //
786 //- Restores a differential filter, writing in 8 bit units.
787 //With NITRO, cannot decompress directly into VRAM
788 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
789 //
790 //- Use 4 byte alignment for the source address.
791 //
792 //Arguments:
793 // void *srcp : source address
794 // void *destp : destination address
795 //
796 //- Data header
797 // u32 :4 : Bit size of unit
798 // compType:4 Compression type( = 3)
799 // destSize:24 Data size after decompression
800 //
801 //- Return value: None
802 //----------------------------------------------------------------------
803
MI_UnfilterDiff8(register const void * srcp,register void * destp)804 asm void MI_UnfilterDiff8( register const void *srcp, register void *destp )
805 {
806 stmfd sp!, {r4}
807
808 ldmia r0, {r2} // r2: destCount = (u32 *)srcp;
809 mov r3, #0 // r3: sum = 0;
810 and r4, r2, #0xF // r4: bitSize = (u32 *)srcp & 0xF;
811 mov r2, r2, lsr #8 // destCount = (u32 *)srcp >> 8;
812 cmp r4, #1 // if (bitSize != 1) {
813 bne @63
814
815 @61 // Difference calculation in units of 8 bits
816 add r0, r0, #3 // srcp += 4;
817 sub r1, r1, #1
818 @62 // do {
819 ldrb r4, [r0, #1]! // tmp = *(srcp++);
820 subs r2, r2, #1 // destCount--; Executes at this position in order to eliminate pipeline stall
821 add r3, r3, r4 // sum += tmp
822 strb r3, [r1, #1]! // *(destp++) = sum;
823 bgt @62 // } while ( destCount > 0 );
824 b @65 // } else {
825
826 @63 // Difference calculation in units of 16 bits
827 add r0, r0, #2 //
828 sub r1, r1, #2 //
829 @64 // do {
830 ldrh r4, [r0, #2]! // tmp = *(u16*)srcp; srcp += 2;
831 subs r2, r2, #2 // destCount -= 2; Executes at this position in order to eliminate pipeline stall
832 add r3, r3, r4 // sum += tmp;
833 strh r3, [r1, #2]! // *(u16*)destp = sum; destp += 2;
834 bgt @64 // } while ( destCount > 0 );
835 // }
836 @65
837 ldmfd sp!, {r4}
838 bx lr
839 }
840
841 //----------------------------------------------------------------------
842 // 16-bit decompression to restore differential filter conversion.
843 //
844 //- Restores a differential filter, writing in 16 bit units.
845 //- Can be decompressed to the data TCM and VRAM
846 //- Higher speed than MI_UnfilterDiff16()
847 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
848 //
849 //- Use 4 byte alignment for the source address.
850 //
851 //Arguments:
852 // void *srcp : source address
853 // void *destp : destination address
854 //
855 //- Data header
856 // u32 :4 : Bit size of unit
857 // compType:4 Compression type( = 3)
858 // destSize:24 Data size after decompression
859 //
860 //- Return value: None
861 //----------------------------------------------------------------------
862
MI_UnfilterDiff16(register const void * srcp,register void * destp)863 asm void MI_UnfilterDiff16( register const void *srcp, register void *destp )
864 {
865 stmfd sp!, {r4, r5}
866
867 ldmia r0, {r2} // r2: destCount = (u32 *)srcp;
868 mov r3, #0 // r3: sum = 0;
869 and r4, r2, #0xF // r4: bitSize = (u32 *)srcp & 0xF;
870 mov r2, r2, lsr #8 // destCount = (u32 *)srcp >> 8;
871 cmp r4, #1 // if (bitSize != 1) {
872 bne @63
873
874 @61 // Difference calculation in units of 8 bits
875 add r0, r0, #2 // srcp += 4;
876 sub r1, r1, #2
877 @62 // do {
878 ldrh r4, [r0, #2]! // tmp = *(u16*)srcp; srcp += 2;
879 sub r2, r2, #2 // destCount -= 2; Executes at this position in order to eliminate pipeline stall
880 add r3, r3, r4 // sum += tmp
881 and r5, r3, #0xFF // r5: tmp2 = sum & 0xFF;
882 add r3, r3, r4, lsr #8 // sum += (tmp >> 8);
883 add r5, r5, r3, lsl #8 // tmp2 += (sum << 8);
884 strh r5, [r1, #2]! // *(u16*)destp = tmp2; destp += 2;
885 cmp r2, #1 //
886 bgt @62 // } while ( destCount > 1 );
887 bne @65 // if ( destCount < 1 ) return;
888 // else // if (destCount == 1) {
889 ldrh r4, [r0, #2]! // tmp = *(u16*)srcp; srcp += 2;
890 add r3, r3, r4 // sum += tmp;
891 and r5, r3, #0xFF // tmp2 = sum & 0xFF
892 strh r5, [r1, #2]! // *(u16*)destp = tmp2; destp += 2;
893 b @65 // }
894 // } else {
895 @63 // Difference calculation in units of 16 bits
896 add r0, r0, #2
897 sub r1, r1, #2
898 @64 // do {
899 ldrh r4, [r0, #2]! // tmp = *(u16*)srcp; srcp += 2;
900 subs r2, r2, #2 // destCount -= 2; Executes at this position in order to eliminate pipeline stall
901 add r3, r3, r4 // sum += tmp;
902 strh r3, [r1, #2]! // *(u16*)destp = sum; destp += 2;
903 bgt @64 // } while ( destCount > 0 );
904 // }
905 @65
906 ldmfd sp!, {r4,r5}
907 bx lr
908 }
909
910 //----------------------------------------------------------------------
911 // 32-bit decompression to restore differential filter conversion.
912 //
913 //- Restores a differential filter, writing in 32 bit units.
914 //- Can be decompressed to the data TCM and VRAM
915 // Faster than the MI_Uncompress8 function//----
916 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
917 //
918 //- Use 4 byte alignment for the source address.
919 //
920 //Arguments:
921 // void *srcp : source address
922 // void *destp : destination address
923 //
924 //- Data header
925 // u32 :4 : Bit size of unit
926 // compType:4 Compression type( = 3)
927 // destSize:24 Data size after decompression
928 //
929 //- Return value: None
930 //----------------------------------------------------------------------
931
MI_UnfilterDiff32(register const void * srcp,register void * destp)932 asm void MI_UnfilterDiff32( register const void *srcp,register void *destp )
933 {
934 stmdb sp!, {r4-r6}
935
936 ldr r4, [r0], #4 // load header
937 mov r5, #0xff // byte mask
938 mov r6, r4, lsr #8 // Dest. count
939 tst r4, #1 // Bitsize 8 or 16?
940 mov r4, #0 // R4 - accumulator
941 beq @bs16
942
943 @L1 ldr r2, [r0], #4 // load data
944
945 subs r6, r6, #4 // decrement dest. count (at this stage to prevent pipeline stall)
946
947 add r4, r2, r4, lsr #24 // 1st sum
948 and r4, r4, r5 // mask
949
950 add r3, r4, r2, lsr #8 // 2nd sum
951 and r3, r3, r5 // mask
952 orr r4, r4, r3, lsl #8 // accumulate
953
954 add r3, r3, r2, lsr #16 // 3rd sum
955 and r3, r3, r5 // mask
956 orr r4, r4, r3, lsl #16 // accumulate
957
958 add r3, r3, r2, lsr #24 // 4th sum
959 and r3, r3, r5 // mask
960 orr r4, r4, r3, lsl #24 // accumulate
961
962 str r4, [r1], #4 // write back
963
964 bgt @L1 // loop if dest. count not reached
965
966 ldmia sp!, {r4-r6}
967 bx lr
968
969 @bs16
970 orr r5, r5, r5, lsl #8 // extend mask to 16 bit
971
972 @L2 ldr r2, [r0], #4 // load data
973
974 subs r6, r6, #4 // decrement Dest. count (at this stage to prevent pipeline stall)
975
976 add r4, r2, r4, lsr #16 // 1st sum
977 and r4, r4, r5 // mask
978
979 add r3, r4, r2, lsr #16 // 2nd sum
980 and r3, r3, r5 // mask
981 orr r4, r4, r3, lsl #16 // accumulate
982
983 str r4, [r1], #4 // write back
984
985 bgt @L2 // loop if dest. count not reached
986
987 ldmia sp!, {r4-r6}
988 bx lr
989 }
990
991 //----------------------------------------------------------------------
992 // 8-bit expansion of the differential filter conversion
993 //
994 //- Converts the differential filter, writing in 8 bit units.
995 //With NITRO, cannot decompress directly into VRAM
996 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
997 //
998 //- Use 4 byte alignment for the source address.
999 //
1000 //Arguments:
1001 // void *srcp : source address
1002 // void *destp : destination address
1003 // u32 size Source size
1004 // BOOL bitsize Differential size (TRUE: 16bit, FALSE: 8bit)
1005 //
1006 //- Data header
1007 // u32 :4 : Bit size of unit
1008 // compType:4 Compression type( = 3)
1009 // destSize:24 Data size after decompression
1010 //
1011 //- Return value: None
1012 //----------------------------------------------------------------------
1013
MI_FilterDiff8(register const void * srcp,register void * destp,register u32 size,register BOOL bitsize)1014 asm void MI_FilterDiff8(register const void *srcp, register void *destp, register u32 size, register BOOL bitsize)
1015 {
1016 stmdb sp!, {r4-r6}
1017
1018 add r4, r3, #1 // u32 :4 : Bit size of unit
1019 orr r4, r4, #0x80 // compType:4 : Compression type( = 3)
1020 orr r4, r4, r2, lsl #8 // destSize:24 : Data size before compression
1021 str r4, [r1], #4 // write header
1022 mov r4, #0 // 1st subtrahend
1023 mov r5, #0xff // byte mask
1024 tst r3, #1 // Bitsize 8 or 16?
1025 bne @bs16
1026
1027 @L1 ldrb r3,[r0],#1 // load data
1028
1029 subs r2, r2, #1 // decrement Dest. count (at this stage to prevent pipeline stall)
1030
1031 sub r4, r3, r4 // diff
1032 and r6, r4, r5 // mask
1033
1034 mov r4, r3 // set as previous value
1035
1036 strb r6, [r1], #1 // write back
1037
1038 bgt @L1 // loop if dest. count not reached
1039
1040 ldmia sp!, {r4-r6}
1041 bx lr
1042
1043 @bs16
1044 orr r5, r5, r5, lsl #8 // extend mask to 16 bit
1045
1046 @L2 ldrh r3,[r0],#2 // load data
1047
1048 subs r2, r2, #2 // decrement Dest. count (at this stage to prevent pipeline stall)
1049
1050 sub r4, r3, r4 // 1st diff
1051 and r6, r4, r5 // mask
1052
1053 mov r4, r3 // set as previous value
1054
1055 strh r6, [r1], #2 // write back
1056
1057 bgt @L2 // loop if dest. count not reached
1058
1059 ldmia sp!, {r4-r6}
1060 bx lr
1061 }
1062
1063 //----------------------------------------------------------------------
1064 // 16-bit expansion of the differential filter conversion
1065 //
1066 //- Converts the differential filter, writing in 16 bit units.
1067 //- Can be decompressed to the data TCM and VRAM
1068 //- Higher speed than MI_FilterDiff8()
1069 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
1070 //
1071 //- Use 4 byte alignment for the source address.
1072 //
1073 //Arguments:
1074 // void *srcp : source address
1075 // void *destp : destination address
1076 // u32 size Source size
1077 // BOOL bitsize Differential size (TRUE: 16bit, FALSE: 8bit)
1078 //
1079 //- Data header
1080 // u32 :4 : Bit size of unit
1081 // compType:4 Compression type( = 3)
1082 // destSize:24 Data size after decompression
1083 //
1084 //- Return value: None
1085 //----------------------------------------------------------------------
1086
MI_FilterDiff16(register const void * srcp,register void * destp,register u32 size,register BOOL bitsize)1087 asm void MI_FilterDiff16(register const void *srcp, register void *destp, register u32 size, register BOOL bitsize)
1088 {
1089 stmdb sp!, {r4-r6}
1090
1091 add r4, r3, #1 // u32 :4 : Bit size of unit
1092 orr r4, r4, #0x80 // compType:4 : Compression type( = 3)
1093 orr r4, r4, r2, lsl #8 // destSize:24 : Data size before compression
1094 str r4, [r1], #4 // write header
1095 mov r4, #0 // 1st subtrahend
1096 mov r5, #0xff // byte mask
1097 tst r3, #1 // Bitsize 8 or 16?
1098 bne @bs16
1099
1100 @L1 ldrh r3,[r0],#2 // load data
1101
1102 subs r2, r2, #2 // decrement Dest. count (at this stage to prevent pipeline stall)
1103
1104 sub r4, r3, r4, lsr #8 // 1st diff
1105 and r6, r4, r5 // mask
1106
1107 rsb r4, r3, r3, lsr #8 // 2nd diff
1108 and r4, r4, r5 // mask
1109 orr r6, r6, r4, lsl #8 // accumulate
1110
1111 mov r4, r3 // set as previous value
1112
1113 strh r6, [r1], #2 // write back
1114
1115 bgt @L1 // loop if dest. count not reached
1116
1117 ldmia sp!, {r4-r6}
1118 bx lr
1119
1120 @bs16
1121 orr r5, r5, r5, lsl #8 // extend mask to 16 bit
1122
1123 @L2 ldrh r3,[r0],#2 // load data
1124
1125 subs r2, r2, #2 // decrement Dest. count (at this stage to prevent pipeline stall)
1126
1127 sub r4, r3, r4 // 1st diff
1128 and r6, r4, r5 // mask
1129
1130 mov r4, r3 // set as previous value
1131
1132 strh r6, [r1], #2 // write back
1133
1134 bgt @L2 // loop if dest. count not reached
1135
1136 ldmia sp!, {r4-r6}
1137 bx lr
1138 }
1139
1140 //----------------------------------------------------------------------
1141 // 32-bit expansion of the differential filter conversion
1142 //
1143 //- Converts the differential filter, writing in 32 bit units.
1144 //- Higher speed than MI_FilterDiff16()
1145 //- If the compressed data size was not a multiple of four, adjust by padding with 0s as much as possible.
1146 //
1147 //- Use 4 byte alignment for the source address.
1148 //
1149 //Arguments:
1150 // void *srcp : source address
1151 // void *destp : destination address
1152 // u32 size Source size
1153 // BOOL bitsize Differential size (TRUE: 16bit, FALSE: 8bit)
1154 //
1155 //- Data header
1156 // u32 :4 : Bit size of unit
1157 // compType:4 Compression type( = 3)
1158 // destSize:24 Data size after decompression
1159 //
1160 //- Return value: None
1161 //----------------------------------------------------------------------
1162
MI_FilterDiff32(register const void * srcp,register void * destp,register u32 size,register BOOL bitsize)1163 asm void MI_FilterDiff32(register const void *srcp, register void *destp, register u32 size, register BOOL bitsize)
1164 {
1165 stmdb sp!, {r4-r6}
1166
1167 add r4, r3, #1 // u32 :4 : Bit size of unit
1168 orr r4, r4, #0x80 // compType:4 : Compression type( = 3)
1169 orr r4, r4, r2, lsl #8 // destSize:24 : Data size before compression
1170 str r4, [r1], #4 // write header
1171 mov r4, #0 // 1st subtrahend
1172 mov r5, #0xff // byte mask
1173 tst r3, #1 // Bitsize 8 or 16?
1174 bne @bs16
1175
1176 @L1 ldr r3,[r0],#4 // load data
1177
1178 subs r2, r2, #4 // decrement Dest. count (at this stage to prevent pipeline stall)
1179
1180 sub r4, r3, r4, lsr #8 // 1st diff
1181 and r6, r4, r5 // mask
1182
1183 rsb r4, r3, r3, lsr #8 // 2nd diff
1184 and r4, r4, r5 // mask
1185 orr r6, r6, r4, lsl #8 // accumulate
1186
1187 mov r4, r3, lsr #16 // shift higher halfword down
1188
1189 sub r3, r4, r3, lsr #8 // 3rd diff
1190 and r3, r3, r5 // mask
1191 orr r6, r6, r3, lsl #16 // accumulate
1192
1193 rsb r3, r4, r4, lsr #8 // 4th diff
1194 and r3, r3, r5 // mask
1195 orr r6, r6, r3, lsl #24 // accumulate
1196
1197 str r6, [r1], #4 // write back
1198
1199 bgt @L1 // loop if dest. count not reached
1200
1201 ldmia sp!, {r4-r6}
1202 bx lr
1203
1204 @bs16
1205 orr r5, r5, r5, lsl #8 // extend mask to 16 bit
1206
1207 @L2 ldr r3,[r0],#4 // load data
1208
1209 subs r2, r2, #4 // decrement Dest. count (at this stage to prevent pipeline stall)
1210
1211 sub r4, r3, r4 // 1st diff
1212 and r6, r4, r5 // mask
1213
1214 mov r4, r3, lsr #16 // shift higher halfword down
1215
1216 rsb r3, r3, r3, lsr #16 // 2nd diff
1217 and r3, r3, r5 // mask
1218 orr r6, r6, r3, lsl #16 // accumulate
1219
1220 str r6, [r1], #4 // write back
1221
1222 bgt @L2 // loop if dest. count not reached
1223
1224 ldmia sp!, {r4-r6}
1225 bx lr
1226 }
1227
1228 //---- end limitation of processer mode
1229 #include <nitro/codereset.h>
1230