1/*---------------------------------------------------------------------------*
2  Project: matrix vector Library
3  File:    mtx44Vec_asm.s
4
5  Copyright (C) Nintendo.  All rights reserved.
6
7  These coded instructions, statements, and computer programs contain
8  proprietary information of Nintendo of America Inc. and/or Nintendo
9  Company Ltd., and are protected by Federal copyright law.     They may
10  not be disclosed to third parties or copied or duplicated in any form,
11  in whole or in part, without the prior written consent of Nintendo.
12
13 *---------------------------------------------------------------------------*/
14
15        .data
16/***
17Unit01: .float        0.0
18        .float        1.0
19
20***/
21
22        .text
23
24
25////////////////////////////////////////////////////////////////////////////////
26// void ASM_MTX44MultVec(const Mtx44 m, const Vec *src, Vec *dst)
27#define m   r3
28#define src r4
29#define dst r5
30        .global ASM_MTX44MultVec
31ASM_MTX44MultVec:
32        .type ASM_MTX44MultVec, @function
33
34        psq_l       fp0, 0(src),    0, 0       // fp0 <-src.x, src.y
35        psq_l       fp2, 48(m),     0, 0
36        psq_l       fp1, 8(src),    1, 0       // fp1 <-src.z, 1.0
37        ps_mul      fp4, fp0, fp2
38        psq_l       fp3, 56(m),     0, 0
39        ps_madd     fp5, fp1, fp3, fp4
40        ps_merge11  fp12, fp1, fp1             // fp12 = 1.0, 1.0
41        ps_sum0     fp13, fp5, fp5, fp5        // fp3 <-  w
42        psq_l       fp4, 0(m),      0, 0
43        ps_merge00  fp13, fp13, fp13
44        psq_l       fp5, 8(m),      0, 0
45        ps_div      fp13, fp12, fp13           // fp13 <- 1/w
46        psq_l       fp6, 16(m),     0, 0
47        psq_l       fp7, 24(m),     0, 0
48        psq_l       fp8, 32(m),     0, 0
49        psq_l       fp9, 40(m),     0, 0
50        ps_mul      fp4, fp0, fp4
51        ps_madd     fp2, fp1, fp5, fp4
52        ps_mul      fp6, fp0, fp6
53        ps_madd     fp3, fp1, fp7, fp6
54        ps_mul      fp8, fp0, fp8
55        ps_sum0     fp2, fp2, fp2, fp2         // fp2 <- dst.x, --
56        ps_madd     fp9, fp1, fp9, fp8
57        ps_sum1     fp2, fp3, fp2, fp3         // fp2 <- dst.x, dst.y
58        ps_sum0     fp3, fp9, fp9, fp9
59        ps_mul      fp2, fp2, fp13
60        psq_st      fp2, 0(dst),    0, 0
61        ps_mul      fp3, fp3, fp13
62        psq_st      fp3, 8(dst),    1, 0
63        blr
64        .size ASM_MTX44MultVec,$-ASM_MTX44MultVec
65
66#undef m
67#undef src
68#undef dst
69
70
71
72////////////////////////////////////////////////////////////////////////////////
73// void ASM_MTX44MultVecArray (const Mtx44 m, const Vec *srcBase, Vec *dstBase,  u32 count)
74#define m       r3
75#define srcBase r4
76#define dstBase r5
77#define count   r6
78        .global ASM_MTX44MultVecArray
79#define rsp     sp
80ASM_MTX44MultVecArray:
81        .type ASM_MTX44MultVecArray, @function
82
83        stwu        rsp, -24(rsp)
84        addi        count, count, -1
85        psq_l       fp6, 48(m),         0, 0   // fp6 <- m30, m31
86        mtctr       count
87        psq_l       fp8, 0(srcBase),    0, 0   // fp8 <- src.x, src.y
88        addi        dstBase, dstBase, -4
89        stfd        fp14, 8(rsp)
90        psq_l       fp7, 56(m),         0, 0   // fp7 <- m32, m33
91        psq_lu      fp9, 8(srcBase),    1, 0   // fp9 <- src.z, 1.0
92        ps_mul      fp13, fp6, fp8
93        psq_l       fp0, 0(m),          0, 0   // fp0 <- m00, m01
94        psq_st      fp14, 16(rsp),      0, 0
95        ps_madd     fp13, fp7, fp9, fp13
96        psq_l       fp2, 16(m),         0, 0   // fp2 <- m10, m11
97        ps_merge11  fp14, fp9, fp9             // fp9 = 1.0F, 1.0F
98        ps_mul      fp10, fp0, fp8
99        psq_l       fp4, 32(m),         0, 0   // fp4 <- m20, m21
100        ps_mul      fp11, fp2, fp8
101        psq_l       fp1, 8(m),          0, 0   // fp1 <- m02, m03
102        ps_mul      fp12, fp4, fp8
103        psq_l       fp3, 24(m),         0, 0   // fp3 <- m12, m13
104        ps_sum0     fp13, fp13, fp13, fp13     // fp13 <- w
105        psq_l       fp5, 40(m),         0, 0   // fp5 <- m22, m23
106
107_ASM_MTX44MultVecArray_loop:
108        ps_madd     fp10, fp1, fp9, fp10
109        ps_madd     fp11, fp3, fp9, fp11
110        ps_madd     fp12, fp5, fp9, fp12
111        ps_sum0     fp10, fp10, fp10, fp10     // fp10 <- x
112        ps_sum0     fp11, fp11, fp11, fp11     // fp11 <- y
113        ps_sum0     fp12, fp12, fp12, fp12     // fp12 <- z
114        ps_div      fp13, fp14, fp13
115
116        psq_lu      fp8, 4(srcBase), 0, 0
117        psq_lu      fp9, 8(srcBase), 1, 0
118
119        ps_mul      fp10, fp10, fp13
120        psq_stu     fp10, 4(dstBase), 1, 0
121        ps_mul      fp11, fp11, fp13
122        psq_stu     fp11, 4(dstBase), 1, 0
123        ps_mul      fp12, fp12, fp13
124        psq_stu     fp12, 4(dstBase), 1, 0
125
126        ps_mul      fp13, fp6, fp8
127
128        ps_mul      fp10, fp0, fp8
129        ps_mul      fp11, fp2, fp8
130        ps_madd     fp13, fp7, fp9, fp13
131        ps_mul      fp12, fp4, fp8
132        ps_sum0     fp13, fp13, fp13, fp13
133
134        bdnz+       _ASM_MTX44MultVecArray_loop
135
136        ps_madd     fp10, fp1, fp9, fp10
137        ps_madd     fp11, fp3, fp9, fp11
138        ps_madd     fp12, fp5, fp9, fp12
139        ps_sum0     fp10, fp10, fp10, fp10     // fp10 <- x
140        ps_sum0     fp11, fp11, fp11, fp11     // fp11 <- y
141        ps_sum0     fp12, fp12, fp12, fp12     // fp12 <- z
142        ps_div      fp13, fp14, fp13
143
144        ps_mul      fp10, fp10, fp13
145        psq_st      fp10, 4(dstBase), 1, 0
146        ps_mul      fp11, fp11, fp13
147        psq_st      fp11, 8(dstBase), 1, 0
148        ps_mul      fp12, fp12, fp13
149        psq_st      fp12, 12(dstBase), 1, 0
150
151        psq_l       fp14, 16(rsp), 0, 0
152        lfd         fp14,  8(rsp)
153        addi        rsp, rsp, 24
154        blr
155        .size ASM_MTX44MultVecArray,$-ASM_MTX44MultVecArray
156#undef m
157#undef srcBase
158#undef dstBase
159#undef count
160#undef rsp
161
162
163
164////////////////////////////////////////////////////////////////////////////////
165// void ASM_MTX44MultVecSR(const Mtx44 m, const Vec *src, Vec *dst)
166#define m   r3
167#define src r4
168#define dst r5
169        .global ASM_MTX44MultVecSR
170ASM_MTX44MultVecSR:
171        .type ASM_MTX44MultVecSR, @function
172
173        psq_l   fp0, 0(m), 0, 0    // m[0][0], m[0][1] GQR0 = 0
174
175        // fp6 - x y
176        psq_l   fp6, 0(src), 0, 0
177
178        psq_l   fp2, 16(m), 0, 0   // m[1][0], m[1][1]
179
180
181        // fp8 = m00x m01y // next X
182        ps_mul  fp8, fp0, fp6
183        psq_l   fp4, 32(m), 0, 0   // m[2][0], m[2][1]
184
185        // fp10 = m10x m11y // next Y
186        ps_mul  fp10, fp2, fp6
187        psq_l   fp7, 8(src), 1, 0   // fp7 - z,1.0
188
189        // fp12 = m20x m21y // next Z
190        ps_mul  fp12, fp4, fp6  // YYY last FP6 usage
191        psq_l   fp3, 24(m), 0, 0   // m[1][2], m[1][3]
192
193        ps_sum0 fp8, fp8, fp8, fp8
194        psq_l   fp5, 40(m), 0, 0   // m[2][2], m[2][3]
195
196        ps_sum0 fp10, fp10, fp10, fp10
197        psq_l   fp1,  8(m), 0, 0    // m[0][2], m[0][3]
198
199        ps_sum0 fp12, fp12, fp12, fp12
200        ps_madd fp9, fp1, fp7, fp8
201        psq_st  fp9,  0(dst), 1, 0      // store X
202
203        ps_madd fp11, fp3, fp7, fp10
204        psq_st  fp11, 4(dst), 1, 0      // store Y
205
206        ps_madd fp13, fp5, fp7, fp12
207        psq_st  fp13, 8(dst), 1, 0      //  sore Z
208
209        blr
210        .size ASM_MTX44MultVecSR,$-ASM_MTX44MultVecSR
211
212#undef m
213#undef src
214#undef dst
215
216
217
218////////////////////////////////////////////////////////////////////////////////
219// void ASM_MTX44MultVecArraySR(const Mtx44 m, const Vec *srcBase, Vec *dstBase,  u32 count)
220#define m       r3
221#define srcBase r4
222#define dstBase r5
223#define count   r6
224        .global ASM_MTX44MultVecArraySR
225ASM_MTX44MultVecArraySR:
226        .type ASM_MTX44MultVecArraySR, @function
227
228        psq_l       fp0,  0(m),         0, 0           // fp0 <- m00, m01
229        addi        count, count, -1
230        psq_l       fp6,  0(srcBase),   0, 0           // fp6 <- src.x, src.y
231        ps_mul      fp8,  fp0, fp6
232        psq_l       fp2,  16(m),        0, 0           // fp2 <- m10, m11
233        ps_mul      fp9,  fp2, fp6
234        psq_l       fp4,  32(m),        0, 0           // fp4 <- m20, m21
235        psq_lu      fp7,  8(srcBase),   1, 0           // fp7 <- src.z, 1.0
236        ps_mul      fp10, fp4, fp6
237        psq_l       fp1,  8(m),         1, 0           // fp1 <- m02, 1.0
238        mtctr       count
239        psq_l       fp3,  24(m),        1, 0           // fp3 <- m12, 1.0
240        addi        dstBase, dstBase, -4
241        psq_l       fp5,  40(m),        1, 0           // fp5 <- m22, 1.0
242
243_ASM_MTX44MultVecArraySR_loop:
244        ps_madd     fp11, fp1, fp7, fp8
245        psq_lu      fp6,  4(srcBase),   0, 0
246        ps_madd     fp12, fp3, fp7, fp9
247        ps_madd     fp13, fp5, fp7, fp10
248        psq_lu      fp7,  8(srcBase),   1, 0
249        ps_sum0     fp11, fp11, fp8, fp8
250        psq_stu     fp11, 4(dstBase),   1, 0
251        ps_sum0     fp12, fp12, fp9, fp9
252        psq_stu     fp12, 4(dstBase),   1, 0
253        ps_sum0     fp13, fp13, fp10, fp10
254        psq_stu     fp13, 4(dstBase),   1, 0
255        ps_mul      fp8,  fp0, fp6
256        ps_mul      fp9,  fp2, fp6
257        ps_mul      fp10,  fp4, fp6
258        bdnz+       _ASM_MTX44MultVecArraySR_loop
259
260        ps_madd     fp11, fp1, fp7, fp8
261        ps_madd     fp12, fp3, fp7, fp9
262        ps_madd     fp13, fp5, fp7, fp10
263        ps_sum0     fp11, fp11, fp8, fp8
264        psq_stu     fp11, 4(dstBase),   1, 0
265        ps_sum0     fp12, fp12, fp9, fp9
266        psq_stu     fp12, 4(dstBase),   1, 0
267        ps_sum0     fp13, fp13, fp10, fp10
268        psq_stu     fp13, 4(dstBase),   1, 0
269        blr
270        .size ASM_MTX44MultVecArraySR,$-ASM_MTX44MultVecArraySR
271#undef m
272#undef srcBase
273#undef dstBase
274#undef count
275
276
277