1 /*---------------------------------------------------------------------------*
2 
3   Copyright (C) Nintendo.  All rights reserved.
4 
5   These coded instructions, statements, and computer programs contain
6   proprietary information of Nintendo of America Inc. and/or Nintendo
7   Company Ltd., and are protected by Federal copyright law.  They may
8   not be disclosed to third parties or copied or duplicated in any form,
9   in whole or in part, without the prior written consent of Nintendo.
10 
11  *---------------------------------------------------------------------------*/
12 
13 // gx2Perf.h
14 //
15 // Declares GPU performance APIs for gx2 library.
16 
17 
18 #ifndef _CAFE_GX2_PERF_H_
19 #define _CAFE_GX2_PERF_H_
20 
21 #ifdef __cplusplus
22 extern "C" {
23 #endif
24 
25 /// @addtogroup GX2DeprecatedGroup
26 /// @{
27 
28 /// At this offset is the starting 8-samples of the pipeline stats.
29 #define GX2_PIPELINE_DATA_START_OFFSET  GX2_COUNTER_PIPELINE
30 
31 /// At this offset is the end final 8-samples of the pipeline stats.
32 #define GX2_PIPELINE_DATA_END_OFFSET    GX2_COUNTER_PIPELINE + GX2_NUM_COUNTER_PIPELINE
33 
34 #define GX2_RESULT_SIZE                 GX2_PIPELINE_DATA_END_OFFSET + GX2_NUM_COUNTER_PIPELINE
35 
36 /// \brief Data container for all low-level performance counter settings.
37 ///
38 /// \deprecated Please use the Perf APIs described in
39 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
40 ///
41 /// \note This structure must be allocated in GPU memory.
42 ///
43 typedef struct __GX2CounterInfo
44 {
45     /// Counter results are written to this buffer
46     /// From 0 to GX2_COUNTER_LAST-1 is "normal" 64-bit counter data.
47     /// From GX2_PIPELINE_DATA_START_OFFSET (aka GX2_COUNTER_LAST) to GX2_PIPELINE_DATA_START_OFFSET+GX2_NUM_COUNTER_PIPELINE-1 are the pipeline stat start values.
48     /// From GX2_PIPELINE_DATA_END_OFFSET to GX2_PIPELINE_DATA_END_OFFSET+GX2_NUM_COUNTER_PIPELINE-1 are the pipeline stat end values.
49     u64 results[GX2_RESULT_SIZE];
50 
51     /// The CPU updated data below needs to be 64 bytes away from the GPU written data above.  This is so
52     /// any CPU updates and cache line flushes don't overwrite data updated by the GPU.  A cache line is 64 bytes.
53     u8  padding[64];
54 
55     /// Flag if each counter is enabled or not
56     GX2Boolean enabled[GX2_COUNTER_LAST+1];
57     /// What statistic is being counted by each hardware counter?
58     _GX2StatId stats[GX2_COUNTER_LAST+1];
59     /// In unified shader architecture, each SIMD can change to
60     /// each type of shader (vertex, pixel, geometry, etc.)
61     /// This controls what shader type to count for SQ counters.
62     _GX2SqType sqType[GX2_NUM_COUNTER_SQ];
63 } _GX2CounterInfo;
64 
65 /// \brief Data structure to store high-level performance counter data.
66 ///
67 /// \deprecated Please use the Perf APIs described in
68 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
69 ///
70 /// \note This structure must be allocated in GPU memory.  Using the high-level counters will change the low-level
71 /// counter registers and possibly corrupt anything you are trying to count simultaneously with the low-level counters,
72 /// with the exception of GX2_COUNTER_PIPELINE.
73 ///
74 typedef struct _GX2PerfInfo
75 {
76     /// Container for counterInfo (low-level counter data)
77     _GX2CounterInfo counterInfo;
78 
79     /// Number of CP (command processor) counters used
80     u8 usedCpCount;
81     /// Number of GRBM (gfx register bus mgr) counters used
82     u8 usedGrbmCount;
83     /// Number of PA-SU (prim assembler/setup) counters used
84     u8 usedPaSuCount;
85     /// Number of VGT (vtx grouper/tessellator) counters used
86     u8 usedVgtCount;
87     /// Number of SQ (sequencer) counters used
88     u8 usedSqCount;
89     /// Number of SPI (shader parameter interpolator) counters used
90     u8 usedSpiCount;
91     /// Number of SX (shader exporter) counters used
92     u8 usedSxCount;
93     /// Number of TA (texture addresser) counters used
94     u8 usedTaCount;
95     /// Number of TCP (texture cache per-pipe/L1) counters used
96     u8 usedTcpCount;
97     /// Number of TCC (texture cache per-channel/L2) counters used
98     u8 usedTccCount;
99     /// Number of DB (depth buffer) counters used
100     u8 usedDbCount;
101     /// Number of CB (color buffer) counters used
102     u8 usedCbCount;
103 
104     /// Indicate which low-level counter index is used by the given high-level statistic
105     u8 idxGrbmCount;
106     u8 idxGrbmGuiActive;
107     u8 idxGrbmShBusy;
108 
109     u8 idxPaSuClipBusy;
110     u8 idxPaSuClprCullPrim;
111     u8 idxPaSuSuZeroAreaCullPrim;
112     u8 idxPaSuSuBackFaceCullPrim;
113     u8 idxPaSuSuFrontFaceCullPrim;
114     u8 idxPaSuSuPolyModeFaceCull;
115     u8 idxPaSuPaInputPrim;
116     u8 idxPaSuSuStalledSc;
117 
118     u8 idxVgtVgtPaClippSend;
119     u8 idxVgtVgtPaClippIsEvent;
120     u8 idxVgtReusedVsIndices;
121     u8 idxVgtPaClippSend;
122     u8 idxVgtPaClippIsEvent;
123 
124     u8 idxSqEsVsItemsPerType;
125     u8 idxSqPsItemsPerType;
126     u8 idxSqEsVsGsPsTaTexInstrsPerType;
127     u8 idxSqEsVsTaTexInstrsPerType;
128     u8 idxSqGsTaTexInstrsPerType;
129     u8 idxSqPsTaTexInstrsPerType;
130     u8 idxSqEsVsAluClauseInstrsPerType;
131     u8 idxSqGsAluClauseInstrsPerType;
132     u8 idxSqPsAluClauseInstrsPerType;
133     u8 idxSqEsVsGsPsAluClauseInstrGroupsPerType;
134     u8 idxSqEsVsAluClauseInstrGroupsPerType;
135     u8 idxSqGsAluClauseInstrGroupsPerType;
136     u8 idxSqPsAluClauseInstrGroupsPerType;
137     u8 idxSqEsVsThreadLevelPerType;
138     u8 idxSqGsThreadLevelPerType;
139     u8 idxSqPsThreadLevelPerType;
140     u8 idxSqEsVsGsPsThreadLevelPerType;
141     u8 idxSqEsVsThreadsPerType;
142     u8 idxSqGsThreadsPerType;
143     u8 idxSqPsThreadsPerType;
144 
145     u8 idxSpiPctL0PiBusy;
146     u8 idxSpiPctL1PiBusy;
147 
148     u8 idxSxDb0Pixels;
149     u8 idxSxDb1Pixels;
150     u8 idxSxDb0StallCycles;
151     u8 idxSxDb1StallCycles;
152 
153     u8 idxTaAlignerCycles;
154 
155     u8 idxTcpTcpTaStallCycles;
156     u8 idxTcpTcpTagconflictStallCycles;
157     u8 idxTcpFmtV8Pixels;
158     u8 idxTcpFmtV16Pixels;
159     u8 idxTcpFmtV32Pixels;
160     u8 idxTcpFmtV642Pixels;
161     u8 idxTcpFmtV641Pixels;
162     u8 idxTcpFmtV1284CyclePixels;
163     u8 idxTcpFmtV1282CyclePixels;
164     u8 idxTcpFmtV1281CyclePixels;
165     u8 idxTcpTotalPixels;
166 
167     u8 idxTccReqsTcTfMiss;
168 
169     u8 idxDbOpPipeBusy;
170     u8 idxDbDbScTileNoOps;
171     u8 idxDbDbScTilePixelRate;
172     u8 idxDbDbScTileFastOps;
173     u8 idxDbDbScTileHierKill;
174     u8 idxDbDbScQuadTiles;
175     u8 idxDbPreZSamplesPassingZ;
176     u8 idxDbPreZSamplesFailingS;
177     u8 idxDbPreZSamplesFailingZ;
178     u8 idxDbPostZSamplesPassingZ;
179     u8 idxDbPostZSamplesFailingS;
180     u8 idxDbPostZSamplesFailingZ;
181     u8 idxDbScDbTileTiles;
182     u8 idxDbDbCbLqiadStalls;
183 
184     u8 idxCbDrawnPixel;
185     u8 idxCbCcMcWriteRequest;
186 } GX2PerfInfo;
187 
188 /// \brief Low-Level: Reset all low-level performance counter settings
189 ///
190 /// \deprecated Please use the Perf APIs described in
191 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
192 ///
193 /// \param info Info structure to reset
194 ///
195 /// \donotcall \threadsafe \devonly \enddonotcall
196 ///
_GX2ResetCounterInfo(_GX2CounterInfo * info)197 GX2_INLINE void _GX2ResetCounterInfo(_GX2CounterInfo* info)
198 {
199     u32 rsize = sizeof(u64)*(GX2_RESULT_SIZE);
200     ASSERT(NULL != info);
201     memset(info->results, 0xff, rsize); // write GX2_INVALID_COUNTER_VALUE_U64
202     memset(info->enabled, 0, sizeof(_GX2CounterInfo)-rsize);
203 }
204 
205 /// \brief Low-Level: Enables a specific low-level counter
206 ///
207 /// \deprecated Please use the Perf APIs described in
208 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
209 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
210 ///
211 /// Call this multiple times to enable multiple low-level counters
212 ///
213 /// \note Using the high-level counters will change the low-level counter registers
214 /// and possibly corrupt anything you are trying to count simultaneously with the low-level counters,
215 /// with the exception of GX2_COUNTER_PIPELINE.
216 ///
217 /// \param info Structure to contain all low-level counter info
218 /// \param id Which low-level hardware counter to enable
219 /// \param parm What low-level parameter it should count
220 ///
221 /// \donotcall \threadsafe \devonly \enddonotcall
222 ///
223 void GX2API _GX2InitCounterInfo(_GX2CounterInfo *info, _GX2CounterId id,
224                                 _GX2StatId parm);
225 
226 /// \brief Low-Level: Send completed low-level counter configuration to GPU.
227 ///
228 /// \deprecated Please use the Perf APIs described in
229 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
230 ///
231 /// \param info Structure containing all low-level counter info
232 ///
233 /// \note Make sure to call _GX2SampleCounters() to collect any previously counted values you care about before
234 /// calling this function.  Currently this function will reset counted results for the CB and SQ.
235 ///
236 /// \donotcall \gx2_typical \enddonotcall
237 ///
238 /// \writesgpu
239 /// \alwayswritesgpu
240 ///
241 void GX2API _GX2SetCounterInfo(const _GX2CounterInfo *info);
242 
243 /// \brief Low-Level: Resets to zero all active low-level counters
244 ///
245 /// \deprecated Please use the Perf APIs described in
246 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
247 ///
248 /// \donotcall \gx2_typical \enddonotcall
249 ///
250 /// \writesgpu
251 /// \alwayswritesgpu
252 ///
253 void GX2API _GX2ResetCounters(void);
254 
255 /// \brief Low-Level: Start (or restart) all active low-level counters counting
256 ///
257 /// \deprecated Please use the Perf APIs described in
258 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
259 ///
260 /// \donotcall \gx2_typical \enddonotcall
261 ///
262 /// \writesgpu
263 /// \alwayswritesgpu
264 ///
265 void GX2API _GX2StartCounters(const _GX2CounterInfo *cinfo);
266 
267 /// \brief Low-Level: Stop (pause) all active low-level counters from counting
268 ///
269 /// \deprecated Please use the Perf APIs described in
270 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
271 ///
272 /// \donotcall \gx2_typical \enddonotcall
273 ///
274 /// \writesgpu
275 /// \alwayswritesgpu
276 ///
277 void GX2API _GX2StopCounters(const _GX2CounterInfo *cinfo);
278 
279 /// \brief Low-Level: Tell GPU to write all active low-level counter values to memory
280 ///
281 /// \deprecated Please use the Perf APIs described in
282 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
283 ///
284 /// \param info Structure to contain all low-level counter info
285 ///
286 /// \donotcall \gx2_typical \enddonotcall
287 ///
288 /// \writesgpu
289 /// \alwayswritesgpu
290 ///
291 void GX2API _GX2SampleCounters(_GX2CounterInfo *info);
292 
293 /// \brief Low-Level: Check if low-level counter data has been written by GPU
294 ///
295 /// \deprecated Please use the Perf APIs described in
296 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
297 ///
298 /// \param info Structure to contain all low-level counter info
299 ///
300 /// \note This method only works when the counters have been reset using
301 /// \ref _GX2ResetCounterInfo.  However, that also resets the counter selection.
302 /// It is perhaps better to use a timestamp-based synchronization method.
303 ///
304 /// \donotcall \notthreadsafe \devonly \enddonotcall
305 ///
_GX2GetCountersReady(const _GX2CounterInfo * info)306 GX2_INLINE GX2Boolean _GX2GetCountersReady(const _GX2CounterInfo *info)
307 {
308     u32 i;
309     GX2Invalidate(GX2_INVALIDATE_CPU, (void *)info, sizeof(_GX2CounterInfo));
310     for(i=0; i<GX2_COUNTER_LAST+1; i++)
311         if(GX2_TRUE == info->enabled[i])
312             if(GX2_INVALID_COUNTER_VALUE_U64 == info->results[i])
313                 return GX2_FALSE;
314     return GX2_TRUE;
315 }
316 
317 /// \brief Reset all high-level GPU performance metrics.
318 ///
319 /// \deprecated Please use the Perf APIs described in
320 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
321 ///
322 /// \param perfinfo User-allocated structure for performance metric data.
323 ///
324 /// \donotcall \threadsafe \devonly \enddonotcall
325 ///
GX2ResetPerfMetrics(GX2PerfInfo * perfinfo)326 GX2_INLINE void GX2ResetPerfMetrics(GX2PerfInfo* perfinfo)
327 {
328     _GX2ResetCounterInfo(&perfinfo->counterInfo);
329     memset(&perfinfo->usedCpCount, 0, sizeof(u8)*12);
330     memset(&perfinfo->idxGrbmCount, 0xff, sizeof(GX2PerfInfo) - sizeof(u8)*12 - sizeof(_GX2CounterInfo));
331 }
332 
333 /// \brief Enable a specific high-level GPU performance metric.
334 ///
335 /// \deprecated Please use the Perf APIs described in
336 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
337 ///
338 /// \param perfinfo User-allocated structure for performance metric data.
339 /// \param metric Desired high-level GPU performance metric to count
340 ///
341 /// \return GX2_TRUE if metric can be counted, GX2_FALSE if otherwise.
342 ///
343 /// You may enable multiple metrics at once.  However,
344 /// there are hardware restrictions on how many metrics can be sampled per pass.
345 /// If this function returns false, it means that the desired combination
346 /// of metrics cannot be gathered in a single pass; you must use multiple
347 /// passes in order to sample that combination.
348 ///
349 /// \note Using the high-level counters will change the low-level counter registers
350 /// and possibly corrupt anything you are trying to count simultaneously with the low-level counters,
351 /// with the exception of GX2_COUNTER_PIPELINE.
352 ///
353 /// \donotcall \threadsafe \devonly \enddonotcall
354 ///
355 GX2Boolean GX2API GX2InitPerfMetric(GX2PerfInfo* perfinfo,
356                                     GX2PerfMetric metric);
357 
358 /// \brief Starts all enabled high-level GPU performance counters.
359 ///
360 /// \deprecated Please use the Perf APIs described in
361 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
362 ///
363 /// \param perfinfo User-allocated structure for performance metric data.
364 ///
365 /// \donotcall \gx2_typical \enddonotcall
366 ///
367 /// \writesgpu
368 /// \alwayswritesgpu
369 ///
GX2BeginPerf(const GX2PerfInfo * perfinfo)370 GX2_INLINE void GX2BeginPerf(const GX2PerfInfo* perfinfo)
371 {
372     _GX2SetCounterInfo(&perfinfo->counterInfo);
373     _GX2ResetCounters();
374     _GX2StartCounters(&perfinfo->counterInfo);
375 }
376 
377 /// \brief Pause all enabled high-level GPU performance counters.
378 ///
379 /// \deprecated Please use the Perf APIs described in
380 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
381 ///
382 /// \donotcall \gx2_typical \enddonotcall
383 ///
384 /// \writesgpu
385 /// \alwayswritesgpu
386 ///
GX2PerfPause(const GX2PerfInfo * perfinfo)387 GX2_INLINE void GX2PerfPause(const GX2PerfInfo* perfinfo)
388 {
389     _GX2StopCounters(&perfinfo->counterInfo);
390 }
391 
392 /// \brief Resume all enabled high-level GPU performance counters.
393 ///
394 /// \deprecated Please use the Perf APIs described in
395 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
396 ///
397 /// \donotcall \gx2_typical \enddonotcall
398 ///
399 /// \writesgpu
400 /// \alwayswritesgpu
401 ///
GX2PerfPlay(const GX2PerfInfo * perfinfo)402 GX2_INLINE void GX2PerfPlay(const GX2PerfInfo* perfinfo)
403 {
404     _GX2StartCounters(&perfinfo->counterInfo);
405 }
406 
407 /// \brief Stop and sample all enabled high-level GPU performance counters.
408 ///
409 /// \deprecated Please use the Perf APIs described in
410 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
411 ///
412 /// \param perfinfo User-allocated structure for performance metric data.
413 ///
414 /// \donotcall \gx2_typical \enddonotcall
415 ///
416 /// \writesgpu
417 /// \alwayswritesgpu
418 ///
GX2EndPerf(GX2PerfInfo * perfinfo)419 GX2_INLINE void GX2EndPerf(GX2PerfInfo* perfinfo)
420 {
421     _GX2StopCounters(&perfinfo->counterInfo);
422     _GX2SampleCounters(&perfinfo->counterInfo);
423 }
424 
425 /// \brief Check if performance counter data has been written by GPU yet.
426 ///
427 /// \deprecated Please use the Perf APIs described in
428 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
429 ///
430 /// \param perfinfo User-allocated structure for performance metric data.
431 ///
432 /// \note This method only works when the counters have been reset using
433 /// \ref GX2ResetPerfMetrics.  However, that also resets the counter selection.
434 /// It is perhaps better to use a timestamp-based synchronization method.
435 ///
436 /// \donotcall \threadsafe \devonly \enddonotcall
437 ///
GX2GetPerfMetricReady(const GX2PerfInfo * perfinfo)438 GX2_INLINE GX2Boolean GX2GetPerfMetricReady(const GX2PerfInfo *perfinfo)
439 {
440     return _GX2GetCountersReady(&perfinfo->counterInfo);
441 }
442 
443 /// \brief Get the value of a high-level u64-type perf metric.
444 ///
445 /// \deprecated Please use the Perf APIs described in
446 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
447 ///
448 /// \param perfinfo User-allocated structure for performance metric data.
449 /// \param metric Metric to be read.
450 /// \param result u64 counter value.
451 ///
452 /// \donotcall \threadsafe \devonly \enddonotcall
453 ///
454 void GX2API GX2GetPerfMetricU64(const GX2PerfInfo *perfinfo,
455                                 GX2PerfMetric metric, u64* result);
456 
457 /// \brief Get the value of a high-level f32-type perf metric.
458 ///
459 /// \deprecated Please use the Perf APIs described in
460 ///             \ref GX2PerfCounterPage "GX2 Perf Counter APIs"
461 ///
462 /// \param perfinfo User-allocated structure for performance metric data.
463 /// \param metric Metric to be read.
464 /// \param result 32-bit floating point metric data, given as percentage.
465 ///
466 /// \donotcall \threadsafe \devonly \enddonotcall
467 ///
468 void GX2API GX2GetPerfMetricF32(const GX2PerfInfo *perfinfo,
469                                 GX2PerfMetric metric, f32* result);
470 
471 /// @}
472 
473 #ifdef __cplusplus
474 }
475 #endif
476 
477 #endif /// __DEMO_PERF_H__
478