diff --git a/cliloader/cliloader.cpp b/cliloader/cliloader.cpp index f8a4ec6f..e3fe165e 100644 --- a/cliloader/cliloader.cpp +++ b/cliloader/cliloader.cpp @@ -441,6 +441,10 @@ static bool parseArguments(int argc, char *argv[]) { checkSetEnv("CLI_DumpKernelISABinaries", "1"); } + else if( !strcmp(argv[i], "-ct") || !strcmp(argv[i], "--conditional-timing") ) + { + checkSetEnv("CLI_PerformanceTimingConditional", "1"); + } else if( !strcmp(argv[i], "-d") || !strcmp(argv[i], "--device-timing") ) { checkSetEnv("CLI_DevicePerformanceTiming", "1"); @@ -622,6 +626,7 @@ static bool parseArguments(int argc, char *argv[]) " --dump-spirv [-dspv] Dump Input Program IL (SPIR-V)\n" " --dump-output-binaries Dump Output Program Binaries\n" " --dump-kernel-isa-binaries Dump Kernel ISA Binaries (Intel GPU Only)\n" + " --conditional-timing [-ct] Enable Conditional Timing Based on Environment Variables\n" " --device-timing [-d] Report Device Execution Time\n" " --device-timing-verbose [-dv] Report More Detailed Device Execution Time\n" " --chrome-call-logging [-ccl] Record Host API Calls to a JSON Trace File\n" diff --git a/docs/controls.md b/docs/controls.md index a833fa2f..5b22f31e 100644 --- a/docs/controls.md +++ b/docs/controls.md @@ -423,6 +423,10 @@ If set to a nonzero value, the Intercept Layer for OpenCL Applications will orga If set to a nonzero value, the Intercept Layer for OpenCL Applications will unconditionally estimate the queued time for Chrome Tracing rather than computing it using device and host timers and event profiling data. The estimated time is less accurate than the computed time, but may be more reliable if the device and host timers or event profiling data is incorrect or imprecise. +##### `PerformanceTimingConditional` (bool) + +If set to a nonzero value, the Intercept Layer for OpenCL Applications will only collect host performance timing, device performance timing, and chrome performance timing conditionally, when the "CLI\_ENABLE\_PERFORMANCE\_TIMING" environment variable is set to a non-zero value. + ### Controls for Dumping and Injecting Programs and Build Options ##### `OmitProgramNumber` (bool) diff --git a/intercept/OS/OS_linux.h b/intercept/OS/OS_linux.h index 913b3439..e9c0948b 100644 --- a/intercept/OS/OS_linux.h +++ b/intercept/OS/OS_linux.h @@ -118,6 +118,9 @@ class Services bool CheckMDAPIPermissions( std::string& str ) const; + bool CheckConditionalEnable( + const char* name) const; + private: bool GetControlFromFile( const std::string& fileName, @@ -472,4 +475,12 @@ inline bool Services::CheckMDAPIPermissions( return str.empty(); } +inline bool Services::CheckConditionalEnable( + const char* name) const +{ + const char* envVal = getenv(name); + bool enabled = envVal && strcmp(envVal, "0") != 0; + return enabled; +} + } diff --git a/intercept/OS/OS_mac.h b/intercept/OS/OS_mac.h index ec4768c4..593217b1 100644 --- a/intercept/OS/OS_mac.h +++ b/intercept/OS/OS_mac.h @@ -95,6 +95,9 @@ class Services bool CheckMDAPIPermissions( std::string& str ) const; + bool CheckConditionalEnable( + const char* name) const; + private: bool GetControlFromFile( const std::string& fileName, @@ -291,4 +294,12 @@ inline bool Services::CheckMDAPIPermissions( return true; } +inline bool Services::CheckConditionalEnable( + const char* name) const +{ + const char* envVal = getenv(name); + bool enabled = envVal && strcmp(envVal, "0") != 0; + return enabled; +} + } diff --git a/intercept/OS/OS_windows.h b/intercept/OS/OS_windows.h index 10357d68..5de02aa7 100644 --- a/intercept/OS/OS_windows.h +++ b/intercept/OS/OS_windows.h @@ -89,6 +89,9 @@ class Services bool CheckMDAPIPermissions( std::string& str ) const; + bool CheckConditionalEnable( + const char* name) const; + private: HINSTANCE m_hInstance; }; @@ -585,4 +588,22 @@ inline bool Services::CheckMDAPIPermissions( return true; } +inline bool Services::CheckConditionalEnable( + const char* name) const +{ + bool enabled = false; + char* envVal = NULL; + size_t len = 0; + errno_t err = _dupenv_s( &envVal, &len, name ); + if( !err && envVal ) + { + if( strcmp(envVal, "0") != 0 ) + { + enabled = true; + } + free( envVal ); + } + return enabled; +} + } diff --git a/intercept/src/controls.h b/intercept/src/controls.h index 6faf9ad4..aac27244 100644 --- a/intercept/src/controls.h +++ b/intercept/src/controls.h @@ -89,6 +89,7 @@ CLI_CONTROL( bool, ChromePerformanceTiming, false, "If s CLI_CONTROL( bool, ChromePerformanceTimingInStages, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will separate the performance information placed in the JSON file into Queued, Submitted, and Execution stages. It will also reorder the threads/queues by starting runtime. This flag is only functional when ChromePerformanceTiming is also set." ) CLI_CONTROL( bool, ChromePerformanceTimingPerKernel, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will organize the performance information placed in the JSON file on a per kernel name basis. It is only functional when ChromePerformanceTiming is also set. When ChromePerformanceTimingInStages is also set, information about event stages will be retained." ) CLI_CONTROL( bool, ChromePerformanceTimingEstimateQueuedTime, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will unconditionally estimate the queued time for Chrome Tracing rather than computing it using device and host timers and event profiling data. The estimated time is less accurate than the computed time, but may be more reliable if the device and host timers or event profiling data is incorrect or imprecise." ) +CLI_CONTROL( bool, PerformanceTimingConditional, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will only collect host performance timing, device performance timing, and chrome performance timing conditionally, when the \"CLI_ENABLE_PERFORMANCE_TIMING\" environment variable is set to a non-zero value." ) CLI_CONTROL_SEPARATOR( Controls for Dumping and Injecting Programs and Build Options: ) CLI_CONTROL( bool, OmitProgramNumber, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will omit the program number from dumped file names and hash tracking. This can produce deterministic results even if programs are built in a non-deterministic order (say, by multiple threads)." ) diff --git a/intercept/src/intercept.h b/intercept/src/intercept.h index e0ea175c..1b77c239 100644 --- a/intercept/src/intercept.h +++ b/intercept/src/intercept.h @@ -445,13 +445,17 @@ class CLIntercept cl_device_id device, const cl_queue_properties* properties, cl_queue_properties*& pLocalQueueProperties ) const; + void dummyCommandQueue( + cl_context context, + cl_device_id device ); + + bool checkConditionalTiming() const; + bool checkGetTimingTags( + uint64_t enqueueCounter ) const; bool checkHostPerformanceTimingEnqueueLimits( uint64_t enqueueCounter ) const; bool checkDevicePerformanceTimingEnqueueLimits( uint64_t enqueueCounter ) const; - void dummyCommandQueue( - cl_context context, - cl_device_id device ); void addTimingEvent( const char* functionName, @@ -3058,16 +3062,41 @@ inline bool CLIntercept::checkAubCaptureEnqueueLimits( /////////////////////////////////////////////////////////////////////////////// // + +inline bool CLIntercept::checkConditionalTiming() const +{ + return ( !config().PerformanceTimingConditional || + OS().CheckConditionalEnable("CLI_ENABLE_PERFORMANCE_TIMING") ); +} + +inline bool CLIntercept::checkGetTimingTags( + uint64_t enqueueCounter ) const +{ + if( config().ChromeCallLogging ) + { + return true; + } + + if( ( config().HostPerformanceTiming && + checkHostPerformanceTimingEnqueueLimits( enqueueCounter ) ) || + ( ( config().DevicePerformanceTiming || + config().ITTPerformanceTiming || + config().ChromePerformanceTiming || + config().DevicePerfCounterEventBasedSampling ) && + checkDevicePerformanceTimingEnqueueLimits( enqueueCounter ) ) ) + { + if( checkConditionalTiming() ) + { + return true; + } + } + + return false; +} + #define GET_TIMING_TAGS_BLOCKING( _blocking, _sz ) \ std::string hostTag, deviceTag; \ - if( pIntercept->config().ChromeCallLogging || \ - ( pIntercept->config().HostPerformanceTiming && \ - pIntercept->checkHostPerformanceTimingEnqueueLimits( enqueueCounter ) ) ||\ - ( ( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling ) && \ - pIntercept->checkDevicePerformanceTimingEnqueueLimits( enqueueCounter ) ) )\ + if( pIntercept->checkGetTimingTags( enqueueCounter ) ) \ { \ pIntercept->getTimingTagBlocking( \ __FUNCTION__, \ @@ -3079,14 +3108,7 @@ inline bool CLIntercept::checkAubCaptureEnqueueLimits( #define GET_TIMING_TAGS_MAP( _blocking_map, _map_flags, _sz ) \ std::string hostTag, deviceTag; \ - if( pIntercept->config().ChromeCallLogging || \ - ( pIntercept->config().HostPerformanceTiming && \ - pIntercept->checkHostPerformanceTimingEnqueueLimits( enqueueCounter ) ) ||\ - ( ( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling ) && \ - pIntercept->checkDevicePerformanceTimingEnqueueLimits( enqueueCounter ) ) )\ + if( pIntercept->checkGetTimingTags( enqueueCounter ) ) \ { \ pIntercept->getTimingTagsMap( \ __FUNCTION__, \ @@ -3099,14 +3121,7 @@ inline bool CLIntercept::checkAubCaptureEnqueueLimits( #define GET_TIMING_TAGS_UNMAP( _ptr ) \ std::string hostTag, deviceTag; \ - if( pIntercept->config().ChromeCallLogging || \ - ( pIntercept->config().HostPerformanceTiming && \ - pIntercept->checkHostPerformanceTimingEnqueueLimits( enqueueCounter ) ) ||\ - ( ( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling ) && \ - pIntercept->checkDevicePerformanceTimingEnqueueLimits( enqueueCounter ) ) )\ + if( pIntercept->checkGetTimingTags( enqueueCounter ) ) \ { \ pIntercept->getTimingTagsUnmap( \ __FUNCTION__, \ @@ -3117,14 +3132,7 @@ inline bool CLIntercept::checkAubCaptureEnqueueLimits( #define GET_TIMING_TAGS_MEMFILL( _queue, _dst_ptr, _sz ) \ std::string hostTag, deviceTag; \ - if( pIntercept->config().ChromeCallLogging || \ - ( pIntercept->config().HostPerformanceTiming && \ - pIntercept->checkHostPerformanceTimingEnqueueLimits( enqueueCounter ) ) ||\ - ( ( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling ) && \ - pIntercept->checkDevicePerformanceTimingEnqueueLimits( enqueueCounter ) ) )\ + if( pIntercept->checkGetTimingTags( enqueueCounter ) ) \ { \ pIntercept->getTimingTagsMemfill( \ __FUNCTION__, \ @@ -3137,14 +3145,7 @@ inline bool CLIntercept::checkAubCaptureEnqueueLimits( #define GET_TIMING_TAGS_MEMCPY( _queue, _blocking, _dst_ptr, _src_ptr, _sz )\ std::string hostTag, deviceTag; \ - if( pIntercept->config().ChromeCallLogging || \ - ( pIntercept->config().HostPerformanceTiming && \ - pIntercept->checkHostPerformanceTimingEnqueueLimits( enqueueCounter ) ) ||\ - ( ( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling ) && \ - pIntercept->checkDevicePerformanceTimingEnqueueLimits( enqueueCounter ) ) )\ + if( pIntercept->checkGetTimingTags( enqueueCounter ) ) \ { \ pIntercept->getTimingTagsMemcpy( \ __FUNCTION__, \ @@ -3159,14 +3160,7 @@ inline bool CLIntercept::checkAubCaptureEnqueueLimits( #define GET_TIMING_TAGS_KERNEL( _queue, _kernel, _dim, _gwo, _gws, _lws ) \ std::string hostTag, deviceTag; \ - if( pIntercept->config().ChromeCallLogging || \ - ( pIntercept->config().HostPerformanceTiming && \ - pIntercept->checkHostPerformanceTimingEnqueueLimits( enqueueCounter ) ) ||\ - ( ( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling ) && \ - pIntercept->checkDevicePerformanceTimingEnqueueLimits( enqueueCounter ) ) )\ + if( pIntercept->checkGetTimingTags( enqueueCounter ) ) \ { \ pIntercept->getTimingTagsKernel( \ _queue, \ @@ -3205,19 +3199,21 @@ inline bool CLIntercept::checkHostPerformanceTimingEnqueueLimits( #define HOST_PERFORMANCE_TIMING_START() \ CLIntercept::clock::time_point cpuStart, cpuEnd; \ - if( pIntercept->config().HostPerformanceTiming || \ - pIntercept->config().ChromeCallLogging ) \ + bool doHostPerformanceTiming = \ + pIntercept->config().ChromeCallLogging || \ + ( pIntercept->config().HostPerformanceTiming && \ + pIntercept->checkHostPerformanceTimingEnqueueLimits( enqueueCounter ) &&\ + pIntercept->checkConditionalTiming() ); \ + if( doHostPerformanceTiming ) \ { \ cpuStart = CLIntercept::clock::now(); \ } #define HOST_PERFORMANCE_TIMING_END() \ - if( pIntercept->config().HostPerformanceTiming || \ - pIntercept->config().ChromeCallLogging ) \ + if( doHostPerformanceTiming ) \ { \ cpuEnd = CLIntercept::clock::now(); \ - if( pIntercept->config().HostPerformanceTiming && \ - pIntercept->checkHostPerformanceTimingEnqueueLimits( enqueueCounter ) )\ + if( pIntercept->config().HostPerformanceTiming ) \ { \ pIntercept->updateHostTimingStats( \ __FUNCTION__, \ @@ -3228,12 +3224,10 @@ inline bool CLIntercept::checkHostPerformanceTimingEnqueueLimits( } #define HOST_PERFORMANCE_TIMING_END_WITH_TAG() \ - if( pIntercept->config().HostPerformanceTiming || \ - pIntercept->config().ChromeCallLogging ) \ + if( doHostPerformanceTiming ) \ { \ cpuEnd = CLIntercept::clock::now(); \ - if( pIntercept->config().HostPerformanceTiming && \ - pIntercept->checkHostPerformanceTimingEnqueueLimits( enqueueCounter ) )\ + if( pIntercept->config().HostPerformanceTiming ) \ { \ pIntercept->updateHostTimingStats( \ __FUNCTION__, \ @@ -3245,21 +3239,22 @@ inline bool CLIntercept::checkHostPerformanceTimingEnqueueLimits( #define TOOL_OVERHEAD_TIMING_START() \ CLIntercept::clock::time_point toolStart, toolEnd; \ - if( pIntercept->config().ToolOverheadTiming && \ - ( pIntercept->config().HostPerformanceTiming || \ - pIntercept->config().ChromeCallLogging ) ) \ + bool doToolOverheadTiming = \ + pIntercept->config().ToolOverheadTiming && \ + ( pIntercept->config().ChromeCallLogging || \ + ( pIntercept->config().HostPerformanceTiming && \ + pIntercept->checkHostPerformanceTimingEnqueueLimits( enqueueCounter ) &&\ + pIntercept->checkConditionalTiming() ) ); \ + if( doToolOverheadTiming ) \ { \ toolStart = CLIntercept::clock::now(); \ } #define TOOL_OVERHEAD_TIMING_END( _tag ) \ - if( pIntercept->config().ToolOverheadTiming && \ - ( pIntercept->config().HostPerformanceTiming || \ - pIntercept->config().ChromeCallLogging ) ) \ + if( doToolOverheadTiming ) \ { \ toolEnd = CLIntercept::clock::now(); \ - if( pIntercept->config().HostPerformanceTiming && \ - pIntercept->checkHostPerformanceTimingEnqueueLimits( enqueueCounter ) )\ + if( pIntercept->config().HostPerformanceTiming ) \ { \ pIntercept->updateHostTimingStats( \ _tag, \ @@ -3336,11 +3331,14 @@ inline bool CLIntercept::checkDevicePerformanceTimingEnqueueLimits( CLIntercept::clock::time_point queuedTime; \ cl_event local_event = NULL; \ bool isLocalEvent = false; \ - if( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling ) \ - /* TODO: checkDevicePerformanceTimingEnqueueLimits? */ \ + bool doDevicePerformanceTiming = \ + ( pIntercept->config().DevicePerformanceTiming || \ + pIntercept->config().ITTPerformanceTiming || \ + pIntercept->config().ChromePerformanceTiming || \ + pIntercept->config().DevicePerfCounterEventBasedSampling ) && \ + pIntercept->checkDevicePerformanceTimingEnqueueLimits( enqueueCounter ) &&\ + pIntercept->checkConditionalTiming(); \ + if( doDevicePerformanceTiming ) \ { \ queuedTime = CLIntercept::clock::now(); \ if( pEvent == NULL ) \ @@ -3351,14 +3349,9 @@ inline bool CLIntercept::checkDevicePerformanceTimingEnqueueLimits( } #define DEVICE_PERFORMANCE_TIMING_END( queue, pEvent ) \ - if( ( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling ) && \ - ( pEvent != NULL ) ) \ + if( doDevicePerformanceTiming && ( pEvent != NULL ) ) \ { \ - if( pIntercept->checkDevicePerformanceTimingEnqueueLimits( enqueueCounter ) &&\ - !pIntercept->config().DevicePerformanceTimingKernelsOnly && \ + if( !pIntercept->config().DevicePerformanceTimingKernelsOnly && \ ( !pIntercept->config().DevicePerformanceTimingSkipUnmap || \ std::string(__FUNCTION__) != "clEnqueueUnmapMemObject" ) ) \ { \ @@ -3380,14 +3373,9 @@ inline bool CLIntercept::checkDevicePerformanceTimingEnqueueLimits( } #define DEVICE_PERFORMANCE_TIMING_END_WITH_TAG( queue, pEvent ) \ - if( ( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling ) && \ - ( pEvent != NULL ) ) \ + if( doDevicePerformanceTiming && ( pEvent != NULL ) ) \ { \ - if( pIntercept->checkDevicePerformanceTimingEnqueueLimits( enqueueCounter ) &&\ - !pIntercept->config().DevicePerformanceTimingKernelsOnly && \ + if( !pIntercept->config().DevicePerformanceTimingKernelsOnly && \ ( !pIntercept->config().DevicePerformanceTimingSkipUnmap || \ std::string(__FUNCTION__) != "clEnqueueUnmapMemObject" ) ) \ { \ @@ -3409,24 +3397,17 @@ inline bool CLIntercept::checkDevicePerformanceTimingEnqueueLimits( } #define DEVICE_PERFORMANCE_TIMING_END_KERNEL( queue, pEvent ) \ - if( ( pIntercept->config().DevicePerformanceTiming || \ - pIntercept->config().ITTPerformanceTiming || \ - pIntercept->config().ChromePerformanceTiming || \ - pIntercept->config().DevicePerfCounterEventBasedSampling ) && \ - ( pEvent != NULL ) ) \ + if( doDevicePerformanceTiming && ( pEvent != NULL ) ) \ { \ - if( pIntercept->checkDevicePerformanceTimingEnqueueLimits( enqueueCounter ) )\ - { \ - /*TOOL_OVERHEAD_TIMING_START();*/ \ - pIntercept->addTimingEvent( \ - __FUNCTION__, \ - enqueueCounter, \ - queuedTime, \ - deviceTag, \ - queue, \ - pEvent[0] ); \ - /*TOOL_OVERHEAD_TIMING_END( "(timing event overhead)" );*/ \ - } \ + /*TOOL_OVERHEAD_TIMING_START();*/ \ + pIntercept->addTimingEvent( \ + __FUNCTION__, \ + enqueueCounter, \ + queuedTime, \ + deviceTag, \ + queue, \ + pEvent[0] ); \ + /*TOOL_OVERHEAD_TIMING_END( "(timing event overhead)" );*/ \ if( isLocalEvent ) \ { \ pIntercept->dispatch().clReleaseEvent( pEvent[0] ); \