@@ -34,7 +34,7 @@ FLAGS_DEFINE_uint64(npu_profiling_dtypes,
3434 ACL_PROF_HCCL_TRACE | ACL_PROF_RUNTIME_API,
3535 " ACL datatypes to profile" );
3636FLAGS_DEFINE_uint64 (npu_profiling_metrics,
37- static_cast <uint64_t >(ACL_AICORE_ARITHMETIC_UTILIZATION ),
37+ static_cast <uint64_t >(ACL_AICORE_PIPE_UTILIZATION ),
3838 "AI Core metric to profile");
3939
4040FLAGS_DEFINE_bool (set_to_1d, true , " set_to_1d" );
@@ -199,7 +199,10 @@ aclrtStream SecondaryStream::Get(aclrtStream aicore_stream) {
199199void SecondaryStream::Create (aclrtStream aicore_stream) {
200200 RUN_CHECK (aicpu_streams.find (aicore_stream) == aicpu_streams.cend ());
201201 aclrtStream aicpu_stream;
202- ACL_CHECK (aclrtCreateStream (&aicpu_stream));
202+ ACL_CHECK (aclrtCreateStreamWithConfig (
203+ reinterpret_cast <aclrtStream *>(&aicpu_stream),
204+ 0 ,
205+ (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC)));
203206 aicpu_streams[aicore_stream] = aicpu_stream;
204207}
205208
@@ -597,7 +600,10 @@ C_Status HostDeallocate(const C_Device device, void *ptr, size_t size) {
597600}
598601
599602C_Status CreateStream (const C_Device device, C_Stream *stream) {
600- ACL_CHECK (aclrtCreateStream (reinterpret_cast <aclrtStream *>(stream)));
603+ ACL_CHECK (aclrtCreateStreamWithConfig (
604+ reinterpret_cast <aclrtStream *>(stream),
605+ 0 ,
606+ (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC)));
601607 LOG_IF (INFO, FLAGS_npu_runtime_debug)
602608 << " [RUNTIME] CreateStream: device=" << device->id
603609 << " , stream=" << *stream;
0 commit comments