From 0e2748f5c9319fb357c6c2b9b6b3994a0b1d8e60 Mon Sep 17 00:00:00 2001 From: Tanush Prathi Date: Wed, 19 Nov 2025 14:17:04 -0500 Subject: [PATCH 1/7] Replace cpp macros with fypp macros --- CMakeLists.txt | 24 +- src/common/include/omp_macros.fpp | 66 +--- src/common/include/parallel_macros.fpp | 211 +++++------ src/common/include/shared_parallel_macros.fpp | 10 + src/common/m_nvtx.f90 | 12 +- src/simulation/m_body_forces.fpp | 2 - src/simulation/m_fftw.fpp | 330 +++++++++--------- src/simulation/m_global_parameters.fpp | 2 - src/simulation/m_igr.fpp | 16 +- src/simulation/m_muscl.fpp | 4 +- src/simulation/m_rhs.fpp | 16 +- src/simulation/m_time_steppers.fpp | 18 +- src/simulation/m_weno.fpp | 2 - src/syscheck/syscheck.fpp | 36 +- 14 files changed, 354 insertions(+), 395 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d74602f7f4..cb86d881dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -310,7 +310,20 @@ endif() # and generate documentation. Instead, we can simply include the list of .f90 # files that will eventually be used to compile . -macro(HANDLE_SOURCES target useCommon) +macro(HANDLE_SOURCES target useCommon useOpenACC useOpenMP) + + if (${useOpenACC} AND ${useOpenMP}) + message(FATAL_ERROR "OpenACC and OpenMP at same time is unsupported.") + elseif (${useOpenACC}) + message(STATUS "OpenACC set: ${useOpenACC}") + set(MFC_GPU_MODE "OpenACC") + elseif (${useOpenMP}) + message(STATUS "OpenMP set: ${useOpenMP}") + set(MFC_GPU_MODE "OpenMP") + else() + set(MFC_GPU_MODE "") + endif() + set(${target}_DIR "${CMAKE_SOURCE_DIR}/src/${target}") set(common_DIR "${CMAKE_SOURCE_DIR}/src/common") @@ -372,6 +385,7 @@ macro(HANDLE_SOURCES target useCommon) -D MFC_${${target}_UPPER} -D MFC_COMPILER="${CMAKE_Fortran_COMPILER_ID}" -D MFC_CASE_OPTIMIZATION=False + -D MFC_GPU_MODE="${MFC_GPU_MODE}" -D chemistry=False --line-numbering --no-folding @@ -388,10 +402,10 @@ macro(HANDLE_SOURCES target useCommon) endmacro() -HANDLE_SOURCES(pre_process ON) -HANDLE_SOURCES(simulation ON) -HANDLE_SOURCES(post_process ON) -HANDLE_SOURCES(syscheck OFF) +HANDLE_SOURCES(pre_process ON OFF OFF) +HANDLE_SOURCES(simulation ON ${MFC_OpenACC} ${MFC_OpenMP}) +HANDLE_SOURCES(post_process ON OFF OFF) +HANDLE_SOURCES(syscheck OFF ${MFC_OpenACC} ${MFC_OpenMP}) # MFC_SETUP_TARGET: Given a target (herein ), this macro creates a new diff --git a/src/common/include/omp_macros.fpp b/src/common/include/omp_macros.fpp index 2b7606d03f..a5a2fea718 100644 --- a/src/common/include/omp_macros.fpp +++ b/src/common/include/omp_macros.fpp @@ -1,11 +1,5 @@ #:include 'shared_parallel_macros.fpp' -#:set NVIDIA_COMPILER_ID="NVHPC" -#:set PGI_COMPILER_ID="PGI" -#:set INTEL_COMPILER_ID="Intel" -#:set CCE_COMPILER_ID="Cray" -#:set AMD_COMPILER_ID="LLVMFlang" - #:def OMP_MAP_STR(map_type, var_list) #:assert map_type is not None #:assert isinstance(map_type, str) @@ -23,11 +17,11 @@ #:assert isinstance(default, str) #:assert (default == 'present' or default == 'none') #:if default == 'present' - #:if MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID + #:if USING_NVHPC #:set default_val = 'defaultmap(tofrom:aggregate) defaultmap(tofrom:allocatable) defaultmap(tofrom:pointer) ' - #:elif MFC_COMPILER == CCE_COMPILER_ID + #:elif USING_CCE #:set default_val = 'defaultmap(tofrom:aggregate) defaultmap(present:allocatable) defaultmap(present:pointer) ' - #:elif MFC_COMPILER == AMD_COMPILER_ID + #:elif USING_AMD #:set default_val = '' #:else #:set default_val = 'defaultmap(tofrom:aggregate) defaultmap(tofrom:allocatable) defaultmap(tofrom:pointer) ' @@ -176,13 +170,13 @@ & deviceptr_val.strip('\n') + attach_val.strip('\n') #! Hardcoding the parallelism for now - #:if MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID + #:if USING_NVHPC #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) ' #:set omp_end_directive = '!$omp end target teams loop' - #:elif MFC_COMPILER == CCE_COMPILER_ID + #:elif USING_CCE #:set omp_start_directive = '!$omp target teams distribute parallel do simd defaultmap(firstprivate:scalar) ' #:set omp_end_directive = '!$omp end target teams distribute parallel do simd' - #:elif MFC_COMPILER == AMD_COMPILER_ID + #:elif USING_AMD #:set omp_start_directive = '!$omp target teams distribute parallel do ' #:set omp_end_directive = '!$omp end target teams distribute parallel do' #:else @@ -223,11 +217,11 @@ & deviceptr_val.strip('\n') + attach_val.strip('\n') #! Hardcoding the parallelism for now - #:if MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID + #:if USING_NVHPC #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) ' - #:elif MFC_COMPILER == CCE_COMPILER_ID + #:elif USING_CCE #:set omp_start_directive = '!$omp target teams distribute parallel do simd defaultmap(firstprivate:scalar) ' - #:elif MFC_COMPILER == AMD_COMPILER_ID + #:elif USING_AMD #:set omp_start_directive = '!$omp target teams distribute parallel do ' #:else #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) ' @@ -239,11 +233,11 @@ #:def END_OMP_PARALLEL_LOOP() - #:if MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID + #:if USING_NVHPC #:set omp_end_directive = '!$omp end target teams loop' - #:elif MFC_COMPILER == CCE_COMPILER_ID + #:elif USING_CCE #:set omp_end_directive = '!$omp end target teams distribute parallel do simd' - #:elif MFC_COMPILER == AMD_COMPILER_ID + #:elif USING_AMD #:set omp_end_directive = '!$omp end target teams distribute parallel do' #:else #:set omp_end_directive = '!$omp end target teams loop' @@ -266,7 +260,7 @@ #:set function_name_val = '' #:endif - #:if MFC_COMPILER == AMD_COMPILER_ID + #:if USING_AMD #:set clause_val = '' #:else #:set clause_val = nohost_val.strip('\n') @@ -290,9 +284,9 @@ #! Not fully implemented yet (ignores most args right now) #:def OMP_LOOP(collapse=None, parallelism=None, data_dependency=None, reduction=None, reductionOp=None, private=None, extraOmpArgs=None) - #:if MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID + #:if USING_NVHPC #:set omp_directive = '!$omp loop bind(thread)' - #:elif MFC_COMPILER == CCE_COMPILER_ID or MFC_COMPILER == AMD_COMPILER_ID + #:elif USING_CCE or USING_AMD #:set omp_directive = '' #:else #:set omp_directive = '' @@ -390,34 +384,4 @@ #:set omp_directive = '!$omp barrier ' + clause_val + extraOmpArgs_val.strip('\n') $:omp_directive #:enddef - -#:def UNDEF_AMD(code) - #:if MFC_COMPILER != AMD_COMPILER_ID - $:code - #:endif -#:enddef - -#:def DEF_AMD(code) - #:if MFC_COMPILER == AMD_COMPILER_ID - $:code - #:endif -#:enddef - -#:def UNDEF_CCE(code) - #:if MFC_COMPILER != CCE_COMPILER_ID - $:code - #:endif -#:enddef - -#:def DEF_CCE(code) - #:if MFC_COMPILER == CCE_COMPILER_ID - $:code - #:endif -#:enddef - -#:def UNDEF_NVIDIA(code) - #:if MFC_COMPILER != NVIDIA_COMPILER_ID and MFC_COMPILER != PGI_COMPILER_ID - $:code - #:endif -#:enddef ! New line at end of file is required for FYPP diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp index e5dc5605e9..1bd68c1f89 100644 --- a/src/common/include/parallel_macros.fpp +++ b/src/common/include/parallel_macros.fpp @@ -2,6 +2,12 @@ #:include 'omp_macros.fpp' #:include 'acc_macros.fpp' +#:set OpenMP_MODE = "OpenMP" +#:set OpenACC_MODE = "OpenACC" +#:set MFC_OpenMP = (MFC_GPU_MODE == OpenMP_MODE) +#:set MFC_OpenACC = (MFC_GPU_MODE == OpenACC_MODE) +#:set MFC_GPU = (MFC_OpenMP or MFC_OpenACC) + #:def GPU_PARALLEL(code, private=None, default='present', firstprivate=None, reduction=None, reductionOp=None, & & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None, extraOmpArgs=None) @@ -9,13 +15,13 @@ #:set acc_code = ACC_PARALLEL(code, private, default, firstprivate, reduction, reductionOp, copy, copyin, copyinReadOnly, copyout, create, no_create, present, deviceptr, attach, extraAccArgs) #:set omp_code = OMP_PARALLEL(code, private, default, firstprivate, reduction, reductionOp, copy, copyin, copyinReadOnly, copyout, create, no_create, present, deviceptr, attach, extraOmpArgs) -#if defined(MFC_OpenACC) - $:acc_code -#elif defined(MFC_OpenMP) - $:omp_code -#else - $:code -#endif + #:if MFC_OpenACC + $:acc_code + #:elif MFC_OpenMP + $:omp_code + #:else + $:code + #:endif #:enddef @@ -27,13 +33,13 @@ #:set acc_code = ACC_PARALLEL_LOOP_OLD(code, collapse, private, parallelism, default, firstprivate, reduction, reductionOp, copy, copyin, copyinReadOnly, copyout, create, no_create, present, deviceptr, attach, extraAccArgs) #:set omp_code = OMP_PARALLEL_LOOP_OLD(code, collapse, private, parallelism, default, firstprivate, reduction, reductionOp, copy, copyin, copyinReadOnly, copyout, create, no_create, present, deviceptr, attach, extraOmpArgs) -#if defined(MFC_OpenACC) - $:acc_code -#elif defined(MFC_OpenMP) - $:omp_code -#else - $:code -#endif + #:if MFC_OpenACC + $:acc_code + #:elif MFC_OpenMP + $:omp_code + #:else + $:code + #:endif #:enddef #:def GPU_PARALLEL_LOOP(collapse=None, private=None, parallelism='[gang, vector]', & @@ -44,11 +50,11 @@ #:set acc_directive = ACC_PARALLEL_LOOP(collapse, private, parallelism, default, firstprivate, reduction, reductionOp, copy, copyin, copyinReadOnly, copyout, create, no_create, present, deviceptr, attach, extraAccArgs) #:set omp_directive = OMP_PARALLEL_LOOP(collapse, private, parallelism, default, firstprivate, reduction, reductionOp, copy, copyin, copyinReadOnly, copyout, create, no_create, present, deviceptr, attach, extraOmpArgs) -#if defined(MFC_OpenACC) - $:acc_directive -#elif defined(MFC_OpenMP) - $:omp_directive -#endif + #:if MFC_OpenACC + $:acc_directive + #:elif MFC_OpenMP + $:omp_directive + #:endif #:enddef @@ -57,11 +63,11 @@ #:set acc_end_directive = '!$acc end parallel loop' #:set omp_end_directive = END_OMP_PARALLEL_LOOP() -#if defined(MFC_OpenACC) - $:acc_end_directive -#elif defined(MFC_OpenMP) - $:omp_end_directive -#endif + #:if MFC_OpenACC + $:acc_end_directive + #:elif MFC_OpenMP + $:omp_end_directive + #:endif #:enddef @@ -75,19 +81,19 @@ #:stop "When inlining for Cray Compiler, function name must be given and given as a string" #:endif #:set cray_directive = ('!DIR$ INLINEALWAYS ' + function_name).strip('\n') -#ifdef _CRAYFTN - $:cray_directive -#elif MFC_OpenACC - $:acc_directive -#elif MFC_OpenMP - $:omp_directive -#endif + #:if USING_CCE + $:cray_directive + #:elif MFC_OpenACC + $:acc_directive + #:elif MFC_OpenMP + $:omp_directive + #:endif #:else -#if MFC_OpenACC - $:acc_directive -#elif MFC_OpenMP - $:omp_directive -#endif + #:if MFC_OpenACC + $:acc_directive + #:elif MFC_OpenMP + $:omp_directive + #:endif #:endif #:enddef @@ -99,35 +105,35 @@ #:assert copy is None #:set omp_code = OMP_DECLARE(copyin=copyin, copyinReadOnly=copyinReadOnly, create=create, link=link, extraOmpArgs=extraOmpArgs) -#if defined(MFC_OpenACC) - $:acc_code -#elif defined(MFC_OpenMP) - $:omp_code -#endif + #:if MFC_OpenACC + $:acc_code + #:elif MFC_OpenMP + $:omp_code + #:endif #:enddef #:def GPU_LOOP(collapse=None, parallelism=None, data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None, extraOmpArgs=None) #:set acc_code = ACC_LOOP(collapse=collapse, parallelism=parallelism, data_dependency=data_dependency, reduction=reduction, reductionOp=reductionOp, private=private, extraAccArgs=extraAccArgs) #:set omp_code = OMP_LOOP(collapse=collapse, parallelism=parallelism, data_dependency=data_dependency, reduction=reduction, reductionOp=reductionOp, private=private, extraOmpArgs=extraOmpArgs) -#if defined(MFC_OpenACC) - $:acc_code -#elif defined(MFC_OpenMP) - $:omp_code -#endif + #:if MFC_OpenACC + $:acc_code + #:elif MFC_OpenMP + $:omp_code + #:endif #:enddef #:def GPU_DATA(code, copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraAccArgs=None, extraOmpArgs=None) #:set acc_code = ACC_DATA(code=code, copy=copy, copyin=copyin, copyinReadOnly=copyinReadOnly, copyout=copyout, create=create, no_create=no_create, present=present, deviceptr=deviceptr, attach=attach, default=default, extraAccArgs=extraAccArgs) #:set omp_code = OMP_DATA(code=code, copy=copy, copyin=copyin, copyinReadOnly=copyinReadOnly, copyout=copyout, create=create, no_create=no_create, present=present, deviceptr=deviceptr, attach=attach, default=default, extraOmpArgs=extraOmpArgs) -#if defined(MFC_OpenACC) - $:acc_code -#elif defined(MFC_OpenMP) - $:omp_code -#else - $:code -#endif + #:if MFC_OpenACC + $:acc_code + #:elif MFC_OpenMP + $:omp_code + #:else + $:code + #:endif #:enddef #:def GPU_HOST_DATA(code, use_device_addr=None, use_device_ptr=None, extraAccArgs=None, extraOmpArgs=None) @@ -149,104 +155,77 @@ #:set acc_code = ACC_HOST_DATA(code=code, use_device=use_device, extraAccArgs=extraAccArgs) #:set omp_code = OMP_HOST_DATA(code=code, use_device_addr=use_device_addr, use_device_ptr=use_device_ptr, extraOmpArgs=extraOmpArgs) -#if defined(MFC_OpenACC) - $:acc_code -#elif defined(MFC_OpenMP) - $:omp_code -#else - $:code -#endif + #:if MFC_OpenACC + $:acc_code + #:elif MFC_OpenMP + $:omp_code + #:else + $:code + #:endif #:enddef #:def GPU_ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None, extraOmpArgs=None) #:set acc_code = ACC_ENTER_DATA(copyin=copyin, copyinReadOnly=copyinReadOnly, create=create, attach=attach, extraAccArgs=extraAccArgs) #:set omp_code = OMP_ENTER_DATA(copyin=copyin, copyinReadOnly=copyinReadOnly, create=create, attach=attach, extraOmpArgs=extraOmpArgs) -#if defined(MFC_OpenACC) - $:acc_code -#elif defined(MFC_OpenMP) - $:omp_code -#endif + #:if MFC_OpenACC + $:acc_code + #:elif MFC_OpenMP + $:omp_code + #:endif #:enddef #:def GPU_EXIT_DATA(copyout=None, delete=None, detach=None, extraAccArgs=None, extraOmpArgs=None) #:set acc_code = ACC_EXIT_DATA(copyout=copyout, delete=delete, detach=detach, extraAccArgs=extraAccArgs) #:set omp_code = OMP_EXIT_DATA(copyout=copyout, delete=delete, detach=detach, extraOmpArgs=extraOmpArgs) -#if defined(MFC_OpenACC) - $:acc_code -#elif defined(MFC_OpenMP) - $:omp_code -#endif + #:if MFC_OpenACC + $:acc_code + #:elif MFC_OpenMP + $:omp_code + #:endif #:enddef #:def GPU_ATOMIC(atomic, extraAccArgs=None, extraOmpArgs=None) #:set acc_code = ACC_ATOMIC(atomic=atomic, extraAccArgs=extraAccArgs) #:set omp_code = OMP_ATOMIC(atomic=atomic, extraOmpArgs=extraOmpArgs) -#if defined(MFC_OpenACC) - $:acc_code -#elif defined(MFC_OpenMP) - $:omp_code -#endif + #:if MFC_OpenACC + $:acc_code + #:elif MFC_OpenMP + $:omp_code + #:endif #:enddef #:def GPU_UPDATE(host=None, device=None, extraAccArgs=None, extraOmpArgs=None) #:set acc_code = ACC_UPDATE(host=host, device=device, extraAccArgs=extraAccArgs) #:set omp_code = OMP_UPDATE(host=host, device=device, extraOmpArgs=extraOmpArgs) -#if defined(MFC_OpenACC) - $:acc_code -#elif defined(MFC_OpenMP) - $:omp_code -#endif + #:if MFC_OpenACC + $:acc_code + #:elif MFC_OpenMP + $:omp_code + #:endif #:enddef #:def GPU_WAIT(extraAccArgs=None, extraOmpArgs=None) #:set acc_code = ACC_WAIT(extraAccArgs=extraAccArgs) #:set omp_code = OMP_WAIT(extraOmpArgs=extraOmpArgs) -#if defined(MFC_OpenACC) - $:acc_code -#elif defined(MFC_OpenMP) - $:omp_code -#endif -#:enddef - -#:def USE_GPU_MODULE() - -#if defined(MFC_OpenACC) - use openacc -#elif defined(MFC_OpenMP) - use omp_lib -#endif - -#:enddef - -#:def DEF_AMD(code) - #:if MFC_COMPILER == AMD_COMPILER_ID - $:code + #:if MFC_OpenACC + $:acc_code + #:elif MFC_OpenMP + $:omp_code #:endif #:enddef -#:def UNDEF_CCE(code) - #:if MFC_COMPILER != CCE_COMPILER_ID - $:code - #:endif -#:enddef +#:def USE_GPU_MODULE() -#:def DEF_CCE(code) - #:if MFC_COMPILER == CCE_COMPILER_ID - $:code + #:if MFC_OpenACC + use openacc + #:elif MFC_OpenMP + use omp_lib #:endif -#:enddef -#:def UNDEF_NVIDIA(code) - #:if MFC_COMPILER != NVIDIA_COMPILER_ID and MFC_COMPILER != PGI_COMPILER_ID - $:code - #:endif #:enddef - -#:set USING_NVHPC = (MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID) -#:set USING_CCE = (MFC_COMPILER == CCE_COMPILER_ID) ! New line at end of file is required for FYPP diff --git a/src/common/include/shared_parallel_macros.fpp b/src/common/include/shared_parallel_macros.fpp index a3a0b6f753..36bee0a23a 100644 --- a/src/common/include/shared_parallel_macros.fpp +++ b/src/common/include/shared_parallel_macros.fpp @@ -1,3 +1,13 @@ +#:set NVIDIA_COMPILER_ID="NVHPC" +#:set PGI_COMPILER_ID="PGI" +#:set INTEL_COMPILER_ID="Intel" +#:set CCE_COMPILER_ID="Cray" +#:set AMD_COMPILER_ID="LLVMFlang" + +#:set USING_NVHPC = (MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID) +#:set USING_CCE = (MFC_COMPILER == CCE_COMPILER_ID) +#:set USING_AMD = (MFC_COMPILER == AMD_COMPILER_ID) + #:def ASSERT_LIST(data, datatype) #:assert data is not None #:assert isinstance(data, list) diff --git a/src/common/m_nvtx.f90 b/src/common/m_nvtx.f90 index 524bf77781..ee8827f736 100644 --- a/src/common/m_nvtx.f90 +++ b/src/common/m_nvtx.f90 @@ -25,7 +25,7 @@ module m_nvtx type(c_ptr) :: message ! ascii char end type nvtxEventAttributes -#if defined(MFC_GPU) && defined(__PGI) +#:if MFC_GPU and USING_NVHPC interface nvtxRangePush ! push range with custom label and standard color @@ -49,7 +49,7 @@ subroutine nvtxRangePop() bind(C, name='nvtxRangePop') end subroutine nvtxRangePop end interface nvtxRangePop -#endif +#:endif contains @@ -58,7 +58,7 @@ subroutine nvtxStartRange(name, id) integer, intent(IN), optional :: id type(nvtxEventAttributes) :: event -#if defined(MFC_GPU) && defined(__PGI) +#:if MFC_GPU and USING_NVHPC tempName = trim(name)//c_null_char @@ -70,13 +70,13 @@ subroutine nvtxStartRange(name, id) call nvtxRangePushEx(event) end if -#endif +#:endif end subroutine nvtxStartRange subroutine nvtxEndRange -#if defined(MFC_GPU) && defined(__PGI) +#:if MFC_GPU and USING_NVHPC call nvtxRangePop -#endif +#:endif end subroutine nvtxEndRange end module m_nvtx diff --git a/src/simulation/m_body_forces.fpp b/src/simulation/m_body_forces.fpp index cdf35fd366..61c79edfe0 100644 --- a/src/simulation/m_body_forces.fpp +++ b/src/simulation/m_body_forces.fpp @@ -10,8 +10,6 @@ module m_body_forces use m_nvtx -! $:USE_GPU_MODULE() - implicit none private; diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp index 411485af10..b7f979d500 100644 --- a/src/simulation/m_fftw.fpp +++ b/src/simulation/m_fftw.fpp @@ -14,13 +14,13 @@ module m_fftw use m_mpi_proxy !< Message passing interface (MPI) module proxy -#if defined(MFC_GPU) && defined(__PGI) - use cufft -#elif defined(MFC_GPU) - use hipfort - use hipfort_check - use hipfort_hipfft -#endif + #:if MFC_GPU and USING_NVHPC + use cufft + #:elif MFC_GPU + use hipfort + use hipfort_check + use hipfort_hipfft + #:endif implicit none @@ -28,9 +28,9 @@ module m_fftw s_apply_fourier_filter, & s_finalize_fftw_module -#if !defined(MFC_GPU) - include 'fftw3.f03' -#endif + #:if not MFC_GPU + include 'fftw3.f03' + #:endif type(c_ptr) :: fwd_plan, bwd_plan type(c_ptr) :: fftw_real_data, fftw_cmplx_data, fftw_fltr_cmplx_data @@ -44,24 +44,24 @@ module m_fftw complex(c_double_complex), pointer :: data_fltr_cmplx(:) !< !! Filtered complex data in Fourier space -#if defined(MFC_GPU) - $:GPU_DECLARE(create='[real_size,cmplx_size,x_size,batch_size,Nfq]') + #:if MFC_GPU + $:GPU_DECLARE(create='[real_size,cmplx_size,x_size,batch_size,Nfq]') - real(dp), allocatable, target :: data_real_gpu(:) - complex(dp), allocatable, target :: data_cmplx_gpu(:) - complex(dp), allocatable, target :: data_fltr_cmplx_gpu(:) - $:GPU_DECLARE(create='[data_real_gpu,data_cmplx_gpu,data_fltr_cmplx_gpu]') + real(dp), allocatable, target :: data_real_gpu(:) + complex(dp), allocatable, target :: data_cmplx_gpu(:) + complex(dp), allocatable, target :: data_fltr_cmplx_gpu(:) + $:GPU_DECLARE(create='[data_real_gpu,data_cmplx_gpu,data_fltr_cmplx_gpu]') -#if defined(__PGI) - integer :: fwd_plan_gpu, bwd_plan_gpu -#else - type(c_ptr) :: fwd_plan_gpu, bwd_plan_gpu -#endif + #:if USING_NVHPC + integer :: fwd_plan_gpu, bwd_plan_gpu + #:else + type(c_ptr) :: fwd_plan_gpu, bwd_plan_gpu + #:endif - integer, allocatable :: gpu_fft_size(:), iembed(:), oembed(:) + integer, allocatable :: gpu_fft_size(:), iembed(:), oembed(:) - integer :: istride, ostride, idist, odist, rank -#endif + integer :: istride, ostride, idist, odist, rank + #:endif contains @@ -80,45 +80,45 @@ contains x_size = m + 1 batch_size = x_size*sys_size -#if defined(MFC_GPU) - rank = 1; istride = 1; ostride = 1 - - allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank)) - - gpu_fft_size(1) = real_size; - iembed(1) = 0 - oembed(1) = 0 - $:GPU_ENTER_DATA(copyin='[real_size,cmplx_size,x_size,sys_size,batch_size,Nfq]') - $:GPU_UPDATE(device='[real_size,cmplx_size,x_size,sys_size,batch_size]') -#else - ! Allocate input and output DFT data sizes - fftw_real_data = fftw_alloc_real(int(real_size, c_size_t)) - fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t)) - fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t)) - ! Associate input and output data pointers with allocated memory - call c_f_pointer(fftw_real_data, data_real, [real_size]) - call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size]) - call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size]) - - ! Generate plans for forward and backward DFTs - fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE) - bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE) -#endif - -#if defined(MFC_GPU) - @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size)) - @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size)) - @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size)) - -#if defined(__PGI) - ierr = cufftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, CUFFT_D2Z, batch_size) - ierr = cufftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, CUFFT_Z2D, batch_size) -#else - ierr = hipfftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, HIPFFT_D2Z, batch_size) - ierr = hipfftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, HIPFFT_Z2D, batch_size) -#endif - -#endif + #:if MFC_GPU + rank = 1; istride = 1; ostride = 1 + + allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank)) + + gpu_fft_size(1) = real_size; + iembed(1) = 0 + oembed(1) = 0 + $:GPU_ENTER_DATA(copyin='[real_size,cmplx_size,x_size,sys_size,batch_size,Nfq]') + $:GPU_UPDATE(device='[real_size,cmplx_size,x_size,sys_size,batch_size]') + #:else + ! Allocate input and output DFT data sizes + fftw_real_data = fftw_alloc_real(int(real_size, c_size_t)) + fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t)) + fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t)) + ! Associate input and output data pointers with allocated memory + call c_f_pointer(fftw_real_data, data_real, [real_size]) + call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size]) + call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size]) + + ! Generate plans for forward and backward DFTs + fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE) + bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE) + #:endif + + #:if MFC_GPU + @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size)) + @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size)) + @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size)) + + #:if USING_NVHPC + ierr = cufftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, CUFFT_D2Z, batch_size) + ierr = cufftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, CUFFT_Z2D, batch_size) + #:else + ierr = hipfftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, HIPFFT_D2Z, batch_size) + ierr = hipfftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, HIPFFT_Z2D, batch_size) + #:endif + + #:endif end subroutine s_initialize_fftw_module @@ -135,70 +135,7 @@ contains ! Restrict filter to processors that have cells adjacent to axis if (bc_y%beg >= 0) return -#if defined(MFC_GPU) - - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 1, sys_size - do j = 0, m - do l = 1, cmplx_size - data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp) - end do - end do - end do - $:END_GPU_PARALLEL_LOOP() - - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 1, sys_size - do j = 0, m - do l = 0, p - data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l) - end do - end do - end do - $:END_GPU_PARALLEL_LOOP() - - #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]') -#if defined(__PGI) - ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) -#else - ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) - call hipCheck(hipDeviceSynchronize()) -#endif - #:endcall GPU_HOST_DATA - Nfq = 3 - $:GPU_UPDATE(device='[Nfq]') - - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 1, sys_size - do j = 0, m - do l = 1, Nfq - data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) - end do - end do - end do - $:END_GPU_PARALLEL_LOOP() - - #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]') -#if defined(__PGI) - ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) -#else - ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) - call hipCheck(hipDeviceSynchronize()) -#endif - #:endcall GPU_HOST_DATA - - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 1, sys_size - do j = 0, m - do l = 0, p - data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp) - q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) - end do - end do - end do - $:END_GPU_PARALLEL_LOOP() - - do i = 1, fourier_rings + #:if MFC_GPU $:GPU_PARALLEL_LOOP(collapse=3) do k = 1, sys_size @@ -210,26 +147,25 @@ contains end do $:END_GPU_PARALLEL_LOOP() - $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]') + $:GPU_PARALLEL_LOOP(collapse=3) do k = 1, sys_size do j = 0, m do l = 0, p - data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l) + data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l) end do end do end do $:END_GPU_PARALLEL_LOOP() #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]') -#if defined(__PGI) - ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) -#else - ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) - call hipCheck(hipDeviceSynchronize()) -#endif + #:if USING_NVHPC + ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) + #:else + ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) + call hipCheck(hipDeviceSynchronize()) + #:endif #:endcall GPU_HOST_DATA - - Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size) + Nfq = 3 $:GPU_UPDATE(device='[Nfq]') $:GPU_PARALLEL_LOOP(collapse=3) @@ -243,56 +179,120 @@ contains $:END_GPU_PARALLEL_LOOP() #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]') -#if defined(__PGI) - ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) -#else - ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) - call hipCheck(hipDeviceSynchronize()) -#endif + #:if USING_NVHPC + ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) + #:else + ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) + call hipCheck(hipDeviceSynchronize()) + #:endif #:endcall GPU_HOST_DATA - $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]') + $:GPU_PARALLEL_LOOP(collapse=3) do k = 1, sys_size do j = 0, m do l = 0, p data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp) - q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) + q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) end do end do end do $:END_GPU_PARALLEL_LOOP() - end do -#else - Nfq = 3 - do j = 0, m - do k = 1, sys_size - data_fltr_cmplx(:) = (0_dp, 0_dp) - data_real(1:p + 1) = q_cons_vf(k)%sf(j, 0, 0:p) - call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx) - data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq) - call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real) - data_real(:) = data_real(:)/real(real_size, dp) - q_cons_vf(k)%sf(j, 0, 0:p) = data_real(1:p + 1) + do i = 1, fourier_rings + + $:GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 1, cmplx_size + data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp) + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]') + do k = 1, sys_size + do j = 0, m + do l = 0, p + data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l) + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]') + #:if USING_NVHPC + ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) + #:else + ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) + call hipCheck(hipDeviceSynchronize()) + #:endif + #:endcall GPU_HOST_DATA + + Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size) + $:GPU_UPDATE(device='[Nfq]') + + $:GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 1, Nfq + data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() + + #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]') + #:if USING_NVHPC + ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) + #:else + ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) + call hipCheck(hipDeviceSynchronize()) + #:endif + #:endcall GPU_HOST_DATA + + $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]') + do k = 1, sys_size + do j = 0, m + do l = 0, p + data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp) + q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) + end do + end do + end do + $:END_GPU_PARALLEL_LOOP() end do - end do - ! Apply Fourier filter to additional rings - do i = 1, fourier_rings - Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size) + #:else + Nfq = 3 do j = 0, m do k = 1, sys_size data_fltr_cmplx(:) = (0_dp, 0_dp) - data_real(1:p + 1) = q_cons_vf(k)%sf(j, i, 0:p) + data_real(1:p + 1) = q_cons_vf(k)%sf(j, 0, 0:p) call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx) data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq) call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real) data_real(:) = data_real(:)/real(real_size, dp) - q_cons_vf(k)%sf(j, i, 0:p) = data_real(1:p + 1) + q_cons_vf(k)%sf(j, 0, 0:p) = data_real(1:p + 1) end do end do - end do -#endif + + ! Apply Fourier filter to additional rings + do i = 1, fourier_rings + Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size) + do j = 0, m + do k = 1, sys_size + data_fltr_cmplx(:) = (0_dp, 0_dp) + data_real(1:p + 1) = q_cons_vf(k)%sf(j, i, 0:p) + call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx) + data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq) + call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real) + data_real(:) = data_real(:)/real(real_size, dp) + q_cons_vf(k)%sf(j, i, 0:p) = data_real(1:p + 1) + end do + end do + end do + #:endif end subroutine s_apply_fourier_filter diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 31230ad40b..908846dc8d 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -21,8 +21,6 @@ module m_global_parameters use m_helper_basic !< Functions to compare floating point numbers - ! $:USE_GPU_MODULE() - implicit none real(wp) :: wall_time = 0 diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp index 60555b554b..2039afb8e3 100644 --- a/src/simulation/m_igr.fpp +++ b/src/simulation/m_igr.fpp @@ -60,7 +60,7 @@ module m_igr integer, parameter :: offxL = 2 integer, parameter :: offxR = 3 -#if defined(MFC_OpenMP) +#:if MFC_OpenMP real(wp) :: coeff_L(1:5) = [ & -3._wp/60._wp, & ! Index -1 27._wp/60._wp, & ! Index 0 @@ -76,7 +76,7 @@ module m_igr 27._wp/60._wp, & ! Index 1 -3._wp/60._wp & ! Index 2 ] -#else +#:else real(wp), parameter :: coeff_L(1:5) = [ & -3._wp/60._wp, & ! Index -1 27._wp/60._wp, & ! Index 0 @@ -92,14 +92,14 @@ module m_igr 27._wp/60._wp, & ! Index 1 -3._wp/60._wp & ! Index 2 ] -#endif +#:endif #:elif igr_order == 3 integer, parameter :: vidxb = -1 integer, parameter :: vidxe = 2 integer, parameter :: offxL = 1 integer, parameter :: offxR = 2 -#if defined(MFC_OpenMP) +#:if MFC_OpenMP real(wp) :: coeff_L(1:3) = [ & 2._wp/6._wp, & ! Index 0 5._wp/6._wp, & ! Index 1 @@ -110,7 +110,7 @@ module m_igr 5._wp/6._wp, & ! Index 0 2._wp/6._wp & ! Index 1 ] -#else +#:else real(wp), parameter :: coeff_L(1:3) = [ & 2._wp/6._wp, & ! Index 0 5._wp/6._wp, & ! Index 1 @@ -121,12 +121,12 @@ module m_igr 5._wp/6._wp, & ! Index 0 2._wp/6._wp & ! Index 1 ] -#endif +#:endif #:endif -#if defined(MFC_OpenMP) +#:if MFC_OpenMP $:GPU_DECLARE(create='[coeff_L, coeff_R]') -#endif +#:endif #:endif integer(kind=8) :: i, j, k, l, q, r diff --git a/src/simulation/m_muscl.fpp b/src/simulation/m_muscl.fpp index c9d5b4bff0..17fca4cf0f 100644 --- a/src/simulation/m_muscl.fpp +++ b/src/simulation/m_muscl.fpp @@ -7,9 +7,7 @@ module m_muscl use m_variables_conversion !< State variables type conversion procedures -#ifdef MFC_OpenACC - use openacc -#endif + $:USE_GPU_MODULE() use m_mpi_proxy diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp index 47fa433ca9..4da30a3790 100644 --- a/src/simulation/m_rhs.fpp +++ b/src/simulation/m_rhs.fpp @@ -106,10 +106,10 @@ module m_rhs !> @{ type(vector_field), allocatable, dimension(:) :: dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n type(vector_field), allocatable, dimension(:) :: dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n -#if defined(MFC_OpenACC) +#:if MFC_OpenACC $:GPU_DECLARE(create='[dqL_prim_dx_n,dqL_prim_dy_n,dqL_prim_dz_n]') $:GPU_DECLARE(create='[dqR_prim_dx_n,dqR_prim_dy_n,dqR_prim_dz_n]') -#endif +#:endif !> @} type(scalar_field), allocatable, dimension(:) :: tau_Re_vf @@ -127,9 +127,9 @@ module m_rhs !> @{ type(vector_field), allocatable, dimension(:) :: gm_alphaL_n type(vector_field), allocatable, dimension(:) :: gm_alphaR_n -#if defined(MFC_OpenACC) +#:if MFC_OpenACC $:GPU_DECLARE(create='[gm_alphaL_n,gm_alphaR_n]') -#endif +#:endif !> @} !> @name The cell-boundary values of the fluxes (src - source, gsrc - geometrical @@ -140,16 +140,16 @@ module m_rhs type(vector_field), allocatable, dimension(:) :: flux_src_n type(vector_field), allocatable, dimension(:) :: flux_gsrc_n -#if defined(MFC_OpenACC) +#:if MFC_OpenACC $:GPU_DECLARE(create='[flux_n,flux_src_n,flux_gsrc_n]') -#endif +#:endif !> @} type(vector_field), allocatable, dimension(:) :: qL_prim, qR_prim -#if defined(MFC_OpenACC) +#:if MFC_OpenACC $:GPU_DECLARE(create='[qL_prim,qR_prim]') -#endif +#:endif type(int_bounds_info) :: iv !< Vector field indical bounds $:GPU_DECLARE(create='[iv]') diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index 8a852bb40b..b98e944916 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -100,9 +100,9 @@ contains use hipfort use hipfort_hipmalloc use hipfort_check -#if defined(MFC_OpenACC) - use openacc -#endif + #:if MFC_OpenACC + use openacc + #:endif #endif integer :: i, j !< Generic loop iterators @@ -181,17 +181,17 @@ contains ! Doing hipMalloc then mapping should be most performant call hipCheck(hipMalloc(q_cons_ts_pool_device, dims8=pool_dims, lbounds8=pool_starts)) ! Without this map CCE will still create a device copy, because it's silly like that -#if defined(MFC_OpenACC) - call acc_map_data(q_cons_ts_pool_device, c_loc(q_cons_ts_pool_device), c_sizeof(q_cons_ts_pool_device)) -#endif + #:if MFC_OpenACC + call acc_map_data(q_cons_ts_pool_device, c_loc(q_cons_ts_pool_device), c_sizeof(q_cons_ts_pool_device)) + #:endif ! CCE see it can access this and will leave it on the host. It will stay on the host so long as HSA_XNACK=1 ! NOTE: WE CANNOT DO ATOMICS INTO THIS MEMORY. We have to change a property to use atomics here ! Otherwise leaving this as fine-grained will actually help performance since it can't be cached in GPU L2 if (num_ts == 2) then call hipCheck(hipMallocManaged(q_cons_ts_pool_host, dims8=pool_dims, lbounds8=pool_starts, flags=hipMemAttachGlobal)) -#if defined(MFC_OpenMP) - call hipCheck(hipMemAdvise(c_loc(q_cons_ts_pool_host), c_sizeof(q_cons_ts_pool_host), hipMemAdviseSetPreferredLocation, -1)) -#endif + #:if MFC_OpenMP + call hipCheck(hipMemAdvise(c_loc(q_cons_ts_pool_host), c_sizeof(q_cons_ts_pool_host), hipMemAdviseSetPreferredLocation, -1)) + #:endif end if #endif diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp index 3422dab8ec..6ef8eeaf6c 100644 --- a/src/simulation/m_weno.fpp +++ b/src/simulation/m_weno.fpp @@ -25,8 +25,6 @@ module m_weno use m_variables_conversion !< State variables type conversion procedures - ! $:USE_GPU_MODULE() - use m_mpi_proxy use m_muscl !< For Interface Compression diff --git a/src/syscheck/syscheck.fpp b/src/syscheck/syscheck.fpp index 75e18efc33..3052c7aec3 100644 --- a/src/syscheck/syscheck.fpp +++ b/src/syscheck/syscheck.fpp @@ -22,21 +22,21 @@ #:enddef MPIC #:def ACCC(*args) -#ifdef MFC_OpenACC - @:LOG("[TEST] ACC: ${','.join([ x.replace("'", '') for x in args ])}$") - ${','.join([ x.replace("'", '') for x in args ])}$ -#else - @:LOG("[SKIP] ACC: ${','.join([ x.replace("'", '') for x in args ])}$") -#endif + #:if MFC_OpenACC + @:LOG("[TEST] ACC: ${','.join([ x.replace("'", '') for x in args ])}$") + ${','.join([ x.replace("'", '') for x in args ])}$ + #:else + @:LOG("[SKIP] ACC: ${','.join([ x.replace("'", '') for x in args ])}$") + #:endif #:enddef ACCC #:def OMPC(*args) -#ifdef MFC_OpenMP - @:LOG("[TEST] OMP: ${','.join([ x.replace("'", '') for x in args ])}$") - ${','.join([ x.replace("'", '') for x in args ])}$ -#else - @:LOG("[SKIP] OMP: ${','.join([ x.replace("'", '') for x in args ])}$") -#endif + #:if MFC_OpenMP + @:LOG("[TEST] OMP: ${','.join([ x.replace("'", '') for x in args ])}$") + ${','.join([ x.replace("'", '') for x in args ])}$ + #:else + @:LOG("[SKIP] OMP: ${','.join([ x.replace("'", '') for x in args ])}$") + #:endif #:enddef OMPC #:def MPI(*args) @@ -46,15 +46,15 @@ #:enddef MPI #:def ACC(*args) -#ifdef MFC_OpenACC - ${','.join([ x.replace("'", '') for x in args ])}$ -#endif + #:if MFC_OpenACC + ${','.join([ x.replace("'", '') for x in args ])}$ + #:endif #:enddef ACC #:def OMP(*args) -#ifdef MFC_OpenMP - ${','.join([ x.replace("'", '') for x in args ])}$ -#endif + #:if MFC_OpenMP + ${','.join([ x.replace("'", '') for x in args ])}$ + #:endif #:enddef OMP program syscheck From d03437dbc953a9ac7cbd1f8d3c4e0323545d48aa Mon Sep 17 00:00:00 2001 From: Tanush Prathi Date: Wed, 19 Nov 2025 14:59:15 -0500 Subject: [PATCH 2/7] Removed DEF_AMD and fix compile issues --- src/common/m_chemistry.fpp | 17 +++++++-------- src/simulation/m_cbc.fpp | 16 ++++++-------- src/simulation/m_checker.fpp | 4 ++-- src/simulation/m_global_parameters.fpp | 6 +++--- src/simulation/m_riemann_solvers.fpp | 4 ++-- src/simulation/m_start_up.fpp | 30 +++++++++++++------------- 6 files changed, 37 insertions(+), 40 deletions(-) diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp index b43905dc7f..6605e65f32 100644 --- a/src/common/m_chemistry.fpp +++ b/src/common/m_chemistry.fpp @@ -19,12 +19,12 @@ module m_chemistry implicit none - #:block DEF_AMD + #:if USING_AMD real(wp) :: molecular_weights_nonparameter(10) = & (/2.016, 1.008, 15.999, 31.998, 17.007, 18.015, 33.006, & 34.014, 39.95, 28.014/) $:GPU_DECLARE(create='[molecular_weights_nonparameter]') - #:endblock DEF_AMD + #:endif type(int_bounds_info) :: isc1, isc2, isc3 $:GPU_DECLARE(create='[isc1, isc2, isc3]') @@ -146,12 +146,11 @@ contains $:GPU_LOOP(parallelism='[seq]') do eqn = chemxb, chemxe - #:block UNDEF_AMD - omega_m = molecular_weights(eqn - chemxb + 1)*omega(eqn - chemxb + 1) - #:endblock UNDEF_AMD - #:block DEF_AMD + #:if USING_AMD omega_m = molecular_weights_nonparameter(eqn - chemxb + 1)*omega(eqn - chemxb + 1) - #:endblock DEF_AMD + #:else + omega_m = molecular_weights(eqn - chemxb + 1)*omega(eqn - chemxb + 1) + #:endif rhs_vf(eqn)%sf(x, y, z) = rhs_vf(eqn)%sf(x, y, z) + omega_m end do @@ -190,7 +189,7 @@ contains ! Set offsets based on direction using array indexing offsets = 0 offsets(idir) = 1 - #:block UNDEF_AMD + #:if not USING_AMD $:GPU_PARALLEL_LOOP(collapse=3, private='[x,y,z,Ys_L, Ys_R, Ys_cell, Xs_L, Xs_R, mass_diffusivities_mixavg1, mass_diffusivities_mixavg2, mass_diffusivities_mixavg_Cell, h_l, h_r, Xs_cell, h_k, dXk_dxi,Mass_Diffu_Flux, Mass_Diffu_Energy, MW_L, MW_R, MW_cell, Rgas_L, Rgas_R, T_L, T_R, P_L, P_R, rho_L, rho_R, rho_cell, rho_Vic, lambda_L, lambda_R, lambda_Cell, dT_dxi, grid_spacing]', copyin='[offsets]') do z = isc3%beg, isc3%end do y = isc2%beg, isc2%end @@ -299,7 +298,7 @@ contains end do end do $:END_GPU_PARALLEL_LOOP() - #:endblock UNDEF_AMD + #:endif end if end subroutine s_compute_chemistry_diffusion_flux diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp index 94c47f75b4..b7ebcd5736 100644 --- a/src/simulation/m_cbc.fpp +++ b/src/simulation/m_cbc.fpp @@ -37,9 +37,9 @@ module m_cbc molecular_weights, get_species_specific_heats_r, & get_mole_fractions, get_species_specific_heats_r - #:block DEF_AMD + #:if USING_AMD use m_chemistry, only: molecular_weights_nonparameter - #:endblock DEF_AMD + #:endif implicit none private; public :: s_initialize_cbc_module, s_cbc, s_finalize_cbc_module @@ -1063,15 +1063,13 @@ contains sum_Enthalpies = 0._wp $:GPU_LOOP(parallelism='[seq]') do i = 1, num_species - #:block UNDEF_AMD + #:if USING_AMD + h_k(i) = h_k(i)*gas_constant/molecular_weights_nonparameter(i)*T + sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights_nonparameter(i)*Cp/R_gas)*dYs_dt(i) + #:else h_k(i) = h_k(i)*gas_constant/molecular_weights(i)*T sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights(i)*Cp/R_gas)*dYs_dt(i) - #:endblock UNDEF_AMD - - #:block DEF_AMD - h_k(i) = h_k(i)*gas_constant/molecular_weights_nonparameter(i)*T - sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights_nonparameter(i)*Cp/R_gas)*dYs_dt(i) - #:endblock DEF_AMD + #:endif end do flux_rs${XYZ}$_vf_l(-1, k, r, E_idx) = flux_rs${XYZ}$_vf_l(0, k, r, E_idx) & + ds(0)*((E/rho + pres/rho)*drho_dt + rho*vel_dv_dt_sum + Cp*T*L(2)/(c*c) + sum_Enthalpies) diff --git a/src/simulation/m_checker.fpp b/src/simulation/m_checker.fpp index da82e0f37b..bea6b33f7f 100644 --- a/src/simulation/m_checker.fpp +++ b/src/simulation/m_checker.fpp @@ -60,9 +60,9 @@ contains !> Checks constraints on compiler options impure subroutine s_check_inputs_compilers -#if !defined(MFC_OpenACC) && !(defined(__PGI) || defined(_CRAYFTN)) +#:if not MFC_OpenACC and (not (USING_NVHPC or USING_CCE)) @:PROHIBIT(rdma_mpi, "Unsupported value of rdma_mpi for the current compiler") -#endif +#:endif end subroutine s_check_inputs_compilers impure subroutine s_check_inputs_igr diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 908846dc8d..8bedb886aa 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -230,13 +230,13 @@ module m_global_parameters !> @{ type(int_bounds_info) :: bc_x, bc_y, bc_z !> @} -#if defined(MFC_OpenACC) +#:if MFC_OpenACC $:GPU_DECLARE(create='[bc_x%vb1, bc_x%vb2, bc_x%vb3, bc_x%ve1, bc_x%ve2, bc_x%ve3]') $:GPU_DECLARE(create='[bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3]') $:GPU_DECLARE(create='[bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3]') -#elif defined(MFC_OpenMP) +#:elif MFC_OpenMP $:GPU_DECLARE(create='[bc_x, bc_y, bc_z]') -#endif +#:endif type(bounds_info) :: x_domain, y_domain, z_domain real(wp) :: x_a, y_a, z_a real(wp) :: x_b, y_b, z_b diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp index 77c8d4e495..2b25eada22 100644 --- a/src/simulation/m_riemann_solvers.fpp +++ b/src/simulation/m_riemann_solvers.fpp @@ -3682,7 +3682,7 @@ contains #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')] if (norm_dir == ${NORM_DIR}$) then - #:block UNDEF_AMD + #:if not USING_AMD $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, vel, alpha_L, alpha_R, rho, pres,E, H_no_mag, gamma, pi_inf, qv, vel_rms, B, c, c_fast, pres_mag, U_L, U_R, U_starL, U_starR, U_doubleL, U_doubleR, F_L, F_R, F_starL, F_starR, F_hlld, s_L, s_R, s_M, s_starL, s_starR, pTot_L, pTot_R, p_star, rhoL_star, rhoR_star, E_starL, E_starR, sqrt_rhoL_star, sqrt_rhoR_star, denom_ds, sign_Bx, vL_star, vR_star, wL_star, wR_star, v_double, w_double, By_double, Bz_double, E_doubleL, E_doubleR, E_double]', copyin='[norm_dir]') do l = is3%beg, is3%end do k = is2%beg, is2%end @@ -3856,7 +3856,7 @@ contains end do end do $:END_GPU_PARALLEL_LOOP() - #:endblock UNDEF_AMD + #:endif end if #:endfor diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp index 5e56dd1562..8bb39da1c6 100644 --- a/src/simulation/m_start_up.fpp +++ b/src/simulation/m_start_up.fpp @@ -1414,24 +1414,24 @@ contains impure subroutine s_initialize_mpi_domain integer :: ierr -#ifdef MFC_GPU +#:if MFC_GPU real(wp) :: starttime, endtime integer :: num_devices, local_size, num_nodes, ppn, my_device_num integer :: dev, devNum, local_rank #ifdef MFC_MPI integer :: local_comm #endif -#if defined(MFC_OpenACC) +#:if MFC_OpenACC integer(acc_device_kind) :: devtype -#endif -#endif +#:endif +#:endif ! Initializing MPI execution environment call s_mpi_initialize() ! Bind GPUs if OpenACC is enabled -#ifdef MFC_GPU +#:if MFC_GPU #ifndef MFC_MPI local_size = 1 local_rank = 0 @@ -1441,18 +1441,18 @@ contains call MPI_Comm_size(local_comm, local_size, ierr) call MPI_Comm_rank(local_comm, local_rank, ierr) #endif -#if defined(MFC_OpenACC) +#:if MFC_OpenACC devtype = acc_get_device_type() devNum = acc_get_num_devices(devtype) dev = mod(local_rank, devNum) call acc_set_device_num(dev, devtype) -#elif defined(MFC_OpenMP) +#:elif MFC_OpenMP devNum = omp_get_num_devices() dev = mod(local_rank, devNum) call omp_set_default_device(dev) -#endif -#endif +#:endif +#:endif ! The rank 0 processor assigns default values to the user inputs prior to ! reading them in from the input file. Next, the user inputs are read and @@ -1470,13 +1470,13 @@ contains "case-optimized", & #:endif m, n, p, num_procs, & -#if defined(MFC_OpenACC) +#:if MFC_OpenACC "with OpenACC offloading" -#elif defined(MFC_OpenMP) +#:elif MFC_OpenMP "with OpenMP offloading" -#else +#:else "on CPUs" -#endif +#:endif end if ! Broadcasting the user inputs to all of the processors and performing the @@ -1552,14 +1552,14 @@ contains #:if not MFC_CASE_OPTIMIZATION $:GPU_UPDATE(device='[igr,nb,igr_order]') #:endif - #:block DEF_AMD + #:if USING_AMD block use m_thermochem, only: molecular_weights use m_chemistry, only: molecular_weights_nonparameter molecular_weights_nonparameter(:) = molecular_weights(:) $:GPU_UPDATE(device='[molecular_weights_nonparameter]') end block - #:endblock + #:endif end subroutine s_initialize_gpu_vars From a16450fff3f47d5aed7a7b264a1bdba7b672c568 Mon Sep 17 00:00:00 2001 From: Tanush Prathi Date: Wed, 19 Nov 2025 15:03:09 -0500 Subject: [PATCH 3/7] Fixed syscheck to compile --- src/syscheck/syscheck.fpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/syscheck/syscheck.fpp b/src/syscheck/syscheck.fpp index 3052c7aec3..f11bf2e6fa 100644 --- a/src/syscheck/syscheck.fpp +++ b/src/syscheck/syscheck.fpp @@ -1,3 +1,5 @@ +#:include 'macros.fpp' + #:def LOG(*args) #ifdef MFC_MPI if (rank == 0) then From 677e1831bdeb46bffa9b0c261f7a05c2a6f85480 Mon Sep 17 00:00:00 2001 From: Tanush Prathi Date: Wed, 19 Nov 2025 15:17:27 -0500 Subject: [PATCH 4/7] Ran formatter --- src/common/m_nvtx.f90 | 64 +++++++------- src/simulation/m_cbc.fpp | 2 +- src/simulation/m_checker.fpp | 6 +- src/simulation/m_global_parameters.fpp | 14 +-- src/simulation/m_igr.fpp | 118 ++++++++++++------------- src/simulation/m_rhs.fpp | 26 +++--- src/simulation/m_start_up.fpp | 62 ++++++------- 7 files changed, 146 insertions(+), 146 deletions(-) diff --git a/src/common/m_nvtx.f90 b/src/common/m_nvtx.f90 index ee8827f736..e4320f26dd 100644 --- a/src/common/m_nvtx.f90 +++ b/src/common/m_nvtx.f90 @@ -25,31 +25,31 @@ module m_nvtx type(c_ptr) :: message ! ascii char end type nvtxEventAttributes -#:if MFC_GPU and USING_NVHPC + #:if MFC_GPU and USING_NVHPC - interface nvtxRangePush - ! push range with custom label and standard color - subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA') - use iso_c_binding + interface nvtxRangePush + ! push range with custom label and standard color + subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA') + use iso_c_binding - character(kind=c_char, len=*), intent(IN) :: name - end subroutine nvtxRangePushA + character(kind=c_char, len=*), intent(IN) :: name + end subroutine nvtxRangePushA - ! push range with custom label and custom color - subroutine nvtxRangePushEx(event) bind(C, name='nvtxRangePushEx') - use iso_c_binding + ! push range with custom label and custom color + subroutine nvtxRangePushEx(event) bind(C, name='nvtxRangePushEx') + use iso_c_binding - import :: nvtxEventAttributes - type(nvtxEventAttributes), intent(IN) :: event - end subroutine nvtxRangePushEx - end interface nvtxRangePush + import :: nvtxEventAttributes + type(nvtxEventAttributes), intent(IN) :: event + end subroutine nvtxRangePushEx + end interface nvtxRangePush - interface nvtxRangePop - subroutine nvtxRangePop() bind(C, name='nvtxRangePop') - end subroutine nvtxRangePop - end interface nvtxRangePop + interface nvtxRangePop + subroutine nvtxRangePop() bind(C, name='nvtxRangePop') + end subroutine nvtxRangePop + end interface nvtxRangePop -#:endif + #:endif contains @@ -58,25 +58,25 @@ subroutine nvtxStartRange(name, id) integer, intent(IN), optional :: id type(nvtxEventAttributes) :: event -#:if MFC_GPU and USING_NVHPC + #:if MFC_GPU and USING_NVHPC - tempName = trim(name)//c_null_char + tempName = trim(name)//c_null_char - if (.not. present(id)) then - call nvtxRangePush(tempName) - else - event%color = col(mod(id, 7) + 1) - event%message = c_loc(tempName) - call nvtxRangePushEx(event) - end if + if (.not. present(id)) then + call nvtxRangePush(tempName) + else + event%color = col(mod(id, 7) + 1) + event%message = c_loc(tempName) + call nvtxRangePushEx(event) + end if -#:endif + #:endif end subroutine nvtxStartRange subroutine nvtxEndRange -#:if MFC_GPU and USING_NVHPC - call nvtxRangePop -#:endif + #:if MFC_GPU and USING_NVHPC + call nvtxRangePop + #:endif end subroutine nvtxEndRange end module m_nvtx diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp index b7ebcd5736..7a7593d135 100644 --- a/src/simulation/m_cbc.fpp +++ b/src/simulation/m_cbc.fpp @@ -1064,7 +1064,7 @@ contains $:GPU_LOOP(parallelism='[seq]') do i = 1, num_species #:if USING_AMD - h_k(i) = h_k(i)*gas_constant/molecular_weights_nonparameter(i)*T + h_k(i) = h_k(i)*gas_constant/molecular_weights_nonparameter(i)*T sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights_nonparameter(i)*Cp/R_gas)*dYs_dt(i) #:else h_k(i) = h_k(i)*gas_constant/molecular_weights(i)*T diff --git a/src/simulation/m_checker.fpp b/src/simulation/m_checker.fpp index bea6b33f7f..4317fa2000 100644 --- a/src/simulation/m_checker.fpp +++ b/src/simulation/m_checker.fpp @@ -60,9 +60,9 @@ contains !> Checks constraints on compiler options impure subroutine s_check_inputs_compilers -#:if not MFC_OpenACC and (not (USING_NVHPC or USING_CCE)) - @:PROHIBIT(rdma_mpi, "Unsupported value of rdma_mpi for the current compiler") -#:endif + #:if not MFC_OpenACC and (not (USING_NVHPC or USING_CCE)) + @:PROHIBIT(rdma_mpi, "Unsupported value of rdma_mpi for the current compiler") + #:endif end subroutine s_check_inputs_compilers impure subroutine s_check_inputs_igr diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 8bedb886aa..b6dbd5f5bd 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -230,13 +230,13 @@ module m_global_parameters !> @{ type(int_bounds_info) :: bc_x, bc_y, bc_z !> @} -#:if MFC_OpenACC - $:GPU_DECLARE(create='[bc_x%vb1, bc_x%vb2, bc_x%vb3, bc_x%ve1, bc_x%ve2, bc_x%ve3]') - $:GPU_DECLARE(create='[bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3]') - $:GPU_DECLARE(create='[bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3]') -#:elif MFC_OpenMP - $:GPU_DECLARE(create='[bc_x, bc_y, bc_z]') -#:endif + #:if MFC_OpenACC + $:GPU_DECLARE(create='[bc_x%vb1, bc_x%vb2, bc_x%vb3, bc_x%ve1, bc_x%ve2, bc_x%ve3]') + $:GPU_DECLARE(create='[bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3]') + $:GPU_DECLARE(create='[bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3]') + #:elif MFC_OpenMP + $:GPU_DECLARE(create='[bc_x, bc_y, bc_z]') + #:endif type(bounds_info) :: x_domain, y_domain, z_domain real(wp) :: x_a, y_a, z_a real(wp) :: x_b, y_b, z_b diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp index 2039afb8e3..6630df79c8 100644 --- a/src/simulation/m_igr.fpp +++ b/src/simulation/m_igr.fpp @@ -60,73 +60,73 @@ module m_igr integer, parameter :: offxL = 2 integer, parameter :: offxR = 3 -#:if MFC_OpenMP - real(wp) :: coeff_L(1:5) = [ & - -3._wp/60._wp, & ! Index -1 - 27._wp/60._wp, & ! Index 0 - 47._wp/60._wp, & ! Index 1 - -13._wp/60._wp, & ! Index 2 - 2._wp/60._wp & ! Index 3 - ] - - real(wp) :: coeff_R(1:5) = [ & - 2._wp/60._wp, & ! Index -2 - -13._wp/60._wp, & ! Index -1 - 47._wp/60._wp, & ! Index 0 - 27._wp/60._wp, & ! Index 1 - -3._wp/60._wp & ! Index 2 - ] -#:else - real(wp), parameter :: coeff_L(1:5) = [ & - -3._wp/60._wp, & ! Index -1 - 27._wp/60._wp, & ! Index 0 - 47._wp/60._wp, & ! Index 1 - -13._wp/60._wp, & ! Index 2 - 2._wp/60._wp & ! Index 3 - ] - - real(wp), parameter :: coeff_R(1:5) = [ & - 2._wp/60._wp, & ! Index -2 - -13._wp/60._wp, & ! Index -1 - 47._wp/60._wp, & ! Index 0 - 27._wp/60._wp, & ! Index 1 - -3._wp/60._wp & ! Index 2 - ] -#:endif + #:if MFC_OpenMP + real(wp) :: coeff_L(1:5) = [ & + -3._wp/60._wp, & ! Index -1 + 27._wp/60._wp, & ! Index 0 + 47._wp/60._wp, & ! Index 1 + -13._wp/60._wp, & ! Index 2 + 2._wp/60._wp & ! Index 3 + ] + + real(wp) :: coeff_R(1:5) = [ & + 2._wp/60._wp, & ! Index -2 + -13._wp/60._wp, & ! Index -1 + 47._wp/60._wp, & ! Index 0 + 27._wp/60._wp, & ! Index 1 + -3._wp/60._wp & ! Index 2 + ] + #:else + real(wp), parameter :: coeff_L(1:5) = [ & + -3._wp/60._wp, & ! Index -1 + 27._wp/60._wp, & ! Index 0 + 47._wp/60._wp, & ! Index 1 + -13._wp/60._wp, & ! Index 2 + 2._wp/60._wp & ! Index 3 + ] + + real(wp), parameter :: coeff_R(1:5) = [ & + 2._wp/60._wp, & ! Index -2 + -13._wp/60._wp, & ! Index -1 + 47._wp/60._wp, & ! Index 0 + 27._wp/60._wp, & ! Index 1 + -3._wp/60._wp & ! Index 2 + ] + #:endif #:elif igr_order == 3 integer, parameter :: vidxb = -1 integer, parameter :: vidxe = 2 integer, parameter :: offxL = 1 integer, parameter :: offxR = 2 -#:if MFC_OpenMP - real(wp) :: coeff_L(1:3) = [ & - 2._wp/6._wp, & ! Index 0 - 5._wp/6._wp, & ! Index 1 - -1._wp/6._wp & ! Index 2 - ] - real(wp) :: coeff_R(1:3) = [ & - -1._wp/6._wp, & ! Index -1 - 5._wp/6._wp, & ! Index 0 - 2._wp/6._wp & ! Index 1 - ] -#:else - real(wp), parameter :: coeff_L(1:3) = [ & - 2._wp/6._wp, & ! Index 0 - 5._wp/6._wp, & ! Index 1 - -1._wp/6._wp & ! Index 2 - ] - real(wp), parameter :: coeff_R(1:3) = [ & - -1._wp/6._wp, & ! Index -1 - 5._wp/6._wp, & ! Index 0 - 2._wp/6._wp & ! Index 1 - ] -#:endif + #:if MFC_OpenMP + real(wp) :: coeff_L(1:3) = [ & + 2._wp/6._wp, & ! Index 0 + 5._wp/6._wp, & ! Index 1 + -1._wp/6._wp & ! Index 2 + ] + real(wp) :: coeff_R(1:3) = [ & + -1._wp/6._wp, & ! Index -1 + 5._wp/6._wp, & ! Index 0 + 2._wp/6._wp & ! Index 1 + ] + #:else + real(wp), parameter :: coeff_L(1:3) = [ & + 2._wp/6._wp, & ! Index 0 + 5._wp/6._wp, & ! Index 1 + -1._wp/6._wp & ! Index 2 + ] + real(wp), parameter :: coeff_R(1:3) = [ & + -1._wp/6._wp, & ! Index -1 + 5._wp/6._wp, & ! Index 0 + 2._wp/6._wp & ! Index 1 + ] + #:endif #:endif -#:if MFC_OpenMP - $:GPU_DECLARE(create='[coeff_L, coeff_R]') -#:endif + #:if MFC_OpenMP + $:GPU_DECLARE(create='[coeff_L, coeff_R]') + #:endif #:endif integer(kind=8) :: i, j, k, l, q, r diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp index 4da30a3790..ecdf8459d6 100644 --- a/src/simulation/m_rhs.fpp +++ b/src/simulation/m_rhs.fpp @@ -106,10 +106,10 @@ module m_rhs !> @{ type(vector_field), allocatable, dimension(:) :: dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n type(vector_field), allocatable, dimension(:) :: dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n -#:if MFC_OpenACC - $:GPU_DECLARE(create='[dqL_prim_dx_n,dqL_prim_dy_n,dqL_prim_dz_n]') - $:GPU_DECLARE(create='[dqR_prim_dx_n,dqR_prim_dy_n,dqR_prim_dz_n]') -#:endif + #:if MFC_OpenACC + $:GPU_DECLARE(create='[dqL_prim_dx_n,dqL_prim_dy_n,dqL_prim_dz_n]') + $:GPU_DECLARE(create='[dqR_prim_dx_n,dqR_prim_dy_n,dqR_prim_dz_n]') + #:endif !> @} type(scalar_field), allocatable, dimension(:) :: tau_Re_vf @@ -127,9 +127,9 @@ module m_rhs !> @{ type(vector_field), allocatable, dimension(:) :: gm_alphaL_n type(vector_field), allocatable, dimension(:) :: gm_alphaR_n -#:if MFC_OpenACC - $:GPU_DECLARE(create='[gm_alphaL_n,gm_alphaR_n]') -#:endif + #:if MFC_OpenACC + $:GPU_DECLARE(create='[gm_alphaL_n,gm_alphaR_n]') + #:endif !> @} !> @name The cell-boundary values of the fluxes (src - source, gsrc - geometrical @@ -140,16 +140,16 @@ module m_rhs type(vector_field), allocatable, dimension(:) :: flux_src_n type(vector_field), allocatable, dimension(:) :: flux_gsrc_n -#:if MFC_OpenACC - $:GPU_DECLARE(create='[flux_n,flux_src_n,flux_gsrc_n]') -#:endif + #:if MFC_OpenACC + $:GPU_DECLARE(create='[flux_n,flux_src_n,flux_gsrc_n]') + #:endif !> @} type(vector_field), allocatable, dimension(:) :: qL_prim, qR_prim -#:if MFC_OpenACC - $:GPU_DECLARE(create='[qL_prim,qR_prim]') -#:endif + #:if MFC_OpenACC + $:GPU_DECLARE(create='[qL_prim,qR_prim]') + #:endif type(int_bounds_info) :: iv !< Vector field indical bounds $:GPU_DECLARE(create='[iv]') diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp index 8bb39da1c6..de86182428 100644 --- a/src/simulation/m_start_up.fpp +++ b/src/simulation/m_start_up.fpp @@ -1414,45 +1414,45 @@ contains impure subroutine s_initialize_mpi_domain integer :: ierr -#:if MFC_GPU - real(wp) :: starttime, endtime - integer :: num_devices, local_size, num_nodes, ppn, my_device_num - integer :: dev, devNum, local_rank + #:if MFC_GPU + real(wp) :: starttime, endtime + integer :: num_devices, local_size, num_nodes, ppn, my_device_num + integer :: dev, devNum, local_rank #ifdef MFC_MPI - integer :: local_comm + integer :: local_comm #endif -#:if MFC_OpenACC - integer(acc_device_kind) :: devtype -#:endif -#:endif + #:if MFC_OpenACC + integer(acc_device_kind) :: devtype + #:endif + #:endif ! Initializing MPI execution environment call s_mpi_initialize() ! Bind GPUs if OpenACC is enabled -#:if MFC_GPU + #:if MFC_GPU #ifndef MFC_MPI - local_size = 1 - local_rank = 0 + local_size = 1 + local_rank = 0 #else - call MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, & - MPI_INFO_NULL, local_comm, ierr) - call MPI_Comm_size(local_comm, local_size, ierr) - call MPI_Comm_rank(local_comm, local_rank, ierr) + call MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, & + MPI_INFO_NULL, local_comm, ierr) + call MPI_Comm_size(local_comm, local_size, ierr) + call MPI_Comm_rank(local_comm, local_rank, ierr) #endif -#:if MFC_OpenACC - devtype = acc_get_device_type() - devNum = acc_get_num_devices(devtype) - dev = mod(local_rank, devNum) - - call acc_set_device_num(dev, devtype) -#:elif MFC_OpenMP - devNum = omp_get_num_devices() - dev = mod(local_rank, devNum) - call omp_set_default_device(dev) -#:endif -#:endif + #:if MFC_OpenACC + devtype = acc_get_device_type() + devNum = acc_get_num_devices(devtype) + dev = mod(local_rank, devNum) + + call acc_set_device_num(dev, devtype) + #:elif MFC_OpenMP + devNum = omp_get_num_devices() + dev = mod(local_rank, devNum) + call omp_set_default_device(dev) + #:endif + #:endif ! The rank 0 processor assigns default values to the user inputs prior to ! reading them in from the input file. Next, the user inputs are read and @@ -1472,11 +1472,11 @@ contains m, n, p, num_procs, & #:if MFC_OpenACC "with OpenACC offloading" -#:elif MFC_OpenMP + #:elif MFC_OpenMP "with OpenMP offloading" -#:else + #:else "on CPUs" -#:endif + #:endif end if ! Broadcasting the user inputs to all of the processors and performing the From 18d8f6c8ae24e55dbffa64088811acb26ea8ac68 Mon Sep 17 00:00:00 2001 From: Tanush Prathi Date: Wed, 19 Nov 2025 16:00:32 -0500 Subject: [PATCH 5/7] Update nvtx to run through fypp --- src/common/{m_nvtx.f90 => m_nvtx.fpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/common/{m_nvtx.f90 => m_nvtx.fpp} (100%) diff --git a/src/common/m_nvtx.f90 b/src/common/m_nvtx.fpp similarity index 100% rename from src/common/m_nvtx.f90 rename to src/common/m_nvtx.fpp From 5df3da21f14d0267b85b2077f99ab68eecf2e1ae Mon Sep 17 00:00:00 2001 From: Tanush Prathi Date: Wed, 19 Nov 2025 16:27:40 -0500 Subject: [PATCH 6/7] Few fixes --- CMakeLists.txt | 2 -- src/common/m_nvtx.fpp | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cb86d881dd..906d837878 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -315,10 +315,8 @@ macro(HANDLE_SOURCES target useCommon useOpenACC useOpenMP) if (${useOpenACC} AND ${useOpenMP}) message(FATAL_ERROR "OpenACC and OpenMP at same time is unsupported.") elseif (${useOpenACC}) - message(STATUS "OpenACC set: ${useOpenACC}") set(MFC_GPU_MODE "OpenACC") elseif (${useOpenMP}) - message(STATUS "OpenMP set: ${useOpenMP}") set(MFC_GPU_MODE "OpenMP") else() set(MFC_GPU_MODE "") diff --git a/src/common/m_nvtx.fpp b/src/common/m_nvtx.fpp index e4320f26dd..4473b200bf 100644 --- a/src/common/m_nvtx.fpp +++ b/src/common/m_nvtx.fpp @@ -1,3 +1,5 @@ +#:include 'macros.fpp' + module m_nvtx use iso_c_binding From 0828c7c32dfe1492ba7eec1e604b7a046765ea0a Mon Sep 17 00:00:00 2001 From: Tanush Prathi Date: Mon, 1 Dec 2025 13:02:36 -0500 Subject: [PATCH 7/7] Remove duplicate private variables --- src/common/m_chemistry.fpp | 2 +- src/simulation/m_hyperelastic.fpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp index 6605e65f32..46160ef126 100644 --- a/src/common/m_chemistry.fpp +++ b/src/common/m_chemistry.fpp @@ -129,7 +129,7 @@ contains real(wp), dimension(num_species) :: Ys real(wp), dimension(num_species) :: omega - $:GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega, eqn, T, rho, omega, omega_m]', copyin='[bounds]') + $:GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega, eqn, T, rho, omega_m]', copyin='[bounds]') do z = bounds(3)%beg, bounds(3)%end do y = bounds(2)%beg, bounds(2)%end do x = bounds(1)%beg, bounds(1)%end diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp index 9e21ef8820..5702a59f3a 100644 --- a/src/simulation/m_hyperelastic.fpp +++ b/src/simulation/m_hyperelastic.fpp @@ -106,7 +106,7 @@ contains real(wp) :: G_local integer :: j, k, l, i, r - $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,alpha_K, alpha_rho_K, rho, gamma, pi_inf, qv, G_local, Re, tensora, tensorb, i]') + $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,alpha_K, alpha_rho_K, rho, gamma, pi_inf, qv, G_local, Re, tensora, tensorb]') do l = 0, p do k = 0, n do j = 0, m