From 6d5760bc31a14996c419f7832034016581a7ba24 Mon Sep 17 00:00:00 2001 From: "Daniel J. Vickers" Date: Wed, 10 Dec 2025 16:43:51 -0500 Subject: [PATCH] Fixed compilation with post_processing and removed the last of the old macros --- src/common/include/acc_macros.fpp | 32 ---- src/common/include/omp_macros.fpp | 47 ----- src/common/include/parallel_macros.fpp | 17 -- src/common/m_mpi_common.fpp | 238 ++++++++++++------------- src/post_process/m_start_up.fpp | 24 +++ 5 files changed, 143 insertions(+), 215 deletions(-) diff --git a/src/common/include/acc_macros.fpp b/src/common/include/acc_macros.fpp index 3c048e8510..771ee976db 100644 --- a/src/common/include/acc_macros.fpp +++ b/src/common/include/acc_macros.fpp @@ -129,38 +129,6 @@ $:end_acc_directive #:enddef -#:def ACC_PARALLEL_LOOP_OLD(code, collapse=None, private=None, parallelism='[gang, vector]', & - & default='present', firstprivate=None, reduction=None, reductionOp=None, & - & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & - & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None) - #:set collapse_val = GEN_COLLAPSE_STR(collapse) - #:set parallelism_val = GEN_PARALLELISM_STR(parallelism) - #:set default_val = GEN_DEFAULT_STR(default) - #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n') - #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp) - #:set copy_val = GEN_COPY_STR(copy) - #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n') - #:set copyout_val = GEN_COPYOUT_STR(copyout) - #:set create_val = GEN_CREATE_STR(create) - #:set no_create_val = GEN_NOCREATE_STR(no_create) - #:set present_val = GEN_PRESENT_STR(present) - #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr) - #:set attach_val = GEN_ATTACH_STR(attach) - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + & - & default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + & - & copy_val.strip('\n') + copyin_val.strip('\n') + & - & copyout_val.strip('\n') + create_val.strip('\n') + & - & no_create_val.strip('\n') + present_val.strip('\n') + & - & deviceptr_val.strip('\n') + attach_val.strip('\n') - #:set acc_directive = '!$acc parallel loop ' + & - & clause_val + extraAccArgs_val.strip('\n') - #:set acc_end_directive = '!$acc end parallel loop' - $:acc_directive - $:code - $:acc_end_directive -#:enddef - #:def ACC_PARALLEL_LOOP(collapse=None, private=None, parallelism='[gang, vector]', & & default='present', firstprivate=None, reduction=None, reductionOp=None, & & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & diff --git a/src/common/include/omp_macros.fpp b/src/common/include/omp_macros.fpp index 2b7606d03f..2e1df1dd8a 100644 --- a/src/common/include/omp_macros.fpp +++ b/src/common/include/omp_macros.fpp @@ -149,53 +149,6 @@ $:omp_end_directive #:enddef -#:def OMP_PARALLEL_LOOP_OLD(code, collapse=None, private=None, parallelism='[gang, vector]', & - & default='present', firstprivate=None, reduction=None, reductionOp=None, & - & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & - & no_create=None, present=None, deviceptr=None, attach=None, extraOmpArgs=None) - - #:set collapse_val = GEN_COLLAPSE_STR(collapse) - #:set parallelism_val = OMP_PARALLELISM_STR(parallelism) - #:set default_val = OMP_DEFAULT_STR(default) - #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n') - #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp) - #:set copy_val = OMP_COPY_STR(copy) - #:set copyin_val = OMP_COPYIN_STR(copyin).strip('\n') + OMP_COPYIN_STR(copyinReadOnly).strip('\n') - #:set copyout_val = OMP_COPYOUT_STR(copyout) - #:set create_val = OMP_CREATE_STR(create) - #:set no_create_val = OMP_NOCREATE_STR(no_create) - #:set present_val = OMP_PRESENT_STR(present) - #:set deviceptr_val = OMP_DEVICEPTR_STR(deviceptr) - #:set attach_val = OMP_MAP_STR('always,tofrom', attach) - #:set extraOmpArgs_val = GEN_EXTRA_ARGS_STR(extraOmpArgs) - #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + & - & default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + & - & copy_val.strip('\n') + copyin_val.strip('\n') + & - & copyout_val.strip('\n') + create_val.strip('\n') + & - & no_create_val.strip('\n') + present_val.strip('\n') + & - & deviceptr_val.strip('\n') + attach_val.strip('\n') - #! Hardcoding the parallelism for now - - #:if MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID - #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) ' - #:set omp_end_directive = '!$omp end target teams loop' - #:elif MFC_COMPILER == CCE_COMPILER_ID - #:set omp_start_directive = '!$omp target teams distribute parallel do simd defaultmap(firstprivate:scalar) ' - #:set omp_end_directive = '!$omp end target teams distribute parallel do simd' - #:elif MFC_COMPILER == AMD_COMPILER_ID - #:set omp_start_directive = '!$omp target teams distribute parallel do ' - #:set omp_end_directive = '!$omp end target teams distribute parallel do' - #:else - #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) ' - #:set omp_end_directive = '!$omp end target teams loop' - #:endif - - #:set omp_directive = omp_start_directive + clause_val + extraOmpArgs_val.strip('\n') - $:omp_directive - $:code - $:omp_end_directive -#:enddef - #:def OMP_PARALLEL_LOOP(collapse=None, private=None, parallelism='[gang, vector]', & & default='present', firstprivate=None, reduction=None, reductionOp=None, & & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp index e5dc5605e9..bfe4b3beaf 100644 --- a/src/common/include/parallel_macros.fpp +++ b/src/common/include/parallel_macros.fpp @@ -19,23 +19,6 @@ #:enddef -#:def GPU_PARALLEL_LOOP_OLD(code, collapse=None, private=None, parallelism='[gang, vector]', & - & default='present', firstprivate=None, reduction=None, reductionOp=None, & - & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & - & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None, extraOmpArgs=None) - - #:set acc_code = ACC_PARALLEL_LOOP_OLD(code, collapse, private, parallelism, default, firstprivate, reduction, reductionOp, copy, copyin, copyinReadOnly, copyout, create, no_create, present, deviceptr, attach, extraAccArgs) - #:set omp_code = OMP_PARALLEL_LOOP_OLD(code, collapse, private, parallelism, default, firstprivate, reduction, reductionOp, copy, copyin, copyinReadOnly, copyout, create, no_create, present, deviceptr, attach, extraOmpArgs) - -#if defined(MFC_OpenACC) - $:acc_code -#elif defined(MFC_OpenMP) - $:omp_code -#else - $:code -#endif -#:enddef - #:def GPU_PARALLEL_LOOP(collapse=None, private=None, parallelism='[gang, vector]', & & default='present', firstprivate=None, reduction=None, reductionOp=None, & & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp index 905b21570a..3ff753caca 100644 --- a/src/common/m_mpi_common.fpp +++ b/src/common/m_mpi_common.fpp @@ -958,176 +958,176 @@ contains #:for mpi_dir in [1, 2, 3] if (mpi_dir == ${mpi_dir}$) then #:if mpi_dir == 1 - #:call GPU_PARALLEL_LOOP_OLD(collapse=4,private='[r]') - do l = 0, p - do k = 0, n - do j = -buff_size, -1 - do i = 1, nVar - r = (i - 1) + v_size* & - (j + buff_size*((k + 1) + (n + 1)*l)) - q_comm(i)%sf(j + unpack_offset, k, l) = real(buff_recv(r), kind=stp) + $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') + do l = 0, p + do k = 0, n + do j = -buff_size, -1 + do i = 1, nVar + r = (i - 1) + v_size* & + (j + buff_size*((k + 1) + (n + 1)*l)) + q_comm(i)%sf(j + unpack_offset, k, l) = real(buff_recv(r), kind=stp) #if defined(__INTEL_COMPILER) - if (ieee_is_nan(q_comm(i)%sf(j, k, l))) then - print *, "Error", j, k, l, i - error stop "NaN(s) in recv" - end if + if (ieee_is_nan(q_comm(i)%sf(j, k, l))) then + print *, "Error", j, k, l, i + error stop "NaN(s) in recv" + end if #endif - end do end do end do end do - #:endcall GPU_PARALLEL_LOOP_OLD + end do + $:END_GPU_PARALLEL_LOOP() if (qbmm_comm) then - #:call GPU_PARALLEL_LOOP_OLD(collapse=5,private='[r]') - do l = 0, p - do k = 0, n - do j = -buff_size, -1 - do i = nVar + 1, nVar + 4 - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - (j + buff_size*((k + 1) + (n + 1)*l)) - pb_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp) - end do + $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do l = 0, p + do k = 0, n + do j = -buff_size, -1 + do i = nVar + 1, nVar + 4 + do q = 1, nb + r = (i - 1) + (q - 1)*4 + v_size* & + (j + buff_size*((k + 1) + (n + 1)*l)) + pb_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp) end do end do end do end do - #:endcall GPU_PARALLEL_LOOP_OLD + end do + $:END_GPU_PARALLEL_LOOP() - #:call GPU_PARALLEL_LOOP_OLD(collapse=5,private='[r]') - do l = 0, p - do k = 0, n - do j = -buff_size, -1 - do i = nVar + 1, nVar + 4 - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - (j + buff_size*((k + 1) + (n + 1)*l)) - mv_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp) - end do + $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do l = 0, p + do k = 0, n + do j = -buff_size, -1 + do i = nVar + 1, nVar + 4 + do q = 1, nb + r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & + (j + buff_size*((k + 1) + (n + 1)*l)) + mv_in(j + unpack_offset, k, l, i - nVar, q) = real(buff_recv(r), kind=stp) end do end do end do end do - #:endcall GPU_PARALLEL_LOOP_OLD + end do + $:END_GPU_PARALLEL_LOOP() end if #:elif mpi_dir == 2 - #:call GPU_PARALLEL_LOOP_OLD(collapse=4,private='[r]') - do i = 1, nVar - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + buff_size*l)) - q_comm(i)%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp) + $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') + do i = 1, nVar + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size + r = (i - 1) + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + buff_size*l)) + q_comm(i)%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp) #if defined(__INTEL_COMPILER) - if (ieee_is_nan(q_comm(i)%sf(j, k, l))) then - print *, "Error", j, k, l, i - error stop "NaN(s) in recv" - end if + if (ieee_is_nan(q_comm(i)%sf(j, k, l))) then + print *, "Error", j, k, l, i + error stop "NaN(s) in recv" + end if #endif - end do end do end do end do - #:endcall GPU_PARALLEL_LOOP_OLD + end do + $:END_GPU_PARALLEL_LOOP() if (qbmm_comm) then - #:call GPU_PARALLEL_LOOP_OLD(collapse=5,private='[r]') - do i = nVar + 1, nVar + 4 - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + buff_size*l)) - pb_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp) - end do + $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + 4 + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + buff_size*l)) + pb_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp) end do end do end do end do - #:endcall GPU_PARALLEL_LOOP_OLD - - #:call GPU_PARALLEL_LOOP_OLD(collapse=5,private='[r]') - do i = nVar + 1, nVar + 4 - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + buff_size*l)) - mv_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp) - end do + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + 4 + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + buff_size*l)) + mv_in(j, k + unpack_offset, l, i - nVar, q) = real(buff_recv(r), kind=stp) end do end do end do end do - #:endcall GPU_PARALLEL_LOOP_OLD + end do + $:END_GPU_PARALLEL_LOOP() end if #:else ! Unpacking buffer from bc_z%beg - #:call GPU_PARALLEL_LOOP_OLD(collapse=4,private='[r]') - do i = 1, nVar - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l + buff_size))) - q_comm(i)%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp) + $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') + do i = 1, nVar + do l = -buff_size, -1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + r = (i - 1) + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)* & + (l + buff_size))) + q_comm(i)%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp) #if defined(__INTEL_COMPILER) - if (ieee_is_nan(q_comm(i)%sf(j, k, l))) then - print *, "Error", j, k, l, i - error stop "NaN(s) in recv" - end if + if (ieee_is_nan(q_comm(i)%sf(j, k, l))) then + print *, "Error", j, k, l, i + error stop "NaN(s) in recv" + end if #endif - end do end do end do end do - #:endcall GPU_PARALLEL_LOOP_OLD + end do + $:END_GPU_PARALLEL_LOOP() if (qbmm_comm) then - #:call GPU_PARALLEL_LOOP_OLD(collapse=5,private='[r]') - do i = nVar + 1, nVar + 4 - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l + buff_size))) - pb_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp) - end do + $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + 4 + do l = -buff_size, -1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)* & + (l + buff_size))) + pb_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp) end do end do end do end do - #:endcall GPU_PARALLEL_LOOP_OLD - - #:call GPU_PARALLEL_LOOP_OLD(collapse=5,private='[r]') - do i = nVar + 1, nVar + 4 - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l + buff_size))) - mv_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp) - end do + end do + $:END_GPU_PARALLEL_LOOP() + + $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + 4 + do l = -buff_size, -1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)* & + (l + buff_size))) + mv_in(j, k, l + unpack_offset, i - nVar, q) = real(buff_recv(r), kind=stp) end do end do end do end do - #:endcall GPU_PARALLEL_LOOP_OLD + end do + $:END_GPU_PARALLEL_LOOP() end if #:endif end if diff --git a/src/post_process/m_start_up.fpp b/src/post_process/m_start_up.fpp index 41516305e2..904c4207cc 100644 --- a/src/post_process/m_start_up.fpp +++ b/src/post_process/m_start_up.fpp @@ -510,7 +510,9 @@ contains end do end do +#ifdef MFC_MPI call MPI_ALLREDUCE(MPI_IN_PLACE, En, Nf, mpi_p, MPI_SUM, MPI_COMM_WORLD, ierr) +#endif if (proc_rank == 0) then call s_create_directory('En_FFT_DATA') @@ -888,6 +890,8 @@ contains integer :: dest_rank, src_rank integer :: i, j, k, l +#ifdef MFC_MPI + allocate (sendbuf(Nx*Nyloc*Nzloc)) allocate (recvbuf(Nx*Nyloc*Nzloc)) @@ -917,6 +921,8 @@ contains deallocate (sendbuf) deallocate (recvbuf) +#endif + end subroutine s_mpi_transpose_x2y subroutine s_mpi_transpose_y2z @@ -924,6 +930,8 @@ contains integer :: dest_rank, src_rank integer :: j, k, l +#ifdef MFC_MPI + allocate (sendbuf(Ny*Nxloc*Nzloc)) allocate (recvbuf(Ny*Nxloc*Nzloc)) @@ -953,6 +961,8 @@ contains deallocate (sendbuf) deallocate (recvbuf) +#endif + end subroutine s_mpi_transpose_y2z impure subroutine s_initialize_modules @@ -984,6 +994,7 @@ contains s_read_data_files => s_read_parallel_data_files end if +#ifdef MFC_MPI if (fft_wrt) then num_procs_x = (m_glb + 1)/(m + 1) @@ -1054,12 +1065,15 @@ contains call MPI_CART_COORDS(MPI_COMM_CART13, proc_rank13, 2, cart2d13_coords, ierr) end if +#endif end subroutine s_initialize_modules subroutine s_mpi_FFT_fwd integer :: j, k, l +#ifdef MFC_MPI + do l = 1, Nzloc do k = 1, Nyloc do j = 1, Nx @@ -1118,9 +1132,15 @@ contains end do end do +#endif + end subroutine s_mpi_FFT_fwd impure subroutine s_initialize_mpi_domain + + num_dims = 1 + min(1, n) + min(1, p) + +#ifdef MFC_MPI ! Initialization of the MPI environment call s_mpi_initialize() @@ -1144,6 +1164,8 @@ contains call s_mpi_decompose_computational_domain() call s_check_inputs_fft() +#endif + end subroutine s_initialize_mpi_domain impure subroutine s_finalize_modules @@ -1169,11 +1191,13 @@ contains call fftw_cleanup() end if +#ifdef MFC_MPI if (fft_wrt) then if (MPI_COMM_CART12 /= MPI_COMM_NULL) call MPI_Comm_free(MPI_COMM_CART12, ierr) if (MPI_COMM_CART13 /= MPI_COMM_NULL) call MPI_Comm_free(MPI_COMM_CART13, ierr) if (MPI_COMM_CART /= MPI_COMM_NULL) call MPI_Comm_free(MPI_COMM_CART, ierr) end if +#endif ! Deallocation procedures for the modules call s_finalize_data_output_module()