bem_performance_profile.f90 Source File


This file depends on

sourcefile~~bem_performance_profile.f90~~EfferentGraph sourcefile~bem_performance_profile.f90 bem_performance_profile.f90 sourcefile~bem_kinds.f90 bem_kinds.f90 sourcefile~bem_performance_profile.f90->sourcefile~bem_kinds.f90 sourcefile~bem_mpi.f90 bem_mpi.F90 sourcefile~bem_performance_profile.f90->sourcefile~bem_mpi.f90 sourcefile~bem_string_utils.f90 bem_string_utils.f90 sourcefile~bem_performance_profile.f90->sourcefile~bem_string_utils.f90 sourcefile~bem_mpi.f90->sourcefile~bem_kinds.f90

Files dependent on this one

sourcefile~~bem_performance_profile.f90~~AfferentGraph sourcefile~bem_performance_profile.f90 bem_performance_profile.f90 sourcefile~bem_simulator_loop.f90 bem_simulator_loop.f90 sourcefile~bem_simulator_loop.f90->sourcefile~bem_performance_profile.f90 sourcefile~main.f90 main.f90 sourcefile~main.f90->sourcefile~bem_performance_profile.f90

Source Code

!> 実行フェーズごとの壁時計計測と MPI 集約出力を担う軽量プロファイラ。
module bem_performance_profile
!$ use omp_lib
  use, intrinsic :: iso_fortran_env, only: int64, output_unit
  use bem_kinds, only: dp, i32
  use bem_mpi, only: mpi_context, mpi_get_rank_size, mpi_is_root, &
                     mpi_allreduce_sum_real_dp_array, mpi_allreduce_sum_i32_array, &
                     mpi_allreduce_min_real_dp_array, mpi_allreduce_max_real_dp_array
  use bem_string_utils, only: lower_ascii
  implicit none
  private

  integer(i32), parameter, public :: perf_region_program_total = 1_i32
  integer(i32), parameter, public :: perf_region_load_or_init = 2_i32
  integer(i32), parameter, public :: perf_region_history_open = 3_i32
  integer(i32), parameter, public :: perf_region_simulation_total = 4_i32
  integer(i32), parameter, public :: perf_region_field_solver_init = 5_i32
  integer(i32), parameter, public :: perf_region_batch_total = 6_i32
  integer(i32), parameter, public :: perf_region_prepare_batch = 7_i32
  integer(i32), parameter, public :: perf_region_field_refresh = 8_i32
  integer(i32), parameter, public :: perf_region_particle_batch = 9_i32
  integer(i32), parameter, public :: perf_region_commit_charge = 10_i32
  integer(i32), parameter, public :: perf_region_count_outcomes = 11_i32
  integer(i32), parameter, public :: perf_region_mpi_reduce = 12_i32
  integer(i32), parameter, public :: perf_region_stats_update = 13_i32
  integer(i32), parameter, public :: perf_region_history_write = 14_i32
  integer(i32), parameter, public :: perf_region_write_results = 15_i32
  integer(i32), parameter, public :: perf_region_write_checkpoint = 16_i32
  integer(i32), parameter, public :: perf_region_count = 16_i32

  type :: perf_region_stats
    real(dp) :: total_s = 0.0d0
    integer(i32) :: call_count = 0_i32
  end type perf_region_stats

  type :: perf_state_type
    logical :: enabled = .false.
    logical :: write_files = .false.
    integer(i32) :: omp_max_threads = 1_i32
    character(len=1024) :: output_dir = ''
    type(perf_region_stats) :: regions(perf_region_count)
  end type perf_state_type

  type(perf_state_type), save :: perf_state = perf_state_type()

  character(len=32), parameter :: perf_region_names(perf_region_count) = [character(len=32) :: &
                                                                          'program_total', &
                                                                          'load_or_init', &
                                                                          'history_open', &
                                                                          'simulation_total', &
                                                                          'field_solver_init', &
                                                                          'batch_total', &
                                                                          'prepare_batch', &
                                                                          'field_refresh', &
                                                                          'particle_batch', &
                                                                          'commit_charge', &
                                                                          'count_outcomes', &
                                                                          'mpi_reduce', &
                                                                          'stats_update', &
                                                                          'history_write', &
                                                                          'write_results', &
                                                                          'write_checkpoint' &
                                                                          ]

  public :: perf_reset
  public :: perf_configure
  public :: perf_configure_from_env
  public :: perf_set_output_context
  public :: perf_is_enabled
  public :: perf_wall_time_seconds
  public :: perf_region_begin
  public :: perf_region_end
  public :: perf_add_elapsed
  public :: perf_write_outputs

contains

  !> プロファイラ状態を既定値へ戻す。
  subroutine perf_reset()
    perf_state = perf_state_type()
  end subroutine perf_reset

  !> 明示フラグで計測状態を設定する。
  subroutine perf_configure(enabled)
    logical, intent(in) :: enabled

    call perf_reset()
    perf_state%enabled = enabled
    perf_state%omp_max_threads = 1_i32
!$  perf_state%omp_max_threads = int(max(1, omp_get_max_threads()), i32)
  end subroutine perf_configure

  !> 環境変数 `BEACH_PROFILE` から計測状態を初期化する。
  subroutine perf_configure_from_env()
    logical :: enabled

    enabled = env_flag_enabled('BEACH_PROFILE')
    call perf_configure(enabled)
  end subroutine perf_configure_from_env

  !> 出力先ディレクトリとファイル書き出し可否を登録する。
  subroutine perf_set_output_context(output_dir, write_files)
    character(len=*), intent(in) :: output_dir
    logical, intent(in) :: write_files

    perf_state%output_dir = ''
    perf_state%output_dir = trim(output_dir)
    perf_state%write_files = write_files
  end subroutine perf_set_output_context

  !> 粗粒度プロファイルが有効かを返す。
  logical function perf_is_enabled()
    perf_is_enabled = perf_state%enabled
  end function perf_is_enabled

  !> OpenMP有効時は `omp_get_wtime`、それ以外は `system_clock` を使う壁時計。
  real(dp) function perf_wall_time_seconds()
    integer(int64) :: count, rate

!$  perf_wall_time_seconds = omp_get_wtime()
!$  return

    call system_clock(count=count, count_rate=rate)
    if (rate > 0_int64) then
      perf_wall_time_seconds = real(count, dp)/real(rate, dp)
    else
      call cpu_time(perf_wall_time_seconds)
    end if
  end function perf_wall_time_seconds

  !> フェーズ開始時刻を取得する。
  subroutine perf_region_begin(region_id, t0)
    integer(i32), intent(in) :: region_id
    real(dp), intent(out) :: t0

    if (.not. perf_state%enabled) then
      t0 = 0.0d0
      return
    end if
    if (.not. valid_region(region_id)) then
      t0 = 0.0d0
      return
    end if
    t0 = perf_wall_time_seconds()
  end subroutine perf_region_begin

  !> フェーズ終了時刻との差分を累積する。
  subroutine perf_region_end(region_id, t0)
    integer(i32), intent(in) :: region_id
    real(dp), intent(in) :: t0

    if (.not. perf_state%enabled) return
    if (.not. valid_region(region_id)) return
    call perf_add_elapsed(region_id, perf_wall_time_seconds() - t0)
  end subroutine perf_region_end

  !> 経過時間と呼び出し回数を累積する。
  subroutine perf_add_elapsed(region_id, elapsed_s, call_count)
    integer(i32), intent(in) :: region_id
    real(dp), intent(in) :: elapsed_s
    integer(i32), intent(in), optional :: call_count
    integer(i32) :: calls

    if (.not. perf_state%enabled) return
    if (.not. valid_region(region_id)) return

    calls = 1_i32
    if (present(call_count)) calls = max(0_i32, call_count)
    perf_state%regions(region_id)%total_s = perf_state%regions(region_id)%total_s + max(0.0d0, elapsed_s)
    perf_state%regions(region_id)%call_count = perf_state%regions(region_id)%call_count + calls
  end subroutine perf_add_elapsed

  !> 集計済みプロファイルを標準出力および CSV へ書き出す。
  subroutine perf_write_outputs(mpi)
    type(mpi_context), intent(in) :: mpi

    integer(i32) :: rank, world_size, region_idx
    integer(i32) :: call_sum(perf_region_count)
    real(dp) :: total_sum(perf_region_count), total_min(perf_region_count), total_max(perf_region_count)
    real(dp) :: total_mean, imbalance
    integer :: u, ios
    character(len=1024) :: path

    if (.not. perf_state%enabled) return

    call mpi_get_rank_size(rank, world_size, mpi)

    do region_idx = 1, perf_region_count
      total_sum(region_idx) = perf_state%regions(region_idx)%total_s
      total_min(region_idx) = perf_state%regions(region_idx)%total_s
      total_max(region_idx) = perf_state%regions(region_idx)%total_s
      call_sum(region_idx) = perf_state%regions(region_idx)%call_count
    end do

    call mpi_allreduce_sum_real_dp_array(mpi, total_sum)
    call mpi_allreduce_min_real_dp_array(mpi, total_min)
    call mpi_allreduce_max_real_dp_array(mpi, total_max)
    call mpi_allreduce_sum_i32_array(mpi, call_sum)

    if (.not. mpi_is_root(mpi)) return

    write (output_unit, '(a,es12.4,a,es12.4,a,i0,a,i0)') &
      'performance: program_total(rank_max)=', total_max(perf_region_program_total), &
      ' simulation_total(rank_max)=', total_max(perf_region_simulation_total), &
      ' mpi=', world_size, ' omp=', perf_state%omp_max_threads

    if (.not. perf_state%write_files .or. len_trim(perf_state%output_dir) == 0) then
      flush (output_unit)
      return
    end if

    path = trim(perf_state%output_dir)//'/performance_profile.csv'
    open (newunit=u, file=trim(path), status='replace', action='write', iostat=ios)
    if (ios /= 0) error stop 'Failed to open performance_profile.csv.'

    write (u, '(a)') '# BEACH performance profile'
    write (u, '(a,i0)') '# mpi_world_size=', world_size
    write (u, '(a,i0)') '# omp_max_threads=', perf_state%omp_max_threads
    write (u, '(a)') '# use rank_max_s of simulation_total for scaling comparisons'
    write (u, '(a)') 'region,calls_sum,calls_mean,rank_min_s,rank_mean_s,rank_max_s,imbalance_ratio'

    do region_idx = 1, perf_region_count
      total_mean = total_sum(region_idx)/real(max(1_i32, world_size), dp)
      if (total_mean > tiny(1.0d0)) then
        imbalance = total_max(region_idx)/total_mean
      else
        imbalance = 0.0d0
      end if
      write (u, '(a,a,i0,a,es24.16,a,es24.16,a,es24.16,a,es24.16,a,es24.16)') &
        trim(perf_region_names(region_idx)), ',', call_sum(region_idx), ',', &
        real(call_sum(region_idx), dp)/real(max(1_i32, world_size), dp), ',', &
        total_min(region_idx), ',', total_mean, ',', total_max(region_idx), ',', imbalance
    end do

    close (u)
    write (output_unit, '(a,a)') 'performance profile written to ', trim(path)
    flush (output_unit)
  end subroutine perf_write_outputs

  !> 計測対象 region 番号かどうかを返す。
  logical function valid_region(region_id)
    integer(i32), intent(in) :: region_id

    valid_region = (region_id >= 1_i32 .and. region_id <= perf_region_count)
  end function valid_region

  !> 真偽値環境変数を読み取る。
  logical function env_flag_enabled(name)
    character(len=*), intent(in) :: name
    character(len=32) :: value
    integer :: status, value_len

    env_flag_enabled = .false.
    value = ''
    call get_environment_variable(name, value, length=value_len, status=status)
    if (status /= 0 .or. value_len <= 0) return

    select case (trim(lower_ascii(value(1:value_len))))
    case ('1', 'true', 'yes', 'on')
      env_flag_enabled = .true.
    end select
  end function env_flag_enabled

end module bem_performance_profile