Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Errors on v2.3.0rc5 (E3SM Unified 1.9.0rc10) #474

Closed
forsyth2 opened this issue Aug 8, 2023 · 7 comments
Closed

Errors on v2.3.0rc5 (E3SM Unified 1.9.0rc10) #474

forsyth2 opened this issue Aug 8, 2023 · 7 comments
Labels
semver: bug Bug fix (will increment patch version)

Comments

@forsyth2
Copy link
Collaborator

forsyth2 commented Aug 8, 2023

Chrysalis

Contents of tests/integration/generated/test_complete_run_chrysalis.cfg:

[default]
case = v2.LR.historical_0201
constraint = ""
dry_run = "False"
environment_commands = "source /lcrc/soft/climate/e3sm-unified/test_e3sm_unified_1.9.0rc10_chrysalis.sh"
input = "/lcrc/group/e3sm/ac.forsyth2//E3SMv2/v2.LR.historical_0201"
input_subdir = archive/atm/hist
mapping_file = "map_ne30pg2_to_cmip6_180x360_aave.20200201.nc"
# To run this test, edit `output` and `www` in this file, along with `actual_images_dir` in test_complete_run.py
output = "/lcrc/group/e3sm/ac.forsyth2/zppy_test_complete_run_output/v2.LR.historical_0201"
partition = "debug"
qos = "regular"
www = "/lcrc/group/e3sm/public_html/diagnostic_output/ac.forsyth2/zppy_test_complete_run_www"

[climo]
active = True
walltime = "00:30:00"
years = "1850:1854:2", "1850:1854:4",

  [[ atm_monthly_180x360_aave ]]
  frequency = "monthly"

  [[ atm_monthly_diurnal_8xdaily_180x360_aave ]]
  frequency = "diurnal_8xdaily"
  input_files = "eam.h4"
  input_subdir = "archive/atm/hist"
  vars = "PRECT"

[ts]
active = True
walltime = "00:30:00"
years = "1850:1854:2",

  [[ atm_monthly_180x360_aave ]]
  frequency = "monthly"
  input_files = "eam.h0"
  input_subdir = "archive/atm/hist"
  ts_fmt = "cmip"

  [[ atm_daily_180x360_aave ]]
  frequency = "daily"
  input_files = "eam.h1"
  input_subdir = "archive/atm/hist"
  vars = "PRECT"

  [[ atm_monthly_glb ]]
  frequency = "monthly"
  input_files = "eam.h0"
  input_subdir = "archive/atm/hist"
  mapping_file = "glb"
  years = "1850:1860:5",

  [[ land_monthly ]]
  extra_vars = "landfrac"
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = "archive/lnd/hist"
  vars = "FSH,LAISHA,LAISUN,RH2M"
  ts_fmt = "cmip"

  [[ rof_monthly ]]
  extra_vars = 'areatotal2'
  frequency = "monthly"
  input_files = "mosart.h0"
  input_subdir = "archive/rof/hist"
  mapping_file = ""
  vars = "RIVER_DISCHARGE_OVER_LAND_LIQ"

[tc_analysis]
active = True
scratch = "/lcrc/globalscratch/ac.forsyth2/"
walltime = "00:30:00"
years = "1850:1854:2",

[e3sm_diags]
active = True
grid = '180x360_aave'
ref_final_yr = 2014
ref_start_yr = 1985
# TODO: this directory is missing OMI-MLS
sets = "lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","enso_diags","qbo","diurnal_cycle","annual_cycle_zonal_mean","streamflow", "zonal_mean_2d_stratosphere", "tc_analysis",
short_name = 'v2.LR.historical_0201'
ts_num_years = 2
walltime = "00:30:00"
years = "1850:1854:2", "1850:1854:4",

  [[ atm_monthly_180x360_aave ]]
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  partition = "compute"
  qos = "regular"
  sets = "lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","enso_diags","qbo","diurnal_cycle","annual_cycle_zonal_mean","streamflow", "zonal_mean_2d_stratosphere",
  walltime = "2:00:00"

  [[ atm_monthly_180x360_aave_environment_commands ]]
  environment_commands = "source /home/ac.forsyth2/miniconda3/etc/profile.d/conda.sh; conda activate e3sm_diags_20230807"
  sets = "qbo",
  ts_subsection = "atm_monthly_180x360_aave"

  [[ atm_monthly_180x360_aave_tc_analysis ]]
  # Running as its own subtask because tc_analysis requires jobs to run sequentially, which slows down testing
  sets = "tc_analysis",
  years = "1850:1852:2",

  [[ atm_monthly_180x360_aave_mvm ]]
  # Test model-vs-model using the same files as the reference
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  climo_subsection = "atm_monthly_180x360_aave"
  diff_title = "Difference"
  partition = "compute"
  qos = "regular"
  ref_final_yr = 1851
  ref_name = "v2.LR.historical_0201"
  ref_start_yr = 1850
  ref_years = "1850-1851",
  reference_data_path = "/lcrc/group/e3sm/ac.forsyth2/zppy_test_complete_run_output/v2.LR.historical_0201/post/atm/180x360_aave/clim"
  run_type = "model_vs_model"
  short_ref_name = "v2.LR.historical_0201"
  swap_test_ref = False
  tag = "model_vs_model"
  ts_num_years_ref = 2
  ts_subsection = "atm_monthly_180x360_aave"
  walltime = "2:00:00"
  years = "1852-1853",

[mpas_analysis]
active = True
anomalyRefYear = 1850
climo_years ="1850-1854", "1855-1860",
enso_years = "1850-1854", "1855-1860",
mesh = "EC30to60E2r2"
parallelTaskCount = 6
partition = "compute"
qos = "regular"
ts_years = "1850-1854", "1850-1860",
walltime = "00:30:00"

[global_time_series]
active = True
climo_years ="1850-1854", "1855-1860",
experiment_name = "v2.LR.historical_0201"
figstr = "v2_historical_0201"
moc_file=mocTimeSeries_1850-1860.nc
ts_num_years = 5
ts_years = "1850-1854", "1850-1860",
walltime = "00:30:00"
years = "1850-1860",

[ilamb]
active = True
grid = '180x360_aave'
short_name = 'v2.LR.historical_0201'
ts_num_years = 2
years = "1850:1854:2",

I ran:

$ conda activate zppy_dev_pre_rc6
$ pip install .
$ zppy -c tests/integration/generated/test_complete_run_chrysalis.cfg

This generates files, that I have since moved:

  • /lcrc/group/e3sm/ac.forsyth2/zppy_test_complete_run_output/v2.LR.historical_0201/post -> /lcrc/group/e3sm/ac.forsyth2/zppy_test_complete_run_output/v2.LR.historical_0201/post_20230808
  • /lcrc/group/e3sm/public_html/diagnostic_output/ac.forsyth2/zppy_test_complete_run_www/v2.LR.historical_0201 -> /lcrc/group/e3sm/public_html/diagnostic_output/ac.forsyth2/zppy_test_complete_run_www/v2.LR.historical_0201_20230808
$ cd /lcrc/group/e3sm/ac.forsyth2/zppy_test_complete_run_output/v2.LR.historical_0201/post_20230808/scripts
$ grep -v "OK" *status
# No failures
$ grep -n "Segmentation" *
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1850-1851.o370766:327:[chr-0080:487898:0:488057] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x8ec69ef4)
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1850-1851.o370766:365:[chr-0080:487898:0:488074] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x3233028df6)
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1850-1853.o370768:368:[chr-0330:563571:0:563730] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x250a5032e9)
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1850-1853.o370768:418:[chr-0330:563571:0:563747] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x2522eada34)
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1852-1853.o370767:326:[chr-0329:565762:0:565938] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x2467d44998)
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1852-1853.o370767:374:[chr-0329:565762:0:565955] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x8)
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.o370773:550:[chr-0248:593729:0:593903] Caught signal 11 (Segmentation fault: address not mapped to object at address 0xe0e79f)
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.o370773:551:[chr-0248:593729:0:593906] Caught signal 11 (Segmentation fault: address not mapped to object at address (nil))
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.o370773:552:[chr-0248:593729:0:593905] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x354a3da608)
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.o370773:553:[chr-0248:593729:0:593915] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x250b57e708)
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.o370773:554:[chr-0248:593729:0:593908] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x24d485b018)
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.o370773:555:[chr-0248:593729:0:593914] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x248e09b368)
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.o370773:556:[chr-0248:593729:0:593902] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x1578280e4dc8)
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.o370773:557:[chr-0248:593729:0:593907] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x25f9b49208)
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.o370773:558:[chr-0248:593729:0:593913] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x24e1ffe9d8)
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.o370773:559:[chr-0248:593729:0:593904] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x1b08)
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.o370773:579:[chr-0248:593729:0:593911] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x154fa03c0)
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.o370773:675:[chr-0248:593729:0:593912] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x8c2298)
e3sm_diags_atm_monthly_180x360_aave_tc_analysis_model_vs_obs_1850-1851.o370772:27:[chr-0494:1113804:0:1113958] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x1573aa916078)
grep: global_time_series_1850-1860_dir: Is a directory
grep: global_time_series_1850-1860_results: Is a directory

Selection of output from e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1850-1851.o370766:

Traceback (most recent call last):
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/site-packages/e3sm_diags/e3sm_diags_driver.py", line 293, in run_diag
    single_result = module.run_diag(parameter)
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/site-packages/e3sm_diags/driver/zonal_mean_xy_driver.py", line 121, in run_diag
    mv2 = ref_data.get_climo_variable(var, season)
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/site-packages/e3sm_diags/driver/utils/dataset.py", line 155, in get_climo_variable
    filename = self.get_ref_filename_climo(season)
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/site-packages/e3sm_diags/driver/utils/dataset.py", line 293, in get_ref_filename_climo
    return self._get_climo_filename(path, data_name, season)
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/site-packages/e3sm_diags/driver/utils/dataset.py", line 308, in _get_climo_filename
    raise IOError(
OSError: No file found for HadISST and ANN in /lcrc/group/e3sm/diagnostics/observations/Atm/climatology/

@chengzhuzhang says the above error is expected.

[chr-0080:487898:0:488057] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x8ec69ef4)

2023-08-08 10:59:31,787 [ERROR]: run.py(run_diags:36) >> Error traceback:
Traceback (most recent call last):
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/site-packages/e3sm_diags/run.py", line 34, in run_diags
    main(final_params)
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/site-packages/e3sm_diags/e3sm_diags_driver.py", line 419, in main
    parameters_results = _run_with_dask(parameters)
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/site-packages/e3sm_diags/e3sm_diags_driver.py", line 365, in _run_with_dask
    results = bag.map(run_diag).compute(num_workers=num_workers)
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/site-packages/dask/base.py", line 310, in compute
    (result,) = compute(self, traverse=False, **kwargs)
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/site-packages/dask/base.py", line 595, in compute
    results = schedule(dsk, keys, **kwargs)
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/site-packages/dask/multiprocessing.py", line 233, in get
    result = get_async(
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/site-packages/dask/local.py", line 500, in get_async
    for key, res_info, failed in queue_get(queue).result():
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/concurrent/futures/_base.py", line 451, in result
    return self.__get_result()
  File "/lcrc/soft/climate/e3sm-unified/base/envs/e3sm_unified_1.9.0rc10_chrysalis/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
    raise self._exception
concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.
2023-08-08 10:59:31,853 [INFO]: logger.py(move_log_to_prov_dir:106) >> Log file saved in model_vs_obs_1850-1851/prov/e3sm_diags_run.log
Compy

Contents of tests/integration/generated/test_complete_run_compy.cfg:

[default]
case = v2.LR.historical_0201
constraint = ""
dry_run = "False"
environment_commands = "source /share/apps/E3SM/conda_envs/test_e3sm_unified_1.9.0rc10_compy.sh"
input = "/compyfs/fors729//E3SMv2/v2.LR.historical_0201"
input_subdir = archive/atm/hist
mapping_file = "map_ne30pg2_to_cmip6_180x360_aave.20200201.nc"
# To run this test, edit `output` and `www` in this file, along with `actual_images_dir` in test_complete_run.py
output = "/compyfs/fors729/zppy_test_complete_run_output/v2.LR.historical_0201"
partition = "short"
qos = "regular"
www = "/compyfs/www/fors729/zppy_test_complete_run_www"

[climo]
active = True
walltime = "00:30:00"
years = "1850:1854:2", "1850:1854:4",

  [[ atm_monthly_180x360_aave ]]
  frequency = "monthly"

  [[ atm_monthly_diurnal_8xdaily_180x360_aave ]]
  frequency = "diurnal_8xdaily"
  input_files = "eam.h4"
  input_subdir = "archive/atm/hist"
  vars = "PRECT"

[ts]
active = True
walltime = "00:30:00"
years = "1850:1854:2",

  [[ atm_monthly_180x360_aave ]]
  frequency = "monthly"
  input_files = "eam.h0"
  input_subdir = "archive/atm/hist"
  ts_fmt = "cmip"

  [[ atm_daily_180x360_aave ]]
  frequency = "daily"
  input_files = "eam.h1"
  input_subdir = "archive/atm/hist"
  vars = "PRECT"

  [[ atm_monthly_glb ]]
  frequency = "monthly"
  input_files = "eam.h0"
  input_subdir = "archive/atm/hist"
  mapping_file = "glb"
  years = "1850:1860:5",

  [[ land_monthly ]]
  extra_vars = "landfrac"
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = "archive/lnd/hist"
  vars = "FSH,LAISHA,LAISUN,RH2M"
  ts_fmt = "cmip"

  [[ rof_monthly ]]
  extra_vars = 'areatotal2'
  frequency = "monthly"
  input_files = "mosart.h0"
  input_subdir = "archive/rof/hist"
  mapping_file = ""
  vars = "RIVER_DISCHARGE_OVER_LAND_LIQ"

[tc_analysis]
active = True
scratch = "/qfs/people/fors729/"
walltime = "00:30:00"
years = "1850:1854:2",

[e3sm_diags]
active = True
grid = '180x360_aave'
ref_final_yr = 2014
ref_start_yr = 1985
# TODO: this directory is missing OMI-MLS
sets = "lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","enso_diags","qbo","diurnal_cycle","annual_cycle_zonal_mean","streamflow", "zonal_mean_2d_stratosphere", "tc_analysis",
short_name = 'v2.LR.historical_0201'
ts_num_years = 2
walltime = "00:30:00"
years = "1850:1854:2", "1850:1854:4",

  [[ atm_monthly_180x360_aave ]]
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  partition = "slurm"
  qos = "regular"
  sets = "lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","enso_diags","qbo","diurnal_cycle","annual_cycle_zonal_mean","streamflow", "zonal_mean_2d_stratosphere",
  walltime = "03:00:00"

  [[ atm_monthly_180x360_aave_environment_commands ]]
  environment_commands = "source /qfs/people/fors729/miniconda3/etc/profile.d/conda.sh; conda activate e3sm_diags_20230807"
  sets = "qbo",
  ts_subsection = "atm_monthly_180x360_aave"

  [[ atm_monthly_180x360_aave_tc_analysis ]]
  # Running as its own subtask because tc_analysis requires jobs to run sequentially, which slows down testing
  sets = "tc_analysis",
  years = "1850:1852:2",

  [[ atm_monthly_180x360_aave_mvm ]]
  # Test model-vs-model using the same files as the reference
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  climo_subsection = "atm_monthly_180x360_aave"
  diff_title = "Difference"
  partition = "slurm"
  qos = "regular"
  ref_final_yr = 1851
  ref_name = "v2.LR.historical_0201"
  ref_start_yr = 1850
  ref_years = "1850-1851",
  reference_data_path = "/compyfs/fors729/zppy_test_complete_run_output/v2.LR.historical_0201/post/atm/180x360_aave/clim"
  run_type = "model_vs_model"
  short_ref_name = "v2.LR.historical_0201"
  swap_test_ref = False
  tag = "model_vs_model"
  ts_num_years_ref = 2
  ts_subsection = "atm_monthly_180x360_aave"
  walltime = "03:00:00"
  years = "1852-1853",

[mpas_analysis]
active = True
anomalyRefYear = 1850
climo_years ="1850-1854", "1855-1860",
enso_years = "1850-1854", "1855-1860",
mesh = "EC30to60E2r2"
parallelTaskCount = 6
partition = "slurm"
qos = "regular"
ts_years = "1850-1854", "1850-1860",
walltime = "00:30:00"

[global_time_series]
active = True
climo_years ="1850-1854", "1855-1860",
experiment_name = "v2.LR.historical_0201"
figstr = "v2_historical_0201"
moc_file=mocTimeSeries_1850-1860.nc
ts_num_years = 5
ts_years = "1850-1854", "1850-1860",
walltime = "00:30:00"
years = "1850-1860",

[ilamb]
active = True
grid = '180x360_aave'
short_name = 'v2.LR.historical_0201'
ts_num_years = 2
years = "1850:1854:2",

I ran:

$ conda activate zppy_dev_pre_rc6
$ pip install .
$ zppy -c tests/integration/generated/test_complete_run_compy.cfg

This generates files, that I have since moved:

  • /compyfs/fors729/zppy_test_complete_run_output/v2.LR.historical_0201/post -> /compyfs/fors729/zppy_test_complete_run_output/v2.LR.historical_0201/post_20230808
  • /compyfs/www/fors729/zppy_test_complete_run_www/v2.LR.historical_0201 -> /compyfs/www/fors729/zppy_test_complete_run_www/v2.LR.historical_0201_20230808
$ cd /compyfs/fors729/zppy_test_complete_run_output/v2.LR.historical_0201/post_20230808/scripts
$ grep -v "OK" *status
climo_atm_monthly_180x360_aave_1850-1851.status:ERROR (3)
climo_atm_monthly_180x360_aave_1850-1853.status:ERROR (3)
climo_atm_monthly_180x360_aave_1852-1853.status:ERROR (3)
climo_atm_monthly_diurnal_8xdaily_180x360_aave_1850-1851.status:ERROR (3)
climo_atm_monthly_diurnal_8xdaily_180x360_aave_1850-1853.status:ERROR (3)
climo_atm_monthly_diurnal_8xdaily_180x360_aave_1852-1853.status:ERROR (3)
e3sm_diags_atm_monthly_180x360_aave_environment_commands_model_vs_obs_1850-1851.status:WAITING 551047
e3sm_diags_atm_monthly_180x360_aave_environment_commands_model_vs_obs_1850-1853.status:WAITING 551049
e3sm_diags_atm_monthly_180x360_aave_environment_commands_model_vs_obs_1852-1853.status:WAITING 551048
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1850-1851.status:WAITING 551044
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1850-1853.status:WAITING 551046
e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1852-1853.status:WAITING 551045
e3sm_diags_atm_monthly_180x360_aave_mvm_model_vs_model_1852-1853_vs_1850-1851.status:WAITING 551051
e3sm_diags_atm_monthly_180x360_aave_tc_analysis_model_vs_obs_1850-1851.status:WAITING 551050

Compy thus fails earlier than on Chrysalis -- in the climo tasks rather than the e3sm_diags tasks.

Selection of output from climo_atm_monthly_180x360_aave_1850-1851.o551026 :

[[email protected]] [[email protected]] [[email protected]] [[email protected]] match_arg (../../../../../src/pm/i_hydra/libhydra/arg/hydra_arg.c:91): match_arg (../../../../../src/pm/i_hydra/libhydra/arg/hydra_arg.c:91): match_arg (../../../../../src/pm/i_hydra/libhydra/arg/hydra_arg.c:91): match_arg (../../../../../src/pm/i_hydra/libhydra/arg/hydra_arg.c:91): unrecognized argument H
ncclimo: ERROR monthly climo cmd_clm[1] failed. Debug this:
mpirun -H n0001 -n 1 ncra --clm_bnd=1850,1851,1,1,0 -O --no_tmp_fl --hdr_pad=10000 --gaa climo_script=ncclimo --gaa climo_command="'/share/apps/E3SM/conda_envs/base/envs/e3sm_unified_1.9.0rc10_compy/bin/ncclimo --case=v2.LR.historical_0201 --jobs=4 --thr=1 --parallel=mpi --yr_srt=1850 --yr_end=1851 --input=/compyfs/fors729//E3SMv2/v2.LR.historical_0201/archive/atm/hist --map=/compyfs/diagnostics/maps/map_ne30pg2_to_cmip6_180x360_aave.20200201.nc --output=trash --regrid=output --prc_typ=eam'" --gaa climo_hostname=n0001 --gaa climo_version=5.1.7 --gaa yrs_averaged=1850-1851 -p /compyfs/fors729//E3SMv2/v2.LR.historical_0201/archive/atm/hist  v2.LR.historical_0201.eam.h0.1850-01.nc v2.LR.historical_0201.eam.h0.1851-01.nc trash/v2.LR.historical_0201_01_185001_185101_climo.nc
Perlmutter

Contents of tests/integration/generated/test_complete_run_pm-cpu.cfg:

[default]
case = v2.LR.historical_0201
constraint = "cpu"
dry_run = "False"
environment_commands = "source /global/common/software/e3sm/anaconda_envs/test_e3sm_unified_1.9.0rc10_pm-cpu.sh"
input = "/global/cfs/cdirs/e3sm/forsyth//E3SMv2/v2.LR.historical_0201"
input_subdir = archive/atm/hist
mapping_file = "map_ne30pg2_to_cmip6_180x360_aave.20200201.nc"
# To run this test, edit `output` and `www` in this file, along with `actual_images_dir` in test_complete_run.py
output = "/global/cfs/cdirs/e3sm/forsyth/zppy_test_complete_run_output/v2.LR.historical_0201"
partition = ""
qos = "regular"
www = "/global/cfs/cdirs/e3sm/www/forsyth/zppy_test_complete_run_www"

[climo]
active = True
walltime = "00:30:00"
years = "1850:1854:2", "1850:1854:4",

  [[ atm_monthly_180x360_aave ]]
  frequency = "monthly"

  [[ atm_monthly_diurnal_8xdaily_180x360_aave ]]
  frequency = "diurnal_8xdaily"
  input_files = "eam.h4"
  input_subdir = "archive/atm/hist"
  vars = "PRECT"

[ts]
active = True
walltime = "00:30:00"
years = "1850:1854:2",

  [[ atm_monthly_180x360_aave ]]
  frequency = "monthly"
  input_files = "eam.h0"
  input_subdir = "archive/atm/hist"
  ts_fmt = "cmip"

  [[ atm_daily_180x360_aave ]]
  frequency = "daily"
  input_files = "eam.h1"
  input_subdir = "archive/atm/hist"
  vars = "PRECT"

  [[ atm_monthly_glb ]]
  frequency = "monthly"
  input_files = "eam.h0"
  input_subdir = "archive/atm/hist"
  mapping_file = "glb"
  years = "1850:1860:5",

  [[ land_monthly ]]
  extra_vars = "landfrac"
  frequency = "monthly"
  input_files = "elm.h0"
  input_subdir = "archive/lnd/hist"
  vars = "FSH,LAISHA,LAISUN,RH2M"
  ts_fmt = "cmip"

  [[ rof_monthly ]]
  extra_vars = 'areatotal2'
  frequency = "monthly"
  input_files = "mosart.h0"
  input_subdir = "archive/rof/hist"
  mapping_file = ""
  vars = "RIVER_DISCHARGE_OVER_LAND_LIQ"

[tc_analysis]
active = True
scratch = "/pscratch/sd/f/forsyth/"
walltime = "00:30:00"
years = "1850:1854:2",

[e3sm_diags]
active = True
grid = '180x360_aave'
ref_final_yr = 2014
ref_start_yr = 1985
# TODO: this directory is missing OMI-MLS
sets = "lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","enso_diags","qbo","diurnal_cycle","annual_cycle_zonal_mean","streamflow", "zonal_mean_2d_stratosphere", "tc_analysis",
short_name = 'v2.LR.historical_0201'
ts_num_years = 2
walltime = "00:30:00"
years = "1850:1854:2", "1850:1854:4",

  [[ atm_monthly_180x360_aave ]]
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  partition = ""
  qos = "regular"
  sets = "lat_lon","zonal_mean_xy","zonal_mean_2d","polar","cosp_histogram","meridional_mean_2d","enso_diags","qbo","diurnal_cycle","annual_cycle_zonal_mean","streamflow", "zonal_mean_2d_stratosphere",
  walltime = "6:00:00"

  [[ atm_monthly_180x360_aave_environment_commands ]]
  environment_commands = "source /global/homes/f/forsyth/miniconda3/etc/profile.d/conda.sh; conda activate e3sm_diags_20230807"
  sets = "qbo",
  ts_subsection = "atm_monthly_180x360_aave"

  [[ atm_monthly_180x360_aave_tc_analysis ]]
  # Running as its own subtask because tc_analysis requires jobs to run sequentially, which slows down testing
  sets = "tc_analysis",
  years = "1850:1852:2",

  [[ atm_monthly_180x360_aave_mvm ]]
  # Test model-vs-model using the same files as the reference
  climo_diurnal_frequency = "diurnal_8xdaily"
  climo_diurnal_subsection = "atm_monthly_diurnal_8xdaily_180x360_aave"
  climo_subsection = "atm_monthly_180x360_aave"
  diff_title = "Difference"
  partition = ""
  qos = "regular"
  ref_final_yr = 1851
  ref_name = "v2.LR.historical_0201"
  ref_start_yr = 1850
  ref_years = "1850-1851",
  reference_data_path = "/global/cfs/cdirs/e3sm/forsyth/zppy_test_complete_run_output/v2.LR.historical_0201/post/atm/180x360_aave/clim"
  run_type = "model_vs_model"
  short_ref_name = "v2.LR.historical_0201"
  swap_test_ref = False
  tag = "model_vs_model"
  ts_num_years_ref = 2
  ts_subsection = "atm_monthly_180x360_aave"
  walltime = "6:00:00"
  years = "1852-1853",

[mpas_analysis]
active = True
anomalyRefYear = 1850
climo_years ="1850-1854", "1855-1860",
enso_years = "1850-1854", "1855-1860",
mesh = "EC30to60E2r2"
parallelTaskCount = 6
partition = ""
qos = "regular"
ts_years = "1850-1854", "1850-1860",
walltime = "00:30:00"

[global_time_series]
active = True
climo_years ="1850-1854", "1855-1860",
experiment_name = "v2.LR.historical_0201"
figstr = "v2_historical_0201"
moc_file=mocTimeSeries_1850-1860.nc
ts_num_years = 5
ts_years = "1850-1854", "1850-1860",
walltime = "00:30:00"
years = "1850-1860",

[ilamb]
active = True
grid = '180x360_aave'
short_name = 'v2.LR.historical_0201'
ts_num_years = 2
years = "1850:1854:2",

I ran:

$ conda activate zppy_dev_pre_rc6
$ pip install .
$ zppy -c tests/integration/generated/test_complete_run_pm-cpu.cfg

This generates files, that I have since moved:

  • /global/cfs/cdirs/e3sm/forsyth/zppy_test_complete_run_output/v2.LR.historical_0201/post -> /global/cfs/cdirs/e3sm/forsyth/zppy_test_complete_run_output/v2.LR.historical_0201/post_20230808
  • /global/cfs/cdirs/e3sm/www/forsyth/zppy_test_complete_run_www/v2.LR.historical_0201/ -> /global/cfs/cdirs/e3sm/www/forsyth/zppy_test_complete_run_www/v2.LR.historical_0201_20230808
$ cd /global/cfs/cdirs/e3sm/forsyth/zppy_test_complete_run_output/v2.LR.historical_0201/post_20230808/scripts
$ grep -v "OK" *status
# No failures
$ grep -n "Segmentation" *
grep: global_time_series_1850-1860_dir: Is a directory
grep: global_time_series_1850-1860_results: Is a directory
# No segmentation faults

Yet I ran python -u -m unittest tests/integration/test_complete_run.py (before moving the files to different directories) and ran into the following error:

======================================================================
ERROR: test_complete_run (tests.integration.test_complete_run.TestCompleteRun)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/global/u1/f/forsyth/zppy/tests/integration/test_complete_run.py", line 23, in test_complete_run
    check_mismatched_images(
  File "/global/u1/f/forsyth/zppy/tests/integration/utils.py", line 98, in check_mismatched_images
    compare_images(
  File "/global/u1/f/forsyth/zppy/tests/integration/utils.py", line 25, in compare_images
    actual_png = Image.open(path_to_actual_png).convert("RGB")
  File "/global/homes/f/forsyth/miniconda3/envs/zppy_dev_pre_rc6/lib/python3.9/site-packages/PIL/Image.py", line 3147, in open
    raise UnidentifiedImageError(
PIL.UnidentifiedImageError: cannot identify image file '/global/cfs/cdirs/e3sm/www/forsyth/zppy_test_complete_run_www/v2.LR.historical_0201/e3sm_diags/atm_monthly_180x360_aave/model_vs_obs_1850-1851/zonal_mean_xy/SST_CL_HadISST/HadISST_CL-SST-ANN-global.png'

(Chrysalis actually had an UnidentifiedImageError, which is what alerted me to the problem over there in the first place).

Selection of output from e3sm_diags_atm_monthly_180x360_aave_model_vs_obs_1850-1851.o1349980:

Traceback (most recent call last):
  File "/global/common/software/e3sm/anaconda_envs/base/envs/e3sm_unified_1.9.0rc10_pm-cpu/lib/python3.10/site-packages/e3sm_diags/run.py", line 34, in run_diags
    main(final_params)
  File "/global/common/software/e3sm/anaconda_envs/base/envs/e3sm_unified_1.9.0rc10_pm-cpu/lib/python3.10/site-packages/e3sm_diags/e3sm_diags_driver.py", line 419, in main
    parameters_results = _run_with_dask(parameters)
  File "/global/common/software/e3sm/anaconda_envs/base/envs/e3sm_unified_1.9.0rc10_pm-cpu/lib/python3.10/site-packages/e3sm_diags/e3sm_diags_driver.py", line 365, in _run_with_dask
    results = bag.map(run_diag).compute(num_workers=num_workers)
  File "/global/common/software/e3sm/anaconda_envs/base/envs/e3sm_unified_1.9.0rc10_pm-cpu/lib/python3.10/site-packages/dask/base.py", line 310, in compute
    (result,) = compute(self, traverse=False, **kwargs)
  File "/global/common/software/e3sm/anaconda_envs/base/envs/e3sm_unified_1.9.0rc10_pm-cpu/lib/python3.10/site-packages/dask/base.py", line 595, in compute
    results = schedule(dsk, keys, **kwargs)
  File "/global/common/software/e3sm/anaconda_envs/base/envs/e3sm_unified_1.9.0rc10_pm-cpu/lib/python3.10/site-packages/dask/multiprocessing.py", line 233, in get
    result = get_async(
  File "/global/common/software/e3sm/anaconda_envs/base/envs/e3sm_unified_1.9.0rc10_pm-cpu/lib/python3.10/site-packages/dask/local.py", line 500, in get_async
    for key, res_info, failed in queue_get(queue).result():
  File "/global/common/software/e3sm/anaconda_envs/base/envs/e3sm_unified_1.9.0rc10_pm-cpu/lib/python3.10/concurrent/futures/_base.py", line 451, in result
    return self.__get_result()
  File "/global/common/software/e3sm/anaconda_envs/base/envs/e3sm_unified_1.9.0rc10_pm-cpu/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
    raise self._exception
concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.
2023-08-07 16:48:52,335 [INFO]: logger.py(move_log_to_prov_dir:106) >> Log file saved in model_vs_obs_1850-1851/prov/e3sm_diags_run.log

This is the same error that ultimately ended up stopping the Chrysalis run of E3SM Diags -- just without the segmentation faults.

@forsyth2 forsyth2 added the semver: bug Bug fix (will increment patch version) label Aug 8, 2023
@xylar
Copy link
Contributor

xylar commented Aug 8, 2023

I'm afraid I don't have anything useful to suggest beyond what I mentioned on Slack. So I hope further debugging on your part or experience from other zppy users can help narrow down the problem.

@forsyth2
Copy link
Collaborator Author

forsyth2 commented Aug 8, 2023

I will see if I can discover anything with rc9.

@chengzhuzhang @golaz @tomvothecoder @mahf708 On the off-chance any of you have some thoughts on the above errors, please let me know! Thanks

@chengzhuzhang
Copy link
Collaborator

@forsyth2. One thing I can think of is to redo zppy test with rc9, but disable generating cmip timeseries and ilamb tasks in the configuration file. And do the same test for rc10. This can give the behavior difference between 2 rcs.

@forsyth2
Copy link
Collaborator Author

forsyth2 commented Aug 9, 2023

@xylar I confirmed all errors are introduced with the rc9 -> rc10 change. See #475.

@mahf708
Copy link

mahf708 commented Aug 9, 2023

If we can get a simpler reproducer, we could potentially isolate it to a single package. I checked a few of the changes. For example, imagecodecs was updated to remove cfitsio which wasn't needed apparently and was being vendored anyhow. Another example, librsvg skipped a version. Overall, nothing jumps out to me. However, there may be something off with a new mpich though it is hard to tell. Dask's involvement in the chrysalis logs can make things hard to debug... :/

@forsyth2
Copy link
Collaborator Author

forsyth2 commented Aug 9, 2023

If we can get a simpler reproducer,

@mahf708 Yeah, it's also not clear to me how we could do that. At least on Chrysalis, @chengzhuzhang seems to have narrowed it down to an E3SM Diags problem. As for Compy, #475 (comment) seems to be a possible path forward.

@forsyth2 forsyth2 linked a pull request Aug 11, 2023 that will close this issue
@forsyth2
Copy link
Collaborator Author

Closing this issue, resolved by Unified rc12.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
semver: bug Bug fix (will increment patch version)
Projects
None yet
Development

Successfully merging a pull request may close this issue.

4 participants