From 32b86dba094b3ed8d6377e7bf5eeea5cf701c907 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sat, 2 Dec 2023 01:12:43 +0000 Subject: [PATCH 1/9] Split out argo integration from #2350 --- .../{ => FV3GFS}/v0.2/fv3config.yml | 0 .../{ => FV3GFS}/v0.3/fv3config.yml | 0 .../{ => FV3GFS}/v0.4/fv3config.yml | 0 .../{ => FV3GFS}/v0.5/fv3config.yml | 0 .../{ => FV3GFS}/v0.6/fv3config.yml | 0 .../{ => FV3GFS}/v0.7/fv3config.yml | 0 .../base_yamls/SHiELD/v0.1/fv3config.yml | 289 ++++++++++++++++++ external/fv3kube/fv3kube/config.py | 13 +- workflows/argo/kustomization.yaml | 1 + workflows/argo/prognostic-run.yaml | 120 +++++++- workflows/argo/run-shield.yaml | 195 ++++++++++++ 11 files changed, 598 insertions(+), 20 deletions(-) rename external/fv3kube/fv3kube/base_yamls/{ => FV3GFS}/v0.2/fv3config.yml (100%) rename external/fv3kube/fv3kube/base_yamls/{ => FV3GFS}/v0.3/fv3config.yml (100%) rename external/fv3kube/fv3kube/base_yamls/{ => FV3GFS}/v0.4/fv3config.yml (100%) rename external/fv3kube/fv3kube/base_yamls/{ => FV3GFS}/v0.5/fv3config.yml (100%) rename external/fv3kube/fv3kube/base_yamls/{ => FV3GFS}/v0.6/fv3config.yml (100%) rename external/fv3kube/fv3kube/base_yamls/{ => FV3GFS}/v0.7/fv3config.yml (100%) create mode 100644 external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml create mode 100644 workflows/argo/run-shield.yaml diff --git a/external/fv3kube/fv3kube/base_yamls/v0.2/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.2/fv3config.yml similarity index 100% rename from external/fv3kube/fv3kube/base_yamls/v0.2/fv3config.yml rename to external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.2/fv3config.yml diff --git a/external/fv3kube/fv3kube/base_yamls/v0.3/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.3/fv3config.yml similarity index 100% rename from external/fv3kube/fv3kube/base_yamls/v0.3/fv3config.yml rename to external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.3/fv3config.yml diff --git a/external/fv3kube/fv3kube/base_yamls/v0.4/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.4/fv3config.yml similarity index 100% rename from external/fv3kube/fv3kube/base_yamls/v0.4/fv3config.yml rename to external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.4/fv3config.yml diff --git a/external/fv3kube/fv3kube/base_yamls/v0.5/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.5/fv3config.yml similarity index 100% rename from external/fv3kube/fv3kube/base_yamls/v0.5/fv3config.yml rename to external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.5/fv3config.yml diff --git a/external/fv3kube/fv3kube/base_yamls/v0.6/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.6/fv3config.yml similarity index 100% rename from external/fv3kube/fv3kube/base_yamls/v0.6/fv3config.yml rename to external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.6/fv3config.yml diff --git a/external/fv3kube/fv3kube/base_yamls/v0.7/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.7/fv3config.yml similarity index 100% rename from external/fv3kube/fv3kube/base_yamls/v0.7/fv3config.yml rename to external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.7/fv3config.yml diff --git a/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml new file mode 100644 index 0000000000..ce48b327fc --- /dev/null +++ b/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml @@ -0,0 +1,289 @@ +data_table: default +diag_table: no_output +experiment_name: default +field_table: gs://vcm-fv3config/config/field_table/TKE-EDMF/v1.1/field_table +forcing: gs://vcm-ml-experiments/spencerc/2023-09-13-SHiELD-forcing-data +initial_conditions: '' # no default provided +namelist: + fms_affinity_nml: + affinity: false + amip_interp_nml: + data_set: reynolds_oi + date_out_of_range: climo + interp_oi_sst: true + no_anom_sst: false + use_ncep_ice: false + use_ncep_sst: true + atmos_model_nml: + blocksize: 24 + chksum_debug: false + dycore_only: false + fdiag: 0.25 + first_time_step: false + coupler_nml: + atmos_nthreads: 1 + calendar: julian + current_date: + - 2016 + - 8 + - 1 + - 0 + - 0 + - 0 + days: 0 + dt_atmos: 900 + dt_ocean: 900 + hours: 0 + minutes: 30 + months: 0 + seconds: 0 + use_hyper_thread: false + diag_manager_nml: + prepend_date: false + external_ic_nml: + checker_tr: false + filtered_terrain: true + gfs_dwinds: true + levp: 64 + nt_checker: 0 + fms_io_nml: + checksum_required: false + max_files_r: 100 + max_files_w: 100 + fms_nml: + clock_grain: ROUTINE + domains_stack_size: 3000000 + print_memory_usage: false + fv_core_nml: + a_imp: 1.0 + adjust_dry_mass: false + beta: 0.0 + consv_am: false + consv_te: 0.0 + d2_bg: 0.0 + d2_bg_k1: 0.2 + d2_bg_k2: 0.1 + d4_bg: 0.15 + d_con: 1.0 + d_ext: 0.0 + dddmp: 0.5 + delt_max: 0.002 + dnats: 1 + do_vort_damp: true + dwind_2d: false + external_ic: false + fill: true + fill_dp: true + fv_debug: false + fv_sg_adj: 300 + gfs_phil: false + grid_type: 0 + hord_dp: -5 + hord_mt: 5 + hord_tm: 5 + hord_tr: -5 + hord_vt: 5 + hydrostatic: false + io_layout: + - 1 + - 1 + k_split: 1 + ke_bg: 0.0 + kord_mt: 9 + kord_tm: -9 + kord_tr: 9 + kord_wz: -9 + layout: + - 1 + - 1 + make_nh: false + mountain: true + n_split: 6 + n_sponge: 30 + na_init: 0 + ncep_ic: false + nggps_ic: false + no_dycore: false + nord: 3 + npx: 49 + npy: 49 + npz: 79 + npz_type: 'gcrm' + ntiles: 6 + nudge_qv: true + nwat: 6 + p_fac: 0.05 + phys_hydrostatic: false + print_freq: 1 + range_warn: true + reset_eta: false + rf_cutoff: 3000.0 + rf_fast: true + sg_cutoff: 20000.0 + tau: 5.0 + tau_h2o: 0.0 + use_hydro_pressure: false + vtdm4: 0.06 + warm_start: true + z_tracer: true + gfdl_mp_nml: + c_paut: 0.5 + c_pgacs: 0.01 + c_psaci: 0.01 + ccn_l: 300.0 + ccn_o: 100.0 + const_vg: false + const_vi: false + const_vr: false + const_vs: false + do_qa: true + do_cond_timescale: true + do_sedi_heat: true + do_sedi_w: true + dw_land: 0.15 + dw_ocean: 0.1 + fix_negative: true + icloud_f: 0 + irain_f: 0 + prog_ccn: false + qi0_crt: 8.0e-05 + qi_lim: 1.0 + ql_gen: 0.001 + ql_mlt: 0.002 + qs_mlt: 1.0e-6 + qs0_crt: 0.003 + rad_graupel: true + rad_rain: true + rad_snow: true + rh_inc: 0.2 + rh_inr: 0.3 + rh_ins: 0.3 + rthresh: 8.0e-6 + tau_i2s: 1000.0 + tau_l2v: 300.0 + tau_v2l: 90.0 + vg_fac: 1.0 + vg_max: 16.0 + vi_fac: 0.85 + vi_max: 1.0 + vr_fac: 1.0 + vr_max: 16.0 + vs_fac: 1.0 + vs_max: 2.0 + z_slope_ice: true + z_slope_liq: true + gfs_physics_nml: + cal_pre: false + cap_k0_land: false + cdmbgwd: + - 3.5 + - 0.25 + cloud_gfdl: true + cnvcld: false + cnvgwd: true + debug: false + do_deep: true + do_ocean: false # Turn off the mixed layer ocean for now. + dspheat: true + fhcyc: 24.0 + fhlwr: 1800.0 + fhswr: 1800.0 + fhzero: 0.25 + gwd_p_crit: 2000.0 + hybedmf: false + iaer: 111 + ialb: 1 + ico2: 2 + iems: 1 + imfdeepcnv: 2 + imfshalcnv: 2 + isatmedmf: 1 + isol: 2 + isot: 1 + isubc_lw: 2 + isubc_sw: 2 + ivegsrc: 1 + ldiag3d: true + lwhtr: true + ncld: 5 + nst_anl: true + pdfcld: true + pre_rad: false + prslrd0: 0.0 + random_clds: false + redrag: true + satmedmf: true + shal_cnv: true + swhtr: true + trans_trac: true + use_ufo: true + xkzm_ml: 2.0 + xkzm_hl: 2.0 + xkzm_mi: 1.5 + xkzm_hi: 1.5 + xkzminv: 0.0 + ysupbl: false + zhao_mic: false + integ_phys_nml: + do_inline_mp: true + do_sat_adj: false + interpolator_nml: + interp_method: conserve_great_circle + namsfc: + fabsl: 99999 + faisl: 99999 + faiss: 99999 + fnabsc: grb/global_mxsnoalb.uariz.t1534.3072.1536.rg.grb + fnacna: '' + fnaisc: grb/CFSR.SEAICE.1982.2012.monthly.clim.grb + fnalbc: grb/global_snowfree_albedo.bosu.t1534.3072.1536.rg.grb + fnalbc2: grb/global_albedo4.1x1.grb + fnglac: grb/global_glacier.2x2.grb + fnmskh: grb/seaice_newland.grb + fnmxic: grb/global_maxice.2x2.grb + fnslpc: grb/global_slope.1x1.grb + fnsmcc: grb/global_soilmgldas.t1534.3072.1536.grb + fnsnoa: '' + fnsnoc: grb/global_snoclim.1.875.grb + fnsotc: grb/global_soiltype.statsgo.t1534.3072.1536.rg.grb + fntg3c: grb/global_tg3clim.2.6x1.5.grb + fntsfa: '' + fntsfc: grb/RTGSST.1982.2012.monthly.clim.grb + fnvegc: grb/global_vegfrac.0.144.decpercent.grb + fnvetc: grb/global_vegtype.igbp.t1534.3072.1536.rg.grb + fnvmnc: grb/global_shdmin.0.144x0.144.grb + fnvmxc: grb/global_shdmax.0.144x0.144.grb + fnzorc: igbp + fsicl: 99999 + fsics: 99999 + fslpl: 99999 + fsmcl: + - 99999 + - 99999 + - 99999 + fsnol: 99999 + fsnos: 99999 + fsotl: 99999 + ftsfl: 99999 + ftsfs: 90 + fvetl: 99999 + fvmnl: 99999 + fvmxl: 99999 + ldebug: false + ocean_nml: + do_mld_restore: true + end_lat: 30.0 + eps_day: 10.0 + gam: 0.12 + mld_obs_ratio: 1.0 + mld_option: obs + mld_restore_tscale: 15.0 + ocean_option: MLM + restore_method: 2 + sst_restore_tscale: 15.0 + start_lat: -45.0 + end_lat: 45.0 + stress_ratio: 0.75 + use_old_mlm: true + use_rain_flux: true +orographic_forcing: gs://vcm-fv3config/data/orographic_data/v1.0 diff --git a/external/fv3kube/fv3kube/config.py b/external/fv3kube/fv3kube/config.py index 70aac8ea0c..61e6f7c420 100644 --- a/external/fv3kube/fv3kube/config.py +++ b/external/fv3kube/fv3kube/config.py @@ -11,12 +11,13 @@ # Map for different base fv3config dictionaries PWD = Path(os.path.abspath(__file__)).parent BASE_FV3CONFIG_BY_VERSION = { - "v0.2": os.path.join(PWD, "base_yamls/v0.2/fv3config.yml"), - "v0.3": os.path.join(PWD, "base_yamls/v0.3/fv3config.yml"), - "v0.4": os.path.join(PWD, "base_yamls/v0.4/fv3config.yml"), - "v0.5": os.path.join(PWD, "base_yamls/v0.5/fv3config.yml"), - "v0.6": os.path.join(PWD, "base_yamls/v0.6/fv3config.yml"), - "v0.7": os.path.join(PWD, "base_yamls/v0.7/fv3config.yml"), + "v0.2": os.path.join(PWD, "base_yamls/FV3GFS/v0.2/fv3config.yml"), + "v0.3": os.path.join(PWD, "base_yamls/FV3GFS/v0.3/fv3config.yml"), + "v0.4": os.path.join(PWD, "base_yamls/FV3GFS/v0.4/fv3config.yml"), + "v0.5": os.path.join(PWD, "base_yamls/FV3GFS/v0.5/fv3config.yml"), + "v0.6": os.path.join(PWD, "base_yamls/FV3GFS/v0.6/fv3config.yml"), + "v0.7": os.path.join(PWD, "base_yamls/FV3GFS/v0.7/fv3config.yml"), + "SHiELD/v0.1": os.path.join(PWD, "base_yamls/SHiELD/v0.1/fv3config.yml"), } TILE_COORDS_FILENAMES = range(1, 7) # tile numbering in model output filenames diff --git a/workflows/argo/kustomization.yaml b/workflows/argo/kustomization.yaml index 154644b44f..6438c06934 100644 --- a/workflows/argo/kustomization.yaml +++ b/workflows/argo/kustomization.yaml @@ -6,6 +6,7 @@ resources: - training-gpu.yaml - training-torch.yaml - run-fv3gfs.yaml +- run-shield.yaml - offline-diags.yaml - train-diags-prog.yaml - cubed-to-latlon.yaml diff --git a/workflows/argo/prognostic-run.yaml b/workflows/argo/prognostic-run.yaml index 32849d8539..47de3484d9 100644 --- a/workflows/argo/prognostic-run.yaml +++ b/workflows/argo/prognostic-run.yaml @@ -38,20 +38,58 @@ spec: - {name: bucket, value: "{{inputs.parameters.bucket}}"} - {name: project, value: "{{inputs.parameters.project}}"} - {name: tag, value: "{{inputs.parameters.tag}}"} - - - name: prepare-config - template: prepare-config + - - name: prepare-input-config + template: prepare-input-config arguments: parameters: - {name: config, value: "{{inputs.parameters.config}}"} - - - name: run-model + - - template: get-wrapper + name: get-wrapper + arguments: + artifacts: + - name: config + from: "{{steps.prepare-input-config.outputs.artifacts.config}}" + parameters: + - {name: config, value: "{{inputs.parameters.config}}"} + - - name: prepare-config-fv3gfs + template: prepare-config-fv3gfs + when: "'{{steps.get-wrapper.outputs.result}}' == 'fv3gfs.wrapper'" + arguments: + artifacts: + - name: config + from: "{{steps.prepare-input-config.outputs.artifacts.config}}" + - - name: prepare-config-shield + template: prepare-config-shield + when: "'{{steps.get-wrapper.outputs.result}}' == 'shield.wrapper'" + arguments: + artifacts: + - name: config + from: "{{steps.prepare-input-config.outputs.artifacts.config}}" + - - name: run-model-fv3gfs continueOn: failed: true + when: "'{{steps.get-wrapper.outputs.result}}' == 'fv3gfs.wrapper'" templateRef: name: run-fv3gfs template: run-fv3gfs arguments: artifacts: - - {name: fv3config, from: "{{steps.prepare-config.outputs.artifacts.fv3config}}"} + - {name: fv3config, from: "{{steps.prepare-config-fv3gfs.outputs.artifacts.fv3config}}"} + parameters: + - {name: output-url, value: "{{steps.resolve-output-url.outputs.result}}/fv3gfs_run"} + - {name: segment-count, value: "{{inputs.parameters.segment-count}}"} + - {name: cpu, value: "{{inputs.parameters.cpu}}"} + - {name: memory, value: "{{inputs.parameters.memory}}"} + - - name: run-model-shield + continueOn: + failed: true + when: "'{{steps.get-wrapper.outputs.result}}' == 'shield.wrapper'" + templateRef: + name: run-shield + template: run-shield + arguments: + artifacts: + - {name: fv3config, from: "{{steps.prepare-config-shield.outputs.artifacts.fv3config}}"} parameters: - {name: output-url, value: "{{steps.resolve-output-url.outputs.result}}/fv3gfs_run"} - {name: segment-count, value: "{{inputs.parameters.segment-count}}"} @@ -81,15 +119,34 @@ spec: value: "{{steps.resolve-output-url.outputs.result}}/fv3gfs_run_diagnostics" - - name: exit template: exit - when: "{{steps.run-model.status}} == Failed" + when: "{{steps.run-model-fv3gfs.status}} == Failed || {{steps.run-model-shield.status}} == Failed}" - name: exit container: image: ubuntu:20.04 command: ["exit", "1"] - - name: prepare-config + - name: prepare-input-config inputs: parameters: - name: config + outputs: + artifacts: + - {name: config, path: /tmp/config.yaml} + container: + image: bash + command: ["bash", "-c", "-x", "-e"] + volumeMounts: + - name: workdir + mountPath: /tmp + args: + - | + cat << EOF > /tmp/config.yaml + {{inputs.parameters.config}} + EOF + - name: prepare-config-fv3gfs + inputs: + artifacts: + - name: config + path: /tmp/config.yaml outputs: artifacts: - {name: fv3config, path: /tmp/fv3config.yaml} @@ -108,11 +165,46 @@ spec: - name: workdir mountPath: /tmp args: - - | - cat << EOF > config.yaml - {{inputs.parameters.config}} - EOF - - prepare-config \ - config.yaml \ - > /tmp/fv3config.yaml + - prepare-config /tmp/config.yaml > /tmp/fv3config.yaml + - name: prepare-config-shield + inputs: + artifacts: + - name: config + path: /tmp/config.yaml + outputs: + artifacts: + - {name: fv3config, path: /tmp/fv3config.yaml} + container: + image: us.gcr.io/vcm-ml/prognostic_run_shield + resources: + requests: + memory: "500Mi" + cpu: "700m" + limits: + memory: "500Mi" + cpu: "700m" + command: ["bash", "-c", "-x", "-e"] + workingDir: /fv3net/workflows/prognostic_c48_run + volumeMounts: + - name: workdir + mountPath: /tmp + args: + # Without setting the DGLBACKEND some additional text gets added to + # the prepared config (it's a warning that the default backend was set + # to pytorch). I do not fully understand why this happens in this + # image but not the FV3GFS prognostic run image. Should I just set + # this in the Dockerfile itself? + - DGLBACKEND=pytorch prepare-config /tmp/config.yaml > /tmp/fv3config.yaml + - name: get-wrapper + inputs: + artifacts: + - name: config + path: /mnt/data/config.yaml + script: + image: us.gcr.io/vcm-ml/fv3net + command: [python] + source: | + import yaml + with open("/mnt/data/config.yaml", "r") as file: + config = yaml.safe_load(file) + print(config.get("wrapper", "fv3gfs.wrapper")) diff --git a/workflows/argo/run-shield.yaml b/workflows/argo/run-shield.yaml new file mode 100644 index 0000000000..3de3ffa588 --- /dev/null +++ b/workflows/argo/run-shield.yaml @@ -0,0 +1,195 @@ +apiVersion: argoproj.io/v1alpha1 +kind: WorkflowTemplate +metadata: + name: run-shield +spec: + entrypoint: run-shield + templates: + - name: run-shield + inputs: + artifacts: + - name: fv3config + parameters: + - name: output-url + - {name: cpu, value: "6"} + - {name: memory, value: 8Gi} + - {name: segment-count, value: "1"} + steps: + - - template: create-run + name: create-run + arguments: + parameters: + - {name: runURL, value: "{{inputs.parameters.output-url}}"} + artifacts: + - name: fv3config + from: "{{inputs.artifacts.fv3config}}" + - name: choose-node-pool + template: choose-node-pool + arguments: + parameters: + - {name: cpu-request, value: "{{inputs.parameters.cpu}}"} + - {name: cpu-cutoff, value: "24"} + - {name: memory-request, value: "{{inputs.parameters.memory}}"} + - {name: memory-cutoff, value: "30"} + # loop over segments implemented through recursion so that a failed segment will + # terminate the workflow. Argo loops by default run in parallel and do not fail fast. + - - name: run-first-segment + template: run-all-segments + arguments: + parameters: + - {name: output-url, value: "{{inputs.parameters.output-url}}"} + - {name: cpu, value: "{{inputs.parameters.cpu}}"} + - {name: memory, value: "{{inputs.parameters.memory}}"} + - {name: node-pool, value: "{{steps.choose-node-pool.outputs.result}}"} + - {name: segment-count, value: "{{inputs.parameters.segment-count}}"} + - {name: segment, value: 0} + - name: run-all-segments + inputs: + parameters: + - name: output-url + - name: cpu + - name: memory + - name: node-pool + - name: segment-count + - name: segment + steps: + - - template: append-segment + name: append-segment + arguments: + parameters: + - {name: runURL, value: "{{inputs.parameters.output-url}}"} + - {name: cpu, value: "{{inputs.parameters.cpu}}"} + - {name: memory, value: "{{inputs.parameters.memory}}"} + - {name: node-pool, value: "{{inputs.parameters.node-pool}}"} + - - name: increment-segment + template: increment-count + arguments: + parameters: + - {name: count, value: "{{inputs.parameters.segment}}"} + - - name: run-next-segment + template: run-all-segments + when: "{{steps.increment-segment.outputs.result}} < {{inputs.parameters.segment-count}}" + arguments: + parameters: + - {name: output-url, value: "{{inputs.parameters.output-url}}"} + - {name: cpu, value: "{{inputs.parameters.cpu}}"} + - {name: memory, value: "{{inputs.parameters.memory}}"} + - {name: node-pool, value: "{{inputs.parameters.node-pool}}"} + - {name: segment-count, value: "{{inputs.parameters.segment-count}}"} + - {name: segment, value: "{{steps.increment-segment.outputs.result}}"} + - name: create-run + inputs: + artifacts: + - name: fv3config + path: /mnt/data/fv3config.yaml + parameters: + - name: runURL + container: + image: us.gcr.io/vcm-ml/prognostic_run_shield + command: ["/bin/bash", "-c", "-x", "-e"] + resources: + limits: + memory: "500Mi" + cpu: "500m" + args: + - | + find /mnt/data + echo "Using fv3config:" + cat /mnt/data/fv3config.yaml + runfv3 create {{inputs.parameters.runURL}} /mnt/data/fv3config.yaml + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /secret/gcp-credentials/key.json + - name: CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE + value: /secret/gcp-credentials/key.json + - name: FSSPEC_GS_REQUESTER_PAYS + value: vcm-ml + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - name: gcp-key-secret + mountPath: /secret/gcp-credentials + readOnly: true + - name: choose-node-pool + inputs: + parameters: + - name: cpu-request + - name: cpu-cutoff + - name: memory-request + - name: memory-cutoff + + script: + image: python:alpine3.6 + command: [python] + source: | + cpu_request = "{{inputs.parameters.cpu-request}}" + if cpu_request.endswith('m'): + cpus = float(cpu_request[:-1])/1000.0 + else: + cpus = float(cpu_request) + memory_request = "{{inputs.parameters.memory-request}}".lower() + if memory_request.endswith('gi') or memory_request.endswith('gb'): + memory = float(memory_request[:-2]) + else: + raise ValueError("memory request must be in Gi or Gb") + if cpus <= {{inputs.parameters.cpu-cutoff}}: + if memory <= {{inputs.parameters.memory-cutoff}}: + node_pool = 'climate-sim-pool' + else: + node_pool = 'highmem-sim-pool' + else: + node_pool = 'ultra-sim-pool' + print(node_pool) + - name: append-segment + inputs: + parameters: + - name: cpu + - name: memory + - name: runURL + - name: node-pool + tolerations: + - key: "dedicated" + operator: "Equal" + value: "{{inputs.parameters.node-pool}}" + effect: "NoSchedule" + metadata: + labels: + app: fv3run + podSpecPatch: | + containers: + - name: main + resources: + limits: + cpu: "{{inputs.parameters.cpu}}" + memory: "{{inputs.parameters.memory}}" + requests: + cpu: "{{inputs.parameters.cpu}}" + memory: "{{inputs.parameters.memory}}" + container: + image: us.gcr.io/vcm-ml/prognostic_run_shield + command: [runfv3] + args: ["append", "{{inputs.parameters.runURL}}"] + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /secret/gcp-credentials/key.json + - name: CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE + value: /secret/gcp-credentials/key.json + - name: FSSPEC_GS_REQUESTER_PAYS + value: vcm-ml + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - name: gcp-key-secret + mountPath: /secret/gcp-credentials + readOnly: true + - name: dshm + mountPath: /dev/shm + - name: increment-count + inputs: + parameters: + - name: count + script: + image: python:alpine3.6 + command: [python] + source: | + print({{inputs.parameters.count}} + 1) From 163add7ed3546f6c2013028d7cc442db78d251a0 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Mon, 4 Dec 2023 14:49:57 +0000 Subject: [PATCH 2/9] Reference identical workflow steps where possible --- workflows/argo/run-shield.yaml | 47 +++++----------------------------- 1 file changed, 6 insertions(+), 41 deletions(-) diff --git a/workflows/argo/run-shield.yaml b/workflows/argo/run-shield.yaml index 3de3ffa588..856db59d65 100644 --- a/workflows/argo/run-shield.yaml +++ b/workflows/argo/run-shield.yaml @@ -24,7 +24,9 @@ spec: - name: fv3config from: "{{inputs.artifacts.fv3config}}" - name: choose-node-pool - template: choose-node-pool + templateRef: + name: run-fv3gfs + template: choose-node-pool arguments: parameters: - {name: cpu-request, value: "{{inputs.parameters.cpu}}"} @@ -62,7 +64,9 @@ spec: - {name: memory, value: "{{inputs.parameters.memory}}"} - {name: node-pool, value: "{{inputs.parameters.node-pool}}"} - - name: increment-segment - template: increment-count + templateRef: + name: run-fv3gfs + template: increment-count arguments: parameters: - {name: count, value: "{{inputs.parameters.segment}}"} @@ -110,36 +114,6 @@ spec: - name: gcp-key-secret mountPath: /secret/gcp-credentials readOnly: true - - name: choose-node-pool - inputs: - parameters: - - name: cpu-request - - name: cpu-cutoff - - name: memory-request - - name: memory-cutoff - - script: - image: python:alpine3.6 - command: [python] - source: | - cpu_request = "{{inputs.parameters.cpu-request}}" - if cpu_request.endswith('m'): - cpus = float(cpu_request[:-1])/1000.0 - else: - cpus = float(cpu_request) - memory_request = "{{inputs.parameters.memory-request}}".lower() - if memory_request.endswith('gi') or memory_request.endswith('gb'): - memory = float(memory_request[:-2]) - else: - raise ValueError("memory request must be in Gi or Gb") - if cpus <= {{inputs.parameters.cpu-cutoff}}: - if memory <= {{inputs.parameters.memory-cutoff}}: - node_pool = 'climate-sim-pool' - else: - node_pool = 'highmem-sim-pool' - else: - node_pool = 'ultra-sim-pool' - print(node_pool) - name: append-segment inputs: parameters: @@ -184,12 +158,3 @@ spec: readOnly: true - name: dshm mountPath: /dev/shm - - name: increment-count - inputs: - parameters: - - name: count - script: - image: python:alpine3.6 - command: [python] - source: | - print({{inputs.parameters.count}} + 1) From e504e9e03fad79a7330da5f6e2d49aeecb7d090b Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Mon, 4 Dec 2023 21:00:24 +0000 Subject: [PATCH 3/9] Leverage YAML anchors and aliases to reduce repetition --- workflows/argo/kustomization.yaml | 3 +- workflows/argo/prognostic-run.yaml | 93 ++++------ workflows/argo/restart-prognostic-run.yaml | 38 ++++- workflows/argo/run-shield.yaml | 160 ------------------ .../{run-fv3gfs.yaml => run-simulation.yaml} | 69 ++++++-- 5 files changed, 127 insertions(+), 236 deletions(-) delete mode 100644 workflows/argo/run-shield.yaml rename workflows/argo/{run-fv3gfs.yaml => run-simulation.yaml} (76%) diff --git a/workflows/argo/kustomization.yaml b/workflows/argo/kustomization.yaml index 6438c06934..40c51e0083 100644 --- a/workflows/argo/kustomization.yaml +++ b/workflows/argo/kustomization.yaml @@ -5,8 +5,7 @@ resources: - training.yaml - training-gpu.yaml - training-torch.yaml -- run-fv3gfs.yaml -- run-shield.yaml +- run-simulation.yaml - offline-diags.yaml - train-diags-prog.yaml - cubed-to-latlon.yaml diff --git a/workflows/argo/prognostic-run.yaml b/workflows/argo/prognostic-run.yaml index 47de3484d9..8fe0dadc41 100644 --- a/workflows/argo/prognostic-run.yaml +++ b/workflows/argo/prognostic-run.yaml @@ -38,63 +38,57 @@ spec: - {name: bucket, value: "{{inputs.parameters.bucket}}"} - {name: project, value: "{{inputs.parameters.project}}"} - {name: tag, value: "{{inputs.parameters.tag}}"} - - - name: prepare-input-config - template: prepare-input-config + - - name: convert-input-config-to-artifact + template: convert-input-config-to-artifact arguments: parameters: - {name: config, value: "{{inputs.parameters.config}}"} - - - template: get-wrapper - name: get-wrapper + - - template: infer-wrapper + name: infer-wrapper arguments: artifacts: - name: config - from: "{{steps.prepare-input-config.outputs.artifacts.config}}" + from: "{{steps.convert-input-config-to-artifact.outputs.artifacts.config}}" parameters: - {name: config, value: "{{inputs.parameters.config}}"} - - - name: prepare-config-fv3gfs + - - &prepare-config-step-fv3gfs + name: prepare-config-fv3gfs template: prepare-config-fv3gfs - when: "'{{steps.get-wrapper.outputs.result}}' == 'fv3gfs.wrapper'" + when: "'{{steps.infer-wrapper.outputs.result}}' == 'fv3gfs.wrapper'" arguments: artifacts: - name: config - from: "{{steps.prepare-input-config.outputs.artifacts.config}}" - - - name: prepare-config-shield + from: "{{steps.convert-input-config-to-artifact.outputs.artifacts.config}}" + - - <<: *prepare-config-step-fv3gfs + name: prepare-config-shield template: prepare-config-shield - when: "'{{steps.get-wrapper.outputs.result}}' == 'shield.wrapper'" - arguments: - artifacts: - - name: config - from: "{{steps.prepare-input-config.outputs.artifacts.config}}" - - - name: run-model-fv3gfs + when: "'{{steps.infer-wrapper.outputs.result}}' == 'shield.wrapper'" + - - &run-model-step-fv3gfs + name: run-model-fv3gfs continueOn: failed: true - when: "'{{steps.get-wrapper.outputs.result}}' == 'fv3gfs.wrapper'" + when: "'{{steps.infer-wrapper.outputs.result}}' == 'fv3gfs.wrapper'" templateRef: name: run-fv3gfs template: run-fv3gfs arguments: artifacts: - {name: fv3config, from: "{{steps.prepare-config-fv3gfs.outputs.artifacts.fv3config}}"} - parameters: + parameters: &run-model-step-parameters-fv3gfs - {name: output-url, value: "{{steps.resolve-output-url.outputs.result}}/fv3gfs_run"} - {name: segment-count, value: "{{inputs.parameters.segment-count}}"} - {name: cpu, value: "{{inputs.parameters.cpu}}"} - {name: memory, value: "{{inputs.parameters.memory}}"} - - - name: run-model-shield - continueOn: - failed: true - when: "'{{steps.get-wrapper.outputs.result}}' == 'shield.wrapper'" + - - <<: *run-model-step-fv3gfs + name: run-model-shield + when: "'{{steps.infer-wrapper.outputs.result}}' == 'shield.wrapper'" templateRef: name: run-shield template: run-shield arguments: artifacts: - {name: fv3config, from: "{{steps.prepare-config-shield.outputs.artifacts.fv3config}}"} - parameters: - - {name: output-url, value: "{{steps.resolve-output-url.outputs.result}}/fv3gfs_run"} - - {name: segment-count, value: "{{inputs.parameters.segment-count}}"} - - {name: cpu, value: "{{inputs.parameters.cpu}}"} - - {name: memory, value: "{{inputs.parameters.memory}}"} + parameters: *run-model-step-parameters-fv3gfs - - name: online-diags when: "{{inputs.parameters.online-diags}} == true" templateRef: @@ -119,12 +113,12 @@ spec: value: "{{steps.resolve-output-url.outputs.result}}/fv3gfs_run_diagnostics" - - name: exit template: exit - when: "{{steps.run-model-fv3gfs.status}} == Failed || {{steps.run-model-shield.status}} == Failed}" + when: "{{steps.run-model-fv3gfs.status}} == Failed || {{steps.run-model-shield.status}} == Failed" - name: exit container: image: ubuntu:20.04 command: ["exit", "1"] - - name: prepare-input-config + - name: convert-input-config-to-artifact inputs: parameters: - name: config @@ -142,7 +136,8 @@ spec: cat << EOF > /tmp/config.yaml {{inputs.parameters.config}} EOF - - name: prepare-config-fv3gfs + - &prepare-config-fv3gfs + name: prepare-config-fv3gfs inputs: artifacts: - name: config @@ -150,7 +145,7 @@ spec: outputs: artifacts: - {name: fv3config, path: /tmp/fv3config.yaml} - container: + container: &prepare-config-container-fv3gfs image: us.gcr.io/vcm-ml/prognostic_run resources: requests: @@ -166,36 +161,20 @@ spec: mountPath: /tmp args: - prepare-config /tmp/config.yaml > /tmp/fv3config.yaml - - name: prepare-config-shield - inputs: - artifacts: - - name: config - path: /tmp/config.yaml - outputs: - artifacts: - - {name: fv3config, path: /tmp/fv3config.yaml} + - <<: *prepare-config-fv3gfs + name: prepare-config-shield container: + <<: *prepare-config-container-fv3gfs image: us.gcr.io/vcm-ml/prognostic_run_shield - resources: - requests: - memory: "500Mi" - cpu: "700m" - limits: - memory: "500Mi" - cpu: "700m" - command: ["bash", "-c", "-x", "-e"] - workingDir: /fv3net/workflows/prognostic_c48_run - volumeMounts: - - name: workdir - mountPath: /tmp - args: + env: # Without setting the DGLBACKEND some additional text gets added to - # the prepared config (it's a warning that the default backend was set - # to pytorch). I do not fully understand why this happens in this - # image but not the FV3GFS prognostic run image. Should I just set - # this in the Dockerfile itself? - - DGLBACKEND=pytorch prepare-config /tmp/config.yaml > /tmp/fv3config.yaml - - name: get-wrapper + # the prepared config (a warning that the default backend was set to + # pytorch). I do not fully understand why this happens in this image + # but not the FV3GFS prognostic run image. Should I just set this in + # the Dockerfile itself? + - name: DGLBACKEND + value: pytorch + - name: infer-wrapper inputs: artifacts: - name: config diff --git a/workflows/argo/restart-prognostic-run.yaml b/workflows/argo/restart-prognostic-run.yaml index b9fcd9a140..1c8587b44a 100644 --- a/workflows/argo/restart-prognostic-run.yaml +++ b/workflows/argo/restart-prognostic-run.yaml @@ -26,7 +26,7 @@ spec: steps: - - name: choose-node-pool templateRef: - name: run-fv3gfs + name: run-simulation template: choose-node-pool arguments: parameters: @@ -34,9 +34,16 @@ spec: - {name: cpu-cutoff, value: "24"} - {name: memory-request, value: "{{inputs.parameters.memory}}"} - {name: memory-cutoff, value: "30"} - - - name: restart-run + - - name: infer-wrapper + template: infer-wrapper + arguments: + parameters: + - {name: url, value: "{{inputs.parameters.url}}"} + - - &restart-run-fv3gfs + name: restart-run-fv3gfs + when: "'{{steps.infer-wrapper.outputs.result}}' == 'fv3gfs.wrapper'" templateRef: - name: run-fv3gfs + name: run-simulation template: run-all-segments arguments: parameters: @@ -46,3 +53,28 @@ spec: - {name: memory, value: "{{inputs.parameters.memory}}"} - {name: node-pool, value: "{{steps.choose-node-pool.outputs.result}}"} - {name: segment, value: 0} + - - <<: *restart-run-fv3gfs + name: restart-run-shield + when: "'{{steps.infer-wrapper.outputs.result}}' == 'shield.wrapper'" + templateRef: + name: run-simulation + template: run-all-segments-shield + - name: infer-wrapper + inputs: + parameters: + - name: url + script: + image: + us.gcr.io/vcm-ml/fv3net + command: [python] + source: | + import os + import fsspec + import yaml + + config_path = os.path.join("{{inputs.parameters.url}}", "fv3config.yml") + fs, *_ = fsspec.get_fs_token_paths(config_path) + with fs.open(config_path, "r") as file: + config = yaml.safe_load(file) + + print(config.get("wrapper", "fv3gfs.wrapper")) diff --git a/workflows/argo/run-shield.yaml b/workflows/argo/run-shield.yaml deleted file mode 100644 index 856db59d65..0000000000 --- a/workflows/argo/run-shield.yaml +++ /dev/null @@ -1,160 +0,0 @@ -apiVersion: argoproj.io/v1alpha1 -kind: WorkflowTemplate -metadata: - name: run-shield -spec: - entrypoint: run-shield - templates: - - name: run-shield - inputs: - artifacts: - - name: fv3config - parameters: - - name: output-url - - {name: cpu, value: "6"} - - {name: memory, value: 8Gi} - - {name: segment-count, value: "1"} - steps: - - - template: create-run - name: create-run - arguments: - parameters: - - {name: runURL, value: "{{inputs.parameters.output-url}}"} - artifacts: - - name: fv3config - from: "{{inputs.artifacts.fv3config}}" - - name: choose-node-pool - templateRef: - name: run-fv3gfs - template: choose-node-pool - arguments: - parameters: - - {name: cpu-request, value: "{{inputs.parameters.cpu}}"} - - {name: cpu-cutoff, value: "24"} - - {name: memory-request, value: "{{inputs.parameters.memory}}"} - - {name: memory-cutoff, value: "30"} - # loop over segments implemented through recursion so that a failed segment will - # terminate the workflow. Argo loops by default run in parallel and do not fail fast. - - - name: run-first-segment - template: run-all-segments - arguments: - parameters: - - {name: output-url, value: "{{inputs.parameters.output-url}}"} - - {name: cpu, value: "{{inputs.parameters.cpu}}"} - - {name: memory, value: "{{inputs.parameters.memory}}"} - - {name: node-pool, value: "{{steps.choose-node-pool.outputs.result}}"} - - {name: segment-count, value: "{{inputs.parameters.segment-count}}"} - - {name: segment, value: 0} - - name: run-all-segments - inputs: - parameters: - - name: output-url - - name: cpu - - name: memory - - name: node-pool - - name: segment-count - - name: segment - steps: - - - template: append-segment - name: append-segment - arguments: - parameters: - - {name: runURL, value: "{{inputs.parameters.output-url}}"} - - {name: cpu, value: "{{inputs.parameters.cpu}}"} - - {name: memory, value: "{{inputs.parameters.memory}}"} - - {name: node-pool, value: "{{inputs.parameters.node-pool}}"} - - - name: increment-segment - templateRef: - name: run-fv3gfs - template: increment-count - arguments: - parameters: - - {name: count, value: "{{inputs.parameters.segment}}"} - - - name: run-next-segment - template: run-all-segments - when: "{{steps.increment-segment.outputs.result}} < {{inputs.parameters.segment-count}}" - arguments: - parameters: - - {name: output-url, value: "{{inputs.parameters.output-url}}"} - - {name: cpu, value: "{{inputs.parameters.cpu}}"} - - {name: memory, value: "{{inputs.parameters.memory}}"} - - {name: node-pool, value: "{{inputs.parameters.node-pool}}"} - - {name: segment-count, value: "{{inputs.parameters.segment-count}}"} - - {name: segment, value: "{{steps.increment-segment.outputs.result}}"} - - name: create-run - inputs: - artifacts: - - name: fv3config - path: /mnt/data/fv3config.yaml - parameters: - - name: runURL - container: - image: us.gcr.io/vcm-ml/prognostic_run_shield - command: ["/bin/bash", "-c", "-x", "-e"] - resources: - limits: - memory: "500Mi" - cpu: "500m" - args: - - | - find /mnt/data - echo "Using fv3config:" - cat /mnt/data/fv3config.yaml - runfv3 create {{inputs.parameters.runURL}} /mnt/data/fv3config.yaml - env: - - name: GOOGLE_APPLICATION_CREDENTIALS - value: /secret/gcp-credentials/key.json - - name: CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE - value: /secret/gcp-credentials/key.json - - name: FSSPEC_GS_REQUESTER_PAYS - value: vcm-ml - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - name: gcp-key-secret - mountPath: /secret/gcp-credentials - readOnly: true - - name: append-segment - inputs: - parameters: - - name: cpu - - name: memory - - name: runURL - - name: node-pool - tolerations: - - key: "dedicated" - operator: "Equal" - value: "{{inputs.parameters.node-pool}}" - effect: "NoSchedule" - metadata: - labels: - app: fv3run - podSpecPatch: | - containers: - - name: main - resources: - limits: - cpu: "{{inputs.parameters.cpu}}" - memory: "{{inputs.parameters.memory}}" - requests: - cpu: "{{inputs.parameters.cpu}}" - memory: "{{inputs.parameters.memory}}" - container: - image: us.gcr.io/vcm-ml/prognostic_run_shield - command: [runfv3] - args: ["append", "{{inputs.parameters.runURL}}"] - env: - - name: GOOGLE_APPLICATION_CREDENTIALS - value: /secret/gcp-credentials/key.json - - name: CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE - value: /secret/gcp-credentials/key.json - - name: FSSPEC_GS_REQUESTER_PAYS - value: vcm-ml - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - name: gcp-key-secret - mountPath: /secret/gcp-credentials - readOnly: true - - name: dshm - mountPath: /dev/shm diff --git a/workflows/argo/run-fv3gfs.yaml b/workflows/argo/run-simulation.yaml similarity index 76% rename from workflows/argo/run-fv3gfs.yaml rename to workflows/argo/run-simulation.yaml index 5a8032cf95..b56298ca45 100644 --- a/workflows/argo/run-fv3gfs.yaml +++ b/workflows/argo/run-simulation.yaml @@ -1,11 +1,11 @@ apiVersion: argoproj.io/v1alpha1 kind: WorkflowTemplate metadata: - name: run-fv3gfs + name: run-simulation spec: - entrypoint: run-fv3gfs templates: - - name: run-fv3gfs + - &run-fv3gfs + name: run-fv3gfs inputs: artifacts: - name: fv3config @@ -15,7 +15,8 @@ spec: - {name: memory, value: 8Gi} - {name: segment-count, value: "1"} steps: - - - template: create-run + - - &create-run-step-fv3gfs + template: create-run name: create-run arguments: parameters: @@ -23,7 +24,8 @@ spec: artifacts: - name: fv3config from: "{{inputs.artifacts.fv3config}}" - - name: choose-node-pool + - &choose-node-pool-step-fv3gfs + name: choose-node-pool template: choose-node-pool arguments: parameters: @@ -33,7 +35,8 @@ spec: - {name: memory-cutoff, value: "30"} # loop over segments implemented through recursion so that a failed segment will # terminate the workflow. Argo loops by default run in parallel and do not fail fast. - - - name: run-first-segment + - - &run-first-segment-step-fv3gfs + name: run-first-segment template: run-all-segments arguments: parameters: @@ -43,7 +46,8 @@ spec: - {name: node-pool, value: "{{steps.choose-node-pool.outputs.result}}"} - {name: segment-count, value: "{{inputs.parameters.segment-count}}"} - {name: segment, value: 0} - - name: run-all-segments + - &run-all-segments-fv3gfs + name: run-all-segments inputs: parameters: - name: output-url @@ -53,7 +57,8 @@ spec: - name: segment-count - name: segment steps: - - - template: append-segment + - - &append-segment-step-fv3gfs + template: append-segment name: append-segment arguments: parameters: @@ -61,12 +66,14 @@ spec: - {name: cpu, value: "{{inputs.parameters.cpu}}"} - {name: memory, value: "{{inputs.parameters.memory}}"} - {name: node-pool, value: "{{inputs.parameters.node-pool}}"} - - - name: increment-segment + - - &increment-segment-step-fv3gfs + name: increment-segment template: increment-count arguments: parameters: - {name: count, value: "{{inputs.parameters.segment}}"} - - - name: run-next-segment + - - &run-next-segment-step-fv3gfs + name: run-next-segment template: run-all-segments when: "{{steps.increment-segment.outputs.result}} < {{inputs.parameters.segment-count}}" arguments: @@ -77,14 +84,15 @@ spec: - {name: node-pool, value: "{{inputs.parameters.node-pool}}"} - {name: segment-count, value: "{{inputs.parameters.segment-count}}"} - {name: segment, value: "{{steps.increment-segment.outputs.result}}"} - - name: create-run + - &create-run-fv3gfs + name: create-run inputs: artifacts: - name: fv3config path: /mnt/data/fv3config.yaml parameters: - name: runURL - container: + container: &create-run-container-fv3gfs image: us.gcr.io/vcm-ml/prognostic_run command: ["/bin/bash", "-c", "-x", "-e"] resources: @@ -142,7 +150,8 @@ spec: else: node_pool = 'ultra-sim-pool' print(node_pool) - - name: append-segment + - &append-segment-fv3gfs + name: append-segment inputs: parameters: - name: cpu @@ -167,7 +176,7 @@ spec: requests: cpu: "{{inputs.parameters.cpu}}" memory: "{{inputs.parameters.memory}}" - container: + container: &append-segment-container-fv3gfs image: us.gcr.io/vcm-ml/prognostic_run command: [runfv3] args: ["append", "{{inputs.parameters.runURL}}"] @@ -195,3 +204,35 @@ spec: command: [python] source: | print({{inputs.parameters.count}} + 1) + + # Template for running SHiELD instead of FV3GFS; note we have been careful to + # rename all the SHiELD-wrapper-dependent templates such that there is no risk + # of using a template meant for running FV3GFS with SHiELD or vice-versa. The + # choose-node-pool and increment-segment steps are identical between the two + # workflows, so we do not bother to do any renaming of those. + - <<: *run-fv3gfs + name: run-shield + steps: + - - <<: *create-run-step-fv3gfs + template: create-run-shield + - *choose-node-pool-step-fv3gfs + - - <<: *run-first-segment-step-fv3gfs + template: run-all-segments-shield + - <<: *run-all-segments-fv3gfs + name: run-all-segments-shield + steps: + - - <<: *append-segment-step-fv3gfs + template: append-segment-shield + - - *increment-segment-step-fv3gfs + - - <<: *run-next-segment-step-fv3gfs + template: run-all-segments-shield + - <<: *create-run-fv3gfs + name: create-run-shield + container: + <<: *create-run-container-fv3gfs + image: us.gcr.io/vcm-ml/prognostic_run_shield + - <<: *append-segment-fv3gfs + name: append-segment-shield + container: + <<: *append-segment-container-fv3gfs + image: us.gcr.io/vcm-ml/prognostic_run_shield From fac244f13dbf9a7c781b68e051fb70cc9b9d588b Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Fri, 8 Dec 2023 13:41:48 +0000 Subject: [PATCH 4/9] Update forcing in SHiELD base config This references forcing data in the vcm-fv3config bucket, and makes parameters controlling whether we use data from initial conditions or the climatology consistent with our v0.7 FV3GFS base config. --- .../base_yamls/SHiELD/v0.1/fv3config.yml | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml index ce48b327fc..c49d5af9f3 100644 --- a/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml +++ b/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml @@ -2,7 +2,7 @@ data_table: default diag_table: no_output experiment_name: default field_table: gs://vcm-fv3config/config/field_table/TKE-EDMF/v1.1/field_table -forcing: gs://vcm-ml-experiments/spencerc/2023-09-13-SHiELD-forcing-data +forcing: gs://vcm-fv3config/data/base_forcing/SHiELD/v1.0/C48 initial_conditions: '' # no default provided namelist: fms_affinity_nml: @@ -230,14 +230,15 @@ namelist: interpolator_nml: interp_method: conserve_great_circle namsfc: - fabsl: 99999 - faisl: 99999 - faiss: 99999 - fnabsc: grb/global_mxsnoalb.uariz.t1534.3072.1536.rg.grb + fabsl: 0 # Use maximum snow albedo from forcing files instead of initial conditions + fabss: 0 # Use maximum snow albedo from forcing files instead of initial conditions + faisl: 0 # Use land / sea / sea-ice mask from forcing files instead of initial conditions + faiss: 0 # Use land / sea / sea-ice mask from forcing files instead of initial conditions + fnabsc: INPUT/fix_sfc/maximum_snow_albedo.tileX.nc fnacna: '' fnaisc: grb/CFSR.SEAICE.1982.2012.monthly.clim.grb - fnalbc: grb/global_snowfree_albedo.bosu.t1534.3072.1536.rg.grb - fnalbc2: grb/global_albedo4.1x1.grb + fnalbc: INPUT/fix_sfc/snowfree_albedo.tileX.nc + fnalbc2: INPUT/fix_sfc/facsf.tileX.nc fnglac: grb/global_glacier.2x2.grb fnmskh: grb/seaice_newland.grb fnmxic: grb/global_maxice.2x2.grb @@ -245,30 +246,32 @@ namelist: fnsmcc: grb/global_soilmgldas.t1534.3072.1536.grb fnsnoa: '' fnsnoc: grb/global_snoclim.1.875.grb - fnsotc: grb/global_soiltype.statsgo.t1534.3072.1536.rg.grb - fntg3c: grb/global_tg3clim.2.6x1.5.grb + fnsotc: INPUT/fix_sfc/soil_type.tileX.nc + fntg3c: INPUT/fix_sfc/substrate_temperature.tileX.nc fntsfa: '' fntsfc: grb/RTGSST.1982.2012.monthly.clim.grb - fnvegc: grb/global_vegfrac.0.144.decpercent.grb - fnvetc: grb/global_vegtype.igbp.t1534.3072.1536.rg.grb - fnvmnc: grb/global_shdmin.0.144x0.144.grb - fnvmxc: grb/global_shdmax.0.144x0.144.grb + fnvegc: INPUT/fix_sfc/vegetation_greenness.tileX.nc + fnvetc: INPUT/fix_sfc/vegetation_type.tileX.nc + fnvmnc: INPUT/fix_sfc/vegetation_greenness.tileX.nc + fnvmxc: INPUT/fix_sfc/vegetation_greenness.tileX.nc fnzorc: igbp - fsicl: 99999 - fsics: 99999 - fslpl: 99999 + fsicl: 0 # Use sea ice fraction from forcing files instead of persisting the fraction in the initial condition + fsics: 0 # Use sea ice fraction from forcing files instead of persisting the fraction in the initial condition + fslpl: 99999 # Use slope type from initial condition fsmcl: + - 99999 # Use soil moisture from initial condition - 99999 - 99999 - - 99999 - fsnol: 99999 - fsnos: 99999 + fsnol: 99999 # Use snow cover fraction from initial condition + fsnos: 99999 # Use snow cover fraction from initial condition fsotl: 99999 ftsfl: 99999 - ftsfs: 90 + ftsfs: 0 # Use only climatological SSTs, no relaxation from initial conditions fvetl: 99999 - fvmnl: 99999 - fvmxl: 99999 + fvmnl: 0 # Use minimum green vegetation fraction from forcing files instead of initial condition + fvmns: 0 # Use minimum green vegetation fraction from forcing files instead of initial condition + fvmxl: 0 # Use maximum green vegetation fraction from forcing files instead of initial condition + fvmxs: 0 # Use maximum green vegetation fraction from forcing files instead of initial condition ldebug: false ocean_nml: do_mld_restore: true From 942ab21633d976fd35325bdccc726b6118f5978a Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Wed, 13 Dec 2023 17:42:39 +0000 Subject: [PATCH 5/9] Push fix to name field in templateRef we made earlier --- workflows/argo/prognostic-run.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/argo/prognostic-run.yaml b/workflows/argo/prognostic-run.yaml index 8fe0dadc41..e7d32820b3 100644 --- a/workflows/argo/prognostic-run.yaml +++ b/workflows/argo/prognostic-run.yaml @@ -69,7 +69,7 @@ spec: failed: true when: "'{{steps.infer-wrapper.outputs.result}}' == 'fv3gfs.wrapper'" templateRef: - name: run-fv3gfs + name: run-simulation template: run-fv3gfs arguments: artifacts: @@ -83,7 +83,7 @@ spec: name: run-model-shield when: "'{{steps.infer-wrapper.outputs.result}}' == 'shield.wrapper'" templateRef: - name: run-shield + name: run-simulation template: run-shield arguments: artifacts: From 65ca5c839b28d8b85337d43b6da6274eb46dcfa3 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Wed, 13 Dec 2023 18:51:13 +0000 Subject: [PATCH 6/9] Bump SHiELD-wrapper to include Q-flux fix This bumps SHiELD-wrapper to include a Q-flux bug fix, and a couple other user experience improvements with the SOM. Note this fix has not been merged to SHiELD yet, so I will update this PR later once we can point to main branches of SHiELD-wrapper and SHiELD_physics. --- external/SHiELD-wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/SHiELD-wrapper b/external/SHiELD-wrapper index 48c83c96c2..6a4e165059 160000 --- a/external/SHiELD-wrapper +++ b/external/SHiELD-wrapper @@ -1 +1 @@ -Subproject commit 48c83c96c274ad3631aaa9d54a17770c4d540dc4 +Subproject commit 6a4e165059d3067f39c45e002f65cacbde0f761d From d6fd2753d3d2839e2a95e3607e0a865bbd4d41fb Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Wed, 20 Dec 2023 21:35:42 +0000 Subject: [PATCH 7/9] Point SHiELD wrapper to a commit on main --- external/SHiELD-wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/SHiELD-wrapper b/external/SHiELD-wrapper index 6a4e165059..f82a7130f7 160000 --- a/external/SHiELD-wrapper +++ b/external/SHiELD-wrapper @@ -1 +1 @@ -Subproject commit 6a4e165059d3067f39c45e002f65cacbde0f761d +Subproject commit f82a7130f7f2bbc3a7bc26860517694f13e9ef1b From 88f91a074154d12a4cdd3289ec9391ccf0e83a4e Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Thu, 21 Dec 2023 19:45:40 +0000 Subject: [PATCH 8/9] Set DGLBACKEND=pytorch in prognostic_run_shield image --- docker/prognostic_run_shield/Dockerfile | 4 ++++ workflows/argo/prognostic-run.yaml | 8 -------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/docker/prognostic_run_shield/Dockerfile b/docker/prognostic_run_shield/Dockerfile index bd0b965d71..fc325ef459 100644 --- a/docker/prognostic_run_shield/Dockerfile +++ b/docker/prognostic_run_shield/Dockerfile @@ -127,5 +127,9 @@ ENV OUTPUT_FREQ_SEC=18000 # Add fv3net packages to the PYTHONPATH ENV PYTHONPATH=${FV3NET_DIR}/workflows/prognostic_c48_run:${FV3NET_DIR}/external/fv3fit:${FV3NET_DIR}/external/emulation:${FV3NET_DIR}/external/vcm:${FV3NET_DIR}/external/artifacts:${FV3NET_DIR}/external/loaders:${FV3NET_DIR}/external/fv3kube:${FV3NET_DIR}/workflows/post_process_run:${FV3NET_DIR}/external/radiation:${PYTHONPATH} +# Set DGLBACKEND to pytorch to silence warnings that it is unset; if unset it is +# set to pytorch, so this just makes it explicit. +ENV DGLBACKEND=pytorch + WORKDIR ${FV3NET_DIR}/workflows/prognostic_c48_run CMD ["bash"] diff --git a/workflows/argo/prognostic-run.yaml b/workflows/argo/prognostic-run.yaml index e7d32820b3..64ca4986d5 100644 --- a/workflows/argo/prognostic-run.yaml +++ b/workflows/argo/prognostic-run.yaml @@ -166,14 +166,6 @@ spec: container: <<: *prepare-config-container-fv3gfs image: us.gcr.io/vcm-ml/prognostic_run_shield - env: - # Without setting the DGLBACKEND some additional text gets added to - # the prepared config (a warning that the default backend was set to - # pytorch). I do not fully understand why this happens in this image - # but not the FV3GFS prognostic run image. Should I just set this in - # the Dockerfile itself? - - name: DGLBACKEND - value: pytorch - name: infer-wrapper inputs: artifacts: From 2148134e87161f778084a2bfdce517f2ffe57cb9 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Thu, 21 Dec 2023 21:53:17 +0000 Subject: [PATCH 9/9] Bump SHiELD-wrapper to for Q-flux bug fix and diagnostic updates --- external/SHiELD-wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/SHiELD-wrapper b/external/SHiELD-wrapper index f82a7130f7..1e5939d604 160000 --- a/external/SHiELD-wrapper +++ b/external/SHiELD-wrapper @@ -1 +1 @@ -Subproject commit f82a7130f7f2bbc3a7bc26860517694f13e9ef1b +Subproject commit 1e5939d604c6958f6f1f10452fea1570b359e494