From 32b86dba094b3ed8d6377e7bf5eeea5cf701c907 Mon Sep 17 00:00:00 2001
From: Spencer Clark <spencerkclark@gmail.com>
Date: Sat, 2 Dec 2023 01:12:43 +0000
Subject: [PATCH 1/9] Split out argo integration from #2350

---
 .../{ => FV3GFS}/v0.2/fv3config.yml           |   0
 .../{ => FV3GFS}/v0.3/fv3config.yml           |   0
 .../{ => FV3GFS}/v0.4/fv3config.yml           |   0
 .../{ => FV3GFS}/v0.5/fv3config.yml           |   0
 .../{ => FV3GFS}/v0.6/fv3config.yml           |   0
 .../{ => FV3GFS}/v0.7/fv3config.yml           |   0
 .../base_yamls/SHiELD/v0.1/fv3config.yml      | 289 ++++++++++++++++++
 external/fv3kube/fv3kube/config.py            |  13 +-
 workflows/argo/kustomization.yaml             |   1 +
 workflows/argo/prognostic-run.yaml            | 120 +++++++-
 workflows/argo/run-shield.yaml                | 195 ++++++++++++
 11 files changed, 598 insertions(+), 20 deletions(-)
 rename external/fv3kube/fv3kube/base_yamls/{ => FV3GFS}/v0.2/fv3config.yml (100%)
 rename external/fv3kube/fv3kube/base_yamls/{ => FV3GFS}/v0.3/fv3config.yml (100%)
 rename external/fv3kube/fv3kube/base_yamls/{ => FV3GFS}/v0.4/fv3config.yml (100%)
 rename external/fv3kube/fv3kube/base_yamls/{ => FV3GFS}/v0.5/fv3config.yml (100%)
 rename external/fv3kube/fv3kube/base_yamls/{ => FV3GFS}/v0.6/fv3config.yml (100%)
 rename external/fv3kube/fv3kube/base_yamls/{ => FV3GFS}/v0.7/fv3config.yml (100%)
 create mode 100644 external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml
 create mode 100644 workflows/argo/run-shield.yaml

diff --git a/external/fv3kube/fv3kube/base_yamls/v0.2/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.2/fv3config.yml
similarity index 100%
rename from external/fv3kube/fv3kube/base_yamls/v0.2/fv3config.yml
rename to external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.2/fv3config.yml
diff --git a/external/fv3kube/fv3kube/base_yamls/v0.3/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.3/fv3config.yml
similarity index 100%
rename from external/fv3kube/fv3kube/base_yamls/v0.3/fv3config.yml
rename to external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.3/fv3config.yml
diff --git a/external/fv3kube/fv3kube/base_yamls/v0.4/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.4/fv3config.yml
similarity index 100%
rename from external/fv3kube/fv3kube/base_yamls/v0.4/fv3config.yml
rename to external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.4/fv3config.yml
diff --git a/external/fv3kube/fv3kube/base_yamls/v0.5/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.5/fv3config.yml
similarity index 100%
rename from external/fv3kube/fv3kube/base_yamls/v0.5/fv3config.yml
rename to external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.5/fv3config.yml
diff --git a/external/fv3kube/fv3kube/base_yamls/v0.6/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.6/fv3config.yml
similarity index 100%
rename from external/fv3kube/fv3kube/base_yamls/v0.6/fv3config.yml
rename to external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.6/fv3config.yml
diff --git a/external/fv3kube/fv3kube/base_yamls/v0.7/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.7/fv3config.yml
similarity index 100%
rename from external/fv3kube/fv3kube/base_yamls/v0.7/fv3config.yml
rename to external/fv3kube/fv3kube/base_yamls/FV3GFS/v0.7/fv3config.yml
diff --git a/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml
new file mode 100644
index 0000000000..ce48b327fc
--- /dev/null
+++ b/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml
@@ -0,0 +1,289 @@
+data_table: default
+diag_table: no_output
+experiment_name: default
+field_table: gs://vcm-fv3config/config/field_table/TKE-EDMF/v1.1/field_table
+forcing: gs://vcm-ml-experiments/spencerc/2023-09-13-SHiELD-forcing-data
+initial_conditions: ''  # no default provided
+namelist:
+  fms_affinity_nml:
+    affinity: false
+  amip_interp_nml:
+    data_set: reynolds_oi
+    date_out_of_range: climo
+    interp_oi_sst: true
+    no_anom_sst: false
+    use_ncep_ice: false
+    use_ncep_sst: true
+  atmos_model_nml:
+    blocksize: 24
+    chksum_debug: false
+    dycore_only: false
+    fdiag: 0.25
+    first_time_step: false
+  coupler_nml:
+    atmos_nthreads: 1
+    calendar: julian
+    current_date:
+    - 2016
+    - 8
+    - 1
+    - 0
+    - 0
+    - 0
+    days: 0
+    dt_atmos: 900
+    dt_ocean: 900
+    hours: 0
+    minutes: 30
+    months: 0
+    seconds: 0
+    use_hyper_thread: false
+  diag_manager_nml:
+    prepend_date: false
+  external_ic_nml:
+    checker_tr: false
+    filtered_terrain: true
+    gfs_dwinds: true
+    levp: 64
+    nt_checker: 0
+  fms_io_nml:
+    checksum_required: false
+    max_files_r: 100
+    max_files_w: 100
+  fms_nml:
+    clock_grain: ROUTINE
+    domains_stack_size: 3000000
+    print_memory_usage: false
+  fv_core_nml:
+    a_imp: 1.0
+    adjust_dry_mass: false
+    beta: 0.0
+    consv_am: false
+    consv_te: 0.0
+    d2_bg: 0.0
+    d2_bg_k1: 0.2
+    d2_bg_k2: 0.1
+    d4_bg: 0.15
+    d_con: 1.0
+    d_ext: 0.0
+    dddmp: 0.5
+    delt_max: 0.002
+    dnats: 1
+    do_vort_damp: true
+    dwind_2d: false
+    external_ic: false
+    fill: true
+    fill_dp: true
+    fv_debug: false
+    fv_sg_adj: 300
+    gfs_phil: false
+    grid_type: 0
+    hord_dp: -5
+    hord_mt: 5
+    hord_tm: 5
+    hord_tr: -5
+    hord_vt: 5
+    hydrostatic: false
+    io_layout:
+    - 1
+    - 1
+    k_split: 1
+    ke_bg: 0.0
+    kord_mt: 9
+    kord_tm: -9
+    kord_tr: 9
+    kord_wz: -9
+    layout:
+    - 1
+    - 1
+    make_nh: false
+    mountain: true
+    n_split: 6
+    n_sponge: 30
+    na_init: 0
+    ncep_ic: false
+    nggps_ic: false
+    no_dycore: false
+    nord: 3
+    npx: 49
+    npy: 49
+    npz: 79
+    npz_type: 'gcrm'
+    ntiles: 6
+    nudge_qv: true
+    nwat: 6
+    p_fac: 0.05
+    phys_hydrostatic: false
+    print_freq: 1
+    range_warn: true
+    reset_eta: false
+    rf_cutoff: 3000.0
+    rf_fast: true
+    sg_cutoff: 20000.0
+    tau: 5.0
+    tau_h2o: 0.0
+    use_hydro_pressure: false
+    vtdm4: 0.06
+    warm_start: true
+    z_tracer: true
+  gfdl_mp_nml:
+    c_paut: 0.5
+    c_pgacs: 0.01
+    c_psaci: 0.01
+    ccn_l: 300.0
+    ccn_o: 100.0
+    const_vg: false
+    const_vi: false
+    const_vr: false
+    const_vs: false
+    do_qa: true
+    do_cond_timescale: true
+    do_sedi_heat: true
+    do_sedi_w: true
+    dw_land: 0.15
+    dw_ocean: 0.1
+    fix_negative: true
+    icloud_f: 0
+    irain_f: 0
+    prog_ccn: false
+    qi0_crt: 8.0e-05
+    qi_lim: 1.0
+    ql_gen: 0.001
+    ql_mlt: 0.002
+    qs_mlt: 1.0e-6
+    qs0_crt: 0.003
+    rad_graupel: true
+    rad_rain: true
+    rad_snow: true
+    rh_inc: 0.2
+    rh_inr: 0.3
+    rh_ins: 0.3
+    rthresh: 8.0e-6
+    tau_i2s: 1000.0
+    tau_l2v: 300.0
+    tau_v2l: 90.0
+    vg_fac: 1.0
+    vg_max: 16.0
+    vi_fac: 0.85
+    vi_max: 1.0
+    vr_fac: 1.0
+    vr_max: 16.0
+    vs_fac: 1.0
+    vs_max: 2.0
+    z_slope_ice: true
+    z_slope_liq: true
+  gfs_physics_nml:
+    cal_pre: false
+    cap_k0_land: false
+    cdmbgwd:
+    - 3.5
+    - 0.25
+    cloud_gfdl: true
+    cnvcld: false
+    cnvgwd: true
+    debug: false
+    do_deep: true
+    do_ocean: false  # Turn off the mixed layer ocean for now.
+    dspheat: true
+    fhcyc: 24.0
+    fhlwr: 1800.0
+    fhswr: 1800.0
+    fhzero: 0.25
+    gwd_p_crit: 2000.0
+    hybedmf: false
+    iaer: 111
+    ialb: 1
+    ico2: 2
+    iems: 1
+    imfdeepcnv: 2
+    imfshalcnv: 2
+    isatmedmf: 1
+    isol: 2
+    isot: 1
+    isubc_lw: 2
+    isubc_sw: 2
+    ivegsrc: 1
+    ldiag3d: true
+    lwhtr: true
+    ncld: 5
+    nst_anl: true
+    pdfcld: true
+    pre_rad: false
+    prslrd0: 0.0
+    random_clds: false
+    redrag: true
+    satmedmf: true
+    shal_cnv: true
+    swhtr: true
+    trans_trac: true
+    use_ufo: true
+    xkzm_ml: 2.0
+    xkzm_hl: 2.0
+    xkzm_mi: 1.5
+    xkzm_hi: 1.5
+    xkzminv: 0.0
+    ysupbl: false
+    zhao_mic: false
+  integ_phys_nml:
+    do_inline_mp: true
+    do_sat_adj: false
+  interpolator_nml:
+    interp_method: conserve_great_circle
+  namsfc:
+    fabsl: 99999
+    faisl: 99999
+    faiss: 99999
+    fnabsc: grb/global_mxsnoalb.uariz.t1534.3072.1536.rg.grb
+    fnacna: ''
+    fnaisc: grb/CFSR.SEAICE.1982.2012.monthly.clim.grb
+    fnalbc: grb/global_snowfree_albedo.bosu.t1534.3072.1536.rg.grb
+    fnalbc2: grb/global_albedo4.1x1.grb
+    fnglac: grb/global_glacier.2x2.grb
+    fnmskh: grb/seaice_newland.grb
+    fnmxic: grb/global_maxice.2x2.grb
+    fnslpc: grb/global_slope.1x1.grb
+    fnsmcc: grb/global_soilmgldas.t1534.3072.1536.grb
+    fnsnoa: ''
+    fnsnoc: grb/global_snoclim.1.875.grb
+    fnsotc: grb/global_soiltype.statsgo.t1534.3072.1536.rg.grb
+    fntg3c: grb/global_tg3clim.2.6x1.5.grb
+    fntsfa: ''
+    fntsfc: grb/RTGSST.1982.2012.monthly.clim.grb
+    fnvegc: grb/global_vegfrac.0.144.decpercent.grb
+    fnvetc: grb/global_vegtype.igbp.t1534.3072.1536.rg.grb
+    fnvmnc: grb/global_shdmin.0.144x0.144.grb
+    fnvmxc: grb/global_shdmax.0.144x0.144.grb
+    fnzorc: igbp
+    fsicl: 99999
+    fsics: 99999
+    fslpl: 99999
+    fsmcl:
+    - 99999
+    - 99999
+    - 99999
+    fsnol: 99999
+    fsnos: 99999
+    fsotl: 99999
+    ftsfl: 99999
+    ftsfs: 90
+    fvetl: 99999
+    fvmnl: 99999
+    fvmxl: 99999
+    ldebug: false
+  ocean_nml:
+    do_mld_restore: true
+    end_lat: 30.0
+    eps_day: 10.0
+    gam: 0.12
+    mld_obs_ratio: 1.0
+    mld_option: obs
+    mld_restore_tscale: 15.0
+    ocean_option: MLM
+    restore_method: 2
+    sst_restore_tscale: 15.0
+    start_lat: -45.0
+    end_lat: 45.0
+    stress_ratio: 0.75
+    use_old_mlm: true
+    use_rain_flux: true
+orographic_forcing: gs://vcm-fv3config/data/orographic_data/v1.0
diff --git a/external/fv3kube/fv3kube/config.py b/external/fv3kube/fv3kube/config.py
index 70aac8ea0c..61e6f7c420 100644
--- a/external/fv3kube/fv3kube/config.py
+++ b/external/fv3kube/fv3kube/config.py
@@ -11,12 +11,13 @@
 # Map for different base fv3config dictionaries
 PWD = Path(os.path.abspath(__file__)).parent
 BASE_FV3CONFIG_BY_VERSION = {
-    "v0.2": os.path.join(PWD, "base_yamls/v0.2/fv3config.yml"),
-    "v0.3": os.path.join(PWD, "base_yamls/v0.3/fv3config.yml"),
-    "v0.4": os.path.join(PWD, "base_yamls/v0.4/fv3config.yml"),
-    "v0.5": os.path.join(PWD, "base_yamls/v0.5/fv3config.yml"),
-    "v0.6": os.path.join(PWD, "base_yamls/v0.6/fv3config.yml"),
-    "v0.7": os.path.join(PWD, "base_yamls/v0.7/fv3config.yml"),
+    "v0.2": os.path.join(PWD, "base_yamls/FV3GFS/v0.2/fv3config.yml"),
+    "v0.3": os.path.join(PWD, "base_yamls/FV3GFS/v0.3/fv3config.yml"),
+    "v0.4": os.path.join(PWD, "base_yamls/FV3GFS/v0.4/fv3config.yml"),
+    "v0.5": os.path.join(PWD, "base_yamls/FV3GFS/v0.5/fv3config.yml"),
+    "v0.6": os.path.join(PWD, "base_yamls/FV3GFS/v0.6/fv3config.yml"),
+    "v0.7": os.path.join(PWD, "base_yamls/FV3GFS/v0.7/fv3config.yml"),
+    "SHiELD/v0.1": os.path.join(PWD, "base_yamls/SHiELD/v0.1/fv3config.yml"),
 }
 TILE_COORDS_FILENAMES = range(1, 7)  # tile numbering in model output filenames
 
diff --git a/workflows/argo/kustomization.yaml b/workflows/argo/kustomization.yaml
index 154644b44f..6438c06934 100644
--- a/workflows/argo/kustomization.yaml
+++ b/workflows/argo/kustomization.yaml
@@ -6,6 +6,7 @@ resources:
 - training-gpu.yaml
 - training-torch.yaml
 - run-fv3gfs.yaml
+- run-shield.yaml
 - offline-diags.yaml
 - train-diags-prog.yaml
 - cubed-to-latlon.yaml
diff --git a/workflows/argo/prognostic-run.yaml b/workflows/argo/prognostic-run.yaml
index 32849d8539..47de3484d9 100644
--- a/workflows/argo/prognostic-run.yaml
+++ b/workflows/argo/prognostic-run.yaml
@@ -38,20 +38,58 @@ spec:
               - {name: bucket, value: "{{inputs.parameters.bucket}}"}
               - {name: project, value: "{{inputs.parameters.project}}"}
               - {name: tag, value: "{{inputs.parameters.tag}}"}
-      - - name: prepare-config
-          template: prepare-config
+      - - name: prepare-input-config
+          template: prepare-input-config
           arguments:
             parameters:
               - {name: config, value: "{{inputs.parameters.config}}"}
-      - - name: run-model
+      - - template: get-wrapper
+          name: get-wrapper
+          arguments:
+            artifacts:
+              - name: config
+                from: "{{steps.prepare-input-config.outputs.artifacts.config}}"
+            parameters:
+              - {name: config, value: "{{inputs.parameters.config}}"}
+      - - name: prepare-config-fv3gfs
+          template: prepare-config-fv3gfs
+          when: "'{{steps.get-wrapper.outputs.result}}' == 'fv3gfs.wrapper'"
+          arguments:
+            artifacts:
+              - name: config
+                from: "{{steps.prepare-input-config.outputs.artifacts.config}}"
+      - - name: prepare-config-shield
+          template: prepare-config-shield
+          when: "'{{steps.get-wrapper.outputs.result}}' == 'shield.wrapper'"
+          arguments:
+            artifacts:
+              - name: config
+                from: "{{steps.prepare-input-config.outputs.artifacts.config}}"
+      - - name: run-model-fv3gfs
           continueOn:
             failed: true
+          when: "'{{steps.get-wrapper.outputs.result}}' == 'fv3gfs.wrapper'"
           templateRef:
             name: run-fv3gfs
             template: run-fv3gfs
           arguments:
             artifacts:
-              - {name: fv3config, from: "{{steps.prepare-config.outputs.artifacts.fv3config}}"}
+              - {name: fv3config, from: "{{steps.prepare-config-fv3gfs.outputs.artifacts.fv3config}}"}
+            parameters:
+              - {name: output-url, value: "{{steps.resolve-output-url.outputs.result}}/fv3gfs_run"}
+              - {name: segment-count, value: "{{inputs.parameters.segment-count}}"}
+              - {name: cpu, value: "{{inputs.parameters.cpu}}"}
+              - {name: memory, value: "{{inputs.parameters.memory}}"}
+      - - name: run-model-shield
+          continueOn:
+            failed: true
+          when: "'{{steps.get-wrapper.outputs.result}}' == 'shield.wrapper'"
+          templateRef:
+            name: run-shield
+            template: run-shield
+          arguments:
+            artifacts:
+              - {name: fv3config, from: "{{steps.prepare-config-shield.outputs.artifacts.fv3config}}"}
             parameters:
               - {name: output-url, value: "{{steps.resolve-output-url.outputs.result}}/fv3gfs_run"}
               - {name: segment-count, value: "{{inputs.parameters.segment-count}}"}
@@ -81,15 +119,34 @@ spec:
                   value: "{{steps.resolve-output-url.outputs.result}}/fv3gfs_run_diagnostics"
       - - name: exit
           template: exit
-          when: "{{steps.run-model.status}} == Failed"
+          when: "{{steps.run-model-fv3gfs.status}} == Failed || {{steps.run-model-shield.status}} == Failed}"
     - name: exit
       container:
         image: ubuntu:20.04
         command: ["exit", "1"]
-    - name: prepare-config
+    - name: prepare-input-config
       inputs:
         parameters:
           - name: config
+      outputs:
+        artifacts:
+          - {name: config, path: /tmp/config.yaml}
+      container:
+        image: bash
+        command: ["bash", "-c", "-x", "-e"]
+        volumeMounts:
+          - name: workdir
+            mountPath: /tmp
+        args:
+          - |
+            cat << EOF > /tmp/config.yaml
+            {{inputs.parameters.config}}
+            EOF
+    - name: prepare-config-fv3gfs
+      inputs:
+        artifacts:
+          - name: config
+            path: /tmp/config.yaml
       outputs:
         artifacts:
           - {name: fv3config, path: /tmp/fv3config.yaml}
@@ -108,11 +165,46 @@ spec:
           - name: workdir
             mountPath: /tmp
         args:
-          - |
-            cat << EOF > config.yaml
-            {{inputs.parameters.config}}
-            EOF
-
-            prepare-config \
-              config.yaml \
-              > /tmp/fv3config.yaml
+          - prepare-config /tmp/config.yaml > /tmp/fv3config.yaml
+    - name: prepare-config-shield
+      inputs:
+        artifacts:
+          - name: config
+            path: /tmp/config.yaml
+      outputs:
+        artifacts:
+          - {name: fv3config, path: /tmp/fv3config.yaml}
+      container:
+        image: us.gcr.io/vcm-ml/prognostic_run_shield
+        resources:
+          requests:
+            memory: "500Mi"
+            cpu: "700m"
+          limits:
+            memory: "500Mi"
+            cpu: "700m"
+        command: ["bash", "-c", "-x", "-e"]
+        workingDir: /fv3net/workflows/prognostic_c48_run
+        volumeMounts:
+          - name: workdir
+            mountPath: /tmp
+        args:
+          # Without setting the DGLBACKEND some additional text gets added to
+          # the prepared config (it's a warning that the default backend was set
+          # to pytorch).  I do not fully understand why this happens in this
+          # image but not the FV3GFS prognostic run image.  Should I just set
+          # this in the Dockerfile itself?
+          - DGLBACKEND=pytorch prepare-config /tmp/config.yaml > /tmp/fv3config.yaml
+    - name: get-wrapper
+      inputs:
+        artifacts:
+          - name: config
+            path: /mnt/data/config.yaml
+      script:
+        image: us.gcr.io/vcm-ml/fv3net
+        command: [python]
+        source: |
+          import yaml
+          with open("/mnt/data/config.yaml", "r") as file:
+              config = yaml.safe_load(file)
+          print(config.get("wrapper", "fv3gfs.wrapper"))
diff --git a/workflows/argo/run-shield.yaml b/workflows/argo/run-shield.yaml
new file mode 100644
index 0000000000..3de3ffa588
--- /dev/null
+++ b/workflows/argo/run-shield.yaml
@@ -0,0 +1,195 @@
+apiVersion: argoproj.io/v1alpha1
+kind: WorkflowTemplate
+metadata:
+  name: run-shield
+spec:
+  entrypoint: run-shield
+  templates:
+  - name: run-shield
+    inputs:
+      artifacts:
+        - name: fv3config
+      parameters:
+        - name: output-url
+        - {name: cpu, value: "6"}
+        - {name: memory, value: 8Gi}
+        - {name: segment-count, value: "1"}
+    steps:
+      - - template: create-run
+          name: create-run
+          arguments:
+            parameters:
+            - {name: runURL, value: "{{inputs.parameters.output-url}}"}
+            artifacts:
+              - name: fv3config
+                from: "{{inputs.artifacts.fv3config}}"
+        - name: choose-node-pool
+          template: choose-node-pool
+          arguments:
+            parameters:
+            - {name: cpu-request, value: "{{inputs.parameters.cpu}}"}
+            - {name: cpu-cutoff, value: "24"}
+            - {name: memory-request, value: "{{inputs.parameters.memory}}"}
+            - {name: memory-cutoff, value: "30"}
+      # loop over segments implemented through recursion so that a failed segment will
+      # terminate the workflow. Argo loops by default run in parallel and do not fail fast.
+      - - name: run-first-segment
+          template: run-all-segments
+          arguments:
+            parameters:
+            - {name: output-url, value: "{{inputs.parameters.output-url}}"}
+            - {name: cpu, value: "{{inputs.parameters.cpu}}"}
+            - {name: memory, value: "{{inputs.parameters.memory}}"}
+            - {name: node-pool, value: "{{steps.choose-node-pool.outputs.result}}"}
+            - {name: segment-count, value: "{{inputs.parameters.segment-count}}"}
+            - {name: segment, value: 0}
+  - name: run-all-segments
+    inputs:
+      parameters:
+        - name: output-url
+        - name: cpu
+        - name: memory
+        - name: node-pool
+        - name: segment-count
+        - name: segment
+    steps:
+      - - template: append-segment
+          name: append-segment
+          arguments:
+            parameters:
+              - {name: runURL, value: "{{inputs.parameters.output-url}}"}
+              - {name: cpu, value: "{{inputs.parameters.cpu}}"}
+              - {name: memory, value: "{{inputs.parameters.memory}}"}
+              - {name: node-pool, value: "{{inputs.parameters.node-pool}}"}
+      - - name: increment-segment
+          template: increment-count
+          arguments:
+            parameters:
+              - {name: count, value: "{{inputs.parameters.segment}}"}
+      - - name: run-next-segment
+          template: run-all-segments
+          when: "{{steps.increment-segment.outputs.result}} < {{inputs.parameters.segment-count}}"
+          arguments:
+            parameters:
+            - {name: output-url, value: "{{inputs.parameters.output-url}}"}
+            - {name: cpu, value: "{{inputs.parameters.cpu}}"}
+            - {name: memory, value: "{{inputs.parameters.memory}}"}
+            - {name: node-pool, value: "{{inputs.parameters.node-pool}}"}
+            - {name: segment-count, value: "{{inputs.parameters.segment-count}}"}
+            - {name: segment, value: "{{steps.increment-segment.outputs.result}}"}
+  - name: create-run
+    inputs:
+      artifacts:
+        - name: fv3config
+          path: /mnt/data/fv3config.yaml
+      parameters:
+        - name: runURL
+    container:
+      image: us.gcr.io/vcm-ml/prognostic_run_shield
+      command: ["/bin/bash", "-c", "-x", "-e"]
+      resources:
+        limits:
+            memory: "500Mi"
+            cpu: "500m"
+      args:
+      - |
+        find /mnt/data
+        echo "Using fv3config:"
+        cat /mnt/data/fv3config.yaml
+        runfv3 create {{inputs.parameters.runURL}} /mnt/data/fv3config.yaml
+      env:
+        - name: GOOGLE_APPLICATION_CREDENTIALS
+          value: /secret/gcp-credentials/key.json
+        - name: CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE
+          value: /secret/gcp-credentials/key.json
+        - name: FSSPEC_GS_REQUESTER_PAYS
+          value: vcm-ml
+      terminationMessagePath: /dev/termination-log
+      terminationMessagePolicy: File
+      volumeMounts:
+        - name: gcp-key-secret
+          mountPath: /secret/gcp-credentials
+          readOnly: true
+  - name: choose-node-pool
+    inputs:
+      parameters:
+        - name: cpu-request
+        - name: cpu-cutoff
+        - name: memory-request
+        - name: memory-cutoff
+
+    script:
+      image: python:alpine3.6
+      command: [python]
+      source: |
+        cpu_request = "{{inputs.parameters.cpu-request}}"
+        if cpu_request.endswith('m'):
+            cpus = float(cpu_request[:-1])/1000.0
+        else:
+            cpus = float(cpu_request)
+        memory_request = "{{inputs.parameters.memory-request}}".lower()
+        if memory_request.endswith('gi') or memory_request.endswith('gb'):
+            memory = float(memory_request[:-2])
+        else:
+            raise ValueError("memory request must be in Gi or Gb")
+        if cpus <= {{inputs.parameters.cpu-cutoff}}:
+          if memory <= {{inputs.parameters.memory-cutoff}}:
+            node_pool = 'climate-sim-pool'
+          else:
+            node_pool = 'highmem-sim-pool'
+        else:
+            node_pool = 'ultra-sim-pool'
+        print(node_pool)
+  - name: append-segment
+    inputs:
+      parameters:
+        - name: cpu
+        - name: memory
+        - name: runURL
+        - name: node-pool
+    tolerations:
+    - key: "dedicated"
+      operator: "Equal"
+      value: "{{inputs.parameters.node-pool}}"
+      effect: "NoSchedule"
+    metadata:
+      labels:
+        app: fv3run
+    podSpecPatch: |
+      containers:
+        - name: main
+          resources:
+            limits:
+              cpu: "{{inputs.parameters.cpu}}"
+              memory: "{{inputs.parameters.memory}}"
+            requests:
+              cpu: "{{inputs.parameters.cpu}}"
+              memory: "{{inputs.parameters.memory}}"
+    container:
+      image: us.gcr.io/vcm-ml/prognostic_run_shield
+      command: [runfv3]
+      args: ["append", "{{inputs.parameters.runURL}}"]
+      env:
+        - name: GOOGLE_APPLICATION_CREDENTIALS
+          value: /secret/gcp-credentials/key.json
+        - name: CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE
+          value: /secret/gcp-credentials/key.json
+        - name: FSSPEC_GS_REQUESTER_PAYS
+          value: vcm-ml
+      terminationMessagePath: /dev/termination-log
+      terminationMessagePolicy: File
+      volumeMounts:
+        - name: gcp-key-secret
+          mountPath: /secret/gcp-credentials
+          readOnly: true
+        - name: dshm
+          mountPath: /dev/shm
+  - name: increment-count
+    inputs:
+      parameters:
+        - name: count
+    script:
+      image: python:alpine3.6
+      command: [python]
+      source: |
+        print({{inputs.parameters.count}} + 1)

From 163add7ed3546f6c2013028d7cc442db78d251a0 Mon Sep 17 00:00:00 2001
From: Spencer Clark <spencerkclark@gmail.com>
Date: Mon, 4 Dec 2023 14:49:57 +0000
Subject: [PATCH 2/9] Reference identical workflow steps where possible

---
 workflows/argo/run-shield.yaml | 47 +++++-----------------------------
 1 file changed, 6 insertions(+), 41 deletions(-)

diff --git a/workflows/argo/run-shield.yaml b/workflows/argo/run-shield.yaml
index 3de3ffa588..856db59d65 100644
--- a/workflows/argo/run-shield.yaml
+++ b/workflows/argo/run-shield.yaml
@@ -24,7 +24,9 @@ spec:
               - name: fv3config
                 from: "{{inputs.artifacts.fv3config}}"
         - name: choose-node-pool
-          template: choose-node-pool
+          templateRef:
+            name: run-fv3gfs
+            template: choose-node-pool
           arguments:
             parameters:
             - {name: cpu-request, value: "{{inputs.parameters.cpu}}"}
@@ -62,7 +64,9 @@ spec:
               - {name: memory, value: "{{inputs.parameters.memory}}"}
               - {name: node-pool, value: "{{inputs.parameters.node-pool}}"}
       - - name: increment-segment
-          template: increment-count
+          templateRef:
+            name: run-fv3gfs
+            template: increment-count
           arguments:
             parameters:
               - {name: count, value: "{{inputs.parameters.segment}}"}
@@ -110,36 +114,6 @@ spec:
         - name: gcp-key-secret
           mountPath: /secret/gcp-credentials
           readOnly: true
-  - name: choose-node-pool
-    inputs:
-      parameters:
-        - name: cpu-request
-        - name: cpu-cutoff
-        - name: memory-request
-        - name: memory-cutoff
-
-    script:
-      image: python:alpine3.6
-      command: [python]
-      source: |
-        cpu_request = "{{inputs.parameters.cpu-request}}"
-        if cpu_request.endswith('m'):
-            cpus = float(cpu_request[:-1])/1000.0
-        else:
-            cpus = float(cpu_request)
-        memory_request = "{{inputs.parameters.memory-request}}".lower()
-        if memory_request.endswith('gi') or memory_request.endswith('gb'):
-            memory = float(memory_request[:-2])
-        else:
-            raise ValueError("memory request must be in Gi or Gb")
-        if cpus <= {{inputs.parameters.cpu-cutoff}}:
-          if memory <= {{inputs.parameters.memory-cutoff}}:
-            node_pool = 'climate-sim-pool'
-          else:
-            node_pool = 'highmem-sim-pool'
-        else:
-            node_pool = 'ultra-sim-pool'
-        print(node_pool)
   - name: append-segment
     inputs:
       parameters:
@@ -184,12 +158,3 @@ spec:
           readOnly: true
         - name: dshm
           mountPath: /dev/shm
-  - name: increment-count
-    inputs:
-      parameters:
-        - name: count
-    script:
-      image: python:alpine3.6
-      command: [python]
-      source: |
-        print({{inputs.parameters.count}} + 1)

From e504e9e03fad79a7330da5f6e2d49aeecb7d090b Mon Sep 17 00:00:00 2001
From: Spencer Clark <spencerkclark@gmail.com>
Date: Mon, 4 Dec 2023 21:00:24 +0000
Subject: [PATCH 3/9] Leverage YAML anchors and aliases to reduce repetition

---
 workflows/argo/kustomization.yaml             |   3 +-
 workflows/argo/prognostic-run.yaml            |  93 ++++------
 workflows/argo/restart-prognostic-run.yaml    |  38 ++++-
 workflows/argo/run-shield.yaml                | 160 ------------------
 .../{run-fv3gfs.yaml => run-simulation.yaml}  |  69 ++++++--
 5 files changed, 127 insertions(+), 236 deletions(-)
 delete mode 100644 workflows/argo/run-shield.yaml
 rename workflows/argo/{run-fv3gfs.yaml => run-simulation.yaml} (76%)

diff --git a/workflows/argo/kustomization.yaml b/workflows/argo/kustomization.yaml
index 6438c06934..40c51e0083 100644
--- a/workflows/argo/kustomization.yaml
+++ b/workflows/argo/kustomization.yaml
@@ -5,8 +5,7 @@ resources:
 - training.yaml
 - training-gpu.yaml
 - training-torch.yaml
-- run-fv3gfs.yaml
-- run-shield.yaml
+- run-simulation.yaml
 - offline-diags.yaml
 - train-diags-prog.yaml
 - cubed-to-latlon.yaml
diff --git a/workflows/argo/prognostic-run.yaml b/workflows/argo/prognostic-run.yaml
index 47de3484d9..8fe0dadc41 100644
--- a/workflows/argo/prognostic-run.yaml
+++ b/workflows/argo/prognostic-run.yaml
@@ -38,63 +38,57 @@ spec:
               - {name: bucket, value: "{{inputs.parameters.bucket}}"}
               - {name: project, value: "{{inputs.parameters.project}}"}
               - {name: tag, value: "{{inputs.parameters.tag}}"}
-      - - name: prepare-input-config
-          template: prepare-input-config
+      - - name: convert-input-config-to-artifact
+          template: convert-input-config-to-artifact
           arguments:
             parameters:
               - {name: config, value: "{{inputs.parameters.config}}"}
-      - - template: get-wrapper
-          name: get-wrapper
+      - - template: infer-wrapper
+          name: infer-wrapper
           arguments:
             artifacts:
               - name: config
-                from: "{{steps.prepare-input-config.outputs.artifacts.config}}"
+                from: "{{steps.convert-input-config-to-artifact.outputs.artifacts.config}}"
             parameters:
               - {name: config, value: "{{inputs.parameters.config}}"}
-      - - name: prepare-config-fv3gfs
+      - - &prepare-config-step-fv3gfs
+          name: prepare-config-fv3gfs
           template: prepare-config-fv3gfs
-          when: "'{{steps.get-wrapper.outputs.result}}' == 'fv3gfs.wrapper'"
+          when: "'{{steps.infer-wrapper.outputs.result}}' == 'fv3gfs.wrapper'"
           arguments:
             artifacts:
               - name: config
-                from: "{{steps.prepare-input-config.outputs.artifacts.config}}"
-      - - name: prepare-config-shield
+                from: "{{steps.convert-input-config-to-artifact.outputs.artifacts.config}}"
+      - - <<: *prepare-config-step-fv3gfs
+          name: prepare-config-shield
           template: prepare-config-shield
-          when: "'{{steps.get-wrapper.outputs.result}}' == 'shield.wrapper'"
-          arguments:
-            artifacts:
-              - name: config
-                from: "{{steps.prepare-input-config.outputs.artifacts.config}}"
-      - - name: run-model-fv3gfs
+          when: "'{{steps.infer-wrapper.outputs.result}}' == 'shield.wrapper'"
+      - - &run-model-step-fv3gfs
+          name: run-model-fv3gfs
           continueOn:
             failed: true
-          when: "'{{steps.get-wrapper.outputs.result}}' == 'fv3gfs.wrapper'"
+          when: "'{{steps.infer-wrapper.outputs.result}}' == 'fv3gfs.wrapper'"
           templateRef:
             name: run-fv3gfs
             template: run-fv3gfs
           arguments:
             artifacts:
               - {name: fv3config, from: "{{steps.prepare-config-fv3gfs.outputs.artifacts.fv3config}}"}
-            parameters:
+            parameters: &run-model-step-parameters-fv3gfs
               - {name: output-url, value: "{{steps.resolve-output-url.outputs.result}}/fv3gfs_run"}
               - {name: segment-count, value: "{{inputs.parameters.segment-count}}"}
               - {name: cpu, value: "{{inputs.parameters.cpu}}"}
               - {name: memory, value: "{{inputs.parameters.memory}}"}
-      - - name: run-model-shield
-          continueOn:
-            failed: true
-          when: "'{{steps.get-wrapper.outputs.result}}' == 'shield.wrapper'"
+      - - <<: *run-model-step-fv3gfs
+          name: run-model-shield
+          when: "'{{steps.infer-wrapper.outputs.result}}' == 'shield.wrapper'"
           templateRef:
             name: run-shield
             template: run-shield
           arguments:
             artifacts:
               - {name: fv3config, from: "{{steps.prepare-config-shield.outputs.artifacts.fv3config}}"}
-            parameters:
-              - {name: output-url, value: "{{steps.resolve-output-url.outputs.result}}/fv3gfs_run"}
-              - {name: segment-count, value: "{{inputs.parameters.segment-count}}"}
-              - {name: cpu, value: "{{inputs.parameters.cpu}}"}
-              - {name: memory, value: "{{inputs.parameters.memory}}"}
+            parameters: *run-model-step-parameters-fv3gfs
       - - name: online-diags
           when: "{{inputs.parameters.online-diags}} == true"
           templateRef:
@@ -119,12 +113,12 @@ spec:
                   value: "{{steps.resolve-output-url.outputs.result}}/fv3gfs_run_diagnostics"
       - - name: exit
           template: exit
-          when: "{{steps.run-model-fv3gfs.status}} == Failed || {{steps.run-model-shield.status}} == Failed}"
+          when: "{{steps.run-model-fv3gfs.status}} == Failed || {{steps.run-model-shield.status}} == Failed"
     - name: exit
       container:
         image: ubuntu:20.04
         command: ["exit", "1"]
-    - name: prepare-input-config
+    - name: convert-input-config-to-artifact
       inputs:
         parameters:
           - name: config
@@ -142,7 +136,8 @@ spec:
             cat << EOF > /tmp/config.yaml
             {{inputs.parameters.config}}
             EOF
-    - name: prepare-config-fv3gfs
+    - &prepare-config-fv3gfs
+      name: prepare-config-fv3gfs
       inputs:
         artifacts:
           - name: config
@@ -150,7 +145,7 @@ spec:
       outputs:
         artifacts:
           - {name: fv3config, path: /tmp/fv3config.yaml}
-      container:
+      container: &prepare-config-container-fv3gfs
         image: us.gcr.io/vcm-ml/prognostic_run
         resources:
           requests:
@@ -166,36 +161,20 @@ spec:
             mountPath: /tmp
         args:
           - prepare-config /tmp/config.yaml > /tmp/fv3config.yaml
-    - name: prepare-config-shield
-      inputs:
-        artifacts:
-          - name: config
-            path: /tmp/config.yaml
-      outputs:
-        artifacts:
-          - {name: fv3config, path: /tmp/fv3config.yaml}
+    - <<: *prepare-config-fv3gfs
+      name: prepare-config-shield
       container:
+        <<: *prepare-config-container-fv3gfs
         image: us.gcr.io/vcm-ml/prognostic_run_shield
-        resources:
-          requests:
-            memory: "500Mi"
-            cpu: "700m"
-          limits:
-            memory: "500Mi"
-            cpu: "700m"
-        command: ["bash", "-c", "-x", "-e"]
-        workingDir: /fv3net/workflows/prognostic_c48_run
-        volumeMounts:
-          - name: workdir
-            mountPath: /tmp
-        args:
+        env:
           # Without setting the DGLBACKEND some additional text gets added to
-          # the prepared config (it's a warning that the default backend was set
-          # to pytorch).  I do not fully understand why this happens in this
-          # image but not the FV3GFS prognostic run image.  Should I just set
-          # this in the Dockerfile itself?
-          - DGLBACKEND=pytorch prepare-config /tmp/config.yaml > /tmp/fv3config.yaml
-    - name: get-wrapper
+          # the prepared config (a warning that the default backend was set to
+          # pytorch). I do not fully understand why this happens in this image
+          # but not the FV3GFS prognostic run image. Should I just set this in
+          # the Dockerfile itself?
+          - name: DGLBACKEND
+            value: pytorch
+    - name: infer-wrapper
       inputs:
         artifacts:
           - name: config
diff --git a/workflows/argo/restart-prognostic-run.yaml b/workflows/argo/restart-prognostic-run.yaml
index b9fcd9a140..1c8587b44a 100644
--- a/workflows/argo/restart-prognostic-run.yaml
+++ b/workflows/argo/restart-prognostic-run.yaml
@@ -26,7 +26,7 @@ spec:
       steps:
       - - name: choose-node-pool
           templateRef:
-            name: run-fv3gfs
+            name: run-simulation
             template: choose-node-pool
           arguments:
             parameters:
@@ -34,9 +34,16 @@ spec:
               - {name: cpu-cutoff, value: "24"}
               - {name: memory-request, value: "{{inputs.parameters.memory}}"}
               - {name: memory-cutoff, value: "30"}
-      - - name: restart-run
+      - - name: infer-wrapper
+          template: infer-wrapper
+          arguments:
+            parameters:
+              - {name: url, value: "{{inputs.parameters.url}}"}
+      - - &restart-run-fv3gfs
+          name: restart-run-fv3gfs
+          when: "'{{steps.infer-wrapper.outputs.result}}' == 'fv3gfs.wrapper'"
           templateRef:
-            name: run-fv3gfs
+            name: run-simulation
             template: run-all-segments
           arguments:
             parameters:
@@ -46,3 +53,28 @@ spec:
               - {name: memory, value: "{{inputs.parameters.memory}}"}
               - {name: node-pool, value: "{{steps.choose-node-pool.outputs.result}}"}
               - {name: segment, value: 0}
+      - - <<: *restart-run-fv3gfs
+          name: restart-run-shield
+          when: "'{{steps.infer-wrapper.outputs.result}}' == 'shield.wrapper'"
+          templateRef:
+            name: run-simulation
+            template: run-all-segments-shield
+    - name: infer-wrapper
+      inputs:
+        parameters:
+          - name: url
+      script:
+        image:
+          us.gcr.io/vcm-ml/fv3net
+        command: [python]
+        source: |
+          import os
+          import fsspec
+          import yaml
+
+          config_path = os.path.join("{{inputs.parameters.url}}", "fv3config.yml")
+          fs, *_ = fsspec.get_fs_token_paths(config_path)
+          with fs.open(config_path, "r") as file:
+              config = yaml.safe_load(file)
+
+          print(config.get("wrapper", "fv3gfs.wrapper"))
diff --git a/workflows/argo/run-shield.yaml b/workflows/argo/run-shield.yaml
deleted file mode 100644
index 856db59d65..0000000000
--- a/workflows/argo/run-shield.yaml
+++ /dev/null
@@ -1,160 +0,0 @@
-apiVersion: argoproj.io/v1alpha1
-kind: WorkflowTemplate
-metadata:
-  name: run-shield
-spec:
-  entrypoint: run-shield
-  templates:
-  - name: run-shield
-    inputs:
-      artifacts:
-        - name: fv3config
-      parameters:
-        - name: output-url
-        - {name: cpu, value: "6"}
-        - {name: memory, value: 8Gi}
-        - {name: segment-count, value: "1"}
-    steps:
-      - - template: create-run
-          name: create-run
-          arguments:
-            parameters:
-            - {name: runURL, value: "{{inputs.parameters.output-url}}"}
-            artifacts:
-              - name: fv3config
-                from: "{{inputs.artifacts.fv3config}}"
-        - name: choose-node-pool
-          templateRef:
-            name: run-fv3gfs
-            template: choose-node-pool
-          arguments:
-            parameters:
-            - {name: cpu-request, value: "{{inputs.parameters.cpu}}"}
-            - {name: cpu-cutoff, value: "24"}
-            - {name: memory-request, value: "{{inputs.parameters.memory}}"}
-            - {name: memory-cutoff, value: "30"}
-      # loop over segments implemented through recursion so that a failed segment will
-      # terminate the workflow. Argo loops by default run in parallel and do not fail fast.
-      - - name: run-first-segment
-          template: run-all-segments
-          arguments:
-            parameters:
-            - {name: output-url, value: "{{inputs.parameters.output-url}}"}
-            - {name: cpu, value: "{{inputs.parameters.cpu}}"}
-            - {name: memory, value: "{{inputs.parameters.memory}}"}
-            - {name: node-pool, value: "{{steps.choose-node-pool.outputs.result}}"}
-            - {name: segment-count, value: "{{inputs.parameters.segment-count}}"}
-            - {name: segment, value: 0}
-  - name: run-all-segments
-    inputs:
-      parameters:
-        - name: output-url
-        - name: cpu
-        - name: memory
-        - name: node-pool
-        - name: segment-count
-        - name: segment
-    steps:
-      - - template: append-segment
-          name: append-segment
-          arguments:
-            parameters:
-              - {name: runURL, value: "{{inputs.parameters.output-url}}"}
-              - {name: cpu, value: "{{inputs.parameters.cpu}}"}
-              - {name: memory, value: "{{inputs.parameters.memory}}"}
-              - {name: node-pool, value: "{{inputs.parameters.node-pool}}"}
-      - - name: increment-segment
-          templateRef:
-            name: run-fv3gfs
-            template: increment-count
-          arguments:
-            parameters:
-              - {name: count, value: "{{inputs.parameters.segment}}"}
-      - - name: run-next-segment
-          template: run-all-segments
-          when: "{{steps.increment-segment.outputs.result}} < {{inputs.parameters.segment-count}}"
-          arguments:
-            parameters:
-            - {name: output-url, value: "{{inputs.parameters.output-url}}"}
-            - {name: cpu, value: "{{inputs.parameters.cpu}}"}
-            - {name: memory, value: "{{inputs.parameters.memory}}"}
-            - {name: node-pool, value: "{{inputs.parameters.node-pool}}"}
-            - {name: segment-count, value: "{{inputs.parameters.segment-count}}"}
-            - {name: segment, value: "{{steps.increment-segment.outputs.result}}"}
-  - name: create-run
-    inputs:
-      artifacts:
-        - name: fv3config
-          path: /mnt/data/fv3config.yaml
-      parameters:
-        - name: runURL
-    container:
-      image: us.gcr.io/vcm-ml/prognostic_run_shield
-      command: ["/bin/bash", "-c", "-x", "-e"]
-      resources:
-        limits:
-            memory: "500Mi"
-            cpu: "500m"
-      args:
-      - |
-        find /mnt/data
-        echo "Using fv3config:"
-        cat /mnt/data/fv3config.yaml
-        runfv3 create {{inputs.parameters.runURL}} /mnt/data/fv3config.yaml
-      env:
-        - name: GOOGLE_APPLICATION_CREDENTIALS
-          value: /secret/gcp-credentials/key.json
-        - name: CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE
-          value: /secret/gcp-credentials/key.json
-        - name: FSSPEC_GS_REQUESTER_PAYS
-          value: vcm-ml
-      terminationMessagePath: /dev/termination-log
-      terminationMessagePolicy: File
-      volumeMounts:
-        - name: gcp-key-secret
-          mountPath: /secret/gcp-credentials
-          readOnly: true
-  - name: append-segment
-    inputs:
-      parameters:
-        - name: cpu
-        - name: memory
-        - name: runURL
-        - name: node-pool
-    tolerations:
-    - key: "dedicated"
-      operator: "Equal"
-      value: "{{inputs.parameters.node-pool}}"
-      effect: "NoSchedule"
-    metadata:
-      labels:
-        app: fv3run
-    podSpecPatch: |
-      containers:
-        - name: main
-          resources:
-            limits:
-              cpu: "{{inputs.parameters.cpu}}"
-              memory: "{{inputs.parameters.memory}}"
-            requests:
-              cpu: "{{inputs.parameters.cpu}}"
-              memory: "{{inputs.parameters.memory}}"
-    container:
-      image: us.gcr.io/vcm-ml/prognostic_run_shield
-      command: [runfv3]
-      args: ["append", "{{inputs.parameters.runURL}}"]
-      env:
-        - name: GOOGLE_APPLICATION_CREDENTIALS
-          value: /secret/gcp-credentials/key.json
-        - name: CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE
-          value: /secret/gcp-credentials/key.json
-        - name: FSSPEC_GS_REQUESTER_PAYS
-          value: vcm-ml
-      terminationMessagePath: /dev/termination-log
-      terminationMessagePolicy: File
-      volumeMounts:
-        - name: gcp-key-secret
-          mountPath: /secret/gcp-credentials
-          readOnly: true
-        - name: dshm
-          mountPath: /dev/shm
diff --git a/workflows/argo/run-fv3gfs.yaml b/workflows/argo/run-simulation.yaml
similarity index 76%
rename from workflows/argo/run-fv3gfs.yaml
rename to workflows/argo/run-simulation.yaml
index 5a8032cf95..b56298ca45 100644
--- a/workflows/argo/run-fv3gfs.yaml
+++ b/workflows/argo/run-simulation.yaml
@@ -1,11 +1,11 @@
 apiVersion: argoproj.io/v1alpha1
 kind: WorkflowTemplate
 metadata:
-  name: run-fv3gfs
+  name: run-simulation
 spec:
-  entrypoint: run-fv3gfs
   templates:
-  - name: run-fv3gfs
+  - &run-fv3gfs
+    name: run-fv3gfs
     inputs:
       artifacts:
         - name: fv3config
@@ -15,7 +15,8 @@ spec:
         - {name: memory, value: 8Gi}
         - {name: segment-count, value: "1"}
     steps:
-      - - template: create-run
+      - - &create-run-step-fv3gfs
+          template: create-run
           name: create-run
           arguments:
             parameters:
@@ -23,7 +24,8 @@ spec:
             artifacts:
               - name: fv3config
                 from: "{{inputs.artifacts.fv3config}}"
-        - name: choose-node-pool
+        - &choose-node-pool-step-fv3gfs
+          name: choose-node-pool
           template: choose-node-pool
           arguments:
             parameters:
@@ -33,7 +35,8 @@ spec:
             - {name: memory-cutoff, value: "30"}
       # loop over segments implemented through recursion so that a failed segment will
       # terminate the workflow. Argo loops by default run in parallel and do not fail fast.
-      - - name: run-first-segment
+      - - &run-first-segment-step-fv3gfs
+          name: run-first-segment
           template: run-all-segments
           arguments:
             parameters:
@@ -43,7 +46,8 @@ spec:
             - {name: node-pool, value: "{{steps.choose-node-pool.outputs.result}}"}
             - {name: segment-count, value: "{{inputs.parameters.segment-count}}"}
             - {name: segment, value: 0}
-  - name: run-all-segments
+  - &run-all-segments-fv3gfs
+    name: run-all-segments
     inputs:
       parameters:
         - name: output-url
@@ -53,7 +57,8 @@ spec:
         - name: segment-count
         - name: segment
     steps:
-      - - template: append-segment
+      - - &append-segment-step-fv3gfs
+          template: append-segment
           name: append-segment
           arguments:
             parameters:
@@ -61,12 +66,14 @@ spec:
               - {name: cpu, value: "{{inputs.parameters.cpu}}"}
               - {name: memory, value: "{{inputs.parameters.memory}}"}
               - {name: node-pool, value: "{{inputs.parameters.node-pool}}"}
-      - - name: increment-segment
+      - - &increment-segment-step-fv3gfs
+          name: increment-segment
           template: increment-count
           arguments:
             parameters:
               - {name: count, value: "{{inputs.parameters.segment}}"}
-      - - name: run-next-segment
+      - - &run-next-segment-step-fv3gfs
+          name: run-next-segment
           template: run-all-segments
           when: "{{steps.increment-segment.outputs.result}} < {{inputs.parameters.segment-count}}"
           arguments:
@@ -77,14 +84,15 @@ spec:
             - {name: node-pool, value: "{{inputs.parameters.node-pool}}"}
             - {name: segment-count, value: "{{inputs.parameters.segment-count}}"}
             - {name: segment, value: "{{steps.increment-segment.outputs.result}}"}
-  - name: create-run
+  - &create-run-fv3gfs
+    name: create-run
     inputs:
       artifacts:
         - name: fv3config
           path: /mnt/data/fv3config.yaml
       parameters:
         - name: runURL
-    container:
+    container: &create-run-container-fv3gfs
       image: us.gcr.io/vcm-ml/prognostic_run
       command: ["/bin/bash", "-c", "-x", "-e"]
       resources:
@@ -142,7 +150,8 @@ spec:
         else:
             node_pool = 'ultra-sim-pool'
         print(node_pool)
-  - name: append-segment
+  - &append-segment-fv3gfs
+    name: append-segment
     inputs:
       parameters:
         - name: cpu
@@ -167,7 +176,7 @@ spec:
             requests:
               cpu: "{{inputs.parameters.cpu}}"
               memory: "{{inputs.parameters.memory}}"
-    container:
+    container: &append-segment-container-fv3gfs
       image: us.gcr.io/vcm-ml/prognostic_run
       command: [runfv3]
       args: ["append", "{{inputs.parameters.runURL}}"]
@@ -195,3 +204,35 @@ spec:
       command: [python]
       source: |
         print({{inputs.parameters.count}} + 1)
+
+  # Template for running SHiELD instead of FV3GFS; note we have been careful to
+  # rename all the SHiELD-wrapper-dependent templates such that there is no risk
+  # of using a template meant for running FV3GFS with SHiELD or vice-versa.  The
+  # choose-node-pool and increment-segment steps are identical between the two
+  # workflows, so we do not bother to do any renaming of those.
+  - <<: *run-fv3gfs
+    name: run-shield
+    steps:
+      - - <<: *create-run-step-fv3gfs
+          template: create-run-shield
+        - *choose-node-pool-step-fv3gfs
+      - - <<: *run-first-segment-step-fv3gfs
+          template: run-all-segments-shield
+  - <<: *run-all-segments-fv3gfs
+    name: run-all-segments-shield
+    steps:
+      - - <<: *append-segment-step-fv3gfs
+          template: append-segment-shield
+      - - *increment-segment-step-fv3gfs
+      - - <<: *run-next-segment-step-fv3gfs
+          template: run-all-segments-shield
+  - <<: *create-run-fv3gfs
+    name: create-run-shield
+    container:
+      <<: *create-run-container-fv3gfs
+      image: us.gcr.io/vcm-ml/prognostic_run_shield
+  - <<: *append-segment-fv3gfs
+    name: append-segment-shield
+    container:
+      <<: *append-segment-container-fv3gfs
+      image: us.gcr.io/vcm-ml/prognostic_run_shield

From fac244f13dbf9a7c781b68e051fb70cc9b9d588b Mon Sep 17 00:00:00 2001
From: Spencer Clark <spencerkclark@gmail.com>
Date: Fri, 8 Dec 2023 13:41:48 +0000
Subject: [PATCH 4/9] Update forcing in SHiELD base config

This references forcing data in the vcm-fv3config bucket, and makes parameters
controlling whether we use data from initial conditions or the climatology
consistent with our v0.7 FV3GFS base config.
---
 .../base_yamls/SHiELD/v0.1/fv3config.yml      | 47 ++++++++++---------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml b/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml
index ce48b327fc..c49d5af9f3 100644
--- a/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml
+++ b/external/fv3kube/fv3kube/base_yamls/SHiELD/v0.1/fv3config.yml
@@ -2,7 +2,7 @@ data_table: default
 diag_table: no_output
 experiment_name: default
 field_table: gs://vcm-fv3config/config/field_table/TKE-EDMF/v1.1/field_table
-forcing: gs://vcm-ml-experiments/spencerc/2023-09-13-SHiELD-forcing-data
+forcing: gs://vcm-fv3config/data/base_forcing/SHiELD/v1.0/C48
 initial_conditions: ''  # no default provided
 namelist:
   fms_affinity_nml:
@@ -230,14 +230,15 @@ namelist:
   interpolator_nml:
     interp_method: conserve_great_circle
   namsfc:
-    fabsl: 99999
-    faisl: 99999
-    faiss: 99999
-    fnabsc: grb/global_mxsnoalb.uariz.t1534.3072.1536.rg.grb
+    fabsl: 0  # Use maximum snow albedo from forcing files instead of initial conditions
+    fabss: 0  # Use maximum snow albedo from forcing files instead of initial conditions
+    faisl: 0  # Use land / sea / sea-ice mask from forcing files instead of initial conditions
+    faiss: 0  # Use land / sea / sea-ice mask from forcing files instead of initial conditions
+    fnabsc: INPUT/fix_sfc/maximum_snow_albedo.tileX.nc
     fnacna: ''
     fnaisc: grb/CFSR.SEAICE.1982.2012.monthly.clim.grb
-    fnalbc: grb/global_snowfree_albedo.bosu.t1534.3072.1536.rg.grb
-    fnalbc2: grb/global_albedo4.1x1.grb
+    fnalbc: INPUT/fix_sfc/snowfree_albedo.tileX.nc
+    fnalbc2: INPUT/fix_sfc/facsf.tileX.nc
     fnglac: grb/global_glacier.2x2.grb
     fnmskh: grb/seaice_newland.grb
     fnmxic: grb/global_maxice.2x2.grb
@@ -245,30 +246,32 @@ namelist:
     fnsmcc: grb/global_soilmgldas.t1534.3072.1536.grb
     fnsnoa: ''
     fnsnoc: grb/global_snoclim.1.875.grb
-    fnsotc: grb/global_soiltype.statsgo.t1534.3072.1536.rg.grb
-    fntg3c: grb/global_tg3clim.2.6x1.5.grb
+    fnsotc: INPUT/fix_sfc/soil_type.tileX.nc
+    fntg3c: INPUT/fix_sfc/substrate_temperature.tileX.nc
     fntsfa: ''
     fntsfc: grb/RTGSST.1982.2012.monthly.clim.grb
-    fnvegc: grb/global_vegfrac.0.144.decpercent.grb
-    fnvetc: grb/global_vegtype.igbp.t1534.3072.1536.rg.grb
-    fnvmnc: grb/global_shdmin.0.144x0.144.grb
-    fnvmxc: grb/global_shdmax.0.144x0.144.grb
+    fnvegc: INPUT/fix_sfc/vegetation_greenness.tileX.nc
+    fnvetc: INPUT/fix_sfc/vegetation_type.tileX.nc
+    fnvmnc: INPUT/fix_sfc/vegetation_greenness.tileX.nc
+    fnvmxc: INPUT/fix_sfc/vegetation_greenness.tileX.nc
     fnzorc: igbp
-    fsicl: 99999
-    fsics: 99999
-    fslpl: 99999
+    fsicl: 0  # Use sea ice fraction from forcing files instead of persisting the fraction in the initial condition
+    fsics: 0  # Use sea ice fraction from forcing files instead of persisting the fraction in the initial condition
+    fslpl: 99999  # Use slope type from initial condition
     fsmcl:
+    - 99999  # Use soil moisture from initial condition
     - 99999
     - 99999
-    - 99999
-    fsnol: 99999
-    fsnos: 99999
+    fsnol: 99999  # Use snow cover fraction from initial condition
+    fsnos: 99999  # Use snow cover fraction from initial condition
     fsotl: 99999
     ftsfl: 99999
-    ftsfs: 90
+    ftsfs: 0  # Use only climatological SSTs, no relaxation from initial conditions
     fvetl: 99999
-    fvmnl: 99999
-    fvmxl: 99999
+    fvmnl: 0  # Use minimum green vegetation fraction from forcing files instead of initial condition
+    fvmns: 0  # Use minimum green vegetation fraction from forcing files instead of initial condition
+    fvmxl: 0  # Use maximum green vegetation fraction from forcing files instead of initial condition
+    fvmxs: 0  # Use maximum green vegetation fraction from forcing files instead of initial condition
     ldebug: false
   ocean_nml:
     do_mld_restore: true

From 942ab21633d976fd35325bdccc726b6118f5978a Mon Sep 17 00:00:00 2001
From: Spencer Clark <spencerkclark@gmail.com>
Date: Wed, 13 Dec 2023 17:42:39 +0000
Subject: [PATCH 5/9] Push fix to name field in templateRef we made earlier

---
 workflows/argo/prognostic-run.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflows/argo/prognostic-run.yaml b/workflows/argo/prognostic-run.yaml
index 8fe0dadc41..e7d32820b3 100644
--- a/workflows/argo/prognostic-run.yaml
+++ b/workflows/argo/prognostic-run.yaml
@@ -69,7 +69,7 @@ spec:
             failed: true
           when: "'{{steps.infer-wrapper.outputs.result}}' == 'fv3gfs.wrapper'"
           templateRef:
-            name: run-fv3gfs
+            name: run-simulation
             template: run-fv3gfs
           arguments:
             artifacts:
@@ -83,7 +83,7 @@ spec:
           name: run-model-shield
           when: "'{{steps.infer-wrapper.outputs.result}}' == 'shield.wrapper'"
           templateRef:
-            name: run-shield
+            name: run-simulation
             template: run-shield
           arguments:
             artifacts:

From 65ca5c839b28d8b85337d43b6da6274eb46dcfa3 Mon Sep 17 00:00:00 2001
From: Spencer Clark <spencerkclark@gmail.com>
Date: Wed, 13 Dec 2023 18:51:13 +0000
Subject: [PATCH 6/9] Bump SHiELD-wrapper to include Q-flux fix

This bumps SHiELD-wrapper to include a Q-flux bug fix, and a couple other user
experience improvements with the SOM.  Note this fix has not been merged to
SHiELD yet, so I will update this PR later once we can point to main branches
of SHiELD-wrapper and SHiELD_physics.
---
 external/SHiELD-wrapper | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/SHiELD-wrapper b/external/SHiELD-wrapper
index 48c83c96c2..6a4e165059 160000
--- a/external/SHiELD-wrapper
+++ b/external/SHiELD-wrapper
@@ -1 +1 @@
-Subproject commit 48c83c96c274ad3631aaa9d54a17770c4d540dc4
+Subproject commit 6a4e165059d3067f39c45e002f65cacbde0f761d

From d6fd2753d3d2839e2a95e3607e0a865bbd4d41fb Mon Sep 17 00:00:00 2001
From: Spencer Clark <spencerkclark@gmail.com>
Date: Wed, 20 Dec 2023 21:35:42 +0000
Subject: [PATCH 7/9] Point SHiELD wrapper to a commit on main

---
 external/SHiELD-wrapper | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/SHiELD-wrapper b/external/SHiELD-wrapper
index 6a4e165059..f82a7130f7 160000
--- a/external/SHiELD-wrapper
+++ b/external/SHiELD-wrapper
@@ -1 +1 @@
-Subproject commit 6a4e165059d3067f39c45e002f65cacbde0f761d
+Subproject commit f82a7130f7f2bbc3a7bc26860517694f13e9ef1b

From 88f91a074154d12a4cdd3289ec9391ccf0e83a4e Mon Sep 17 00:00:00 2001
From: Spencer Clark <spencerkclark@gmail.com>
Date: Thu, 21 Dec 2023 19:45:40 +0000
Subject: [PATCH 8/9] Set DGLBACKEND=pytorch in prognostic_run_shield image

---
 docker/prognostic_run_shield/Dockerfile | 4 ++++
 workflows/argo/prognostic-run.yaml      | 8 --------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/docker/prognostic_run_shield/Dockerfile b/docker/prognostic_run_shield/Dockerfile
index bd0b965d71..fc325ef459 100644
--- a/docker/prognostic_run_shield/Dockerfile
+++ b/docker/prognostic_run_shield/Dockerfile
@@ -127,5 +127,9 @@ ENV OUTPUT_FREQ_SEC=18000
 # Add fv3net packages to the PYTHONPATH
 ENV PYTHONPATH=${FV3NET_DIR}/workflows/prognostic_c48_run:${FV3NET_DIR}/external/fv3fit:${FV3NET_DIR}/external/emulation:${FV3NET_DIR}/external/vcm:${FV3NET_DIR}/external/artifacts:${FV3NET_DIR}/external/loaders:${FV3NET_DIR}/external/fv3kube:${FV3NET_DIR}/workflows/post_process_run:${FV3NET_DIR}/external/radiation:${PYTHONPATH}
 
+# Set DGLBACKEND to pytorch to silence warnings that it is unset; if unset it is
+# set to pytorch, so this just makes it explicit.
+ENV DGLBACKEND=pytorch
+
 WORKDIR ${FV3NET_DIR}/workflows/prognostic_c48_run
 CMD ["bash"]
diff --git a/workflows/argo/prognostic-run.yaml b/workflows/argo/prognostic-run.yaml
index e7d32820b3..64ca4986d5 100644
--- a/workflows/argo/prognostic-run.yaml
+++ b/workflows/argo/prognostic-run.yaml
@@ -166,14 +166,6 @@ spec:
       container:
         <<: *prepare-config-container-fv3gfs
         image: us.gcr.io/vcm-ml/prognostic_run_shield
-        env:
-          # Without setting the DGLBACKEND some additional text gets added to
-          # the prepared config (a warning that the default backend was set to
-          # pytorch). I do not fully understand why this happens in this image
-          # but not the FV3GFS prognostic run image. Should I just set this in
-          # the Dockerfile itself?
-          - name: DGLBACKEND
-            value: pytorch
     - name: infer-wrapper
       inputs:
         artifacts:

From 2148134e87161f778084a2bfdce517f2ffe57cb9 Mon Sep 17 00:00:00 2001
From: Spencer Clark <spencerkclark@gmail.com>
Date: Thu, 21 Dec 2023 21:53:17 +0000
Subject: [PATCH 9/9] Bump SHiELD-wrapper to for Q-flux bug fix and diagnostic
 updates

---
 external/SHiELD-wrapper | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/SHiELD-wrapper b/external/SHiELD-wrapper
index f82a7130f7..1e5939d604 160000
--- a/external/SHiELD-wrapper
+++ b/external/SHiELD-wrapper
@@ -1 +1 @@
-Subproject commit f82a7130f7f2bbc3a7bc26860517694f13e9ef1b
+Subproject commit 1e5939d604c6958f6f1f10452fea1570b359e494