apply precommit

EleutherAI · Quentin-Anthony · Mar 2, 2024 · Feb 29, 2024 · Feb 29, 2024 · Feb 29, 2024
commit 11a2e9b14cafc66d79db72c66bfdef2947d84699
@@ -11,7 +11,7 @@
  "num_kv_heads": 8,
  # per Mistral, Mistral-7B-v0.1 was pretrained with 8192 seqlen
  # and instruction tuned to 16384 seqlen, all with 4096 sliding window
- "seq_length": 8192, 
+ "seq_length": 8192,
  "sliding_window_width": 4096,
  "max_position_embeddings": 131072,
  "pos_emb": "rotary",

@@ -1058,7 +1058,7 @@ Text Generation arguments
 
 - **prompt_end**: str
 
- Default = 
+ Default =
 
 
  a single prompt's end. Defaults to newline
@@ -1100,7 +1100,7 @@ Text Generation arguments
 
 - **eval_results_prefix**: str
 
- Default = 
+ Default =
 
  prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
 
@@ -1844,7 +1844,7 @@ Args for deepspeed config
 
  Default = None
 
- 
+
 
 
 
@@ -2144,4 +2144,3 @@ Args for deepspeed runner (deepspeed.launcher.runner).
  Default = None
 
  Adds a `--account` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometimes necessary for cluster rules, or so I've heard.
-
@@ -595,12 +595,10 @@ def flash_attention(self, query_layer, key_layer, value_layer):
 
  # only pass in window_size kwarg to flash-attn
  # if we use Sliding Window Attention.
- # Flash attn defaults to (-1,-1), or 
+ # Flash attn defaults to (-1,-1), or
  # does not have this kwarg prior to v2.3.0
  extra_kwargs = (
- {
- "window_size": (self.sliding_window_width, -1)
- } 
+ {"window_size": (self.sliding_window_width, -1)}
  if self.sliding_window_width is not None
  else {}
  )