don't pass window_size if not necessary

EleutherAI · Quentin-Anthony · Mar 2, 2024 · Feb 29, 2024 · Feb 29, 2024 · Feb 29, 2024
commit c094c8cca025b5cd8d570541a0e3531050b452c4
@@ -593,6 +593,17 @@ def flash_attention(self, query_layer, key_layer, value_layer):
  output_size[0], output_size[2], output_size[1], -1
  )
 
+ # only pass in window_size kwarg to flash-attn
+ # if we use Sliding Window Attention.
+ # Flash attn defaults to (-1,-1), or 
+ # does not have this kwarg prior to v2.3.0
+ extra_kwargs = (
+ {
+ "window_size": (self.sliding_window_width, -1)
+ } 
+ if self.sliding_window_width is not None
+ else {}
+ )
  if not self.training:
  q_shape = query_layer.shape
  k_shape = key_layer.shape
@@ -613,9 +624,7 @@ def flash_attention(self, query_layer, key_layer, value_layer):
  max_seqlen_k,
  softmax_scale=None,
  causal=True,
- window_size=(self.sliding_window_width, -1)
- if self.sliding_window_width is not None
- else (-1, -1),
+ **extra_kwargs,
  )
  output = output.reshape(q_shape)
  else:
@@ -626,9 +635,7 @@ def flash_attention(self, query_layer, key_layer, value_layer):
  self.dropout_p if self.training else 0.0,
  softmax_scale=None,
  causal=True,
- window_size=(self.sliding_window_width, -1)
- if self.sliding_window_width is not None
- else (-1, -1),
+ **extra_kwargs,
  )
 
  matmul_result = output