add reduce min/max along with tests. Also optimize i16 abs for sse2 (#…

…138)
Lokathor · Sep 26, 2023 · 6270f8d · 6270f8d
1 parent 8a915b0
commit 6270f8d
Show file tree

Hide file tree

Showing 8 changed files with 178 additions and 8 deletions.
diff --git a/src/i16x16_.rs b/src/i16x16_.rs
@@ -390,8 +390,27 @@ impl i16x16 {
  #[inline]
  #[must_use]
  pub fn reduce_add(self) -> i16 {
- let arr: [i16; 16] = cast(self);
- arr.iter().sum()
+ let arr: [i16x8; 2] = cast(self);
+
+ (arr[0] + arr[1]).reduce_add()
+ }
+
+ /// horizontal min of all the elements of the vector
+ #[inline]
+ #[must_use]
+ pub fn reduce_min(self) -> i16 {
+ let arr: [i16x8; 2] = cast(self);
+
+ arr[0].min(arr[1]).reduce_min()
+ }
+
+ /// horizontal max of all the elements of the vector
+ #[inline]
+ #[must_use]
+ pub fn reduce_max(self) -> i16 {
+ let arr: [i16x8; 2] = cast(self);
+
+ arr[0].max(arr[1]).reduce_max()
  }
 
  #[inline]

diff --git a/src/i16x8_.rs b/src/i16x8_.rs
@@ -655,14 +655,41 @@ impl i16x8 {
  #[must_use]
  pub fn reduce_add(self) -> i16 {
  let arr: [i16; 8] = cast(self);
- arr.iter().sum()
+
+ (arr[0].wrapping_add(arr[1]).wrapping_add(arr[2].wrapping_add(arr[3])))
+ .wrapping_add(
+ arr[4].wrapping_add(arr[5]).wrapping_add(arr[6].wrapping_add(arr[7])),
+ )
+ }
+
+ /// horizontal min of all the elements of the vector
+ #[inline]
+ #[must_use]
+ pub fn reduce_min(self) -> i16 {
+ let arr: [i16; 8] = cast(self);
+
+ (arr[0].min(arr[1]).min(arr[2].min(arr[3])))
+ .min(arr[4].min(arr[5]).min(arr[6].min(arr[7])))
+ }
+
+ /// horizontal max of all the elements of the vector
+ #[inline]
+ #[must_use]
+ pub fn reduce_max(self) -> i16 {
+ let arr: [i16; 8] = cast(self);
+
+ (arr[0].max(arr[1]).max(arr[2].max(arr[3])))
+ .max(arr[4].max(arr[5]).max(arr[6].max(arr[7])))
  }
 
  #[inline]
  #[must_use]
  pub fn abs(self) -> Self {
  pick! {
- if #[cfg(target_feature="ssse3")] {
+ if #[cfg(target_feature="sse2")] {
+ let mask = shr_imm_i16_m128i::<15>(self.sse);
+ Self { sse: bitxor_m128i(add_i16_m128i(self.sse, mask), mask) }
+ } else if #[cfg(target_feature="ssse3")] {
  Self { sse: abs_i16_m128i(self.sse) }
  } else if #[cfg(target_feature="simd128")] {
  Self { simd: i16x8_abs(self.simd) }
@@ -692,7 +719,7 @@ impl i16x8 {
  #[must_use]
  pub fn min(self, rhs: Self) -> Self {
  pick! {
- if #[cfg(target_feature="sse4.1")] {
+ if #[cfg(target_feature="sse2")] {
  Self { sse: min_i16_m128i(self.sse, rhs.sse) }
  } else if #[cfg(target_feature="simd128")] {
  Self { simd: i16x8_min(self.simd, rhs.simd) }
@@ -703,6 +730,7 @@ impl i16x8 {
  }
  }
  }
+
  #[inline]
  #[must_use]
  pub fn saturating_add(self, rhs: Self) -> Self {

diff --git a/src/i32x4_.rs b/src/i32x4_.rs
@@ -443,8 +443,35 @@ impl i32x4 {
  #[inline]
  #[must_use]
  pub fn reduce_add(self) -> i32 {
+ pick! {
+ if #[cfg(target_feature="sse2")] {
+ let hi64 = unpack_high_i64_m128i(self.sse, self.sse);
+ let sum64 = add_i32_m128i(hi64, self.sse);
+ let hi32 = shuffle_ai_f32_all_m128i::<0b10_11_00_01>(sum64); // Swap the low two elements
+ let sum32 = add_i32_m128i(sum64, hi32);
+ get_i32_from_m128i_s(sum32)
+ } else {
+ let arr: [i32; 4] = cast(self);
+ arr[0].wrapping_add(arr[1]).wrapping_add(
+ arr[2].wrapping_add(arr[3]))
+ }
+ }
+ }
+
+ /// horizontal max of all the elements of the vector
+ #[inline]
+ #[must_use]
+ pub fn reduce_max(self) -> i32 {
+ let arr: [i32; 4] = cast(self);
+ arr[0].max(arr[1]).max(arr[2].max(arr[3]))
+ }
+
+ /// horizontal min of all the elements of the vector
+ #[inline]
+ #[must_use]
+ pub fn reduce_min(self) -> i32 {
  let arr: [i32; 4] = cast(self);
- arr.iter().sum()
+ arr[0].min(arr[1]).min(arr[2].min(arr[3]))
  }
 
  #[inline]

diff --git a/src/i32x8_.rs b/src/i32x8_.rs
@@ -335,8 +335,24 @@ impl i32x8 {
  #[inline]
  #[must_use]
  pub fn reduce_add(self) -> i32 {
- let arr: [i32; 8] = cast(self);
- arr.iter().sum()
+ let arr: [i32x4; 2] = cast(self);
+ (arr[0] + arr[1]).reduce_add()
+ }
+
+ /// horizontal max of all the elements of the vector
+ #[inline]
+ #[must_use]
+ pub fn reduce_max(self) -> i32 {
+ let arr: [i32x4; 2] = cast(self);
+ arr[0].max(arr[1]).reduce_max()
+ }
+
+ /// horizontal min of all the elements of the vector
+ #[inline]
+ #[must_use]
+ pub fn reduce_min(self) -> i32 {
+ let arr: [i32x4; 2] = cast(self);
+ arr[0].min(arr[1]).reduce_min()
  }
 
  #[inline]

diff --git a/tests/all_tests/t_i16x16.rs b/tests/all_tests/t_i16x16.rs
@@ -647,3 +647,23 @@ fn impl_i16x16_reduce_add() {
  i16x16::from([1, 2, 3, 4, 5, 6, 7, 9, 10, 20, 30, 40, 50, 60, 70, 90]);
  assert_eq!(p.reduce_add(), 407);
 }
+
+#[test]
+fn impl_i16x16_reduce_min() {
+ for i in 0..8 {
+ let mut v = [i16::MAX; 16];
+ v[i] = i16::MIN;
+ let p = i16x16::from(v);
+ assert_eq!(p.reduce_min(), i16::MIN);
+ }
+}
+
+#[test]
+fn impl_i16x16_reduce_max() {
+ for i in 0..8 {
+ let mut v = [i16::MIN; 16];
+ v[i] = i16::MAX;
+ let p = i16x16::from(v);
+ assert_eq!(p.reduce_min(), i16::MIN);
+ }
+}
diff --git a/tests/all_tests/t_i16x8.rs b/tests/all_tests/t_i16x8.rs
@@ -316,3 +316,23 @@ fn impl_i16x8_reduce_add() {
  let p = i16x8::from([1, 2, 3, 4, 5, 6, 7, 9]);
  assert_eq!(p.reduce_add(), 37);
 }
+
+#[test]
+fn impl_i16x8_reduce_min() {
+ for i in 0..8 {
+ let mut v = [i16::MAX; 8];
+ v[i] = i16::MIN;
+ let p = i16x8::from(v);
+ assert_eq!(p.reduce_min(), i16::MIN);
+ }
+}
+
+#[test]
+fn impl_i16x8_reduce_max() {
+ for i in 0..8 {
+ let mut v = [i16::MIN; 8];
+ v[i] = i16::MAX;
+ let p = i16x8::from(v);
+ assert_eq!(p.reduce_min(), i16::MIN);
+ }
+}
diff --git a/tests/all_tests/t_i32x4.rs b/tests/all_tests/t_i32x4.rs
@@ -201,3 +201,23 @@ fn impl_i32x4_reduce_add() {
  let p = i32x4::from([10000000, 20000000, 30000000, -40000000]);
  assert_eq!(p.reduce_add(), 20000000);
 }
+
+#[test]
+fn impl_i32x4_reduce_min() {
+ for i in 0..4 {
+ let mut v = [i32::MAX; 4];
+ v[i] = i32::MIN;
+ let p = i32x4::from(v);
+ assert_eq!(p.reduce_min(), i32::MIN);
+ }
+}
+
+#[test]
+fn impl_i32x4_reduce_max() {
+ for i in 0..4 {
+ let mut v = [i32::MIN; 4];
+ v[i] = i32::MAX;
+ let p = i32x4::from(v);
+ assert_eq!(p.reduce_max(), i32::MAX);
+ }
+}
diff --git a/tests/all_tests/t_i32x8.rs b/tests/all_tests/t_i32x8.rs
@@ -271,3 +271,23 @@ fn impl_i32x8_reduce_add() {
  ]);
  assert_eq!(p.reduce_add(), 370000000);
 }
+
+#[test]
+fn impl_i32x8_reduce_min() {
+ for i in 0..8 {
+ let mut v = [i32::MAX; 8];
+ v[i] = i32::MIN;
+ let p = i32x8::from(v);
+ assert_eq!(p.reduce_min(), i32::MIN);
+ }
+}
+
+#[test]
+fn impl_i32x8_reduce_max() {
+ for i in 0..8 {
+ let mut v = [i32::MIN; 8];
+ v[i] = i32::MAX;
+ let p = i32x8::from(v);
+ assert_eq!(p.reduce_max(), i32::MAX);
+ }
+}