convolution: implement support for 8 bit input

marph91 · Mar 23, 2021 · 04d10e6 · 04d10e6
1 parent 601f5b5
commit 04d10e6
Show file tree

Hide file tree

Showing 6 changed files with 124 additions and 61 deletions.
diff --git a/playground/04_custom_toplevel.py b/playground/04_custom_toplevel.py
@@ -80,11 +80,10 @@ def update(self, previous_layer_info):
 
  # thresholds
  input_channel_bitwidth = previous_layer_info["bitwidth"]
- bitwidth = (
- math.ceil(
- math.log2(input_channel * input_channel_bitwidth * kernel_size ** 2 + 1)
- )
- * output_channel
+ bitwidth = output_channel * (
+ input_channel_bitwidth
+ + math.ceil(math.log2(input_channel * kernel_size ** 2 + 1))
+ + 1
  )
  thresholds = "".join([str(randint(0, 1)) for _ in range(bitwidth)])
  self.constants["C_THRESHOLDS"] = Parameter(
@@ -319,7 +318,6 @@ def __init__(
  "height": image_height,
  }
 
- # TODO: input_channel
  self.input_data_signal = Parameter(
  f"slv_data_{self.previous_layer_info['name']}",
  f"std_logic_vector(8 - 1 downto 0)",
@@ -335,7 +333,6 @@ def __init__(
 library cnn_lib;
 library util;"""
 
- # TODO: input_channel
  self.entity = f"""
 entity bnn is
  generic (

diff --git a/sim/test_convolution.py b/sim/test_convolution.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+from math import log2
 import pathlib
 from random import randint
 from typing import List
@@ -20,29 +21,45 @@
 
 @cocotb.test()
 async def run_test(dut):
+ input_channel = dut.C_INPUT_CHANNEL.value.integer
+ window_size = dut.C_KERNEL_SIZE.value.integer ** 2
+ input_length = input_channel * window_size
+
+ # input is unsigned 1 or 8 bit, output is signed
+ input_channel_bitwidth = dut.C_INPUT_CHANNEL_BITWIDTH.value.integer
+ output_bitwidth = (
+ input_channel_bitwidth + int(log2(window_size * input_channel)) + 1
+ )
+
  @dataclass
  class Testcase:
  input_activations: List[int]
  input_weights: List[int]
 
  @property
  def input_activations_int(self) -> int:
- return concatenate_integers(self.input_activations)
+ return concatenate_integers(
+ self.input_activations, bitwidth=input_channel_bitwidth
+ )
 
  @property
  def input_weights_int(self) -> int:
  return concatenate_integers(self.input_weights)
 
  @property
  def output_data(self) -> int:
- ones_count = 0
+ # 1 bit activations -> xnor, popcount
+ if input_channel_bitwidth == 1:
+ ones_count = 0
+ for act, weight in zip(self.input_activations, self.input_weights):
+ ones_count = ones_count + (not (act ^ weight))
+ return ones_count
+ # >1 bit activations -> multiplication
+ product = 0
  for act, weight in zip(self.input_activations, self.input_weights):
- ones_count = ones_count + (not (act ^ weight))
- return ones_count
+ product = product + act * (-1 if weight == 0 else 1)
+ return product
 
- input_channel = dut.C_INPUT_CHANNEL.value.integer
- window_size = dut.C_KERNEL_SIZE.value.integer ** 2
- input_length = input_channel * window_size
  cases = (
  Testcase([0] * input_length, [0] * input_length),
  Testcase([0] * input_length, [1] * input_length),
@@ -70,17 +87,21 @@ def output_data(self) -> int:
  while dut.osl_valid.value.integer == 0:
  dut.isl_valid <= 0
  await tick.wait()
- assert (
- dut.oslv_data.value.integer == case.output_data
- ), f"{dut.oslv_data.value.integer} /= {case.output_data}"
+
+ output_int = from_fixedint(
+ dut.oslv_data.value.integer, output_bitwidth, is_unsigned=False
+ )
+ assert output_int == case.output_data, f"{output_int} /= {case.output_data}"
 
 
-@pytest.mark.parametrize("kernel_size", range(2, 7))
+@pytest.mark.parametrize("kernel_size", (1, 2, 3, 5, 7))
 @pytest.mark.parametrize("input_channel", (1, 4, 9))
-def test_convolution(kernel_size, input_channel):
+@pytest.mark.parametrize("input_channel_bitwidth", (1, 8))
+def test_convolution(kernel_size, input_channel, input_channel_bitwidth):
  generics = {
  "C_KERNEL_SIZE": kernel_size,
  "C_INPUT_CHANNEL": input_channel,
+ "C_INPUT_CHANNEL_BITWIDTH": input_channel_bitwidth,
  }
  run(
  vhdl_sources=get_files(

diff --git a/sim/test_window_convolution_activation.py b/sim/test_window_convolution_activation.py
@@ -127,7 +127,9 @@ def get_threshold(self):
 
  return concatenate_integers(
  self.replace_minus(threshold),
- bitwidth=math.ceil(math.log2(kernel_size[0] ** 2 * image_shape[2] + 1)),
+ bitwidth=bitwidth
+ + math.ceil(math.log2(kernel_size[0] ** 2 * image_shape[2] + 1))
+ + 1,
  )
 
  cases = (
@@ -214,6 +216,7 @@ def get_threshold(self):
 
 
 # Don't run the full test matrix. Only the most common configs.
+# TODO: Add test for 8 bit input.
 @pytest.mark.parametrize(
  "kernel_size,stride,input_channel,output_channel",
  [

diff --git a/src/convolution.vhd b/src/convolution.vhd
@@ -5,6 +5,7 @@ library ieee;
  use ieee.math_real.all;
 
 library util;
+ use util.array_pkg.all;
  use util.math_pkg.all;
 
 entity convolution is
@@ -16,52 +17,42 @@ entity convolution is
  port (
  isl_clk : in std_logic;
  isl_valid : in std_logic;
- islv_data : in std_logic_vector(C_KERNEL_SIZE * C_KERNEL_SIZE * C_INPUT_CHANNEL * C_INPUT_CHANNEL_BITWIDTH - 1 downto 0);
- islv_weights : in std_logic_vector(C_KERNEL_SIZE * C_KERNEL_SIZE * C_INPUT_CHANNEL - 1 downto 0);
- oslv_data : out std_logic_vector(log2(C_KERNEL_SIZE * C_KERNEL_SIZE * C_INPUT_CHANNEL * C_INPUT_CHANNEL_BITWIDTH + 1) - 1 downto 0);
+ islv_data : in std_logic_vector(C_KERNEL_SIZE ** 2 * C_INPUT_CHANNEL * C_INPUT_CHANNEL_BITWIDTH - 1 downto 0);
+ islv_weights : in std_logic_vector(C_KERNEL_SIZE ** 2 * C_INPUT_CHANNEL - 1 downto 0);
+ oslv_data : out std_logic_vector(C_INPUT_CHANNEL_BITWIDTH + log2(C_KERNEL_SIZE ** 2 * C_INPUT_CHANNEL + 1) downto 0);
  osl_valid : out std_logic
  );
 end entity convolution;
 
 architecture behavioral of convolution is
 
- constant C_PARALLEL_POPCOUNT : integer := 4;
- constant C_SPLIT : integer := integer(ceil(real(islv_data'length) / real(C_PARALLEL_POPCOUNT)));
- constant C_PADDED_BITWIDTH : integer := C_PARALLEL_POPCOUNT * C_SPLIT;
-
- signal sl_add : std_logic := '0';
- signal sl_popcount : std_logic := '0';
- signal slv_product : std_logic_vector(C_PADDED_BITWIDTH - 1 downto 0) := (others => '0');
- signal slv_popcount : std_logic_vector(C_SPLIT * 3 - 1 downto 0);
-
  signal sl_valid_out : std_logic := '0';
  signal slv_data_out : std_logic_vector(oslv_data'range);
 
 begin
 
- i_adder_tree : entity util.adder_tree
- generic map (
- C_INPUT_COUNT => C_SPLIT,
- C_INPUT_BITWIDTH => 3,
- C_OUTPUT_BITWIDTH => oslv_data'length
- )
- port map (
- isl_clk => isl_clk,
- isl_valid => sl_add,
- islv_data => slv_popcount,
- oslv_data => slv_data_out,
- osl_valid => sl_valid_out
- );
-
  gen_matrix_multiplication : if C_INPUT_CHANNEL_BITWIDTH = 1 generate
+ constant C_PARALLEL_POPCOUNT : integer := 4;
+ constant C_SPLIT : integer := integer(ceil(real(islv_data'length) / real(C_PARALLEL_POPCOUNT)));
+ constant C_INPUT_BITWIDTH_ADDER : integer := log2(C_PARALLEL_POPCOUNT + 1);
+ constant C_PADDED_BITWIDTH_PRODUCT : integer := C_PARALLEL_POPCOUNT * C_SPLIT;
+
+ signal sl_add : std_logic := '0';
+ signal sl_popcount : std_logic := '0';
+ signal slv_product : std_logic_vector(C_PADDED_BITWIDTH_PRODUCT - 1 downto 0) := (others => '0');
+ signal slv_popcount : std_logic_vector(C_SPLIT * C_INPUT_BITWIDTH_ADDER - 1 downto 0);
+
+ signal slv_data_adder : std_logic_vector(C_INPUT_BITWIDTH_ADDER + log2(C_KERNEL_SIZE ** 2 * C_INPUT_CHANNEL + 1) - 1 downto 0);
+ begin
 
  proc_xnor_popcount : process (isl_clk) is
 
- variable v_usig_popcount : unsigned(2 downto 0);
- variable v_usig_popcount_total : unsigned(oslv_data'range);
+ variable v_usig_popcount : unsigned(2 downto 0);
 
  begin
 
+ report to_string(slv_data_out'length) & " " & to_string(slv_data_adder'length);
+
  if (rising_edge(isl_clk)) then
  sl_popcount <= '0';
  sl_add <= '0';
@@ -82,7 +73,7 @@ begin
  v_usig_popcount := v_usig_popcount + 1;
  end if;
  end loop;
- slv_popcount((slice + 1) * 3 - 1 downto slice * 3) <= std_logic_vector(v_usig_popcount);
+ slv_popcount((slice + 1) * C_INPUT_BITWIDTH_ADDER - 1 downto slice * C_INPUT_BITWIDTH_ADDER) <= std_logic_vector(v_usig_popcount);
  end loop;
 
  sl_add <= '1';
@@ -91,31 +82,80 @@ begin
 
  end process proc_xnor_popcount;
 
+ i_adder_tree : entity util.adder_tree
+ generic map (
+ C_INPUT_COUNT => C_SPLIT,
+ C_INPUT_BITWIDTH => C_INPUT_BITWIDTH_ADDER,
+ C_OUTPUT_BITWIDTH => slv_data_adder'length
+ )
+ port map (
+ isl_clk => isl_clk,
+ isl_valid => sl_add,
+ islv_data => slv_popcount,
+ oslv_data => slv_data_adder,
+ osl_valid => sl_valid_out
+ );
+
+ -- Adder output is unsigned, output is signed.
+ -- Adder output has to big bitwidth, because of the grouping. It can be resized.
+ slv_data_out <= std_logic_vector(resize(signed('0' & slv_data_adder), slv_data_out'length));
+
  else generate
+ -- + 1, because of multiplication by +-1
+ constant C_PRODUCT_BITWIDTH : integer := C_INPUT_CHANNEL_BITWIDTH + 1;
+ signal sl_add : std_logic := '0';
+ signal slv_product : std_logic_vector(C_KERNEL_SIZE ** 2 * C_INPUT_CHANNEL * C_PRODUCT_BITWIDTH - 1 downto 0);
+
+ begin
 
  gen_input : for input_channel in 0 to C_INPUT_CHANNEL - 1 generate
 
  proc_add_sign : process (isl_clk) is
 
- variable v_usig_popcount : unsigned(2 downto 0);
- variable v_usig_popcount_total : unsigned(oslv_data'range);
+ variable v_int_index : integer;
+ variable v_slv_input_datum : std_logic_vector(C_PRODUCT_BITWIDTH - 1 downto 0);
+ variable v_slv_output_datum : std_logic_vector(C_PRODUCT_BITWIDTH - 1 downto 0);
 
  begin
 
  if (rising_edge(isl_clk)) then
- sl_popcount <= '0';
- sl_add <= '0';
+ sl_add <= '0';
 
  if (isl_valid = '1') then
- -- islv_data * +-1
- -- assign slices to adder tree
- -- extend adder tree by signed addition
- sl_add <= '1';
+ for ch in 0 to C_INPUT_CHANNEL - 1 loop
+ for k in 0 to C_KERNEL_SIZE ** 2 - 1 loop
+ v_int_index := k + ch * C_KERNEL_SIZE ** 2;
+ v_slv_input_datum := '0' & get_slice(islv_data, v_int_index, C_PRODUCT_BITWIDTH - 1);
+ -- Calculate product, i. e. input data * +-1
+ if (islv_weights(v_int_index) = '1') then
+ v_slv_output_datum := v_slv_input_datum;
+ else
+ v_slv_output_datum := std_logic_vector(-signed(v_slv_input_datum));
+ end if;
+ slv_product((v_int_index + 1) * C_PRODUCT_BITWIDTH - 1 downto v_int_index * C_PRODUCT_BITWIDTH) <= v_slv_output_datum;
+ end loop;
+ end loop;
+ sl_add <= '1';
  end if;
  end if;
 
  end process proc_add_sign;
 
+ i_adder_tree : entity util.adder_tree
+ generic map (
+ C_INPUT_COUNT => C_KERNEL_SIZE ** 2 * C_INPUT_CHANNEL,
+ C_INPUT_BITWIDTH => C_PRODUCT_BITWIDTH,
+ C_UNSIGNED => 0,
+ C_OUTPUT_BITWIDTH => oslv_data'length
+ )
+ port map (
+ isl_clk => isl_clk,
+ isl_valid => sl_add,
+ islv_data => slv_product,
+ oslv_data => slv_data_out,
+ osl_valid => sl_valid_out
+ );
+
  end generate gen_input;
 
  end generate gen_matrix_multiplication;

diff --git a/src/util/adder_tree.vhd b/src/util/adder_tree.vhd
@@ -48,7 +48,9 @@ architecture rtl of adder_tree is
  v_sum_init := (others => (others => '0'));
 
  -- Pad with zeros to widen from input bitwidth to output bitwidth.
- assert C_OUTPUT_BITWIDTH >= C_INPUT_BITWIDTH + C_STAGES; -- Input gets extended by 1 bit at each stage.
+ -- Input gets extended by 1 bit at each stage.
+ assert C_OUTPUT_BITWIDTH >= C_INPUT_BITWIDTH + C_STAGES
+ report "required bitwidth: " & to_string(C_INPUT_BITWIDTH + C_STAGES) & ", actual bitwidth: " & to_string(C_OUTPUT_BITWIDTH);
  v_input_datum := (others => '0');
 
  for i in 0 to C_INPUT_COUNT - 1 loop

diff --git a/src/window_convolution_activation.vhd b/src/window_convolution_activation.vhd
@@ -31,7 +31,7 @@ entity window_convolution_activation is
  islv_data : in std_logic_vector(C_INPUT_CHANNEL * C_INPUT_CHANNEL_BITWIDTH - 1 downto 0);
  -- islv_weights and islv_threshold are constants
  islv_weights : in std_logic_vector(C_KERNEL_SIZE * C_KERNEL_SIZE * C_INPUT_CHANNEL * C_OUTPUT_CHANNEL - 1 downto 0);
- islv_threshold : in std_logic_vector(C_OUTPUT_CHANNEL * log2(C_KERNEL_SIZE ** 2 * C_INPUT_CHANNEL * C_INPUT_CHANNEL_BITWIDTH + 1) - 1 downto 0);
+ islv_threshold : in std_logic_vector(C_OUTPUT_CHANNEL * (C_INPUT_CHANNEL_BITWIDTH + log2(C_KERNEL_SIZE ** 2 * C_INPUT_CHANNEL + 1) + 1) - 1 downto 0);
  oslv_data : out std_logic_vector(C_OUTPUT_CHANNEL * C_OUTPUT_CHANNEL_BITWIDTH - 1 downto 0);
  osl_valid : out std_logic
  );
@@ -40,13 +40,13 @@ end entity window_convolution_activation;
 architecture behavioral of window_convolution_activation is
 
  signal sl_valid_window_ctrl : std_logic := '0';
- signal slv_data_window_ctrl : std_logic_vector(C_KERNEL_SIZE * C_KERNEL_SIZE * C_INPUT_CHANNEL * C_INPUT_CHANNEL_BITWIDTH - 1 downto 0);
+ signal slv_data_window_ctrl : std_logic_vector(C_KERNEL_SIZE ** 2 * C_INPUT_CHANNEL * C_INPUT_CHANNEL_BITWIDTH - 1 downto 0);
 
  signal slv_valid_convolution : std_logic_vector(C_OUTPUT_CHANNEL - 1 downto 0);
 
  type t_slv_array_1d is array(natural range <>) of std_logic_vector;
 
- constant C_POST_CONVOLUTION_BITWIDTH : integer := log2(C_KERNEL_SIZE ** 2 * C_INPUT_CHANNEL * C_INPUT_CHANNEL_BITWIDTH + 1);
+ constant C_POST_CONVOLUTION_BITWIDTH : integer := C_INPUT_CHANNEL_BITWIDTH + log2(C_KERNEL_SIZE ** 2 * C_INPUT_CHANNEL + 1) + 1;
  signal a_data_convolution : t_slv_array_1d(0 to C_OUTPUT_CHANNEL - 1)(C_POST_CONVOLUTION_BITWIDTH - 1 downto 0);
 
  signal slv_valid_batch_normalization : std_logic_vector(C_OUTPUT_CHANNEL - 1 downto 0);