import numpy as np def div_and_ceil(a, b): return ((a - 1) // b) + 1 class Ne16: TP_IN = 16 TP_OUT = 32 KS = 3 INPUT_BUFFER_H = 5 INPUT_BUFFER_W = 5 OUTPUT_BUFFER_SHAPE = (3, 3, 32) @property def name(self): return 'ne16' def weights_ko_len(self, ko, dw): return div_and_ceil(ko, self.TP_IN) if dw else ko def weights_ki_size(self, ki, ks, qw, dw): if dw: return qw * ks[0] * ks[1] * (self.TP_IN // 8) else: return div_and_ceil(ki, self.TP_IN) * qw * ks[0] * ks[1] * (self.TP_IN // 8) def weights_size(self, ko, ki, ks, qw, dw): return self.weights_ko_len(ko, dw) * self.weights_ki_size(ki, ks, qw, dw) # assuming torch shapes, w must already be in uint format! # format --> [Ko, KiMajor, Qw, KiMinor] (binary tensor) # +++++++++++ --> these are *contiguous and packed* def conv1x1_unroll(self, w, qw, tp_in=16): Ko, Ki, H, W = w.shape nb_ki = (Ki // tp_in + (1 if Ki % tp_in != 0 else 0)) wbytes = np.zeros((Ko * nb_ki * qw, 2), dtype=np.uint8) for ko in range(Ko): for ki in range(Ki): kimaj = ki // tp_in kimin = ki % tp_in byte = kimin // 8 shift = kimin % 8 for q in range(qw): index = ko * nb_ki * qw + kimaj * qw + q wbytes[index, byte] = np.bitwise_or(wbytes[index, byte], 1 << shift if w[ko, ki, 0, 0] & (1 << q) != 0 else 0) wbytes = wbytes.reshape(-1) return wbytes def conv1x1_roll(self, wbytes, qw, shape, layout='CoutCinK'): if layout == 'CoutCinK': Ko, Ki, H, W = shape w = np.zeros(shape, dtype=np.uint8) wv = w elif layout == 'CoutKCin': Ko, H, W, Ki = shape w = np.zeros(shape, dtype=np.uint8) wv = w.transpose((0, 3, 1, 2)) else: raise Exception(f'Format {layout} not implemented.') nb_ki = (Ki // self.TP_IN + (1 if Ki % self.TP_IN != 0 else 0)) for ko in range(Ko): for kimaj in range(nb_ki): for q in range(qw): for kimin in range(self.TP_IN): byte = kimin // 8 shift = kimin % 8 index = ko * nb_ki * qw * 2 + kimaj * qw * 2 + q * 2 + byte if kimaj * self.TP_IN + kimin < Ki: wv[ko, kimaj * self.TP_IN + kimin, 0, 0] += (1 & (wbytes[index] >> shift)) << q return w def subtile_bit_extract(self, subtile, bit_idx): retval = 0 for i, el in enumerate(subtile): if el.item() & (1 << bit_idx): retval |= 1 << i return retval def conv3x3_unroll(self, w, qw): Ko, Ki, H, W = w.shape nb_ki = (Ki // self.TP_IN) + (1 if Ki % self.TP_IN != 0 else 0) nb_tp_in = self.TP_IN // 8 wbytes = np.zeros((Ko, nb_ki, qw, H * W, nb_tp_in), dtype=np.uint8) for i in range(Ko): for j in range(nb_ki): tile = w[i, j * self.TP_IN:(j + 1) * self.TP_IN].transpose(1, 2, 0).reshape(H * W, -1) for k, subtile in enumerate(tile): for bit in range(qw): subtile_bit = self.subtile_bit_extract(subtile, bit) for l in range(nb_tp_in): wbytes[i, j, bit, k, l] = (subtile_bit >> (l * 8)) & 0xff wbytes = wbytes.reshape(-1) return wbytes def subtile_bit_roll(self, w_subtile, subtile, bit): s = 0 for i, byte in enumerate(subtile): s += byte.item() << (i * 8) for i in range(w_subtile.size): w_subtile[i] += ((s & (1 << i)) >> i) << bit def conv3x3_roll(self, wbytes, qw, shape, format="CoutCinK"): if format == 'CoutCinK': Ko, Ki, H, W = shape w = np.zeros(shape, dtype=np.uint8) wv = w elif format == 'CoutKCin': Ko, H, W, Ki = shape w = np.zeros(shape, dtype=np.uint8) wv = w.transpose((0, 3, 1, 2)) else: raise Exception(f'Format {format} not implemented.') nb_ki = (Ki // self.TP_IN) + (1 if Ki % self.TP_IN != 0 else 0) wbytes = wbytes.reshape(Ko, nb_ki, qw, H, W, 2) for i in range(Ko): for j in range(nb_ki): for bit in range(qw): for k in range(H): for l in range(W): self.subtile_bit_roll(wv[i, j * self.TP_IN:(j + 1) * self.TP_IN, k, l].reshape(-1), wbytes[i, j, bit, k, l], bit) return w def conv_unroll(self, w, qw, layout='CoutCinK', dw=False): if layout == "CoutCinK": if dw: w = w.transpose(1, 0, 2, 3) # Swap Cout and Cin elif layout == "CoutKCin": if dw: w = w.transpose(3, 0, 1, 2) else: w = w.transpose(0, 3, 1, 2) elif layout == "CoutCin": w = w[:, :, np.newaxis, np.newaxis] elif layout == "CinCout": w = w.T w = w[:, :, np.newaxis, np.newaxis] else: raise Exception(f'Format {layout} not implemented.') fs = w.shape[2] if dw: assert fs == 3, "Only support filter size of 3 with depthwise convolution" assert w.shape[0] == 1, "Assumes that the Cout is equal to 1 in case of depthwise convolution" if fs == 1: return self.conv1x1_unroll(w, qw) elif fs == 3: return self.conv3x3_unroll(w, qw) if __name__ == "__main__": import random def test(name, Ko, Ki, fs, qw): print(f'Test {name} shape=({Ko:3}, {Ki:3}, {fs}, {fs}) qw={qw}: ', end='', flush=True) shape = (Ko, Ki, fs, fs) test_in = np.random.randint(low=0, high=1 << qw, size=shape, dtype=np.uint8) test_out = globals()[f'conv{fs}x{fs}_roll'](globals()[f'conv_unroll'](test_in, qw), qw, shape) if not np.array_equal(test_in, test_out): print(f'Fail!') print('Test in:') print(test_in) print('Test out:') print(test_out) print(test_in[np.equal(test_in, test_out)]) else: print(f'Success!') def test_generator(fs, test_count): print(f'Testing {fs}x{fs} convolution:') for i in range(test_count): Ko = random.randint(1, 128) Ki = random.randint(1, 128) qw = random.randint(2, 8) test(f'[{i}]', Ko, Ki, fs, qw) TEST_COUNT = 10 test_generator(1, TEST_COUNT) test_generator(3, TEST_COUNT)