[DOCS] Added matrix multiplication tutorial

norabelrose · Mar 15, 2021 · 2f8f004 · 2f8f004
1 parent d1c0bf2
commit 2f8f004
Show file tree

Hide file tree

Showing 9 changed files with 395 additions and 18 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -40,7 +40,7 @@ if(BUILD_PYTHON_MODULE)
  if(NOT("${CUTLASS_INCLUDE_DIR}" STREQUAL "") AND NOT("${CUTLASS_LIBRARY_DIR}" STREQUAL ""))
  set(TORCH_SRC ${TORCH_SRC} cutlass.cc)
  add_definitions(-DWITH_CUTLASS_BINDINGS)
- set(CUTLASS_LIBRARIES "cutlass")
+ set(CUTLASS_LIBRARIES "cutlass.a")
  endif()
  message(STATUS ${CUTLASS_INCLUDE_PATH})
  set(PYTHON_SRC main.cc triton.cc ${TORCH_SRC})

diff --git a/lib/ir/module.cc b/lib/ir/module.cc
@@ -30,7 +30,7 @@ void module::set_value(const std::string& name, ir::basic_block *block, ir::valu
  if(it != metadatas_.end()){
  x->set_metadata(it->second.first, it->second.second);
  }
- value->set_name(name);
+// value->set_name(name);
 }
 
 void module::set_value(const std::string& name, ir::value *value){

diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc
@@ -288,6 +288,9 @@ function::function(const std::string& src, const options_t &opt, driver::device
  // find indices of autotune keys
  for(const std::string& name: tune_key){
  auto pred = [&](ir::argument* arg) { return arg->get_name() == name; };
+// std::cout << "----" << std::endl;
+// for(ir::argument* arg: args)
+// std::cout << arg->get_name() << std::endl;
  auto it = std::find_if(args.begin(), args.end(), pred);
  if(it == args.end())
  throw std::runtime_error(name + " is not a valid argument name");

diff --git a/python/setup.py b/python/setup.py
@@ -43,8 +43,7 @@ def run(self):
  out = subprocess.check_output(["cmake", "--version"])
  except OSError:
  raise RuntimeError(
- "CMake must be installed to build the following extensions: " +
- ", ".join(e.name for e in self.extensions)
+ "CMake must be installed to build the following extensions: " + ", ".join(e.name for e in self.extensions)
  )
 
  if platform.system() == "Windows":
@@ -107,10 +106,7 @@ def build_extension(self, ext):
  long_description="",
  packages=["triton", "triton/_C", "triton/ops", "triton/ops/blocksparse"],
  install_requires=["numpy", "torch"],
- package_data={
- "triton/ops": ["*.c"],
- "triton/ops/blocksparse": ["*.c"]
- },
+ package_data={"triton/ops": ["*.c"], "triton/ops/blocksparse": ["*.c"]},
  include_package_data=True,
  ext_modules=[CMakeExtension("triton", "triton/_C/")],
  cmdclass={"build_ext": CMakeBuild},

diff --git a/python/triton/ops/matmul.c b/python/triton/ops/matmul.c
@@ -1,5 +1,5 @@
-#define STM 8
-#define STN 8
+#define STM 1
+#define STN 1
 
 __global__ void matmul(TYPE *A __noalias __readonly,
  TYPE *B __noalias __readonly,

diff --git a/python/triton/testing.py b/python/triton/testing.py
@@ -40,7 +40,7 @@ def allclose(x, y):
  return err < tol
 
 
-def do_bench(fn, warmup=50, rep=50, grad_to_none=None, percentiles=[0.2, 0.8]):
+def do_bench(fn, warmup=25, rep=100, grad_to_none=None, percentiles=[0.2, 0.8]):
  # Estimate the runtime of the function
  fn()
  torch.cuda.synchronize()

diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py
@@ -50,7 +50,7 @@
 # The existence of arrays as a primitive data-type for Triton comes with a number of advantages that are highlighted in the `MAPL'2019 Triton paper <http:https://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf>`_.
 
 # %%
-# Torch bindings
+# Torch Bindings
 # --------------------------
 # The only thing that matters when it comes to Triton and Torch is the :code:`triton.kernel` class. This allows you to transform the above C-like function into a callable python object that can be used to modify :code:`torch.tensor` objects. To create a :code:`triton.kernel`, you only need three things:
 #
@@ -127,7 +127,7 @@ def forward(ctx, x, y):
 
 # %%
 # Unit Test
-# --------------------------
+# -----------
 #
 # Of course, the first thing that we should check is that whether kernel is correct. This is pretty easy to test, as shown below:
 
@@ -144,8 +144,8 @@ def forward(ctx, x, y):
 # Seems like we're good to go!
 
 # %%
-# Benchmarking
-# --------------------------
+# Benchmark
+# -----------
 # We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch.
 # To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of our custom op.
 # for different problem sizes.

diff --git a/python/tutorials/02-fused-softmax.py b/python/tutorials/02-fused-softmax.py
@@ -126,10 +126,19 @@ def make_kernel(N, device):
  # Now are kernels are indexed not only by the provided device but also
  # by the rounded number of columns in the input matrix
  BLOCK = next_power_of_2(N)
- key = (BLOCK, device)
+ # Another trick we can use is to ask the compiler to parallelize each
+ # row-normalization more aggressively -- i.e., with more warps -- vectors
+ # that are longer
+ # You will see in the next tutorial how to auto-tune this value in a more natural
+ # way so you don't have to come up with manual heuristics yourself
+ num_warps = 4
+ if BLOCK >= 2048: num_warps = 8
+ if BLOCK >= 4096: num_warps = 16
+ # Each (BLOCK, num_warps, device) results in a different kernel
+ key = (BLOCK, num_warps, device)
  if key not in cache:
  defines = {'BLOCK': BLOCK}
- cache[key] = triton.kernel(_src, device=device, defines=defines)
+ cache[key] = triton.kernel(_src, device=device, defines=defines, num_warps=num_warps)
  return cache[key]
 
 
@@ -174,7 +183,7 @@ def forward(ctx, x):
 # As expected, the results are identical.
 
 # %%
-# Benchmarking
+# Benchmark
 # -------------
 # Here we will benchmark our operation as a function of the number of columns in the input matrix -- assuming 4096 rows.
 # We will then compare its performance against (1) :code:`torch.softmax` and (2) the :code:`naive_softmax` defined above.