Fix doxygen intro page naming scheme. Adding tutorial: baseline int32…

… on ibex done.
mandulaj · Jan 20, 2022 · 186bbaa · 186bbaa
1 parent 689f682
commit 186bbaa
Show file tree

Hide file tree

Showing 23 changed files with 299 additions and 718 deletions.
diff --git a/docs/tutorial-index.md b/docs/tutorial-index.md
@@ -0,0 +1 @@
+../tutorial/README.md
diff --git a/include/plp_math.h b/include/plp_math.h
@@ -23,81 +23,69 @@
  */
 
 /**
- \mainpage PULP DSP Software Library
- *
- * Introduction
- * ------------
- *
- * This user manual describes the PULP DSP software library,
- * a suite of common signal processing functions for use on PULP-based processors.
- *
- * The library is divided into a number of functions each covering a specific category:
- * - Basic math functions
- * - Fast math functions
- * - Complex math functions
- * - Filters
- * - Matrix functions
- * - Transform functions
- * - Motor control functions
- * - Statistical functions
- * - Support functions
- * - Interpolation functions
- *
- * The library has separate functions for operating on 8-bit integers, 16-bit integers,
- * 32-bit integer and 32-bit floating-point values.
- *
- * The library is released under Apache v2.0 license.
- *
- */
+ * \mainpage PULP DSP Software Library
+ * 
+ * Introduction
+ * ------------
+ *
+ * This user manual describes the PULP DSP software library,
+ * a suite of common signal processing functions for use on PULP-based processors.
+ *
+ * The library is divided into a number of functions each covering a specific category:
+ * - Basic math functions
+ * - Fast math functions
+ * - Complex math functions
+ * - Filters
+ * - Matrix functions
+ * - Transform functions
+ * - Motor control functions
+ * - Statistical functions
+ * - Support functions
+ * - Interpolation functions
+ *
+ * ..
+ *
+ *
+ * The library has separate functions for operating on 8-bit integers, 16-bit integers,
+ * 32-bit integer and 32-bit floating-point values.
+ *
+ * The naming scheme of the functions follows the following pattern (for example plp_dot_prod_i32s):
+ *
+ * <pre>
+ * < plp > _ < function name > _ < data type > < precision > < method > _ < isa extension >,
+ * </pre>
+ *
+ * with
+ *
+ * - data type = {f, i, q} respectively for floats, integers, fixed points
+ * - precision = {32, 16, 8} bits
+ * - method = {s, p} respectively meaning single core or parallel multicore implementation.
+ * - isa extension = rv32im, xpulpv2, etc. of which rv32im is the most general one.
+ *
+ * ..
+ *
+ *
+ * The library is released under Apache v2.0 license.
+ *
+ */
 
 /**
  * @defgroup groupMath Basic Math Functions
- * The naming scheme of the functions follows the following pattern (for example
- plp_dot_prod_i32s_rv32im): <pre>
- \<pulp\> _ \<function name\> _ \<data type\> \<precision\> \<method\> _ \<isa extension\>, with
-
- data type = {f, i, q} respectively for floats, integers, fixed points
-
- precision = {32, 16, 8} bits
-
- method = {s, p} respectively meaning single core or parallel multicore implementation.
-
- isa extension = rv32im, xpulpv2, etc. of which rv32im is the most general one.
-
- </pre>
-
 */
 
 /**
  * @defgroup groupCmplxMath Complex Math Functions
  */
-/**
-
 
 /**
  * @defgroup groupFilters Filtering Functions
  */
 
 /**
  * @defgroup groupMatrix Matrix Functions
- * The naming scheme of the functions follows the following pattern (for example
- plp_mat_mult_i32s_rv32im): <pre>
- \<pulp\> _ \<function name\> _ \<data type\> \<precision\> \<method\> _ \<isa extension\>, with
-
- data type = {f, i, q} respectively for floats, integers, fixed points
-
- precision = {32, 16, 8} bits
-
- method = {s, p} respectively meaning single core or parallel multicore implementation.
-
- isa extension = rv32im, xpulpv2, etc. of which rv32im is the most general one.
-
- </pre>
-
  *
  * This set of functions provides basic matrix math operations.
  *
-
  */
 
 /**

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -11,6 +11,7 @@ repo_name: pulp-platform/pulp-dsp
 nav:
 - Home: index.md
 - Test Framework: test-framework-index.md
+- Tutorial: tutorial-index.md
 - Reference Manual:
  - Pages: refmanual/Pages/index.md
  - Files: refmanual/Files/index.md

diff --git a/tutorial/Makefile b/tutorial/Makefile
diff --git a/tutorial/README.md b/tutorial/README.md
@@ -1,40 +1,55 @@
-## Tutorial for beginners
-
 # Introduction
 
-In this tutorial you will learn how to contribute to the pulp-dsp library and write optimized code for PULP platforms.
+In this tutorial you will learn how to use the pulp-dsp library and how to improve and write optimized code for PULP platforms.
 
-We will use the gvsoc virtual platform and Mr. Wolf processor.
+We will use the gvsoc virtual platform and Mr. Wolf processor as an example.
 
-Mr. Wolf has two processing domains: SoC with a Fabric Controller (FC) and Cluster which is activated for compute-intensive tasks. The FC contains an Ibex core with RV32IMC ISA, while the Cluster contains 8 RI5CY cores with RV32IMCXPulpV2 extensions.
+Mr. Wolf has two processing domains: SoC with a Fabric Controller (FC) and Cluster which is activated for compute-intensive tasks. The FC contains an Ibex core with basic RV32IMC ISA, while the Cluster contains 8 CV32E40P (formerly called RI5CY) cores with RV32IMCXPulpV2 extensions.
 
 The library contains glue codes where it is checked in which domain the function is called. If it's in FC, then the rv32im kernel function is called, otherwise the xpulpv2 kernel function is called.
 
-If the user wants to use a parallel implementation, he/she can call the glue code functions with '_parallel' in the function name.
+If the user wants to use a parallel implementation, they can call the glue code functions with '_parallel' in the function name.
+
+# Requirements
+
+Follow the instructions under `Installation and usage` on the main page to install pulp-sdk and pulp-dsp.
+
+For Mr. Wolf, we will use the [v1 branch](https://github.com/pulp-platform/pulp-sdk/tree/v1) of the pulp-sdk. The 'new' sdk on the [main branch](https://github.com/pulp-platform/pulp-sdk) works similarly. For detailed documentations, please refer to the respective documentation of pulp-sdk.
 
-The naming scheme of the functions follows the following pattern (for example plp\_dot\_prod\_i32s):
+# Configurations
 
-\<pulp\> _ \<function name\> _ \<data type\> \<precision\> \<method\> _ \<isa extension\>, with
+After you installed the pulp-sdk and the pulp-dsp in the sdk, you can start developing your DSP application. 
 
-- data type = {f, i, q} respectively for floats, integers, fixed points
+Here we will execute the dot product between two vectors as an example.
 
-- precision = {32, 16, 8} bits
+Everytime you open a new terminal, go to the `pulp-sdk` folder and configure the gvsoc for Mr. Wolf:
 
-- method = {s, p} respectively meaning single core or parallel multicore implementation.
+~~~~~shell
+source configs/platform-gvsoc.sh
+source configs/wolfe.sh
+~~~~~
 
-- isa extension = rv32im, xpulpv2, etc. of which rv32im is the most general one.
+and activate the configurations:
 
-In this tutorial, an `example_dot_prod` function is used.
+~~~~~shell
+source sourceme.sh
+~~~~~
 
-# Compile the library and install it in the pulp-sdk
+# Baseline
+
+Go to the `baseline` folder. It contains the codes for computing the dot product on FC of Mr. Wolf, which has an Ibex core featuring the basic RV32IMC ISA.
 
 Run the command
 
-`make clean header all install`
+~~~~~shell
+make clean all run
+~~~~~
+
+The vectors length is 80. The values are 32-bit integers. It takes 817 cycles and 439 instructions. Refer to the `main.c` to learn how to use the performance counter. Note that on gvsoc you can use as many counters as you want, while on the board only one HW counter exists.
+
+# Single cluster core
 
-to compile and install the library.
 
-You can check out the Makefile to see which functions are compiled.
 
 # Run an application
 

diff --git a/tutorial/baseline/Makefile b/tutorial/baseline/Makefile
@@ -0,0 +1,8 @@
+PULP_APP = dotprod
+PULP_APP_FC_SRCS = main.c
+
+PULP_LDFLAGS += -lplpdsp -lm
+
+PULP_CFLAGS += -O3 -g
+
+-include $(PULP_SDK_HOME)/install/rules/pulp.mk
diff --git a/tutorial/baseline/data.h b/tutorial/baseline/data.h
@@ -0,0 +1,20 @@
+#include "rtos_hal.h"
+
+#define VLEN 80
+
+int32_t a[VLEN] = {9, 8, 9, -5, 6, -8, -1, -8, 8, -3, 2, 5, -2,
+ 3, -7, 2, -10, -10, 1, 7, 1, -10, -1, 8, -7, -4,
+ 4, -8, 2, -9, -9, 9, -3, 6, -6, 6, 4, 8, -9,
+ -9, -9, 4, -4, 3, -9, 1, -1, -1, 2, 7, 9, 4,
+ -9, 2, -7, 3, 9, -7, 1, 5, -8, 2, -6, -3, -8,
+ -2, -1, -4, 8, -8, -4, 3, 3, -10, 9, 1, -3, 6,
+ -2, -9};
+int32_t b[VLEN] = {-5, -3, 6, 8, -9, -6, 9, -8, -2, 0, -9, 5, 1,
+ 1, 3, 7, 9, -6, -2, 1, -7, 0, 1, -3, -6, 2,
+ -10, 0, -7, -8, -8, -4, -6, -6, -7, 4, -10, 2, -5,
+ -1, -9, 6, 5, 5, 3, 2, -2, -7, 5, -5, 0, 4,
+ -4, 0, 6, -6, 1, -9, -2, 2, 7, 7, -9, 9, -10,
+ 9, 0, -2, 1, 6, -5, -3, 2, 3, -3, 0, -3, -5,
+ 9, -7};
+
+int32_t res = 218;
diff --git a/tutorial/baseline/main.c b/tutorial/baseline/main.c
@@ -0,0 +1,55 @@
+#include "rtos_hal.h"
+#include "stdio.h"
+#include "plp_math.h"
+#include "data.h"
+
+int main(){
+
+ int32_t result=0;
+
+ printf("\nComputing dot prod of i32 numbers\n\n"); // it's better to always end with \n
+
+ // We also count the number of cycles taken to compute it.
+ // This tructure will hold the configuration and also the results in the
+ // cumulative mode
+ rt_perf_t perf;
+
+ // It must be initiliazed at least once, this will set all values in the
+ // structure to zero.
+ rt_perf_init(&perf);
+
+ // Activate specified events
+ rt_perf_conf(&perf, (1<<RT_PERF_CYCLES) | (1<<RT_PERF_INSTR)); // Note: on gvsoc you can actiate as many counters as you want, while when you run on board, there is only one HW counter.
+
+ // Reset HW counters now and start and stop counters so that we benchmark
+ // only around the printf
+ rt_perf_reset(&perf);
+ rt_perf_start(&perf);
+
+ plp_dot_prod_i32(a, b, VLEN, &result);
+
+ rt_perf_stop(&perf);
+
+ printf("The true result is %d, the calculated result is %d.\n", res, result);
+ printf("Total cycles: %d\n", rt_perf_read(RT_PERF_CYCLES));
+ printf("Instructions: %d\n", rt_perf_read(RT_PERF_INSTR));
+
+ printf("\nThe glue code also took few cycles. If we call directly the kernel function to compute the dot product we have:\n");
+
+ // Reset HW counters now and start and stop counters so that we benchmark
+ // only around the printf
+ rt_perf_reset(&perf);
+ rt_perf_start(&perf);
+
+ plp_dot_prod_i32s_rv32im(a, b, VLEN, &result);
+
+ rt_perf_stop(&perf);
+
+ printf("The true result is %d, the calculated result is %d.\n", res, result);
+ printf("Total cycles: %d\n", rt_perf_read(RT_PERF_CYCLES));
+ printf("Instructions: %d\n", rt_perf_read(RT_PERF_INSTR));
+ printf("(The effect might be more evident with SIMD and parallel computation.)\n");
+
+ return 0;
+
+}
diff --git a/tutorial/cluster_parallel/data.h b/tutorial/cluster_parallel/data.h
@@ -0,0 +1,18 @@
+#include "rtos_hal.h"
+
+int32_t a = {9, 8, 9, -5, 6, -8, -1, -8, 8, -3, 2, 5, -2,
+ 3, -7, 2, -10, -10, 1, 7, 1, -10, -1, 8, -7, -4,
+ 4, -8, 2, -9, -9, 9, -3, 6, -6, 6, 4, 8, -9,
+ -9, -9, 4, -4, 3, -9, 1, -1, -1, 2, 7, 9, 4,
+ -9, 2, -7, 3, 9, -7, 1, 5, -8, 2, -6, -3, -8,
+ -2, -1, -4, 8, -8, -4, 3, 3, -10, 9, 1, -3, 6,
+ -2, -9};
+int32_t b = {-5, -3, 6, 8, -9, -6, 9, -8, -2, 0, -9, 5, 1,
+ 1, 3, 7, 9, -6, -2, 1, -7, 0, 1, -3, -6, 2,
+ -10, 0, -7, -8, -8, -4, -6, -6, -7, 4, -10, 2, -5,
+ -1, -9, 6, 5, 5, 3, 2, -2, -7, 5, -5, 0, 4,
+ -4, 0, 6, -6, 1, -9, -2, 2, 7, 7, -9, 9, -10,
+ 9, 0, -2, 1, 6, -5, -3, 2, 3, -3, 0, -3, -5,
+ 9, -7};
+
+int32_t res = 218;
diff --git a/tutorial/cluster_single/Makefile b/tutorial/cluster_single/Makefile
@@ -0,0 +1,8 @@
+PULP_APP = dotprod
+PULP_APP_FC_SRCS = main.c
+
+PULP_LDFLAGS += -lplpdsp -lm
+
+PULP_CFLAGS += -O3 -g
+
+-include $(PULP_SDK_HOME)/install/rules/pulp.mk
diff --git a/tutorial/cluster_single/data.h b/tutorial/cluster_single/data.h
@@ -0,0 +1,20 @@
+#include "rtos_hal.h"
+
+#define VLEN 80
+
+int32_t a[VLEN] = {9, 8, 9, -5, 6, -8, -1, -8, 8, -3, 2, 5, -2,
+ 3, -7, 2, -10, -10, 1, 7, 1, -10, -1, 8, -7, -4,
+ 4, -8, 2, -9, -9, 9, -3, 6, -6, 6, 4, 8, -9,
+ -9, -9, 4, -4, 3, -9, 1, -1, -1, 2, 7, 9, 4,
+ -9, 2, -7, 3, 9, -7, 1, 5, -8, 2, -6, -3, -8,
+ -2, -1, -4, 8, -8, -4, 3, 3, -10, 9, 1, -3, 6,
+ -2, -9};
+int32_t b[VLEN] = {-5, -3, 6, 8, -9, -6, 9, -8, -2, 0, -9, 5, 1,
+ 1, 3, 7, 9, -6, -2, 1, -7, 0, 1, -3, -6, 2,
+ -10, 0, -7, -8, -8, -4, -6, -6, -7, 4, -10, 2, -5,
+ -1, -9, 6, 5, 5, 3, 2, -2, -7, 5, -5, 0, 4,
+ -4, 0, 6, -6, 1, -9, -2, 2, 7, 7, -9, 9, -10,
+ 9, 0, -2, 1, 6, -5, -3, 2, 3, -3, 0, -3, -5,
+ 9, -7};
+
+int32_t res = 218;