Apply compile time optimisations

2024-04-07 13:06:49 +02:00 · 2024-04-07 13:06:49 +02:00 · 33c18dbefa
commit 33c18dbefa
parent 29b5365918
4 changed files with 284 additions and 13 deletions
--- a/Design_and_analysis.md
+++ b/Design_and_analysis.md
@ -137,9 +137,80 @@ void dct_2d(element_t** matrix_in, element_t** matrix_out) {
    }
 }
 ```
+This version will serve as a baseline for further optimisations and after simulating this, it yielded a performance of 62977442 cycles.

 ### Software optimisations
 ## Compile time constants
+Looking at the naive implementation we can see some low hanging fruit that can be easilty optimised by evaluating constants in compile time. 
+Firstly we can calculate the value of 1/sqrt(DCT_SIZE) and sqrt(2)/sqrt(DCT_SIZE) to avoid executing sqrt() in runtime which is a costly operation.
+After doing this we get the following constants:
+    
+```c
+#define INV_SQRTDCT_SIZE (real_t) 0.3535533906 
+#define SQRT2_INV_SQRTDCT (real_t) 0.5 
+```
+
+Because PI / (2 * DCT_SIZE) is a constant we can calculate all possible cos() values from 0 to 32 with this multiple. 
+These values can then be stored in an array to eliminate runtime calculations. This is done in also done in compile time in the following way:
+
+```c
+#define DCT_COS_TABLE_SIZE 32
+
+#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \
+    1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \
+    0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \
+    -0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \
+    -0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \
+    0.707107, 0.83147, 0.92388, 0.980785 }
+```
+This changes the way the sum is calculated to the following:
+```c
+sum += matrix_in[i][j] * DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE *DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
+```
+
+After eliminating unnecessary calculations we can move some calculation to other loops to reduce redundant calculations.
+These are found in the inner loops of the alogrithm where they should be recalculated for each other iteration, 
+but are instead recalculated in the inner leading to redudant operations. 
+```c
+for (u = 0; u < DCT_SIZE; u++) {
+    for (v = 0; v < DCT_SIZE; v++) {
+        cu = u == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE);
+        cv = v == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE); 
+
+        sum = 0;
+        for (i = 0; i < DCT_SIZE; i++) {
+            for (j = 0; j < DCT_SIZE; j++) {
+                sum += matrix_in[i][j] * DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE *DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
+            }
+        }
+        matrix_out[u][v] = cu * cv * sum;
+    }
+}
+```
+The first step is to move the cu assignment to the outer loop, this will eliminate 7 redundant calculations of cu. 
+Secondly the sum calculation can be refactored to only lookup the cos for the u value in the outer loop and the v value in the inner loop.
+By applying these changes we get the following code:
+
+```c
+for (u = 0; u < DCT_SIZE; u++) {
+    cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
+    for (v = 0; v < DCT_SIZE; v++) {
+        cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT; 
+        sum = 0;
+        for (i = 0; i < DCT_SIZE; i++) {
+            cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
+            for (j = 0; j < DCT_SIZE; j++) { 
+                cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
+                sum += matrix_in[i][j] * cos_u * cos_v; 
+            }
+        }
+        matrix_out[u][v] = cu * cv * sum;
+    }
+}
+```
+After running the changes in the simulation, the performance improved to 23697904 cycles.
+
+## Remove conditionals
 ## Flattening arrays and loops
 ## Vectorisation
 ## Changing data types