From af34c63a492a997d0880dd6294e3f38529810942 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hugo=20M=C3=A5rdbrink?= <hugo@mardbrink.se>
Date: Mon, 8 Apr 2024 12:33:21 +0200
Subject: [PATCH] Flatten arrays

---
 Design_and_analysis.md     | 65 ++++++++++++++++++++++++++--
 main.c                     | 43 +++++++------------
 versions/flattened_array.c | 88 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 165 insertions(+), 31 deletions(-)
 create mode 100644 versions/flattened_array.c

diff --git a/Design_and_analysis.md b/Design_and_analysis.md
index 9d979be..75f3277 100644
--- a/Design_and_analysis.md
+++ b/Design_and_analysis.md
@@ -208,10 +208,69 @@ for (u = 0; u < DCT_SIZE; u++) {
     }
 }
 ```
-After running the changes in the simulation, the performance improved to 23697904 cycles.
+After running the changes in the simulation, the performance improved to 26965608 cycles.
 
-## Remove conditionals
-## Flattening arrays and loops
+## Flattening arrays 
+Flattening arrays is the process of storing a multidimensional array in a single dimension. 
+This creates a memory layout that is less jagged, leading to better cache performance and predictability.
+It is also necessary for future implementation of vectorision and compiler optimisations.
+
+The first step is to slightly change our data generations to now generate a one dimensional array. 
+The memory allocation, memory deallocation and data generation now looks like this:
+
+```c
+element_t** generate_mock_matrices() {
+    element_t **mock_matrices = (element_t **) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t*));
+    for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
+        mock_matrices[i] = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
+    }
+
+    populate_mock_matrices(mock_matrices);
+    return mock_matrices;
+}
+
+void free_mock_matrices(element_t** mock_matrices) { 
+    for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
+        free(mock_matrices[i]);
+    }
+    free(mock_matrices);
+}
+
+void populate_mock_matrices(element_t** mock_matrices) {
+    for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
+        for (int j = 0; j < DCT_SIZE; j++) {
+            for (int k = 0; k < DCT_SIZE; k++) {
+                mock_matrices[i][j * DCT_SIZE + k] = j + k;
+            }
+        }
+    }
+}
+```
+
+The next step is to change the signature of the kernel function and change the array accessing.
+```c
+ void dct_2d(element_t* matrix_in, element_t* matrix_out) {
+    real_t cu, cv, sum, cos_u, cos_v;
+    int u, v, i, j;
+
+    for (u = 0; u < DCT_SIZE; u++) {
+        cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
+        for (v = 0; v < DCT_SIZE; v++) {
+            cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT; 
+            sum = 0;
+            for (i = 0; i < DCT_SIZE; i++) {
+                cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
+                for (j = 0; j < DCT_SIZE; j++) { 
+                    cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
+                    sum += matrix_in[i * DCT_SIZE + j] * cos_u * cos_v;
+                }
+            }
+            matrix_out[u * DCT_SIZE + v] = cu * cv * sum;
+        }
+    }
+}
+```
+Not only does this enable further optimisations but the performance improved to 23667310 cycles.
 ## Vectorisation
 ## Changing data types
 ## Compiler optimisations
diff --git a/main.c b/main.c
index f432a5d..25bf982 100644
--- a/main.c
+++ b/main.c
@@ -19,7 +19,7 @@
 #define INV_SQRTDCT_SIZE (real_t) 0.3535533906 
 #define SQRT2_INV_SQRTDCT (real_t) 0.5 
 
-void dct_2d(element_t** matrix_in, element_t** matrix_out) {
+void dct_2d(element_t* matrix_in, element_t* matrix_out) {
     real_t cu, cv, sum, cos_u, cos_v;
     int u, v, i, j;
 
@@ -32,68 +32,55 @@ void dct_2d(element_t** matrix_in, element_t** matrix_out) {
                 cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
                 for (j = 0; j < DCT_SIZE; j++) { 
                     cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
-                    sum += matrix_in[i][j] * cos_u * cos_v; 
+                    sum += matrix_in[i * DCT_SIZE + j] * cos_u * cos_v;
                 }
             }
-            matrix_out[u][v] = cu * cv * sum;
+            matrix_out[u * DCT_SIZE + v] = cu * cv * sum;
         }
     }
 }
 
 
-void populate_mock_matrices(element_t*** mock_matrices) {
+void populate_mock_matrices(element_t** mock_matrices) {
     for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
         for (int j = 0; j < DCT_SIZE; j++) {
             for (int k = 0; k < DCT_SIZE; k++) {
-                mock_matrices[i][j][k] = j + k;
+                mock_matrices[i][j * DCT_SIZE + k] = j + k;
             }
         }
     }
 }
 
 
-element_t*** generate_mock_matrices() {
-    element_t ***mock_matrices = (element_t ***) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t**));
+element_t** generate_mock_matrices() {
+    element_t **mock_matrices = (element_t **) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t*));
     for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
-        mock_matrices[i] = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
-        for (int j = 0; j < DCT_SIZE; j++) {
-            mock_matrices[i][j] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));              
-        }
+        mock_matrices[i] = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
     }
 
     populate_mock_matrices(mock_matrices);
     return mock_matrices;
 }
 
-void free_mock_matrices(element_t*** mock_matrices, element_t** matrix_out) {
+void free_mock_matrices(element_t** mock_matrices) { 
     for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
-        for (int j = 0; j < DCT_SIZE; j++) {
-            free(mock_matrices[i][j]);
-        }
         free(mock_matrices[i]);
     }
     free(mock_matrices);
-
 }
 
 int main() {
-    element_t ***mock_matrices = generate_mock_matrices();
+    element_t **mock_matrices = generate_mock_matrices();
     int i;
-
-    element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
-    for (i = 0; i < DCT_SIZE; i++) {
-        matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
-    }
-
+    
+    element_t* matrix_out = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
+    
     for(i = 0; i < TOTAL_DCT_BLOCKS; i++) {
         dct_2d(mock_matrices[i], matrix_out);
     }
+    
+    free_mock_matrices(mock_matrices);
 
-    free_mock_matrices(mock_matrices, matrix_out);
-
-    for (i = 0; i < DCT_SIZE; i++) {
-        free(matrix_out[i]);
-    }
     free(matrix_out);
 
     return 0;
diff --git a/versions/flattened_array.c b/versions/flattened_array.c
new file mode 100644
index 0000000..560122e
--- /dev/null
+++ b/versions/flattened_array.c
@@ -0,0 +1,88 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define DCT_SIZE 8
+#define TOTAL_DCT_BLOCKS 100
+
+#define element_t int16_t
+#define real_t double
+
+#define DCT_COS_TABLE_SIZE 32
+// DCT_COS_TABLE[i] = cos(i * PI / (2 * DCT_SIZE))
+#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \
+    1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \
+    0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \
+    -0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \
+    -0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \
+    0.707107, 0.83147, 0.92388, 0.980785 }
+
+#define INV_SQRTDCT_SIZE (real_t) 0.3535533906 
+#define SQRT2_INV_SQRTDCT (real_t) 0.5 
+
+void dct_2d(element_t* matrix_in, element_t* matrix_out) {
+    real_t cu, cv, sum, cos_u, cos_v;
+    int u, v, i, j;
+
+    for (u = 0; u < DCT_SIZE; u++) {
+        cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
+        for (v = 0; v < DCT_SIZE; v++) {
+            cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT; 
+            sum = 0;
+            for (i = 0; i < DCT_SIZE; i++) {
+                cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
+                for (j = 0; j < DCT_SIZE; j++) { 
+                    cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
+                    sum += matrix_in[i * DCT_SIZE + j] * cos_u * cos_v;
+                }
+            }
+            matrix_out[u * DCT_SIZE + v] = cu * cv * sum;
+        }
+    }
+}
+
+
+void populate_mock_matrices(element_t** mock_matrices) {
+    for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
+        for (int j = 0; j < DCT_SIZE; j++) {
+            for (int k = 0; k < DCT_SIZE; k++) {
+                mock_matrices[i][j * DCT_SIZE + k] = j + k;
+            }
+        }
+    }
+}
+
+
+element_t** generate_mock_matrices() {
+    element_t **mock_matrices = (element_t **) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t*));
+    for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
+        mock_matrices[i] = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
+    }
+
+    populate_mock_matrices(mock_matrices);
+    return mock_matrices;
+}
+
+void free_mock_matrices(element_t** mock_matrices) { 
+    for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
+        free(mock_matrices[i]);
+    }
+    free(mock_matrices);
+}
+
+int main() {
+    element_t **mock_matrices = generate_mock_matrices();
+    int i;
+    
+    element_t* matrix_out = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
+    
+    for(i = 0; i < TOTAL_DCT_BLOCKS; i++) {
+        dct_2d(mock_matrices[i], matrix_out);
+    }
+    
+    free_mock_matrices(mock_matrices);
+
+    free(matrix_out);
+
+    return 0;
+}