From af34c63a492a997d0880dd6294e3f38529810942 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hugo=20M=C3=A5rdbrink?= Date: Mon, 8 Apr 2024 12:33:21 +0200 Subject: [PATCH] Flatten arrays --- Design_and_analysis.md | 65 ++++++++++++++++++++++++++-- main.c | 43 +++++++------------ versions/flattened_array.c | 88 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+), 31 deletions(-) create mode 100644 versions/flattened_array.c diff --git a/Design_and_analysis.md b/Design_and_analysis.md index 9d979be..75f3277 100644 --- a/Design_and_analysis.md +++ b/Design_and_analysis.md @@ -208,10 +208,69 @@ for (u = 0; u < DCT_SIZE; u++) { } } ``` -After running the changes in the simulation, the performance improved to 23697904 cycles. +After running the changes in the simulation, the performance improved to 26965608 cycles. -## Remove conditionals -## Flattening arrays and loops +## Flattening arrays +Flattening arrays is the process of storing a multidimensional array in a single dimension. +This creates a memory layout that is less jagged, leading to better cache performance and predictability. +It is also necessary for future implementation of vectorision and compiler optimisations. + +The first step is to slightly change our data generations to now generate a one dimensional array. +The memory allocation, memory deallocation and data generation now looks like this: + +```c +element_t** generate_mock_matrices() { + element_t **mock_matrices = (element_t **) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t*)); + for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) { + mock_matrices[i] = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t)); + } + + populate_mock_matrices(mock_matrices); + return mock_matrices; +} + +void free_mock_matrices(element_t** mock_matrices) { + for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) { + free(mock_matrices[i]); + } + free(mock_matrices); +} + +void populate_mock_matrices(element_t** mock_matrices) { + for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) { + for (int j = 0; j < DCT_SIZE; j++) { + for (int k = 0; k < DCT_SIZE; k++) { + mock_matrices[i][j * DCT_SIZE + k] = j + k; + } + } + } +} +``` + +The next step is to change the signature of the kernel function and change the array accessing. +```c + void dct_2d(element_t* matrix_in, element_t* matrix_out) { + real_t cu, cv, sum, cos_u, cos_v; + int u, v, i, j; + + for (u = 0; u < DCT_SIZE; u++) { + cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT; + for (v = 0; v < DCT_SIZE; v++) { + cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT; + sum = 0; + for (i = 0; i < DCT_SIZE; i++) { + cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE]; + for (j = 0; j < DCT_SIZE; j++) { + cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE]; + sum += matrix_in[i * DCT_SIZE + j] * cos_u * cos_v; + } + } + matrix_out[u * DCT_SIZE + v] = cu * cv * sum; + } + } +} +``` +Not only does this enable further optimisations but the performance improved to 23667310 cycles. ## Vectorisation ## Changing data types ## Compiler optimisations diff --git a/main.c b/main.c index f432a5d..25bf982 100644 --- a/main.c +++ b/main.c @@ -19,7 +19,7 @@ #define INV_SQRTDCT_SIZE (real_t) 0.3535533906 #define SQRT2_INV_SQRTDCT (real_t) 0.5 -void dct_2d(element_t** matrix_in, element_t** matrix_out) { +void dct_2d(element_t* matrix_in, element_t* matrix_out) { real_t cu, cv, sum, cos_u, cos_v; int u, v, i, j; @@ -32,68 +32,55 @@ void dct_2d(element_t** matrix_in, element_t** matrix_out) { cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE]; for (j = 0; j < DCT_SIZE; j++) { cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE]; - sum += matrix_in[i][j] * cos_u * cos_v; + sum += matrix_in[i * DCT_SIZE + j] * cos_u * cos_v; } } - matrix_out[u][v] = cu * cv * sum; + matrix_out[u * DCT_SIZE + v] = cu * cv * sum; } } } -void populate_mock_matrices(element_t*** mock_matrices) { +void populate_mock_matrices(element_t** mock_matrices) { for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) { for (int j = 0; j < DCT_SIZE; j++) { for (int k = 0; k < DCT_SIZE; k++) { - mock_matrices[i][j][k] = j + k; + mock_matrices[i][j * DCT_SIZE + k] = j + k; } } } } -element_t*** generate_mock_matrices() { - element_t ***mock_matrices = (element_t ***) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t**)); +element_t** generate_mock_matrices() { + element_t **mock_matrices = (element_t **) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t*)); for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) { - mock_matrices[i] = (element_t **) malloc(DCT_SIZE * sizeof(element_t*)); - for (int j = 0; j < DCT_SIZE; j++) { - mock_matrices[i][j] = (element_t *) malloc(DCT_SIZE * sizeof(element_t)); - } + mock_matrices[i] = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t)); } populate_mock_matrices(mock_matrices); return mock_matrices; } -void free_mock_matrices(element_t*** mock_matrices, element_t** matrix_out) { +void free_mock_matrices(element_t** mock_matrices) { for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) { - for (int j = 0; j < DCT_SIZE; j++) { - free(mock_matrices[i][j]); - } free(mock_matrices[i]); } free(mock_matrices); - } int main() { - element_t ***mock_matrices = generate_mock_matrices(); + element_t **mock_matrices = generate_mock_matrices(); int i; - - element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*)); - for (i = 0; i < DCT_SIZE; i++) { - matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t)); - } - + + element_t* matrix_out = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t)); + for(i = 0; i < TOTAL_DCT_BLOCKS; i++) { dct_2d(mock_matrices[i], matrix_out); } + + free_mock_matrices(mock_matrices); - free_mock_matrices(mock_matrices, matrix_out); - - for (i = 0; i < DCT_SIZE; i++) { - free(matrix_out[i]); - } free(matrix_out); return 0; diff --git a/versions/flattened_array.c b/versions/flattened_array.c new file mode 100644 index 0000000..560122e --- /dev/null +++ b/versions/flattened_array.c @@ -0,0 +1,88 @@ +#include +#include +#include + +#define DCT_SIZE 8 +#define TOTAL_DCT_BLOCKS 100 + +#define element_t int16_t +#define real_t double + +#define DCT_COS_TABLE_SIZE 32 +// DCT_COS_TABLE[i] = cos(i * PI / (2 * DCT_SIZE)) +#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \ + 1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \ + 0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \ + -0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \ + -0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \ + 0.707107, 0.83147, 0.92388, 0.980785 } + +#define INV_SQRTDCT_SIZE (real_t) 0.3535533906 +#define SQRT2_INV_SQRTDCT (real_t) 0.5 + +void dct_2d(element_t* matrix_in, element_t* matrix_out) { + real_t cu, cv, sum, cos_u, cos_v; + int u, v, i, j; + + for (u = 0; u < DCT_SIZE; u++) { + cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT; + for (v = 0; v < DCT_SIZE; v++) { + cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT; + sum = 0; + for (i = 0; i < DCT_SIZE; i++) { + cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE]; + for (j = 0; j < DCT_SIZE; j++) { + cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE]; + sum += matrix_in[i * DCT_SIZE + j] * cos_u * cos_v; + } + } + matrix_out[u * DCT_SIZE + v] = cu * cv * sum; + } + } +} + + +void populate_mock_matrices(element_t** mock_matrices) { + for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) { + for (int j = 0; j < DCT_SIZE; j++) { + for (int k = 0; k < DCT_SIZE; k++) { + mock_matrices[i][j * DCT_SIZE + k] = j + k; + } + } + } +} + + +element_t** generate_mock_matrices() { + element_t **mock_matrices = (element_t **) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t*)); + for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) { + mock_matrices[i] = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t)); + } + + populate_mock_matrices(mock_matrices); + return mock_matrices; +} + +void free_mock_matrices(element_t** mock_matrices) { + for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) { + free(mock_matrices[i]); + } + free(mock_matrices); +} + +int main() { + element_t **mock_matrices = generate_mock_matrices(); + int i; + + element_t* matrix_out = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t)); + + for(i = 0; i < TOTAL_DCT_BLOCKS; i++) { + dct_2d(mock_matrices[i], matrix_out); + } + + free_mock_matrices(mock_matrices); + + free(matrix_out); + + return 0; +}