diff --git a/Design_and_analysis.md b/Design_and_analysis.md index 2107804..9d979be 100644 --- a/Design_and_analysis.md +++ b/Design_and_analysis.md @@ -137,9 +137,80 @@ void dct_2d(element_t** matrix_in, element_t** matrix_out) { } } ``` +This version will serve as a baseline for further optimisations and after simulating this, it yielded a performance of 62977442 cycles. ### Software optimisations ## Compile time constants +Looking at the naive implementation we can see some low hanging fruit that can be easilty optimised by evaluating constants in compile time. +Firstly we can calculate the value of 1/sqrt(DCT_SIZE) and sqrt(2)/sqrt(DCT_SIZE) to avoid executing sqrt() in runtime which is a costly operation. +After doing this we get the following constants: + +```c +#define INV_SQRTDCT_SIZE (real_t) 0.3535533906 +#define SQRT2_INV_SQRTDCT (real_t) 0.5 +``` + +Because PI / (2 * DCT_SIZE) is a constant we can calculate all possible cos() values from 0 to 32 with this multiple. +These values can then be stored in an array to eliminate runtime calculations. This is done in also done in compile time in the following way: + +```c +#define DCT_COS_TABLE_SIZE 32 + +#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \ + 1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \ + 0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \ + -0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \ + -0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \ + 0.707107, 0.83147, 0.92388, 0.980785 } +``` +This changes the way the sum is calculated to the following: +```c +sum += matrix_in[i][j] * DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE *DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE]; +``` + +After eliminating unnecessary calculations we can move some calculation to other loops to reduce redundant calculations. +These are found in the inner loops of the alogrithm where they should be recalculated for each other iteration, +but are instead recalculated in the inner leading to redudant operations. +```c +for (u = 0; u < DCT_SIZE; u++) { + for (v = 0; v < DCT_SIZE; v++) { + cu = u == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE); + cv = v == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE); + + sum = 0; + for (i = 0; i < DCT_SIZE; i++) { + for (j = 0; j < DCT_SIZE; j++) { + sum += matrix_in[i][j] * DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE *DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE]; + } + } + matrix_out[u][v] = cu * cv * sum; + } +} +``` +The first step is to move the cu assignment to the outer loop, this will eliminate 7 redundant calculations of cu. +Secondly the sum calculation can be refactored to only lookup the cos for the u value in the outer loop and the v value in the inner loop. +By applying these changes we get the following code: + +```c +for (u = 0; u < DCT_SIZE; u++) { + cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT; + for (v = 0; v < DCT_SIZE; v++) { + cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT; + sum = 0; + for (i = 0; i < DCT_SIZE; i++) { + cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE]; + for (j = 0; j < DCT_SIZE; j++) { + cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE]; + sum += matrix_in[i][j] * cos_u * cos_v; + } + } + matrix_out[u][v] = cu * cv * sum; + } +} +``` +After running the changes in the simulation, the performance improved to 23697904 cycles. + +## Remove conditionals ## Flattening arrays and loops ## Vectorisation ## Changing data types diff --git a/main.c b/main.c index 36362a1..f432a5d 100644 --- a/main.c +++ b/main.c @@ -1,28 +1,38 @@ -#include #include #include #define DCT_SIZE 8 #define TOTAL_DCT_BLOCKS 100 -#define PI 3.14159265358979323846 - #define element_t int16_t #define real_t double +#define DCT_COS_TABLE_SIZE 32 +// DCT_COS_TABLE[i] = cos(i * PI / (2 * DCT_SIZE)) +#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \ + 1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \ + 0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \ + -0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \ + -0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \ + 0.707107, 0.83147, 0.92388, 0.980785 } + +#define INV_SQRTDCT_SIZE (real_t) 0.3535533906 +#define SQRT2_INV_SQRTDCT (real_t) 0.5 + void dct_2d(element_t** matrix_in, element_t** matrix_out) { - real_t cu, cv, sum; + real_t cu, cv, sum, cos_u, cos_v; int u, v, i, j; for (u = 0; u < DCT_SIZE; u++) { + cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT; for (v = 0; v < DCT_SIZE; v++) { - cu = u == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE); - cv = v == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE); - + cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT; sum = 0; for (i = 0; i < DCT_SIZE; i++) { - for (j = 0; j < DCT_SIZE; j++) { - sum += matrix_in[i][j] * cos((2 * i + 1) * u * PI / (2 * DCT_SIZE)) * cos((2 * j + 1) * v * PI / (2 * DCT_SIZE)); + cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE]; + for (j = 0; j < DCT_SIZE; j++) { + cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE]; + sum += matrix_in[i][j] * cos_u * cos_v; } } matrix_out[u][v] = cu * cv * sum; @@ -52,7 +62,6 @@ element_t*** generate_mock_matrices() { } populate_mock_matrices(mock_matrices); - return mock_matrices; } @@ -69,19 +78,20 @@ void free_mock_matrices(element_t*** mock_matrices, element_t** matrix_out) { int main() { element_t ***mock_matrices = generate_mock_matrices(); + int i; element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*)); - for (int i = 0; i < DCT_SIZE; i++) { + for (i = 0; i < DCT_SIZE; i++) { matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t)); } - for(long i = 0; i < TOTAL_DCT_BLOCKS; i++) { + for(i = 0; i < TOTAL_DCT_BLOCKS; i++) { dct_2d(mock_matrices[i], matrix_out); } free_mock_matrices(mock_matrices, matrix_out); - for (int i = 0; i < DCT_SIZE; i++) { + for (i = 0; i < DCT_SIZE; i++) { free(matrix_out[i]); } free(matrix_out); diff --git a/versions/comptime_constants.c b/versions/comptime_constants.c new file mode 100644 index 0000000..f432a5d --- /dev/null +++ b/versions/comptime_constants.c @@ -0,0 +1,100 @@ +#include +#include + +#define DCT_SIZE 8 +#define TOTAL_DCT_BLOCKS 100 + +#define element_t int16_t +#define real_t double + +#define DCT_COS_TABLE_SIZE 32 +// DCT_COS_TABLE[i] = cos(i * PI / (2 * DCT_SIZE)) +#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \ + 1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \ + 0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \ + -0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \ + -0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \ + 0.707107, 0.83147, 0.92388, 0.980785 } + +#define INV_SQRTDCT_SIZE (real_t) 0.3535533906 +#define SQRT2_INV_SQRTDCT (real_t) 0.5 + +void dct_2d(element_t** matrix_in, element_t** matrix_out) { + real_t cu, cv, sum, cos_u, cos_v; + int u, v, i, j; + + for (u = 0; u < DCT_SIZE; u++) { + cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT; + for (v = 0; v < DCT_SIZE; v++) { + cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT; + sum = 0; + for (i = 0; i < DCT_SIZE; i++) { + cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE]; + for (j = 0; j < DCT_SIZE; j++) { + cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE]; + sum += matrix_in[i][j] * cos_u * cos_v; + } + } + matrix_out[u][v] = cu * cv * sum; + } + } +} + + +void populate_mock_matrices(element_t*** mock_matrices) { + for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) { + for (int j = 0; j < DCT_SIZE; j++) { + for (int k = 0; k < DCT_SIZE; k++) { + mock_matrices[i][j][k] = j + k; + } + } + } +} + + +element_t*** generate_mock_matrices() { + element_t ***mock_matrices = (element_t ***) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t**)); + for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) { + mock_matrices[i] = (element_t **) malloc(DCT_SIZE * sizeof(element_t*)); + for (int j = 0; j < DCT_SIZE; j++) { + mock_matrices[i][j] = (element_t *) malloc(DCT_SIZE * sizeof(element_t)); + } + } + + populate_mock_matrices(mock_matrices); + return mock_matrices; +} + +void free_mock_matrices(element_t*** mock_matrices, element_t** matrix_out) { + for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) { + for (int j = 0; j < DCT_SIZE; j++) { + free(mock_matrices[i][j]); + } + free(mock_matrices[i]); + } + free(mock_matrices); + +} + +int main() { + element_t ***mock_matrices = generate_mock_matrices(); + int i; + + element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*)); + for (i = 0; i < DCT_SIZE; i++) { + matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t)); + } + + for(i = 0; i < TOTAL_DCT_BLOCKS; i++) { + dct_2d(mock_matrices[i], matrix_out); + } + + free_mock_matrices(mock_matrices, matrix_out); + + for (i = 0; i < DCT_SIZE; i++) { + free(matrix_out[i]); + } + free(matrix_out); + + return 0; +} diff --git a/versions/naive.c b/versions/naive.c new file mode 100644 index 0000000..36362a1 --- /dev/null +++ b/versions/naive.c @@ -0,0 +1,90 @@ +#include +#include +#include + +#define DCT_SIZE 8 +#define TOTAL_DCT_BLOCKS 100 + +#define PI 3.14159265358979323846 + +#define element_t int16_t +#define real_t double + +void dct_2d(element_t** matrix_in, element_t** matrix_out) { + real_t cu, cv, sum; + int u, v, i, j; + + for (u = 0; u < DCT_SIZE; u++) { + for (v = 0; v < DCT_SIZE; v++) { + cu = u == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE); + cv = v == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE); + + sum = 0; + for (i = 0; i < DCT_SIZE; i++) { + for (j = 0; j < DCT_SIZE; j++) { + sum += matrix_in[i][j] * cos((2 * i + 1) * u * PI / (2 * DCT_SIZE)) * cos((2 * j + 1) * v * PI / (2 * DCT_SIZE)); + } + } + matrix_out[u][v] = cu * cv * sum; + } + } +} + + +void populate_mock_matrices(element_t*** mock_matrices) { + for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) { + for (int j = 0; j < DCT_SIZE; j++) { + for (int k = 0; k < DCT_SIZE; k++) { + mock_matrices[i][j][k] = j + k; + } + } + } +} + + +element_t*** generate_mock_matrices() { + element_t ***mock_matrices = (element_t ***) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t**)); + for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) { + mock_matrices[i] = (element_t **) malloc(DCT_SIZE * sizeof(element_t*)); + for (int j = 0; j < DCT_SIZE; j++) { + mock_matrices[i][j] = (element_t *) malloc(DCT_SIZE * sizeof(element_t)); + } + } + + populate_mock_matrices(mock_matrices); + + return mock_matrices; +} + +void free_mock_matrices(element_t*** mock_matrices, element_t** matrix_out) { + for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) { + for (int j = 0; j < DCT_SIZE; j++) { + free(mock_matrices[i][j]); + } + free(mock_matrices[i]); + } + free(mock_matrices); + +} + +int main() { + element_t ***mock_matrices = generate_mock_matrices(); + + element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*)); + for (int i = 0; i < DCT_SIZE; i++) { + matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t)); + } + + for(long i = 0; i < TOTAL_DCT_BLOCKS; i++) { + dct_2d(mock_matrices[i], matrix_out); + } + + free_mock_matrices(mock_matrices, matrix_out); + + for (int i = 0; i < DCT_SIZE; i++) { + free(matrix_out[i]); + } + free(matrix_out); + + return 0; +}