Apply compile time optimisations

This commit is contained in:
Hugo Mårdbrink 2024-04-07 13:06:49 +02:00
parent 29b5365918
commit 33c18dbefa
4 changed files with 284 additions and 13 deletions

View file

@ -137,9 +137,80 @@ void dct_2d(element_t** matrix_in, element_t** matrix_out) {
}
}
```
This version will serve as a baseline for further optimisations and after simulating this, it yielded a performance of 62977442 cycles.
### Software optimisations
## Compile time constants
Looking at the naive implementation we can see some low hanging fruit that can be easilty optimised by evaluating constants in compile time.
Firstly we can calculate the value of 1/sqrt(DCT_SIZE) and sqrt(2)/sqrt(DCT_SIZE) to avoid executing sqrt() in runtime which is a costly operation.
After doing this we get the following constants:
```c
#define INV_SQRTDCT_SIZE (real_t) 0.3535533906
#define SQRT2_INV_SQRTDCT (real_t) 0.5
```
Because PI / (2 * DCT_SIZE) is a constant we can calculate all possible cos() values from 0 to 32 with this multiple.
These values can then be stored in an array to eliminate runtime calculations. This is done in also done in compile time in the following way:
```c
#define DCT_COS_TABLE_SIZE 32
#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \
1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \
0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \
-0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \
-0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \
0.707107, 0.83147, 0.92388, 0.980785 }
```
This changes the way the sum is calculated to the following:
```c
sum += matrix_in[i][j] * DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE *DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
```
After eliminating unnecessary calculations we can move some calculation to other loops to reduce redundant calculations.
These are found in the inner loops of the alogrithm where they should be recalculated for each other iteration,
but are instead recalculated in the inner leading to redudant operations.
```c
for (u = 0; u < DCT_SIZE; u++) {
for (v = 0; v < DCT_SIZE; v++) {
cu = u == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE);
cv = v == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE);
sum = 0;
for (i = 0; i < DCT_SIZE; i++) {
for (j = 0; j < DCT_SIZE; j++) {
sum += matrix_in[i][j] * DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE *DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
}
}
matrix_out[u][v] = cu * cv * sum;
}
}
```
The first step is to move the cu assignment to the outer loop, this will eliminate 7 redundant calculations of cu.
Secondly the sum calculation can be refactored to only lookup the cos for the u value in the outer loop and the v value in the inner loop.
By applying these changes we get the following code:
```c
for (u = 0; u < DCT_SIZE; u++) {
cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
for (v = 0; v < DCT_SIZE; v++) {
cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
sum = 0;
for (i = 0; i < DCT_SIZE; i++) {
cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
for (j = 0; j < DCT_SIZE; j++) {
cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
sum += matrix_in[i][j] * cos_u * cos_v;
}
}
matrix_out[u][v] = cu * cv * sum;
}
}
```
After running the changes in the simulation, the performance improved to 23697904 cycles.
## Remove conditionals
## Flattening arrays and loops
## Vectorisation
## Changing data types

34
main.c
View file

@ -1,28 +1,38 @@
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#define DCT_SIZE 8
#define TOTAL_DCT_BLOCKS 100
#define PI 3.14159265358979323846
#define element_t int16_t
#define real_t double
#define DCT_COS_TABLE_SIZE 32
// DCT_COS_TABLE[i] = cos(i * PI / (2 * DCT_SIZE))
#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \
1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \
0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \
-0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \
-0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \
0.707107, 0.83147, 0.92388, 0.980785 }
#define INV_SQRTDCT_SIZE (real_t) 0.3535533906
#define SQRT2_INV_SQRTDCT (real_t) 0.5
void dct_2d(element_t** matrix_in, element_t** matrix_out) {
real_t cu, cv, sum;
real_t cu, cv, sum, cos_u, cos_v;
int u, v, i, j;
for (u = 0; u < DCT_SIZE; u++) {
cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
for (v = 0; v < DCT_SIZE; v++) {
cu = u == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE);
cv = v == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE);
cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
sum = 0;
for (i = 0; i < DCT_SIZE; i++) {
cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
for (j = 0; j < DCT_SIZE; j++) {
sum += matrix_in[i][j] * cos((2 * i + 1) * u * PI / (2 * DCT_SIZE)) * cos((2 * j + 1) * v * PI / (2 * DCT_SIZE));
cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
sum += matrix_in[i][j] * cos_u * cos_v;
}
}
matrix_out[u][v] = cu * cv * sum;
@ -52,7 +62,6 @@ element_t*** generate_mock_matrices() {
}
populate_mock_matrices(mock_matrices);
return mock_matrices;
}
@ -69,19 +78,20 @@ void free_mock_matrices(element_t*** mock_matrices, element_t** matrix_out) {
int main() {
element_t ***mock_matrices = generate_mock_matrices();
int i;
element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
for (int i = 0; i < DCT_SIZE; i++) {
for (i = 0; i < DCT_SIZE; i++) {
matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
}
for(long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
for(i = 0; i < TOTAL_DCT_BLOCKS; i++) {
dct_2d(mock_matrices[i], matrix_out);
}
free_mock_matrices(mock_matrices, matrix_out);
for (int i = 0; i < DCT_SIZE; i++) {
for (i = 0; i < DCT_SIZE; i++) {
free(matrix_out[i]);
}
free(matrix_out);

View file

@ -0,0 +1,100 @@
#include <stdint.h>
#include <stdlib.h>
#define DCT_SIZE 8
#define TOTAL_DCT_BLOCKS 100
#define element_t int16_t
#define real_t double
#define DCT_COS_TABLE_SIZE 32
// DCT_COS_TABLE[i] = cos(i * PI / (2 * DCT_SIZE))
#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \
1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \
0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \
-0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \
-0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \
0.707107, 0.83147, 0.92388, 0.980785 }
#define INV_SQRTDCT_SIZE (real_t) 0.3535533906
#define SQRT2_INV_SQRTDCT (real_t) 0.5
void dct_2d(element_t** matrix_in, element_t** matrix_out) {
real_t cu, cv, sum, cos_u, cos_v;
int u, v, i, j;
for (u = 0; u < DCT_SIZE; u++) {
cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
for (v = 0; v < DCT_SIZE; v++) {
cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
sum = 0;
for (i = 0; i < DCT_SIZE; i++) {
cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
for (j = 0; j < DCT_SIZE; j++) {
cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
sum += matrix_in[i][j] * cos_u * cos_v;
}
}
matrix_out[u][v] = cu * cv * sum;
}
}
}
void populate_mock_matrices(element_t*** mock_matrices) {
for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
for (int j = 0; j < DCT_SIZE; j++) {
for (int k = 0; k < DCT_SIZE; k++) {
mock_matrices[i][j][k] = j + k;
}
}
}
}
element_t*** generate_mock_matrices() {
element_t ***mock_matrices = (element_t ***) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t**));
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
mock_matrices[i] = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
for (int j = 0; j < DCT_SIZE; j++) {
mock_matrices[i][j] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
}
}
populate_mock_matrices(mock_matrices);
return mock_matrices;
}
void free_mock_matrices(element_t*** mock_matrices, element_t** matrix_out) {
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
for (int j = 0; j < DCT_SIZE; j++) {
free(mock_matrices[i][j]);
}
free(mock_matrices[i]);
}
free(mock_matrices);
}
int main() {
element_t ***mock_matrices = generate_mock_matrices();
int i;
element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
for (i = 0; i < DCT_SIZE; i++) {
matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
}
for(i = 0; i < TOTAL_DCT_BLOCKS; i++) {
dct_2d(mock_matrices[i], matrix_out);
}
free_mock_matrices(mock_matrices, matrix_out);
for (i = 0; i < DCT_SIZE; i++) {
free(matrix_out[i]);
}
free(matrix_out);
return 0;
}

90
versions/naive.c Normal file
View file

@ -0,0 +1,90 @@
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#define DCT_SIZE 8
#define TOTAL_DCT_BLOCKS 100
#define PI 3.14159265358979323846
#define element_t int16_t
#define real_t double
void dct_2d(element_t** matrix_in, element_t** matrix_out) {
real_t cu, cv, sum;
int u, v, i, j;
for (u = 0; u < DCT_SIZE; u++) {
for (v = 0; v < DCT_SIZE; v++) {
cu = u == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE);
cv = v == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE);
sum = 0;
for (i = 0; i < DCT_SIZE; i++) {
for (j = 0; j < DCT_SIZE; j++) {
sum += matrix_in[i][j] * cos((2 * i + 1) * u * PI / (2 * DCT_SIZE)) * cos((2 * j + 1) * v * PI / (2 * DCT_SIZE));
}
}
matrix_out[u][v] = cu * cv * sum;
}
}
}
void populate_mock_matrices(element_t*** mock_matrices) {
for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
for (int j = 0; j < DCT_SIZE; j++) {
for (int k = 0; k < DCT_SIZE; k++) {
mock_matrices[i][j][k] = j + k;
}
}
}
}
element_t*** generate_mock_matrices() {
element_t ***mock_matrices = (element_t ***) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t**));
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
mock_matrices[i] = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
for (int j = 0; j < DCT_SIZE; j++) {
mock_matrices[i][j] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
}
}
populate_mock_matrices(mock_matrices);
return mock_matrices;
}
void free_mock_matrices(element_t*** mock_matrices, element_t** matrix_out) {
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
for (int j = 0; j < DCT_SIZE; j++) {
free(mock_matrices[i][j]);
}
free(mock_matrices[i]);
}
free(mock_matrices);
}
int main() {
element_t ***mock_matrices = generate_mock_matrices();
element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
for (int i = 0; i < DCT_SIZE; i++) {
matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
}
for(long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
dct_2d(mock_matrices[i], matrix_out);
}
free_mock_matrices(mock_matrices, matrix_out);
for (int i = 0; i < DCT_SIZE; i++) {
free(matrix_out[i]);
}
free(matrix_out);
return 0;
}