Apply compile time optimisations
This commit is contained in:
parent
29b5365918
commit
33c18dbefa
4 changed files with 284 additions and 13 deletions
|
|
@ -137,9 +137,80 @@ void dct_2d(element_t** matrix_in, element_t** matrix_out) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
This version will serve as a baseline for further optimisations and after simulating this, it yielded a performance of 62977442 cycles.
|
||||||
|
|
||||||
### Software optimisations
|
### Software optimisations
|
||||||
## Compile time constants
|
## Compile time constants
|
||||||
|
Looking at the naive implementation we can see some low hanging fruit that can be easilty optimised by evaluating constants in compile time.
|
||||||
|
Firstly we can calculate the value of 1/sqrt(DCT_SIZE) and sqrt(2)/sqrt(DCT_SIZE) to avoid executing sqrt() in runtime which is a costly operation.
|
||||||
|
After doing this we get the following constants:
|
||||||
|
|
||||||
|
```c
|
||||||
|
#define INV_SQRTDCT_SIZE (real_t) 0.3535533906
|
||||||
|
#define SQRT2_INV_SQRTDCT (real_t) 0.5
|
||||||
|
```
|
||||||
|
|
||||||
|
Because PI / (2 * DCT_SIZE) is a constant we can calculate all possible cos() values from 0 to 32 with this multiple.
|
||||||
|
These values can then be stored in an array to eliminate runtime calculations. This is done in also done in compile time in the following way:
|
||||||
|
|
||||||
|
```c
|
||||||
|
#define DCT_COS_TABLE_SIZE 32
|
||||||
|
|
||||||
|
#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \
|
||||||
|
1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \
|
||||||
|
0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \
|
||||||
|
-0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \
|
||||||
|
-0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \
|
||||||
|
0.707107, 0.83147, 0.92388, 0.980785 }
|
||||||
|
```
|
||||||
|
This changes the way the sum is calculated to the following:
|
||||||
|
```c
|
||||||
|
sum += matrix_in[i][j] * DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE *DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
|
||||||
|
```
|
||||||
|
|
||||||
|
After eliminating unnecessary calculations we can move some calculation to other loops to reduce redundant calculations.
|
||||||
|
These are found in the inner loops of the alogrithm where they should be recalculated for each other iteration,
|
||||||
|
but are instead recalculated in the inner leading to redudant operations.
|
||||||
|
```c
|
||||||
|
for (u = 0; u < DCT_SIZE; u++) {
|
||||||
|
for (v = 0; v < DCT_SIZE; v++) {
|
||||||
|
cu = u == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE);
|
||||||
|
cv = v == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE);
|
||||||
|
|
||||||
|
sum = 0;
|
||||||
|
for (i = 0; i < DCT_SIZE; i++) {
|
||||||
|
for (j = 0; j < DCT_SIZE; j++) {
|
||||||
|
sum += matrix_in[i][j] * DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE *DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
matrix_out[u][v] = cu * cv * sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
The first step is to move the cu assignment to the outer loop, this will eliminate 7 redundant calculations of cu.
|
||||||
|
Secondly the sum calculation can be refactored to only lookup the cos for the u value in the outer loop and the v value in the inner loop.
|
||||||
|
By applying these changes we get the following code:
|
||||||
|
|
||||||
|
```c
|
||||||
|
for (u = 0; u < DCT_SIZE; u++) {
|
||||||
|
cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
|
||||||
|
for (v = 0; v < DCT_SIZE; v++) {
|
||||||
|
cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
|
||||||
|
sum = 0;
|
||||||
|
for (i = 0; i < DCT_SIZE; i++) {
|
||||||
|
cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
|
||||||
|
for (j = 0; j < DCT_SIZE; j++) {
|
||||||
|
cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
|
||||||
|
sum += matrix_in[i][j] * cos_u * cos_v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
matrix_out[u][v] = cu * cv * sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
After running the changes in the simulation, the performance improved to 23697904 cycles.
|
||||||
|
|
||||||
|
## Remove conditionals
|
||||||
## Flattening arrays and loops
|
## Flattening arrays and loops
|
||||||
## Vectorisation
|
## Vectorisation
|
||||||
## Changing data types
|
## Changing data types
|
||||||
|
|
|
||||||
36
main.c
36
main.c
|
|
@ -1,28 +1,38 @@
|
||||||
#include <math.h>
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
#define DCT_SIZE 8
|
#define DCT_SIZE 8
|
||||||
#define TOTAL_DCT_BLOCKS 100
|
#define TOTAL_DCT_BLOCKS 100
|
||||||
|
|
||||||
#define PI 3.14159265358979323846
|
|
||||||
|
|
||||||
#define element_t int16_t
|
#define element_t int16_t
|
||||||
#define real_t double
|
#define real_t double
|
||||||
|
|
||||||
|
#define DCT_COS_TABLE_SIZE 32
|
||||||
|
// DCT_COS_TABLE[i] = cos(i * PI / (2 * DCT_SIZE))
|
||||||
|
#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \
|
||||||
|
1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \
|
||||||
|
0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \
|
||||||
|
-0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \
|
||||||
|
-0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \
|
||||||
|
0.707107, 0.83147, 0.92388, 0.980785 }
|
||||||
|
|
||||||
|
#define INV_SQRTDCT_SIZE (real_t) 0.3535533906
|
||||||
|
#define SQRT2_INV_SQRTDCT (real_t) 0.5
|
||||||
|
|
||||||
void dct_2d(element_t** matrix_in, element_t** matrix_out) {
|
void dct_2d(element_t** matrix_in, element_t** matrix_out) {
|
||||||
real_t cu, cv, sum;
|
real_t cu, cv, sum, cos_u, cos_v;
|
||||||
int u, v, i, j;
|
int u, v, i, j;
|
||||||
|
|
||||||
for (u = 0; u < DCT_SIZE; u++) {
|
for (u = 0; u < DCT_SIZE; u++) {
|
||||||
|
cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
|
||||||
for (v = 0; v < DCT_SIZE; v++) {
|
for (v = 0; v < DCT_SIZE; v++) {
|
||||||
cu = u == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE);
|
cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
|
||||||
cv = v == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE);
|
|
||||||
|
|
||||||
sum = 0;
|
sum = 0;
|
||||||
for (i = 0; i < DCT_SIZE; i++) {
|
for (i = 0; i < DCT_SIZE; i++) {
|
||||||
for (j = 0; j < DCT_SIZE; j++) {
|
cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
|
||||||
sum += matrix_in[i][j] * cos((2 * i + 1) * u * PI / (2 * DCT_SIZE)) * cos((2 * j + 1) * v * PI / (2 * DCT_SIZE));
|
for (j = 0; j < DCT_SIZE; j++) {
|
||||||
|
cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
|
||||||
|
sum += matrix_in[i][j] * cos_u * cos_v;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
matrix_out[u][v] = cu * cv * sum;
|
matrix_out[u][v] = cu * cv * sum;
|
||||||
|
|
@ -52,7 +62,6 @@ element_t*** generate_mock_matrices() {
|
||||||
}
|
}
|
||||||
|
|
||||||
populate_mock_matrices(mock_matrices);
|
populate_mock_matrices(mock_matrices);
|
||||||
|
|
||||||
return mock_matrices;
|
return mock_matrices;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -69,19 +78,20 @@ void free_mock_matrices(element_t*** mock_matrices, element_t** matrix_out) {
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
element_t ***mock_matrices = generate_mock_matrices();
|
element_t ***mock_matrices = generate_mock_matrices();
|
||||||
|
int i;
|
||||||
|
|
||||||
element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
|
element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
|
||||||
for (int i = 0; i < DCT_SIZE; i++) {
|
for (i = 0; i < DCT_SIZE; i++) {
|
||||||
matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
|
matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
for(long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
for(i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||||
dct_2d(mock_matrices[i], matrix_out);
|
dct_2d(mock_matrices[i], matrix_out);
|
||||||
}
|
}
|
||||||
|
|
||||||
free_mock_matrices(mock_matrices, matrix_out);
|
free_mock_matrices(mock_matrices, matrix_out);
|
||||||
|
|
||||||
for (int i = 0; i < DCT_SIZE; i++) {
|
for (i = 0; i < DCT_SIZE; i++) {
|
||||||
free(matrix_out[i]);
|
free(matrix_out[i]);
|
||||||
}
|
}
|
||||||
free(matrix_out);
|
free(matrix_out);
|
||||||
|
|
|
||||||
100
versions/comptime_constants.c
Normal file
100
versions/comptime_constants.c
Normal file
|
|
@ -0,0 +1,100 @@
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#define DCT_SIZE 8
|
||||||
|
#define TOTAL_DCT_BLOCKS 100
|
||||||
|
|
||||||
|
#define element_t int16_t
|
||||||
|
#define real_t double
|
||||||
|
|
||||||
|
#define DCT_COS_TABLE_SIZE 32
|
||||||
|
// DCT_COS_TABLE[i] = cos(i * PI / (2 * DCT_SIZE))
|
||||||
|
#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \
|
||||||
|
1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \
|
||||||
|
0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \
|
||||||
|
-0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \
|
||||||
|
-0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \
|
||||||
|
0.707107, 0.83147, 0.92388, 0.980785 }
|
||||||
|
|
||||||
|
#define INV_SQRTDCT_SIZE (real_t) 0.3535533906
|
||||||
|
#define SQRT2_INV_SQRTDCT (real_t) 0.5
|
||||||
|
|
||||||
|
void dct_2d(element_t** matrix_in, element_t** matrix_out) {
|
||||||
|
real_t cu, cv, sum, cos_u, cos_v;
|
||||||
|
int u, v, i, j;
|
||||||
|
|
||||||
|
for (u = 0; u < DCT_SIZE; u++) {
|
||||||
|
cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
|
||||||
|
for (v = 0; v < DCT_SIZE; v++) {
|
||||||
|
cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
|
||||||
|
sum = 0;
|
||||||
|
for (i = 0; i < DCT_SIZE; i++) {
|
||||||
|
cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
|
||||||
|
for (j = 0; j < DCT_SIZE; j++) {
|
||||||
|
cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
|
||||||
|
sum += matrix_in[i][j] * cos_u * cos_v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
matrix_out[u][v] = cu * cv * sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void populate_mock_matrices(element_t*** mock_matrices) {
|
||||||
|
for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||||
|
for (int j = 0; j < DCT_SIZE; j++) {
|
||||||
|
for (int k = 0; k < DCT_SIZE; k++) {
|
||||||
|
mock_matrices[i][j][k] = j + k;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
element_t*** generate_mock_matrices() {
|
||||||
|
element_t ***mock_matrices = (element_t ***) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t**));
|
||||||
|
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||||
|
mock_matrices[i] = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
|
||||||
|
for (int j = 0; j < DCT_SIZE; j++) {
|
||||||
|
mock_matrices[i][j] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
populate_mock_matrices(mock_matrices);
|
||||||
|
return mock_matrices;
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_mock_matrices(element_t*** mock_matrices, element_t** matrix_out) {
|
||||||
|
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||||
|
for (int j = 0; j < DCT_SIZE; j++) {
|
||||||
|
free(mock_matrices[i][j]);
|
||||||
|
}
|
||||||
|
free(mock_matrices[i]);
|
||||||
|
}
|
||||||
|
free(mock_matrices);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
element_t ***mock_matrices = generate_mock_matrices();
|
||||||
|
int i;
|
||||||
|
|
||||||
|
element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
|
||||||
|
for (i = 0; i < DCT_SIZE; i++) {
|
||||||
|
matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
|
||||||
|
}
|
||||||
|
|
||||||
|
for(i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||||
|
dct_2d(mock_matrices[i], matrix_out);
|
||||||
|
}
|
||||||
|
|
||||||
|
free_mock_matrices(mock_matrices, matrix_out);
|
||||||
|
|
||||||
|
for (i = 0; i < DCT_SIZE; i++) {
|
||||||
|
free(matrix_out[i]);
|
||||||
|
}
|
||||||
|
free(matrix_out);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
90
versions/naive.c
Normal file
90
versions/naive.c
Normal file
|
|
@ -0,0 +1,90 @@
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#define DCT_SIZE 8
|
||||||
|
#define TOTAL_DCT_BLOCKS 100
|
||||||
|
|
||||||
|
#define PI 3.14159265358979323846
|
||||||
|
|
||||||
|
#define element_t int16_t
|
||||||
|
#define real_t double
|
||||||
|
|
||||||
|
void dct_2d(element_t** matrix_in, element_t** matrix_out) {
|
||||||
|
real_t cu, cv, sum;
|
||||||
|
int u, v, i, j;
|
||||||
|
|
||||||
|
for (u = 0; u < DCT_SIZE; u++) {
|
||||||
|
for (v = 0; v < DCT_SIZE; v++) {
|
||||||
|
cu = u == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE);
|
||||||
|
cv = v == 0 ? 1 / sqrt(DCT_SIZE) : sqrt(2) / sqrt(DCT_SIZE);
|
||||||
|
|
||||||
|
sum = 0;
|
||||||
|
for (i = 0; i < DCT_SIZE; i++) {
|
||||||
|
for (j = 0; j < DCT_SIZE; j++) {
|
||||||
|
sum += matrix_in[i][j] * cos((2 * i + 1) * u * PI / (2 * DCT_SIZE)) * cos((2 * j + 1) * v * PI / (2 * DCT_SIZE));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
matrix_out[u][v] = cu * cv * sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void populate_mock_matrices(element_t*** mock_matrices) {
|
||||||
|
for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||||
|
for (int j = 0; j < DCT_SIZE; j++) {
|
||||||
|
for (int k = 0; k < DCT_SIZE; k++) {
|
||||||
|
mock_matrices[i][j][k] = j + k;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
element_t*** generate_mock_matrices() {
|
||||||
|
element_t ***mock_matrices = (element_t ***) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t**));
|
||||||
|
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||||
|
mock_matrices[i] = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
|
||||||
|
for (int j = 0; j < DCT_SIZE; j++) {
|
||||||
|
mock_matrices[i][j] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
populate_mock_matrices(mock_matrices);
|
||||||
|
|
||||||
|
return mock_matrices;
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_mock_matrices(element_t*** mock_matrices, element_t** matrix_out) {
|
||||||
|
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||||
|
for (int j = 0; j < DCT_SIZE; j++) {
|
||||||
|
free(mock_matrices[i][j]);
|
||||||
|
}
|
||||||
|
free(mock_matrices[i]);
|
||||||
|
}
|
||||||
|
free(mock_matrices);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
element_t ***mock_matrices = generate_mock_matrices();
|
||||||
|
|
||||||
|
element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
|
||||||
|
for (int i = 0; i < DCT_SIZE; i++) {
|
||||||
|
matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
|
||||||
|
}
|
||||||
|
|
||||||
|
for(long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||||
|
dct_2d(mock_matrices[i], matrix_out);
|
||||||
|
}
|
||||||
|
|
||||||
|
free_mock_matrices(mock_matrices, matrix_out);
|
||||||
|
|
||||||
|
for (int i = 0; i < DCT_SIZE; i++) {
|
||||||
|
free(matrix_out[i]);
|
||||||
|
}
|
||||||
|
free(matrix_out);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue