Flatten arrays

This commit is contained in:
Hugo Mårdbrink 2024-04-08 12:33:21 +02:00
parent 33c18dbefa
commit af34c63a49
3 changed files with 165 additions and 31 deletions

View file

@ -208,10 +208,69 @@ for (u = 0; u < DCT_SIZE; u++) {
}
}
```
After running the changes in the simulation, the performance improved to 23697904 cycles.
After running the changes in the simulation, the performance improved to 26965608 cycles.
## Remove conditionals
## Flattening arrays and loops
## Flattening arrays
Flattening arrays is the process of storing a multidimensional array in a single dimension.
This creates a memory layout that is less jagged, leading to better cache performance and predictability.
It is also necessary for future implementation of vectorision and compiler optimisations.
The first step is to slightly change our data generations to now generate a one dimensional array.
The memory allocation, memory deallocation and data generation now looks like this:
```c
element_t** generate_mock_matrices() {
element_t **mock_matrices = (element_t **) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t*));
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
mock_matrices[i] = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
}
populate_mock_matrices(mock_matrices);
return mock_matrices;
}
void free_mock_matrices(element_t** mock_matrices) {
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
free(mock_matrices[i]);
}
free(mock_matrices);
}
void populate_mock_matrices(element_t** mock_matrices) {
for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
for (int j = 0; j < DCT_SIZE; j++) {
for (int k = 0; k < DCT_SIZE; k++) {
mock_matrices[i][j * DCT_SIZE + k] = j + k;
}
}
}
}
```
The next step is to change the signature of the kernel function and change the array accessing.
```c
void dct_2d(element_t* matrix_in, element_t* matrix_out) {
real_t cu, cv, sum, cos_u, cos_v;
int u, v, i, j;
for (u = 0; u < DCT_SIZE; u++) {
cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
for (v = 0; v < DCT_SIZE; v++) {
cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
sum = 0;
for (i = 0; i < DCT_SIZE; i++) {
cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
for (j = 0; j < DCT_SIZE; j++) {
cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
sum += matrix_in[i * DCT_SIZE + j] * cos_u * cos_v;
}
}
matrix_out[u * DCT_SIZE + v] = cu * cv * sum;
}
}
}
```
Not only does this enable further optimisations but the performance improved to 23667310 cycles.
## Vectorisation
## Changing data types
## Compiler optimisations

43
main.c
View file

@ -19,7 +19,7 @@
#define INV_SQRTDCT_SIZE (real_t) 0.3535533906
#define SQRT2_INV_SQRTDCT (real_t) 0.5
void dct_2d(element_t** matrix_in, element_t** matrix_out) {
void dct_2d(element_t* matrix_in, element_t* matrix_out) {
real_t cu, cv, sum, cos_u, cos_v;
int u, v, i, j;
@ -32,68 +32,55 @@ void dct_2d(element_t** matrix_in, element_t** matrix_out) {
cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
for (j = 0; j < DCT_SIZE; j++) {
cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
sum += matrix_in[i][j] * cos_u * cos_v;
sum += matrix_in[i * DCT_SIZE + j] * cos_u * cos_v;
}
}
matrix_out[u][v] = cu * cv * sum;
matrix_out[u * DCT_SIZE + v] = cu * cv * sum;
}
}
}
void populate_mock_matrices(element_t*** mock_matrices) {
void populate_mock_matrices(element_t** mock_matrices) {
for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
for (int j = 0; j < DCT_SIZE; j++) {
for (int k = 0; k < DCT_SIZE; k++) {
mock_matrices[i][j][k] = j + k;
mock_matrices[i][j * DCT_SIZE + k] = j + k;
}
}
}
}
element_t*** generate_mock_matrices() {
element_t ***mock_matrices = (element_t ***) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t**));
element_t** generate_mock_matrices() {
element_t **mock_matrices = (element_t **) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t*));
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
mock_matrices[i] = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
for (int j = 0; j < DCT_SIZE; j++) {
mock_matrices[i][j] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
}
mock_matrices[i] = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
}
populate_mock_matrices(mock_matrices);
return mock_matrices;
}
void free_mock_matrices(element_t*** mock_matrices, element_t** matrix_out) {
void free_mock_matrices(element_t** mock_matrices) {
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
for (int j = 0; j < DCT_SIZE; j++) {
free(mock_matrices[i][j]);
}
free(mock_matrices[i]);
}
free(mock_matrices);
}
int main() {
element_t ***mock_matrices = generate_mock_matrices();
element_t **mock_matrices = generate_mock_matrices();
int i;
element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
for (i = 0; i < DCT_SIZE; i++) {
matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
}
element_t* matrix_out = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
for(i = 0; i < TOTAL_DCT_BLOCKS; i++) {
dct_2d(mock_matrices[i], matrix_out);
}
free_mock_matrices(mock_matrices);
free_mock_matrices(mock_matrices, matrix_out);
for (i = 0; i < DCT_SIZE; i++) {
free(matrix_out[i]);
}
free(matrix_out);
return 0;

View file

@ -0,0 +1,88 @@
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#define DCT_SIZE 8
#define TOTAL_DCT_BLOCKS 100
#define element_t int16_t
#define real_t double
#define DCT_COS_TABLE_SIZE 32
// DCT_COS_TABLE[i] = cos(i * PI / (2 * DCT_SIZE))
#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \
1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \
0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \
-0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \
-0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \
0.707107, 0.83147, 0.92388, 0.980785 }
#define INV_SQRTDCT_SIZE (real_t) 0.3535533906
#define SQRT2_INV_SQRTDCT (real_t) 0.5
void dct_2d(element_t* matrix_in, element_t* matrix_out) {
real_t cu, cv, sum, cos_u, cos_v;
int u, v, i, j;
for (u = 0; u < DCT_SIZE; u++) {
cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
for (v = 0; v < DCT_SIZE; v++) {
cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
sum = 0;
for (i = 0; i < DCT_SIZE; i++) {
cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
for (j = 0; j < DCT_SIZE; j++) {
cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
sum += matrix_in[i * DCT_SIZE + j] * cos_u * cos_v;
}
}
matrix_out[u * DCT_SIZE + v] = cu * cv * sum;
}
}
}
void populate_mock_matrices(element_t** mock_matrices) {
for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
for (int j = 0; j < DCT_SIZE; j++) {
for (int k = 0; k < DCT_SIZE; k++) {
mock_matrices[i][j * DCT_SIZE + k] = j + k;
}
}
}
}
element_t** generate_mock_matrices() {
element_t **mock_matrices = (element_t **) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t*));
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
mock_matrices[i] = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
}
populate_mock_matrices(mock_matrices);
return mock_matrices;
}
void free_mock_matrices(element_t** mock_matrices) {
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
free(mock_matrices[i]);
}
free(mock_matrices);
}
int main() {
element_t **mock_matrices = generate_mock_matrices();
int i;
element_t* matrix_out = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
for(i = 0; i < TOTAL_DCT_BLOCKS; i++) {
dct_2d(mock_matrices[i], matrix_out);
}
free_mock_matrices(mock_matrices);
free(matrix_out);
return 0;
}