Flatten arrays
This commit is contained in:
parent
33c18dbefa
commit
af34c63a49
3 changed files with 165 additions and 31 deletions
|
|
@ -208,10 +208,69 @@ for (u = 0; u < DCT_SIZE; u++) {
|
|||
}
|
||||
}
|
||||
```
|
||||
After running the changes in the simulation, the performance improved to 23697904 cycles.
|
||||
After running the changes in the simulation, the performance improved to 26965608 cycles.
|
||||
|
||||
## Remove conditionals
|
||||
## Flattening arrays and loops
|
||||
## Flattening arrays
|
||||
Flattening arrays is the process of storing a multidimensional array in a single dimension.
|
||||
This creates a memory layout that is less jagged, leading to better cache performance and predictability.
|
||||
It is also necessary for future implementation of vectorision and compiler optimisations.
|
||||
|
||||
The first step is to slightly change our data generations to now generate a one dimensional array.
|
||||
The memory allocation, memory deallocation and data generation now looks like this:
|
||||
|
||||
```c
|
||||
element_t** generate_mock_matrices() {
|
||||
element_t **mock_matrices = (element_t **) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t*));
|
||||
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||
mock_matrices[i] = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
|
||||
}
|
||||
|
||||
populate_mock_matrices(mock_matrices);
|
||||
return mock_matrices;
|
||||
}
|
||||
|
||||
void free_mock_matrices(element_t** mock_matrices) {
|
||||
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||
free(mock_matrices[i]);
|
||||
}
|
||||
free(mock_matrices);
|
||||
}
|
||||
|
||||
void populate_mock_matrices(element_t** mock_matrices) {
|
||||
for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||
for (int j = 0; j < DCT_SIZE; j++) {
|
||||
for (int k = 0; k < DCT_SIZE; k++) {
|
||||
mock_matrices[i][j * DCT_SIZE + k] = j + k;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The next step is to change the signature of the kernel function and change the array accessing.
|
||||
```c
|
||||
void dct_2d(element_t* matrix_in, element_t* matrix_out) {
|
||||
real_t cu, cv, sum, cos_u, cos_v;
|
||||
int u, v, i, j;
|
||||
|
||||
for (u = 0; u < DCT_SIZE; u++) {
|
||||
cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
|
||||
for (v = 0; v < DCT_SIZE; v++) {
|
||||
cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
|
||||
sum = 0;
|
||||
for (i = 0; i < DCT_SIZE; i++) {
|
||||
cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
|
||||
for (j = 0; j < DCT_SIZE; j++) {
|
||||
cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
|
||||
sum += matrix_in[i * DCT_SIZE + j] * cos_u * cos_v;
|
||||
}
|
||||
}
|
||||
matrix_out[u * DCT_SIZE + v] = cu * cv * sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
Not only does this enable further optimisations but the performance improved to 23667310 cycles.
|
||||
## Vectorisation
|
||||
## Changing data types
|
||||
## Compiler optimisations
|
||||
|
|
|
|||
43
main.c
43
main.c
|
|
@ -19,7 +19,7 @@
|
|||
#define INV_SQRTDCT_SIZE (real_t) 0.3535533906
|
||||
#define SQRT2_INV_SQRTDCT (real_t) 0.5
|
||||
|
||||
void dct_2d(element_t** matrix_in, element_t** matrix_out) {
|
||||
void dct_2d(element_t* matrix_in, element_t* matrix_out) {
|
||||
real_t cu, cv, sum, cos_u, cos_v;
|
||||
int u, v, i, j;
|
||||
|
||||
|
|
@ -32,68 +32,55 @@ void dct_2d(element_t** matrix_in, element_t** matrix_out) {
|
|||
cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
|
||||
for (j = 0; j < DCT_SIZE; j++) {
|
||||
cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
|
||||
sum += matrix_in[i][j] * cos_u * cos_v;
|
||||
sum += matrix_in[i * DCT_SIZE + j] * cos_u * cos_v;
|
||||
}
|
||||
}
|
||||
matrix_out[u][v] = cu * cv * sum;
|
||||
matrix_out[u * DCT_SIZE + v] = cu * cv * sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void populate_mock_matrices(element_t*** mock_matrices) {
|
||||
void populate_mock_matrices(element_t** mock_matrices) {
|
||||
for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||
for (int j = 0; j < DCT_SIZE; j++) {
|
||||
for (int k = 0; k < DCT_SIZE; k++) {
|
||||
mock_matrices[i][j][k] = j + k;
|
||||
mock_matrices[i][j * DCT_SIZE + k] = j + k;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
element_t*** generate_mock_matrices() {
|
||||
element_t ***mock_matrices = (element_t ***) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t**));
|
||||
element_t** generate_mock_matrices() {
|
||||
element_t **mock_matrices = (element_t **) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t*));
|
||||
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||
mock_matrices[i] = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
|
||||
for (int j = 0; j < DCT_SIZE; j++) {
|
||||
mock_matrices[i][j] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
|
||||
}
|
||||
mock_matrices[i] = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
|
||||
}
|
||||
|
||||
populate_mock_matrices(mock_matrices);
|
||||
return mock_matrices;
|
||||
}
|
||||
|
||||
void free_mock_matrices(element_t*** mock_matrices, element_t** matrix_out) {
|
||||
void free_mock_matrices(element_t** mock_matrices) {
|
||||
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||
for (int j = 0; j < DCT_SIZE; j++) {
|
||||
free(mock_matrices[i][j]);
|
||||
}
|
||||
free(mock_matrices[i]);
|
||||
}
|
||||
free(mock_matrices);
|
||||
|
||||
}
|
||||
|
||||
int main() {
|
||||
element_t ***mock_matrices = generate_mock_matrices();
|
||||
element_t **mock_matrices = generate_mock_matrices();
|
||||
int i;
|
||||
|
||||
element_t** matrix_out = (element_t **) malloc(DCT_SIZE * sizeof(element_t*));
|
||||
for (i = 0; i < DCT_SIZE; i++) {
|
||||
matrix_out[i] = (element_t *) malloc(DCT_SIZE * sizeof(element_t));
|
||||
}
|
||||
|
||||
|
||||
element_t* matrix_out = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
|
||||
|
||||
for(i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||
dct_2d(mock_matrices[i], matrix_out);
|
||||
}
|
||||
|
||||
free_mock_matrices(mock_matrices);
|
||||
|
||||
free_mock_matrices(mock_matrices, matrix_out);
|
||||
|
||||
for (i = 0; i < DCT_SIZE; i++) {
|
||||
free(matrix_out[i]);
|
||||
}
|
||||
free(matrix_out);
|
||||
|
||||
return 0;
|
||||
|
|
|
|||
88
versions/flattened_array.c
Normal file
88
versions/flattened_array.c
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define DCT_SIZE 8
|
||||
#define TOTAL_DCT_BLOCKS 100
|
||||
|
||||
#define element_t int16_t
|
||||
#define real_t double
|
||||
|
||||
#define DCT_COS_TABLE_SIZE 32
|
||||
// DCT_COS_TABLE[i] = cos(i * PI / (2 * DCT_SIZE))
|
||||
#define DCT_COS_TABLE (double[DCT_COS_TABLE_SIZE]) { \
|
||||
1, 0.980785, 0.92388, 0.83147, 0.707107, 0.55557, 0.382683, \
|
||||
0.19509, 0, -0.19509, -0.382683, -0.55557, -0.707107, -0.83147, \
|
||||
-0.92388, -0.980785, -1, -0.980785, -0.92388, -0.83147, -0.707107, \
|
||||
-0.55557, -0.382683, -0.19509, 0, 0.19509, 0.382683, 0.55557, \
|
||||
0.707107, 0.83147, 0.92388, 0.980785 }
|
||||
|
||||
#define INV_SQRTDCT_SIZE (real_t) 0.3535533906
|
||||
#define SQRT2_INV_SQRTDCT (real_t) 0.5
|
||||
|
||||
void dct_2d(element_t* matrix_in, element_t* matrix_out) {
|
||||
real_t cu, cv, sum, cos_u, cos_v;
|
||||
int u, v, i, j;
|
||||
|
||||
for (u = 0; u < DCT_SIZE; u++) {
|
||||
cu = u == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
|
||||
for (v = 0; v < DCT_SIZE; v++) {
|
||||
cv = v == 0 ? INV_SQRTDCT_SIZE : SQRT2_INV_SQRTDCT;
|
||||
sum = 0;
|
||||
for (i = 0; i < DCT_SIZE; i++) {
|
||||
cos_u = DCT_COS_TABLE[((2 * i + 1) * u) % DCT_COS_TABLE_SIZE];
|
||||
for (j = 0; j < DCT_SIZE; j++) {
|
||||
cos_v = DCT_COS_TABLE[((2 * j + 1) * v) % DCT_COS_TABLE_SIZE];
|
||||
sum += matrix_in[i * DCT_SIZE + j] * cos_u * cos_v;
|
||||
}
|
||||
}
|
||||
matrix_out[u * DCT_SIZE + v] = cu * cv * sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void populate_mock_matrices(element_t** mock_matrices) {
|
||||
for (long i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||
for (int j = 0; j < DCT_SIZE; j++) {
|
||||
for (int k = 0; k < DCT_SIZE; k++) {
|
||||
mock_matrices[i][j * DCT_SIZE + k] = j + k;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
element_t** generate_mock_matrices() {
|
||||
element_t **mock_matrices = (element_t **) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t*));
|
||||
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||
mock_matrices[i] = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
|
||||
}
|
||||
|
||||
populate_mock_matrices(mock_matrices);
|
||||
return mock_matrices;
|
||||
}
|
||||
|
||||
void free_mock_matrices(element_t** mock_matrices) {
|
||||
for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||
free(mock_matrices[i]);
|
||||
}
|
||||
free(mock_matrices);
|
||||
}
|
||||
|
||||
int main() {
|
||||
element_t **mock_matrices = generate_mock_matrices();
|
||||
int i;
|
||||
|
||||
element_t* matrix_out = (element_t *) malloc(DCT_SIZE * DCT_SIZE * sizeof(element_t));
|
||||
|
||||
for(i = 0; i < TOTAL_DCT_BLOCKS; i++) {
|
||||
dct_2d(mock_matrices[i], matrix_out);
|
||||
}
|
||||
|
||||
free_mock_matrices(mock_matrices);
|
||||
|
||||
free(matrix_out);
|
||||
|
||||
return 0;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue