diff --git a/src/heat/common/heat.h b/src/heat/common/heat.h index 1a2f5ac..696b28c 100644 --- a/src/heat/common/heat.h +++ b/src/heat/common/heat.h @@ -53,6 +53,6 @@ double getTime(void); const char *summary(void); double solve(HeatConfiguration *conf, int64_t rows, int64_t cols, int timesteps, void *extraData); void computeBlock(const int64_t rows, const int64_t cols, const int rstart, const int rend, const int cstart, const int cend, double M[rows][cols]); -double computeBlockResidual(const int64_t rows, const int64_t cols, const int rstart, const int rend, const int cstart, const int cend, double M[rows][cols]); +void computeBlockResidual(const int64_t rows, const int64_t cols, const int rstart, const int rend, const int cstart, const int cend, double M[rows][cols], double relax, double *residual, double *max_elem); #endif // HEAT_H diff --git a/src/heat/common/kernel.c b/src/heat/common/kernel.c index da6582f..aadc896 100644 --- a/src/heat/common/kernel.c +++ b/src/heat/common/kernel.c @@ -1,4 +1,5 @@ #include +#include #include "common/heat.h" #ifndef SIMD @@ -19,6 +20,7 @@ void computeBlock(const int64_t rows, const int64_t cols, const int cstart, const int cend, double M[rows][cols]) { + (void) cend; // Assuming square blocks const int bs = rend-rstart+1; for (int k = 0; k < bs; ++k) { @@ -40,19 +42,74 @@ void computeBlock(const int64_t rows, const int64_t cols, } #endif -double computeBlockResidual(const int64_t rows, const int64_t cols, + +#if 1 + +void computeBlockResidual(const int64_t rows, const int64_t cols, const int rstart, const int rend, const int cstart, const int cend, - double M[rows][cols]) + double M[rows][cols], double relax, + double *residual, double *max_elem) { - double sum = 0.0; + //double relax = 1.95; for (int r = rstart; r <= rend; ++r) { for (int c = cstart; c <= cend; ++c) { - const double value = 0.25*(M[r-1][c] + M[r+1][c] + M[r][c-1] + M[r][c+1]); - const double diff = value - M[r][c]; - sum += diff*diff; - M[r][c] = value; + double old = M[r][c]; + double fdiff = 0.25*(M[r-1][c] + M[r+1][c] + M[r][c-1] + M[r][c+1]); + double new = (1 - relax) * old + relax * fdiff; + double diff = new - old; + /* Use the largest absolute error as residual */ + *residual = fmax(*residual, fabs(diff)); + *max_elem = fmax(*max_elem, fabs(new)); + //fprintf(stderr, "residual = %e in (%4d, %4d)\n", residual, r, c); + M[r][c] = new; } } - return sum; } + +#else + +/* Red-black parallelization */ +void computeBlockResidual(const int64_t rows, const int64_t cols, + const int rstart, const int rend, + const int cstart, const int cend, + double M[rows][cols], double relax, + double * restrict residual, double * restrict max_elem) +{ + (void)(residual); + (void)(max_elem); + + const double A = 1 - relax; + const double B = 0.25 * relax; + + //double relax = 1.95; + for (int r = rstart; r <= rend; r += 2) { + #pragma clang loop vectorize(enable) + for (int c = cstart; c <= cend; c += 2) { + double old = M[r][c]; + double new = A * old + B * (M[r-1][c] + M[r+1][c] + M[r][c-1] + M[r][c+1]); + //double diff = new - old; + /* Use the largest absolute error as residual */ + //*residual = fmax(*residual, fabs(diff)); + //*max_elem = fmax(*max_elem, fabs(new)); + //fprintf(stderr, "residual = %e in (%4d, %4d)\n", residual, r, c); + M[r][c] = new; + } + } + + for (int r = rstart+1; r <= rend; r += 2) { + #pragma clang loop vectorize(enable) + for (int c = cstart+1; c <= cend; c += 2) { + double old = M[r][c]; + double new = A * old + B * (M[r-1][c] + M[r+1][c] + M[r][c-1] + M[r][c+1]); + //double diff = new - old; + /* Use the largest absolute error as residual */ + //*residual = fmax(*residual, fabs(diff)); + //*max_elem = fmax(*max_elem, fabs(new)); + //fprintf(stderr, "residual = %e in (%4d, %4d)\n", residual, r, c); + M[r][c] = new; + } + } +} + +#endif