diff --git a/src/heat/common/heat.h b/src/heat/common/heat.h
index 1a2f5ac..696b28c 100644
--- a/src/heat/common/heat.h
+++ b/src/heat/common/heat.h
@@ -53,6 +53,6 @@ double getTime(void);
 const char *summary(void);
 double solve(HeatConfiguration *conf, int64_t rows, int64_t cols, int timesteps, void *extraData);
 void computeBlock(const int64_t rows, const int64_t cols, const int rstart, const int rend, const int cstart, const int cend, double M[rows][cols]);
-double computeBlockResidual(const int64_t rows, const int64_t cols, const int rstart, const int rend, const int cstart, const int cend, double M[rows][cols]);
+void computeBlockResidual(const int64_t rows, const int64_t cols, const int rstart, const int rend, const int cstart, const int cend, double M[rows][cols], double relax, double *residual, double *max_elem);
 
 #endif // HEAT_H
diff --git a/src/heat/common/kernel.c b/src/heat/common/kernel.c
index da6582f..aadc896 100644
--- a/src/heat/common/kernel.c
+++ b/src/heat/common/kernel.c
@@ -1,4 +1,5 @@
 #include <stdint.h>
+#include <stdio.h>
 #include "common/heat.h"
 
 #ifndef SIMD
@@ -19,6 +20,7 @@ void computeBlock(const int64_t rows, const int64_t cols,
 		const int cstart, const int cend,
 		double M[rows][cols])
 {
+	(void) cend;
 	// Assuming square blocks
 	const int bs = rend-rstart+1;
 	for (int k = 0; k < bs; ++k) {
@@ -40,19 +42,74 @@ void computeBlock(const int64_t rows, const int64_t cols,
 }
 #endif
 
-double computeBlockResidual(const int64_t rows, const int64_t cols,
+
+#if 1
+
+void computeBlockResidual(const int64_t rows, const int64_t cols,
 		const int rstart, const int rend,
 		const int cstart, const int cend,
-		double M[rows][cols])
+		double M[rows][cols], double relax,
+		double *residual, double *max_elem)
 {
-	double sum = 0.0;
+	//double relax = 1.95;
 	for (int r = rstart; r <= rend; ++r) {
 		for (int c = cstart; c <= cend; ++c) {
-			const double value = 0.25*(M[r-1][c] + M[r+1][c] + M[r][c-1] + M[r][c+1]);
-			const double diff = value - M[r][c];
-			sum += diff*diff;
-			M[r][c] = value;
+			double old = M[r][c];
+			double fdiff = 0.25*(M[r-1][c] + M[r+1][c] + M[r][c-1] + M[r][c+1]);
+			double new = (1 - relax) * old + relax * fdiff;
+			double diff = new - old;
+			/* Use the largest absolute error as residual */
+			*residual = fmax(*residual, fabs(diff));
+			*max_elem = fmax(*max_elem, fabs(new));
+			//fprintf(stderr, "residual = %e in (%4d, %4d)\n", residual, r, c);
+			M[r][c] = new;
 		}
 	}
-	return sum;
 }
+
+#else
+
+/* Red-black parallelization */
+void computeBlockResidual(const int64_t rows, const int64_t cols,
+		const int rstart, const int rend,
+		const int cstart, const int cend,
+		double M[rows][cols], double relax,
+		double * restrict residual, double * restrict max_elem)
+{
+	(void)(residual);
+	(void)(max_elem);
+
+	const double A = 1 - relax;
+	const double B = 0.25 * relax;
+
+	//double relax = 1.95;
+	for (int r = rstart; r <= rend; r += 2) {
+		#pragma clang loop vectorize(enable)
+		for (int c = cstart; c <= cend; c += 2) {
+			double old = M[r][c];
+			double new = A * old + B * (M[r-1][c] + M[r+1][c] + M[r][c-1] + M[r][c+1]);
+			//double diff = new - old;
+			/* Use the largest absolute error as residual */
+			//*residual = fmax(*residual, fabs(diff));
+			//*max_elem = fmax(*max_elem, fabs(new));
+			//fprintf(stderr, "residual = %e in (%4d, %4d)\n", residual, r, c);
+			M[r][c] = new;
+		}
+	}
+
+	for (int r = rstart+1; r <= rend; r += 2) {
+		#pragma clang loop vectorize(enable)
+		for (int c = cstart+1; c <= cend; c += 2) {
+			double old = M[r][c];
+			double new = A * old + B * (M[r-1][c] + M[r+1][c] + M[r][c-1] + M[r][c+1]);
+			//double diff = new - old;
+			/* Use the largest absolute error as residual */
+			//*residual = fmax(*residual, fabs(diff));
+			//*max_elem = fmax(*max_elem, fabs(new));
+			//fprintf(stderr, "residual = %e in (%4d, %4d)\n", residual, r, c);
+			M[r][c] = new;
+		}
+	}
+}
+
+#endif