Hello, I have this loop written in C++, that compiled with MSVC2010 takes a long time to run. (300ms)
for (int i=0; i<h; i++) {
for (int j=0; j<w; j++) {
if (buf[i*w+j] > 0) {
const int sy = max(0, i - hr);
const int ey = min(h, i + hr + 1);
const int sx = max(0, j - hr);
const int ex = min(w, j + hr + 1);
float val = 0;
for (int k=sy; k < ey; k++) {
for (int m=sx; m < ex; m++) {
val += original[k*w + m] * ds[k - i + hr][m - j + hr];
}
}
heat_map[i*w + j] = val;
}
}
}
It seemed a bit strange to me, so I did some tests then changed a few bits to inline assembly: (specifically, the code that sums "val")
for (int i=0; i<h; i++) {
for (int j=0; j<w; j++) {
if (buf[i*w+j] > 0) {
const int sy = max(0, i - hr);
const int ey = min(h, i + hr + 1);
const int sx = max(0, j - hr);
const int ex = min(w, j + hr + 1);
__asm {
fldz
}
for (int k=sy; k < ey; k++) {
for (int m=sx; m < ex; m++) {
float val = original[k*w + m] * ds[k - i + hr][m - j + hr];
__asm {
fld val
fadd
}
}
}
float val1;
__asm {
fstp val1
}
heat_map[i*w + j] = val1;
}
}
}
Now it runs in half the time, 150ms. It does exactly the same thing, but why is it twice as quick? In both cases it was run in Release mode with optimizations on. Am I doing anything wrong in my original C++ code?