Hi,
I wrote a simple program to implement SSE intrinsics for computing the inner product of two large (100000 or more elements) vectors. The program compares the execution time for both, inner product computed the conventional way and using intrinsics. Everything works out fine, until I insert (just for the fun of it) an inner loop before the statement that computes the inner product. Before I go further, here is the code:
//this is a sample Intrinsics program to compute inner product of two vectors and compare Intrinsics with traditional method of doing things.
#include <iostream>
#include <iomanip>
#include <xmmintrin.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
using namespace std;
typedef float v4sf __attribute__ ((vector_size(16)));
double innerProduct(float* arr1, int len1, float* arr2, int len2) { //assume len1 = len2.
float result = 0.0;
for(int i = 0; i < len1; i++) {
for(int j = 0; j < len1; j++) {
result += (arr1[i] * arr2[i]);
}
}
//float y = 1.23e+09;
//cout << "y = " << y << endl;
return result;
}
double sse_v4sf_innerProduct(float* arr1, int len1, float* arr2, int len2) { //assume that len1 = len2.
if(len1 != len2) {
cout << "Lengths not equal." << endl;
exit(1);
}
/*steps:
* 1. load a long-type (4 float) into a v4sf type data from both arrays.
* 2. multiply the two.
* 3. multiply the same and store result.
* 4. add this to previous results.
*/
v4sf arr1Data, arr2Data, prevSums, multVal, xyz;
//__builtin_ia32_xorps(prevSums, prevSums); //making it equal zero.
//can explicitly load 0 into prevSums using loadps or storeps (Check).
float temp[4] = {0.0, 0.0, 0.0, 0.0};
prevSums = __builtin_ia32_loadups(temp);
float result = 0.0;
for(int i = 0; i < (len1 - 3); i += 4) {
for(int j = 0; j < len1; j++) {
arr1Data = __builtin_ia32_loadups(&arr1[i]);
arr2Data = __builtin_ia32_loadups(&arr2[i]); //store the contents of two arrays.
multVal = __builtin_ia32_mulps(arr1Data, arr2Data); //multiply.
xyz = __builtin_ia32_addps(multVal, prevSums);
prevSums = xyz;
}
}
//prevSums will hold the sums of 4 32-bit floating point values taken at a time. Individual entries in prevSums also need to be added.
__builtin_ia32_storeups(temp, prevSums); //store prevSums into temp.
cout << "Values of temp:" << endl;
for(int i = 0; i < 4; i++)
cout << temp[i] << endl;
result += temp[0] + temp[1] + temp[2] + temp[3];
return result;
}
int main() {
clock_t begin, end;
int length = 100000;
float *arr1, *arr2;
double result_Conventional, result_Intrinsic;
// printStats("Allocating memory.");
arr1 = new float[length];
arr2 = new float[length];
// printStats("End allocation.");
srand(time(NULL)); //init random seed.
// printStats("Initializing array1 and array2");
begin = clock();
for(int i = 0; i < length; i++) {
// for(int j = 0; j < length; j++) {
// arr1[i] = rand() % 10 + 1;
arr1[i] = 2.5;
// arr2[i] = rand() % 10 - 1;
arr2[i] = 2.5;
// }
}
end = clock();
cout << "Time to initialize array1 and array2 = " << ((double) (end - begin)) / CLOCKS_PER_SEC << endl;
// printStats("Finished initialization.");
// printStats("Begin inner product conventionally.");
begin = clock();
result_Conventional = innerProduct(arr1, length, arr2, length);
end = clock();
cout << "Time to compute inner product conventionally = " << ((double) (end - begin)) / CLOCKS_PER_SEC << endl;
// printStats("End inner product conventionally.");
// printStats("Begin inner product using Intrinsics.");
begin = clock();
result_Intrinsic = sse_v4sf_innerProduct(arr1, length, arr2, length);
end = clock();
cout << "Time to compute inner product with intrinsics = " << ((double) (end - begin)) / CLOCKS_PER_SEC << endl;
//printStats("End inner product using Intrinsics.");
cout << "Results: " << endl;
cout << " result_Conventional = " << result_Conventional << endl;
cout << " result_Intrinsics = " << result_Intrinsic << endl;
return 0;
}
I use the following g++ invocation to build this:
g++ -W -Wall -O2 -pedantic -march=i386 -msse intrinsics_SSE_innerProduct.C -o innerProduct
Each of the loops above, in both the functions, runs a total of N^2 times. However, given that arr1 and arr2 (the two floating point vectors) are loaded with a value 2.5, the length of the array is 100,000, the result in both cases should be 6.25e+10. The results I get are:
Results:
result_Conventional = 6.25e+10
result_Intrinsics = 5.36871e+08
This is not all. It seems that the value returned from the function that uses intrinsics "saturates" at the value above. I tried putting other values for the elements of the array and different sizes too. But it seems that any value above 1.0 for the array contents and any size above 1000 meets with the same value we see above.
Initially, I thought it might be because all operations within SSE are in floating point, but floating point should be able to store a number that is of the order of e+08.
I am trying to see where I could be going wrong but cannot seem to figure it out. I am using g++ version: g++ (GCC) 4.4.1 20090725 (Red Hat 4.4.1-2).
Any help on this is most welcome.
Thanks,
Sriram.