Click here to Skip to main content
15,920,801 members
Please Sign up or sign in to vote.
4.00/5 (2 votes)
See more:
I am investigating how to optimise XORing two data buffers together on a 32 bit Intel processor. To get something working I am writing a simple function on the lines of
void XOR(char* inBuf1, char* inBuf2, char*outBuf, int size)
{
    while(size--)
    {
        *outBuf++ = *inBuf1++ ^ *inBuf2++;
    }
}

I need really the function to be more efficient. The options I've thought of so far are:-
XORing __int64 data instead
Looking at using BitBlt
Use SSE2.

Can anyone suggest other alternatives or provide any clues as to which of the above is likely to be better?
Posted

Have a look at this article:
Introduction to SSE Programming[^]

You would simply use __m128i _mm_xor_si128:
http://msdn.microsoft.com/en-us/library/fzt08www(v=vs.80).aspx[^]

Other good resource:
http://bmagic.sourceforge.net/bmsse2opt.html[^]

Good luck!
 
Share this answer
 
A little example to evaluate:
double FreqTime(const LARGE_INTEGER& t,const LARGE_INTEGER& f)
{
  typedef struct { static double tod(const LARGE_INTEGER& li){ return (4294967296.0*li.HighPart+li.LowPart); } }_;
  return 1e3*_::tod(t)/_::tod(f);
}

void XOR(char* inBuf1, char* inBuf2, char*outBuf, unsigned int size)
{
    while(size--)
    {
        *outBuf++ = *inBuf1++ ^ *inBuf2++;
    }
}

template <class TI>
void tXOR(char* i1, char* i2, char* o, unsigned int cb)
{
  unsigned int  i;
  TI*            ii1 = (TI*)i1;
  TI*            ii2 = (TI*)i2;
  TI*            oo  = (TI*)o;
  unsigned int  nn = cb/sizeof(TI);
  unsigned int  dd = sizeof(TI)*nn;
  for(i=0;i<nn;i++) oo[i]=ii1[i]^ii2[i];
  if(dd<cb) tXOR<char>(i1+dd,i2+dd,o+dd,cb-dd);
}

template <>
void tXOR<__m128>(char* i1, char* i2, char* o, unsigned int cb)
{
  unsigned int  i;
  __m128*        ii1 = (__m128*)i1;
  __m128*        ii2 = (__m128*)i2;
  __m128*        oo  = (__m128*)o;
  unsigned int  nn = cb/sizeof(__m128);
  unsigned int  dd = sizeof(__m128)*nn;
  for(i=0;i<nn;i++) oo[i]=_mm_xor_ps(ii1[i],ii2[i]);
  if(dd<cb) tXOR<char>(i1+dd,i2+dd,o+dd,cb-dd);
}

int _tmain(int argc, _TCHAR* argv[])
{

  LARGE_INTEGER freq = {0,0};
  LARGE_INTEGER t0 = {0,0};
  LARGE_INTEGER t1 = {0,0};

  QueryPerformanceFrequency(&freq);

  enum{ BUFFSIZE=1<<20, LOOPS=1000, };
  unsigned int  i;
  char*          in1 = (char*)malloc(BUFFSIZE);
  char*          in2 = (char*)malloc(BUFFSIZE);
  char*          out = (char*)malloc(BUFFSIZE);
  
  QueryPerformanceCounter(&t0);
  srand(t0.LowPart);
  for(i=0;i<BUFFSIZE;i++)
  {
    in1[i] = MulDiv(rand(),1,RAND_MAX);
    in2[i] = MulDiv(rand(),1,RAND_MAX);
  }

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) XOR(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("XOR               "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<char>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<char>        "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<short>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<short>       "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<unsigned int>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<unsigned int>"),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<__int64>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<__int64>     "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<__m128>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<__m128>      "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  free(in1);
  free(in2);
  free(out);

  _getch();
  return 0;
}

output:
XOR               (1000,1048576) = 1447.370089ms
tXOR<char>        (1000,1048576) = 993.742449ms
tXOR<short>       (1000,1048576) = 511.465385ms
tXOR<unsigned int>(1000,1048576) = 334.088394ms
tXOR<__int64>     (1000,1048576) = 232.502586ms
tXOR<__m128>      (1000,1048576) = 201.321703ms

Tested on i3 CPU.
Regards.
 
Share this answer
 
Comments
Member 2088 19-Mar-11 16:23pm    
Fantastic. Thank you very much. I'm surprised that tXOR<char> is so much faster that XOR, but I'm not arguing with the results.
Ozer Karaagac 19-Mar-11 17:10pm    
I'd like to draw attention to the first two results. Both operate on char. First one decrements a counter, increments 3 pointers and possibly uses direct register addressing. Second increments a counter but possibly uses indexed register addressing. There is a huge difference on i3. Pretty different on my system (2.4GHz Montevina NB). I think, indexed addressing matters. There may be some other factors such as blending.

XOR (1000,1048576) = 933.972893ms
tXOR<char> (1000,1048576) = 1211.190808ms
tXOR<short> (1000,1048576) = 683.663426ms
tXOR<unsigned int> (1000,1048576) = 373.330130ms
tXOR<__int64> (1000,1048576) = 361.271335ms
tXOR<__m128> (1000,1048576) = 170.242587ms
[no name] 19-May-16 5:57am    
TI* ii1 = (TI*)i1;

Doesn't this break the strict-aliasing rule?
I think, If you xor more data in one turn, you will get more efficient result. It will decrease both xor operation count (in clock cycles) and the memory access count. You can use 64 bit (8 byte) data by using MMX registers in an "32 bit Intel processor", if MMX supported. If you use SSE2 instructions, data size will be 128 Bit. But, these cases should be profiled before deciding that they are most efficient. SIMD functionality provides useful instructions operates on vectors however your case buffer manipulation, as far as I understand. I can help further, if you need help while implementing.

Such a code below will be more efficient than above on a very large buffer.

void XOR(char* inBuf1, char* inBuf2, char*outBuf, int size)
{
	int size8 = size / sizeof(unsigned __int64); // 8 actually
	if(0 < size8)
	{
		unsigned __int64 * p1 = (unsigned __int64 *)inBuf1;
		unsigned __int64 * p2 = (unsigned __int64 *)inBuf2;
		unsigned __int64 * po = (unsigned __int64 *)outBuf;

		while(size8--)
		{
			*po++ = *p1++ ^ *p2++;
		}
	}

	int size1 = size % sizeof(unsigned __int64); // 8 actually
	if(0 < size1)
	{
		const int done = size - size1;
		inBuf1 += done;
		inBuf2 += done;
		outBuf += done;

		while(size1--)
		{
			*outBuf++ = *inBuf1++ ^ *inBuf2++;
		}
	}
}
 
Share this answer
 
v2

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900