How to optimise XOR function for large buffer manipulation

Question

4.00/5 (2 votes)

See more:

I am investigating how to optimise XORing two data buffers together on a 32 bit Intel processor. To get something working I am writing a simple function on the lines of

void XOR(char* inBuf1, char* inBuf2, char*outBuf, int size)
{
    while(size--)
    {
        *outBuf++ = *inBuf1++ ^ *inBuf2++;
    }
}

I need really the function to be more efficient. The options I've thought of so far are:-
XORing __int64 data instead
Looking at using BitBlt
Use SSE2.

Can anyone suggest other alternatives or provide any clues as to which of the above is likely to be better?

Posted 18-Mar-11 3:50am

Member 2088

Add a Solution

3 solutions

Solution 1

Have a look at this article:
Introduction to SSE Programming[^]

You would simply use __m128i _mm_xor_si128:
http://msdn.microsoft.com/en-us/library/fzt08www(v=vs.80).aspx[^]

Other good resource:
http://bmagic.sourceforge.net/bmsse2opt.html[^]

Good luck!

Posted 18-Mar-11 5:46am

E.F. Nijboer

Solution 2

I think, If you xor more data in one turn, you will get more efficient result. It will decrease both xor operation count (in clock cycles) and the memory access count. You can use 64 bit (8 byte) data by using MMX registers in an "32 bit Intel processor", if MMX supported. If you use SSE2 instructions, data size will be 128 Bit. But, these cases should be profiled before deciding that they are most efficient. SIMD functionality provides useful instructions operates on vectors however your case buffer manipulation, as far as I understand. I can help further, if you need help while implementing.

Such a code below will be more efficient than above on a very large buffer.

void XOR(char* inBuf1, char* inBuf2, char*outBuf, int size)
{
	int size8 = size / sizeof(unsigned __int64); // 8 actually
	if(0 < size8)
	{
		unsigned __int64 * p1 = (unsigned __int64 *)inBuf1;
		unsigned __int64 * p2 = (unsigned __int64 *)inBuf2;
		unsigned __int64 * po = (unsigned __int64 *)outBuf;

		while(size8--)
		{
			*po++ = *p1++ ^ *p2++;
		}
	}

	int size1 = size % sizeof(unsigned __int64); // 8 actually
	if(0 < size1)
	{
		const int done = size - size1;
		inBuf1 += done;
		inBuf2 += done;
		outBuf += done;

		while(size1--)
		{
			*outBuf++ = *inBuf1++ ^ *inBuf2++;
		}
	}
}

Posted 18-Mar-11 7:58am

Ozer Karaagac

Updated 18-Mar-11 8:03am

v2

Add a Solution

Add your solution here

Treat my content as plain text, not as HTML

Preview 0

…

Existing Members

Sign in to your account

...or Join us

Download, Vote, Comment, Publish.

Your Email
Password
Forgot your password?

Your Email
This email is in use. Do you need your password?
Optional Password

I have read and agree to the Terms of Service and Privacy Policy
Please subscribe me to the CodeProject newsletters

When answering a question please:

Read the question carefully.
Understand that English isn't everyone's first language so be lenient of bad spelling and grammar.
If a question is poorly phrased then either ask for clarification, ignore it, or edit the question and fix the problem. Insults are not welcome.
Don't tell someone to read the manual. Chances are they have and don't get it. Provide an answer or move on to the next question.

Let's work to help developers, not make them feel stupid.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

mbue · Accepted Answer · 2011-03-19T01:55:00

A little example to evaluate:

double FreqTime(const LARGE_INTEGER& t,const LARGE_INTEGER& f)
{
  typedef struct { static double tod(const LARGE_INTEGER& li){ return (4294967296.0*li.HighPart+li.LowPart); } }_;
  return 1e3*_::tod(t)/_::tod(f);
}

void XOR(char* inBuf1, char* inBuf2, char*outBuf, unsigned int size)
{
    while(size--)
    {
        *outBuf++ = *inBuf1++ ^ *inBuf2++;
    }
}

template <class TI>
void tXOR(char* i1, char* i2, char* o, unsigned int cb)
{
  unsigned int  i;
  TI*            ii1 = (TI*)i1;
  TI*            ii2 = (TI*)i2;
  TI*            oo  = (TI*)o;
  unsigned int  nn = cb/sizeof(TI);
  unsigned int  dd = sizeof(TI)*nn;
  for(i=0;i<nn;i++) oo[i]=ii1[i]^ii2[i];
  if(dd<cb) tXOR<char>(i1+dd,i2+dd,o+dd,cb-dd);
}

template <>
void tXOR<__m128>(char* i1, char* i2, char* o, unsigned int cb)
{
  unsigned int  i;
  __m128*        ii1 = (__m128*)i1;
  __m128*        ii2 = (__m128*)i2;
  __m128*        oo  = (__m128*)o;
  unsigned int  nn = cb/sizeof(__m128);
  unsigned int  dd = sizeof(__m128)*nn;
  for(i=0;i<nn;i++) oo[i]=_mm_xor_ps(ii1[i],ii2[i]);
  if(dd<cb) tXOR<char>(i1+dd,i2+dd,o+dd,cb-dd);
}

int _tmain(int argc, _TCHAR* argv[])
{

  LARGE_INTEGER freq = {0,0};
  LARGE_INTEGER t0 = {0,0};
  LARGE_INTEGER t1 = {0,0};

  QueryPerformanceFrequency(&freq);

  enum{ BUFFSIZE=1<<20, LOOPS=1000, };
  unsigned int  i;
  char*          in1 = (char*)malloc(BUFFSIZE);
  char*          in2 = (char*)malloc(BUFFSIZE);
  char*          out = (char*)malloc(BUFFSIZE);
  
  QueryPerformanceCounter(&t0);
  srand(t0.LowPart);
  for(i=0;i<BUFFSIZE;i++)
  {
    in1[i] = MulDiv(rand(),1,RAND_MAX);
    in2[i] = MulDiv(rand(),1,RAND_MAX);
  }

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) XOR(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("XOR               "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<char>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<char>        "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<short>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<short>       "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<unsigned int>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<unsigned int>"),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<__int64>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<__int64>     "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<__m128>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<__m128>      "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  free(in1);
  free(in2);
  free(out);

  _getch();
  return 0;
}

output:

XOR               (1000,1048576) = 1447.370089ms
tXOR<char>        (1000,1048576) = 993.742449ms
tXOR<short>       (1000,1048576) = 511.465385ms
tXOR<unsigned int>(1000,1048576) = 334.088394ms
tXOR<__int64>     (1000,1048576) = 232.502586ms
tXOR<__m128>      (1000,1048576) = 201.321703ms

Tested on i3 CPU.
Regards.