33 #include <m4ri/m4ri_config.h>
36 #include <emmintrin.h>
54 __m128i *__c = (__m128i*)c;
55 __m128i *__t1 = (__m128i*)t1;
56 __m128i *__t2 = (__m128i*)t2;
57 __m128i *__t3 = (__m128i*)t3;
58 __m128i *__t4 = (__m128i*)t4;
59 __m128i *__t5 = (__m128i*)t5;
60 __m128i *__t6 = (__m128i*)t6;
61 __m128i *__t7 = (__m128i*)t7;
62 __m128i *__t8 = (__m128i*)t8;
63 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
67 xmm1 = _mm_xor_si128(*__c, *__t1++);
68 xmm1 = _mm_xor_si128(xmm1, *__t2++);
69 xmm1 = _mm_xor_si128(xmm1, *__t3++);
70 xmm1 = _mm_xor_si128(xmm1, *__t4++);
71 xmm1 = _mm_xor_si128(xmm1, *__t5++);
72 xmm1 = _mm_xor_si128(xmm1, *__t6++);
73 xmm1 = _mm_xor_si128(xmm1, *__t7++);
74 xmm1 = _mm_xor_si128(xmm1, *__t8++);
86 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
89 for(
wi_t i = 0; i < wide; ++i) {
90 c[i] ^= t1[i] ^ t2[i] ^ t3[i] ^ t4[i] ^ t5[i] ^ t6[i] ^ t7[i] ^ t8[i];
93 __M4RI_DD_RAWROW(c, wide_in);
108 *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
112 __m128i *__c = (__m128i*)c;
113 __m128i *__t1 = (__m128i*)t1;
114 __m128i *__t2 = (__m128i*)t2;
115 __m128i *__t3 = (__m128i*)t3;
116 __m128i *__t4 = (__m128i*)t4;
117 __m128i *__t5 = (__m128i*)t5;
118 __m128i *__t6 = (__m128i*)t6;
119 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
123 xmm1 = _mm_xor_si128(*__c, *__t1++);
124 xmm1 = _mm_xor_si128(xmm1, *__t2++);
125 xmm1 = _mm_xor_si128(xmm1, *__t3++);
126 xmm1 = _mm_xor_si128(xmm1, *__t4++);
127 xmm1 = _mm_xor_si128(xmm1, *__t5++);
128 xmm1 = _mm_xor_si128(xmm1, *__t6++);
138 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
141 *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
143 __M4RI_DD_RAWROW(c, wide_in);
146 wi_t n = (wide + 7) / 8;
148 case 0:
do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
149 case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
150 case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
151 case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
152 case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
153 case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
154 case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
155 case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
158 __M4RI_DD_RAWROW(c, wide_in);
160 #endif // __M4RI_HAVE_SSE2
177 *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
181 __m128i *__c = (__m128i*)c;
182 __m128i *__t1 = (__m128i*)t1;
183 __m128i *__t2 = (__m128i*)t2;
184 __m128i *__t3 = (__m128i*)t3;
185 __m128i *__t4 = (__m128i*)t4;
186 __m128i *__t5 = (__m128i*)t5;
187 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
191 xmm1 = _mm_xor_si128(*__c, *__t1++);
192 xmm1 = _mm_xor_si128(xmm1, *__t2++);
193 xmm1 = _mm_xor_si128(xmm1, *__t3++);
194 xmm1 = _mm_xor_si128(xmm1, *__t4++);
195 xmm1 = _mm_xor_si128(xmm1, *__t5++);
204 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
207 *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
209 __M4RI_DD_RAWROW(c, wide_in);
212 wi_t n = (wide + 7) / 8;
214 case 0:
do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
215 case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
216 case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
217 case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
218 case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
219 case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
220 case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
221 case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
224 __M4RI_DD_RAWROW(c, wide_in);
226 #endif // __M4RI_HAVE_SSE2
243 *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
247 __m128i *__c = (__m128i*)c;
248 __m128i *__t1 = (__m128i*)t1;
249 __m128i *__t2 = (__m128i*)t2;
250 __m128i *__t3 = (__m128i*)t3;
251 __m128i *__t4 = (__m128i*)t4;
252 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
256 xmm1 = _mm_xor_si128(*__c, *__t1++);
257 xmm1 = _mm_xor_si128(xmm1, *__t2++);
258 xmm1 = _mm_xor_si128(xmm1, *__t3++);
259 xmm1 = _mm_xor_si128(xmm1, *__t4++);
267 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
270 *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
272 __M4RI_DD_RAWROW(c, wide_in);
275 wi_t n = (wide + 7) / 8;
277 case 0:
do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
278 case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
279 case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
280 case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
281 case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
282 case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
283 case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
284 case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
287 __M4RI_DD_RAWROW(c, wide_in);
289 #endif // __M4RI_HAVE_SSE2
302 __m128i *__c = (__m128i*)c;
303 __m128i *__t1 = (__m128i*)t1;
304 __m128i *__t2 = (__m128i*)t2;
305 __m128i *__t3 = (__m128i*)t3;
306 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
310 xmm1 = _mm_xor_si128(*__c, *__t1++);
311 xmm1 = _mm_xor_si128(xmm1, *__t2++);
312 xmm1 = _mm_xor_si128(xmm1, *__t3++);
319 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
322 __M4RI_DD_RAWROW(c, wide_in);
325 #endif // __M4RI_HAVE_SSE2
326 wi_t n = (wide + 7) / 8;
328 case 0:
do { *c++ ^= *t1++ ^ *t2++ ^ *t3++;
329 case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
330 case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
331 case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
332 case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
333 case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
334 case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
335 case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
338 __M4RI_DD_RAWROW(c, wide_in);
352 __m128i *__c = (__m128i*)c;
353 __m128i *__t1 = (__m128i*)t1;
354 __m128i *__t2 = (__m128i*)t2;
355 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
359 xmm1 = _mm_xor_si128(*__c, *__t1++);
360 xmm1 = _mm_xor_si128(xmm1, *__t2++);
366 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
369 __M4RI_DD_RAWROW(c, wide_in);
372 #endif // __M4RI_HAVE_SSE2
373 wi_t n = (wide + 7) / 8;
375 case 0:
do { *c++ ^= *t1++ ^ *t2++;
376 case 7: *c++ ^= *t1++ ^ *t2++;
377 case 6: *c++ ^= *t1++ ^ *t2++;
378 case 5: *c++ ^= *t1++ ^ *t2++;
379 case 4: *c++ ^= *t1++ ^ *t2++;
380 case 3: *c++ ^= *t1++ ^ *t2++;
381 case 2: *c++ ^= *t1++ ^ *t2++;
382 case 1: *c++ ^= *t1++ ^ *t2++;
385 __M4RI_DD_RAWROW(c, wide_in);
403 __m128i *__c = (__m128i*)c;
404 __m128i *__t1 = (__m128i*)t1;
405 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
410 xmm1 = _mm_xor_si128(*__c, *__t1++);
412 xmm1 = _mm_xor_si128(*__c, *__t1++);
417 xmm1 = _mm_xor_si128(*__c, *__t1++);
423 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
426 __M4RI_DD_RAWROW(c, wide_in);
429 #endif // __M4RI_HAVE_SSE2
431 wi_t n = (wide + 7) / 8;
433 case 0:
do { *c++ ^= *t1++;
434 case 7: *c++ ^= *t1++;
435 case 6: *c++ ^= *t1++;
436 case 5: *c++ ^= *t1++;
437 case 4: *c++ ^= *t1++;
438 case 3: *c++ ^= *t1++;
439 case 2: *c++ ^= *t1++;
440 case 1: *c++ ^= *t1++;
443 __M4RI_DD_RAWROW(c, wide_in);