M4RI 1.0.1
|
00001 00010 #ifndef M4RI_XOR_H 00011 #define M4RI_XOR_H 00012 00013 /******************************************************************* 00014 * 00015 * M4RI: Linear Algebra over GF(2) 00016 * 00017 * Copyright (C) 2008-2010 Martin Albrecht <martinralbrecht@googlemail.com> 00018 * 00019 * Distributed under the terms of the GNU General Public License (GPL) 00020 * version 2 or higher. 00021 * 00022 * This code is distributed in the hope that it will be useful, 00023 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00024 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00025 * General Public License for more details. 00026 * 00027 * The full text of the GPL is available at: 00028 * 00029 * http://www.gnu.org/licenses/ 00030 * 00031 ********************************************************************/ 00032 00033 #include "m4ri_config.h" 00034 00035 #if __M4RI_HAVE_SSE2 00036 #include <emmintrin.h> 00037 #endif 00038 00039 #include "misc.h" 00040 00048 static inline void _mzd_combine8(word *c, word const *t1, word const *t2, word const *t3, word const *t4, 00049 word const *t5, word const *t6, word const *t7, word const *t8, wi_t wide_in) { 00050 wi_t wide = wide_in; 00051 #if __M4RI_HAVE_SSE2 00052 /* assuming t1 ... t8 are aligned, but c might not be */ 00053 if (__M4RI_ALIGNMENT(c,16)==0) { 00054 __m128i *__c = (__m128i*)c; 00055 __m128i *__t1 = (__m128i*)t1; 00056 __m128i *__t2 = (__m128i*)t2; 00057 __m128i *__t3 = (__m128i*)t3; 00058 __m128i *__t4 = (__m128i*)t4; 00059 __m128i *__t5 = (__m128i*)t5; 00060 __m128i *__t6 = (__m128i*)t6; 00061 __m128i *__t7 = (__m128i*)t7; 00062 __m128i *__t8 = (__m128i*)t8; 00063 const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL); 00064 __m128i xmm1; 00065 00066 while(__c < eof) { 00067 xmm1 = _mm_xor_si128(*__c, *__t1++); 00068 xmm1 = _mm_xor_si128(xmm1, *__t2++); 00069 xmm1 = _mm_xor_si128(xmm1, *__t3++); 00070 xmm1 = _mm_xor_si128(xmm1, *__t4++); 00071 xmm1 = _mm_xor_si128(xmm1, *__t5++); 00072 xmm1 = _mm_xor_si128(xmm1, *__t6++); 00073 xmm1 = _mm_xor_si128(xmm1, *__t7++); 00074 xmm1 = _mm_xor_si128(xmm1, *__t8++); 00075 *__c++ = xmm1; 00076 } 00077 c = (word*)__c; 00078 t1 = (word*)__t1; 00079 t2 = (word*)__t2; 00080 t3 = (word*)__t3; 00081 t4 = (word*)__t4; 00082 t5 = (word*)__t5; 00083 t6 = (word*)__t6; 00084 t7 = (word*)__t7; 00085 t8 = (word*)__t8; 00086 wide = ((sizeof(word) * wide) % 16) / sizeof(word); 00087 } 00088 #endif 00089 for(wi_t i = 0; i < wide; ++i) { 00090 c[i] ^= t1[i] ^ t2[i] ^ t3[i] ^ t4[i] ^ t5[i] ^ t6[i] ^ t7[i] ^ t8[i]; 00091 } 00092 00093 __M4RI_DD_RAWROW(c, wide_in); 00094 } 00095 00101 static inline void _mzd_combine4(word *c, word const *t1, word const *t2, word const *t3, word const *t4, wi_t wide_in) { 00102 wi_t wide = wide_in; 00103 #if __M4RI_HAVE_SSE2 00104 /* assuming t1 ... t4 are aligned, but c might not be */ 00105 if (__M4RI_ALIGNMENT(c,16)==0) { 00106 __m128i *__c = (__m128i*)c; 00107 __m128i *__t1 = (__m128i*)t1; 00108 __m128i *__t2 = (__m128i*)t2; 00109 __m128i *__t3 = (__m128i*)t3; 00110 __m128i *__t4 = (__m128i*)t4; 00111 const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL); 00112 __m128i xmm1; 00113 00114 while(__c < eof) { 00115 xmm1 = _mm_xor_si128(*__c, *__t1++); 00116 xmm1 = _mm_xor_si128(xmm1, *__t2++); 00117 xmm1 = _mm_xor_si128(xmm1, *__t3++); 00118 xmm1 = _mm_xor_si128(xmm1, *__t4++); 00119 *__c++ = xmm1; 00120 } 00121 c = (word*)__c; 00122 t1 = (word*)__t1; 00123 t2 = (word*)__t2; 00124 t3 = (word*)__t3; 00125 t4 = (word*)__t4; 00126 wide = ((sizeof(word) * wide) % 16) / sizeof(word); 00127 } 00128 if(!wide) { 00129 __M4RI_DD_RAWROW(c, wide_in); 00130 return; 00131 } 00132 #endif // __M4RI_HAVE_SSE2 00133 wi_t n = (wide + 7) / 8; 00134 switch (wide % 8) { 00135 case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; 00136 case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; 00137 case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; 00138 case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; 00139 case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; 00140 case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; 00141 case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; 00142 case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++; 00143 } while (--n > 0); 00144 } 00145 __M4RI_DD_RAWROW(c, wide_in); 00146 } 00147 00153 static inline void _mzd_combine3(word *c, word const *t1, word const *t2, word const *t3, wi_t wide_in) { 00154 wi_t wide = wide_in; 00155 #if __M4RI_HAVE_SSE2 00156 /* assuming t1 ... t3 are aligned, but c might not be */ 00157 if (__M4RI_ALIGNMENT(c,16)==0) { 00158 __m128i *__c = (__m128i*)c; 00159 __m128i *__t1 = (__m128i*)t1; 00160 __m128i *__t2 = (__m128i*)t2; 00161 __m128i *__t3 = (__m128i*)t3; 00162 const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL); 00163 __m128i xmm1; 00164 00165 while(__c < eof) { 00166 xmm1 = _mm_xor_si128(*__c, *__t1++); 00167 xmm1 = _mm_xor_si128(xmm1, *__t2++); 00168 xmm1 = _mm_xor_si128(xmm1, *__t3++); 00169 *__c++ = xmm1; 00170 } 00171 c = (word*)__c; 00172 t1 = (word*)__t1; 00173 t2 = (word*)__t2; 00174 t3 = (word*)__t3; 00175 wide = ((sizeof(word) * wide) % 16) / sizeof(word); 00176 } 00177 if(!wide) { 00178 __M4RI_DD_RAWROW(c, wide_in); 00179 return; 00180 } 00181 #endif // __M4RI_HAVE_SSE2 00182 wi_t n = (wide + 7) / 8; 00183 switch (wide % 8) { 00184 case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++; 00185 case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++; 00186 case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++; 00187 case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++; 00188 case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++; 00189 case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++; 00190 case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++; 00191 case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++; 00192 } while (--n > 0); 00193 } 00194 __M4RI_DD_RAWROW(c, wide_in); 00195 } 00196 00197 00203 static inline void _mzd_combine2(word *c, word const *t1, word const *t2, wi_t wide_in) { 00204 wi_t wide = wide_in; 00205 #if __M4RI_HAVE_SSE2 00206 /* assuming t1 ... t2 are aligned, but c might not be */ 00207 if (__M4RI_ALIGNMENT(c,16)==0) { 00208 __m128i *__c = (__m128i*)c; 00209 __m128i *__t1 = (__m128i*)t1; 00210 __m128i *__t2 = (__m128i*)t2; 00211 const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL); 00212 __m128i xmm1; 00213 00214 while(__c < eof) { 00215 xmm1 = _mm_xor_si128(*__c, *__t1++); 00216 xmm1 = _mm_xor_si128(xmm1, *__t2++); 00217 *__c++ = xmm1; 00218 } 00219 c = (word*)__c; 00220 t1 = (word*)__t1; 00221 t2 = (word*)__t2; 00222 wide = ((sizeof(word) * wide) % 16) / sizeof(word); 00223 } 00224 if(!wide) { 00225 __M4RI_DD_RAWROW(c, wide_in); 00226 return; 00227 } 00228 #endif // __M4RI_HAVE_SSE2 00229 wi_t n = (wide + 7) / 8; 00230 switch (wide % 8) { 00231 case 0: do { *c++ ^= *t1++ ^ *t2++; 00232 case 7: *c++ ^= *t1++ ^ *t2++; 00233 case 6: *c++ ^= *t1++ ^ *t2++; 00234 case 5: *c++ ^= *t1++ ^ *t2++; 00235 case 4: *c++ ^= *t1++ ^ *t2++; 00236 case 3: *c++ ^= *t1++ ^ *t2++; 00237 case 2: *c++ ^= *t1++ ^ *t2++; 00238 case 1: *c++ ^= *t1++ ^ *t2++; 00239 } while (--n > 0); 00240 } 00241 __M4RI_DD_RAWROW(c, wide_in); 00242 } 00243 00249 static inline void _mzd_combine(word *c, word const *t1, wi_t wide_in) { 00250 wi_t wide = wide_in; 00251 #if __M4RI_HAVE_SSE2 00252 /* assuming c, t1 are alligned the same way */ 00253 00254 if (__M4RI_ALIGNMENT(c,16)==8 && wide) { 00255 *c++ ^= *t1++; 00256 wide--; 00257 } 00258 00259 __m128i *__c = (__m128i*)c; 00260 __m128i *__t1 = (__m128i*)t1; 00261 const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL); 00262 __m128i xmm1; 00263 00264 00265 while(__c < eof-1) { 00266 xmm1 = _mm_xor_si128(*__c, *__t1++); 00267 *__c++ = xmm1; 00268 xmm1 = _mm_xor_si128(*__c, *__t1++); 00269 *__c++ = xmm1; 00270 } 00271 00272 if(__c < eof) { 00273 xmm1 = _mm_xor_si128(*__c, *__t1++); 00274 *__c++ = xmm1; 00275 } 00276 00277 c = (word*)__c; 00278 t1 = (word*)__t1; 00279 wide = ((sizeof(word) * wide) % 16) / sizeof(word); 00280 00281 if(!wide) { 00282 __M4RI_DD_RAWROW(c, wide_in); 00283 return; 00284 } 00285 #endif // __M4RI_HAVE_SSE2 00286 00287 wi_t n = (wide + 7) / 8; 00288 switch (wide % 8) { 00289 case 0: do { *c++ ^= *t1++; 00290 case 7: *c++ ^= *t1++; 00291 case 6: *c++ ^= *t1++; 00292 case 5: *c++ ^= *t1++; 00293 case 4: *c++ ^= *t1++; 00294 case 3: *c++ ^= *t1++; 00295 case 2: *c++ ^= *t1++; 00296 case 1: *c++ ^= *t1++; 00297 } while (--n > 0); 00298 } 00299 __M4RI_DD_RAWROW(c, wide_in); 00300 } 00301 00302 00303 #ifdef __M4RI_M4RM_GRAY8 00304 #define _MZD_COMBINE _mzd_combine8(c, t1, t2, t3, t4, t5, t6, t7, t8, wide) 00305 #else // __M4RI_M4RM_GRAY8 00306 #define _MZD_COMBINE _mzd_combine4(c, t1, t2, t3, t4, wide) 00307 #endif // __M4RI_M4RM_GRAY8 00308 00309 #endif // M4RI_XOR_H