__m128i v_zero = _mm_setzero_si128(); for ( ; j <= width - 8; j += 8) { __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j)); __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j)); v_dx = _mm_max_epi16(v_dx, _mm_sub_epi16(v_zero, v_dx)); v_dy = _mm_max_epi16(v_dy, _mm_sub_epi16(v_zero, v_dy)); __m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx, v_zero), _mm_unpacklo_epi16(v_dy, v_zero)); _mm_storeu_si128((__m128i *)(_norm + j), v_norm); v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx, v_zero), _mm_unpackhi_epi16(v_dy, v_zero)); _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm); }
var
This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)