60 __m512i idx1 = _mm512_set_epi32(
61 0x1E, 0x1C, 0x1A, 0x18, 0x16, 0x14, 0x12, 0x10,
62 0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00
64 __m512i idx2 = _mm512_set_epi32(
65 0x1F, 0x1D, 0x1B, 0x19, 0x17, 0x15, 0x13, 0x11,
66 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01
68 for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16)
70 __m512 a = _mm512_load_ps(sp);
71 __m512 b = _mm512_load_ps(sp + 16);
72 __m512 c = _mm512_permutex2var_ps(a, idx1, b);
73 __m512 d = _mm512_permutex2var_ps(a, idx2, b);
74 _mm512_store_ps(dpl, c);
75 _mm512_store_ps(dph, d);
77 for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
79 __m256 a = _mm256_load_ps(sp);
80 __m256 b = _mm256_load_ps(sp + 8);
81 __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
82 __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
83 __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
84 __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
85 _mm256_store_ps(dpl, e);
86 _mm256_store_ps(dph, f);
96 __m512i idx1 = _mm512_set_epi32(
97 0x17, 0x7, 0x16, 0x6, 0x15, 0x5, 0x14, 0x4,
98 0x13, 0x3, 0x12, 0x2, 0x11, 0x1, 0x10, 0x0
100 __m512i idx2 = _mm512_set_epi32(
101 0x1F, 0xF, 0x1E, 0xE, 0x1D, 0xD, 0x1C, 0xC,
102 0x1B, 0xB, 0x1A, 0xA, 0x19, 0x9, 0x18, 0x8
104 for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16)
106 __m512 a = _mm512_load_ps(spl);
107 __m512 b = _mm512_load_ps(sph);
108 __m512 c = _mm512_permutex2var_ps(a, idx1, b);
109 __m512 d = _mm512_permutex2var_ps(a, idx2, b);
110 _mm512_store_ps(dp, c);
111 _mm512_store_ps(dp + 16, d);
113 for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
115 __m256 a = _mm256_load_ps(spl);
116 __m256 b = _mm256_load_ps(sph);
117 __m256 c = _mm256_unpacklo_ps(a, b);
118 __m256 d = _mm256_unpackhi_ps(a, b);
119 __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
120 __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
121 _mm256_store_ps(dp, e);
122 _mm256_store_ps(dp + 8, f);
132 __m512i idx1 = _mm512_set_epi64(
133 0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00
135 __m512i idx2 = _mm512_set_epi64(
136 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01
138 for (; width > 8; width -= 16, sp += 16, dpl += 8, dph += 8)
140 __m512d a = _mm512_load_pd(sp);
141 __m512d b = _mm512_load_pd(sp + 16);
142 __m512d c = _mm512_permutex2var_pd(a, idx1, b);
143 __m512d d = _mm512_permutex2var_pd(a, idx2, b);
144 _mm512_store_pd(dpl, c);
145 _mm512_store_pd(dph, d);
147 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
149 __m256d a = _mm256_load_pd(sp);
150 __m256d b = _mm256_load_pd(sp + 4);
151 __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0));
152 __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1));
153 __m256d e = _mm256_shuffle_pd(c, d, 0x0);
154 __m256d f = _mm256_shuffle_pd(c, d, 0xF);
155 _mm256_store_pd(dpl, e);
156 _mm256_store_pd(dph, f);
166 __m512i idx1 = _mm512_set_epi64(
167 0xB, 0x3, 0xA, 0x2, 0x9, 0x1, 0x8, 0x0
169 __m512i idx2 = _mm512_set_epi64(
170 0xF, 0x7, 0xE, 0x6, 0xD, 0x5, 0xC, 0x4
172 for (; width > 8; width -= 16, dp += 16, spl += 8, sph += 8)
174 __m512d a = _mm512_load_pd(spl);
175 __m512d b = _mm512_load_pd(sph);
176 __m512d c = _mm512_permutex2var_pd(a, idx1, b);
177 __m512d d = _mm512_permutex2var_pd(a, idx2, b);
178 _mm512_store_pd(dp, c);
179 _mm512_store_pd(dp + 16, d);
181 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
183 __m256d a = _mm256_load_pd(spl);
184 __m256d b = _mm256_load_pd(sph);
185 __m256d c = _mm256_unpacklo_pd(a, b);
186 __m256d d = _mm256_unpackhi_pd(a, b);
187 __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0));
188 __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1));
189 _mm256_store_pd(dp, e);
190 _mm256_store_pd(dp + 4, f);
238 ui32 width,
bool even)
244 float* dpl = even ? ldst->
f32 : hdst->
f32;
245 float* dph = even ? hdst->
f32 : ldst->
f32;
246 float* sp = src->
f32;
252 float* hp = hdst->
f32, * lp = ldst->
f32;
253 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
254 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
256 for (
ui32 j = num_steps; j > 0; --j)
263 lp[l_width] = lp[l_width - 1];
265 const float* sp = lp;
267 int i = (int)h_width;
268 __m512 f = _mm512_set1_ps(a);
271 for (; i > 0; i -= 16, sp += 16, dp += 16)
273 __m512 m = _mm512_load_ps(sp);
274 __m512 n = _mm512_loadu_ps(sp + 1);
275 __m512 p = _mm512_load_ps(dp);
276 p = _mm512_add_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
277 _mm512_store_ps(dp, p);
282 for (; i > 0; i -= 16, sp += 16, dp += 16)
284 __m512 m = _mm512_load_ps(sp);
285 __m512 n = _mm512_loadu_ps(sp - 1);
286 __m512 p = _mm512_load_ps(dp);
287 p = _mm512_add_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
288 _mm512_store_ps(dp, p);
293 float* t = lp; lp = hp; hp = t;
295 ui32 w = l_width; l_width = h_width; h_width = w;
299 float K = atk->
get_K();
300 float K_inv = 1.0f / K;
307 ldst->
f32[0] = src->
f32[0];
309 hdst->
f32[0] = src->
f32[0] * 2.0f;
316 ui32 width,
bool even)
321 float* oth = hsrc->
f32, * aug = lsrc->
f32;
322 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
323 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
326 float K = atk->
get_K();
327 float K_inv = 1.0f / K;
334 for (
ui32 j = 0; j < num_steps; ++j)
341 oth[oth_width] = oth[oth_width - 1];
343 const float* sp = oth;
345 int i = (int)aug_width;
346 __m512 f = _mm512_set1_ps(a);
349 for (; i > 0; i -= 16, sp += 16, dp += 16)
351 __m512 m = _mm512_load_ps(sp);
352 __m512 n = _mm512_loadu_ps(sp - 1);
353 __m512 p = _mm512_load_ps(dp);
354 p = _mm512_sub_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
355 _mm512_store_ps(dp, p);
360 for (; i > 0; i -= 16, sp += 16, dp += 16)
362 __m512 m = _mm512_load_ps(sp);
363 __m512 n = _mm512_loadu_ps(sp + 1);
364 __m512 p = _mm512_load_ps(dp);
365 p = _mm512_sub_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
366 _mm512_store_ps(dp, p);
371 float* t = aug; aug = oth; oth = t;
373 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
378 float* dp = dst->
f32;
379 float* spl = even ? lsrc->
f32 : hsrc->
f32;
380 float* sph = even ? hsrc->
f32 : lsrc->
f32;
387 dst->
f32[0] = lsrc->
f32[0];
389 dst->
f32[0] = hsrc->
f32[0] * 0.5f;
397 ui32 repeat,
bool synthesis)
402 __m512i va = _mm512_set1_epi32(a);
403 __m512i vb = _mm512_set1_epi32(b);
406 const si32* src1 = sig->
i32, * src2 = other->
i32;
414 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
416 __m512i s1 = _mm512_load_si512((__m512i*)src1);
417 __m512i s2 = _mm512_load_si512((__m512i*)src2);
418 __m512i d = _mm512_load_si512((__m512i*)dst);
419 __m512i t = _mm512_add_epi32(s1, s2);
420 __m512i v = _mm512_add_epi32(vb, t);
421 __m512i w = _mm512_srai_epi32(v, e);
422 d = _mm512_sub_epi32(d, w);
423 _mm512_store_si512((__m512i*)dst, d);
426 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
428 __m512i s1 = _mm512_load_si512((__m512i*)src1);
429 __m512i s2 = _mm512_load_si512((__m512i*)src2);
430 __m512i d = _mm512_load_si512((__m512i*)dst);
431 __m512i t = _mm512_add_epi32(s1, s2);
432 __m512i v = _mm512_add_epi32(vb, t);
433 __m512i w = _mm512_srai_epi32(v, e);
434 d = _mm512_add_epi32(d, w);
435 _mm512_store_si512((__m512i*)dst, d);
438 else if (a == -1 && b == 1 && e == 1)
442 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
444 __m512i s1 = _mm512_load_si512((__m512i*)src1);
445 __m512i s2 = _mm512_load_si512((__m512i*)src2);
446 __m512i d = _mm512_load_si512((__m512i*)dst);
447 __m512i t = _mm512_add_epi32(s1, s2);
448 __m512i w = _mm512_srai_epi32(t, e);
449 d = _mm512_add_epi32(d, w);
450 _mm512_store_si512((__m512i*)dst, d);
453 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
455 __m512i s1 = _mm512_load_si512((__m512i*)src1);
456 __m512i s2 = _mm512_load_si512((__m512i*)src2);
457 __m512i d = _mm512_load_si512((__m512i*)dst);
458 __m512i t = _mm512_add_epi32(s1, s2);
459 __m512i w = _mm512_srai_epi32(t, e);
460 d = _mm512_sub_epi32(d, w);
461 _mm512_store_si512((__m512i*)dst, d);
468 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
470 __m512i s1 = _mm512_load_si512((__m512i*)src1);
471 __m512i s2 = _mm512_load_si512((__m512i*)src2);
472 __m512i d = _mm512_load_si512((__m512i*)dst);
473 __m512i t = _mm512_add_epi32(s1, s2);
474 __m512i v = _mm512_sub_epi32(vb, t);
475 __m512i w = _mm512_srai_epi32(v, e);
476 d = _mm512_sub_epi32(d, w);
477 _mm512_store_si512((__m512i*)dst, d);
480 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
482 __m512i s1 = _mm512_load_si512((__m512i*)src1);
483 __m512i s2 = _mm512_load_si512((__m512i*)src2);
484 __m512i d = _mm512_load_si512((__m512i*)dst);
485 __m512i t = _mm512_add_epi32(s1, s2);
486 __m512i v = _mm512_sub_epi32(vb, t);
487 __m512i w = _mm512_srai_epi32(v, e);
488 d = _mm512_add_epi32(d, w);
489 _mm512_store_si512((__m512i*)dst, d);
495 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
497 __m512i s1 = _mm512_load_si512((__m512i*)src1);
498 __m512i s2 = _mm512_load_si512((__m512i*)src2);
499 __m512i d = _mm512_load_si512((__m512i*)dst);
500 __m512i t = _mm512_add_epi32(s1, s2);
501 __m512i u = _mm512_mullo_epi32(va, t);
502 __m512i v = _mm512_add_epi32(vb, u);
503 __m512i w = _mm512_srai_epi32(v, e);
504 d = _mm512_sub_epi32(d, w);
505 _mm512_store_si512((__m512i*)dst, d);
508 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
510 __m512i s1 = _mm512_load_si512((__m512i*)src1);
511 __m512i s2 = _mm512_load_si512((__m512i*)src2);
512 __m512i d = _mm512_load_si512((__m512i*)dst);
513 __m512i t = _mm512_add_epi32(s1, s2);
514 __m512i u = _mm512_mullo_epi32(va, t);
515 __m512i v = _mm512_add_epi32(vb, u);
516 __m512i w = _mm512_srai_epi32(v, e);
517 d = _mm512_add_epi32(d, w);
518 _mm512_store_si512((__m512i*)dst, d);
526 ui32 repeat,
bool synthesis)
531 __m512i vb = _mm512_set1_epi64(b);
534 const si64* src1 = sig->
i64, * src2 = other->
i64;
542 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
544 __m512i s1 = _mm512_load_si512((__m512i*)src1);
545 __m512i s2 = _mm512_load_si512((__m512i*)src2);
546 __m512i d = _mm512_load_si512((__m512i*)dst);
547 __m512i t = _mm512_add_epi64(s1, s2);
548 __m512i v = _mm512_add_epi64(vb, t);
549 __m512i w = _mm512_srai_epi64(v, e);
550 d = _mm512_sub_epi64(d, w);
551 _mm512_store_si512((__m512i*)dst, d);
554 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
556 __m512i s1 = _mm512_load_si512((__m512i*)src1);
557 __m512i s2 = _mm512_load_si512((__m512i*)src2);
558 __m512i d = _mm512_load_si512((__m512i*)dst);
559 __m512i t = _mm512_add_epi64(s1, s2);
560 __m512i v = _mm512_add_epi64(vb, t);
561 __m512i w = _mm512_srai_epi64(v, e);
562 d = _mm512_add_epi64(d, w);
563 _mm512_store_si512((__m512i*)dst, d);
566 else if (a == -1 && b == 1 && e == 1)
570 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
572 __m512i s1 = _mm512_load_si512((__m512i*)src1);
573 __m512i s2 = _mm512_load_si512((__m512i*)src2);
574 __m512i d = _mm512_load_si512((__m512i*)dst);
575 __m512i t = _mm512_add_epi64(s1, s2);
576 __m512i w = _mm512_srai_epi64(t, e);
577 d = _mm512_add_epi64(d, w);
578 _mm512_store_si512((__m512i*)dst, d);
581 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
583 __m512i s1 = _mm512_load_si512((__m512i*)src1);
584 __m512i s2 = _mm512_load_si512((__m512i*)src2);
585 __m512i d = _mm512_load_si512((__m512i*)dst);
586 __m512i t = _mm512_add_epi64(s1, s2);
587 __m512i w = _mm512_srai_epi64(t, e);
588 d = _mm512_sub_epi64(d, w);
589 _mm512_store_si512((__m512i*)dst, d);
596 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
598 __m512i s1 = _mm512_load_si512((__m512i*)src1);
599 __m512i s2 = _mm512_load_si512((__m512i*)src2);
600 __m512i d = _mm512_load_si512((__m512i*)dst);
601 __m512i t = _mm512_add_epi64(s1, s2);
602 __m512i v = _mm512_sub_epi64(vb, t);
603 __m512i w = _mm512_srai_epi64(v, e);
604 d = _mm512_sub_epi64(d, w);
605 _mm512_store_si512((__m512i*)dst, d);
608 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
610 __m512i s1 = _mm512_load_si512((__m512i*)src1);
611 __m512i s2 = _mm512_load_si512((__m512i*)src2);
612 __m512i d = _mm512_load_si512((__m512i*)dst);
613 __m512i t = _mm512_add_epi64(s1, s2);
614 __m512i v = _mm512_sub_epi64(vb, t);
615 __m512i w = _mm512_srai_epi64(v, e);
616 d = _mm512_add_epi64(d, w);
617 _mm512_store_si512((__m512i*)dst, d);
625 for (
ui32 i = repeat; i > 0; --i)
626 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
628 for (
ui32 i = repeat; i > 0; --i)
629 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
691 ui32 width,
bool even)
697 float* dpl = even ? ldst->
f32 : hdst->
f32;
698 float* dph = even ? hdst->
f32 : ldst->
f32;
699 float* sp = src->
f32;
705 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
706 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
708 for (
ui32 j = num_steps; j > 0; --j)
715 __m512i va = _mm512_set1_epi32(a);
716 __m512i vb = _mm512_set1_epi32(b);
720 lp[l_width] = lp[l_width - 1];
726 int i = (int)h_width;
729 for (; i > 0; i -= 16, sp += 16, dp += 16)
731 __m512i s1 = _mm512_load_si512((__m512i*)sp);
732 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
733 __m512i d = _mm512_load_si512((__m512i*)dp);
734 __m512i t = _mm512_add_epi32(s1, s2);
735 __m512i v = _mm512_add_epi32(vb, t);
736 __m512i w = _mm512_srai_epi32(v, e);
737 d = _mm512_add_epi32(d, w);
738 _mm512_store_si512((__m512i*)dp, d);
743 for (; i > 0; i -= 16, sp += 16, dp += 16)
745 __m512i s1 = _mm512_load_si512((__m512i*)sp);
746 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
747 __m512i d = _mm512_load_si512((__m512i*)dp);
748 __m512i t = _mm512_add_epi32(s1, s2);
749 __m512i v = _mm512_add_epi32(vb, t);
750 __m512i w = _mm512_srai_epi32(v, e);
751 d = _mm512_add_epi32(d, w);
752 _mm512_store_si512((__m512i*)dp, d);
756 else if (a == -1 && b == 1 && e == 1)
758 int i = (int)h_width;
760 for (; i > 0; i -= 16, sp += 16, dp += 16)
762 __m512i s1 = _mm512_load_si512((__m512i*)sp);
763 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
764 __m512i d = _mm512_load_si512((__m512i*)dp);
765 __m512i t = _mm512_add_epi32(s1, s2);
766 __m512i w = _mm512_srai_epi32(t, e);
767 d = _mm512_sub_epi32(d, w);
768 _mm512_store_si512((__m512i*)dp, d);
771 for (; i > 0; i -= 16, sp += 16, dp += 16)
773 __m512i s1 = _mm512_load_si512((__m512i*)sp);
774 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
775 __m512i d = _mm512_load_si512((__m512i*)dp);
776 __m512i t = _mm512_add_epi32(s1, s2);
777 __m512i w = _mm512_srai_epi32(t, e);
778 d = _mm512_sub_epi32(d, w);
779 _mm512_store_si512((__m512i*)dp, d);
784 int i = (int)h_width;
786 for (; i > 0; i -= 16, sp += 16, dp += 16)
788 __m512i s1 = _mm512_load_si512((__m512i*)sp);
789 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
790 __m512i d = _mm512_load_si512((__m512i*)dp);
791 __m512i t = _mm512_add_epi32(s1, s2);
792 __m512i v = _mm512_sub_epi32(vb, t);
793 __m512i w = _mm512_srai_epi32(v, e);
794 d = _mm512_add_epi32(d, w);
795 _mm512_store_si512((__m512i*)dp, d);
798 for (; i > 0; i -= 16, sp += 16, dp += 16)
800 __m512i s1 = _mm512_load_si512((__m512i*)sp);
801 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
802 __m512i d = _mm512_load_si512((__m512i*)dp);
803 __m512i t = _mm512_add_epi32(s1, s2);
804 __m512i v = _mm512_sub_epi32(vb, t);
805 __m512i w = _mm512_srai_epi32(v, e);
806 d = _mm512_add_epi32(d, w);
807 _mm512_store_si512((__m512i*)dp, d);
812 int i = (int)h_width;
814 for (; i > 0; i -= 16, sp += 16, dp += 16)
816 __m512i s1 = _mm512_load_si512((__m512i*)sp);
817 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
818 __m512i d = _mm512_load_si512((__m512i*)dp);
819 __m512i t = _mm512_add_epi32(s1, s2);
820 __m512i u = _mm512_mullo_epi32(va, t);
821 __m512i v = _mm512_add_epi32(vb, u);
822 __m512i w = _mm512_srai_epi32(v, e);
823 d = _mm512_add_epi32(d, w);
824 _mm512_store_si512((__m512i*)dp, d);
827 for (; i > 0; i -= 16, sp += 16, dp += 16)
829 __m512i s1 = _mm512_load_si512((__m512i*)sp);
830 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
831 __m512i d = _mm512_load_si512((__m512i*)dp);
832 __m512i t = _mm512_add_epi32(s1, s2);
833 __m512i u = _mm512_mullo_epi32(va, t);
834 __m512i v = _mm512_add_epi32(vb, u);
835 __m512i w = _mm512_srai_epi32(v, e);
836 d = _mm512_add_epi32(d, w);
837 _mm512_store_si512((__m512i*)dp, d);
842 si32* t = lp; lp = hp; hp = t;
844 ui32 w = l_width; l_width = h_width; h_width = w;
849 ldst->
i32[0] = src->
i32[0];
851 hdst->
i32[0] = src->
i32[0] << 1;
858 ui32 width,
bool even)
864 double* dpl = (
double*)(even ? ldst->
p : hdst->
p);
865 double* dph = (
double*)(even ? hdst->
p : ldst->
p);
866 double* sp = (
double*)(src->
p);
872 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
873 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
875 for (
ui32 j = num_steps; j > 0; --j)
882 __m512i vb = _mm512_set1_epi64(b);
886 lp[l_width] = lp[l_width - 1];
892 int i = (int)h_width;
895 for (; i > 0; i -= 8, sp += 8, dp += 8)
897 __m512i s1 = _mm512_load_si512((__m512i*)sp);
898 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
899 __m512i d = _mm512_load_si512((__m512i*)dp);
900 __m512i t = _mm512_add_epi64(s1, s2);
901 __m512i v = _mm512_add_epi64(vb, t);
902 __m512i w = _mm512_srai_epi64(v, e);
903 d = _mm512_add_epi64(d, w);
904 _mm512_store_si512((__m512i*)dp, d);
909 for (; i > 0; i -= 8, sp += 8, dp += 8)
911 __m512i s1 = _mm512_load_si512((__m512i*)sp);
912 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
913 __m512i d = _mm512_load_si512((__m512i*)dp);
914 __m512i t = _mm512_add_epi64(s1, s2);
915 __m512i v = _mm512_add_epi64(vb, t);
916 __m512i w = _mm512_srai_epi64(v, e);
917 d = _mm512_add_epi64(d, w);
918 _mm512_store_si512((__m512i*)dp, d);
922 else if (a == -1 && b == 1 && e == 1)
924 int i = (int)h_width;
926 for (; i > 0; i -= 8, sp += 8, dp += 8)
928 __m512i s1 = _mm512_load_si512((__m512i*)sp);
929 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
930 __m512i d = _mm512_load_si512((__m512i*)dp);
931 __m512i t = _mm512_add_epi64(s1, s2);
932 __m512i w = _mm512_srai_epi64(t, e);
933 d = _mm512_sub_epi64(d, w);
934 _mm512_store_si512((__m512i*)dp, d);
937 for (; i > 0; i -= 8, sp += 8, dp += 8)
939 __m512i s1 = _mm512_load_si512((__m512i*)sp);
940 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
941 __m512i d = _mm512_load_si512((__m512i*)dp);
942 __m512i t = _mm512_add_epi64(s1, s2);
943 __m512i w = _mm512_srai_epi64(t, e);
944 d = _mm512_sub_epi64(d, w);
945 _mm512_store_si512((__m512i*)dp, d);
950 int i = (int)h_width;
952 for (; i > 0; i -= 8, sp += 8, dp += 8)
954 __m512i s1 = _mm512_load_si512((__m512i*)sp);
955 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
956 __m512i d = _mm512_load_si512((__m512i*)dp);
957 __m512i t = _mm512_add_epi64(s1, s2);
958 __m512i v = _mm512_sub_epi64(vb, t);
959 __m512i w = _mm512_srai_epi64(v, e);
960 d = _mm512_add_epi64(d, w);
961 _mm512_store_si512((__m512i*)dp, d);
964 for (; i > 0; i -= 8, sp += 8, dp += 8)
966 __m512i s1 = _mm512_load_si512((__m512i*)sp);
967 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
968 __m512i d = _mm512_load_si512((__m512i*)dp);
969 __m512i t = _mm512_add_epi64(s1, s2);
970 __m512i v = _mm512_sub_epi64(vb, t);
971 __m512i w = _mm512_srai_epi64(v, e);
972 d = _mm512_add_epi64(d, w);
973 _mm512_store_si512((__m512i*)dp, d);
982 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
983 *dp += (b + a * (sp[0] + sp[1])) >> e;
985 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
986 *dp += (b + a * (sp[-1] + sp[0])) >> e;
1023 si64* t = lp; lp = hp; hp = t;
1025 ui32 w = l_width; l_width = h_width; h_width = w;
1030 ldst->
i64[0] = src->
i64[0];
1032 hdst->
i64[0] = src->
i64[0] << 1;
1059 ui32 width,
bool even)
1065 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
1066 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
1068 for (
ui32 j = 0; j < num_steps; ++j)
1074 __m512i va = _mm512_set1_epi32(a);
1075 __m512i vb = _mm512_set1_epi32(b);
1079 oth[oth_width] = oth[oth_width - 1];
1081 const si32* sp = oth;
1085 int i = (int)aug_width;
1088 for (; i > 0; i -= 16, sp += 16, dp += 16)
1090 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1091 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1092 __m512i d = _mm512_load_si512((__m512i*)dp);
1093 __m512i t = _mm512_add_epi32(s1, s2);
1094 __m512i v = _mm512_add_epi32(vb, t);
1095 __m512i w = _mm512_srai_epi32(v, e);
1096 d = _mm512_sub_epi32(d, w);
1097 _mm512_store_si512((__m512i*)dp, d);
1102 for (; i > 0; i -= 16, sp += 16, dp += 16)
1104 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1105 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1106 __m512i d = _mm512_load_si512((__m512i*)dp);
1107 __m512i t = _mm512_add_epi32(s1, s2);
1108 __m512i v = _mm512_add_epi32(vb, t);
1109 __m512i w = _mm512_srai_epi32(v, e);
1110 d = _mm512_sub_epi32(d, w);
1111 _mm512_store_si512((__m512i*)dp, d);
1115 else if (a == -1 && b == 1 && e == 1)
1117 int i = (int)aug_width;
1119 for (; i > 0; i -= 16, sp += 16, dp += 16)
1121 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1122 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1123 __m512i d = _mm512_load_si512((__m512i*)dp);
1124 __m512i t = _mm512_add_epi32(s1, s2);
1125 __m512i w = _mm512_srai_epi32(t, e);
1126 d = _mm512_add_epi32(d, w);
1127 _mm512_store_si512((__m512i*)dp, d);
1130 for (; i > 0; i -= 16, sp += 16, dp += 16)
1132 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1133 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1134 __m512i d = _mm512_load_si512((__m512i*)dp);
1135 __m512i t = _mm512_add_epi32(s1, s2);
1136 __m512i w = _mm512_srai_epi32(t, e);
1137 d = _mm512_add_epi32(d, w);
1138 _mm512_store_si512((__m512i*)dp, d);
1143 int i = (int)aug_width;
1145 for (; i > 0; i -= 16, sp += 16, dp += 16)
1147 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1148 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1149 __m512i d = _mm512_load_si512((__m512i*)dp);
1150 __m512i t = _mm512_add_epi32(s1, s2);
1151 __m512i v = _mm512_sub_epi32(vb, t);
1152 __m512i w = _mm512_srai_epi32(v, e);
1153 d = _mm512_sub_epi32(d, w);
1154 _mm512_store_si512((__m512i*)dp, d);
1157 for (; i > 0; i -= 16, sp += 16, dp += 16)
1159 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1160 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1161 __m512i d = _mm512_load_si512((__m512i*)dp);
1162 __m512i t = _mm512_add_epi32(s1, s2);
1163 __m512i v = _mm512_sub_epi32(vb, t);
1164 __m512i w = _mm512_srai_epi32(v, e);
1165 d = _mm512_sub_epi32(d, w);
1166 _mm512_store_si512((__m512i*)dp, d);
1171 int i = (int)aug_width;
1173 for (; i > 0; i -= 16, sp += 16, dp += 16)
1175 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1176 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1177 __m512i d = _mm512_load_si512((__m512i*)dp);
1178 __m512i t = _mm512_add_epi32(s1, s2);
1179 __m512i u = _mm512_mullo_epi32(va, t);
1180 __m512i v = _mm512_add_epi32(vb, u);
1181 __m512i w = _mm512_srai_epi32(v, e);
1182 d = _mm512_sub_epi32(d, w);
1183 _mm512_store_si512((__m512i*)dp, d);
1186 for (; i > 0; i -= 16, sp += 16, dp += 16)
1188 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1189 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1190 __m512i d = _mm512_load_si512((__m512i*)dp);
1191 __m512i t = _mm512_add_epi32(s1, s2);
1192 __m512i u = _mm512_mullo_epi32(va, t);
1193 __m512i v = _mm512_add_epi32(vb, u);
1194 __m512i w = _mm512_srai_epi32(v, e);
1195 d = _mm512_sub_epi32(d, w);
1196 _mm512_store_si512((__m512i*)dp, d);
1201 si32* t = aug; aug = oth; oth = t;
1203 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1208 float* dp = dst->
f32;
1209 float* spl = even ? lsrc->
f32 : hsrc->
f32;
1210 float* sph = even ? hsrc->
f32 : lsrc->
f32;
1217 dst->
i32[0] = lsrc->
i32[0];
1219 dst->
i32[0] = hsrc->
i32[0] >> 1;
1226 ui32 width,
bool even)
1232 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
1233 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
1235 for (
ui32 j = 0; j < num_steps; ++j)
1241 __m512i vb = _mm512_set1_epi64(b);
1245 oth[oth_width] = oth[oth_width - 1];
1247 const si64* sp = oth;
1251 int i = (int)aug_width;
1254 for (; i > 0; i -= 8, sp += 8, dp += 8)
1256 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1257 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1258 __m512i d = _mm512_load_si512((__m512i*)dp);
1259 __m512i t = _mm512_add_epi64(s1, s2);
1260 __m512i v = _mm512_add_epi64(vb, t);
1261 __m512i w = _mm512_srai_epi64(v, e);
1262 d = _mm512_sub_epi64(d, w);
1263 _mm512_store_si512((__m512i*)dp, d);
1268 for (; i > 0; i -= 8, sp += 8, dp += 8)
1270 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1271 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1272 __m512i d = _mm512_load_si512((__m512i*)dp);
1273 __m512i t = _mm512_add_epi64(s1, s2);
1274 __m512i v = _mm512_add_epi64(vb, t);
1275 __m512i w = _mm512_srai_epi64(v, e);
1276 d = _mm512_sub_epi64(d, w);
1277 _mm512_store_si512((__m512i*)dp, d);
1281 else if (a == -1 && b == 1 && e == 1)
1283 int i = (int)aug_width;
1285 for (; i > 0; i -= 8, sp += 8, dp += 8)
1287 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1288 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1289 __m512i d = _mm512_load_si512((__m512i*)dp);
1290 __m512i t = _mm512_add_epi64(s1, s2);
1291 __m512i w = _mm512_srai_epi64(t, e);
1292 d = _mm512_add_epi64(d, w);
1293 _mm512_store_si512((__m512i*)dp, d);
1296 for (; i > 0; i -= 8, sp += 8, dp += 8)
1298 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1299 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1300 __m512i d = _mm512_load_si512((__m512i*)dp);
1301 __m512i t = _mm512_add_epi64(s1, s2);
1302 __m512i w = _mm512_srai_epi64(t, e);
1303 d = _mm512_add_epi64(d, w);
1304 _mm512_store_si512((__m512i*)dp, d);
1309 int i = (int)aug_width;
1311 for (; i > 0; i -= 8, sp += 8, dp += 8)
1313 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1314 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1315 __m512i d = _mm512_load_si512((__m512i*)dp);
1316 __m512i t = _mm512_add_epi64(s1, s2);
1317 __m512i v = _mm512_sub_epi64(vb, t);
1318 __m512i w = _mm512_srai_epi64(v, e);
1319 d = _mm512_sub_epi64(d, w);
1320 _mm512_store_si512((__m512i*)dp, d);
1323 for (; i > 0; i -= 8, sp += 8, dp += 8)
1325 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1326 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1327 __m512i d = _mm512_load_si512((__m512i*)dp);
1328 __m512i t = _mm512_add_epi64(s1, s2);
1329 __m512i v = _mm512_sub_epi64(vb, t);
1330 __m512i w = _mm512_srai_epi64(v, e);
1331 d = _mm512_sub_epi64(d, w);
1332 _mm512_store_si512((__m512i*)dp, d);
1341 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
1342 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
1344 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
1345 *dp -= (b + a * (sp[0] + sp[1])) >> e;
1382 si64* t = aug; aug = oth; oth = t;
1384 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1389 double* dp = (
double*)(dst->
p);
1390 double* spl = (
double*)(even ? lsrc->
p : hsrc->
p);
1391 double* sph = (
double*)(even ? hsrc->
p : lsrc->
p);
1398 dst->
i64[0] = lsrc->
i64[0];
1400 dst->
i64[0] = hsrc->
i64[0] >> 1;