Qt
Internal/Contributor docs for the Qt SDK. <b>Note:</b> These are NOT official API docs; those are found <a href='https://doc.qt.io/'>here</a>.
Loading...
Searching...
No Matches
qdrawhelper_avx2.cpp
Go to the documentation of this file.
1// Copyright (C) 2018 The Qt Company Ltd.
2// Copyright (C) 2018 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#include "qdrawhelper_p.h"
6#include "qdrawhelper_x86_p.h"
8#include "qpixellayout_p.h"
9#include "qrgba64_p.h"
10
11#if defined(QT_COMPILER_SUPPORTS_AVX2)
12
14
15enum {
16 FixedScale = 1 << 16,
17 HalfPoint = 1 << 15
18};
19
20// Vectorized blend functions:
21
22// See BYTE_MUL_SSE2 for details.
23inline static void Q_DECL_VECTORCALL
24BYTE_MUL_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
25{
26 __m256i pixelVectorAG = _mm256_srli_epi16(pixelVector, 8);
27 __m256i pixelVectorRB = _mm256_and_si256(pixelVector, colorMask);
28
29 pixelVectorAG = _mm256_mullo_epi16(pixelVectorAG, alphaChannel);
30 pixelVectorRB = _mm256_mullo_epi16(pixelVectorRB, alphaChannel);
31
32 pixelVectorRB = _mm256_add_epi16(pixelVectorRB, _mm256_srli_epi16(pixelVectorRB, 8));
33 pixelVectorAG = _mm256_add_epi16(pixelVectorAG, _mm256_srli_epi16(pixelVectorAG, 8));
34 pixelVectorRB = _mm256_add_epi16(pixelVectorRB, half);
35 pixelVectorAG = _mm256_add_epi16(pixelVectorAG, half);
36
37 pixelVectorRB = _mm256_srli_epi16(pixelVectorRB, 8);
38 pixelVectorAG = _mm256_andnot_si256(colorMask, pixelVectorAG);
39
40 pixelVector = _mm256_or_si256(pixelVectorAG, pixelVectorRB);
41}
42
43inline static void Q_DECL_VECTORCALL
44BYTE_MUL_RGB64_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
45{
46 __m256i pixelVectorAG = _mm256_srli_epi32(pixelVector, 16);
47 __m256i pixelVectorRB = _mm256_and_si256(pixelVector, colorMask);
48
49 pixelVectorAG = _mm256_mullo_epi32(pixelVectorAG, alphaChannel);
50 pixelVectorRB = _mm256_mullo_epi32(pixelVectorRB, alphaChannel);
51
52 pixelVectorRB = _mm256_add_epi32(pixelVectorRB, _mm256_srli_epi32(pixelVectorRB, 16));
53 pixelVectorAG = _mm256_add_epi32(pixelVectorAG, _mm256_srli_epi32(pixelVectorAG, 16));
54 pixelVectorRB = _mm256_add_epi32(pixelVectorRB, half);
55 pixelVectorAG = _mm256_add_epi32(pixelVectorAG, half);
56
57 pixelVectorRB = _mm256_srli_epi32(pixelVectorRB, 16);
58 pixelVectorAG = _mm256_andnot_si256(colorMask, pixelVectorAG);
59
60 pixelVector = _mm256_or_si256(pixelVectorAG, pixelVectorRB);
61}
62
63// See INTERPOLATE_PIXEL_255_SSE2 for details.
64inline static void Q_DECL_VECTORCALL
65INTERPOLATE_PIXEL_255_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
66{
67 const __m256i srcVectorAG = _mm256_srli_epi16(srcVector, 8);
68 const __m256i dstVectorAG = _mm256_srli_epi16(dstVector, 8);
69 const __m256i srcVectorRB = _mm256_and_si256(srcVector, colorMask);
70 const __m256i dstVectorRB = _mm256_and_si256(dstVector, colorMask);
71 const __m256i srcVectorAGalpha = _mm256_mullo_epi16(srcVectorAG, alphaChannel);
72 const __m256i srcVectorRBalpha = _mm256_mullo_epi16(srcVectorRB, alphaChannel);
73 const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi16(dstVectorAG, oneMinusAlphaChannel);
74 const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi16(dstVectorRB, oneMinusAlphaChannel);
75 __m256i finalAG = _mm256_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlpha);
76 __m256i finalRB = _mm256_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlpha);
77 finalAG = _mm256_add_epi16(finalAG, _mm256_srli_epi16(finalAG, 8));
78 finalRB = _mm256_add_epi16(finalRB, _mm256_srli_epi16(finalRB, 8));
79 finalAG = _mm256_add_epi16(finalAG, half);
80 finalRB = _mm256_add_epi16(finalRB, half);
81 finalAG = _mm256_andnot_si256(colorMask, finalAG);
82 finalRB = _mm256_srli_epi16(finalRB, 8);
83
84 dstVector = _mm256_or_si256(finalAG, finalRB);
85}
86
87inline static void Q_DECL_VECTORCALL
88INTERPOLATE_PIXEL_RGB64_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
89{
90 const __m256i srcVectorAG = _mm256_srli_epi32(srcVector, 16);
91 const __m256i dstVectorAG = _mm256_srli_epi32(dstVector, 16);
92 const __m256i srcVectorRB = _mm256_and_si256(srcVector, colorMask);
93 const __m256i dstVectorRB = _mm256_and_si256(dstVector, colorMask);
94 const __m256i srcVectorAGalpha = _mm256_mullo_epi32(srcVectorAG, alphaChannel);
95 const __m256i srcVectorRBalpha = _mm256_mullo_epi32(srcVectorRB, alphaChannel);
96 const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi32(dstVectorAG, oneMinusAlphaChannel);
97 const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi32(dstVectorRB, oneMinusAlphaChannel);
98 __m256i finalAG = _mm256_add_epi32(srcVectorAGalpha, dstVectorAGoneMinusAlpha);
99 __m256i finalRB = _mm256_add_epi32(srcVectorRBalpha, dstVectorRBoneMinusAlpha);
100 finalAG = _mm256_add_epi32(finalAG, _mm256_srli_epi32(finalAG, 16));
101 finalRB = _mm256_add_epi32(finalRB, _mm256_srli_epi32(finalRB, 16));
102 finalAG = _mm256_add_epi32(finalAG, half);
103 finalRB = _mm256_add_epi32(finalRB, half);
104 finalAG = _mm256_andnot_si256(colorMask, finalAG);
105 finalRB = _mm256_srli_epi32(finalRB, 16);
106
107 dstVector = _mm256_or_si256(finalAG, finalRB);
108}
109
110// See BLEND_SOURCE_OVER_ARGB32_SSE2 for details.
111inline static void Q_DECL_VECTORCALL BLEND_SOURCE_OVER_ARGB32_AVX2(quint32 *dst, const quint32 *src, const int length)
112{
113 const __m256i half = _mm256_set1_epi16(0x80);
114 const __m256i one = _mm256_set1_epi16(0xff);
115 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
116 const __m256i alphaMask = _mm256_set1_epi32(0xff000000);
117 const __m256i offsetMask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
118 const __m256i alphaShuffleMask = _mm256_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3,
119 char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
120
121 const int minusOffsetToAlignDstOn32Bytes = (reinterpret_cast<quintptr>(dst) >> 2) & 0x7;
122
123 int x = 0;
124 // Prologue to handle all pixels until dst is 32-byte aligned in one step.
125 if (minusOffsetToAlignDstOn32Bytes != 0 && x < (length - 7)) {
126 const __m256i prologueMask = _mm256_sub_epi32(_mm256_set1_epi32(minusOffsetToAlignDstOn32Bytes - 1), offsetMask);
127 const __m256i srcVector = _mm256_maskload_epi32((const int *)&src[x - minusOffsetToAlignDstOn32Bytes], prologueMask);
128 const __m256i prologueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, prologueMask);
129 if (!_mm256_testz_si256(srcVector, prologueAlphaMask)) {
130 if (_mm256_testc_si256(srcVector, prologueAlphaMask)) {
131 _mm256_maskstore_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask, srcVector);
132 } else {
133 __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
134 alphaChannel = _mm256_sub_epi16(one, alphaChannel);
135 __m256i dstVector = _mm256_maskload_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask);
136 BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
137 dstVector = _mm256_add_epi8(dstVector, srcVector);
138 _mm256_maskstore_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask, dstVector);
139 }
140 }
141 x += (8 - minusOffsetToAlignDstOn32Bytes);
142 }
143
144 for (; x < (length - 7); x += 8) {
145 const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
146 if (!_mm256_testz_si256(srcVector, alphaMask)) {
147 if (_mm256_testc_si256(srcVector, alphaMask)) {
148 _mm256_store_si256((__m256i *)&dst[x], srcVector);
149 } else {
150 __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
151 alphaChannel = _mm256_sub_epi16(one, alphaChannel);
152 __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
153 BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
154 dstVector = _mm256_add_epi8(dstVector, srcVector);
155 _mm256_store_si256((__m256i *)&dst[x], dstVector);
156 }
157 }
158 }
159
160 // Epilogue to handle all remaining pixels in one step.
161 if (x < length) {
162 const __m256i epilogueMask = _mm256_add_epi32(offsetMask, _mm256_set1_epi32(x - length));
163 const __m256i srcVector = _mm256_maskload_epi32((const int *)&src[x], epilogueMask);
164 const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask);
165 if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) {
166 if (_mm256_testc_si256(srcVector, epilogueAlphaMask)) {
167 _mm256_maskstore_epi32((int *)&dst[x], epilogueMask, srcVector);
168 } else {
169 __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
170 alphaChannel = _mm256_sub_epi16(one, alphaChannel);
171 __m256i dstVector = _mm256_maskload_epi32((int *)&dst[x], epilogueMask);
172 BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
173 dstVector = _mm256_add_epi8(dstVector, srcVector);
174 _mm256_maskstore_epi32((int *)&dst[x], epilogueMask, dstVector);
175 }
176 }
177 }
178}
179
180
181// See BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2 for details.
182inline static void Q_DECL_VECTORCALL
183BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(quint32 *dst, const quint32 *src, const int length, const int const_alpha)
184{
185 int x = 0;
186
188 blend_pixel(dst[x], src[x], const_alpha);
189
190 const __m256i half = _mm256_set1_epi16(0x80);
191 const __m256i one = _mm256_set1_epi16(0xff);
192 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
193 const __m256i alphaMask = _mm256_set1_epi32(0xff000000);
194 const __m256i alphaShuffleMask = _mm256_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3,
195 char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
196 const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
197 for (; x < (length - 7); x += 8) {
198 __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
199 if (!_mm256_testz_si256(srcVector, alphaMask)) {
200 BYTE_MUL_AVX2(srcVector, constAlphaVector, colorMask, half);
201
202 __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
203 alphaChannel = _mm256_sub_epi16(one, alphaChannel);
204 __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
205 BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
206 dstVector = _mm256_add_epi8(dstVector, srcVector);
207 _mm256_store_si256((__m256i *)&dst[x], dstVector);
208 }
209 }
211 blend_pixel(dst[x], src[x], const_alpha);
212}
213
214void qt_blend_argb32_on_argb32_avx2(uchar *destPixels, int dbpl,
215 const uchar *srcPixels, int sbpl,
216 int w, int h,
217 int const_alpha)
218{
219 if (const_alpha == 256) {
220 for (int y = 0; y < h; ++y) {
221 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
222 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
223 BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, w);
224 destPixels += dbpl;
225 srcPixels += sbpl;
226 }
227 } else if (const_alpha != 0) {
228 const_alpha = (const_alpha * 255) >> 8;
229 for (int y = 0; y < h; ++y) {
230 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
231 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
232 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, w, const_alpha);
233 destPixels += dbpl;
234 srcPixels += sbpl;
235 }
236 }
237}
238
239void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl,
240 const uchar *srcPixels, int sbpl,
241 int w, int h,
242 int const_alpha)
243{
244 if (const_alpha == 256) {
245 for (int y = 0; y < h; ++y) {
246 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
247 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
248 ::memcpy(dst, src, w * sizeof(uint));
249 srcPixels += sbpl;
250 destPixels += dbpl;
251 }
252 return;
253 }
254 if (const_alpha == 0)
255 return;
256
257 const __m256i half = _mm256_set1_epi16(0x80);
258 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
259
260 const_alpha = (const_alpha * 255) >> 8;
261 int one_minus_const_alpha = 255 - const_alpha;
262 const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
263 const __m256i oneMinusConstAlpha = _mm256_set1_epi16(one_minus_const_alpha);
264 for (int y = 0; y < h; ++y) {
265 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
266 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
267 int x = 0;
268
269 // First, align dest to 32 bytes:
271 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
272
273 // 2) interpolate pixels with AVX2
274 for (; x < (w - 7); x += 8) {
275 const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
276 __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
277 INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
278 _mm256_store_si256((__m256i *)&dst[x], dstVector);
279 }
280
281 // 3) Epilogue
282 SIMD_EPILOGUE(x, w, 7)
283 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
284
285 srcPixels += sbpl;
286 destPixels += dbpl;
287 }
288}
289
290static Q_NEVER_INLINE
291void Q_DECL_VECTORCALL qt_memfillXX_avx2(uchar *dest, __m256i value256, qsizetype bytes)
292{
293 __m128i value128 = _mm256_castsi256_si128(value256);
294
295 // main body
296 __m256i *dst256 = reinterpret_cast<__m256i *>(dest);
297 uchar *end = dest + bytes;
298 while (reinterpret_cast<uchar *>(dst256 + 4) <= end) {
299 _mm256_storeu_si256(dst256 + 0, value256);
300 _mm256_storeu_si256(dst256 + 1, value256);
301 _mm256_storeu_si256(dst256 + 2, value256);
302 _mm256_storeu_si256(dst256 + 3, value256);
303 dst256 += 4;
304 }
305
306 // first epilogue: fewer than 128 bytes / 32 entries
307 bytes = end - reinterpret_cast<uchar *>(dst256);
308 switch (bytes / sizeof(value256)) {
309 case 3: _mm256_storeu_si256(dst256++, value256); Q_FALLTHROUGH();
310 case 2: _mm256_storeu_si256(dst256++, value256); Q_FALLTHROUGH();
311 case 1: _mm256_storeu_si256(dst256++, value256);
312 }
313
314 // second epilogue: fewer than 32 bytes
315 __m128i *dst128 = reinterpret_cast<__m128i *>(dst256);
316 if (bytes & sizeof(value128))
317 _mm_storeu_si128(dst128++, value128);
318
319 // third epilogue: fewer than 16 bytes
320 if (bytes & 8)
321 _mm_storel_epi64(reinterpret_cast<__m128i *>(end - 8), value128);
322}
323
324void qt_memfill64_avx2(quint64 *dest, quint64 value, qsizetype count)
325{
326#if defined(Q_CC_GNU) && !defined(Q_CC_CLANG)
327 // work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80820
328 __m128i value64 = _mm_set_epi64x(0, value); // _mm_cvtsi64_si128(value);
329# ifdef Q_PROCESSOR_X86_64
330 asm ("" : "+x" (value64));
331# endif
332 __m256i value256 = _mm256_broadcastq_epi64(value64);
333#else
334 __m256i value256 = _mm256_set1_epi64x(value);
335#endif
336
337 qt_memfillXX_avx2(reinterpret_cast<uchar *>(dest), value256, count * sizeof(quint64));
338}
339
340void qt_memfill32_avx2(quint32 *dest, quint32 value, qsizetype count)
341{
342 if (count % 2) {
343 // odd number of pixels, round to even
344 *dest++ = value;
345 --count;
346 }
347 qt_memfillXX_avx2(reinterpret_cast<uchar *>(dest), _mm256_set1_epi32(value), count * sizeof(quint32));
348}
349
350void QT_FASTCALL comp_func_SourceOver_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha)
351{
352 Q_ASSERT(const_alpha < 256);
353
354 const quint32 *src = (const quint32 *) srcPixels;
355 quint32 *dst = (quint32 *) destPixels;
356
357 if (const_alpha == 255)
358 BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, length);
359 else
360 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, length, const_alpha);
361}
362
363#if QT_CONFIG(raster_64bit)
364void QT_FASTCALL comp_func_SourceOver_rgb64_avx2(QRgba64 *dst, const QRgba64 *src, int length, uint const_alpha)
365{
366 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
367 const __m256i half = _mm256_set1_epi32(0x8000);
368 const __m256i one = _mm256_set1_epi32(0xffff);
369 const __m256i colorMask = _mm256_set1_epi32(0x0000ffff);
370 __m256i alphaMask = _mm256_set1_epi32(0xff000000);
371 alphaMask = _mm256_unpacklo_epi8(alphaMask, alphaMask);
372 const __m256i alphaShuffleMask = _mm256_set_epi8(char(0xff),char(0xff),15,14,char(0xff),char(0xff),15,14,char(0xff),char(0xff),7,6,char(0xff),char(0xff),7,6,
373 char(0xff),char(0xff),15,14,char(0xff),char(0xff),15,14,char(0xff),char(0xff),7,6,char(0xff),char(0xff),7,6);
374
375 if (const_alpha == 255) {
376 int x = 0;
377 for (; x < length && (quintptr(dst + x) & 31); ++x)
378 blend_pixel(dst[x], src[x]);
379 for (; x < length - 3; x += 4) {
380 const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
381 if (!_mm256_testz_si256(srcVector, alphaMask)) {
382 // Not all transparent
383 if (_mm256_testc_si256(srcVector, alphaMask)) {
384 // All opaque
385 _mm256_store_si256((__m256i *)&dst[x], srcVector);
386 } else {
387 __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
388 alphaChannel = _mm256_sub_epi32(one, alphaChannel);
389 __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
390 BYTE_MUL_RGB64_AVX2(dstVector, alphaChannel, colorMask, half);
391 dstVector = _mm256_add_epi16(dstVector, srcVector);
392 _mm256_store_si256((__m256i *)&dst[x], dstVector);
393 }
394 }
395 }
397 blend_pixel(dst[x], src[x]);
398 } else {
399 const __m256i constAlphaVector = _mm256_set1_epi32(const_alpha | (const_alpha << 8));
400 int x = 0;
401 for (; x < length && (quintptr(dst + x) & 31); ++x)
402 blend_pixel(dst[x], src[x], const_alpha);
403 for (; x < length - 3; x += 4) {
404 __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
405 if (!_mm256_testz_si256(srcVector, alphaMask)) {
406 // Not all transparent
407 BYTE_MUL_RGB64_AVX2(srcVector, constAlphaVector, colorMask, half);
408
409 __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
410 alphaChannel = _mm256_sub_epi32(one, alphaChannel);
411 __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
412 BYTE_MUL_RGB64_AVX2(dstVector, alphaChannel, colorMask, half);
413 dstVector = _mm256_add_epi16(dstVector, srcVector);
414 _mm256_store_si256((__m256i *)&dst[x], dstVector);
415 }
416 }
418 blend_pixel(dst[x], src[x], const_alpha);
419 }
420}
421#endif
422
423#if QT_CONFIG(raster_fp)
424void QT_FASTCALL comp_func_SourceOver_rgbafp_avx2(QRgbaFloat32 *dst, const QRgbaFloat32 *src, int length, uint const_alpha)
425{
426 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
427
428 const float a = const_alpha / 255.0f;
429 const __m128 one = _mm_set1_ps(1.0f);
430 const __m128 constAlphaVector = _mm_set1_ps(a);
431 const __m256 one256 = _mm256_set1_ps(1.0f);
432 const __m256 constAlphaVector256 = _mm256_set1_ps(a);
433 int x = 0;
434 for (; x < length - 1; x += 2) {
435 __m256 srcVector = _mm256_loadu_ps((const float *)&src[x]);
436 __m256 dstVector = _mm256_loadu_ps((const float *)&dst[x]);
437 srcVector = _mm256_mul_ps(srcVector, constAlphaVector256);
438 __m256 alphaChannel = _mm256_permute_ps(srcVector, _MM_SHUFFLE(3, 3, 3, 3));
439 alphaChannel = _mm256_sub_ps(one256, alphaChannel);
440 dstVector = _mm256_mul_ps(dstVector, alphaChannel);
441 dstVector = _mm256_add_ps(dstVector, srcVector);
442 _mm256_storeu_ps((float *)(dst + x), dstVector);
443 }
444 if (x < length) {
445 __m128 srcVector = _mm_loadu_ps((const float *)&src[x]);
446 __m128 dstVector = _mm_loadu_ps((const float *)&dst[x]);
447 srcVector = _mm_mul_ps(srcVector, constAlphaVector);
448 __m128 alphaChannel = _mm_permute_ps(srcVector, _MM_SHUFFLE(3, 3, 3, 3));
449 alphaChannel = _mm_sub_ps(one, alphaChannel);
450 dstVector = _mm_mul_ps(dstVector, alphaChannel);
451 dstVector = _mm_add_ps(dstVector, srcVector);
452 _mm_storeu_ps((float *)(dst + x), dstVector);
453 }
454}
455#endif
456
457void QT_FASTCALL comp_func_Source_avx2(uint *dst, const uint *src, int length, uint const_alpha)
458{
459 if (const_alpha == 255) {
460 ::memcpy(dst, src, length * sizeof(uint));
461 } else {
462 const int ialpha = 255 - const_alpha;
463
464 int x = 0;
465
466 // 1) prologue, align on 32 bytes
468 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
469
470 // 2) interpolate pixels with AVX2
471 const __m256i half = _mm256_set1_epi16(0x80);
472 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
473 const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
474 const __m256i oneMinusConstAlpha = _mm256_set1_epi16(ialpha);
475 for (; x < length - 7; x += 8) {
476 const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
477 __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
478 INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
479 _mm256_store_si256((__m256i *)&dst[x], dstVector);
480 }
481
482 // 3) Epilogue
484 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
485 }
486}
487
488#if QT_CONFIG(raster_64bit)
489void QT_FASTCALL comp_func_Source_rgb64_avx2(QRgba64 *dst, const QRgba64 *src, int length, uint const_alpha)
490{
491 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
492 if (const_alpha == 255) {
493 ::memcpy(dst, src, length * sizeof(QRgba64));
494 } else {
495 const uint ca = const_alpha | (const_alpha << 8); // adjust to [0-65535]
496 const uint cia = 65535 - ca;
497
498 int x = 0;
499
500 // 1) prologue, align on 32 bytes
501 for (; x < length && (quintptr(dst + x) & 31); ++x)
502 dst[x] = interpolate65535(src[x], ca, dst[x], cia);
503
504 // 2) interpolate pixels with AVX2
505 const __m256i half = _mm256_set1_epi32(0x8000);
506 const __m256i colorMask = _mm256_set1_epi32(0x0000ffff);
507 const __m256i constAlphaVector = _mm256_set1_epi32(ca);
508 const __m256i oneMinusConstAlpha = _mm256_set1_epi32(cia);
509 for (; x < length - 3; x += 4) {
510 const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
511 __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
512 INTERPOLATE_PIXEL_RGB64_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
513 _mm256_store_si256((__m256i *)&dst[x], dstVector);
514 }
515
516 // 3) Epilogue
518 dst[x] = interpolate65535(src[x], ca, dst[x], cia);
519 }
520}
521#endif
522
523#if QT_CONFIG(raster_fp)
524void QT_FASTCALL comp_func_Source_rgbafp_avx2(QRgbaFloat32 *dst, const QRgbaFloat32 *src, int length, uint const_alpha)
525{
526 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
527 if (const_alpha == 255) {
528 ::memcpy(dst, src, length * sizeof(QRgbaFloat32));
529 } else {
530 const float ca = const_alpha / 255.f;
531 const float cia = 1.0f - ca;
532
533 const __m128 constAlphaVector = _mm_set1_ps(ca);
534 const __m128 oneMinusConstAlpha = _mm_set1_ps(cia);
535 const __m256 constAlphaVector256 = _mm256_set1_ps(ca);
536 const __m256 oneMinusConstAlpha256 = _mm256_set1_ps(cia);
537 int x = 0;
538 for (; x < length - 1; x += 2) {
539 __m256 srcVector = _mm256_loadu_ps((const float *)&src[x]);
540 __m256 dstVector = _mm256_loadu_ps((const float *)&dst[x]);
541 srcVector = _mm256_mul_ps(srcVector, constAlphaVector256);
542 dstVector = _mm256_mul_ps(dstVector, oneMinusConstAlpha256);
543 dstVector = _mm256_add_ps(dstVector, srcVector);
544 _mm256_storeu_ps((float *)&dst[x], dstVector);
545 }
546 if (x < length) {
547 __m128 srcVector = _mm_loadu_ps((const float *)&src[x]);
548 __m128 dstVector = _mm_loadu_ps((const float *)&dst[x]);
549 srcVector = _mm_mul_ps(srcVector, constAlphaVector);
550 dstVector = _mm_mul_ps(dstVector, oneMinusConstAlpha);
551 dstVector = _mm_add_ps(dstVector, srcVector);
552 _mm_storeu_ps((float *)&dst[x], dstVector);
553 }
554 }
555}
556#endif
557
558void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, uint color, uint const_alpha)
559{
560 if ((const_alpha & qAlpha(color)) == 255) {
561 qt_memfill32(destPixels, color, length);
562 } else {
563 if (const_alpha != 255)
564 color = BYTE_MUL(color, const_alpha);
565
566 const quint32 minusAlphaOfColor = qAlpha(~color);
567 int x = 0;
568
569 quint32 *dst = (quint32 *) destPixels;
570 const __m256i colorVector = _mm256_set1_epi32(color);
571 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
572 const __m256i half = _mm256_set1_epi16(0x80);
573 const __m256i minusAlphaOfColorVector = _mm256_set1_epi16(minusAlphaOfColor);
574
576 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
577
578 for (; x < length - 7; x += 8) {
579 __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
580 BYTE_MUL_AVX2(dstVector, minusAlphaOfColorVector, colorMask, half);
581 dstVector = _mm256_add_epi8(colorVector, dstVector);
582 _mm256_store_si256((__m256i *)&dst[x], dstVector);
583 }
585 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
586 }
587}
588
589#if QT_CONFIG(raster_64bit)
590void QT_FASTCALL comp_func_solid_SourceOver_rgb64_avx2(QRgba64 *destPixels, int length, QRgba64 color, uint const_alpha)
591{
592 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
593 if (const_alpha == 255 && color.isOpaque()) {
594 qt_memfill64((quint64*)destPixels, color, length);
595 } else {
596 if (const_alpha != 255)
597 color = multiplyAlpha255(color, const_alpha);
598
599 const uint minusAlphaOfColor = 65535 - color.alpha();
600 int x = 0;
601 quint64 *dst = (quint64 *) destPixels;
602 const __m256i colorVector = _mm256_set1_epi64x(color);
603 const __m256i colorMask = _mm256_set1_epi32(0x0000ffff);
604 const __m256i half = _mm256_set1_epi32(0x8000);
605 const __m256i minusAlphaOfColorVector = _mm256_set1_epi32(minusAlphaOfColor);
606
607 for (; x < length && (quintptr(dst + x) & 31); ++x)
608 destPixels[x] = color + multiplyAlpha65535(destPixels[x], minusAlphaOfColor);
609
610 for (; x < length - 3; x += 4) {
611 __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
612 BYTE_MUL_RGB64_AVX2(dstVector, minusAlphaOfColorVector, colorMask, half);
613 dstVector = _mm256_add_epi16(colorVector, dstVector);
614 _mm256_store_si256((__m256i *)&dst[x], dstVector);
615 }
617 destPixels[x] = color + multiplyAlpha65535(destPixels[x], minusAlphaOfColor);
618 }
619}
620#endif
621
622#if QT_CONFIG(raster_fp)
623void QT_FASTCALL comp_func_solid_Source_rgbafp_avx2(QRgbaFloat32 *dst, int length, QRgbaFloat32 color, uint const_alpha)
624{
625 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
626 if (const_alpha == 255) {
627 for (int i = 0; i < length; ++i)
628 dst[i] = color;
629 } else {
630 const float a = const_alpha / 255.0f;
631 const __m128 alphaVector = _mm_set1_ps(a);
632 const __m128 minusAlphaVector = _mm_set1_ps(1.0f - a);
633 __m128 colorVector = _mm_loadu_ps((const float *)&color);
634 colorVector = _mm_mul_ps(colorVector, alphaVector);
635 const __m256 colorVector256 = _mm256_insertf128_ps(_mm256_castps128_ps256(colorVector), colorVector, 1);
636 const __m256 minusAlphaVector256 = _mm256_set1_ps(1.0f - a);
637 int x = 0;
638 for (; x < length - 1; x += 2) {
639 __m256 dstVector = _mm256_loadu_ps((const float *)&dst[x]);
640 dstVector = _mm256_mul_ps(dstVector, minusAlphaVector256);
641 dstVector = _mm256_add_ps(dstVector, colorVector256);
642 _mm256_storeu_ps((float *)&dst[x], dstVector);
643 }
644 if (x < length) {
645 __m128 dstVector = _mm_loadu_ps((const float *)&dst[x]);
646 dstVector = _mm_mul_ps(dstVector, minusAlphaVector);
647 dstVector = _mm_add_ps(dstVector, colorVector);
648 _mm_storeu_ps((float *)&dst[x], dstVector);
649 }
650 }
651}
652
653void QT_FASTCALL comp_func_solid_SourceOver_rgbafp_avx2(QRgbaFloat32 *dst, int length, QRgbaFloat32 color, uint const_alpha)
654{
655 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
656 if (const_alpha == 255 && color.a >= 1.0f) {
657 for (int i = 0; i < length; ++i)
658 dst[i] = color;
659 } else {
660 __m128 colorVector = _mm_loadu_ps((const float *)&color);
661 if (const_alpha != 255)
662 colorVector = _mm_mul_ps(colorVector, _mm_set1_ps(const_alpha / 255.f));
663 __m128 minusAlphaOfColorVector =
664 _mm_sub_ps(_mm_set1_ps(1.0f), _mm_permute_ps(colorVector, _MM_SHUFFLE(3, 3, 3, 3)));
665 const __m256 colorVector256 = _mm256_insertf128_ps(_mm256_castps128_ps256(colorVector), colorVector, 1);
666 const __m256 minusAlphaVector256 = _mm256_insertf128_ps(_mm256_castps128_ps256(minusAlphaOfColorVector),
667 minusAlphaOfColorVector, 1);
668 int x = 0;
669 for (; x < length - 1; x += 2) {
670 __m256 dstVector = _mm256_loadu_ps((const float *)&dst[x]);
671 dstVector = _mm256_mul_ps(dstVector, minusAlphaVector256);
672 dstVector = _mm256_add_ps(dstVector, colorVector256);
673 _mm256_storeu_ps((float *)&dst[x], dstVector);
674 }
675 if (x < length) {
676 __m128 dstVector = _mm_loadu_ps((const float *)&dst[x]);
677 dstVector = _mm_mul_ps(dstVector, minusAlphaOfColorVector);
678 dstVector = _mm_add_ps(dstVector, colorVector);
679 _mm_storeu_ps((float *)&dst[x], dstVector);
680 }
681 }
682}
683#endif
684
685#define interpolate_4_pixels_16_avx2(tlr1, tlr2, blr1, blr2, distx, disty, colorMask, v_256, b) \
686{ \
687 /* Correct for later unpack */ \
688 const __m256i vdistx = _mm256_permute4x64_epi64(distx, _MM_SHUFFLE(3, 1, 2, 0)); \
689 const __m256i vdisty = _mm256_permute4x64_epi64(disty, _MM_SHUFFLE(3, 1, 2, 0)); \
690 \
691 __m256i dxdy = _mm256_mullo_epi16 (vdistx, vdisty); \
692 const __m256i distx_ = _mm256_slli_epi16(vdistx, 4); \
693 const __m256i disty_ = _mm256_slli_epi16(vdisty, 4); \
694 __m256i idxidy = _mm256_add_epi16(dxdy, _mm256_sub_epi16(v_256, _mm256_add_epi16(distx_, disty_))); \
695 __m256i dxidy = _mm256_sub_epi16(distx_, dxdy); \
696 __m256i idxdy = _mm256_sub_epi16(disty_, dxdy); \
697 \
698 __m256i tlr1AG = _mm256_srli_epi16(tlr1, 8); \
699 __m256i tlr1RB = _mm256_and_si256(tlr1, colorMask); \
700 __m256i tlr2AG = _mm256_srli_epi16(tlr2, 8); \
701 __m256i tlr2RB = _mm256_and_si256(tlr2, colorMask); \
702 __m256i blr1AG = _mm256_srli_epi16(blr1, 8); \
703 __m256i blr1RB = _mm256_and_si256(blr1, colorMask); \
704 __m256i blr2AG = _mm256_srli_epi16(blr2, 8); \
705 __m256i blr2RB = _mm256_and_si256(blr2, colorMask); \
706 \
707 __m256i odxidy1 = _mm256_unpacklo_epi32(idxidy, dxidy); \
708 __m256i odxidy2 = _mm256_unpackhi_epi32(idxidy, dxidy); \
709 tlr1AG = _mm256_mullo_epi16(tlr1AG, odxidy1); \
710 tlr1RB = _mm256_mullo_epi16(tlr1RB, odxidy1); \
711 tlr2AG = _mm256_mullo_epi16(tlr2AG, odxidy2); \
712 tlr2RB = _mm256_mullo_epi16(tlr2RB, odxidy2); \
713 __m256i odxdy1 = _mm256_unpacklo_epi32(idxdy, dxdy); \
714 __m256i odxdy2 = _mm256_unpackhi_epi32(idxdy, dxdy); \
715 blr1AG = _mm256_mullo_epi16(blr1AG, odxdy1); \
716 blr1RB = _mm256_mullo_epi16(blr1RB, odxdy1); \
717 blr2AG = _mm256_mullo_epi16(blr2AG, odxdy2); \
718 blr2RB = _mm256_mullo_epi16(blr2RB, odxdy2); \
719 \
720 /* Add the values, and shift to only keep 8 significant bits per colors */ \
721 __m256i topAG = _mm256_hadd_epi32(tlr1AG, tlr2AG); \
722 __m256i topRB = _mm256_hadd_epi32(tlr1RB, tlr2RB); \
723 __m256i botAG = _mm256_hadd_epi32(blr1AG, blr2AG); \
724 __m256i botRB = _mm256_hadd_epi32(blr1RB, blr2RB); \
725 __m256i rAG = _mm256_add_epi16(topAG, botAG); \
726 __m256i rRB = _mm256_add_epi16(topRB, botRB); \
727 rRB = _mm256_srli_epi16(rRB, 8); \
728 /* Correct for hadd */ \
729 rAG = _mm256_permute4x64_epi64(rAG, _MM_SHUFFLE(3, 1, 2, 0)); \
730 rRB = _mm256_permute4x64_epi64(rRB, _MM_SHUFFLE(3, 1, 2, 0)); \
731 _mm256_storeu_si256((__m256i*)(b), _mm256_blendv_epi8(rAG, rRB, colorMask)); \
732}
733
734inline void fetchTransformedBilinear_pixelBounds(int, int l1, int l2, int &v1, int &v2)
735{
736 if (v1 < l1)
737 v2 = v1 = l1;
738 else if (v1 >= l2)
739 v2 = v1 = l2;
740 else
741 v2 = v1 + 1;
742 Q_ASSERT(v1 >= l1 && v1 <= l2);
743 Q_ASSERT(v2 >= l1 && v2 <= l2);
744}
745
746void QT_FASTCALL intermediate_adder_avx2(uint *b, uint *end, const IntermediateBuffer &intermediate, int offset, int &fx, int fdx);
747
748void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_scale_helper_avx2(uint *b, uint *end, const QTextureData &image,
749 int &fx, int &fy, int fdx, int /*fdy*/)
750{
751 int y1 = (fy >> 16);
752 int y2;
754 const uint *s1 = (const uint *)image.scanLine(y1);
755 const uint *s2 = (const uint *)image.scanLine(y2);
756
757 const int disty = (fy & 0x0000ffff) >> 8;
758 const int idisty = 256 - disty;
759 const int length = end - b;
760
761 // The intermediate buffer is generated in the positive direction
762 const int adjust = (fdx < 0) ? fdx * length : 0;
763 const int offset = (fx + adjust) >> 16;
764 int x = offset;
765
766 IntermediateBuffer intermediate;
767 // count is the size used in the intermediate_buffer.
768 int count = (qint64(length) * qAbs(fdx) + FixedScale - 1) / FixedScale + 2;
769 // length is supposed to be <= BufferSize either because data->m11 < 1 or
770 // data->m11 < 2, and any larger buffers split
771 Q_ASSERT(count <= BufferSize + 2);
772 int f = 0;
773 int lim = qMin(count, image.x2 - x);
774 if (x < image.x1) {
775 Q_ASSERT(x < image.x2);
776 uint t = s1[image.x1];
777 uint b = s2[image.x1];
778 quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
779 quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
780 do {
781 intermediate.buffer_rb[f] = rb;
782 intermediate.buffer_ag[f] = ag;
783 f++;
784 x++;
785 } while (x < image.x1 && f < lim);
786 }
787
788 const __m256i disty_ = _mm256_set1_epi16(disty);
789 const __m256i idisty_ = _mm256_set1_epi16(idisty);
790 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
791
792 lim -= 7;
793 for (; f < lim; x += 8, f += 8) {
794 // Load 8 pixels from s1, and split the alpha-green and red-blue component
795 __m256i top = _mm256_loadu_si256((const __m256i*)((const uint *)(s1)+x));
796 __m256i topAG = _mm256_srli_epi16(top, 8);
797 __m256i topRB = _mm256_and_si256(top, colorMask);
798 // Multiplies each color component by idisty
799 topAG = _mm256_mullo_epi16 (topAG, idisty_);
800 topRB = _mm256_mullo_epi16 (topRB, idisty_);
801
802 // Same for the s2 vector
803 __m256i bottom = _mm256_loadu_si256((const __m256i*)((const uint *)(s2)+x));
804 __m256i bottomAG = _mm256_srli_epi16(bottom, 8);
805 __m256i bottomRB = _mm256_and_si256(bottom, colorMask);
806 bottomAG = _mm256_mullo_epi16 (bottomAG, disty_);
807 bottomRB = _mm256_mullo_epi16 (bottomRB, disty_);
808
809 // Add the values, and shift to only keep 8 significant bits per colors
810 __m256i rAG =_mm256_add_epi16(topAG, bottomAG);
811 rAG = _mm256_srli_epi16(rAG, 8);
812 _mm256_storeu_si256((__m256i*)(&intermediate.buffer_ag[f]), rAG);
813 __m256i rRB =_mm256_add_epi16(topRB, bottomRB);
814 rRB = _mm256_srli_epi16(rRB, 8);
815 _mm256_storeu_si256((__m256i*)(&intermediate.buffer_rb[f]), rRB);
816 }
817
818 for (; f < count; f++) { // Same as above but without simd
819 x = qMin(x, image.x2 - 1);
820
821 uint t = s1[x];
822 uint b = s2[x];
823
824 intermediate.buffer_rb[f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
825 intermediate.buffer_ag[f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
826 x++;
827 }
828
829 // Now interpolate the values from the intermediate_buffer to get the final result.
830 intermediate_adder_avx2(b, end, intermediate, offset, fx, fdx);
831}
832
833void QT_FASTCALL intermediate_adder_avx2(uint *b, uint *end, const IntermediateBuffer &intermediate, int offset, int &fx, int fdx)
834{
835 fx -= offset * FixedScale;
836
837 const __m128i v_fdx = _mm_set1_epi32(fdx * 4);
838 const __m128i v_blend = _mm_set1_epi32(0x00800080);
839 const __m128i vdx_shuffle = _mm_set_epi8(char(0x80), 13, char(0x80), 13, char(0x80), 9, char(0x80), 9,
840 char(0x80), 5, char(0x80), 5, char(0x80), 1, char(0x80), 1);
841 __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
842
843 while (b < end - 3) {
844 const __m128i offset = _mm_srli_epi32(v_fx, 16);
845 __m256i vrb = _mm256_i32gather_epi64((const long long *)intermediate.buffer_rb, offset, 4);
846 __m256i vag = _mm256_i32gather_epi64((const long long *)intermediate.buffer_ag, offset, 4);
847
848 __m128i vdx = _mm_shuffle_epi8(v_fx, vdx_shuffle);
849 __m128i vidx = _mm_sub_epi16(_mm_set1_epi16(256), vdx);
850 __m256i vmulx = _mm256_castsi128_si256(_mm_unpacklo_epi32(vidx, vdx));
851 vmulx = _mm256_inserti128_si256(vmulx, _mm_unpackhi_epi32(vidx, vdx), 1);
852
853 vrb = _mm256_mullo_epi16(vrb, vmulx);
854 vag = _mm256_mullo_epi16(vag, vmulx);
855
856 __m256i vrbag = _mm256_hadd_epi32(vrb, vag);
857 vrbag = _mm256_permute4x64_epi64(vrbag, _MM_SHUFFLE(3, 1, 2, 0));
858
859 __m128i rb = _mm256_castsi256_si128(vrbag);
860 __m128i ag = _mm256_extracti128_si256(vrbag, 1);
861 rb = _mm_srli_epi16(rb, 8);
862
863 _mm_storeu_si128((__m128i*)b, _mm_blendv_epi8(ag, rb, v_blend));
864
865 b += 4;
866 v_fx = _mm_add_epi32(v_fx, v_fdx);
867 }
868 fx = _mm_cvtsi128_si32(v_fx);
869 while (b < end) {
870 const int x = (fx >> 16);
871
872 const uint distx = (fx & 0x0000ffff) >> 8;
873 const uint idistx = 256 - distx;
874 const uint rb = (intermediate.buffer_rb[x] * idistx + intermediate.buffer_rb[x + 1] * distx) & 0xff00ff00;
875 const uint ag = (intermediate.buffer_ag[x] * idistx + intermediate.buffer_ag[x + 1] * distx) & 0xff00ff00;
876 *b = (rb >> 8) | ag;
877 b++;
878 fx += fdx;
879 }
880 fx += offset * FixedScale;
881}
882
883void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper_avx2(uint *b, uint *end, const QTextureData &image,
884 int &fx, int &fy, int fdx, int /*fdy*/)
885{
886 int y1 = (fy >> 16);
887 int y2;
889 const uint *s1 = (const uint *)image.scanLine(y1);
890 const uint *s2 = (const uint *)image.scanLine(y2);
891 const int disty8 = (fy & 0x0000ffff) >> 8;
892 const int disty4 = (disty8 + 0x08) >> 4;
893
894 const qint64 min_fx = qint64(image.x1) * FixedScale;
895 const qint64 max_fx = qint64(image.x2 - 1) * FixedScale;
896 while (b < end) {
897 int x1 = (fx >> 16);
898 int x2;
900 if (x1 != x2)
901 break;
902 uint top = s1[x1];
903 uint bot = s2[x1];
904 *b = INTERPOLATE_PIXEL_256(top, 256 - disty8, bot, disty8);
905 fx += fdx;
906 ++b;
907 }
908 uint *boundedEnd = end;
909 if (fdx > 0)
910 boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
911 else if (fdx < 0)
912 boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
913
914 // A fast middle part without boundary checks
915 const __m256i vdistShuffle =
916 _mm256_setr_epi8(0, char(0x80), 0, char(0x80), 4, char(0x80), 4, char(0x80), 8, char(0x80), 8, char(0x80), 12, char(0x80), 12, char(0x80),
917 0, char(0x80), 0, char(0x80), 4, char(0x80), 4, char(0x80), 8, char(0x80), 8, char(0x80), 12, char(0x80), 12, char(0x80));
918 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
919 const __m256i v_256 = _mm256_set1_epi16(256);
920 const __m256i v_disty = _mm256_set1_epi16(disty4);
921 const __m256i v_fdx = _mm256_set1_epi32(fdx * 8);
922 const __m256i v_fx_r = _mm256_set1_epi32(0x08);
923 const __m256i v_index = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
924 __m256i v_fx = _mm256_set1_epi32(fx);
925 v_fx = _mm256_add_epi32(v_fx, _mm256_mullo_epi32(_mm256_set1_epi32(fdx), v_index));
926
927 while (b < boundedEnd - 7) {
928 const __m256i offset = _mm256_srli_epi32(v_fx, 16);
929 const __m128i offsetLo = _mm256_castsi256_si128(offset);
930 const __m128i offsetHi = _mm256_extracti128_si256(offset, 1);
931 const __m256i toplo = _mm256_i32gather_epi64((const long long *)s1, offsetLo, 4);
932 const __m256i tophi = _mm256_i32gather_epi64((const long long *)s1, offsetHi, 4);
933 const __m256i botlo = _mm256_i32gather_epi64((const long long *)s2, offsetLo, 4);
934 const __m256i bothi = _mm256_i32gather_epi64((const long long *)s2, offsetHi, 4);
935
936 __m256i v_distx = _mm256_srli_epi16(v_fx, 8);
937 v_distx = _mm256_srli_epi16(_mm256_add_epi32(v_distx, v_fx_r), 4);
938 v_distx = _mm256_shuffle_epi8(v_distx, vdistShuffle);
939
940 interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256, b);
941 b += 8;
942 v_fx = _mm256_add_epi32(v_fx, v_fdx);
943 }
944 fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , 0);
945
946 while (b < boundedEnd) {
947 int x = (fx >> 16);
948 int distx8 = (fx & 0x0000ffff) >> 8;
949 *b = interpolate_4_pixels(s1 + x, s2 + x, distx8, disty8);
950 fx += fdx;
951 ++b;
952 }
953
954 while (b < end) {
955 int x1 = (fx >> 16);
956 int x2;
958 uint tl = s1[x1];
959 uint tr = s1[x2];
960 uint bl = s2[x1];
961 uint br = s2[x2];
962 int distx8 = (fx & 0x0000ffff) >> 8;
963 *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
964 fx += fdx;
965 ++b;
966 }
967}
968
969void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2(uint *b, uint *end, const QTextureData &image,
970 int &fx, int &fy, int fdx, int fdy)
971{
972 const qint64 min_fx = qint64(image.x1) * FixedScale;
973 const qint64 max_fx = qint64(image.x2 - 1) * FixedScale;
974 const qint64 min_fy = qint64(image.y1) * FixedScale;
975 const qint64 max_fy = qint64(image.y2 - 1) * FixedScale;
976 // first handle the possibly bounded part in the beginning
977 while (b < end) {
978 int x1 = (fx >> 16);
979 int x2;
980 int y1 = (fy >> 16);
981 int y2;
984 if (x1 != x2 && y1 != y2)
985 break;
986 const uint *s1 = (const uint *)image.scanLine(y1);
987 const uint *s2 = (const uint *)image.scanLine(y2);
988 uint tl = s1[x1];
989 uint tr = s1[x2];
990 uint bl = s2[x1];
991 uint br = s2[x2];
992 int distx = (fx & 0x0000ffff) >> 8;
993 int disty = (fy & 0x0000ffff) >> 8;
994 *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
995 fx += fdx;
996 fy += fdy;
997 ++b;
998 }
999 uint *boundedEnd = end;
1000 if (fdx > 0)
1001 boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
1002 else if (fdx < 0)
1003 boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
1004 if (fdy > 0)
1005 boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy);
1006 else if (fdy < 0)
1007 boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy);
1008
1009 // until boundedEnd we can now have a fast middle part without boundary checks
1010 const __m256i vdistShuffle =
1011 _mm256_setr_epi8(0, char(0x80), 0, char(0x80), 4, char(0x80), 4, char(0x80), 8, char(0x80), 8, char(0x80), 12, char(0x80), 12, char(0x80),
1012 0, char(0x80), 0, char(0x80), 4, char(0x80), 4, char(0x80), 8, char(0x80), 8, char(0x80), 12, char(0x80), 12, char(0x80));
1013 const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
1014 const __m256i v_256 = _mm256_set1_epi16(256);
1015 const __m256i v_fdx = _mm256_set1_epi32(fdx * 8);
1016 const __m256i v_fdy = _mm256_set1_epi32(fdy * 8);
1017 const __m256i v_fxy_r = _mm256_set1_epi32(0x08);
1018 const __m256i v_index = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
1019 __m256i v_fx = _mm256_set1_epi32(fx);
1020 __m256i v_fy = _mm256_set1_epi32(fy);
1021 v_fx = _mm256_add_epi32(v_fx, _mm256_mullo_epi32(_mm256_set1_epi32(fdx), v_index));
1022 v_fy = _mm256_add_epi32(v_fy, _mm256_mullo_epi32(_mm256_set1_epi32(fdy), v_index));
1023
1024 const uchar *textureData = image.imageData;
1025 const qsizetype bytesPerLine = image.bytesPerLine;
1026 const __m256i vbpl = _mm256_set1_epi16(bytesPerLine/4);
1027
1028 while (b < boundedEnd - 7) {
1029 const __m256i vy = _mm256_packs_epi32(_mm256_srli_epi32(v_fy, 16), _mm256_setzero_si256());
1030 // 8x16bit * 8x16bit -> 8x32bit
1031 __m256i offset = _mm256_unpacklo_epi16(_mm256_mullo_epi16(vy, vbpl), _mm256_mulhi_epi16(vy, vbpl));
1032 offset = _mm256_add_epi32(offset, _mm256_srli_epi32(v_fx, 16));
1033 const __m128i offsetLo = _mm256_castsi256_si128(offset);
1034 const __m128i offsetHi = _mm256_extracti128_si256(offset, 1);
1035 const uint *topData = (const uint *)(textureData);
1036 const uint *botData = (const uint *)(textureData + bytesPerLine);
1037 const __m256i toplo = _mm256_i32gather_epi64((const long long *)topData, offsetLo, 4);
1038 const __m256i tophi = _mm256_i32gather_epi64((const long long *)topData, offsetHi, 4);
1039 const __m256i botlo = _mm256_i32gather_epi64((const long long *)botData, offsetLo, 4);
1040 const __m256i bothi = _mm256_i32gather_epi64((const long long *)botData, offsetHi, 4);
1041
1042 __m256i v_distx = _mm256_srli_epi16(v_fx, 8);
1043 __m256i v_disty = _mm256_srli_epi16(v_fy, 8);
1044 v_distx = _mm256_srli_epi16(_mm256_add_epi32(v_distx, v_fxy_r), 4);
1045 v_disty = _mm256_srli_epi16(_mm256_add_epi32(v_disty, v_fxy_r), 4);
1046 v_distx = _mm256_shuffle_epi8(v_distx, vdistShuffle);
1047 v_disty = _mm256_shuffle_epi8(v_disty, vdistShuffle);
1048
1049 interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256, b);
1050 b += 8;
1051 v_fx = _mm256_add_epi32(v_fx, v_fdx);
1052 v_fy = _mm256_add_epi32(v_fy, v_fdy);
1053 }
1054 fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , 0);
1055 fy = _mm_extract_epi32(_mm256_castsi256_si128(v_fy) , 0);
1056
1057 while (b < boundedEnd) {
1058 int x = (fx >> 16);
1059 int y = (fy >> 16);
1060
1061 const uint *s1 = (const uint *)image.scanLine(y);
1062 const uint *s2 = (const uint *)image.scanLine(y + 1);
1063
1064 int distx = (fx & 0x0000ffff) >> 8;
1065 int disty = (fy & 0x0000ffff) >> 8;
1066 *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty);
1067
1068 fx += fdx;
1069 fy += fdy;
1070 ++b;
1071 }
1072
1073 while (b < end) {
1074 int x1 = (fx >> 16);
1075 int x2;
1076 int y1 = (fy >> 16);
1077 int y2;
1078
1081
1082 const uint *s1 = (const uint *)image.scanLine(y1);
1083 const uint *s2 = (const uint *)image.scanLine(y2);
1084
1085 uint tl = s1[x1];
1086 uint tr = s1[x2];
1087 uint bl = s2[x1];
1088 uint br = s2[x2];
1089
1090 int distx = (fx & 0x0000ffff) >> 8;
1091 int disty = (fy & 0x0000ffff) >> 8;
1092 *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
1093
1094 fx += fdx;
1095 fy += fdy;
1096 ++b;
1097 }
1098}
1099
1100static inline __m256i epilogueMaskFromCount(qsizetype count)
1101{
1102 Q_ASSERT(count > 0);
1103 static const __m256i offsetMask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
1104 return _mm256_add_epi32(offsetMask, _mm256_set1_epi32(-count));
1105}
1106
1107template<bool RGBA>
1108static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype count)
1109{
1110 qsizetype i = 0;
1111 const __m256i alphaMask = _mm256_set1_epi32(0xff000000);
1112 const __m256i rgbaMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15));
1113 const __m256i shuffleMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15));
1114 const __m256i half = _mm256_set1_epi16(0x0080);
1115 const __m256i zero = _mm256_setzero_si256();
1116
1117 for (; i < count - 7; i += 8) {
1118 __m256i srcVector = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + i));
1119 if (!_mm256_testz_si256(srcVector, alphaMask)) {
1120 // keep the two _mm_test[zc]_siXXX next to each other
1121 bool cf = _mm256_testc_si256(srcVector, alphaMask);
1122 if (RGBA)
1123 srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
1124 if (!cf) {
1125 __m256i src1 = _mm256_unpacklo_epi8(srcVector, zero);
1126 __m256i src2 = _mm256_unpackhi_epi8(srcVector, zero);
1127 __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
1128 __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask);
1129 src1 = _mm256_mullo_epi16(src1, alpha1);
1130 src2 = _mm256_mullo_epi16(src2, alpha2);
1131 src1 = _mm256_add_epi16(src1, _mm256_srli_epi16(src1, 8));
1132 src2 = _mm256_add_epi16(src2, _mm256_srli_epi16(src2, 8));
1133 src1 = _mm256_add_epi16(src1, half);
1134 src2 = _mm256_add_epi16(src2, half);
1135 src1 = _mm256_srli_epi16(src1, 8);
1136 src2 = _mm256_srli_epi16(src2, 8);
1137 src1 = _mm256_blend_epi16(src1, alpha1, 0x88);
1138 src2 = _mm256_blend_epi16(src2, alpha2, 0x88);
1139 srcVector = _mm256_packus_epi16(src1, src2);
1140 _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), srcVector);
1141 } else {
1142 if (buffer != src || RGBA)
1143 _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), srcVector);
1144 }
1145 } else {
1146 _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), zero);
1147 }
1148 }
1149
1150 if (i < count) {
1151 const __m256i epilogueMask = epilogueMaskFromCount(count - i);
1152 __m256i srcVector = _mm256_maskload_epi32(reinterpret_cast<const int *>(src + i), epilogueMask);
1153 const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask);
1154
1155 if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) {
1156 // keep the two _mm_test[zc]_siXXX next to each other
1157 bool cf = _mm256_testc_si256(srcVector, epilogueAlphaMask);
1158 if (RGBA)
1159 srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
1160 if (!cf) {
1161 __m256i src1 = _mm256_unpacklo_epi8(srcVector, zero);
1162 __m256i src2 = _mm256_unpackhi_epi8(srcVector, zero);
1163 __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
1164 __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask);
1165 src1 = _mm256_mullo_epi16(src1, alpha1);
1166 src2 = _mm256_mullo_epi16(src2, alpha2);
1167 src1 = _mm256_add_epi16(src1, _mm256_srli_epi16(src1, 8));
1168 src2 = _mm256_add_epi16(src2, _mm256_srli_epi16(src2, 8));
1169 src1 = _mm256_add_epi16(src1, half);
1170 src2 = _mm256_add_epi16(src2, half);
1171 src1 = _mm256_srli_epi16(src1, 8);
1172 src2 = _mm256_srli_epi16(src2, 8);
1173 src1 = _mm256_blend_epi16(src1, alpha1, 0x88);
1174 src2 = _mm256_blend_epi16(src2, alpha2, 0x88);
1175 srcVector = _mm256_packus_epi16(src1, src2);
1176 _mm256_maskstore_epi32(reinterpret_cast<int *>(buffer + i), epilogueMask, srcVector);
1177 } else {
1178 if (buffer != src || RGBA)
1179 _mm256_maskstore_epi32(reinterpret_cast<int *>(buffer + i), epilogueMask, srcVector);
1180 }
1181 } else {
1182 _mm256_maskstore_epi32(reinterpret_cast<int *>(buffer + i), epilogueMask, zero);
1183 }
1184 }
1185}
1186
1187void QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, int count, const QList<QRgb> *)
1188{
1189 convertARGBToARGB32PM_avx2<false>(buffer, buffer, count);
1190}
1191
1192void QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, int count, const QList<QRgb> *)
1193{
1194 convertARGBToARGB32PM_avx2<true>(buffer, buffer, count);
1195}
1196
1197const uint *QT_FASTCALL fetchARGB32ToARGB32PM_avx2(uint *buffer, const uchar *src, int index, int count,
1198 const QList<QRgb> *, QDitherInfo *)
1199{
1200 convertARGBToARGB32PM_avx2<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1201 return buffer;
1202}
1203
1204const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_avx2(uint *buffer, const uchar *src, int index, int count,
1205 const QList<QRgb> *, QDitherInfo *)
1206{
1207 convertARGBToARGB32PM_avx2<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1208 return buffer;
1209}
1210
1211template<bool RGBA>
1212static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizetype count)
1213{
1214 qsizetype i = 0;
1215 const __m256i alphaMask = _mm256_set1_epi32(0xff000000);
1216 const __m256i rgbaMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15));
1217 const __m256i shuffleMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15));
1218 const __m256i zero = _mm256_setzero_si256();
1219
1220 for (; i < count - 7; i += 8) {
1221 __m256i dst1, dst2;
1222 __m256i srcVector = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + i));
1223 if (!_mm256_testz_si256(srcVector, alphaMask)) {
1224 // keep the two _mm_test[zc]_siXXX next to each other
1225 bool cf = _mm256_testc_si256(srcVector, alphaMask);
1226 if (!RGBA)
1227 srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
1228
1229 // The two unpack instructions unpack the low and upper halves of
1230 // each 128-bit half of the 256-bit register. Here's the tracking
1231 // of what's where: (p is 32-bit, P is 64-bit)
1232 // as loaded: [ p1, p2, p3, p4; p5, p6, p7, p8 ]
1233 // after permute4x64 [ p1, p2, p5, p6; p3, p4, p7, p8 ]
1234 // after unpacklo/hi [ P1, P2; P3, P4 ] [ P5, P6; P7, P8 ]
1235 srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(3, 1, 2, 0));
1236
1237 const __m256i src1 = _mm256_unpacklo_epi8(srcVector, srcVector);
1238 const __m256i src2 = _mm256_unpackhi_epi8(srcVector, srcVector);
1239 if (!cf) {
1240 const __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
1241 const __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask);
1242 dst1 = _mm256_mulhi_epu16(src1, alpha1);
1243 dst2 = _mm256_mulhi_epu16(src2, alpha2);
1244 dst1 = _mm256_add_epi16(dst1, _mm256_srli_epi16(dst1, 15));
1245 dst2 = _mm256_add_epi16(dst2, _mm256_srli_epi16(dst2, 15));
1246 dst1 = _mm256_blend_epi16(dst1, src1, 0x88);
1247 dst2 = _mm256_blend_epi16(dst2, src2, 0x88);
1248 } else {
1249 dst1 = src1;
1250 dst2 = src2;
1251 }
1252 } else {
1253 dst1 = dst2 = zero;
1254 }
1255 _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), dst1);
1256 _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i) + 1, dst2);
1257 }
1258
1259 if (i < count) {
1260 __m256i epilogueMask = epilogueMaskFromCount(count - i);
1261 const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask);
1262 __m256i dst1, dst2;
1263 __m256i srcVector = _mm256_maskload_epi32(reinterpret_cast<const int *>(src + i), epilogueMask);
1264
1265 if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) {
1266 // keep the two _mm_test[zc]_siXXX next to each other
1267 bool cf = _mm256_testc_si256(srcVector, epilogueAlphaMask);
1268 if (!RGBA)
1269 srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
1270 srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(3, 1, 2, 0));
1271 const __m256i src1 = _mm256_unpacklo_epi8(srcVector, srcVector);
1272 const __m256i src2 = _mm256_unpackhi_epi8(srcVector, srcVector);
1273 if (!cf) {
1274 const __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
1275 const __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask);
1276 dst1 = _mm256_mulhi_epu16(src1, alpha1);
1277 dst2 = _mm256_mulhi_epu16(src2, alpha2);
1278 dst1 = _mm256_add_epi16(dst1, _mm256_srli_epi16(dst1, 15));
1279 dst2 = _mm256_add_epi16(dst2, _mm256_srli_epi16(dst2, 15));
1280 dst1 = _mm256_blend_epi16(dst1, src1, 0x88);
1281 dst2 = _mm256_blend_epi16(dst2, src2, 0x88);
1282 } else {
1283 dst1 = src1;
1284 dst2 = src2;
1285 }
1286 } else {
1287 dst1 = dst2 = zero;
1288 }
1289 epilogueMask = _mm256_permute4x64_epi64(epilogueMask, _MM_SHUFFLE(3, 1, 2, 0));
1290 _mm256_maskstore_epi64(reinterpret_cast<qint64 *>(buffer + i),
1291 _mm256_unpacklo_epi32(epilogueMask, epilogueMask),
1292 dst1);
1293 _mm256_maskstore_epi64(reinterpret_cast<qint64 *>(buffer + i + 4),
1294 _mm256_unpackhi_epi32(epilogueMask, epilogueMask),
1295 dst2);
1296 }
1297}
1298
1299const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, int count,
1300 const QList<QRgb> *, QDitherInfo *)
1301{
1302 convertARGBToRGBA64PM_avx2<false>(buffer, src, count);
1303 return buffer;
1304}
1305
1306const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, int count,
1307 const QList<QRgb> *, QDitherInfo *)
1308{
1309 convertARGBToRGBA64PM_avx2<true>(buffer, src, count);
1310 return buffer;
1311}
1312
1313const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count,
1314 const QList<QRgb> *, QDitherInfo *)
1315{
1316 convertARGBToRGBA64PM_avx2<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1317 return buffer;
1318}
1319
1320const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count,
1321 const QList<QRgb> *, QDitherInfo *)
1322{
1323 convertARGBToRGBA64PM_avx2<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1324 return buffer;
1325}
1326
1327const QRgba64 *QT_FASTCALL fetchRGBA64ToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count,
1328 const QList<QRgb> *, QDitherInfo *)
1329{
1330 const QRgba64 *s = reinterpret_cast<const QRgba64 *>(src) + index;
1331 int i = 0;
1332 const __m256i vh = _mm256_set1_epi32(0x8000);
1333 for (; i < count - 3; i += 4) {
1334 __m256i vs256 = _mm256_loadu_si256((const __m256i *)(s + i));
1335 __m256i va256 = _mm256_shufflelo_epi16(vs256, _MM_SHUFFLE(3, 3, 3, 3));
1336 va256 = _mm256_shufflehi_epi16(va256, _MM_SHUFFLE(3, 3, 3, 3));
1337 const __m256i vmullo = _mm256_mullo_epi16(vs256, va256);
1338 const __m256i vmulhi = _mm256_mulhi_epu16(vs256, va256);
1339 __m256i vslo = _mm256_unpacklo_epi16(vmullo, vmulhi);
1340 __m256i vshi = _mm256_unpackhi_epi16(vmullo, vmulhi);
1341 vslo = _mm256_add_epi32(vslo, _mm256_srli_epi32(vslo, 16));
1342 vshi = _mm256_add_epi32(vshi, _mm256_srli_epi32(vshi, 16));
1343 vslo = _mm256_add_epi32(vslo, vh);
1344 vshi = _mm256_add_epi32(vshi, vh);
1345 vslo = _mm256_srli_epi32(vslo, 16);
1346 vshi = _mm256_srli_epi32(vshi, 16);
1347 vs256 = _mm256_packus_epi32(vslo, vshi);
1348 vs256 = _mm256_blend_epi16(vs256, va256, 0x88);
1349 _mm256_storeu_si256((__m256i *)(buffer + i), vs256);
1350 }
1351 for (; i < count; ++i) {
1352 const auto a = s[i].alpha();
1353 __m128i vs = _mm_loadl_epi64((const __m128i *)(s + i));
1354 __m128i va = _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3));
1355 vs = multiplyAlpha65535(vs, va);
1356 _mm_storel_epi64((__m128i *)(buffer + i), vs);
1357 buffer[i].setAlpha(a);
1358 }
1359 return buffer;
1360}
1361
1362const uint *QT_FASTCALL fetchRGB16FToRGB32_avx2(uint *buffer, const uchar *src, int index, int count,
1363 const QList<QRgb> *, QDitherInfo *)
1364{
1365 const quint64 *s = reinterpret_cast<const quint64 *>(src) + index;
1366 const __m256 vf = _mm256_set1_ps(255.0f);
1367 const __m256 vh = _mm256_set1_ps(0.5f);
1368 int i = 0;
1369 for (; i + 1 < count; i += 2) {
1370 __m256 vsf = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(s + i)));
1371 vsf = _mm256_mul_ps(vsf, vf);
1372 vsf = _mm256_add_ps(vsf, vh);
1373 __m256i vsi = _mm256_cvttps_epi32(vsf);
1374 vsi = _mm256_packs_epi32(vsi, vsi);
1375 vsi = _mm256_shufflelo_epi16(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1376 vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(3, 1, 2, 0));
1377 __m128i vsi128 = _mm256_castsi256_si128(vsi);
1378 vsi128 = _mm_packus_epi16(vsi128, vsi128);
1379 _mm_storel_epi64((__m128i *)(buffer + i), vsi128);
1380 }
1381 if (i < count) {
1382 __m128 vsf = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)(s + i)));
1383 vsf = _mm_mul_ps(vsf, _mm_set1_ps(255.0f));
1384 vsf = _mm_add_ps(vsf, _mm_set1_ps(0.5f));
1385 __m128i vsi = _mm_cvttps_epi32(vsf);
1386 vsi = _mm_packs_epi32(vsi, vsi);
1387 vsi = _mm_shufflelo_epi16(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1388 vsi = _mm_packus_epi16(vsi, vsi);
1389 buffer[i] = _mm_cvtsi128_si32(vsi);
1390 }
1391 return buffer;
1392}
1393
1394const uint *QT_FASTCALL fetchRGBA16FToARGB32PM_avx2(uint *buffer, const uchar *src, int index, int count,
1395 const QList<QRgb> *, QDitherInfo *)
1396{
1397 const quint64 *s = reinterpret_cast<const quint64 *>(src) + index;
1398 const __m256 vf = _mm256_set1_ps(255.0f);
1399 const __m256 vh = _mm256_set1_ps(0.5f);
1400 int i = 0;
1401 for (; i + 1 < count; i += 2) {
1402 __m256 vsf = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(s + i)));
1403 __m256 vsa = _mm256_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1404 vsf = _mm256_mul_ps(vsf, vsa);
1405 vsf = _mm256_blend_ps(vsf, vsa, 0x88);
1406 vsf = _mm256_mul_ps(vsf, vf);
1407 vsf = _mm256_add_ps(vsf, vh);
1408 __m256i vsi = _mm256_cvttps_epi32(vsf);
1409 vsi = _mm256_packus_epi32(vsi, vsi);
1410 vsi = _mm256_shufflelo_epi16(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1411 vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(3, 1, 2, 0));
1412 __m128i vsi128 = _mm256_castsi256_si128(vsi);
1413 vsi128 = _mm_packus_epi16(vsi128, vsi128);
1414 _mm_storel_epi64((__m128i *)(buffer + i), vsi128);
1415 }
1416 if (i < count) {
1417 __m128 vsf = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)(s + i)));
1418 __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1419 vsf = _mm_mul_ps(vsf, vsa);
1420 vsf = _mm_insert_ps(vsf, vsa, 0x30);
1421 vsf = _mm_mul_ps(vsf, _mm_set1_ps(255.0f));
1422 vsf = _mm_add_ps(vsf, _mm_set1_ps(0.5f));
1423 __m128i vsi = _mm_cvttps_epi32(vsf);
1424 vsi = _mm_packus_epi32(vsi, vsi);
1425 vsi = _mm_shufflelo_epi16(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1426 vsi = _mm_packus_epi16(vsi, vsi);
1427 buffer[i] = _mm_cvtsi128_si32(vsi);
1428 }
1429 return buffer;
1430}
1431
1432const QRgba64 *QT_FASTCALL fetchRGBA16FPMToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count,
1433 const QList<QRgb> *, QDitherInfo *)
1434{
1435 const quint64 *s = reinterpret_cast<const quint64 *>(src) + index;
1436 const __m256 vf = _mm256_set1_ps(65535.0f);
1437 const __m256 vh = _mm256_set1_ps(0.5f);
1438 int i = 0;
1439 for (; i + 1 < count; i += 2) {
1440 __m256 vsf = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(s + i)));
1441 vsf = _mm256_mul_ps(vsf, vf);
1442 vsf = _mm256_add_ps(vsf, vh);
1443 __m256i vsi = _mm256_cvttps_epi32(vsf);
1444 vsi = _mm256_packus_epi32(vsi, vsi);
1445 vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(3, 1, 2, 0));
1446 _mm_storeu_si128((__m128i *)(buffer + i), _mm256_castsi256_si128(vsi));
1447 }
1448 if (i < count) {
1449 __m128 vsf = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)(s + i)));
1450 vsf = _mm_mul_ps(vsf, _mm_set1_ps(65535.0f));
1451 vsf = _mm_add_ps(vsf, _mm_set1_ps(0.5f));
1452 __m128i vsi = _mm_cvttps_epi32(vsf);
1453 vsi = _mm_packus_epi32(vsi, vsi);
1454 _mm_storel_epi64((__m128i *)(buffer + i), vsi);
1455 }
1456 return buffer;
1457}
1458
1459const QRgba64 *QT_FASTCALL fetchRGBA16FToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count,
1460 const QList<QRgb> *, QDitherInfo *)
1461{
1462 const quint64 *s = reinterpret_cast<const quint64 *>(src) + index;
1463 const __m256 vf = _mm256_set1_ps(65535.0f);
1464 const __m256 vh = _mm256_set1_ps(0.5f);
1465 int i = 0;
1466 for (; i + 1 < count; i += 2) {
1467 __m256 vsf = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(s + i)));
1468 __m256 vsa = _mm256_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
1469 vsf = _mm256_mul_ps(vsf, vsa);
1470 vsf = _mm256_blend_ps(vsf, vsa, 0x88);
1471 vsf = _mm256_mul_ps(vsf, vf);
1472 vsf = _mm256_add_ps(vsf, vh);
1473 __m256i vsi = _mm256_cvttps_epi32(vsf);
1474 vsi = _mm256_packus_epi32(vsi, vsi);
1475 vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(3, 1, 2, 0));
1476 _mm_storeu_si128((__m128i *)(buffer + i), _mm256_castsi256_si128(vsi));
1477 }
1478 if (i < count) {
1479 __m128 vsf = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)(s + i)));
1480 __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
1481 vsf = _mm_mul_ps(vsf, vsa);
1482 vsf = _mm_insert_ps(vsf, vsa, 0x30);
1483 vsf = _mm_mul_ps(vsf, _mm_set1_ps(65535.0f));
1484 vsf = _mm_add_ps(vsf, _mm_set1_ps(0.5f));
1485 __m128i vsi = _mm_cvttps_epi32(vsf);
1486 vsi = _mm_packus_epi32(vsi, vsi);
1487 _mm_storel_epi64((__m128i *)(buffer + i), vsi);
1488 }
1489 return buffer;
1490}
1491
1492void QT_FASTCALL storeRGB16FFromRGB32_avx2(uchar *dest, const uint *src, int index, int count,
1493 const QList<QRgb> *, QDitherInfo *)
1494{
1495 quint64 *d = reinterpret_cast<quint64 *>(dest) + index;
1496 const __m256 vf = _mm256_set1_ps(1.0f / 255.0f);
1497 int i = 0;
1498 for (; i + 1 < count; i += 2) {
1499 __m256i vsi = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)(src + i)));
1500 vsi = _mm256_shuffle_epi32(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1501 __m256 vsf = _mm256_cvtepi32_ps(vsi);
1502 vsf = _mm256_mul_ps(vsf, vf);
1503 _mm_storeu_si128((__m128i *)(d + i), _mm256_cvtps_ph(vsf, 0));
1504 }
1505 if (i < count) {
1506 __m128i vsi = _mm_cvtsi32_si128(src[i]);
1507 vsi = _mm_cvtepu8_epi32(vsi);
1508 vsi = _mm_shuffle_epi32(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1509 __m128 vsf = _mm_cvtepi32_ps(vsi);
1510 vsf = _mm_mul_ps(vsf, _mm_set1_ps(1.0f / 255.0f));
1511 _mm_storel_epi64((__m128i *)(d + i), _mm_cvtps_ph(vsf, 0));
1512 }
1513}
1514
1515void QT_FASTCALL storeRGBA16FFromARGB32PM_avx2(uchar *dest, const uint *src, int index, int count,
1516 const QList<QRgb> *, QDitherInfo *)
1517{
1518 quint64 *d = reinterpret_cast<quint64 *>(dest) + index;
1519 const __m128 vf = _mm_set1_ps(1.0f / 255.0f);
1520 for (int i = 0; i < count; ++i) {
1521 const uint s = src[i];
1522 __m128i vsi = _mm_cvtsi32_si128(s);
1523 vsi = _mm_cvtepu8_epi32(vsi);
1524 vsi = _mm_shuffle_epi32(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1525 __m128 vsf = _mm_cvtepi32_ps(vsi);
1526 const uint8_t a = (s >> 24);
1527 if (a == 255)
1528 vsf = _mm_mul_ps(vsf, vf);
1529 else if (a == 0)
1530 vsf = _mm_set1_ps(0.0f);
1531 else {
1532 const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1533 __m128 vsr = _mm_rcp_ps(vsa);
1534 vsr = _mm_sub_ps(_mm_add_ps(vsr, vsr), _mm_mul_ps(vsr, _mm_mul_ps(vsr, vsa)));
1535 vsr = _mm_insert_ps(vsr, _mm_set_ss(1.0f), 0x30);
1536 vsf = _mm_mul_ps(vsf, vsr);
1537 }
1538 _mm_storel_epi64((__m128i *)(d + i), _mm_cvtps_ph(vsf, 0));
1539 }
1540}
1541
1542#if QT_CONFIG(raster_fp)
1543const QRgbaFloat32 *QT_FASTCALL fetchRGBA16FToRGBA32F_avx2(QRgbaFloat32 *buffer, const uchar *src, int index, int count,
1544 const QList<QRgb> *, QDitherInfo *)
1545{
1546 const quint64 *s = reinterpret_cast<const quint64 *>(src) + index;
1547 int i = 0;
1548 for (; i + 1 < count; i += 2) {
1549 __m256 vsf = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(s + i)));
1550 __m256 vsa = _mm256_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1551 vsf = _mm256_mul_ps(vsf, vsa);
1552 vsf = _mm256_blend_ps(vsf, vsa, 0x88);
1553 _mm256_storeu_ps((float *)(buffer + i), vsf);
1554 }
1555 if (i < count) {
1556 __m128 vsf = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)(s + i)));
1557 __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1558 vsf = _mm_mul_ps(vsf, vsa);
1559 vsf = _mm_insert_ps(vsf, vsa, 0x30);
1560 _mm_storeu_ps((float *)(buffer + i), vsf);
1561 }
1562 return buffer;
1563}
1564
1565void QT_FASTCALL storeRGBX16FFromRGBA32F_avx2(uchar *dest, const QRgbaFloat32 *src, int index, int count,
1566 const QList<QRgb> *, QDitherInfo *)
1567{
1568 quint64 *d = reinterpret_cast<quint64 *>(dest) + index;
1569 const __m128 *s = reinterpret_cast<const __m128 *>(src);
1570 const __m128 zero = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
1571 for (int i = 0; i < count; ++i) {
1572 __m128 vsf = _mm_loadu_ps(reinterpret_cast<const float *>(s + i));
1573 const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1574 const float a = _mm_cvtss_f32(vsa);
1575 if (a == 1.0f)
1576 { }
1577 else if (a == 0.0f)
1578 vsf = zero;
1579 else {
1580 __m128 vsr = _mm_rcp_ps(vsa);
1581 vsr = _mm_sub_ps(_mm_add_ps(vsr, vsr), _mm_mul_ps(vsr, _mm_mul_ps(vsr, vsa)));
1582 vsf = _mm_mul_ps(vsf, vsr);
1583 vsf = _mm_insert_ps(vsf, _mm_set_ss(1.0f), 0x30);
1584 }
1585 _mm_storel_epi64((__m128i *)(d + i), _mm_cvtps_ph(vsf, 0));
1586 }
1587}
1588
1589void QT_FASTCALL storeRGBA16FFromRGBA32F_avx2(uchar *dest, const QRgbaFloat32 *src, int index, int count,
1590 const QList<QRgb> *, QDitherInfo *)
1591{
1592 quint64 *d = reinterpret_cast<quint64 *>(dest) + index;
1593 const __m128 *s = reinterpret_cast<const __m128 *>(src);
1594 const __m128 zero = _mm_set1_ps(0.0f);
1595 for (int i = 0; i < count; ++i) {
1596 __m128 vsf = _mm_loadu_ps(reinterpret_cast<const float *>(s + i));
1597 const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1598 const float a = _mm_cvtss_f32(vsa);
1599 if (a == 1.0f)
1600 { }
1601 else if (a == 0.0f)
1602 vsf = zero;
1603 else {
1604 __m128 vsr = _mm_rcp_ps(vsa);
1605 vsr = _mm_sub_ps(_mm_add_ps(vsr, vsr), _mm_mul_ps(vsr, _mm_mul_ps(vsr, vsa)));
1606 vsr = _mm_insert_ps(vsr, _mm_set_ss(1.0f), 0x30);
1607 vsf = _mm_mul_ps(vsf, vsr);
1608 }
1609 _mm_storel_epi64((__m128i *)(d + i), _mm_cvtps_ph(vsf, 0));
1610 }
1611}
1612#endif
1613
1615
1616#endif
Combined button and popup list for selecting options.
Definition image.cpp:4
#define Q_FALLTHROUGH()
#define QT_FASTCALL
#define Q_NEVER_INLINE
void qt_memfill64(quint64 *dest, quint64 color, qsizetype count)
void qt_memfill32(quint32 *dest, quint32 color, qsizetype count)
void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2)
static constexpr int BufferSize
static uint INTERPOLATE_PIXEL_256(uint x, uint a, uint y, uint b)
static void blend_pixel(quint32 &dst, const quint32 src)
static uint INTERPOLATE_PIXEL_255(uint x, uint a, uint y, uint b)
static uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty)
static uint BYTE_MUL(uint x, uint a)
#define Q_DECL_VECTORCALL
EGLOutputLayerEXT EGLint EGLAttrib value
[5]
constexpr const T & qMin(const T &a, const T &b)
Definition qminmax.h:40
constexpr T qAbs(const T &t)
Definition qnumeric.h:328
GLint GLfloat GLfloat GLfloat v2
GLboolean GLboolean GLboolean b
GLint GLint GLint GLint GLint x
[0]
GLfloat GLfloat GLfloat w
[0]
GLboolean GLboolean GLboolean GLboolean a
[7]
GLuint GLfloat GLfloat GLfloat GLfloat y1
GLuint index
[2]
GLuint GLuint end
GLuint GLfloat GLfloat GLfloat x1
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat s1
GLenum GLuint GLenum GLsizei length
GLdouble GLdouble GLdouble GLdouble top
GLenum GLenum GLsizei count
GLfloat GLfloat f
GLenum src
GLenum GLuint buffer
GLuint color
[2]
GLenum GLenum dst
GLint GLint bottom
GLint GLfloat GLfloat v1
GLenum GLuint GLintptr offset
GLint y
GLfloat GLfloat GLfloat GLfloat h
GLdouble s
[6]
Definition qopenglext.h:235
GLfixed GLfixed GLfixed y2
GLfixed GLfixed x2
GLdouble GLdouble t
Definition qopenglext.h:243
#define Q_ASSERT(cond)
Definition qrandom.cpp:47
constexpr int qAlpha(QRgb rgb)
Definition qrgb.h:27
static QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
Definition qrgba64_p.h:62
static QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
Definition qrgba64_p.h:159
static T multiplyAlpha255(T rgba64, uint alpha255)
Definition qrgba64_p.h:92
#define SIMD_EPILOGUE(i, length, max)
Definition qsimd_p.h:33
#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length)
Definition qsimd_p.h:30
#define zero
#define s2
#define tr(X)
unsigned int quint32
Definition qtypes.h:50
unsigned char uchar
Definition qtypes.h:32
size_t quintptr
Definition qtypes.h:167
unsigned long long quint64
Definition qtypes.h:61
ptrdiff_t qsizetype
Definition qtypes.h:165
unsigned int uint
Definition qtypes.h:34
long long qint64
Definition qtypes.h:60