d7/db1/qdrawingprimitive__sse2__p_8h_source.html

// Copyright (C) 2016 The Qt Company Ltd.

// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only


#ifndef QDRAWINGPRIMITIVE_SSE2_P_H

#define QDRAWINGPRIMITIVE_SSE2_P_H


#include <QtGui/private/qtguiglobal_p.h>

#include <private/qsimd_p.h>

#include "qdrawhelper_x86_p.h"

#include "qrgba64_p.h"


#ifdef __SSE2__


//

//  W A R N I N G

//  -------------

//

// This file is not part of the Qt API.  It exists purely as an

// implementation detail.  This header file may change from version to

// version without notice, or even be removed.

//

// We mean it.

//


QT_BEGIN_NAMESPACE


/*

 * Multiply the components of pixelVector by alphaChannel

 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA

 * colorMask must have 0x00ff00ff on each 32 bits component

 * half must have the value 128 (0x80) for each 32 bits component

 */

#define BYTE_MUL_SSE2(result, pixelVector, alphaChannel, colorMask, half) \

{ \

    /* 1. separate the colors in 2 vectors so each color is on 16 bits \

       (in order to be multiplied by the alpha \

       each 32 bit of dstVectorAG are in the form 0x00AA00GG \

       each 32 bit of dstVectorRB are in the form 0x00RR00BB */\

    __m128i pixelVectorAG = _mm_srli_epi16(pixelVector, 8); \

    __m128i pixelVectorRB = _mm_and_si128(pixelVector, colorMask); \

 \

    /* 2. multiply the vectors by the alpha channel */\

    pixelVectorAG = _mm_mullo_epi16(pixelVectorAG, alphaChannel); \

    pixelVectorRB = _mm_mullo_epi16(pixelVectorRB, alphaChannel); \

 \

    /* 3. divide by 255, that's the tricky part. \

       we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */ \

\

    pixelVectorRB = _mm_add_epi16(pixelVectorRB, _mm_srli_epi16(pixelVectorRB, 8)); \

    pixelVectorRB = _mm_add_epi16(pixelVectorRB, half); \

    pixelVectorAG = _mm_add_epi16(pixelVectorAG, _mm_srli_epi16(pixelVectorAG, 8)); \

    pixelVectorAG = _mm_add_epi16(pixelVectorAG, half); \

 \

\

    pixelVectorRB = _mm_srli_epi16(pixelVectorRB, 8); \


\

    pixelVectorAG = _mm_andnot_si128(colorMask, pixelVectorAG); \

 \

    /* 4. combine the 2 pairs of colors */ \

    result = _mm_or_si128(pixelVectorAG, pixelVectorRB); \

}


/*

 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA

 * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component

 * colorMask must have 0x00ff00ff on each 32 bits component

 * half must have the value 128 (0x80) for each 32 bits component

 */

#define INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, alphaChannel, oneMinusAlphaChannel, colorMask, half) { \

    /* interpolate AG */\

    __m128i srcVectorAG = _mm_srli_epi16(srcVector, 8); \

    __m128i dstVectorAG = _mm_srli_epi16(dstVector, 8); \

    __m128i srcVectorAGalpha = _mm_mullo_epi16(srcVectorAG, alphaChannel); \

    __m128i dstVectorAGoneMinusAlphalpha = _mm_mullo_epi16(dstVectorAG, oneMinusAlphaChannel); \

    __m128i finalAG = _mm_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha); \

    finalAG = _mm_add_epi16(finalAG, _mm_srli_epi16(finalAG, 8)); \

    finalAG = _mm_add_epi16(finalAG, half); \

    finalAG = _mm_andnot_si128(colorMask, finalAG); \

 \

    /* interpolate RB */\

    __m128i srcVectorRB = _mm_and_si128(srcVector, colorMask); \

    __m128i dstVectorRB = _mm_and_si128(dstVector, colorMask); \

    __m128i srcVectorRBalpha = _mm_mullo_epi16(srcVectorRB, alphaChannel); \

    __m128i dstVectorRBoneMinusAlphalpha = _mm_mullo_epi16(dstVectorRB, oneMinusAlphaChannel); \

    __m128i finalRB = _mm_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha); \

    finalRB = _mm_add_epi16(finalRB, _mm_srli_epi16(finalRB, 8)); \

    finalRB = _mm_add_epi16(finalRB, half); \

    finalRB = _mm_srli_epi16(finalRB, 8); \

 \

    /* combine */\

    result = _mm_or_si128(finalAG, finalRB); \

}


// same as BLEND_SOURCE_OVER_ARGB32_SSE2, but for one vector srcVector

#define BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask) { \

        const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \

        if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \

            /* all opaque */ \

            _mm_store_si128((__m128i *)&dst[x], srcVector); \

        } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \

            /* not fully transparent */ \

            /* extract the alpha channel on 2 x 16 bits */ \

            /* so we have room for the multiplication */ \

            /* each 32 bits will be in the form 0x00AA00AA */ \

            /* with A being the 1 - alpha */ \

            __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); \

            alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); \

            alphaChannel = _mm_sub_epi16(one, alphaChannel); \

 \

            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \

            __m128i destMultipliedByOneMinusAlpha; \

            BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \

 \

            /* result = s + d * (1-alpha) */\

            const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \

            _mm_store_si128((__m128i *)&dst[x], result); \

        } \

    }


// Basically blend src over dst with the const alpha defined as constAlphaVector.

// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:

//const __m128i nullVector = _mm_set1_epi32(0);

//const __m128i half = _mm_set1_epi16(0x80);

//const __m128i one = _mm_set1_epi16(0xff);

//const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);

//const __m128i alphaMask = _mm_set1_epi32(0xff000000);

//

// The computation being done is:

// result = s + d * (1-alpha)

// with shortcuts if fully opaque or fully transparent.

#define BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask) { \

    int x = 0; \

\

    /* First, get dst aligned. */ \

    ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { \

        blend_pixel(dst[x], src[x]); \

    } \

\

    for (; x < length-3; x += 4) { \

        const __m128i srcVector = _mm_loadu_si128((const __m128i *)&src[x]); \

        BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask) \

    } \

    SIMD_EPILOGUE(x, length, 3) { \

        blend_pixel(dst[x], src[x]); \

    } \

}


// Basically blend src over dst with the const alpha defined as constAlphaVector.

// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:

//const __m128i nullVector = _mm_set1_epi32(0);

//const __m128i half = _mm_set1_epi16(0x80);

//const __m128i one = _mm_set1_epi16(0xff);

//const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);

//

// The computation being done is:

// dest = (s + d * sia) * ca + d * cia

//      = s * ca + d * (sia * ca + cia)

//      = s * ca + d * (1 - sa*ca)

#define BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector) \

{ \

    int x = 0; \

\

    ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { \

        blend_pixel(dst[x], src[x], const_alpha); \

    } \

\

    for (; x < length-3; x += 4) { \

        __m128i srcVector = _mm_loadu_si128((const __m128i *)&src[x]); \

        if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { \

            BYTE_MUL_SSE2(srcVector, srcVector, constAlphaVector, colorMask, half); \

\

            __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); \

            alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); \

            alphaChannel = _mm_sub_epi16(one, alphaChannel); \

 \

            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \

            __m128i destMultipliedByOneMinusAlpha; \

            BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \

 \

            const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \

            _mm_store_si128((__m128i *)&dst[x], result); \

        } \

    } \

    SIMD_EPILOGUE(x, length, 3) { \

        blend_pixel(dst[x], src[x], const_alpha); \

    } \

}


QT_END_NAMESPACE


#endif // __SSE2__


QT_BEGIN_NAMESPACE

#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)

QT_FUNCTION_TARGET(SSE2)

static inline void Q_DECL_VECTORCALL reciprocal_mul_ss(__m128 &ia, const __m128 a, float mul)

{

    ia = _mm_rcp_ss(a); // Approximate 1/a

    // Improve precision of ia using Newton-Raphson

    ia = _mm_sub_ss(_mm_add_ss(ia, ia), _mm_mul_ss(ia, _mm_mul_ss(ia, a)));

    ia = _mm_mul_ss(ia, _mm_set_ss(mul));

    ia = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0,0,0,0));

}


QT_FUNCTION_TARGET(SSE4_1)

static inline QRgb qUnpremultiply_sse4(QRgb p)

{

    const uint alpha = qAlpha(p);

    if (alpha == 255)

        return p;

    if (alpha == 0)

        return 0;

    const __m128 va = _mm_set1_ps(alpha);

    __m128 via;

    reciprocal_mul_ss(via, va, 255.0f); // Approximate 1/a

    __m128i vl = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p));

    vl = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(vl), via));

    vl = _mm_packus_epi32(vl, vl);

    vl = _mm_insert_epi16(vl, alpha, 3);

    vl = _mm_packus_epi16(vl, vl);

    return _mm_cvtsi128_si32(vl);

}


template<enum QtPixelOrder PixelOrder>

QT_FUNCTION_TARGET(SSE4_1)

static inline uint qConvertArgb32ToA2rgb30_sse4(QRgb p)

{

    const uint alpha = qAlpha(p);

    if (alpha == 255)

        return qConvertRgb32ToRgb30<PixelOrder>(p);

    if (alpha == 0)

        return 0;

    constexpr float mult = 1023.0f / (255 >> 6);

    const uint newalpha = (alpha >> 6);

    const __m128 va = _mm_set1_ps(alpha);

    __m128 via;

    reciprocal_mul_ss(via, va, mult * newalpha);

    __m128i vl = _mm_cvtsi32_si128(p);

    vl = _mm_cvtepu8_epi32(vl);

    vl = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(vl), via));

    vl = _mm_packus_epi32(vl, vl);

    uint rgb30 = (newalpha << 30);

    rgb30 |= ((uint)_mm_extract_epi16(vl, 1)) << 10;

    if (PixelOrder == PixelOrderRGB) {

        rgb30 |= ((uint)_mm_extract_epi16(vl, 2)) << 20;

        rgb30 |= ((uint)_mm_extract_epi16(vl, 0));

    } else {

        rgb30 |= ((uint)_mm_extract_epi16(vl, 0)) << 20;

        rgb30 |= ((uint)_mm_extract_epi16(vl, 2));

    }

    return rgb30;

}


template<enum QtPixelOrder PixelOrder>

QT_FUNCTION_TARGET(SSE4_1)

static inline uint qConvertRgba64ToRgb32_sse4(QRgba64 p)

{

    if (p.isTransparent())

        return 0;

    __m128i vl = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&p));

    if (!p.isOpaque()) {

        const __m128 va = _mm_set1_ps(p.alpha());

        __m128 via;

        reciprocal_mul_ss(via, va, 65535.0f);

        vl = _mm_unpacklo_epi16(vl, _mm_setzero_si128());

        vl = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(vl) , via));

        vl = _mm_packus_epi32(vl, vl);

        vl = _mm_insert_epi16(vl, p.alpha(), 3);

    }

    if (PixelOrder == PixelOrderBGR)

        vl = _mm_shufflelo_epi16(vl, _MM_SHUFFLE(3, 0, 1, 2));

    return toArgb32(vl);

}

#endif

QT_END_NAMESPACE


#endif // QDRAWINGPRIMITIVE_SSE2_P_H

QRgba64
Definition qrgba64.h:12

QT_BEGIN_NAMESPACE
Combined button and popup list for selecting options.
Definition qstandardpaths_haiku.cpp:21

QT_END_NAMESPACE
Definition qsharedpointer.cpp:1590

Q_DECL_VECTORCALL
#define Q_DECL_VECTORCALL
Definition qdrawhelper_p.h:48

qdrawhelper_x86_p.h

a
GLboolean GLboolean GLboolean GLboolean a
[7]
Definition qopengles2ext.h:337

p
GLfloat GLfloat p
[1]
Definition qopenglext.h:12698

alpha
GLfloat GLfloat GLfloat alpha
Definition qopenglext.h:418

PixelOrderRGB
@ PixelOrderRGB
Definition qpixellayout_p.h:27

PixelOrderBGR
@ PixelOrderBGR
Definition qpixellayout_p.h:28

QRgb
QT_BEGIN_NAMESPACE typedef unsigned int QRgb
Definition qrgb.h:13

qAlpha
constexpr int qAlpha(QRgb rgb)
Definition qrgb.h:27

qrgba64_p.h

toArgb32
static uint toArgb32(QRgba64 rgba64)
Definition qrgba64_p.h:219

QT_FUNCTION_TARGET
#define QT_FUNCTION_TARGET(x)
Definition qsimd_p.h:133

uint
unsigned int uint
Definition qtypes.h:34