Qt
Internal/Contributor docs for the Qt SDK. <b>Note:</b> These are NOT official API docs; those are found <a href='https://doc.qt.io/'>here</a>.
Loading...
Searching...
No Matches
qstringconverter.cpp
Go to the documentation of this file.
1// Copyright (C) 2020 The Qt Company Ltd.
2// Copyright (C) 2020 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#include <qstringconverter.h>
6#include <private/qstringconverter_p.h>
7#include "qendian.h"
8
9#include "private/qsimd_p.h"
10#include "private/qstringiterator_p.h"
11#include "private/qtools_p.h"
12#include "qbytearraymatcher.h"
14#include <QtCore/qbytearraylist.h>
15
16#if QT_CONFIG(icu)
17#include <unicode/ucnv.h>
18#include <unicode/ucnv_cb.h>
19#include <unicode/ucnv_err.h>
20#include <unicode/ustring.h>
21#endif
22
23#ifdef Q_OS_WIN
24#include <qt_windows.h>
25#ifndef QT_BOOTSTRAPPED
26#include <QtCore/qvarlengtharray.h>
27#include <QtCore/q20iterator.h>
28#include <QtCore/private/qnumeric_p.h>
29#endif // !QT_BOOTSTRAPPED
30#endif
31
32#include <array>
33
34#if __has_include(<bit>) && __cplusplus > 201703L
35#include <bit>
36#endif
37
39
40using namespace QtMiscUtils;
41
42static_assert(std::is_nothrow_move_constructible_v<QStringEncoder>);
43static_assert(std::is_nothrow_move_assignable_v<QStringEncoder>);
44static_assert(std::is_nothrow_move_constructible_v<QStringDecoder>);
45static_assert(std::is_nothrow_move_assignable_v<QStringDecoder>);
46
47enum { Endian = 0, Data = 1 };
48
49static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
50
51#if defined(__SSE2__) || defined(__ARM_NEON__)
52static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
53{
54#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
55 return std::bit_width(v) - 1;
56#else
58 // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
59 // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
60 // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
61 result ^= sizeof(unsigned) * 8 - 1;
62 return result;
63#endif
64}
65#endif
66
67#if defined(__SSE2__)
68static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
69{
70 // do sixteen characters at a time
71 for ( ; end - src >= 16; src += 16, dst += 16) {
72# ifdef __AVX2__
73 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
74 __m128i data1 = _mm256_castsi256_si128(data);
75 __m128i data2 = _mm256_extracti128_si256(data, 1);
76# else
77 __m128i data1 = _mm_loadu_si128((const __m128i*)src);
78 __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src);
79# endif
80
81 // check if everything is ASCII
82 // the highest ASCII value is U+007F
83 // Do the packing directly:
84 // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
85 // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
86 // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
87 // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
88 // "non-ASCII", but it's an acceptable compromise.
89 __m128i packed = _mm_packus_epi16(data1, data2);
90 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
91
92 // store, even if there are non-ASCII characters here
93 _mm_storeu_si128((__m128i*)dst, packed);
94
95 // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
96 ushort n = ~_mm_movemask_epi8(nonAscii);
97 if (n) {
98 // find the next probable ASCII character
99 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
100 // characters still coming
101 nextAscii = src + qBitScanReverse(n) + 1;
102
104 dst += n;
105 src += n;
106 return false;
107 }
108 }
109
110 if (end - src >= 8) {
111 // do eight characters at a time
112 __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
113 __m128i packed = _mm_packus_epi16(data, data);
114 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
115
116 // store even non-ASCII
117 _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
118
119 uchar n = ~_mm_movemask_epi8(nonAscii);
120 if (n) {
121 nextAscii = src + qBitScanReverse(n) + 1;
123 dst += n;
124 src += n;
125 return false;
126 }
127 }
128
129 return src == end;
130}
131
132static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
133{
134 // do sixteen characters at a time
135 for ( ; end - src >= 16; src += 16, dst += 16) {
136 __m128i data = _mm_loadu_si128((const __m128i*)src);
137
138#ifdef __AVX2__
139 const int BitSpacing = 2;
140 // load and zero extend to an YMM register
141 const __m256i extended = _mm256_cvtepu8_epi16(data);
142
143 uint n = _mm256_movemask_epi8(extended);
144 if (!n) {
145 // store
146 _mm256_storeu_si256((__m256i*)dst, extended);
147 continue;
148 }
149#else
150 const int BitSpacing = 1;
151
152 // check if everything is ASCII
153 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
154 uint n = _mm_movemask_epi8(data);
155 if (!n) {
156 // unpack
157 _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
158 _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
159 continue;
160 }
161#endif
162
163 // copy the front part that is still ASCII
164 while (!(n & 1)) {
165 *dst++ = *src++;
166 n >>= BitSpacing;
167 }
168
169 // find the next probable ASCII character
170 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
171 // characters still coming
172 n = qBitScanReverse(n);
173 nextAscii = src + (n / BitSpacing) + 1;
174 return false;
175
176 }
177
178 if (end - src >= 8) {
179 __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
180 uint n = _mm_movemask_epi8(data) & 0xff;
181 if (!n) {
182 // unpack and store
183 _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
184 } else {
185 while (!(n & 1)) {
186 *dst++ = *src++;
187 n >>= 1;
188 }
189
190 n = qBitScanReverse(n);
191 nextAscii = src + n + 1;
192 return false;
193 }
194 }
195
196 return src == end;
197}
198
199static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
200{
201#ifdef __AVX2__
202 // do 32 characters at a time
203 // (this is similar to simdTestMask in qstring.cpp)
204 const __m256i mask = _mm256_set1_epi8(char(0x80));
205 for ( ; end - src >= 32; src += 32) {
206 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
207 if (_mm256_testz_si256(mask, data))
208 continue;
209
210 uint n = _mm256_movemask_epi8(data);
211 Q_ASSERT(n);
212
213 // find the next probable ASCII character
214 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
215 // characters still coming
216 nextAscii = src + qBitScanReverse(n) + 1;
217
218 // return the non-ASCII character
219 return src + qCountTrailingZeroBits(n);
220 }
221#endif
222
223 // do sixteen characters at a time
224 for ( ; end - src >= 16; src += 16) {
225 __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
226
227 // check if everything is ASCII
228 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
229 uint n = _mm_movemask_epi8(data);
230 if (!n)
231 continue;
232
233 // find the next probable ASCII character
234 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
235 // characters still coming
236 nextAscii = src + qBitScanReverse(n) + 1;
237
238 // return the non-ASCII character
239 return src + qCountTrailingZeroBits(n);
240 }
241
242 // do four characters at a time
243 for ( ; end - src >= 4; src += 4) {
244 quint32 data = qFromUnaligned<quint32>(src);
245 data &= 0x80808080U;
246 if (!data)
247 continue;
248
249 // We don't try to guess which of the three bytes is ASCII and which
250 // one isn't. The chance that at least two of them are non-ASCII is
251 // better than 75%.
252 nextAscii = src;
253 return src;
254 }
255 nextAscii = end;
256 return src;
257}
258
259// Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
260// and advance src8 and src16 to the first character that could not be compared
261static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const char16_t *&src16, const char16_t *end16)
262{
263 int bitSpacing = 1;
264 qptrdiff len = qMin(end8 - src8, end16 - src16);
265 qptrdiff offset = 0;
266 uint mask = 0;
267
268 // do sixteen characters at a time
269 for ( ; offset + 16 < len; offset += 16) {
270 __m128i data8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src8 + offset));
271#ifdef __AVX2__
272 // AVX2 version, use 256-bit registers and VPMOVXZBW
273 __m256i data16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src16 + offset));
274
275 // expand US-ASCII as if it were Latin1 and confirm it's US-ASCII
276 __m256i datax8 = _mm256_cvtepu8_epi16(data8);
277 mask = _mm256_movemask_epi8(datax8);
278 if (mask)
279 break;
280
281 // compare Latin1 to UTF-16
282 __m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
283 mask = ~_mm256_movemask_epi8(latin1cmp);
284 if (mask)
285 break;
286#else
287 // non-AVX2 code
288 __m128i datalo16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
289 __m128i datahi16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset) + 1);
290
291 // expand US-ASCII as if it were Latin1, we'll confirm later
292 __m128i datalo8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
293 __m128i datahi8 = _mm_unpackhi_epi8(data8, _mm_setzero_si128());
294
295 // compare Latin1 to UTF-16
296 __m128i latin1cmplo = _mm_cmpeq_epi16(datalo8, datalo16);
297 __m128i latin1cmphi = _mm_cmpeq_epi16(datahi8, datahi16);
298 mask = _mm_movemask_epi8(latin1cmphi) << 16;
299 mask |= ushort(_mm_movemask_epi8(latin1cmplo));
300 mask = ~mask;
301 if (mask)
302 break;
303
304 // confirm it was US-ASCII
305 mask = _mm_movemask_epi8(data8);
306 if (mask) {
307 bitSpacing = 0;
308 break;
309 }
310#endif
311 }
312
313 // helper for comparing 4 or 8 characters
314 auto cmp_lt_16 = [&mask, &offset](int n, __m128i data8, __m128i data16) {
315 // n = 4 -> sizemask = 0xff
316 // n = 8 -> sizemask = 0xffff
317 unsigned sizemask = (1U << (2 * n)) - 1;
318
319 // expand as if Latin1
320 data8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
321
322 // compare and confirm it's US-ASCII
323 __m128i latin1cmp = _mm_cmpeq_epi16(data8, data16);
324 mask = ~_mm_movemask_epi8(latin1cmp) & sizemask;
325 mask |= _mm_movemask_epi8(data8);
326 if (mask == 0)
327 offset += n;
328 };
329
330 // do eight characters at a time
331 if (mask == 0 && offset + 8 < len) {
332 __m128i data8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src8 + offset));
333 __m128i data16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
334 cmp_lt_16(8, data8, data16);
335 }
336
337 // do four characters
338 if (mask == 0 && offset + 4 < len) {
339 __m128i data8 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src8 + offset));
340 __m128i data16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src16 + offset));
341 cmp_lt_16(4, data8, data16);
342 }
343
344 // correct the source pointers to point to the first character we couldn't deal with
345 if (mask)
346 offset += qCountTrailingZeroBits(mask) >> bitSpacing;
347 src8 += offset;
348 src16 += offset;
349}
350#elif defined(__ARM_NEON__)
351static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
352{
353 uint16x8_t maxAscii = vdupq_n_u16(0x7f);
354 uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
355 uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
356
357 // do sixteen characters at a time
358 for ( ; end - src >= 16; src += 16, dst += 16) {
359 // load 2 lanes (or: "load interleaved")
360 uint16x8x2_t in = vld2q_u16(reinterpret_cast<const uint16_t *>(src));
361
362 // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
363 // add those together into a scalar, and merge the scalars.
364 uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
365 | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
366
367 // merge the two lanes by shifting the values of the second by 8 and inserting them
368 uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
369
370 // store, even if there are non-ASCII characters here
371 vst1q_u8(dst, vreinterpretq_u8_u16(out));
372
373 if (nonAscii) {
374 // find the next probable ASCII character
375 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
376 // characters still coming
377 nextAscii = src + qBitScanReverse(nonAscii) + 1;
378
379 nonAscii = qCountTrailingZeroBits(nonAscii);
380 dst += nonAscii;
381 src += nonAscii;
382 return false;
383 }
384 }
385 return src == end;
386}
387
388static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
389{
390 // do eight characters at a time
391 uint8x8_t msb_mask = vdup_n_u8(0x80);
392 uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
393 for ( ; end - src >= 8; src += 8, dst += 8) {
394 uint8x8_t c = vld1_u8(src);
395 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
396 if (!n) {
397 // store
398 vst1q_u16(reinterpret_cast<uint16_t *>(dst), vmovl_u8(c));
399 continue;
400 }
401
402 // copy the front part that is still ASCII
403 while (!(n & 1)) {
404 *dst++ = *src++;
405 n >>= 1;
406 }
407
408 // find the next probable ASCII character
409 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
410 // characters still coming
411 n = qBitScanReverse(n);
412 nextAscii = src + n + 1;
413 return false;
414
415 }
416 return src == end;
417}
418
419static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
420{
421 // The SIMD code below is untested, so just force an early return until
422 // we've had the time to verify it works.
423 nextAscii = end;
424 return src;
425
426 // do eight characters at a time
427 uint8x8_t msb_mask = vdup_n_u8(0x80);
428 uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
429 for ( ; end - src >= 8; src += 8) {
430 uint8x8_t c = vld1_u8(src);
431 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
432 if (!n)
433 continue;
434
435 // find the next probable ASCII character
436 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
437 // characters still coming
438 nextAscii = src + qBitScanReverse(n) + 1;
439
440 // return the non-ASCII character
441 return src + qCountTrailingZeroBits(n);
442 }
443 nextAscii = end;
444 return src;
445}
446
447static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
448{
449}
450#else
451static inline bool simdEncodeAscii(uchar *, const char16_t *, const char16_t *, const char16_t *)
452{
453 return false;
454}
455
456static inline bool simdDecodeAscii(char16_t *, const uchar *, const uchar *, const uchar *)
457{
458 return false;
459}
460
461static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
462{
463 nextAscii = end;
464 return src;
465}
466
467static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
468{
469}
470#endif
471
472enum { HeaderDone = 1 };
473
475{
476 qsizetype len = in.size();
477
478 // create a QByteArray with the worst case scenario size
480 uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
481 const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
482 const char16_t *const end = src + len;
483
484 while (src != end) {
485 const char16_t *nextAscii = end;
486 if (simdEncodeAscii(dst, nextAscii, src, end))
487 break;
488
489 do {
490 char16_t u = *src++;
491 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
492 if (res < 0) {
493 // encoding error - append '?'
494 *dst++ = '?';
495 }
496 } while (src < nextAscii);
497 }
498
499 result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
500 return result;
501}
502
510
512{
514 qsizetype len = in.size();
515 if (!len)
516 return out;
517
518 auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
520 *cursor++ = 0;
521 } else {
522 // QChar::replacement encoded in utf8
523 *cursor++ = 0xef;
524 *cursor++ = 0xbf;
525 *cursor++ = 0xbd;
526 }
527 return cursor;
528 };
529
530 uchar *cursor = reinterpret_cast<uchar *>(out);
531 const char16_t *src = in.utf16();
532 const char16_t *const end = src + len;
533
534 if (!(state->flags & QStringDecoder::Flag::Stateless)) {
535 if (state->remainingChars) {
536 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(state->state_data[0], cursor, src, end);
537 if (res < 0)
538 cursor = appendReplacementChar(cursor);
539 state->state_data[0] = 0;
540 state->remainingChars = 0;
541 } else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
542 // append UTF-8 BOM
543 *cursor++ = utf8bom[0];
544 *cursor++ = utf8bom[1];
545 *cursor++ = utf8bom[2];
546 state->internalState |= HeaderDone;
547 }
548 }
549
550 while (src != end) {
551 const char16_t *nextAscii = end;
552 if (simdEncodeAscii(cursor, nextAscii, src, end))
553 break;
554
555 do {
556 char16_t uc = *src++;
557 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
558 if (Q_LIKELY(res >= 0))
559 continue;
560
562 // encoding error
563 ++state->invalidChars;
564 cursor = appendReplacementChar(cursor);
565 } else if (res == QUtf8BaseTraits::EndOfString) {
567 ++state->invalidChars;
568 cursor = appendReplacementChar(cursor);
569 } else {
570 state->remainingChars = 1;
571 state->state_data[0] = uc;
572 }
573 return reinterpret_cast<char *>(cursor);
574 }
575 } while (src < nextAscii);
576 }
577
578 return reinterpret_cast<char *>(cursor);
579}
580
582{
583 // ### SIMD-optimize:
584 for (uchar ch : in) {
585 if (ch < 128) {
586 *out++ = ch;
587 } else {
588 // as per https://en.wikipedia.org/wiki/UTF-8#Encoding, 2nd row
589 *out++ = 0b110'0'0000u | (ch >> 6);
590 *out++ = 0b10'00'0000u | (ch & 0b0011'1111);
591 }
592 }
593 return out;
594}
595
597{
598 // UTF-8 to UTF-16 always needs the exact same number of words or less:
599 // UTF-8 UTF-16
600 // 1 byte 1 word
601 // 2 bytes 1 word
602 // 3 bytes 1 word
603 // 4 bytes 2 words (one surrogate pair)
604 // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
605 // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
606 // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
607 //
608 // The table holds for invalid sequences too: we'll insert one replacement char
609 // per invalid byte.
611 QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
612 const QChar *end = convertToUnicode(data, in);
613 result.truncate(end - data);
614 return result;
615}
616
636char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
637{
638 const uchar *const start = reinterpret_cast<const uchar *>(in.data());
639 const uchar *src = start;
640 const uchar *end = src + in.size();
641
642 // attempt to do a full decoding in SIMD
643 const uchar *nextAscii = end;
644 if (!simdDecodeAscii(dst, nextAscii, src, end)) {
645 // at least one non-ASCII entry
646 // check if we failed to decode the UTF-8 BOM; if so, skip it
647 if (Q_UNLIKELY(src == start)
648 && end - src >= 3
649 && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
650 src += 3;
651 }
652
653 while (src < end) {
654 nextAscii = end;
655 if (simdDecodeAscii(dst, nextAscii, src, end))
656 break;
657
658 do {
659 uchar b = *src++;
660 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
661 if (res < 0) {
662 // decoding error
663 *dst++ = QChar::ReplacementCharacter;
664 }
665 } while (src < nextAscii);
666 }
667 }
668
669 return dst;
670}
671
673{
674 // See above for buffer requirements for stateless decoding. However, that
675 // fails if the state is not empty. The following situations can add to the
676 // requirements:
677 // state contains chars starts with requirement
678 // 1 of 2 bytes valid continuation 0
679 // 2 of 3 bytes same 0
680 // 3 bytes of 4 same +1 (need to insert surrogate pair)
681 // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
682 // 2 of 3 bytes same +1 (same)
683 // 3 of 4 bytes same +1 (same)
684 QString result(in.size() + 1, Qt::Uninitialized);
686 result.truncate(end - result.constData());
687 return result;
688}
689
691{
692 qsizetype len = in.size();
693
695 if (!len)
696 return dst;
697
698
699 char16_t replacement = QChar::ReplacementCharacter;
701 replacement = QChar::Null;
702
704 uchar ch = 0;
705
706 const uchar *src = reinterpret_cast<const uchar *>(in.data());
707 const uchar *end = src + len;
708
710 bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
711 if (state->remainingChars || !headerdone) {
712 // handle incoming state first
713 uchar remainingCharsData[4]; // longest UTF-8 sequence possible
714 qsizetype remainingCharsCount = state->remainingChars;
715 qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
716
717 memset(remainingCharsData, 0, sizeof(remainingCharsData));
718 memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
719 memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
720
721 const uchar *begin = &remainingCharsData[1];
722 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
723 static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
725 ++state->invalidChars;
726 *dst++ = replacement;
727 ++src;
728 } else if (res == QUtf8BaseTraits::EndOfString) {
729 // if we got EndOfString again, then there were too few bytes in src;
730 // copy to our state and return
731 state->remainingChars = remainingCharsCount + newCharsToCopy;
732 memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
733 return dst;
734 } else if (!headerdone) {
735 // eat the UTF-8 BOM
736 if (dst[-1] == 0xfeff)
737 --dst;
738 }
739 state->internalState |= HeaderDone;
740
741 // adjust src now that we have maybe consumed a few chars
742 if (res >= 0) {
743 Q_ASSERT(res > remainingCharsCount);
744 src += res - remainingCharsCount;
745 }
746 }
747 } else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
748 // stateless, remove initial BOM
749 if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
750 // skip BOM
751 src += 3;
752 }
753
754 // main body, stateless decoding
755 res = 0;
756 const uchar *nextAscii = src;
757 while (res >= 0 && src < end) {
758 if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
759 break;
760
761 ch = *src++;
762 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
764 res = 0;
765 ++state->invalidChars;
766 *dst++ = replacement;
767 }
768 }
769
771 // unterminated UTF sequence
773 *dst++ = QChar::ReplacementCharacter;
774 ++state->invalidChars;
775 while (src++ < end) {
776 *dst++ = QChar::ReplacementCharacter;
777 ++state->invalidChars;
778 }
779 state->remainingChars = 0;
780 } else {
781 --src; // unread the byte in ch
782 state->remainingChars = end - src;
783 memcpy(&state->state_data[0], src, end - src);
784 }
785 } else {
786 state->remainingChars = 0;
787 }
788
789 return dst;
790}
791
793{
794 struct NoOutput {};
795 static void appendUtf16(const NoOutput &, char16_t) {}
796 static void appendUcs4(const NoOutput &, char32_t) {}
797};
798
800{
801 const uchar *src = reinterpret_cast<const uchar *>(in.data());
802 const uchar *end = src + in.size();
803 const uchar *nextAscii = src;
804 bool isValidAscii = true;
805
806 while (src < end) {
807 if (src >= nextAscii)
808 src = simdFindNonAscii(src, end, nextAscii);
809 if (src == end)
810 break;
811
812 do {
813 uchar b = *src++;
814 if ((b & 0x80) == 0)
815 continue;
816
817 isValidAscii = false;
819 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
820 if (res < 0) {
821 // decoding error
822 return { false, false };
823 }
824 } while (src < nextAscii);
825 }
826
827 return { true, isValidAscii };
828}
829
831{
832 auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data());
833 auto end1 = src1 + utf8.size();
834 auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
835 auto end2 = src2 + utf16.size();
836
837 do {
838 simdCompareAscii(src1, end1, src2, end2);
839
840 if (src1 < end1 && src2 < end2) {
841 char32_t uc1 = *src1++;
842 char32_t uc2 = *src2++;
843
844 if (uc1 >= 0x80) {
845 char32_t *output = &uc1;
846 qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraitsNoAscii>(uc1, output, src1, end1);
847 if (res < 0) {
848 // decoding error
849 uc1 = QChar::ReplacementCharacter;
850 }
851
852 // Only decode the UTF-16 surrogate pair if the UTF-8 code point
853 // wasn't US-ASCII (a surrogate cannot match US-ASCII).
854 if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2))
855 uc2 = QChar::surrogateToUcs4(uc2, *src2++);
856 }
857 if (cs == Qt::CaseInsensitive) {
858 uc1 = QChar::toCaseFolded(uc1);
859 uc2 = QChar::toCaseFolded(uc2);
860 }
861 if (uc1 != uc2)
862 return int(uc1) - int(uc2);
863 }
864 } while (src1 < end1 && src2 < end2);
865
866 // the shorter string sorts first
867 return (end1 > src1) - int(end2 > src2);
868}
869
871{
872 char32_t uc1 = QChar::Null;
873 auto src1 = reinterpret_cast<const uchar *>(utf8.data());
874 auto end1 = src1 + utf8.size();
875 auto src2 = reinterpret_cast<const uchar *>(s.latin1());
876 auto end2 = src2 + s.size();
877
878 while (src1 < end1 && src2 < end2) {
879 uchar b = *src1++;
880 char32_t *output = &uc1;
881 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
882 if (res < 0) {
883 // decoding error
884 uc1 = QChar::ReplacementCharacter;
885 }
886
887 char32_t uc2 = *src2++;
888 if (cs == Qt::CaseInsensitive) {
889 uc1 = QChar::toCaseFolded(uc1);
890 uc2 = QChar::toCaseFolded(uc2);
891 }
892 if (uc1 != uc2)
893 return int(uc1) - int(uc2);
894 }
895
896 // the shorter string sorts first
897 return (end1 > src1) - (end2 > src2);
898}
899
901{
902 if (lhs.isEmpty())
903 return qt_lencmp(0, rhs.size());
904
905 if (cs == Qt::CaseSensitive) {
906 const auto l = std::min(lhs.size(), rhs.size());
907 int r = memcmp(lhs.data(), rhs.data(), l);
908 return r ? r : qt_lencmp(lhs.size(), rhs.size());
909 }
910
911 char32_t uc1 = QChar::Null;
912 auto src1 = reinterpret_cast<const uchar *>(lhs.data());
913 auto end1 = src1 + lhs.size();
914 char32_t uc2 = QChar::Null;
915 auto src2 = reinterpret_cast<const uchar *>(rhs.data());
916 auto end2 = src2 + rhs.size();
917
918 while (src1 < end1 && src2 < end2) {
919 uchar b = *src1++;
920 char32_t *output = &uc1;
921 qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src1, end1);
922 if (res < 0) {
923 // decoding error
924 uc1 = QChar::ReplacementCharacter;
925 }
926
927 b = *src2++;
928 output = &uc2;
929 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, output, src2, end2);
930 if (res < 0) {
931 // decoding error
932 uc2 = QChar::ReplacementCharacter;
933 }
934
935 uc1 = QChar::toCaseFolded(uc1);
936 uc2 = QChar::toCaseFolded(uc2);
937 if (uc1 != uc2)
938 return int(uc1) - int(uc2);
939 }
940
941 // the shorter string sorts first
942 return (end1 > src1) - (end2 > src2);
943}
944
945#ifndef QT_BOOTSTRAPPED
947{
948 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
949 qsizetype length = 2 * in.size();
950 if (writeBom)
951 length += 2;
952
954 char *end = convertFromUnicode(d.data(), in, state, endian);
955 Q_ASSERT(end - d.constData() == d.size());
956 Q_UNUSED(end);
957 return d;
958}
959
961{
963 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
964
965 if (endian == DetectEndianness)
967
968 if (writeBom) {
969 // set them up the BOM
970 QChar bom(QChar::ByteOrderMark);
971 if (endian == BigEndianness)
972 qToBigEndian(bom.unicode(), out);
973 else
974 qToLittleEndian(bom.unicode(), out);
975 out += 2;
976 }
977 if (endian == BigEndianness)
978 qToBigEndian<char16_t>(in.data(), in.size(), out);
979 else
980 qToLittleEndian<char16_t>(in.data(), in.size(), out);
981
982 state->remainingChars = 0;
983 state->internalState |= HeaderDone;
984 return out + 2*in.size();
985}
986
988{
989 QString result((in.size() + 1) >> 1, Qt::Uninitialized); // worst case
990 QChar *qch = convertToUnicode(result.data(), in, state, endian);
991 result.truncate(qch - result.constData());
992 return result;
993}
994
996{
997 qsizetype len = in.size();
998 const char *chars = in.data();
999
1000 Q_ASSERT(state);
1001
1002 if (endian == DetectEndianness)
1003 endian = (DataEndianness)state->state_data[Endian];
1004
1005 const char *end = chars + len;
1006
1007 // make sure we can decode at least one char
1008 if (state->remainingChars + len < 2) {
1009 if (len) {
1010 Q_ASSERT(state->remainingChars == 0 && len == 1);
1011 state->remainingChars = 1;
1012 state->state_data[Data] = *chars;
1013 }
1014 return out;
1015 }
1016
1017 bool headerdone = state && state->internalState & HeaderDone;
1019 headerdone = true;
1020
1021 if (!headerdone || state->remainingChars) {
1022 uchar buf;
1023 if (state->remainingChars)
1024 buf = state->state_data[Data];
1025 else
1026 buf = *chars++;
1027
1028 // detect BOM, set endianness
1029 state->internalState |= HeaderDone;
1030 QChar ch(buf, *chars++);
1031 if (endian == DetectEndianness) {
1032 // someone set us up the BOM
1033 if (ch == QChar::ByteOrderSwapped) {
1034 endian = BigEndianness;
1035 } else if (ch == QChar::ByteOrderMark) {
1036 endian = LittleEndianness;
1037 } else {
1039 endian = BigEndianness;
1040 } else {
1041 endian = LittleEndianness;
1042 }
1043 }
1044 }
1045 if (endian == BigEndianness)
1046 ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
1047 if (headerdone || ch != QChar::ByteOrderMark)
1048 *out++ = ch;
1049 } else if (endian == DetectEndianness) {
1051 }
1052
1053 qsizetype nPairs = (end - chars) >> 1;
1054 if (endian == BigEndianness)
1055 qFromBigEndian<char16_t>(chars, nPairs, out);
1056 else
1057 qFromLittleEndian<char16_t>(chars, nPairs, out);
1058 out += nPairs;
1059
1060 state->state_data[Endian] = endian;
1061 state->remainingChars = 0;
1062 if ((end - chars) & 1) {
1064 *out++ = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? QChar::Null : QChar::ReplacementCharacter;
1065 } else {
1066 state->remainingChars = 1;
1067 state->state_data[Data] = *(end - 1);
1068 }
1069 } else {
1070 state->state_data[Data] = 0;
1071 }
1072
1073 return out;
1074}
1075
1077{
1078 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1079 qsizetype length = 4*in.size();
1080 if (writeBom)
1081 length += 4;
1083 char *end = convertFromUnicode(ba.data(), in, state, endian);
1084 ba.truncate(end - ba.constData());
1085 return ba;
1086}
1087
1089{
1090 Q_ASSERT(state);
1091
1092 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1093 if (endian == DetectEndianness)
1095
1096 if (writeBom) {
1097 // set them up the BOM
1098 if (endian == BigEndianness) {
1099 out[0] = 0;
1100 out[1] = 0;
1101 out[2] = (char)0xfe;
1102 out[3] = (char)0xff;
1103 } else {
1104 out[0] = (char)0xff;
1105 out[1] = (char)0xfe;
1106 out[2] = 0;
1107 out[3] = 0;
1108 }
1109 out += 4;
1110 state->internalState |= HeaderDone;
1111 }
1112
1113 const QChar *uc = in.data();
1114 const QChar *end = in.data() + in.size();
1115 QChar ch;
1116 char32_t ucs4;
1117 if (state->remainingChars == 1) {
1118 auto character = state->state_data[Data];
1119 Q_ASSERT(character <= 0xFFFF);
1120 ch = QChar(character);
1121 // this is ugly, but shortcuts a whole lot of logic that would otherwise be required
1122 state->remainingChars = 0;
1123 goto decode_surrogate;
1124 }
1125
1126 while (uc < end) {
1127 ch = *uc++;
1128 if (Q_LIKELY(!ch.isSurrogate())) {
1129 ucs4 = ch.unicode();
1130 } else if (Q_LIKELY(ch.isHighSurrogate())) {
1131decode_surrogate:
1132 if (uc == end) {
1134 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1135 } else {
1136 state->remainingChars = 1;
1137 state->state_data[Data] = ch.unicode();
1138 return out;
1139 }
1140 } else if (uc->isLowSurrogate()) {
1141 ucs4 = QChar::surrogateToUcs4(ch, *uc++);
1142 } else {
1143 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1144 }
1145 } else {
1146 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1147 }
1148 if (endian == BigEndianness)
1149 qToBigEndian(ucs4, out);
1150 else
1151 qToLittleEndian(ucs4, out);
1152 out += 4;
1153 }
1154
1155 return out;
1156}
1157
1159{
1161 result.resize((in.size() + 7) >> 1); // worst case
1162 QChar *end = convertToUnicode(result.data(), in, state, endian);
1163 result.truncate(end - result.constData());
1164 return result;
1165}
1166
1168{
1169 qsizetype len = in.size();
1170 const char *chars = in.data();
1171
1172 Q_ASSERT(state);
1173 if (endian == DetectEndianness)
1174 endian = (DataEndianness)state->state_data[Endian];
1175
1176 const char *end = chars + len;
1177
1178 uchar tuple[4];
1179 memcpy(tuple, &state->state_data[Data], 4);
1180
1181 // make sure we can decode at least one char
1182 if (state->remainingChars + len < 4) {
1183 if (len) {
1184 while (chars < end) {
1185 tuple[state->remainingChars] = *chars;
1186 ++state->remainingChars;
1187 ++chars;
1188 }
1189 Q_ASSERT(state->remainingChars < 4);
1190 memcpy(&state->state_data[Data], tuple, 4);
1191 }
1192 return out;
1193 }
1194
1195 bool headerdone = state->internalState & HeaderDone;
1197 headerdone = true;
1198
1199 qsizetype num = state->remainingChars;
1200 state->remainingChars = 0;
1201
1202 if (!headerdone || endian == DetectEndianness || num) {
1203 while (num < 4)
1204 tuple[num++] = *chars++;
1205 if (endian == DetectEndianness) {
1206 // someone set us up the BOM?
1207 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0) {
1208 endian = LittleEndianness;
1209 } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff) {
1210 endian = BigEndianness;
1212 endian = BigEndianness;
1213 } else {
1214 endian = LittleEndianness;
1215 }
1216 }
1217 char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
1218 if (headerdone || code != QChar::ByteOrderMark) {
1219 if (QChar::requiresSurrogates(code)) {
1220 *out++ = QChar(QChar::highSurrogate(code));
1221 *out++ = QChar(QChar::lowSurrogate(code));
1222 } else {
1223 *out++ = QChar(code);
1224 }
1225 }
1226 num = 0;
1227 } else if (endian == DetectEndianness) {
1229 }
1230 state->state_data[Endian] = endian;
1231 state->internalState |= HeaderDone;
1232
1233 while (chars < end) {
1234 tuple[num++] = *chars++;
1235 if (num == 4) {
1236 char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
1237 for (char16_t c : QChar::fromUcs4(code))
1238 *out++ = c;
1239 num = 0;
1240 }
1241 }
1242
1243 if (num) {
1245 *out++ = QChar::ReplacementCharacter;
1246 } else {
1247 state->state_data[Endian] = endian;
1248 state->remainingChars = num;
1249 memcpy(&state->state_data[Data], tuple, 4);
1250 }
1251 }
1252
1253 return out;
1254}
1255#endif // !QT_BOOTSTRAPPED
1256
1257#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
1258int QLocal8Bit::checkUtf8()
1259{
1260 return GetACP() == CP_UTF8 ? 1 : -1;
1261}
1262
1263QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
1264{
1265 return convertToUnicode_sys(in, CP_ACP, state);
1266}
1267
1268QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
1270{
1271 const char *mb = in.data();
1272 qsizetype mblen = in.size();
1273
1274 Q_ASSERT(state);
1275 qsizetype &invalidChars = state->invalidChars;
1277 const bool useNullForReplacement = !!(state->flags & Flag::ConvertInvalidToNull);
1278 const char16_t replacementCharacter = useNullForReplacement ? QChar::Null
1279 : QChar::ReplacementCharacter;
1280 if (state->flags & Flag::Stateless) {
1281 Q_ASSERT(state->remainingChars == 0);
1282 state = nullptr;
1283 }
1284
1285 if (!mb || !mblen)
1286 return QString();
1287
1288 // Use a local stack-buffer at first to allow us a decently large container
1289 // to avoid a lot of resizing, without also returning an overallocated
1290 // QString to the user for small strings.
1291 // Then we can be fast for small strings and take the hit of extra resizes
1292 // and measuring how much storage is needed for large strings.
1293 std::array<wchar_t, 4096> buf;
1294 wchar_t *out = buf.data();
1295 qsizetype outlen = buf.size();
1296
1297 QString sp;
1298
1299 // Return a pointer to storage where we have enough space for `size`
1300 const auto growOut = [&](qsizetype size) -> std::tuple<wchar_t *, qsizetype> {
1301 if (outlen >= size)
1302 return {out, outlen};
1303 const bool wasStackBuffer = sp.isEmpty();
1304 const auto begin = wasStackBuffer ? buf.data() : reinterpret_cast<wchar_t *>(sp.data());
1305 const qsizetype offset = qsizetype(std::distance(begin, out));
1306 qsizetype newSize = 0;
1307 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1308 Q_CHECK_PTR(false);
1309 return {nullptr, 0};
1310 }
1311 sp.resize(newSize);
1312 auto it = reinterpret_cast<wchar_t *>(sp.data());
1313 if (wasStackBuffer)
1314 it = std::copy_n(buf.data(), offset, it);
1315 else
1316 it += offset;
1317 return {it, size};
1318 };
1319
1320 // Convert the pending characters (if available)
1321 while (state && state->remainingChars && mblen) {
1322 QStringConverter::State localState;
1323 localState.flags = state->flags;
1324 // Use at most 6 characters as a guess for the longest encoded character
1325 // in any multibyte encoding.
1326 // Even with a total of 2 bytes of overhead that would leave around
1327 // 2^(4 * 8) possible characters
1328 std::array<char, 6> prev = {0};
1329 Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data));
1330 qsizetype index = 0;
1331 for (; index < state->remainingChars; ++index)
1332 prev[index] = state->state_data[index];
1333 const qsizetype toCopy = std::min(q20::ssize(prev) - index, mblen);
1334 for (qsizetype i = 0; i < toCopy; ++i, ++index)
1335 prev[index] = mb[i];
1336 mb += toCopy;
1337 mblen -= toCopy;
1338
1339 // Recursing:
1340 // Since we are using a clean local state it will try to decode what was
1341 // stored in our state + some extra octets from input (`prev`). If some
1342 // part fails we will have those characters stored in the local state's
1343 // storage, and we can extract those. It may also output some
1344 // replacement characters, which we'll count in the invalidChars.
1345 // In the best case we only do this once, but we will loop until we have
1346 // resolved all the remaining characters or we have run out of new input
1347 // in which case we may still have remaining characters.
1348 const QString tmp = convertToUnicode_sys(QByteArrayView(prev.data(), index), codePage,
1349 &localState);
1350 std::tie(out, outlen) = growOut(tmp.size());
1351 if (!out)
1352 return {};
1353 out = std::copy_n(reinterpret_cast<const wchar_t *>(tmp.constData()), tmp.size(), out);
1354 outlen -= tmp.size();
1355 const qsizetype tail = toCopy - localState.remainingChars;
1356 if (tail >= 0) {
1357 // Everything left to process comes from `in`, so we can stop
1358 // looping. Adjust the window for `in` and unset remainingChars to
1359 // signal that we're done.
1360 mb -= localState.remainingChars;
1361 mblen += localState.remainingChars;
1362 localState.remainingChars = 0;
1363 }
1364 state->remainingChars = localState.remainingChars;
1365 state->invalidChars += localState.invalidChars;
1366 std::copy_n(localState.state_data, state->remainingChars, state->state_data);
1367 }
1368
1369 Q_ASSERT(!state || state->remainingChars == 0 || mblen == 0);
1370
1371 // Need it in this scope, since we try to decrease our window size if we
1372 // encounter an error
1373 int nextIn = qt_saturate<int>(mblen);
1374 while (mblen > 0) {
1375 std::tie(out, outlen) = growOut(1); // Need space for at least one character
1376 if (!out)
1377 return {};
1378 const int nextOut = qt_saturate<int>(outlen);
1379 int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, nextIn, out, nextOut);
1380 if (len) {
1381 mb += nextIn;
1382 mblen -= nextIn;
1383 out += len;
1384 outlen -= len;
1385 } else {
1386 int r = GetLastError();
1387 if (r == ERROR_INSUFFICIENT_BUFFER) {
1388 const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0);
1389 std::tie(out, outlen) = growOut(wclen);
1390 if (!out)
1391 return {};
1392 } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
1393 // Can't decode the current window, so either store the state,
1394 // reduce window size or output a replacement character.
1395
1396 // Check if we can store all remaining characters in the state
1397 // to be used next time we're called:
1398 if (state && mblen <= q20::ssize(state->state_data)) {
1399 state->remainingChars = mblen;
1400 std::copy_n(mb, mblen, state->state_data);
1401 mb += mblen;
1402 mblen = 0;
1403 break;
1404 }
1405
1406 // .. if not, try to find the last valid character in the window
1407 // and try again with a shrunken window:
1408 if (nextIn > 1) {
1409 // There may be some incomplete data at the end of our current
1410 // window, so decrease the window size and try again.
1411 // In the worst case scenario there is gigs of undecodable
1412 // garbage, but what are we supposed to do about that?
1413 const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0);
1414 if (it != mb)
1415 nextIn = int(it - mb);
1416 else
1417 --nextIn;
1418 continue;
1419 }
1420
1421 // Finally, we are forced to output a replacement character for
1422 // the first byte in the window:
1423 std::tie(out, outlen) = growOut(1);
1424 if (!out)
1425 return {};
1426 *out = replacementCharacter;
1427 ++invalidChars;
1428 ++out;
1429 --outlen;
1430 ++mb;
1431 --mblen;
1432 } else {
1433 // Fail.
1434 qWarning("MultiByteToWideChar: Cannot convert multibyte text");
1435 break;
1436 }
1437 }
1438 nextIn = qt_saturate<int>(mblen);
1439 }
1440
1441 if (sp.isEmpty()) {
1442 // We must have only used the stack buffer
1443 if (out != buf.data()) // else: we return null-string
1444 sp = QStringView(buf.data(), out).toString();
1445 } else{
1446 const auto begin = reinterpret_cast<wchar_t *>(sp.data());
1447 sp.truncate(std::distance(begin, out));
1448 }
1449
1450 if (sp.size() && sp.back().isNull())
1451 sp.chop(1);
1452
1453 if (!state && mblen > 0) {
1454 // We have trailing character(s) that could not be converted, and
1455 // nowhere to cache them
1456 sp.resize(sp.size() + mblen, replacementCharacter);
1457 invalidChars += mblen;
1458 }
1459 return sp;
1460}
1461
1462QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
1463{
1464 return convertFromUnicode_sys(in, CP_ACP, state);
1465}
1466
1467QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
1469{
1470 const wchar_t *ch = reinterpret_cast<const wchar_t *>(in.data());
1471 qsizetype uclen = in.size();
1472
1473 Q_ASSERT(state);
1474 // The Windows API has a *boolean* out-parameter that says if a replacement
1475 // character was used, but it gives us no way to know _how many_ were used.
1476 // Since we cannot simply scan the string for replacement characters
1477 // (which is potentially a question mark, and thus a valid character),
1478 // we simply do not track the number of invalid characters here.
1479 // auto &invalidChars = state->invalidChars;
1480
1482 if (state->flags & Flag::Stateless) { // temporary
1483 Q_ASSERT(state->remainingChars == 0);
1484 state = nullptr;
1485 }
1486
1487 if (!ch)
1488 return QByteArray();
1489 if (uclen == 0)
1490 return QByteArray("");
1491
1492 // Use a local stack-buffer at first to allow us a decently large container
1493 // to avoid a lot of resizing, without also returning an overallocated
1494 // QByteArray to the user for small strings.
1495 // Then we can be fast for small strings and take the hit of extra resizes
1496 // and measuring how much storage is needed for large strings.
1497 std::array<char, 4096> buf;
1498 char *out = buf.data();
1499 qsizetype outlen = buf.size();
1500 QByteArray mb;
1501
1502 if (state && state->remainingChars > 0) {
1503 Q_ASSERT(state->remainingChars == 1);
1504 // Let's try to decode the pending character
1505 wchar_t wc[2] = { wchar_t(state->state_data[0]), ch[0] };
1506 // Check if the second character is a valid low surrogate,
1507 // otherwise we'll just decode the first character, for which windows
1508 // will output a replacement character.
1509 const bool validCodePoint = QChar::isLowSurrogate(wc[1]);
1510 int len = WideCharToMultiByte(codePage, 0, wc, validCodePoint ? 2 : 1, out, outlen, nullptr,
1511 nullptr);
1512 if (!len)
1513 return {}; // Cannot recover, and I refuse to believe it was a size limitation
1514 out += len;
1515 outlen -= len;
1516 if (validCodePoint) {
1517 ++ch;
1518 --uclen;
1519 }
1520 state->remainingChars = 0;
1521 state->state_data[0] = 0;
1522 if (uclen == 0)
1523 return QByteArrayView(buf.data(), len).toByteArray();
1524 }
1525
1526 if (state && QChar::isHighSurrogate(ch[uclen - 1])) {
1527 // We can handle a missing low surrogate at the end of the string,
1528 // so if there is one, exclude it now and store it in the state.
1529 state->remainingChars = 1;
1530 state->state_data[0] = ch[uclen - 1];
1531 --uclen;
1532 if (uclen == 0)
1533 return QByteArray();
1534 }
1535
1536 Q_ASSERT(uclen > 0);
1537
1538 // Return a pointer to storage where we have enough space for `size`
1539 const auto growOut = [&](qsizetype size) -> std::tuple<char *, qsizetype> {
1540 if (outlen >= size)
1541 return {out, outlen};
1542 const bool wasStackBuffer = mb.isEmpty();
1543 const auto begin = wasStackBuffer ? buf.data() : mb.data();
1544 const qsizetype offset = qsizetype(std::distance(begin, out));
1545 qsizetype newSize = 0;
1546 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1547 Q_CHECK_PTR(false);
1548 return {nullptr, 0};
1549 }
1550 mb.resize(newSize);
1551 auto it = mb.data();
1552 if (wasStackBuffer)
1553 it = std::copy_n(buf.data(), offset, it);
1554 else
1555 it += offset;
1556 return {it, size};
1557 };
1558
1559 const auto getNextWindowSize = [&]() {
1560 int nextIn = qt_saturate<int>(uclen);
1561 // The Windows API has some issues if the current window ends in the
1562 // middle of a surrogate pair, so we avoid that:
1563 if (nextIn > 1 && QChar::isHighSurrogate(ch[nextIn - 1]))
1564 --nextIn;
1565 return nextIn;
1566 };
1567
1568 int len = 0;
1569 while (uclen > 0) {
1570 const int nextIn = getNextWindowSize();
1571 std::tie(out, outlen) = growOut(1); // We need at least one byte
1572 if (!out)
1573 return {};
1574 const int nextOut = qt_saturate<int>(outlen);
1575 len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut, nullptr, nullptr);
1576 if (len > 0) {
1577 ch += nextIn;
1578 uclen -= nextIn;
1579 out += len;
1580 outlen -= len;
1581 } else {
1582 int r = GetLastError();
1583 if (r == ERROR_INSUFFICIENT_BUFFER) {
1584 int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn, nullptr, 0,
1585 nullptr, nullptr);
1586 if (neededLength <= 0) {
1587 // Fail. Observed with UTF8 where the input window was max int and ended in an
1588 // incomplete sequence, probably a Windows bug. We try to avoid that from
1589 // happening by reducing the window size in that case. But let's keep this
1590 // branch just in case of other bugs.
1591#ifndef QT_NO_DEBUG
1592 r = GetLastError();
1593 fprintf(stderr,
1594 "WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r);
1595#endif // !QT_NO_DEBUG
1596 break;
1597 }
1598 std::tie(out, outlen) = growOut(neededLength);
1599 if (!out)
1600 return {};
1601 // and try again...
1602 } else {
1603 // Fail. Probably can't happen in fact (dwFlags is 0).
1604#ifndef QT_NO_DEBUG
1605 // Can't use qWarning(), as it'll recurse to handle %ls
1606 fprintf(stderr,
1607 "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n", r,
1608 reinterpret_cast<const wchar_t *>(
1609 QStringView(ch, uclen).left(100).toString().utf16()));
1610#endif
1611 break;
1612 }
1613 }
1614 }
1615 if (mb.isEmpty()) {
1616 // We must have only used the stack buffer
1617 if (out != buf.data()) // else: we return null-array
1618 mb = QByteArrayView(buf.data(), out).toByteArray();
1619 } else {
1620 mb.truncate(std::distance(mb.data(), out));
1621 }
1622 return mb;
1623}
1624#endif
1625
1626void QStringConverter::State::clear() noexcept
1627{
1628 if (clearFn)
1629 clearFn(this);
1630 else
1631 state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
1632 remainingChars = 0;
1633 invalidChars = 0;
1634 internalState = 0;
1635}
1636
1637void QStringConverter::State::reset() noexcept
1638{
1639 if (flags & Flag::UsesIcu) {
1640#if QT_CONFIG(icu)
1641 UConverter *converter = static_cast<UConverter *>(d[0]);
1642 if (converter)
1643 ucnv_reset(converter);
1644#else
1645 Q_UNREACHABLE();
1646#endif
1647 } else {
1648 clear();
1649 }
1650}
1651
1652#ifndef QT_BOOTSTRAPPED
1657
1662
1667
1672
1677
1682
1687
1692
1697
1702
1707
1712#endif // !QT_BOOTSTRAPPED
1713
1715{
1716 Q_ASSERT(state);
1717 if (state->flags & QStringConverter::Flag::Stateless) // temporary
1718 state = nullptr;
1719
1720 const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?';
1721 qsizetype invalid = 0;
1722 for (qsizetype i = 0; i < in.size(); ++i) {
1723 if (in[i] > QChar(0xff)) {
1724 *out = replacement;
1725 ++invalid;
1726 } else {
1727 *out = (char)in[i].cell();
1728 }
1729 ++out;
1730 }
1731 if (state)
1732 state->invalidChars += invalid;
1733 return out;
1734}
1735
1737{
1739 memcpy(out, s.constData(), s.size()*sizeof(QChar));
1740 return out + s.size();
1741}
1742
1744{
1746 memcpy(out, s.constData(), s.size());
1747 return out + s.size();
1748}
1749
1750
1751static qsizetype fromUtf8Len(qsizetype l) { return l + 1; }
1752static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); }
1753
1754#ifndef QT_BOOTSTRAPPED
1755static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; }
1756static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); }
1757
1758static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; }
1759static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); }
1760#endif
1761
1762static qsizetype fromLatin1Len(qsizetype l) { return l + 1; }
1763static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
1764
1765
1766
1897const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
1898{
1900#ifndef QT_BOOTSTRAPPED
1901 { "UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
1902 { "UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
1903 { "UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },
1904 { "UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len },
1905 { "UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len },
1906 { "UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len },
1907#endif
1910};
1911
1912// match names case insensitive and skipping '-' and '_'
1913static bool nameMatch(const char *a, const char *b)
1914{
1915 do {
1916 while (*a == '-' || *a == '_')
1917 ++a;
1918 while (*b == '-' || *b == '_')
1919 ++b;
1920 if (!*a && !*b) // end of both strings
1921 return true;
1923
1924 return false;
1925}
1926
1927
1939#if QT_CONFIG(icu)
1940// only derives from QStringConverter to get access to protected types
1941struct QStringConverterICU : QStringConverter
1942{
1943 static void clear_function(QStringConverterBase::State *state) noexcept
1944 {
1945 ucnv_close(static_cast<UConverter *>(state->d[0]));
1946 state->d[0] = nullptr;
1947 }
1948
1949 static void ensureConverter(QStringConverter::State *state)
1950 {
1951 // old code might reset the state via clear instead of reset
1952 // in that case, the converter has been closed, and we have to reopen it
1953 if (state->d[0] == nullptr)
1954 state->d[0] = createConverterForName(static_cast<const char *>(state->d[1]), state);
1955 }
1956
1958 {
1959 ensureConverter(state);
1960
1961 auto icu_conv = static_cast<UConverter *>(state->d[0]);
1962 UErrorCode err = U_ZERO_ERROR;
1963 auto source = in.data();
1964 auto sourceLimit = in.data() + in.size();
1965
1966 qsizetype length = toLen(in.size());
1967
1968 UChar *target = reinterpret_cast<UChar *>(out);
1969 auto targetLimit = target + length;
1970 // We explicitly clean up anyway, so no need to set flush to true,
1971 // which would just reset the converter.
1972 UBool flush = false;
1973
1974 // If the QStringConverter was moved, the state that we used as a context is stale now.
1975 UConverterToUCallback action;
1976 const void *context;
1977 ucnv_getToUCallBack(icu_conv, &action, &context);
1978 if (context != state)
1979 ucnv_setToUCallBack(icu_conv, action, state, nullptr, nullptr, &err);
1980
1981 ucnv_toUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
1982 // We did reserve enough space:
1983 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
1984 if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
1985 if (auto leftOver = ucnv_toUCountPending(icu_conv, &err)) {
1986 ucnv_reset(icu_conv);
1987 state->invalidChars += leftOver;
1988 }
1989 }
1990 return reinterpret_cast<QChar *>(target);
1991 }
1992
1994 {
1995 ensureConverter(state);
1996 auto icu_conv = static_cast<UConverter *>(state->d[0]);
1997 UErrorCode err = U_ZERO_ERROR;
1998 auto source = reinterpret_cast<const UChar *>(in.data());
1999 auto sourceLimit = reinterpret_cast<const UChar *>(in.data() + in.size());
2000
2001 qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.size(), ucnv_getMaxCharSize(icu_conv));
2002
2003 char *target = out;
2004 char *targetLimit = out + length;
2005 UBool flush = false;
2006
2007 // If the QStringConverter was moved, the state that we used as a context is stale now.
2008 UConverterFromUCallback action;
2009 const void *context;
2010 ucnv_getFromUCallBack(icu_conv, &action, &context);
2011 if (context != state)
2012 ucnv_setFromUCallBack(icu_conv, action, state, nullptr, nullptr, &err);
2013
2014 ucnv_fromUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
2015 // We did reserve enough space:
2016 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2017 if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
2018 if (auto leftOver = ucnv_fromUCountPending(icu_conv, &err)) {
2019 ucnv_reset(icu_conv);
2020 state->invalidChars += leftOver;
2021 }
2022 }
2023 return target;
2024 }
2025
2026 Q_DISABLE_COPY_MOVE(QStringConverterICU)
2027
2028 template<qsizetype X>
2029 static qsizetype fromLen(qsizetype inLength)
2030 {
2031 return X * inLength * sizeof(UChar);
2032 }
2033
2034 static qsizetype toLen(qsizetype inLength)
2035 {
2036
2037 /* Assumption: each input char might map to a different codepoint
2038 Each codepoint can take up to 4 bytes == 2 QChar
2039 We can ignore reserving space for a BOM, as only UTF encodings use one
2040 and those are not handled by the ICU converter.
2041 */
2042 return 2 * inLength;
2043 }
2044
2045 static constexpr QStringConverter::Interface forLength[] = {
2046 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<1>},
2047 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<2>},
2048 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<3>},
2049 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<4>},
2050 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<5>},
2051 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<6>},
2052 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<7>},
2053 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<8>}
2054 };
2055
2056 static UConverter *createConverterForName(const char *name, const State *state)
2057 {
2058 Q_ASSERT(name);
2059 Q_ASSERT(state);
2060 UErrorCode status = U_ZERO_ERROR;
2061 UConverter *conv = ucnv_open(name, &status);
2062 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
2063 ucnv_close(conv);
2064 return nullptr;
2065 }
2066
2067 if (state->flags.testFlag(Flag::ConvertInvalidToNull)) {
2068 UErrorCode error = U_ZERO_ERROR;
2069
2070 auto nullToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
2071 const char *, int32_t length,
2072 UConverterCallbackReason reason, UErrorCode *err) {
2073 if (reason <= UCNV_IRREGULAR) {
2074 *err = U_ZERO_ERROR;
2075 UChar c = '\0';
2076 ucnv_cbToUWriteUChars(toUArgs, &c, 1, 0, err);
2077 // Recover outer scope's state (which isn't const) from context:
2078 auto state = const_cast<State *>(static_cast<const State *>(context));
2079 state->invalidChars += length;
2080 }
2081 };
2082 ucnv_setToUCallBack(conv, nullToSubstituter, state, nullptr, nullptr, &error);
2083
2084 auto nullFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
2085 const UChar *, int32_t length,
2086 UChar32, UConverterCallbackReason reason, UErrorCode *err) {
2087 if (reason <= UCNV_IRREGULAR) {
2088 *err = U_ZERO_ERROR;
2089 const UChar replacement[] = { 0 };
2090 const UChar *stringBegin = std::begin(replacement);
2091 ucnv_cbFromUWriteUChars(fromUArgs, &stringBegin, std::end(replacement), 0, err);
2092 // Recover outer scope's state (which isn't const) from context:
2093 auto state = const_cast<State *>(static_cast<const State *>(context));
2094 state->invalidChars += length;
2095 }
2096 };
2097 ucnv_setFromUCallBack(conv, nullFromSubstituter, state, nullptr, nullptr, &error);
2098 } else {
2099 UErrorCode error = U_ZERO_ERROR;
2100
2101 auto qmarkToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
2102 const char *codeUnits,int32_t length,
2103 UConverterCallbackReason reason, UErrorCode *err) {
2104 if (reason <= UCNV_IRREGULAR) {
2105 // Recover outer scope's state (which isn't const) from context:
2106 auto state = const_cast<State *>(static_cast<const State *>(context));
2107 state->invalidChars += length;
2108 }
2109 // use existing ICU callback for logic
2110 UCNV_TO_U_CALLBACK_SUBSTITUTE(nullptr, toUArgs, codeUnits, length, reason, err);
2111
2112 };
2113 ucnv_setToUCallBack(conv, qmarkToSubstituter, state, nullptr, nullptr, &error);
2114
2115 auto qmarkFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
2116 const UChar *codeUnits, int32_t length,
2117 UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *err) {
2118 if (reason <= UCNV_IRREGULAR) {
2119 // Recover outer scope's state (which isn't const) from context:
2120 auto state = const_cast<State *>(static_cast<const State *>(context));
2121 state->invalidChars += length;
2122 }
2123 // use existing ICU callback for logic
2124 UCNV_FROM_U_CALLBACK_SUBSTITUTE(nullptr, fromUArgs, codeUnits, length,
2125 codePoint, reason, err);
2126 };
2127 ucnv_setFromUCallBack(conv, qmarkFromSubstituter, state, nullptr, nullptr, &error);
2128 }
2129 return conv;
2130 }
2131
2132 static const QStringConverter::Interface *make_icu_converter(
2134 const char *name)
2135 {
2136 UErrorCode status = U_ZERO_ERROR;
2137 UConverter *conv = createConverterForName(name, state);
2138 if (!conv)
2139 return nullptr;
2140
2141 const char *icuName = ucnv_getName(conv, &status);
2142 // ucnv_getStandardName returns a name which is owned by the library
2143 // we can thus store it in the state without worrying aobut its lifetime
2144 const char *persistentName = ucnv_getStandardName(icuName, "MIME", &status);
2145 if (U_FAILURE(status) || !persistentName) {
2146 status = U_ZERO_ERROR;
2147 persistentName = ucnv_getStandardName(icuName, "IANA", &status);
2148 }
2149 state->d[1] = const_cast<char *>(persistentName);
2150 state->d[0] = conv;
2152 qsizetype maxCharSize = ucnv_getMaxCharSize(conv);
2153 state->clearFn = QStringConverterICU::clear_function;
2154 if (maxCharSize > 8 || maxCharSize < 1) {
2155 qWarning("Encountered unexpected codec \"%s\" which requires >8x space", name);
2156 return nullptr;
2157 } else {
2158 return &forLength[maxCharSize - 1];
2159 }
2160
2161 }
2162
2163};
2164#endif
2165
2170 : iface(nullptr), state(f)
2171{
2172 auto e = encodingForName(name);
2173 if (e)
2174 iface = encodingInterfaces + int(*e);
2175#if QT_CONFIG(icu)
2176 else
2177 iface = QStringConverterICU::make_icu_converter(&state, name);
2178#endif
2179}
2180
2181
2182const char *QStringConverter::name() const noexcept
2183{
2184 if (!iface)
2185 return nullptr;
2187#if QT_CONFIG(icu)
2188 return static_cast<const char*>(state.d[1]);
2189#else
2190 return nullptr;
2191#endif
2192 } else {
2193 return iface->name;
2194 }
2195}
2196
2242std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(const char *name) noexcept
2243{
2244 if (!name)
2245 return std::nullopt;
2246 for (qsizetype i = 0; i < LastEncoding + 1; ++i) {
2247 if (nameMatch(encodingInterfaces[i].name, name))
2249 }
2250 if (nameMatch(name, "latin1"))
2252 return std::nullopt;
2253}
2254
2255#ifndef QT_BOOTSTRAPPED
2263std::optional<QStringConverter::Encoding>
2264QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter) noexcept
2265{
2266 // someone set us up the BOM?
2267 qsizetype arraySize = data.size();
2268 if (arraySize > 3) {
2269 char32_t uc = qFromUnaligned<char32_t>(data.data());
2270 if (uc == qToBigEndian(char32_t(QChar::ByteOrderMark)))
2272 if (uc == qToLittleEndian(char32_t(QChar::ByteOrderMark)))
2274 if (expectedFirstCharacter) {
2275 // catch also anything starting with the expected character
2276 if (qToLittleEndian(uc) == expectedFirstCharacter)
2278 else if (qToBigEndian(uc) == expectedFirstCharacter)
2280 }
2281 }
2282
2283 if (arraySize > 2) {
2284 if (memcmp(data.data(), utf8bom, sizeof(utf8bom)) == 0)
2286 }
2287
2288 if (arraySize > 1) {
2289 char16_t uc = qFromUnaligned<char16_t>(data.data());
2290 if (uc == qToBigEndian(char16_t(QChar::ByteOrderMark)))
2292 if (uc == qToLittleEndian(char16_t(QChar::ByteOrderMark)))
2294 if (expectedFirstCharacter) {
2295 // catch also anything starting with the expected character
2296 if (qToLittleEndian(uc) == expectedFirstCharacter)
2298 else if (qToBigEndian(uc) == expectedFirstCharacter)
2300 }
2301 }
2302 return std::nullopt;
2303}
2304
2306{
2307 static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher("meta ");
2308 static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher("charset=");
2309
2310 QByteArray header = data.first(qMin(data.size(), qsizetype(1024))).toByteArray().toLower();
2311 qsizetype pos = metaSearcher.indexIn(header);
2312 if (pos != -1) {
2313 pos = charsetSearcher.indexIn(header, pos);
2314 if (pos != -1) {
2315 pos += qstrlen("charset=");
2316 if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\''))
2317 ++pos;
2318
2319 qsizetype pos2 = pos;
2320 // The attribute can be closed with either """, "'", ">" or "/",
2321 // none of which are valid charset characters.
2322 while (++pos2 < header.size()) {
2323 char ch = header.at(pos2);
2324 if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
2325 QByteArray name = header.mid(pos, pos2 - pos);
2326 qsizetype colon = name.indexOf(':');
2327 if (colon > 0)
2328 name = name.left(colon);
2329 name = name.simplified();
2330 if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
2331 name = QByteArrayLiteral("UTF-8");
2332 if (!name.isEmpty())
2333 return name;
2334 }
2335 }
2336 }
2337 }
2338 return QByteArray();
2339}
2340
2349std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
2350{
2351 // determine charset
2352 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2353 if (encoding)
2354 // trust the initial BOM
2355 return encoding;
2356
2358 if (!encodingTag.isEmpty())
2359 return encodingForName(encodingTag);
2360
2361 return Utf8;
2362}
2363
2365{
2366#if !QT_CONFIG(icu)
2368#else
2369 /* icu contains also the names of what Qt provides
2370 except for the special Locale one (so add one for it)
2371 */
2372 return 1 + ucnv_countAvailable();
2373#endif
2374}
2375
2390{
2391 auto availableCodec = [](qsizetype index) -> QString
2392 {
2393 #if !QT_CONFIG(icu)
2394 return QString::fromLatin1(encodingInterfaces[index].name);
2395 #else
2396 if (index == 0) // "Locale", not provided by icu
2397 return QString::fromLatin1(
2398 encodingInterfaces[QStringConverter::Encoding::System].name);
2399 // this mirrors the setup we do to set a converters name
2400 UErrorCode status = U_ZERO_ERROR;
2401 auto icuName = ucnv_getAvailableName(int32_t(index - 1));
2402 const char *standardName = ucnv_getStandardName(icuName, "MIME", &status);
2403 if (U_FAILURE(status) || !standardName) {
2404 status = U_ZERO_ERROR;
2405 standardName = ucnv_getStandardName(icuName, "IANA", &status);
2406 }
2407 if (!standardName)
2408 standardName = icuName;
2409 return QString::fromLatin1(standardName);
2410 #endif
2411 };
2412
2413 qsizetype codecCount = availableCodecCount();
2415 result.reserve(codecCount);
2416 for (qsizetype i = 0; i < codecCount; ++i)
2417 result.push_back(availableCodec(i));
2418 return result;
2419}
2420
2431{
2432 // determine charset
2433 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2434 if (encoding)
2435 // trust the initial BOM
2436 return QStringDecoder(encoding.value());
2437
2439 if (!encodingTag.isEmpty())
2440 return QStringDecoder(encodingTag);
2441
2442 return QStringDecoder(Utf8);
2443}
2444#endif // !QT_BOOTSTRAPPED
2445
2450{
2451 return encodingInterfaces[int(e)].name;
2452}
2453
QByteArray toByteArray() const
Definition qbytearray.h:796
\inmodule QtCore
Definition qbytearray.h:57
char * data()
\macro QT_NO_CAST_FROM_BYTEARRAY
Definition qbytearray.h:611
const char * constData() const noexcept
Returns a pointer to the const data stored in the byte array.
Definition qbytearray.h:124
void truncate(qsizetype pos)
Truncates the byte array at index position pos.
\inmodule QtCore
Q_CORE_EXPORT const char * name() const noexcept
Returns the canonical name of the encoding this QStringConverter can encode or decode.
static Q_CORE_EXPORT std::optional< Encoding > encodingForHtml(QByteArrayView data)
Tries to determine the encoding of the HTML in data by looking at leading byte order marks or a chars...
static Q_CORE_EXPORT const char * nameForEncoding(Encoding e)
Returns the canonical name for encoding e.
Encoding
\value Utf8 Create a converter to or from UTF-8 \value Utf16 Create a converter to or from UTF-16.
static Q_CORE_EXPORT QStringList availableCodecs()
Returns a list of names of supported codecs.
const Interface * iface
static Q_CORE_EXPORT std::optional< Encoding > encodingForName(const char *name) noexcept
Convert name to the corresponding \l Encoding member, if there is one.
constexpr QStringConverter() noexcept
static Q_CORE_EXPORT std::optional< Encoding > encodingForData(QByteArrayView data, char16_t expectedFirstCharacter=0) noexcept
Returns the encoding for the content of data if it can be determined.
\inmodule QtCore
static Q_CORE_EXPORT QStringDecoder decoderForHtml(QByteArrayView data)
Tries to determine the encoding of the HTML in data by looking at leading byte order marks or a chars...
constexpr QStringDecoder() noexcept
Default constructs an decoder.
\inmodule QtCore
\inmodule QtCore
Definition qstringview.h:78
QString toString() const
Returns a deep copy of this string view's data as a QString.
Definition qstring.h:1121
\macro QT_RESTRICTED_CAST_FROM_ASCII
Definition qstring.h:129
static QString fromLatin1(QByteArrayView ba)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition qstring.cpp:5871
const QChar * constData() const
Returns a pointer to the data stored in the QString.
Definition qstring.h:1246
qsizetype size() const noexcept
Returns the number of characters in this string.
Definition qstring.h:186
QChar * data()
Returns a pointer to the data stored in the QString.
Definition qstring.h:1240
@ BigEndian
Definition qsysinfo.h:29
@ ByteOrder
Definition qsysinfo.h:34
b clear()
QCursor cursor
QSet< QString >::iterator it
else opt state
[0]
Combined button and popup list for selecting options.
constexpr int qt_lencmp(qsizetype lhs, qsizetype rhs) noexcept
Definition qtools_p.h:109
constexpr char toAsciiLower(char ch) noexcept
Definition qtools_p.h:87
QTextStream & flush(QTextStream &stream)
Calls QTextStream::flush() on stream and returns stream.
CaseSensitivity
@ CaseInsensitive
@ CaseSensitive
constexpr Initialization Uninitialized
constexpr auto ssize(const C &c) -> std::common_type_t< std::ptrdiff_t, std::make_signed_t< decltype(c.size())> >
Definition q20iterator.h:33
QT_POPCOUNT_RELAXED_CONSTEXPR uint qCountLeadingZeroBits(quint32 v) noexcept
constexpr uint qCountTrailingZeroBits(quint32 v) noexcept
static void * context
#define QByteArrayLiteral(str)
Definition qbytearray.h:52
size_t qstrlen(const char *str)
constexpr QStaticByteArrayMatcher< N > qMakeStaticByteArrayMatcher(const char(&pattern)[N]) noexcept
#define Q_UNLIKELY(x)
#define Q_LIKELY(x)
#define Q_ALWAYS_INLINE
DBusConnection const char DBusError * error
static QString header(const QString &name)
typedef QByteArray(EGLAPIENTRYP PFNQGSGETDISPLAYSPROC)()
constexpr T qToBigEndian(T source)
Definition qendian.h:172
constexpr T qToLittleEndian(T source)
Definition qendian.h:176
Flags
#define qWarning
Definition qlogging.h:166
constexpr const T & qMin(const T &a, const T &b)
Definition qminmax.h:40
std::enable_if_t< std::is_unsigned_v< T >, bool > qAddOverflow(T v1, T v2, T *r)
Definition qnumeric.h:113
GLboolean GLboolean GLboolean b
GLsizei const GLfloat * v
[13]
GLboolean GLboolean GLboolean GLboolean a
[7]
GLenum GLuint GLintptr GLsizeiptr size
[1]
GLuint index
[2]
GLboolean r
[2]
GLuint GLuint end
GLenum GLuint GLenum GLsizei length
GLint GLsizei GLsizei GLenum GLenum GLsizei void * data
GLfloat GLfloat f
GLenum src
GLint left
GLenum GLenum dst
GLenum GLuint GLenum GLsizei const GLchar * buf
GLenum target
GLbitfield flags
GLuint start
GLenum GLuint GLintptr offset
GLuint name
GLint GLint GLint GLint GLint GLint GLint GLbitfield mask
GLfloat n
GLsizei GLsizei GLchar * source
GLdouble s
[6]
Definition qopenglext.h:235
GLuint res
const GLubyte * c
GLuint in
GLuint64EXT * result
[6]
GLenum GLsizei len
GLuint num
GLuint GLenum GLsizei GLsizei GLint GLint GLboolean packed
#define Q_ASSERT(cond)
Definition qrandom.cpp:47
QtPrivate::QRegularExpressionMatchIteratorRangeBasedForIterator begin(const QRegularExpressionMatchIterator &iterator)
@ HeaderDone
static const uchar utf8bom[]
static QChar * fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static QChar * fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
static QChar * fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toUtf8Len(qsizetype l)
static QChar * fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
static bool simdDecodeAscii(char16_t *, const uchar *, const uchar *, const uchar *)
static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
static qsizetype toLatin1Len(qsizetype l)
static const uchar * simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
static bool simdEncodeAscii(uchar *, const char16_t *, const char16_t *, const char16_t *)
static QChar * fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
static char * toUtf32(char *out, QStringView in, QStringConverter::State *state)
static char * toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf8Len(qsizetype l)
static char * toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
static qsizetype toUtf16Len(qsizetype l)
static qsizetype fromLatin1Len(qsizetype l)
static char * toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
static char * toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf32Len(qsizetype l)
static qsizetype availableCodecCount()
static bool nameMatch(const char *a, const char *b)
static QChar * fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toUtf32Len(qsizetype l)
static qsizetype fromUtf16Len(qsizetype l)
static char * toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
@ LittleEndianness
@ DetectEndianness
@ BigEndianness
#define sp
#define Q_UNUSED(x)
unsigned int quint32
Definition qtypes.h:50
unsigned char uchar
Definition qtypes.h:32
ptrdiff_t qptrdiff
Definition qtypes.h:164
ptrdiff_t qsizetype
Definition qtypes.h:165
unsigned int uint
Definition qtypes.h:34
unsigned short ushort
Definition qtypes.h:33
QT_BEGIN_NAMESPACE typedef uchar * output
Q_CHECK_PTR(a=new int[80])
QByteArray ba
[0]
QTextStream out(stdout)
[7]
QObject::connect nullptr
char * toString(const MyType &t)
[31]
static char16_t * convertToUnicode(char16_t *dst, QLatin1StringView in) noexcept
Definition qstring.cpp:5687
static char * convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept
static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state)
static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state)
static Q_CORE_EXPORT QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness=DetectEndianness)
static Q_CORE_EXPORT QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness=DetectEndianness)
static Q_CORE_EXPORT QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness=DetectEndianness)
static QChar * convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian)
static const int Error
static const int EndOfString
static void appendUtf16(const NoOutput &, char16_t)
static void appendUcs4(const NoOutput &, char32_t)
static Q_CORE_EXPORT QByteArray convertFromUnicode(QStringView in)
static int compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs=Qt::CaseSensitive) noexcept
static QChar * convertToUnicode(QChar *buffer, QByteArrayView in) noexcept
static ValidUtf8Result isValidUtf8(QByteArrayView in)
static Q_CORE_EXPORT char * convertFromLatin1(char *out, QLatin1StringView in)