/usr/src/gnu/usr.bin/clang/libclangLex/../../../llvm/clang/lib/Lex/LiteralSupport.cpp

Bug Summary

File:	src/gnu/usr.bin/clang/libclangLex/../../../llvm/clang/lib/Lex/LiteralSupport.cpp
Warning:	line 720, column 11 Value stored to 'HasSize' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.0 -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LiteralSupport.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model static -mframe-pointer=all -relaxed-aliasing -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/gnu/usr.bin/clang/libclangLex/obj -resource-dir /usr/local/lib/clang/13.0.0 -I /usr/src/gnu/usr.bin/clang/libclangLex/../../../llvm/clang/include -I /usr/src/gnu/usr.bin/clang/libclangLex/../../../llvm/llvm/include -I /usr/src/gnu/usr.bin/clang/libclangLex/../include -I /usr/src/gnu/usr.bin/clang/libclangLex/obj -I /usr/src/gnu/usr.bin/clang/libclangLex/obj/../include -D NDEBUG -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D LLVM_PREFIX="/usr" -internal-isystem /usr/include/c++/v1 -internal-isystem /usr/local/lib/clang/13.0.0/include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/usr/src/gnu/usr.bin/clang/libclangLex/obj -ferror-limit 19 -fvisibility-inlines-hidden -fwrapv -stack-protector 2 -fno-rtti -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/ben/Projects/vmm/scan-build/2022-01-12-194120-40624-1 -x c++ /usr/src/gnu/usr.bin/clang/libclangLex/../../../llvm/clang/lib/Lex/LiteralSupport.cpp

1	//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the NumericLiteralParser, CharLiteralParser, and
10	// StringLiteralParser interfaces.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "clang/Lex/LiteralSupport.h"
15	#include "clang/Basic/CharInfo.h"
16	#include "clang/Basic/LangOptions.h"
17	#include "clang/Basic/SourceLocation.h"
18	#include "clang/Basic/TargetInfo.h"
19	#include "clang/Lex/LexDiagnostic.h"
20	#include "clang/Lex/Lexer.h"
21	#include "clang/Lex/Preprocessor.h"
22	#include "clang/Lex/Token.h"
23	#include "llvm/ADT/APInt.h"
24	#include "llvm/ADT/SmallVector.h"
25	#include "llvm/ADT/StringExtras.h"
26	#include "llvm/ADT/StringSwitch.h"
27	#include "llvm/Support/ConvertUTF.h"
28	#include "llvm/Support/Error.h"
29	#include "llvm/Support/ErrorHandling.h"
30	#include <algorithm>
31	#include <cassert>
32	#include <cstddef>
33	#include <cstdint>
34	#include <cstring>
35	#include <string>
36
37	using namespace clang;
38
39	static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
40	switch (kind) {
41	default: llvm_unreachable("Unknown token type!")__builtin_unreachable();
42	case tok::char_constant:
43	case tok::string_literal:
44	case tok::utf8_char_constant:
45	case tok::utf8_string_literal:
46	return Target.getCharWidth();
47	case tok::wide_char_constant:
48	case tok::wide_string_literal:
49	return Target.getWCharWidth();
50	case tok::utf16_char_constant:
51	case tok::utf16_string_literal:
52	return Target.getChar16Width();
53	case tok::utf32_char_constant:
54	case tok::utf32_string_literal:
55	return Target.getChar32Width();
56	}
57	}
58
59	static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
60	FullSourceLoc TokLoc,
61	const char *TokBegin,
62	const char *TokRangeBegin,
63	const char *TokRangeEnd) {
64	SourceLocation Begin =
65	Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
66	TokLoc.getManager(), Features);
67	SourceLocation End =
68	Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
69	TokLoc.getManager(), Features);
70	return CharSourceRange::getCharRange(Begin, End);
71	}
72
73	/// Produce a diagnostic highlighting some portion of a literal.
74	///
75	/// Emits the diagnostic \p DiagID, highlighting the range of characters from
76	/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
77	/// a substring of a spelling buffer for the token beginning at \p TokBegin.
78	static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
79	const LangOptions &Features, FullSourceLoc TokLoc,
80	const char TokBegin, const char TokRangeBegin,
81	const char *TokRangeEnd, unsigned DiagID) {
82	SourceLocation Begin =
83	Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
84	TokLoc.getManager(), Features);
85	return Diags->Report(Begin, DiagID) <<
86	MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
87	}
88
89	/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
90	/// either a character or a string literal.
91	static unsigned ProcessCharEscape(const char *ThisTokBegin,
92	const char *&ThisTokBuf,
93	const char *ThisTokEnd, bool &HadError,
94	FullSourceLoc Loc, unsigned CharWidth,
95	DiagnosticsEngine *Diags,
96	const LangOptions &Features) {
97	const char *EscapeBegin = ThisTokBuf;
98
99	// Skip the '\' char.
100	++ThisTokBuf;
101
102	// We know that this character can't be off the end of the buffer, because
103	// that would have been \", which would not have been the end of string.
104	unsigned ResultChar = *ThisTokBuf++;
105	switch (ResultChar) {
106	// These map to themselves.
107	case '\\': case '\'': case '"': case '?': break;
108
109	// These have fixed mappings.
110	case 'a':
111	// TODO: K&R: the meaning of '\\a' is different in traditional C
112	ResultChar = 7;
113	break;
114	case 'b':
115	ResultChar = 8;
116	break;
117	case 'e':
118	if (Diags)
119	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
120	diag::ext_nonstandard_escape) << "e";
121	ResultChar = 27;
122	break;
123	case 'E':
124	if (Diags)
125	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
126	diag::ext_nonstandard_escape) << "E";
127	ResultChar = 27;
128	break;
129	case 'f':
130	ResultChar = 12;
131	break;
132	case 'n':
133	ResultChar = 10;
134	break;
135	case 'r':
136	ResultChar = 13;
137	break;
138	case 't':
139	ResultChar = 9;
140	break;
141	case 'v':
142	ResultChar = 11;
143	break;
144	case 'x': { // Hex escape.
145	ResultChar = 0;
146	if (ThisTokBuf == ThisTokEnd \|\| !isHexDigit(*ThisTokBuf)) {
147	if (Diags)
148	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
149	diag::err_hex_escape_no_digits) << "x";
150	HadError = true;
151	break;
152	}
153
154	// Hex escapes are a maximal series of hex digits.
155	bool Overflow = false;
156	for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
157	int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
158	if (CharVal == -1) break;
159	// About to shift out a digit?
160	if (ResultChar & 0xF0000000)
161	Overflow = true;
162	ResultChar <<= 4;
163	ResultChar \|= CharVal;
164	}
165
166	// See if any bits will be truncated when evaluated as a character.
167	if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
168	Overflow = true;
169	ResultChar &= ~0U >> (32-CharWidth);
170	}
171
172	// Check for overflow.
173	if (Overflow && Diags) // Too many digits to fit in
174	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
175	diag::err_escape_too_large) << 0;
176	break;
177	}
178	case '0': case '1': case '2': case '3':
179	case '4': case '5': case '6': case '7': {
180	// Octal escapes.
181	--ThisTokBuf;
182	ResultChar = 0;
183
184	// Octal escapes are a series of octal digits with maximum length 3.
185	// "\0123" is a two digit sequence equal to "\012" "3".
186	unsigned NumDigits = 0;
187	do {
188	ResultChar <<= 3;
189	ResultChar \|= *ThisTokBuf++ - '0';
190	++NumDigits;
191	} while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
192	ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
193
194	// Check for overflow. Reject '\777', but not L'\777'.
195	if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
196	if (Diags)
197	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
198	diag::err_escape_too_large) << 1;
199	ResultChar &= ~0U >> (32-CharWidth);
200	}
201	break;
202	}
203
204	// Otherwise, these are not valid escapes.
205	case '(': case '{': case '[': case '%':
206	// GCC accepts these as extensions. We warn about them as such though.
207	if (Diags)
208	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
209	diag::ext_nonstandard_escape)
210	<< std::string(1, ResultChar);
211	break;
212	default:
213	if (!Diags)
214	break;
215
216	if (isPrintable(ResultChar))
217	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
218	diag::ext_unknown_escape)
219	<< std::string(1, ResultChar);
220	else
221	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
222	diag::ext_unknown_escape)
223	<< "x" + llvm::utohexstr(ResultChar);
224	break;
225	}
226
227	return ResultChar;
228	}
229
230	static void appendCodePoint(unsigned Codepoint,
231	llvm::SmallVectorImpl<char> &Str) {
232	char ResultBuf[4];
233	char *ResultPtr = ResultBuf;
234	bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
235	(void)Res;
236	assert(Res && "Unexpected conversion failure")((void)0);
237	Str.append(ResultBuf, ResultPtr);
238	}
239
240	void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
241	for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
242	if (*I != '\\') {
243	Buf.push_back(*I);
244	continue;
245	}
246
247	++I;
248	assert(I == 'u' \|\| I == 'U')((void)0);
249
250	unsigned NumHexDigits;
251	if (*I == 'u')
252	NumHexDigits = 4;
253	else
254	NumHexDigits = 8;
255
256	assert(I + NumHexDigits <= E)((void)0);
257
258	uint32_t CodePoint = 0;
259	for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
260	unsigned Value = llvm::hexDigitValue(*I);
261	assert(Value != -1U)((void)0);
262
263	CodePoint <<= 4;
264	CodePoint += Value;
265	}
266
267	appendCodePoint(CodePoint, Buf);
268	--I;
269	}
270	}
271
272	/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
273	/// return the UTF32.
274	static bool ProcessUCNEscape(const char ThisTokBegin, const char &ThisTokBuf,
275	const char *ThisTokEnd,
276	uint32_t &UcnVal, unsigned short &UcnLen,
277	FullSourceLoc Loc, DiagnosticsEngine *Diags,
278	const LangOptions &Features,
279	bool in_char_string_literal = false) {
280	const char *UcnBegin = ThisTokBuf;
281
282	// Skip the '\u' char's.
283	ThisTokBuf += 2;
284
285	if (ThisTokBuf == ThisTokEnd \|\| !isHexDigit(*ThisTokBuf)) {
286	if (Diags)
287	Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
288	diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
289	return false;
290	}
291	UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
292	unsigned short UcnLenSave = UcnLen;
293	for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
294	int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
295	if (CharVal == -1) break;
296	UcnVal <<= 4;
297	UcnVal \|= CharVal;
298	}
299	// If we didn't consume the proper number of digits, there is a problem.
300	if (UcnLenSave) {
301	if (Diags)
302	Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
303	diag::err_ucn_escape_incomplete);
304	return false;
305	}
306
307	// Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
308	if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) \|\| // surrogate codepoints
309	UcnVal > 0x10FFFF) { // maximum legal UTF32 value
310	if (Diags)
311	Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
312	diag::err_ucn_escape_invalid);
313	return false;
314	}
315
316	// C++11 allows UCNs that refer to control characters and basic source
317	// characters inside character and string literals
318	if (UcnVal < 0xa0 &&
319	(UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, `
320	bool IsError = (!Features.CPlusPlus11 \|\| !in_char_string_literal);
321	if (Diags) {
322	char BasicSCSChar = UcnVal;
323	if (UcnVal >= 0x20 && UcnVal < 0x7f)
324	Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
325	IsError ? diag::err_ucn_escape_basic_scs :
326	diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
327	<< StringRef(&BasicSCSChar, 1);
328	else
329	Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
330	IsError ? diag::err_ucn_control_character :
331	diag::warn_cxx98_compat_literal_ucn_control_character);
332	}
333	if (IsError)
334	return false;
335	}
336
337	if (!Features.CPlusPlus && !Features.C99 && Diags)
338	Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
339	diag::warn_ucn_not_valid_in_c89_literal);
340
341	return true;
342	}
343
344	/// MeasureUCNEscape - Determine the number of bytes within the resulting string
345	/// which this UCN will occupy.
346	static int MeasureUCNEscape(const char ThisTokBegin, const char &ThisTokBuf,
347	const char *ThisTokEnd, unsigned CharByteWidth,
348	const LangOptions &Features, bool &HadError) {
349	// UTF-32: 4 bytes per escape.
350	if (CharByteWidth == 4)
351	return 4;
352
353	uint32_t UcnVal = 0;
354	unsigned short UcnLen = 0;
355	FullSourceLoc Loc;
356
357	if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
358	UcnLen, Loc, nullptr, Features, true)) {
359	HadError = true;
360	return 0;
361	}
362
363	// UTF-16: 2 bytes for BMP, 4 bytes otherwise.
364	if (CharByteWidth == 2)
365	return UcnVal <= 0xFFFF ? 2 : 4;
366
367	// UTF-8.
368	if (UcnVal < 0x80)
369	return 1;
370	if (UcnVal < 0x800)
371	return 2;
372	if (UcnVal < 0x10000)
373	return 3;
374	return 4;
375	}
376
377	/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
378	/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
379	/// StringLiteralParser. When we decide to implement UCN's for identifiers,
380	/// we will likely rework our support for UCN's.
381	static void EncodeUCNEscape(const char ThisTokBegin, const char &ThisTokBuf,
382	const char *ThisTokEnd,
383	char *&ResultBuf, bool &HadError,
384	FullSourceLoc Loc, unsigned CharByteWidth,
385	DiagnosticsEngine *Diags,
386	const LangOptions &Features) {
387	typedef uint32_t UTF32;
388	UTF32 UcnVal = 0;
389	unsigned short UcnLen = 0;
390	if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
391	Loc, Diags, Features, true)) {
392	HadError = true;
393	return;
394	}
395
396	assert((CharByteWidth == 1 \|\| CharByteWidth == 2 \|\| CharByteWidth == 4) &&((void)0)
397	"only character widths of 1, 2, or 4 bytes supported")((void)0);
398
399	(void)UcnLen;
400	assert((UcnLen== 4 \|\| UcnLen== 8) && "only ucn length of 4 or 8 supported")((void)0);
401
402	if (CharByteWidth == 4) {
403	// FIXME: Make the type of the result buffer correct instead of
404	// using reinterpret_cast.
405	llvm::UTF32 ResultPtr = reinterpret_cast<llvm::UTF32>(ResultBuf);
406	*ResultPtr = UcnVal;
407	ResultBuf += 4;
408	return;
409	}
410
411	if (CharByteWidth == 2) {
412	// FIXME: Make the type of the result buffer correct instead of
413	// using reinterpret_cast.
414	llvm::UTF16 ResultPtr = reinterpret_cast<llvm::UTF16>(ResultBuf);
415
416	if (UcnVal <= (UTF32)0xFFFF) {
417	*ResultPtr = UcnVal;
418	ResultBuf += 2;
419	return;
420	}
421
422	// Convert to UTF16.
423	UcnVal -= 0x10000;
424	*ResultPtr = 0xD800 + (UcnVal >> 10);
425	*(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
426	ResultBuf += 4;
427	return;
428	}
429
430	assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters")((void)0);
431
432	// Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
433	// The conversion below was inspired by:
434	// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
435	// First, we determine how many bytes the result will require.
436	typedef uint8_t UTF8;
437
438	unsigned short bytesToWrite = 0;
439	if (UcnVal < (UTF32)0x80)
440	bytesToWrite = 1;
441	else if (UcnVal < (UTF32)0x800)
442	bytesToWrite = 2;
443	else if (UcnVal < (UTF32)0x10000)
444	bytesToWrite = 3;
445	else
446	bytesToWrite = 4;
447
448	const unsigned byteMask = 0xBF;
449	const unsigned byteMark = 0x80;
450
451	// Once the bits are split out into bytes of UTF8, this is a mask OR-ed
452	// into the first byte, depending on how many bytes follow.
453	static const UTF8 firstByteMark[5] = {
454	0x00, 0x00, 0xC0, 0xE0, 0xF0
455	};
456	// Finally, we write the bytes into ResultBuf.
457	ResultBuf += bytesToWrite;
458	switch (bytesToWrite) { // note: everything falls through.
459	case 4:
460	*--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= 6;
461	LLVM_FALLTHROUGH[[gnu::fallthrough]];
462	case 3:
463	*--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= 6;
464	LLVM_FALLTHROUGH[[gnu::fallthrough]];
465	case 2:
466	*--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= 6;
467	LLVM_FALLTHROUGH[[gnu::fallthrough]];
468	case 1:
469	*--ResultBuf = (UTF8) (UcnVal \| firstByteMark[bytesToWrite]);
470	}
471	// Update the buffer.
472	ResultBuf += bytesToWrite;
473	}
474
475	/// integer-constant: [C99 6.4.4.1]
476	/// decimal-constant integer-suffix
477	/// octal-constant integer-suffix
478	/// hexadecimal-constant integer-suffix
479	/// binary-literal integer-suffix [GNU, C++1y]
480	/// user-defined-integer-literal: [C++11 lex.ext]
481	/// decimal-literal ud-suffix
482	/// octal-literal ud-suffix
483	/// hexadecimal-literal ud-suffix
484	/// binary-literal ud-suffix [GNU, C++1y]
485	/// decimal-constant:
486	/// nonzero-digit
487	/// decimal-constant digit
488	/// octal-constant:
489	/// 0
490	/// octal-constant octal-digit
491	/// hexadecimal-constant:
492	/// hexadecimal-prefix hexadecimal-digit
493	/// hexadecimal-constant hexadecimal-digit
494	/// hexadecimal-prefix: one of
495	/// 0x 0X
496	/// binary-literal:
497	/// 0b binary-digit
498	/// 0B binary-digit
499	/// binary-literal binary-digit
500	/// integer-suffix:
501	/// unsigned-suffix [long-suffix]
502	/// unsigned-suffix [long-long-suffix]
503	/// long-suffix [unsigned-suffix]
504	/// long-long-suffix [unsigned-sufix]
505	/// nonzero-digit:
506	/// 1 2 3 4 5 6 7 8 9
507	/// octal-digit:
508	/// 0 1 2 3 4 5 6 7
509	/// hexadecimal-digit:
510	/// 0 1 2 3 4 5 6 7 8 9
511	/// a b c d e f
512	/// A B C D E F
513	/// binary-digit:
514	/// 0
515	/// 1
516	/// unsigned-suffix: one of
517	/// u U
518	/// long-suffix: one of
519	/// l L
520	/// long-long-suffix: one of
521	/// ll LL
522	///
523	/// floating-constant: [C99 6.4.4.2]
524	/// TODO: add rules...
525	///
526	NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
527	SourceLocation TokLoc,
528	const SourceManager &SM,
529	const LangOptions &LangOpts,
530	const TargetInfo &Target,
531	DiagnosticsEngine &Diags)
532	: SM(SM), LangOpts(LangOpts), Diags(Diags),
533	ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
534
535	// This routine assumes that the range begin/end matches the regex for integer
536	// and FP constants (specifically, the 'pp-number' regex), and assumes that
537	// the byte at "*end" is both valid and not part of the regex. Because of
538	// this, it doesn't have to check for 'overscan' in various places.
539	assert(!isPreprocessingNumberBody(*ThisTokEnd) && "didn't maximally munch?")((void)0);
540
541	s = DigitsBegin = ThisTokBegin;
542	saw_exponent = false;
543	saw_period = false;
544	saw_ud_suffix = false;
545	saw_fixed_point_suffix = false;
546	isLong = false;
547	isUnsigned = false;
548	isLongLong = false;
549	isSizeT = false;
550	isHalf = false;
551	isFloat = false;
552	isImaginary = false;
553	isFloat16 = false;
554	isFloat128 = false;
555	MicrosoftInteger = 0;
556	isFract = false;
557	isAccum = false;
558	hadError = false;
559
560	if (*s == '0') { // parse radix
561	ParseNumberStartingWithZero(TokLoc);
562	if (hadError)
563	return;
564	} else { // the first digit is non-zero
565	radix = 10;
566	s = SkipDigits(s);
567	if (s == ThisTokEnd) {
568	// Done.
569	} else {
570	ParseDecimalOrOctalCommon(TokLoc);
571	if (hadError)
572	return;
573	}
574	}
575
576	SuffixBegin = s;
577	checkSeparator(TokLoc, s, CSK_AfterDigits);
578
579	// Initial scan to lookahead for fixed point suffix.
580	if (LangOpts.FixedPoint) {
581	for (const char *c = s; c != ThisTokEnd; ++c) {
582	if (c == 'r' \|\| c == 'k' \|\| c == 'R' \|\| c == 'K') {
583	saw_fixed_point_suffix = true;
584	break;
585	}
586	}
587	}
588
589	// Parse the suffix. At this point we can classify whether we have an FP or
590	// integer constant.
591	bool isFixedPointConstant = isFixedPointLiteral();
592	bool isFPConstant = isFloatingLiteral();
593	bool HasSize = false;
594
595	// Loop over all of the characters of the suffix. If we see something bad,
596	// we break out of the loop.
597	for (; s != ThisTokEnd; ++s) {
598	switch (*s) {
599	case 'R':
600	case 'r':
601	if (!LangOpts.FixedPoint)
602	break;
603	if (isFract \|\| isAccum) break;
604	if (!(saw_period \|\| saw_exponent)) break;
605	isFract = true;
606	continue;
607	case 'K':
608	case 'k':
609	if (!LangOpts.FixedPoint)
610	break;
611	if (isFract \|\| isAccum) break;
612	if (!(saw_period \|\| saw_exponent)) break;
613	isAccum = true;
614	continue;
615	case 'h': // FP Suffix for "half".
616	case 'H':
617	// OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
618	if (!(LangOpts.Half \|\| LangOpts.FixedPoint))
619	break;
620	if (isIntegerLiteral()) break; // Error for integer constant.
621	if (HasSize)
622	break;
623	HasSize = true;
624	isHalf = true;
625	continue; // Success.
626	case 'f': // FP Suffix for "float"
627	case 'F':
628	if (!isFPConstant) break; // Error for integer constant.
629	if (HasSize)
630	break;
631	HasSize = true;
632
633	// CUDA host and device may have different _Float16 support, therefore
634	// allows f16 literals to avoid false alarm.
635	// ToDo: more precise check for CUDA.
636	if ((Target.hasFloat16Type() \|\| LangOpts.CUDA) && s + 2 < ThisTokEnd &&
637	s[1] == '1' && s[2] == '6') {
638	s += 2; // success, eat up 2 characters.
639	isFloat16 = true;
640	continue;
641	}
642
643	isFloat = true;
644	continue; // Success.
645	case 'q': // FP Suffix for "__float128"
646	case 'Q':
647	if (!isFPConstant) break; // Error for integer constant.
648	if (HasSize)
649	break;
650	HasSize = true;
651	isFloat128 = true;
652	continue; // Success.
653	case 'u':
654	case 'U':
655	if (isFPConstant) break; // Error for floating constant.
656	if (isUnsigned) break; // Cannot be repeated.
657	isUnsigned = true;
658	continue; // Success.
659	case 'l':
660	case 'L':
661	if (HasSize)
662	break;
663	HasSize = true;
664
665	// Check for long long. The L's need to be adjacent and the same case.
666	if (s[1] == s[0]) {
667	assert(s + 1 < ThisTokEnd && "didn't maximally munch?")((void)0);
668	if (isFPConstant) break; // long long invalid for floats.
669	isLongLong = true;
670	++s; // Eat both of them.
671	} else {
672	isLong = true;
673	}
674	continue; // Success.
675	case 'z':
676	case 'Z':
677	if (isFPConstant)
678	break; // Invalid for floats.
679	if (HasSize)
680	break;
681	HasSize = true;
682	isSizeT = true;
683	continue;
684	case 'i':
685	case 'I':
686	if (LangOpts.MicrosoftExt && !isFPConstant) {
687	// Allow i8, i16, i32, and i64. First, look ahead and check if
688	// suffixes are Microsoft integers and not the imaginary unit.
689	uint8_t Bits = 0;
690	size_t ToSkip = 0;
691	switch (s[1]) {
692	case '8': // i8 suffix
693	Bits = 8;
694	ToSkip = 2;
695	break;
696	case '1':
697	if (s[2] == '6') { // i16 suffix
698	Bits = 16;
699	ToSkip = 3;
700	}
701	break;
702	case '3':
703	if (s[2] == '2') { // i32 suffix
704	Bits = 32;
705	ToSkip = 3;
706	}
707	break;
708	case '6':
709	if (s[2] == '4') { // i64 suffix
710	Bits = 64;
711	ToSkip = 3;
712	}
713	break;
714	default:
715	break;
716	}
717	if (Bits) {
718	if (HasSize)
719	break;
720	HasSize = true;
	Value stored to 'HasSize' is never read
721	MicrosoftInteger = Bits;
722	s += ToSkip;
723	assert(s <= ThisTokEnd && "didn't maximally munch?")((void)0);
724	break;
725	}
726	}
727	LLVM_FALLTHROUGH[[gnu::fallthrough]];
728	case 'j':
729	case 'J':
730	if (isImaginary) break; // Cannot be repeated.
731	isImaginary = true;
732	continue; // Success.
733	}
734	// If we reached here, there was an error or a ud-suffix.
735	break;
736	}
737
738	// "i", "if", and "il" are user-defined suffixes in C++1y.
739	if (s != ThisTokEnd \|\| isImaginary) {
740	// FIXME: Don't bother expanding UCNs if !tok.hasUCN().
741	expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
742	if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
743	if (!isImaginary) {
744	// Any suffix pieces we might have parsed are actually part of the
745	// ud-suffix.
746	isLong = false;
747	isUnsigned = false;
748	isLongLong = false;
749	isSizeT = false;
750	isFloat = false;
751	isFloat16 = false;
752	isHalf = false;
753	isImaginary = false;
754	MicrosoftInteger = 0;
755	saw_fixed_point_suffix = false;
756	isFract = false;
757	isAccum = false;
758	}
759
760	saw_ud_suffix = true;
761	return;
762	}
763
764	if (s != ThisTokEnd) {
765	// Report an error if there are any.
766	Diags.Report(Lexer::AdvanceToTokenCharacter(
767	TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
768	diag::err_invalid_suffix_constant)
769	<< StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
770	<< (isFixedPointConstant ? 2 : isFPConstant);
771	hadError = true;
772	}
773	}
774
775	if (!hadError && saw_fixed_point_suffix) {
776	assert(isFract \|\| isAccum)((void)0);
777	}
778	}
779
780	/// ParseDecimalOrOctalCommon - This method is called for decimal or octal
781	/// numbers. It issues an error for illegal digits, and handles floating point
782	/// parsing. If it detects a floating point number, the radix is set to 10.
783	void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
784	assert((radix == 8 \|\| radix == 10) && "Unexpected radix")((void)0);
785
786	// If we have a hex digit other than 'e' (which denotes a FP exponent) then
787	// the code is using an incorrect base.
788	if (isHexDigit(s) && s != 'e' && *s != 'E' &&
789	!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
790	Diags.Report(
791	Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
792	diag::err_invalid_digit)
793	<< StringRef(s, 1) << (radix == 8 ? 1 : 0);
794	hadError = true;
795	return;
796	}
797
798	if (*s == '.') {
799	checkSeparator(TokLoc, s, CSK_AfterDigits);
800	s++;
801	radix = 10;
802	saw_period = true;
803	checkSeparator(TokLoc, s, CSK_BeforeDigits);
804	s = SkipDigits(s); // Skip suffix.
805	}
806	if (s == 'e' \|\| s == 'E') { // exponent
807	checkSeparator(TokLoc, s, CSK_AfterDigits);
808	const char *Exponent = s;
809	s++;
810	radix = 10;
811	saw_exponent = true;
812	if (s != ThisTokEnd && (s == '+' \|\| s == '-')) s++; // sign
813	const char *first_non_digit = SkipDigits(s);
814	if (containsDigits(s, first_non_digit)) {
815	checkSeparator(TokLoc, s, CSK_BeforeDigits);
816	s = first_non_digit;
817	} else {
818	if (!hadError) {
819	Diags.Report(Lexer::AdvanceToTokenCharacter(
820	TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
821	diag::err_exponent_has_no_digits);
822	hadError = true;
823	}
824	return;
825	}
826	}
827	}
828
829	/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
830	/// suffixes as ud-suffixes, because the diagnostic experience is better if we
831	/// treat it as an invalid suffix.
832	bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
833	StringRef Suffix) {
834	if (!LangOpts.CPlusPlus11 \|\| Suffix.empty())
835	return false;
836
837	// By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
838	if (Suffix[0] == '_')
839	return true;
840
841	// In C++11, there are no library suffixes.
842	if (!LangOpts.CPlusPlus14)
843	return false;
844
845	// In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
846	// Per tweaked N3660, "il", "i", and "if" are also used in the library.
847	// In C++2a "d" and "y" are used in the library.
848	return llvm::StringSwitch<bool>(Suffix)
849	.Cases("h", "min", "s", true)
850	.Cases("ms", "us", "ns", true)
851	.Cases("il", "i", "if", true)
852	.Cases("d", "y", LangOpts.CPlusPlus20)
853	.Default(false);
854	}
855
856	void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
857	const char *Pos,
858	CheckSeparatorKind IsAfterDigits) {
859	if (IsAfterDigits == CSK_AfterDigits) {
860	if (Pos == ThisTokBegin)
861	return;
862	--Pos;
863	} else if (Pos == ThisTokEnd)
864	return;
865
866	if (isDigitSeparator(*Pos)) {
867	Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
868	LangOpts),
869	diag::err_digit_separator_not_between_digits)
870	<< IsAfterDigits;
871	hadError = true;
872	}
873	}
874
875	/// ParseNumberStartingWithZero - This method is called when the first character
876	/// of the number is found to be a zero. This means it is either an octal
877	/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
878	/// a floating point number (01239.123e4). Eat the prefix, determining the
879	/// radix etc.
880	void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
881	assert(s[0] == '0' && "Invalid method call")((void)0);
882	s++;
883
884	int c1 = s[0];
885
886	// Handle a hex number like 0x1234.
887	if ((c1 == 'x' \|\| c1 == 'X') && (isHexDigit(s[1]) \|\| s[1] == '.')) {
888	s++;
889	assert(s < ThisTokEnd && "didn't maximally munch?")((void)0);
890	radix = 16;
891	DigitsBegin = s;
892	s = SkipHexDigits(s);
893	bool HasSignificandDigits = containsDigits(DigitsBegin, s);
894	if (s == ThisTokEnd) {
895	// Done.
896	} else if (*s == '.') {
897	s++;
898	saw_period = true;
899	const char *floatDigitsBegin = s;
900	s = SkipHexDigits(s);
901	if (containsDigits(floatDigitsBegin, s))
902	HasSignificandDigits = true;
903	if (HasSignificandDigits)
904	checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
905	}
906
907	if (!HasSignificandDigits) {
908	Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
909	LangOpts),
910	diag::err_hex_constant_requires)
911	<< LangOpts.CPlusPlus << 1;
912	hadError = true;
913	return;
914	}
915
916	// A binary exponent can appear with or with a '.'. If dotted, the
917	// binary exponent is required.
918	if (s == 'p' \|\| s == 'P') {
919	checkSeparator(TokLoc, s, CSK_AfterDigits);
920	const char *Exponent = s;
921	s++;
922	saw_exponent = true;
923	if (s != ThisTokEnd && (s == '+' \|\| s == '-')) s++; // sign
924	const char *first_non_digit = SkipDigits(s);
925	if (!containsDigits(s, first_non_digit)) {
926	if (!hadError) {
927	Diags.Report(Lexer::AdvanceToTokenCharacter(
928	TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
929	diag::err_exponent_has_no_digits);
930	hadError = true;
931	}
932	return;
933	}
934	checkSeparator(TokLoc, s, CSK_BeforeDigits);
935	s = first_non_digit;
936
937	if (!LangOpts.HexFloats)
938	Diags.Report(TokLoc, LangOpts.CPlusPlus
939	? diag::ext_hex_literal_invalid
940	: diag::ext_hex_constant_invalid);
941	else if (LangOpts.CPlusPlus17)
942	Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
943	} else if (saw_period) {
944	Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
945	LangOpts),
946	diag::err_hex_constant_requires)
947	<< LangOpts.CPlusPlus << 0;
948	hadError = true;
949	}
950	return;
951	}
952
953	// Handle simple binary numbers 0b01010
954	if ((c1 == 'b' \|\| c1 == 'B') && (s[1] == '0' \|\| s[1] == '1')) {
955	// 0b101010 is a C++1y / GCC extension.
956	Diags.Report(TokLoc, LangOpts.CPlusPlus14
957	? diag::warn_cxx11_compat_binary_literal
958	: LangOpts.CPlusPlus ? diag::ext_binary_literal_cxx14
959	: diag::ext_binary_literal);
960	++s;
961	assert(s < ThisTokEnd && "didn't maximally munch?")((void)0);
962	radix = 2;
963	DigitsBegin = s;
964	s = SkipBinaryDigits(s);
965	if (s == ThisTokEnd) {
966	// Done.
967	} else if (isHexDigit(*s) &&
968	!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
969	Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
970	LangOpts),
971	diag::err_invalid_digit)
972	<< StringRef(s, 1) << 2;
973	hadError = true;
974	}
975	// Other suffixes will be diagnosed by the caller.
976	return;
977	}
978
979	// For now, the radix is set to 8. If we discover that we have a
980	// floating point constant, the radix will change to 10. Octal floating
981	// point constants are not permitted (only decimal and hexadecimal).
982	radix = 8;
983	DigitsBegin = s;
984	s = SkipOctalDigits(s);
985	if (s == ThisTokEnd)
986	return; // Done, simple octal number like 01234
987
988	// If we have some other non-octal digit that is a decimal digit, see if
989	// this is part of a floating point number like 094.123 or 09e1.
990	if (isDigit(*s)) {
991	const char *EndDecimal = SkipDigits(s);
992	if (EndDecimal[0] == '.' \|\| EndDecimal[0] == 'e' \|\| EndDecimal[0] == 'E') {
993	s = EndDecimal;
994	radix = 10;
995	}
996	}
997
998	ParseDecimalOrOctalCommon(TokLoc);
999	}
1000
1001	static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1002	switch (Radix) {
1003	case 2:
1004	return NumDigits <= 64;
1005	case 8:
1006	return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
1007	case 10:
1008	return NumDigits <= 19; // floor(log10(2^64))
1009	case 16:
1010	return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
1011	default:
1012	llvm_unreachable("impossible Radix")__builtin_unreachable();
1013	}
1014	}
1015
1016	/// GetIntegerValue - Convert this numeric literal value to an APInt that
1017	/// matches Val's input width. If there is an overflow, set Val to the low bits
1018	/// of the result and return true. Otherwise, return false.
1019	bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1020	// Fast path: Compute a conservative bound on the maximum number of
1021	// bits per digit in this radix. If we can't possibly overflow a
1022	// uint64 based on that bound then do the simple conversion to
1023	// integer. This avoids the expensive overflow checking below, and
1024	// handles the common cases that matter (small decimal integers and
1025	// hex/octal values which don't overflow).
1026	const unsigned NumDigits = SuffixBegin - DigitsBegin;
1027	if (alwaysFitsInto64Bits(radix, NumDigits)) {
1028	uint64_t N = 0;
1029	for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
1030	if (!isDigitSeparator(*Ptr))
1031	N = N * radix + llvm::hexDigitValue(*Ptr);
1032
1033	// This will truncate the value to Val's input width. Simply check
1034	// for overflow by comparing.
1035	Val = N;
1036	return Val.getZExtValue() != N;
1037	}
1038
1039	Val = 0;
1040	const char *Ptr = DigitsBegin;
1041
1042	llvm::APInt RadixVal(Val.getBitWidth(), radix);
1043	llvm::APInt CharVal(Val.getBitWidth(), 0);
1044	llvm::APInt OldVal = Val;
1045
1046	bool OverflowOccurred = false;
1047	while (Ptr < SuffixBegin) {
1048	if (isDigitSeparator(*Ptr)) {
1049	++Ptr;
1050	continue;
1051	}
1052
1053	unsigned C = llvm::hexDigitValue(*Ptr++);
1054
1055	// If this letter is out of bound for this radix, reject it.
1056	assert(C < radix && "NumericLiteralParser ctor should have rejected this")((void)0);
1057
1058	CharVal = C;
1059
1060	// Add the digit to the value in the appropriate radix. If adding in digits
1061	// made the value smaller, then this overflowed.
1062	OldVal = Val;
1063
1064	// Multiply by radix, did overflow occur on the multiply?
1065	Val *= RadixVal;
1066	OverflowOccurred \|= Val.udiv(RadixVal) != OldVal;
1067
1068	// Add value, did overflow occur on the value?
1069	// (a + b) ult b <=> overflow
1070	Val += CharVal;
1071	OverflowOccurred \|= Val.ult(CharVal);
1072	}
1073	return OverflowOccurred;
1074	}
1075
1076	llvm::APFloat::opStatus
1077	NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
1078	using llvm::APFloat;
1079
1080	unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
1081
1082	llvm::SmallString<16> Buffer;
1083	StringRef Str(ThisTokBegin, n);
1084	if (Str.find('\'') != StringRef::npos) {
1085	Buffer.reserve(n);
1086	std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
1087	&isDigitSeparator);
1088	Str = Buffer;
1089	}
1090
1091	auto StatusOrErr =
1092	Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
1093	assert(StatusOrErr && "Invalid floating point representation")((void)0);
1094	return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1095	: APFloat::opInvalidOp;
1096	}
1097
1098	static inline bool IsExponentPart(char c) {
1099	return c == 'p' \|\| c == 'P' \|\| c == 'e' \|\| c == 'E';
1100	}
1101
1102	bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1103	assert(radix == 16 \|\| radix == 10)((void)0);
1104
1105	// Find how many digits are needed to store the whole literal.
1106	unsigned NumDigits = SuffixBegin - DigitsBegin;
1107	if (saw_period) --NumDigits;
1108
1109	// Initial scan of the exponent if it exists
1110	bool ExpOverflowOccurred = false;
1111	bool NegativeExponent = false;
1112	const char *ExponentBegin;
1113	uint64_t Exponent = 0;
1114	int64_t BaseShift = 0;
1115	if (saw_exponent) {
1116	const char *Ptr = DigitsBegin;
1117
1118	while (!IsExponentPart(*Ptr)) ++Ptr;
1119	ExponentBegin = Ptr;
1120	++Ptr;
1121	NegativeExponent = *Ptr == '-';
1122	if (NegativeExponent) ++Ptr;
1123
1124	unsigned NumExpDigits = SuffixBegin - Ptr;
1125	if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
1126	llvm::StringRef ExpStr(Ptr, NumExpDigits);
1127	llvm::APInt ExpInt(/numBits=/64, ExpStr, /radix=/10);
1128	Exponent = ExpInt.getZExtValue();
1129	} else {
1130	ExpOverflowOccurred = true;
1131	}
1132
1133	if (NegativeExponent) BaseShift -= Exponent;
1134	else BaseShift += Exponent;
1135	}
1136
1137	// Number of bits needed for decimal literal is
1138	// ceil(NumDigits * log2(10)) Integral part
1139	// + Scale Fractional part
1140	// + ceil(Exponent * log2(10)) Exponent
1141	// --------------------------------------------------
1142	// ceil((NumDigits + Exponent) * log2(10)) + Scale
1143	//
1144	// But for simplicity in handling integers, we can round up log2(10) to 4,
1145	// making:
1146	// 4 * (NumDigits + Exponent) + Scale
1147	//
1148	// Number of digits needed for hexadecimal literal is
1149	// 4 * NumDigits Integral part
1150	// + Scale Fractional part
1151	// + Exponent Exponent
1152	// --------------------------------------------------
1153	// (4 * NumDigits) + Scale + Exponent
1154	uint64_t NumBitsNeeded;
1155	if (radix == 10)
1156	NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
1157	else
1158	NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
1159
1160	if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1161	ExpOverflowOccurred = true;
1162	llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /isSigned=/false);
1163
1164	bool FoundDecimal = false;
1165
1166	int64_t FractBaseShift = 0;
1167	const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
1168	for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
1169	if (*Ptr == '.') {
1170	FoundDecimal = true;
1171	continue;
1172	}
1173
1174	// Normal reading of an integer
1175	unsigned C = llvm::hexDigitValue(*Ptr);
1176	assert(C < radix && "NumericLiteralParser ctor should have rejected this")((void)0);
1177
1178	Val *= radix;
1179	Val += C;
1180
1181	if (FoundDecimal)
1182	// Keep track of how much we will need to adjust this value by from the
1183	// number of digits past the radix point.
1184	--FractBaseShift;
1185	}
1186
1187	// For a radix of 16, we will be multiplying by 2 instead of 16.
1188	if (radix == 16) FractBaseShift *= 4;
1189	BaseShift += FractBaseShift;
1190
1191	Val <<= Scale;
1192
1193	uint64_t Base = (radix == 16) ? 2 : 10;
1194	if (BaseShift > 0) {
1195	for (int64_t i = 0; i < BaseShift; ++i) {
1196	Val *= Base;
1197	}
1198	} else if (BaseShift < 0) {
1199	for (int64_t i = BaseShift; i < 0 && !Val.isNullValue(); ++i)
1200	Val = Val.udiv(Base);
1201	}
1202
1203	bool IntOverflowOccurred = false;
1204	auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
1205	if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1206	IntOverflowOccurred \|= Val.ugt(MaxVal.zext(Val.getBitWidth()));
1207	StoreVal = Val.trunc(StoreVal.getBitWidth());
1208	} else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
1209	IntOverflowOccurred \|= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
1210	StoreVal = Val.zext(StoreVal.getBitWidth());
1211	} else {
1212	StoreVal = Val;
1213	}
1214
1215	return IntOverflowOccurred \|\| ExpOverflowOccurred;
1216	}
1217
1218	/// \verbatim
1219	/// user-defined-character-literal: [C++11 lex.ext]
1220	/// character-literal ud-suffix
1221	/// ud-suffix:
1222	/// identifier
1223	/// character-literal: [C++11 lex.ccon]
1224	/// ' c-char-sequence '
1225	/// u' c-char-sequence '
1226	/// U' c-char-sequence '
1227	/// L' c-char-sequence '
1228	/// u8' c-char-sequence ' [C++1z lex.ccon]
1229	/// c-char-sequence:
1230	/// c-char
1231	/// c-char-sequence c-char
1232	/// c-char:
1233	/// any member of the source character set except the single-quote ',
1234	/// backslash \, or new-line character
1235	/// escape-sequence
1236	/// universal-character-name
1237	/// escape-sequence:
1238	/// simple-escape-sequence
1239	/// octal-escape-sequence
1240	/// hexadecimal-escape-sequence
1241	/// simple-escape-sequence:
1242	/// one of \' \" \? \\ \a \b \f \n \r \t \v
1243	/// octal-escape-sequence:
1244	/// \ octal-digit
1245	/// \ octal-digit octal-digit
1246	/// \ octal-digit octal-digit octal-digit
1247	/// hexadecimal-escape-sequence:
1248	/// \x hexadecimal-digit
1249	/// hexadecimal-escape-sequence hexadecimal-digit
1250	/// universal-character-name: [C++11 lex.charset]
1251	/// \u hex-quad
1252	/// \U hex-quad hex-quad
1253	/// hex-quad:
1254	/// hex-digit hex-digit hex-digit hex-digit
1255	/// \endverbatim
1256	///
1257	CharLiteralParser::CharLiteralParser(const char begin, const char end,
1258	SourceLocation Loc, Preprocessor &PP,
1259	tok::TokenKind kind) {
1260	// At this point we know that the character matches the regex "(L\|u\|U)?'.*'".
1261	HadError = false;
1262
1263	Kind = kind;
1264
1265	const char *TokBegin = begin;
1266
1267	// Skip over wide character determinant.
1268	if (Kind != tok::char_constant)
1269	++begin;
1270	if (Kind == tok::utf8_char_constant)
1271	++begin;
1272
1273	// Skip over the entry quote.
1274	assert(begin[0] == '\'' && "Invalid token lexed")((void)0);
1275	++begin;
1276
1277	// Remove an optional ud-suffix.
1278	if (end[-1] != '\'') {
1279	const char *UDSuffixEnd = end;
1280	do {
1281	--end;
1282	} while (end[-1] != '\'');
1283	// FIXME: Don't bother with this if !tok.hasUCN().
1284	expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1285	UDSuffixOffset = end - TokBegin;
1286	}
1287
1288	// Trim the ending quote.
1289	assert(end != begin && "Invalid token lexed")((void)0);
1290	--end;
1291
1292	// FIXME: The "Value" is an uint64_t so we can handle char literals of
1293	// up to 64-bits.
1294	// FIXME: This extensively assumes that 'char' is 8-bits.
1295	assert(PP.getTargetInfo().getCharWidth() == 8 &&((void)0)
1296	"Assumes char is 8 bits")((void)0);
1297	assert(PP.getTargetInfo().getIntWidth() <= 64 &&((void)0)
1298	(PP.getTargetInfo().getIntWidth() & 7) == 0 &&((void)0)
1299	"Assumes sizeof(int) on target is <= 64 and a multiple of char")((void)0);
1300	assert(PP.getTargetInfo().getWCharWidth() <= 64 &&((void)0)
1301	"Assumes sizeof(wchar) on target is <= 64")((void)0);
1302
1303	SmallVector<uint32_t, 4> codepoint_buffer;
1304	codepoint_buffer.resize(end - begin);
1305	uint32_t *buffer_begin = &codepoint_buffer.front();
1306	uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1307
1308	// Unicode escapes representing characters that cannot be correctly
1309	// represented in a single code unit are disallowed in character literals
1310	// by this implementation.
1311	uint32_t largest_character_for_kind;
1312	if (tok::wide_char_constant == Kind) {
1313	largest_character_for_kind =
1314	0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1315	} else if (tok::utf8_char_constant == Kind) {
1316	largest_character_for_kind = 0x7F;
1317	} else if (tok::utf16_char_constant == Kind) {
1318	largest_character_for_kind = 0xFFFF;
1319	} else if (tok::utf32_char_constant == Kind) {
1320	largest_character_for_kind = 0x10FFFF;
1321	} else {
1322	largest_character_for_kind = 0x7Fu;
1323	}
1324
1325	while (begin != end) {
1326	// Is this a span of non-escape characters?
1327	if (begin[0] != '\\') {
1328	char const *start = begin;
1329	do {
1330	++begin;
1331	} while (begin != end && *begin != '\\');
1332
1333	char const *tmp_in_start = start;
1334	uint32_t *tmp_out_start = buffer_begin;
1335	llvm::ConversionResult res =
1336	llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
1337	reinterpret_cast<llvm::UTF8 const *>(begin),
1338	&buffer_begin, buffer_end, llvm::strictConversion);
1339	if (res != llvm::conversionOK) {
1340	// If we see bad encoding for unprefixed character literals, warn and
1341	// simply copy the byte values, for compatibility with gcc and
1342	// older versions of clang.
1343	bool NoErrorOnBadEncoding = isAscii();
1344	unsigned Msg = diag::err_bad_character_encoding;
1345	if (NoErrorOnBadEncoding)
1346	Msg = diag::warn_bad_character_encoding;
1347	PP.Diag(Loc, Msg);
1348	if (NoErrorOnBadEncoding) {
1349	start = tmp_in_start;
1350	buffer_begin = tmp_out_start;
1351	for (; start != begin; ++start, ++buffer_begin)
1352	buffer_begin = static_cast<uint8_t>(start);
1353	} else {
1354	HadError = true;
1355	}
1356	} else {
1357	for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1358	if (*tmp_out_start > largest_character_for_kind) {
1359	HadError = true;
1360	PP.Diag(Loc, diag::err_character_too_large);
1361	}
1362	}
1363	}
1364
1365	continue;
1366	}
1367	// Is this a Universal Character Name escape?
1368	if (begin[1] == 'u' \|\| begin[1] == 'U') {
1369	unsigned short UcnLen = 0;
1370	if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1371	FullSourceLoc(Loc, PP.getSourceManager()),
1372	&PP.getDiagnostics(), PP.getLangOpts(), true)) {
1373	HadError = true;
1374	} else if (*buffer_begin > largest_character_for_kind) {
1375	HadError = true;
1376	PP.Diag(Loc, diag::err_character_too_large);
1377	}
1378
1379	++buffer_begin;
1380	continue;
1381	}
1382	unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1383	uint64_t result =
1384	ProcessCharEscape(TokBegin, begin, end, HadError,
1385	FullSourceLoc(Loc,PP.getSourceManager()),
1386	CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
1387	*buffer_begin++ = result;
1388	}
1389
1390	unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1391
1392	if (NumCharsSoFar > 1) {
1393	if (isWide())
1394	PP.Diag(Loc, diag::warn_extraneous_char_constant);
1395	else if (isAscii() && NumCharsSoFar == 4)
1396	PP.Diag(Loc, diag::warn_four_char_character_literal);
1397	else if (isAscii())
1398	PP.Diag(Loc, diag::warn_multichar_character_literal);
1399	else
1400	PP.Diag(Loc, diag::err_multichar_utf_character_literal);
1401	IsMultiChar = true;
1402	} else {
1403	IsMultiChar = false;
1404	}
1405
1406	llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1407
1408	// Narrow character literals act as though their value is concatenated
1409	// in this implementation, but warn on overflow.
1410	bool multi_char_too_long = false;
1411	if (isAscii() && isMultiChar()) {
1412	LitVal = 0;
1413	for (size_t i = 0; i < NumCharsSoFar; ++i) {
1414	// check for enough leading zeros to shift into
1415	multi_char_too_long \|= (LitVal.countLeadingZeros() < 8);
1416	LitVal <<= 8;
1417	LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1418	}
1419	} else if (NumCharsSoFar > 0) {
1420	// otherwise just take the last character
1421	LitVal = buffer_begin[-1];
1422	}
1423
1424	if (!HadError && multi_char_too_long) {
1425	PP.Diag(Loc, diag::warn_char_constant_too_large);
1426	}
1427
1428	// Transfer the value from APInt to uint64_t
1429	Value = LitVal.getZExtValue();
1430
1431	// If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1432	// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
1433	// character constants are not sign extended in the this implementation:
1434	// '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1435	if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
1436	PP.getLangOpts().CharIsSigned)
1437	Value = (signed char)Value;
1438	}
1439
1440	/// \verbatim
1441	/// string-literal: [C++0x lex.string]
1442	/// encoding-prefix " [s-char-sequence] "
1443	/// encoding-prefix R raw-string
1444	/// encoding-prefix:
1445	/// u8
1446	/// u
1447	/// U
1448	/// L
1449	/// s-char-sequence:
1450	/// s-char
1451	/// s-char-sequence s-char
1452	/// s-char:
1453	/// any member of the source character set except the double-quote ",
1454	/// backslash \, or new-line character
1455	/// escape-sequence
1456	/// universal-character-name
1457	/// raw-string:
1458	/// " d-char-sequence ( r-char-sequence ) d-char-sequence "
1459	/// r-char-sequence:
1460	/// r-char
1461	/// r-char-sequence r-char
1462	/// r-char:
1463	/// any member of the source character set, except a right parenthesis )
1464	/// followed by the initial d-char-sequence (which may be empty)
1465	/// followed by a double quote ".
1466	/// d-char-sequence:
1467	/// d-char
1468	/// d-char-sequence d-char
1469	/// d-char:
1470	/// any member of the basic source character set except:
1471	/// space, the left parenthesis (, the right parenthesis ),
1472	/// the backslash \, and the control characters representing horizontal
1473	/// tab, vertical tab, form feed, and newline.
1474	/// escape-sequence: [C++0x lex.ccon]
1475	/// simple-escape-sequence
1476	/// octal-escape-sequence
1477	/// hexadecimal-escape-sequence
1478	/// simple-escape-sequence:
1479	/// one of \' \" \? \\ \a \b \f \n \r \t \v
1480	/// octal-escape-sequence:
1481	/// \ octal-digit
1482	/// \ octal-digit octal-digit
1483	/// \ octal-digit octal-digit octal-digit
1484	/// hexadecimal-escape-sequence:
1485	/// \x hexadecimal-digit
1486	/// hexadecimal-escape-sequence hexadecimal-digit
1487	/// universal-character-name:
1488	/// \u hex-quad
1489	/// \U hex-quad hex-quad
1490	/// hex-quad:
1491	/// hex-digit hex-digit hex-digit hex-digit
1492	/// \endverbatim
1493	///
1494	StringLiteralParser::
1495	StringLiteralParser(ArrayRef<Token> StringToks,
1496	Preprocessor &PP, bool Complain)
1497	: SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1498	Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr),
1499	MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1500	ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
1501	init(StringToks);
1502	}
1503
1504	void StringLiteralParser::init(ArrayRef<Token> StringToks){
1505	// The literal token may have come from an invalid source location (e.g. due
1506	// to a PCH error), in which case the token length will be 0.
1507	if (StringToks.empty() \|\| StringToks[0].getLength() < 2)
1508	return DiagnoseLexingError(SourceLocation());
1509
1510	// Scan all of the string portions, remember the max individual token length,
1511	// computing a bound on the concatenated string length, and see whether any
1512	// piece is a wide-string. If any of the string portions is a wide-string
1513	// literal, the result is a wide-string literal [C99 6.4.5p4].
1514	assert(!StringToks.empty() && "expected at least one token")((void)0);
1515	MaxTokenLength = StringToks[0].getLength();
1516	assert(StringToks[0].getLength() >= 2 && "literal token is invalid!")((void)0);
1517	SizeBound = StringToks[0].getLength()-2; // -2 for "".
1518	Kind = StringToks[0].getKind();
1519
1520	hadError = false;
1521
1522	// Implement Translation Phase #6: concatenation of string literals
1523	/// (C99 5.1.1.2p1). The common case is only one string fragment.
1524	for (unsigned i = 1; i != StringToks.size(); ++i) {
1525	if (StringToks[i].getLength() < 2)
1526	return DiagnoseLexingError(StringToks[i].getLocation());
1527
1528	// The string could be shorter than this if it needs cleaning, but this is a
1529	// reasonable bound, which is all we need.
1530	assert(StringToks[i].getLength() >= 2 && "literal token is invalid!")((void)0);
1531	SizeBound += StringToks[i].getLength()-2; // -2 for "".
1532
1533	// Remember maximum string piece length.
1534	if (StringToks[i].getLength() > MaxTokenLength)
1535	MaxTokenLength = StringToks[i].getLength();
1536
1537	// Remember if we see any wide or utf-8/16/32 strings.
1538	// Also check for illegal concatenations.
1539	if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
1540	if (isAscii()) {
1541	Kind = StringToks[i].getKind();
1542	} else {
1543	if (Diags)
1544	Diags->Report(StringToks[i].getLocation(),
1545	diag::err_unsupported_string_concat);
1546	hadError = true;
1547	}
1548	}
1549	}
1550
1551	// Include space for the null terminator.
1552	++SizeBound;
1553
1554	// TODO: K&R warning: "traditional C rejects string constant concatenation"
1555
1556	// Get the width in bytes of char/wchar_t/char16_t/char32_t
1557	CharByteWidth = getCharWidth(Kind, Target);
1558	assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple")((void)0);
1559	CharByteWidth /= 8;
1560
1561	// The output buffer size needs to be large enough to hold wide characters.
1562	// This is a worst-case assumption which basically corresponds to L"" "long".
1563	SizeBound *= CharByteWidth;
1564
1565	// Size the temporary buffer to hold the result string data.
1566	ResultBuf.resize(SizeBound);
1567
1568	// Likewise, but for each string piece.
1569	SmallString<512> TokenBuf;
1570	TokenBuf.resize(MaxTokenLength);
1571
1572	// Loop over all the strings, getting their spelling, and expanding them to
1573	// wide strings as appropriate.
1574	ResultPtr = &ResultBuf[0]; // Next byte to fill in.
1575
1576	Pascal = false;
1577
1578	SourceLocation UDSuffixTokLoc;
1579
1580	for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
1581	const char *ThisTokBuf = &TokenBuf[0];
1582	// Get the spelling of the token, which eliminates trigraphs, etc. We know
1583	// that ThisTokBuf points to a buffer that is big enough for the whole token
1584	// and 'spelled' tokens can only shrink.
1585	bool StringInvalid = false;
1586	unsigned ThisTokLen =
1587	Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1588	&StringInvalid);
1589	if (StringInvalid)
1590	return DiagnoseLexingError(StringToks[i].getLocation());
1591
1592	const char *ThisTokBegin = ThisTokBuf;
1593	const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1594
1595	// Remove an optional ud-suffix.
1596	if (ThisTokEnd[-1] != '"') {
1597	const char *UDSuffixEnd = ThisTokEnd;
1598	do {
1599	--ThisTokEnd;
1600	} while (ThisTokEnd[-1] != '"');
1601
1602	StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1603
1604	if (UDSuffixBuf.empty()) {
1605	if (StringToks[i].hasUCN())
1606	expandUCNs(UDSuffixBuf, UDSuffix);
1607	else
1608	UDSuffixBuf.assign(UDSuffix);
1609	UDSuffixToken = i;
1610	UDSuffixOffset = ThisTokEnd - ThisTokBuf;
1611	UDSuffixTokLoc = StringToks[i].getLocation();
1612	} else {
1613	SmallString<32> ExpandedUDSuffix;
1614	if (StringToks[i].hasUCN()) {
1615	expandUCNs(ExpandedUDSuffix, UDSuffix);
1616	UDSuffix = ExpandedUDSuffix;
1617	}
1618
1619	// C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1620	// result of a concatenation involving at least one user-defined-string-
1621	// literal, all the participating user-defined-string-literals shall
1622	// have the same ud-suffix.
1623	if (UDSuffixBuf != UDSuffix) {
1624	if (Diags) {
1625	SourceLocation TokLoc = StringToks[i].getLocation();
1626	Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1627	<< UDSuffixBuf << UDSuffix
1628	<< SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1629	<< SourceRange(TokLoc, TokLoc);
1630	}
1631	hadError = true;
1632	}
1633	}
1634	}
1635
1636	// Strip the end quote.
1637	--ThisTokEnd;
1638
1639	// TODO: Input character set mapping support.
1640
1641	// Skip marker for wide or unicode strings.
1642	if (ThisTokBuf[0] == 'L' \|\| ThisTokBuf[0] == 'u' \|\| ThisTokBuf[0] == 'U') {
1643	++ThisTokBuf;
1644	// Skip 8 of u8 marker for utf8 strings.
1645	if (ThisTokBuf[0] == '8')
1646	++ThisTokBuf;
1647	}
1648
1649	// Check for raw string
1650	if (ThisTokBuf[0] == 'R') {
1651	if (ThisTokBuf[1] != '"') {
1652	// The file may have come from PCH and then changed after loading the
1653	// PCH; Fail gracefully.
1654	return DiagnoseLexingError(StringToks[i].getLocation());
1655	}
1656	ThisTokBuf += 2; // skip R"
1657
1658	// C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
1659	// characters.
1660	constexpr unsigned MaxRawStrDelimLen = 16;
1661
1662	const char *Prefix = ThisTokBuf;
1663	while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
1664	ThisTokBuf[0] != '(')
1665	++ThisTokBuf;
1666	if (ThisTokBuf[0] != '(')
1667	return DiagnoseLexingError(StringToks[i].getLocation());
1668	++ThisTokBuf; // skip '('
1669
1670	// Remove same number of characters from the end
1671	ThisTokEnd -= ThisTokBuf - Prefix;
1672	if (ThisTokEnd < ThisTokBuf)
1673	return DiagnoseLexingError(StringToks[i].getLocation());
1674
1675	// C++14 [lex.string]p4: A source-file new-line in a raw string literal
1676	// results in a new-line in the resulting execution string-literal.
1677	StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
1678	while (!RemainingTokenSpan.empty()) {
1679	// Split the string literal on \r\n boundaries.
1680	size_t CRLFPos = RemainingTokenSpan.find("\r\n");
1681	StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
1682	StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
1683
1684	// Copy everything before the \r\n sequence into the string literal.
1685	if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
1686	hadError = true;
1687
1688	// Point into the \n inside the \r\n sequence and operate on the
1689	// remaining portion of the literal.
1690	RemainingTokenSpan = AfterCRLF.substr(1);
1691	}
1692	} else {
1693	if (ThisTokBuf[0] != '"') {
1694	// The file may have come from PCH and then changed after loading the
1695	// PCH; Fail gracefully.
1696	return DiagnoseLexingError(StringToks[i].getLocation());
1697	}
1698	++ThisTokBuf; // skip "
1699
1700	// Check if this is a pascal string
1701	if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
1702	ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
1703
1704	// If the \p sequence is found in the first token, we have a pascal string
1705	// Otherwise, if we already have a pascal string, ignore the first \p
1706	if (i == 0) {
1707	++ThisTokBuf;
1708	Pascal = true;
1709	} else if (Pascal)
1710	ThisTokBuf += 2;
1711	}
1712
1713	while (ThisTokBuf != ThisTokEnd) {
1714	// Is this a span of non-escape characters?
1715	if (ThisTokBuf[0] != '\\') {
1716	const char *InStart = ThisTokBuf;
1717	do {
1718	++ThisTokBuf;
1719	} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
1720
1721	// Copy the character span over.
1722	if (CopyStringFragment(StringToks[i], ThisTokBegin,
1723	StringRef(InStart, ThisTokBuf - InStart)))
1724	hadError = true;
1725	continue;
1726	}
1727	// Is this a Universal Character Name escape?
1728	if (ThisTokBuf[1] == 'u' \|\| ThisTokBuf[1] == 'U') {
1729	EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
1730	ResultPtr, hadError,
1731	FullSourceLoc(StringToks[i].getLocation(), SM),
1732	CharByteWidth, Diags, Features);
1733	continue;
1734	}
1735	// Otherwise, this is a non-UCN escape character. Process it.
1736	unsigned ResultChar =
1737	ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
1738	FullSourceLoc(StringToks[i].getLocation(), SM),
1739	CharByteWidth*8, Diags, Features);
1740
1741	if (CharByteWidth == 4) {
1742	// FIXME: Make the type of the result buffer correct instead of
1743	// using reinterpret_cast.
1744	llvm::UTF32 ResultWidePtr = reinterpret_cast<llvm::UTF32>(ResultPtr);
1745	*ResultWidePtr = ResultChar;
1746	ResultPtr += 4;
1747	} else if (CharByteWidth == 2) {
1748	// FIXME: Make the type of the result buffer correct instead of
1749	// using reinterpret_cast.
1750	llvm::UTF16 ResultWidePtr = reinterpret_cast<llvm::UTF16>(ResultPtr);
1751	*ResultWidePtr = ResultChar & 0xFFFF;
1752	ResultPtr += 2;
1753	} else {
1754	assert(CharByteWidth == 1 && "Unexpected char width")((void)0);
1755	*ResultPtr++ = ResultChar & 0xFF;
1756	}
1757	}
1758	}
1759	}
1760
1761	if (Pascal) {
1762	if (CharByteWidth == 4) {
1763	// FIXME: Make the type of the result buffer correct instead of
1764	// using reinterpret_cast.
1765	llvm::UTF32 ResultWidePtr = reinterpret_cast<llvm::UTF32>(ResultBuf.data());
1766	ResultWidePtr[0] = GetNumStringChars() - 1;
1767	} else if (CharByteWidth == 2) {
1768	// FIXME: Make the type of the result buffer correct instead of
1769	// using reinterpret_cast.
1770	llvm::UTF16 ResultWidePtr = reinterpret_cast<llvm::UTF16>(ResultBuf.data());
1771	ResultWidePtr[0] = GetNumStringChars() - 1;
1772	} else {
1773	assert(CharByteWidth == 1 && "Unexpected char width")((void)0);
1774	ResultBuf[0] = GetNumStringChars() - 1;
1775	}
1776
1777	// Verify that pascal strings aren't too large.
1778	if (GetStringLength() > 256) {
1779	if (Diags)
1780	Diags->Report(StringToks.front().getLocation(),
1781	diag::err_pascal_string_too_long)
1782	<< SourceRange(StringToks.front().getLocation(),
1783	StringToks.back().getLocation());
1784	hadError = true;
1785	return;
1786	}
1787	} else if (Diags) {
1788	// Complain if this string literal has too many characters.
1789	unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
1790
1791	if (GetNumStringChars() > MaxChars)
1792	Diags->Report(StringToks.front().getLocation(),
1793	diag::ext_string_too_long)
1794	<< GetNumStringChars() << MaxChars
1795	<< (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
1796	<< SourceRange(StringToks.front().getLocation(),
1797	StringToks.back().getLocation());
1798	}
1799	}
1800
1801	static const char resyncUTF8(const char Err, const char *End) {
1802	if (Err == End)
1803	return End;
1804	End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
1805	while (++Err != End && (*Err & 0xC0) == 0x80)
1806	;
1807	return Err;
1808	}
1809
1810	/// This function copies from Fragment, which is a sequence of bytes
1811	/// within Tok's contents (which begin at TokBegin) into ResultPtr.
1812	/// Performs widening for multi-byte characters.
1813	bool StringLiteralParser::CopyStringFragment(const Token &Tok,
1814	const char *TokBegin,
1815	StringRef Fragment) {
1816	const llvm::UTF8 *ErrorPtrTmp;
1817	if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
1818	return false;
1819
1820	// If we see bad encoding for unprefixed string literals, warn and
1821	// simply copy the byte values, for compatibility with gcc and older
1822	// versions of clang.
1823	bool NoErrorOnBadEncoding = isAscii();
1824	if (NoErrorOnBadEncoding) {
1825	memcpy(ResultPtr, Fragment.data(), Fragment.size());
1826	ResultPtr += Fragment.size();
1827	}
1828
1829	if (Diags) {
1830	const char ErrorPtr = reinterpret_cast<const char >(ErrorPtrTmp);
1831
1832	FullSourceLoc SourceLoc(Tok.getLocation(), SM);
1833	const DiagnosticBuilder &Builder =
1834	Diag(Diags, Features, SourceLoc, TokBegin,
1835	ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
1836	NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
1837	: diag::err_bad_string_encoding);
1838
1839	const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
1840	StringRef NextFragment(NextStart, Fragment.end()-NextStart);
1841
1842	// Decode into a dummy buffer.
1843	SmallString<512> Dummy;
1844	Dummy.reserve(Fragment.size() * CharByteWidth);
1845	char *Ptr = Dummy.data();
1846
1847	while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
1848	const char ErrorPtr = reinterpret_cast<const char >(ErrorPtrTmp);
1849	NextStart = resyncUTF8(ErrorPtr, Fragment.end());
1850	Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
1851	ErrorPtr, NextStart);
1852	NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
1853	}
1854	}
1855	return !NoErrorOnBadEncoding;
1856	}
1857
1858	void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
1859	hadError = true;
1860	if (Diags)
1861	Diags->Report(Loc, diag::err_lexing_string);
1862	}
1863
1864	/// getOffsetOfStringByte - This function returns the offset of the
1865	/// specified byte of the string data represented by Token. This handles
1866	/// advancing over escape sequences in the string.
1867	unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
1868	unsigned ByteNo) const {
1869	// Get the spelling of the token.
1870	SmallString<32> SpellingBuffer;
1871	SpellingBuffer.resize(Tok.getLength());
1872
1873	bool StringInvalid = false;
1874	const char *SpellingPtr = &SpellingBuffer[0];
1875	unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1876	&StringInvalid);
1877	if (StringInvalid)
1878	return 0;
1879
1880	const char *SpellingStart = SpellingPtr;
1881	const char *SpellingEnd = SpellingPtr+TokLen;
1882
1883	// Handle UTF-8 strings just like narrow strings.
1884	if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
1885	SpellingPtr += 2;
1886
1887	assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&((void)0)
1888	SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet")((void)0);
1889
1890	// For raw string literals, this is easy.
1891	if (SpellingPtr[0] == 'R') {
1892	assert(SpellingPtr[1] == '"' && "Should be a raw string literal!")((void)0);
1893	// Skip 'R"'.
1894	SpellingPtr += 2;
1895	while (*SpellingPtr != '(') {
1896	++SpellingPtr;
1897	assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal")((void)0);
1898	}
1899	// Skip '('.
1900	++SpellingPtr;
1901	return SpellingPtr - SpellingStart + ByteNo;
1902	}
1903
1904	// Skip over the leading quote
1905	assert(SpellingPtr[0] == '"' && "Should be a string literal!")((void)0);
1906	++SpellingPtr;
1907
1908	// Skip over bytes until we find the offset we're looking for.
1909	while (ByteNo) {
1910	assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!")((void)0);
1911
1912	// Step over non-escapes simply.
1913	if (*SpellingPtr != '\\') {
1914	++SpellingPtr;
1915	--ByteNo;
1916	continue;
1917	}
1918
1919	// Otherwise, this is an escape character. Advance over it.
1920	bool HadError = false;
1921	if (SpellingPtr[1] == 'u' \|\| SpellingPtr[1] == 'U') {
1922	const char *EscapePtr = SpellingPtr;
1923	unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
1924	1, Features, HadError);
1925	if (Len > ByteNo) {
1926	// ByteNo is somewhere within the escape sequence.
1927	SpellingPtr = EscapePtr;
1928	break;
1929	}
1930	ByteNo -= Len;
1931	} else {
1932	ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
1933	FullSourceLoc(Tok.getLocation(), SM),
1934	CharByteWidth*8, Diags, Features);
1935	--ByteNo;
1936	}
1937	assert(!HadError && "This method isn't valid on erroneous strings")((void)0);
1938	}
1939
1940	return SpellingPtr-SpellingStart;
1941	}
1942
1943	/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1944	/// suffixes as ud-suffixes, because the diagnostic experience is better if we
1945	/// treat it as an invalid suffix.
1946	bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1947	StringRef Suffix) {
1948	return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) \|\|
1949	Suffix == "sv";
1950	}