1*67e74705SXin Li //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2*67e74705SXin Li //
3*67e74705SXin Li // The LLVM Compiler Infrastructure
4*67e74705SXin Li //
5*67e74705SXin Li // This file is distributed under the University of Illinois Open Source
6*67e74705SXin Li // License. See LICENSE.TXT for details.
7*67e74705SXin Li //
8*67e74705SXin Li //===----------------------------------------------------------------------===//
9*67e74705SXin Li //
10*67e74705SXin Li // This file implements the NumericLiteralParser, CharLiteralParser, and
11*67e74705SXin Li // StringLiteralParser interfaces.
12*67e74705SXin Li //
13*67e74705SXin Li //===----------------------------------------------------------------------===//
14*67e74705SXin Li
15*67e74705SXin Li #include "clang/Lex/LiteralSupport.h"
16*67e74705SXin Li #include "clang/Basic/CharInfo.h"
17*67e74705SXin Li #include "clang/Basic/TargetInfo.h"
18*67e74705SXin Li #include "clang/Lex/LexDiagnostic.h"
19*67e74705SXin Li #include "clang/Lex/Preprocessor.h"
20*67e74705SXin Li #include "llvm/ADT/StringExtras.h"
21*67e74705SXin Li #include "llvm/Support/ConvertUTF.h"
22*67e74705SXin Li #include "llvm/Support/ErrorHandling.h"
23*67e74705SXin Li
24*67e74705SXin Li using namespace clang;
25*67e74705SXin Li
getCharWidth(tok::TokenKind kind,const TargetInfo & Target)26*67e74705SXin Li static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
27*67e74705SXin Li switch (kind) {
28*67e74705SXin Li default: llvm_unreachable("Unknown token type!");
29*67e74705SXin Li case tok::char_constant:
30*67e74705SXin Li case tok::string_literal:
31*67e74705SXin Li case tok::utf8_char_constant:
32*67e74705SXin Li case tok::utf8_string_literal:
33*67e74705SXin Li return Target.getCharWidth();
34*67e74705SXin Li case tok::wide_char_constant:
35*67e74705SXin Li case tok::wide_string_literal:
36*67e74705SXin Li return Target.getWCharWidth();
37*67e74705SXin Li case tok::utf16_char_constant:
38*67e74705SXin Li case tok::utf16_string_literal:
39*67e74705SXin Li return Target.getChar16Width();
40*67e74705SXin Li case tok::utf32_char_constant:
41*67e74705SXin Li case tok::utf32_string_literal:
42*67e74705SXin Li return Target.getChar32Width();
43*67e74705SXin Li }
44*67e74705SXin Li }
45*67e74705SXin Li
MakeCharSourceRange(const LangOptions & Features,FullSourceLoc TokLoc,const char * TokBegin,const char * TokRangeBegin,const char * TokRangeEnd)46*67e74705SXin Li static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
47*67e74705SXin Li FullSourceLoc TokLoc,
48*67e74705SXin Li const char *TokBegin,
49*67e74705SXin Li const char *TokRangeBegin,
50*67e74705SXin Li const char *TokRangeEnd) {
51*67e74705SXin Li SourceLocation Begin =
52*67e74705SXin Li Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
53*67e74705SXin Li TokLoc.getManager(), Features);
54*67e74705SXin Li SourceLocation End =
55*67e74705SXin Li Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
56*67e74705SXin Li TokLoc.getManager(), Features);
57*67e74705SXin Li return CharSourceRange::getCharRange(Begin, End);
58*67e74705SXin Li }
59*67e74705SXin Li
60*67e74705SXin Li /// \brief Produce a diagnostic highlighting some portion of a literal.
61*67e74705SXin Li ///
62*67e74705SXin Li /// Emits the diagnostic \p DiagID, highlighting the range of characters from
63*67e74705SXin Li /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
64*67e74705SXin Li /// a substring of a spelling buffer for the token beginning at \p TokBegin.
Diag(DiagnosticsEngine * Diags,const LangOptions & Features,FullSourceLoc TokLoc,const char * TokBegin,const char * TokRangeBegin,const char * TokRangeEnd,unsigned DiagID)65*67e74705SXin Li static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
66*67e74705SXin Li const LangOptions &Features, FullSourceLoc TokLoc,
67*67e74705SXin Li const char *TokBegin, const char *TokRangeBegin,
68*67e74705SXin Li const char *TokRangeEnd, unsigned DiagID) {
69*67e74705SXin Li SourceLocation Begin =
70*67e74705SXin Li Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
71*67e74705SXin Li TokLoc.getManager(), Features);
72*67e74705SXin Li return Diags->Report(Begin, DiagID) <<
73*67e74705SXin Li MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
74*67e74705SXin Li }
75*67e74705SXin Li
76*67e74705SXin Li /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
77*67e74705SXin Li /// either a character or a string literal.
ProcessCharEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,bool & HadError,FullSourceLoc Loc,unsigned CharWidth,DiagnosticsEngine * Diags,const LangOptions & Features)78*67e74705SXin Li static unsigned ProcessCharEscape(const char *ThisTokBegin,
79*67e74705SXin Li const char *&ThisTokBuf,
80*67e74705SXin Li const char *ThisTokEnd, bool &HadError,
81*67e74705SXin Li FullSourceLoc Loc, unsigned CharWidth,
82*67e74705SXin Li DiagnosticsEngine *Diags,
83*67e74705SXin Li const LangOptions &Features) {
84*67e74705SXin Li const char *EscapeBegin = ThisTokBuf;
85*67e74705SXin Li
86*67e74705SXin Li // Skip the '\' char.
87*67e74705SXin Li ++ThisTokBuf;
88*67e74705SXin Li
89*67e74705SXin Li // We know that this character can't be off the end of the buffer, because
90*67e74705SXin Li // that would have been \", which would not have been the end of string.
91*67e74705SXin Li unsigned ResultChar = *ThisTokBuf++;
92*67e74705SXin Li switch (ResultChar) {
93*67e74705SXin Li // These map to themselves.
94*67e74705SXin Li case '\\': case '\'': case '"': case '?': break;
95*67e74705SXin Li
96*67e74705SXin Li // These have fixed mappings.
97*67e74705SXin Li case 'a':
98*67e74705SXin Li // TODO: K&R: the meaning of '\\a' is different in traditional C
99*67e74705SXin Li ResultChar = 7;
100*67e74705SXin Li break;
101*67e74705SXin Li case 'b':
102*67e74705SXin Li ResultChar = 8;
103*67e74705SXin Li break;
104*67e74705SXin Li case 'e':
105*67e74705SXin Li if (Diags)
106*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
107*67e74705SXin Li diag::ext_nonstandard_escape) << "e";
108*67e74705SXin Li ResultChar = 27;
109*67e74705SXin Li break;
110*67e74705SXin Li case 'E':
111*67e74705SXin Li if (Diags)
112*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
113*67e74705SXin Li diag::ext_nonstandard_escape) << "E";
114*67e74705SXin Li ResultChar = 27;
115*67e74705SXin Li break;
116*67e74705SXin Li case 'f':
117*67e74705SXin Li ResultChar = 12;
118*67e74705SXin Li break;
119*67e74705SXin Li case 'n':
120*67e74705SXin Li ResultChar = 10;
121*67e74705SXin Li break;
122*67e74705SXin Li case 'r':
123*67e74705SXin Li ResultChar = 13;
124*67e74705SXin Li break;
125*67e74705SXin Li case 't':
126*67e74705SXin Li ResultChar = 9;
127*67e74705SXin Li break;
128*67e74705SXin Li case 'v':
129*67e74705SXin Li ResultChar = 11;
130*67e74705SXin Li break;
131*67e74705SXin Li case 'x': { // Hex escape.
132*67e74705SXin Li ResultChar = 0;
133*67e74705SXin Li if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
134*67e74705SXin Li if (Diags)
135*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
136*67e74705SXin Li diag::err_hex_escape_no_digits) << "x";
137*67e74705SXin Li HadError = 1;
138*67e74705SXin Li break;
139*67e74705SXin Li }
140*67e74705SXin Li
141*67e74705SXin Li // Hex escapes are a maximal series of hex digits.
142*67e74705SXin Li bool Overflow = false;
143*67e74705SXin Li for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
144*67e74705SXin Li int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
145*67e74705SXin Li if (CharVal == -1) break;
146*67e74705SXin Li // About to shift out a digit?
147*67e74705SXin Li if (ResultChar & 0xF0000000)
148*67e74705SXin Li Overflow = true;
149*67e74705SXin Li ResultChar <<= 4;
150*67e74705SXin Li ResultChar |= CharVal;
151*67e74705SXin Li }
152*67e74705SXin Li
153*67e74705SXin Li // See if any bits will be truncated when evaluated as a character.
154*67e74705SXin Li if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
155*67e74705SXin Li Overflow = true;
156*67e74705SXin Li ResultChar &= ~0U >> (32-CharWidth);
157*67e74705SXin Li }
158*67e74705SXin Li
159*67e74705SXin Li // Check for overflow.
160*67e74705SXin Li if (Overflow && Diags) // Too many digits to fit in
161*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
162*67e74705SXin Li diag::err_escape_too_large) << 0;
163*67e74705SXin Li break;
164*67e74705SXin Li }
165*67e74705SXin Li case '0': case '1': case '2': case '3':
166*67e74705SXin Li case '4': case '5': case '6': case '7': {
167*67e74705SXin Li // Octal escapes.
168*67e74705SXin Li --ThisTokBuf;
169*67e74705SXin Li ResultChar = 0;
170*67e74705SXin Li
171*67e74705SXin Li // Octal escapes are a series of octal digits with maximum length 3.
172*67e74705SXin Li // "\0123" is a two digit sequence equal to "\012" "3".
173*67e74705SXin Li unsigned NumDigits = 0;
174*67e74705SXin Li do {
175*67e74705SXin Li ResultChar <<= 3;
176*67e74705SXin Li ResultChar |= *ThisTokBuf++ - '0';
177*67e74705SXin Li ++NumDigits;
178*67e74705SXin Li } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
179*67e74705SXin Li ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
180*67e74705SXin Li
181*67e74705SXin Li // Check for overflow. Reject '\777', but not L'\777'.
182*67e74705SXin Li if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
183*67e74705SXin Li if (Diags)
184*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
185*67e74705SXin Li diag::err_escape_too_large) << 1;
186*67e74705SXin Li ResultChar &= ~0U >> (32-CharWidth);
187*67e74705SXin Li }
188*67e74705SXin Li break;
189*67e74705SXin Li }
190*67e74705SXin Li
191*67e74705SXin Li // Otherwise, these are not valid escapes.
192*67e74705SXin Li case '(': case '{': case '[': case '%':
193*67e74705SXin Li // GCC accepts these as extensions. We warn about them as such though.
194*67e74705SXin Li if (Diags)
195*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
196*67e74705SXin Li diag::ext_nonstandard_escape)
197*67e74705SXin Li << std::string(1, ResultChar);
198*67e74705SXin Li break;
199*67e74705SXin Li default:
200*67e74705SXin Li if (!Diags)
201*67e74705SXin Li break;
202*67e74705SXin Li
203*67e74705SXin Li if (isPrintable(ResultChar))
204*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
205*67e74705SXin Li diag::ext_unknown_escape)
206*67e74705SXin Li << std::string(1, ResultChar);
207*67e74705SXin Li else
208*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
209*67e74705SXin Li diag::ext_unknown_escape)
210*67e74705SXin Li << "x" + llvm::utohexstr(ResultChar);
211*67e74705SXin Li break;
212*67e74705SXin Li }
213*67e74705SXin Li
214*67e74705SXin Li return ResultChar;
215*67e74705SXin Li }
216*67e74705SXin Li
appendCodePoint(unsigned Codepoint,llvm::SmallVectorImpl<char> & Str)217*67e74705SXin Li static void appendCodePoint(unsigned Codepoint,
218*67e74705SXin Li llvm::SmallVectorImpl<char> &Str) {
219*67e74705SXin Li char ResultBuf[4];
220*67e74705SXin Li char *ResultPtr = ResultBuf;
221*67e74705SXin Li bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
222*67e74705SXin Li (void)Res;
223*67e74705SXin Li assert(Res && "Unexpected conversion failure");
224*67e74705SXin Li Str.append(ResultBuf, ResultPtr);
225*67e74705SXin Li }
226*67e74705SXin Li
expandUCNs(SmallVectorImpl<char> & Buf,StringRef Input)227*67e74705SXin Li void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
228*67e74705SXin Li for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
229*67e74705SXin Li if (*I != '\\') {
230*67e74705SXin Li Buf.push_back(*I);
231*67e74705SXin Li continue;
232*67e74705SXin Li }
233*67e74705SXin Li
234*67e74705SXin Li ++I;
235*67e74705SXin Li assert(*I == 'u' || *I == 'U');
236*67e74705SXin Li
237*67e74705SXin Li unsigned NumHexDigits;
238*67e74705SXin Li if (*I == 'u')
239*67e74705SXin Li NumHexDigits = 4;
240*67e74705SXin Li else
241*67e74705SXin Li NumHexDigits = 8;
242*67e74705SXin Li
243*67e74705SXin Li assert(I + NumHexDigits <= E);
244*67e74705SXin Li
245*67e74705SXin Li uint32_t CodePoint = 0;
246*67e74705SXin Li for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
247*67e74705SXin Li unsigned Value = llvm::hexDigitValue(*I);
248*67e74705SXin Li assert(Value != -1U);
249*67e74705SXin Li
250*67e74705SXin Li CodePoint <<= 4;
251*67e74705SXin Li CodePoint += Value;
252*67e74705SXin Li }
253*67e74705SXin Li
254*67e74705SXin Li appendCodePoint(CodePoint, Buf);
255*67e74705SXin Li --I;
256*67e74705SXin Li }
257*67e74705SXin Li }
258*67e74705SXin Li
259*67e74705SXin Li /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
260*67e74705SXin Li /// return the UTF32.
ProcessUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,uint32_t & UcnVal,unsigned short & UcnLen,FullSourceLoc Loc,DiagnosticsEngine * Diags,const LangOptions & Features,bool in_char_string_literal=false)261*67e74705SXin Li static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
262*67e74705SXin Li const char *ThisTokEnd,
263*67e74705SXin Li uint32_t &UcnVal, unsigned short &UcnLen,
264*67e74705SXin Li FullSourceLoc Loc, DiagnosticsEngine *Diags,
265*67e74705SXin Li const LangOptions &Features,
266*67e74705SXin Li bool in_char_string_literal = false) {
267*67e74705SXin Li const char *UcnBegin = ThisTokBuf;
268*67e74705SXin Li
269*67e74705SXin Li // Skip the '\u' char's.
270*67e74705SXin Li ThisTokBuf += 2;
271*67e74705SXin Li
272*67e74705SXin Li if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
273*67e74705SXin Li if (Diags)
274*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
275*67e74705SXin Li diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
276*67e74705SXin Li return false;
277*67e74705SXin Li }
278*67e74705SXin Li UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
279*67e74705SXin Li unsigned short UcnLenSave = UcnLen;
280*67e74705SXin Li for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
281*67e74705SXin Li int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
282*67e74705SXin Li if (CharVal == -1) break;
283*67e74705SXin Li UcnVal <<= 4;
284*67e74705SXin Li UcnVal |= CharVal;
285*67e74705SXin Li }
286*67e74705SXin Li // If we didn't consume the proper number of digits, there is a problem.
287*67e74705SXin Li if (UcnLenSave) {
288*67e74705SXin Li if (Diags)
289*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
290*67e74705SXin Li diag::err_ucn_escape_incomplete);
291*67e74705SXin Li return false;
292*67e74705SXin Li }
293*67e74705SXin Li
294*67e74705SXin Li // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
295*67e74705SXin Li if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
296*67e74705SXin Li UcnVal > 0x10FFFF) { // maximum legal UTF32 value
297*67e74705SXin Li if (Diags)
298*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
299*67e74705SXin Li diag::err_ucn_escape_invalid);
300*67e74705SXin Li return false;
301*67e74705SXin Li }
302*67e74705SXin Li
303*67e74705SXin Li // C++11 allows UCNs that refer to control characters and basic source
304*67e74705SXin Li // characters inside character and string literals
305*67e74705SXin Li if (UcnVal < 0xa0 &&
306*67e74705SXin Li (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, `
307*67e74705SXin Li bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal);
308*67e74705SXin Li if (Diags) {
309*67e74705SXin Li char BasicSCSChar = UcnVal;
310*67e74705SXin Li if (UcnVal >= 0x20 && UcnVal < 0x7f)
311*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
312*67e74705SXin Li IsError ? diag::err_ucn_escape_basic_scs :
313*67e74705SXin Li diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
314*67e74705SXin Li << StringRef(&BasicSCSChar, 1);
315*67e74705SXin Li else
316*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
317*67e74705SXin Li IsError ? diag::err_ucn_control_character :
318*67e74705SXin Li diag::warn_cxx98_compat_literal_ucn_control_character);
319*67e74705SXin Li }
320*67e74705SXin Li if (IsError)
321*67e74705SXin Li return false;
322*67e74705SXin Li }
323*67e74705SXin Li
324*67e74705SXin Li if (!Features.CPlusPlus && !Features.C99 && Diags)
325*67e74705SXin Li Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
326*67e74705SXin Li diag::warn_ucn_not_valid_in_c89_literal);
327*67e74705SXin Li
328*67e74705SXin Li return true;
329*67e74705SXin Li }
330*67e74705SXin Li
331*67e74705SXin Li /// MeasureUCNEscape - Determine the number of bytes within the resulting string
332*67e74705SXin Li /// which this UCN will occupy.
MeasureUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,unsigned CharByteWidth,const LangOptions & Features,bool & HadError)333*67e74705SXin Li static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
334*67e74705SXin Li const char *ThisTokEnd, unsigned CharByteWidth,
335*67e74705SXin Li const LangOptions &Features, bool &HadError) {
336*67e74705SXin Li // UTF-32: 4 bytes per escape.
337*67e74705SXin Li if (CharByteWidth == 4)
338*67e74705SXin Li return 4;
339*67e74705SXin Li
340*67e74705SXin Li uint32_t UcnVal = 0;
341*67e74705SXin Li unsigned short UcnLen = 0;
342*67e74705SXin Li FullSourceLoc Loc;
343*67e74705SXin Li
344*67e74705SXin Li if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
345*67e74705SXin Li UcnLen, Loc, nullptr, Features, true)) {
346*67e74705SXin Li HadError = true;
347*67e74705SXin Li return 0;
348*67e74705SXin Li }
349*67e74705SXin Li
350*67e74705SXin Li // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
351*67e74705SXin Li if (CharByteWidth == 2)
352*67e74705SXin Li return UcnVal <= 0xFFFF ? 2 : 4;
353*67e74705SXin Li
354*67e74705SXin Li // UTF-8.
355*67e74705SXin Li if (UcnVal < 0x80)
356*67e74705SXin Li return 1;
357*67e74705SXin Li if (UcnVal < 0x800)
358*67e74705SXin Li return 2;
359*67e74705SXin Li if (UcnVal < 0x10000)
360*67e74705SXin Li return 3;
361*67e74705SXin Li return 4;
362*67e74705SXin Li }
363*67e74705SXin Li
364*67e74705SXin Li /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
365*67e74705SXin Li /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
366*67e74705SXin Li /// StringLiteralParser. When we decide to implement UCN's for identifiers,
367*67e74705SXin Li /// we will likely rework our support for UCN's.
EncodeUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,char * & ResultBuf,bool & HadError,FullSourceLoc Loc,unsigned CharByteWidth,DiagnosticsEngine * Diags,const LangOptions & Features)368*67e74705SXin Li static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
369*67e74705SXin Li const char *ThisTokEnd,
370*67e74705SXin Li char *&ResultBuf, bool &HadError,
371*67e74705SXin Li FullSourceLoc Loc, unsigned CharByteWidth,
372*67e74705SXin Li DiagnosticsEngine *Diags,
373*67e74705SXin Li const LangOptions &Features) {
374*67e74705SXin Li typedef uint32_t UTF32;
375*67e74705SXin Li UTF32 UcnVal = 0;
376*67e74705SXin Li unsigned short UcnLen = 0;
377*67e74705SXin Li if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
378*67e74705SXin Li Loc, Diags, Features, true)) {
379*67e74705SXin Li HadError = true;
380*67e74705SXin Li return;
381*67e74705SXin Li }
382*67e74705SXin Li
383*67e74705SXin Li assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
384*67e74705SXin Li "only character widths of 1, 2, or 4 bytes supported");
385*67e74705SXin Li
386*67e74705SXin Li (void)UcnLen;
387*67e74705SXin Li assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
388*67e74705SXin Li
389*67e74705SXin Li if (CharByteWidth == 4) {
390*67e74705SXin Li // FIXME: Make the type of the result buffer correct instead of
391*67e74705SXin Li // using reinterpret_cast.
392*67e74705SXin Li UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);
393*67e74705SXin Li *ResultPtr = UcnVal;
394*67e74705SXin Li ResultBuf += 4;
395*67e74705SXin Li return;
396*67e74705SXin Li }
397*67e74705SXin Li
398*67e74705SXin Li if (CharByteWidth == 2) {
399*67e74705SXin Li // FIXME: Make the type of the result buffer correct instead of
400*67e74705SXin Li // using reinterpret_cast.
401*67e74705SXin Li UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);
402*67e74705SXin Li
403*67e74705SXin Li if (UcnVal <= (UTF32)0xFFFF) {
404*67e74705SXin Li *ResultPtr = UcnVal;
405*67e74705SXin Li ResultBuf += 2;
406*67e74705SXin Li return;
407*67e74705SXin Li }
408*67e74705SXin Li
409*67e74705SXin Li // Convert to UTF16.
410*67e74705SXin Li UcnVal -= 0x10000;
411*67e74705SXin Li *ResultPtr = 0xD800 + (UcnVal >> 10);
412*67e74705SXin Li *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
413*67e74705SXin Li ResultBuf += 4;
414*67e74705SXin Li return;
415*67e74705SXin Li }
416*67e74705SXin Li
417*67e74705SXin Li assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
418*67e74705SXin Li
419*67e74705SXin Li // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
420*67e74705SXin Li // The conversion below was inspired by:
421*67e74705SXin Li // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
422*67e74705SXin Li // First, we determine how many bytes the result will require.
423*67e74705SXin Li typedef uint8_t UTF8;
424*67e74705SXin Li
425*67e74705SXin Li unsigned short bytesToWrite = 0;
426*67e74705SXin Li if (UcnVal < (UTF32)0x80)
427*67e74705SXin Li bytesToWrite = 1;
428*67e74705SXin Li else if (UcnVal < (UTF32)0x800)
429*67e74705SXin Li bytesToWrite = 2;
430*67e74705SXin Li else if (UcnVal < (UTF32)0x10000)
431*67e74705SXin Li bytesToWrite = 3;
432*67e74705SXin Li else
433*67e74705SXin Li bytesToWrite = 4;
434*67e74705SXin Li
435*67e74705SXin Li const unsigned byteMask = 0xBF;
436*67e74705SXin Li const unsigned byteMark = 0x80;
437*67e74705SXin Li
438*67e74705SXin Li // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
439*67e74705SXin Li // into the first byte, depending on how many bytes follow.
440*67e74705SXin Li static const UTF8 firstByteMark[5] = {
441*67e74705SXin Li 0x00, 0x00, 0xC0, 0xE0, 0xF0
442*67e74705SXin Li };
443*67e74705SXin Li // Finally, we write the bytes into ResultBuf.
444*67e74705SXin Li ResultBuf += bytesToWrite;
445*67e74705SXin Li switch (bytesToWrite) { // note: everything falls through.
446*67e74705SXin Li case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
447*67e74705SXin Li case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
448*67e74705SXin Li case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
449*67e74705SXin Li case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
450*67e74705SXin Li }
451*67e74705SXin Li // Update the buffer.
452*67e74705SXin Li ResultBuf += bytesToWrite;
453*67e74705SXin Li }
454*67e74705SXin Li
455*67e74705SXin Li
456*67e74705SXin Li /// integer-constant: [C99 6.4.4.1]
457*67e74705SXin Li /// decimal-constant integer-suffix
458*67e74705SXin Li /// octal-constant integer-suffix
459*67e74705SXin Li /// hexadecimal-constant integer-suffix
460*67e74705SXin Li /// binary-literal integer-suffix [GNU, C++1y]
461*67e74705SXin Li /// user-defined-integer-literal: [C++11 lex.ext]
462*67e74705SXin Li /// decimal-literal ud-suffix
463*67e74705SXin Li /// octal-literal ud-suffix
464*67e74705SXin Li /// hexadecimal-literal ud-suffix
465*67e74705SXin Li /// binary-literal ud-suffix [GNU, C++1y]
466*67e74705SXin Li /// decimal-constant:
467*67e74705SXin Li /// nonzero-digit
468*67e74705SXin Li /// decimal-constant digit
469*67e74705SXin Li /// octal-constant:
470*67e74705SXin Li /// 0
471*67e74705SXin Li /// octal-constant octal-digit
472*67e74705SXin Li /// hexadecimal-constant:
473*67e74705SXin Li /// hexadecimal-prefix hexadecimal-digit
474*67e74705SXin Li /// hexadecimal-constant hexadecimal-digit
475*67e74705SXin Li /// hexadecimal-prefix: one of
476*67e74705SXin Li /// 0x 0X
477*67e74705SXin Li /// binary-literal:
478*67e74705SXin Li /// 0b binary-digit
479*67e74705SXin Li /// 0B binary-digit
480*67e74705SXin Li /// binary-literal binary-digit
481*67e74705SXin Li /// integer-suffix:
482*67e74705SXin Li /// unsigned-suffix [long-suffix]
483*67e74705SXin Li /// unsigned-suffix [long-long-suffix]
484*67e74705SXin Li /// long-suffix [unsigned-suffix]
485*67e74705SXin Li /// long-long-suffix [unsigned-sufix]
486*67e74705SXin Li /// nonzero-digit:
487*67e74705SXin Li /// 1 2 3 4 5 6 7 8 9
488*67e74705SXin Li /// octal-digit:
489*67e74705SXin Li /// 0 1 2 3 4 5 6 7
490*67e74705SXin Li /// hexadecimal-digit:
491*67e74705SXin Li /// 0 1 2 3 4 5 6 7 8 9
492*67e74705SXin Li /// a b c d e f
493*67e74705SXin Li /// A B C D E F
494*67e74705SXin Li /// binary-digit:
495*67e74705SXin Li /// 0
496*67e74705SXin Li /// 1
497*67e74705SXin Li /// unsigned-suffix: one of
498*67e74705SXin Li /// u U
499*67e74705SXin Li /// long-suffix: one of
500*67e74705SXin Li /// l L
501*67e74705SXin Li /// long-long-suffix: one of
502*67e74705SXin Li /// ll LL
503*67e74705SXin Li ///
504*67e74705SXin Li /// floating-constant: [C99 6.4.4.2]
505*67e74705SXin Li /// TODO: add rules...
506*67e74705SXin Li ///
NumericLiteralParser(StringRef TokSpelling,SourceLocation TokLoc,Preprocessor & PP)507*67e74705SXin Li NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
508*67e74705SXin Li SourceLocation TokLoc,
509*67e74705SXin Li Preprocessor &PP)
510*67e74705SXin Li : PP(PP), ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
511*67e74705SXin Li
512*67e74705SXin Li // This routine assumes that the range begin/end matches the regex for integer
513*67e74705SXin Li // and FP constants (specifically, the 'pp-number' regex), and assumes that
514*67e74705SXin Li // the byte at "*end" is both valid and not part of the regex. Because of
515*67e74705SXin Li // this, it doesn't have to check for 'overscan' in various places.
516*67e74705SXin Li assert(!isPreprocessingNumberBody(*ThisTokEnd) && "didn't maximally munch?");
517*67e74705SXin Li
518*67e74705SXin Li s = DigitsBegin = ThisTokBegin;
519*67e74705SXin Li saw_exponent = false;
520*67e74705SXin Li saw_period = false;
521*67e74705SXin Li saw_ud_suffix = false;
522*67e74705SXin Li isLong = false;
523*67e74705SXin Li isUnsigned = false;
524*67e74705SXin Li isLongLong = false;
525*67e74705SXin Li isHalf = false;
526*67e74705SXin Li isFloat = false;
527*67e74705SXin Li isImaginary = false;
528*67e74705SXin Li isFloat128 = false;
529*67e74705SXin Li MicrosoftInteger = 0;
530*67e74705SXin Li hadError = false;
531*67e74705SXin Li
532*67e74705SXin Li if (*s == '0') { // parse radix
533*67e74705SXin Li ParseNumberStartingWithZero(TokLoc);
534*67e74705SXin Li if (hadError)
535*67e74705SXin Li return;
536*67e74705SXin Li } else { // the first digit is non-zero
537*67e74705SXin Li radix = 10;
538*67e74705SXin Li s = SkipDigits(s);
539*67e74705SXin Li if (s == ThisTokEnd) {
540*67e74705SXin Li // Done.
541*67e74705SXin Li } else {
542*67e74705SXin Li ParseDecimalOrOctalCommon(TokLoc);
543*67e74705SXin Li if (hadError)
544*67e74705SXin Li return;
545*67e74705SXin Li }
546*67e74705SXin Li }
547*67e74705SXin Li
548*67e74705SXin Li SuffixBegin = s;
549*67e74705SXin Li checkSeparator(TokLoc, s, CSK_AfterDigits);
550*67e74705SXin Li
551*67e74705SXin Li // Parse the suffix. At this point we can classify whether we have an FP or
552*67e74705SXin Li // integer constant.
553*67e74705SXin Li bool isFPConstant = isFloatingLiteral();
554*67e74705SXin Li const char *ImaginarySuffixLoc = nullptr;
555*67e74705SXin Li
556*67e74705SXin Li // Loop over all of the characters of the suffix. If we see something bad,
557*67e74705SXin Li // we break out of the loop.
558*67e74705SXin Li for (; s != ThisTokEnd; ++s) {
559*67e74705SXin Li switch (*s) {
560*67e74705SXin Li case 'h': // FP Suffix for "half".
561*67e74705SXin Li case 'H':
562*67e74705SXin Li // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
563*67e74705SXin Li if (!PP.getLangOpts().Half) break;
564*67e74705SXin Li if (!isFPConstant) break; // Error for integer constant.
565*67e74705SXin Li if (isHalf || isFloat || isLong) break; // HH, FH, LH invalid.
566*67e74705SXin Li isHalf = true;
567*67e74705SXin Li continue; // Success.
568*67e74705SXin Li case 'f': // FP Suffix for "float"
569*67e74705SXin Li case 'F':
570*67e74705SXin Li if (!isFPConstant) break; // Error for integer constant.
571*67e74705SXin Li if (isHalf || isFloat || isLong || isFloat128)
572*67e74705SXin Li break; // HF, FF, LF, QF invalid.
573*67e74705SXin Li isFloat = true;
574*67e74705SXin Li continue; // Success.
575*67e74705SXin Li case 'q': // FP Suffix for "__float128"
576*67e74705SXin Li case 'Q':
577*67e74705SXin Li if (!isFPConstant) break; // Error for integer constant.
578*67e74705SXin Li if (isHalf || isFloat || isLong || isFloat128)
579*67e74705SXin Li break; // HQ, FQ, LQ, QQ invalid.
580*67e74705SXin Li isFloat128 = true;
581*67e74705SXin Li continue; // Success.
582*67e74705SXin Li case 'u':
583*67e74705SXin Li case 'U':
584*67e74705SXin Li if (isFPConstant) break; // Error for floating constant.
585*67e74705SXin Li if (isUnsigned) break; // Cannot be repeated.
586*67e74705SXin Li isUnsigned = true;
587*67e74705SXin Li continue; // Success.
588*67e74705SXin Li case 'l':
589*67e74705SXin Li case 'L':
590*67e74705SXin Li if (isLong || isLongLong) break; // Cannot be repeated.
591*67e74705SXin Li if (isHalf || isFloat || isFloat128) break; // LH, LF, LQ invalid.
592*67e74705SXin Li
593*67e74705SXin Li // Check for long long. The L's need to be adjacent and the same case.
594*67e74705SXin Li if (s[1] == s[0]) {
595*67e74705SXin Li assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
596*67e74705SXin Li if (isFPConstant) break; // long long invalid for floats.
597*67e74705SXin Li isLongLong = true;
598*67e74705SXin Li ++s; // Eat both of them.
599*67e74705SXin Li } else {
600*67e74705SXin Li isLong = true;
601*67e74705SXin Li }
602*67e74705SXin Li continue; // Success.
603*67e74705SXin Li case 'i':
604*67e74705SXin Li case 'I':
605*67e74705SXin Li if (PP.getLangOpts().MicrosoftExt) {
606*67e74705SXin Li if (isLong || isLongLong || MicrosoftInteger)
607*67e74705SXin Li break;
608*67e74705SXin Li
609*67e74705SXin Li if (!isFPConstant) {
610*67e74705SXin Li // Allow i8, i16, i32, and i64.
611*67e74705SXin Li switch (s[1]) {
612*67e74705SXin Li case '8':
613*67e74705SXin Li s += 2; // i8 suffix
614*67e74705SXin Li MicrosoftInteger = 8;
615*67e74705SXin Li break;
616*67e74705SXin Li case '1':
617*67e74705SXin Li if (s[2] == '6') {
618*67e74705SXin Li s += 3; // i16 suffix
619*67e74705SXin Li MicrosoftInteger = 16;
620*67e74705SXin Li }
621*67e74705SXin Li break;
622*67e74705SXin Li case '3':
623*67e74705SXin Li if (s[2] == '2') {
624*67e74705SXin Li s += 3; // i32 suffix
625*67e74705SXin Li MicrosoftInteger = 32;
626*67e74705SXin Li }
627*67e74705SXin Li break;
628*67e74705SXin Li case '6':
629*67e74705SXin Li if (s[2] == '4') {
630*67e74705SXin Li s += 3; // i64 suffix
631*67e74705SXin Li MicrosoftInteger = 64;
632*67e74705SXin Li }
633*67e74705SXin Li break;
634*67e74705SXin Li default:
635*67e74705SXin Li break;
636*67e74705SXin Li }
637*67e74705SXin Li }
638*67e74705SXin Li if (MicrosoftInteger) {
639*67e74705SXin Li assert(s <= ThisTokEnd && "didn't maximally munch?");
640*67e74705SXin Li break;
641*67e74705SXin Li }
642*67e74705SXin Li }
643*67e74705SXin Li // "i", "if", and "il" are user-defined suffixes in C++1y.
644*67e74705SXin Li if (*s == 'i' && PP.getLangOpts().CPlusPlus14)
645*67e74705SXin Li break;
646*67e74705SXin Li // fall through.
647*67e74705SXin Li case 'j':
648*67e74705SXin Li case 'J':
649*67e74705SXin Li if (isImaginary) break; // Cannot be repeated.
650*67e74705SXin Li isImaginary = true;
651*67e74705SXin Li ImaginarySuffixLoc = s;
652*67e74705SXin Li continue; // Success.
653*67e74705SXin Li }
654*67e74705SXin Li // If we reached here, there was an error or a ud-suffix.
655*67e74705SXin Li break;
656*67e74705SXin Li }
657*67e74705SXin Li
658*67e74705SXin Li if (s != ThisTokEnd) {
659*67e74705SXin Li // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
660*67e74705SXin Li expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
661*67e74705SXin Li if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) {
662*67e74705SXin Li // Any suffix pieces we might have parsed are actually part of the
663*67e74705SXin Li // ud-suffix.
664*67e74705SXin Li isLong = false;
665*67e74705SXin Li isUnsigned = false;
666*67e74705SXin Li isLongLong = false;
667*67e74705SXin Li isFloat = false;
668*67e74705SXin Li isHalf = false;
669*67e74705SXin Li isImaginary = false;
670*67e74705SXin Li MicrosoftInteger = 0;
671*67e74705SXin Li
672*67e74705SXin Li saw_ud_suffix = true;
673*67e74705SXin Li return;
674*67e74705SXin Li }
675*67e74705SXin Li
676*67e74705SXin Li // Report an error if there are any.
677*67e74705SXin Li PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin - ThisTokBegin),
678*67e74705SXin Li diag::err_invalid_suffix_constant)
679*67e74705SXin Li << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin) << isFPConstant;
680*67e74705SXin Li hadError = true;
681*67e74705SXin Li return;
682*67e74705SXin Li }
683*67e74705SXin Li
684*67e74705SXin Li if (isImaginary) {
685*67e74705SXin Li PP.Diag(PP.AdvanceToTokenCharacter(TokLoc,
686*67e74705SXin Li ImaginarySuffixLoc - ThisTokBegin),
687*67e74705SXin Li diag::ext_imaginary_constant);
688*67e74705SXin Li }
689*67e74705SXin Li }
690*67e74705SXin Li
691*67e74705SXin Li /// ParseDecimalOrOctalCommon - This method is called for decimal or octal
692*67e74705SXin Li /// numbers. It issues an error for illegal digits, and handles floating point
693*67e74705SXin Li /// parsing. If it detects a floating point number, the radix is set to 10.
ParseDecimalOrOctalCommon(SourceLocation TokLoc)694*67e74705SXin Li void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
695*67e74705SXin Li assert((radix == 8 || radix == 10) && "Unexpected radix");
696*67e74705SXin Li
697*67e74705SXin Li // If we have a hex digit other than 'e' (which denotes a FP exponent) then
698*67e74705SXin Li // the code is using an incorrect base.
699*67e74705SXin Li if (isHexDigit(*s) && *s != 'e' && *s != 'E') {
700*67e74705SXin Li PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
701*67e74705SXin Li diag::err_invalid_digit) << StringRef(s, 1) << (radix == 8 ? 1 : 0);
702*67e74705SXin Li hadError = true;
703*67e74705SXin Li return;
704*67e74705SXin Li }
705*67e74705SXin Li
706*67e74705SXin Li if (*s == '.') {
707*67e74705SXin Li checkSeparator(TokLoc, s, CSK_AfterDigits);
708*67e74705SXin Li s++;
709*67e74705SXin Li radix = 10;
710*67e74705SXin Li saw_period = true;
711*67e74705SXin Li checkSeparator(TokLoc, s, CSK_BeforeDigits);
712*67e74705SXin Li s = SkipDigits(s); // Skip suffix.
713*67e74705SXin Li }
714*67e74705SXin Li if (*s == 'e' || *s == 'E') { // exponent
715*67e74705SXin Li checkSeparator(TokLoc, s, CSK_AfterDigits);
716*67e74705SXin Li const char *Exponent = s;
717*67e74705SXin Li s++;
718*67e74705SXin Li radix = 10;
719*67e74705SXin Li saw_exponent = true;
720*67e74705SXin Li if (*s == '+' || *s == '-') s++; // sign
721*67e74705SXin Li const char *first_non_digit = SkipDigits(s);
722*67e74705SXin Li if (containsDigits(s, first_non_digit)) {
723*67e74705SXin Li checkSeparator(TokLoc, s, CSK_BeforeDigits);
724*67e74705SXin Li s = first_non_digit;
725*67e74705SXin Li } else {
726*67e74705SXin Li PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
727*67e74705SXin Li diag::err_exponent_has_no_digits);
728*67e74705SXin Li hadError = true;
729*67e74705SXin Li return;
730*67e74705SXin Li }
731*67e74705SXin Li }
732*67e74705SXin Li }
733*67e74705SXin Li
734*67e74705SXin Li /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
735*67e74705SXin Li /// suffixes as ud-suffixes, because the diagnostic experience is better if we
736*67e74705SXin Li /// treat it as an invalid suffix.
isValidUDSuffix(const LangOptions & LangOpts,StringRef Suffix)737*67e74705SXin Li bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
738*67e74705SXin Li StringRef Suffix) {
739*67e74705SXin Li if (!LangOpts.CPlusPlus11 || Suffix.empty())
740*67e74705SXin Li return false;
741*67e74705SXin Li
742*67e74705SXin Li // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
743*67e74705SXin Li if (Suffix[0] == '_')
744*67e74705SXin Li return true;
745*67e74705SXin Li
746*67e74705SXin Li // In C++11, there are no library suffixes.
747*67e74705SXin Li if (!LangOpts.CPlusPlus14)
748*67e74705SXin Li return false;
749*67e74705SXin Li
750*67e74705SXin Li // In C++1y, "s", "h", "min", "ms", "us", and "ns" are used in the library.
751*67e74705SXin Li // Per tweaked N3660, "il", "i", and "if" are also used in the library.
752*67e74705SXin Li return llvm::StringSwitch<bool>(Suffix)
753*67e74705SXin Li .Cases("h", "min", "s", true)
754*67e74705SXin Li .Cases("ms", "us", "ns", true)
755*67e74705SXin Li .Cases("il", "i", "if", true)
756*67e74705SXin Li .Default(false);
757*67e74705SXin Li }
758*67e74705SXin Li
checkSeparator(SourceLocation TokLoc,const char * Pos,CheckSeparatorKind IsAfterDigits)759*67e74705SXin Li void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
760*67e74705SXin Li const char *Pos,
761*67e74705SXin Li CheckSeparatorKind IsAfterDigits) {
762*67e74705SXin Li if (IsAfterDigits == CSK_AfterDigits) {
763*67e74705SXin Li if (Pos == ThisTokBegin)
764*67e74705SXin Li return;
765*67e74705SXin Li --Pos;
766*67e74705SXin Li } else if (Pos == ThisTokEnd)
767*67e74705SXin Li return;
768*67e74705SXin Li
769*67e74705SXin Li if (isDigitSeparator(*Pos))
770*67e74705SXin Li PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin),
771*67e74705SXin Li diag::err_digit_separator_not_between_digits)
772*67e74705SXin Li << IsAfterDigits;
773*67e74705SXin Li }
774*67e74705SXin Li
775*67e74705SXin Li /// ParseNumberStartingWithZero - This method is called when the first character
776*67e74705SXin Li /// of the number is found to be a zero. This means it is either an octal
777*67e74705SXin Li /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
778*67e74705SXin Li /// a floating point number (01239.123e4). Eat the prefix, determining the
779*67e74705SXin Li /// radix etc.
ParseNumberStartingWithZero(SourceLocation TokLoc)780*67e74705SXin Li void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
781*67e74705SXin Li assert(s[0] == '0' && "Invalid method call");
782*67e74705SXin Li s++;
783*67e74705SXin Li
784*67e74705SXin Li int c1 = s[0];
785*67e74705SXin Li
786*67e74705SXin Li // Handle a hex number like 0x1234.
787*67e74705SXin Li if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
788*67e74705SXin Li s++;
789*67e74705SXin Li assert(s < ThisTokEnd && "didn't maximally munch?");
790*67e74705SXin Li radix = 16;
791*67e74705SXin Li DigitsBegin = s;
792*67e74705SXin Li s = SkipHexDigits(s);
793*67e74705SXin Li bool HasSignificandDigits = containsDigits(DigitsBegin, s);
794*67e74705SXin Li if (s == ThisTokEnd) {
795*67e74705SXin Li // Done.
796*67e74705SXin Li } else if (*s == '.') {
797*67e74705SXin Li s++;
798*67e74705SXin Li saw_period = true;
799*67e74705SXin Li const char *floatDigitsBegin = s;
800*67e74705SXin Li s = SkipHexDigits(s);
801*67e74705SXin Li if (containsDigits(floatDigitsBegin, s))
802*67e74705SXin Li HasSignificandDigits = true;
803*67e74705SXin Li if (HasSignificandDigits)
804*67e74705SXin Li checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
805*67e74705SXin Li }
806*67e74705SXin Li
807*67e74705SXin Li if (!HasSignificandDigits) {
808*67e74705SXin Li PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
809*67e74705SXin Li diag::err_hex_constant_requires)
810*67e74705SXin Li << PP.getLangOpts().CPlusPlus << 1;
811*67e74705SXin Li hadError = true;
812*67e74705SXin Li return;
813*67e74705SXin Li }
814*67e74705SXin Li
815*67e74705SXin Li // A binary exponent can appear with or with a '.'. If dotted, the
816*67e74705SXin Li // binary exponent is required.
817*67e74705SXin Li if (*s == 'p' || *s == 'P') {
818*67e74705SXin Li checkSeparator(TokLoc, s, CSK_AfterDigits);
819*67e74705SXin Li const char *Exponent = s;
820*67e74705SXin Li s++;
821*67e74705SXin Li saw_exponent = true;
822*67e74705SXin Li if (*s == '+' || *s == '-') s++; // sign
823*67e74705SXin Li const char *first_non_digit = SkipDigits(s);
824*67e74705SXin Li if (!containsDigits(s, first_non_digit)) {
825*67e74705SXin Li PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
826*67e74705SXin Li diag::err_exponent_has_no_digits);
827*67e74705SXin Li hadError = true;
828*67e74705SXin Li return;
829*67e74705SXin Li }
830*67e74705SXin Li checkSeparator(TokLoc, s, CSK_BeforeDigits);
831*67e74705SXin Li s = first_non_digit;
832*67e74705SXin Li
833*67e74705SXin Li if (!PP.getLangOpts().HexFloats)
834*67e74705SXin Li PP.Diag(TokLoc, PP.getLangOpts().CPlusPlus
835*67e74705SXin Li ? diag::ext_hex_literal_invalid
836*67e74705SXin Li : diag::ext_hex_constant_invalid);
837*67e74705SXin Li else if (PP.getLangOpts().CPlusPlus1z)
838*67e74705SXin Li PP.Diag(TokLoc, diag::warn_cxx1z_hex_literal);
839*67e74705SXin Li } else if (saw_period) {
840*67e74705SXin Li PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
841*67e74705SXin Li diag::err_hex_constant_requires)
842*67e74705SXin Li << PP.getLangOpts().CPlusPlus << 0;
843*67e74705SXin Li hadError = true;
844*67e74705SXin Li }
845*67e74705SXin Li return;
846*67e74705SXin Li }
847*67e74705SXin Li
848*67e74705SXin Li // Handle simple binary numbers 0b01010
849*67e74705SXin Li if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
850*67e74705SXin Li // 0b101010 is a C++1y / GCC extension.
851*67e74705SXin Li PP.Diag(TokLoc,
852*67e74705SXin Li PP.getLangOpts().CPlusPlus14
853*67e74705SXin Li ? diag::warn_cxx11_compat_binary_literal
854*67e74705SXin Li : PP.getLangOpts().CPlusPlus
855*67e74705SXin Li ? diag::ext_binary_literal_cxx14
856*67e74705SXin Li : diag::ext_binary_literal);
857*67e74705SXin Li ++s;
858*67e74705SXin Li assert(s < ThisTokEnd && "didn't maximally munch?");
859*67e74705SXin Li radix = 2;
860*67e74705SXin Li DigitsBegin = s;
861*67e74705SXin Li s = SkipBinaryDigits(s);
862*67e74705SXin Li if (s == ThisTokEnd) {
863*67e74705SXin Li // Done.
864*67e74705SXin Li } else if (isHexDigit(*s)) {
865*67e74705SXin Li PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
866*67e74705SXin Li diag::err_invalid_digit) << StringRef(s, 1) << 2;
867*67e74705SXin Li hadError = true;
868*67e74705SXin Li }
869*67e74705SXin Li // Other suffixes will be diagnosed by the caller.
870*67e74705SXin Li return;
871*67e74705SXin Li }
872*67e74705SXin Li
873*67e74705SXin Li // For now, the radix is set to 8. If we discover that we have a
874*67e74705SXin Li // floating point constant, the radix will change to 10. Octal floating
875*67e74705SXin Li // point constants are not permitted (only decimal and hexadecimal).
876*67e74705SXin Li radix = 8;
877*67e74705SXin Li DigitsBegin = s;
878*67e74705SXin Li s = SkipOctalDigits(s);
879*67e74705SXin Li if (s == ThisTokEnd)
880*67e74705SXin Li return; // Done, simple octal number like 01234
881*67e74705SXin Li
882*67e74705SXin Li // If we have some other non-octal digit that *is* a decimal digit, see if
883*67e74705SXin Li // this is part of a floating point number like 094.123 or 09e1.
884*67e74705SXin Li if (isDigit(*s)) {
885*67e74705SXin Li const char *EndDecimal = SkipDigits(s);
886*67e74705SXin Li if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
887*67e74705SXin Li s = EndDecimal;
888*67e74705SXin Li radix = 10;
889*67e74705SXin Li }
890*67e74705SXin Li }
891*67e74705SXin Li
892*67e74705SXin Li ParseDecimalOrOctalCommon(TokLoc);
893*67e74705SXin Li }
894*67e74705SXin Li
alwaysFitsInto64Bits(unsigned Radix,unsigned NumDigits)895*67e74705SXin Li static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
896*67e74705SXin Li switch (Radix) {
897*67e74705SXin Li case 2:
898*67e74705SXin Li return NumDigits <= 64;
899*67e74705SXin Li case 8:
900*67e74705SXin Li return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
901*67e74705SXin Li case 10:
902*67e74705SXin Li return NumDigits <= 19; // floor(log10(2^64))
903*67e74705SXin Li case 16:
904*67e74705SXin Li return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
905*67e74705SXin Li default:
906*67e74705SXin Li llvm_unreachable("impossible Radix");
907*67e74705SXin Li }
908*67e74705SXin Li }
909*67e74705SXin Li
910*67e74705SXin Li /// GetIntegerValue - Convert this numeric literal value to an APInt that
911*67e74705SXin Li /// matches Val's input width. If there is an overflow, set Val to the low bits
912*67e74705SXin Li /// of the result and return true. Otherwise, return false.
GetIntegerValue(llvm::APInt & Val)913*67e74705SXin Li bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
914*67e74705SXin Li // Fast path: Compute a conservative bound on the maximum number of
915*67e74705SXin Li // bits per digit in this radix. If we can't possibly overflow a
916*67e74705SXin Li // uint64 based on that bound then do the simple conversion to
917*67e74705SXin Li // integer. This avoids the expensive overflow checking below, and
918*67e74705SXin Li // handles the common cases that matter (small decimal integers and
919*67e74705SXin Li // hex/octal values which don't overflow).
920*67e74705SXin Li const unsigned NumDigits = SuffixBegin - DigitsBegin;
921*67e74705SXin Li if (alwaysFitsInto64Bits(radix, NumDigits)) {
922*67e74705SXin Li uint64_t N = 0;
923*67e74705SXin Li for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
924*67e74705SXin Li if (!isDigitSeparator(*Ptr))
925*67e74705SXin Li N = N * radix + llvm::hexDigitValue(*Ptr);
926*67e74705SXin Li
927*67e74705SXin Li // This will truncate the value to Val's input width. Simply check
928*67e74705SXin Li // for overflow by comparing.
929*67e74705SXin Li Val = N;
930*67e74705SXin Li return Val.getZExtValue() != N;
931*67e74705SXin Li }
932*67e74705SXin Li
933*67e74705SXin Li Val = 0;
934*67e74705SXin Li const char *Ptr = DigitsBegin;
935*67e74705SXin Li
936*67e74705SXin Li llvm::APInt RadixVal(Val.getBitWidth(), radix);
937*67e74705SXin Li llvm::APInt CharVal(Val.getBitWidth(), 0);
938*67e74705SXin Li llvm::APInt OldVal = Val;
939*67e74705SXin Li
940*67e74705SXin Li bool OverflowOccurred = false;
941*67e74705SXin Li while (Ptr < SuffixBegin) {
942*67e74705SXin Li if (isDigitSeparator(*Ptr)) {
943*67e74705SXin Li ++Ptr;
944*67e74705SXin Li continue;
945*67e74705SXin Li }
946*67e74705SXin Li
947*67e74705SXin Li unsigned C = llvm::hexDigitValue(*Ptr++);
948*67e74705SXin Li
949*67e74705SXin Li // If this letter is out of bound for this radix, reject it.
950*67e74705SXin Li assert(C < radix && "NumericLiteralParser ctor should have rejected this");
951*67e74705SXin Li
952*67e74705SXin Li CharVal = C;
953*67e74705SXin Li
954*67e74705SXin Li // Add the digit to the value in the appropriate radix. If adding in digits
955*67e74705SXin Li // made the value smaller, then this overflowed.
956*67e74705SXin Li OldVal = Val;
957*67e74705SXin Li
958*67e74705SXin Li // Multiply by radix, did overflow occur on the multiply?
959*67e74705SXin Li Val *= RadixVal;
960*67e74705SXin Li OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
961*67e74705SXin Li
962*67e74705SXin Li // Add value, did overflow occur on the value?
963*67e74705SXin Li // (a + b) ult b <=> overflow
964*67e74705SXin Li Val += CharVal;
965*67e74705SXin Li OverflowOccurred |= Val.ult(CharVal);
966*67e74705SXin Li }
967*67e74705SXin Li return OverflowOccurred;
968*67e74705SXin Li }
969*67e74705SXin Li
970*67e74705SXin Li llvm::APFloat::opStatus
GetFloatValue(llvm::APFloat & Result)971*67e74705SXin Li NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
972*67e74705SXin Li using llvm::APFloat;
973*67e74705SXin Li
974*67e74705SXin Li unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
975*67e74705SXin Li
976*67e74705SXin Li llvm::SmallString<16> Buffer;
977*67e74705SXin Li StringRef Str(ThisTokBegin, n);
978*67e74705SXin Li if (Str.find('\'') != StringRef::npos) {
979*67e74705SXin Li Buffer.reserve(n);
980*67e74705SXin Li std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
981*67e74705SXin Li &isDigitSeparator);
982*67e74705SXin Li Str = Buffer;
983*67e74705SXin Li }
984*67e74705SXin Li
985*67e74705SXin Li return Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
986*67e74705SXin Li }
987*67e74705SXin Li
988*67e74705SXin Li
989*67e74705SXin Li /// \verbatim
990*67e74705SXin Li /// user-defined-character-literal: [C++11 lex.ext]
991*67e74705SXin Li /// character-literal ud-suffix
992*67e74705SXin Li /// ud-suffix:
993*67e74705SXin Li /// identifier
994*67e74705SXin Li /// character-literal: [C++11 lex.ccon]
995*67e74705SXin Li /// ' c-char-sequence '
996*67e74705SXin Li /// u' c-char-sequence '
997*67e74705SXin Li /// U' c-char-sequence '
998*67e74705SXin Li /// L' c-char-sequence '
999*67e74705SXin Li /// u8' c-char-sequence ' [C++1z lex.ccon]
1000*67e74705SXin Li /// c-char-sequence:
1001*67e74705SXin Li /// c-char
1002*67e74705SXin Li /// c-char-sequence c-char
1003*67e74705SXin Li /// c-char:
1004*67e74705SXin Li /// any member of the source character set except the single-quote ',
1005*67e74705SXin Li /// backslash \, or new-line character
1006*67e74705SXin Li /// escape-sequence
1007*67e74705SXin Li /// universal-character-name
1008*67e74705SXin Li /// escape-sequence:
1009*67e74705SXin Li /// simple-escape-sequence
1010*67e74705SXin Li /// octal-escape-sequence
1011*67e74705SXin Li /// hexadecimal-escape-sequence
1012*67e74705SXin Li /// simple-escape-sequence:
1013*67e74705SXin Li /// one of \' \" \? \\ \a \b \f \n \r \t \v
1014*67e74705SXin Li /// octal-escape-sequence:
1015*67e74705SXin Li /// \ octal-digit
1016*67e74705SXin Li /// \ octal-digit octal-digit
1017*67e74705SXin Li /// \ octal-digit octal-digit octal-digit
1018*67e74705SXin Li /// hexadecimal-escape-sequence:
1019*67e74705SXin Li /// \x hexadecimal-digit
1020*67e74705SXin Li /// hexadecimal-escape-sequence hexadecimal-digit
1021*67e74705SXin Li /// universal-character-name: [C++11 lex.charset]
1022*67e74705SXin Li /// \u hex-quad
1023*67e74705SXin Li /// \U hex-quad hex-quad
1024*67e74705SXin Li /// hex-quad:
1025*67e74705SXin Li /// hex-digit hex-digit hex-digit hex-digit
1026*67e74705SXin Li /// \endverbatim
1027*67e74705SXin Li ///
CharLiteralParser(const char * begin,const char * end,SourceLocation Loc,Preprocessor & PP,tok::TokenKind kind)1028*67e74705SXin Li CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1029*67e74705SXin Li SourceLocation Loc, Preprocessor &PP,
1030*67e74705SXin Li tok::TokenKind kind) {
1031*67e74705SXin Li // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1032*67e74705SXin Li HadError = false;
1033*67e74705SXin Li
1034*67e74705SXin Li Kind = kind;
1035*67e74705SXin Li
1036*67e74705SXin Li const char *TokBegin = begin;
1037*67e74705SXin Li
1038*67e74705SXin Li // Skip over wide character determinant.
1039*67e74705SXin Li if (Kind != tok::char_constant)
1040*67e74705SXin Li ++begin;
1041*67e74705SXin Li if (Kind == tok::utf8_char_constant)
1042*67e74705SXin Li ++begin;
1043*67e74705SXin Li
1044*67e74705SXin Li // Skip over the entry quote.
1045*67e74705SXin Li assert(begin[0] == '\'' && "Invalid token lexed");
1046*67e74705SXin Li ++begin;
1047*67e74705SXin Li
1048*67e74705SXin Li // Remove an optional ud-suffix.
1049*67e74705SXin Li if (end[-1] != '\'') {
1050*67e74705SXin Li const char *UDSuffixEnd = end;
1051*67e74705SXin Li do {
1052*67e74705SXin Li --end;
1053*67e74705SXin Li } while (end[-1] != '\'');
1054*67e74705SXin Li // FIXME: Don't bother with this if !tok.hasUCN().
1055*67e74705SXin Li expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1056*67e74705SXin Li UDSuffixOffset = end - TokBegin;
1057*67e74705SXin Li }
1058*67e74705SXin Li
1059*67e74705SXin Li // Trim the ending quote.
1060*67e74705SXin Li assert(end != begin && "Invalid token lexed");
1061*67e74705SXin Li --end;
1062*67e74705SXin Li
1063*67e74705SXin Li // FIXME: The "Value" is an uint64_t so we can handle char literals of
1064*67e74705SXin Li // up to 64-bits.
1065*67e74705SXin Li // FIXME: This extensively assumes that 'char' is 8-bits.
1066*67e74705SXin Li assert(PP.getTargetInfo().getCharWidth() == 8 &&
1067*67e74705SXin Li "Assumes char is 8 bits");
1068*67e74705SXin Li assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1069*67e74705SXin Li (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1070*67e74705SXin Li "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1071*67e74705SXin Li assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1072*67e74705SXin Li "Assumes sizeof(wchar) on target is <= 64");
1073*67e74705SXin Li
1074*67e74705SXin Li SmallVector<uint32_t, 4> codepoint_buffer;
1075*67e74705SXin Li codepoint_buffer.resize(end - begin);
1076*67e74705SXin Li uint32_t *buffer_begin = &codepoint_buffer.front();
1077*67e74705SXin Li uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1078*67e74705SXin Li
1079*67e74705SXin Li // Unicode escapes representing characters that cannot be correctly
1080*67e74705SXin Li // represented in a single code unit are disallowed in character literals
1081*67e74705SXin Li // by this implementation.
1082*67e74705SXin Li uint32_t largest_character_for_kind;
1083*67e74705SXin Li if (tok::wide_char_constant == Kind) {
1084*67e74705SXin Li largest_character_for_kind =
1085*67e74705SXin Li 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1086*67e74705SXin Li } else if (tok::utf8_char_constant == Kind) {
1087*67e74705SXin Li largest_character_for_kind = 0x7F;
1088*67e74705SXin Li } else if (tok::utf16_char_constant == Kind) {
1089*67e74705SXin Li largest_character_for_kind = 0xFFFF;
1090*67e74705SXin Li } else if (tok::utf32_char_constant == Kind) {
1091*67e74705SXin Li largest_character_for_kind = 0x10FFFF;
1092*67e74705SXin Li } else {
1093*67e74705SXin Li largest_character_for_kind = 0x7Fu;
1094*67e74705SXin Li }
1095*67e74705SXin Li
1096*67e74705SXin Li while (begin != end) {
1097*67e74705SXin Li // Is this a span of non-escape characters?
1098*67e74705SXin Li if (begin[0] != '\\') {
1099*67e74705SXin Li char const *start = begin;
1100*67e74705SXin Li do {
1101*67e74705SXin Li ++begin;
1102*67e74705SXin Li } while (begin != end && *begin != '\\');
1103*67e74705SXin Li
1104*67e74705SXin Li char const *tmp_in_start = start;
1105*67e74705SXin Li uint32_t *tmp_out_start = buffer_begin;
1106*67e74705SXin Li ConversionResult res =
1107*67e74705SXin Li ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
1108*67e74705SXin Li reinterpret_cast<UTF8 const *>(begin),
1109*67e74705SXin Li &buffer_begin, buffer_end, strictConversion);
1110*67e74705SXin Li if (res != conversionOK) {
1111*67e74705SXin Li // If we see bad encoding for unprefixed character literals, warn and
1112*67e74705SXin Li // simply copy the byte values, for compatibility with gcc and
1113*67e74705SXin Li // older versions of clang.
1114*67e74705SXin Li bool NoErrorOnBadEncoding = isAscii();
1115*67e74705SXin Li unsigned Msg = diag::err_bad_character_encoding;
1116*67e74705SXin Li if (NoErrorOnBadEncoding)
1117*67e74705SXin Li Msg = diag::warn_bad_character_encoding;
1118*67e74705SXin Li PP.Diag(Loc, Msg);
1119*67e74705SXin Li if (NoErrorOnBadEncoding) {
1120*67e74705SXin Li start = tmp_in_start;
1121*67e74705SXin Li buffer_begin = tmp_out_start;
1122*67e74705SXin Li for (; start != begin; ++start, ++buffer_begin)
1123*67e74705SXin Li *buffer_begin = static_cast<uint8_t>(*start);
1124*67e74705SXin Li } else {
1125*67e74705SXin Li HadError = true;
1126*67e74705SXin Li }
1127*67e74705SXin Li } else {
1128*67e74705SXin Li for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1129*67e74705SXin Li if (*tmp_out_start > largest_character_for_kind) {
1130*67e74705SXin Li HadError = true;
1131*67e74705SXin Li PP.Diag(Loc, diag::err_character_too_large);
1132*67e74705SXin Li }
1133*67e74705SXin Li }
1134*67e74705SXin Li }
1135*67e74705SXin Li
1136*67e74705SXin Li continue;
1137*67e74705SXin Li }
1138*67e74705SXin Li // Is this a Universal Character Name escape?
1139*67e74705SXin Li if (begin[1] == 'u' || begin[1] == 'U') {
1140*67e74705SXin Li unsigned short UcnLen = 0;
1141*67e74705SXin Li if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1142*67e74705SXin Li FullSourceLoc(Loc, PP.getSourceManager()),
1143*67e74705SXin Li &PP.getDiagnostics(), PP.getLangOpts(), true)) {
1144*67e74705SXin Li HadError = true;
1145*67e74705SXin Li } else if (*buffer_begin > largest_character_for_kind) {
1146*67e74705SXin Li HadError = true;
1147*67e74705SXin Li PP.Diag(Loc, diag::err_character_too_large);
1148*67e74705SXin Li }
1149*67e74705SXin Li
1150*67e74705SXin Li ++buffer_begin;
1151*67e74705SXin Li continue;
1152*67e74705SXin Li }
1153*67e74705SXin Li unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1154*67e74705SXin Li uint64_t result =
1155*67e74705SXin Li ProcessCharEscape(TokBegin, begin, end, HadError,
1156*67e74705SXin Li FullSourceLoc(Loc,PP.getSourceManager()),
1157*67e74705SXin Li CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
1158*67e74705SXin Li *buffer_begin++ = result;
1159*67e74705SXin Li }
1160*67e74705SXin Li
1161*67e74705SXin Li unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1162*67e74705SXin Li
1163*67e74705SXin Li if (NumCharsSoFar > 1) {
1164*67e74705SXin Li if (isWide())
1165*67e74705SXin Li PP.Diag(Loc, diag::warn_extraneous_char_constant);
1166*67e74705SXin Li else if (isAscii() && NumCharsSoFar == 4)
1167*67e74705SXin Li PP.Diag(Loc, diag::ext_four_char_character_literal);
1168*67e74705SXin Li else if (isAscii())
1169*67e74705SXin Li PP.Diag(Loc, diag::ext_multichar_character_literal);
1170*67e74705SXin Li else
1171*67e74705SXin Li PP.Diag(Loc, diag::err_multichar_utf_character_literal);
1172*67e74705SXin Li IsMultiChar = true;
1173*67e74705SXin Li } else {
1174*67e74705SXin Li IsMultiChar = false;
1175*67e74705SXin Li }
1176*67e74705SXin Li
1177*67e74705SXin Li llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1178*67e74705SXin Li
1179*67e74705SXin Li // Narrow character literals act as though their value is concatenated
1180*67e74705SXin Li // in this implementation, but warn on overflow.
1181*67e74705SXin Li bool multi_char_too_long = false;
1182*67e74705SXin Li if (isAscii() && isMultiChar()) {
1183*67e74705SXin Li LitVal = 0;
1184*67e74705SXin Li for (size_t i = 0; i < NumCharsSoFar; ++i) {
1185*67e74705SXin Li // check for enough leading zeros to shift into
1186*67e74705SXin Li multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
1187*67e74705SXin Li LitVal <<= 8;
1188*67e74705SXin Li LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1189*67e74705SXin Li }
1190*67e74705SXin Li } else if (NumCharsSoFar > 0) {
1191*67e74705SXin Li // otherwise just take the last character
1192*67e74705SXin Li LitVal = buffer_begin[-1];
1193*67e74705SXin Li }
1194*67e74705SXin Li
1195*67e74705SXin Li if (!HadError && multi_char_too_long) {
1196*67e74705SXin Li PP.Diag(Loc, diag::warn_char_constant_too_large);
1197*67e74705SXin Li }
1198*67e74705SXin Li
1199*67e74705SXin Li // Transfer the value from APInt to uint64_t
1200*67e74705SXin Li Value = LitVal.getZExtValue();
1201*67e74705SXin Li
1202*67e74705SXin Li // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1203*67e74705SXin Li // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
1204*67e74705SXin Li // character constants are not sign extended in the this implementation:
1205*67e74705SXin Li // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1206*67e74705SXin Li if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
1207*67e74705SXin Li PP.getLangOpts().CharIsSigned)
1208*67e74705SXin Li Value = (signed char)Value;
1209*67e74705SXin Li }
1210*67e74705SXin Li
1211*67e74705SXin Li /// \verbatim
1212*67e74705SXin Li /// string-literal: [C++0x lex.string]
1213*67e74705SXin Li /// encoding-prefix " [s-char-sequence] "
1214*67e74705SXin Li /// encoding-prefix R raw-string
1215*67e74705SXin Li /// encoding-prefix:
1216*67e74705SXin Li /// u8
1217*67e74705SXin Li /// u
1218*67e74705SXin Li /// U
1219*67e74705SXin Li /// L
1220*67e74705SXin Li /// s-char-sequence:
1221*67e74705SXin Li /// s-char
1222*67e74705SXin Li /// s-char-sequence s-char
1223*67e74705SXin Li /// s-char:
1224*67e74705SXin Li /// any member of the source character set except the double-quote ",
1225*67e74705SXin Li /// backslash \, or new-line character
1226*67e74705SXin Li /// escape-sequence
1227*67e74705SXin Li /// universal-character-name
1228*67e74705SXin Li /// raw-string:
1229*67e74705SXin Li /// " d-char-sequence ( r-char-sequence ) d-char-sequence "
1230*67e74705SXin Li /// r-char-sequence:
1231*67e74705SXin Li /// r-char
1232*67e74705SXin Li /// r-char-sequence r-char
1233*67e74705SXin Li /// r-char:
1234*67e74705SXin Li /// any member of the source character set, except a right parenthesis )
1235*67e74705SXin Li /// followed by the initial d-char-sequence (which may be empty)
1236*67e74705SXin Li /// followed by a double quote ".
1237*67e74705SXin Li /// d-char-sequence:
1238*67e74705SXin Li /// d-char
1239*67e74705SXin Li /// d-char-sequence d-char
1240*67e74705SXin Li /// d-char:
1241*67e74705SXin Li /// any member of the basic source character set except:
1242*67e74705SXin Li /// space, the left parenthesis (, the right parenthesis ),
1243*67e74705SXin Li /// the backslash \, and the control characters representing horizontal
1244*67e74705SXin Li /// tab, vertical tab, form feed, and newline.
1245*67e74705SXin Li /// escape-sequence: [C++0x lex.ccon]
1246*67e74705SXin Li /// simple-escape-sequence
1247*67e74705SXin Li /// octal-escape-sequence
1248*67e74705SXin Li /// hexadecimal-escape-sequence
1249*67e74705SXin Li /// simple-escape-sequence:
1250*67e74705SXin Li /// one of \' \" \? \\ \a \b \f \n \r \t \v
1251*67e74705SXin Li /// octal-escape-sequence:
1252*67e74705SXin Li /// \ octal-digit
1253*67e74705SXin Li /// \ octal-digit octal-digit
1254*67e74705SXin Li /// \ octal-digit octal-digit octal-digit
1255*67e74705SXin Li /// hexadecimal-escape-sequence:
1256*67e74705SXin Li /// \x hexadecimal-digit
1257*67e74705SXin Li /// hexadecimal-escape-sequence hexadecimal-digit
1258*67e74705SXin Li /// universal-character-name:
1259*67e74705SXin Li /// \u hex-quad
1260*67e74705SXin Li /// \U hex-quad hex-quad
1261*67e74705SXin Li /// hex-quad:
1262*67e74705SXin Li /// hex-digit hex-digit hex-digit hex-digit
1263*67e74705SXin Li /// \endverbatim
1264*67e74705SXin Li ///
1265*67e74705SXin Li StringLiteralParser::
StringLiteralParser(ArrayRef<Token> StringToks,Preprocessor & PP,bool Complain)1266*67e74705SXin Li StringLiteralParser(ArrayRef<Token> StringToks,
1267*67e74705SXin Li Preprocessor &PP, bool Complain)
1268*67e74705SXin Li : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1269*67e74705SXin Li Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr),
1270*67e74705SXin Li MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1271*67e74705SXin Li ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
1272*67e74705SXin Li init(StringToks);
1273*67e74705SXin Li }
1274*67e74705SXin Li
init(ArrayRef<Token> StringToks)1275*67e74705SXin Li void StringLiteralParser::init(ArrayRef<Token> StringToks){
1276*67e74705SXin Li // The literal token may have come from an invalid source location (e.g. due
1277*67e74705SXin Li // to a PCH error), in which case the token length will be 0.
1278*67e74705SXin Li if (StringToks.empty() || StringToks[0].getLength() < 2)
1279*67e74705SXin Li return DiagnoseLexingError(SourceLocation());
1280*67e74705SXin Li
1281*67e74705SXin Li // Scan all of the string portions, remember the max individual token length,
1282*67e74705SXin Li // computing a bound on the concatenated string length, and see whether any
1283*67e74705SXin Li // piece is a wide-string. If any of the string portions is a wide-string
1284*67e74705SXin Li // literal, the result is a wide-string literal [C99 6.4.5p4].
1285*67e74705SXin Li assert(!StringToks.empty() && "expected at least one token");
1286*67e74705SXin Li MaxTokenLength = StringToks[0].getLength();
1287*67e74705SXin Li assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1288*67e74705SXin Li SizeBound = StringToks[0].getLength()-2; // -2 for "".
1289*67e74705SXin Li Kind = StringToks[0].getKind();
1290*67e74705SXin Li
1291*67e74705SXin Li hadError = false;
1292*67e74705SXin Li
1293*67e74705SXin Li // Implement Translation Phase #6: concatenation of string literals
1294*67e74705SXin Li /// (C99 5.1.1.2p1). The common case is only one string fragment.
1295*67e74705SXin Li for (unsigned i = 1; i != StringToks.size(); ++i) {
1296*67e74705SXin Li if (StringToks[i].getLength() < 2)
1297*67e74705SXin Li return DiagnoseLexingError(StringToks[i].getLocation());
1298*67e74705SXin Li
1299*67e74705SXin Li // The string could be shorter than this if it needs cleaning, but this is a
1300*67e74705SXin Li // reasonable bound, which is all we need.
1301*67e74705SXin Li assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
1302*67e74705SXin Li SizeBound += StringToks[i].getLength()-2; // -2 for "".
1303*67e74705SXin Li
1304*67e74705SXin Li // Remember maximum string piece length.
1305*67e74705SXin Li if (StringToks[i].getLength() > MaxTokenLength)
1306*67e74705SXin Li MaxTokenLength = StringToks[i].getLength();
1307*67e74705SXin Li
1308*67e74705SXin Li // Remember if we see any wide or utf-8/16/32 strings.
1309*67e74705SXin Li // Also check for illegal concatenations.
1310*67e74705SXin Li if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
1311*67e74705SXin Li if (isAscii()) {
1312*67e74705SXin Li Kind = StringToks[i].getKind();
1313*67e74705SXin Li } else {
1314*67e74705SXin Li if (Diags)
1315*67e74705SXin Li Diags->Report(StringToks[i].getLocation(),
1316*67e74705SXin Li diag::err_unsupported_string_concat);
1317*67e74705SXin Li hadError = true;
1318*67e74705SXin Li }
1319*67e74705SXin Li }
1320*67e74705SXin Li }
1321*67e74705SXin Li
1322*67e74705SXin Li // Include space for the null terminator.
1323*67e74705SXin Li ++SizeBound;
1324*67e74705SXin Li
1325*67e74705SXin Li // TODO: K&R warning: "traditional C rejects string constant concatenation"
1326*67e74705SXin Li
1327*67e74705SXin Li // Get the width in bytes of char/wchar_t/char16_t/char32_t
1328*67e74705SXin Li CharByteWidth = getCharWidth(Kind, Target);
1329*67e74705SXin Li assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1330*67e74705SXin Li CharByteWidth /= 8;
1331*67e74705SXin Li
1332*67e74705SXin Li // The output buffer size needs to be large enough to hold wide characters.
1333*67e74705SXin Li // This is a worst-case assumption which basically corresponds to L"" "long".
1334*67e74705SXin Li SizeBound *= CharByteWidth;
1335*67e74705SXin Li
1336*67e74705SXin Li // Size the temporary buffer to hold the result string data.
1337*67e74705SXin Li ResultBuf.resize(SizeBound);
1338*67e74705SXin Li
1339*67e74705SXin Li // Likewise, but for each string piece.
1340*67e74705SXin Li SmallString<512> TokenBuf;
1341*67e74705SXin Li TokenBuf.resize(MaxTokenLength);
1342*67e74705SXin Li
1343*67e74705SXin Li // Loop over all the strings, getting their spelling, and expanding them to
1344*67e74705SXin Li // wide strings as appropriate.
1345*67e74705SXin Li ResultPtr = &ResultBuf[0]; // Next byte to fill in.
1346*67e74705SXin Li
1347*67e74705SXin Li Pascal = false;
1348*67e74705SXin Li
1349*67e74705SXin Li SourceLocation UDSuffixTokLoc;
1350*67e74705SXin Li
1351*67e74705SXin Li for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
1352*67e74705SXin Li const char *ThisTokBuf = &TokenBuf[0];
1353*67e74705SXin Li // Get the spelling of the token, which eliminates trigraphs, etc. We know
1354*67e74705SXin Li // that ThisTokBuf points to a buffer that is big enough for the whole token
1355*67e74705SXin Li // and 'spelled' tokens can only shrink.
1356*67e74705SXin Li bool StringInvalid = false;
1357*67e74705SXin Li unsigned ThisTokLen =
1358*67e74705SXin Li Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1359*67e74705SXin Li &StringInvalid);
1360*67e74705SXin Li if (StringInvalid)
1361*67e74705SXin Li return DiagnoseLexingError(StringToks[i].getLocation());
1362*67e74705SXin Li
1363*67e74705SXin Li const char *ThisTokBegin = ThisTokBuf;
1364*67e74705SXin Li const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1365*67e74705SXin Li
1366*67e74705SXin Li // Remove an optional ud-suffix.
1367*67e74705SXin Li if (ThisTokEnd[-1] != '"') {
1368*67e74705SXin Li const char *UDSuffixEnd = ThisTokEnd;
1369*67e74705SXin Li do {
1370*67e74705SXin Li --ThisTokEnd;
1371*67e74705SXin Li } while (ThisTokEnd[-1] != '"');
1372*67e74705SXin Li
1373*67e74705SXin Li StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1374*67e74705SXin Li
1375*67e74705SXin Li if (UDSuffixBuf.empty()) {
1376*67e74705SXin Li if (StringToks[i].hasUCN())
1377*67e74705SXin Li expandUCNs(UDSuffixBuf, UDSuffix);
1378*67e74705SXin Li else
1379*67e74705SXin Li UDSuffixBuf.assign(UDSuffix);
1380*67e74705SXin Li UDSuffixToken = i;
1381*67e74705SXin Li UDSuffixOffset = ThisTokEnd - ThisTokBuf;
1382*67e74705SXin Li UDSuffixTokLoc = StringToks[i].getLocation();
1383*67e74705SXin Li } else {
1384*67e74705SXin Li SmallString<32> ExpandedUDSuffix;
1385*67e74705SXin Li if (StringToks[i].hasUCN()) {
1386*67e74705SXin Li expandUCNs(ExpandedUDSuffix, UDSuffix);
1387*67e74705SXin Li UDSuffix = ExpandedUDSuffix;
1388*67e74705SXin Li }
1389*67e74705SXin Li
1390*67e74705SXin Li // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1391*67e74705SXin Li // result of a concatenation involving at least one user-defined-string-
1392*67e74705SXin Li // literal, all the participating user-defined-string-literals shall
1393*67e74705SXin Li // have the same ud-suffix.
1394*67e74705SXin Li if (UDSuffixBuf != UDSuffix) {
1395*67e74705SXin Li if (Diags) {
1396*67e74705SXin Li SourceLocation TokLoc = StringToks[i].getLocation();
1397*67e74705SXin Li Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1398*67e74705SXin Li << UDSuffixBuf << UDSuffix
1399*67e74705SXin Li << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1400*67e74705SXin Li << SourceRange(TokLoc, TokLoc);
1401*67e74705SXin Li }
1402*67e74705SXin Li hadError = true;
1403*67e74705SXin Li }
1404*67e74705SXin Li }
1405*67e74705SXin Li }
1406*67e74705SXin Li
1407*67e74705SXin Li // Strip the end quote.
1408*67e74705SXin Li --ThisTokEnd;
1409*67e74705SXin Li
1410*67e74705SXin Li // TODO: Input character set mapping support.
1411*67e74705SXin Li
1412*67e74705SXin Li // Skip marker for wide or unicode strings.
1413*67e74705SXin Li if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
1414*67e74705SXin Li ++ThisTokBuf;
1415*67e74705SXin Li // Skip 8 of u8 marker for utf8 strings.
1416*67e74705SXin Li if (ThisTokBuf[0] == '8')
1417*67e74705SXin Li ++ThisTokBuf;
1418*67e74705SXin Li }
1419*67e74705SXin Li
1420*67e74705SXin Li // Check for raw string
1421*67e74705SXin Li if (ThisTokBuf[0] == 'R') {
1422*67e74705SXin Li ThisTokBuf += 2; // skip R"
1423*67e74705SXin Li
1424*67e74705SXin Li const char *Prefix = ThisTokBuf;
1425*67e74705SXin Li while (ThisTokBuf[0] != '(')
1426*67e74705SXin Li ++ThisTokBuf;
1427*67e74705SXin Li ++ThisTokBuf; // skip '('
1428*67e74705SXin Li
1429*67e74705SXin Li // Remove same number of characters from the end
1430*67e74705SXin Li ThisTokEnd -= ThisTokBuf - Prefix;
1431*67e74705SXin Li assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
1432*67e74705SXin Li
1433*67e74705SXin Li // C++14 [lex.string]p4: A source-file new-line in a raw string literal
1434*67e74705SXin Li // results in a new-line in the resulting execution string-literal.
1435*67e74705SXin Li StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
1436*67e74705SXin Li while (!RemainingTokenSpan.empty()) {
1437*67e74705SXin Li // Split the string literal on \r\n boundaries.
1438*67e74705SXin Li size_t CRLFPos = RemainingTokenSpan.find("\r\n");
1439*67e74705SXin Li StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
1440*67e74705SXin Li StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
1441*67e74705SXin Li
1442*67e74705SXin Li // Copy everything before the \r\n sequence into the string literal.
1443*67e74705SXin Li if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
1444*67e74705SXin Li hadError = true;
1445*67e74705SXin Li
1446*67e74705SXin Li // Point into the \n inside the \r\n sequence and operate on the
1447*67e74705SXin Li // remaining portion of the literal.
1448*67e74705SXin Li RemainingTokenSpan = AfterCRLF.substr(1);
1449*67e74705SXin Li }
1450*67e74705SXin Li } else {
1451*67e74705SXin Li if (ThisTokBuf[0] != '"') {
1452*67e74705SXin Li // The file may have come from PCH and then changed after loading the
1453*67e74705SXin Li // PCH; Fail gracefully.
1454*67e74705SXin Li return DiagnoseLexingError(StringToks[i].getLocation());
1455*67e74705SXin Li }
1456*67e74705SXin Li ++ThisTokBuf; // skip "
1457*67e74705SXin Li
1458*67e74705SXin Li // Check if this is a pascal string
1459*67e74705SXin Li if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
1460*67e74705SXin Li ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
1461*67e74705SXin Li
1462*67e74705SXin Li // If the \p sequence is found in the first token, we have a pascal string
1463*67e74705SXin Li // Otherwise, if we already have a pascal string, ignore the first \p
1464*67e74705SXin Li if (i == 0) {
1465*67e74705SXin Li ++ThisTokBuf;
1466*67e74705SXin Li Pascal = true;
1467*67e74705SXin Li } else if (Pascal)
1468*67e74705SXin Li ThisTokBuf += 2;
1469*67e74705SXin Li }
1470*67e74705SXin Li
1471*67e74705SXin Li while (ThisTokBuf != ThisTokEnd) {
1472*67e74705SXin Li // Is this a span of non-escape characters?
1473*67e74705SXin Li if (ThisTokBuf[0] != '\\') {
1474*67e74705SXin Li const char *InStart = ThisTokBuf;
1475*67e74705SXin Li do {
1476*67e74705SXin Li ++ThisTokBuf;
1477*67e74705SXin Li } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
1478*67e74705SXin Li
1479*67e74705SXin Li // Copy the character span over.
1480*67e74705SXin Li if (CopyStringFragment(StringToks[i], ThisTokBegin,
1481*67e74705SXin Li StringRef(InStart, ThisTokBuf - InStart)))
1482*67e74705SXin Li hadError = true;
1483*67e74705SXin Li continue;
1484*67e74705SXin Li }
1485*67e74705SXin Li // Is this a Universal Character Name escape?
1486*67e74705SXin Li if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
1487*67e74705SXin Li EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
1488*67e74705SXin Li ResultPtr, hadError,
1489*67e74705SXin Li FullSourceLoc(StringToks[i].getLocation(), SM),
1490*67e74705SXin Li CharByteWidth, Diags, Features);
1491*67e74705SXin Li continue;
1492*67e74705SXin Li }
1493*67e74705SXin Li // Otherwise, this is a non-UCN escape character. Process it.
1494*67e74705SXin Li unsigned ResultChar =
1495*67e74705SXin Li ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
1496*67e74705SXin Li FullSourceLoc(StringToks[i].getLocation(), SM),
1497*67e74705SXin Li CharByteWidth*8, Diags, Features);
1498*67e74705SXin Li
1499*67e74705SXin Li if (CharByteWidth == 4) {
1500*67e74705SXin Li // FIXME: Make the type of the result buffer correct instead of
1501*67e74705SXin Li // using reinterpret_cast.
1502*67e74705SXin Li UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);
1503*67e74705SXin Li *ResultWidePtr = ResultChar;
1504*67e74705SXin Li ResultPtr += 4;
1505*67e74705SXin Li } else if (CharByteWidth == 2) {
1506*67e74705SXin Li // FIXME: Make the type of the result buffer correct instead of
1507*67e74705SXin Li // using reinterpret_cast.
1508*67e74705SXin Li UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);
1509*67e74705SXin Li *ResultWidePtr = ResultChar & 0xFFFF;
1510*67e74705SXin Li ResultPtr += 2;
1511*67e74705SXin Li } else {
1512*67e74705SXin Li assert(CharByteWidth == 1 && "Unexpected char width");
1513*67e74705SXin Li *ResultPtr++ = ResultChar & 0xFF;
1514*67e74705SXin Li }
1515*67e74705SXin Li }
1516*67e74705SXin Li }
1517*67e74705SXin Li }
1518*67e74705SXin Li
1519*67e74705SXin Li if (Pascal) {
1520*67e74705SXin Li if (CharByteWidth == 4) {
1521*67e74705SXin Li // FIXME: Make the type of the result buffer correct instead of
1522*67e74705SXin Li // using reinterpret_cast.
1523*67e74705SXin Li UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());
1524*67e74705SXin Li ResultWidePtr[0] = GetNumStringChars() - 1;
1525*67e74705SXin Li } else if (CharByteWidth == 2) {
1526*67e74705SXin Li // FIXME: Make the type of the result buffer correct instead of
1527*67e74705SXin Li // using reinterpret_cast.
1528*67e74705SXin Li UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());
1529*67e74705SXin Li ResultWidePtr[0] = GetNumStringChars() - 1;
1530*67e74705SXin Li } else {
1531*67e74705SXin Li assert(CharByteWidth == 1 && "Unexpected char width");
1532*67e74705SXin Li ResultBuf[0] = GetNumStringChars() - 1;
1533*67e74705SXin Li }
1534*67e74705SXin Li
1535*67e74705SXin Li // Verify that pascal strings aren't too large.
1536*67e74705SXin Li if (GetStringLength() > 256) {
1537*67e74705SXin Li if (Diags)
1538*67e74705SXin Li Diags->Report(StringToks.front().getLocation(),
1539*67e74705SXin Li diag::err_pascal_string_too_long)
1540*67e74705SXin Li << SourceRange(StringToks.front().getLocation(),
1541*67e74705SXin Li StringToks.back().getLocation());
1542*67e74705SXin Li hadError = true;
1543*67e74705SXin Li return;
1544*67e74705SXin Li }
1545*67e74705SXin Li } else if (Diags) {
1546*67e74705SXin Li // Complain if this string literal has too many characters.
1547*67e74705SXin Li unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
1548*67e74705SXin Li
1549*67e74705SXin Li if (GetNumStringChars() > MaxChars)
1550*67e74705SXin Li Diags->Report(StringToks.front().getLocation(),
1551*67e74705SXin Li diag::ext_string_too_long)
1552*67e74705SXin Li << GetNumStringChars() << MaxChars
1553*67e74705SXin Li << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
1554*67e74705SXin Li << SourceRange(StringToks.front().getLocation(),
1555*67e74705SXin Li StringToks.back().getLocation());
1556*67e74705SXin Li }
1557*67e74705SXin Li }
1558*67e74705SXin Li
resyncUTF8(const char * Err,const char * End)1559*67e74705SXin Li static const char *resyncUTF8(const char *Err, const char *End) {
1560*67e74705SXin Li if (Err == End)
1561*67e74705SXin Li return End;
1562*67e74705SXin Li End = Err + std::min<unsigned>(getNumBytesForUTF8(*Err), End-Err);
1563*67e74705SXin Li while (++Err != End && (*Err & 0xC0) == 0x80)
1564*67e74705SXin Li ;
1565*67e74705SXin Li return Err;
1566*67e74705SXin Li }
1567*67e74705SXin Li
1568*67e74705SXin Li /// \brief This function copies from Fragment, which is a sequence of bytes
1569*67e74705SXin Li /// within Tok's contents (which begin at TokBegin) into ResultPtr.
1570*67e74705SXin Li /// Performs widening for multi-byte characters.
CopyStringFragment(const Token & Tok,const char * TokBegin,StringRef Fragment)1571*67e74705SXin Li bool StringLiteralParser::CopyStringFragment(const Token &Tok,
1572*67e74705SXin Li const char *TokBegin,
1573*67e74705SXin Li StringRef Fragment) {
1574*67e74705SXin Li const UTF8 *ErrorPtrTmp;
1575*67e74705SXin Li if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
1576*67e74705SXin Li return false;
1577*67e74705SXin Li
1578*67e74705SXin Li // If we see bad encoding for unprefixed string literals, warn and
1579*67e74705SXin Li // simply copy the byte values, for compatibility with gcc and older
1580*67e74705SXin Li // versions of clang.
1581*67e74705SXin Li bool NoErrorOnBadEncoding = isAscii();
1582*67e74705SXin Li if (NoErrorOnBadEncoding) {
1583*67e74705SXin Li memcpy(ResultPtr, Fragment.data(), Fragment.size());
1584*67e74705SXin Li ResultPtr += Fragment.size();
1585*67e74705SXin Li }
1586*67e74705SXin Li
1587*67e74705SXin Li if (Diags) {
1588*67e74705SXin Li const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
1589*67e74705SXin Li
1590*67e74705SXin Li FullSourceLoc SourceLoc(Tok.getLocation(), SM);
1591*67e74705SXin Li const DiagnosticBuilder &Builder =
1592*67e74705SXin Li Diag(Diags, Features, SourceLoc, TokBegin,
1593*67e74705SXin Li ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
1594*67e74705SXin Li NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
1595*67e74705SXin Li : diag::err_bad_string_encoding);
1596*67e74705SXin Li
1597*67e74705SXin Li const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
1598*67e74705SXin Li StringRef NextFragment(NextStart, Fragment.end()-NextStart);
1599*67e74705SXin Li
1600*67e74705SXin Li // Decode into a dummy buffer.
1601*67e74705SXin Li SmallString<512> Dummy;
1602*67e74705SXin Li Dummy.reserve(Fragment.size() * CharByteWidth);
1603*67e74705SXin Li char *Ptr = Dummy.data();
1604*67e74705SXin Li
1605*67e74705SXin Li while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
1606*67e74705SXin Li const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
1607*67e74705SXin Li NextStart = resyncUTF8(ErrorPtr, Fragment.end());
1608*67e74705SXin Li Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
1609*67e74705SXin Li ErrorPtr, NextStart);
1610*67e74705SXin Li NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
1611*67e74705SXin Li }
1612*67e74705SXin Li }
1613*67e74705SXin Li return !NoErrorOnBadEncoding;
1614*67e74705SXin Li }
1615*67e74705SXin Li
DiagnoseLexingError(SourceLocation Loc)1616*67e74705SXin Li void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
1617*67e74705SXin Li hadError = true;
1618*67e74705SXin Li if (Diags)
1619*67e74705SXin Li Diags->Report(Loc, diag::err_lexing_string);
1620*67e74705SXin Li }
1621*67e74705SXin Li
1622*67e74705SXin Li /// getOffsetOfStringByte - This function returns the offset of the
1623*67e74705SXin Li /// specified byte of the string data represented by Token. This handles
1624*67e74705SXin Li /// advancing over escape sequences in the string.
getOffsetOfStringByte(const Token & Tok,unsigned ByteNo) const1625*67e74705SXin Li unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
1626*67e74705SXin Li unsigned ByteNo) const {
1627*67e74705SXin Li // Get the spelling of the token.
1628*67e74705SXin Li SmallString<32> SpellingBuffer;
1629*67e74705SXin Li SpellingBuffer.resize(Tok.getLength());
1630*67e74705SXin Li
1631*67e74705SXin Li bool StringInvalid = false;
1632*67e74705SXin Li const char *SpellingPtr = &SpellingBuffer[0];
1633*67e74705SXin Li unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1634*67e74705SXin Li &StringInvalid);
1635*67e74705SXin Li if (StringInvalid)
1636*67e74705SXin Li return 0;
1637*67e74705SXin Li
1638*67e74705SXin Li const char *SpellingStart = SpellingPtr;
1639*67e74705SXin Li const char *SpellingEnd = SpellingPtr+TokLen;
1640*67e74705SXin Li
1641*67e74705SXin Li // Handle UTF-8 strings just like narrow strings.
1642*67e74705SXin Li if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
1643*67e74705SXin Li SpellingPtr += 2;
1644*67e74705SXin Li
1645*67e74705SXin Li assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
1646*67e74705SXin Li SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
1647*67e74705SXin Li
1648*67e74705SXin Li // For raw string literals, this is easy.
1649*67e74705SXin Li if (SpellingPtr[0] == 'R') {
1650*67e74705SXin Li assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
1651*67e74705SXin Li // Skip 'R"'.
1652*67e74705SXin Li SpellingPtr += 2;
1653*67e74705SXin Li while (*SpellingPtr != '(') {
1654*67e74705SXin Li ++SpellingPtr;
1655*67e74705SXin Li assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
1656*67e74705SXin Li }
1657*67e74705SXin Li // Skip '('.
1658*67e74705SXin Li ++SpellingPtr;
1659*67e74705SXin Li return SpellingPtr - SpellingStart + ByteNo;
1660*67e74705SXin Li }
1661*67e74705SXin Li
1662*67e74705SXin Li // Skip over the leading quote
1663*67e74705SXin Li assert(SpellingPtr[0] == '"' && "Should be a string literal!");
1664*67e74705SXin Li ++SpellingPtr;
1665*67e74705SXin Li
1666*67e74705SXin Li // Skip over bytes until we find the offset we're looking for.
1667*67e74705SXin Li while (ByteNo) {
1668*67e74705SXin Li assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
1669*67e74705SXin Li
1670*67e74705SXin Li // Step over non-escapes simply.
1671*67e74705SXin Li if (*SpellingPtr != '\\') {
1672*67e74705SXin Li ++SpellingPtr;
1673*67e74705SXin Li --ByteNo;
1674*67e74705SXin Li continue;
1675*67e74705SXin Li }
1676*67e74705SXin Li
1677*67e74705SXin Li // Otherwise, this is an escape character. Advance over it.
1678*67e74705SXin Li bool HadError = false;
1679*67e74705SXin Li if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
1680*67e74705SXin Li const char *EscapePtr = SpellingPtr;
1681*67e74705SXin Li unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
1682*67e74705SXin Li 1, Features, HadError);
1683*67e74705SXin Li if (Len > ByteNo) {
1684*67e74705SXin Li // ByteNo is somewhere within the escape sequence.
1685*67e74705SXin Li SpellingPtr = EscapePtr;
1686*67e74705SXin Li break;
1687*67e74705SXin Li }
1688*67e74705SXin Li ByteNo -= Len;
1689*67e74705SXin Li } else {
1690*67e74705SXin Li ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
1691*67e74705SXin Li FullSourceLoc(Tok.getLocation(), SM),
1692*67e74705SXin Li CharByteWidth*8, Diags, Features);
1693*67e74705SXin Li --ByteNo;
1694*67e74705SXin Li }
1695*67e74705SXin Li assert(!HadError && "This method isn't valid on erroneous strings");
1696*67e74705SXin Li }
1697*67e74705SXin Li
1698*67e74705SXin Li return SpellingPtr-SpellingStart;
1699*67e74705SXin Li }
1700