xref: /aosp_15_r20/external/clang/lib/Lex/LiteralSupport.cpp (revision 67e74705e28f6214e480b399dd47ea732279e315)
1*67e74705SXin Li //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2*67e74705SXin Li //
3*67e74705SXin Li //                     The LLVM Compiler Infrastructure
4*67e74705SXin Li //
5*67e74705SXin Li // This file is distributed under the University of Illinois Open Source
6*67e74705SXin Li // License. See LICENSE.TXT for details.
7*67e74705SXin Li //
8*67e74705SXin Li //===----------------------------------------------------------------------===//
9*67e74705SXin Li //
10*67e74705SXin Li // This file implements the NumericLiteralParser, CharLiteralParser, and
11*67e74705SXin Li // StringLiteralParser interfaces.
12*67e74705SXin Li //
13*67e74705SXin Li //===----------------------------------------------------------------------===//
14*67e74705SXin Li 
15*67e74705SXin Li #include "clang/Lex/LiteralSupport.h"
16*67e74705SXin Li #include "clang/Basic/CharInfo.h"
17*67e74705SXin Li #include "clang/Basic/TargetInfo.h"
18*67e74705SXin Li #include "clang/Lex/LexDiagnostic.h"
19*67e74705SXin Li #include "clang/Lex/Preprocessor.h"
20*67e74705SXin Li #include "llvm/ADT/StringExtras.h"
21*67e74705SXin Li #include "llvm/Support/ConvertUTF.h"
22*67e74705SXin Li #include "llvm/Support/ErrorHandling.h"
23*67e74705SXin Li 
24*67e74705SXin Li using namespace clang;
25*67e74705SXin Li 
getCharWidth(tok::TokenKind kind,const TargetInfo & Target)26*67e74705SXin Li static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
27*67e74705SXin Li   switch (kind) {
28*67e74705SXin Li   default: llvm_unreachable("Unknown token type!");
29*67e74705SXin Li   case tok::char_constant:
30*67e74705SXin Li   case tok::string_literal:
31*67e74705SXin Li   case tok::utf8_char_constant:
32*67e74705SXin Li   case tok::utf8_string_literal:
33*67e74705SXin Li     return Target.getCharWidth();
34*67e74705SXin Li   case tok::wide_char_constant:
35*67e74705SXin Li   case tok::wide_string_literal:
36*67e74705SXin Li     return Target.getWCharWidth();
37*67e74705SXin Li   case tok::utf16_char_constant:
38*67e74705SXin Li   case tok::utf16_string_literal:
39*67e74705SXin Li     return Target.getChar16Width();
40*67e74705SXin Li   case tok::utf32_char_constant:
41*67e74705SXin Li   case tok::utf32_string_literal:
42*67e74705SXin Li     return Target.getChar32Width();
43*67e74705SXin Li   }
44*67e74705SXin Li }
45*67e74705SXin Li 
MakeCharSourceRange(const LangOptions & Features,FullSourceLoc TokLoc,const char * TokBegin,const char * TokRangeBegin,const char * TokRangeEnd)46*67e74705SXin Li static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
47*67e74705SXin Li                                            FullSourceLoc TokLoc,
48*67e74705SXin Li                                            const char *TokBegin,
49*67e74705SXin Li                                            const char *TokRangeBegin,
50*67e74705SXin Li                                            const char *TokRangeEnd) {
51*67e74705SXin Li   SourceLocation Begin =
52*67e74705SXin Li     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
53*67e74705SXin Li                                    TokLoc.getManager(), Features);
54*67e74705SXin Li   SourceLocation End =
55*67e74705SXin Li     Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
56*67e74705SXin Li                                    TokLoc.getManager(), Features);
57*67e74705SXin Li   return CharSourceRange::getCharRange(Begin, End);
58*67e74705SXin Li }
59*67e74705SXin Li 
60*67e74705SXin Li /// \brief Produce a diagnostic highlighting some portion of a literal.
61*67e74705SXin Li ///
62*67e74705SXin Li /// Emits the diagnostic \p DiagID, highlighting the range of characters from
63*67e74705SXin Li /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
64*67e74705SXin Li /// a substring of a spelling buffer for the token beginning at \p TokBegin.
Diag(DiagnosticsEngine * Diags,const LangOptions & Features,FullSourceLoc TokLoc,const char * TokBegin,const char * TokRangeBegin,const char * TokRangeEnd,unsigned DiagID)65*67e74705SXin Li static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
66*67e74705SXin Li                               const LangOptions &Features, FullSourceLoc TokLoc,
67*67e74705SXin Li                               const char *TokBegin, const char *TokRangeBegin,
68*67e74705SXin Li                               const char *TokRangeEnd, unsigned DiagID) {
69*67e74705SXin Li   SourceLocation Begin =
70*67e74705SXin Li     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
71*67e74705SXin Li                                    TokLoc.getManager(), Features);
72*67e74705SXin Li   return Diags->Report(Begin, DiagID) <<
73*67e74705SXin Li     MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
74*67e74705SXin Li }
75*67e74705SXin Li 
76*67e74705SXin Li /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
77*67e74705SXin Li /// either a character or a string literal.
ProcessCharEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,bool & HadError,FullSourceLoc Loc,unsigned CharWidth,DiagnosticsEngine * Diags,const LangOptions & Features)78*67e74705SXin Li static unsigned ProcessCharEscape(const char *ThisTokBegin,
79*67e74705SXin Li                                   const char *&ThisTokBuf,
80*67e74705SXin Li                                   const char *ThisTokEnd, bool &HadError,
81*67e74705SXin Li                                   FullSourceLoc Loc, unsigned CharWidth,
82*67e74705SXin Li                                   DiagnosticsEngine *Diags,
83*67e74705SXin Li                                   const LangOptions &Features) {
84*67e74705SXin Li   const char *EscapeBegin = ThisTokBuf;
85*67e74705SXin Li 
86*67e74705SXin Li   // Skip the '\' char.
87*67e74705SXin Li   ++ThisTokBuf;
88*67e74705SXin Li 
89*67e74705SXin Li   // We know that this character can't be off the end of the buffer, because
90*67e74705SXin Li   // that would have been \", which would not have been the end of string.
91*67e74705SXin Li   unsigned ResultChar = *ThisTokBuf++;
92*67e74705SXin Li   switch (ResultChar) {
93*67e74705SXin Li   // These map to themselves.
94*67e74705SXin Li   case '\\': case '\'': case '"': case '?': break;
95*67e74705SXin Li 
96*67e74705SXin Li     // These have fixed mappings.
97*67e74705SXin Li   case 'a':
98*67e74705SXin Li     // TODO: K&R: the meaning of '\\a' is different in traditional C
99*67e74705SXin Li     ResultChar = 7;
100*67e74705SXin Li     break;
101*67e74705SXin Li   case 'b':
102*67e74705SXin Li     ResultChar = 8;
103*67e74705SXin Li     break;
104*67e74705SXin Li   case 'e':
105*67e74705SXin Li     if (Diags)
106*67e74705SXin Li       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
107*67e74705SXin Li            diag::ext_nonstandard_escape) << "e";
108*67e74705SXin Li     ResultChar = 27;
109*67e74705SXin Li     break;
110*67e74705SXin Li   case 'E':
111*67e74705SXin Li     if (Diags)
112*67e74705SXin Li       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
113*67e74705SXin Li            diag::ext_nonstandard_escape) << "E";
114*67e74705SXin Li     ResultChar = 27;
115*67e74705SXin Li     break;
116*67e74705SXin Li   case 'f':
117*67e74705SXin Li     ResultChar = 12;
118*67e74705SXin Li     break;
119*67e74705SXin Li   case 'n':
120*67e74705SXin Li     ResultChar = 10;
121*67e74705SXin Li     break;
122*67e74705SXin Li   case 'r':
123*67e74705SXin Li     ResultChar = 13;
124*67e74705SXin Li     break;
125*67e74705SXin Li   case 't':
126*67e74705SXin Li     ResultChar = 9;
127*67e74705SXin Li     break;
128*67e74705SXin Li   case 'v':
129*67e74705SXin Li     ResultChar = 11;
130*67e74705SXin Li     break;
131*67e74705SXin Li   case 'x': { // Hex escape.
132*67e74705SXin Li     ResultChar = 0;
133*67e74705SXin Li     if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
134*67e74705SXin Li       if (Diags)
135*67e74705SXin Li         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
136*67e74705SXin Li              diag::err_hex_escape_no_digits) << "x";
137*67e74705SXin Li       HadError = 1;
138*67e74705SXin Li       break;
139*67e74705SXin Li     }
140*67e74705SXin Li 
141*67e74705SXin Li     // Hex escapes are a maximal series of hex digits.
142*67e74705SXin Li     bool Overflow = false;
143*67e74705SXin Li     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
144*67e74705SXin Li       int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
145*67e74705SXin Li       if (CharVal == -1) break;
146*67e74705SXin Li       // About to shift out a digit?
147*67e74705SXin Li       if (ResultChar & 0xF0000000)
148*67e74705SXin Li         Overflow = true;
149*67e74705SXin Li       ResultChar <<= 4;
150*67e74705SXin Li       ResultChar |= CharVal;
151*67e74705SXin Li     }
152*67e74705SXin Li 
153*67e74705SXin Li     // See if any bits will be truncated when evaluated as a character.
154*67e74705SXin Li     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
155*67e74705SXin Li       Overflow = true;
156*67e74705SXin Li       ResultChar &= ~0U >> (32-CharWidth);
157*67e74705SXin Li     }
158*67e74705SXin Li 
159*67e74705SXin Li     // Check for overflow.
160*67e74705SXin Li     if (Overflow && Diags)   // Too many digits to fit in
161*67e74705SXin Li       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
162*67e74705SXin Li            diag::err_escape_too_large) << 0;
163*67e74705SXin Li     break;
164*67e74705SXin Li   }
165*67e74705SXin Li   case '0': case '1': case '2': case '3':
166*67e74705SXin Li   case '4': case '5': case '6': case '7': {
167*67e74705SXin Li     // Octal escapes.
168*67e74705SXin Li     --ThisTokBuf;
169*67e74705SXin Li     ResultChar = 0;
170*67e74705SXin Li 
171*67e74705SXin Li     // Octal escapes are a series of octal digits with maximum length 3.
172*67e74705SXin Li     // "\0123" is a two digit sequence equal to "\012" "3".
173*67e74705SXin Li     unsigned NumDigits = 0;
174*67e74705SXin Li     do {
175*67e74705SXin Li       ResultChar <<= 3;
176*67e74705SXin Li       ResultChar |= *ThisTokBuf++ - '0';
177*67e74705SXin Li       ++NumDigits;
178*67e74705SXin Li     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
179*67e74705SXin Li              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
180*67e74705SXin Li 
181*67e74705SXin Li     // Check for overflow.  Reject '\777', but not L'\777'.
182*67e74705SXin Li     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
183*67e74705SXin Li       if (Diags)
184*67e74705SXin Li         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
185*67e74705SXin Li              diag::err_escape_too_large) << 1;
186*67e74705SXin Li       ResultChar &= ~0U >> (32-CharWidth);
187*67e74705SXin Li     }
188*67e74705SXin Li     break;
189*67e74705SXin Li   }
190*67e74705SXin Li 
191*67e74705SXin Li     // Otherwise, these are not valid escapes.
192*67e74705SXin Li   case '(': case '{': case '[': case '%':
193*67e74705SXin Li     // GCC accepts these as extensions.  We warn about them as such though.
194*67e74705SXin Li     if (Diags)
195*67e74705SXin Li       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
196*67e74705SXin Li            diag::ext_nonstandard_escape)
197*67e74705SXin Li         << std::string(1, ResultChar);
198*67e74705SXin Li     break;
199*67e74705SXin Li   default:
200*67e74705SXin Li     if (!Diags)
201*67e74705SXin Li       break;
202*67e74705SXin Li 
203*67e74705SXin Li     if (isPrintable(ResultChar))
204*67e74705SXin Li       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
205*67e74705SXin Li            diag::ext_unknown_escape)
206*67e74705SXin Li         << std::string(1, ResultChar);
207*67e74705SXin Li     else
208*67e74705SXin Li       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
209*67e74705SXin Li            diag::ext_unknown_escape)
210*67e74705SXin Li         << "x" + llvm::utohexstr(ResultChar);
211*67e74705SXin Li     break;
212*67e74705SXin Li   }
213*67e74705SXin Li 
214*67e74705SXin Li   return ResultChar;
215*67e74705SXin Li }
216*67e74705SXin Li 
appendCodePoint(unsigned Codepoint,llvm::SmallVectorImpl<char> & Str)217*67e74705SXin Li static void appendCodePoint(unsigned Codepoint,
218*67e74705SXin Li                             llvm::SmallVectorImpl<char> &Str) {
219*67e74705SXin Li   char ResultBuf[4];
220*67e74705SXin Li   char *ResultPtr = ResultBuf;
221*67e74705SXin Li   bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
222*67e74705SXin Li   (void)Res;
223*67e74705SXin Li   assert(Res && "Unexpected conversion failure");
224*67e74705SXin Li   Str.append(ResultBuf, ResultPtr);
225*67e74705SXin Li }
226*67e74705SXin Li 
expandUCNs(SmallVectorImpl<char> & Buf,StringRef Input)227*67e74705SXin Li void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
228*67e74705SXin Li   for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
229*67e74705SXin Li     if (*I != '\\') {
230*67e74705SXin Li       Buf.push_back(*I);
231*67e74705SXin Li       continue;
232*67e74705SXin Li     }
233*67e74705SXin Li 
234*67e74705SXin Li     ++I;
235*67e74705SXin Li     assert(*I == 'u' || *I == 'U');
236*67e74705SXin Li 
237*67e74705SXin Li     unsigned NumHexDigits;
238*67e74705SXin Li     if (*I == 'u')
239*67e74705SXin Li       NumHexDigits = 4;
240*67e74705SXin Li     else
241*67e74705SXin Li       NumHexDigits = 8;
242*67e74705SXin Li 
243*67e74705SXin Li     assert(I + NumHexDigits <= E);
244*67e74705SXin Li 
245*67e74705SXin Li     uint32_t CodePoint = 0;
246*67e74705SXin Li     for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
247*67e74705SXin Li       unsigned Value = llvm::hexDigitValue(*I);
248*67e74705SXin Li       assert(Value != -1U);
249*67e74705SXin Li 
250*67e74705SXin Li       CodePoint <<= 4;
251*67e74705SXin Li       CodePoint += Value;
252*67e74705SXin Li     }
253*67e74705SXin Li 
254*67e74705SXin Li     appendCodePoint(CodePoint, Buf);
255*67e74705SXin Li     --I;
256*67e74705SXin Li   }
257*67e74705SXin Li }
258*67e74705SXin Li 
259*67e74705SXin Li /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
260*67e74705SXin Li /// return the UTF32.
ProcessUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,uint32_t & UcnVal,unsigned short & UcnLen,FullSourceLoc Loc,DiagnosticsEngine * Diags,const LangOptions & Features,bool in_char_string_literal=false)261*67e74705SXin Li static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
262*67e74705SXin Li                              const char *ThisTokEnd,
263*67e74705SXin Li                              uint32_t &UcnVal, unsigned short &UcnLen,
264*67e74705SXin Li                              FullSourceLoc Loc, DiagnosticsEngine *Diags,
265*67e74705SXin Li                              const LangOptions &Features,
266*67e74705SXin Li                              bool in_char_string_literal = false) {
267*67e74705SXin Li   const char *UcnBegin = ThisTokBuf;
268*67e74705SXin Li 
269*67e74705SXin Li   // Skip the '\u' char's.
270*67e74705SXin Li   ThisTokBuf += 2;
271*67e74705SXin Li 
272*67e74705SXin Li   if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
273*67e74705SXin Li     if (Diags)
274*67e74705SXin Li       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
275*67e74705SXin Li            diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
276*67e74705SXin Li     return false;
277*67e74705SXin Li   }
278*67e74705SXin Li   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
279*67e74705SXin Li   unsigned short UcnLenSave = UcnLen;
280*67e74705SXin Li   for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
281*67e74705SXin Li     int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
282*67e74705SXin Li     if (CharVal == -1) break;
283*67e74705SXin Li     UcnVal <<= 4;
284*67e74705SXin Li     UcnVal |= CharVal;
285*67e74705SXin Li   }
286*67e74705SXin Li   // If we didn't consume the proper number of digits, there is a problem.
287*67e74705SXin Li   if (UcnLenSave) {
288*67e74705SXin Li     if (Diags)
289*67e74705SXin Li       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
290*67e74705SXin Li            diag::err_ucn_escape_incomplete);
291*67e74705SXin Li     return false;
292*67e74705SXin Li   }
293*67e74705SXin Li 
294*67e74705SXin Li   // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
295*67e74705SXin Li   if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
296*67e74705SXin Li       UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
297*67e74705SXin Li     if (Diags)
298*67e74705SXin Li       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
299*67e74705SXin Li            diag::err_ucn_escape_invalid);
300*67e74705SXin Li     return false;
301*67e74705SXin Li   }
302*67e74705SXin Li 
303*67e74705SXin Li   // C++11 allows UCNs that refer to control characters and basic source
304*67e74705SXin Li   // characters inside character and string literals
305*67e74705SXin Li   if (UcnVal < 0xa0 &&
306*67e74705SXin Li       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {  // $, @, `
307*67e74705SXin Li     bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal);
308*67e74705SXin Li     if (Diags) {
309*67e74705SXin Li       char BasicSCSChar = UcnVal;
310*67e74705SXin Li       if (UcnVal >= 0x20 && UcnVal < 0x7f)
311*67e74705SXin Li         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
312*67e74705SXin Li              IsError ? diag::err_ucn_escape_basic_scs :
313*67e74705SXin Li                        diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
314*67e74705SXin Li             << StringRef(&BasicSCSChar, 1);
315*67e74705SXin Li       else
316*67e74705SXin Li         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
317*67e74705SXin Li              IsError ? diag::err_ucn_control_character :
318*67e74705SXin Li                        diag::warn_cxx98_compat_literal_ucn_control_character);
319*67e74705SXin Li     }
320*67e74705SXin Li     if (IsError)
321*67e74705SXin Li       return false;
322*67e74705SXin Li   }
323*67e74705SXin Li 
324*67e74705SXin Li   if (!Features.CPlusPlus && !Features.C99 && Diags)
325*67e74705SXin Li     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
326*67e74705SXin Li          diag::warn_ucn_not_valid_in_c89_literal);
327*67e74705SXin Li 
328*67e74705SXin Li   return true;
329*67e74705SXin Li }
330*67e74705SXin Li 
331*67e74705SXin Li /// MeasureUCNEscape - Determine the number of bytes within the resulting string
332*67e74705SXin Li /// which this UCN will occupy.
MeasureUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,unsigned CharByteWidth,const LangOptions & Features,bool & HadError)333*67e74705SXin Li static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
334*67e74705SXin Li                             const char *ThisTokEnd, unsigned CharByteWidth,
335*67e74705SXin Li                             const LangOptions &Features, bool &HadError) {
336*67e74705SXin Li   // UTF-32: 4 bytes per escape.
337*67e74705SXin Li   if (CharByteWidth == 4)
338*67e74705SXin Li     return 4;
339*67e74705SXin Li 
340*67e74705SXin Li   uint32_t UcnVal = 0;
341*67e74705SXin Li   unsigned short UcnLen = 0;
342*67e74705SXin Li   FullSourceLoc Loc;
343*67e74705SXin Li 
344*67e74705SXin Li   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
345*67e74705SXin Li                         UcnLen, Loc, nullptr, Features, true)) {
346*67e74705SXin Li     HadError = true;
347*67e74705SXin Li     return 0;
348*67e74705SXin Li   }
349*67e74705SXin Li 
350*67e74705SXin Li   // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
351*67e74705SXin Li   if (CharByteWidth == 2)
352*67e74705SXin Li     return UcnVal <= 0xFFFF ? 2 : 4;
353*67e74705SXin Li 
354*67e74705SXin Li   // UTF-8.
355*67e74705SXin Li   if (UcnVal < 0x80)
356*67e74705SXin Li     return 1;
357*67e74705SXin Li   if (UcnVal < 0x800)
358*67e74705SXin Li     return 2;
359*67e74705SXin Li   if (UcnVal < 0x10000)
360*67e74705SXin Li     return 3;
361*67e74705SXin Li   return 4;
362*67e74705SXin Li }
363*67e74705SXin Li 
364*67e74705SXin Li /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
365*67e74705SXin Li /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
366*67e74705SXin Li /// StringLiteralParser. When we decide to implement UCN's for identifiers,
367*67e74705SXin Li /// we will likely rework our support for UCN's.
EncodeUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,char * & ResultBuf,bool & HadError,FullSourceLoc Loc,unsigned CharByteWidth,DiagnosticsEngine * Diags,const LangOptions & Features)368*67e74705SXin Li static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
369*67e74705SXin Li                             const char *ThisTokEnd,
370*67e74705SXin Li                             char *&ResultBuf, bool &HadError,
371*67e74705SXin Li                             FullSourceLoc Loc, unsigned CharByteWidth,
372*67e74705SXin Li                             DiagnosticsEngine *Diags,
373*67e74705SXin Li                             const LangOptions &Features) {
374*67e74705SXin Li   typedef uint32_t UTF32;
375*67e74705SXin Li   UTF32 UcnVal = 0;
376*67e74705SXin Li   unsigned short UcnLen = 0;
377*67e74705SXin Li   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
378*67e74705SXin Li                         Loc, Diags, Features, true)) {
379*67e74705SXin Li     HadError = true;
380*67e74705SXin Li     return;
381*67e74705SXin Li   }
382*67e74705SXin Li 
383*67e74705SXin Li   assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
384*67e74705SXin Li          "only character widths of 1, 2, or 4 bytes supported");
385*67e74705SXin Li 
386*67e74705SXin Li   (void)UcnLen;
387*67e74705SXin Li   assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
388*67e74705SXin Li 
389*67e74705SXin Li   if (CharByteWidth == 4) {
390*67e74705SXin Li     // FIXME: Make the type of the result buffer correct instead of
391*67e74705SXin Li     // using reinterpret_cast.
392*67e74705SXin Li     UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);
393*67e74705SXin Li     *ResultPtr = UcnVal;
394*67e74705SXin Li     ResultBuf += 4;
395*67e74705SXin Li     return;
396*67e74705SXin Li   }
397*67e74705SXin Li 
398*67e74705SXin Li   if (CharByteWidth == 2) {
399*67e74705SXin Li     // FIXME: Make the type of the result buffer correct instead of
400*67e74705SXin Li     // using reinterpret_cast.
401*67e74705SXin Li     UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);
402*67e74705SXin Li 
403*67e74705SXin Li     if (UcnVal <= (UTF32)0xFFFF) {
404*67e74705SXin Li       *ResultPtr = UcnVal;
405*67e74705SXin Li       ResultBuf += 2;
406*67e74705SXin Li       return;
407*67e74705SXin Li     }
408*67e74705SXin Li 
409*67e74705SXin Li     // Convert to UTF16.
410*67e74705SXin Li     UcnVal -= 0x10000;
411*67e74705SXin Li     *ResultPtr     = 0xD800 + (UcnVal >> 10);
412*67e74705SXin Li     *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
413*67e74705SXin Li     ResultBuf += 4;
414*67e74705SXin Li     return;
415*67e74705SXin Li   }
416*67e74705SXin Li 
417*67e74705SXin Li   assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
418*67e74705SXin Li 
419*67e74705SXin Li   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
420*67e74705SXin Li   // The conversion below was inspired by:
421*67e74705SXin Li   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
422*67e74705SXin Li   // First, we determine how many bytes the result will require.
423*67e74705SXin Li   typedef uint8_t UTF8;
424*67e74705SXin Li 
425*67e74705SXin Li   unsigned short bytesToWrite = 0;
426*67e74705SXin Li   if (UcnVal < (UTF32)0x80)
427*67e74705SXin Li     bytesToWrite = 1;
428*67e74705SXin Li   else if (UcnVal < (UTF32)0x800)
429*67e74705SXin Li     bytesToWrite = 2;
430*67e74705SXin Li   else if (UcnVal < (UTF32)0x10000)
431*67e74705SXin Li     bytesToWrite = 3;
432*67e74705SXin Li   else
433*67e74705SXin Li     bytesToWrite = 4;
434*67e74705SXin Li 
435*67e74705SXin Li   const unsigned byteMask = 0xBF;
436*67e74705SXin Li   const unsigned byteMark = 0x80;
437*67e74705SXin Li 
438*67e74705SXin Li   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
439*67e74705SXin Li   // into the first byte, depending on how many bytes follow.
440*67e74705SXin Li   static const UTF8 firstByteMark[5] = {
441*67e74705SXin Li     0x00, 0x00, 0xC0, 0xE0, 0xF0
442*67e74705SXin Li   };
443*67e74705SXin Li   // Finally, we write the bytes into ResultBuf.
444*67e74705SXin Li   ResultBuf += bytesToWrite;
445*67e74705SXin Li   switch (bytesToWrite) { // note: everything falls through.
446*67e74705SXin Li   case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
447*67e74705SXin Li   case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
448*67e74705SXin Li   case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
449*67e74705SXin Li   case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
450*67e74705SXin Li   }
451*67e74705SXin Li   // Update the buffer.
452*67e74705SXin Li   ResultBuf += bytesToWrite;
453*67e74705SXin Li }
454*67e74705SXin Li 
455*67e74705SXin Li 
456*67e74705SXin Li ///       integer-constant: [C99 6.4.4.1]
457*67e74705SXin Li ///         decimal-constant integer-suffix
458*67e74705SXin Li ///         octal-constant integer-suffix
459*67e74705SXin Li ///         hexadecimal-constant integer-suffix
460*67e74705SXin Li ///         binary-literal integer-suffix [GNU, C++1y]
461*67e74705SXin Li ///       user-defined-integer-literal: [C++11 lex.ext]
462*67e74705SXin Li ///         decimal-literal ud-suffix
463*67e74705SXin Li ///         octal-literal ud-suffix
464*67e74705SXin Li ///         hexadecimal-literal ud-suffix
465*67e74705SXin Li ///         binary-literal ud-suffix [GNU, C++1y]
466*67e74705SXin Li ///       decimal-constant:
467*67e74705SXin Li ///         nonzero-digit
468*67e74705SXin Li ///         decimal-constant digit
469*67e74705SXin Li ///       octal-constant:
470*67e74705SXin Li ///         0
471*67e74705SXin Li ///         octal-constant octal-digit
472*67e74705SXin Li ///       hexadecimal-constant:
473*67e74705SXin Li ///         hexadecimal-prefix hexadecimal-digit
474*67e74705SXin Li ///         hexadecimal-constant hexadecimal-digit
475*67e74705SXin Li ///       hexadecimal-prefix: one of
476*67e74705SXin Li ///         0x 0X
477*67e74705SXin Li ///       binary-literal:
478*67e74705SXin Li ///         0b binary-digit
479*67e74705SXin Li ///         0B binary-digit
480*67e74705SXin Li ///         binary-literal binary-digit
481*67e74705SXin Li ///       integer-suffix:
482*67e74705SXin Li ///         unsigned-suffix [long-suffix]
483*67e74705SXin Li ///         unsigned-suffix [long-long-suffix]
484*67e74705SXin Li ///         long-suffix [unsigned-suffix]
485*67e74705SXin Li ///         long-long-suffix [unsigned-sufix]
486*67e74705SXin Li ///       nonzero-digit:
487*67e74705SXin Li ///         1 2 3 4 5 6 7 8 9
488*67e74705SXin Li ///       octal-digit:
489*67e74705SXin Li ///         0 1 2 3 4 5 6 7
490*67e74705SXin Li ///       hexadecimal-digit:
491*67e74705SXin Li ///         0 1 2 3 4 5 6 7 8 9
492*67e74705SXin Li ///         a b c d e f
493*67e74705SXin Li ///         A B C D E F
494*67e74705SXin Li ///       binary-digit:
495*67e74705SXin Li ///         0
496*67e74705SXin Li ///         1
497*67e74705SXin Li ///       unsigned-suffix: one of
498*67e74705SXin Li ///         u U
499*67e74705SXin Li ///       long-suffix: one of
500*67e74705SXin Li ///         l L
501*67e74705SXin Li ///       long-long-suffix: one of
502*67e74705SXin Li ///         ll LL
503*67e74705SXin Li ///
504*67e74705SXin Li ///       floating-constant: [C99 6.4.4.2]
505*67e74705SXin Li ///         TODO: add rules...
506*67e74705SXin Li ///
NumericLiteralParser(StringRef TokSpelling,SourceLocation TokLoc,Preprocessor & PP)507*67e74705SXin Li NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
508*67e74705SXin Li                                            SourceLocation TokLoc,
509*67e74705SXin Li                                            Preprocessor &PP)
510*67e74705SXin Li   : PP(PP), ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
511*67e74705SXin Li 
512*67e74705SXin Li   // This routine assumes that the range begin/end matches the regex for integer
513*67e74705SXin Li   // and FP constants (specifically, the 'pp-number' regex), and assumes that
514*67e74705SXin Li   // the byte at "*end" is both valid and not part of the regex.  Because of
515*67e74705SXin Li   // this, it doesn't have to check for 'overscan' in various places.
516*67e74705SXin Li   assert(!isPreprocessingNumberBody(*ThisTokEnd) && "didn't maximally munch?");
517*67e74705SXin Li 
518*67e74705SXin Li   s = DigitsBegin = ThisTokBegin;
519*67e74705SXin Li   saw_exponent = false;
520*67e74705SXin Li   saw_period = false;
521*67e74705SXin Li   saw_ud_suffix = false;
522*67e74705SXin Li   isLong = false;
523*67e74705SXin Li   isUnsigned = false;
524*67e74705SXin Li   isLongLong = false;
525*67e74705SXin Li   isHalf = false;
526*67e74705SXin Li   isFloat = false;
527*67e74705SXin Li   isImaginary = false;
528*67e74705SXin Li   isFloat128 = false;
529*67e74705SXin Li   MicrosoftInteger = 0;
530*67e74705SXin Li   hadError = false;
531*67e74705SXin Li 
532*67e74705SXin Li   if (*s == '0') { // parse radix
533*67e74705SXin Li     ParseNumberStartingWithZero(TokLoc);
534*67e74705SXin Li     if (hadError)
535*67e74705SXin Li       return;
536*67e74705SXin Li   } else { // the first digit is non-zero
537*67e74705SXin Li     radix = 10;
538*67e74705SXin Li     s = SkipDigits(s);
539*67e74705SXin Li     if (s == ThisTokEnd) {
540*67e74705SXin Li       // Done.
541*67e74705SXin Li     } else {
542*67e74705SXin Li       ParseDecimalOrOctalCommon(TokLoc);
543*67e74705SXin Li       if (hadError)
544*67e74705SXin Li         return;
545*67e74705SXin Li     }
546*67e74705SXin Li   }
547*67e74705SXin Li 
548*67e74705SXin Li   SuffixBegin = s;
549*67e74705SXin Li   checkSeparator(TokLoc, s, CSK_AfterDigits);
550*67e74705SXin Li 
551*67e74705SXin Li   // Parse the suffix.  At this point we can classify whether we have an FP or
552*67e74705SXin Li   // integer constant.
553*67e74705SXin Li   bool isFPConstant = isFloatingLiteral();
554*67e74705SXin Li   const char *ImaginarySuffixLoc = nullptr;
555*67e74705SXin Li 
556*67e74705SXin Li   // Loop over all of the characters of the suffix.  If we see something bad,
557*67e74705SXin Li   // we break out of the loop.
558*67e74705SXin Li   for (; s != ThisTokEnd; ++s) {
559*67e74705SXin Li     switch (*s) {
560*67e74705SXin Li     case 'h':      // FP Suffix for "half".
561*67e74705SXin Li     case 'H':
562*67e74705SXin Li       // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
563*67e74705SXin Li       if (!PP.getLangOpts().Half) break;
564*67e74705SXin Li       if (!isFPConstant) break;  // Error for integer constant.
565*67e74705SXin Li       if (isHalf || isFloat || isLong) break; // HH, FH, LH invalid.
566*67e74705SXin Li       isHalf = true;
567*67e74705SXin Li       continue;  // Success.
568*67e74705SXin Li     case 'f':      // FP Suffix for "float"
569*67e74705SXin Li     case 'F':
570*67e74705SXin Li       if (!isFPConstant) break;  // Error for integer constant.
571*67e74705SXin Li       if (isHalf || isFloat || isLong || isFloat128)
572*67e74705SXin Li         break; // HF, FF, LF, QF invalid.
573*67e74705SXin Li       isFloat = true;
574*67e74705SXin Li       continue;  // Success.
575*67e74705SXin Li     case 'q':    // FP Suffix for "__float128"
576*67e74705SXin Li     case 'Q':
577*67e74705SXin Li       if (!isFPConstant) break;  // Error for integer constant.
578*67e74705SXin Li       if (isHalf || isFloat || isLong || isFloat128)
579*67e74705SXin Li         break; // HQ, FQ, LQ, QQ invalid.
580*67e74705SXin Li       isFloat128 = true;
581*67e74705SXin Li       continue;  // Success.
582*67e74705SXin Li     case 'u':
583*67e74705SXin Li     case 'U':
584*67e74705SXin Li       if (isFPConstant) break;  // Error for floating constant.
585*67e74705SXin Li       if (isUnsigned) break;    // Cannot be repeated.
586*67e74705SXin Li       isUnsigned = true;
587*67e74705SXin Li       continue;  // Success.
588*67e74705SXin Li     case 'l':
589*67e74705SXin Li     case 'L':
590*67e74705SXin Li       if (isLong || isLongLong) break;  // Cannot be repeated.
591*67e74705SXin Li       if (isHalf || isFloat || isFloat128) break;     // LH, LF, LQ invalid.
592*67e74705SXin Li 
593*67e74705SXin Li       // Check for long long.  The L's need to be adjacent and the same case.
594*67e74705SXin Li       if (s[1] == s[0]) {
595*67e74705SXin Li         assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
596*67e74705SXin Li         if (isFPConstant) break;        // long long invalid for floats.
597*67e74705SXin Li         isLongLong = true;
598*67e74705SXin Li         ++s;  // Eat both of them.
599*67e74705SXin Li       } else {
600*67e74705SXin Li         isLong = true;
601*67e74705SXin Li       }
602*67e74705SXin Li       continue;  // Success.
603*67e74705SXin Li     case 'i':
604*67e74705SXin Li     case 'I':
605*67e74705SXin Li       if (PP.getLangOpts().MicrosoftExt) {
606*67e74705SXin Li         if (isLong || isLongLong || MicrosoftInteger)
607*67e74705SXin Li           break;
608*67e74705SXin Li 
609*67e74705SXin Li         if (!isFPConstant) {
610*67e74705SXin Li           // Allow i8, i16, i32, and i64.
611*67e74705SXin Li           switch (s[1]) {
612*67e74705SXin Li           case '8':
613*67e74705SXin Li             s += 2; // i8 suffix
614*67e74705SXin Li             MicrosoftInteger = 8;
615*67e74705SXin Li             break;
616*67e74705SXin Li           case '1':
617*67e74705SXin Li             if (s[2] == '6') {
618*67e74705SXin Li               s += 3; // i16 suffix
619*67e74705SXin Li               MicrosoftInteger = 16;
620*67e74705SXin Li             }
621*67e74705SXin Li             break;
622*67e74705SXin Li           case '3':
623*67e74705SXin Li             if (s[2] == '2') {
624*67e74705SXin Li               s += 3; // i32 suffix
625*67e74705SXin Li               MicrosoftInteger = 32;
626*67e74705SXin Li             }
627*67e74705SXin Li             break;
628*67e74705SXin Li           case '6':
629*67e74705SXin Li             if (s[2] == '4') {
630*67e74705SXin Li               s += 3; // i64 suffix
631*67e74705SXin Li               MicrosoftInteger = 64;
632*67e74705SXin Li             }
633*67e74705SXin Li             break;
634*67e74705SXin Li           default:
635*67e74705SXin Li             break;
636*67e74705SXin Li           }
637*67e74705SXin Li         }
638*67e74705SXin Li         if (MicrosoftInteger) {
639*67e74705SXin Li           assert(s <= ThisTokEnd && "didn't maximally munch?");
640*67e74705SXin Li           break;
641*67e74705SXin Li         }
642*67e74705SXin Li       }
643*67e74705SXin Li       // "i", "if", and "il" are user-defined suffixes in C++1y.
644*67e74705SXin Li       if (*s == 'i' && PP.getLangOpts().CPlusPlus14)
645*67e74705SXin Li         break;
646*67e74705SXin Li       // fall through.
647*67e74705SXin Li     case 'j':
648*67e74705SXin Li     case 'J':
649*67e74705SXin Li       if (isImaginary) break;   // Cannot be repeated.
650*67e74705SXin Li       isImaginary = true;
651*67e74705SXin Li       ImaginarySuffixLoc = s;
652*67e74705SXin Li       continue;  // Success.
653*67e74705SXin Li     }
654*67e74705SXin Li     // If we reached here, there was an error or a ud-suffix.
655*67e74705SXin Li     break;
656*67e74705SXin Li   }
657*67e74705SXin Li 
658*67e74705SXin Li   if (s != ThisTokEnd) {
659*67e74705SXin Li     // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
660*67e74705SXin Li     expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
661*67e74705SXin Li     if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) {
662*67e74705SXin Li       // Any suffix pieces we might have parsed are actually part of the
663*67e74705SXin Li       // ud-suffix.
664*67e74705SXin Li       isLong = false;
665*67e74705SXin Li       isUnsigned = false;
666*67e74705SXin Li       isLongLong = false;
667*67e74705SXin Li       isFloat = false;
668*67e74705SXin Li       isHalf = false;
669*67e74705SXin Li       isImaginary = false;
670*67e74705SXin Li       MicrosoftInteger = 0;
671*67e74705SXin Li 
672*67e74705SXin Li       saw_ud_suffix = true;
673*67e74705SXin Li       return;
674*67e74705SXin Li     }
675*67e74705SXin Li 
676*67e74705SXin Li     // Report an error if there are any.
677*67e74705SXin Li     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin - ThisTokBegin),
678*67e74705SXin Li             diag::err_invalid_suffix_constant)
679*67e74705SXin Li       << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin) << isFPConstant;
680*67e74705SXin Li     hadError = true;
681*67e74705SXin Li     return;
682*67e74705SXin Li   }
683*67e74705SXin Li 
684*67e74705SXin Li   if (isImaginary) {
685*67e74705SXin Li     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc,
686*67e74705SXin Li                                        ImaginarySuffixLoc - ThisTokBegin),
687*67e74705SXin Li             diag::ext_imaginary_constant);
688*67e74705SXin Li   }
689*67e74705SXin Li }
690*67e74705SXin Li 
691*67e74705SXin Li /// ParseDecimalOrOctalCommon - This method is called for decimal or octal
692*67e74705SXin Li /// numbers. It issues an error for illegal digits, and handles floating point
693*67e74705SXin Li /// parsing. If it detects a floating point number, the radix is set to 10.
ParseDecimalOrOctalCommon(SourceLocation TokLoc)694*67e74705SXin Li void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
695*67e74705SXin Li   assert((radix == 8 || radix == 10) && "Unexpected radix");
696*67e74705SXin Li 
697*67e74705SXin Li   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
698*67e74705SXin Li   // the code is using an incorrect base.
699*67e74705SXin Li   if (isHexDigit(*s) && *s != 'e' && *s != 'E') {
700*67e74705SXin Li     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
701*67e74705SXin Li             diag::err_invalid_digit) << StringRef(s, 1) << (radix == 8 ? 1 : 0);
702*67e74705SXin Li     hadError = true;
703*67e74705SXin Li     return;
704*67e74705SXin Li   }
705*67e74705SXin Li 
706*67e74705SXin Li   if (*s == '.') {
707*67e74705SXin Li     checkSeparator(TokLoc, s, CSK_AfterDigits);
708*67e74705SXin Li     s++;
709*67e74705SXin Li     radix = 10;
710*67e74705SXin Li     saw_period = true;
711*67e74705SXin Li     checkSeparator(TokLoc, s, CSK_BeforeDigits);
712*67e74705SXin Li     s = SkipDigits(s); // Skip suffix.
713*67e74705SXin Li   }
714*67e74705SXin Li   if (*s == 'e' || *s == 'E') { // exponent
715*67e74705SXin Li     checkSeparator(TokLoc, s, CSK_AfterDigits);
716*67e74705SXin Li     const char *Exponent = s;
717*67e74705SXin Li     s++;
718*67e74705SXin Li     radix = 10;
719*67e74705SXin Li     saw_exponent = true;
720*67e74705SXin Li     if (*s == '+' || *s == '-')  s++; // sign
721*67e74705SXin Li     const char *first_non_digit = SkipDigits(s);
722*67e74705SXin Li     if (containsDigits(s, first_non_digit)) {
723*67e74705SXin Li       checkSeparator(TokLoc, s, CSK_BeforeDigits);
724*67e74705SXin Li       s = first_non_digit;
725*67e74705SXin Li     } else {
726*67e74705SXin Li       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
727*67e74705SXin Li               diag::err_exponent_has_no_digits);
728*67e74705SXin Li       hadError = true;
729*67e74705SXin Li       return;
730*67e74705SXin Li     }
731*67e74705SXin Li   }
732*67e74705SXin Li }
733*67e74705SXin Li 
734*67e74705SXin Li /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
735*67e74705SXin Li /// suffixes as ud-suffixes, because the diagnostic experience is better if we
736*67e74705SXin Li /// treat it as an invalid suffix.
isValidUDSuffix(const LangOptions & LangOpts,StringRef Suffix)737*67e74705SXin Li bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
738*67e74705SXin Li                                            StringRef Suffix) {
739*67e74705SXin Li   if (!LangOpts.CPlusPlus11 || Suffix.empty())
740*67e74705SXin Li     return false;
741*67e74705SXin Li 
742*67e74705SXin Li   // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
743*67e74705SXin Li   if (Suffix[0] == '_')
744*67e74705SXin Li     return true;
745*67e74705SXin Li 
746*67e74705SXin Li   // In C++11, there are no library suffixes.
747*67e74705SXin Li   if (!LangOpts.CPlusPlus14)
748*67e74705SXin Li     return false;
749*67e74705SXin Li 
750*67e74705SXin Li   // In C++1y, "s", "h", "min", "ms", "us", and "ns" are used in the library.
751*67e74705SXin Li   // Per tweaked N3660, "il", "i", and "if" are also used in the library.
752*67e74705SXin Li   return llvm::StringSwitch<bool>(Suffix)
753*67e74705SXin Li       .Cases("h", "min", "s", true)
754*67e74705SXin Li       .Cases("ms", "us", "ns", true)
755*67e74705SXin Li       .Cases("il", "i", "if", true)
756*67e74705SXin Li       .Default(false);
757*67e74705SXin Li }
758*67e74705SXin Li 
checkSeparator(SourceLocation TokLoc,const char * Pos,CheckSeparatorKind IsAfterDigits)759*67e74705SXin Li void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
760*67e74705SXin Li                                           const char *Pos,
761*67e74705SXin Li                                           CheckSeparatorKind IsAfterDigits) {
762*67e74705SXin Li   if (IsAfterDigits == CSK_AfterDigits) {
763*67e74705SXin Li     if (Pos == ThisTokBegin)
764*67e74705SXin Li       return;
765*67e74705SXin Li     --Pos;
766*67e74705SXin Li   } else if (Pos == ThisTokEnd)
767*67e74705SXin Li     return;
768*67e74705SXin Li 
769*67e74705SXin Li   if (isDigitSeparator(*Pos))
770*67e74705SXin Li     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin),
771*67e74705SXin Li             diag::err_digit_separator_not_between_digits)
772*67e74705SXin Li       << IsAfterDigits;
773*67e74705SXin Li }
774*67e74705SXin Li 
775*67e74705SXin Li /// ParseNumberStartingWithZero - This method is called when the first character
776*67e74705SXin Li /// of the number is found to be a zero.  This means it is either an octal
777*67e74705SXin Li /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
778*67e74705SXin Li /// a floating point number (01239.123e4).  Eat the prefix, determining the
779*67e74705SXin Li /// radix etc.
ParseNumberStartingWithZero(SourceLocation TokLoc)780*67e74705SXin Li void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
781*67e74705SXin Li   assert(s[0] == '0' && "Invalid method call");
782*67e74705SXin Li   s++;
783*67e74705SXin Li 
784*67e74705SXin Li   int c1 = s[0];
785*67e74705SXin Li 
786*67e74705SXin Li   // Handle a hex number like 0x1234.
787*67e74705SXin Li   if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
788*67e74705SXin Li     s++;
789*67e74705SXin Li     assert(s < ThisTokEnd && "didn't maximally munch?");
790*67e74705SXin Li     radix = 16;
791*67e74705SXin Li     DigitsBegin = s;
792*67e74705SXin Li     s = SkipHexDigits(s);
793*67e74705SXin Li     bool HasSignificandDigits = containsDigits(DigitsBegin, s);
794*67e74705SXin Li     if (s == ThisTokEnd) {
795*67e74705SXin Li       // Done.
796*67e74705SXin Li     } else if (*s == '.') {
797*67e74705SXin Li       s++;
798*67e74705SXin Li       saw_period = true;
799*67e74705SXin Li       const char *floatDigitsBegin = s;
800*67e74705SXin Li       s = SkipHexDigits(s);
801*67e74705SXin Li       if (containsDigits(floatDigitsBegin, s))
802*67e74705SXin Li         HasSignificandDigits = true;
803*67e74705SXin Li       if (HasSignificandDigits)
804*67e74705SXin Li         checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
805*67e74705SXin Li     }
806*67e74705SXin Li 
807*67e74705SXin Li     if (!HasSignificandDigits) {
808*67e74705SXin Li       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
809*67e74705SXin Li               diag::err_hex_constant_requires)
810*67e74705SXin Li           << PP.getLangOpts().CPlusPlus << 1;
811*67e74705SXin Li       hadError = true;
812*67e74705SXin Li       return;
813*67e74705SXin Li     }
814*67e74705SXin Li 
815*67e74705SXin Li     // A binary exponent can appear with or with a '.'. If dotted, the
816*67e74705SXin Li     // binary exponent is required.
817*67e74705SXin Li     if (*s == 'p' || *s == 'P') {
818*67e74705SXin Li       checkSeparator(TokLoc, s, CSK_AfterDigits);
819*67e74705SXin Li       const char *Exponent = s;
820*67e74705SXin Li       s++;
821*67e74705SXin Li       saw_exponent = true;
822*67e74705SXin Li       if (*s == '+' || *s == '-')  s++; // sign
823*67e74705SXin Li       const char *first_non_digit = SkipDigits(s);
824*67e74705SXin Li       if (!containsDigits(s, first_non_digit)) {
825*67e74705SXin Li         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
826*67e74705SXin Li                 diag::err_exponent_has_no_digits);
827*67e74705SXin Li         hadError = true;
828*67e74705SXin Li         return;
829*67e74705SXin Li       }
830*67e74705SXin Li       checkSeparator(TokLoc, s, CSK_BeforeDigits);
831*67e74705SXin Li       s = first_non_digit;
832*67e74705SXin Li 
833*67e74705SXin Li       if (!PP.getLangOpts().HexFloats)
834*67e74705SXin Li         PP.Diag(TokLoc, PP.getLangOpts().CPlusPlus
835*67e74705SXin Li                             ? diag::ext_hex_literal_invalid
836*67e74705SXin Li                             : diag::ext_hex_constant_invalid);
837*67e74705SXin Li       else if (PP.getLangOpts().CPlusPlus1z)
838*67e74705SXin Li         PP.Diag(TokLoc, diag::warn_cxx1z_hex_literal);
839*67e74705SXin Li     } else if (saw_period) {
840*67e74705SXin Li       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
841*67e74705SXin Li               diag::err_hex_constant_requires)
842*67e74705SXin Li           << PP.getLangOpts().CPlusPlus << 0;
843*67e74705SXin Li       hadError = true;
844*67e74705SXin Li     }
845*67e74705SXin Li     return;
846*67e74705SXin Li   }
847*67e74705SXin Li 
848*67e74705SXin Li   // Handle simple binary numbers 0b01010
849*67e74705SXin Li   if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
850*67e74705SXin Li     // 0b101010 is a C++1y / GCC extension.
851*67e74705SXin Li     PP.Diag(TokLoc,
852*67e74705SXin Li             PP.getLangOpts().CPlusPlus14
853*67e74705SXin Li               ? diag::warn_cxx11_compat_binary_literal
854*67e74705SXin Li               : PP.getLangOpts().CPlusPlus
855*67e74705SXin Li                 ? diag::ext_binary_literal_cxx14
856*67e74705SXin Li                 : diag::ext_binary_literal);
857*67e74705SXin Li     ++s;
858*67e74705SXin Li     assert(s < ThisTokEnd && "didn't maximally munch?");
859*67e74705SXin Li     radix = 2;
860*67e74705SXin Li     DigitsBegin = s;
861*67e74705SXin Li     s = SkipBinaryDigits(s);
862*67e74705SXin Li     if (s == ThisTokEnd) {
863*67e74705SXin Li       // Done.
864*67e74705SXin Li     } else if (isHexDigit(*s)) {
865*67e74705SXin Li       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
866*67e74705SXin Li               diag::err_invalid_digit) << StringRef(s, 1) << 2;
867*67e74705SXin Li       hadError = true;
868*67e74705SXin Li     }
869*67e74705SXin Li     // Other suffixes will be diagnosed by the caller.
870*67e74705SXin Li     return;
871*67e74705SXin Li   }
872*67e74705SXin Li 
873*67e74705SXin Li   // For now, the radix is set to 8. If we discover that we have a
874*67e74705SXin Li   // floating point constant, the radix will change to 10. Octal floating
875*67e74705SXin Li   // point constants are not permitted (only decimal and hexadecimal).
876*67e74705SXin Li   radix = 8;
877*67e74705SXin Li   DigitsBegin = s;
878*67e74705SXin Li   s = SkipOctalDigits(s);
879*67e74705SXin Li   if (s == ThisTokEnd)
880*67e74705SXin Li     return; // Done, simple octal number like 01234
881*67e74705SXin Li 
882*67e74705SXin Li   // If we have some other non-octal digit that *is* a decimal digit, see if
883*67e74705SXin Li   // this is part of a floating point number like 094.123 or 09e1.
884*67e74705SXin Li   if (isDigit(*s)) {
885*67e74705SXin Li     const char *EndDecimal = SkipDigits(s);
886*67e74705SXin Li     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
887*67e74705SXin Li       s = EndDecimal;
888*67e74705SXin Li       radix = 10;
889*67e74705SXin Li     }
890*67e74705SXin Li   }
891*67e74705SXin Li 
892*67e74705SXin Li   ParseDecimalOrOctalCommon(TokLoc);
893*67e74705SXin Li }
894*67e74705SXin Li 
alwaysFitsInto64Bits(unsigned Radix,unsigned NumDigits)895*67e74705SXin Li static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
896*67e74705SXin Li   switch (Radix) {
897*67e74705SXin Li   case 2:
898*67e74705SXin Li     return NumDigits <= 64;
899*67e74705SXin Li   case 8:
900*67e74705SXin Li     return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
901*67e74705SXin Li   case 10:
902*67e74705SXin Li     return NumDigits <= 19; // floor(log10(2^64))
903*67e74705SXin Li   case 16:
904*67e74705SXin Li     return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
905*67e74705SXin Li   default:
906*67e74705SXin Li     llvm_unreachable("impossible Radix");
907*67e74705SXin Li   }
908*67e74705SXin Li }
909*67e74705SXin Li 
910*67e74705SXin Li /// GetIntegerValue - Convert this numeric literal value to an APInt that
911*67e74705SXin Li /// matches Val's input width.  If there is an overflow, set Val to the low bits
912*67e74705SXin Li /// of the result and return true.  Otherwise, return false.
GetIntegerValue(llvm::APInt & Val)913*67e74705SXin Li bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
914*67e74705SXin Li   // Fast path: Compute a conservative bound on the maximum number of
915*67e74705SXin Li   // bits per digit in this radix. If we can't possibly overflow a
916*67e74705SXin Li   // uint64 based on that bound then do the simple conversion to
917*67e74705SXin Li   // integer. This avoids the expensive overflow checking below, and
918*67e74705SXin Li   // handles the common cases that matter (small decimal integers and
919*67e74705SXin Li   // hex/octal values which don't overflow).
920*67e74705SXin Li   const unsigned NumDigits = SuffixBegin - DigitsBegin;
921*67e74705SXin Li   if (alwaysFitsInto64Bits(radix, NumDigits)) {
922*67e74705SXin Li     uint64_t N = 0;
923*67e74705SXin Li     for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
924*67e74705SXin Li       if (!isDigitSeparator(*Ptr))
925*67e74705SXin Li         N = N * radix + llvm::hexDigitValue(*Ptr);
926*67e74705SXin Li 
927*67e74705SXin Li     // This will truncate the value to Val's input width. Simply check
928*67e74705SXin Li     // for overflow by comparing.
929*67e74705SXin Li     Val = N;
930*67e74705SXin Li     return Val.getZExtValue() != N;
931*67e74705SXin Li   }
932*67e74705SXin Li 
933*67e74705SXin Li   Val = 0;
934*67e74705SXin Li   const char *Ptr = DigitsBegin;
935*67e74705SXin Li 
936*67e74705SXin Li   llvm::APInt RadixVal(Val.getBitWidth(), radix);
937*67e74705SXin Li   llvm::APInt CharVal(Val.getBitWidth(), 0);
938*67e74705SXin Li   llvm::APInt OldVal = Val;
939*67e74705SXin Li 
940*67e74705SXin Li   bool OverflowOccurred = false;
941*67e74705SXin Li   while (Ptr < SuffixBegin) {
942*67e74705SXin Li     if (isDigitSeparator(*Ptr)) {
943*67e74705SXin Li       ++Ptr;
944*67e74705SXin Li       continue;
945*67e74705SXin Li     }
946*67e74705SXin Li 
947*67e74705SXin Li     unsigned C = llvm::hexDigitValue(*Ptr++);
948*67e74705SXin Li 
949*67e74705SXin Li     // If this letter is out of bound for this radix, reject it.
950*67e74705SXin Li     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
951*67e74705SXin Li 
952*67e74705SXin Li     CharVal = C;
953*67e74705SXin Li 
954*67e74705SXin Li     // Add the digit to the value in the appropriate radix.  If adding in digits
955*67e74705SXin Li     // made the value smaller, then this overflowed.
956*67e74705SXin Li     OldVal = Val;
957*67e74705SXin Li 
958*67e74705SXin Li     // Multiply by radix, did overflow occur on the multiply?
959*67e74705SXin Li     Val *= RadixVal;
960*67e74705SXin Li     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
961*67e74705SXin Li 
962*67e74705SXin Li     // Add value, did overflow occur on the value?
963*67e74705SXin Li     //   (a + b) ult b  <=> overflow
964*67e74705SXin Li     Val += CharVal;
965*67e74705SXin Li     OverflowOccurred |= Val.ult(CharVal);
966*67e74705SXin Li   }
967*67e74705SXin Li   return OverflowOccurred;
968*67e74705SXin Li }
969*67e74705SXin Li 
970*67e74705SXin Li llvm::APFloat::opStatus
GetFloatValue(llvm::APFloat & Result)971*67e74705SXin Li NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
972*67e74705SXin Li   using llvm::APFloat;
973*67e74705SXin Li 
974*67e74705SXin Li   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
975*67e74705SXin Li 
976*67e74705SXin Li   llvm::SmallString<16> Buffer;
977*67e74705SXin Li   StringRef Str(ThisTokBegin, n);
978*67e74705SXin Li   if (Str.find('\'') != StringRef::npos) {
979*67e74705SXin Li     Buffer.reserve(n);
980*67e74705SXin Li     std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
981*67e74705SXin Li                         &isDigitSeparator);
982*67e74705SXin Li     Str = Buffer;
983*67e74705SXin Li   }
984*67e74705SXin Li 
985*67e74705SXin Li   return Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
986*67e74705SXin Li }
987*67e74705SXin Li 
988*67e74705SXin Li 
989*67e74705SXin Li /// \verbatim
990*67e74705SXin Li ///       user-defined-character-literal: [C++11 lex.ext]
991*67e74705SXin Li ///         character-literal ud-suffix
992*67e74705SXin Li ///       ud-suffix:
993*67e74705SXin Li ///         identifier
994*67e74705SXin Li ///       character-literal: [C++11 lex.ccon]
995*67e74705SXin Li ///         ' c-char-sequence '
996*67e74705SXin Li ///         u' c-char-sequence '
997*67e74705SXin Li ///         U' c-char-sequence '
998*67e74705SXin Li ///         L' c-char-sequence '
999*67e74705SXin Li ///         u8' c-char-sequence ' [C++1z lex.ccon]
1000*67e74705SXin Li ///       c-char-sequence:
1001*67e74705SXin Li ///         c-char
1002*67e74705SXin Li ///         c-char-sequence c-char
1003*67e74705SXin Li ///       c-char:
1004*67e74705SXin Li ///         any member of the source character set except the single-quote ',
1005*67e74705SXin Li ///           backslash \, or new-line character
1006*67e74705SXin Li ///         escape-sequence
1007*67e74705SXin Li ///         universal-character-name
1008*67e74705SXin Li ///       escape-sequence:
1009*67e74705SXin Li ///         simple-escape-sequence
1010*67e74705SXin Li ///         octal-escape-sequence
1011*67e74705SXin Li ///         hexadecimal-escape-sequence
1012*67e74705SXin Li ///       simple-escape-sequence:
1013*67e74705SXin Li ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1014*67e74705SXin Li ///       octal-escape-sequence:
1015*67e74705SXin Li ///         \ octal-digit
1016*67e74705SXin Li ///         \ octal-digit octal-digit
1017*67e74705SXin Li ///         \ octal-digit octal-digit octal-digit
1018*67e74705SXin Li ///       hexadecimal-escape-sequence:
1019*67e74705SXin Li ///         \x hexadecimal-digit
1020*67e74705SXin Li ///         hexadecimal-escape-sequence hexadecimal-digit
1021*67e74705SXin Li ///       universal-character-name: [C++11 lex.charset]
1022*67e74705SXin Li ///         \u hex-quad
1023*67e74705SXin Li ///         \U hex-quad hex-quad
1024*67e74705SXin Li ///       hex-quad:
1025*67e74705SXin Li ///         hex-digit hex-digit hex-digit hex-digit
1026*67e74705SXin Li /// \endverbatim
1027*67e74705SXin Li ///
CharLiteralParser(const char * begin,const char * end,SourceLocation Loc,Preprocessor & PP,tok::TokenKind kind)1028*67e74705SXin Li CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1029*67e74705SXin Li                                      SourceLocation Loc, Preprocessor &PP,
1030*67e74705SXin Li                                      tok::TokenKind kind) {
1031*67e74705SXin Li   // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1032*67e74705SXin Li   HadError = false;
1033*67e74705SXin Li 
1034*67e74705SXin Li   Kind = kind;
1035*67e74705SXin Li 
1036*67e74705SXin Li   const char *TokBegin = begin;
1037*67e74705SXin Li 
1038*67e74705SXin Li   // Skip over wide character determinant.
1039*67e74705SXin Li   if (Kind != tok::char_constant)
1040*67e74705SXin Li     ++begin;
1041*67e74705SXin Li   if (Kind == tok::utf8_char_constant)
1042*67e74705SXin Li     ++begin;
1043*67e74705SXin Li 
1044*67e74705SXin Li   // Skip over the entry quote.
1045*67e74705SXin Li   assert(begin[0] == '\'' && "Invalid token lexed");
1046*67e74705SXin Li   ++begin;
1047*67e74705SXin Li 
1048*67e74705SXin Li   // Remove an optional ud-suffix.
1049*67e74705SXin Li   if (end[-1] != '\'') {
1050*67e74705SXin Li     const char *UDSuffixEnd = end;
1051*67e74705SXin Li     do {
1052*67e74705SXin Li       --end;
1053*67e74705SXin Li     } while (end[-1] != '\'');
1054*67e74705SXin Li     // FIXME: Don't bother with this if !tok.hasUCN().
1055*67e74705SXin Li     expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1056*67e74705SXin Li     UDSuffixOffset = end - TokBegin;
1057*67e74705SXin Li   }
1058*67e74705SXin Li 
1059*67e74705SXin Li   // Trim the ending quote.
1060*67e74705SXin Li   assert(end != begin && "Invalid token lexed");
1061*67e74705SXin Li   --end;
1062*67e74705SXin Li 
1063*67e74705SXin Li   // FIXME: The "Value" is an uint64_t so we can handle char literals of
1064*67e74705SXin Li   // up to 64-bits.
1065*67e74705SXin Li   // FIXME: This extensively assumes that 'char' is 8-bits.
1066*67e74705SXin Li   assert(PP.getTargetInfo().getCharWidth() == 8 &&
1067*67e74705SXin Li          "Assumes char is 8 bits");
1068*67e74705SXin Li   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1069*67e74705SXin Li          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1070*67e74705SXin Li          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1071*67e74705SXin Li   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1072*67e74705SXin Li          "Assumes sizeof(wchar) on target is <= 64");
1073*67e74705SXin Li 
1074*67e74705SXin Li   SmallVector<uint32_t, 4> codepoint_buffer;
1075*67e74705SXin Li   codepoint_buffer.resize(end - begin);
1076*67e74705SXin Li   uint32_t *buffer_begin = &codepoint_buffer.front();
1077*67e74705SXin Li   uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1078*67e74705SXin Li 
1079*67e74705SXin Li   // Unicode escapes representing characters that cannot be correctly
1080*67e74705SXin Li   // represented in a single code unit are disallowed in character literals
1081*67e74705SXin Li   // by this implementation.
1082*67e74705SXin Li   uint32_t largest_character_for_kind;
1083*67e74705SXin Li   if (tok::wide_char_constant == Kind) {
1084*67e74705SXin Li     largest_character_for_kind =
1085*67e74705SXin Li         0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1086*67e74705SXin Li   } else if (tok::utf8_char_constant == Kind) {
1087*67e74705SXin Li     largest_character_for_kind = 0x7F;
1088*67e74705SXin Li   } else if (tok::utf16_char_constant == Kind) {
1089*67e74705SXin Li     largest_character_for_kind = 0xFFFF;
1090*67e74705SXin Li   } else if (tok::utf32_char_constant == Kind) {
1091*67e74705SXin Li     largest_character_for_kind = 0x10FFFF;
1092*67e74705SXin Li   } else {
1093*67e74705SXin Li     largest_character_for_kind = 0x7Fu;
1094*67e74705SXin Li   }
1095*67e74705SXin Li 
1096*67e74705SXin Li   while (begin != end) {
1097*67e74705SXin Li     // Is this a span of non-escape characters?
1098*67e74705SXin Li     if (begin[0] != '\\') {
1099*67e74705SXin Li       char const *start = begin;
1100*67e74705SXin Li       do {
1101*67e74705SXin Li         ++begin;
1102*67e74705SXin Li       } while (begin != end && *begin != '\\');
1103*67e74705SXin Li 
1104*67e74705SXin Li       char const *tmp_in_start = start;
1105*67e74705SXin Li       uint32_t *tmp_out_start = buffer_begin;
1106*67e74705SXin Li       ConversionResult res =
1107*67e74705SXin Li           ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
1108*67e74705SXin Li                              reinterpret_cast<UTF8 const *>(begin),
1109*67e74705SXin Li                              &buffer_begin, buffer_end, strictConversion);
1110*67e74705SXin Li       if (res != conversionOK) {
1111*67e74705SXin Li         // If we see bad encoding for unprefixed character literals, warn and
1112*67e74705SXin Li         // simply copy the byte values, for compatibility with gcc and
1113*67e74705SXin Li         // older versions of clang.
1114*67e74705SXin Li         bool NoErrorOnBadEncoding = isAscii();
1115*67e74705SXin Li         unsigned Msg = diag::err_bad_character_encoding;
1116*67e74705SXin Li         if (NoErrorOnBadEncoding)
1117*67e74705SXin Li           Msg = diag::warn_bad_character_encoding;
1118*67e74705SXin Li         PP.Diag(Loc, Msg);
1119*67e74705SXin Li         if (NoErrorOnBadEncoding) {
1120*67e74705SXin Li           start = tmp_in_start;
1121*67e74705SXin Li           buffer_begin = tmp_out_start;
1122*67e74705SXin Li           for (; start != begin; ++start, ++buffer_begin)
1123*67e74705SXin Li             *buffer_begin = static_cast<uint8_t>(*start);
1124*67e74705SXin Li         } else {
1125*67e74705SXin Li           HadError = true;
1126*67e74705SXin Li         }
1127*67e74705SXin Li       } else {
1128*67e74705SXin Li         for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1129*67e74705SXin Li           if (*tmp_out_start > largest_character_for_kind) {
1130*67e74705SXin Li             HadError = true;
1131*67e74705SXin Li             PP.Diag(Loc, diag::err_character_too_large);
1132*67e74705SXin Li           }
1133*67e74705SXin Li         }
1134*67e74705SXin Li       }
1135*67e74705SXin Li 
1136*67e74705SXin Li       continue;
1137*67e74705SXin Li     }
1138*67e74705SXin Li     // Is this a Universal Character Name escape?
1139*67e74705SXin Li     if (begin[1] == 'u' || begin[1] == 'U') {
1140*67e74705SXin Li       unsigned short UcnLen = 0;
1141*67e74705SXin Li       if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1142*67e74705SXin Li                             FullSourceLoc(Loc, PP.getSourceManager()),
1143*67e74705SXin Li                             &PP.getDiagnostics(), PP.getLangOpts(), true)) {
1144*67e74705SXin Li         HadError = true;
1145*67e74705SXin Li       } else if (*buffer_begin > largest_character_for_kind) {
1146*67e74705SXin Li         HadError = true;
1147*67e74705SXin Li         PP.Diag(Loc, diag::err_character_too_large);
1148*67e74705SXin Li       }
1149*67e74705SXin Li 
1150*67e74705SXin Li       ++buffer_begin;
1151*67e74705SXin Li       continue;
1152*67e74705SXin Li     }
1153*67e74705SXin Li     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1154*67e74705SXin Li     uint64_t result =
1155*67e74705SXin Li       ProcessCharEscape(TokBegin, begin, end, HadError,
1156*67e74705SXin Li                         FullSourceLoc(Loc,PP.getSourceManager()),
1157*67e74705SXin Li                         CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
1158*67e74705SXin Li     *buffer_begin++ = result;
1159*67e74705SXin Li   }
1160*67e74705SXin Li 
1161*67e74705SXin Li   unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1162*67e74705SXin Li 
1163*67e74705SXin Li   if (NumCharsSoFar > 1) {
1164*67e74705SXin Li     if (isWide())
1165*67e74705SXin Li       PP.Diag(Loc, diag::warn_extraneous_char_constant);
1166*67e74705SXin Li     else if (isAscii() && NumCharsSoFar == 4)
1167*67e74705SXin Li       PP.Diag(Loc, diag::ext_four_char_character_literal);
1168*67e74705SXin Li     else if (isAscii())
1169*67e74705SXin Li       PP.Diag(Loc, diag::ext_multichar_character_literal);
1170*67e74705SXin Li     else
1171*67e74705SXin Li       PP.Diag(Loc, diag::err_multichar_utf_character_literal);
1172*67e74705SXin Li     IsMultiChar = true;
1173*67e74705SXin Li   } else {
1174*67e74705SXin Li     IsMultiChar = false;
1175*67e74705SXin Li   }
1176*67e74705SXin Li 
1177*67e74705SXin Li   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1178*67e74705SXin Li 
1179*67e74705SXin Li   // Narrow character literals act as though their value is concatenated
1180*67e74705SXin Li   // in this implementation, but warn on overflow.
1181*67e74705SXin Li   bool multi_char_too_long = false;
1182*67e74705SXin Li   if (isAscii() && isMultiChar()) {
1183*67e74705SXin Li     LitVal = 0;
1184*67e74705SXin Li     for (size_t i = 0; i < NumCharsSoFar; ++i) {
1185*67e74705SXin Li       // check for enough leading zeros to shift into
1186*67e74705SXin Li       multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
1187*67e74705SXin Li       LitVal <<= 8;
1188*67e74705SXin Li       LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1189*67e74705SXin Li     }
1190*67e74705SXin Li   } else if (NumCharsSoFar > 0) {
1191*67e74705SXin Li     // otherwise just take the last character
1192*67e74705SXin Li     LitVal = buffer_begin[-1];
1193*67e74705SXin Li   }
1194*67e74705SXin Li 
1195*67e74705SXin Li   if (!HadError && multi_char_too_long) {
1196*67e74705SXin Li     PP.Diag(Loc, diag::warn_char_constant_too_large);
1197*67e74705SXin Li   }
1198*67e74705SXin Li 
1199*67e74705SXin Li   // Transfer the value from APInt to uint64_t
1200*67e74705SXin Li   Value = LitVal.getZExtValue();
1201*67e74705SXin Li 
1202*67e74705SXin Li   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1203*67e74705SXin Li   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
1204*67e74705SXin Li   // character constants are not sign extended in the this implementation:
1205*67e74705SXin Li   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1206*67e74705SXin Li   if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
1207*67e74705SXin Li       PP.getLangOpts().CharIsSigned)
1208*67e74705SXin Li     Value = (signed char)Value;
1209*67e74705SXin Li }
1210*67e74705SXin Li 
1211*67e74705SXin Li /// \verbatim
1212*67e74705SXin Li ///       string-literal: [C++0x lex.string]
1213*67e74705SXin Li ///         encoding-prefix " [s-char-sequence] "
1214*67e74705SXin Li ///         encoding-prefix R raw-string
1215*67e74705SXin Li ///       encoding-prefix:
1216*67e74705SXin Li ///         u8
1217*67e74705SXin Li ///         u
1218*67e74705SXin Li ///         U
1219*67e74705SXin Li ///         L
1220*67e74705SXin Li ///       s-char-sequence:
1221*67e74705SXin Li ///         s-char
1222*67e74705SXin Li ///         s-char-sequence s-char
1223*67e74705SXin Li ///       s-char:
1224*67e74705SXin Li ///         any member of the source character set except the double-quote ",
1225*67e74705SXin Li ///           backslash \, or new-line character
1226*67e74705SXin Li ///         escape-sequence
1227*67e74705SXin Li ///         universal-character-name
1228*67e74705SXin Li ///       raw-string:
1229*67e74705SXin Li ///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
1230*67e74705SXin Li ///       r-char-sequence:
1231*67e74705SXin Li ///         r-char
1232*67e74705SXin Li ///         r-char-sequence r-char
1233*67e74705SXin Li ///       r-char:
1234*67e74705SXin Li ///         any member of the source character set, except a right parenthesis )
1235*67e74705SXin Li ///           followed by the initial d-char-sequence (which may be empty)
1236*67e74705SXin Li ///           followed by a double quote ".
1237*67e74705SXin Li ///       d-char-sequence:
1238*67e74705SXin Li ///         d-char
1239*67e74705SXin Li ///         d-char-sequence d-char
1240*67e74705SXin Li ///       d-char:
1241*67e74705SXin Li ///         any member of the basic source character set except:
1242*67e74705SXin Li ///           space, the left parenthesis (, the right parenthesis ),
1243*67e74705SXin Li ///           the backslash \, and the control characters representing horizontal
1244*67e74705SXin Li ///           tab, vertical tab, form feed, and newline.
1245*67e74705SXin Li ///       escape-sequence: [C++0x lex.ccon]
1246*67e74705SXin Li ///         simple-escape-sequence
1247*67e74705SXin Li ///         octal-escape-sequence
1248*67e74705SXin Li ///         hexadecimal-escape-sequence
1249*67e74705SXin Li ///       simple-escape-sequence:
1250*67e74705SXin Li ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1251*67e74705SXin Li ///       octal-escape-sequence:
1252*67e74705SXin Li ///         \ octal-digit
1253*67e74705SXin Li ///         \ octal-digit octal-digit
1254*67e74705SXin Li ///         \ octal-digit octal-digit octal-digit
1255*67e74705SXin Li ///       hexadecimal-escape-sequence:
1256*67e74705SXin Li ///         \x hexadecimal-digit
1257*67e74705SXin Li ///         hexadecimal-escape-sequence hexadecimal-digit
1258*67e74705SXin Li ///       universal-character-name:
1259*67e74705SXin Li ///         \u hex-quad
1260*67e74705SXin Li ///         \U hex-quad hex-quad
1261*67e74705SXin Li ///       hex-quad:
1262*67e74705SXin Li ///         hex-digit hex-digit hex-digit hex-digit
1263*67e74705SXin Li /// \endverbatim
1264*67e74705SXin Li ///
1265*67e74705SXin Li StringLiteralParser::
StringLiteralParser(ArrayRef<Token> StringToks,Preprocessor & PP,bool Complain)1266*67e74705SXin Li StringLiteralParser(ArrayRef<Token> StringToks,
1267*67e74705SXin Li                     Preprocessor &PP, bool Complain)
1268*67e74705SXin Li   : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1269*67e74705SXin Li     Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr),
1270*67e74705SXin Li     MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1271*67e74705SXin Li     ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
1272*67e74705SXin Li   init(StringToks);
1273*67e74705SXin Li }
1274*67e74705SXin Li 
init(ArrayRef<Token> StringToks)1275*67e74705SXin Li void StringLiteralParser::init(ArrayRef<Token> StringToks){
1276*67e74705SXin Li   // The literal token may have come from an invalid source location (e.g. due
1277*67e74705SXin Li   // to a PCH error), in which case the token length will be 0.
1278*67e74705SXin Li   if (StringToks.empty() || StringToks[0].getLength() < 2)
1279*67e74705SXin Li     return DiagnoseLexingError(SourceLocation());
1280*67e74705SXin Li 
1281*67e74705SXin Li   // Scan all of the string portions, remember the max individual token length,
1282*67e74705SXin Li   // computing a bound on the concatenated string length, and see whether any
1283*67e74705SXin Li   // piece is a wide-string.  If any of the string portions is a wide-string
1284*67e74705SXin Li   // literal, the result is a wide-string literal [C99 6.4.5p4].
1285*67e74705SXin Li   assert(!StringToks.empty() && "expected at least one token");
1286*67e74705SXin Li   MaxTokenLength = StringToks[0].getLength();
1287*67e74705SXin Li   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1288*67e74705SXin Li   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
1289*67e74705SXin Li   Kind = StringToks[0].getKind();
1290*67e74705SXin Li 
1291*67e74705SXin Li   hadError = false;
1292*67e74705SXin Li 
1293*67e74705SXin Li   // Implement Translation Phase #6: concatenation of string literals
1294*67e74705SXin Li   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1295*67e74705SXin Li   for (unsigned i = 1; i != StringToks.size(); ++i) {
1296*67e74705SXin Li     if (StringToks[i].getLength() < 2)
1297*67e74705SXin Li       return DiagnoseLexingError(StringToks[i].getLocation());
1298*67e74705SXin Li 
1299*67e74705SXin Li     // The string could be shorter than this if it needs cleaning, but this is a
1300*67e74705SXin Li     // reasonable bound, which is all we need.
1301*67e74705SXin Li     assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
1302*67e74705SXin Li     SizeBound += StringToks[i].getLength()-2;  // -2 for "".
1303*67e74705SXin Li 
1304*67e74705SXin Li     // Remember maximum string piece length.
1305*67e74705SXin Li     if (StringToks[i].getLength() > MaxTokenLength)
1306*67e74705SXin Li       MaxTokenLength = StringToks[i].getLength();
1307*67e74705SXin Li 
1308*67e74705SXin Li     // Remember if we see any wide or utf-8/16/32 strings.
1309*67e74705SXin Li     // Also check for illegal concatenations.
1310*67e74705SXin Li     if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
1311*67e74705SXin Li       if (isAscii()) {
1312*67e74705SXin Li         Kind = StringToks[i].getKind();
1313*67e74705SXin Li       } else {
1314*67e74705SXin Li         if (Diags)
1315*67e74705SXin Li           Diags->Report(StringToks[i].getLocation(),
1316*67e74705SXin Li                         diag::err_unsupported_string_concat);
1317*67e74705SXin Li         hadError = true;
1318*67e74705SXin Li       }
1319*67e74705SXin Li     }
1320*67e74705SXin Li   }
1321*67e74705SXin Li 
1322*67e74705SXin Li   // Include space for the null terminator.
1323*67e74705SXin Li   ++SizeBound;
1324*67e74705SXin Li 
1325*67e74705SXin Li   // TODO: K&R warning: "traditional C rejects string constant concatenation"
1326*67e74705SXin Li 
1327*67e74705SXin Li   // Get the width in bytes of char/wchar_t/char16_t/char32_t
1328*67e74705SXin Li   CharByteWidth = getCharWidth(Kind, Target);
1329*67e74705SXin Li   assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1330*67e74705SXin Li   CharByteWidth /= 8;
1331*67e74705SXin Li 
1332*67e74705SXin Li   // The output buffer size needs to be large enough to hold wide characters.
1333*67e74705SXin Li   // This is a worst-case assumption which basically corresponds to L"" "long".
1334*67e74705SXin Li   SizeBound *= CharByteWidth;
1335*67e74705SXin Li 
1336*67e74705SXin Li   // Size the temporary buffer to hold the result string data.
1337*67e74705SXin Li   ResultBuf.resize(SizeBound);
1338*67e74705SXin Li 
1339*67e74705SXin Li   // Likewise, but for each string piece.
1340*67e74705SXin Li   SmallString<512> TokenBuf;
1341*67e74705SXin Li   TokenBuf.resize(MaxTokenLength);
1342*67e74705SXin Li 
1343*67e74705SXin Li   // Loop over all the strings, getting their spelling, and expanding them to
1344*67e74705SXin Li   // wide strings as appropriate.
1345*67e74705SXin Li   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
1346*67e74705SXin Li 
1347*67e74705SXin Li   Pascal = false;
1348*67e74705SXin Li 
1349*67e74705SXin Li   SourceLocation UDSuffixTokLoc;
1350*67e74705SXin Li 
1351*67e74705SXin Li   for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
1352*67e74705SXin Li     const char *ThisTokBuf = &TokenBuf[0];
1353*67e74705SXin Li     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
1354*67e74705SXin Li     // that ThisTokBuf points to a buffer that is big enough for the whole token
1355*67e74705SXin Li     // and 'spelled' tokens can only shrink.
1356*67e74705SXin Li     bool StringInvalid = false;
1357*67e74705SXin Li     unsigned ThisTokLen =
1358*67e74705SXin Li       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1359*67e74705SXin Li                          &StringInvalid);
1360*67e74705SXin Li     if (StringInvalid)
1361*67e74705SXin Li       return DiagnoseLexingError(StringToks[i].getLocation());
1362*67e74705SXin Li 
1363*67e74705SXin Li     const char *ThisTokBegin = ThisTokBuf;
1364*67e74705SXin Li     const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1365*67e74705SXin Li 
1366*67e74705SXin Li     // Remove an optional ud-suffix.
1367*67e74705SXin Li     if (ThisTokEnd[-1] != '"') {
1368*67e74705SXin Li       const char *UDSuffixEnd = ThisTokEnd;
1369*67e74705SXin Li       do {
1370*67e74705SXin Li         --ThisTokEnd;
1371*67e74705SXin Li       } while (ThisTokEnd[-1] != '"');
1372*67e74705SXin Li 
1373*67e74705SXin Li       StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1374*67e74705SXin Li 
1375*67e74705SXin Li       if (UDSuffixBuf.empty()) {
1376*67e74705SXin Li         if (StringToks[i].hasUCN())
1377*67e74705SXin Li           expandUCNs(UDSuffixBuf, UDSuffix);
1378*67e74705SXin Li         else
1379*67e74705SXin Li           UDSuffixBuf.assign(UDSuffix);
1380*67e74705SXin Li         UDSuffixToken = i;
1381*67e74705SXin Li         UDSuffixOffset = ThisTokEnd - ThisTokBuf;
1382*67e74705SXin Li         UDSuffixTokLoc = StringToks[i].getLocation();
1383*67e74705SXin Li       } else {
1384*67e74705SXin Li         SmallString<32> ExpandedUDSuffix;
1385*67e74705SXin Li         if (StringToks[i].hasUCN()) {
1386*67e74705SXin Li           expandUCNs(ExpandedUDSuffix, UDSuffix);
1387*67e74705SXin Li           UDSuffix = ExpandedUDSuffix;
1388*67e74705SXin Li         }
1389*67e74705SXin Li 
1390*67e74705SXin Li         // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1391*67e74705SXin Li         // result of a concatenation involving at least one user-defined-string-
1392*67e74705SXin Li         // literal, all the participating user-defined-string-literals shall
1393*67e74705SXin Li         // have the same ud-suffix.
1394*67e74705SXin Li         if (UDSuffixBuf != UDSuffix) {
1395*67e74705SXin Li           if (Diags) {
1396*67e74705SXin Li             SourceLocation TokLoc = StringToks[i].getLocation();
1397*67e74705SXin Li             Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1398*67e74705SXin Li               << UDSuffixBuf << UDSuffix
1399*67e74705SXin Li               << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1400*67e74705SXin Li               << SourceRange(TokLoc, TokLoc);
1401*67e74705SXin Li           }
1402*67e74705SXin Li           hadError = true;
1403*67e74705SXin Li         }
1404*67e74705SXin Li       }
1405*67e74705SXin Li     }
1406*67e74705SXin Li 
1407*67e74705SXin Li     // Strip the end quote.
1408*67e74705SXin Li     --ThisTokEnd;
1409*67e74705SXin Li 
1410*67e74705SXin Li     // TODO: Input character set mapping support.
1411*67e74705SXin Li 
1412*67e74705SXin Li     // Skip marker for wide or unicode strings.
1413*67e74705SXin Li     if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
1414*67e74705SXin Li       ++ThisTokBuf;
1415*67e74705SXin Li       // Skip 8 of u8 marker for utf8 strings.
1416*67e74705SXin Li       if (ThisTokBuf[0] == '8')
1417*67e74705SXin Li         ++ThisTokBuf;
1418*67e74705SXin Li     }
1419*67e74705SXin Li 
1420*67e74705SXin Li     // Check for raw string
1421*67e74705SXin Li     if (ThisTokBuf[0] == 'R') {
1422*67e74705SXin Li       ThisTokBuf += 2; // skip R"
1423*67e74705SXin Li 
1424*67e74705SXin Li       const char *Prefix = ThisTokBuf;
1425*67e74705SXin Li       while (ThisTokBuf[0] != '(')
1426*67e74705SXin Li         ++ThisTokBuf;
1427*67e74705SXin Li       ++ThisTokBuf; // skip '('
1428*67e74705SXin Li 
1429*67e74705SXin Li       // Remove same number of characters from the end
1430*67e74705SXin Li       ThisTokEnd -= ThisTokBuf - Prefix;
1431*67e74705SXin Li       assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
1432*67e74705SXin Li 
1433*67e74705SXin Li       // C++14 [lex.string]p4: A source-file new-line in a raw string literal
1434*67e74705SXin Li       // results in a new-line in the resulting execution string-literal.
1435*67e74705SXin Li       StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
1436*67e74705SXin Li       while (!RemainingTokenSpan.empty()) {
1437*67e74705SXin Li         // Split the string literal on \r\n boundaries.
1438*67e74705SXin Li         size_t CRLFPos = RemainingTokenSpan.find("\r\n");
1439*67e74705SXin Li         StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
1440*67e74705SXin Li         StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
1441*67e74705SXin Li 
1442*67e74705SXin Li         // Copy everything before the \r\n sequence into the string literal.
1443*67e74705SXin Li         if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
1444*67e74705SXin Li           hadError = true;
1445*67e74705SXin Li 
1446*67e74705SXin Li         // Point into the \n inside the \r\n sequence and operate on the
1447*67e74705SXin Li         // remaining portion of the literal.
1448*67e74705SXin Li         RemainingTokenSpan = AfterCRLF.substr(1);
1449*67e74705SXin Li       }
1450*67e74705SXin Li     } else {
1451*67e74705SXin Li       if (ThisTokBuf[0] != '"') {
1452*67e74705SXin Li         // The file may have come from PCH and then changed after loading the
1453*67e74705SXin Li         // PCH; Fail gracefully.
1454*67e74705SXin Li         return DiagnoseLexingError(StringToks[i].getLocation());
1455*67e74705SXin Li       }
1456*67e74705SXin Li       ++ThisTokBuf; // skip "
1457*67e74705SXin Li 
1458*67e74705SXin Li       // Check if this is a pascal string
1459*67e74705SXin Li       if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
1460*67e74705SXin Li           ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
1461*67e74705SXin Li 
1462*67e74705SXin Li         // If the \p sequence is found in the first token, we have a pascal string
1463*67e74705SXin Li         // Otherwise, if we already have a pascal string, ignore the first \p
1464*67e74705SXin Li         if (i == 0) {
1465*67e74705SXin Li           ++ThisTokBuf;
1466*67e74705SXin Li           Pascal = true;
1467*67e74705SXin Li         } else if (Pascal)
1468*67e74705SXin Li           ThisTokBuf += 2;
1469*67e74705SXin Li       }
1470*67e74705SXin Li 
1471*67e74705SXin Li       while (ThisTokBuf != ThisTokEnd) {
1472*67e74705SXin Li         // Is this a span of non-escape characters?
1473*67e74705SXin Li         if (ThisTokBuf[0] != '\\') {
1474*67e74705SXin Li           const char *InStart = ThisTokBuf;
1475*67e74705SXin Li           do {
1476*67e74705SXin Li             ++ThisTokBuf;
1477*67e74705SXin Li           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
1478*67e74705SXin Li 
1479*67e74705SXin Li           // Copy the character span over.
1480*67e74705SXin Li           if (CopyStringFragment(StringToks[i], ThisTokBegin,
1481*67e74705SXin Li                                  StringRef(InStart, ThisTokBuf - InStart)))
1482*67e74705SXin Li             hadError = true;
1483*67e74705SXin Li           continue;
1484*67e74705SXin Li         }
1485*67e74705SXin Li         // Is this a Universal Character Name escape?
1486*67e74705SXin Li         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
1487*67e74705SXin Li           EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
1488*67e74705SXin Li                           ResultPtr, hadError,
1489*67e74705SXin Li                           FullSourceLoc(StringToks[i].getLocation(), SM),
1490*67e74705SXin Li                           CharByteWidth, Diags, Features);
1491*67e74705SXin Li           continue;
1492*67e74705SXin Li         }
1493*67e74705SXin Li         // Otherwise, this is a non-UCN escape character.  Process it.
1494*67e74705SXin Li         unsigned ResultChar =
1495*67e74705SXin Li           ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
1496*67e74705SXin Li                             FullSourceLoc(StringToks[i].getLocation(), SM),
1497*67e74705SXin Li                             CharByteWidth*8, Diags, Features);
1498*67e74705SXin Li 
1499*67e74705SXin Li         if (CharByteWidth == 4) {
1500*67e74705SXin Li           // FIXME: Make the type of the result buffer correct instead of
1501*67e74705SXin Li           // using reinterpret_cast.
1502*67e74705SXin Li           UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);
1503*67e74705SXin Li           *ResultWidePtr = ResultChar;
1504*67e74705SXin Li           ResultPtr += 4;
1505*67e74705SXin Li         } else if (CharByteWidth == 2) {
1506*67e74705SXin Li           // FIXME: Make the type of the result buffer correct instead of
1507*67e74705SXin Li           // using reinterpret_cast.
1508*67e74705SXin Li           UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);
1509*67e74705SXin Li           *ResultWidePtr = ResultChar & 0xFFFF;
1510*67e74705SXin Li           ResultPtr += 2;
1511*67e74705SXin Li         } else {
1512*67e74705SXin Li           assert(CharByteWidth == 1 && "Unexpected char width");
1513*67e74705SXin Li           *ResultPtr++ = ResultChar & 0xFF;
1514*67e74705SXin Li         }
1515*67e74705SXin Li       }
1516*67e74705SXin Li     }
1517*67e74705SXin Li   }
1518*67e74705SXin Li 
1519*67e74705SXin Li   if (Pascal) {
1520*67e74705SXin Li     if (CharByteWidth == 4) {
1521*67e74705SXin Li       // FIXME: Make the type of the result buffer correct instead of
1522*67e74705SXin Li       // using reinterpret_cast.
1523*67e74705SXin Li       UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());
1524*67e74705SXin Li       ResultWidePtr[0] = GetNumStringChars() - 1;
1525*67e74705SXin Li     } else if (CharByteWidth == 2) {
1526*67e74705SXin Li       // FIXME: Make the type of the result buffer correct instead of
1527*67e74705SXin Li       // using reinterpret_cast.
1528*67e74705SXin Li       UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());
1529*67e74705SXin Li       ResultWidePtr[0] = GetNumStringChars() - 1;
1530*67e74705SXin Li     } else {
1531*67e74705SXin Li       assert(CharByteWidth == 1 && "Unexpected char width");
1532*67e74705SXin Li       ResultBuf[0] = GetNumStringChars() - 1;
1533*67e74705SXin Li     }
1534*67e74705SXin Li 
1535*67e74705SXin Li     // Verify that pascal strings aren't too large.
1536*67e74705SXin Li     if (GetStringLength() > 256) {
1537*67e74705SXin Li       if (Diags)
1538*67e74705SXin Li         Diags->Report(StringToks.front().getLocation(),
1539*67e74705SXin Li                       diag::err_pascal_string_too_long)
1540*67e74705SXin Li           << SourceRange(StringToks.front().getLocation(),
1541*67e74705SXin Li                          StringToks.back().getLocation());
1542*67e74705SXin Li       hadError = true;
1543*67e74705SXin Li       return;
1544*67e74705SXin Li     }
1545*67e74705SXin Li   } else if (Diags) {
1546*67e74705SXin Li     // Complain if this string literal has too many characters.
1547*67e74705SXin Li     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
1548*67e74705SXin Li 
1549*67e74705SXin Li     if (GetNumStringChars() > MaxChars)
1550*67e74705SXin Li       Diags->Report(StringToks.front().getLocation(),
1551*67e74705SXin Li                     diag::ext_string_too_long)
1552*67e74705SXin Li         << GetNumStringChars() << MaxChars
1553*67e74705SXin Li         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
1554*67e74705SXin Li         << SourceRange(StringToks.front().getLocation(),
1555*67e74705SXin Li                        StringToks.back().getLocation());
1556*67e74705SXin Li   }
1557*67e74705SXin Li }
1558*67e74705SXin Li 
resyncUTF8(const char * Err,const char * End)1559*67e74705SXin Li static const char *resyncUTF8(const char *Err, const char *End) {
1560*67e74705SXin Li   if (Err == End)
1561*67e74705SXin Li     return End;
1562*67e74705SXin Li   End = Err + std::min<unsigned>(getNumBytesForUTF8(*Err), End-Err);
1563*67e74705SXin Li   while (++Err != End && (*Err & 0xC0) == 0x80)
1564*67e74705SXin Li     ;
1565*67e74705SXin Li   return Err;
1566*67e74705SXin Li }
1567*67e74705SXin Li 
1568*67e74705SXin Li /// \brief This function copies from Fragment, which is a sequence of bytes
1569*67e74705SXin Li /// within Tok's contents (which begin at TokBegin) into ResultPtr.
1570*67e74705SXin Li /// Performs widening for multi-byte characters.
CopyStringFragment(const Token & Tok,const char * TokBegin,StringRef Fragment)1571*67e74705SXin Li bool StringLiteralParser::CopyStringFragment(const Token &Tok,
1572*67e74705SXin Li                                              const char *TokBegin,
1573*67e74705SXin Li                                              StringRef Fragment) {
1574*67e74705SXin Li   const UTF8 *ErrorPtrTmp;
1575*67e74705SXin Li   if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
1576*67e74705SXin Li     return false;
1577*67e74705SXin Li 
1578*67e74705SXin Li   // If we see bad encoding for unprefixed string literals, warn and
1579*67e74705SXin Li   // simply copy the byte values, for compatibility with gcc and older
1580*67e74705SXin Li   // versions of clang.
1581*67e74705SXin Li   bool NoErrorOnBadEncoding = isAscii();
1582*67e74705SXin Li   if (NoErrorOnBadEncoding) {
1583*67e74705SXin Li     memcpy(ResultPtr, Fragment.data(), Fragment.size());
1584*67e74705SXin Li     ResultPtr += Fragment.size();
1585*67e74705SXin Li   }
1586*67e74705SXin Li 
1587*67e74705SXin Li   if (Diags) {
1588*67e74705SXin Li     const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
1589*67e74705SXin Li 
1590*67e74705SXin Li     FullSourceLoc SourceLoc(Tok.getLocation(), SM);
1591*67e74705SXin Li     const DiagnosticBuilder &Builder =
1592*67e74705SXin Li       Diag(Diags, Features, SourceLoc, TokBegin,
1593*67e74705SXin Li            ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
1594*67e74705SXin Li            NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
1595*67e74705SXin Li                                 : diag::err_bad_string_encoding);
1596*67e74705SXin Li 
1597*67e74705SXin Li     const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
1598*67e74705SXin Li     StringRef NextFragment(NextStart, Fragment.end()-NextStart);
1599*67e74705SXin Li 
1600*67e74705SXin Li     // Decode into a dummy buffer.
1601*67e74705SXin Li     SmallString<512> Dummy;
1602*67e74705SXin Li     Dummy.reserve(Fragment.size() * CharByteWidth);
1603*67e74705SXin Li     char *Ptr = Dummy.data();
1604*67e74705SXin Li 
1605*67e74705SXin Li     while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
1606*67e74705SXin Li       const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
1607*67e74705SXin Li       NextStart = resyncUTF8(ErrorPtr, Fragment.end());
1608*67e74705SXin Li       Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
1609*67e74705SXin Li                                      ErrorPtr, NextStart);
1610*67e74705SXin Li       NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
1611*67e74705SXin Li     }
1612*67e74705SXin Li   }
1613*67e74705SXin Li   return !NoErrorOnBadEncoding;
1614*67e74705SXin Li }
1615*67e74705SXin Li 
DiagnoseLexingError(SourceLocation Loc)1616*67e74705SXin Li void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
1617*67e74705SXin Li   hadError = true;
1618*67e74705SXin Li   if (Diags)
1619*67e74705SXin Li     Diags->Report(Loc, diag::err_lexing_string);
1620*67e74705SXin Li }
1621*67e74705SXin Li 
1622*67e74705SXin Li /// getOffsetOfStringByte - This function returns the offset of the
1623*67e74705SXin Li /// specified byte of the string data represented by Token.  This handles
1624*67e74705SXin Li /// advancing over escape sequences in the string.
getOffsetOfStringByte(const Token & Tok,unsigned ByteNo) const1625*67e74705SXin Li unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
1626*67e74705SXin Li                                                     unsigned ByteNo) const {
1627*67e74705SXin Li   // Get the spelling of the token.
1628*67e74705SXin Li   SmallString<32> SpellingBuffer;
1629*67e74705SXin Li   SpellingBuffer.resize(Tok.getLength());
1630*67e74705SXin Li 
1631*67e74705SXin Li   bool StringInvalid = false;
1632*67e74705SXin Li   const char *SpellingPtr = &SpellingBuffer[0];
1633*67e74705SXin Li   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1634*67e74705SXin Li                                        &StringInvalid);
1635*67e74705SXin Li   if (StringInvalid)
1636*67e74705SXin Li     return 0;
1637*67e74705SXin Li 
1638*67e74705SXin Li   const char *SpellingStart = SpellingPtr;
1639*67e74705SXin Li   const char *SpellingEnd = SpellingPtr+TokLen;
1640*67e74705SXin Li 
1641*67e74705SXin Li   // Handle UTF-8 strings just like narrow strings.
1642*67e74705SXin Li   if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
1643*67e74705SXin Li     SpellingPtr += 2;
1644*67e74705SXin Li 
1645*67e74705SXin Li   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
1646*67e74705SXin Li          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
1647*67e74705SXin Li 
1648*67e74705SXin Li   // For raw string literals, this is easy.
1649*67e74705SXin Li   if (SpellingPtr[0] == 'R') {
1650*67e74705SXin Li     assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
1651*67e74705SXin Li     // Skip 'R"'.
1652*67e74705SXin Li     SpellingPtr += 2;
1653*67e74705SXin Li     while (*SpellingPtr != '(') {
1654*67e74705SXin Li       ++SpellingPtr;
1655*67e74705SXin Li       assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
1656*67e74705SXin Li     }
1657*67e74705SXin Li     // Skip '('.
1658*67e74705SXin Li     ++SpellingPtr;
1659*67e74705SXin Li     return SpellingPtr - SpellingStart + ByteNo;
1660*67e74705SXin Li   }
1661*67e74705SXin Li 
1662*67e74705SXin Li   // Skip over the leading quote
1663*67e74705SXin Li   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
1664*67e74705SXin Li   ++SpellingPtr;
1665*67e74705SXin Li 
1666*67e74705SXin Li   // Skip over bytes until we find the offset we're looking for.
1667*67e74705SXin Li   while (ByteNo) {
1668*67e74705SXin Li     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
1669*67e74705SXin Li 
1670*67e74705SXin Li     // Step over non-escapes simply.
1671*67e74705SXin Li     if (*SpellingPtr != '\\') {
1672*67e74705SXin Li       ++SpellingPtr;
1673*67e74705SXin Li       --ByteNo;
1674*67e74705SXin Li       continue;
1675*67e74705SXin Li     }
1676*67e74705SXin Li 
1677*67e74705SXin Li     // Otherwise, this is an escape character.  Advance over it.
1678*67e74705SXin Li     bool HadError = false;
1679*67e74705SXin Li     if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
1680*67e74705SXin Li       const char *EscapePtr = SpellingPtr;
1681*67e74705SXin Li       unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
1682*67e74705SXin Li                                       1, Features, HadError);
1683*67e74705SXin Li       if (Len > ByteNo) {
1684*67e74705SXin Li         // ByteNo is somewhere within the escape sequence.
1685*67e74705SXin Li         SpellingPtr = EscapePtr;
1686*67e74705SXin Li         break;
1687*67e74705SXin Li       }
1688*67e74705SXin Li       ByteNo -= Len;
1689*67e74705SXin Li     } else {
1690*67e74705SXin Li       ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
1691*67e74705SXin Li                         FullSourceLoc(Tok.getLocation(), SM),
1692*67e74705SXin Li                         CharByteWidth*8, Diags, Features);
1693*67e74705SXin Li       --ByteNo;
1694*67e74705SXin Li     }
1695*67e74705SXin Li     assert(!HadError && "This method isn't valid on erroneous strings");
1696*67e74705SXin Li   }
1697*67e74705SXin Li 
1698*67e74705SXin Li   return SpellingPtr-SpellingStart;
1699*67e74705SXin Li }
1700