xref: /aosp_15_r20/external/clang/lib/AST/CommentLexer.cpp (revision 67e74705e28f6214e480b399dd47ea732279e315)
1*67e74705SXin Li //===--- CommentLexer.cpp -------------------------------------------------===//
2*67e74705SXin Li //
3*67e74705SXin Li //                     The LLVM Compiler Infrastructure
4*67e74705SXin Li //
5*67e74705SXin Li // This file is distributed under the University of Illinois Open Source
6*67e74705SXin Li // License. See LICENSE.TXT for details.
7*67e74705SXin Li //
8*67e74705SXin Li //===----------------------------------------------------------------------===//
9*67e74705SXin Li 
10*67e74705SXin Li #include "clang/AST/CommentLexer.h"
11*67e74705SXin Li #include "clang/AST/CommentCommandTraits.h"
12*67e74705SXin Li #include "clang/AST/CommentDiagnostic.h"
13*67e74705SXin Li #include "clang/Basic/CharInfo.h"
14*67e74705SXin Li #include "llvm/ADT/StringExtras.h"
15*67e74705SXin Li #include "llvm/ADT/StringSwitch.h"
16*67e74705SXin Li #include "llvm/Support/ConvertUTF.h"
17*67e74705SXin Li #include "llvm/Support/ErrorHandling.h"
18*67e74705SXin Li 
19*67e74705SXin Li namespace clang {
20*67e74705SXin Li namespace comments {
21*67e74705SXin Li 
dump(const Lexer & L,const SourceManager & SM) const22*67e74705SXin Li void Token::dump(const Lexer &L, const SourceManager &SM) const {
23*67e74705SXin Li   llvm::errs() << "comments::Token Kind=" << Kind << " ";
24*67e74705SXin Li   Loc.dump(SM);
25*67e74705SXin Li   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
26*67e74705SXin Li }
27*67e74705SXin Li 
isHTMLNamedCharacterReferenceCharacter(char C)28*67e74705SXin Li static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
29*67e74705SXin Li   return isLetter(C);
30*67e74705SXin Li }
31*67e74705SXin Li 
isHTMLDecimalCharacterReferenceCharacter(char C)32*67e74705SXin Li static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
33*67e74705SXin Li   return isDigit(C);
34*67e74705SXin Li }
35*67e74705SXin Li 
isHTMLHexCharacterReferenceCharacter(char C)36*67e74705SXin Li static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
37*67e74705SXin Li   return isHexDigit(C);
38*67e74705SXin Li }
39*67e74705SXin Li 
convertCodePointToUTF8(llvm::BumpPtrAllocator & Allocator,unsigned CodePoint)40*67e74705SXin Li static inline StringRef convertCodePointToUTF8(
41*67e74705SXin Li                                       llvm::BumpPtrAllocator &Allocator,
42*67e74705SXin Li                                       unsigned CodePoint) {
43*67e74705SXin Li   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
44*67e74705SXin Li   char *ResolvedPtr = Resolved;
45*67e74705SXin Li   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
46*67e74705SXin Li     return StringRef(Resolved, ResolvedPtr - Resolved);
47*67e74705SXin Li   else
48*67e74705SXin Li     return StringRef();
49*67e74705SXin Li }
50*67e74705SXin Li 
51*67e74705SXin Li namespace {
52*67e74705SXin Li 
53*67e74705SXin Li #include "clang/AST/CommentHTMLTags.inc"
54*67e74705SXin Li #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
55*67e74705SXin Li 
56*67e74705SXin Li } // end anonymous namespace
57*67e74705SXin Li 
resolveHTMLNamedCharacterReference(StringRef Name) const58*67e74705SXin Li StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
59*67e74705SXin Li   // Fast path, first check a few most widely used named character references.
60*67e74705SXin Li   return llvm::StringSwitch<StringRef>(Name)
61*67e74705SXin Li       .Case("amp", "&")
62*67e74705SXin Li       .Case("lt", "<")
63*67e74705SXin Li       .Case("gt", ">")
64*67e74705SXin Li       .Case("quot", "\"")
65*67e74705SXin Li       .Case("apos", "\'")
66*67e74705SXin Li       // Slow path.
67*67e74705SXin Li       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
68*67e74705SXin Li }
69*67e74705SXin Li 
resolveHTMLDecimalCharacterReference(StringRef Name) const70*67e74705SXin Li StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
71*67e74705SXin Li   unsigned CodePoint = 0;
72*67e74705SXin Li   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
73*67e74705SXin Li     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
74*67e74705SXin Li     CodePoint *= 10;
75*67e74705SXin Li     CodePoint += Name[i] - '0';
76*67e74705SXin Li   }
77*67e74705SXin Li   return convertCodePointToUTF8(Allocator, CodePoint);
78*67e74705SXin Li }
79*67e74705SXin Li 
resolveHTMLHexCharacterReference(StringRef Name) const80*67e74705SXin Li StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
81*67e74705SXin Li   unsigned CodePoint = 0;
82*67e74705SXin Li   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
83*67e74705SXin Li     CodePoint *= 16;
84*67e74705SXin Li     const char C = Name[i];
85*67e74705SXin Li     assert(isHTMLHexCharacterReferenceCharacter(C));
86*67e74705SXin Li     CodePoint += llvm::hexDigitValue(C);
87*67e74705SXin Li   }
88*67e74705SXin Li   return convertCodePointToUTF8(Allocator, CodePoint);
89*67e74705SXin Li }
90*67e74705SXin Li 
skipLineStartingDecorations()91*67e74705SXin Li void Lexer::skipLineStartingDecorations() {
92*67e74705SXin Li   // This function should be called only for C comments
93*67e74705SXin Li   assert(CommentState == LCS_InsideCComment);
94*67e74705SXin Li 
95*67e74705SXin Li   if (BufferPtr == CommentEnd)
96*67e74705SXin Li     return;
97*67e74705SXin Li 
98*67e74705SXin Li   switch (*BufferPtr) {
99*67e74705SXin Li   case ' ':
100*67e74705SXin Li   case '\t':
101*67e74705SXin Li   case '\f':
102*67e74705SXin Li   case '\v': {
103*67e74705SXin Li     const char *NewBufferPtr = BufferPtr;
104*67e74705SXin Li     NewBufferPtr++;
105*67e74705SXin Li     if (NewBufferPtr == CommentEnd)
106*67e74705SXin Li       return;
107*67e74705SXin Li 
108*67e74705SXin Li     char C = *NewBufferPtr;
109*67e74705SXin Li     while (isHorizontalWhitespace(C)) {
110*67e74705SXin Li       NewBufferPtr++;
111*67e74705SXin Li       if (NewBufferPtr == CommentEnd)
112*67e74705SXin Li         return;
113*67e74705SXin Li       C = *NewBufferPtr;
114*67e74705SXin Li     }
115*67e74705SXin Li     if (C == '*')
116*67e74705SXin Li       BufferPtr = NewBufferPtr + 1;
117*67e74705SXin Li     break;
118*67e74705SXin Li   }
119*67e74705SXin Li   case '*':
120*67e74705SXin Li     BufferPtr++;
121*67e74705SXin Li     break;
122*67e74705SXin Li   }
123*67e74705SXin Li }
124*67e74705SXin Li 
125*67e74705SXin Li namespace {
126*67e74705SXin Li /// Returns pointer to the first newline character in the string.
findNewline(const char * BufferPtr,const char * BufferEnd)127*67e74705SXin Li const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
128*67e74705SXin Li   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
129*67e74705SXin Li     if (isVerticalWhitespace(*BufferPtr))
130*67e74705SXin Li       return BufferPtr;
131*67e74705SXin Li   }
132*67e74705SXin Li   return BufferEnd;
133*67e74705SXin Li }
134*67e74705SXin Li 
skipNewline(const char * BufferPtr,const char * BufferEnd)135*67e74705SXin Li const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
136*67e74705SXin Li   if (BufferPtr == BufferEnd)
137*67e74705SXin Li     return BufferPtr;
138*67e74705SXin Li 
139*67e74705SXin Li   if (*BufferPtr == '\n')
140*67e74705SXin Li     BufferPtr++;
141*67e74705SXin Li   else {
142*67e74705SXin Li     assert(*BufferPtr == '\r');
143*67e74705SXin Li     BufferPtr++;
144*67e74705SXin Li     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
145*67e74705SXin Li       BufferPtr++;
146*67e74705SXin Li   }
147*67e74705SXin Li   return BufferPtr;
148*67e74705SXin Li }
149*67e74705SXin Li 
skipNamedCharacterReference(const char * BufferPtr,const char * BufferEnd)150*67e74705SXin Li const char *skipNamedCharacterReference(const char *BufferPtr,
151*67e74705SXin Li                                         const char *BufferEnd) {
152*67e74705SXin Li   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153*67e74705SXin Li     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
154*67e74705SXin Li       return BufferPtr;
155*67e74705SXin Li   }
156*67e74705SXin Li   return BufferEnd;
157*67e74705SXin Li }
158*67e74705SXin Li 
skipDecimalCharacterReference(const char * BufferPtr,const char * BufferEnd)159*67e74705SXin Li const char *skipDecimalCharacterReference(const char *BufferPtr,
160*67e74705SXin Li                                           const char *BufferEnd) {
161*67e74705SXin Li   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162*67e74705SXin Li     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
163*67e74705SXin Li       return BufferPtr;
164*67e74705SXin Li   }
165*67e74705SXin Li   return BufferEnd;
166*67e74705SXin Li }
167*67e74705SXin Li 
skipHexCharacterReference(const char * BufferPtr,const char * BufferEnd)168*67e74705SXin Li const char *skipHexCharacterReference(const char *BufferPtr,
169*67e74705SXin Li                                       const char *BufferEnd) {
170*67e74705SXin Li   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
171*67e74705SXin Li     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
172*67e74705SXin Li       return BufferPtr;
173*67e74705SXin Li   }
174*67e74705SXin Li   return BufferEnd;
175*67e74705SXin Li }
176*67e74705SXin Li 
isHTMLIdentifierStartingCharacter(char C)177*67e74705SXin Li bool isHTMLIdentifierStartingCharacter(char C) {
178*67e74705SXin Li   return isLetter(C);
179*67e74705SXin Li }
180*67e74705SXin Li 
isHTMLIdentifierCharacter(char C)181*67e74705SXin Li bool isHTMLIdentifierCharacter(char C) {
182*67e74705SXin Li   return isAlphanumeric(C);
183*67e74705SXin Li }
184*67e74705SXin Li 
skipHTMLIdentifier(const char * BufferPtr,const char * BufferEnd)185*67e74705SXin Li const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
186*67e74705SXin Li   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
187*67e74705SXin Li     if (!isHTMLIdentifierCharacter(*BufferPtr))
188*67e74705SXin Li       return BufferPtr;
189*67e74705SXin Li   }
190*67e74705SXin Li   return BufferEnd;
191*67e74705SXin Li }
192*67e74705SXin Li 
193*67e74705SXin Li /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
194*67e74705SXin Li /// string allowed.
195*67e74705SXin Li ///
196*67e74705SXin Li /// Returns pointer to closing quote.
skipHTMLQuotedString(const char * BufferPtr,const char * BufferEnd)197*67e74705SXin Li const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
198*67e74705SXin Li {
199*67e74705SXin Li   const char Quote = *BufferPtr;
200*67e74705SXin Li   assert(Quote == '\"' || Quote == '\'');
201*67e74705SXin Li 
202*67e74705SXin Li   BufferPtr++;
203*67e74705SXin Li   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204*67e74705SXin Li     const char C = *BufferPtr;
205*67e74705SXin Li     if (C == Quote && BufferPtr[-1] != '\\')
206*67e74705SXin Li       return BufferPtr;
207*67e74705SXin Li   }
208*67e74705SXin Li   return BufferEnd;
209*67e74705SXin Li }
210*67e74705SXin Li 
skipWhitespace(const char * BufferPtr,const char * BufferEnd)211*67e74705SXin Li const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
212*67e74705SXin Li   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
213*67e74705SXin Li     if (!isWhitespace(*BufferPtr))
214*67e74705SXin Li       return BufferPtr;
215*67e74705SXin Li   }
216*67e74705SXin Li   return BufferEnd;
217*67e74705SXin Li }
218*67e74705SXin Li 
isWhitespace(const char * BufferPtr,const char * BufferEnd)219*67e74705SXin Li bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
220*67e74705SXin Li   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
221*67e74705SXin Li }
222*67e74705SXin Li 
isCommandNameStartCharacter(char C)223*67e74705SXin Li bool isCommandNameStartCharacter(char C) {
224*67e74705SXin Li   return isLetter(C);
225*67e74705SXin Li }
226*67e74705SXin Li 
isCommandNameCharacter(char C)227*67e74705SXin Li bool isCommandNameCharacter(char C) {
228*67e74705SXin Li   return isAlphanumeric(C);
229*67e74705SXin Li }
230*67e74705SXin Li 
skipCommandName(const char * BufferPtr,const char * BufferEnd)231*67e74705SXin Li const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
232*67e74705SXin Li   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
233*67e74705SXin Li     if (!isCommandNameCharacter(*BufferPtr))
234*67e74705SXin Li       return BufferPtr;
235*67e74705SXin Li   }
236*67e74705SXin Li   return BufferEnd;
237*67e74705SXin Li }
238*67e74705SXin Li 
239*67e74705SXin Li /// Return the one past end pointer for BCPL comments.
240*67e74705SXin Li /// Handles newlines escaped with backslash or trigraph for backslahs.
findBCPLCommentEnd(const char * BufferPtr,const char * BufferEnd)241*67e74705SXin Li const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
242*67e74705SXin Li   const char *CurPtr = BufferPtr;
243*67e74705SXin Li   while (CurPtr != BufferEnd) {
244*67e74705SXin Li     while (!isVerticalWhitespace(*CurPtr)) {
245*67e74705SXin Li       CurPtr++;
246*67e74705SXin Li       if (CurPtr == BufferEnd)
247*67e74705SXin Li         return BufferEnd;
248*67e74705SXin Li     }
249*67e74705SXin Li     // We found a newline, check if it is escaped.
250*67e74705SXin Li     const char *EscapePtr = CurPtr - 1;
251*67e74705SXin Li     while(isHorizontalWhitespace(*EscapePtr))
252*67e74705SXin Li       EscapePtr--;
253*67e74705SXin Li 
254*67e74705SXin Li     if (*EscapePtr == '\\' ||
255*67e74705SXin Li         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
256*67e74705SXin Li          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
257*67e74705SXin Li       // We found an escaped newline.
258*67e74705SXin Li       CurPtr = skipNewline(CurPtr, BufferEnd);
259*67e74705SXin Li     } else
260*67e74705SXin Li       return CurPtr; // Not an escaped newline.
261*67e74705SXin Li   }
262*67e74705SXin Li   return BufferEnd;
263*67e74705SXin Li }
264*67e74705SXin Li 
265*67e74705SXin Li /// Return the one past end pointer for C comments.
266*67e74705SXin Li /// Very dumb, does not handle escaped newlines or trigraphs.
findCCommentEnd(const char * BufferPtr,const char * BufferEnd)267*67e74705SXin Li const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
268*67e74705SXin Li   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
269*67e74705SXin Li     if (*BufferPtr == '*') {
270*67e74705SXin Li       assert(BufferPtr + 1 != BufferEnd);
271*67e74705SXin Li       if (*(BufferPtr + 1) == '/')
272*67e74705SXin Li         return BufferPtr;
273*67e74705SXin Li     }
274*67e74705SXin Li   }
275*67e74705SXin Li   llvm_unreachable("buffer end hit before '*/' was seen");
276*67e74705SXin Li }
277*67e74705SXin Li 
278*67e74705SXin Li } // end anonymous namespace
279*67e74705SXin Li 
formTokenWithChars(Token & Result,const char * TokEnd,tok::TokenKind Kind)280*67e74705SXin Li void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
281*67e74705SXin Li                                tok::TokenKind Kind) {
282*67e74705SXin Li   const unsigned TokLen = TokEnd - BufferPtr;
283*67e74705SXin Li   Result.setLocation(getSourceLocation(BufferPtr));
284*67e74705SXin Li   Result.setKind(Kind);
285*67e74705SXin Li   Result.setLength(TokLen);
286*67e74705SXin Li #ifndef NDEBUG
287*67e74705SXin Li   Result.TextPtr = "<UNSET>";
288*67e74705SXin Li   Result.IntVal = 7;
289*67e74705SXin Li #endif
290*67e74705SXin Li   BufferPtr = TokEnd;
291*67e74705SXin Li }
292*67e74705SXin Li 
lexCommentText(Token & T)293*67e74705SXin Li void Lexer::lexCommentText(Token &T) {
294*67e74705SXin Li   assert(CommentState == LCS_InsideBCPLComment ||
295*67e74705SXin Li          CommentState == LCS_InsideCComment);
296*67e74705SXin Li 
297*67e74705SXin Li   switch (State) {
298*67e74705SXin Li   case LS_Normal:
299*67e74705SXin Li     break;
300*67e74705SXin Li   case LS_VerbatimBlockFirstLine:
301*67e74705SXin Li     lexVerbatimBlockFirstLine(T);
302*67e74705SXin Li     return;
303*67e74705SXin Li   case LS_VerbatimBlockBody:
304*67e74705SXin Li     lexVerbatimBlockBody(T);
305*67e74705SXin Li     return;
306*67e74705SXin Li   case LS_VerbatimLineText:
307*67e74705SXin Li     lexVerbatimLineText(T);
308*67e74705SXin Li     return;
309*67e74705SXin Li   case LS_HTMLStartTag:
310*67e74705SXin Li     lexHTMLStartTag(T);
311*67e74705SXin Li     return;
312*67e74705SXin Li   case LS_HTMLEndTag:
313*67e74705SXin Li     lexHTMLEndTag(T);
314*67e74705SXin Li     return;
315*67e74705SXin Li   }
316*67e74705SXin Li 
317*67e74705SXin Li   assert(State == LS_Normal);
318*67e74705SXin Li 
319*67e74705SXin Li   const char *TokenPtr = BufferPtr;
320*67e74705SXin Li   assert(TokenPtr < CommentEnd);
321*67e74705SXin Li   while (TokenPtr != CommentEnd) {
322*67e74705SXin Li     switch(*TokenPtr) {
323*67e74705SXin Li       case '\\':
324*67e74705SXin Li       case '@': {
325*67e74705SXin Li         // Commands that start with a backslash and commands that start with
326*67e74705SXin Li         // 'at' have equivalent semantics.  But we keep information about the
327*67e74705SXin Li         // exact syntax in AST for comments.
328*67e74705SXin Li         tok::TokenKind CommandKind =
329*67e74705SXin Li             (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
330*67e74705SXin Li         TokenPtr++;
331*67e74705SXin Li         if (TokenPtr == CommentEnd) {
332*67e74705SXin Li           formTextToken(T, TokenPtr);
333*67e74705SXin Li           return;
334*67e74705SXin Li         }
335*67e74705SXin Li         char C = *TokenPtr;
336*67e74705SXin Li         switch (C) {
337*67e74705SXin Li         default:
338*67e74705SXin Li           break;
339*67e74705SXin Li 
340*67e74705SXin Li         case '\\': case '@': case '&': case '$':
341*67e74705SXin Li         case '#':  case '<': case '>': case '%':
342*67e74705SXin Li         case '\"': case '.': case ':':
343*67e74705SXin Li           // This is one of \\ \@ \& \$ etc escape sequences.
344*67e74705SXin Li           TokenPtr++;
345*67e74705SXin Li           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
346*67e74705SXin Li             // This is the \:: escape sequence.
347*67e74705SXin Li             TokenPtr++;
348*67e74705SXin Li           }
349*67e74705SXin Li           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
350*67e74705SXin Li           formTokenWithChars(T, TokenPtr, tok::text);
351*67e74705SXin Li           T.setText(UnescapedText);
352*67e74705SXin Li           return;
353*67e74705SXin Li         }
354*67e74705SXin Li 
355*67e74705SXin Li         // Don't make zero-length commands.
356*67e74705SXin Li         if (!isCommandNameStartCharacter(*TokenPtr)) {
357*67e74705SXin Li           formTextToken(T, TokenPtr);
358*67e74705SXin Li           return;
359*67e74705SXin Li         }
360*67e74705SXin Li 
361*67e74705SXin Li         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
362*67e74705SXin Li         unsigned Length = TokenPtr - (BufferPtr + 1);
363*67e74705SXin Li 
364*67e74705SXin Li         // Hardcoded support for lexing LaTeX formula commands
365*67e74705SXin Li         // \f$ \f[ \f] \f{ \f} as a single command.
366*67e74705SXin Li         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
367*67e74705SXin Li           C = *TokenPtr;
368*67e74705SXin Li           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
369*67e74705SXin Li             TokenPtr++;
370*67e74705SXin Li             Length++;
371*67e74705SXin Li           }
372*67e74705SXin Li         }
373*67e74705SXin Li 
374*67e74705SXin Li         StringRef CommandName(BufferPtr + 1, Length);
375*67e74705SXin Li 
376*67e74705SXin Li         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
377*67e74705SXin Li         if (!Info) {
378*67e74705SXin Li           if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
379*67e74705SXin Li             StringRef CorrectedName = Info->Name;
380*67e74705SXin Li             SourceLocation Loc = getSourceLocation(BufferPtr);
381*67e74705SXin Li             SourceRange CommandRange(Loc.getLocWithOffset(1),
382*67e74705SXin Li                                      getSourceLocation(TokenPtr));
383*67e74705SXin Li             Diag(Loc, diag::warn_correct_comment_command_name)
384*67e74705SXin Li               << CommandName << CorrectedName
385*67e74705SXin Li               << FixItHint::CreateReplacement(CommandRange, CorrectedName);
386*67e74705SXin Li           } else {
387*67e74705SXin Li             formTokenWithChars(T, TokenPtr, tok::unknown_command);
388*67e74705SXin Li             T.setUnknownCommandName(CommandName);
389*67e74705SXin Li             Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
390*67e74705SXin Li             return;
391*67e74705SXin Li           }
392*67e74705SXin Li         }
393*67e74705SXin Li         if (Info->IsVerbatimBlockCommand) {
394*67e74705SXin Li           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
395*67e74705SXin Li           return;
396*67e74705SXin Li         }
397*67e74705SXin Li         if (Info->IsVerbatimLineCommand) {
398*67e74705SXin Li           setupAndLexVerbatimLine(T, TokenPtr, Info);
399*67e74705SXin Li           return;
400*67e74705SXin Li         }
401*67e74705SXin Li         formTokenWithChars(T, TokenPtr, CommandKind);
402*67e74705SXin Li         T.setCommandID(Info->getID());
403*67e74705SXin Li         return;
404*67e74705SXin Li       }
405*67e74705SXin Li 
406*67e74705SXin Li       case '&':
407*67e74705SXin Li         lexHTMLCharacterReference(T);
408*67e74705SXin Li         return;
409*67e74705SXin Li 
410*67e74705SXin Li       case '<': {
411*67e74705SXin Li         TokenPtr++;
412*67e74705SXin Li         if (TokenPtr == CommentEnd) {
413*67e74705SXin Li           formTextToken(T, TokenPtr);
414*67e74705SXin Li           return;
415*67e74705SXin Li         }
416*67e74705SXin Li         const char C = *TokenPtr;
417*67e74705SXin Li         if (isHTMLIdentifierStartingCharacter(C))
418*67e74705SXin Li           setupAndLexHTMLStartTag(T);
419*67e74705SXin Li         else if (C == '/')
420*67e74705SXin Li           setupAndLexHTMLEndTag(T);
421*67e74705SXin Li         else
422*67e74705SXin Li           formTextToken(T, TokenPtr);
423*67e74705SXin Li         return;
424*67e74705SXin Li       }
425*67e74705SXin Li 
426*67e74705SXin Li       case '\n':
427*67e74705SXin Li       case '\r':
428*67e74705SXin Li         TokenPtr = skipNewline(TokenPtr, CommentEnd);
429*67e74705SXin Li         formTokenWithChars(T, TokenPtr, tok::newline);
430*67e74705SXin Li 
431*67e74705SXin Li         if (CommentState == LCS_InsideCComment)
432*67e74705SXin Li           skipLineStartingDecorations();
433*67e74705SXin Li         return;
434*67e74705SXin Li 
435*67e74705SXin Li       default: {
436*67e74705SXin Li         size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
437*67e74705SXin Li                          find_first_of("\n\r\\@&<");
438*67e74705SXin Li         if (End != StringRef::npos)
439*67e74705SXin Li           TokenPtr += End;
440*67e74705SXin Li         else
441*67e74705SXin Li           TokenPtr = CommentEnd;
442*67e74705SXin Li         formTextToken(T, TokenPtr);
443*67e74705SXin Li         return;
444*67e74705SXin Li       }
445*67e74705SXin Li     }
446*67e74705SXin Li   }
447*67e74705SXin Li }
448*67e74705SXin Li 
setupAndLexVerbatimBlock(Token & T,const char * TextBegin,char Marker,const CommandInfo * Info)449*67e74705SXin Li void Lexer::setupAndLexVerbatimBlock(Token &T,
450*67e74705SXin Li                                      const char *TextBegin,
451*67e74705SXin Li                                      char Marker, const CommandInfo *Info) {
452*67e74705SXin Li   assert(Info->IsVerbatimBlockCommand);
453*67e74705SXin Li 
454*67e74705SXin Li   VerbatimBlockEndCommandName.clear();
455*67e74705SXin Li   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
456*67e74705SXin Li   VerbatimBlockEndCommandName.append(Info->EndCommandName);
457*67e74705SXin Li 
458*67e74705SXin Li   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
459*67e74705SXin Li   T.setVerbatimBlockID(Info->getID());
460*67e74705SXin Li 
461*67e74705SXin Li   // If there is a newline following the verbatim opening command, skip the
462*67e74705SXin Li   // newline so that we don't create an tok::verbatim_block_line with empty
463*67e74705SXin Li   // text content.
464*67e74705SXin Li   if (BufferPtr != CommentEnd &&
465*67e74705SXin Li       isVerticalWhitespace(*BufferPtr)) {
466*67e74705SXin Li     BufferPtr = skipNewline(BufferPtr, CommentEnd);
467*67e74705SXin Li     State = LS_VerbatimBlockBody;
468*67e74705SXin Li     return;
469*67e74705SXin Li   }
470*67e74705SXin Li 
471*67e74705SXin Li   State = LS_VerbatimBlockFirstLine;
472*67e74705SXin Li }
473*67e74705SXin Li 
lexVerbatimBlockFirstLine(Token & T)474*67e74705SXin Li void Lexer::lexVerbatimBlockFirstLine(Token &T) {
475*67e74705SXin Li again:
476*67e74705SXin Li   assert(BufferPtr < CommentEnd);
477*67e74705SXin Li 
478*67e74705SXin Li   // FIXME: It would be better to scan the text once, finding either the block
479*67e74705SXin Li   // end command or newline.
480*67e74705SXin Li   //
481*67e74705SXin Li   // Extract current line.
482*67e74705SXin Li   const char *Newline = findNewline(BufferPtr, CommentEnd);
483*67e74705SXin Li   StringRef Line(BufferPtr, Newline - BufferPtr);
484*67e74705SXin Li 
485*67e74705SXin Li   // Look for end command in current line.
486*67e74705SXin Li   size_t Pos = Line.find(VerbatimBlockEndCommandName);
487*67e74705SXin Li   const char *TextEnd;
488*67e74705SXin Li   const char *NextLine;
489*67e74705SXin Li   if (Pos == StringRef::npos) {
490*67e74705SXin Li     // Current line is completely verbatim.
491*67e74705SXin Li     TextEnd = Newline;
492*67e74705SXin Li     NextLine = skipNewline(Newline, CommentEnd);
493*67e74705SXin Li   } else if (Pos == 0) {
494*67e74705SXin Li     // Current line contains just an end command.
495*67e74705SXin Li     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
496*67e74705SXin Li     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
497*67e74705SXin Li     formTokenWithChars(T, End, tok::verbatim_block_end);
498*67e74705SXin Li     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
499*67e74705SXin Li     State = LS_Normal;
500*67e74705SXin Li     return;
501*67e74705SXin Li   } else {
502*67e74705SXin Li     // There is some text, followed by end command.  Extract text first.
503*67e74705SXin Li     TextEnd = BufferPtr + Pos;
504*67e74705SXin Li     NextLine = TextEnd;
505*67e74705SXin Li     // If there is only whitespace before end command, skip whitespace.
506*67e74705SXin Li     if (isWhitespace(BufferPtr, TextEnd)) {
507*67e74705SXin Li       BufferPtr = TextEnd;
508*67e74705SXin Li       goto again;
509*67e74705SXin Li     }
510*67e74705SXin Li   }
511*67e74705SXin Li 
512*67e74705SXin Li   StringRef Text(BufferPtr, TextEnd - BufferPtr);
513*67e74705SXin Li   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
514*67e74705SXin Li   T.setVerbatimBlockText(Text);
515*67e74705SXin Li 
516*67e74705SXin Li   State = LS_VerbatimBlockBody;
517*67e74705SXin Li }
518*67e74705SXin Li 
lexVerbatimBlockBody(Token & T)519*67e74705SXin Li void Lexer::lexVerbatimBlockBody(Token &T) {
520*67e74705SXin Li   assert(State == LS_VerbatimBlockBody);
521*67e74705SXin Li 
522*67e74705SXin Li   if (CommentState == LCS_InsideCComment)
523*67e74705SXin Li     skipLineStartingDecorations();
524*67e74705SXin Li 
525*67e74705SXin Li   if (BufferPtr == CommentEnd) {
526*67e74705SXin Li     formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
527*67e74705SXin Li     T.setVerbatimBlockText("");
528*67e74705SXin Li     return;
529*67e74705SXin Li   }
530*67e74705SXin Li 
531*67e74705SXin Li   lexVerbatimBlockFirstLine(T);
532*67e74705SXin Li }
533*67e74705SXin Li 
setupAndLexVerbatimLine(Token & T,const char * TextBegin,const CommandInfo * Info)534*67e74705SXin Li void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
535*67e74705SXin Li                                     const CommandInfo *Info) {
536*67e74705SXin Li   assert(Info->IsVerbatimLineCommand);
537*67e74705SXin Li   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
538*67e74705SXin Li   T.setVerbatimLineID(Info->getID());
539*67e74705SXin Li 
540*67e74705SXin Li   State = LS_VerbatimLineText;
541*67e74705SXin Li }
542*67e74705SXin Li 
lexVerbatimLineText(Token & T)543*67e74705SXin Li void Lexer::lexVerbatimLineText(Token &T) {
544*67e74705SXin Li   assert(State == LS_VerbatimLineText);
545*67e74705SXin Li 
546*67e74705SXin Li   // Extract current line.
547*67e74705SXin Li   const char *Newline = findNewline(BufferPtr, CommentEnd);
548*67e74705SXin Li   StringRef Text(BufferPtr, Newline - BufferPtr);
549*67e74705SXin Li   formTokenWithChars(T, Newline, tok::verbatim_line_text);
550*67e74705SXin Li   T.setVerbatimLineText(Text);
551*67e74705SXin Li 
552*67e74705SXin Li   State = LS_Normal;
553*67e74705SXin Li }
554*67e74705SXin Li 
lexHTMLCharacterReference(Token & T)555*67e74705SXin Li void Lexer::lexHTMLCharacterReference(Token &T) {
556*67e74705SXin Li   const char *TokenPtr = BufferPtr;
557*67e74705SXin Li   assert(*TokenPtr == '&');
558*67e74705SXin Li   TokenPtr++;
559*67e74705SXin Li   if (TokenPtr == CommentEnd) {
560*67e74705SXin Li     formTextToken(T, TokenPtr);
561*67e74705SXin Li     return;
562*67e74705SXin Li   }
563*67e74705SXin Li   const char *NamePtr;
564*67e74705SXin Li   bool isNamed = false;
565*67e74705SXin Li   bool isDecimal = false;
566*67e74705SXin Li   char C = *TokenPtr;
567*67e74705SXin Li   if (isHTMLNamedCharacterReferenceCharacter(C)) {
568*67e74705SXin Li     NamePtr = TokenPtr;
569*67e74705SXin Li     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
570*67e74705SXin Li     isNamed = true;
571*67e74705SXin Li   } else if (C == '#') {
572*67e74705SXin Li     TokenPtr++;
573*67e74705SXin Li     if (TokenPtr == CommentEnd) {
574*67e74705SXin Li       formTextToken(T, TokenPtr);
575*67e74705SXin Li       return;
576*67e74705SXin Li     }
577*67e74705SXin Li     C = *TokenPtr;
578*67e74705SXin Li     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
579*67e74705SXin Li       NamePtr = TokenPtr;
580*67e74705SXin Li       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
581*67e74705SXin Li       isDecimal = true;
582*67e74705SXin Li     } else if (C == 'x' || C == 'X') {
583*67e74705SXin Li       TokenPtr++;
584*67e74705SXin Li       NamePtr = TokenPtr;
585*67e74705SXin Li       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
586*67e74705SXin Li     } else {
587*67e74705SXin Li       formTextToken(T, TokenPtr);
588*67e74705SXin Li       return;
589*67e74705SXin Li     }
590*67e74705SXin Li   } else {
591*67e74705SXin Li     formTextToken(T, TokenPtr);
592*67e74705SXin Li     return;
593*67e74705SXin Li   }
594*67e74705SXin Li   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
595*67e74705SXin Li       *TokenPtr != ';') {
596*67e74705SXin Li     formTextToken(T, TokenPtr);
597*67e74705SXin Li     return;
598*67e74705SXin Li   }
599*67e74705SXin Li   StringRef Name(NamePtr, TokenPtr - NamePtr);
600*67e74705SXin Li   TokenPtr++; // Skip semicolon.
601*67e74705SXin Li   StringRef Resolved;
602*67e74705SXin Li   if (isNamed)
603*67e74705SXin Li     Resolved = resolveHTMLNamedCharacterReference(Name);
604*67e74705SXin Li   else if (isDecimal)
605*67e74705SXin Li     Resolved = resolveHTMLDecimalCharacterReference(Name);
606*67e74705SXin Li   else
607*67e74705SXin Li     Resolved = resolveHTMLHexCharacterReference(Name);
608*67e74705SXin Li 
609*67e74705SXin Li   if (Resolved.empty()) {
610*67e74705SXin Li     formTextToken(T, TokenPtr);
611*67e74705SXin Li     return;
612*67e74705SXin Li   }
613*67e74705SXin Li   formTokenWithChars(T, TokenPtr, tok::text);
614*67e74705SXin Li   T.setText(Resolved);
615*67e74705SXin Li }
616*67e74705SXin Li 
setupAndLexHTMLStartTag(Token & T)617*67e74705SXin Li void Lexer::setupAndLexHTMLStartTag(Token &T) {
618*67e74705SXin Li   assert(BufferPtr[0] == '<' &&
619*67e74705SXin Li          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
620*67e74705SXin Li   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
621*67e74705SXin Li   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
622*67e74705SXin Li   if (!isHTMLTagName(Name)) {
623*67e74705SXin Li     formTextToken(T, TagNameEnd);
624*67e74705SXin Li     return;
625*67e74705SXin Li   }
626*67e74705SXin Li 
627*67e74705SXin Li   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
628*67e74705SXin Li   T.setHTMLTagStartName(Name);
629*67e74705SXin Li 
630*67e74705SXin Li   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
631*67e74705SXin Li 
632*67e74705SXin Li   const char C = *BufferPtr;
633*67e74705SXin Li   if (BufferPtr != CommentEnd &&
634*67e74705SXin Li       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
635*67e74705SXin Li     State = LS_HTMLStartTag;
636*67e74705SXin Li }
637*67e74705SXin Li 
lexHTMLStartTag(Token & T)638*67e74705SXin Li void Lexer::lexHTMLStartTag(Token &T) {
639*67e74705SXin Li   assert(State == LS_HTMLStartTag);
640*67e74705SXin Li 
641*67e74705SXin Li   const char *TokenPtr = BufferPtr;
642*67e74705SXin Li   char C = *TokenPtr;
643*67e74705SXin Li   if (isHTMLIdentifierCharacter(C)) {
644*67e74705SXin Li     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
645*67e74705SXin Li     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
646*67e74705SXin Li     formTokenWithChars(T, TokenPtr, tok::html_ident);
647*67e74705SXin Li     T.setHTMLIdent(Ident);
648*67e74705SXin Li   } else {
649*67e74705SXin Li     switch (C) {
650*67e74705SXin Li     case '=':
651*67e74705SXin Li       TokenPtr++;
652*67e74705SXin Li       formTokenWithChars(T, TokenPtr, tok::html_equals);
653*67e74705SXin Li       break;
654*67e74705SXin Li     case '\"':
655*67e74705SXin Li     case '\'': {
656*67e74705SXin Li       const char *OpenQuote = TokenPtr;
657*67e74705SXin Li       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
658*67e74705SXin Li       const char *ClosingQuote = TokenPtr;
659*67e74705SXin Li       if (TokenPtr != CommentEnd) // Skip closing quote.
660*67e74705SXin Li         TokenPtr++;
661*67e74705SXin Li       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
662*67e74705SXin Li       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
663*67e74705SXin Li                                       ClosingQuote - (OpenQuote + 1)));
664*67e74705SXin Li       break;
665*67e74705SXin Li     }
666*67e74705SXin Li     case '>':
667*67e74705SXin Li       TokenPtr++;
668*67e74705SXin Li       formTokenWithChars(T, TokenPtr, tok::html_greater);
669*67e74705SXin Li       State = LS_Normal;
670*67e74705SXin Li       return;
671*67e74705SXin Li     case '/':
672*67e74705SXin Li       TokenPtr++;
673*67e74705SXin Li       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
674*67e74705SXin Li         TokenPtr++;
675*67e74705SXin Li         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
676*67e74705SXin Li       } else
677*67e74705SXin Li         formTextToken(T, TokenPtr);
678*67e74705SXin Li 
679*67e74705SXin Li       State = LS_Normal;
680*67e74705SXin Li       return;
681*67e74705SXin Li     }
682*67e74705SXin Li   }
683*67e74705SXin Li 
684*67e74705SXin Li   // Now look ahead and return to normal state if we don't see any HTML tokens
685*67e74705SXin Li   // ahead.
686*67e74705SXin Li   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
687*67e74705SXin Li   if (BufferPtr == CommentEnd) {
688*67e74705SXin Li     State = LS_Normal;
689*67e74705SXin Li     return;
690*67e74705SXin Li   }
691*67e74705SXin Li 
692*67e74705SXin Li   C = *BufferPtr;
693*67e74705SXin Li   if (!isHTMLIdentifierStartingCharacter(C) &&
694*67e74705SXin Li       C != '=' && C != '\"' && C != '\'' && C != '>') {
695*67e74705SXin Li     State = LS_Normal;
696*67e74705SXin Li     return;
697*67e74705SXin Li   }
698*67e74705SXin Li }
699*67e74705SXin Li 
setupAndLexHTMLEndTag(Token & T)700*67e74705SXin Li void Lexer::setupAndLexHTMLEndTag(Token &T) {
701*67e74705SXin Li   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
702*67e74705SXin Li 
703*67e74705SXin Li   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
704*67e74705SXin Li   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
705*67e74705SXin Li   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
706*67e74705SXin Li   if (!isHTMLTagName(Name)) {
707*67e74705SXin Li     formTextToken(T, TagNameEnd);
708*67e74705SXin Li     return;
709*67e74705SXin Li   }
710*67e74705SXin Li 
711*67e74705SXin Li   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
712*67e74705SXin Li 
713*67e74705SXin Li   formTokenWithChars(T, End, tok::html_end_tag);
714*67e74705SXin Li   T.setHTMLTagEndName(Name);
715*67e74705SXin Li 
716*67e74705SXin Li   if (BufferPtr != CommentEnd && *BufferPtr == '>')
717*67e74705SXin Li     State = LS_HTMLEndTag;
718*67e74705SXin Li }
719*67e74705SXin Li 
lexHTMLEndTag(Token & T)720*67e74705SXin Li void Lexer::lexHTMLEndTag(Token &T) {
721*67e74705SXin Li   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
722*67e74705SXin Li 
723*67e74705SXin Li   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
724*67e74705SXin Li   State = LS_Normal;
725*67e74705SXin Li }
726*67e74705SXin Li 
Lexer(llvm::BumpPtrAllocator & Allocator,DiagnosticsEngine & Diags,const CommandTraits & Traits,SourceLocation FileLoc,const char * BufferStart,const char * BufferEnd)727*67e74705SXin Li Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
728*67e74705SXin Li              const CommandTraits &Traits,
729*67e74705SXin Li              SourceLocation FileLoc,
730*67e74705SXin Li              const char *BufferStart, const char *BufferEnd):
731*67e74705SXin Li     Allocator(Allocator), Diags(Diags), Traits(Traits),
732*67e74705SXin Li     BufferStart(BufferStart), BufferEnd(BufferEnd),
733*67e74705SXin Li     FileLoc(FileLoc), BufferPtr(BufferStart),
734*67e74705SXin Li     CommentState(LCS_BeforeComment), State(LS_Normal) {
735*67e74705SXin Li }
736*67e74705SXin Li 
lex(Token & T)737*67e74705SXin Li void Lexer::lex(Token &T) {
738*67e74705SXin Li again:
739*67e74705SXin Li   switch (CommentState) {
740*67e74705SXin Li   case LCS_BeforeComment:
741*67e74705SXin Li     if (BufferPtr == BufferEnd) {
742*67e74705SXin Li       formTokenWithChars(T, BufferPtr, tok::eof);
743*67e74705SXin Li       return;
744*67e74705SXin Li     }
745*67e74705SXin Li 
746*67e74705SXin Li     assert(*BufferPtr == '/');
747*67e74705SXin Li     BufferPtr++; // Skip first slash.
748*67e74705SXin Li     switch(*BufferPtr) {
749*67e74705SXin Li     case '/': { // BCPL comment.
750*67e74705SXin Li       BufferPtr++; // Skip second slash.
751*67e74705SXin Li 
752*67e74705SXin Li       if (BufferPtr != BufferEnd) {
753*67e74705SXin Li         // Skip Doxygen magic marker, if it is present.
754*67e74705SXin Li         // It might be missing because of a typo //< or /*<, or because we
755*67e74705SXin Li         // merged this non-Doxygen comment into a bunch of Doxygen comments
756*67e74705SXin Li         // around it: /** ... */ /* ... */ /** ... */
757*67e74705SXin Li         const char C = *BufferPtr;
758*67e74705SXin Li         if (C == '/' || C == '!')
759*67e74705SXin Li           BufferPtr++;
760*67e74705SXin Li       }
761*67e74705SXin Li 
762*67e74705SXin Li       // Skip less-than symbol that marks trailing comments.
763*67e74705SXin Li       // Skip it even if the comment is not a Doxygen one, because //< and /*<
764*67e74705SXin Li       // are frequent typos.
765*67e74705SXin Li       if (BufferPtr != BufferEnd && *BufferPtr == '<')
766*67e74705SXin Li         BufferPtr++;
767*67e74705SXin Li 
768*67e74705SXin Li       CommentState = LCS_InsideBCPLComment;
769*67e74705SXin Li       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
770*67e74705SXin Li         State = LS_Normal;
771*67e74705SXin Li       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
772*67e74705SXin Li       goto again;
773*67e74705SXin Li     }
774*67e74705SXin Li     case '*': { // C comment.
775*67e74705SXin Li       BufferPtr++; // Skip star.
776*67e74705SXin Li 
777*67e74705SXin Li       // Skip Doxygen magic marker.
778*67e74705SXin Li       const char C = *BufferPtr;
779*67e74705SXin Li       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
780*67e74705SXin Li         BufferPtr++;
781*67e74705SXin Li 
782*67e74705SXin Li       // Skip less-than symbol that marks trailing comments.
783*67e74705SXin Li       if (BufferPtr != BufferEnd && *BufferPtr == '<')
784*67e74705SXin Li         BufferPtr++;
785*67e74705SXin Li 
786*67e74705SXin Li       CommentState = LCS_InsideCComment;
787*67e74705SXin Li       State = LS_Normal;
788*67e74705SXin Li       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
789*67e74705SXin Li       goto again;
790*67e74705SXin Li     }
791*67e74705SXin Li     default:
792*67e74705SXin Li       llvm_unreachable("second character of comment should be '/' or '*'");
793*67e74705SXin Li     }
794*67e74705SXin Li 
795*67e74705SXin Li   case LCS_BetweenComments: {
796*67e74705SXin Li     // Consecutive comments are extracted only if there is only whitespace
797*67e74705SXin Li     // between them.  So we can search for the start of the next comment.
798*67e74705SXin Li     const char *EndWhitespace = BufferPtr;
799*67e74705SXin Li     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
800*67e74705SXin Li       EndWhitespace++;
801*67e74705SXin Li 
802*67e74705SXin Li     // Turn any whitespace between comments (and there is only whitespace
803*67e74705SXin Li     // between them -- guaranteed by comment extraction) into a newline.  We
804*67e74705SXin Li     // have two newlines between C comments in total (first one was synthesized
805*67e74705SXin Li     // after a comment).
806*67e74705SXin Li     formTokenWithChars(T, EndWhitespace, tok::newline);
807*67e74705SXin Li 
808*67e74705SXin Li     CommentState = LCS_BeforeComment;
809*67e74705SXin Li     break;
810*67e74705SXin Li   }
811*67e74705SXin Li 
812*67e74705SXin Li   case LCS_InsideBCPLComment:
813*67e74705SXin Li   case LCS_InsideCComment:
814*67e74705SXin Li     if (BufferPtr != CommentEnd) {
815*67e74705SXin Li       lexCommentText(T);
816*67e74705SXin Li       break;
817*67e74705SXin Li     } else {
818*67e74705SXin Li       // Skip C comment closing sequence.
819*67e74705SXin Li       if (CommentState == LCS_InsideCComment) {
820*67e74705SXin Li         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
821*67e74705SXin Li         BufferPtr += 2;
822*67e74705SXin Li         assert(BufferPtr <= BufferEnd);
823*67e74705SXin Li 
824*67e74705SXin Li         // Synthenize newline just after the C comment, regardless if there is
825*67e74705SXin Li         // actually a newline.
826*67e74705SXin Li         formTokenWithChars(T, BufferPtr, tok::newline);
827*67e74705SXin Li 
828*67e74705SXin Li         CommentState = LCS_BetweenComments;
829*67e74705SXin Li         break;
830*67e74705SXin Li       } else {
831*67e74705SXin Li         // Don't synthesized a newline after BCPL comment.
832*67e74705SXin Li         CommentState = LCS_BetweenComments;
833*67e74705SXin Li         goto again;
834*67e74705SXin Li       }
835*67e74705SXin Li     }
836*67e74705SXin Li   }
837*67e74705SXin Li }
838*67e74705SXin Li 
getSpelling(const Token & Tok,const SourceManager & SourceMgr,bool * Invalid) const839*67e74705SXin Li StringRef Lexer::getSpelling(const Token &Tok,
840*67e74705SXin Li                              const SourceManager &SourceMgr,
841*67e74705SXin Li                              bool *Invalid) const {
842*67e74705SXin Li   SourceLocation Loc = Tok.getLocation();
843*67e74705SXin Li   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
844*67e74705SXin Li 
845*67e74705SXin Li   bool InvalidTemp = false;
846*67e74705SXin Li   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
847*67e74705SXin Li   if (InvalidTemp) {
848*67e74705SXin Li     *Invalid = true;
849*67e74705SXin Li     return StringRef();
850*67e74705SXin Li   }
851*67e74705SXin Li 
852*67e74705SXin Li   const char *Begin = File.data() + LocInfo.second;
853*67e74705SXin Li   return StringRef(Begin, Tok.getLength());
854*67e74705SXin Li }
855*67e74705SXin Li 
856*67e74705SXin Li } // end namespace comments
857*67e74705SXin Li } // end namespace clang
858