CommentLexer.cpp source code [clang_source_code/lib/AST/CommentLexer.cpp]

1	//===--- CommentLexer.cpp -------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "clang/AST/CommentLexer.h"
10	#include "clang/AST/CommentCommandTraits.h"
11	#include "clang/AST/CommentDiagnostic.h"
12	#include "clang/Basic/CharInfo.h"
13	#include "llvm/ADT/StringExtras.h"
14	#include "llvm/ADT/StringSwitch.h"
15	#include "llvm/Support/ConvertUTF.h"
16	#include "llvm/Support/ErrorHandling.h"
17
18	namespace clang {
19	namespace comments {
20
21	void Token::dump(const Lexer &L, const SourceManager &SM) const {
22	llvm::errs() << "comments::Token Kind=" << Kind << " ";
23	Loc.print(llvm::errs(), SM);
24	llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25	}
26
27	static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28	return isLetter(C);
29	}
30
31	static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32	return isDigit(C);
33	}
34
35	static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36	return isHexDigit(C);
37	}
38
39	static inline StringRef convertCodePointToUTF8(
40	llvm::BumpPtrAllocator &Allocator,
41	unsigned CodePoint) {
42	char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43	char *ResolvedPtr = Resolved;
44	if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45	return StringRef(Resolved, ResolvedPtr - Resolved);
46	else
47	return StringRef();
48	}
49
50	namespace {
51
52	#include "clang/AST/CommentHTMLTags.inc"
53	#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54
55	} // end anonymous namespace
56
57	StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58	// Fast path, first check a few most widely used named character references.
59	return llvm::StringSwitch<StringRef>(Name)
60	.Case("amp", "&")
61	.Case("lt", "<")
62	.Case("gt", ">")
63	.Case("quot", "\"")
64	.Case("apos", "\'")
65	// Slow path.
66	.Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67	}
68
69	StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70	unsigned CodePoint = 0;
71	for (unsigned i = 0, e = Name.size(); i != e; ++i) {
72	assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73	CodePoint *= 10;
74	CodePoint += Name[i] - '0';
75	}
76	return convertCodePointToUTF8(Allocator, CodePoint);
77	}
78
79	StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80	unsigned CodePoint = 0;
81	for (unsigned i = 0, e = Name.size(); i != e; ++i) {
82	CodePoint *= 16;
83	const char C = Name[i];
84	assert(isHTMLHexCharacterReferenceCharacter(C));
85	CodePoint += llvm::hexDigitValue(C);
86	}
87	return convertCodePointToUTF8(Allocator, CodePoint);
88	}
89
90	void Lexer::skipLineStartingDecorations() {
91	// This function should be called only for C comments
92	assert(CommentState == LCS_InsideCComment);
93
94	if (BufferPtr == CommentEnd)
95	return;
96
97	switch (*BufferPtr) {
98	case ' ':
99	case '\t':
100	case '\f':
101	case '\v': {
102	const char *NewBufferPtr = BufferPtr;
103	NewBufferPtr++;
104	if (NewBufferPtr == CommentEnd)
105	return;
106
107	char C = *NewBufferPtr;
108	while (isHorizontalWhitespace(C)) {
109	NewBufferPtr++;
110	if (NewBufferPtr == CommentEnd)
111	return;
112	C = *NewBufferPtr;
113	}
114	if (C == '*')
115	BufferPtr = NewBufferPtr + 1;
116	break;
117	}
118	case '*':
119	BufferPtr++;
120	break;
121	}
122	}
123
124	namespace {
125	/// Returns pointer to the first newline character in the string.
126	const char findNewline(const char BufferPtr, const char *BufferEnd) {
127	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
128	if (isVerticalWhitespace(*BufferPtr))
129	return BufferPtr;
130	}
131	return BufferEnd;
132	}
133
134	const char skipNewline(const char BufferPtr, const char *BufferEnd) {
135	if (BufferPtr == BufferEnd)
136	return BufferPtr;
137
138	if (*BufferPtr == '\n')
139	BufferPtr++;
140	else {
141	assert(*BufferPtr == '\r');
142	BufferPtr++;
143	if (BufferPtr != BufferEnd && *BufferPtr == '\n')
144	BufferPtr++;
145	}
146	return BufferPtr;
147	}
148
149	const char skipNamedCharacterReference(const char BufferPtr,
150	const char *BufferEnd) {
151	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
152	if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
153	return BufferPtr;
154	}
155	return BufferEnd;
156	}
157
158	const char skipDecimalCharacterReference(const char BufferPtr,
159	const char *BufferEnd) {
160	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
161	if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
162	return BufferPtr;
163	}
164	return BufferEnd;
165	}
166
167	const char skipHexCharacterReference(const char BufferPtr,
168	const char *BufferEnd) {
169	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
170	if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
171	return BufferPtr;
172	}
173	return BufferEnd;
174	}
175
176	bool isHTMLIdentifierStartingCharacter(char C) {
177	return isLetter(C);
178	}
179
180	bool isHTMLIdentifierCharacter(char C) {
181	return isAlphanumeric(C);
182	}
183
184	const char skipHTMLIdentifier(const char BufferPtr, const char *BufferEnd) {
185	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
186	if (!isHTMLIdentifierCharacter(*BufferPtr))
187	return BufferPtr;
188	}
189	return BufferEnd;
190	}
191
192	/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
193	/// string allowed.
194	///
195	/// Returns pointer to closing quote.
196	const char skipHTMLQuotedString(const char BufferPtr, const char *BufferEnd)
197	{
198	const char Quote = *BufferPtr;
199	(0) . __assert_fail ("Quote == '\\\"' \|\| Quote == '\\''", "/home/seafit/code_projects/clang_source/clang/lib/AST/CommentLexer.cpp", 199, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(Quote == '\"' \|\| Quote == '\'');
200
201	BufferPtr++;
202	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
203	const char C = *BufferPtr;
204	if (C == Quote && BufferPtr[-1] != '\\')
205	return BufferPtr;
206	}
207	return BufferEnd;
208	}
209
210	const char skipWhitespace(const char BufferPtr, const char *BufferEnd) {
211	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
212	if (!isWhitespace(*BufferPtr))
213	return BufferPtr;
214	}
215	return BufferEnd;
216	}
217
218	bool isWhitespace(const char BufferPtr, const char BufferEnd) {
219	return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
220	}
221
222	bool isCommandNameStartCharacter(char C) {
223	return isLetter(C);
224	}
225
226	bool isCommandNameCharacter(char C) {
227	return isAlphanumeric(C);
228	}
229
230	const char skipCommandName(const char BufferPtr, const char *BufferEnd) {
231	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
232	if (!isCommandNameCharacter(*BufferPtr))
233	return BufferPtr;
234	}
235	return BufferEnd;
236	}
237
238	/// Return the one past end pointer for BCPL comments.
239	/// Handles newlines escaped with backslash or trigraph for backslahs.
240	const char findBCPLCommentEnd(const char BufferPtr, const char *BufferEnd) {
241	const char *CurPtr = BufferPtr;
242	while (CurPtr != BufferEnd) {
243	while (!isVerticalWhitespace(*CurPtr)) {
244	CurPtr++;
245	if (CurPtr == BufferEnd)
246	return BufferEnd;
247	}
248	// We found a newline, check if it is escaped.
249	const char *EscapePtr = CurPtr - 1;
250	while(isHorizontalWhitespace(*EscapePtr))
251	EscapePtr--;
252
253	if (*EscapePtr == '\\' \|\|
254	(EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
255	EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
256	// We found an escaped newline.
257	CurPtr = skipNewline(CurPtr, BufferEnd);
258	} else
259	return CurPtr; // Not an escaped newline.
260	}
261	return BufferEnd;
262	}
263
264	/// Return the one past end pointer for C comments.
265	/// Very dumb, does not handle escaped newlines or trigraphs.
266	const char findCCommentEnd(const char BufferPtr, const char *BufferEnd) {
267	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
268	if (BufferPtr == '') {
269	assert(BufferPtr + 1 != BufferEnd);
270	if (*(BufferPtr + 1) == '/')
271	return BufferPtr;
272	}
273	}
274	llvm_unreachable("buffer end hit before '*/' was seen");
275	}
276
277	} // end anonymous namespace
278
279	void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
280	tok::TokenKind Kind) {
281	const unsigned TokLen = TokEnd - BufferPtr;
282	Result.setLocation(getSourceLocation(BufferPtr));
283	Result.setKind(Kind);
284	Result.setLength(TokLen);
285	#ifndef NDEBUG
286	Result.TextPtr = "<UNSET>";
287	Result.IntVal = 7;
288	#endif
289	BufferPtr = TokEnd;
290	}
291
292	void Lexer::lexCommentText(Token &T) {
293	assert(CommentState == LCS_InsideBCPLComment \|\|
294	CommentState == LCS_InsideCComment);
295
296	// Handles lexing non-command text, i.e. text and newline.
297	auto HandleNonCommandToken = [&]() -> void {
298	assert(State == LS_Normal);
299
300	const char *TokenPtr = BufferPtr;
301	assert(TokenPtr < CommentEnd);
302	switch (*TokenPtr) {
303	case '\n':
304	case '\r':
305	TokenPtr = skipNewline(TokenPtr, CommentEnd);
306	formTokenWithChars(T, TokenPtr, tok::newline);
307
308	if (CommentState == LCS_InsideCComment)
309	skipLineStartingDecorations();
310	return;
311
312	default: {
313	StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
314	size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
315	.find_first_of(TokStartSymbols);
316	if (End != StringRef::npos)
317	TokenPtr += End;
318	else
319	TokenPtr = CommentEnd;
320	formTextToken(T, TokenPtr);
321	return;
322	}
323	}
324	};
325
326	if (!ParseCommands)
327	return HandleNonCommandToken();
328
329	switch (State) {
330	case LS_Normal:
331	break;
332	case LS_VerbatimBlockFirstLine:
333	lexVerbatimBlockFirstLine(T);
334	return;
335	case LS_VerbatimBlockBody:
336	lexVerbatimBlockBody(T);
337	return;
338	case LS_VerbatimLineText:
339	lexVerbatimLineText(T);
340	return;
341	case LS_HTMLStartTag:
342	lexHTMLStartTag(T);
343	return;
344	case LS_HTMLEndTag:
345	lexHTMLEndTag(T);
346	return;
347	}
348
349	assert(State == LS_Normal);
350	const char *TokenPtr = BufferPtr;
351	assert(TokenPtr < CommentEnd);
352	switch(*TokenPtr) {
353	case '\\':
354	case '@': {
355	// Commands that start with a backslash and commands that start with
356	// 'at' have equivalent semantics. But we keep information about the
357	// exact syntax in AST for comments.
358	tok::TokenKind CommandKind =
359	(*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
360	TokenPtr++;
361	if (TokenPtr == CommentEnd) {
362	formTextToken(T, TokenPtr);
363	return;
364	}
365	char C = *TokenPtr;
366	switch (C) {
367	default:
368	break;
369
370	case '\\': case '@': case '&': case '$':
371	case '#': case '<': case '>': case '%':
372	case '\"': case '.': case ':':
373	// This is one of \\ \@ \& \$ etc escape sequences.
374	TokenPtr++;
375	if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
376	// This is the \:: escape sequence.
377	TokenPtr++;
378	}
379	StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
380	formTokenWithChars(T, TokenPtr, tok::text);
381	T.setText(UnescapedText);
382	return;
383	}
384
385	// Don't make zero-length commands.
386	if (!isCommandNameStartCharacter(*TokenPtr)) {
387	formTextToken(T, TokenPtr);
388	return;
389	}
390
391	TokenPtr = skipCommandName(TokenPtr, CommentEnd);
392	unsigned Length = TokenPtr - (BufferPtr + 1);
393
394	// Hardcoded support for lexing LaTeX formula commands
395	// \f$ \f[ \f] \f{ \f} as a single command.
396	if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
397	C = *TokenPtr;
398	if (C == '$' \|\| C == '[' \|\| C == ']' \|\| C == '{' \|\| C == '}') {
399	TokenPtr++;
400	Length++;
401	}
402	}
403
404	StringRef CommandName(BufferPtr + 1, Length);
405
406	const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
407	if (!Info) {
408	if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
409	StringRef CorrectedName = Info->Name;
410	SourceLocation Loc = getSourceLocation(BufferPtr);
411	SourceLocation EndLoc = getSourceLocation(TokenPtr);
412	SourceRange FullRange = SourceRange(Loc, EndLoc);
413	SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
414	Diag(Loc, diag::warn_correct_comment_command_name)
415	<< FullRange << CommandName << CorrectedName
416	<< FixItHint::CreateReplacement(CommandRange, CorrectedName);
417	} else {
418	formTokenWithChars(T, TokenPtr, tok::unknown_command);
419	T.setUnknownCommandName(CommandName);
420	Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
421	<< SourceRange(T.getLocation(), T.getEndLocation());
422	return;
423	}
424	}
425	if (Info->IsVerbatimBlockCommand) {
426	setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
427	return;
428	}
429	if (Info->IsVerbatimLineCommand) {
430	setupAndLexVerbatimLine(T, TokenPtr, Info);
431	return;
432	}
433	formTokenWithChars(T, TokenPtr, CommandKind);
434	T.setCommandID(Info->getID());
435	return;
436	}
437
438	case '&':
439	lexHTMLCharacterReference(T);
440	return;
441
442	case '<': {
443	TokenPtr++;
444	if (TokenPtr == CommentEnd) {
445	formTextToken(T, TokenPtr);
446	return;
447	}
448	const char C = *TokenPtr;
449	if (isHTMLIdentifierStartingCharacter(C))
450	setupAndLexHTMLStartTag(T);
451	else if (C == '/')
452	setupAndLexHTMLEndTag(T);
453	else
454	formTextToken(T, TokenPtr);
455	return;
456	}
457
458	default:
459	return HandleNonCommandToken();
460	}
461	}
462
463	void Lexer::setupAndLexVerbatimBlock(Token &T,
464	const char *TextBegin,
465	char Marker, const CommandInfo *Info) {
466	IsVerbatimBlockCommand", "/home/seafit/code_projects/clang_source/clang/lib/AST/CommentLexer.cpp", 466, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(Info->IsVerbatimBlockCommand);
467
468	VerbatimBlockEndCommandName.clear();
469	VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
470	VerbatimBlockEndCommandName.append(Info->EndCommandName);
471
472	formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
473	T.setVerbatimBlockID(Info->getID());
474
475	// If there is a newline following the verbatim opening command, skip the
476	// newline so that we don't create an tok::verbatim_block_line with empty
477	// text content.
478	if (BufferPtr != CommentEnd &&
479	isVerticalWhitespace(*BufferPtr)) {
480	BufferPtr = skipNewline(BufferPtr, CommentEnd);
481	State = LS_VerbatimBlockBody;
482	return;
483	}
484
485	State = LS_VerbatimBlockFirstLine;
486	}
487
488	void Lexer::lexVerbatimBlockFirstLine(Token &T) {
489	again:
490	assert(BufferPtr < CommentEnd);
491
492	// FIXME: It would be better to scan the text once, finding either the block
493	// end command or newline.
494	//
495	// Extract current line.
496	const char *Newline = findNewline(BufferPtr, CommentEnd);
497	StringRef Line(BufferPtr, Newline - BufferPtr);
498
499	// Look for end command in current line.
500	size_t Pos = Line.find(VerbatimBlockEndCommandName);
501	const char *TextEnd;
502	const char *NextLine;
503	if (Pos == StringRef::npos) {
504	// Current line is completely verbatim.
505	TextEnd = Newline;
506	NextLine = skipNewline(Newline, CommentEnd);
507	} else if (Pos == 0) {
508	// Current line contains just an end command.
509	const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
510	StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
511	formTokenWithChars(T, End, tok::verbatim_block_end);
512	T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
513	State = LS_Normal;
514	return;
515	} else {
516	// There is some text, followed by end command. Extract text first.
517	TextEnd = BufferPtr + Pos;
518	NextLine = TextEnd;
519	// If there is only whitespace before end command, skip whitespace.
520	if (isWhitespace(BufferPtr, TextEnd)) {
521	BufferPtr = TextEnd;
522	goto again;
523	}
524	}
525
526	StringRef Text(BufferPtr, TextEnd - BufferPtr);
527	formTokenWithChars(T, NextLine, tok::verbatim_block_line);
528	T.setVerbatimBlockText(Text);
529
530	State = LS_VerbatimBlockBody;
531	}
532
533	void Lexer::lexVerbatimBlockBody(Token &T) {
534	assert(State == LS_VerbatimBlockBody);
535
536	if (CommentState == LCS_InsideCComment)
537	skipLineStartingDecorations();
538
539	if (BufferPtr == CommentEnd) {
540	formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
541	T.setVerbatimBlockText("");
542	return;
543	}
544
545	lexVerbatimBlockFirstLine(T);
546	}
547
548	void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
549	const CommandInfo *Info) {
550	IsVerbatimLineCommand", "/home/seafit/code_projects/clang_source/clang/lib/AST/CommentLexer.cpp", 550, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(Info->IsVerbatimLineCommand);
551	formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
552	T.setVerbatimLineID(Info->getID());
553
554	State = LS_VerbatimLineText;
555	}
556
557	void Lexer::lexVerbatimLineText(Token &T) {
558	assert(State == LS_VerbatimLineText);
559
560	// Extract current line.
561	const char *Newline = findNewline(BufferPtr, CommentEnd);
562	StringRef Text(BufferPtr, Newline - BufferPtr);
563	formTokenWithChars(T, Newline, tok::verbatim_line_text);
564	T.setVerbatimLineText(Text);
565
566	State = LS_Normal;
567	}
568
569	void Lexer::lexHTMLCharacterReference(Token &T) {
570	const char *TokenPtr = BufferPtr;
571	assert(*TokenPtr == '&');
572	TokenPtr++;
573	if (TokenPtr == CommentEnd) {
574	formTextToken(T, TokenPtr);
575	return;
576	}
577	const char *NamePtr;
578	bool isNamed = false;
579	bool isDecimal = false;
580	char C = *TokenPtr;
581	if (isHTMLNamedCharacterReferenceCharacter(C)) {
582	NamePtr = TokenPtr;
583	TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
584	isNamed = true;
585	} else if (C == '#') {
586	TokenPtr++;
587	if (TokenPtr == CommentEnd) {
588	formTextToken(T, TokenPtr);
589	return;
590	}
591	C = *TokenPtr;
592	if (isHTMLDecimalCharacterReferenceCharacter(C)) {
593	NamePtr = TokenPtr;
594	TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
595	isDecimal = true;
596	} else if (C == 'x' \|\| C == 'X') {
597	TokenPtr++;
598	NamePtr = TokenPtr;
599	TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
600	} else {
601	formTextToken(T, TokenPtr);
602	return;
603	}
604	} else {
605	formTextToken(T, TokenPtr);
606	return;
607	}
608	if (NamePtr == TokenPtr \|\| TokenPtr == CommentEnd \|\|
609	*TokenPtr != ';') {
610	formTextToken(T, TokenPtr);
611	return;
612	}
613	StringRef Name(NamePtr, TokenPtr - NamePtr);
614	TokenPtr++; // Skip semicolon.
615	StringRef Resolved;
616	if (isNamed)
617	Resolved = resolveHTMLNamedCharacterReference(Name);
618	else if (isDecimal)
619	Resolved = resolveHTMLDecimalCharacterReference(Name);
620	else
621	Resolved = resolveHTMLHexCharacterReference(Name);
622
623	if (Resolved.empty()) {
624	formTextToken(T, TokenPtr);
625	return;
626	}
627	formTokenWithChars(T, TokenPtr, tok::text);
628	T.setText(Resolved);
629	}
630
631	void Lexer::setupAndLexHTMLStartTag(Token &T) {
632	assert(BufferPtr[0] == '<' &&
633	isHTMLIdentifierStartingCharacter(BufferPtr[1]));
634	const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
635	StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
636	if (!isHTMLTagName(Name)) {
637	formTextToken(T, TagNameEnd);
638	return;
639	}
640
641	formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
642	T.setHTMLTagStartName(Name);
643
644	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
645
646	const char C = *BufferPtr;
647	if (BufferPtr != CommentEnd &&
648	(C == '>' \|\| C == '/' \|\| isHTMLIdentifierStartingCharacter(C)))
649	State = LS_HTMLStartTag;
650	}
651
652	void Lexer::lexHTMLStartTag(Token &T) {
653	assert(State == LS_HTMLStartTag);
654
655	const char *TokenPtr = BufferPtr;
656	char C = *TokenPtr;
657	if (isHTMLIdentifierCharacter(C)) {
658	TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
659	StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
660	formTokenWithChars(T, TokenPtr, tok::html_ident);
661	T.setHTMLIdent(Ident);
662	} else {
663	switch (C) {
664	case '=':
665	TokenPtr++;
666	formTokenWithChars(T, TokenPtr, tok::html_equals);
667	break;
668	case '\"':
669	case '\'': {
670	const char *OpenQuote = TokenPtr;
671	TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
672	const char *ClosingQuote = TokenPtr;
673	if (TokenPtr != CommentEnd) // Skip closing quote.
674	TokenPtr++;
675	formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
676	T.setHTMLQuotedString(StringRef(OpenQuote + 1,
677	ClosingQuote - (OpenQuote + 1)));
678	break;
679	}
680	case '>':
681	TokenPtr++;
682	formTokenWithChars(T, TokenPtr, tok::html_greater);
683	State = LS_Normal;
684	return;
685	case '/':
686	TokenPtr++;
687	if (TokenPtr != CommentEnd && *TokenPtr == '>') {
688	TokenPtr++;
689	formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
690	} else
691	formTextToken(T, TokenPtr);
692
693	State = LS_Normal;
694	return;
695	}
696	}
697
698	// Now look ahead and return to normal state if we don't see any HTML tokens
699	// ahead.
700	BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
701	if (BufferPtr == CommentEnd) {
702	State = LS_Normal;
703	return;
704	}
705
706	C = *BufferPtr;
707	if (!isHTMLIdentifierStartingCharacter(C) &&
708	C != '=' && C != '\"' && C != '\'' && C != '>') {
709	State = LS_Normal;
710	return;
711	}
712	}
713
714	void Lexer::setupAndLexHTMLEndTag(Token &T) {
715	assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
716
717	const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
718	const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
719	StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
720	if (!isHTMLTagName(Name)) {
721	formTextToken(T, TagNameEnd);
722	return;
723	}
724
725	const char *End = skipWhitespace(TagNameEnd, CommentEnd);
726
727	formTokenWithChars(T, End, tok::html_end_tag);
728	T.setHTMLTagEndName(Name);
729
730	if (BufferPtr != CommentEnd && *BufferPtr == '>')
731	State = LS_HTMLEndTag;
732	}
733
734	void Lexer::lexHTMLEndTag(Token &T) {
735	'", "/home/seafit/code_projects/clang_source/clang/lib/AST/CommentLexer.cpp", 735, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(BufferPtr != CommentEnd && *BufferPtr == '>');
736
737	formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
738	State = LS_Normal;
739	}
740
741	Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
742	const CommandTraits &Traits, SourceLocation FileLoc,
743	const char BufferStart, const char BufferEnd,
744	bool ParseCommands)
745	: Allocator(Allocator), Diags(Diags), Traits(Traits),
746	BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
747	BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
748	ParseCommands(ParseCommands) {}
749
750	void Lexer::lex(Token &T) {
751	again:
752	switch (CommentState) {
753	case LCS_BeforeComment:
754	if (BufferPtr == BufferEnd) {
755	formTokenWithChars(T, BufferPtr, tok::eof);
756	return;
757	}
758
759	assert(*BufferPtr == '/');
760	BufferPtr++; // Skip first slash.
761	switch(*BufferPtr) {
762	case '/': { // BCPL comment.
763	BufferPtr++; // Skip second slash.
764
765	if (BufferPtr != BufferEnd) {
766	// Skip Doxygen magic marker, if it is present.
767	// It might be missing because of a typo //< or /*<, or because we
768	// merged this non-Doxygen comment into a bunch of Doxygen comments
769	// around it: /** ... / / ... / /* ... */
770	const char C = *BufferPtr;
771	if (C == '/' \|\| C == '!')
772	BufferPtr++;
773	}
774
775	// Skip less-than symbol that marks trailing comments.
776	// Skip it even if the comment is not a Doxygen one, because //< and /*<
777	// are frequent typos.
778	if (BufferPtr != BufferEnd && *BufferPtr == '<')
779	BufferPtr++;
780
781	CommentState = LCS_InsideBCPLComment;
782	if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
783	State = LS_Normal;
784	CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
785	goto again;
786	}
787	case '*': { // C comment.
788	BufferPtr++; // Skip star.
789
790	// Skip Doxygen magic marker.
791	const char C = *BufferPtr;
792	if ((C == '' && (BufferPtr + 1) != '/') \|\| C == '!')
793	BufferPtr++;
794
795	// Skip less-than symbol that marks trailing comments.
796	if (BufferPtr != BufferEnd && *BufferPtr == '<')
797	BufferPtr++;
798
799	CommentState = LCS_InsideCComment;
800	State = LS_Normal;
801	CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
802	goto again;
803	}
804	default:
805	llvm_unreachable("second character of comment should be '/' or '*'");
806	}
807
808	case LCS_BetweenComments: {
809	// Consecutive comments are extracted only if there is only whitespace
810	// between them. So we can search for the start of the next comment.
811	const char *EndWhitespace = BufferPtr;
812	while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
813	EndWhitespace++;
814
815	// Turn any whitespace between comments (and there is only whitespace
816	// between them -- guaranteed by comment extraction) into a newline. We
817	// have two newlines between C comments in total (first one was synthesized
818	// after a comment).
819	formTokenWithChars(T, EndWhitespace, tok::newline);
820
821	CommentState = LCS_BeforeComment;
822	break;
823	}
824
825	case LCS_InsideBCPLComment:
826	case LCS_InsideCComment:
827	if (BufferPtr != CommentEnd) {
828	lexCommentText(T);
829	break;
830	} else {
831	// Skip C comment closing sequence.
832	if (CommentState == LCS_InsideCComment) {
833	assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
834	BufferPtr += 2;
835	assert(BufferPtr <= BufferEnd);
836
837	// Synthenize newline just after the C comment, regardless if there is
838	// actually a newline.
839	formTokenWithChars(T, BufferPtr, tok::newline);
840
841	CommentState = LCS_BetweenComments;
842	break;
843	} else {
844	// Don't synthesized a newline after BCPL comment.
845	CommentState = LCS_BetweenComments;
846	goto again;
847	}
848	}
849	}
850	}
851
852	StringRef Lexer::getSpelling(const Token &Tok,
853	const SourceManager &SourceMgr,
854	bool *Invalid) const {
855	SourceLocation Loc = Tok.getLocation();
856	std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
857
858	bool InvalidTemp = false;
859	StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
860	if (InvalidTemp) {
861	*Invalid = true;
862	return StringRef();
863	}
864
865	const char *Begin = File.data() + LocInfo.second;
866	return StringRef(Begin, Tok.getLength());
867	}
868
869	} // end namespace comments
870	} // end namespace clang
871