BreakableToken.h source code [clang_source_code/lib/Format/BreakableToken.h]

1	//===--- BreakableToken.h - Format C++ code ---------------------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// Declares BreakableToken, BreakableStringLiteral, BreakableComment,
11	/// BreakableBlockComment and BreakableLineCommentSection classes, that contain
12	/// token type-specific logic to break long lines in tokens and reflow content
13	/// between tokens.
14	///
15	//===----------------------------------------------------------------------===//
16
17	#ifndef LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
18	#define LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
19
20	#include "Encoding.h"
21	#include "TokenAnnotator.h"
22	#include "WhitespaceManager.h"
23	#include "llvm/ADT/StringSet.h"
24	#include "llvm/Support/Regex.h"
25	#include <utility>
26
27	namespace clang {
28	namespace format {
29
30	/// Checks if \p Token switches formatting, like /* clang-format off */.
31	/// \p Token must be a comment.
32	bool switchesFormatting(const FormatToken &Token);
33
34	struct FormatStyle;
35
36	/// Base class for tokens / ranges of tokens that can allow breaking
37	/// within the tokens - for example, to avoid whitespace beyond the column
38	/// limit, or to reflow text.
39	///
40	/// Generally, a breakable token consists of logical lines, addressed by a line
41	/// index. For example, in a sequence of line comments, each line comment is its
42	/// own logical line; similarly, for a block comment, each line in the block
43	/// comment is on its own logical line.
44	///
45	/// There are two methods to compute the layout of the token:
46	/// - getRangeLength measures the number of columns needed for a range of text
47	/// within a logical line, and
48	/// - getContentStartColumn returns the start column at which we want the
49	/// content of a logical line to start (potentially after introducing a line
50	/// break).
51	///
52	/// The mechanism to adapt the layout of the breakable token is organised
53	/// around the concept of a \c Split, which is a whitespace range that signifies
54	/// a position of the content of a token where a reformatting might be done.
55	///
56	/// Operating with splits is divided into two operations:
57	/// - getSplit, for finding a split starting at a position,
58	/// - insertBreak, for executing the split using a whitespace manager.
59	///
60	/// There is a pair of operations that are used to compress a long whitespace
61	/// range with a single space if that will bring the line length under the
62	/// column limit:
63	/// - getLineLengthAfterCompression, for calculating the size in columns of the
64	/// line after a whitespace range has been compressed, and
65	/// - compressWhitespace, for executing the whitespace compression using a
66	/// whitespace manager; note that the compressed whitespace may be in the
67	/// middle of the original line and of the reformatted line.
68	///
69	/// For tokens where the whitespace before each line needs to be also
70	/// reformatted, for example for tokens supporting reflow, there are analogous
71	/// operations that might be executed before the main line breaking occurs:
72	/// - getReflowSplit, for finding a split such that the content preceding it
73	/// needs to be specially reflown,
74	/// - reflow, for executing the split using a whitespace manager,
75	/// - introducesBreakBefore, for checking if reformatting the beginning
76	/// of the content introduces a line break before it,
77	/// - adaptStartOfLine, for executing the reflow using a whitespace
78	/// manager.
79	///
80	/// For tokens that require the whitespace after the last line to be
81	/// reformatted, for example in multiline jsdoc comments that require the
82	/// trailing '*/' to be on a line of itself, there are analogous operations
83	/// that might be executed after the last line has been reformatted:
84	/// - getSplitAfterLastLine, for finding a split after the last line that needs
85	/// to be reflown,
86	/// - replaceWhitespaceAfterLastLine, for executing the reflow using a
87	/// whitespace manager.
88	///
89	class BreakableToken {
90	public:
91	/// Contains starting character index and length of split.
92	typedef std::pair<StringRef::size_type, unsigned> Split;
93
94	virtual ~BreakableToken() {}
95
96	/// Returns the number of lines in this token in the original code.
97	virtual unsigned getLineCount() const = 0;
98
99	/// Returns the number of columns required to format the text in the
100	/// byte range [\p Offset, \p Offset \c + \p Length).
101	///
102	/// \p Offset is the byte offset from the start of the content of the line
103	/// at \p LineIndex.
104	///
105	/// \p StartColumn is the column at which the text starts in the formatted
106	/// file, needed to compute tab stops correctly.
107	virtual unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
108	StringRef::size_type Length,
109	unsigned StartColumn) const = 0;
110
111	/// Returns the number of columns required to format the text following
112	/// the byte \p Offset in the line \p LineIndex, including potentially
113	/// unbreakable sequences of tokens following after the end of the token.
114	///
115	/// \p Offset is the byte offset from the start of the content of the line
116	/// at \p LineIndex.
117	///
118	/// \p StartColumn is the column at which the text starts in the formatted
119	/// file, needed to compute tab stops correctly.
120	///
121	/// For breakable tokens that never use extra space at the end of a line, this
122	/// is equivalent to getRangeLength with a Length of StringRef::npos.
123	virtual unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
124	unsigned StartColumn) const {
125	return getRangeLength(LineIndex, Offset, StringRef::npos, StartColumn);
126	}
127
128	/// Returns the column at which content in line \p LineIndex starts,
129	/// assuming no reflow.
130	///
131	/// If \p Break is true, returns the column at which the line should start
132	/// after the line break.
133	/// If \p Break is false, returns the column at which the line itself will
134	/// start.
135	virtual unsigned getContentStartColumn(unsigned LineIndex,
136	bool Break) const = 0;
137
138	/// Returns additional content indent required for the second line after the
139	/// content at line \p LineIndex is broken.
140	///
141	// (Next lines do not start with `///` since otherwise -Wdocumentation picks
142	// up the example annotations and generates warnings for them)
143	// For example, Javadoc @param annotations require and indent of 4 spaces and
144	// in this example getContentIndex(1) returns 4.
145	// /**
146	// * @param loooooooooooooong line
147	// * continuation
148	// */
149	virtual unsigned getContentIndent(unsigned LineIndex) const { return 0; }
150
151	/// Returns a range (offset, length) at which to break the line at
152	/// \p LineIndex, if previously broken at \p TailOffset. If possible, do not
153	/// violate \p ColumnLimit, assuming the text starting at \p TailOffset in
154	/// the token is formatted starting at ContentStartColumn in the reformatted
155	/// file.
156	virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
157	unsigned ColumnLimit, unsigned ContentStartColumn,
158	llvm::Regex &CommentPragmasRegex) const = 0;
159
160	/// Emits the previously retrieved \p Split via \p Whitespaces.
161	virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
162	unsigned ContentIndent,
163	WhitespaceManager &Whitespaces) const = 0;
164
165	/// Returns the number of columns needed to format
166	/// \p RemainingTokenColumns, assuming that Split is within the range measured
167	/// by \p RemainingTokenColumns, and that the whitespace in Split is reduced
168	/// to a single space.
169	unsigned getLengthAfterCompression(unsigned RemainingTokenColumns,
170	Split Split) const;
171
172	/// Replaces the whitespace range described by \p Split with a single
173	/// space.
174	virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset,
175	Split Split,
176	WhitespaceManager &Whitespaces) const = 0;
177
178	/// Returns whether the token supports reflowing text.
179	virtual bool supportsReflow() const { return false; }
180
181	/// Returns a whitespace range (offset, length) of the content at \p
182	/// LineIndex such that the content of that line is reflown to the end of the
183	/// previous one.
184	///
185	/// Returning (StringRef::npos, 0) indicates reflowing is not possible.
186	///
187	/// The range will include any whitespace preceding the specified line's
188	/// content.
189	///
190	/// If the split is not contained within one token, for example when reflowing
191	/// line comments, returns (0, <length>).
192	virtual Split getReflowSplit(unsigned LineIndex,
193	llvm::Regex &CommentPragmasRegex) const {
194	return Split(StringRef::npos, 0);
195	}
196
197	/// Reflows the current line into the end of the previous one.
198	virtual void reflow(unsigned LineIndex,
199	WhitespaceManager &Whitespaces) const {}
200
201	/// Returns whether there will be a line break at the start of the
202	/// token.
203	virtual bool introducesBreakBeforeToken() const { return false; }
204
205	/// Replaces the whitespace between \p LineIndex-1 and \p LineIndex.
206	virtual void adaptStartOfLine(unsigned LineIndex,
207	WhitespaceManager &Whitespaces) const {}
208
209	/// Returns a whitespace range (offset, length) of the content at
210	/// the last line that needs to be reformatted after the last line has been
211	/// reformatted.
212	///
213	/// A result having offset == StringRef::npos means that no reformat is
214	/// necessary.
215	virtual Split getSplitAfterLastLine(unsigned TailOffset) const {
216	return Split(StringRef::npos, 0);
217	}
218
219	/// Replaces the whitespace from \p SplitAfterLastLine on the last line
220	/// after the last line has been formatted by performing a reformatting.
221	void replaceWhitespaceAfterLastLine(unsigned TailOffset,
222	Split SplitAfterLastLine,
223	WhitespaceManager &Whitespaces) const {
224	insertBreak(getLineCount() - 1, TailOffset, SplitAfterLastLine,
225	/ContentIndent=/0, Whitespaces);
226	}
227
228	/// Updates the next token of \p State to the next token after this
229	/// one. This can be used when this token manages a set of underlying tokens
230	/// as a unit and is responsible for the formatting of the them.
231	virtual void updateNextToken(LineState &State) const {}
232
233	protected:
234	BreakableToken(const FormatToken &Tok, bool InPPDirective,
235	encoding::Encoding Encoding, const FormatStyle &Style)
236	: Tok(Tok), InPPDirective(InPPDirective), Encoding(Encoding),
237	Style(Style) {}
238
239	const FormatToken &Tok;
240	const bool InPPDirective;
241	const encoding::Encoding Encoding;
242	const FormatStyle &Style;
243	};
244
245	class BreakableStringLiteral : public BreakableToken {
246	public:
247	/// Creates a breakable token for a single line string literal.
248	///
249	/// \p StartColumn specifies the column in which the token will start
250	/// after formatting.
251	BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
252	StringRef Prefix, StringRef Postfix,
253	unsigned UnbreakableTailLength, bool InPPDirective,
254	encoding::Encoding Encoding, const FormatStyle &Style);
255
256	Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
257	unsigned ContentStartColumn,
258	llvm::Regex &CommentPragmasRegex) const override;
259	void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
260	unsigned ContentIndent,
261	WhitespaceManager &Whitespaces) const override;
262	void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
263	WhitespaceManager &Whitespaces) const override {}
264	unsigned getLineCount() const override;
265	unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
266	StringRef::size_type Length,
267	unsigned StartColumn) const override;
268	unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
269	unsigned StartColumn) const override;
270	unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
271
272	protected:
273	// The column in which the token starts.
274	unsigned StartColumn;
275	// The prefix a line needs after a break in the token.
276	StringRef Prefix;
277	// The postfix a line needs before introducing a break.
278	StringRef Postfix;
279	// The token text excluding the prefix and postfix.
280	StringRef Line;
281	// Length of the sequence of tokens after this string literal that cannot
282	// contain line breaks.
283	unsigned UnbreakableTailLength;
284	};
285
286	class BreakableComment : public BreakableToken {
287	protected:
288	/// Creates a breakable token for a comment.
289	///
290	/// \p StartColumn specifies the column in which the comment will start after
291	/// formatting.
292	BreakableComment(const FormatToken &Token, unsigned StartColumn,
293	bool InPPDirective, encoding::Encoding Encoding,
294	const FormatStyle &Style);
295
296	public:
297	bool supportsReflow() const override { return true; }
298	unsigned getLineCount() const override;
299	Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
300	unsigned ContentStartColumn,
301	llvm::Regex &CommentPragmasRegex) const override;
302	void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
303	WhitespaceManager &Whitespaces) const override;
304
305	protected:
306	// Returns the token containing the line at LineIndex.
307	const FormatToken &tokenAt(unsigned LineIndex) const;
308
309	// Checks if the content of line LineIndex may be reflown with the previous
310	// line.
311	virtual bool mayReflow(unsigned LineIndex,
312	llvm::Regex &CommentPragmasRegex) const = 0;
313
314	// Contains the original text of the lines of the block comment.
315	//
316	// In case of a block comments, excludes the leading /* in the first line and
317	// trailing */ in the last line. In case of line comments, excludes the
318	// leading // and spaces.
319	SmallVector<StringRef, 16> Lines;
320
321	// Contains the text of the lines excluding all leading and trailing
322	// whitespace between the lines. Note that the decoration (if present) is also
323	// not considered part of the text.
324	SmallVector<StringRef, 16> Content;
325
326	// Tokens[i] contains a reference to the token containing Lines[i] if the
327	// whitespace range before that token is managed by this block.
328	// Otherwise, Tokens[i] is a null pointer.
329	SmallVector<FormatToken *, 16> Tokens;
330
331	// ContentColumn[i] is the target column at which Content[i] should be.
332	// Note that this excludes a leading "* " or "*" in case of block comments
333	// where all lines have a "*" prefix, or the leading "// " or "//" in case of
334	// line comments.
335	//
336	// In block comments, the first line's target column is always positive. The
337	// remaining lines' target columns are relative to the first line to allow
338	// correct indentation of comments in \c WhitespaceManager. Thus they can be
339	// negative as well (in case the first line needs to be unindented more than
340	// there's actual whitespace in another line).
341	SmallVector<int, 16> ContentColumn;
342
343	// The intended start column of the first line of text from this section.
344	unsigned StartColumn;
345
346	// The prefix to use in front a line that has been reflown up.
347	// For example, when reflowing the second line after the first here:
348	// // comment 1
349	// // comment 2
350	// we expect:
351	// // comment 1 comment 2
352	// and not:
353	// // comment 1comment 2
354	StringRef ReflowPrefix = " ";
355	};
356
357	class BreakableBlockComment : public BreakableComment {
358	public:
359	BreakableBlockComment(const FormatToken &Token, unsigned StartColumn,
360	unsigned OriginalStartColumn, bool FirstInLine,
361	bool InPPDirective, encoding::Encoding Encoding,
362	const FormatStyle &Style);
363
364	unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
365	StringRef::size_type Length,
366	unsigned StartColumn) const override;
367	unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
368	unsigned StartColumn) const override;
369	unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
370	unsigned getContentIndent(unsigned LineIndex) const override;
371	void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
372	unsigned ContentIndent,
373	WhitespaceManager &Whitespaces) const override;
374	Split getReflowSplit(unsigned LineIndex,
375	llvm::Regex &CommentPragmasRegex) const override;
376	void reflow(unsigned LineIndex,
377	WhitespaceManager &Whitespaces) const override;
378	bool introducesBreakBeforeToken() const override;
379	void adaptStartOfLine(unsigned LineIndex,
380	WhitespaceManager &Whitespaces) const override;
381	Split getSplitAfterLastLine(unsigned TailOffset) const override;
382
383	bool mayReflow(unsigned LineIndex,
384	llvm::Regex &CommentPragmasRegex) const override;
385
386	// Contains Javadoc annotations that require additional indent when continued
387	// on multiple lines.
388	static const llvm::StringSet<> ContentIndentingJavadocAnnotations;
389
390	private:
391	// Rearranges the whitespace between Lines[LineIndex-1] and Lines[LineIndex].
392	//
393	// Updates Content[LineIndex-1] and Content[LineIndex] by stripping off
394	// leading and trailing whitespace.
395	//
396	// Sets ContentColumn to the intended column in which the text at
397	// Lines[LineIndex] starts (note that the decoration, if present, is not
398	// considered part of the text).
399	void adjustWhitespace(unsigned LineIndex, int IndentDelta);
400
401	// The column at which the text of a broken line should start.
402	// Note that an optional decoration would go before that column.
403	// IndentAtLineBreak is a uniform position for all lines in a block comment,
404	// regardless of their relative position.
405	// FIXME: Revisit the decision to do this; the main reason was to support
406	// patterns like
407	// /************//
408	// * Comment
409	// We could also support such patterns by special casing the first line
410	// instead.
411	unsigned IndentAtLineBreak;
412
413	// This is to distinguish between the case when the last line was empty and
414	// the case when it started with a decoration ("" or " ").
415	bool LastLineNeedsDecoration;
416
417	// Either "* " if all lines begin with a "*", or empty.
418	StringRef Decoration;
419
420	// If this block comment has decorations, this is the column of the start of
421	// the decorations.
422	unsigned DecorationColumn;
423
424	// If true, make sure that the opening '/*' and the closing '/' ends on a
425	// line of itself. Styles like jsdoc require this for multiline comments.
426	bool DelimitersOnNewline;
427
428	// Length of the sequence of tokens after this string literal that cannot
429	// contain line breaks.
430	unsigned UnbreakableTailLength;
431	};
432
433	class BreakableLineCommentSection : public BreakableComment {
434	public:
435	BreakableLineCommentSection(const FormatToken &Token, unsigned StartColumn,
436	unsigned OriginalStartColumn, bool FirstInLine,
437	bool InPPDirective, encoding::Encoding Encoding,
438	const FormatStyle &Style);
439
440	unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
441	StringRef::size_type Length,
442	unsigned StartColumn) const override;
443	unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
444	void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
445	unsigned ContentIndent,
446	WhitespaceManager &Whitespaces) const override;
447	Split getReflowSplit(unsigned LineIndex,
448	llvm::Regex &CommentPragmasRegex) const override;
449	void reflow(unsigned LineIndex,
450	WhitespaceManager &Whitespaces) const override;
451	void adaptStartOfLine(unsigned LineIndex,
452	WhitespaceManager &Whitespaces) const override;
453	void updateNextToken(LineState &State) const override;
454	bool mayReflow(unsigned LineIndex,
455	llvm::Regex &CommentPragmasRegex) const override;
456
457	private:
458	// OriginalPrefix[i] contains the original prefix of line i, including
459	// trailing whitespace before the start of the content. The indentation
460	// preceding the prefix is not included.
461	// For example, if the line is:
462	// // content
463	// then the original prefix is "// ".
464	SmallVector<StringRef, 16> OriginalPrefix;
465
466	// Prefix[i] contains the intended leading "//" with trailing spaces to
467	// account for the indentation of content within the comment at line i after
468	// formatting. It can be different than the original prefix when the original
469	// line starts like this:
470	// //content
471	// Then the original prefix is "//", but the prefix is "// ".
472	SmallVector<StringRef, 16> Prefix;
473
474	SmallVector<unsigned, 16> OriginalContentColumn;
475
476	/// The token to which the last line of this breakable token belongs
477	/// to; nullptr if that token is the initial token.
478	///
479	/// The distinction is because if the token of the last line of this breakable
480	/// token is distinct from the initial token, this breakable token owns the
481	/// whitespace before the token of the last line, and the whitespace manager
482	/// must be able to modify it.
483	FormatToken *LastLineTok = nullptr;
484	};
485	} // namespace format
486	} // namespace clang
487
488	#endif
489

Clang Project