1 | //===--- BreakableToken.h - Format C++ code ---------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// Declares BreakableToken, BreakableStringLiteral, BreakableComment, |
11 | /// BreakableBlockComment and BreakableLineCommentSection classes, that contain |
12 | /// token type-specific logic to break long lines in tokens and reflow content |
13 | /// between tokens. |
14 | /// |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #ifndef LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H |
18 | #define LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H |
19 | |
20 | #include "Encoding.h" |
21 | #include "TokenAnnotator.h" |
22 | #include "WhitespaceManager.h" |
23 | #include "llvm/ADT/StringSet.h" |
24 | #include "llvm/Support/Regex.h" |
25 | #include <utility> |
26 | |
27 | namespace clang { |
28 | namespace format { |
29 | |
30 | /// Checks if \p Token switches formatting, like /* clang-format off */. |
31 | /// \p Token must be a comment. |
32 | bool switchesFormatting(const FormatToken &Token); |
33 | |
34 | struct FormatStyle; |
35 | |
36 | /// Base class for tokens / ranges of tokens that can allow breaking |
37 | /// within the tokens - for example, to avoid whitespace beyond the column |
38 | /// limit, or to reflow text. |
39 | /// |
40 | /// Generally, a breakable token consists of logical lines, addressed by a line |
41 | /// index. For example, in a sequence of line comments, each line comment is its |
42 | /// own logical line; similarly, for a block comment, each line in the block |
43 | /// comment is on its own logical line. |
44 | /// |
45 | /// There are two methods to compute the layout of the token: |
46 | /// - getRangeLength measures the number of columns needed for a range of text |
47 | /// within a logical line, and |
48 | /// - getContentStartColumn returns the start column at which we want the |
49 | /// content of a logical line to start (potentially after introducing a line |
50 | /// break). |
51 | /// |
52 | /// The mechanism to adapt the layout of the breakable token is organised |
53 | /// around the concept of a \c Split, which is a whitespace range that signifies |
54 | /// a position of the content of a token where a reformatting might be done. |
55 | /// |
56 | /// Operating with splits is divided into two operations: |
57 | /// - getSplit, for finding a split starting at a position, |
58 | /// - insertBreak, for executing the split using a whitespace manager. |
59 | /// |
60 | /// There is a pair of operations that are used to compress a long whitespace |
61 | /// range with a single space if that will bring the line length under the |
62 | /// column limit: |
63 | /// - getLineLengthAfterCompression, for calculating the size in columns of the |
64 | /// line after a whitespace range has been compressed, and |
65 | /// - compressWhitespace, for executing the whitespace compression using a |
66 | /// whitespace manager; note that the compressed whitespace may be in the |
67 | /// middle of the original line and of the reformatted line. |
68 | /// |
69 | /// For tokens where the whitespace before each line needs to be also |
70 | /// reformatted, for example for tokens supporting reflow, there are analogous |
71 | /// operations that might be executed before the main line breaking occurs: |
72 | /// - getReflowSplit, for finding a split such that the content preceding it |
73 | /// needs to be specially reflown, |
74 | /// - reflow, for executing the split using a whitespace manager, |
75 | /// - introducesBreakBefore, for checking if reformatting the beginning |
76 | /// of the content introduces a line break before it, |
77 | /// - adaptStartOfLine, for executing the reflow using a whitespace |
78 | /// manager. |
79 | /// |
80 | /// For tokens that require the whitespace after the last line to be |
81 | /// reformatted, for example in multiline jsdoc comments that require the |
82 | /// trailing '*/' to be on a line of itself, there are analogous operations |
83 | /// that might be executed after the last line has been reformatted: |
84 | /// - getSplitAfterLastLine, for finding a split after the last line that needs |
85 | /// to be reflown, |
86 | /// - replaceWhitespaceAfterLastLine, for executing the reflow using a |
87 | /// whitespace manager. |
88 | /// |
89 | class BreakableToken { |
90 | public: |
91 | /// Contains starting character index and length of split. |
92 | typedef std::pair<StringRef::size_type, unsigned> Split; |
93 | |
94 | virtual ~BreakableToken() {} |
95 | |
96 | /// Returns the number of lines in this token in the original code. |
97 | virtual unsigned getLineCount() const = 0; |
98 | |
99 | /// Returns the number of columns required to format the text in the |
100 | /// byte range [\p Offset, \p Offset \c + \p Length). |
101 | /// |
102 | /// \p Offset is the byte offset from the start of the content of the line |
103 | /// at \p LineIndex. |
104 | /// |
105 | /// \p StartColumn is the column at which the text starts in the formatted |
106 | /// file, needed to compute tab stops correctly. |
107 | virtual unsigned getRangeLength(unsigned LineIndex, unsigned Offset, |
108 | StringRef::size_type Length, |
109 | unsigned StartColumn) const = 0; |
110 | |
111 | /// Returns the number of columns required to format the text following |
112 | /// the byte \p Offset in the line \p LineIndex, including potentially |
113 | /// unbreakable sequences of tokens following after the end of the token. |
114 | /// |
115 | /// \p Offset is the byte offset from the start of the content of the line |
116 | /// at \p LineIndex. |
117 | /// |
118 | /// \p StartColumn is the column at which the text starts in the formatted |
119 | /// file, needed to compute tab stops correctly. |
120 | /// |
121 | /// For breakable tokens that never use extra space at the end of a line, this |
122 | /// is equivalent to getRangeLength with a Length of StringRef::npos. |
123 | virtual unsigned getRemainingLength(unsigned LineIndex, unsigned Offset, |
124 | unsigned StartColumn) const { |
125 | return getRangeLength(LineIndex, Offset, StringRef::npos, StartColumn); |
126 | } |
127 | |
128 | /// Returns the column at which content in line \p LineIndex starts, |
129 | /// assuming no reflow. |
130 | /// |
131 | /// If \p Break is true, returns the column at which the line should start |
132 | /// after the line break. |
133 | /// If \p Break is false, returns the column at which the line itself will |
134 | /// start. |
135 | virtual unsigned getContentStartColumn(unsigned LineIndex, |
136 | bool Break) const = 0; |
137 | |
138 | /// Returns additional content indent required for the second line after the |
139 | /// content at line \p LineIndex is broken. |
140 | /// |
141 | // (Next lines do not start with `///` since otherwise -Wdocumentation picks |
142 | // up the example annotations and generates warnings for them) |
143 | // For example, Javadoc @param annotations require and indent of 4 spaces and |
144 | // in this example getContentIndex(1) returns 4. |
145 | // /** |
146 | // * @param loooooooooooooong line |
147 | // * continuation |
148 | // */ |
149 | virtual unsigned getContentIndent(unsigned LineIndex) const { return 0; } |
150 | |
151 | /// Returns a range (offset, length) at which to break the line at |
152 | /// \p LineIndex, if previously broken at \p TailOffset. If possible, do not |
153 | /// violate \p ColumnLimit, assuming the text starting at \p TailOffset in |
154 | /// the token is formatted starting at ContentStartColumn in the reformatted |
155 | /// file. |
156 | virtual Split getSplit(unsigned LineIndex, unsigned TailOffset, |
157 | unsigned ColumnLimit, unsigned ContentStartColumn, |
158 | llvm::Regex &CommentPragmasRegex) const = 0; |
159 | |
160 | /// Emits the previously retrieved \p Split via \p Whitespaces. |
161 | virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, |
162 | unsigned ContentIndent, |
163 | WhitespaceManager &Whitespaces) const = 0; |
164 | |
165 | /// Returns the number of columns needed to format |
166 | /// \p RemainingTokenColumns, assuming that Split is within the range measured |
167 | /// by \p RemainingTokenColumns, and that the whitespace in Split is reduced |
168 | /// to a single space. |
169 | unsigned getLengthAfterCompression(unsigned RemainingTokenColumns, |
170 | Split Split) const; |
171 | |
172 | /// Replaces the whitespace range described by \p Split with a single |
173 | /// space. |
174 | virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset, |
175 | Split Split, |
176 | WhitespaceManager &Whitespaces) const = 0; |
177 | |
178 | /// Returns whether the token supports reflowing text. |
179 | virtual bool supportsReflow() const { return false; } |
180 | |
181 | /// Returns a whitespace range (offset, length) of the content at \p |
182 | /// LineIndex such that the content of that line is reflown to the end of the |
183 | /// previous one. |
184 | /// |
185 | /// Returning (StringRef::npos, 0) indicates reflowing is not possible. |
186 | /// |
187 | /// The range will include any whitespace preceding the specified line's |
188 | /// content. |
189 | /// |
190 | /// If the split is not contained within one token, for example when reflowing |
191 | /// line comments, returns (0, <length>). |
192 | virtual Split getReflowSplit(unsigned LineIndex, |
193 | llvm::Regex &CommentPragmasRegex) const { |
194 | return Split(StringRef::npos, 0); |
195 | } |
196 | |
197 | /// Reflows the current line into the end of the previous one. |
198 | virtual void reflow(unsigned LineIndex, |
199 | WhitespaceManager &Whitespaces) const {} |
200 | |
201 | /// Returns whether there will be a line break at the start of the |
202 | /// token. |
203 | virtual bool introducesBreakBeforeToken() const { return false; } |
204 | |
205 | /// Replaces the whitespace between \p LineIndex-1 and \p LineIndex. |
206 | virtual void adaptStartOfLine(unsigned LineIndex, |
207 | WhitespaceManager &Whitespaces) const {} |
208 | |
209 | /// Returns a whitespace range (offset, length) of the content at |
210 | /// the last line that needs to be reformatted after the last line has been |
211 | /// reformatted. |
212 | /// |
213 | /// A result having offset == StringRef::npos means that no reformat is |
214 | /// necessary. |
215 | virtual Split getSplitAfterLastLine(unsigned TailOffset) const { |
216 | return Split(StringRef::npos, 0); |
217 | } |
218 | |
219 | /// Replaces the whitespace from \p SplitAfterLastLine on the last line |
220 | /// after the last line has been formatted by performing a reformatting. |
221 | void replaceWhitespaceAfterLastLine(unsigned TailOffset, |
222 | Split SplitAfterLastLine, |
223 | WhitespaceManager &Whitespaces) const { |
224 | insertBreak(getLineCount() - 1, TailOffset, SplitAfterLastLine, |
225 | /*ContentIndent=*/0, Whitespaces); |
226 | } |
227 | |
228 | /// Updates the next token of \p State to the next token after this |
229 | /// one. This can be used when this token manages a set of underlying tokens |
230 | /// as a unit and is responsible for the formatting of the them. |
231 | virtual void updateNextToken(LineState &State) const {} |
232 | |
233 | protected: |
234 | BreakableToken(const FormatToken &Tok, bool InPPDirective, |
235 | encoding::Encoding Encoding, const FormatStyle &Style) |
236 | : Tok(Tok), InPPDirective(InPPDirective), Encoding(Encoding), |
237 | Style(Style) {} |
238 | |
239 | const FormatToken &Tok; |
240 | const bool InPPDirective; |
241 | const encoding::Encoding Encoding; |
242 | const FormatStyle &Style; |
243 | }; |
244 | |
245 | class BreakableStringLiteral : public BreakableToken { |
246 | public: |
247 | /// Creates a breakable token for a single line string literal. |
248 | /// |
249 | /// \p StartColumn specifies the column in which the token will start |
250 | /// after formatting. |
251 | BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn, |
252 | StringRef Prefix, StringRef Postfix, |
253 | unsigned UnbreakableTailLength, bool InPPDirective, |
254 | encoding::Encoding Encoding, const FormatStyle &Style); |
255 | |
256 | Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit, |
257 | unsigned ContentStartColumn, |
258 | llvm::Regex &CommentPragmasRegex) const override; |
259 | void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, |
260 | unsigned ContentIndent, |
261 | WhitespaceManager &Whitespaces) const override; |
262 | void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, |
263 | WhitespaceManager &Whitespaces) const override {} |
264 | unsigned getLineCount() const override; |
265 | unsigned getRangeLength(unsigned LineIndex, unsigned Offset, |
266 | StringRef::size_type Length, |
267 | unsigned StartColumn) const override; |
268 | unsigned getRemainingLength(unsigned LineIndex, unsigned Offset, |
269 | unsigned StartColumn) const override; |
270 | unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override; |
271 | |
272 | protected: |
273 | // The column in which the token starts. |
274 | unsigned StartColumn; |
275 | // The prefix a line needs after a break in the token. |
276 | StringRef Prefix; |
277 | // The postfix a line needs before introducing a break. |
278 | StringRef Postfix; |
279 | // The token text excluding the prefix and postfix. |
280 | StringRef Line; |
281 | // Length of the sequence of tokens after this string literal that cannot |
282 | // contain line breaks. |
283 | unsigned UnbreakableTailLength; |
284 | }; |
285 | |
286 | class BreakableComment : public BreakableToken { |
287 | protected: |
288 | /// Creates a breakable token for a comment. |
289 | /// |
290 | /// \p StartColumn specifies the column in which the comment will start after |
291 | /// formatting. |
292 | BreakableComment(const FormatToken &Token, unsigned StartColumn, |
293 | bool InPPDirective, encoding::Encoding Encoding, |
294 | const FormatStyle &Style); |
295 | |
296 | public: |
297 | bool supportsReflow() const override { return true; } |
298 | unsigned getLineCount() const override; |
299 | Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit, |
300 | unsigned ContentStartColumn, |
301 | llvm::Regex &CommentPragmasRegex) const override; |
302 | void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, |
303 | WhitespaceManager &Whitespaces) const override; |
304 | |
305 | protected: |
306 | // Returns the token containing the line at LineIndex. |
307 | const FormatToken &tokenAt(unsigned LineIndex) const; |
308 | |
309 | // Checks if the content of line LineIndex may be reflown with the previous |
310 | // line. |
311 | virtual bool mayReflow(unsigned LineIndex, |
312 | llvm::Regex &CommentPragmasRegex) const = 0; |
313 | |
314 | // Contains the original text of the lines of the block comment. |
315 | // |
316 | // In case of a block comments, excludes the leading /* in the first line and |
317 | // trailing */ in the last line. In case of line comments, excludes the |
318 | // leading // and spaces. |
319 | SmallVector<StringRef, 16> Lines; |
320 | |
321 | // Contains the text of the lines excluding all leading and trailing |
322 | // whitespace between the lines. Note that the decoration (if present) is also |
323 | // not considered part of the text. |
324 | SmallVector<StringRef, 16> Content; |
325 | |
326 | // Tokens[i] contains a reference to the token containing Lines[i] if the |
327 | // whitespace range before that token is managed by this block. |
328 | // Otherwise, Tokens[i] is a null pointer. |
329 | SmallVector<FormatToken *, 16> Tokens; |
330 | |
331 | // ContentColumn[i] is the target column at which Content[i] should be. |
332 | // Note that this excludes a leading "* " or "*" in case of block comments |
333 | // where all lines have a "*" prefix, or the leading "// " or "//" in case of |
334 | // line comments. |
335 | // |
336 | // In block comments, the first line's target column is always positive. The |
337 | // remaining lines' target columns are relative to the first line to allow |
338 | // correct indentation of comments in \c WhitespaceManager. Thus they can be |
339 | // negative as well (in case the first line needs to be unindented more than |
340 | // there's actual whitespace in another line). |
341 | SmallVector<int, 16> ContentColumn; |
342 | |
343 | // The intended start column of the first line of text from this section. |
344 | unsigned StartColumn; |
345 | |
346 | // The prefix to use in front a line that has been reflown up. |
347 | // For example, when reflowing the second line after the first here: |
348 | // // comment 1 |
349 | // // comment 2 |
350 | // we expect: |
351 | // // comment 1 comment 2 |
352 | // and not: |
353 | // // comment 1comment 2 |
354 | StringRef ReflowPrefix = " "; |
355 | }; |
356 | |
357 | class BreakableBlockComment : public BreakableComment { |
358 | public: |
359 | BreakableBlockComment(const FormatToken &Token, unsigned StartColumn, |
360 | unsigned OriginalStartColumn, bool FirstInLine, |
361 | bool InPPDirective, encoding::Encoding Encoding, |
362 | const FormatStyle &Style); |
363 | |
364 | unsigned getRangeLength(unsigned LineIndex, unsigned Offset, |
365 | StringRef::size_type Length, |
366 | unsigned StartColumn) const override; |
367 | unsigned getRemainingLength(unsigned LineIndex, unsigned Offset, |
368 | unsigned StartColumn) const override; |
369 | unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override; |
370 | unsigned getContentIndent(unsigned LineIndex) const override; |
371 | void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, |
372 | unsigned ContentIndent, |
373 | WhitespaceManager &Whitespaces) const override; |
374 | Split getReflowSplit(unsigned LineIndex, |
375 | llvm::Regex &CommentPragmasRegex) const override; |
376 | void reflow(unsigned LineIndex, |
377 | WhitespaceManager &Whitespaces) const override; |
378 | bool introducesBreakBeforeToken() const override; |
379 | void adaptStartOfLine(unsigned LineIndex, |
380 | WhitespaceManager &Whitespaces) const override; |
381 | Split getSplitAfterLastLine(unsigned TailOffset) const override; |
382 | |
383 | bool mayReflow(unsigned LineIndex, |
384 | llvm::Regex &CommentPragmasRegex) const override; |
385 | |
386 | // Contains Javadoc annotations that require additional indent when continued |
387 | // on multiple lines. |
388 | static const llvm::StringSet<> ContentIndentingJavadocAnnotations; |
389 | |
390 | private: |
391 | // Rearranges the whitespace between Lines[LineIndex-1] and Lines[LineIndex]. |
392 | // |
393 | // Updates Content[LineIndex-1] and Content[LineIndex] by stripping off |
394 | // leading and trailing whitespace. |
395 | // |
396 | // Sets ContentColumn to the intended column in which the text at |
397 | // Lines[LineIndex] starts (note that the decoration, if present, is not |
398 | // considered part of the text). |
399 | void adjustWhitespace(unsigned LineIndex, int IndentDelta); |
400 | |
401 | // The column at which the text of a broken line should start. |
402 | // Note that an optional decoration would go before that column. |
403 | // IndentAtLineBreak is a uniform position for all lines in a block comment, |
404 | // regardless of their relative position. |
405 | // FIXME: Revisit the decision to do this; the main reason was to support |
406 | // patterns like |
407 | // /**************//** |
408 | // * Comment |
409 | // We could also support such patterns by special casing the first line |
410 | // instead. |
411 | unsigned IndentAtLineBreak; |
412 | |
413 | // This is to distinguish between the case when the last line was empty and |
414 | // the case when it started with a decoration ("*" or "* "). |
415 | bool LastLineNeedsDecoration; |
416 | |
417 | // Either "* " if all lines begin with a "*", or empty. |
418 | StringRef Decoration; |
419 | |
420 | // If this block comment has decorations, this is the column of the start of |
421 | // the decorations. |
422 | unsigned DecorationColumn; |
423 | |
424 | // If true, make sure that the opening '/**' and the closing '*/' ends on a |
425 | // line of itself. Styles like jsdoc require this for multiline comments. |
426 | bool DelimitersOnNewline; |
427 | |
428 | // Length of the sequence of tokens after this string literal that cannot |
429 | // contain line breaks. |
430 | unsigned UnbreakableTailLength; |
431 | }; |
432 | |
433 | class BreakableLineCommentSection : public BreakableComment { |
434 | public: |
435 | BreakableLineCommentSection(const FormatToken &Token, unsigned StartColumn, |
436 | unsigned OriginalStartColumn, bool FirstInLine, |
437 | bool InPPDirective, encoding::Encoding Encoding, |
438 | const FormatStyle &Style); |
439 | |
440 | unsigned getRangeLength(unsigned LineIndex, unsigned Offset, |
441 | StringRef::size_type Length, |
442 | unsigned StartColumn) const override; |
443 | unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override; |
444 | void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, |
445 | unsigned ContentIndent, |
446 | WhitespaceManager &Whitespaces) const override; |
447 | Split getReflowSplit(unsigned LineIndex, |
448 | llvm::Regex &CommentPragmasRegex) const override; |
449 | void reflow(unsigned LineIndex, |
450 | WhitespaceManager &Whitespaces) const override; |
451 | void adaptStartOfLine(unsigned LineIndex, |
452 | WhitespaceManager &Whitespaces) const override; |
453 | void updateNextToken(LineState &State) const override; |
454 | bool mayReflow(unsigned LineIndex, |
455 | llvm::Regex &CommentPragmasRegex) const override; |
456 | |
457 | private: |
458 | // OriginalPrefix[i] contains the original prefix of line i, including |
459 | // trailing whitespace before the start of the content. The indentation |
460 | // preceding the prefix is not included. |
461 | // For example, if the line is: |
462 | // // content |
463 | // then the original prefix is "// ". |
464 | SmallVector<StringRef, 16> OriginalPrefix; |
465 | |
466 | // Prefix[i] contains the intended leading "//" with trailing spaces to |
467 | // account for the indentation of content within the comment at line i after |
468 | // formatting. It can be different than the original prefix when the original |
469 | // line starts like this: |
470 | // //content |
471 | // Then the original prefix is "//", but the prefix is "// ". |
472 | SmallVector<StringRef, 16> Prefix; |
473 | |
474 | SmallVector<unsigned, 16> OriginalContentColumn; |
475 | |
476 | /// The token to which the last line of this breakable token belongs |
477 | /// to; nullptr if that token is the initial token. |
478 | /// |
479 | /// The distinction is because if the token of the last line of this breakable |
480 | /// token is distinct from the initial token, this breakable token owns the |
481 | /// whitespace before the token of the last line, and the whitespace manager |
482 | /// must be able to modify it. |
483 | FormatToken *LastLineTok = nullptr; |
484 | }; |
485 | } // namespace format |
486 | } // namespace clang |
487 | |
488 | #endif |
489 | |