JavaParser/com/github/javaparser/utils/StringEscapeUtils.java

1	/*
2	* Copyright (C) 2007-2010 Júlio Vilmar Gesser.
3	* Copyright (C) 2011, 2013-2020 The JavaParser Team.
4	*
5	* This file is part of JavaParser.
6	*
7	* JavaParser can be used either under the terms of
8	* a) the GNU Lesser General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	* b) the terms of the Apache License
12	*
13	* You should have received a copy of both licenses in LICENCE.LGPL and
14	* LICENCE.APACHE. Please refer to those files for details.
15	*
16	* JavaParser is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU Lesser General Public License for more details.
20	*/
21	package com.github.javaparser.utils;
22
23	import java.io.IOException;
24	import java.io.StringWriter;
25	import java.io.Writer;
26	import java.util.HashMap;
27	import java.util.HashSet;
28
29	/**
30	* Adapted from apache commons-lang3 project.
31	* <p>
32	* Unescapes escaped chars in strings.
33	*/
34	public final class StringEscapeUtils {
35
36	private StringEscapeUtils() {
37	}
38
39	/**
40	* <p>Escapes the characters in a {@code String} using Java String rules.</p>
41	* <p>
42	* <p>Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p>
43	* <p>
44	* <p>So a tab becomes the characters {@code '\\'} and
45	* {@code 't'}.</p>
46	* <p>
47	* <p>The only difference between Java strings and JavaScript strings
48	* is that in JavaScript, a single quote and forward-slash (/) are escaped.</p>
49	* <p>
50	* <p>Example:</p>
51	* <pre>
52	* input string: He didn't say, "Stop!"
53	* output string: He didn't say, \"Stop!\"
54	* </pre>
55	*
56	* @param input String to escape values in, may be null
57	* @return String with escaped values, {@code null} if null string input
58	*/
59	public static String escapeJava(final String input) {
60	return ESCAPE_JAVA.translate(input);
61	}
62
63	/**
64	* <p>Unescapes any Java literals found in the {@code String}.
65	* For example, it will turn a sequence of {@code '\'} and
66	* {@code 'n'} into a newline character, unless the {@code '\'}
67	* is preceded by another {@code '\'}.</p>
68	* <p>
69	* This can be replaced by String::translateEscapes in JDK 13
70	*
71	* @param input the {@code String} to unescape, may be null
72	* @return a new unescaped {@code String}, {@code null} if null string input
73	*/
74	public static String unescapeJava(final String input) {
75	return UNESCAPE_JAVA.translate(input);
76	}
77
78	public static String unescapeJavaTextBlock(final String input) {
79	return UNESCAPE_JAVA_TEXT_BLOCK.translate(input);
80	}
81
82	private static final LookupTranslator JAVA_CTRL_CHARS_UNESCAPE = new LookupTranslator(new String[][]{
83	{"\\b", "\b"},
84	{"\\n", "\n"},
85	{"\\t", "\t"},
86	{"\\f", "\f"},
87	{"\\r", "\r"}});
88
89	private static final LookupTranslator JAVA_CTRL_CHARS_ESCAPE = new LookupTranslator(new String[][]{
90	{"\b", "\\b"},
91	{"\n", "\\n"},
92	{"\t", "\\t"},
93	{"\f", "\\f"},
94	{"\r", "\\r"}});
95
96	private static final CharSequenceTranslator ESCAPE_JAVA = new AggregateTranslator(
97	new LookupTranslator(
98	new String[][]{
99	{"\"", "\\\""},
100	{"\\", "\\\\"},
101	}),
102	JAVA_CTRL_CHARS_ESCAPE);
103
104	private static final CharSequenceTranslator UNESCAPE_JAVA = new AggregateTranslator(
105	new OctalUnescaper(),
106	new UnicodeUnescaper(),
107	JAVA_CTRL_CHARS_UNESCAPE,
108	new LookupTranslator(new String[][]{
109	{"\\\\", "\\"},
110	{"\\\"", "\""},
111	{"\\'", "'"},
112	{"\\", ""}}));
113
114	private static final CharSequenceTranslator UNESCAPE_JAVA_TEXT_BLOCK = new AggregateTranslator(
115	new OctalUnescaper(),
116	new UnicodeUnescaper(),
117	JAVA_CTRL_CHARS_UNESCAPE,
118	new LookupTranslator(new String[][]{
119	{"\\\\", "\\"},
120	{"\\\"", "\""},
121	{"\\'", "'"},
122	{"\\", ""},
123	{"\\s", " "},
124	{"\\\n", ""}}));
125
126	/**
127	* Adapted from apache commons-lang3 project.
128	* <p>
129	* An API for translating text.
130	* Its core use is to escape and unescape text. Because escaping and unescaping
131	* is completely contextual, the API does not present two separate signatures.
132	*
133	* @since 3.0
134	*/
135	private static abstract class CharSequenceTranslator {
136
137	/**
138	* Translate a set of codepoints, represented by an int index into a CharSequence,
139	* into another set of codepoints. The number of codepoints consumed must be returned,
140	* and the only IOExceptions thrown must be from interacting with the Writer so that
141	* the top level API may reliably ignore StringWriter IOExceptions.
142	*
143	* @param input CharSequence that is being translated
144	* @param index int representing the current point of translation
145	* @param out Writer to translate the text to
146	* @return int count of codepoints consumed
147	* @throws IOException if and only if the Writer produces an IOException
148	*/
149	protected abstract int translate(CharSequence input, int index, Writer out) throws IOException;
150
151	/**
152	* Helper for non-Writer usage.
153	*
154	* @param input CharSequence to be translated
155	* @return String output of translation
156	*/
157	private String translate(final CharSequence input) {
158	if (input == null) {
159	return null;
160	}
161	try {
162	final StringWriter writer = new StringWriter(input.length() * 2);
163	translate(input, writer);
164	return writer.toString();
165	} catch (final IOException ioe) {
166	// this should never ever happen while writing to a StringWriter
167	throw new RuntimeException(ioe);
168	}
169	}
170
171	/**
172	* Translate an input onto a Writer. This is intentionally final as its algorithm is
173	* tightly coupled with the abstract method of this class.
174	*
175	* @param input CharSequence that is being translated
176	* @param out Writer to translate the text to
177	* @throws IOException if and only if the Writer produces an IOException
178	*/
179	private void translate(final CharSequence input, final Writer out) throws IOException {
180	if (out == null) {
181	throw new IllegalArgumentException("The Writer must not be null");
182	}
183	if (input == null) {
184	return;
185	}
186	int pos = 0;
187	final int len = input.length();
188	while (pos < len) {
189	final int consumed = translate(input, pos, out);
190	if (consumed == 0) {
191	// inlined implementation of Character.toChars(Character.codePointAt(input, pos))
192	// avoids allocating temp char arrays and duplicate checks
193	char c1 = input.charAt(pos);
194	out.write(c1);
195	pos++;
196	if (Character.isHighSurrogate(c1) && pos < len) {
197	char c2 = input.charAt(pos);
198	if (Character.isLowSurrogate(c2)) {
199	out.write(c2);
200	pos++;
201	}
202	}
203	continue;
204	}
205	// contract with translators is that they have to understand codepoints
206	// and they just took care of a surrogate pair
207	for (int pt = 0; pt < consumed; pt++) {
208	pos += Character.charCount(Character.codePointAt(input, pos));
209	}
210	}
211	}
212	}
213
214	/**
215	* Adapted from apache commons-lang3 project.
216	* <p>
217	* Translates a value using a lookup table.
218	*
219	* @since 3.0
220	*/
221	private static class LookupTranslator extends CharSequenceTranslator {
222
223	private final HashMap<String, String> lookupMap;
224	private final HashSet<Character> prefixSet;
225	private final int shortest;
226	private final int longest;
227
228	/**
229	* Define the lookup table to be used in translation
230	* <p>
231	* Note that, as of Lang 3.1, the key to the lookup table is converted to a
232	* java.lang.String. This is because we need the key to support hashCode and
233	* equals(Object), allowing it to be the key for a HashMap. See LANG-882.
234	*
235	* @param lookup CharSequence[][] table of size [*][2]
236	*/
237	private LookupTranslator(final CharSequence[]... lookup) {
238	lookupMap = new HashMap<>();
239	prefixSet = new HashSet<>();
240	int _shortest = Integer.MAX_VALUE;
241	int _longest = 0;
242	if (lookup != null) {
243	for (final CharSequence[] seq : lookup) {
244	this.lookupMap.put(seq[0].toString(), seq[1].toString());
245	this.prefixSet.add(seq[0].charAt(0));
246	final int sz = seq[0].length();
247	if (sz < _shortest) {
248	_shortest = sz;
249	}
250	if (sz > _longest) {
251	_longest = sz;
252	}
253	}
254	}
255	shortest = _shortest;
256	longest = _longest;
257	}
258
259	/**
260	* {@inheritDoc}
261	*/
262	@Override
263	protected int translate(final CharSequence input, final int index, final Writer out) throws IOException {
264	// check if translation exists for the input at position index
265	if (prefixSet.contains(input.charAt(index))) {
266	int max = longest;
267	if (index + longest > input.length()) {
268	max = input.length() - index;
269	}
270	// implement greedy algorithm by trying maximum match first
271	for (int i = max; i >= shortest; i--) {
272	final CharSequence subSeq = input.subSequence(index, index + i);
273	final String result = lookupMap.get(subSeq.toString());
274
275	if (result != null) {
276	out.write(result);
277	return i;
278	}
279	}
280	}
281	return 0;
282	}
283	}
284
285	/**
286	* Adapted from apache commons-lang3 project.
287	* <p>
288	* Executes a sequence of translators one after the other. Execution ends whenever
289	* the first translator consumes codepoints from the input.
290	*
291	* @since 3.0
292	*/
293	private static class AggregateTranslator extends CharSequenceTranslator {
294
295	private final CharSequenceTranslator[] translators;
296
297	/**
298	* Specify the translators to be used at creation time.
299	*
300	* @param translators CharSequenceTranslator array to aggregate
301	*/
302	private AggregateTranslator(final CharSequenceTranslator... translators) {
303	this.translators = translators == null ? null : translators.clone();
304	}
305
306	/**
307	* The first translator to consume codepoints from the input is the 'winner'.
308	* Execution stops with the number of consumed codepoints being returned.
309	* {@inheritDoc}
310	*/
311	@Override
312	protected int translate(final CharSequence input, final int index, final Writer out) throws IOException {
313	for (final CharSequenceTranslator translator : translators) {
314	final int consumed = translator.translate(input, index, out);
315	if (consumed != 0) {
316	return consumed;
317	}
318	}
319	return 0;
320	}
321
322	}
323
324	/**
325	* Adapted from apache commons-lang3 project.
326	* <p>
327	* Translate escaped octal Strings back to their octal values.
328	* <p>
329	* For example, "\45" should go back to being the specific value (a %).
330	* <p>
331	* Note that this currently only supports the viable range of octal for Java; namely
332	* 1 to 377. This is because parsing Java is the main use case.
333	*
334	* @since 3.0
335	*/
336	private static class OctalUnescaper extends CharSequenceTranslator {
337
338	/**
339	* {@inheritDoc}
340	*/
341	@Override
342	protected int translate(final CharSequence input, final int index, final Writer out) throws IOException {
343	final int remaining = input.length() - index - 1; // how many characters left, ignoring the first \
344	final StringBuilder builder = new StringBuilder();
345	if (input.charAt(index) == '\\' && remaining > 0 && isOctalDigit(input.charAt(index + 1))) {
346	final int next = index + 1;
347	final int next2 = index + 2;
348	final int next3 = index + 3;
349
350	// we know this is good as we checked it in the if block above
351	builder.append(input.charAt(next));
352
353	if (remaining > 1 && isOctalDigit(input.charAt(next2))) {
354	builder.append(input.charAt(next2));
355	if (remaining > 2 && isZeroToThree(input.charAt(next)) && isOctalDigit(input.charAt(next3))) {
356	builder.append(input.charAt(next3));
357	}
358	}
359
360	out.write(Integer.parseInt(builder.toString(), 8));
361	return 1 + builder.length();
362	}
363	return 0;
364	}
365
366	/**
367	* Checks if the given char is an octal digit. Octal digits are the character representations of the digits 0 to
368	* 7.
369	*
370	* @param ch the char to check
371	* @return true if the given char is the character representation of one of the digits from 0 to 7
372	*/
373	private boolean isOctalDigit(final char ch) {
374	return ch >= '0' && ch <= '7';
375	}
376
377	/**
378	* Checks if the given char is the character representation of one of the digit from 0 to 3.
379	*
380	* @param ch the char to check
381	* @return true if the given char is the character representation of one of the digits from 0 to 3
382	*/
383	private boolean isZeroToThree(final char ch) {
384	return ch >= '0' && ch <= '3';
385	}
386	}
387
388	/**
389	* Adapted from apache commons-lang3 project.
390	* <p>
391	* Translates escaped Unicode values of the form \\u+\d\d\d\d back to
392	* Unicode. It supports multiple 'u' characters and will work with or
393	* without the +.
394	*
395	* @since 3.0
396	*/
397	private static class UnicodeUnescaper extends CharSequenceTranslator {
398
399	/**
400	* {@inheritDoc}
401	*/
402	@Override
403	protected int translate(final CharSequence input, final int index, final Writer out) throws IOException {
404	if (input.charAt(index) == '\\' && index + 1 < input.length() && input.charAt(index + 1) == 'u') {
405	// consume optional additional 'u' chars
406	int i = 2;
407	while (index + i < input.length() && input.charAt(index + i) == 'u') {
408	i++;
409	}
410
411	if (index + i < input.length() && input.charAt(index + i) == '+') {
412	i++;
413	}
414
415	if (index + i + 4 <= input.length()) {
416	// Get 4 hex digits
417	final CharSequence unicode = input.subSequence(index + i, index + i + 4);
418
419	try {
420	final int value = Integer.parseInt(unicode.toString(), 16);
421	out.write((char) value);
422	} catch (final NumberFormatException nfe) {
423	throw new IllegalArgumentException("Unable to parse unicode value: " + unicode, nfe);
424	}
425	return i + 4;
426	}
427	throw new IllegalArgumentException("Less than 4 hex digits in unicode value: '" + input.subSequence(index, input.length())
428	+ "' due to end of CharSequence");
429	}
430	return 0;
431	}
432	}
433
434	}
435