1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 2011-2026 Yegor Bugayenko
3 * SPDX-License-Identifier: MIT
4 */
5 package com.qulice.checkstyle;
6
7 import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
8 import com.puppycrawl.tools.checkstyle.api.DetailAST;
9 import com.puppycrawl.tools.checkstyle.api.TokenTypes;
10 import java.util.Optional;
11
12 /**
13 * Checks that String.split is only invoked with regex arguments that the JDK
14 * handles via its fastpath.
15 *
16 * <p>For anything beyond the fastpath, String.split builds a fresh Pattern
17 * on every call, which is wasteful in tight loops. Extract the regex into a
18 * private static final Pattern field and use Pattern.split(CharSequence)
19 * instead.
20 *
21 * <p>The JDK fastpath accepts only a one-char string whose sole character is
22 * not a regex meta character, or a two-char string whose first character is
23 * a backslash and whose second character is not an ASCII letter or digit.
24 *
25 * <p>Examples that are flagged:
26 *
27 * <pre>
28 * "abxxdexxzy".split("xx");
29 * "abxxdexxzy".split("xx", 1);
30 * "abxxdexxzy".split(".");
31 * </pre>
32 *
33 * <p>Examples that are accepted:
34 *
35 * <pre>
36 * "abxdexzy".split("x");
37 * "abxdexzy".split("x", 2);
38 * "abxdexzy".split("\n");
39 * "ab.ex.zy".split("\\.");
40 * </pre>
41 *
42 * <p>The check only reports calls whose first argument is a string literal:
43 * when the regex is a variable the optimization cannot be determined from
44 * the AST alone.
45 *
46 * @since 0.24
47 */
48 public final class SimpleStringSplitCheck extends AbstractCheck {
49
50 /**
51 * Regex meta characters the JDK fastpath refuses for a one-char pattern.
52 */
53 private static final String META = ".$|()[{^?*+\\";
54
55 @Override
56 public int[] getDefaultTokens() {
57 return new int[] {TokenTypes.METHOD_CALL};
58 }
59
60 @Override
61 public int[] getAcceptableTokens() {
62 return this.getDefaultTokens();
63 }
64
65 @Override
66 public int[] getRequiredTokens() {
67 return this.getDefaultTokens();
68 }
69
70 @Override
71 public void visitToken(final DetailAST ast) {
72 final Optional<DetailAST> literal =
73 SimpleStringSplitCheck.regexLiteral(ast);
74 final Optional<String> regex = literal.flatMap(
75 node -> SimpleStringSplitCheck.decode(node.getText())
76 );
77 if (regex.isPresent()
78 && !SimpleStringSplitCheck.optimized(regex.get())) {
79 this.log(
80 literal.get(),
81 "String.split regex is not JDK-optimized, use a precompiled java.util.regex.Pattern"
82 );
83 }
84 }
85
86 /**
87 * Returns the STRING_LITERAL node that is the first argument of a
88 * split call with one or two arguments, if any.
89 * @param call METHOD_CALL AST node
90 * @return Node of the literal regex, or empty when the call is not a
91 * split with a literal first argument
92 */
93 private static Optional<DetailAST> regexLiteral(final DetailAST call) {
94 Optional<DetailAST> result = Optional.empty();
95 if (SimpleStringSplitCheck.isSplitCall(call)) {
96 result = SimpleStringSplitCheck.firstLiteralArg(call);
97 }
98 return result;
99 }
100
101 /**
102 * Tells whether the method call invokes a method named "split".
103 * @param call METHOD_CALL AST node
104 * @return True when the call is of the form receiver.split(...)
105 */
106 private static boolean isSplitCall(final DetailAST call) {
107 final DetailAST dot = call.getFirstChild();
108 final boolean result;
109 if (dot == null || dot.getType() != TokenTypes.DOT) {
110 result = false;
111 } else {
112 final DetailAST name = dot.getLastChild();
113 result = name != null && name.getType() == TokenTypes.IDENT
114 && "split".equals(name.getText());
115 }
116 return result;
117 }
118
119 /**
120 * Finds the first STRING_LITERAL argument when the call has one or two
121 * arguments.
122 * @param call METHOD_CALL AST node
123 * @return STRING_LITERAL node, or empty when the shape does not match
124 */
125 private static Optional<DetailAST> firstLiteralArg(final DetailAST call) {
126 final DetailAST elist = call.findFirstToken(TokenTypes.ELIST);
127 Optional<DetailAST> result = Optional.empty();
128 if (elist != null && SimpleStringSplitCheck.isOneOrTwoArgs(elist)) {
129 final DetailAST expr = elist.findFirstToken(TokenTypes.EXPR);
130 if (expr != null && SimpleStringSplitCheck.isLoneStringLiteral(expr)) {
131 result = Optional.of(expr.getFirstChild());
132 }
133 }
134 return result;
135 }
136
137 /**
138 * Tells whether the given ELIST holds one or two EXPR children.
139 * @param elist ELIST AST node
140 * @return True if ELIST has 1 or 2 EXPR children
141 */
142 private static boolean isOneOrTwoArgs(final DetailAST elist) {
143 final int args = elist.getChildCount(TokenTypes.EXPR);
144 return args == 1 || args == 2;
145 }
146
147 /**
148 * Tells whether the given EXPR is a lone STRING_LITERAL.
149 * @param expr EXPR AST node
150 * @return True if EXPR has a single STRING_LITERAL child
151 */
152 private static boolean isLoneStringLiteral(final DetailAST expr) {
153 return expr.getChildCount() == 1
154 && expr.getFirstChild().getType() == TokenTypes.STRING_LITERAL;
155 }
156
157 /**
158 * Tells whether the given regex string would hit the JDK fastpath.
159 * @param regex Runtime regex string
160 * @return True if JDK fastpath applies
161 */
162 private static boolean optimized(final String regex) {
163 final boolean result;
164 final int len = regex.length();
165 if (len == 1) {
166 result = SimpleStringSplitCheck.META.indexOf(regex.charAt(0)) < 0;
167 } else if (len == 2 && regex.charAt(0) == '\\') {
168 result = !SimpleStringSplitCheck.isAsciiAlphanumeric(regex.charAt(1));
169 } else {
170 result = false;
171 }
172 return result;
173 }
174
175 /**
176 * Tells whether the given character is an ASCII letter or digit.
177 * @param chr Character to test
178 * @return True if ASCII letter or digit
179 */
180 private static boolean isAsciiAlphanumeric(final char chr) {
181 return SimpleStringSplitCheck.isAsciiDigit(chr)
182 || SimpleStringSplitCheck.isAsciiLetter(chr);
183 }
184
185 /**
186 * Tells whether the given character is an ASCII digit 0-9.
187 * @param chr Character to test
188 * @return True if ASCII digit
189 */
190 private static boolean isAsciiDigit(final char chr) {
191 return chr >= '0' && chr <= '9';
192 }
193
194 /**
195 * Tells whether the given character is an ASCII letter a-z or A-Z.
196 * @param chr Character to test
197 * @return True if ASCII letter
198 */
199 private static boolean isAsciiLetter(final char chr) {
200 return chr >= 'a' && chr <= 'z'
201 || chr >= 'A' && chr <= 'Z';
202 }
203
204 /**
205 * Decodes a Java string literal (with surrounding double quotes) to the
206 * runtime string it denotes. Returns empty when the literal contains
207 * escape sequences the check does not understand, so the check can stay
208 * silent rather than guess.
209 * @param text Raw token text including the surrounding double quotes
210 * @return Decoded runtime string, or empty on unsupported escapes
211 */
212 private static Optional<String> decode(final String text) {
213 final String body = text.substring(1, text.length() - 1);
214 final StringBuilder out = new StringBuilder(body.length());
215 int idx = 0;
216 boolean failed = false;
217 while (idx < body.length() && !failed) {
218 final int advance = SimpleStringSplitCheck.step(body, idx, out);
219 if (advance < 0) {
220 failed = true;
221 } else {
222 idx += advance;
223 }
224 }
225 final Optional<String> result;
226 if (failed) {
227 result = Optional.empty();
228 } else {
229 result = Optional.of(out.toString());
230 }
231 return result;
232 }
233
234 /**
235 * Advances one step through the string literal body.
236 * @param body Literal body without surrounding quotes
237 * @param idx Current index
238 * @param out Receiver for the decoded char
239 * @return Number of source chars consumed, or -1 on unsupported escape
240 */
241 private static int step(
242 final String body, final int idx, final StringBuilder out
243 ) {
244 final char chr = body.charAt(idx);
245 final int advance;
246 if (chr == '\\') {
247 advance = SimpleStringSplitCheck.handleEscape(body, idx, out);
248 } else {
249 out.append(chr);
250 advance = 1;
251 }
252 return advance;
253 }
254
255 /**
256 * Handles a backslash escape starting at {@code idx} in {@code body}.
257 * @param body Literal body without surrounding quotes
258 * @param idx Index of the backslash
259 * @param out Receiver for the decoded char
260 * @return Number of source chars consumed, or -1 on unsupported escape
261 */
262 private static int handleEscape(
263 final String body, final int idx, final StringBuilder out
264 ) {
265 final int advance;
266 if (idx + 1 >= body.length()) {
267 advance = -1;
268 } else {
269 final int decoded = SimpleStringSplitCheck.escape(
270 body.charAt(idx + 1)
271 );
272 if (decoded < 0) {
273 advance = -1;
274 } else {
275 out.append((char) decoded);
276 advance = 2;
277 }
278 }
279 return advance;
280 }
281
282 /**
283 * Translates a single Java escape letter to its runtime character.
284 * @param chr Character after the backslash
285 * @return Runtime character code, or -1 for unsupported escapes
286 */
287 private static int escape(final char chr) {
288 final int result;
289 switch (chr) {
290 case 'n':
291 result = '\n';
292 break;
293 case 't':
294 result = '\t';
295 break;
296 case 'r':
297 result = '\r';
298 break;
299 case 'b':
300 result = '\b';
301 break;
302 case 'f':
303 result = '\f';
304 break;
305 case 's':
306 result = ' ';
307 break;
308 case '\'':
309 result = '\'';
310 break;
311 case '"':
312 result = '"';
313 break;
314 case '\\':
315 result = '\\';
316 break;
317 default:
318 result = -1;
319 break;
320 }
321 return result;
322 }
323 }