View Javadoc
1   /*
2    * SPDX-FileCopyrightText: Copyright (c) 2011-2026 Yegor Bugayenko
3    * SPDX-License-Identifier: MIT
4    */
5   package com.qulice.checkstyle;
6   
7   import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
8   import com.puppycrawl.tools.checkstyle.api.DetailAST;
9   import com.puppycrawl.tools.checkstyle.api.TokenTypes;
10  import java.util.Optional;
11  
12  /**
13   * Checks that String.split is only invoked with regex arguments that the JDK
14   * handles via its fastpath.
15   *
16   * <p>For anything beyond the fastpath, String.split builds a fresh Pattern
17   * on every call, which is wasteful in tight loops. Extract the regex into a
18   * private static final Pattern field and use Pattern.split(CharSequence)
19   * instead.
20   *
21   * <p>The JDK fastpath accepts only a one-char string whose sole character is
22   * not a regex meta character, or a two-char string whose first character is
23   * a backslash and whose second character is not an ASCII letter or digit.
24   *
25   * <p>Examples that are flagged:
26   *
27   * <pre>
28   * "abxxdexxzy".split("xx");
29   * "abxxdexxzy".split("xx", 1);
30   * "abxxdexxzy".split(".");
31   * </pre>
32   *
33   * <p>Examples that are accepted:
34   *
35   * <pre>
36   * "abxdexzy".split("x");
37   * "abxdexzy".split("x", 2);
38   * "abxdexzy".split("\n");
39   * "ab.ex.zy".split("\\.");
40   * </pre>
41   *
42   * <p>The check only reports calls whose first argument is a string literal:
43   * when the regex is a variable the optimization cannot be determined from
44   * the AST alone.
45   *
46   * @since 0.24
47   */
48  public final class SimpleStringSplitCheck extends AbstractCheck {
49  
50      /**
51       * Regex meta characters the JDK fastpath refuses for a one-char pattern.
52       */
53      private static final String META = ".$|()[{^?*+\\";
54  
55      @Override
56      public int[] getDefaultTokens() {
57          return new int[] {TokenTypes.METHOD_CALL};
58      }
59  
60      @Override
61      public int[] getAcceptableTokens() {
62          return this.getDefaultTokens();
63      }
64  
65      @Override
66      public int[] getRequiredTokens() {
67          return this.getDefaultTokens();
68      }
69  
70      @Override
71      public void visitToken(final DetailAST ast) {
72          final Optional<DetailAST> literal =
73              SimpleStringSplitCheck.regexLiteral(ast);
74          final Optional<String> regex = literal.flatMap(
75              node -> SimpleStringSplitCheck.decode(node.getText())
76          );
77          if (regex.isPresent()
78              && !SimpleStringSplitCheck.optimized(regex.get())) {
79              this.log(
80                  literal.get(),
81                  "String.split regex is not JDK-optimized, use a precompiled java.util.regex.Pattern"
82              );
83          }
84      }
85  
86      /**
87       * Returns the STRING_LITERAL node that is the first argument of a
88       * split call with one or two arguments, if any.
89       * @param call METHOD_CALL AST node
90       * @return Node of the literal regex, or empty when the call is not a
91       *  split with a literal first argument
92       */
93      private static Optional<DetailAST> regexLiteral(final DetailAST call) {
94          Optional<DetailAST> result = Optional.empty();
95          if (SimpleStringSplitCheck.isSplitCall(call)) {
96              result = SimpleStringSplitCheck.firstLiteralArg(call);
97          }
98          return result;
99      }
100 
101     /**
102      * Tells whether the method call invokes a method named "split".
103      * @param call METHOD_CALL AST node
104      * @return True when the call is of the form receiver.split(...)
105      */
106     private static boolean isSplitCall(final DetailAST call) {
107         final DetailAST dot = call.getFirstChild();
108         final boolean result;
109         if (dot == null || dot.getType() != TokenTypes.DOT) {
110             result = false;
111         } else {
112             final DetailAST name = dot.getLastChild();
113             result = name != null && name.getType() == TokenTypes.IDENT
114                 && "split".equals(name.getText());
115         }
116         return result;
117     }
118 
119     /**
120      * Finds the first STRING_LITERAL argument when the call has one or two
121      * arguments.
122      * @param call METHOD_CALL AST node
123      * @return STRING_LITERAL node, or empty when the shape does not match
124      */
125     private static Optional<DetailAST> firstLiteralArg(final DetailAST call) {
126         final DetailAST elist = call.findFirstToken(TokenTypes.ELIST);
127         Optional<DetailAST> result = Optional.empty();
128         if (elist != null && SimpleStringSplitCheck.isOneOrTwoArgs(elist)) {
129             final DetailAST expr = elist.findFirstToken(TokenTypes.EXPR);
130             if (expr != null && SimpleStringSplitCheck.isLoneStringLiteral(expr)) {
131                 result = Optional.of(expr.getFirstChild());
132             }
133         }
134         return result;
135     }
136 
137     /**
138      * Tells whether the given ELIST holds one or two EXPR children.
139      * @param elist ELIST AST node
140      * @return True if ELIST has 1 or 2 EXPR children
141      */
142     private static boolean isOneOrTwoArgs(final DetailAST elist) {
143         final int args = elist.getChildCount(TokenTypes.EXPR);
144         return args == 1 || args == 2;
145     }
146 
147     /**
148      * Tells whether the given EXPR is a lone STRING_LITERAL.
149      * @param expr EXPR AST node
150      * @return True if EXPR has a single STRING_LITERAL child
151      */
152     private static boolean isLoneStringLiteral(final DetailAST expr) {
153         return expr.getChildCount() == 1
154             && expr.getFirstChild().getType() == TokenTypes.STRING_LITERAL;
155     }
156 
157     /**
158      * Tells whether the given regex string would hit the JDK fastpath.
159      * @param regex Runtime regex string
160      * @return True if JDK fastpath applies
161      */
162     private static boolean optimized(final String regex) {
163         final boolean result;
164         final int len = regex.length();
165         if (len == 1) {
166             result = SimpleStringSplitCheck.META.indexOf(regex.charAt(0)) < 0;
167         } else if (len == 2 && regex.charAt(0) == '\\') {
168             result = !SimpleStringSplitCheck.isAsciiAlphanumeric(regex.charAt(1));
169         } else {
170             result = false;
171         }
172         return result;
173     }
174 
175     /**
176      * Tells whether the given character is an ASCII letter or digit.
177      * @param chr Character to test
178      * @return True if ASCII letter or digit
179      */
180     private static boolean isAsciiAlphanumeric(final char chr) {
181         return SimpleStringSplitCheck.isAsciiDigit(chr)
182             || SimpleStringSplitCheck.isAsciiLetter(chr);
183     }
184 
185     /**
186      * Tells whether the given character is an ASCII digit 0-9.
187      * @param chr Character to test
188      * @return True if ASCII digit
189      */
190     private static boolean isAsciiDigit(final char chr) {
191         return chr >= '0' && chr <= '9';
192     }
193 
194     /**
195      * Tells whether the given character is an ASCII letter a-z or A-Z.
196      * @param chr Character to test
197      * @return True if ASCII letter
198      */
199     private static boolean isAsciiLetter(final char chr) {
200         return chr >= 'a' && chr <= 'z'
201             || chr >= 'A' && chr <= 'Z';
202     }
203 
204     /**
205      * Decodes a Java string literal (with surrounding double quotes) to the
206      * runtime string it denotes. Returns empty when the literal contains
207      * escape sequences the check does not understand, so the check can stay
208      * silent rather than guess.
209      * @param text Raw token text including the surrounding double quotes
210      * @return Decoded runtime string, or empty on unsupported escapes
211      */
212     private static Optional<String> decode(final String text) {
213         final String body = text.substring(1, text.length() - 1);
214         final StringBuilder out = new StringBuilder(body.length());
215         int idx = 0;
216         boolean failed = false;
217         while (idx < body.length() && !failed) {
218             final int advance = SimpleStringSplitCheck.step(body, idx, out);
219             if (advance < 0) {
220                 failed = true;
221             } else {
222                 idx += advance;
223             }
224         }
225         final Optional<String> result;
226         if (failed) {
227             result = Optional.empty();
228         } else {
229             result = Optional.of(out.toString());
230         }
231         return result;
232     }
233 
234     /**
235      * Advances one step through the string literal body.
236      * @param body Literal body without surrounding quotes
237      * @param idx Current index
238      * @param out Receiver for the decoded char
239      * @return Number of source chars consumed, or -1 on unsupported escape
240      */
241     private static int step(
242         final String body, final int idx, final StringBuilder out
243     ) {
244         final char chr = body.charAt(idx);
245         final int advance;
246         if (chr == '\\') {
247             advance = SimpleStringSplitCheck.handleEscape(body, idx, out);
248         } else {
249             out.append(chr);
250             advance = 1;
251         }
252         return advance;
253     }
254 
255     /**
256      * Handles a backslash escape starting at {@code idx} in {@code body}.
257      * @param body Literal body without surrounding quotes
258      * @param idx Index of the backslash
259      * @param out Receiver for the decoded char
260      * @return Number of source chars consumed, or -1 on unsupported escape
261      */
262     private static int handleEscape(
263         final String body, final int idx, final StringBuilder out
264     ) {
265         final int advance;
266         if (idx + 1 >= body.length()) {
267             advance = -1;
268         } else {
269             final int decoded = SimpleStringSplitCheck.escape(
270                 body.charAt(idx + 1)
271             );
272             if (decoded < 0) {
273                 advance = -1;
274             } else {
275                 out.append((char) decoded);
276                 advance = 2;
277             }
278         }
279         return advance;
280     }
281 
282     /**
283      * Translates a single Java escape letter to its runtime character.
284      * @param chr Character after the backslash
285      * @return Runtime character code, or -1 for unsupported escapes
286      */
287     private static int escape(final char chr) {
288         final int result;
289         switch (chr) {
290             case 'n':
291                 result = '\n';
292                 break;
293             case 't':
294                 result = '\t';
295                 break;
296             case 'r':
297                 result = '\r';
298                 break;
299             case 'b':
300                 result = '\b';
301                 break;
302             case 'f':
303                 result = '\f';
304                 break;
305             case 's':
306                 result = ' ';
307                 break;
308             case '\'':
309                 result = '\'';
310                 break;
311             case '"':
312                 result = '"';
313                 break;
314             case '\\':
315                 result = '\\';
316                 break;
317             default:
318                 result = -1;
319                 break;
320         }
321         return result;
322     }
323 }