|
0
|
1 |
package hirondelle.web4j.util;
|
|
|
2 |
|
|
|
3 |
/**
|
|
|
4 |
(UNPUBLISHED) Simple collection of commonly used regular expressions, as <tt>String</tt>s.
|
|
|
5 |
|
|
|
6 |
<P>Regular expressions are both cryptic and error-prone. Reuse of
|
|
|
7 |
the regular expressions in this class should increase legibility in the caller, and
|
|
|
8 |
reduce testing time.
|
|
|
9 |
|
|
|
10 |
<P>(These are presented here as <tt>String</tt>s, and not as <tt>Pattern</tt>s, to
|
|
|
11 |
aid in constructing complex expressions out of simpler ones.)
|
|
|
12 |
|
|
|
13 |
<P>Some items follow the style of <em>Mastering Regular Expressions</em>, by
|
|
|
14 |
Jeffrey Friedl (0-596-00289-0).
|
|
|
15 |
|
|
|
16 |
<P>Grouping represents a problem for this class, since the caller may or
|
|
|
17 |
may not desire a particular element to be a capturing group.
|
|
|
18 |
*/
|
|
|
19 |
public final class Regex {
|
|
|
20 |
|
|
|
21 |
/**
|
|
|
22 |
Not a well-formed regex, but a symbolic name for alternation, intended
|
|
|
23 |
simply to improve legibility of regexes.
|
|
|
24 |
*/
|
|
|
25 |
public static final String OR = "|";
|
|
|
26 |
|
|
|
27 |
/**
|
|
|
28 |
Whitespace.
|
|
|
29 |
*/
|
|
|
30 |
public static final String WS = "\\s*";
|
|
|
31 |
public static final String DOT = "\\.";
|
|
|
32 |
public static final String ANY_CHARS = ".*";
|
|
|
33 |
public static final String START_TAG = "<";
|
|
|
34 |
public static final String END_TAG = WS + ">";
|
|
|
35 |
public static final String ALL_BUT_END_OF_TAG = "[^>]*";
|
|
|
36 |
public static final String ALL_BUT_START_OF_TAG = "[^<]*";
|
|
|
37 |
public static final String QUOTE = "(?:'|\")";
|
|
|
38 |
public static final String NO_SPECIAL_HTML_CHAR = "[^<>'\"]";
|
|
|
39 |
|
|
|
40 |
/** Group 1 returns the attribute value. */
|
|
|
41 |
public static final String QUOTED_ATTR = QUOTE+"((?:"+NO_SPECIAL_HTML_CHAR+")*)"+QUOTE;
|
|
|
42 |
|
|
|
43 |
/** Group 1 returns the trimmed text. */
|
|
|
44 |
public static final String TRIMMED_TEXT = "(?:\\s)*((?:\\S(?:.)*\\S)|(?:\\S))(?:\\s)*";
|
|
|
45 |
|
|
|
46 |
//Might be used for unquoted attributes:
|
|
|
47 |
//public static final String fNO_SPECIAL_HTML_CHARS_OR_SPACES = "[^<>'\"\\s]";
|
|
|
48 |
|
|
|
49 |
public static final int ENTIRE_MATCH = 0;
|
|
|
50 |
public static final int FIRST_GROUP = 1;
|
|
|
51 |
public static final int SECOND_GROUP = 2;
|
|
|
52 |
public static final int THIRD_GROUP = 3;
|
|
|
53 |
public static final int FOURTH_GROUP = 4;
|
|
|
54 |
|
|
|
55 |
// NEW items added after web4j 1.3.0 :
|
|
|
56 |
|
|
|
57 |
public static final String SINGLE_QUOTED_ATTR = "'[^']*'";
|
|
|
58 |
public static final String DOUBLE_QUOTED_ATTR = "\"[^\"]*\"";
|
|
|
59 |
public static final String UNQUOTED_ATTR = "[-.:\\w]+";
|
|
|
60 |
|
|
|
61 |
/**
|
|
|
62 |
Group 1 is the attribute value, <em>with</em> quotes.
|
|
|
63 |
<P>Use {@link Util#removeQuotes(String)} to remove any quotes, if needed.
|
|
|
64 |
<P>The content of an HTML tag attribute is specified at
|
|
|
65 |
http://www.w3.org/TR/html4/intro/sgmltut.html#attributes
|
|
|
66 |
*/
|
|
|
67 |
public static final String ATTR_VALUE =
|
|
|
68 |
"(" +
|
|
|
69 |
SINGLE_QUOTED_ATTR +
|
|
|
70 |
OR +
|
|
|
71 |
DOUBLE_QUOTED_ATTR +
|
|
|
72 |
OR +
|
|
|
73 |
UNQUOTED_ATTR +
|
|
|
74 |
")"
|
|
|
75 |
;
|
|
|
76 |
|
|
|
77 |
public static final String ATTR_NAME = "[a-zA-Z]+";
|
|
|
78 |
public static final String TAG_NAME = "[a-zA-Z]+";
|
|
|
79 |
public static final String ATTR = "\\s+" + ATTR_NAME + WS + "=" + WS + ATTR_VALUE;
|
|
|
80 |
public static final String FIRST_TAG = "<" + TAG_NAME + "(?:" + ATTR + ")*" + WS + ">";
|
|
|
81 |
public static final String SECOND_TAG = "</" + TAG_NAME + ">";
|
|
|
82 |
public static final String TAG_BODY = "(.*?)";
|
|
|
83 |
public static final String ENTIRE_TAG = FIRST_TAG + TAG_BODY + SECOND_TAG;
|
|
|
84 |
|
|
|
85 |
public static final String BLANK_LINE = "^\\s*$";
|
|
|
86 |
|
|
|
87 |
/**
|
|
|
88 |
Finds positions where a comma should be placed in a number, in the style
|
|
|
89 |
<tt>1,000,000</tt>. Intended for integers, but can also handle up to 3 decimal
|
|
|
90 |
places. For example, <tt>10000.001</tt> gives <tt>10,000.001</tt>.
|
|
|
91 |
*/
|
|
|
92 |
public static final String COMMA_INSERTION = "(?<=\\d)(?=(\\d\\d\\d)+(?!\\d))";
|
|
|
93 |
|
|
|
94 |
/**
|
|
|
95 |
Either integer or floating point number. Has the following properties
|
|
|
96 |
<ul>
|
|
|
97 |
<li>digits with possible decimal point
|
|
|
98 |
<li>possible leading plus or minus sign
|
|
|
99 |
<li>no grouping delimiters (such as a comma)
|
|
|
100 |
<li>no leading or trailing whitespace
|
|
|
101 |
</ul>
|
|
|
102 |
<P>Example matches: 1, 100, 2.3, -2.3, +2.3, -272.13, -.0, 2.
|
|
|
103 |
<P>Example mismatches: '1,000', '123 ', ' 123'.
|
|
|
104 |
*/
|
|
|
105 |
public static final String NUMBER =
|
|
|
106 |
"(?:-|\\+)?" +
|
|
|
107 |
"(" +
|
|
|
108 |
"[0-9]+" + "(" + DOT + "[0-9]*)?" +
|
|
|
109 |
OR +
|
|
|
110 |
DOT + "[0-9]+" +
|
|
|
111 |
")"
|
|
|
112 |
;
|
|
|
113 |
|
|
|
114 |
/**
|
|
|
115 |
Similar to {@link #NUMBER}, except the number of decimals, if present, is always 2.
|
|
|
116 |
|
|
|
117 |
<P>Example matches : <tt>1000, 100.25, .25, 0.25, -.13, -0.13</tt>
|
|
|
118 |
<P>Example mismatches : <tt>1,000.00, 100., 100.0, 100.123, -56.000, .123</tt>
|
|
|
119 |
*/
|
|
|
120 |
public static final String DOLLARS =
|
|
|
121 |
"(?:-|\\+)?" +
|
|
|
122 |
"(" +
|
|
|
123 |
"[0-9]+" + "(" + Regex.DOT + "[0-9]{2})?" +
|
|
|
124 |
Regex.OR +
|
|
|
125 |
Regex.DOT + "[0-9]{2}" +
|
|
|
126 |
")"
|
|
|
127 |
;
|
|
|
128 |
|
|
|
129 |
/**
|
|
|
130 |
An amount in any currency.
|
|
|
131 |
|
|
|
132 |
<P>There are two permitted decimal separators: <tt>'.'</tt> and <tt>','</tt>. The
|
|
|
133 |
permitted number of decimals is <tt>0,2,3</tt>.
|
|
|
134 |
|
|
|
135 |
<P>Example matches : <tt>'1000', '100.25', '.253', '0.25', '-.13', '-0.00'</tt>
|
|
|
136 |
<P>Example mismatches : <tt>'1,000.00', '100.', '100.0', '100.1234', ',1', ',1234'</tt>
|
|
|
137 |
|
|
|
138 |
<P>Note as well that <tt>'1,000'</tt> matches as well! A grouping separator
|
|
|
139 |
in one <tt>Locale</tt> is a decimal separator in others.
|
|
|
140 |
|
|
|
141 |
<P>Any <tt>String</tt> that matches this pattern will be accepted by
|
|
|
142 |
{@link java.math.BigDecimal#BigDecimal(String)}.
|
|
|
143 |
*/
|
|
|
144 |
public static final String MONEY =
|
|
|
145 |
"(?:-|\\+)?" +
|
|
|
146 |
"(" +
|
|
|
147 |
"[0-9]+((?:\\.|,)[0-9]{2,3})?" +
|
|
|
148 |
Regex.OR +
|
|
|
149 |
"(?:\\.|,)[0-9]{2,3}" +
|
|
|
150 |
")"
|
|
|
151 |
;
|
|
|
152 |
|
|
|
153 |
/**
|
|
|
154 |
An arbitrary number of digits. Has the following properties
|
|
|
155 |
<ul>
|
|
|
156 |
<li>value greater than or equal to <tt>0</tt>
|
|
|
157 |
<li>possible leading zeros, as in <tt>0100</tt>, or <tt>0002</tt>
|
|
|
158 |
<li>no decimal point
|
|
|
159 |
<li>no leading plus or minus sign
|
|
|
160 |
<li>no leading or trailing whitespace
|
|
|
161 |
</ul>
|
|
|
162 |
|
|
|
163 |
<P><em>Design Note:</em><br>
|
|
|
164 |
Allowing leading zeros is not a problem for creating <tt>Integer</tt> objects,
|
|
|
165 |
since the <tt>Integer(String)</tt> constructor allows them.
|
|
|
166 |
|
|
|
167 |
<P>Example matches: 0, 1, 2, 9, 10, 99, 789, 010, 0018.<br>
|
|
|
168 |
Example mismatches: -1, +1, 2.0, ' 0', '2 '.
|
|
|
169 |
*/
|
|
|
170 |
public static final String DIGITS = "(\\d)+";
|
|
|
171 |
|
|
|
172 |
/**
|
|
|
173 |
Return a regular expression corresponding to <tt>DIGITS</tt>, but having
|
|
|
174 |
number of digits in range <tt>1..aMaxNumDigits</tt>.
|
|
|
175 |
|
|
|
176 |
@param aMaxNumDigits must be <tt>1</tt> or more.
|
|
|
177 |
*/
|
|
|
178 |
public static String forNDigits(int aMaxNumDigits){
|
|
|
179 |
Args.checkForRange(aMaxNumDigits, 1, Integer.MAX_VALUE);
|
|
|
180 |
return "(\\d){1," + aMaxNumDigits + "}";
|
|
|
181 |
}
|
|
|
182 |
|
|
|
183 |
/**
|
|
|
184 |
Email address.
|
|
|
185 |
|
|
|
186 |
<P>The {@link javax.mail.internet.InternetAddress} class permits validation of an
|
|
|
187 |
email address. See also
|
|
|
188 |
{@link hirondelle.web4j.util.WebUtil#isValidEmailAddress(String)}. Thus, this
|
|
|
189 |
regex <em>should be used only when those classes are not available</em>.
|
|
|
190 |
*/
|
|
|
191 |
public static final String EMAIL_ADDR =
|
|
|
192 |
"\\w[-.\\w]*@" +
|
|
|
193 |
"[a-z0-9]+(\\.[a-z0-9]+)*" + DOT +
|
|
|
194 |
"(com|org|net|edu|gov|int|mil|biz|info|name|museum|coop|aero|[a-z][a-z])"
|
|
|
195 |
;
|
|
|
196 |
|
|
|
197 |
/**
|
|
|
198 |
Matches numbers in the range 0-255.
|
|
|
199 |
<P>Example: 1, 001, 010, 199, 255.
|
|
|
200 |
*/
|
|
|
201 |
private static final String IP_ADDR_ITEM =
|
|
|
202 |
"(?:[01]?\\d\\d?" +
|
|
|
203 |
OR +
|
|
|
204 |
"2[0-4]\\d" +
|
|
|
205 |
OR +
|
|
|
206 |
"25[0-5])"
|
|
|
207 |
;
|
|
|
208 |
|
|
|
209 |
/**
|
|
|
210 |
IP addresses.
|
|
|
211 |
<P>Example match: 1.01.001.255
|
|
|
212 |
*/
|
|
|
213 |
public static final String IP_ADDR =
|
|
|
214 |
"(?<![\\w.])" +
|
|
|
215 |
IP_ADDR_ITEM + DOT +
|
|
|
216 |
IP_ADDR_ITEM + DOT +
|
|
|
217 |
IP_ADDR_ITEM + DOT +
|
|
|
218 |
IP_ADDR_ITEM +
|
|
|
219 |
"(?![\\w.])"
|
|
|
220 |
;
|
|
|
221 |
|
|
|
222 |
|
|
|
223 |
/**
|
|
|
224 |
A positional regex which returns the position where lower case text is
|
|
|
225 |
immediately followed by upper case text.
|
|
|
226 |
|
|
|
227 |
<P>Intended for manipulation of text in camel hump style, which looks like this :
|
|
|
228 |
BlahBlahBlah, LoginName, EmailAddress.
|
|
|
229 |
|
|
|
230 |
<P>Example:<br>
|
|
|
231 |
To change 'LoginName' into the more user-friendly 'Login Name' (with an added space),
|
|
|
232 |
replace the matches returned by this regex with the replacement string <tt>' $1'</tt>.
|
|
|
233 |
*/
|
|
|
234 |
public static final String CAMEL_HUMP_TEXT = "(?<=[a-z0-9])([A-Z])";
|
|
|
235 |
|
|
|
236 |
/**
|
|
|
237 |
Simple identifier.
|
|
|
238 |
One or more letters/underscores, with possible trailing digits.
|
|
|
239 |
Matching examples include :
|
|
|
240 |
<ul>
|
|
|
241 |
<li><tt>blah</tt>
|
|
|
242 |
<li><tt>blah42</tt>
|
|
|
243 |
<li><tt>blah_42</tt>
|
|
|
244 |
<li><tt>BlahBlah</tt>
|
|
|
245 |
<li><tt>BLAH_BLAH</tt>
|
|
|
246 |
</ul>
|
|
|
247 |
*/
|
|
|
248 |
public static final String SIMPLE_IDENTIFIER = "([a-zA-Z_]+(?:\\d)*)";
|
|
|
249 |
|
|
|
250 |
/**
|
|
|
251 |
Scoped identifier.
|
|
|
252 |
|
|
|
253 |
<P>Either two {@link #SIMPLE_IDENTIFIER}s separated by a period, or a single
|
|
|
254 |
{@link #SIMPLE_IDENTIFIER}. The item before the period represents an <em>optional</em>
|
|
|
255 |
scoping qualifier. (This style is used by SQL statement identifiers, where the
|
|
|
256 |
scoping qualifier represents the target database.)
|
|
|
257 |
*/
|
|
|
258 |
public static final String SIMPLE_SCOPED_IDENTIFIER = "(?:[a-zA-Z_]+(?:\\d)*\\.)?(?:[a-zA-Z_]+(?:\\d)*)";
|
|
|
259 |
|
|
|
260 |
/**
|
|
|
261 |
A link or anchor tag.
|
|
|
262 |
|
|
|
263 |
<P>Here, <tt>HREF</tt> must be the first attribute to appear in the tag.
|
|
|
264 |
The following groups are defined :
|
|
|
265 |
<ul>
|
|
|
266 |
<li> group 1 - value of the HREF attr
|
|
|
267 |
<li> group 2 - all text after the HREF attr, but still inside the tag - the "remainder"
|
|
|
268 |
attributes
|
|
|
269 |
<li> group 3 - the body of the A tag
|
|
|
270 |
</ul>
|
|
|
271 |
*/
|
|
|
272 |
public static final String LINK =
|
|
|
273 |
"<a href=" + Regex.QUOTED_ATTR + "(" + Regex.ALL_BUT_END_OF_TAG + ")" + Regex.END_TAG
|
|
|
274 |
+ Regex.TRIMMED_TEXT + "</a>"
|
|
|
275 |
;
|
|
|
276 |
|
|
|
277 |
/**
|
|
|
278 |
Month in the Gregorian calendar: <tt>01..12</tt>.
|
|
|
279 |
*/
|
|
|
280 |
public static final String MONTH =
|
|
|
281 |
"(01|02|03|04|05|06|07|08|09|10|11|12)"
|
|
|
282 |
;
|
|
|
283 |
|
|
|
284 |
/**
|
|
|
285 |
Day of the month in the Gregorian calendar: <tt>01..31</tt>.
|
|
|
286 |
*/
|
|
|
287 |
public static final String DAY_OF_MONTH =
|
|
|
288 |
"(01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31)"
|
|
|
289 |
;
|
|
|
290 |
|
|
|
291 |
/** Hours in the day <tt>00..23</tt>. */
|
|
|
292 |
public static final String HOURS =
|
|
|
293 |
"(00|01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23)"
|
|
|
294 |
;
|
|
|
295 |
|
|
|
296 |
/** Minutes in an hour <tt>00..59</tt>. */
|
|
|
297 |
public static final String MINUTES =
|
|
|
298 |
"((0|1|2|3|4|5)\\d)"
|
|
|
299 |
;
|
|
|
300 |
|
|
|
301 |
/** Hours and minutes, in the form <tt>00:59</tt>. */
|
|
|
302 |
public static final String HOURS_AND_MINUTES =
|
|
|
303 |
HOURS + ":" + MINUTES
|
|
|
304 |
;
|
|
|
305 |
|
|
|
306 |
// PRIVATE //
|
|
|
307 |
|
|
|
308 |
/** Prevents instantiation of this class. */
|
|
|
309 |
private Regex(){
|
|
|
310 |
//emtpy
|
|
|
311 |
}
|
|
|
312 |
}
|