最近项目中有一些地方用到了正则表达式,之前对这个东西了解不多,这次正好多看了一些,也发现正则表达式的一些需要注意地方。
正则总结:
正则表达式的优势是进行样式匹配,而不是具体的逻辑处理;
元字符注意使用英文字符,使用中文符号不会报错,但意义不同了;
零长度匹配的情况;
逆向引用 java正则表达式中使用\+数字,代码中使用$+数字;
java正则引擎目前不支持组命名,java7开始支持该功能;
前向断言和后向断言中的表达式就是具体明确或是长度确定的,原因是正则引擎不会对断言的内容进行回溯操作。
比如下面的demo里展示的一些点:
package demo.regex; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegexDemo { /** * @param args */ public static void main(String[] args) { testZerolength(); testMatchMode(); testLineMode(); testGroup(); testAssertion(); } private static void testMatchMode() { System.out.println(); System.out.println("test match mode"); String content = "xfooxxxxxxfoo"; String regex = ".*foo"; System.out.println(content); System.out.println("greedy mode :" + regex); // String regex = "a+"; Pattern p = Pattern.compile(regex); printMatch(content, p); regex = ".*?foo"; System.out.println("lazy mode :" + regex); p = Pattern.compile(regex); printMatch(content, p); regex = ".*+foo"; System.out.println("possessive mode :" + regex); p = Pattern.compile(regex); printMatch(content, p); } private static void testZerolength() { System.out.println(); System.out.println("test Zero-length"); String content = "aaaaabb"; String regex = "a?"; // String regex = "a+"; Pattern p = Pattern.compile(regex); printMatch(content, p); } private static void testAssertion() { System.out.println(); System.out.println("test assertion"); String content = "前赴后继,前无古人后无来者,前事不忘后事之师"; // String regex = "(?<=[前])\\w+(?=[后])"; //不好使 // String regex = "(?<=[前])[^\\s]+(?=[后])"; //贪婪 String regex = "(?<=[前])[^\\s]+?(?=[后])"; Pattern p = Pattern.compile(regex); printMatch(content, p); System.out.println("test spilt num"); String sample = "1234567890"; sample = sample.replaceAll("^(\\d{1,3})((\\d{3})+)$", "$1,$2"); System.out.println(sample.replaceAll("(?<=\\d{3})(\\d{3})", ",$1")); // 1,234,567,890 sample = "123456789"; sample = sample.replaceAll("^(\\d{1,3})((\\d{3})+)$", "$1,$2"); System.out.println(sample.replaceAll("(?<=\\d{3})(\\d{3})", ",$1")); // 123,456,789 } private static void testGroup() { System.out.println(); System.out.println("test group"); String content = "1212"; String regex = "(\\d\\d)\\1"; System.out.println(regex + " for " + content); printMatch(content, Pattern.compile(regex)); content = "中国-CN"; // String regex = "(\\w+)\\W*(\\w+)"; //不好使 regex = "([^\\-]+)\\-(\\w+)"; // printMatch(content, Pattern.compile(regex)); String result = content.replaceAll(regex, "$2-$1"); // String result = content.replaceAll(regex, "\\2-\\1"); System.out.println(content); System.out.println(result); regex = "<.+>"; //贪婪 content = "<title>This is a demo</title>"; printMatch(content, Pattern.compile(regex)); regex = "<.+?>"; //懒惰 printMatch(content, Pattern.compile(regex)); regex = "<([^>]+?)\\s*?.*?>.*?</\\1>";//逆向引用 java正则表达式中使用\+数字,代码中使用$+数字 // regex = "<([^>]+?)>.*?</\\1>";//逆向引用 java正则表达式中使用\+数字,代码中使用$+数字 String content1 = "<script language=\"JavaScript\" type=\"text/javascript\"></script>"; Pattern p = Pattern.compile(regex); printMatch(content, p); printMatch(content1, p); } protected static void testLineMode() { System.out.println(); // 如果 multiline 为 false,那么 "^" 匹配字符串的开始位置,而 "$" 匹配字符串的结束位置。 // 如果 multline 为 true,那么 "^" 匹配字符串开始位置以及 "\n" 或 "\r" 之后的位置, // 而 "$" 匹配字符串结束位置以及 "\n" 或 "\r" 之前的位置。 // // 其实很简单,多行模式就是根据 \r 或 \n 把字符串分隔为多个单行模式去分别匹配,关键是要与 ^ 或 $ 配合使用! String content = "山清水秀\r\n 山穷水尽 \r\n山舞银蛇,原驰蜡象,欲与天公试比高。\r\n高山仰止"; // String regex = "^山.+?"; //懒惰模式 String regex = "^山.+"; //贪婪模式 Pattern p1 = Pattern.compile(regex); //singleline System.out.println("单行模式:"); printMatch(content, p1); System.out.println(); System.out.println("多行模式:"); Pattern p2 = Pattern.compile(regex,Pattern.MULTILINE);//multiline printMatch(content, p2); } protected static void printMatch(String content, Pattern p1) { Matcher matcher = p1.matcher(content); int i = 1; while(matcher.find()){ System.out.println("match found "+ i +":\""+ matcher.group(0)+"\",start:" + matcher.start()+",end:"+matcher.end()); i++; } } }
运行结果如下:
test Zero-length match found 1:"a",start:0,end:1 match found 2:"a",start:1,end:2 match found 3:"a",start:2,end:3 match found 4:"a",start:3,end:4 match found 5:"a",start:4,end:5 match found 6:"",start:5,end:5 match found 7:"",start:6,end:6 match found 8:"",start:7,end:7 test match mode xfooxxxxxxfoo greedy mode :.*foo match found 1:"xfooxxxxxxfoo",start:0,end:13 lazy mode :.*?foo match found 1:"xfoo",start:0,end:4 match found 2:"xxxxxxfoo",start:4,end:13 possessive mode :.*+foo 单行模式: match found 1:"山清水秀",start:0,end:4 多行模式: match found 1:"山清水秀",start:0,end:4 match found 2:"山舞银蛇,原驰蜡象,欲与天公试比高。",start:14,end:32 test group (\d\d)\1 for 1212 match found 1:"1212",start:0,end:4 中国-CN CN-中国 match found 1:"<title>This is a demo</title>",start:0,end:29 match found 1:"<title>",start:0,end:7 match found 2:"</title>",start:21,end:29 match found 1:"<title>This is a demo</title>",start:0,end:29 match found 1:"<script language="JavaScript" type="text/javascript"></script>",start:0,end:62 test assertion match found 1:"赴",start:1,end:2 match found 2:"无古人",start:6,end:9 match found 3:"事不忘",start:15,end:18 test spilt num 1,234,567,890 123,456,789
demo下载:http://download.csdn.net/download/candyguy242/4534068