理解完正则学会一半:
import re def test_patterns(text, patterns=[]): """Given source text and a list of patterns, look for matches for each pattern within the text and print them to stdout. """ print print ''.join(str(i/10 or ' ') for i in range(len(text))) print ''.join(str(i%10) for i in range(len(text))) print text # Look for each pattern in the text and print the results for pattern in patterns: print print 'Matching "%s"' % pattern for match in re.finditer(pattern, text): s = match.start() e = match.end() print ' %2d : %2d = "%s"' % \ (s, e-1, text[s:e]) return if __name__ == '__main__': print "*"*50 #Pattern Syntax test_patterns('abbaaabbbbaaaaa', ['ab']) print "*"*50 #Repetition test_patterns('abbaaabbbbaaaaa', [ 'ab*', # a followed by zero or more b 'ab+', # a followed by one or more b 'ab?', # a followed by zero or one b 'ab{3}', # a followed by three b 'ab{2,3}', # a followed by two to three b ]) print "*"*50 #Character Sets test_patterns('abbaaabbbbaaaaa', [ '[ab]', # either a or b 'a[ab]+', # a followed by one or more a or b 'a[ab]+?', # a followed by one or more a or b, not greedy ]) print "*"*50 test_patterns('This is some text -- with punctuation.', [ '[^-. ]+', # sequences without -, ., or space ]) print "*"*50 test_patterns('This is some text -- with punctuation.', [ '[a-z]+', # sequences of lower case letters '[A-Z]+', # sequences of upper case letters '[a-zA-Z]+', # sequences of lower or upper case letters '[A-Z][a-z]+', # one upper case letter followed by lower case letters ]) print "*"*50 test_patterns('abbaaabbbbaaaaa', [ 'a.', # a followed by any one character 'b.', # b followed by any one character 'a.*b', # a followed by anything, ending in b 'a.*?b', # a followed by anything, ending in b ]) print "*"*50 #Escape Codes # Code Meaning # \d a digit # \D a non-digit # \s whitespace (tab, space, newline, etc.) # \S non-whitespace # \w alphanumeric # \W non-alphanumeric test_patterns('This is a prime #1 example!', [ r'\d+', # sequence of digits r'\D+', # sequence of non-digits r'\s+', # sequence of whitespace r'\S+', # sequence of non-whitespace r'\w+', # alphanumeric characters r'\W+', # non-alphanumeric ]) print "*"*50 test_patterns(r'\d+ \D+ \s+ \S+ \w+ \W+', [ r'\\d\+', r'\\D\+', r'\\s\+', r'\\S\+', r'\\w\+', r'\\W\+', ]) #Anchoring # Code Meaning # ^ start of string, or line # $ end of string, or line # \A start of string # \Z end of string # \b empty string at the beginning or end of a word # \B empty string not at the beginning or end of a word print "*"*50 test_patterns('This is some text -- with punctuation.', [ r'^\w+', # word at start of string r'\A\w+', # word at start of string r'\w+\S*$', # word at end of string, with optional punctuation r'\w+\S*\Z', # word at end of string, with optional punctuation r'\w*t\w*', # word containing 't' r'\bt\w+', # 't' at start of word r'\w+t\b', # 't' at end of word r'\Bt\B', # 't', not start or end of word ])输出结果:
************************************************** 11111 012345678901234 abbaaabbbbaaaaa Matching "ab" 0 : 1 = "ab" 5 : 6 = "ab" ************************************************** 11111 012345678901234 abbaaabbbbaaaaa Matching "ab*" 0 : 2 = "abb" 3 : 3 = "a" 4 : 4 = "a" 5 : 9 = "abbbb" 10 : 10 = "a" 11 : 11 = "a" 12 : 12 = "a" 13 : 13 = "a" 14 : 14 = "a" Matching "ab+" 0 : 2 = "abb" 5 : 9 = "abbbb" Matching "ab?" 0 : 1 = "ab" 3 : 3 = "a" 4 : 4 = "a" 5 : 6 = "ab" 10 : 10 = "a" 11 : 11 = "a" 12 : 12 = "a" 13 : 13 = "a" 14 : 14 = "a" Matching "ab{3}" 5 : 8 = "abbb" Matching "ab{2,3}" 0 : 2 = "abb" 5 : 8 = "abbb" ************************************************** 11111 012345678901234 abbaaabbbbaaaaa Matching "[ab]" 0 : 0 = "a" 1 : 1 = "b" 2 : 2 = "b" 3 : 3 = "a" 4 : 4 = "a" 5 : 5 = "a" 6 : 6 = "b" 7 : 7 = "b" 8 : 8 = "b" 9 : 9 = "b" 10 : 10 = "a" 11 : 11 = "a" 12 : 12 = "a" 13 : 13 = "a" 14 : 14 = "a" Matching "a[ab]+" 0 : 14 = "abbaaabbbbaaaaa" Matching "a[ab]+?" 0 : 1 = "ab" 3 : 4 = "aa" 5 : 6 = "ab" 10 : 11 = "aa" 12 : 13 = "aa" ************************************************** 1111111111222222222233333333 01234567890123456789012345678901234567 This is some text -- with punctuation. Matching "[^-. ]+" 0 : 3 = "This" 5 : 6 = "is" 8 : 11 = "some" 13 : 16 = "text" 21 : 24 = "with" 26 : 36 = "punctuation" ************************************************** 1111111111222222222233333333 01234567890123456789012345678901234567 This is some text -- with punctuation. Matching "[a-z]+" 1 : 3 = "his" 5 : 6 = "is" 8 : 11 = "some" 13 : 16 = "text" 21 : 24 = "with" 26 : 36 = "punctuation" Matching "[A-Z]+" 0 : 0 = "T" Matching "[a-zA-Z]+" 0 : 3 = "This" 5 : 6 = "is" 8 : 11 = "some" 13 : 16 = "text" 21 : 24 = "with" 26 : 36 = "punctuation" Matching "[A-Z][a-z]+" 0 : 3 = "This" ************************************************** 11111 012345678901234 abbaaabbbbaaaaa Matching "a." 0 : 1 = "ab" 3 : 4 = "aa" 5 : 6 = "ab" 10 : 11 = "aa" 12 : 13 = "aa" Matching "b." 1 : 2 = "bb" 6 : 7 = "bb" 8 : 9 = "bb" Matching "a.*b" 0 : 9 = "abbaaabbbb" Matching "a.*?b" 0 : 1 = "ab" 3 : 6 = "aaab" ************************************************** 11111111112222222 012345678901234567890123456 This is a prime #1 example! Matching "\d+" 17 : 17 = "1" Matching "\D+" 0 : 16 = "This is a prime #" 18 : 26 = " example!" Matching "\s+" 4 : 4 = " " 7 : 7 = " " 9 : 9 = " " 15 : 15 = " " 18 : 18 = " " Matching "\S+" 0 : 3 = "This" 5 : 6 = "is" 8 : 8 = "a" 10 : 14 = "prime" 16 : 17 = "#1" 19 : 26 = "example!" Matching "\w+" 0 : 3 = "This" 5 : 6 = "is" 8 : 8 = "a" 10 : 14 = "prime" 17 : 17 = "1" 19 : 25 = "example" Matching "\W+" 4 : 4 = " " 7 : 7 = " " 9 : 9 = " " 15 : 16 = " #" 18 : 18 = " " 26 : 26 = "!" ************************************************** 1111111111222 01234567890123456789012 \d+ \D+ \s+ \S+ \w+ \W+ Matching "\\d\+" 0 : 2 = "\d+" Matching "\\D\+" 4 : 6 = "\D+" Matching "\\s\+" 8 : 10 = "\s+" Matching "\\S\+" 12 : 14 = "\S+" Matching "\\w\+" 16 : 18 = "\w+" Matching "\\W\+" 20 : 22 = "\W+" ************************************************** 1111111111222222222233333333 01234567890123456789012345678901234567 This is some text -- with punctuation. Matching "^\w+" 0 : 3 = "This" Matching "\A\w+" 0 : 3 = "This" Matching "\w+\S*$" 26 : 37 = "punctuation." Matching "\w+\S*\Z" 26 : 37 = "punctuation." Matching "\w*t\w*" 13 : 16 = "text" 21 : 24 = "with" 26 : 36 = "punctuation" Matching "\bt\w+" 13 : 16 = "text" Matching "\w+t\b" 13 : 16 = "text" Matching "\Bt\B" 23 : 23 = "t" 30 : 30 = "t" 33 : 33 = "t"待续...