Data Warehouse

Python: Regular Expression

Reference : https://www.w3resource.com/python/python-regular-expression.php

A regular expression (or RE) specifies a set of strings that matches it; the functions in this module let you check if a particular string matches a given regular expression (or if a given regular expression matches a particular string, which comes down to the same thing).

Contents:

The following table provides a list and description of the special pattern matching characters that can be used in regular expressions.

Matches only at the end of the string.

Compare HTML tags :

tag type format example
open tag <[^/>][^>]*> <a>, <table>
close tag </[^>]+> </p>, </a>
self close <[^/>]+/> <br />
all tag <[^>]+> <br />, <a>
# open tag
>>> import re
>>> re.search('<[^/>][^>]*>', '<table>') != None
True
>>> import re
>>> re.search('<[^/>][^>]*>', '<a href="#label">') != None
True
>>> import re
>>> re.search('<[^/>][^>]*>', '<img src="/img">') != None
True
>>> import re
>>> re.search('<[^/>][^>]*>', '</table>') != None
False

# close tag
>>> import re
>>> re.search('</[^>]+>', '</table>') != None
True

# self close
>>> import re
>>> re.search('<[^/>]+/>', '<br />') != None
True

re.findall() match string :

# split all string
>>> import re
>>> source = "split all string"
>>> re.findall('[\w]+', source)
['split', 'all', 'string']

# parsing python.org website
>>> import urllib
>>> import re
>>> x = urllib.urlopen('https://www.w3resource.org')
>>> html = x.read()
>>> x.close()
>>> print("open tags")
open tags
>>> re.findall('<[^/>][^>]*>', html)[0:2]
['<!doctype html>', '<!-[if lt IE 7]>']
>>> print("close tags")
close tags
>>> re.findall('</[^>]+>', html)[0:2]
['</script>', '</title>']
>>> print("self-closing tags")

Group Comparison :

# (...) group a regular expression
>>> import re
>>> mon = re.search(r'(\d{4})-(\d{2})-(\d{2})', '2018-09-01')
>>> mon
<_sre.SRE_Match object at 0x019A72F0>
>>> mon.groups()
('2018', '09', '01')
>>> mon.group()
'2018-09-01')
>>> mon.group(1)
'2018'
>>> mon.group(2)
'09'
>>> mon.group(3)
'01'

# Nesting groups
>>> import re
>>> mon = re.search(r'(((\d{4})-\d{2})-\d{2})', '2018-09-01')
>>> mon.groups()
('2018-09-01', '2018-09', '2018')
>>> mon.group()
'2018-09-01'
>>> mon.group(1)
'2018-09-01'
>>> mon.group(2)
'2018-09'
>>> mon.group(3)
'2018'

Non capturing group :

# non capturing group
>>> import re
>>> url = 'http://w3resource.com/'
>>> mon = re.search('(?:http|ftp)://([^/\r\n]+)(/[^\r\n]*)?', url)
>>> mon.groups()
('w3resource.com', '/')

# capturing group
>>> import re
>>> mon = re.search('(http|ftp)://([^/\r\n]+)(/[^\r\n]*)?', url)
>>> mon.groups()
('http', 'w3resource.com', '/')

Back Reference :

# compare 'xx', 'yy'
>>> import re
>>> re.search(r'([a-z])\1$','xx') != None
True
>>> import re
>>> re.search(r'([a-z])\1$','yy') != None
True
>>> import re
>>> re.search(r'([a-z])\1$','xy') != None
False

# compare open tag and close tag
>>> import re
>>> pattern = r'<([^>]+)>[\s\S]*?</\1>'
>>> re.search(pattern, '<bold> test </bold>') != None
True
>>> re.search(pattern, '<h1> test </h1>') != None
True
>>> re.search(pattern, '<bold> test </h1>') != None
False

Named Grouping (?P) :

# group reference ``(?P<name>...)``
>>> import re
>>> pattern = '(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})'
>>> mon = re.search(pattern, '2018-09-01')
>>> mon.group('year')
'2018'
>>> mon.group('month')
'09'
>>> mon.group('day')
'01'

# back reference ``(?P=name)``
>>> import re
>>> re.search('^(?P<char>[a-z])(?P=char)','aa')
<_sre.SRE_Match object at 0x01A08660>

Substitute String :

# basic substitute
>>> import re
>>> res = "4x5y6z"
>>> re.sub(r'[a-z]',' ', res)
'4 5 6 '

# substitute with group reference
>>> import re
>>> date = r'2018-09-01'
>>> re.sub(r'(\d{4})-(\d{2})-(\d{2})',r'\2/\3/\1/',date)
'09/01/2018/'

# camelcase to underscore
>>> def convert(s):
...     res = re.sub(r'(.)([A-Z][a-z]+)',r'\1_\2', s)
...     return re.sub(r'([a-z])([A-Z])',r'\1_\2', res).lower()
...
>>> convert('SentenceCase')
'sentence_case'
>>> convert('SentenceSentenceCase')
'sentence_sentence_case'
>>> convert('SampleExampleHTTPServer')
'sample_example_http_server'

Look around :

notation compare direction
(?<=…) right to left
(?=…) left to right
(?!<…) right to left
(?!…) left to right
# basic
>>> import re
>>> re.sub('(?=\d{3})', ' ', '56789')
' 5 6 789'
>>> re.sub('(?!\d{3})', ' ', '56789')
'567 8 9 '
>>> re.sub('(?<=\d{3})', ' ', '56789')
'567 8 9 '
>>> re.sub('(?<!\d{3})', ' ', '56789')
' 5 6 789'

Match common username or password :

>>> import re
>>> re.match('^[a-zA-Z0-9-_]{3,16}$', 'Foo') is not None
True
>>> re.match('^\w|[-_]{3,16}$', 'Foo') is not None
True

Match hex color value :

>>> import re
>>> re.match('^#?([a-f0-9]{6}|[a-f0-9]{3})$', '#ff0000')
<_sre.SRE_Match object at 0x019E7720>
>>> re.match('^#?([a-f0-9]{6}|[a-f0-9]{3})$', '#000000')
<_sre.SRE_Match object at 0x019E77A0>

Match email :

>>> import re
>>> re.match('^([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$',
	 'citi.world@example.com')
<_sre.SRE_Match object; span=(0, 22), match='citi.world@example.com'>

# or

>>> import re
>>> example = re.compile(r'''^([a-zA-Z0-9._%-]+@
[a-zA-Z0-9.-]+
\.[a-zA-Z]{2,4})*$''', re.X)
>>> example.match('citi.world@example.citi.com')
<_sre.SRE_Match object; span=(0, 27), match='citi.world@example.citi.com'>
>>> example.match('citi%world@example.citi.com')
<_sre.SRE_Match object; span=(0, 27), match='citi%world@example.citi.com'>

Match URL :

>>> import re
>>> example = re.compile(r'''^(https?:\/\/)? # match http or https
...             ([\da-z\.-]+)            # match domain
...             \.([a-z\.]{2,6})         # match domain
...             ([\/\w \.-]*)\/?$        # match api or file
...             ''', re.X)
>>> example.match('www.yahoo.com')
<_sre.SRE_Match object; span=(0, 13), match='www.yahoo.com'>
>>> example.match('http://www.example')
<_sre.SRE_Match object; span=(0, 18), match='http://www.example'>
>>> example.match('http://www.example/w3r.html')
<_sre.SRE_Match object; span=(0, 27), match='http://www.example/w3r.html'>
>>> example.match('http://www.example/w3r!.html')
>>> example
re.compile('^(https?:\\/\\/)?\n([\\da-z\\.-]+)\n\\.([a-z\\.]{2,6})\n([\\/\\w \\.-]*)\\/?$\n', re.VERBOSE)

Match IP address :

notation description
[1]?[0-9][0-9] Match 0-199 pattern
2[0-4][0-9] Match 200-249 pattern
25[0-5] Match 251-255 pattern
(?:…) Don’t capture group
>>> import re
>>> example = re.compile(r'''^(?:(?:25[0-5]
... |2[0-4][0-9]
... |[1]?[0-9][0-9]?)\.){3}
... (?:25[0-5]
... |2[0-4][0-9]
... |[1]?[0-9][0-9]?)$''', re.X)
>>> example.match('192.168.1.1')
<_sre.SRE_Match object at 0x0134A608>
>>> example.match('255.255.255.0')
<_sre.SRE_Match object at 0x01938678>
>>> example.match('172.17.0.5')
<_sre.SRE_Match object at 0x0134A608>
>>> example.match('256.0.0.0') is None
True

Match Mac address :

>>> import random
>>> mac = [random.randint(0x00, 0x6b),
... random.randint(0x00, 0x6b),
... random.randint(0x00, 0x6b),
... random.randint(0x00, 0x6b),
... random.randint(0x00, 0x6b),
... random.randint(0x00, 0x6b)]
>>> mac = ':'.join(map(lambda x: "%02x" % x, mac))
>>> mac
'05:38:64:60:55:63'
>>> import re
>>> example = re.compile(r'''[0-9a-f]{2}([:])
... [0-9a-f]{2}
... (\1[0-9a-f]{2}){4}$''', re.X)
>>> example.match(mac) is not None
True

Lexer :

>>> import re
>>> from collections import namedtuple
>>> tokens = [r'(?P<NUMBER>\d+)',
	  r'(?P<PLUS>\+)',
	  r'(?P<MINUS>-)',
	  r'(?P<TIMES>\*)',
	  r'(?P<DIVIDE>/)',
	  r'(?P<WS>\s+)']
>>> lex = re.compile('|'.join(tokens))
>>> Token = namedtuple('Token', ['type', 'value'])
>>> def tokenize(text):
	scan = lex.scanner(text)
	return (Token(m.lastgroup, m.group())
		for m in iter(scan.match, None) if m.lastgroup != 'WS')

>>> for _t in tokenize('9 + 5 * 2 - 7'):
	print(_t)

	
Token(type='NUMBER', value='9')
Token(type='PLUS', value='+')
Token(type='NUMBER', value='5')
Token(type='TIMES', value='*')
Token(type='NUMBER', value='2')
Token(type='MINUS', value='-')
Token(type='NUMBER', value='7')
>>> tokens
# ['(?P<NUMBER>\\d+)', '(?P<PLUS>\\+)', '(?P<MINUS>-)', '(?P<TIMES>\\*)', '(?P<DIVIDE>/)', '(?P<WS>\\s+)']