Python re 模块,L 实例源码
我们从Python开源项目中,提取了以下33个代码示例,用于说明如何使用re.L。
def preprocessing(content):
remove_punc = ('? ? ? ? ? ? ? ? ? —').split(' ')
## preprocessing #1 : remove XXenglishXX and numbers
preprocessing_1 = re.compile(r'\d*',re.L) ## only substitute numbers
#preprocessing_1 = re.compile(r'\w*',re.L) ## substitute number & English
content = preprocessing_1.sub("",content)
## preprocessing #2 : remove punctuation
preprocessing_2 = re.compile('[%s]' % re.escape(string.punctuation))
content = preprocessing_2.sub("",content)
## preprocessing #3 : remove Chinese punctuation and multiple whitspaces
content = content.replace('\n','')
for punc in remove_punc:
content = content.replace(punc,'')
try:
content = parsing.strip_multiple_whitespaces(content)
except:
print 'Warning : failed to strip whitespaces @ '
return content
def defSyntax(self):
'''Define re patterns according to syntax.'''
#------------------REGEX patterns------------------
if self.syntax=='markdown':
self._img_re=re.compile('^(.*)!\\[(.+?)\\]\\((.+?)\\)', re.M | re.L)
self._h_re_base = r'''
(^(.+)[ \t]*\n(=+|-+)[ \t]*\n+)
|
(^(\#{%s}) # \1 = string of #'s
[ \t]*
(.+?) # \2 = Header text
[ \t]*
(?<!\\) # ensure not an escaped trailing '#'
\#* # optional closing #'s (not counted)
\n+
)
'''
self._all_h_re=re.compile(self._h_re_base %'1,6', re.X | re.M)
elif self.syntax=='zim':
self._img_re=re.compile('^(.*)\\{\\{(.+?)\\}\\}(.*)$', re.M | re.L)
self._h_re_base = r'''
^(\={%s}) # \1 = string of ='s
[ \t]*
(.+?) # \2 = Header text
[ \t]*
\1
\n+
'''
self._all_h_re=re.compile(self._h_re_base %'1,6', re.X | re.M)
else:
raise Exception("Unknown syntax %s" %self.syntax)
return
def test_from_re(self):
# re.U and re.S flags are implicitly set
self.assertEqual(RegExp.from_re(re.compile("a", re.U)), RegExp("a"))
self.assertEqual(RegExp.from_re(re.compile("a", re.S)), RegExp("a"))
# re.I flag can be set explicitly
self.assertEqual(
RegExp.from_re(re.compile("a", re.I)),
RegExp("a", ignore_case=True))
# re.M, re.L and re.X are forbidden
for flag in [re.M, re.L, re.X]:
self.assertRaises(ValueError, RegExp.from_re, re.compile("a", flag))
def iternext(self):
"""
Iterate through characters of the string.
Count escaped l, L, c, C, E, N, p, P, backslash as a single char.
"""
if self.index > self.max_index:
raise StopIteration
char = self.string[self.index:self.index + 1]
if char == self._b_slash:
m = self._re_search_ref.match(self.string[self.index + 1:])
if m:
ref = m.group(0)
if len(ref) == 1 and ref in self._long_search_refs:
if ref == self._unicode_name:
raise SyntaxError('Format for Unicode name is \\N{name}!')
elif ref == self._uni_prop:
raise SyntaxError('Format for Unicode property is \\p{property}!')
elif ref == self._inverse_uni_prop:
raise SyntaxError('Format for inverse Unicode property is \\P{property}!')
char += m.group(1) if m.group(1) else m.group(2)
elif char == self._ls_bracket:
m = self._re_posix.match(self.string[self.index:])
if m:
char = m.group(0)
self.index += len(char)
self.current = char
return self.current
# Templates
def test_constants(self):
self.assertEqual(re.I, re.IGNORECASE)
self.assertEqual(re.L, re.LOCALE)
self.assertEqual(re.M, re.MULTILINE)
self.assertEqual(re.S, re.DOTALL)
self.assertEqual(re.X, re.VERBOSE)
def test_flags(self):
for flag in [re.I, re.M, re.X, re.S, re.L]:
self.assertNotEqual(re.compile('^pattern$', flag), None)
def test_constants(self):
self.assertEqual(re.I, re.IGNORECASE)
self.assertEqual(re.L, re.LOCALE)
self.assertEqual(re.M, re.MULTILINE)
self.assertEqual(re.S, re.DOTALL)
self.assertEqual(re.X, re.VERBOSE)
def test_flags(self):
for flag in [re.I, re.M, re.X, re.S, re.L]:
self.assertTrue(re.compile('^pattern$', flag))
def check_en_US_iso88591(self):
locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
def check_en_US_utf8(self):
locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
def test_constants(self):
self.assertEqual(re.I, re.IGNORECASE)
self.assertEqual(re.L, re.LOCALE)
self.assertEqual(re.M, re.MULTILINE)
self.assertEqual(re.S, re.DOTALL)
self.assertEqual(re.X, re.VERBOSE)
def test_flags(self):
for flag in [re.I, re.M, re.X, re.S, re.L]:
self.assertTrue(re.compile('^pattern$', flag))
def check_en_US_iso88591(self):
locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
def check_en_US_utf8(self):
locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
def parse_string(self, txt):
import re, unicodedata, locale
if type(txt) is not str:
txt = txt.decode('utf-8')
#locale.setlocale(locale.LC_ALL, 'ca_ES')
prog = re.compile("[-_àèìòùáéíóúñçÀÈÌÒÙÁÉÍÓÚÑÇ .a-zA-Z0-9]+$", re.L)
if not prog.match(txt):
return False
else:
# ~ Replace accents
txt = ''.join((c for c in unicodedata.normalize('NFD', txt) if unicodedata.category(c) != 'Mn'))
return txt.replace(" ", "_")
def validCharacters(txt):
import re, unicodedata, locale
txt=txt.decode('utf-8')
locale.setlocale(locale.LC_ALL, 'ca_ES')
prog = re.compile("[-_àèìòùáéíóúñçÀÈÌÒÙÁÉÍÓÚÑÇ .a-zA-Z0-9]+$".decode('UTF-8'), re.L)
if not prog.match(txt):
return False
else:
return txt
def _parseString(self, txt):
import re, unicodedata, locale
if type(txt) is not str:
txt = txt.decode('utf-8')
locale.setlocale(locale.LC_ALL, 'ca_ES')
prog = re.compile("[-_àèìòùáéíóúñçÀÈÌÒÙÁÉÍÓÚÑÇ .a-zA-Z0-9]+$", re.L)
if not prog.match(txt):
return False
else:
# ~ Replace accents
txt = ''.join((c for c in unicodedata.normalize('NFD', txt) if unicodedata.category(c) != 'Mn'))
return txt.replace(" ", "_")
def test_constants(self):
self.assertEqual(re.I, re.IGNORECASE)
self.assertEqual(re.L, re.LOCALE)
self.assertEqual(re.M, re.MULTILINE)
self.assertEqual(re.S, re.DOTALL)
self.assertEqual(re.X, re.VERBOSE)
def test_flags(self):
for flag in [re.I, re.M, re.X, re.S, re.L]:
self.assertNotEqual(re.compile('^pattern$', flag), None)
def test_constants(self):
self.assertEqual(re.I, re.IGNORECASE)
self.assertEqual(re.L, re.LOCALE)
self.assertEqual(re.M, re.MULTILINE)
self.assertEqual(re.S, re.DOTALL)
self.assertEqual(re.X, re.VERBOSE)
def test_flags(self):
for flag in [re.I, re.M, re.X, re.S, re.L]:
self.assertTrue(re.compile('^pattern$', flag))
def check_en_US_iso88591(self):
locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
def check_en_US_utf8(self):
locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
def test_constants(self):
self.assertEqual(re.I, re.IGNORECASE)
self.assertEqual(re.L, re.LOCALE)
self.assertEqual(re.M, re.MULTILINE)
self.assertEqual(re.S, re.DOTALL)
self.assertEqual(re.X, re.VERBOSE)
def test_flags(self):
for flag in [re.I, re.M, re.X, re.S, re.L]:
self.assertTrue(re.compile('^pattern$', flag))
def check_en_US_iso88591(self):
locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
def check_en_US_utf8(self):
locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
def test_constants(self):
self.assertEqual(re.I, re.IGNORECASE)
self.assertEqual(re.L, re.LOCALE)
self.assertEqual(re.M, re.MULTILINE)
self.assertEqual(re.S, re.DOTALL)
self.assertEqual(re.X, re.VERBOSE)
def test_flags(self):
for flag in [re.I, re.M, re.X, re.S, re.L]:
self.assertNotEqual(re.compile('^pattern$', flag), None)
def test_constants(self):
self.assertEqual(re.I, re.IGNORECASE)
self.assertEqual(re.L, re.LOCALE)
self.assertEqual(re.M, re.MULTILINE)
self.assertEqual(re.S, re.DOTALL)
self.assertEqual(re.X, re.VERBOSE)
def test_flags(self):
for flag in [re.I, re.M, re.X, re.S, re.L]:
self.assertTrue(re.compile('^pattern$', flag))
def get_info(host):
"""get some infomation of a host"""
req = ''
try:
req = requests.get(host,timeout=10)
req.encoding = req.apparent_encoding
result = re.findall(re.compile('<title>(.*?)</title>',re.L),req.text)
return result[0]
except Exception as e:
print(e)
return None
#----------------------------------------------------------------------
def iternext(self):
"""
Iterate through characters of the string.
Count escaped l, L, c, C, E and backslash as a single char.
"""
if self.index > self.max_index:
raise StopIteration
char = self.string[self.index:self.index + 1]
if char == self._b_slash:
m = self._replace_ref.match(self.string[self.index + 1:])
if m:
ref = m.group(0)
if len(ref) == 1 and ref in self._long_replace_refs:
if ref == self._hex:
raise SyntaxError('Format for byte is \\xXX!')
elif ref == self._group:
raise SyntaxError('Format for group is \\g<group_name_or_index>!')
elif ref == self._unicode_name:
raise SyntaxError('Format for Unicode name is \\N{name}!')
elif ref == self._unicode_narrow: # pragma: no cover
raise SyntaxError('Format for Unicode is \\uXXXX!')
elif ref == self._unicode_wide: # pragma: no cover
raise SyntaxError('Format for wide Unicode is \\UXXXXXXXX!')
if self.use_format and (m.group(3) or m.group(4)):
char += self._b_slash
self.index -= 1
if not self.use_format or not m.group(4):
char += m.group(1) if m.group(1) else m.group(2)
elif self.use_format and char in (self._lc_bracket, self._rc_bracket):
m = self._format_replace_group.match(self.string[self.index:])
if m:
if m.group(2):
char = m.group(2)
else:
self.index += 1
else:
raise ValueError("Single unmatched curly bracket!")
self.index += len(char)
self.current = char
return self.current