Python re 模块,U 实例源码
我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用re.U。
def setup(self, config):
"""
Compile configured regular expressions.
:param config: Configuration object.
:type config: ``dict``
"""
self.matches = {}
patterns = []
for entity_type, pattern_conf in config.get(helper.ENTITIES, {}).items():
patterns.append(
r'\b(?P<{}>{})\b'.format(entity_type, pattern_conf[helper.PATTERN]))
self.pattern = regex.compile(
'|'.join(patterns),
regex.I | regex.U)
def split_into_sentences(text):
potential_end_pat = re.compile(r"".join([
r"([\w\.'’&\]\)]+[\.\?!])", # A word that ends with punctuation
r"([‘’“”'\"\)\]]*)", # Followed by optional quote/parens/etc
r"(\s+(?![a-z\-–—]))", # Followed by whitespace + non-(lowercase or dash)
]),
re.U
)
dot_iter = re.finditer(potential_end_pat, text)
end_indices = [
(x.start() + len(x.group(1)) + len(x.group(2)))
for x in dot_iter
if is_sentence_ender(x.group(1))
]
spans = zip([None] + end_indices, end_indices + [None])
sentences = [
text[start:end].strip() for start, end in spans
]
return sentences
def replace_wiki_links(text, raw_link=False):
"""
?????? ????-?????? ???? '[user_id|link_text]' ?? ??????????? HTML
:param text: ????? ??? ?????????
:param raw_link: ?????? ?????? ?????? ????-??????
"""
link_format = "{1} (vk.com/{0})" if raw_link else "<a href=\"https://vk.com/{0}\">{1}</a>"
pattern = re.compile(r"\[([^|]+)\|([^|]+)\]", re.U)
results = pattern.findall(text, re.U)
for i in results:
user_id = i[0]
link_text = i[1]
before = "[{0}|{1}]".format(user_id, link_text)
after = link_format.format(user_id, link_text)
text = text.replace(before, after)
return text
def _extract_info(self, soup):
empty_info = {'from': 0, 'to': 0, 'total': 0}
div_ssb = soup.find('div', id='ssb')
if not div_ssb:
self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup)
return empty_info
p = div_ssb.find('p')
if not p:
self._maybe_raise(ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup)
return empty_info
txt = ''.join(p.findAll(text=True))
txt = txt.replace(',', '')
matches = re.search(r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U)
if not matches:
return empty_info
return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))}
def _html_unescape(self, str):
def entity_replacer(m):
entity = m.group(1)
if entity in name2codepoint:
return unichr(name2codepoint[entity])
else:
return m.group(0)
def ascii_replacer(m):
cp = int(m.group(1))
if cp <= 255:
return unichr(cp)
else:
return m.group(0)
s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U)
return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
def _extract_info(self, soup):
empty_info = {'from': 0, 'to': 0, 'total': 0}
td_rsb = soup.find('td', 'rsb')
if not td_rsb:
self._maybe_raise(ParseError, "Td with number of results was not found on Blogs search page", soup)
return empty_info
font = td_rsb.find('font')
if not font:
self._maybe_raise(ParseError, """<p> tag within <tr class='rsb'> was not found on Blogs search page""", soup)
return empty_info
txt = ''.join(font.findAll(text=True))
txt = txt.replace(',', '')
if self.hl == 'es':
matches = re.search(r'Resultados (\d+) - (\d+) de (?:aproximadamente )?(\d+)', txt, re.U)
elif self.hl == 'en':
matches = re.search(r'Results (\d+) - (\d+) of (?:about )?(\d+)', txt, re.U)
if not matches:
return empty_info
return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))}
def _html_unescape(self, str):
def entity_replacer(m):
entity = m.group(1)
if entity in name2codepoint:
return unichr(name2codepoint[entity])
else:
return m.group(0)
def ascii_replacer(m):
cp = int(m.group(1))
if cp <= 255:
return unichr(cp)
else:
return m.group(0)
s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U)
return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
def preprocess_simple( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True ):
"""
Preprocess a list containing text documents stored as strings, where the documents have already been tokenized and are separated by whitespace
"""
token_pattern = re.compile(r"[\s\-]+", re.U)
def custom_tokenizer( s ):
return [x.lower() for x in token_pattern.split(s) if (len(x) >= min_term_length) ]
# Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
if apply_norm:
norm_function = "l2"
else:
norm_function = None
tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range)
X = tfidf.fit_transform(docs)
terms = []
# store the vocabulary map
v = tfidf.vocabulary_
for i in range(len(v)):
terms.append("")
for term in v.keys():
terms[ v[term] ] = term
return (X,terms)
def search_filename(fname, fields):
"""Extract movie title/date from filename and return dict with movies infos
"""
path_tokens = os.path.normpath(fname).split(os.sep)
candidate = path_tokens[-1]
res = re.split(FNAME_SPLIT_RE, candidate,
flags=re.I | re.U)[0].strip()
res = scrub(res, '[({])}', ' ')
res = ' '.join([x for x in re.split(r'[\s\._]', res, flags=re.U) if x])
years = re.findall(r'((?:19|20)\d\d)', res)
if years:
toks = re.split(r'(%s)' % years[-1], res)
else:
toks = [res]
title = toks[0].strip()
year = toks[1] if len(toks) > 1 else None
item = search_by(title, year, fields)
if item:
item['filename'] = fname
return item
def __get_series_data(program, ext_info):
episode = int(program['episode'])
season = int(program['season'])
desc = ext_info['synopsis'] if ext_info else u'Año: %s' % program['year']
if season == 0:
sn = re.findall(r'.*\sT(\d*/?\d+).*', program['full_title'], re.U)
season = int(sn[0].replace('/', '')) if sn else season
if 'episode_title' in program:
title = program['serie']
stitle = '%ix%02d %s' % (season, episode, program['episode_title'])
else:
title = re.findall(r'(.*)\sT\d*/?\d+.*', program['full_title'], re.U)
title = title[0] if title else program['full_title']
stitle = '%ix%02d %s' % (
season, episode, ext_info['originalTitle']
if ext_info and 'originalTitle' in ext_info else 'Episodio %i' % episode
)
return {
'title': title,
'sub-title': stitle,
'season': season if season > 0 else '',
'episode': episode,
'desc': desc
}
def test_ignore_case(self):
self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
if have_unicode:
assert u(r'\u212a').lower() == u'k' # '?'
self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
assert u(r'\u017f').upper() == u'S' # '?'
self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
def test_ignore_case_set(self):
self.assertTrue(re.match(r'[19A]', 'A', re.I))
self.assertTrue(re.match(r'[19a]', 'a', re.I))
self.assertTrue(re.match(r'[19a]', 'A', re.I))
self.assertTrue(re.match(r'[19A]', 'a', re.I))
if have_unicode:
self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
assert u(r'\u212a').lower() == u'k' # '?'
self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
assert u(r'\u017f').upper() == u'S' # '?'
self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
def test_ignore_case(self):
self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
if have_unicode:
assert u(r'\u212a').lower() == u'k' # '?'
self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
assert u(r'\u017f').upper() == u'S' # '?'
self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
def test_ignore_case_set(self):
self.assertTrue(re.match(r'[19A]', 'A', re.I))
self.assertTrue(re.match(r'[19a]', 'a', re.I))
self.assertTrue(re.match(r'[19a]', 'A', re.I))
self.assertTrue(re.match(r'[19A]', 'a', re.I))
if have_unicode:
self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
assert u(r'\u212a').lower() == u'k' # '?'
self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
assert u(r'\u017f').upper() == u'S' # '?'
self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
def test_sre_character_class_literals(self):
for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
if i < 256:
self.assertIsNotNone(re.match(r"[\%o]" % i, chr(i)))
self.assertIsNotNone(re.match(r"[\%o8]" % i, chr(i)))
self.assertIsNotNone(re.match(r"[\%03o]" % i, chr(i)))
self.assertIsNotNone(re.match(r"[\%03o0]" % i, chr(i)))
self.assertIsNotNone(re.match(r"[\%03o8]" % i, chr(i)))
self.assertIsNotNone(re.match(r"[\x%02x]" % i, chr(i)))
self.assertIsNotNone(re.match(r"[\x%02x0]" % i, chr(i)))
self.assertIsNotNone(re.match(r"[\x%02xz]" % i, chr(i)))
if i < 0x10000:
self.assertIsNotNone(re.match(r"[\u%04x]" % i, chr(i)))
self.assertIsNotNone(re.match(r"[\u%04x0]" % i, chr(i)))
self.assertIsNotNone(re.match(r"[\u%04xz]" % i, chr(i)))
self.assertIsNotNone(re.match(r"[\U%08x]" % i, chr(i)))
self.assertIsNotNone(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
self.assertIsNotNone(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
self.assertIsNotNone(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
self.assertRaises(re.error, re.match, r"[\911]", "")
self.assertRaises(re.error, re.match, r"[\x1z]", "")
self.assertRaises(re.error, re.match, r"[\u123z]", "")
self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
self.assertRaises(re.error, re.match, r"[\U00110000]", "")
def test_sre_byte_literals(self):
for i in [0, 8, 16, 32, 64, 127, 128, 255]:
self.assertIsNotNone(re.match((r"\%03o" % i).encode(), bytes([i])))
self.assertIsNotNone(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
self.assertIsNotNone(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
self.assertIsNotNone(re.match((r"\x%02x" % i).encode(), bytes([i])))
self.assertIsNotNone(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
self.assertIsNotNone(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
self.assertIsNotNone(re.match(br"\u", b'u'))
self.assertIsNotNone(re.match(br"\U", b'U'))
self.assertIsNotNone(re.match(br"\0", b"\000"))
self.assertIsNotNone(re.match(br"\08", b"\0008"))
self.assertIsNotNone(re.match(br"\01", b"\001"))
self.assertIsNotNone(re.match(br"\018", b"\0018"))
self.assertIsNotNone(re.match(br"\567", bytes([0o167])))
self.assertRaises(re.error, re.match, br"\911", b"")
self.assertRaises(re.error, re.match, br"\x1", b"")
self.assertRaises(re.error, re.match, br"\x1z", b"")
def m3u2list(data):
"""convert an m3u data to a list"""
matches = re.compile('^#EXTINF:-?[0-9]*(.*?),(.*?)\n(.*?)$', re.I + re.M + re.U + re.S).findall(data)
li = []
for params, display_name, url in matches:
item_data = {'params': params, 'display_name': display_name, 'url': url}
li.append(item_data)
playlist = []
for channel in li:
item_data = {'display_name': channel['display_name'], 'url': channel['url']}
matches = re.compile(' (.+?)="(.+?)"', re.I + re.M + re.U + re.S).findall(channel['params'])
for field, value in matches:
item_data[field.strip().lower().replace('-', '_')] = value.strip()
playlist.append(item_data)
return playlist
def test_ignore_case(self):
self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
if have_unicode:
assert u(r'\u212a').lower() == u'k' # '?'
self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
assert u(r'\u017f').upper() == u'S' # '?'
self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
def test_ignore_case_set(self):
self.assertTrue(re.match(r'[19A]', 'A', re.I))
self.assertTrue(re.match(r'[19a]', 'a', re.I))
self.assertTrue(re.match(r'[19a]', 'A', re.I))
self.assertTrue(re.match(r'[19A]', 'a', re.I))
if have_unicode:
self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
assert u(r'\u212a').lower() == u'k' # '?'
self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
assert u(r'\u017f').upper() == u'S' # '?'
self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
def is_hebrew(string):
'A hacky way to check if our string is in Hebrew - check the 1rst char'
# Drop digits from the string
string = re.sub('\d', '', string)
# Drop special characters from the string
string = re.sub('\W', '', string, flags = re.U)
# Strip the string
string = string.strip()
# Support empty strings
if not string:
return None
# Make sure the string is UTF-8
if type(string) != unicode:
string = string.decode('utf-8')
HEBREW_AB = unicode(u'???????????????????????????')
if string[0] in HEBREW_AB:
return True
else:
return False
def test_sre_character_class_literals(self):
for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
if i < 256:
self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
if i < 0x10000:
self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
self.assertRaises(re.error, re.match, r"[\911]", "")
self.assertRaises(re.error, re.match, r"[\x1z]", "")
self.assertRaises(re.error, re.match, r"[\u123z]", "")
self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
self.assertRaises(re.error, re.match, r"[\U00110000]", "")
def test_sre_byte_literals(self):
for i in [0, 8, 16, 32, 64, 127, 128, 255]:
self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
self.assertTrue(re.match(br"\u", b'u'))
self.assertTrue(re.match(br"\U", b'U'))
self.assertTrue(re.match(br"\0", b"\000"))
self.assertTrue(re.match(br"\08", b"\0008"))
self.assertTrue(re.match(br"\01", b"\001"))
self.assertTrue(re.match(br"\018", b"\0018"))
self.assertTrue(re.match(br"\567", bytes([0o167])))
self.assertRaises(re.error, re.match, br"\911", b"")
self.assertRaises(re.error, re.match, br"\x1", b"")
self.assertRaises(re.error, re.match, br"\x1z", b"")
def has_omnifunc(self, ft):
if ft not in self.trigger_cache:
name = '{}_omni_trigger'.format(ft)
option = self.get_option(name)
if not option:
return False
try:
self.trigger_cache[ft] = re.compile(
to_unicode(option, 'utf-8'), re.X | re.U)
except Exception:
return False
try:
return bool(vim.current.buffer.options['omnifunc'])
except vim.error:
return False
def test_sre_character_class_literals(self):
for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
if i < 256:
self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
if i < 0x10000:
self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
self.assertRaises(re.error, re.match, r"[\911]", "")
self.assertRaises(re.error, re.match, r"[\x1z]", "")
self.assertRaises(re.error, re.match, r"[\u123z]", "")
self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
self.assertRaises(re.error, re.match, r"[\U00110000]", "")
def test_sre_byte_literals(self):
for i in [0, 8, 16, 32, 64, 127, 128, 255]:
self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
self.assertTrue(re.match(br"\u", b'u'))
self.assertTrue(re.match(br"\U", b'U'))
self.assertTrue(re.match(br"\0", b"\000"))
self.assertTrue(re.match(br"\08", b"\0008"))
self.assertTrue(re.match(br"\01", b"\001"))
self.assertTrue(re.match(br"\018", b"\0018"))
self.assertTrue(re.match(br"\567", bytes([0o167])))
self.assertRaises(re.error, re.match, br"\911", b"")
self.assertRaises(re.error, re.match, br"\x1", b"")
self.assertRaises(re.error, re.match, br"\x1z", b"")
def compile(searchString):
r"""
Return the user's searchString compiled to a regular expression.
Example terms: @call +work (A) carrots
Term may be prefixed with ! or ~ for negation.
Terms may be combined with "," or " " (AND) or with "|" (OR).
Terms only match the beginning of a word in the task.
Terms are case-insensitive.
Expressions may NOT be nested with parentheses.
Only \-character special regular expression sets are allowed, everything else is escaped.
"""
if not searchString:
return None
terms = SimpleTextFilter._splitter.split(searchString)
terms = [SimpleTextFilter._term2re(term) for term in terms]
return re.compile("".join(terms), re.I | re.U)
def slugify(string, separator=r'-'):
r"""
Slugify a unicode string using unicodedata to normalize the string.
:Example:
>>> slugify(u"H\xe9ll\xf8 W\xc3\xb6rld")
'hell-world'
>>> slugify("Bonjour, tout l'monde !", separator="_")
'bonjour_tout_lmonde'
>>> slugify("\tStuff with -- dashes and... spaces \n")
'stuff-with-dashes-and-spaces'
"""
string = normalize(string)
string = re.sub(r'[^\w\s' + separator + ']', '', string, flags=re.U)
string = string.strip().lower()
return re.sub(r'[' + separator + '\s]+', separator, string, flags=re.U)
def token_words (self,source):
list_words=[]
source_without_urls=u''
#renove urls from tweet
urls=re.findall (r'(http[s]*://\S+)', source,re.U)
for url in urls:
start=source.find(url)
end=len(url)
source_without_urls=source_without_urls+source[0:start-1]
source=source[start+end:]
source_without_urls=source_without_urls+source
list_tokens=re.findall (r'[#@]*\w+', source_without_urls,re.U)
# remove users and hashtags
for token in list_tokens:
if (token.find('#') == -1) and (token.find('@') == -1):
number= re.search(r'\d+',token)
if not number:
token=token.lower()
list_words.append(token)
return list_words
def set_user_mention_day(self,date,text):
list_mentions=re.findall (r'@\w+', text)
if len (list_mentions) >0:
user=list_mentions[0]
if re.match(r'[\.]*(@\w+)[^\t\n]+',text):
if user in self.top_users_reply:
index= self.top_users_reply.index(user)
self.dict_top_users_reply_day.store(date,index,1)
elif re.match('[rt[\s]*(@\w+)[:]*',text,re.U):
if user in self.top_users_RT:
index= self.top_users_RT.index(user)
self.dict_top_users_RT_day.store(date,index,1)
for user in list_mentions:
if user in self.top_users_mention:
index= self.top_users_mention.index(user)
self.dict_top_users_mention_day.store(date,index,1)
return
def get_tweet (tweet):
data = tweet.split('\t')
if len (data) >= 10:
id_tweet = data[0]
timestamp = data[1]
date_hour =re.findall(r'(\d\d\d\d)-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)',timestamp,re.U)
(year,month,day,hour,minutes,seconds) = date_hour[0]
author= data[2]
text = data[3]
app = data[4]
user_id = data[6]
followers = data[6]
following = data[7]
statuses = data[8]
loc = data[9]
return (year,month,day,hour,minutes,seconds, author,text,app,user_id,followers,following,statuses,loc)
else:
print ' tweet not match'
return None
def get_tweet (tweet):
data = tweet.split('\t')
if len (data) >= 8:
id_tweet = data[0]
timestamp = data[1]
date_hour =re.findall(r'(\d\d\d\d)-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)',timestamp,re.U)
(year,month,day,hour,minutes,seconds) = date_hour[0]
author= data[2]
text = data[3]
app = data[4]
id_user = data[5]
followers = data[6]
following = data [7]
return (id_tweet,year,month,day,hour,minutes,seconds, author,text,app,id_user,followers,following)
else:
print ' tweet not match'
return None
def get_tweet_source (text):
source=None
text_aux=text
start=text_aux.find('RT')
while start != -1:
#print start
text=text_aux[start:]
#print text
RT = re.match('[RT[\s]*(@\w+)[:]*',text,re.U)
if RT:
source=RT.group(1)
text_aux=text[len(RT.group(0)):]
#print text_aux
#print source
start=text_aux.find('RT')
else:
break
return (source, text_aux)
def __parse_episode_number(self, eps_title):
'''
parse the episode number from episode title, it use a list of regular expressions. the position in the list
is the priority of the regular expression.
:param eps_title: the title of episode.
:return: episode number if matched, otherwise, -1
'''
try:
for regex in episode_regex_tuple:
search_result = re.search(regex, eps_title, re.U | re.I)
if search_result is not None:
return int(search_result.group(1))
return -1
except Exception:
return -1
def parse_episode_number(self, eps_title):
'''
parse the episode number from episode title, it use a list of regular expressions. the position in the list
is the priority of the regular expression.
:param eps_title: the title of episode.
:return: episode number if matched, otherwise, -1
'''
try:
for regex in episode_regex_tuple:
search_result = re.search(regex, eps_title, re.U | re.I)
if search_result is not None:
matched_number = int(search_result.group(1))
if self.bangumi.eps_no_offset is not None:
matched_number = matched_number + self.bangumi.eps_no_offset
return matched_number
return -1
except Exception as error:
logger.warn(error)
return -1
def parse_episode_number(self, eps_title):
'''
parse the episode number from episode title, it use a list of regular expressions. the position in the list
is the priority of the regular expression.
:param eps_title: the title of episode.
:return: episode number if matched, otherwise, -1
'''
try:
for regex in episode_regex_tuple:
search_result = re.search(regex, eps_title, re.U | re.I)
if search_result is not None:
return int(search_result.group(1))
return -1
except Exception:
return -1
def normalize(text):
"""
??????????????????????????????????
normalize(text)
?????? str
????????
>>> print(normalize("?????")=="????") # ? ? ? ? ? ??? ????
True
"""
if six.PY2:
for data in rule2py2:
text=re.sub(data[0].replace(u"t",u"[????]"),data[1],text,re.U)
else:
for data in rule2:
text=re.sub(data[0].replace("t","[????]"),data[1],text,re.U)
for data in list(zip(rule1,rule1)):
text=re.sub(data[0].replace(u"t",u"[????]")+"+",data[1],text,re.U)
return text
def __init__(self,
width=70,
initial_indent="",
subsequent_indent="",
expand_tabs=True,
replace_whitespace=True,
fix_sentence_endings=False,
break_long_words=True,
drop_whitespace=True,
break_on_hyphens=True):
self.width = width
self.initial_indent = initial_indent
self.subsequent_indent = subsequent_indent
self.expand_tabs = expand_tabs
self.replace_whitespace = replace_whitespace
self.fix_sentence_endings = fix_sentence_endings
self.break_long_words = break_long_words
self.drop_whitespace = drop_whitespace
self.break_on_hyphens = break_on_hyphens
# recompile the regexes for Unicode mode -- done in this clumsy way for
# backwards compatibility because it's rather common to monkey-patch
# the TextWrapper class' wordsep_re attribute.
self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U)
self.wordsep_simple_re_uni = re.compile(
self.wordsep_simple_re.pattern, re.U)
# -- Private methods -----------------------------------------------
# (possibly useful for subclasses to override)
def __init__(self):
# Initialize the standard TreebankWordTokenizer.
super(self.__class__, self).__init__()
# Adding to TreebankWordTokenizer, the splits on
# - chervon quotes u'\xab' and u'\xbb' .
# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
improved_open_quote_regex = re.compile(u'([«“‘])', re.U)
improved_close_quote_regex = re.compile(u'([»”’])', re.U)
improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
self.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
self.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
self.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))
def translate(original):#original <type 'unicode'>
waittrans = re.findall(u'[?-?|?-?|?-?|?|?]+',original,re.U)
findnum = len(waittrans)
subnum = 0
waitfill = original
while(findnum!=subnum):
waitfill = re.sub(u'[?-?|?-?|?-?|?|?]+',"%s",waitfill,re.U)
subnum = len(re.findall('%s',waitfill))
# if len(re.findall('%',waitfill)) != subnum:
waitfill = re.sub(u'%(?!s)','?'.decode("utf-8"),waitfill)
filltext=[]
print "workload",len(waittrans)
for line in waittrans:
if line in { u"?" : "", u"?" : ""}:
filltext.append(line)
continue
send = line.encode("utf-8")
gettrans = baidufanyi(send)
if re.search(u"[???]",gettrans[-1]):
gettrans = gettrans[0:-1]
filltext.append(gettrans)
translation = waitfill %tuple(filltext)
translation = re.sub("?".decode("utf-8"),'%',translation)
return translation
def tweetassembler(**args):
in_reply_to_status = args['in_reply_to_status']
if in_reply_to_status is not None:
regex = u'.*??.*'
if re.match(regex, in_reply_to_status.text, re.U):
# ??????ID???
id = in_reply_to_status.in_reply_to_status_id
# ??????????????
qkou_status = api.get_status(id)
entities = qkou_status.entities['hashtags']
# ????????????????
if len(entities) > 0:
hashtag = entities[0]['text']
# ??????????????
info_num = re.search("(?<=lec)[0-9]*", hashtag)
news_num = re.search("(?<=news)[0-9]*", hashtag)
if info_num is not None:
qkou_id = info_num.group()
log.debug("[ Stream ] Info??????")
dm_text = get_info(qkou_id)
elif news_num is not None:
news_id = news_num.group()
log.debug("[ Stream ] News??????")
dm_text = get_news(news_id)
else:
pass
try:
api.send_direct_message(
user_id=in_reply_to_status.user.id, text=dm_text)
log.debug('[ Stream ] DM???')
except Exception as e:
log.exception(e)
else:
pass
def load(self):
Pref.view = False
Pref.elapsed_time = 0.4
Pref.running = False
Pref.wrdRx = re.compile(s.get('word_regexp', "^[^\w]?`*\w+[^\w]*$"), re.U)
Pref.wrdRx = Pref.wrdRx.match
Pref.splitRx = s.get('word_split', None)
if Pref.splitRx:
Pref.splitRx = re.compile(Pref.splitRx, re.U)
Pref.splitRx = Pref.splitRx.findall
Pref.enable_live_count = s.get('enable_live_count', True)
Pref.enable_readtime = s.get('enable_readtime', False)
Pref.enable_line_word_count = s.get('enable_line_word_count', False)
Pref.enable_line_char_count = s.get('enable_line_char_count', False)
Pref.enable_count_lines = s.get('enable_count_lines', False)
Pref.enable_count_chars = s.get('enable_count_chars', False)
Pref.enable_count_pages = s.get('enable_count_pages', True)
Pref.words_per_page = s.get('words_per_page', 300)
Pref.page_count_mode_count_words = s.get('page_count_mode_count_words', True)
Pref.char_ignore_whitespace = s.get('char_ignore_whitespace', True)
Pref.readtime_wpm = s.get('readtime_wpm', 200)
Pref.whitelist = [x.lower() for x in s.get('whitelist_syntaxes', []) or []]
Pref.blacklist = [x.lower() for x in s.get('blacklist_syntaxes', []) or []]
Pref.strip = s.get('strip', [])
for window in sublime.windows():
for view in window.views():
view.erase_status('WordCount');
view.settings().erase('WordCount')
def __init__(self, version, pattern):
self._version = version
self._ip_rex = re.compile(r"(" + pattern + r")", re.U | re.I)
self._cidr_rex = re.compile(r"\s*/\s*(\d{1,5})", re.U | re.I)
self._range_rex = re.compile(r"\s*-\s*(" + pattern + r")", re.U | re.I)
def test_from_re(self):
# re.U and re.S flags are implicitly set
self.assertEqual(RegExp.from_re(re.compile("a", re.U)), RegExp("a"))
self.assertEqual(RegExp.from_re(re.compile("a", re.S)), RegExp("a"))
# re.I flag can be set explicitly
self.assertEqual(
RegExp.from_re(re.compile("a", re.I)),
RegExp("a", ignore_case=True))
# re.M, re.L and re.X are forbidden
for flag in [re.M, re.L, re.X]:
self.assertRaises(ValueError, RegExp.from_re, re.compile("a", flag))
def init(self, pattern, ignore_case=False):
Atom.init(self)
flags = re.U | re.S | (re.I if ignore_case else 0)
self._regexp = re.compile(pattern, flags)
def format_regexp(format, regexp):
escape_slash_rex = re.compile(r"((?:^|[^\\])(?:\\\\)*?)(/+)", re.U)
def escape_slash(match):
return match.group(1) + match.group(2).replace("/", "\\/")
pattern = regexp.pattern
pattern = escape_slash_rex.sub(escape_slash, pattern)
result = "/" + pattern + "/"
if regexp.ignore_case:
result += "i"
yield result
def escape_whitespace(unicode_string):
r"""
Return the given unicode string with the whitespace escaped
using 'unicode-escape' encoding.
>>> escape_whitespace(u"space is not escaped")
u'space is not escaped'
>>> escape_whitespace(u"multi\nline\nwith\ttabs")
u'multi\\nline\\nwith\\ttabs'
"""
return re.sub(r"\s", lambda x: unicode(x.group(0).encode("unicode-escape")), unicode_string, re.U)
def slugify(value, allow_unicode=False):
"""
Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
Remove characters that aren't alphanumerics, underscores, or hyphens.
Convert to lowercase. Also strip leading and trailing whitespace.
"""
value = force_text(value)
if allow_unicode:
value = unicodedata.normalize('NFKC', value)
value = re.sub('[^\w\s-]', '', value, flags=re.U).strip().lower()
return mark_safe(re.sub('[-\s]+', '-', value, flags=re.U))
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
value = re.sub('[^\w\s-]', '', value).strip().lower()
return mark_safe(re.sub('[-\s]+', '-', value))
def remove_tags(html, tags):
"""Returns the given HTML with given tags removed."""
warnings.warn(
"django.utils.html.remove_tags() and the removetags template filter "
"are deprecated. Consider using the bleach library instead.",
RemovedInDjango110Warning, stacklevel=3
)
tags = [re.escape(tag) for tag in tags.split()]
tags_re = '(%s)' % '|'.join(tags)
starttag_re = re.compile(r'<%s(/?>|(\s+[^>]*>))' % tags_re, re.U)
endtag_re = re.compile('</%s>' % tags_re)
html = starttag_re.sub('', html)
html = endtag_re.sub('', html)
return html
def __init__(self,
width=70,
initial_indent="",
subsequent_indent="",
expand_tabs=True,
replace_whitespace=True,
fix_sentence_endings=False,
break_long_words=True,
drop_whitespace=True,
break_on_hyphens=True):
self.width = width
self.initial_indent = initial_indent
self.subsequent_indent = subsequent_indent
self.expand_tabs = expand_tabs
self.replace_whitespace = replace_whitespace
self.fix_sentence_endings = fix_sentence_endings
self.break_long_words = break_long_words
self.drop_whitespace = drop_whitespace
self.break_on_hyphens = break_on_hyphens
# recompile the regexes for Unicode mode -- done in this clumsy way for
# backwards compatibility because it's rather common to monkey-patch
# the TextWrapper class' wordsep_re attribute.
self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U)
self.wordsep_simple_re_uni = re.compile(
self.wordsep_simple_re.pattern, re.U)
# -- Private methods -----------------------------------------------
# (possibly useful for subclasses to override)