我们从Python开源项目中,提取了以下46个代码示例,用于说明如何使用lxml.html.tostring()。
def lxml_test(): url = "http://www.caixunzz.com" req = urllib2.Request(url=url) resp = urllib2.urlopen(req) #print resp.read() ''' parse_body=html.fromstring(resp.read()) href=parse_body.xpath('//a[@class="label"]/@href') print href #not working from above ''' tree = etree.HTML(resp.read()) href = tree.xpath('//a[@class="label"]/@href') #print href.tag for i in href: #print html.tostring(i) #print type(i) print i print type(href) #not working yet
def process_item(self, item, spider): if item is not None: doc = item['content'] if not isinstance(doc, (str, bytes)): if isinstance(doc, HtmlElement): item['content'] = tostring(doc, encoding='UTF-8', pretty_print=True, method='html') item['encoding'] = 'UTF-8' else: raise Exception(( 'Error in store pipeline unsupported doc type[{}]' ).format(doc.__class__.__name__)) item_ = dict(item) item_['lang'] = get_article_lang(item) item_['spider'] = spider._id item_['source'] = spider.title item_['category'] = get_category(item_) if not is_exists_article(item_): save_article(item_) return item
def setUp(self): super(TestViewSaving, self).setUp() self.arch = h.DIV( h.DIV( h.H3("Column 1"), h.UL( h.LI("Item 1"), h.LI("Item 2"), h.LI("Item 3"))), h.DIV( h.H3("Column 2"), h.UL( h.LI("Item 1"), h.LI(h.SPAN("My Company", attrs(model='res.company', id=1, field='name', type='char'))), h.LI(h.SPAN("+00 00 000 00 0 000", attrs(model='res.company', id=1, field='phone', type='char'))) )) ) self.view_id = self.registry('ir.ui.view').create(self.cr, self.uid, { 'name': "Test View", 'type': 'qweb', 'arch': ET.tostring(self.arch, encoding='utf-8').decode('utf-8') })
def ingest(self, file_path): """Ingestor implementation.""" file_size = self.result.size or os.path.getsize(file_path) if file_size > self.MAX_SIZE: raise ProcessingException("XML file is too large.") try: doc = etree.parse(file_path) except (ParserError, ParseError): raise ProcessingException("XML could not be parsed.") text = self.extract_html_text(doc.getroot()) transform = etree.XSLT(self.XSLT) html_doc = transform(doc) html_body = html.tostring(html_doc, encoding='unicode', pretty_print=True) self.result.flag(self.result.FLAG_HTML) self.result.emit_html_body(html_body, text)
def WriteHTML(self,testcaseinfo): self.CreateHtmlFile() f = open(self.reportfile,"r") htmlcontent = f.read() f.close() #tree = mytree.fromstring(str(htmlcontent)) htmlcontent.encode('utf-8') tree = html.fromstring(htmlcontent) tableElem = tree.find(".//table") if testcaseinfo.result == "Failed": mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#FF0000\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(testcaseinfo.id,testcaseinfo.name,testcaseinfo.owner,testcaseinfo.result,testcaseinfo.starttime,testcaseinfo.endtime,testcaseinfo.secondsDuration,testcaseinfo.errorinfo) else: mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(testcaseinfo.id,testcaseinfo.name,testcaseinfo.owner,testcaseinfo.result,testcaseinfo.starttime,testcaseinfo.endtime,testcaseinfo.secondsDuration,testcaseinfo.errorinfo) tableElem.append(mytree.HTML(str(mytablerow))) f = open(self.reportfile,"w") #html.tostring newContent = repr(html.tostring(tree,method="html",with_tail=False)) newContent = newContent.replace(r"\n","").replace(r"\t","").replace('b\'',"") newContent = newContent[:len(newContent)-1] f.write(newContent) f.close()
def __init__(self, file_name, user_id): with open(file_name, 'r') as self.opened_file: # So Instapaper doesn't close <li> tags # This was causing infinite recursion when using BS directly # Hence why the stuff below is being done, so that the <li> tags get closed self.html = html.document_fromstring(self.opened_file.read()) self.html = html.tostring(self.html) self.soup = BeautifulSoup4(self.html) self.user = user_id self.urls = dict() self.check_duplicates = dict() self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user, Bookmark.deleted == False).all() for bmark in self.check_duplicates_query: self.check_duplicates[bmark.main_url] = bmark self.tags_dict = dict() self.tags_set = set() self.valid_url = re.compile( r'^(?:[a-z0-9\.\-]*)://' r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|' r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' r'(?::\d+)?' r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def _sanitize_html_frags(html_value, valid_tags, valid_attributes): fragments = html.fragments_fromstring(html_value) for f in fragments: if isinstance(f, html.HtmlElement): _sanitize_html_rec(f, valid_tags, valid_attributes) if f.tag in valid_tags: _clean_attributes(f, valid_attributes) yield html.tostring(f, encoding="unicode") else: if f.text: yield f.text for sub in f: yield html.tostring(sub, encoding="unicode") if f.tail: yield f.tail if f.tag in ('p', 'br'): yield '\n' else: yield f
def totext(self, elem): return self.tostring(elem, encoding=unicode, method='text').strip()
def parse_results_page(self, root): # {{{ from lxml.html import tostring matches = [] def title_ok(title): title = title.lower() bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )', ': free sampler'] for x in bad: if x in title: return False # if title and title[0] in '[{' and re.search(r'\(\s*author\s*\)', title) is not None: # # Bad entries in the catalog # return False return True for a in root.xpath(r'//li[starts-with(@class, "line")]//a[@href and contains(@name, "itemlist-picture")]'): # title = a.get('title') # if title_ok(title): url = a.get('href') if url.startswith('/'): url = 'http://product.dangdang.com/%s' % (url) matches.append(url) # Keep only the top 5 matches as the matches are sorted by relevance by # Amazon so lower matches are not likely to be very relevant return matches[:5] # }}}
def url_trim(html): """Trims anchor texts that are longer than 70 chars.""" fragment = fromstring(html) for el, attrib_, link_, pos_ in fragment.iterlinks(): new_link_text = trim_url(el.text_content()) el.text = new_link_text return mark_safe(tostring(fragment, encoding=unicode))
def lxml_case3(): text = ''' <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item><span>Hello world</span></a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> <li class="de-item-0"><a href="link5.html">fifth item</a> </ul> </div> ''' tree=etree.HTML(text) html_s=etree.tostring(tree) #print html_s #print tree.xpath('//li//span/text()')[0] ''' reg_case=tree.xpath('//*[starts-with(@class,"item")]') for i in reg_case: print i.xpath('.//a/@href') ''' result=tree.xpath(r'//*[re:match(@class, "item-0")]') print result for i in result[0]: print i.xpath('.//a/@href')
def test_body(self): html = '''<body><p>test</p></body>''' res = b'''<html><body><p>test</p></body></html>''' tree = self.soupparser.fromstring(html) self.assertEqual(tostring(tree), res)
def test_head_body(self): # HTML tag missing, parser should fix that html = '<head><title>test</title></head><body><p>test</p></body>' res = b'<html><head><title>test</title></head><body><p>test</p></body></html>' tree = self.soupparser.fromstring(html) self.assertEqual(tostring(tree), res)
def test_wrap_html(self): # <head> outside <html>, parser should fix that html = '<head><title>title</test></head><html><body/></html>' res = b'<html><head><title>title</title></head><body></body></html>' tree = self.soupparser.fromstring(html) self.assertEqual(tostring(tree), res)
def test_comment_pi(self): html = '''<!-- comment --> <?test asdf?> <head><title>test</title></head><body><p>test</p></body> <!-- another comment -->''' res = b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd"> <!-- comment --><?test asdf?><html><head><title>test</title></head><body><p>test</p></body></html><!-- another comment -->''' tree = self.soupparser.fromstring(html).getroottree() self.assertEqual(tostring(tree, method='html'), res)
def test_doctype1(self): # Test document type declaration, comments and PI's # outside the root html = \ '''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar>''' res = \ b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>''' tree = self.soupparser.fromstring(html).getroottree() self.assertEqual(tree.docinfo.public_id, "-//W3C//DTD HTML 4.01//EN") self.assertEqual(tostring(tree), res)
def test_doctype_html5(self): # html 5 doctype declaration html = b'<!DOCTYPE html>\n<html lang="en"></html>' tree = self.soupparser.fromstring(html).getroottree() self.assertTrue(tree.docinfo.public_id is None) self.assertEqual(tostring(tree), html)
def get_language(self, s_intervention, p, i_lang, new_paragraphs): language = p.xpath('.//span[@class="italic"][text()[re:test(.,"^[\xad\s\.—–\-?,\(]*({})[\xad\s\.—–\-?,\)]*")]]'.format('|'.join(self.langs)), namespaces=self.ns) if len(language) > 0 and not self.explanations_of_vote.match(language[0].text): lang = re.match( r'.*({}).*'.format('|'.join(self.langs)), language[0].text) output = lang.group(1) for l in language: l.drop_tree() else: p = html.tostring(p, with_tail=True, encoding='utf-8').decode('utf-8') lang_in_text = re.search( r'\(({})\)'.format('|'.join(self.langs)), p) if lang_in_text is not None: output = lang_in_text.group(1) p = re.sub(r'\(({})\) *'.format('|'.join(self.langs)), r'', p) else: if len(new_paragraphs) == 0: if 'role' in s_intervention.keys(): president_pattern = '|'.join(self.loc['president']) if re.match(r'{}\Z'.format(president_pattern), s_intervention['role']): output = 'unknown' else: if i_lang is None: output = self.language.upper() else: output = i_lang else: if i_lang is None: output = self.language.upper() else: output = i_lang else: output = new_paragraphs[-1]['language'] p = html.fromstring(p) return output, p
def serialize(self, infile, root): ofile_name = os.path.splitext(os.path.basename(infile))[0] ofile_path = os.path.join(self.outdir, ofile_name+'.xml') xml = etree.tostring( root, encoding='utf-8', xml_declaration=True, pretty_print=True).decode('utf-8') with open(ofile_path, mode='w', encoding='utf-8') as ofile: ofile.write(xml) pass
def get_name(self, tree): name = tree.xpath('//li[@class="mep_name"]')[0] name = self.rm_a.clean_html(name) name = html.tostring(name).decode('utf-8') name = re.sub(r'[\t\n]', r'', name) name = name.split('<br>') name = [html.fromstring(x).text_content() for x in name] name = ' '.join(name) return name
def convert_html_to_telegraph_format(html_string, clean_html=True, output_format="json_string"): if clean_html: html_string = clean_article_html(html_string) body = preprocess_fragments( _fragments_from_string(html_string) ) if body is not None: desc = [x for x in body.iterdescendants()] for tag in desc: preprocess_media_tags(tag) move_to_top(body) post_process(body) else: fragments = _fragments_from_string(html_string) body = fragments[0].getparent() if len(fragments) else None content = [] if body is not None: content = [_recursive_convert(x) for x in body.iterchildren()] if output_format == 'json_string': return json.dumps(content, ensure_ascii=False) elif output_format == 'python_list': return content elif output_format == 'html_string': return html.tostring(body, encoding='unicode')
def from_html(self, cr, uid, model, field, element, context=None): content = [] if element.text: content.append(element.text) content.extend(html.tostring(child) for child in element.iterchildren(tag=etree.Element)) return '\n'.join(content)
def test_save(self): Company = self.registry('res.company') View = self.registry('ir.ui.view') replacement = ET.tostring(h.DIV( h.H3("Column 2"), h.UL( h.LI("wob wob wob"), h.LI(h.SPAN("Acme Corporation", attrs(model='res.company', id=1, field='name', expression="bob", type='char'))), h.LI(h.SPAN("+12 3456789", attrs(model='res.company', id=1, field='phone', expression="edmund", type='char'))), ) ), encoding='utf-8') View.save(self.cr, self.uid, res_id=self.view_id, value=replacement, xpath='/div/div[2]') company = Company.browse(self.cr, self.uid, 1) self.assertEqual(company.name, "Acme Corporation") self.assertEqual(company.phone, "+12 3456789") self.eq( ET.fromstring(View.browse(self.cr, self.uid, self.view_id).arch.encode('utf-8')), h.DIV( h.DIV( h.H3("Column 1"), h.UL( h.LI("Item 1"), h.LI("Item 2"), h.LI("Item 3"))), h.DIV( h.H3("Column 2"), h.UL( h.LI("wob wob wob"), h.LI(h.SPAN({'t-field': "bob"})), h.LI(h.SPAN({'t-field': "edmund"})) )) ) )
def test_save_only_embedded(self): Company = self.registry('res.company') company_id = 1 Company.write(self.cr, self.uid, company_id, {'name': "Foo Corporation"}) node = html.tostring(h.SPAN( "Acme Corporation", attrs(model='res.company', id=company_id, field="name", expression='bob', type='char'))) self.registry('ir.ui.view').save(self.cr, self.uid, res_id=company_id,value=node) company = Company.browse(self.cr, self.uid, company_id) self.assertEqual(company.name, "Acme Corporation")
def test_field_tail(self): View = self.registry('ir.ui.view') replacement = ET.tostring( h.LI(h.SPAN("+12 3456789", attrs( model='res.company', id=1, type='char', field='phone', expression="edmund")), "whop whop" ), encoding="utf-8") View.save(self.cr, self.uid, res_id = self.view_id, value=replacement, xpath='/div/div[2]/ul/li[3]') self.eq( ET.fromstring(View.browse(self.cr, self.uid, self.view_id).arch.encode('utf-8')), h.DIV( h.DIV( h.H3("Column 1"), h.UL( h.LI("Item 1"), h.LI("Item 2"), h.LI("Item 3"))), h.DIV( h.H3("Column 2"), h.UL( h.LI("Item 1"), h.LI(h.SPAN("My Company", attrs(model='res.company', id=1, field='name', type='char'))), h.LI(h.SPAN({'t-field': "edmund"}), "whop whop"), )) ) )
def modify_html(content, prop='_content'): html_string = getattr(content, prop) html_tree = html.fromstring(html_string) yield html_tree html_string = html.tostring(html_tree, encoding='unicode') html_string = re.sub(r'%7B(\w+)%7D', r'{\1}', html_string) html_string = re.sub(r'%7C(\w+)%7C', r'|\1|', html_string) setattr(content, prop, html_string)
def get_content(self, page, meta): if not page.is_html: return page.content check_path = self.config.data.get('check_path') if check_path is not None: if page.doc.find(check_path) is None: log.info("Failed XML path check: %r", page.url) return None for meta_el in ['title', 'author', 'date']: path = self.config.data.get('%s_path' % meta_el) if path is not None and page.doc.findtext(path): meta[meta_el] = page.doc.findtext(path) if 'date' in meta: try: date = meta.pop('date') date = parse(date) if 'dates' not in meta: meta['dates'] = [] meta['dates'].append(date.isoformat()) except Exception as ex: log.exception(ex) body = page.doc if self.config.data.get('body_path') is not None: body = page.doc.find(self.config.data.get('body_path')) for path in self.config.data.get('remove_paths', []): for el in body.findall(path): el.drop_tree() return html.tostring(body)
def parse_movie_details(self, response): html_root = html.fromstring(response.content, base_url=response.base_url) movie_info = dict() movie_info['??'] = self.xpath_first(html_root, '//div[@id="content"]' '/h1/span[1]/text()').strip() try: # to pure text soup = BeautifulSoup(html.tostring( self.xpath_first(html_root, '//div[@id="info"]')), 'html') except TypeError: return None else: for line in soup.get_text().splitlines(): try: left, *right = line.split(':') except AttributeError: pass else: key = left.strip() value = ''.join(x.strip() for x in right) if key and value: movie_info[key] = value yield movie_info
def test_convert_spans(self): expected = ''' <p> <em><strong> foobar <em> lala <strong> yum </strong> </em> <span> hey hey </span> <strong> uh oh </strong> <span> yes </span> </strong></em> </p> ''' h = fromstring(html) for span in h.findall('.//span'): html2md.convert_span(span) result = tostring(h).decode('utf-8') results = [x.replace('\n', '').replace(' ', '') for x in [result, expected]] print('=========') print(results[0]) print('=========') print(results[1]) self.assertEqual(results[0], results[1])
def html_to_markdown(html): """convert html to markdown. this will try and convert span styling to the proper tags as well. e.g. `<span style='font-weight:bold;'>foo</span>` will become `<strong>foo</strong>`. """ h = fromstring(html) clean_highlighted_code(h) for span in h.findall('.//span') + h.findall('.//font'): convert_span(span) html = tostring(h).decode('utf-8') # not ideal but works in a pinch html = html.replace('<mark>', '==') html = html.replace('</mark>', '==') md = to_md(html) # sometimes html2text returns a ton of extra whitespace. # clean up lines with only whitespace. # condense line break streaks of 3 or more. md = re.sub(r'\n([\s\*_]+)\n', '\n\n', md) md = re.sub(r'\n{3,}', '\n\n', md) return md
def rewrite_links(raw_html, rewrite_func): """ Take an HTML input string, rewrite links according to the `rewrite_func`, return the rewritten HTML string. """ html = fromstring(raw_html) html.rewrite_links(rewrite_func) return tostring(html)
def view_source(self): for line in html.tostring(self.lxml_html, pretty_print=True, encoding='unicode').split('\n'): print(line)
def get_html_for(self, locator): """Returns the HTML of the element (including its own tags) targeted by the given `locator` :param locator: An instance of :class:`XPath` or a string containing an XPath expression. """ xpath = six.text_type(locator) element = self.xpath(xpath)[0] return html.tostring(element, encoding='unicode')
def get_inner_html_for(self, locator): """Returns the HTML of the children of the element targeted by the given `locator` (excluding the element's own tags). :param locator: An instance of :class:`XPath` or a string containing an XPath expression. """ xpath = six.text_type(locator) element = self.xpath(xpath)[0] return ''.join(html.tostring(child, encoding='unicode') for child in element.getchildren())
def WriteHTML(self, testcaseinfo): self.CreateHtmlFile() f = open(self.reportfile, "r") htmlcontent = f.read() f.close() # tree = mytree.fromstring(str(htmlcontent)) htmlcontent.encode('utf-8') tree = html.fromstring(htmlcontent) tableElem = tree.find(".//table") if testcaseinfo.result == "Failed": mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#FF0000\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format( testcaseinfo.id, testcaseinfo.name, testcaseinfo.owner, testcaseinfo.result, testcaseinfo.starttime, testcaseinfo.endtime, testcaseinfo.secondsDuration, testcaseinfo.errorinfo) elif testcaseinfo.result == "Pass": mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#00FF00\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format( testcaseinfo.id, testcaseinfo.name, testcaseinfo.owner, testcaseinfo.result, testcaseinfo.starttime, testcaseinfo.endtime, testcaseinfo.secondsDuration, testcaseinfo.errorinfo) else: mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format( testcaseinfo.id, testcaseinfo.name, testcaseinfo.owner, testcaseinfo.result, testcaseinfo.starttime, testcaseinfo.endtime, testcaseinfo.secondsDuration, testcaseinfo.errorinfo) tableElem.append(mytree.HTML(str(mytablerow))) f = open(self.reportfile, "w") # html.tostring newContent = repr(html.tostring(tree, method="html", with_tail=False)) newContent = newContent.replace(r"\n", "").replace(r"\t", "").replace('b\'', "") newContent = newContent[:len(newContent) - 1] f.write(newContent) f.close()
def process_content(jsonBody,item_dict): entry = json.loads(jsonBody) content=Xhtml.fromstring(entry['body']) # get author # print item_dict['json_url'] try: author=content.xpath('//span[@class="author"]/text()')[0].strip() except IndexError: author = '' try: bio=content.xpath('//span[@class="bio"]/text()')[0].strip() except IndexError: bio='' item_dict['author'] = author + bio coverelement = Element('img') coverelement.set('src', item_dict['cover']) content.insert(0, coverelement) item_dict['content'] = Xhtml.tostring(content, encoding='unicode') # print "++++\tGet zhihu items\t++++" print item_dict['cover'] print item_dict['created'] print item_dict['title'] print item_dict['author'] print item_dict['link'] return item_dict
def process_content(html,item_dict): root = Xhtml.fromstring(html) # ?????? try: content = root.xpath('//*[@class="article-content"]')[0] except IndexError: return '' # item_dict['cover'] = None imgs = root.xpath('//img[@src]') if imgs: for img in imgs: src=img.attrib['src'].strip() if (not item_dict['cover']) and src[-3:].lower() in ['jpg','png','gif'] : item_dict['cover']='http:'+src # ???? coverelement = Element('img') coverelement.set('src', item_dict['cover']) content.insert(0, coverelement) elif src[:22]=="data:image/png;base64,": img.set("src","") else: pass item_dict['content'] = Xhtml.tostring(content, encoding='unicode') # print "++++\tGet jaq items\t++++" print item_dict['cover'] print item_dict['created'] print item_dict['title'] print item_dict['desc'] print item_dict['link'] return item_dict
def __init__(self, failure): traceback = html.Element("pre") traceback.text = failure.getTraceback() super(StartFailedPage, self).__init__( status=int(SERVICE_UNAVAILABLE), brief="MAAS failed to start", detail=html.tostring(traceback, encoding=str))
def adapt_html( html_text, extra_metadata, click_tracking=True, open_tracking=True, configuration=None, **kwargs): """Changes an HTML string by replacing links (<a href...>) with tracking links and by adding a 1x1 transparent pixel just before the closing body tag. :param html_text: The HTML to change (unicode or bytestring). :param extra_metadata: A dict that can be json-encoded and that will be encoded in the tracking link. :param click_tracking: If links (<a href...>) must be changed. :param open_tracking: If a transparent pixel must be added before the closing body tag. :param configuration: An optional Configuration instance. :param kwargs: Optional configuration parameters. If provided with a Configuration instance, the kwargs parameters will override the Configuration parameters. """ configuration = get_configuration(configuration, kwargs) tree = html.fromstring(html_text) if click_tracking: _replace_links(tree, extra_metadata, configuration) if open_tracking: _add_tracking_pixel(tree, extra_metadata, configuration) new_html_text = html.tostring(tree) return new_html_text.decode("utf-8")
def _clean_html(html_value, cleaner): fragments = html.fragments_fromstring(html_value) for f in fragments: if isinstance(f, html.HtmlElement): cleaner(f) yield html.tostring(f, encoding="unicode") else: yield f
def markdown(target, image=False): fragment = _markdown_fragment(target, image) return html.tostring(fragment, encoding=unicode)[5:-6] # <div>...</div>
def parse_details_page(url, log, timeout, browser): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode import html5lib from lxml.html import tostring try: raw = browser.open_novisit(url, timeout=timeout).read().decode('gb18030').strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('URL malformed: %r'%url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Amazon timed out. Try again later.' log.error(msg) else: msg = 'Failed to make details query: %r'%url log.exception(msg) return oraw = raw raw = raw raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: log.error('URL malformed: %r'%url) return try: root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r'%url log.exception(msg) return errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse amazon details page: %r'%url msg += tostring(errmsg, method='text', encoding=unicode).strip() log.error(msg) return from css_selectors import Select selector = Select(root) return oraw, root, selector
def _render_comments(self, desc): from calibre.library.comments import sanitize_comments_html import html5lib # html5lib parsed noscript as CDATA desc = html5lib.parseFragment('<div>%s</div>' % (self.totext(desc).replace('textarea', 'div')), \ treebuilder='lxml', namespaceHTMLElements=False)[0] matches = desc.xpath('descendant::*[contains(text(), "????") \ or contains(text(), "????") or contains(text(), "????") \ or contains(text(), "????") or contains(text(), "????")]/../*[self::p or self::div or self::span]') if matches: if len(matches)>1: desc = matches[-1] for item in matches: content_len = len(self.totext(item)) if content_len > 50 and content_len < 200: desc = item break for c in desc.xpath('descendant::noscript'): c.getparent().remove(c) for c in desc.xpath('descendant::*[@class="seeAll" or' ' @class="emptyClear" or @id="collapsePS" or' ' @id="expandPS"]'): c.getparent().remove(c) # for a in desc.xpath('descendant::a[@href]'): del a.attrib['href'] a.tag = 'span' desc = self.tostring(desc, method='text', encoding=unicode).strip() # return desc # Encoding bug in Amazon data U+fffd (replacement char) # in some examples it is present in place of ' desc = desc.replace('\ufffd', "'") # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace desc = re.sub('\n+', '\n', desc) desc = re.sub(' +', ' ', desc) # Remove the notice about text referring to out of print editions desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) return sanitize_comments_html(desc)
def parse_series(self, root): ans = (None, None) # This is found on the paperback/hardback pages for books on amazon.com series = root.xpath('//div[@data-feature-name="seriesTitle"]') if series: series = series[0] spans = series.xpath('./span') if spans: raw = self.tostring(spans[0], encoding=unicode, method='text', with_tail=False).strip() m = re.search('\s+([0-9.]+)$', raw.strip()) if m is not None: series_index = float(m.group(1)) s = series.xpath('./a[@id="series-page-link"]') if s: series = self.tostring(s[0], encoding=unicode, method='text', with_tail=False).strip() if series: ans = (series, series_index) # This is found on Kindle edition pages on amazon.com if ans == (None, None): for span in root.xpath('//div[@id="aboutEbooksSection"]//li/span'): text = (span.text or '').strip() m = re.match('Book\s+([0-9.]+)', text) if m is not None: series_index = float(m.group(1)) a = span.xpath('./a[@href]') if a: series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).strip() if series: ans = (series, series_index) if ans == (None, None): desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]') if desc: raw = self.tostring(desc[0], method='text', encoding=unicode) raw = re.sub(r'\s+', ' ', raw) match = self.series_pat.search(raw) if match is not None: s, i = match.group('series'), float(match.group('index')) if s: ans = (s, i) if ans[0]: ans = (re.sub(r'\s+Series$', '', ans[0]).strip(), ans[1]) ans = (re.sub(r'\(.+?\s+Series\)$', '', ans[0]).strip(), ans[1]) return ans
def update_zendesk_article_html(self): ''' rewrite the html of zendesk articles to point anchor tags at new zendesk articles, instead of old uservoice articles ''' print "**UPDATING HTML to switch anchor hrefs to zendesk" url = '{}/api/v2/help_center/categories/{}/articles.json'.format(self.zendesk_url, self.zendesk_destination_category_id) articles = [] while url: response = requests.get(url, headers=self.headers, auth=self.credentials) if response.status_code != 200: print('FAILED to get get article list with error {}'.format(response.status_code)) exit() data = response.json() for article in data['articles']: articles.append(article) url = data['next_page'] print "UPDATING HTML for {} articles".format(len(articles)) for article in articles: url = "{}/api/v2/help_center/articles/{}.json".format(self.zendesk_url, article['id']) response = requests.get(url, headers=self.headers, auth=self.credentials) if response.status_code != 200: print('FAILED to update HTML for article {} with error {}'.format(article['id'], response.status_code)) exit() html_doc = fromstring(article['body']) for anchor_tag in html_doc.cssselect('a'): if not anchor_tag.get('href'): continue number_from_string_regex = re.search('(\d+)', anchor_tag.get('href')) if not number_from_string_regex: continue uv_id = int(number_from_string_regex.group(0)) if uv_id in self.uvid_to_zdid: url = "{}/api/v2/help_center/articles/{}.json".format(self.zendesk_url, self.uvid_to_zdid[uv_id]) response = requests.get(url, headers=self.headers, auth=self.credentials) if response.status_code != 200: print('FAILED to get article {} with error {}'.format(self.uvid_to_zdid[uv_id], response.status_code)) exit() new_url = response.json()['article']['html_url'] try: print('CHANGING {} to {}'.format(anchor_tag.get('href'), new_url)) except: e = sys.exc_info()[0] print "lxml parsing error {}".format(e) anchor_tag.set('href', new_url) info = { 'body': tostring(html_doc) } payload = json.dumps({'article': info}) url = "{}/api/v2/help_center/articles/{}.json".format(self.zendesk_url, article['id']) response = requests.put(url, data=payload, headers=self.headers, auth=self.credentials) if response.status_code != 200: print('FAILED to update HTML for article {} with error {}'.format(article['id'], response.status_code)) exit() else: print "SKIPPING this href {}".format(anchor_tag.get('href'))